Skip to content

Commit

Permalink
rafa fun: input does not require id column anymore
Browse files Browse the repository at this point in the history
  • Loading branch information
rafapereirabr committed Jan 7, 2025
1 parent c145541 commit 8056906
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 29 deletions.
15 changes: 8 additions & 7 deletions R/geocode_rafa.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,7 @@ geocode_rafa <- function(addresses_table,
checkmate::assert_number(n_cores, lower = 1, finite = TRUE)
checkmate::assert_logical(progress, any.missing = FALSE, len = 1)
checkmate::assert_logical(cache, any.missing = FALSE, len = 1)
checkmate::assert_names(
names(addresses_table),
must.include = "id"
)


# normalize input data -------------------------------------------------------

Expand All @@ -84,17 +81,21 @@ geocode_rafa <- function(addresses_table,
# same column names used in cnefe data set
data.table::setDT(input_padrao)
cols_padr <- grep("_padr", names(input_padrao), value = TRUE)
input_padrao <- input_padrao[, .SD, .SDcols = c("id", cols_padr)]
names(input_padrao) <- c("id", gsub("_padr", "", cols_padr))
input_padrao <- input_padrao[, .SD, .SDcols = c(cols_padr)]
names(input_padrao) <- c(gsub("_padr", "", cols_padr))

data.table::setnames(
x = input_padrao,
old = c('logradouro', 'bairro'),
new = c('logradouro_sem_numero', 'localidade'))

# temp id
input_padrao[, tempidgeocodebr := 1:nrow(input_padrao) ]

### temporary
input_padrao[, numero := as.numeric(numero)]


# downloading cnefe. we only need to download the states present in the
# addresses table, which may save us some time.
input_states <- unique(input_padrao$estado)
Expand Down Expand Up @@ -234,7 +235,7 @@ geocode_rafa <- function(addresses_table,
con,
x='input_padrao_db',
y='output_db',
key_column='id',
key_column='tempidgeocodebr',
select_columns = x_columns
)

Expand Down
13 changes: 5 additions & 8 deletions R/geocode_rafa_arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,6 @@ geocode_rafa_arrow <- function(addresses_table,
checkmate::assert_number(n_cores, lower = 1, finite = TRUE)
checkmate::assert_logical(progress, any.missing = FALSE, len = 1)
checkmate::assert_logical(cache, any.missing = FALSE, len = 1)
checkmate::assert_names(
names(addresses_table),
must.include = "id"
)


# normalize input data -------------------------------------------------------
Expand Down Expand Up @@ -86,15 +82,16 @@ geocode_rafa_arrow <- function(addresses_table,
# same column names used in cnefe data set
data.table::setDT(input_padrao)
cols_padr <- grep("_padr", names(input_padrao), value = TRUE)
input_padrao <- input_padrao[, .SD, .SDcols = c("id", cols_padr)]
names(input_padrao) <- c("id", gsub("_padr", "", cols_padr))
input_padrao <- input_padrao[, .SD, .SDcols = c(cols_padr)]
names(input_padrao) <- c(gsub("_padr", "", cols_padr))

data.table::setnames(
x = input_padrao,
old = c('logradouro', 'bairro'),
new = c('logradouro_sem_numero', 'localidade'))


# temp id
input_padrao[, tempidgeocodebr := 1:nrow(input_padrao) ]

# downloading cnefe. we only need to download the states present in the
# addresses table, which may save us some time.
Expand Down Expand Up @@ -216,7 +213,7 @@ geocode_rafa_arrow <- function(addresses_table,
con,
x='input_padrao_db',
y='output_db',
key_column='id',
key_column='tempidgeocodebr',
select_columns = x_columns
)

Expand Down
2 changes: 1 addition & 1 deletion R/geocodebr.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ cache_config_file <- fs::path(
utils::globalVariables(
c(
"year", "temp_local_file", "lon_min", "lon_max", "lat_min", "lat_max", ".",
"lon_diff", "lat_diff", "lon", "lat", "estado", "municipio",
"lon_diff", "lat_diff", "lon", "lat", "estado", "municipio", "tempidgeocodebr",

# due to reverse geocoding draft
"cep", "lat_cnefe", "localidade", "logradouro_sem_numero", "lon_cnefe",
Expand Down
4 changes: 2 additions & 2 deletions R/match_cases.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ match_cases <- function(con, x, y, output_tb, key_cols, match_type){
# query for left join
query_match <- glue::glue(
"CREATE TEMPORARY TABLE {output_tb} AS
SELECT {x}.id, pre_aggregated_cnefe.lon, pre_aggregated_cnefe.lat, {match_type} as match_type
SELECT {x}.tempidgeocodebr, pre_aggregated_cnefe.lon, pre_aggregated_cnefe.lat, {match_type} as match_type
FROM {x}
LEFT JOIN pre_aggregated_cnefe
ON {join_condition}
Expand Down Expand Up @@ -145,7 +145,7 @@ match_cases_arrow <- function(con,
# query for left join
query_match <- glue::glue(
"CREATE TEMPORARY TABLE {output_tb} AS
SELECT {x}.id, filtered_cnefe.lon, filtered_cnefe.lat, {match_type} as match_type
SELECT {x}.tempidgeocodebr, filtered_cnefe.lon, filtered_cnefe.lat, {match_type} as match_type
FROM {x}
LEFT JOIN filtered_cnefe
ON {join_condition}
Expand Down
16 changes: 8 additions & 8 deletions R/match_weighted_cases.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ match_weighted_cases <- function(con, x, y, output_tb, key_cols, match_type){


# # Build the dynamic select and group statement
# cols_select <- c('id', key_cols, 'numero')
# cols_select <- c('tempidgeocodebr', key_cols, 'numero')
# cols_select <- paste( glue::glue("{x}.{cols_select}"), collapse = ', ')
# cols_group <- paste(c('id', key_cols, 'numero'), collapse = ", ")
# cols_group <- paste(c('tempidgeocodebr', key_cols, 'numero'), collapse = ", ")

# Construct the SQL match query
query_match <- glue::glue(
"CREATE OR REPLACE TEMPORARY VIEW temp_db AS
SELECT {x}.id, {x}.numero, {y}.numero as numero_db, {y}.lat, {y}.lon
SELECT {x}.tempidgeocodebr, {x}.numero, {y}.numero as numero_db, {y}.lat, {y}.lon
FROM {x}
LEFT JOIN {y}
ON {join_condition}
Expand All @@ -39,12 +39,12 @@ match_weighted_cases <- function(con, x, y, output_tb, key_cols, match_type){
# summarize
query_aggregate <- glue::glue(
"CREATE TEMPORARY TABLE {output_tb} AS
SELECT id,
SELECT tempidgeocodebr,
SUM((1/ABS(numero - numero_db) * lon)) / SUM(1/ABS(numero - numero_db)) AS lon,
SUM((1/ABS(numero - numero_db) * lat)) / SUM(1/ABS(numero - numero_db)) AS lat,
{match_type} as match_type
FROM temp_db
GROUP BY id;"
GROUP BY tempidgeocodebr;"
)
temp_n <- DBI::dbExecute(con, query_aggregate)

Expand Down Expand Up @@ -113,7 +113,7 @@ match_weighted_cases_arrow <- function(con,
# Construct the SQL match query
query_match <- glue::glue(
"CREATE OR REPLACE TEMPORARY VIEW temp_db AS
SELECT {x}.id, {x}.numero, {y}.numero as numero_db, {y}.lat, {y}.lon
SELECT {x}.tempidgeocodebr, {x}.numero, {y}.numero as numero_db, {y}.lat, {y}.lon
FROM {x}
LEFT JOIN {y}
ON {join_condition}
Expand All @@ -125,12 +125,12 @@ match_weighted_cases_arrow <- function(con,
# summarize
query_aggregate <- glue::glue(
"CREATE TEMPORARY TABLE {output_tb} AS
SELECT id,
SELECT tempidgeocodebr,
SUM((1/ABS(numero - numero_db) * lon)) / SUM(1/ABS(numero - numero_db)) AS lon,
SUM((1/ABS(numero - numero_db) * lat)) / SUM(1/ABS(numero - numero_db)) AS lat,
{match_type} as match_type
FROM temp_db
GROUP BY id;"
GROUP BY tempidgeocodebr;"
)

temp_n <- DBI::dbExecute(con, query_aggregate)
Expand Down
7 changes: 5 additions & 2 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ update_input_db <- function(con, update_tb = 'input_padrao_db', reference_tb){

query_remove_matched <- sprintf("
DELETE FROM %s
WHERE id IN (SELECT id FROM %s)", update_tb, reference_tb)
WHERE tempidgeocodebr IN (SELECT tempidgeocodebr FROM %s)", update_tb, reference_tb)
DBI::dbExecute(con, query_remove_matched)
}

Expand Down Expand Up @@ -171,13 +171,16 @@ merge_results <- function(con, x, y, key_column, select_columns){

# x = 'output_db'
# y = 'output_caso_01'
# key_column = 'id'
# key_column = 'tempidgeocodebr'
select_columns_y = c('lon', 'lat', 'match_type')

# drop temp id column
select_columns <- select_columns[select_columns!='tempidgeocodebr']

# Create the SELECT clause dynamically
# select_x <- paste0(x, '.', c('lon', 'lat', 'match_type '), collapse = ', ')
select_x <- paste0(x, '.', c(select_columns), collapse = ', ')

select_clause <- paste0(
select_x, ',',
paste0(y, ".", select_columns_y, collapse = ", ")
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_rafa/benchmark_LIKE.R
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ mb <- microbenchmark::microbenchmark(
dani = dani(),
rafa = rafa(),
rafa_arrow = rafa_arrow(),
times = 5
times = 10
)
mb

Expand Down

0 comments on commit 8056906

Please sign in to comment.