diff --git a/R/geocode_rafa.R b/R/geocode_rafa.R index 1710498..1e33512 100644 --- a/R/geocode_rafa.R +++ b/R/geocode_rafa.R @@ -55,10 +55,7 @@ geocode_rafa <- function(addresses_table, checkmate::assert_number(n_cores, lower = 1, finite = TRUE) checkmate::assert_logical(progress, any.missing = FALSE, len = 1) checkmate::assert_logical(cache, any.missing = FALSE, len = 1) - checkmate::assert_names( - names(addresses_table), - must.include = "id" - ) + # normalize input data ------------------------------------------------------- @@ -84,17 +81,21 @@ geocode_rafa <- function(addresses_table, # same column names used in cnefe data set data.table::setDT(input_padrao) cols_padr <- grep("_padr", names(input_padrao), value = TRUE) - input_padrao <- input_padrao[, .SD, .SDcols = c("id", cols_padr)] - names(input_padrao) <- c("id", gsub("_padr", "", cols_padr)) + input_padrao <- input_padrao[, .SD, .SDcols = c(cols_padr)] + names(input_padrao) <- c(gsub("_padr", "", cols_padr)) data.table::setnames( x = input_padrao, old = c('logradouro', 'bairro'), new = c('logradouro_sem_numero', 'localidade')) + # temp id + input_padrao[, tempidgeocodebr := 1:nrow(input_padrao) ] + ### temporary input_padrao[, numero := as.numeric(numero)] + # downloading cnefe. we only need to download the states present in the # addresses table, which may save us some time. input_states <- unique(input_padrao$estado) @@ -234,7 +235,7 @@ geocode_rafa <- function(addresses_table, con, x='input_padrao_db', y='output_db', - key_column='id', + key_column='tempidgeocodebr', select_columns = x_columns ) diff --git a/R/geocode_rafa_arrow.R b/R/geocode_rafa_arrow.R index 9be0a3b..c3e15b4 100644 --- a/R/geocode_rafa_arrow.R +++ b/R/geocode_rafa_arrow.R @@ -55,10 +55,6 @@ geocode_rafa_arrow <- function(addresses_table, checkmate::assert_number(n_cores, lower = 1, finite = TRUE) checkmate::assert_logical(progress, any.missing = FALSE, len = 1) checkmate::assert_logical(cache, any.missing = FALSE, len = 1) - checkmate::assert_names( - names(addresses_table), - must.include = "id" - ) # normalize input data ------------------------------------------------------- @@ -86,15 +82,16 @@ geocode_rafa_arrow <- function(addresses_table, # same column names used in cnefe data set data.table::setDT(input_padrao) cols_padr <- grep("_padr", names(input_padrao), value = TRUE) - input_padrao <- input_padrao[, .SD, .SDcols = c("id", cols_padr)] - names(input_padrao) <- c("id", gsub("_padr", "", cols_padr)) + input_padrao <- input_padrao[, .SD, .SDcols = c(cols_padr)] + names(input_padrao) <- c(gsub("_padr", "", cols_padr)) data.table::setnames( x = input_padrao, old = c('logradouro', 'bairro'), new = c('logradouro_sem_numero', 'localidade')) - + # temp id + input_padrao[, tempidgeocodebr := 1:nrow(input_padrao) ] # downloading cnefe. we only need to download the states present in the # addresses table, which may save us some time. @@ -216,7 +213,7 @@ geocode_rafa_arrow <- function(addresses_table, con, x='input_padrao_db', y='output_db', - key_column='id', + key_column='tempidgeocodebr', select_columns = x_columns ) diff --git a/R/geocodebr.R b/R/geocodebr.R index 2acb2dc..22dfcbf 100644 --- a/R/geocodebr.R +++ b/R/geocodebr.R @@ -28,7 +28,7 @@ cache_config_file <- fs::path( utils::globalVariables( c( "year", "temp_local_file", "lon_min", "lon_max", "lat_min", "lat_max", ".", - "lon_diff", "lat_diff", "lon", "lat", "estado", "municipio", + "lon_diff", "lat_diff", "lon", "lat", "estado", "municipio", "tempidgeocodebr", # due to reverse geocoding draft "cep", "lat_cnefe", "localidade", "logradouro_sem_numero", "lon_cnefe", diff --git a/R/match_cases.R b/R/match_cases.R index 5d665ac..0e963a3 100644 --- a/R/match_cases.R +++ b/R/match_cases.R @@ -51,7 +51,7 @@ match_cases <- function(con, x, y, output_tb, key_cols, match_type){ # query for left join query_match <- glue::glue( "CREATE TEMPORARY TABLE {output_tb} AS - SELECT {x}.id, pre_aggregated_cnefe.lon, pre_aggregated_cnefe.lat, {match_type} as match_type + SELECT {x}.tempidgeocodebr, pre_aggregated_cnefe.lon, pre_aggregated_cnefe.lat, {match_type} as match_type FROM {x} LEFT JOIN pre_aggregated_cnefe ON {join_condition} @@ -145,7 +145,7 @@ match_cases_arrow <- function(con, # query for left join query_match <- glue::glue( "CREATE TEMPORARY TABLE {output_tb} AS - SELECT {x}.id, filtered_cnefe.lon, filtered_cnefe.lat, {match_type} as match_type + SELECT {x}.tempidgeocodebr, filtered_cnefe.lon, filtered_cnefe.lat, {match_type} as match_type FROM {x} LEFT JOIN filtered_cnefe ON {join_condition} diff --git a/R/match_weighted_cases.R b/R/match_weighted_cases.R index 370d56f..09c460e 100644 --- a/R/match_weighted_cases.R +++ b/R/match_weighted_cases.R @@ -20,14 +20,14 @@ match_weighted_cases <- function(con, x, y, output_tb, key_cols, match_type){ # # Build the dynamic select and group statement -# cols_select <- c('id', key_cols, 'numero') +# cols_select <- c('tempidgeocodebr', key_cols, 'numero') # cols_select <- paste( glue::glue("{x}.{cols_select}"), collapse = ', ') -# cols_group <- paste(c('id', key_cols, 'numero'), collapse = ", ") +# cols_group <- paste(c('tempidgeocodebr', key_cols, 'numero'), collapse = ", ") # Construct the SQL match query query_match <- glue::glue( "CREATE OR REPLACE TEMPORARY VIEW temp_db AS - SELECT {x}.id, {x}.numero, {y}.numero as numero_db, {y}.lat, {y}.lon + SELECT {x}.tempidgeocodebr, {x}.numero, {y}.numero as numero_db, {y}.lat, {y}.lon FROM {x} LEFT JOIN {y} ON {join_condition} @@ -39,12 +39,12 @@ match_weighted_cases <- function(con, x, y, output_tb, key_cols, match_type){ # summarize query_aggregate <- glue::glue( "CREATE TEMPORARY TABLE {output_tb} AS - SELECT id, + SELECT tempidgeocodebr, SUM((1/ABS(numero - numero_db) * lon)) / SUM(1/ABS(numero - numero_db)) AS lon, SUM((1/ABS(numero - numero_db) * lat)) / SUM(1/ABS(numero - numero_db)) AS lat, {match_type} as match_type FROM temp_db - GROUP BY id;" + GROUP BY tempidgeocodebr;" ) temp_n <- DBI::dbExecute(con, query_aggregate) @@ -113,7 +113,7 @@ match_weighted_cases_arrow <- function(con, # Construct the SQL match query query_match <- glue::glue( "CREATE OR REPLACE TEMPORARY VIEW temp_db AS - SELECT {x}.id, {x}.numero, {y}.numero as numero_db, {y}.lat, {y}.lon + SELECT {x}.tempidgeocodebr, {x}.numero, {y}.numero as numero_db, {y}.lat, {y}.lon FROM {x} LEFT JOIN {y} ON {join_condition} @@ -125,12 +125,12 @@ match_weighted_cases_arrow <- function(con, # summarize query_aggregate <- glue::glue( "CREATE TEMPORARY TABLE {output_tb} AS - SELECT id, + SELECT tempidgeocodebr, SUM((1/ABS(numero - numero_db) * lon)) / SUM(1/ABS(numero - numero_db)) AS lon, SUM((1/ABS(numero - numero_db) * lat)) / SUM(1/ABS(numero - numero_db)) AS lat, {match_type} as match_type FROM temp_db - GROUP BY id;" + GROUP BY tempidgeocodebr;" ) temp_n <- DBI::dbExecute(con, query_aggregate) diff --git a/R/utils.R b/R/utils.R index 8363ec4..35f540b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -139,7 +139,7 @@ update_input_db <- function(con, update_tb = 'input_padrao_db', reference_tb){ query_remove_matched <- sprintf(" DELETE FROM %s - WHERE id IN (SELECT id FROM %s)", update_tb, reference_tb) + WHERE tempidgeocodebr IN (SELECT tempidgeocodebr FROM %s)", update_tb, reference_tb) DBI::dbExecute(con, query_remove_matched) } @@ -171,13 +171,16 @@ merge_results <- function(con, x, y, key_column, select_columns){ # x = 'output_db' # y = 'output_caso_01' - # key_column = 'id' + # key_column = 'tempidgeocodebr' select_columns_y = c('lon', 'lat', 'match_type') + # drop temp id column + select_columns <- select_columns[select_columns!='tempidgeocodebr'] # Create the SELECT clause dynamically # select_x <- paste0(x, '.', c('lon', 'lat', 'match_type '), collapse = ', ') select_x <- paste0(x, '.', c(select_columns), collapse = ', ') + select_clause <- paste0( select_x, ',', paste0(y, ".", select_columns_y, collapse = ", ") diff --git a/tests/tests_rafa/benchmark_LIKE.R b/tests/tests_rafa/benchmark_LIKE.R index 971244b..218e9ac 100644 --- a/tests/tests_rafa/benchmark_LIKE.R +++ b/tests/tests_rafa/benchmark_LIKE.R @@ -172,7 +172,7 @@ mb <- microbenchmark::microbenchmark( dani = dani(), rafa = rafa(), rafa_arrow = rafa_arrow(), - times = 5 + times = 10 ) mb