diff --git a/R/geocode.R b/R/geocode.R index f1ab1e4..b503ad6 100644 --- a/R/geocode.R +++ b/R/geocode.R @@ -84,12 +84,15 @@ geocode <- function(input_table, campos_do_endereco = campos ) - # subset and rename colunms of input_padrao + # keep and rename colunms of input_padrao # keeping same column names used in our cnefe data set cols_padr <- grep("_padr", names(input_padrao_raw), value = TRUE) input_padrao <- input_padrao_raw[, .SD, .SDcols = c("ID", cols_padr)] names(input_padrao) <- c("ID", gsub("_padr", "", cols_padr)) + data.table::setnames(input_padrao, old = 'logradouro', new = 'logradouro_sem_numero') + + # create db connection ------------------------------------------------------- ## this creates a persistent database which allows DuckDB to @@ -127,7 +130,7 @@ geocode <- function(input_table, add_abbrev_state_col(con, update_tb = "input_padrao_db") # determine states present in the input - query <- "SELECT DISTINCT abbrev_state FROM input_padrao_db" + query <- "SELECT DISTINCT estado FROM input_padrao_db" input_states <- DBI::dbGetQuery(con, query)[[1]] # download cnefe @@ -185,13 +188,13 @@ geocode <- function(input_table, # key columns - cols_01 <- c("estado", "municipio", "logradouro", "numero", "cep", "bairro") - cols_02 <- c("estado", "municipio", "logradouro", "numero", "cep") - cols_03 <- c("estado", "municipio", "logradouro", "cep", "bairro") - cols_04 <- c("estado", "municipio", "logradouro", "numero") - cols_05 <- c("estado", "municipio", "logradouro", "cep") - cols_06 <- c("estado", "municipio", "logradouro", "bairro") - cols_07 <- c("estado", "municipio", "logradouro") + cols_01 <- c("estado", "municipio", "logradouro_sem_numero", "numero", "cep", "bairro") + cols_02 <- c("estado", "municipio", "logradouro_sem_numero", "numero", "cep") + cols_03 <- c("estado", "municipio", "logradouro_sem_numero", "cep", "bairro") + cols_04 <- c("estado", "municipio", "logradouro_sem_numero", "numero") + cols_05 <- c("estado", "municipio", "logradouro_sem_numero", "cep") + cols_06 <- c("estado", "municipio", "logradouro_sem_numero", "bairro") + cols_07 <- c("estado", "municipio", "logradouro_sem_numero") cols_08 <- c("estado", "municipio", "cep", "bairro") cols_09 <- c("estado", "municipio", "cep") cols_10 <- c("estado", "municipio", "bairro") @@ -204,8 +207,8 @@ geocode <- function(input_table, } ## CASE 1 -------------------------------------------------------------------- - - temp_n <- match_aggregated_cases( + tictoc::tic() + temp_n <- exact_match_case( con, x = 'input_padrao_db', y = 'filtered_cnefe_cep', @@ -213,7 +216,7 @@ geocode <- function(input_table, key_cols <- cols_01, precision = 1L ) - + tictoc::toc() # UPDATE input_padrao_db: Remove observations found in previous step update_input_db( @@ -417,47 +420,49 @@ geocode <- function(input_table, ## CASE 10 -------------------------------------------------------------------- - - # delete cases where Bairro is missing - delete_null_bairro <- function(tb){ - query_delete_bairro_from_input <- - sprintf('DELETE FROM "%s" WHERE "bairro" IS NOT NULL;', tb) - DBI::dbExecute(con, query_delete_bairro_from_input) - # DBI::dbGetQuery(con, 'SELECT COUNT(*) FROM "input_padrao_db"') - } - delete_null_bairro('input_padrao_db') - delete_null_bairro('filtered_cnefe_cep') - - # narrow down search scope to Bairro - query_narrow_bairros <- - 'DELETE FROM "filtered_cnefe_cep" - WHERE "bairro" NOT IN (SELECT DISTINCT "bairro" FROM "input_padrao_db");' - DBI::dbExecute(con, query_narrow_bairros) - # DBI::dbGetQuery(con, 'SELECT COUNT(*) FROM "filtered_cnefe_cep"') - - - temp_n <- match_aggregated_cases( - con, - x = 'input_padrao_db', - y = 'filtered_cnefe_cep', - output_tb = 'output_caso_10', - key_cols <- cols_10, - precision = 10L - ) - - # UPDATE input_padrao_db: Remove observations found in previous step - update_input_db( - con, - update_tb = 'input_padrao_db', - reference_tb = 'output_caso_10' - ) - - # update progress bar - if (isTRUE(progress)) { - ndone <- ndone + temp_n - utils::setTxtProgressBar(pb, ndone) - base::close(pb) - } + # 666 o cnefe padronizado preciza manter a coluna de bairro + +# # delete cases where Bairro is missing +# delete_null_bairro <- function(tb){ +# query_delete_bairro_from_input <- +# sprintf('DELETE FROM "%s" WHERE "bairro" IS NOT NULL;', tb) +# DBI::dbExecute(con, query_delete_bairro_from_input) +# # DBI::dbGetQuery(con, 'SELECT COUNT(*) FROM "input_padrao_db"') +# } +# delete_null_bairro('input_padrao_db') +# # delete_null_bairro('filtered_cnefe_cep') +# +# +# # narrow down search scope to Bairro +# query_narrow_bairros <- +# 'DELETE FROM "filtered_cnefe_cep" +# WHERE "bairro" NOT IN (SELECT DISTINCT "bairro" FROM "input_padrao_db");' +# DBI::dbExecute(con, query_narrow_bairros) +# # DBI::dbGetQuery(con, 'SELECT COUNT(*) FROM "filtered_cnefe_cep"') +# +# +# temp_n <- match_aggregated_cases( +# con, +# x = 'input_padrao_db', +# y = 'filtered_cnefe_cep', +# output_tb = 'output_caso_10', +# key_cols <- cols_10, +# precision = 10L +# ) +# +# # UPDATE input_padrao_db: Remove observations found in previous step +# update_input_db( +# con, +# update_tb = 'input_padrao_db', +# reference_tb = 'output_caso_10' +# ) +# +# # update progress bar +# if (isTRUE(progress)) { +# ndone <- ndone + temp_n +# utils::setTxtProgressBar(pb, ndone) +# base::close(pb) +# } ## CASE 11 -------------------------------------------------------------------- # TO DO @@ -527,8 +532,8 @@ geocode <- function(input_table, 'output_caso_06', 'output_caso_07', 'output_caso_08', - 'output_caso_09', - 'output_caso_10' + 'output_caso_09' + #, 'output_caso_10' #, 'output_caso_11' ) diff --git a/R/match_aggregated_cases.R b/R/match_aggregated_cases.R index 12394fb..7ccdee7 100644 --- a/R/match_aggregated_cases.R +++ b/R/match_aggregated_cases.R @@ -15,7 +15,7 @@ match_aggregated_cases <- function(con, x, y, output_tb, key_cols, precision){ # x = 'input_padrao_db' # y = 'filtered_cnefe_cep' # output_tb = 'output_caso_01' - # key_cols <- c("estado", "municipio", "logradouro", "numero", "cep", "bairro") + # key_cols <- c("estado", "municipio", "logradouro_sem_numero", "numero", "cep", "bairro") # Create the JOIN condition by concatenating the key columns diff --git a/R/match_case.R b/R/match_case.R index 7cab50b..5935608 100644 --- a/R/match_case.R +++ b/R/match_case.R @@ -15,7 +15,7 @@ match_case <- function(con, x, y, output_tb, key_cols, precision){ # x = 'input_padrao_db' # y = 'filtered_cnefe_cep' # output_tb = 'output_caso_01' - # key_cols <- c("estado", "municipio", "logradouro", "numero", "cep", "bairro") + # key_cols <- c("estado", "municipio", "logradouro_sem_numero", "numero", "cep", "bairro") # Build the dynamic select statement to keep ID and key columns from `x` cols_select <- paste(paste0(x, ".ID"), @@ -90,7 +90,7 @@ exact_match_case <- function(con, x, y, output_tb, key_cols, precision){ # x = 'input_padrao_db' # y = 'filtered_cnefe_cep' # output_tb = 'output_caso_01' - # key_cols <- c("estado", "municipio", "logradouro", "numero", "cep", "bairro") + # key_cols <- c("estado", "municipio", "logradouro_sem_numero", "numero", "cep", "bairro") # Build the dynamic select statement to keep ID and key columns from `x` cols_select <- paste(paste0(x, ".ID"), diff --git a/R/onLoad.R b/R/onLoad.R index a501fc9..628ab2c 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -21,7 +21,7 @@ geocodebr_env <- new.env(parent = emptyenv()) # structure is done # data release - geocodebr_env$data_release <- 'v0.0.1' + geocodebr_env$data_release <- 'v0.1.0' # local cache dir cache_d <- paste0('geocodebr/data_release_', geocodebr_env$data_release) diff --git a/R/utils.R b/R/utils.R index fe96f1a..02b9d74 100644 --- a/R/utils.R +++ b/R/utils.R @@ -76,13 +76,13 @@ cache_message <- function(local_file = parent.frame()$local_file, add_abbrev_state_col <- function(con, update_tb = "input_padrao_db"){ # Assuming `con` is your DuckDB connection and `input_padrao` already exists as a table - # Step 1: Add a new empty column to the existing table - DBI::dbExecute(con, sprintf("ALTER TABLE %s ADD COLUMN abbrev_state VARCHAR", update_tb)) + # # Step 1: Add a new empty column to the existing table + # DBI::dbExecute(con, sprintf("ALTER TABLE %s ADD COLUMN abbrev_state VARCHAR", update_tb)) # Step 2: Update the new column with the state abbreviations using CASE WHEN query_update <- sprintf(" UPDATE %s - SET abbrev_state = CASE + SET estado = CASE WHEN estado = 'RONDONIA' THEN 'RO' WHEN estado = 'ACRE' THEN 'AC' WHEN estado = 'AMAZONAS' THEN 'AM' diff --git a/tests/tests_rafa/tests_arrow_vs_duckdb.R b/tests/tests_rafa/tests_arrow_vs_duckdb.R index 7040787..0102d71 100644 --- a/tests/tests_rafa/tests_arrow_vs_duckdb.R +++ b/tests/tests_rafa/tests_arrow_vs_duckdb.R @@ -13,12 +13,12 @@ #' adicionar dados de POI da Meta /overture #' adicionar dados de enderecos Meta /overture - -# library(geocodebr) +devtools::load_all('.') library(tictoc) +library(dplyr) +# library(geocodebr) # library(enderecobr) # library(data.table) -# library(dplyr) # library(arrow) # library(duckdb) @@ -42,7 +42,7 @@ input_df$ID <- 1:nrow(input_df) # bairro = "Bairro" # municipio = "nm_municipio" # estado = "nm_uf" -# showProgress = TRUE +# progress = TRUE # output_simple = TRUE # ncores = NULL # cache = TRUE @@ -63,7 +63,7 @@ df_duck3 <- geocode( estado = "nm_uf", output_simple = F, ncores=NULL, - showProgress = T + progress = T ) tictoc::toc() #> 28: 4 - 5 (in memory)