Skip to content

Commit

Permalink
adjusting to new cnefe_padrao
Browse files Browse the repository at this point in the history
  • Loading branch information
rafapereirabr committed Dec 10, 2024
1 parent 929c475 commit c8d19ee
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 67 deletions.
115 changes: 60 additions & 55 deletions R/geocode.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,15 @@ geocode <- function(input_table,
campos_do_endereco = campos
)

# subset and rename colunms of input_padrao
# keep and rename colunms of input_padrao
# keeping same column names used in our cnefe data set
cols_padr <- grep("_padr", names(input_padrao_raw), value = TRUE)
input_padrao <- input_padrao_raw[, .SD, .SDcols = c("ID", cols_padr)]
names(input_padrao) <- c("ID", gsub("_padr", "", cols_padr))

data.table::setnames(input_padrao, old = 'logradouro', new = 'logradouro_sem_numero')


# create db connection -------------------------------------------------------

## this creates a persistent database which allows DuckDB to
Expand Down Expand Up @@ -127,7 +130,7 @@ geocode <- function(input_table,
add_abbrev_state_col(con, update_tb = "input_padrao_db")

# determine states present in the input
query <- "SELECT DISTINCT abbrev_state FROM input_padrao_db"
query <- "SELECT DISTINCT estado FROM input_padrao_db"
input_states <- DBI::dbGetQuery(con, query)[[1]]

# download cnefe
Expand Down Expand Up @@ -185,13 +188,13 @@ geocode <- function(input_table,


# key columns
cols_01 <- c("estado", "municipio", "logradouro", "numero", "cep", "bairro")
cols_02 <- c("estado", "municipio", "logradouro", "numero", "cep")
cols_03 <- c("estado", "municipio", "logradouro", "cep", "bairro")
cols_04 <- c("estado", "municipio", "logradouro", "numero")
cols_05 <- c("estado", "municipio", "logradouro", "cep")
cols_06 <- c("estado", "municipio", "logradouro", "bairro")
cols_07 <- c("estado", "municipio", "logradouro")
cols_01 <- c("estado", "municipio", "logradouro_sem_numero", "numero", "cep", "bairro")
cols_02 <- c("estado", "municipio", "logradouro_sem_numero", "numero", "cep")
cols_03 <- c("estado", "municipio", "logradouro_sem_numero", "cep", "bairro")
cols_04 <- c("estado", "municipio", "logradouro_sem_numero", "numero")
cols_05 <- c("estado", "municipio", "logradouro_sem_numero", "cep")
cols_06 <- c("estado", "municipio", "logradouro_sem_numero", "bairro")
cols_07 <- c("estado", "municipio", "logradouro_sem_numero")
cols_08 <- c("estado", "municipio", "cep", "bairro")
cols_09 <- c("estado", "municipio", "cep")
cols_10 <- c("estado", "municipio", "bairro")
Expand All @@ -204,16 +207,16 @@ geocode <- function(input_table,
}

## CASE 1 --------------------------------------------------------------------

temp_n <- match_aggregated_cases(
tictoc::tic()
temp_n <- exact_match_case(
con,
x = 'input_padrao_db',
y = 'filtered_cnefe_cep',
output_tb = 'output_caso_01',
key_cols <- cols_01,
precision = 1L
)

tictoc::toc()

# UPDATE input_padrao_db: Remove observations found in previous step
update_input_db(
Expand Down Expand Up @@ -417,47 +420,49 @@ geocode <- function(input_table,

## CASE 10 --------------------------------------------------------------------


# delete cases where Bairro is missing
delete_null_bairro <- function(tb){
query_delete_bairro_from_input <-
sprintf('DELETE FROM "%s" WHERE "bairro" IS NOT NULL;', tb)
DBI::dbExecute(con, query_delete_bairro_from_input)
# DBI::dbGetQuery(con, 'SELECT COUNT(*) FROM "input_padrao_db"')
}
delete_null_bairro('input_padrao_db')
delete_null_bairro('filtered_cnefe_cep')

# narrow down search scope to Bairro
query_narrow_bairros <-
'DELETE FROM "filtered_cnefe_cep"
WHERE "bairro" NOT IN (SELECT DISTINCT "bairro" FROM "input_padrao_db");'
DBI::dbExecute(con, query_narrow_bairros)
# DBI::dbGetQuery(con, 'SELECT COUNT(*) FROM "filtered_cnefe_cep"')


temp_n <- match_aggregated_cases(
con,
x = 'input_padrao_db',
y = 'filtered_cnefe_cep',
output_tb = 'output_caso_10',
key_cols <- cols_10,
precision = 10L
)

# UPDATE input_padrao_db: Remove observations found in previous step
update_input_db(
con,
update_tb = 'input_padrao_db',
reference_tb = 'output_caso_10'
)

# update progress bar
if (isTRUE(progress)) {
ndone <- ndone + temp_n
utils::setTxtProgressBar(pb, ndone)
base::close(pb)
}
# 666 o cnefe padronizado preciza manter a coluna de bairro

# # delete cases where Bairro is missing
# delete_null_bairro <- function(tb){
# query_delete_bairro_from_input <-
# sprintf('DELETE FROM "%s" WHERE "bairro" IS NOT NULL;', tb)
# DBI::dbExecute(con, query_delete_bairro_from_input)
# # DBI::dbGetQuery(con, 'SELECT COUNT(*) FROM "input_padrao_db"')
# }
# delete_null_bairro('input_padrao_db')
# # delete_null_bairro('filtered_cnefe_cep')
#
#
# # narrow down search scope to Bairro
# query_narrow_bairros <-
# 'DELETE FROM "filtered_cnefe_cep"
# WHERE "bairro" NOT IN (SELECT DISTINCT "bairro" FROM "input_padrao_db");'
# DBI::dbExecute(con, query_narrow_bairros)
# # DBI::dbGetQuery(con, 'SELECT COUNT(*) FROM "filtered_cnefe_cep"')
#
#
# temp_n <- match_aggregated_cases(
# con,
# x = 'input_padrao_db',
# y = 'filtered_cnefe_cep',
# output_tb = 'output_caso_10',
# key_cols <- cols_10,
# precision = 10L
# )
#
# # UPDATE input_padrao_db: Remove observations found in previous step
# update_input_db(
# con,
# update_tb = 'input_padrao_db',
# reference_tb = 'output_caso_10'
# )
#
# # update progress bar
# if (isTRUE(progress)) {
# ndone <- ndone + temp_n
# utils::setTxtProgressBar(pb, ndone)
# base::close(pb)
# }

## CASE 11 --------------------------------------------------------------------
# TO DO
Expand Down Expand Up @@ -527,8 +532,8 @@ geocode <- function(input_table,
'output_caso_06',
'output_caso_07',
'output_caso_08',
'output_caso_09',
'output_caso_10'
'output_caso_09'
#, 'output_caso_10'
#, 'output_caso_11'
)

Expand Down
2 changes: 1 addition & 1 deletion R/match_aggregated_cases.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ match_aggregated_cases <- function(con, x, y, output_tb, key_cols, precision){
# x = 'input_padrao_db'
# y = 'filtered_cnefe_cep'
# output_tb = 'output_caso_01'
# key_cols <- c("estado", "municipio", "logradouro", "numero", "cep", "bairro")
# key_cols <- c("estado", "municipio", "logradouro_sem_numero", "numero", "cep", "bairro")


# Create the JOIN condition by concatenating the key columns
Expand Down
4 changes: 2 additions & 2 deletions R/match_case.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ match_case <- function(con, x, y, output_tb, key_cols, precision){
# x = 'input_padrao_db'
# y = 'filtered_cnefe_cep'
# output_tb = 'output_caso_01'
# key_cols <- c("estado", "municipio", "logradouro", "numero", "cep", "bairro")
# key_cols <- c("estado", "municipio", "logradouro_sem_numero", "numero", "cep", "bairro")

# Build the dynamic select statement to keep ID and key columns from `x`
cols_select <- paste(paste0(x, ".ID"),
Expand Down Expand Up @@ -90,7 +90,7 @@ exact_match_case <- function(con, x, y, output_tb, key_cols, precision){
# x = 'input_padrao_db'
# y = 'filtered_cnefe_cep'
# output_tb = 'output_caso_01'
# key_cols <- c("estado", "municipio", "logradouro", "numero", "cep", "bairro")
# key_cols <- c("estado", "municipio", "logradouro_sem_numero", "numero", "cep", "bairro")

# Build the dynamic select statement to keep ID and key columns from `x`
cols_select <- paste(paste0(x, ".ID"),
Expand Down
2 changes: 1 addition & 1 deletion R/onLoad.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ geocodebr_env <- new.env(parent = emptyenv())
# structure is done

# data release
geocodebr_env$data_release <- 'v0.0.1'
geocodebr_env$data_release <- 'v0.1.0'

# local cache dir
cache_d <- paste0('geocodebr/data_release_', geocodebr_env$data_release)
Expand Down
6 changes: 3 additions & 3 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,13 @@ cache_message <- function(local_file = parent.frame()$local_file,
add_abbrev_state_col <- function(con, update_tb = "input_padrao_db"){
# Assuming `con` is your DuckDB connection and `input_padrao` already exists as a table

# Step 1: Add a new empty column to the existing table
DBI::dbExecute(con, sprintf("ALTER TABLE %s ADD COLUMN abbrev_state VARCHAR", update_tb))
# # Step 1: Add a new empty column to the existing table
# DBI::dbExecute(con, sprintf("ALTER TABLE %s ADD COLUMN abbrev_state VARCHAR", update_tb))

# Step 2: Update the new column with the state abbreviations using CASE WHEN
query_update <- sprintf("
UPDATE %s
SET abbrev_state = CASE
SET estado = CASE
WHEN estado = 'RONDONIA' THEN 'RO'
WHEN estado = 'ACRE' THEN 'AC'
WHEN estado = 'AMAZONAS' THEN 'AM'
Expand Down
10 changes: 5 additions & 5 deletions tests/tests_rafa/tests_arrow_vs_duckdb.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
#' adicionar dados de POI da Meta /overture
#' adicionar dados de enderecos Meta /overture


# library(geocodebr)
devtools::load_all('.')
library(tictoc)
library(dplyr)
# library(geocodebr)
# library(enderecobr)
# library(data.table)
# library(dplyr)
# library(arrow)
# library(duckdb)

Expand All @@ -42,7 +42,7 @@ input_df$ID <- 1:nrow(input_df)
# bairro = "Bairro"
# municipio = "nm_municipio"
# estado = "nm_uf"
# showProgress = TRUE
# progress = TRUE
# output_simple = TRUE
# ncores = NULL
# cache = TRUE
Expand All @@ -63,7 +63,7 @@ df_duck3 <- geocode(
estado = "nm_uf",
output_simple = F,
ncores=NULL,
showProgress = T
progress = T
)
tictoc::toc()
#> 28: 4 - 5 (in memory)
Expand Down

0 comments on commit c8d19ee

Please sign in to comment.