Skip to content

Commit

Permalink
filtered_cnefe with arrow
Browse files Browse the repository at this point in the history
  • Loading branch information
Rafael Henrique Moraes Pereira authored and Rafael Henrique Moraes Pereira committed Dec 19, 2024
1 parent ba3659f commit 046f5d3
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 25 deletions.
29 changes: 13 additions & 16 deletions R/geocode.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,19 +97,14 @@ geocode <- function(addresses_table,
)

# downloading cnefe. we only need to download the states present in the
# addresses table, which may save us some time. we also subset cnefe to
# include only the municipalities present in the input table, reducing the
# search scope and consequently reducing processing time and memory usage
# addresses table, which may save us some time.

present_states <- unique(standard_locations$estado_padr)
download_cnefe(present_states, progress = progress, cache = cache)


cnefe <- arrow::open_dataset(get_cache_dir())
cnefe <- dplyr::filter(cnefe, estado %in% present_states)

# creating a temporary db and registering both the input table and the cnefe
# data
# creating a temporary db and register the input table data

tmpdb <- tempfile(fileext = ".duckdb")
con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = tmpdb)
Expand All @@ -123,17 +118,19 @@ geocode <- function(addresses_table,
temporary = TRUE
)

# register cnefe data to db but only include states and municipalities
# present in the input table, reducing the search scope and
# consequently reducing processing time and memory usage

unique_muns <- unique(standard_locations$municipio_padr)
muns_list <- paste(glue::glue("'{unique_muns}'"), collapse = ", ")

duckdb::duckdb_register_arrow(con, "cnefe", cnefe)
DBI::dbExecute(
con,
glue::glue(
"CREATE OR REPLACE VIEW filtered_cnefe AS ",
"SELECT * FROM cnefe WHERE municipio IN ({muns_list})"
)
)
filtered_cnefe <- arrow::open_dataset(get_cache_dir()) |>
dplyr::filter(estado %in% present_states) |>
dplyr::filter(municipio %in% unique_muns) |>
dplyr::compute()

duckdb::duckdb_register_arrow(con, "filtered_cnefe", filtered_cnefe)


# to find the coordinates of the addresses, we merge the input table with the
# cnefe data. the column names used in the input table are different than the
Expand Down
5 changes: 3 additions & 2 deletions R/geocode_rafa.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ geocode_rafa <- function(input_table,

# Convert input data frame to DuckDB table
duckdb::dbWriteTable(con, "input_padrao_db", input_padrao,
temporary = TRUE, overwrite=TRUE)
temporary = TRUE)

input_states <- unique(input_padrao$estado)

Expand All @@ -123,7 +123,8 @@ geocode_rafa <- function(input_table,
if(is.null(input_municipio)){ input_municipio <- "*"}

# Load CNEFE data and write to DuckDB
filtered_cnefe <- arrow_open_dataset(geocodebr::get_cache_dir()) |>
filtered_cnefe <- arrow::open_dataset(get_cache_dir()) |>
dplyr::filter(estado %in% input_states) |>
dplyr::filter(municipio %in% input_municipio) |>
dplyr::compute()

Expand Down
11 changes: 4 additions & 7 deletions tests/tests_rafa/tests_arrow_vs_duckdb.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ fields <- geocodebr::setup_address_fields(
estado = 'nm_uf'
)


df_duck_dani <- geocodebr:::geocode(
addresses_table = input_df,
address_fields = fields,
Expand All @@ -79,7 +78,7 @@ df_duck_rafa <- geocodebr:::geocode_rafa(
municipio = "nm_municipio",
estado = "nm_uf",
output_simple = F,
ncores=7,
n_cores=7,
progress = T
)
tictoc::toc()
Expand Down Expand Up @@ -147,17 +146,16 @@ query_aggregate_and_match <- sprintf(


rafa <- function(){
df_duck_rafa <- geocode(
df_duck_rafa <- geocodebr:::geocode_rafa(
input_table = input_df,
logradouro = "nm_logradouro",
numero = "Numero",
complemento = "Complemento",
cep = "Cep",
bairro = "Bairro",
municipio = "nm_municipio",
estado = "nm_uf",
output_simple = F,
ncores=7,
n_cores=7,
progress = T
)
}
Expand All @@ -166,15 +164,14 @@ rafa <- function(){
fields <- geocodebr::setup_address_fields(
logradouro = 'nm_logradouro',
numero = 'Numero',
complemento = 'Complemento',
cep = 'Cep',
bairro = 'Bairro',
municipio = 'nm_municipio',
estado = 'nm_uf'
)


df_duck_dani <- geocodebr:::geocode2(
df_duck_dani <- geocodebr:::geocode(
addresses_table = input_df,
address_fields = fields,
n_cores = 7,
Expand Down

0 comments on commit 046f5d3

Please sign in to comment.