Skip to content

Commit

Permalink
new argument keep_matched_address added to geocode
Browse files Browse the repository at this point in the history
  • Loading branch information
rafapereirabr committed Jan 11, 2025
1 parent 1403f24 commit 1b82156
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 63 deletions.
12 changes: 9 additions & 3 deletions R/geocode.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#' parameters.
#' @template n_cores
#' @template progress
#' @param keep_matched_address Logical. Whethe the output should include a
#' column indicating the matched address of reference. Defaults to `FALSE`.
#' @template cache
#'
#' @return Returns the data frame passed in `addresses_table` with the latitude
Expand Down Expand Up @@ -55,13 +57,15 @@ geocode <- function(addresses_table,
address_fields = setup_address_fields(),
n_cores = 1,
progress = TRUE,
keep_matched_address = FALSE,
cache = TRUE){
# check input
assert_address_fields(address_fields, addresses_table)
checkmate::assert_data_frame(addresses_table)
checkmate::assert_number(n_cores, lower = 1, finite = TRUE)
checkmate::assert_logical(progress, any.missing = FALSE, len = 1)
checkmate::assert_logical(cache, any.missing = FALSE, len = 1)
checkmate::assert_logical(keep_matched_address, any.missing = FALSE, len = 1)

# normalize input data -------------------------------------------------------

Expand All @@ -71,7 +75,7 @@ geocode <- function(addresses_table,
if (progress) message_standardizing_addresses()


# TEMP. necessario para garantir que numero de input 0 vire 'S/N'
# TEMP. necessario para garantir que numero de input 0 vire 'S/N'
data.table::setDT(addresses_table)
addresses_table[, address_fields['numero'] := as.character( get(address_fields['numero']) )]

Expand Down Expand Up @@ -167,6 +171,7 @@ geocode <- function(addresses_table,
output_tb = paste0('output_', case),
key_cols = relevant_cols,
match_type = case,
keep_matched_address = keep_matched_address,
input_states = input_states,
input_municipio = input_municipio
)
Expand Down Expand Up @@ -213,11 +218,12 @@ geocode <- function(addresses_table,
x='input_db',
y='output_db',
key_column='tempidgeocodebr',
select_columns = x_columns
select_columns = x_columns,
keep_matched_address = keep_matched_address
)

# Disconnect from DuckDB when done
duckdb::dbDisconnect(con, shutdown=TRUE)
duckdb::dbDisconnect(con)

# Return the result
return(output_deterministic)
Expand Down
5 changes: 5 additions & 0 deletions R/match_cases.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ match_cases_arrow <- function(con,
output_tb,
key_cols,
match_type,
keep_matched_address,
input_states,
input_municipio
){
Expand Down Expand Up @@ -161,6 +162,10 @@ match_cases_arrow <- function(con,
query_match <- gsub("input_padrao_db.numero IS NOT NULL AND", "", query_match)
}

if (isFALSE(keep_matched_address)) {
query_match <- gsub(", filtered_cnefe.endereco_completo as matched_address", "", query_match)
}

temp_n <- DBI::dbExecute(con, query_match)
duckdb::duckdb_unregister_arrow(con, "filtered_cnefe")

Expand Down
20 changes: 19 additions & 1 deletion R/match_weighted_cases.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,11 @@ match_weighted_cases_arrow <- function(con,
output_tb,
key_cols,
match_type,
keep_matched_address,
input_states,
input_municipio){


# read correspondind parquet file
table_name <- paste(key_cols, collapse = "_")
table_name <- gsub('estado_municipio', 'municipio', table_name)
Expand Down Expand Up @@ -118,7 +120,6 @@ match_weighted_cases_arrow <- function(con,
# join_condition <- gsub("= input_padrao_db.logradouro_sem_numero", "LIKE '%' || input_padrao_db.logradouro_sem_numero || '%'", join_condition)
# }


# Construct the SQL match query
query_match <- glue::glue(
"CREATE OR REPLACE TEMPORARY VIEW temp_db AS
Expand All @@ -128,6 +129,11 @@ match_weighted_cases_arrow <- function(con,
ON {join_condition}
WHERE {x}.numero IS NOT NULL AND {y}.numero IS NOT NULL;"
)

if (isFALSE(keep_matched_address)) {
query_match <- gsub(", filtered_cnefe.endereco_completo", "", query_match)
}

DBI::dbExecute(con, query_match)


Expand All @@ -143,6 +149,18 @@ match_weighted_cases_arrow <- function(con,
GROUP BY tempidgeocodebr;"
)

if (isFALSE(keep_matched_address)) {
query_aggregate <- glue::glue(
"CREATE TEMPORARY TABLE {output_tb} AS
SELECT tempidgeocodebr,
SUM((1/ABS(numero - numero_db) * lon)) / SUM(1/ABS(numero - numero_db)) AS lon,
SUM((1/ABS(numero - numero_db) * lat)) / SUM(1/ABS(numero - numero_db)) AS lat,
'{match_type}' AS match_type
FROM temp_db
GROUP BY tempidgeocodebr;"
)
}

temp_n <- DBI::dbExecute(con, query_aggregate)
duckdb::duckdb_unregister_arrow(con, "filtered_cnefe")

Expand Down
32 changes: 20 additions & 12 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,22 @@ add_precision_col <- function(con, update_tb = NULL){



merge_results <- function(con, x, y, key_column, select_columns){
merge_results <- function(con,
x,
y,
key_column,
select_columns,
keep_matched_address){

# x = 'output_db'
# y = 'output_caso_01'
# key_column = 'tempidgeocodebr'
select_columns_y = c('lon', 'lat', 'match_type', 'precision', 'matched_address')

if (isFALSE(keep_matched_address)) {
select_columns_y <- select_columns_y[select_columns_y != 'matched_address']
}

# drop temp id column
select_columns <- select_columns[select_columns!='tempidgeocodebr']

Expand All @@ -148,19 +157,18 @@ merge_results <- function(con, x, y, key_column, select_columns){
paste0(y, ".", select_columns_y, collapse = ", ")
)

# Create the SQL query
query <- sprintf("
SELECT %s
FROM %s
LEFT JOIN %s
ON %s.%s = %s.%s",
select_clause, # Selected columns
x, # Left table
y, # Right table
x, key_column, # Left table and key column
y, key_column # Right table and key column
join_condition <- paste(
glue::glue("{x}.{key_column} = {y}.{key_column}"),
collapse = ' ON '
)

# Create the SQL query
query <- glue::glue(
"SELECT {select_clause}
FROM {x}
LEFT JOIN {y}
ON {join_condition} "
)

# Execute the query and fetch the merged data
merged_data <- DBI::dbGetQuery(con, query)
Expand Down
73 changes: 26 additions & 47 deletions tests/tests_rafa/benchmark_LIKE.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ input_df <- arrow::read_parquet(data_path)
# ncores <- 7
# progress = T
# cache = TRUE
# keep_matched_address = F
# address_fields <- geocodebr::setup_address_fields(
# logradouro = 'logradouro',
# numero = 'numero',
Expand Down Expand Up @@ -86,60 +87,31 @@ ncores <- 7



fields <- geocodebr::setup_address_fields(
logradouro = 'logradouro',
numero = 'numero',
cep = 'cep',
bairro = 'bairro',
municipio = 'municipio',
estado = 'uf'
)

rafa <- function(){ message('rafa')
fields <- geocodebr::setup_address_fields(
logradouro = 'logradouro',
numero = 'numero',
cep = 'cep',
bairro = 'bairro',
municipio = 'municipio',
estado = 'uf'
)

df_rafa2 <- geocodebr::geocode(
addresses_table = input_df,
address_fields = fields,
n_cores = ncores,
progress = T
)
}



rafa_arrow <- function(){ message('rafa_arrow')
fields <- geocodebr::setup_address_fields(
logradouro = 'logradouro',
numero = 'numero',
cep = 'cep',
bairro = 'bairro',
municipio = 'municipio',
estado = 'uf'
)

df_rafa_arrow <- geocodebr::geocode(
rafaF <- function(){ message('rafa F')
df_rafaF <- geocodebr::geocode(
addresses_table = input_df,
address_fields = fields,
n_cores = ncores,
keep_matched_address = F,
progress = T
)
}

dani <- function(){ message('dani')
fields <- geocodebr::setup_address_fields(
logradouro = 'logradouro',
numero = 'numero',
cep = 'cep',
bairro = 'bairro',
municipio = 'municipio',
estado = 'uf'
)


df_dani <- geocodebr:::geocode_dani(
rafaT <- function(){ message('rafa T')
df_rafaT <- geocodebr::geocode(
addresses_table = input_df,
address_fields = fields,
n_cores = ncores,
keep_matched_address = T,
progress = T
)
}
Expand All @@ -148,13 +120,16 @@ dani <- function(){ message('dani')


mb <- microbenchmark::microbenchmark(
dani = dani(),
rafa = rafa(),
rafa_arrow = rafa_arrow(),
times = 10
rafa_drop = rafaF(),
rafa_keep = rafaT(),
times = 5
)
mb

library(profvis)
profvis({
rafaT()
})



Expand Down Expand Up @@ -372,3 +347,7 @@ data.table::setnames(


table(df$match_type.d, df$match_type.p)




0 comments on commit 1b82156

Please sign in to comment.