Skip to content

Commit

Permalink
fix error
Browse files Browse the repository at this point in the history
  • Loading branch information
rafapereirabr committed Dec 17, 2024
1 parent d0d62b7 commit 4401cd5
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 16 deletions.
18 changes: 14 additions & 4 deletions R/create_geocodebr_db.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ create_geocodebr_db <- function(db_path = tempdir(),
ncores = NULL
){



## check input -------------------------------------------------------

checkmate::assert_directory_exists(db_path)
Expand All @@ -34,10 +36,18 @@ create_geocodebr_db <- function(db_path = tempdir(),

## create db connection -------------------------------------------------------

# remove traces of previous db
old_duckdb <- list.files( db_path, pattern = '.duckdb', full.names = TRUE)

if (length(old_duckdb)>0) {
unlink(old_duckdb, recursive = TRUE, force = TRUE, expand = TRUE)
}


## this creates a persistent database which allows DuckDB to
## perform **larger-than-memory** workloads
db_path <- fs::file_temp(tmp_dir = db_path, ext = '.duckdb')
con <- duckdb::dbConnect(duckdb::duckdb(), dbdir=db_path)
con <- duckdb::dbConnect(duckdb::duckdb(), dbdir= db_path ) # db_path ":memory:"

## configure db connection -------------------------------------------------------

Expand All @@ -60,9 +70,9 @@ create_geocodebr_db <- function(db_path = tempdir(),
gc()
}

# Load CNEFE data and write it to DuckDB
cnefe <- arrow_open_dataset(cache_dir)
duckdb::duckdb_register_arrow(con, "cnefe", cnefe)
# # Load CNEFE data and write it to DuckDB
# cnefe <- arrow_open_dataset(cache_dir)
# duckdb::duckdb_register_arrow(con, "cnefe", cnefe)

return(con)
}
16 changes: 8 additions & 8 deletions R/geocode.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,16 @@ geocode <- function(input_table,
if (isFALSE(download_success)) { return(invisible(NULL)) }

# Load CNEFE data and write to DuckDB
cnefe <- arrow_open_dataset(geocodebr_env$cache_dir)
cnefe <- arrow_open_dataset(geocodebr::get_cache_dir())
duckdb::duckdb_register_arrow(con, "cnefe", cnefe)

# # more than 2x SLOWER
# dir <- fs::path(geocodebr_env$cache_dir, "/**/*.parquet")
# DBI::dbExecute(con,
# sprintf("CREATE TEMPORARY TABLE cnefe AS SELECT * FROM read_parquet('%s')",
# dir))

## DBI::dbRemoveTable(con, 'cnefe')
# # # more than 2x SLOWER
# # dir <- fs::path(geocodebr_env$cache_dir, "/**/*.parquet")
# # DBI::dbExecute(con,
# # sprintf("CREATE VIEW cnefe AS SELECT * FROM read_parquet('%s')",
# # dir))
#
# ## DBI::dbRemoveTable(con, 'cnefe')


# Narrow search scope in cnefe to municipalities and zip codes present in input
Expand Down
2 changes: 1 addition & 1 deletion R/match_case_insgle_string.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#' @return Writes the result of the left join as a new table in con
#'
#' @keywords internal
match_case_insgle_string <- function(con, x, y, output_tb, key_cols, precision){
match_case_single_string <- function(con, x, y, output_tb, key_cols, precision){

# x = 'input_padrao_db'
# y = 'filtered_cnefe_cep'
Expand Down
28 changes: 28 additions & 0 deletions inst/extdata/states_bbox.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
abbrev_state,xmin,ymin,xmax,ymax
RO,-66.810253111633,-13.6937001241688,-59.7743528516697,-7.96929420061669
AC,-73.9904499689983,-11.1455614675226,-66.6238395939147,-7.11182413031569
AM,-73.8015583247804,-9.8180458760763,-56.0975580377312,2.24663056119678
RR,-64.8252506620746,-1.58063310691207,-58.8868843947234,5.27184107741844
PA,-58.8983283997823,-9.84115347777047,-46.0609467928189,2.59102704641964
AP,-54.8762501802638,-1.23617637395988,-49.8762166349157,4.43674631702481
TO,-50.7420639777669,-13.4677126641889,-45.6968832022868,-5.16857194577177
MA,-48.7551507366124,-10.2617647031021,-41.7958843809523,-1.04397682226038
PI,-45.9942948375065,-10.9287563495505,-40.3705097622986,-2.73930891991561
CE,-41.4235180088295,-7.85755775830388,-37.2532911248468,-2.78423065030707
RN,-38.5820985224728,-6.98273481293201,-34.9685380203107,-4.83155550793584
PB,-38.7656034316372,-8.3029550511595,-34.7928849744584,-6.02656885876282
PE,-41.3583361843337,-9.48289744404414,-32.3911090093679,-3.82878357931473
AL,-38.2375889351762,-10.5011858051009,-35.1519504514605,-8.81312703208501
SE,-38.2450317495498,-11.5682235623433,-36.3938678211131,-9.51502990879311
BA,-46.6170967574251,-18.3485602613306,-37.3411466536959,-8.53282057107809
MG,-51.0459443779647,-22.9227552583774,-39.8568288906276,-14.2331806646433
ES,-41.8796410723695,-21.3017819688833,-28.835943541245,-17.8919446401203
RJ,-44.8893205505007,-23.3689319635892,-40.9585185182187,-20.7632054626486
SP,-53.1101115316335,-25.3123209500439,-44.1613651644231,-19.7798888098656
PR,-54.6193046378125,-26.7171220086855,-48.0235374346287,-22.5163770699407
SC,-53.8368638573633,-29.3548140092243,-48.327119984172,-25.9558418227432
RS,-57.6437862155364,-33.7520812714679,-49.6914598601596,-27.0823150313345
MS,-58.1685116881068,-24.0685836291468,-50.9229090804468,-17.1666306285883
MT,-61.6333603451517,-18.0415807513547,-50.2248063821325,-7.34902838258037
GO,-53.2512093338537,-19.4991508119681,-45.9069681830834,-12.3957550632074
DF,-48.2857910682451,-16.0502642631457,-47.3083869870584,-15.5002551538495

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

53 changes: 53 additions & 0 deletions tests/tests_rafa/tests_arrow_vs_duckdb.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ input_df <- rbind(input_df,input_df,input_df,input_df,input_df,input_df,input_df
input_df$ID <- 1:nrow(input_df)



# input_table = input_df
# logradouro = "nm_logradouro"
# numero = "Numero"
Expand Down Expand Up @@ -76,3 +77,55 @@ tictoc::toc()
#> 7.3 milhoes: 113.01 (in memory)
#> 7.3 milhoes: 101.31 (disk) match_case
#> 7.3 milhoes: 57.36 (disk) match_aggregated_cases





input_df <- input_table <- data.frame(
ID=666,
nm_logradouro = 'SQS 308 Bloco C',
Numero = 204,
Complemento = 'Bloco C',
Cep = 70355030,
Bairro = 'Asa sul',
nm_municipio = 'Brasilia',
nm_uf = 'DF'
)

cnefe_cep <- arrow::open_dataset( geocodebr::get_cache_dir())
df <- filter(cnefe_cep, cep =='70355-030') |> collect()



################## calculate precision as the area in m2
range_lon <- max(df$lon) - min(df$lon)
range_lat <- max(df$lat) - min(df$lat)

lon_meters <- 111320 * range_lon * cos(mean(df$lat))
lat_meters <- 111320 * range_lat

area = pi * lon_meters * lat_meters
area
##################
range_lon <- sd(df$lon) *2
range_lat <- sd(df$lat) *2



query_aggregate_and_match <- glue::glue(
"CREATE TABLE {output_tb} AS
WITH pre_aggregated_cnefe AS (
SELECT {cols_select}, AVG(lon) AS lon, AVG(lat) AS lat,
MAX(lon) - MIN(lon) as range_lon, MAX(lat) - MIN(lat) as range_lat,
FROM {y}
GROUP BY {cols_group}
)
SELECT {x}.ID, pre_aggregated_cnefe.lon, pre_aggregated_cnefe.lat
FROM {x} AS {x}
LEFT JOIN pre_aggregated_cnefe
ON {join_condition}
WHERE pre_aggregated_cnefe.lon IS NOT NULL;"
)


0 comments on commit 4401cd5

Please sign in to comment.