Skip to content

Commit

Permalink
reverse geocode 2
Browse files Browse the repository at this point in the history
  • Loading branch information
rafapereirabr committed Dec 21, 2024
1 parent a7eeed2 commit 5a6a9fe
Show file tree
Hide file tree
Showing 6 changed files with 302 additions and 18 deletions.
2 changes: 1 addition & 1 deletion R/geocode_rafa.R
Original file line number Diff line number Diff line change
Expand Up @@ -626,7 +626,7 @@ geocode_rafa <- function(input_table,
}

# Disconnect from DuckDB when done
duckdb::duckdb_unregister_arrow(con, 'cnefe')
duckdb::duckdb_unregister_arrow(con, 'filtered_cnefe')
duckdb::dbDisconnect(con, shutdown=TRUE)
# gc()

Expand Down
176 changes: 176 additions & 0 deletions R/reverse_geocode2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#' Reverse geocoding coordinates in Brazil based on CNEFE data
#'
#' @description
#' Takes a data frame containing coordinates (latitude and longitude) and
#' returns the address in CNEFE that is the closest to the input coordinates.
#' Latitude and longitude inputs are limited to possible values within the
#' bounding box of Brazil.
#'
#' @param input_table A data frame. It must contain the columns `'id'`, `'lat'`, `'lon'`.
#' @template n_cores
#' @template progress
#' @template cache
#'
#' @return A `"data.frame"` object.
#' @family Reverse geocoding
#' @examplesIf identical(tolower(Sys.getenv("NOT_CRAN")), "true")
#'
#' # # input data
#' # df_coords <- data.frame(
#' # id = 1:6,
#' # lon = c(-67.83112, -67.83559, -67.81918, -43.47110, -51.08934, -67.8191),
#' # lat = c(-9.962392, -9.963436, -9.972736, -22.695578, -30.05981, -9.97273)
#' # )
#' #
#' # # reverse geocode
#' # df_addresses <- geocodebr::reverse_geocode2(
#' # input_table = df_coords,
#' # progress = TRUE
#' # )
#'
reverse_geocode2 <- function(input_table,
n_cores = 1,
progress = TRUE,
cache = TRUE
){

# check input
checkmate::assert_data_frame(input_table)
checkmate::assert_logical(progress)
checkmate::assert_number(n_cores, null.ok = TRUE)
checkmate::assert_logical(cache)
checkmate::assert_names(
names(input_table),
must.include = c("id","lon","lat"),
.var.name = "input_table"
)

# prep input -------------------------------------------------------

# 1 degree of latitude is always 111111.1 meters
# 1 degree of longitude is 111111.1 * cos(lat)

# create a small range around coordinates
margin <- 0.001 # 0.0001

data.table::setDT(input_table)
input_table[, c("lon_min", "lon_max", "lat_min", "lat_max") :=
.(lon - margin, lon + margin, lat - margin, lat + margin)]

bbox_lon_min <- min(input_table$lon_min)
bbox_lon_max <- max(input_table$lon_max)
bbox_lat_min <- min(input_table$lat_min)
bbox_lat_max <- max(input_table$lat_max)

# check if input falls within Brazil
bbox_brazil <- data.frame(
xmin = -73.99044997,
ymin = -33.752081270872,
xmax = -28.83594354,
ymax = 5.27184108017288)

error_msg <- 'Input coordinates outside the bounding box of Brazil.'
if(bbox_lon_min < bbox_brazil$xmin){stop(error_msg)}
if(bbox_lon_max > bbox_brazil$xmax){stop(error_msg)}
if(bbox_lat_min < bbox_brazil$ymin){stop(error_msg)}
if(bbox_lat_max > bbox_brazil$ymax){stop(error_msg)}

# download cnefe -------------------------------------------------------

download_cnefe(state = "all", progress = progress)


# create db connection -------------------------------------------------------
con <- create_geocodebr_db(n_cores = n_cores)

# Narrow search scope in cnefe to bounding box
filtered_cnefe_coords <- arrow::open_dataset(get_cache_dir()) |>
dplyr::filter(lon > bbox_lon_min &
lon < bbox_lon_max &
lat > bbox_lat_min &
lat < bbox_lat_max) |>
dplyr::compute()

duckdb::duckdb_register_arrow(con, "filtered_cnefe_coords",
filtered_cnefe_coords)


# Convert input data frame to DuckDB table
duckdb::dbWriteTable(con, "input_table_db", input_table,
temporary = TRUE)


# Find cases nearby -------------------------------------------------------

query_join_cases_nearby <- glue::glue(
"CREATE TABLE cases_nearby AS
SELECT *
FROM input_table_db
LEFT JOIN filtered_cnefe_coords
ON input_table_db.lat_min < filtered_cnefe_coords.lat
AND input_table_db.lat_max > filtered_cnefe_coords.lat
AND input_table_db.lon_min < filtered_cnefe_coords.lon
AND input_table_db.lon_max > filtered_cnefe_coords.lon"
)

temp_n <- DBI::dbExecute(con, query_join_cases_nearby)


# find closest points -------------------------------------------------------

query_coordinates_diff <- glue::glue(
"ALTER TABLE cases_nearby ADD COLUMN lon_diff DOUBLE;
ALTER TABLE cases_nearby ADD COLUMN lat_diff DOUBLE;
UPDATE cases_nearby
SET lon_diff = ABS(lon - lon_1),
lat_diff = ABS(lat - lat_1);"
)

DBI::dbExecute(con, query_coordinates_diff)


# return closets ---------------

query_return_closest <- glue::glue(
"WITH ranked_rows AS (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY id
ORDER BY lon_diff ASC, lat_diff ASC
) AS rn
FROM cases_nearby
)
SELECT *
FROM ranked_rows
WHERE rn = 1;"
)

output <- DBI::dbGetQuery(con, query_return_closest)




# organize output -------------------------------------------------

data.table::setDT(output)
output[, c('lon_min', 'lon_max', 'lat_min', 'lat_max',
'lon_diff', 'lat_diff', 'rn') := NULL]

data.table::setnames(
x = output,
old = c('lon_1', 'lat_1'),
new = c('lon_cnefe', 'lat_cnefe')
)





# Disconnect from DuckDB when done
duckdb::duckdb_unregister_arrow(con, 'filtered_cnefe_coords')
duckdb::dbDisconnect(con, shutdown=TRUE)

return(output)
}
72 changes: 72 additions & 0 deletions man/reverse_geocode2.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

62 changes: 46 additions & 16 deletions tests/tests_rafa/benchmark_reg_adm.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,25 @@ library(tictoc)
library(dplyr)
library(data.table)
library(ipeadatalake)
library(mapview)
library(sfheaders)
library(sf)

mapview::mapviewOptions(platform = 'leafgl')
set.seed(42)

data_path <- system.file("extdata/sample_1.csv", package = "geocodebr")
input_df <- read.csv(data_path)


# rais --------------------------------------------------------------------

rais <- ipeadatalake::ler_rais(
ano = 2021,
ano = 2019,
tipo = 'estabelecimento',
as_data_frame = F,
colunas = c("id_estab", "cnpj_raiz", "logradouro",
"bairro", "codemun", "uf", "cep", "qt_vinc_ativos")
) |>
geoloc = T) |>
select("id_estab", "logradouro", "bairro", "codemun", "uf", "cep",
'lat', 'lon', 'Addr_type', 'Match_addr') |>
compute() |>
dplyr::slice_sample(n = 50000) |> # sample 50K
filter(uf != "IG") |>
filter(uf != "") |>
Expand All @@ -33,10 +38,11 @@ rais[, logradouro_no_numbers := gsub("\\d+", "", logradouro)]
rais[, logradouro_no_numbers := gsub(",", "", logradouro_no_numbers)]



# sample 10%
rais2 <- sample_frac(tbl = rais, 0.1)
gc(T)
data.table::setnames(
rais,
old = c('lat', 'lon'),
new = c('lat_arcgis', 'lon_arcgis')
)

tictoc::tic()
fields <- geocodebr::setup_address_fields(
Expand All @@ -48,28 +54,52 @@ fields <- geocodebr::setup_address_fields(
estado = 'uf'
)

df_geo <- geocodebr:::geocode(
addresses_table = rais2,
rais <- geocodebr:::geocode(
addresses_table = rais,
address_fields = fields,
n_cores = 20, # 7
progress = T
)
tictoc::toc()


result_arcgis <- table(rais$Addr_type) / nrow(rais) *100
result_geocodebr <- table(rais$match_type) / nrow(rais) *100

aaaa <- table(rais$match_type, rais$Addr_type) / nrow(rais) *100
aaaa <- as.data.frame(aaaa)
aaaa <- subset(aaaa, Freq>0)


arc <- ipeadatalake::ler_cadunico(
data = 202312,
tipo = 'familia',
as_data_frame = F,geoloc = T)
data.table::fwrite(aaaa, 'rais.csv', dec = ',', sep = '-')



t <- subset(rais, match_type=='case_09' & Addr_type== 'PointAddress')

t_arc <- sfheaders::sf_point(t[1,], x = 'lon_arcgis', y = 'lat_arcgis',keep = T)
t_geo <- sfheaders::sf_point(t[1,], x = 'lon', y = 'lat',keep = T)

st_crs(t_arc) <- 4674
st_crs(t_geo) <- 4674

mapview::mapviewOptions(platform = 'mapdeck', )

mapview(t_arc) + t_geo
sf::st_distance(t_geo, t_arc)



jp <- geocodebr::get_cache_dir() |>
geocodebr:::arrow_open_dataset() |>
filter(estado=="PB") |>
filter(municipio == "JOAO PESSOA") |>
collect()

head(jp)

subset(jp , logradouro_sem_numero %like% "DESEMBARGADOR SOUTO MAIOR")
subset(t , logradouro_no_numbers %like% "DESEMBARGADOR SOUTO MAIOR")


# cad unico --------------------------------------------------------------------
Expand Down
6 changes: 6 additions & 0 deletions tests/tests_rafa/reverse_geocode_tests.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,9 @@ tictoc::toc()

ttt <- data.frame(id=1, lat=-15.814192047159876, lon=-47.90534614672923)
reverse_geocode(df = ttt)





right join(uf, in)
2 changes: 1 addition & 1 deletion tests/tests_rafa/tests_arrow_vs_duckdb.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ microbenchmark::microbenchmark(dani = dani(),
# precision ------------------------------------------------------------------

input_df <- input_table <- data.frame(
ID=666,
id=666,
nm_logradouro = 'SQS 308 Bloco C',
Numero = 204,
Complemento = 'Bloco C',
Expand Down

0 comments on commit 5a6a9fe

Please sign in to comment.