Nothing
#' @keywords internal
geocode_orig <- function(
enderecos,
campos_endereco = definir_campos(),
resultado_completo = FALSE,
resolver_empates = TRUE,
h3_res = NULL,
resultado_sf = FALSE,
verboso = TRUE,
cache = TRUE,
n_cores = NULL
) {
# nocov start
# ## ---- tiny timing toolkit (self-contained) ------------------------------
# .make_timer <- function(verbose = TRUE) {
# .marks <- list()
# .t0_rt <- proc.time()[["elapsed"]] # monotonic wall clock
# .t_prev <- .t0_rt
#
# fmt <- function(secs) sprintf("%.3f s", secs)
#
# mark <- function(label) {
# now <- proc.time()[["elapsed"]]
# step <- now - .t_prev
# total <- now - .t0_rt
# .marks <<- append(.marks, list(list(label = label, step = step, total = total)))
# .t_prev <<- now
# if (verbose) message(sprintf("[%s] +%s (total %s)", label, fmt(step), fmt(total)))
# invisible(now)
# }
#
# summary <- function(print_summary = verbose) {
# if (length(.marks) == 0) return(invisible(data.frame()))
# df <- data.frame(
# step = vapply(.marks, `[[`, "", "label"),
# step_sec = vapply(.marks, `[[`, 0.0, "step"),
# total_sec = vapply(.marks, `[[`, 0.0, "total"),
# stringsAsFactors = FALSE
# ) |>
# dplyr::mutate(step_relative = round(step_sec / max(total_sec)*100, 1))
#
# if (print_summary) {
# message("— Timing summary —")
# print(df, row.names = FALSE)
# }
# df
# }
#
# time_it <- function(label, expr) {
# force(label)
# res <- eval.parent(substitute(expr))
# mark(label)
# invisible(res)
# }
#
# list(mark = mark, summary = summary, time_it = time_it)
# }
# timer <- .make_timer(verbose = isTRUE(verboso))
# on.exit(timer$summary(), add = TRUE)
# ## -----------------------------------------------------------------------
# check input
checkmate::assert_data_frame(enderecos)
checkmate::assert_logical(resultado_completo, any.missing = FALSE, len = 1)
checkmate::assert_logical(resolver_empates, any.missing = FALSE, len = 1)
checkmate::assert_logical(resultado_sf, any.missing = FALSE, len = 1)
checkmate::assert_logical(verboso, any.missing = FALSE, len = 1)
checkmate::assert_logical(cache, any.missing = FALSE, len = 1)
checkmate::assert_numeric(
h3_res,
null.ok = TRUE,
lower = 0,
upper = 15,
max.len = 16
)
campos_endereco <- assert_and_assign_address_fields(
campos_endereco,
enderecos
)
# normalize input data -------------------------------------------------------
# standardizing the addresses table to increase the chances of finding a match
# in the CNEFE data
if (verboso) {
message_standardizing_addresses()
}
# systime start 66666 ----------------
# timer$mark("Start")
input_padrao <- enderecobr::padronizar_enderecos(
enderecos,
campos_do_endereco = enderecobr::correspondencia_campos(
logradouro = campos_endereco[["logradouro"]],
numero = campos_endereco[["numero"]],
cep = campos_endereco[["cep"]],
bairro = campos_endereco[["localidade"]],
municipio = campos_endereco[["municipio"]],
estado = campos_endereco[["estado"]]
),
formato_estados = "sigla",
formato_numeros = 'integer'
)
# keep and rename colunms of input_padrao to use the
# same column names used in cnefe data set
data.table::setDT(input_padrao)
cols_to_keep <- names(input_padrao)[names(input_padrao) %like% '_padr']
input_padrao <- input_padrao[, .SD, .SDcols = c(cols_to_keep)]
names(input_padrao) <- c(gsub("_padr", "", names(input_padrao)))
if ('bairro' %in% names(input_padrao)) {
data.table::setnames(
x = input_padrao,
old = 'bairro',
new = 'localidade'
)
}
# systime padronizacao 66666 ----------------
# timer$mark("Padronizacao")
# create temp id
data.table::setDT(enderecos)[, tempidgeocodebr := 1:nrow(input_padrao)]
input_padrao[, tempidgeocodebr := 1:nrow(input_padrao)]
# temp coluna de logradouro q sera usada no match probabilistico
input_padrao[, temp_lograd_determ := NA_character_]
input_padrao[, similaridade_logradouro := NA_real_]
# # sort input data
# input_padrao <- input_padrao[order(estado, municipio, logradouro, numero, cep, localidade)]
# downloading cnefe
cnefe_dir <- download_cnefe(
tabela = 'todas',
verboso = verboso,
cache = cache
)
# creating a temporary db and register the input table data
con <- create_geocodebr_db(n_cores = n_cores)
# register standardized input data
input_padrao_arrw <- arrow::as_arrow_table(input_padrao)
DBI::dbWriteTableArrow(
con,
name = "input_padrao_db",
input_padrao_arrw,
overwrite = TRUE,
temporary = TRUE
)
# systime register standardized 66666 ----------------
# timer$mark("Register standardized input")
# cria coluna "log_causa_confusao" identificando logradouros que geram confusao
# issue https://github.com/ipeaGIT/geocodebr/issues/67
cria_col_logradouro_confusao(con)
# create an empty output table that will be populated -----------------------------------------------
# Define schema
schema_output_db <- arrow::schema(
tempidgeocodebr = arrow::int32(),
lat = arrow::float16(), # Equivalent to NUMERIC(8,6)
lon = arrow::float16(),
endereco_encontrado = arrow::string(),
logradouro_encontrado = arrow::string(),
tipo_resultado = arrow::string(),
contagem_cnefe = arrow::int32(),
desvio_metros = arrow::int32(),
log_causa_confusao = arrow::boolean(),
similaridade_logradouro = arrow::float16()
)
if (isTRUE(resultado_completo)) {
schema_output_db <- arrow::schema(
tempidgeocodebr = arrow::int32(),
lat = arrow::float16(), # Equivalent to NUMERIC(8,6)
lon = arrow::float16(),
endereco_encontrado = arrow::string(),
logradouro_encontrado = arrow::string(),
tipo_resultado = arrow::string(),
contagem_cnefe = arrow::int32(),
desvio_metros = arrow::int32(),
log_causa_confusao = arrow::boolean(),
#
numero_encontrado = arrow::int32(),
localidade_encontrada = arrow::string(),
cep_encontrado = arrow::string(),
municipio_encontrado = arrow::string(),
estado_encontrado = arrow::string(),
similaridade_logradouro = arrow::float16()
)
}
output_db_arrow <- arrow::arrow_table(schema = schema_output_db)
DBI::dbWriteTableArrow(
con,
name = "output_db",
output_db_arrow,
overwrite = TRUE,
temporary = TRUE
)
# START MATCHING -----------------------------------------------
# start progress bar
if (verboso) {
prog <- create_progress_bar(input_padrao)
message_looking_for_matches()
}
n_rows <- nrow(input_padrao)
matched_rows <- 0
# start matching
for (match_type in all_possible_match_types) {
# get key cols
key_cols <- get_key_cols(match_type)
if (verboso) {
update_progress_bar(matched_rows, match_type)
}
# somente busca essa categoria match_type se todas colunas estiverem na base
# caso contrario, passa para proxima categoria
if (all(key_cols %in% names(input_padrao))) {
# select match function
match_fun <-
if (match_type %in% c(number_exact_types, exact_types_no_number)) {
match_cases
} else if (match_type %in% number_interpolation_types) {
match_weighted_cases
} else if (
match_type %in%
c(probabilistic_exact_types, probabilistic_types_no_number)
) {
match_cases_probabilistic
} else if (match_type %in% probabilistic_interpolation_types) {
match_weighted_cases_probabilistic
}
n_rows_affected <- match_fun(
con,
match_type = match_type,
key_cols = key_cols,
resultado_completo = resultado_completo
)
matched_rows <- matched_rows + n_rows_affected
# leave the loop early if we find all addresses before covering all cases
if (matched_rows == n_rows) break
}
}
if (verboso) {
finish_progress_bar(matched_rows)
}
# systime matching 66666 ----------------
# timer$mark("Matching")
if (verboso) {
message_preparando_output()
}
# casos de empate -----------------------------------------------
empates_resolvidos <- trata_empates_geocode_duckdb(
con,
resultado_completo,
resolver_empates,
verboso
)
# systime resolve empates 66666 ----------------
# timer$mark("Resolve empates")
# bring original input back -----------------------------------------------
# output with all original columns
duckdb::dbWriteTable(
con,
"input_db",
enderecos,
temporary = TRUE,
overwrite = TRUE
)
# enderecos_arrw <- arrow::as_arrow_table(enderecos)
# DBI::dbWriteTableArrow(con, name = "input_db", enderecos_arrw,
# overwrite = TRUE, temporary = TRUE)
# systime write original input back 66666 ----------------
# timer$mark("Write original input back")
# add precision column ----------------
output_table_to_use <- ifelse(
empates_resolvidos == 0,
'output_db',
'output_db2'
)
add_precision_col(con, update_tb = output_table_to_use)
# systime add precision 66666 ----------------
# timer$mark("Add precision")
x_columns <- names(enderecos)
output_df <- merge_results_to_input(
con,
x = 'input_db',
y = output_table_to_use,
key_column = 'tempidgeocodebr',
select_columns = x_columns,
resultado_completo = resultado_completo
)
# systime merge results 66666 ----------------
# timer$mark("Merge results")
# drop geocodebr temp id column
data.table::setDT(output_df)
output_df[, tempidgeocodebr := NULL]
# Disconnect from DuckDB when done
duckdb::dbDisconnect(con)
# add H3
if (!is.null(h3_res)) {
for (i in h3_res) {
colname <- paste0(
'h3_',
formatC(h3_res, width = 2, flag = "0")
)
output_df[
!is.na(lat),
{{ colname }} := h3r::latLngToCell(lat = lat, lng = lon, resolution = i)
]
}
# drop data.table
data.table::setDF(output_df)
# systime add h3 66666 ----------------
# timer$mark("Add H3")
}
# convert df to simple feature
if (isTRUE(resultado_sf)) {
output_sf <- sfheaders::sf_point(
obj = output_df,
x = 'lon',
y = 'lat',
keep = TRUE
)
sf::st_crs(output_sf) <- 4674
# systime convert to sf 66666 ----------------
# timer$mark("Convert to sf")
return(output_sf)
}
return(output_df[])
} # nocov end
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.