#' The \code{NYC.CleanGeoBoro} function utilizes the rGBAT, rUSPS, and rNYCclean packages to clean and geocode NYC addresses by one digit borough code.
#'
#' @title Clean and geocode NYC addresses by borough code.
#' @name NYC.CleanGeoBoro
#' @aliases NYC.CleanGeoBoro
#' @import data.table
#' @import rGBAT
#' @import rUSPS
#' @import rNYCclean
#' @export NYC.CleanGeoBoro
#' @param in_df a data frame or data table containing NYC addresses. Required.
#' @param id_colname the name of the unique identifier column as string. Required.
#' @param addr1_colname the name of the input address line one column as string. Required.
#' @param addr2_colname the name of the input address line two column as string. Optional.
#' @param boro_colname the name of the input one digit borough code as string. Required.
#' @param source_cols vector of column names from the input data frame to be returned with geocoder results. Required.
#' @param geocode_fields vector of field names generated by the geocoder to be returned with geocoder results. Required.
#' @param GBAT_name the release or version of DCP's Geosupport geocoding software as string. Required.
#' @param in_clus the number of clusters available to the function as integer. Optional.
#' @param USPS_verify if TRUE, addresses will be run through the IBM Infosphere address verification service. Optional.
#' @return A data frame or data table (depending on format of \code{in_df}) of cleaned and geocoded NYC addresses.
#' @usage NYC.CleanGeoBoro(in_df, id_colname, addr1_colname,
#' addr2_colname=NULL, boro_colname, source_cols, geocode_fields,
#' GBAT_name, in_clus=1, USPS_verify=TRUE)
#' @examples # create a data frame of addresses
#' ADDR <- c("80 CENTRE","125 WORTH S","42 09 28 S","253 BROADW",
#' "620 ATLANT","125 WOR","1 FRANKLIN","1 FRANKLIN",
#' "1 1 1 AVE","1 1 1 AVE")
#' BORO_CODE <- c(1,1,4,1,3,1,3,3,1,1)
#' u_id <- 1:length(ADDR)
#' df = data.frame(u_id, ADDR, BORO_CODE)
#'
#' #specify columns from input data frame to retain
#' source_cols <- c('u_id')
#'
#' #specify geocoder return fields
#' geocode_fields <- c('F1E.output.bin','F1E.output.bbl','F1E.longitude',
#' 'F1E.latitude','JN.ZCTA_10','F1E.output.ret_code','F1E.output.msg')
#'
#' #clean and geocode by borough code
#' gc_df <- NYC.CleanGeoBoro(in_df=df,id_colname="u_id",
#' addr1_colname="ADDR",boro_colname="BORO_CODE",
#' source_cols=source_cols, geocode_fields=geocode_fields,
#' GBAT_name="18B")
#'
#' #preview results
#' head(gc_df)
#'
#' #view metadata
#' NYC.CleanGeoBoro_metadata
NYC.CleanGeoBoro <- function (in_df, id_colname, addr1_colname, addr2_colname=NULL, boro_colname, source_cols, geocode_fields, GBAT_name, in_clus=1, USPS_verify=TRUE){
###create copy of input geocode return fields###
gc_flds <- geocode_fields
bul_str <- "\xE2\x80\xA2"
Encoding(bul_str) <- "UTF-8"
################################################
###COMMON INNACCURATE ADDRESSES REASSIGNMENTS###
################################################
#c("CENTRAL PARK","BRIGHTON BEACH","PROSPECT PARK","WASHINGTON SQUARE PARK","FT HAMILTON","ASTORIA PARK","CLOVE LAKE","ORCHARD BEACH","SOUTH BEACH","SUNSET PARK","BATTERY PARK","LINDEN PARK","RIVERSIDE PARK","BRONX PARK")
f_vec <- c("1011110001","3087250001","3011170001","1005490001","3061530001","4008980001","5003190001","2056500001","5035250200","3009210001","1000030001","4019760028","1012540002","2043360001")
f_col <- 'F1E.output.bbl'
################################################
################################################
################################################
ptm <- proc.time()
if(nrow(in_df)==0) stop("Your dataset is empty.")
###detect if data.table or data.frame###
is.DT <- "data.table" %in% class(in_df)
###if data.frame: convert to data.table; else: create copy of data.table so as not to affect original###
if(!is.DT){
in_df <- as.data.table(in_df)
} else{
in_df <- copy(in_df)
}
##################################################################
###PART 1a: GEOCODE RAW FREEFORM ADDR1 FIELD USING BOROUGH CODE###
##################################################################
gc.type <- "FREEFORM ADDRESS LINE 1 WITH BOROUGH CODE"
###ensure zip code is 1 characters in length###
bc_col_name <- "BOROCODE_new"
in_df[,(bc_col_name) := substr(gsub(" ","",as.character(get(boro_colname))), 1, 1)]
###prepare the address 1 column by applying a function that removes illegal characters###
addr1_col_name <- "ADDR1_new"
in_df[,(addr1_col_name) := rNYCclean::prep_addr(get(addr1_colname))]
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=in_df, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr1_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=NULL, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign rejects df for fails... records for unprocessed addresses will be returned at end of function###
rejects_df <- GC.list[["rejects_df"]]
###assign messages df###
message_df <- GC.list[["message_df"]]
###############################################################
###PART 1b: GEOCODE RAW SPLIT ADDR1 FIELD USING BOROUGH CODE###
###############################################################
gc.type <- "SPLIT ADDRESS LINE 1 WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr1_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
#########################################################################
###PART 1c: GEOCODE PAD OF RAW FREEFORM ADDR1 FIELD USING BOROUGH CODE###
#########################################################################
gc.type <- "PAD MERGE OF FREEFORM ADDRESS LINE 1 WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
######################################################
###additional step to deal with numero-alpha combos###
######################################################
pad_in_addr_col_name <- "ADDR1_pad_in"
new_fails[,(pad_in_addr_col_name) := rNYCclean::prep_addr(gsub("^(.*[[:digit:]])(ND|RD|ST|TH)(.*)$","\\1 \\2",get(addr1_col_name)))]
######################################################
######################################################
######################################################
###get PAD address if available###
pad_addr_col_name <- "ADDR1_pad"
#new_fails <- rNYCclean::pad_addr(new_fails, pad_addr_col_name, addr1_col_name, bc_col_name, "boro_code", GBAT_name)
new_fails <- rNYCclean::pad_addr(new_fails, pad_addr_col_name, pad_in_addr_col_name, bc_col_name, "boro_code", GBAT_name)
new_fails <- new_fails[!(is.na(get(pad_addr_col_name))),]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=pad_addr_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
######################################################################
###PART 1d: GEOCODE PAD OF RAW SPLIT ADDR1 FIELD USING BOROUGH CODE###
######################################################################
gc.type <- "PAD MERGE OF SPLIT ADDRESS LINE 1 WITH BOROUGH CODE"
new_fails <- new_fails[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=pad_addr_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
}
########################################################################
###PART 1e: GEOCODE SPELL CHECKED FREEFORM ADDRESS USING BOROUGH CODE###
########################################################################
gc.type <- "FREEFORM ADDRESS LINE 1 SPELL CHECKED WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
spchk_addr_col <- "ADDR_spchk"
###perform spell check on address###
new_fails <- rNYCclean::parallel.splchk_addr(in_clus, new_fails, spchk_addr_col, addr1_col_name, bc_col_name, "boro_code",GBAT_name)
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=spchk_addr_col, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
#####################################################################
###PART 1f: GEOCODE SPELL CHECKED SPLIT ADDRESS USING BOROUGH CODE###
#####################################################################
gc.type <- "SPLIT ADDRESS LINE 1 SPELL CHECKED WITH BOROUGH CODE"
new_fails <- new_fails[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=spchk_addr_col, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
}
###############################################################################
###PART 2a: GEOCODE SUBSTRING OF RAW FREEFORM ADDR1 FIELD USING BOROUGH CODE###
###############################################################################
gc.type <- "FREEFORM ADDRESS LINE 1 SUBSTRING WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
addr1_ss_col_name <- "ADDR1_seq_spl"
###get all possible sequential string combinations###
new_fails2 <- rNYCclean::parallel.seqsplt_addr(in_clus, new_fails, addr1_ss_col_name, id_colname, addr1_col_name, bc_col_name)
new_fails <- merge(new_fails,new_fails2,by=c(id_colname,bc_col_name))
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr1_ss_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
############################################################################
###PART 2b: GEOCODE SUBSTRING OF RAW SPLIT ADDR1 FIELD USING BOROUGH CODE###
############################################################################
gc.type <- "SPLIT ADDRESS ADDRESS LINE 1 SUBSTRING WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- new_fails[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr1_ss_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
if(!(is.null(addr2_colname))){
##################################################################
###PART 3a: GEOCODE RAW FREEFORM ADDR2 FIELD USING BOROUGH CODE###
##################################################################
gc.type <- "FREEFORM ADDRESS LINE 2 WITH BOROUGH CODE"
###prepare the address 2 column by applying a function that removes illegal characters###
addr2_col_name <- "ADDR2_new"
in_df[,(addr2_col_name) := rNYCclean::prep_addr(get(addr2_colname))]
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=in_df, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr2_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
###############################################################
###PART 3b: GEOCODE RAW SPLIT ADDR2 FIELD USING BOROUGH CODE###
###############################################################
gc.type <- "SPLIT ADDRESS LINE 2 WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr2_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
#########################################################################
###PART 3c: GEOCODE PAD OF RAW FREEFORM ADDR2 FIELD USING BOROUGH CODE###
#########################################################################
gc.type <- "PAD MERGE OF FREEFORM ADDRESS LINE 2 WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###get PAD address if available###
pad_addr_col_name <- "ADDR2_pad"
new_fails <- rNYCclean::pad_addr(new_fails,pad_addr_col_name,addr2_col_name,bc_col_name,"boro_code")
new_fails <- new_fails[!(is.na(get(pad_addr_col_name))),]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=pad_addr_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
###############################################################################
###PART 4a: GEOCODE SUBSTRING OF RAW FREEFORM ADDR2 FIELD USING BOROUGH CODE###
###############################################################################
gc.type <- "FREEFORM ADDRESS LINE 2 SUBSTRING WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
addr2_ss_col_name <- "ADDR2_seq_spl"
###get all possible sequential string combinations###
new_fails <- rNYCclean::parallel.seqsplt_addr(in_clus, new_fails, addr2_ss_col_name, id_colname, addr2_col_name, bc_col_name)
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr2_ss_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
############################################################################
###PART 4b: GEOCODE SUBSTRING OF RAW SPLIT ADDR2 FIELD USING BOROUGH CODE###
############################################################################
gc.type <- "SPLIT ADDRESS LINE 2 SUBSTRING WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- new_fails[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr2_ss_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
}
########################################################################
###PART 5a: GEOCODE REGEX CLEANED FREEFORM ADDRESS USING BOROUGH CODE###
########################################################################
regex_addr_col <- "ADDR_regex"
if(!(is.null(addr2_colname))){
gc.type <- "FREEFORM ADDRESS LINES 1 AND 2 REGEX CLEANED WITH BOROUGH CODE"
###run address columns 1 and 2 through NYC regular expressions###
in_df <- rNYCclean::parallel.regex_addr(in_clus=in_clus, in_df=in_df, new_addr_col_name=regex_addr_col, addr1_col_name=addr1_col_name, GBAT_name=GBAT_name, addr2_col_name=addr2_col_name)
} else {
gc.type <- "FREEFORM ADDRESS LINES 1 REGEX CLEANED WITH BOROUGH CODE"
###run address columns 1 through NYC regular expressions###
in_df <- rNYCclean::parallel.regex_addr(in_clus=in_clus, in_df=in_df, new_addr_col_name=regex_addr_col, addr1_col_name=addr1_col_name, GBAT_name=GBAT_name)
}
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=regex_addr_col, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
#####################################################################
###PART 5b: GEOCODE REGEX CLEANED SPLIT ADDRESS USING BOROUGH CODE###
#####################################################################
gc.type <- "SPLIT ADDRESS LINES 1 AND 2 REGEX CLEANED WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=regex_addr_col, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
#####################################################################################
###PART 6a: GEOCODE SUBSTRING OF REGEX CLEANED FREEFORM ADDRESS USING BOROUGH CODE###
#####################################################################################
gc.type <- "FREEFORM ADDRESS LINES 1 AND 2 REGEX CLEANED SUBSTRING WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
addr_ss_col_name <- "ADDR_regex_seq_spl"
###get all possible sequential string combinations###
new_fails2 <- rNYCclean::parallel.seqsplt_addr(in_clus, new_fails, addr_ss_col_name, id_colname, regex_addr_col, bc_col_name)
new_fails <- merge(new_fails,new_fails2,by=c(id_colname,bc_col_name))
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr_ss_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
##################################################################################
###PART 6b: GEOCODE SUBSTRING OF REGEX CLEANED SPLIT ADDRESS USING BOROUGH CODE###
##################################################################################
gc.type <- "SPLIT ADDRESS LINES 1 AND 2 REGEX CLEANED SUBSTRING WITH BOROUGH CODE"
###create data frame of addresses that failed###
new_fails <- new_fails[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr_ss_col_name, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
##################################################
###MAKE USPS VERIFICATION OPTIONAL - 02/01/2019###
##################################################
if(USPS_verify){
###################################################################
###ERROR HANDLING TO DEAL WITH USPS VERIFICATION SERVICE FAILURE###
###################################################################
USPS_error <- FALSE
################################################################################################
###PART 7a: USPS VALIDATION OF FREEFORM ADDRESS LINES 1 AND 2 USPS VERIFIED WITH BOROUGH CODE###
################################################################################################
gc.type <- paste0("FREEFORM ADDRESS LINE", ifelse(!(is.null(addr2_colname)),"S 1 AND 2"," 1")," USPS VERIFIED WITH ZIP CODE")
###prepare the city column by converting borough code to borough string###
city_col_name <- "CITY_new"
in_df[,(city_col_name) := ifelse(get(bc_col_name)=="1","NEW YORK",ifelse(get(bc_col_name)=="2","BRONX",ifelse(get(bc_col_name)=="3","BROOKLYN",ifelse(get(bc_col_name)=="4","QUEENS",ifelse(get(bc_col_name)=="5","STATEN ISLAND","OTHER")))))]
###add column for state###
state_col_name <- "STATE_new"
in_df[,(state_col_name) := "NY"]
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
###error handling for the buggy USPS service###
if(!(is.null(addr2_colname))){
###run addresses through the IBM USPS Address Verification Service and return in a geocoder friendly format###
USPS_df <- try(rUSPS::USPS_addr_gc(in_df = new_fails, addr1_col_name = addr1_col_name, addr2_col_name = addr2_col_name, city_col_name = city_col_name, state_col_name = state_col_name, source_cols = source_cols), silent=TRUE)
if("try-error" %in% class(USPS_df)) {
###re-run with fewer cores###
#USPS_df <- try(rUSPS::USPS_addr_gc(in_df = new_fails, addr1_col_name = addr1_col_name, addr2_col_name = addr2_col_name, city_col_name = city_col_name, state_col_name = state_col_name, source_cols = source_cols))
USPS_error <- TRUE
}
} else{
###run addresses through the IBM USPS Address Verification Service and return in a geocoder friendly format###
USPS_df <- try(rUSPS::USPS_addr_gc(in_df = new_fails, addr1_col_name = addr1_col_name, city_col_name = city_col_name, state_col_name = state_col_name, source_cols = source_cols), silent=TRUE)
if("try-error" %in% class(USPS_df)) {
###re-run with fewer cores###
#USPS_df <- try(rUSPS::USPS_addr_gc(in_df = new_fails, addr1_col_name = addr1_col_name, city_col_name = city_col_name, state_col_name = state_col_name, source_cols = source_cols))
USPS_error <- TRUE
}
}
if(USPS_error){
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(1))),use.names=TRUE)
cat(paste0(bul_str, "ERROR!!! 0 records successfully geocoded by ", gc.type, ".\n"))
} else{
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=USPS_df, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name="deliveryaddressline1_cass",third_col_name="zip5_cass", source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
}
} else{
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
}
#########################################################################################
###PART 7b: USPS VALIDATION OF SPLIT ADDRESS LINES 1 AND 2 USPS VERIFIED WITH ZIP CODE###
#########################################################################################
gc.type <- paste0("SPLIT ADDRESS LINE", ifelse(!(is.null(addr2_colname)),"S 1 AND 2"," 1")," USPS VERIFIED WITH ZIP CODE")
if(nrow(new_fails)>0 & !(USPS_error)){
###create data frame of addresses that failed###
new_fails <- USPS_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name="deliveryaddressline1_cass",third_col_name="zip5_cass", source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
if(USPS_error){
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(1))),use.names=TRUE)
cat(paste0(bul_str, "ERROR!!! 0 records successfully geocoded by ", gc.type, ".\n"))
} else{
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
}
}
}
#############################################################################################################################
###PART 8a: GEOCODE FREEFORM ADDRESS WITH ALTERED HOUSE NUMBER (e.g., 1 123 SMITH ST -> 1-123 SMITH ST) USING BOROUGH CODE###
#############################################################################################################################
gc.type <- paste0("FREEFORM ADDRESS LINE", ifelse(!(is.null(addr2_colname)),"S 1 AND 2"," 1")," REGEX CLEANED WITH ALTERED HOUSE NUMBER WITH BOROUGH CODE")
###create data frame of addresses that failed###
new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(new_fails)>0){
regex_addr_col2 <- "ADDR_regex_hnum"
hnum_df <- new_fails[grepl("^[[:digit:]]{1,} [[:digit:]]{1,}.*$",get(regex_addr_col))]
if(nrow(hnum_df)>0){
hnum_df[,(regex_addr_col2) := gsub("^([[:digit:]]{1,}) ([[:digit:]]{1,}.*)$","\\1-\\2",get(regex_addr_col))]
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=hnum_df, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=regex_addr_col2, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
##########################################################################################################################
###PART 8b: GEOCODE SPLIT ADDRESS WITH ALTERED HOUSE NUMBER (e.g., 1 123 SMITH ST -> 1-123 SMITH ST) USING BOROUGH CODE###
##########################################################################################################################
gc.type <- paste0("SPLIT ADDRESS LINE", ifelse(!(is.null(addr2_colname)),"S 1 AND 2"," 1")," REGEX CLEANED WITH ALTERED HOUSE NUMBER WITH BOROUGH CODE")
if(exists('hnum_df') && is.data.frame(get('hnum_df'))){
hnum_df <- hnum_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
if(nrow(hnum_df)>0){
###use helper function to geocode and remove fails###
GC.list <- GC_routine(in_clus=in_clus, in_df=hnum_df, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=regex_addr_col2, third_col_name=bc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="boro_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
###assign results df###
results_df <- GC.list[["results_df"]]
###assign messages df###
message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
} else{
cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
}
############################
###MERGE PASSES AND FAILS###
############################
rejects_df <- rejects_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
gc_type <- 'failed to return at least BBL'
###assign messages df###
message_df <- rbindlist(list(message_df, data.table(num_rec=c(nrow(rejects_df)), desc=c(gc.type), error=c(0))),use.names=TRUE)
if(nrow(rejects_df) > 0){
rejects_df[,USPS_comment := ""]
if(exists('USPS_df') && is.data.frame(get('USPS_df'))){
###merge to USPS_df to get validity of address###
rejects_df <- merge(rejects_df, USPS_df[,c(id_colname,"dpvmatchflag_cass","carrierroute_cass","boro_code_cass"),with=FALSE], by=id_colname, all.x=TRUE)
rejects_df[,USPS_DPV := ifelse(dpvmatchflag_cass %in% c("Y","D","S"),"valid","invalid")]
#B = PO BOX
#C = CITY
#G = GENERAL DELIVERY
#H = HIGHWAY
#R = RURAL
rejects_df[,USPS_type := ifelse(dpvmatchflag_cass %in% c("Y","D","S"),
ifelse( substr(carrierroute_cass, 1, 1)=="C", "city delivery",
ifelse(substr(carrierroute_cass, 1, 1)=="B","PO BOX",
ifelse(substr(carrierroute_cass, 1, 1)=="G", "general delivery",
ifelse(substr(carrierroute_cass, 1, 1)=="R", "rural delivery","highway")))),"")]
rejects_df[,USPS_boro := ifelse(dpvmatchflag_cass %in% c("Y","D","S"), ifelse(boro_code_cass > 0, "within NYC","outside of NYC"),"")]
rejects_df[,USPS_comment := gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", paste(USPS_DPV,"USPS",USPS_type,"address",USPS_boro,sep=" "), perl=TRUE)]
rejects_df[,c('USPS_DPV','USPS_type','USPS_boro','carrierroute_cass','boro_code_cass','dpvmatchflag_cass') := NULL]
}
rejects_df <- merge(rejects_df, in_df[,c(id_colname,regex_addr_col),with=FALSE], by=id_colname, all.x=TRUE)
rejects_df[,REGEX_ASSUM := ifelse(get(regex_addr_col) %in% c("UNKNOWN","HOMELESS"),paste0(", suspected ",get(regex_addr_col)),"")]
rejects_df[,GC.type := paste0(gc_type,", ", USPS_comment, REGEX_ASSUM)]
rejects_df[,c('USPS_comment','REGEX_ASSUM',regex_addr_col) := NULL]
results_df <- data.table::rbindlist(list(rejects_df,results_df), use.names=TRUE, fill=TRUE)
}
p_num <- as.character(round(((nrow(rejects_df)/nrow(in_df))*100),2))
ptm2 <- proc.time() - ptm
time_dur <- format(.POSIXct(as.numeric(ptm2[3]),tz="GMT"), "%H:%M:%S")
cat(paste0("\tTime to process ", nrow(results_df)," records: ",time_dur,"\n"))
cat(paste0("\t",p_num, "% of records (n = ", nrow(rejects_df), ") ", gc_type, ".\n"))
invisible(gc())
results_df <- unique(results_df)
###delete f_col if not present in input geocode fields###
if(!(f_col %in% gc_flds)) results_df[,c(gc_flds) := NULL]
if(!is.DT) results_df <- as.data.frame(results_df)
assign("NYC.CleanGeoBoro_metadata", unique(message_df), envir = .GlobalEnv)
return(results_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.