R/NYC.CleanGeoZip.R

#' The \code{NYC.CleanGeoZip} function utilizes the rGBAT, rUSPS, and rNYCclean packages to clean and geocode NYC addresses by five digit zip code.
#'
#' @title Clean and geocode NYC addresses by zip code.
#' @name NYC.CleanGeoZip
#' @aliases NYC.CleanGeoZip
#' @import data.table
#' @import rGBAT
#' @import rUSPS
#' @import rNYCclean
#' @export NYC.CleanGeoZip
#' @param in_df a data frame containing NYC addresses.  Required.
#' @param id_colname the name of the unique identifier column as string.  Required.
#' @param addr1_colname the name of the input address line one column as string.  Required.
#' @param addr2_colname the name of the input address line two column as string.  Optional.
#' @param zip_colname the name of the input five digit zip code as string.  Required.
#' @param source_cols vector of column names from the input data frame to be returned with geocoder results.  Required.
#' @param geocode_fields vector of field names generated by the geocoder to be returned with geocoder results.  Required.
#' @param GBAT_name the release or version of DCP's Geosupport geocoding software as string.  Required.
#' @param in_clus the number of clusters available to the function as integer.  Optional.
#' @param USPS_verify if TRUE, addresses will be run through the IBM Infosphere address verification service.  Optional.
#' @return A data frame or data table (depending on format of \code{in_df}) of cleaned and geocoded NYC addresses.
#' @usage NYC.CleanGeoZip(in_df, id_colname, addr1_colname, 
#'     addr2_colname=NULL, zip_colname, source_cols, geocode_fields, 
#'     GBAT_name, in_clus=1, USPS_verify=TRUE)
#' @examples # create a data frame of addresses
#' ADDR <- c("80 CENTRE","125 WORTH S","42 09 28 S","253 BROADW",
#'     "620 ATLANT","125 WOR","1 FRANKLIN","1 FRANKLIN",
#'     "1 1 1 AVE","1 1 1 AVE")
#' CITY <- c("NEW YORK","NEW YORK","LONG ISLAND CITY","NEW YORK",
#'     "BROOKLYN","NEW YORK","BROOKLYN","BROOKLYN","NEW YORK",
#'     "NEW YORK")
#' ZIP_CODE <- c('10013','10013','11101','10007','11217','10013',
#'     '11222','11249','10003','10014')
#' u_id <- 1:length(ADDR)
#' df = data.frame(u_id, ADDR, CITY, ZIP_CODE)
#'
#' #specify columns from input data frame to retain
#' source_cols <- c('u_id')
#'
#' #specify geocoder return fields
#' geocode_fields <- c('F1E.output.bin','F1E.output.bbl','F1E.longitude',
#'     'F1E.latitude','JN.ZCTA_10','F1E.output.ret_code','F1E.output.msg')
#'
#' #clean and geocode by zip code
#' gc_df <- NYC.CleanGeoZip(in_df=df,id_colname="u_id",
#'     addr1_colname="ADDR", city_colname="CITY",
#'     zip_colname="ZIP_CODE", source_cols=source_cols, 
#'     geocode_fields=geocode_fields, GBAT_name="18B")
#'
#' #preview results
#' head(gc_df)
#'
#' #view metadata
#' NYC.CleanGeoZip_metadata


NYC.CleanGeoZip <- function (in_df, id_colname, addr1_colname, addr2_colname=NULL, city_colname, zip_colname, source_cols, geocode_fields, GBAT_name, in_clus=1, USPS_verify=TRUE){
	
	###create copy of input geocode return fields###
	gc_flds <- geocode_fields
	
	bul_str <- "\xE2\x80\xA2"
	Encoding(bul_str) <- "UTF-8"

	################################################
	###COMMON INNACCURATE ADDRESSES REASSIGNMENTS###
	################################################

	#c("CENTRAL PARK","BRIGHTON BEACH","PROSPECT PARK","WASHINGTON SQUARE PARK","FT HAMILTON","ASTORIA PARK","CLOVE LAKE","ORCHARD BEACH","SOUTH BEACH","SUNSET PARK","BATTERY PARK","LINDEN PARK","RIVERSIDE PARK","BRONX PARK")

	f_vec <- c("1011110001","3087250001","3011170001","1005490001","3061530001","4008980001","5003190001","2056500001","5035250200","3009210001","1000030001","4019760028","1012540002","2043360001")

	f_col <- 'F1E.output.bbl'
	
	################################################
	################################################
	################################################
	
	ptm <- proc.time()
	
	if(nrow(in_df)==0) stop("Your dataset is empty.")
	
	###detect if data.table or data.frame###
	is.DT <- "data.table" %in% class(in_df)
			
	###if data.frame: convert to data.table; else: create copy of data.table so as not to affect original###
	if(!is.DT){
		in_df <- as.data.table(in_df)
	} else{
		in_df <- copy(in_df)
	}
	
	##############################################################
	###PART 1a: GEOCODE RAW FREEFORM ADDR1 FIELD USING ZIP CODE###
	##############################################################

	gc.type <- "FREEFORM ADDRESS LINE 1 WITH ZIP CODE"
	
	###ensure zip code is 5 characters in length###
	zc_col_name <- "ZIPCODE_new"
	in_df[,(zc_col_name) := substr(gsub(" ","",as.character(get(zip_colname))), 1, 5)]
	
	###prepare the address 1 column by applying a function that removes illegal characters###
	addr1_col_name <- "ADDR1_new"
	in_df[,(addr1_col_name) := rNYCclean::prep_addr(get(addr1_colname))]
	
	###use helper function to geocode and remove fails###
	GC.list <- GC_routine(in_clus=in_clus, in_df=in_df, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr1_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=NULL, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
	
	###assign results df###
	results_df <- GC.list[["results_df"]]
	
	###assign rejects df for fails... records for unprocessed addresses will be returned at end of function###
	rejects_df <- GC.list[["rejects_df"]]
	
	###assign messages df###
	message_df <- GC.list[["message_df"]]
	
	###########################################################
	###PART 1b: GEOCODE RAW SPLIT ADDR1 FIELD USING ZIP CODE###
	###########################################################

	gc.type <- "SPLIT ADDRESS LINE 1 WITH ZIP CODE"
	
	###create data frame of addresses that failed### 
	new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
		###use helper function to geocode and remove fails###
		GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr1_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
		
		###assign results df###
		results_df <- GC.list[["results_df"]]
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
		
	} else{
		
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	}
	
	#####################################################################
	###PART 1c: GEOCODE PAD OF RAW FREEFORM ADDR1 FIELD USING ZIP CODE###
	#####################################################################
	
	gc.type <- "PAD MERGE OF FREEFORM ADDRESS LINE 1 WITH ZIP CODE"

	###create data frame of addresses that failed### 
	new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
	
		######################################################
		###additional step to deal with numero-alpha combos###
		######################################################
		pad_in_addr_col_name <- "ADDR1_pad_in"
		new_fails[,(pad_in_addr_col_name) := rNYCclean::prep_addr(gsub("^(.*[[:digit:]])(ND|RD|ST|TH)(.*)$","\\1 \\2",get(addr1_col_name)))]
		######################################################
		######################################################
		######################################################
		
		###get PAD address if available###
		pad_addr_col_name <- "ADDR1_pad"
		#new_fails <- rNYCclean::pad_addr(new_fails, pad_addr_col_name, addr1_col_name, zc_col_name, "zip_code", GBAT_name)
		new_fails <- rNYCclean::pad_addr(new_fails, pad_addr_col_name, pad_in_addr_col_name, zc_col_name, "zip_code", GBAT_name)
		
		new_fails <- new_fails[!(is.na(get(pad_addr_col_name))),]
		
		if(nrow(new_fails)>0){
		
			###use helper function to geocode and remove fails###
			GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=pad_addr_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
			
			###assign results df###
			results_df <- GC.list[["results_df"]]
			
			###assign messages df###
			message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
		
		} else{
		
			###assign messages df###
			message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
			
			cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
		}
	
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
	
	##################################################################
	###PART 1d: GEOCODE PAD OF RAW SPLIT ADDR1 FIELD USING ZIP CODE###
	##################################################################
	
	gc.type <- "PAD MERGE OF SPLIT ADDRESS LINE 1 WITH ZIP CODE"
	
	new_fails <- new_fails[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
	
		###use helper function to geocode and remove fails###
		GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=pad_addr_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
		
		###assign results df###
		results_df <- GC.list[["results_df"]]
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
	
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
		
	####################################################################
	###PART 1e: GEOCODE SPELL CHECKED FREEFORM ADDRESS USING ZIP CODE###
	####################################################################
	
	gc.type <- "FREEFORM ADDRESS LINE 1 SPELL CHECKED WITH ZIP CODE"
	
	###create data frame of addresses that failed###
	new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
	
		spchk_addr_col <- "ADDR_spchk"
		
		###perform spell check on address### 	
		new_fails <- rNYCclean::parallel.splchk_addr(in_clus, new_fails, spchk_addr_col, addr1_col_name, zc_col_name, "zip_code",GBAT_name)
		
		###use helper function to geocode and remove fails###
		GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=spchk_addr_col, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
		
		###assign results df###
		results_df <- GC.list[["results_df"]]
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
	
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
	
	#################################################################
	###PART 1f: GEOCODE SPELL CHECKED SPLIT ADDRESS USING ZIP CODE###
	#################################################################
	
	gc.type <- "SPLIT ADDRESS LINE 1 SPELL CHECKED WITH ZIP CODE"
	
	new_fails <- new_fails[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
	
		###use helper function to geocode and remove fails###
		GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=spchk_addr_col, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
		
		###assign results df###
		results_df <- GC.list[["results_df"]]	
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
		
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
		
	###########################################################################
	###PART 2a: GEOCODE SUBSTRING OF RAW FREEFORM ADDR1 FIELD USING ZIP CODE###
	###########################################################################
	
	gc.type <- "FREEFORM ADDRESS LINE 1 SUBSTRING WITH ZIP CODE"

	###create data frame of addresses that failed### 
	new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
	
		addr1_ss_col_name <- "ADDR1_seq_spl"

		###get all possible sequential string combinations###
		new_fails2 <- rNYCclean::parallel.seqsplt_addr(in_clus, new_fails, addr1_ss_col_name, id_colname, addr1_col_name, zc_col_name)
		
		new_fails <- merge(new_fails,new_fails2,by=c(id_colname,zc_col_name))
		
		###use helper function to geocode and remove fails###
		GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr1_ss_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
		
		###assign results df###
		results_df <- GC.list[["results_df"]]
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
	
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
	
	########################################################################
	###PART 2b: GEOCODE SUBSTRING OF RAW SPLIT ADDR1 FIELD USING ZIP CODE###
	########################################################################

	gc.type <- "SPLIT ADDRESS ADDRESS LINE 1 SUBSTRING WITH ZIP CODE"
	
	###create data frame of addresses that failed### 
	new_fails <- new_fails[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
	
		###use helper function to geocode and remove fails###
		GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr1_ss_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
		
		###assign results df###
		results_df <- GC.list[["results_df"]]
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
	
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
	
	if(!(is.null(addr2_colname))){
	
		##############################################################
		###PART 3a: GEOCODE RAW FREEFORM ADDR2 FIELD USING ZIP CODE###
		##############################################################
		
		gc.type <- "FREEFORM ADDRESS LINE 2 WITH ZIP CODE"

		###prepare the address 2 column by applying a function that removes illegal characters###
		addr2_col_name <- "ADDR2_new"
		in_df[,(addr2_col_name) := rNYCclean::prep_addr(get(addr2_colname))]
		
		###create data frame of addresses that failed### 
		new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
		
		if(nrow(new_fails)>0){
		
			###use helper function to geocode and remove fails###
			GC.list <- GC_routine(in_clus=in_clus, in_df=in_df, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr2_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
			
			###assign results df###
			results_df <- GC.list[["results_df"]]
			
			###assign messages df###
			message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
		
		} else{
		
			###assign messages df###
			message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
		
			cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
		}
	
		###########################################################
		###PART 3b: GEOCODE RAW SPLIT ADDR2 FIELD USING ZIP CODE###
		###########################################################
		
		gc.type <- "SPLIT ADDRESS LINE 2 WITH ZIP CODE"

		###create data frame of addresses that failed### 
		new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]

		if(nrow(new_fails)>0){
		
			###use helper function to geocode and remove fails###
			GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr2_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
			
			###assign results df###
			results_df <- GC.list[["results_df"]]
			
			###assign messages df###
			message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
		
		} else{
		
			###assign messages df###
			message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
		
			cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
		}
	
		#####################################################################
		###PART 3c: GEOCODE PAD OF RAW FREEFORM ADDR2 FIELD USING ZIP CODE###
		#####################################################################
		
		gc.type <- "PAD MERGE OF FREEFORM ADDRESS LINE 2 WITH ZIP CODE"

		###create data frame of addresses that failed### 
		new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
		
		if(nrow(new_fails)>0){
		
			###get PAD address if available###
			pad_addr_col_name <- "ADDR2_pad"
			new_fails <- rNYCclean::pad_addr(new_fails,pad_addr_col_name,addr2_col_name,zc_col_name,"zip_code")
			
			new_fails <- new_fails[!(is.na(get(pad_addr_col_name))),]
			
			if(nrow(new_fails)>0){
				###use helper function to geocode and remove fails###
				GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=pad_addr_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
				
				###assign results df###
				results_df <- GC.list[["results_df"]]
				
				###assign messages df###
				message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
				
			} else{
			
				###assign messages df###
				message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
			
				cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
			}
		} else{
		
			###assign messages df###
			message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
		
			cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
		}
	
		###########################################################################
		###PART 4a: GEOCODE SUBSTRING OF RAW FREEFORM ADDR2 FIELD USING ZIP CODE###
		###########################################################################
		
		gc.type <- "FREEFORM ADDRESS LINE 2 SUBSTRING WITH ZIP CODE"
		
		###create data frame of addresses that failed### 
		new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
		
		if(nrow(new_fails)>0){
			addr2_ss_col_name <- "ADDR2_seq_spl"

			###get all possible sequential string combinations###
			new_fails <- rNYCclean::parallel.seqsplt_addr(in_clus, new_fails, addr2_ss_col_name, id_colname, addr2_col_name, zc_col_name)
			
			###use helper function to geocode and remove fails###
			GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr2_ss_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
			
			###assign results df###
			results_df <- GC.list[["results_df"]]
			
			###assign messages df###
			message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
		
		} else{
		
			###assign messages df###
			message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
		
			cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
		}
		
		########################################################################
		###PART 4b: GEOCODE SUBSTRING OF RAW SPLIT ADDR2 FIELD USING ZIP CODE###
		########################################################################
		
		gc.type <- "SPLIT ADDRESS LINE 2 SUBSTRING WITH ZIP CODE"
		
		###create data frame of addresses that failed### 
		new_fails <- new_fails[!(get(id_colname) %in% results_df[,get(id_colname)])]
		
		if(nrow(new_fails)>0){
		
			###use helper function to geocode and remove fails###
			GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr2_ss_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
			
			###assign results df###
			results_df <- GC.list[["results_df"]]
			
			###assign messages df###
			message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
			
		} else{
		
			###assign messages df###
			message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
		
			cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
		}
	}
	
	####################################################################
	###PART 5a: GEOCODE REGEX CLEANED FREEFORM ADDRESS USING ZIP CODE###
	####################################################################

	regex_addr_col <- "ADDR_regex"
	
	if(!(is.null(addr2_colname))){
		
		gc.type <- "FREEFORM ADDRESS LINES 1 AND 2 REGEX CLEANED WITH ZIP CODE"
		
		###run address columns 1 and 2 through NYC regular expressions###
		in_df <- rNYCclean::parallel.regex_addr(in_clus=in_clus, in_df=in_df, new_addr_col_name=regex_addr_col, addr1_col_name=addr1_col_name, GBAT_name=GBAT_name, addr2_col_name=addr2_col_name)
		
	} else {
	
		gc.type <- "FREEFORM ADDRESS LINE 1 REGEX CLEANED WITH ZIP CODE"
	
		###run address columns 1 through NYC regular expressions###
		in_df <- rNYCclean::parallel.regex_addr(in_clus=in_clus, in_df=in_df, new_addr_col_name=regex_addr_col, addr1_col_name=addr1_col_name, GBAT_name=GBAT_name)
	}
	
	###create data frame of addresses that failed### 
	new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
		###use helper function to geocode and remove fails###
		GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=regex_addr_col, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
		
		###assign results df###
		results_df <- GC.list[["results_df"]]
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
		
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
	
	#################################################################
	###PART 5b: GEOCODE REGEX CLEANED SPLIT ADDRESS USING ZIP CODE###
	#################################################################
	
	gc.type <- "SPLIT ADDRESS LINES 1 AND 2 REGEX CLEANED WITH ZIP CODE"

	###create data frame of addresses that failed### 
	new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
		###use helper function to geocode and remove fails###
		GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=regex_addr_col, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
		
		###assign results df###
		results_df <- GC.list[["results_df"]]
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
		
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
	
	#################################################################################
	###PART 6a: GEOCODE SUBSTRING OF REGEX CLEANED FREEFORM ADDRESS USING ZIP CODE###
	#################################################################################
	
	gc.type <- "FREEFORM ADDRESS LINES 1 AND 2 REGEX CLEANED SUBSTRING WITH ZIP CODE"
	
	###create data frame of addresses that failed###  
	new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	
	if(nrow(new_fails)>0){
		addr_ss_col_name <- "ADDR_regex_seq_spl"

		###get all possible sequential string combinations###
		new_fails2 <- rNYCclean::parallel.seqsplt_addr(in_clus, new_fails, addr_ss_col_name, id_colname, regex_addr_col, zc_col_name)
		
		new_fails <- merge(new_fails,new_fails2,by=c(id_colname,zc_col_name))
		
		###use helper function to geocode and remove fails###
		GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr_ss_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
		
		###assign results df###
		results_df <- GC.list[["results_df"]]
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
		
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
	
	##############################################################################
	###PART 6b: GEOCODE SUBSTRING OF REGEX CLEANED SPLIT ADDRESS USING ZIP CODE###
	##############################################################################
	
	gc.type <- "SPLIT ADDRESS LINES 1 AND 2 REGEX CLEANED SUBSTRING WITH ZIP CODE"
	
	###create data frame of addresses that failed###  
	new_fails <- new_fails[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
		###use helper function to geocode and remove fails###
		GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=addr_ss_col_name, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
		
		###assign results df###
		results_df <- GC.list[["results_df"]]	
		
		###assign messages df###
		message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
		
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
	
	
	##################################################
	###MAKE USPS VERIFICATION OPTIONAL - 02/01/2019###
	##################################################
	
	if(USPS_verify){
	
		
		###################################################################
		###ERROR HANDLING TO DEAL WITH USPS VERIFICATION SERVICE FAILURE###
		###################################################################
		
		USPS_error <- FALSE 
		
		
		############################################################################################
		###PART 7a: USPS VALIDATION OF FREEFORM ADDRESS LINES 1 AND 2 USPS VERIFIED WITH ZIP CODE###
		############################################################################################

		gc.type <- paste0("FREEFORM ADDRESS LINE", ifelse(!(is.null(addr2_colname)),"S 1 AND 2"," 1")," USPS VERIFIED WITH ZIP CODE")
		
		###prepare the city column by applying a function that removes illegal characters###	
		city_col_name <- "CITY_new"
		in_df[,(city_col_name) := rNYCclean::prep_addr(get(city_colname))]	
		
		###add column for state###
		state_col_name <- "STATE_new"
		in_df[,(state_col_name) := "NY"]
		
		###create data frame of addresses that failed###  
		new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
		
		if(nrow(new_fails)>0){
			###error handling for the buggy USPS service###
			
			if(!(is.null(addr2_colname))){
			
				###run addresses through the IBM USPS Address Verification Service and return in a geocoder friendly format###
				USPS_df <- try(rUSPS::USPS_addr_gc(in_df = new_fails, addr1_col_name = addr1_col_name, addr2_col_name = addr2_col_name, city_col_name = city_col_name, state_col_name = state_col_name, zip_col_name = zc_col_name, source_cols = source_cols), silent=TRUE)
				
				if("try-error" %in% class(USPS_df)) {
					###re-run with fewer cores###
					#USPS_df <- try(rUSPS::USPS_addr_gc(in_df = new_fails, addr1_col_name = addr1_col_name, addr2_col_name = addr2_col_name, city_col_name = city_col_name, state_col_name = state_col_name, zip_col_name = zc_col_name, source_cols = source_cols))
					
					USPS_error <- TRUE
				}
			} else{
			
				###run addresses through the IBM USPS Address Verification Service and return in a geocoder friendly format###
				USPS_df <- try(rUSPS::USPS_addr_gc(in_df = new_fails, addr1_col_name = addr1_col_name, city_col_name = city_col_name, state_col_name = state_col_name, zip_col_name = zc_col_name, source_cols = source_cols), silent=TRUE)
				
				if("try-error" %in% class(USPS_df)) {
					###re-run with fewer cores###
					#USPS_df <- try(rUSPS::USPS_addr_gc(in_df = new_fails, addr1_col_name = addr1_col_name, city_col_name = city_col_name, state_col_name = state_col_name, zip_col_name = zc_col_name, source_cols = source_cols))
					
					USPS_error <- TRUE
				}
			}
			
			if(USPS_error){
			
				###assign messages df###
				message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(1))),use.names=TRUE)
				
				cat(paste0(bul_str, "ERROR!!! 0 records successfully geocoded by ", gc.type, ".\n"))
			
			} else{
			
				###use helper function to geocode and remove fails###
				GC.list <- GC_routine(in_clus=in_clus, in_df=USPS_df, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name="deliveryaddressline1_cass",third_col_name="zip5_cass", source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
				
				###assign results df###
				results_df <- GC.list[["results_df"]]
				
				###assign messages df###
				message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
			}
			
		} else{
		
			###assign messages df###
			message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
		
			cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
		}
		
		#########################################################################################
		###PART 7b: USPS VALIDATION OF SPLIT ADDRESS LINES 1 AND 2 USPS VERIFIED WITH ZIP CODE###
		#########################################################################################	

		gc.type <- paste0("SPLIT ADDRESS LINE", ifelse(!(is.null(addr2_colname)),"S 1 AND 2"," 1")," USPS VERIFIED WITH ZIP CODE")
		
		if(nrow(new_fails)>0 & !(USPS_error)){
			###create data frame of addresses that failed###  
			new_fails <- USPS_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
		
			###use helper function to geocode and remove fails###
			GC.list <- GC_routine(in_clus=in_clus, in_df=new_fails, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name="deliveryaddressline1_cass",third_col_name="zip5_cass", source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
			
			###assign results df###
			results_df <- GC.list[["results_df"]]
			
			###assign messages df###
			message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
			
		} else{
		
			if(USPS_error){
				
				###assign messages df###
				message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(1))),use.names=TRUE)
			
				cat(paste0(bul_str, "ERROR!!! 0 records successfully geocoded by ", gc.type, ".\n"))
				
			} else{
			
				###assign messages df###
				message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
			
				cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
			
			}
			
		}
	
	}
	
	
	#########################################################################################################################
	###PART 8a: GEOCODE FREEFORM ADDRESS WITH ALTERED HOUSE NUMBER (e.g., 1 123 SMITH ST -> 1-123 SMITH ST) USING ZIP CODE###
	#########################################################################################################################
	
	gc.type <- paste0("FREEFORM ADDRESS LINE", ifelse(!(is.null(addr2_colname)),"S 1 AND 2"," 1")," REGEX CLEANED WITH ALTERED HOUSE NUMBER WITH ZIP CODE")
	
	###create data frame of addresses that failed### 
	new_fails <- in_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	if(nrow(new_fails)>0){
		regex_addr_col2 <- "ADDR_regex_hnum"
			
		hnum_df <- new_fails[grepl("^[[:digit:]]{1,} [[:digit:]]{1,}.*$",get(regex_addr_col))]
			
		if(nrow(hnum_df)>0){	
			hnum_df[,(regex_addr_col2) := gsub("^([[:digit:]]{1,}) ([[:digit:]]{1,}.*)$","\\1-\\2",get(regex_addr_col))]

			###use helper function to geocode and remove fails###
			GC.list <- GC_routine(in_clus=in_clus, in_df=hnum_df, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=regex_addr_col2, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=FALSE)
			
			###assign results df###
			results_df <- GC.list[["results_df"]]
			
			###assign messages df###
			message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
			
		} else{
		
			###assign messages df###
			message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
		
			cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
		}
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
	
	######################################################################################################################
	###PART 8b: GEOCODE SPLIT ADDRESS WITH ALTERED HOUSE NUMBER (e.g., 1 123 SMITH ST -> 1-123 SMITH ST) USING ZIP CODE###
	######################################################################################################################
	
	gc.type <- paste0("SPLIT ADDRESS LINE", ifelse(!(is.null(addr2_colname)),"S 1 AND 2"," 1")," REGEX CLEANED WITH ALTERED HOUSE NUMBER WITH ZIP CODE")
	
	if(exists('hnum_df') && is.data.frame(get('hnum_df'))){
		hnum_df <- hnum_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
		
		if(nrow(hnum_df)>0){	
			###use helper function to geocode and remove fails###
			GC.list <- GC_routine(in_clus=in_clus, in_df=hnum_df, id_colname=id_colname, hse_num_col_name=NULL, addr_col_name=regex_addr_col2, third_col_name=zc_col_name, source_cols=source_cols, geocode_fields=geocode_fields, addr_type="zip_code", GBAT_name=GBAT_name, gc_type=gc.type, results_df=results_df, filter_col_name=f_col, filter_vector=f_vec, split=TRUE)
			
			###assign results df###
			results_df <- GC.list[["results_df"]]
			
			###assign messages df###
			message_df <- rbindlist(list(message_df, GC.list[["message_df"]]),use.names=TRUE)
			
		} else{
		
			###assign messages df###
			message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
		
			cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
		}
	} else{
	
		###assign messages df###
		message_df <- rbindlist(list(message_df, data.table(num_rec=c(0), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
		cat(paste0(bul_str, " 0 records successfully geocoded by ", gc.type, ".\n"))
	}
	
	############################
	###MERGE PASSES AND FAILS###
	############################
		
	rejects_df <- rejects_df[!(get(id_colname) %in% results_df[,get(id_colname)])]
	
	gc_type <- 'failed to return at least BBL'
	
	###assign messages df###
	message_df <- rbindlist(list(message_df, data.table(num_rec=c(nrow(rejects_df)), desc=c(gc.type), error=c(0))),use.names=TRUE)
	
	if(nrow(rejects_df) > 0){
	
		rejects_df[,USPS_comment := ""]
		
		if(exists('USPS_df') && is.data.frame(get('USPS_df'))){
		
			###merge to USPS_df to get validity of address###
			rejects_df <- merge(rejects_df, USPS_df[,c(id_colname,"dpvmatchflag_cass","carrierroute_cass","boro_code_cass"),with=FALSE], by=id_colname, all.x=TRUE)
			
			rejects_df[,USPS_DPV := ifelse(dpvmatchflag_cass %in% c("Y","D","S"),"valid","invalid")]
			
			#B = PO BOX
			#C = CITY
			#G = GENERAL DELIVERY
			#H = HIGHWAY
			#R = RURAL
			
			rejects_df[,USPS_type := ifelse(dpvmatchflag_cass %in% c("Y","D","S"), 
				ifelse( substr(carrierroute_cass, 1, 1)=="C", "city delivery", 
					ifelse(substr(carrierroute_cass, 1, 1)=="B","PO BOX", 
						ifelse(substr(carrierroute_cass, 1, 1)=="G", "general delivery",
							ifelse(substr(carrierroute_cass, 1, 1)=="R", "rural delivery","highway")))),"")] 
			
			
			rejects_df[,USPS_boro := ifelse(dpvmatchflag_cass %in% c("Y","D","S"), ifelse(boro_code_cass > 0, "within NYC","outside of NYC"),"")]			
						
			rejects_df[,USPS_comment := gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", paste(USPS_DPV,"USPS",USPS_type,"address",USPS_boro,sep=" "), perl=TRUE)]
			
			rejects_df[,c('USPS_DPV','USPS_type','USPS_boro','carrierroute_cass','boro_code_cass','dpvmatchflag_cass') := NULL]
			
		}
		
		rejects_df <- merge(rejects_df, in_df[,c(id_colname,regex_addr_col),with=FALSE], by=id_colname, all.x=TRUE)
		
		rejects_df[,REGEX_ASSUM := ifelse(get(regex_addr_col) %in% c("UNKNOWN","HOMELESS"),paste0(", suspected ",get(regex_addr_col)),"")]
		
		rejects_df[,GC.type := paste0(gc_type,", ", USPS_comment, REGEX_ASSUM)]
		
		rejects_df[,c('USPS_comment','REGEX_ASSUM',regex_addr_col) := NULL]
			
		results_df <- data.table::rbindlist(list(rejects_df,results_df), use.names=TRUE, fill=TRUE)
		
	}
	
	p_num <- as.character(round(((nrow(rejects_df)/nrow(in_df))*100),2))
	
	ptm2 <- proc.time() - ptm
	
	time_dur <- format(.POSIXct(as.numeric(ptm2[3]),tz="GMT"), "%H:%M:%S")
	
	cat(paste0("\tTime to process ", nrow(results_df)," records: ",time_dur,"\n"))
	
	cat(paste0("\t",p_num, "% of records (n = ", nrow(rejects_df), ") ", gc_type, ".\n"))
	
	invisible(gc())

	results_df <- unique(results_df)
	
	###delete f_col if not present in input geocode fields###
	if(!(f_col %in% gc_flds)) results_df[,c(gc_flds) := NULL]
	
	if(!is.DT) results_df <- as.data.frame(results_df)
	
	assign("NYC.CleanGeoZip_metadata", unique(message_df), envir = .GlobalEnv)
	
	return(results_df)
	
}
gmculp/rBES documentation built on May 25, 2019, 11:31 p.m.