#' The \code{pad_addr} function performs a substring match between a data frame of NYC addresses and NYC Department of City Planning's (DCP) PAD (Property Address Directory) and returns the PAD address, if available.
#'
#' @title NYC DCP PAD address match on a data frame of NYC addresses.
#' @name pad_addr
#' @aliases pad_addr
#' @import data.table
#' @export pad_addr
#' @param in_df a data frame containing NYC addresses. Required.
#' @param addr_col_name the name of the input addresses column as string. Required.
#' @param third_col_name the name of either the borough code or zip code column as string. Required.
#' @param new_addr_col_name the name of the output addresses column as string. Required.
#' @param third_col_type either \code{"boro_code"} or \code{"zip_code"} as string. Required.
#' @param return_type option to exclude address which failed to match from output as string. Optional.
#' @return A data frame containing the input data frame plus the PAD address column.
#' @usage pad_addr(in_df, new_addr_col_name, addr_col_name, third_col_name,
#' third_col_type, return_type = "all")
#' @examples # create a data frame of addresses
#' ADDR <- c("80 CENTRE","125 WORTH S","42 09 28 S","253 BROADW",
#' "620 ATLANT","125 WOR","1 FRANKLIN","1 FRANKLIN",
#' "1 1 1 AVE","1 1 1 AVE")
#' BORO_CODE <- c(1,1,4,1,3,1,3,3,1,1)
#' ZIP_CODE <- c('10013','10013','11101','10007','11217','10013',
#' '11222','11249','10003','10014')
#' u_id <- 1:length(ADDR)
#' df = data.frame(u_id, ADDR, BORO_CODE, ZIP_CODE)
#'
#' #get version of DCP PAD used to build package data
#' rNYCclean::pad_version
#'
#' #get PAD address using borough code
#' #NOTE: slow due to expansive search area (entire borough)
#' system.time({df1 <- pad_addr(df,"ADDR.pad","ADDR","BORO_CODE","boro_code")})
#'
#' #preview records
#' head(df1)
#'
#' #get PAD address using ZIP code
#' #NOTE: much faster due to localized search area (single ZIP code)
#' system.time({df2 <- pad_addr(df,"ADDR.pad","ADDR","ZIP_CODE","zip_code")})
#'
#' #preview records
#' head(df2)
#'
pad_addr <- function(in_df, new_addr_col_name, addr_col_name, third_col_name, third_col_type, return_type = "all"){
###detect if data.table or data.frame###
is.DT <- "data.table" %in% class(in_df)
if(!is.DT) in_df <- as.data.table(in_df)
o_cols <- names(in_df)
if((addr_col_name %in% o_cols) & (third_col_name %in% o_cols) & !(new_addr_col_name %in% o_cols)) {
###determine third column name for look-up table###
this.col <- ifelse(grepl("zip",third_col_type,ignore.case=TRUE),'NYC_ab.zipcode','NYC_ab.boro')
###remove spaces and hyphens###
in_df[,ADDR1s := gsub("-","",toupper(gsub("\\s","",as.character(get(addr_col_name)))))]
###assign third column from input data.frame as arbitrary column###
in_df[,temp_col_123 := as.character(get(third_col_name))]
###grab unique addresses###
temp_df <- unique(in_df[,c("ADDR1s","temp_col_123")])
###assign temporary unique id###
temp_df[,temp_id_123 := .I]
###produce sorted unique vector of string lengths###
l_v <- sort(unique(nchar(temp_df$ADDR1s)))
###substring merge###
out_df <- data.table::rbindlist(lapply(l_v, function(i){
###attempt to limit number of records###
dt.x <- temp_df[nchar(temp_df$ADDR1s)==i,]
###make one string column by pasting address substring and third column together###
dt.x[,ADDR1s123 := paste(ADDR1s,temp_col_123,sep="_")]
dt.y <- unique(NYC_address_bank[get(this.col) %in% dt.x$temp_col_123, c("NYC_ab.ADDR1","NYC_ab.ADDR1s",this.col), with=FALSE])
dt.y[,NYC_ab.ADDR1s2 := substr(NYC_ab.ADDR1s, 1, i)]
dt.y[,NYC_ab.ADDR1s := NULL]
dt.y <- unique(dt.y)
dt.y[,n.rows := .N, by=list(NYC_ab.ADDR1s2,get(this.col))]
dt.y <- dt.y[n.rows == 1]
dt.y[,NYC_ab.ADDR1s123 := paste(NYC_ab.ADDR1s2,get(this.col),sep="_")]
dt.x[, NYC_ab.ADDR1:= dt.y[chmatch(ADDR1s123,dt.y$NYC_ab.ADDR1s123),]$NYC_ab.ADDR1]
return(dt.x[,c("ADDR1s","temp_col_123","NYC_ab.ADDR1"), with=FALSE])
}))
data.table::setnames(out_df, c("NYC_ab.ADDR1"), c(new_addr_col_name))
all_x <- ifelse(return_type == "all",TRUE,FALSE)
###merge back to original table###
out_df <- merge(in_df,out_df,by=c("ADDR1s","temp_col_123"),all.x=all_x)
###flush memory###
invisible(gc())
out_df <- out_df[,c(o_cols,new_addr_col_name),with=FALSE]
if(!is.DT) out_df <- as.data.frame(out_df)
return(out_df)
} else {
stop("Please check your column names.")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.