R/assign.locations.R

#' \code{assign.locations} package
#'
#' assign.locations
#'
#' See the Vignette in the future
#'
#' @docType package
#' @name assign.locations
#' @importFrom dplyr %>% select
NULL

## quiets concerns of R CMD check re: the .'s that appear in pipelines
if(getRversion() >= "2.15.1")  utils::globalVariables(c("."))

#' @title funImport
#'
#' @description Assigns address to shapes
#' @param D raw data with all fields
#' @param source data source name
#' @param match.threshold string match quality
#' @param l.study.extent list with cities, zips, states
#' @param pkg.data.root path to dropbox pkg.data root
#' @keywords parcels, clean
#' @export
#' @import stringr
#'     data.table
#'     stringdist
#'     methods.string
#'     foreign
#'     parcels
#'     pkg.data.paths
funImport <- function(D, source='', match.threshold=0.6, l.study.extent=NULL,
                      pkg.data.root = NULL){
  pkg.name <- pkg.data.path <- address.id <- NULL
  # Build the substrate
  l.address <- fun.l.address(pkg.data.root)
  l <- list(); mgs.list <- list(); l$D <- D;
  l$source <- source; l$match.threshold<-match.threshold
  if (is.null(l.study.extent)){
    dt.pkg.data <- pkg.data.paths::paths(pkg.data.root, str.pkg.name = 'parcels')
    parcels.address <- parcels::address(dt.pkg.data$pkg.root[1])
    l.study.extent <- list(cities=unique(parcels.address$city), zips = unique(parcels.address$zip),
                           states=unique(parcels.address$state))
  }
  # Generate checklis
  check <- funImport.check.list(l$D)
  # Break clean if any of these condtions are true
  if (check$location.id & l$source != 'parcels.address') stop('Location already assigned')
  if (check$data.id) stop('data.id automatically created. Please remove')
  if (source==''){stop('must define data source (example: mmed.denver') } else {l$D$record.source <- l$source}
  # Define record ids
  if (check$record.id==FALSE) l$D$record.id <- as.character(1:nrow(l$D))
  # Define address ids
  l$D <- funImport.address.ids(l$D)
  l$data <- funImport.data(l) # Retains record.id specific info
  l$address <- funImport.address(l$D, lookup.address=l.address$parcels) # Cleans and formats address for location assignment
  l$location <- funImport.locations(l, match.threshold, l.study.extent, pkg.data.path) # Assigns address to parcel, outline, point
  setkey(l$data, address.id)
  setkey(l$address, address.id)
  DT <- l$data[l$address]
  setkey(DT, address.id)
  setkey(l$location, address.id)
  DT <- DT[l$location, allow.cartesian=TRUE]
  c.head <- c('record.id', 'address.id', 'location.type', 'location.id', 'street', 'city', 'state', 'zip')
  c.remain <- names(DT)[!(names(DT)%in%c.head)]
  cols <- c(c.head, c.remain)
  DT <- DT[, (cols), with=FALSE]
  return(DT)
}
#' @title funImport.check.list
#'
#' @description checks to see what data fields are being used
#' @param x data.table
#' @keywords check, import, spatial
#' @export
#' @import stringr
#'     data.table
funImport.check.list <- function(x){
  pkg.name <- pkg.data.path <- address.id <- NULL
  l <- list()
  l$location.id <- ('location.id' %in% names(x))
  l$data.id <- ('data.id' %in% names(x))
  l$cityStateZip <- ('cityStateZip' %in% names(x))
  l$street <- ('street' %in% names(x))
  l$city <- ('city' %in% names(x))
  l$state <- ('state' %in% names(x))
  l$zip <- ('zip' %in% names(x))
  l$name <- ('name' %in% names(x))
  l$name.key <- ('name.key' %in% names(x))
  l$name.lic <- ('name.lic' %in% names(x))
  l$record.id <- ('record.id' %in% names(x))
  l$address <- length(grep('(?i)(?=^)address(?!.id)', names(x), value=TRUE, perl=TRUE))>0
  return(l)
}
#' @title funImport.address
#'
#' @description Explodes and cleans address
#' @param x data.table
#' @param lookup.address data.table used to fill missing zips and cities usually parcel.address
#' @keywords check, import, spatial
#' @export
#' @import stringr
#'     data.table
#'     methods.string
#'     parcels
#'     stats
funImport.address <- function(x, lookup.address=NULL){
  city <- state <- zip <- geocode.id <- address.id <- address.cols <- NULL
  l <- na.omit <- address.id <- pkg.name <- NULL
  street.num.temp <- NULL
  regex.address <- '(?i)(^|.*.)address($|.*.)'
  regex.geo <- paste0('(?i)(^)(',paste0(c('city', 'state', 'zip', 'street', 'unit','cityStateZip'),collapse='|'),')($)|',regex.address)
  cols <- names(x)[str_detect(names(x), regex(regex.geo, perl=TRUE))]
  x <- x[,(cols),with=FALSE]
  x <- unique(x, by='address.id')
  # Formats for names from columns
  check <- funImport.check.list(x)
  # Build the street info
  if (check$street==TRUE){
    street <- explode.street(x$street)
    street$street <- NULL
    x <- data.table(x, street)
  } else {
    stop(paste('Error in funImport.address: No street for dataset', l$source))
  }
  # Try and fill city state zip
  if (check$city == FALSE|check$state==FALSE|check$zip==FALSE){
    if (check$cityStateZip==TRUE){
      cityStateZip <- methods.string::explode.cityStateZip(x)
        if (check$city==FALSE) x$city <- cityStateZip$city
      if (check$state==FALSE) x$state <- cityStateZip$state
      if (check$zip==FALSE) x$zip <- cityStateZip$zip
      x$cityStateZip <- NULL
      check <- funImport.check.list(x)
    } 
    if (check$cityStateZip==FALSE){
      if (check$city==FALSE & check$zip==FALSE) stop(paste('Error in funImport.address: No city or zip for dataset', l$source))
      if (check$city==FALSE) x$city <- ''
      if (check$zip==FALSE) x$zip <- ''
    }
    if (check$address==TRUE){ # What is address (think it is full 134 45th street, denver, co 402389)
      address.cols <- stats::na.omit(str_extract(names(x), regex(regex.address, perl=TRUE)))
      address.col <- names(which.max(lapply(x[,(address.cols),with=FALSE], function(y) length(na.omit(y)))))[1]
      address <- methods.string::explode.cityStateZip(x[, address.col, with=FALSE])
      if (check$city==FALSE) x$city <- address$city
      if (check$state==FALSE) x$state <- address$state
      if (check$zip==FALSE) x$zip <-address$zip
    }
    if(check$state==FALSE){
      dt.states <- methods.string::fill.missing.state(x)
      setkey(x, address.id)
      x <- dt.states[x]
    } 
    check <- funImport.check.list(x)
    #x <- try(methods.string::fill.missing.zip.city(x, lookup.address))
  }
  x$zip <- methods.string::extract.zip(x$zip)
  x$city <- methods.string::clean.city(x$city)
  if(check$state==FALSE & check$zip==TRUE){
    dt.states <- methods.string::fill.missing.state(x)
    setkey(x, address.id)
    x <- dt.states[x]
  }
  check <- funImport.check.list(x)
  if(check$state==TRUE){
    x$state <- methods.string::extract.state(x$state)
  }
  return(x)
}
#' @title funImport.data
#'
#' @description checks to see what data fields are being used
#' @param l list of data
#' @keywords check, import, spatial
#' @export
#' @import stringr
#'     data.table
#'     methods.string
funImport.data <- function(l){
  x <- l$D
  check <- funImport.check.list(x)
  regex.geo <- paste0('(?i)(^(',paste0(c('city', 'state', 'zip', 'unit', 'street'),collapse='|'),')($))')
  cols <- names(x)[!str_detect(names(x), regex(regex.geo, perl=TRUE))]
  x <- x[,(cols),with=FALSE]
  # name
  if (check$name.lic){
    x$name.lic <- clean.name(x$name.lic)
    x$name.lic.key <- clean.name.key(x$name.lic)
  } else {
    x$name.lic <- ''
    x$name.lic.key <- ''
  }  
  # name.key
  if (check$name){
    x$name <- clean.name(x$name)
    x$name.key <- clean.name.key(x$name)
  } else {
    x$name <- ''
    x$name.key <- ''
  }
  # Address.alts
  if (check$address){
    address.cols<-na.omit(str_extract(names(x), '(?i).*.address($|.*.)'))
    address.cols.new<- paste0(address.cols,'.alt')
    setnames(x, address.cols, address.cols.new)
  }
  return(x)
}
#' @title funImport.address.ids
#'
#' @description Creates unique address.ids for places that are different
#' @param x data.table
#' @keywords address.ids
#' @export
#' @import stringr
#'     data.table
funImport.address.ids <- function(x){
  address.id <- NULL
  regex.address <- '(?i)(^|.*.)address($|.*.)'
  regex.geo <- paste0('(?i)(^)(',paste0(c('city', 'state', 'zip', 'street', 'unit','cityStateZip'),collapse='|'),')($)|',regex.address)
  cols <- names(x)[str_detect(names(x), regex(regex.geo, perl=TRUE))]
  x <- x[,address.id:=.GRP, by=cols]
  return(x)
}
#' @title funImport.locations
#'
#' @description assigns address to locations (shapefiles)
#' @param l list with all location data.tables
#' @param match.threshold string matching criteria
#' @param l.study.extent cities, zips, states
#' @param pkg.data.root dropbox pkg root
#' @keywords assign, location, import
#' @export
#' @import stringr
#'     data.table
#'     methods.string
#'     api.keys
funImport.locations <- function(l, match.threshold, l.study.extent, pkg.data.root = NULL){
  file.name <- street <- zip <- city <- NULL
  api.key <- api.keys::import.key(str.api.name='google')
  address <- l$address
  address.ids <- address$address.id
  if (!(is.null(l.study.extent))){
    address <- address[zip %in% l.study.extent$zips | city %in% l.study.extent$cities]
  }
  address <- address[!(is.na(street))]
  locs <- funUpdate.locs(address, match.threshold, pkg.data.root)
  locs <- funImport.location.geocode(locs, address, pkg.data.root, api.key, l.study.extent, l$source)
  locs <- funImport.location.intersect(locs, address, pkg.data.root, l$source)
  return(locs)
}
#' @title funImport.location.address
#'
#' @description string matches address (x) to shapefiles address
#' @param l.location points, parcels, geocodes.bad, outlines
#' @param x the address file (to be assigned to l.locations)
#' @keywords assign location import
#' @export
#' @import stringr
#'     data.table
#'     methods.string
#'     tidyr
funImport.location.address <- function(l.location, x){
  street.num.range <- street.num <- street.num.low <- street.num.hi <- NULL
  data.street.num <- data.street.direction <- street.direction.prefix <- NULL
  combn <- data.num.max <- address.id <- location.id <- DT.geo <- NULL
  match.score <- location.street.direction <- location.source <- NULL
  match.cols.n <- match.score <- location.street.direction <- NULL
  street.body <- max.score <- location.match.score <- street <- NULL
  i.comb <- location.street.num <- city <- state <- zip <- NULL
  geocode.id <- adddress.id <- street.num.temp <- NULL
  
  # data: costar.adddress, mmed, etc.
  # locations: parcel.address, point.address
  cat(paste('Assigning', l.location$type), sep='\n')
  data <- x # MMED
  if(!is.null(data$street.num.hi)) data$street.num <- NULL; 
  data <- data %>% # For roll join
    gather(street.num.range, street.num, street.num.low:street.num.hi)
  data <- as.data.table(data)
  data$street.num.range<-NULL
  data <- data[, data.street.num := street.num]
  data <- data[, data.street.direction := street.direction.prefix]
  setkey(data,NULL)
  data <- unique(data)  
  
  location <- l.location$address # Parcels, points, outlines
  match.threshold <- l.location$match.threshold
  type <- l.location$type
  if (any(str_detect(names(data), 'record.id'))) data$record.id<-NULL
  if (missing(match.threshold)|match.threshold>0.9) match.threshold <- 0.8
  # Clone fields for join
  location$location.street.num <- location$street.num
  location$location.street.direction <- location$street.direction.prefix
  # Create combinations of street fields for matching algo
  cols.fixed <- c('street.body', 'city', 'street.num', 'state')
  cols.var.list <- list(list('street.direction.prefix', 10),
                        list('street.type', 10),
                        list('street.direction.suffix', 6),
                        list('street.unit.type', 3),
                        list('street.unit', 3),
                        list('zip',10))
  cols.var.DT <- rbindlist(cols.var.list)
  setnames(cols.var.DT, names(cols.var.DT), c('var', 'score'))
  vec.m <- seq(nrow(cols.var.DT), 1, -1)
  comb.vars <- unlist(lapply(vec.m, combn, x = cols.var.DT$var, simplify=FALSE),recursive = FALSE)
  comb.scores <- sapply(unlist(lapply(vec.m, combn, x = cols.var.DT$score, simplify=FALSE),recursive = FALSE),sum)
  combs <- cbind(comb.vars,comb.scores)
  
  # Set match.threshold
  best.score <- max(unlist(combs[,2]))
  scores.pct <- comb.scores/best.score
  comb.scores.test <- scores.pct > match.threshold
  combs <- combs[comb.scores.test,]
  combs.n <- nrow(combs)
  
  DT.list <- list()
  start.time <- Sys.time()
  for(i.comb in 1:combs.n){
    cols.comb <- combs[i.comb,]$comb.vars
    score <- combs[i.comb,]$comb.scores
    cols <-c(cols.comb, cols.fixed)
    setkeyv(data, cols)
    setkeyv(location, cols)
    cols.data <- c('address.id', 'data.street.direction', cols)
    data.sub <- data[, (cols.data), with=FALSE]
    if (nrow(data.sub)>0){ # Added tghe on 10/7/16 for cases where there is no matches (single address queries), may be buggy
      cols.locations <- c('location.id', 'location.source', 'location.street.direction', 'location.street.num', cols)
      locations.sub <- location[, (cols.locations), with=FALSE]
      setkeyv(data.sub, cols)
      setkeyv(locations.sub, cols)
      data.sub <- data.sub[, data.num.max:=max(street.num), by=address.id]
      data.location <- data.sub[locations.sub, roll=Inf][!is.na(address.id) & location.street.num<=data.num.max]
      if (nrow(data.location)>0){
        setkey(data.location, address.id, location.id)
        data.location <- unique(data.location)
        data.location[ ,match.cols.n := length(cols)]
        data.location[ ,match.score := score]
        data.location[ ,i.comb := i.comb]
        DT.list[[i.comb]]<-data.location
      }
    }
  }
  end.time <- Sys.time()
  # print(end.time-start.time)
  DT <- rbindlist(DT.list, use.names=TRUE, fill=TRUE)
  if (nrow(DT)>0){
    setkey(DT, address.id, location.id)
    #DT[, location.street.num:=NULL]
    #DT[, location.street.direction:=NULL];
    DT$location.type <- type
    # Bad n/s match
    DT <- DT[data.street.direction != location.street.direction &
               (nchar(data.street.direction) > 0 & nchar(location.street.direction) > 0),
             match.score:=0]
    # Penalize geocoded by 1
    DT <- DT[grep('geocode', location.source), match.score := match.score-1]
    DT <- DT[grep('(?i)[0-9]box {1,}(?=$)', street.body, ignore.case = TRUE, perl=TRUE), match.score:=match.score+10]
    DT <- unique(DT[, max.score:=max(match.score),
                    by=address.id][match.score == max.score][,location.match.score:=round(match.score/best.score,2)])
    cols <- c('address.id', 'location.id', 'location.type', 'location.source')
    cols <- c(cols,
              names(DT)[!(names(DT) %in% c(cols, 'data.street.direction', 'match.cols.n','match.score', 'i.comb', 'max.score', 'location.type'))])
    DT <- DT[, (cols), with=FALSE]
  }
  # Add in missing if bad geocode (may happen if bad address and missing fields)
  if (l.location$type == "geocodes.bad") {
    geocodes.bad <- l.location$address[, .(street,city,state,zip,geocode.id=location.id, location.source)]
    setkey(geocodes.bad, street, city, state, zip)
    setkey(x, street, city, state, zip)
    DT.2 <- geocodes.bad[x]
    DT.2 <- DT.2[!is.na(geocode.id)]
    setnames(DT.2, 'geocode.id', 'location.id')
    DT.2 <- DT.2[is.na(street.num), data.num.max:=0]
    DT.2$street.num.temp <- as.integer(0)
    DT.2 <- DT.2[!is.na(street.num), street.num.temp:=as.integer(street.num)]
    DT.2 <- DT.2[, data.num.max:=as.numeric(max(street.num.temp)), by=address.id]
    DT.2$location.type <- 'geocodes.bad'
    DT.2$street.num.temp <- NULL
    #DT.cols[!(DT.cols %in% names(DT.2))]
    # add onto the bottom of DT
    if (nrow(DT)>0){
      DT.address.ids <- unique(DT$address.id)
      DT.2 <- DT.2[!(address.id %in% DT.address.ids)]
      l.DT <- list(DT, DT.2)
      DT <- rbindlist(l.DT, use.names=TRUE, fill=TRUE)
      #   print(nrow(DT))
      DT <- DT[, DT.cols, with=FALSE]
    } else {
      DT <- DT.2
    }
  }
  return(DT)
}
#' @title funImport.location.geocode
#'
#' @description string matches address (x) to shapefiles address
#' @param locs table showing what shapes the address.ids are associated with
#' @param address data to be assigned
#' @param pkg.data.root source dropbox data
#' @param api.key google api key
#' @param l.study.extent cities, zips, states
#' @param location.source source data mmed.1348998
#' @keywords assign location import
#' @export
#' @import stringr
#'     data.table
#'     methods.string
#'     points
funImport.location.geocode <- function(locs, address, pkg.data.root, api.key, l.study.extent, location.source){
  pkg.name <- location.id <- street.num <- address.num.low <- address.num.hi <- address.address.id <- NULL
  street.num.low <- street.num.hi <- locs.address.id <- NULL
  geocode.n <- location.type <- city <- street <- NULL
  DT.geo <- match.threshold <- pkg.name <- cityStreetZip <- pkg.data.root <- NULL
  address.id <- NULL
  dt.pkg.data <- pkg.data.paths::paths(path.root = pkg.data.root, str.pkg.name = 'points')
  points.address <- points::address(dt.pkg.data$sys.path)
  states.abbrev <- names(table(address$state))[table(address$state)/nrow(address) > 0.5 & names(table(address$state)) != ""]
  funCheck.geocode.ids <- function(locs, address){
    pkg.name <- location.type <- address.id <- address.address.id <- address.num.low <- NULL
    address.num.hik <- street.num.range <- street.num.points.address <- NULL
    points.address <- points::address(dt.pkg.data[pkg.name=='points']$sys.path)
    check.ids <- list()
    locs.points <- locs[location.type=='points']
    check.ids$bad <- sort(locs[location.type=='geocodes.bad', address.id])
    # Check for addresses with street num ranges
    x.locs <- locs.points[,.(address.id, location.id)]
    x.points.address <- points.address[, .(location.id, street.num.points.address = street.num)]
    setkey(x.locs, location.id)
    setkey(x.points.address, location.id)
    x.locs <- x.points.address[x.locs]
    x.locs$locs.address.id <- x.locs$address.id
    x.address <- address[,.(address.id, address.address.id = address.id, address.num.low = street.num.low, address.num.hi = street.num.hi)]
    setkey(x.address, address.id)
    setkey(x.locs, address.id)
    x <- x.locs[x.address]
    setkey(x, address.id, street.num.points.address)
    x <- unique(x)
    x$street.num.range <- FALSE
    x <- x[address.num.low != address.num.hi, street.num.range:=TRUE]
    x$geocode.n <- 0
    x <- x[!(is.na(locs.address.id)), geocode.n := as.numeric(.N), by=address.id]
    check.ids$noRange <- unique(x[!(is.na(locs.address.id)) & street.num.range & geocode.n < 2 & !(address.id %in% check.ids$bad)]$address.id)
    check.ids$noCode <- unique(x[is.na(locs.address.id) & !(address.id %in% check.ids$bad)]$address.id)
    address.ids.missing <- unique(c(check.ids$noCode, check.ids$noRange))
    address.ids.missing <- address.ids.missing[!(address.ids.missing %in% check.ids$bad)]
    address.ids.skip <- unique(c(address.ids.missing, check.ids$bad))
    check.ids$missing <- address[address.id %in% address.ids.missing]
    check.ids$coded <- unique(sort(address[!(address.id %in% address.ids.skip)]$address.id))
    cat(paste('----------------------'),
        paste('gecode possible:      ', nrow(address)),
        paste('----------------------'),
        paste('complete:             ', length(unique(check.ids$coded))),
        paste('geocode missing all:  ', length(unique(check.ids$noCode))),
        paste('geocode missing one:  ', length(check.ids$noRange)),
        paste('bad:                  ', length(unique(check.ids$bad))),
        paste('-------------------------'),
        c('', ''),
        sep = '\n')
    return(check.ids)
  }
  # Geocode missing 
  check.ids <- funCheck.geocode.ids(locs, address)
  # Initialize empty table
  address.ids.missing <- check.ids$missing$address.id
  location.ids.points <- locs[location.type=='points' & address.id %in% address.ids.missing]
  i.missing <- 0
  if (nrow(check.ids$missing) > 0){
    for (missing.id in address.ids.missing){
      i.missing <- i.missing + 1
      DT.geo.point <- check.ids$missing[address.id == missing.id]
      print(paste('geocoding >> address.id:', DT.geo.point$address.id, '| street:', DT.geo.point$street, ' Number ', i.missing, 'of', length(address.ids.missing)))
      if (!(is.na(DT.geo.point$street.num))){
        DT.geo.point.unique <- DT.geo.point$street.num.low==DT.geo.point$street.num.hi
        # Only one street number
        if (DT.geo.point.unique){
          DT.geos <- geocode::geocode(pkg.data.root, DT.geo.point, api.key, location.source,  l.study.extent)
        } else { # Range of street numbers
          DT.geo.list <- list()
          DT.geo.low <- copy(DT.geo.point[, street.num:=street.num.low])
          DT.geo.high <- copy(DT.geo.point[, street.num:=street.num.hi])
          DT.geo.list$low <- geocode::geocode(pkg.data.root, DT.geo.low, api.key, location.source, l.study.extent)
          DT.geo.list$high <- geocode::geocode(pkg.data.root, DT.geo.high, api.key, location.source, l.study.extent)
          
          num.range <- unlist(DT.geo.point[,.(street.num.low, street.num.hi)])
          nums <- seq(num.range[1], num.range[2])
          nums <- nums[!(nums %in% num.range)]
          nums.n <- length(nums)
          nums.inds <- unique(sample(1:nums.n, size = 10, replace=TRUE))
          nums <- nums[nums.inds]
          loc.match <- FALSE
          iter <- 0
          i.hi <- length(nums)
          i.low <- 1
          end.loc <- c(0,0)
          while (iter <= length(nums) & loc.match==FALSE){
            iter <- iter+1
            start.loc <- nums[iter]
            if(iter %% 2 == 0){
              i <- i.hi
              i.hi <- i.hi - 1
            } else {
              i <- i.low
              i.low <- i.low + 1
            }
            DT.temp <- DT.geo.point[, street.num:=nums[i]]
            DT.geo.list[[iter+2]] <- geocode::geocode(pkg.data.root, DT.temp, api.key, location.source, l.study.extent)
            end.loc <- nums[i]
            if (iter==1){
              loc.match == FALSE
            } else {
              loc.match <- (identical(end.loc, start.loc) | DT.geo$match.rank > 2)
            }
          }
          DT.geos <- rbindlist(DT.geo.list, use.names=TRUE, fill=TRUE)
        }
      } else {
        geocodes.bad.address <- geocode::geocodes.bad.address(geocodes.bad.address.new = DT.geo.point, location.source)
      }
    }
    locs <- funUpdate.locs(address, match.threshold, pkg.data.root)
    check.ids <- funCheck.geocode.ids(locs, address)
  }
  return(locs)
}
#' @title funImport.location.intersect
#'
#' @description intersect points to shapes
#' @param locs assignment table
#' @param address to be looked up
#' @param pkg.data.root dropbox data root dir
#' @param location.source data source
#' @keywords assign location import
#' @export
#' @import stringr
#'     data.table
#'     methods.string
#'     methods.shapes
#'     points
#'     parcels
#'     pkg.data.paths
#'     sp
#'     tidyr
#'     dplyr
funImport.location.intersect <- function(locs, address, pkg.data.root, location.source){
  pkg.name <- pkg.root <- point.id <- outline.id <- parcel.id <-address.id <- NULL
  outlines.address <- parcels.address <- state <- zip <- NULL
  location.id <- location.type <- cityStateZip <- city <- match.threshold <- NULL
  l.address <- fun.l.address(pkg.data.root)
  dt.pkg.data <- pkg.data.paths::paths(path.root = pkg.data.root, str.pkg.name = 'parcels')
  shapes.parcels <- parcels::shapes(dt.pkg.data$pkg.root[1]) 
  dt.pkg.data <- pkg.data.paths::paths(path.root = pkg.data.root, str.pkg.name = 'outlines')
  shapes.outlines <- outlines::shapes(dt.pkg.data$pkg.root[1])
  funCheck.ids <- function(locs, address, pkg.data.root){
    l.address <- fun.l.address(pkg.data.root)
    setkey(locs, address.id, location.id, location.type)
    locs <- unique(locs)
    check.ids <- list()
    points.address <- l.address$points
    locs.points <- locs[location.type=='points'][, .(address.id, point.id = location.id)]
    locs.outlines <- locs[location.type=='outlines'][, .(address.id, outline.id = location.id)]
    locs.parcels <- locs[location.type=='parcels'][, .(address.id, parcel.id = location.id)]
    setkey(locs.parcels, address.id)
    setkey(locs.outlines, address.id)
    join.1 <- locs.parcels[locs.points]
    locs.spread <- locs.outlines[join.1]
    
    cat(paste('Address', length(unique(locs$address.id))), sep='\n')
    
    missing <- list()
    missing$parcel <- locs.spread[is.na(parcel.id)]
    missing$outline <- locs.spread[is.na(outline.id)]
    missing$point <- locs.spread[is.na(point.id)]
    cat('Missing:', sep='\n')
    out <- unlist(lapply(missing, function(x) length(unique(x$address.id))))
    out <- data.table(type=names(out), n.missing=out)
    print(out)
    
    no.match <- list()
    no.match$parcel <- locs.spread[parcel.id =='no.match']
    no.match$outline <- locs.spread[outline.id =='no.match']
    no.match$point <- locs.spread[point.id =='no.match']
    cat('No.match:', sep='\n')
    out <- unlist(lapply(no.match, function(x) length(unique(x$address.id))))
    out <- data.table(type=names(out), no.match=out)
    print(out)
    return(locs.spread)
  }
  # Global Overlays
  cat('Start Stats', sep='\n')
  locs.spread <- funCheck.ids(locs, address, pkg.data.root)
  cat(' ', sep='\n')
  setnames(l.address$points, 'location.id', 'point.id')
  setkey(l.address$points, point.id)
  
  # Overlay parcels
  locs.parcel <- unique(locs.spread[is.na(parcel.id)][,.(point.id)])
  setkey(locs.parcel, point.id)
  locs.parcel <- l.address$points[locs.parcel]
  if (nrow(locs.parcel)>0){
    DT.geocoded.shapes <- methods.shapes::shapes.coords2points(locs.parcel)
    # Step 3: Update [outlines.address]
    print('Intersecting parcels...')
    DT.points.parcels <- sp::over(DT.geocoded.shapes, shapes.parcels)
    parcel.ids <-  DT.points.parcels$parcel.id
    DT.geo.parcels.intersect <- as.data.table(cbind(parcel.id=parcel.ids, locs.parcel))
    DT.geo.parcels <- DT.geo.parcels.intersect[is.na(parcel.id), parcel.id := 'no.match']
    DT.geo.parcels.address.new <-  DT.geo.parcels
    setnames(DT.geo.parcels.address.new, 'parcel.id', 'location.id')
    col.names <- names(parcels.address)
    col.names <- col.names[!(col.names %in% c('area','location.source', 'location.street.direction', 'location.street.num'))]
    if (!('cityStateZip' %in% names(DT.geo.parcels.address.new))){
      DT.geo.parcels.address.new <- DT.geo.parcels.address.new[, cityStateZip := paste0(city, ' ', state, ', ',zip)]
    }
    DT.geo.parcels.address.new <- DT.geo.parcels.address.new[, (col.names), with=FALSE]
    l.address$parcels <- parcels::address.update(pkg.data.root, l.address$parcels, DT.geo.parcels.address.new)
    locs <- funUpdate.locs(address, match.threshold, pkg.data.root)
  }
  
  # Overlay outlines
  locs.outline <- unique(locs.spread[is.na(outline.id)][,.(point.id)])
  setkey(locs.outline, point.id)
  locs.outline <- l.address$points[locs.outline]
  if (nrow(locs.outline)>0){
    DT.geocoded.shapes <- methods.shapes::shapes.coords2points(locs.outline)
    # Step 3: Update [outlines.address]
    print('Intersecting outlines...')
    DT.points.outlines <- sp::over(DT.geocoded.shapes, shapes.outlines)
    outline.ids <-  DT.points.outlines$outline.id
    DT.geo.outlines.intersect <- as.data.table(cbind(outline.id=outline.ids, locs.outline))
    DT.geo.outlines <- DT.geo.outlines.intersect[is.na(outline.id), outline.id := 'no.match']
    DT.geo.outlines.address.new <-  DT.geo.outlines
    setnames(DT.geo.outlines.address.new, 'outline.id', 'location.id')
    col.names <- names(outlines.address)
    col.names <- col.names[!(col.names %in% c('area','location.source', 'location.street.direction', 'location.street.num'))]
    if (!('cityStateZip' %in% names(DT.geo.outlines.address.new))){
      DT.geo.outlines.address.new <- DT.geo.outlines.address.new[, cityStateZip := paste0(city, ' ', state, ', ',zip)]
    }
    DT.geo.outlines.address.new <- DT.geo.outlines.address.new[, (col.names), with=FALSE]
    l.address$outlines <- outlines::address.update(pkg.data.root, l.address$outlines, DT.geo.outlines.address.new)
    locs <- funUpdate.locs(address, match.threshold, pkg.data.root)
  }
  check.ids <- funCheck.ids(locs, address, pkg.data.root)
  return(locs)
}
#' @title funUpdate.locs
#'
#' @description Assigns address to locations in last stage of import
#' @param address data.table with all exploded address fields
#' @param match.threshold string match quality
#' @param pkg.data.root package path data
#' @keywords parcels clean update locations
#' @export
#' @import stringr
#'     data.table
#'     stringdist
#'     methods.string
#'     parcels
#'     outlines
funUpdate.locs <- function(address, match.threshold, pkg.data.root){
  street <- address.id <- address.id <- address.num.hi <- address.num.low <- state <- NULL
  location.id <- NULL
  totals <- nrow(address)
  names(totals) <- 'possible'
  l.address <- fun.l.address(pkg.data.root)
  l.locations <- list(parcels=list(type='parcels', address=l.address$parcels, match.threshold=match.threshold),
                      points=list(type='points', address=l.address$points, match.threshold=match.threshold),
                      outlines=list(type='outlines', address=l.address$outlines, match.threshold=match.threshold),
                      geocodes.bad=list(type='geocodes.bad', address = l.address$geocodes.bad, match.threshold=0.5))
  # Join Data to existing address -> locations
  l.locs <- sapply(l.locations, function(l.location) funImport.location.address(l.location, address), USE.NAMES=TRUE, simplify = FALSE)
  # Make sure of no overly zealous matches on geocodes.bad
  l.locs.ids <- which(lapply(l.locs, nrow)>0)
  l.locs.ids.sub <- l.locs.ids[!(l.locs.ids%in%1)]
  addresses.good <- unique(unlist(sapply(l.locs[l.locs.ids.sub], '[[', 1)))
  temp <- l.locs$geocodes.bad[!(address.id %in% addresses.good)]
  l.locs$geocodes.bad 
  diagnose <- list(n.assigned = c(totals,sapply(l.locs[l.locs.ids], function(x) length(unique(x[location.id != 'no.match', address.id])))))
  cat('-------------------', sep='\n')
  cat(sapply(diagnose, function(y) paste0('Address assigned to ',str_pad(names(y), 13, 'right'), ' : ', y)), sep='\n')
  cat('-------------------', sep='\n')
  locs <- rbindlist(l.locs, use.names = TRUE)
  locs <- locs[, c('address.id', 'location.id', 'location.type'), with=FALSE]
  ## Debug check
  address.ids <- unique(address$address.id)
  address.ids.n <- length(address.ids)
  locs.address.ids <- unique(locs$address.id)
  locs.address.ids.n <- length(locs.address.ids)
  debug.address <- c(paste('Addresses missing:   ', address.ids.n - locs.address.ids.n))
  cat(debug.address, sep='\n')
  missing.addresses <- address[!(address.ids %in% locs.address.ids), .(address.id, street)]
  l.missing <- split(missing.addresses, seq(nrow(missing.addresses)))
  if ((address.ids.n - locs.address.ids.n)!=0){
    cat(sapply(l.missing, function(x) paste('address.id: ', x$address.id, '| street:', x$street)), sep='\n')
  }
  return(locs)
}
#' @title fun.l.address
#'
#' @description Assigns a list of addresses
#' @param pkg.data.root dropbox root
#' @keywords parcels, clean, update, locations, list
#' @export
#' @import stringr
#'     data.table
#'     methods.string
#'     parcels
#'     outlines
#'     geocode
#'     points
fun.l.address <- function(pkg.data.root=NULL){
  dt.pkg.data <- pkg.data.paths::paths(pkg.data.root, 'parcels')
  parcels.address <- parcels::address(dt.pkg.data$pkg.root[1])
  dt.pkg.data <- pkg.data.paths::paths(pkg.data.root, 'outlines')
  outlines.address <- outlines::address(dt.pkg.data$pkg.root[1])
  dt.pkg.data <- pkg.data.paths::paths(pkg.data.root, 'points')
  points.address <- points::address(dt.pkg.data$sys.path)
  geocodes.bad.address <- geocode::geocodes.bad.address(pkg.data.root)
  return(list(parcels=parcels.address, points=points.address, outlines=outlines.address, geocodes.bad=geocodes.bad.address))
}
erikbjohn/assign.locations documentation built on May 30, 2019, 8:03 a.m.