R/methods.string.R

#' \code{methods.string} package
#'
#' methods.string
#'
#' See the README on
#'
#' @docType package
#' @name methods.string
#' @importFrom dplyr %>%
#' @importFrom data.table :=
NULL

## quiets concerns of R CMD check re: the .'s that appear in pipelines
if(getRversion() >= "2.15.1")  utils::globalVariables(c("."))

#' @title clean.city
#'
#' @description Extracts and cleans city from cityStateZip based on table lookup and matching.
#' @param cityStateZip character vector
#' @keywords city, clean
#' @export
#' @import stringr
#'     data.table
#'     stringdist
clean.city <- function(cityStateZip){
    city.replace <- NULL
    city <- NULL
    # city state zip
    cityStateZip <- str_trim(cityStateZip)
    cityStateZip <- str_replace_all(cityStateZip, ',|-|\\(|\\)', ' ')
    cityStateZip <- str_replace_all(cityStateZip, regex('(?i)(?<=^)ENVER', perl=TRUE), 'Denver')
    x <- data.table(cityStateZip)
    regex.remove <- '(?i)( [A-Z]{2,2}( |$|,|, )|[0-9]{1,5})(.*.|$)'
    x <- x[, city:=str_replace_all(cityStateZip, regex(regex.remove, perl=TRUE), '')]
    x <- x[, city:=str_trim(str_to_title(city))]
    # Do some string matching
    city.search <- unique(x$city)
    match.ids<-amatch(city.search, methods.string::study.cities, method='soundex')
    cities.soundex <- methods.string::study.cities[match.ids]
    cities.jw <- methods.string::study.cities[amatch(city.search, methods.string::study.cities, method='jw')]
    city.table <- data.table(city.search, cities.soundex, cities.jw)
    city.table <- city.table[, city.replace:=cities.jw]
    city.table <- city.table[is.na(cities.jw), city.replace:=cities.soundex]
    city.table <- unique(city.table[, .(city.search, city.replace)][city.search!='' & city.search != 'Na'])
    city.table <- city.table[order(city.search)]
    missing.cities <- city.table[is.na(city.replace), city.search]
    city.table <- city.table[!is.na(city.replace)]
    matches.pattern <- paste0('(?i)(?<=^)',city.table$city.search)
    matches.replacement <- city.table$city.replace
    if(length(matches.replacement>0)){
        names(matches.replacement) <- matches.pattern
        cityStateZip <- str_replace_all(cityStateZip, matches.replacement)
    }
    cityStateZip <- str_replace_all(cityStateZip, regex('[ ]{1,}', perl=TRUE), ' ')
    return(cityStateZip)
}
#' @title clean.lic.type
#'
#' @description cleans marijuana license type for mmed locations
#' @param license.type character vector
#' @keywords city, license, lic, type
#' @export
#' @import stringr
#'     data.table
clean.lic.type <- function(license.type){
    license.type <- str_trim(str_to_title(license.type))
    license.type <- gsub('Mmj Center License ', 'Center - ', license.type)
    return(license.type)
}
#' @title clean.lic.num
#'
#' @description cleans marijuana license number
#' @param license.num character vector
#' @keywords city, license, lic, num, number
#' @export
#' @import stringr
#'     data.table
clean.lic.num <- function(license.num){
    license.num <- str_trim(str_to_title(license.num))
    return(license.num)
}
#' @title clean.name
#'
#' @description Clean an d sanitize business (location) names
#' @param name character vector with names
#' @keywords clean anems
#' @export
#' @import stringr
clean.name <- function(name){
    name <- gsub('THE PLEASURE CAF\x83 LLC', 'THE PLEASURE CAFE LLC', name)
    name <- str_trim(gsub('\\&.*.\\;', "'", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub('\\&.*.\\;', "'", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub(',|\\.', "", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub('Inc\\_', "Inc", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub("'", "", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub("\\-", "", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub('[^[:alnum:] ]', "", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub('[\\_]{2,}', '', name, perl=TRUE))
    name <- str_trim(gsub('( llc| inc)(?= |$)', " ", name, ignore.case = TRUE, perl=TRUE))
    name <- gsub('\\s{2,}',' ', name, perl=TRUE)
    name <- gsub('\\.','', name, perl=TRUE)
    name <- str_to_title(name)
    return(name)
}
#' @title clean.name.key
#'
#' @description create a unique name key
#' @param name character vector with names
#' @keywords clean anems
#' @export
#' @import stringr
clean.name.key <- function(name){
    name <- str_trim(gsub('\\&.*.\\;', "'", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub(',|\\.', "", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub('Inc\\_', "Inc", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub("'", "", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub("\\-", " ", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub('[^[:alnum:] ]', "", name, ignore.case = TRUE, perl=TRUE))
    name <- str_trim(gsub('[\\_]{2,}', '', name, perl=TRUE))
    name <- str_trim(gsub('( llc| inc)(?= |$)', " ", name, ignore.case = TRUE, perl=TRUE))
    name <- gsub("'", '', name)
    name <- gsub("\\.", '', name)
    name <- gsub("\\.", '', name)
    name <- tolower(gsub('[^[:alnum:]]', '', name))
    name <- gsub("llc|group|cooperative", '', name)
    name <- gsub("coop|center|industry|industries|inc|amp", '', name)
    name <- gsub("the ", '', name)
    name <- str_to_lower(name)
    return(name)
}

#' @title clean.special
#'
#' @description Cleans up some custom strings in the c*star rent data.
#' @param DT datatable with address info, city, street.type, street.direction.prefix, street.body
#' @param DT.name string flag for c*rent
#' @keywords special clean for some weird exceptions (Denver does not have any north)
#' @export
#' @import data.table
clean.special <- function(DT, DT.name=''){
    data.id <- NULL
    city <- NULL
    street.type <- NULL
    street.direction.prefix <- NULL
    street.body <- NULL
    city <- NULL
    street <- NULL
    if (DT.name=='costar.rent'){
        DT <- DT[data.id == '1657', city := 'Denver']
        glendale.building.id <- c('105','148', '152', '153', '159')
        DT <- DT[data.id %in% glendale.building.id, city:='Denver']
        DT <- DT[data.id=='368', street.type:='Ave']
        DT <- DT[data.id=='502', street.type:='Pl']
        DT <- unique(DT)
    }
    DT <- DT[city=='Denver' & street.direction.prefix =='N', street.direction.prefix:='']
    DT <- DT[city=='Denver' & street.body == 'Broadway', street.type:='']
    DT <- DT[street=='6859 N Hills Hwy Building C', street.body:= 'Foothills Hwy']
    DT <- DT[street=='4699 Nautilus Ct South', city:='Boulder']
    DT <- DT[street=='4301 Jason Street Suite 101', street.type:='Ct']
    return(DT)
}
#' @title clean.state
#'
#' @description Just returns CO if state is missing.
#' @param cityStateZip character vector
#' @keywords silly, colorodo
#' @export
#' @import stringr
#'     data.table
#'     stringdist
clean.state <- function(cityStateZip){
    state.name <- NULL
    name <- NULL
    state.replace <- NULL
    x <- cityStateZip
    x <- str_extract(x, regex('(?<= ).+', perl=TRUE))
    states <- unlist(str_split(x, regex(', ', perl=TRUE)))
    # Those without abbreviations
    state.search <- unique(states)
    state.search <- state.search[str_length(state.search)>2 & !grepl('[0-9]{1,}', state.search)]
    states.soundex <- methods.string::lookup.state[amatch(state.search, name, method='soundex')]$abb
    states.jw <- methods.string::lookup.state[amatch(state.search, name, method='jw')]$abb
    state.table <- data.table(state.search, states.soundex, states.jw)
    state.table <- state.table[, state.replace:=states.jw]
    state.table <- state.table[is.na(states.jw), state.replace:=states.soundex]
    state.table <- unique(state.table[, .(state.search, state.replace)][state.search!='' & state.search != 'Na'])
    state.table <- state.table[order(state.search)]
    missing.states <- state.table[is.na(state.replace), state.search]
    state.table <- state.table[!is.na(state.replace)]
    matches.pattern <- paste0('(?<= )',state.table$state.search, '(?=( |,|$))')
    matches.replacement <- state.table$state.replace
    if(length(matches.replacement>0)){
        names(matches.replacement) <- matches.pattern
        cityStateZip <- str_replace_all(cityStateZip, matches.replacement)
    }
    cityStateZip <- str_replace_all(cityStateZip, regex('[ ]{1,}', perl=TRUE), ' ')
    return(cityStateZip)
}
#' @title clean.street
#'
#' @description Cleans up and standardizes street string. Needed for the street.explode function.
#' @param street character vector of streets
#' @param l.regex helper function since data.table not parallelizable
#' @keywords street.explode
#' @export
#' @import stringr
#'     data.table
#'     stats
clean.street <- function(street, l.regex){
   street.base <-''
    iter <- 0
    while(identical(street, street.base)==FALSE & iter < 6){
        #changes <- cbind(street.base[which(street !=street.base)],street[which(street !=street.base)])
        street.base <- street
        iter <- iter + 1
        street <- str_trim(str_replace_all(street, regex('[\\s]{2,}', perl=TRUE), ' '))
        street <- str_replace(street, '\\\\', '')
        street <- str_replace(street, ' Units ', ' Unit ')
        street <- str_replace(street, ' Suites ', ' Suites ')
        street <- str_replace(street, 'Unitis', 'Unit')
        street <- str_replace(street, '1529 York St 2Nd Fl', '1529 York St')
        street <- str_replace(street, 'South Boulder', 'SouthBoulder')

        #street <- '28 StandBrighton Blvd'
        street <- str_replace(street,regex(l.regex$intersection, perl=TRUE), ' and ')

        #E 17Th Ave @ Park Avenue West'
        street <- str_replace(street, '@', 'and')

        #Ste 300 425 S Wilcox St', 'Suite 2400 1600 Broadway'
        street <- str_replace(street, regex(l.regex$unit.start, perl=TRUE), '')

        #One Flatiron Circle Ste Ff232'
        str.nums <- as.character(1:10)
        names(str.nums) <-  paste0('(?i)(?<=^)',c('one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'))
        street <- str_replace_all(street, str.nums)

        # MMED fixed
        #  1044 & 1046 W 5TH AVE
        street <- str_replace(street, regex(paste0('(?<=[0-9]{1,1})', '( |)&( |)', '(?=[0-9]{1,})'), perl=TRUE), '-')
        street <- str_replace(street, '1545 W BAYAUD STREET #3', '1545 W BAYAUD Ave #3')
        street <- str_replace(street, 'FILMORE', 'Fillmore')
        street <- str_replace(street, '1136 N YUMA CT', '1136 Yuma CT')
        street <- str_replace(street, '150 RIO GRAND BLVD', '150 RIO GRANDE BLVD')
        street <- str_replace(street, '7739 E -7741 COLFAX AVE','7739-7741 E COLFAX AVE')

        #9800 Mt Pyramid Ct Ste 4042Aand4042b # Be careful to not break up "3415 S Oleander"
        unit.parts <-  str_split(street, '\\s')
        unit.parts.and <- lapply(unit.parts, str_match, pattern=regex('.*and.*', perl=TRUE))
        unit.parts.and <- sapply(sapply(unit.parts.and, str_match, pattern=regex('.*\\d{1,}.*', perl=TRUE), simplify=TRUE), na.omit, simplify=TRUE)
        unit.parts.and <- sapply(unit.parts.and, unlist, simplify=TRUE)
        unit.street.edit <- which(sapply(unit.parts.and, length)>0)
        street[unit.street.edit] <- lapply(street[unit.street.edit], str_replace, regex('(?<=[^\\s])and(?=[^\\s])', perl=TRUE), ' and ')

        # Remove special characters and periods (keep extra space until end)
        #2517 1/2 BROADWAY'
        street <- str_replace(street,regex('(?<=[0-9]{1,1}) [0-9]{1,}\\/[0-9]{1,} (?=[A-z]{1,1})', perl=TRUE), ' ')
        street <- str_to_title(gsub('\\,|\\#|;', ' ', street))
        street <- str_replace_all(street, '((\\s){0,}\\&(\\s{0,}))',' and ')
        street <- str_replace_all(street, '\\.|\\(|\\)','')
        street <- str_replace_all(street, '\\*', '')
        # Clean up
        #"3740B E 40Th Ave"
        street <- str_replace_all(street,'\\^', '')
        street <- str_replace(street,'SuiteC & D', 'Suite C&D')
        street <- str_replace(street,'StreetSuite', 'Street Suite')
        street <- str_replace(street,'(?i)Unit(-|6)', 'Unit')
        street <- str_replace(street,'UNITA&B', 'UNIT A&B')
        street <- str_replace(street,'SUITE100', 'Suite 100')
        street <- str_replace(street,'1910 - 16th', '1910 16th')
        street <- str_replace(street,'St Paul', 'Saint Paul')
        street <- str_replace(street,'StreetPenrose', 'Street Penrose')
        street <- str_replace(street,'A150', 'Unit A150')
        street <- str_replace(street,' Cr ', ' County Road ')
        street <- str_replace(street,' U S ', ' US ')
        street <- str_replace(street,' Us ', ' US ')
        street <- str_replace(street,'Btm \\& Top', '')
        street <- str_replace(street,'First Floor', 'Floor 1')
        street <- str_replace(street, regex('[\\s]{0,}(\\+|\\-|\\/|\\&)[\\s]{0,}', perl=TRUE), '-')
        street <- str_replace(street, regex('(?<=[0-9]{1,1})\\-(?=[0-9]{1,}[A-z]{1,}\\s)', perl=TRUE), ' ')
        street <- str_replace(street, regex('[A-z]{1,}(\\-)(?=[0-9]{0,}$)', perl=TRUE), ' ')
        street <- str_replace(street, 'Jewll', 'Jewell')
        street <- str_replace(street,'Bottom-Top', '')
        street <- str_replace(street,'A-Ste F1', 'A')
        street <- str_replace(street,' Frnt St ', ' Front St ')
        street <- str_trim(str_replace_all(street, regex('[\\s]{2,}', perl=TRUE), ' '))
        #"1997 And 2000 S Acoma St"'
        street <- str_replace(street,regex('(?<=[0-9]{1,1}) And (?=[0-9]{1,1})', perl=TRUE), '-')
        #990W 6Th Ave Unit 5'
        str.extract <- str_extract(street,regex(paste0('(?<=^)[0-9]{1,}(',l.regex$abbrev.compass.search,' )'), perl=TRUE))
        str.direction <- str_extract(str.extract,regex(paste0('(',l.regex$abbrev.compass.search,')'), perl=TRUE))
        street <- str_replace(street, regex(paste0(str.direction),perl=TRUE), paste0(' ', str.direction))
        #"3740B E 40Th Ave"
        # bad.prefix <- str_detect(street, regex('(?<=^)([0-9]{1,}[A-z]{1,})(?=\\s)', perl=TRUE))
        street.num.prefix <- str_extract(street, regex('(?<=^)([0-9|-]{1,}[A-z]{0,})(?=\\s)', perl=TRUE))
        street.num.prefix[is.na(street.num.prefix)] <- ''
        street.num.extract <- str_extract(street, regex('(?<=^)([0-9|-]{1,})(?=(([A-z]{1,1})|\\s|$))', perl=TRUE))
        street.num.extract[is.na(street.num.extract)] <- ''
        street.body <- str_replace(street, paste0('(?<=^)',street.num.prefix, ' '), '')
        street <- paste(street.num.extract, street.body)
        # street.num.prefix[is.na(street.num.prefix)] <- street.num.extract
        # street.unit.suffix <- str_replace(street.num.extract, street.num.prefix, '')
        # street <- paste(street.num.prefix, street.body, street.unit.suffix)
        #9900 Stanford Ave St 107'
        street <- str_replace(street, ' Ave St ', ' Ave Ste ')
        street <- str_replace(street, 'Royal M H', 'Royal MH')
        street <- str_replace(street, '6859 N Hills Hwy', '6859 N Foothills Hwy')
        #5401 S Park Terrace Ave Ste 209 A'
        street <- str_replace(street, regex('(?<=[0-9])\\s(?=\\w$)', perl=TRUE), '')
        #1035 Pearl St 3rd Floor'
        street <- str_replace(street,  '1035 Pearl St 3rd Floor', '1035 Pearl St Floor 3')
        street <- str_replace(street, '2625 East Street Vrain A',  '2625 East St Vrain Street A')
        # abbrev.suffix <- methods.string::abbrev[class=='suffix', search]
        # regex.abbrev.suffix <- paste0('(?<= )(', paste0(abbrev.suffix, collapse='|'), ')(?= )')
        # # Speed up
        # abbrev.suffix <- unique(unlist(str_extract_all(street, regex(regex.abbrev.suffix, perl=TRUE))))
        # if (length(abbrev.suffix>0)){
        #     regex.suffix <- paste0('(', paste0(abbrev.suffix, collapse='|'), ')')
        #     regex.suffix.pair <- paste0(regex.suffix, '\\s', regex.suffix)
        #     str.search <- unique(str_extract(street,regex(regex.suffix.pair, perl=TRUE)))
        #     str.search <- str.search[!is.na(str.search)]
        #     str.replace <- str_extract(str.search,regex(paste0('(?<=^)', regex.suffix, '(?= )'), perl=TRUE))
        #     names(str.replace) <- str.search
        #     street <- str_replace_all(street, str.replace)
        # }
        # Step 3: Get rid of extra spaces and return

        # Remove two unit suffixes
        reg.unit <- paste0('(',paste0(paste0(' ',l.regex$x, ' '), collapse='|'),')')
        ids <- which(sapply(str_split(street, reg.unit), length)>2)
        pattern.neg.ahead <- paste0('(?!.* ',paste0(l.regex$s, collapse='|'),').*.')
        street[ids] <- str_trim(str_replace(street[ids], regex(paste0(str_extract(street[ids], regex(pattern.neg.ahead, perl=TRUE)),'(?=$)'), perl=TRUE),''))

        # street <- '4228 York Street Unit 104 106-107'
        pattern.neg.ahead <- paste0(l.regex$t,'(?!.*',l.regex$u,')(.*.|$)')
        street.tail <- str_extract(street, regex(pattern.neg.ahead, perl=TRUE))
        ids <- which(sapply(sapply(street.tail, str_split, ' |-| and '), length) > 2)

        if (length(ids) > 0){
            # print(paste('Cleaning', length(ids), 'bad street units'))
            for (i in 1:length(ids)){
                i.id <- ids[i]
                name.id <- names(i.id)
                search.sub <- paste0(name.id,'(?=$)')
                replace.sub <- clean.street.unit(name.id)
                names(replace.sub) <- search.sub
                # street.replace <- sapply(names(ids), clean.street.unit)
                street[i.id] <- str_replace_all(street[i.id], replace.sub)
            }
        }
        # street <- '2675 South Santa Fe Drive Building 6 Unit F Gandh' abc(?!.*abc)
        pattern.neg.ahead <- paste0(l.regex$s,'(?!.*',l.regex$s,')')
        pattern.detect <-'.*[\\w]{0,1}and[\\w]{0,1}.*'
        pattern <- paste0(pattern.neg.ahead, pattern.detect)
        step.1 <- str_extract(street, regex(pattern, perl=TRUE))
        ids <- which(str_detect(step.1, regex(paste0('(?<=',l.regex$s,') '), perl=TRUE)))
        if (length(ids)>0){
            street.sub <- street[ids]
            extract.sub <- str_extract(street.sub, regex(pattern, perl=TRUE))
            replace.sub <- str_extract(extract.sub, regex(paste0(l.regex$s, '( [:alnum:])'),perl=TRUE))
            search.sub <- paste0(extract.sub,'(?=$)')
            names(replace.sub) <- search.sub
            street[ids] <- str_replace_all(street[ids], replace.sub)
        }

        # Fix leading street num ranges
        # costar.rent.scrape: "521-& 541 14th St"
        nums.amp <- str_extract(street, regex('(?<=^)[0-9]{1,}- [0-9]{1,}'))
        ids <- which(str_detect(street, nums.amp))
        street[ids] <- str_replace_all(street[ids], '- (?=[0-9]{1,1})', '-')

        nums <- sapply(str_split(street,' '),'[',1)
        nums.split <- sapply(nums, str_split, pattern='-')
        ids <- which(sapply(nums.split, length)>1)
        ids <- ids[which(!str_detect(names(ids),'Highway|Hwy'))]
        if (length(ids)>0){
            # print(paste('Checking and fixing', length(ids), 'street number ranges'))
            replace.sub <- clean.street.num(names(ids))
            names(replace.sub) <- paste0('(?<=^)', names(ids))
            street[ids] <- str_replace_all(street[ids], replace.sub)
            street <- str_trim(str_replace_all(street, regex('[\\s]{2,}', perl=TRUE), ' '))
        }
    }
    if (iter==6) print(paste0('Error with clean.street',  cbind(street.base[which(street !=street.base)],street[which(street !=street.base)])))
    street <- str_trim(str_replace(street, regex('(?i)( LLC)', perl=TRUE), ' LLC'))
    return(street)
}
#' @title clean.street.regex
#'
#' @description Builds regex used in clean.street (depends on data.table which had mclapply bug in R 3.4.0)
#' @keywords clean street regex
#' @export
#' @import stringr
#'     data.table
#'     stats
clean.street.regex <- function(){
  class <- NULL; search <- NULL; collapse <- NULL
  l.regex <- list()
  l.regex$intersection <- paste0('(?<= ',paste0(methods.string::abbrev[class=='suffix' & search!='Wy', search], collapse='|'),')and')
  units<- paste0(methods.string::abbrev[class=='unit' & search != '#',search],collapse='|')
  l.regex$unit.start <- paste0('(?i)(?<=^)(',units,')\\s[0-9]{1,}\\s(?=[0-9]{1,1})')
  l.regex$abbrev.compass.search <- paste0(methods.string::abbrev[class=='compass',search], collapse='|')
  l.regex$s <- paste0('(',paste0(paste0(unlist(methods.string::abbrev[class!='compass'&search!='#', .(search, paste0(search,'s'))])),collapse='|'),')')
  l.regex$x <- sapply(t(methods.string::abbrev[class=='unit' & search!='#', .(search, paste0(search,'s'))]), cbind, simplify=TRUE)
  l.regex$t <- paste0('(',paste0(paste0(sort(c(unlist(methods.string::abbrev[class!='compass'&search!='#', .(search, paste0(search,'s'))]), 'Highway', 'Broadway'))),collapse=' |'),')')
  l.regex$u <- paste0('(',paste0(paste0(sort(c(unlist(methods.string::abbrev[class!='compass'&search!='#', .(search, paste0(search,'s'))]), 'Highway', 'Bay', 'Broadway'))),collapse='|'),')')
  return(l.regex)
}
#' @title clean.street.num
#'
#' @description Cleans up street.num, makes sure that hypens are set correctly.
#' @param street.num character vector of street.nums
#' @keywords street.num
#' @export
#' @import stringr
#'     data.table
clean.street.num <- function(street.num){
    high <- NULL
    low <- NULL
    diff.int <- NULL
    lead.1 <- NULL
    nchar.high <- NULL
    num.2.new <- NULL
    char.diff <- NULL
    num.split<-str_split(street.num, pattern='-')
    num.1 <- sapply(num.split, '[', 1)
    num.2 <- sapply(num.split, '[', 2)
    nums <- data.table(low=num.1, high=num.2)
    # 103-
    nums <- nums[high=='', high:=low]
    id.weird <- as.integer(nums$low)>as.integer(nums$high)
    # 10371-103
    nums$nchar.high <- nchar(nums$high)
    nums <- nums[id.weird & str_sub(low, 1, nchar.high) == high, high:=low]
    id.weird <- as.integer(nums$low)>as.integer(nums$high)
    # 3740-70
    nums <- nums[ , char.diff := nchar(low) - nchar(high)]
    nums <- nums[ , lead.1 := str_sub(low,1,char.diff)]
    nums <- nums[ ,  num.2.new := paste0(lead.1, high)]
    nums <- nums[,   diff.int := as.integer(num.2.new)-as.integer(low)]
    nums <- nums[diff.int>0 & diff.int < 200 & id.weird, high := num.2.new]
    id.weird <- as.integer(nums$low)>as.integer(nums$high)
    nums[id.weird]
    # 1565-1557
    nums <- nums[id.weird & diff.int < 0 & diff.int > -100, high:=low]
    nums <- nums[id.weird & diff.int < 0 & diff.int > -100, low:=num.2.new]
    street.num <- paste(nums$low,nums$high, sep='-')
    return(street.num)
}
#' @title clean.street.unit
#'
#' @description Cleans up street.unit number.
#' @param vec.chars character vector of street.units. This function is very confusing
#' @keywords street.num
#' @export
#' @import stringr
#'     data.table
#'     utils
clean.street.unit <- function(vec.chars){
    # print(vec.chars)
    if (grepl("Cir 7-O", vec.chars)) {
        vec.chars <- 'Cir 7-0'
        return(vec.chars)
    }
    l <- list(iterates='', leads='', repeats='')
    check <- list()
    #suffix.done <- FALSE
    vec.chars <- str_replace(vec.chars, ' and | \\+ ', ' ')
    x <- unlist(str_split(vec.chars,' |-| and '))
    l$prefix <- x[1]
    x <- x[2:length(x)]
    n.x <- length(x)
    x.nums <- table(unlist(strsplit(x, split='[A-z]{1,}')),exclude = '')
    #n.nums <-length(x.nums)
    x.chars <- table(unlist(strsplit(x, split='[0-9]{1,}')),exclude = '')
    #n.chars <- length(x.chars)
    l.mix <- list()
    if (length(x.nums)>0) {
        l.mix$x.nums <- x.nums
    }
    if (length(x.chars)>0){
        l.mix$x.chars <- x.chars
    }
    # vec.chars <- 'Unit F G H', 'Drive 6F 6G 6H', 'Drive A1 A2 A3' 'Suite 120 130 140'
    check$iterates <- which(sapply(l.mix, length)==n.x)
    if (length(check$iterates)>0){
        vals <- l.mix[[check$iterates]][l.mix[[check$iterates]]==1]
        if(length(vals)==n.x){
            chars <- sapply(names(l.mix[[check$iterates]]), nchar)
            combs <- t(combn(chars,2))
            diff <- combs[,2]-combs[,1]
            if(max(diff) == 0){
                l$iterates <- names(l.mix[[check$iterates]])
            } else {
                l$iterates <- NULL
                return(vec.chars)
            }
        }
    }
    v.mix <- unlist(l.mix)
    names(v.mix) <- sapply(str_split(names(v.mix),'\\.'),function(x) x[length(x)])
    leadstring <- names(l.mix$x.chars[1])
    greg.match <- sapply(names(v.mix), function(x) gregexpr(pattern=x, leadstring),simplify=TRUE)
    if (length(x.chars)==0){
        return(vec.chars)
    } else {
        check$leads <- which.max(sapply(greg.match[greg.match==1], function(x) attr(x,'match.length')))
        if (length(check$leads)>0){
            if(names(check$leads) %in% l$iterates){
                check$leads<-''
            }
            if(check$leads==1){
                if(names(check$leads)==leadstring){
                    l$leads <- names(check$leads)
                } else {
                    l$repeats <- names(check$leads)
                }
            }
        }
        rep.checks <- unlist(sapply(l.mix, function(x) x==n.x))
        names(rep.checks) <- sapply(str_split(names(rep.checks),'\\.'),function(x) x[length(x)])
        check$repeats <- which(rep.checks)
        if (length(check$repeats)>0){
            l$repeats <- names(check$repeats)
        }
        x.tail <- paste0(l$repeats, l$iterates[c(1,length(l$iterates))], collapse='-')
        if(nchar(x.tail)==1){
            x.tail<-''
        }
        x.out <- paste0(l$prefix, ' ',l$leads, ' ', x.tail)
        x.out <- str_trim(str_replace_all(x.out, '(\\s){1,}', ' '))
        return(x.out)
    }
}
#' @title explode.address
#'
#' @description Explodes a single string addresss using explode.street and explode.cityStateZip.
#' @param DT data.table needs to have fields street, cityStateZip
#' @param lookup.address optional table from main data usually parcels.address
#' @keywords street.num
#' @export
#' @import stringr
#'     data.table
#'     tidyr
#' @importFrom tidyr gather
explode.address <- function(DT, lookup.address=NULL){
    street.num <- NULL
    street.num.low <- NULL
    street.num.hi <- NULL
    street.num.range <- NULL
    # Break out cityStateZip
    cityStateZip <- explode.cityStateZip(DT)
    DT$cityStateZip <- NULL
    DT <- data.table(DT, cityStateZip)
     # Break out street
    street <- DT$street
    DT$street <- NULL
    street.explode <- explode.street(street)
    # Bring it back together (remove old street name from DT)
    DT <- data.table(DT, street.explode)
    DT <- DT[, street.num:=as.integer(street.num)]
    DT$street.num <- NULL
    DT <- tidyr::gather(DT, street.num.range, street.num, street.num.low:street.num.hi)
    DT <- as.data.table(DT)
    DT$street.num.range <- NULL
    DT$street <- NULL
    DT <- unique(DT)
    if(!is.null(lookup.address)){
        DT <- fill.missing.zip.city(DT, lookup.address) # from erikbjohn/location package
    }
    return(DT)
}
#' @title explode.cityStateZip
#'
#' @description Explodes cityStateZip into city state and zip
#' @param DT data.table with column name containg address or cityStateZip
#' @keywords street.num
#' @export
#' @import stringr
#'     data.table
#'     stats
explode.cityStateZip <- function(DT){
    regex.study.cities <- paste0(methods.string::study.cities, collapse='|')
    x <- DT
    address.cols <- na.omit(str_extract(names(x), regex('(?i)(?=^)(address$|cityStateZip)', perl=TRUE)))
    address.col <- names(which.max(lapply(x[,(address.cols),with=FALSE], function(y) length(na.omit(y)))))[1]
    x <- x[, (address.col), with=FALSE]
    # Clean state (must do this first)
    x$cityStateZip <- clean.state(x$cityStateZip)
    # Clean city name
    x$cityStateZip <- clean.city(x$cityStateZip)
    # First pass: Extract to the city
    # Find last 'city' location in string
    # abc(?!.*abc)
    pattern.neg.ahead <- paste0('(?i)(',regex.study.cities,')(?!.*(',regex.study.cities,'))(.*.|$)')
    cityStateZip <- as.data.table(sapply(x,function(y) str_extract(y,pattern=regex(pattern.neg.ahead, perl=TRUE))))
    pattern.city <- paste0('(?i)(',regex.study.cities,')(?!.*(',regex.study.cities,'))')
    city <- data.table(sapply(cityStateZip,function(y) str_extract(y,pattern=regex(pattern.city, perl=TRUE)),simplify=TRUE, USE.NAMES = FALSE ))
    setnames(city, names(city), 'city')
    zip <- sapply(cityStateZip, extract.zip, USE.NAMES = FALSE)
    state.regex <- paste0('(?= )', paste0(methods.string::lookup.state$abb, collapse='|'), '(?=( |,))')
    state <- str_extract(cityStateZip, regex(state.regex, perl=TRUE))
    x <- data.table(city, state, zip)
    setnames(x, names(x), c('city', 'state', 'zip'))
    return(x)
}
#' @title explode.street
#'
#' @description Explodes street, into component pieces for matching, return a data.table.
#' @param street charcater vector with address
#' @keywords explode, street
#' @export
#' @import stringr
#'     data.table
#'     parallel
explode.street <- function(street){
    parcels.street.last <- NULL
    street.body <- NULL
    street.cum <- NULL
    street.direction.prefix <- NULL
    street.direction.suffix <- NULL
    street.first<- NULL
    street.head.first <- NULL
    street.last<- NULL
    street.num<- NULL
    street.num.hi<- NULL
    street.num.lookup<- NULL
    street.num.low<- NULL
    street.num.range<- NULL
    street.second.last<- NULL
    street.type<- NULL
    street.unit<- NULL
    street.unit.type<- NULL
    l.regex <- clean.street.regex()
    # tic <- Sys.time()
    # street <- sapply(street, function(x) clean.street(x, l.regex))
    # toc <- Sys.time()
    # difftime(toc, tic, 'mins')
    tic <- Sys.time()
    n.cores <- detectCores() - 2
    n.cores <- max(n.cores, 1)
    street <- unlist(mclapply(street, function(y) clean.street(y, l.regex), mc.cores = n.cores))
    toc <- Sys.time()
    difftime(toc, tic, 'mins')

    street.explode <- data.table(street=street, street.num = '', street.direction.prefix = '', street.type = '',
                                 street.unit = '', street.unit.type = '', street.direction.suffix='',
                                 street.num.low = '', street.num.hi = '', street.last = '', street.second.last = '')

    # Initialize direction abbrev matches
    abbrev.direction <- methods.string::abbrev[class=='compass']
    search.direction <- abbrev.direction$search.regex
    return.direction <- abbrev.direction$return
    names(return.direction) <- search.direction

    # Prefix: street.num
    street.explode <- street.explode[, street.cum := street]
    street.explode <- street.explode[, street.head.first := str_extract(street.cum, regex('(?=^)[(0-9)|-]{1,}\\s', perl=TRUE))] # Debug 7, 8
    street.explode <- street.explode[is.na(street.head.first), street.head.first:='']
    street.explode <- street.explode[, street.num:=street.head.first]
    street.explode <- street.explode[street.head.first!='', street.cum:=str_trim(str_replace(street.cum, street.head.first, ''))]
    street.explode <- street.explode[, street.head.first := '']
    street.explode <- street.explode[, street.first := str_trim(lapply(str_split(street, ' '), '[[', 1))]
    street.explode <- street.explode[is.na(street.num), street.num:='9999999']

    ## Prefix: street.num.low, street.num.high
    street.explode <- street.explode[, street.num.low := str_trim(lapply(str_split(street.first, '-'), '[[', 1))]
    street.explode <- street.explode[, street.num.low := str_extract(street.num.low, regex('[0-9]{1,}(?=[^0-9]|$)', perl=TRUE))]
    street.explode <- street.explode[, street.num.hi := str_trim(lapply(str_split(street.first, '-'), '[', 2))]
    street.explode <- street.explode[, street.num.hi := str_extract(street.num.hi, regex('[0-9]{1,}(?=[^0-9]|$)', perl=TRUE))]
    street.explode <- street.explode[is.na(street.num.low), street.num.low := '']
    street.explode <- street.explode[is.na(street.num.hi), street.num.hi := street.num.low]
    street.explode$street.first <- NULL
    street.explode$street.num <- NULL

    # Prefix: street.direction.prefix
    direction.set <- abbrev.direction[, search]
    street.explode <- street.explode[, street.head.first := str_trim(lapply(str_split(street.cum, ' '), '[[', 1))]
    street.explode <- street.explode[street.head.first %in% direction.set, street.direction.prefix := street.head.first] # Debug 3 fixed here

    street.explode <- street.explode[street.head.first %in% direction.set, street.cum :=
                                         str_trim(str_replace(street.cum,
                                                              regex(paste0('(?=^)', street.head.first, '\\s'), perl=TRUE), ''))]
    street.explode <- street.explode[street.head.first %in% direction.set, street.direction.prefix :=
                                         str_trim(str_replace_all(street.direction.prefix, return.direction))]
    street.explode$street.head.first <- NULL

    # Suffix
    ## Initialize street.last, street.second.last
    street.cum.word.count <- word.count(street.explode$street.cum)
    update.log <- street.cum.word.count > 1
    street.explode <- parcels.street.last(update.log, street.explode)
    update.log <- street.cum.word.count > 2
    street.explode <- parcels.street.second.last(update.log, street.explode)
    street.cum.word.count <- word.count(street.explode$street.cum)
    '3280 Boulder Cir Bldg G4 West'
    search.set <- c(methods.string::abbrev[class=='unit'|class=='suffix', search],'')
    update.log <- !(street.explode$street.last %in% search.set) & !(street.explode$street.second.last %in% search.set)
    street.explode <- street.explode[update.log, street.last := paste(street.second.last, street.last)]
    street.explode <- parcels.street.second.last(update.log, street.explode)
    street.cum.word.count <- word.count(street.explode$street.cum)

    ## Step 0.0: First swipe of  street.unit (no street.type)
    ### Debug 9, 10, 11, 13, 14
    street.explode <- street.explode[str_detect(street.last, regex('(?<=(^|-))[0-9]{1,}(?=$)', perl=TRUE)) &
                                         !(street.second.last %in% methods.string::abbrev[class=='unit', search]),
                                     street.unit:=street.last]
    ### Debug 15 '6390 Gunpark Dr B'
    x <- street.explode
    c1 <- !(x$street.second.last %in% methods.string::abbrev[class=='unit', search])
    c2 <- !(x$street.last %in% methods.string::abbrev[class=='suffix', search])
    c3 <- !(x$street.last %in% methods.string::abbrev[class=='compass', search])
    c4 <- x$street.last !=''
    c5 <- str_detect(x$street.last, regex('[A-z]{3,}$', perl=TRUE))==FALSE  #"4845 Van Gordon
    logical <- cbind(c1,c2,c3,c4,c5)
    logical <-apply(logical, 1, function(x) length(x[which(x==TRUE)])==length(x))
    street.explode<-street.explode[logical, street.unit:=street.last]
    update.log <- street.explode$street.unit != ''
    street.explode <- parcels.street.last.promote(update.log, street.explode)
    street.explode <- parcels.street.second.last(update.log, street.explode)
    #"3201 W Hampden Ave Ste"
    #update.promote <- street.explode$street.last == '' & street.explode$street.second.last != ''
    #street.explode <- street.explode[update.promote, street.last := street.second.last]
    #street.explode <- street.explode[update.promote, street.second.last := '']
    #street.explode <- parcels.street.second.last(update.log, street.explode)

    ## Step 0.5: First swipe of  Assign street.direction.suffix (Debug 4)
    street.explode <- parcels.street.direction.suffix(street.explode, return.direction, abbrev.direction)

    ## Step 1: Revisit street.unit, street.unit.type
    regex.unit <- paste0(paste(methods.string::abbrev[class=='unit', return], collapse = '|'))
    unit.set <- methods.string::abbrev[class=='unit', search]
    street.unit.base <- street.explode$street.unit
    update.log <- street.explode$street.second.last %in% unit.set & street.unit.base==''
    street.explode <- street.explode[update.log, street.unit:= street.last]

    search.unit.type <- paste0('(?<=^)',unit.set,'(?=$)')
    return.unit.type <- methods.string::abbrev[class=='unit', return]
    names(return.unit.type) <- search.unit.type

    street.explode <- street.explode[update.log,
                                     street.unit.type := str_replace_all(street.second.last, return.unit.type)]
    street.explode <- parcels.street.last(update.log, street.explode)
    street.explode <- parcels.street.second.last(update.log, street.explode)

    # Step 2: Revisit Assign street.direction.suffix
    street.direction.suffix.base <- street.explode$street.direction.suffix
    street.explode <- parcels.street.direction.suffix(street.explode, return.direction, abbrev.direction)
    update.log <- street.explode$street.direction.suffix != '' & street.direction.suffix.base == ''  # Debug 5/6
    street.explode <- parcels.street.last(update.log, street.explode)
    street.explode <- parcels.street.second.last(update.log, street.explode)

    # Step 3 Assign street.type
    ## Speed up
    street.types <- unique(street.explode$street.last)
    abbrev.street.type <- methods.string::abbrev[class=='suffix' & search %in% street.types]
    # Debug 16
    #if (nrow(abbrev.street.type) >0){
    search.regex.street.type <- paste0('(?i)(?<=^)(',abbrev.street.type$search,')(?=$)')
    regex.street.type <- paste0('(?i)(?<=^)(',paste0(abbrev.street.type$search,collapse='|'),')(?=$)')
    return.street.type <- abbrev.street.type$return
    if (length(return.street.type)>0){
        names(return.street.type) <- search.regex.street.type
        street.explode <- street.explode[str_detect(street.last, regex(regex.street.type, perl=TRUE)),
                                         street.type := str_replace_all(street.last,return.street.type)]
    }
    update.log <- street.explode$street.type != ''  # Fixed Debug 2
    street.explode <- parcels.street.last.promote(update.log, street.explode)
    street.explode <- parcels.street.second.last(update.log, street.explode)
    #}
    # street.body (create and clean)
    vec <- c('rd','nd','th', 'st')
    matches.pattern <- paste0('(?i)(?<=(^| )[0-9]{1,10})(', vec, ')(?= |$)')
    matches.replacement <- vec
    names(matches.replacement) <- matches.pattern

    street.explode <- street.explode[, street.body:=str_trim(paste(street.cum, street.second.last, street.last))]
    street.explode <- street.explode[, street.body := str_replace_all(street.body, matches.replacement)]
    street.explode <- street.explode[, street.body := str_replace_all(street.body, '[\\s]{1,}', ' ')]

    # Error catch for PO Box
    inds.pi <- str_detect(street, regex('(?i)Box [0-9]{1,}(?=$)'))
    street.explode <- street.explode[inds.pi==TRUE, street:=street]
    street.explode <- street.explode[inds.pi==TRUE, street.num.low:='0']
    street.explode <- street.explode[inds.pi==TRUE, street.num.hi:='0']
    street.explode <- street.explode[inds.pi==TRUE, street.direction.prefix:='']
    street.explode <- street.explode[inds.pi==TRUE, street.direction.suffix:='']
    street.explode <- street.explode[inds.pi==TRUE, street.unit:=str_extract(street, regex('[0-9]{1,}(?=$)',perl=TRUE))]
    street.explode <- street.explode[inds.pi==TRUE, street.unit.type:='Box']
    street.explode <- street.explode[inds.pi==TRUE, street.body:='PO']

    street.explode$street.last <- NULL
    street.explode$street.second.last <- NULL
    street.explode$street.cum <- NULL

    street.explode <- street.explode[, street.num.hi:=as.integer(street.num.hi)]
    street.explode <- street.explode[, street.num.low:=as.integer(street.num.low)]
    street.explode$street.num <- street.explode$street.num.low

    return(street.explode)
}
#' @title extract.state
#'
#' @description Extracts state from cityStateZip.
#' @param cityStateZip character vector with cityStateZip
#' @keywords extract, state
#' @export
#' @import stringr
extract.state = function(cityStateZip){
    cityStateZip <- clean.state(cityStateZip)
    regex.state.abb <- paste0('(',paste0(methods.string::lookup.state$abb, collapse='|'), ')')
    state <- str_extract(cityStateZip, regex(regex.state.abb, perl=TRUE))
    return(state)
}
#' @title extract.zip
#'
#' @description Extracts zipcode from cityStateZip.
#' @param cityStateZip charcater vector with cityStateZip
#' @keywords extract, zipcode, zip
#' @export
#' @import stringr
extract.zip = function(cityStateZip){
    zip <- str_extract(cityStateZip, regex('(?<=( |^|,|\\w))[0-9]{5,5}', perl=TRUE))
    zip[is.na(zip)] <- ''
    return(zip)
}
#' @title fill.missing.state
#'
#' @description Fills missing state by querying zips
#' @param x data.table consisting of address.id and zip
#' @keywords fill missing, zip, city, state
#' @export
#' @import stringr
#'     data.table
#' @importFrom dplyr mutate select one_of group_by_
fill.missing.state <- function(x){
    if('address.id' %in% names(x)){
    state <- NULL
    address.id <- NULL
    zip <- NULL
    zip.find <- NULL
    dt <- x[, .(zip.find=zip, address.id)]
    setkey(dt, zip.find)
    setkey(methods.string::lookup.zips,zip)
    dt <- methods.string::lookup.zips[dt]
    # county by state and fill
    states.found <- table(dt$state)
    states.found <- states.found[str_length(names(states.found))==2]
    states.found.n <- length(states.found)
    states.found.pct <- states.found/sum(states.found)
    states.notfound.pct <- states.found/nrow(dt)
    state.fill <- names(states.found)[which.max(states.found)]
    if(max(states.found.pct) > 0.9 & max(states.notfound.pct)){
       state.filled.n <- nrow(dt[zip==''])
       dt <- dt[zip=='', state:=state.fill]
       cat('Filled', state.filled.n, 'state records with', state.fill,'based on study consistency')
    }
    dt <- dt[, .(address.id, state)]
    setkey(dt, address.id)
    return(dt)
    } else {
        print('No address.id in methods.string::fill.missing.state. Omitted function')
        return(x)
    }
}
#' @title fill.missing.zip.city
#'
#' @description Fills missing zip and city by looking up in lookup.address if it exists.
#' @param DT data.table with exploded zip city state
#' @param lookup.address data.table used for lookup values
#' @keywords fill missing, zip, city, state
#' @export
#' @import stringr
#'     data.table
#'     dplyr
fill.missing.zip.city <- function(DT, lookup.address=NULL){
    city <- NULL
    street.num <- NULL
    num.distance <- NULL
    street.num.lookup <- NULL
    print('Attempting to fill missing zip and cities')
    # Convert NA values to ''f
    DT <- DT[is.na(zip), zip:='']
    DT <- DT[is.na(city), city:='']
    DT <- DT[str_detect(city, regex('(?i)(?<=( |^))na(?=( |$))', perl=TRUE)), city:='']
    DT.colnames <- names(DT)
    lookup.var <- 'zip'
    check.id <- grepl('.id', paste0(DT.colnames,collapse=','))
    if(check.id==FALSE){
        DT$temp.id <- seq(1:nrow(DT))
        var.id <- 'temp.id'
    } else {
        var.id <- grep('.id', DT.colnames,value = TRUE)
    }
    val.id <- as.symbol(var.id)

    vars.list <- list()
    vars.list$key.vars[[1]] <- c('street.direction.prefix', 'street.body', 'street.type', 'city', 'street.num')
    vars.list$lookup.var[[1]] <- 'zip'
    vars.list$key.vars[[2]] <- c('street.direction.prefix', 'street.body', 'street.type', 'street.num')
    vars.list$lookup.var[[2]] <- 'zip'
    vars.list$key.vars[[3]] <- c('street.direction.prefix', 'street.body', 'street.type', 'zip', 'street.num')
    vars.list$lookup.var[[3]] <- 'city'
    vars.list$key.vars[[4]] <- c('street.direction.prefix', 'street.body', 'street.type', 'street.num')
    vars.list$lookup.var[[4]] <- 'city'

    # Used for join (drop zip from colname)
    for(iComb in 1:length(vars.list$key.vars)){
        #print(iComb)
        # Set variable parameters
        key.vars <- vars.list$key.vars[[iComb]]
        lookup.var <- vars.list$lookup.var[[iComb]]
        lookup.val <- as.symbol(lookup.var)
        #DT.full.vars <- grep(lookup.var, DT.colnames, invert=TRUE, value=TRUE)
        #baseVars <- c(var.id, key.vars)

        # Initialize DT tables
        #DT.good <- DT[eval(lookup.val)!='']
        DT.missing <- DT[eval(lookup.val)=='']
        n.missing <- nrow(DT.missing)
        if(n.missing>0){
            if(exists('DT.lookup')){
                rm(DT.lookup)
            }
            DT.lookup <- DT.missing[, street.num := as.numeric(street.num)] #Not sure about this
            full.vars <- c(lookup.var, key.vars)
            #key.vals <- lapply(key.vars, as.symbol)
            lookup.val <- as.symbol(lookup.var)
            #full.vals <- lapply(full.vars, as.symbol)
            #vars.lookup <- c(key.vars, 'val.new')
            val.id <- as.symbol(var.id)

            if (exists('lookup.address')){
                lookup.address.table <- lookup.address[city!='' & zip != ''] %>%
                    dplyr::select(one_of(full.vars)) %>%
                    dplyr::mutate(street.num = as.numeric(street.num)) %>%
                    dplyr::mutate(street.num.lookup = street.num)
                lookup.address.table <- as.data.table(lookup.address.table)
            } else {
                lookup.address.table <- DT[city!='' & zip != ''] %>%
                    dplyr::select(one_of(full.vars)) %>%
                    dplyr::mutate(street.num = as.numeric(street.num)) %>%
                    dplyr::mutate(street.num.lookup = street.num)
                lookup.address.table <- as.data.table(lookup.address.table)
            }
            setkeyv(DT.lookup, key.vars)
            setkeyv(lookup.address.table, key.vars)
            DT.lookup.back <- lookup.address.table[DT.lookup, roll=-Inf]
            DT.lookup.forward <- lookup.address.table[DT.lookup, roll=Inf]
            DT.lookup <- rbindlist(list(DT.lookup.back, DT.lookup.forward))
            DT.lookup <- DT.lookup[, num.distance:=abs(street.num-street.num.lookup)]

            DT.lookup <-   DT.lookup %>%
                dplyr::group_by_(.dots=list(val.id)) %>%
                filter(rank(-num.distance, ties.method='first')==1)
            DT.lookup <- as.data.table(DT.lookup)
            DT.lookup <- DT.lookup[!is.na(street.num)]
            DT.lookup <- dplyr::select(DT.lookup, one_of(names(DT.missing)))
            DT.fixed <- DT.lookup[!is.na(eval(lookup.val))]

            checks <- vars.list$key.vars[[iComb]]
            bigs <- list('city', 'zip')
            if(length(unlist(sapply(bigs, function(x) which(x %in% checks))))==0){
                val.check <- as.symbol(bigs[[which(!(bigs %in% lookup.var))]])
                bad.ids <- unlist(DT[eval(val.check)!=''][, (var.id), with=FALSE])
                DT.fixed<-DT.fixed[!(eval(val.id) %in% bad.ids)]
            }

            if (nrow(DT.fixed)>0){
                # Fixed DT.id
                ids.fixed <- unlist(DT.fixed[, get(var.id)])
                ids.full <- unlist(DT[, get(var.id)])
                ids.keep <- which(!(ids.full %in% ids.fixed))
                DT <- DT[ids.keep]
                DT <- rbindlist(list(DT, DT.fixed), use.names=TRUE)
            }
            print(paste0(paste0(lookup.var, collapse=','),
                         ': missing ', nrow(DT.missing),': fixed ', nrow(DT.fixed)))
        }
    }
    if(check.id==FALSE) DT$temp.id<-NULL
    return(DT)
}
#' @title parcels.street.last
#'
#' @description Suffix operations for explode.street
#' @param update.log logical vector
#' @param street.explode data.table
#' @keywords street.explode, explode, helper
#' @export
#' @import stringr
#'     data.table
parcels.street.last <- function(update.log, street.explode){
    street.last <- NULL
    street.cum <- NULL
    street.explode <- street.explode[update.log, street.last := str_trim(str_replace(street.cum, regex('.*. ', perl=TRUE), ''))]
    street.explode <- street.explode[update.log,
                                     street.cum := str_trim(str_replace(street.cum, regex(paste0('(^|\\s)',street.last,'(?=$)'),
                                                                                          perl=TRUE), ''))]
    return(street.explode)
    }
#' @title parcels.street.last.promote
#'
#' @description Helper function for explode.street.
#' @param update.log logical vector
#' @param street.explode data.table
#' @keywords street.explode, explode, helper
#' @export
#' @import stringr
#'    data.table
parcels.street.last.promote <- function(update.log, street.explode){
    street.second.last <- NULL
    street.last <- NULL
    street.explode <- street.explode[update.log, street.last := street.second.last]
}
#' @title parcels.street.second.last
#'
#' @description Helper function for explode.street.
#' @param update.log logical vector
#' @param street.explode data.table
#' @keywords street.explode, explode, helper
#' @export
#' @import stringr
#'     data.table
parcels.street.second.last <- function(update.log, street.explode){
    street.cum <- NULL
    street.second.last <- NULL
    street.explode <- street.explode[update.log, street.second.last := str_trim(str_replace(street.cum, regex('.* ', perl=TRUE), ''))]
    street.explode <- street.explode[update.log, street.cum:=str_trim(str_replace(street.cum, regex(paste0('(^|\\s)',street.second.last,'(?=$)'), perl=TRUE), ' '))]
}
#' @title parcels.street.direction.suffix
#'
#' @description Helper function for explode.street.
#' @param street.explode data.table
#' @param abbrev.direction list of abbreviations to search for
#' @param return.direction no idea
#' @keywords street.explode, explode, helper
#' @export
#' @import stringr
#'    data.table
parcels.street.direction.suffix <- function(street.explode, return.direction, abbrev.direction){
    street.direction.suffix <- NULL
    street.last <- NULL
    street.num<-NULL
    street.num.lookup<-NULL
    #regex.direction <- paste0('(?i)(?<=^| )(', paste(abbrev.direction$search, collapse='|'), ')(?=$)')
    direction.set <- abbrev.direction$search
    update.log <- street.explode$street.last %in% direction.set
    street.explode <- street.explode[update.log,
                                     street.direction.suffix := street.last]
    street.explode <- street.explode[update.log,
                                     street.direction.suffix := str_trim(str_replace_all(street.direction.suffix,
                                                                                         return.direction))]
    street.explode <- parcels.street.last.promote(update.log, street.explode)
    street.explode <- parcels.street.second.last(update.log, street.explode)
    return(street.explode)
}
#' @title word.count
#'
#' @description Helper function for explode.street, sees how may words are in the holder street.cum, also binds some words together.
#' @param street.cum character vector
#' @keywords street.explode, explode, helper, cound words
#' @export
#' @import stringr
word.count <- function(street.cum){
    word.count <- sapply(strsplit(street.cum, "\\s+"), length)
    notting <- str_detect(street.cum, regex(paste0('Notting Hill '), perl=TRUE))
    july <- str_detect(street.cum, regex(paste0('Fourth Of July '), perl=TRUE))
    county.road <- str_detect(street.cum, regex(paste0('County (Road|Rd) ', '[0-9]{1,}(?=\\s|$)'), perl=TRUE))
    highway <- str_detect(street.cum, regex(paste0('(Highway |Hwy )', '[0-9]{1,}(?=\\s|$)'), perl=TRUE))
    us.highway <- str_detect(street.cum, regex(paste0('US (Highway|Hwy) ', '[0-9]{1,}(?=\\s|$)'), perl=TRUE))
    state.highway <- str_detect(street.cum, regex(paste0('State (Highway|Hwy) ', '[0-9]{1,}(?=\\s|$)'), perl=TRUE))
    foothills.highway <- str_detect(street.cum, regex(paste0('Foothills Hwy ', '[0-9|A-z]{1,}(?=\\s|$)'), perl=TRUE))
    peak2peak.highway <- str_detect(street.cum, regex(paste0('Peak To Peak Hwy ', '[0-9]{1,}(?=\\s|$)'), perl=TRUE))
    peak2peak <- str_detect(street.cum, regex(paste0('(Peak To Peak |Paseo Del Prado )', '[0-9]{1,}(?=\\s|$)'), perl=TRUE))
    word.count[notting] <- word.count[notting] - 2
    word.count[july] <- word.count[july] - 3
    word.count[county.road] <- word.count[county.road] - 2
    word.count[highway] <- word.count[highway] - 1
    word.count[us.highway] <- word.count[us.highway] - 2
    word.count[state.highway] <- word.count[state.highway] - 2
    word.count[foothills.highway] <- word.count[foothills.highway] - 1
    word.count[peak2peak.highway] <- word.count[peak2peak.highway] - 3
    word.count[peak2peak] <- word.count[peak2peak] - 2
    return(word.count)
}
#' @title underscore
#'
#' @description Convert spaces, dashes, and antying else to underscore.
#' @param x character vector
#' @keywords street.explode, explode, helper, cound words
#' @export
underscore <- function(x){
    x <- tolower(x)
    x <- gsub("(_.)", "\\U\\1", x, perl = TRUE)
    x <- gsub("(-.)", "\\U\\1", x, perl = TRUE)
    x <- gsub("( .)", "\\U\\1", x, perl = TRUE)
    x <- gsub(",", ' ', x, perl = TRUE)
    x <- gsub(" ", '-', x, perl = TRUE)
    x <- gsub("--", '-', x, perl = TRUE)
    x <- gsub("--", '-', x, perl = TRUE)
    x <- gsub("--", '-', x, perl = TRUE)
    x <- gsub('_', '', x, perl = TRUE)
    x <- gsub('-', '', x, perl = TRUE)
}
erikbjohn/methods.string documentation built on May 20, 2019, 2:22 p.m.