R/authors_address.R

Defines functions authors_address

#' Parses out address information and splits it into its respective parts.
#' This is an internal function used by \code{authors_clean}
#' 
#' \code{authors_address} This function takes the output from 
#' \code{references_read} and pulls out address information. Splitting it into
#' university, department, city, state, etc. 
#' @param addresses the addresses
#' @param ID the authorID
#' @noRd
authors_address <- function(addresses, ID){
  message("\nSplitting addresses\n")

  list_address <- strsplit(addresses, ",")
  university_list <- vapply(list_address, function(x) x[1], character(1))
  country_list <- vapply(list_address, function(x) {
    gsub("\\_", "", x[length(x)]) },
    character(1))
  country_list <- trimws(country_list, which = "both")
  pc_list <- trimws(substr(country_list, 1, (vapply(regexpr("USA",
    country_list), function(x) x[1], numeric(1))) - 1), which = "right")
  state_list <- pc_list

  state_list[nchar(state_list) > 0] <- regmatches(
    state_list[nchar(state_list) > 0],
    regexpr("[[:upper:]]{2}", state_list[nchar(state_list) > 0])
  )

  pc_list[nchar(pc_list) > 2] <- regmatches(pc_list[nchar(pc_list) > 2],
    regexpr("[[:digit:]]{5}", pc_list[nchar(pc_list) > 2]))
  pc_list[nchar(pc_list) < 3] <- ""
  country_list <- ifelse(grepl("USA", country_list), "USA", country_list)

  list_address1 <- lapply(list_address, function(x) x[-c(1, length(x))])

  # Because formats of address printing is different across platforms
  # We are going to split using a tier system assuming first and last
  # info is somewhat reliable and guess the other info from the
  # remaining position of the info

  second_tier_list <- lapply(list_address1, function(x) x[length(x)])
  second_tier_list <- trimws(second_tier_list, which = "both")
  second_tier_list[second_tier_list == "character(0)"] <- NA

  list_address2 <- lapply(list_address1, function(x) x[-c(length(x))])

  third_tier_list <- lapply(list_address2, function(x) x[length(x)])
  third_tier_list <- trimws(third_tier_list, which = "both")
  third_tier_list[third_tier_list == "character(0)"] <- NA

  # All remaining info is just shoved in this category
  remain_list <- lapply(list_address2, function(x) x[-c(length(x))][1])
  remain_list <- trimws(remain_list, which = "both")
  remain_list[remain_list == "character(0)"] <- NA

  a_df <- data.frame(
    adID = ID, university = university_list,
    country = country_list,
    state = state_list, postal_code = pc_list, city = NA,
    department = NA, second_tier = second_tier_list,
    third_tier = third_tier_list,
    remain = remain_list, address = addresses,
    stringsAsFactors = FALSE
  )

  # try to fix the USA spots, which vary in format than other countries
  a_df$city[nchar(a_df$state) > 0] <- a_df$second_tier[nchar(a_df$state) > 0]
  a_df$state[nchar(a_df$state) == 0] <- NA
  a_df$postal_code[nchar(a_df$postal_code) == 0] <- NA
  a_df$department[!is.na(a_df$state) & !is.na(a_df$postal_code) &
      !is.na(a_df$state)] <- a_df$third_tier[!is.na(a_df$state) &
          !is.na(a_df$postal_code) & !is.na(a_df$state)]
  # fix a US problem when USA is not tacked onto the end

  us_reg <- "[[:alpha:]]{2}[[:space:]]{1}[[:digit:]]{5}"
  a_df$state[ grepl(us_reg, a_df$country) ] <-
    substr(a_df$country[ grepl(us_reg, a_df$country) ], 1, 2)

  a_df$postal_code[ grepl(us_reg, a_df$country) ] <-
    substr(a_df$country[grepl(us_reg, a_df$country)], 4, 8)

  a_df$country[grepl(us_reg, a_df$country)] <- "USA"

  ##########################
  # We'll use regular expression to pull zipcodes
  # These formats differ by region
  int1 <- "[[:alpha:]]{2}[[:punct:]]{1}[[:digit:]]{1,8}"
  int2 <- paste("[[:space:]][[:upper:]][[:digit:]][[:upper:]]",
                 "[[:space:]][[:digit:]][[:upper:]][[:digit:]]", sep="")
  int3 <- "[[:alpha:]][[:punct:]][[:digit:]]{4,7}"
  int4 <- "[:upper:]{1,2}[:alnum:]{1,3}[:space:][:digit:][:alnum:]{1,3}"
  int <- paste(int1, int2, int3, int4, sep = "|")

  UK <- paste("[[:upper:]]{1,2}[[:digit:]]{1,2}[[:space:]]",
              "{1}[[:digit:]]{1}[[:upper:]]{2}", sep="")

  Mexico <- "[[:space:]]{1}[[:digit:]]{5}" # technically US as well

  Panama <- "[[:digit:]]{4}-[[:digit:]]{5}"

  zip_search <- paste0(int, "|", UK, "|", Mexico, "|", Panama)

  ###########################
  id_run <- a_df$adID[is.na(a_df$state) & is.na(a_df$postal_code) &
      a_df$address != "Could not be extracted"]
  ###########################

  # We now iteratively run through the addresses using the concept that
  # certain information always exists next to each other.
  # Ex. city, state, country tend to exist next to each other.
  # We use the position of the zipcode also to help guide us
  # in where the information lies as well as how many fields were
  # given to us.
  for (i in id_run) {
    found <- FALSE
    row <- which(a_df$adID == i)
    university <- a_df$university[row]
    second_tier <- a_df$second_tier[row]
    third_tier <- a_df$third_tier[row]
    remain <- a_df$remain[row]
    city <- NA
    state <- NA
    postal_code <- NA
    department <- NA
    grepl(zip_search, second_tier)
    grepl(zip_search, third_tier)
    # 2nd tier
    if (grepl(zip_search, second_tier)) {
      found <- TRUE
      postal_code <- regmatches(second_tier, regexpr(zip_search, second_tier))
      city <- gsub(zip_search, "", second_tier)
      department <- ifelse(is.na(remain), third_tier, remain)
    }
    # 3RD tiers
    if (grepl(zip_search, third_tier) & !found) {
      found <- TRUE
      postal_code <- regmatches(third_tier, regexpr(zip_search, third_tier))
      city <- gsub(zip_search, "", third_tier)
      state <- second_tier
      department <- remain
    }

    if (!found) {
      state <- second_tier
      city <- third_tier
      department <- remain
    }
    # To make university searching more efficient we'll override values
    # based on if it has university/college in the name,
    # where university overides college
    override_univ <- grepl("\\buniv\\b|\\buniversi",
      c(second_tier, third_tier, remain, city, university),
      ignore.case = TRUE) &
      !grepl("\\bdrv\\b|\\bdrive\\b",
        c(second_tier, third_tier, remain, city, university),
        ignore.case = TRUE)

    if (any(override_univ)) {
      university <-
        c(second_tier, third_tier, remain, city, university)[override_univ][1]
      assign(
        c("second_tier", "third_tier", "remain", "city", "university")[
          override_univ][1],
        NA
      )
    }
    # only if university doesnt already exist
    override_univ_col <-
      grepl("\\bcol\\b|college|\\bcoll\\b",
        c(second_tier, third_tier, remain, city, university),
        ignore.case = TRUE) &
      !grepl("\\bdrv\\b|\\bdrive\\b",
        c(second_tier, third_tier, remain, city, university),
        ignore.case = TRUE)

    if (!any(override_univ) & any(override_univ_col)) {
      university <-
        c(second_tier, third_tier, remain, city, university )[
          override_univ_col][1]

      assign(
        c("second_tier", "third_tier", "remain", "city", "university")[
          override_univ_col][1],
        NA
      )
    }
    # more risky, but institutions as well, just incase its not a university
    override_univ_inst <- grepl("\\binst\\b|\\binstitut",
      c(second_tier, third_tier, remain, city, university),
      ignore.case = TRUE)
    if (
      !any(override_univ) & !any(override_univ_col) & any(override_univ_inst)
      ) {
      department <- c(second_tier, third_tier, remain, city, university )[
        override_univ_inst][1]

      assign(
        c("second_tier", "third_tier", "remain", "city", "university")[
          override_univ_inst][1],
        NA
      )
    }

    a_df$city[row] <- gsub("[[:digit:]]", "", city)
    a_df$state[row] <- gsub("[[:digit:]]", "", state)
    a_df$postal_code[row] <- postal_code
    a_df$department[row] <- department

    #########################Clock###############################
    total <- length(id_run)
    pb <- utils::txtProgressBar(min = 0, max = total, style = 3)
    utils::setTxtProgressBar(pb, which(id_run == i))
    #############################################################
  }

  city_fix <- is.na(a_df$city) & !is.na(a_df$state)
  a_df$city[city_fix] <- a_df$state[city_fix]
  a_df$state[city_fix] <- NA
  a_df$university[a_df$university == "Could not be extracted"] <- NA
  a_df$country[a_df$country == "Could not be extracted"] <- NA
  a_df$country[a_df$country == "Peoples R China"] <- "China"
  a_df$postal_code[grepl("[[:alpha:]]{1,2}-", a_df$postal_code)] <-
    vapply(strsplit(
      a_df$postal_code[ grepl("[[:alpha:]]{1,2}-", a_df$postal_code)],
      "-"),
      function(x) x[2], character(1)
    )
  #strip periods from the ends of city,state,country
  a_df$city <- gsub("\\.", "", a_df$city)
  a_df$state <- gsub("\\.", "", a_df$state)
  a_df$country <- gsub("\\.", "", a_df$country)
  a_df$country[a_df$country == ""] <- NA
  a_df$university[a_df$university == ""] <- NA
  a_df$postal_code[a_df$postal_code == ""] <- NA
  #convert to lower
  for (l in 2:ncol(a_df)){
    a_df[, l] <- tolower(a_df[, l])
  }
  return(a_df)
}
embruna/refsplitr documentation built on Feb. 8, 2022, 4:11 a.m.