R/cleaning_standardizing_preprocessing_functions.R

Defines functions resolve_entity_types remove_extra fill_type clean_country_iso

Documented in clean_country_iso fill_type remove_extra resolve_entity_types

# Data-Driven Lab
# Functions used for cleaning and standardizing of values in the
# pre-processing of data for fuzzy matching

#' @export
#' @title Cleans the dataset's country names and adds iso.
#' @description Cleans the dataset actors' countries using exact matches based
#' on the package's country dictionary and adds corresponding iso information to
#' the dataset. For more information on \code{clean_enc}, see the vignette.
#'
#' @details The \code{clean_enc} argument is used for users to denote whether the data is
#' read in with the correct encoding. Use \code{clean_enc = T} if users are sure of the encoding
#' that the data is in and is sure data is read in with the right encoding and
#' \code{clean_enc = F} if users are unsure of data's encoding or if unsure if data is read in
#' with the correct encoding.
#'
#' If \code{clean_enc = F}, then the function will check the encoding of the country column in
#' the user's dataset and tries to fix it if the data was read in with the wrong encoding
#' (with \code{rvest::repair_encoding()}).
#' @param dataset Dataset to clean the country names for
#' @param country.dict Country dictionary to clean the dataset against
#' @param iso Input either 2 or 3 to select for 2 or 3 letter ISO code. Defaults to ISO3
#' @param clean_enc Is the data read in with the correct encoding?
#' If unknown, set as FALSE. Defaults to TRUE
#'
#' @return The original dataset with the country names cleaned
#'
#' @example \dontrun{clean_country_iso(df, country_dict, iso = 3, clean_enc = F)}
clean_country_iso <- function(dataset, country.dict, iso = 3, clean_enc = T) {
  # If not sure if data is clean, check and convert to try and repair the encoding
  if (!is.logical(clean_enc)){
    stop("clean_enc argument requires a logical (True/False) input.")
  }
  # Check for column naming using helper function
  dataset <- .col_check(dataset, "country", environment())
  if (exists("to.stop")){
    stop("Stopping function. Missing the \"country\" ",
         "column.")
  }
  # If not sure if data is clean, check and convert to try and repair the encoding

  if (!clean_enc & !all(is.na(dataset$country))){
    dataset$country <- .check_and_convert(dataset$country)
  }
  match_country <- which(dataset$country %in% .check_and_convert(country.dict$wrong))
  dataset$country[match_country] <- country.dict$right[match(dataset$country[match_country],
                                                             .check_and_convert(country.dict$wrong))]
  country_ind <<- which(!(dataset$country %in% country.dict$right))

  if (iso != 2 & iso != 3){
    stop("Please input either 2 or 3 for the \"iso\" argument.")
  }
  if (!("iso" %in% tolower(names(dataset)))){
    dataset$iso <- NA
  }
  if (iso == 2){
    dataset$iso <- country.dict$iso2[match(toupper(dataset$country),
                                           toupper(country.dict$right))]
  } else if (iso == 3){
    dataset$iso <- country.dict$iso[match(toupper(dataset$country),
                                          toupper(country.dict$right))]
  }
  if (length(country_ind) != 0){
    print(paste0("There are ", length(country_ind),
                 " entries with no exact matches in the country dictionary. The indices",
                 " for these names are recorded in the country_ind vector",
                 " Please use the fuzzify_country function to clean those names or input",
                 " them manually."))
  }

  # Helper function returns output that checks if the name is capitalized
  # Change back to capitalized version if the check is true
  if (exists("countryname")){
    names(dataset)[grepl("^country$", names(dataset))] <- countryname
  }
  return(dataset)
}


#' @export
#' @title Fills in the corresponding actor entity type for the dataset based on the
#' actor name
#' @description Guesses the entity type for the actors within the dataset based on commonly
#' used words for the respective entity types.
#'
#' @details The fill type function looks at the actor name and looks for common words
#' for each entity type (such as "City", "Town" for cities and "Inc.", "Co." for companies)
#' to guess the entity type for that actor.
#' @param dataset Dataset to fill in the entity type for
#'
#' @return The original dataset with entity types filled for the actors
#'
#' @example \dontrun{fill_type(df)}
fill_type <- function(dataset) {
  # Check for column naming
  col <- "entity_type"
  dataset <- .col_check(dataset, col, environment())
  if (exists("to.stop")){
    stop(paste0("Stopping function. Please create or rename a \"", col, "\"",
                "column."))
  }
  # fill the corresponding entity type with "City" if it contains a U.S. state abbreviation
  dataset$entity_type[grep(", AL|, AK|, AZ|, AR|, CA|, CO|, CT|, DE|, FL|, GA|, HI|, ID|
                           , IL|, IN|, IA|, KS|, KY|, LA|, ME|, MD|, MA|, MI|, MN|, MS|
                           , MO|, MT|, NE|, NV|, NH|, NJ|, NM|, NY|, NC|, ND|, OH|, OK|
                           , OR|, PA|, RI|, SC|, SD|, TN|, TX|, UT|, VT|, VA|, WA|, WV|
                           , WI|, WY",
                           dataset$name, ignore.case = TRUE)] <- "City"

  # fill corresponding entity type with Company if it contains words that are usually associated with companies
  # this filling comes after the state abbreviations so that any company with "Inc"
  # (or similar company names that happen to have a ", ..") aren't coded as cities
  dataset$entity_type[grep(" Inc|servic|limited| Co\\.|inc\\.|util|
                           constr|contract|plc|ltd|plc\\.|ltd\\.|P\\.L\\.C\\.|
                           L\\.T\\.D\\.|produ|LLP|L\\.L\\.P\\.|L\\.L\\C\\.|LLC|
                           group|hold|ltda|l\\.t\\.d\\.a\\.|craft |corp\\.| chapter|
                           congregation|makers|method| stage|indust|organic|organiz|
                           ingredi|transpo|glass|agricul|archite|hortic|logis|bevera|
                           market|system|syst|corpo|packag|soluti|softwa|integra|perfo|
                           desig|SRl|S\\.R\\.L\\.|chemic|cream|company|freight|metal|
                           electr|intern|int'l|intl|aero|alcoh|contai|special|S\\.A\\.|
                           SA\\.|sa de cv|sa de c\\.v\\.|s\\.a\\. de cv|
                           s\\.a\\. de c\\.v\\.|C\\.V\\.|CV\\.|artform|corporat|co\\.|
                           ltd\\.|print|maint|steel|rail|bank |banco |auto|build|
                           special|plastic|health|medical|maintain|concie|office|hotel|
                           food|center|charg|therap|pharma|device|harbor|harbour|mecan|
                           mecha|ltda|L\\.T\\.D\\.A\\.|ltda\\.|styli|style|casting|
                           investm|venture|textil|knit|appare|merchan|sourc|soup|computer|
                           labora|farm|greenh|outdoor|access|custom|produc|rubber|brewing|
                           wood|lumber|bakery| baker| brand|dairy|confecti|interface|
                           corporate|contract|electric|telecom|recycl|waste|energy|
                           enviro|furnit|technolog|micro|surgic|manufac|interio|retail|
                           holiday|worldwide|company|enterprise|propert|power",
                           dataset$name, ignore.case = TRUE)] <- "Company"

  # fill the corresponding entity type with City if it contains words associated
  # with "city" (as they're coded in the key dictionary)
  # this comes after the filling of company entity types because some city names
  # will include "corporation", "brand" or similar "company-esque" nwords in
  # their name (e.g. Pune Municipal Corporation is the civic body that governs Pune, a city in India)
  dataset$entity_type[grep("City|Muni|Town|County|Shire|District|Village|
                           Comm|Metro|Council|Ministry|Canton|Reg",
                           dataset$name, ignore.case = TRUE)] <- "City"

  # fill corresponding entity type with Region if it contains words associated
  # with "regions" (as coded in the key dictionary)
  # reasoning for having this entity type filled after companies is the same as
  # why the filling of cities came after companies
  dataset$entity_type[grep("Prov[e|i]nce|Region|State|Prefect",
                           dataset$name, ignore.case = TRUE)] <- "Region"

  # alert the user to fill any NAs in the entity_type column, if they exist
  missing_ET <- sum(is.na(dataset$entity_type))
  if (missing_ET > 0) {
    print(paste0("You seem to have ", missing_ET, " missing values in the entity_type column. Please fill these before proceeding."))
  }
  cat("Warning: This function will be generally accurate for the entity type of most--but not all--entries. It is highly recommended you double check the entity types, fix any errors, and fill in any missing values. ")
  # Helper function returns output that checks if the name is slightly different
  # Change back to capitalized version if the check is true
  if (exists(paste0("entity_typename"))){
    names(dataset)[grepl("^entity_type$", names(dataset))] <- entity_typename
  }
  return(dataset)
}

# #' @export
# #' @title Standardizes existing entity types within the dataset
# #' @description Similar to function \code{fill_type()}, guesses the entity type of actors based
# #' based on commonly used words. However, this function applies specifically
# #' to differentiate between different subnational actors (City vs Region)
# #' @param dataset Dataset to standardize and clean the entity type for
# #' @return Original dataset with the entity types cleaned
# #'
# #' @example \dontrun{standardize_type(df)}
# standardize_type <- function(dataset) {
#   # Check for column naming
#   col <- "entity_type"
#   .col_check(dataset, col)
#   if (exists("to.stop")){
#     stop(paste0("Stopping function. Please create or rename a \"", col, "\"",
#                 "column."))
#   }
#   dataset$entity_type[grep("City|Muni|Town|County|Shire|District|Village|Assembly|Comm|Metro|Council|Ministry|Authority|Canton|Reg",
#                            dataset$entity_type, ignore.case = TRUE)] <- "City"
#   dataset$entity_type[grep("Prov|Region|Government|State|Pref",
#                            dataset$entity_type, ignore.case = TRUE)] <- "Region"
#   # Helper function returns output that checks if the name is capitalized
#   # Change back to capitalized version if the check is true
#   if (exists(paste0("entity_typename"))){
#     names(dataset)[grepl("entity_type", names(dataset))] <- entity_typename
#   }
#   return(dataset)
# # }

#' @export
#' @title Remove extraneous words from actors' names
#' @description Removes extraneous words such as "council", "district", etc. from the
#' names of actors to improve phonetic matching accuracy. See vignette
#' for the full list of "extraneous" words.
#'
#' @details It is recommended that you use this function after the \code{fill_type()} and
#' \code{resolve_entity_types()} functions, as those functions require the presence of these
#' "extraneous" words to be fully effective.
#'
#' @param dataset Dataset containing actors' names
#'
#' @return Dataset with extraneous words removed from actors' names
remove_extra <- function(dataset){
  # cleaning names by getting rid of extraneous words
  # this list of words can be updated in the future
  # Check for column naming using helper function
  col <- "name"
  dataset <- .col_check(dataset, col, environment())
  if (exists("to.stop")){
    stop(paste0("Stopping function. Please create or rename a \"", col, "\"",
                "column."))
  }
  words <- c("council", "adjuntament", "corporation", "government", "district",
             "mayor", "city", "autonomous", "state", "province", "provincial",
             "county", "municipality", "municipalidadde", "municipalidad",
             "municipio", "kommune", "municipal", "prefecture", "prefectural",
             "metropolitana", "metropolis", "metropolitan", "metropole",
             "town", "community", "communat", "communat", "Ayuntamiento",
             "Gemeente", "Comune", "Kommune", "Republic",
             "city of", "municipality of", "town of", "province of",
             "comune di", "municipalidad de", "prefeitura de" ,
             "munic[\u00ed]pio de", "company", "pte", "inc", "group", "ltd")
  words_rm <- paste0(c(paste("\\s", trimws(words), "\\s", sep = ""),
                       paste("\\s", trimws(words), "$", sep = ""),
                       paste("^", trimws(words), "\\s", sep = "")),
                     collapse = "|")

  dataset$name <- trimws(gsub(words_rm, " ", dataset$name,
                              ignore.case = T))

  # Helper function returns output that checks if the name is capitalized
  # Change back to capitalized version if the check is true
  if (exists(paste0("namename"))){
    names(dataset)[grepl("^name$", names(dataset))] <- namename
  }
  return(dataset)
}

#' @export
#' @title Resolve conflicting entity types
#' @description Resolve entity type conflicts between user's dataset and the key
#' dictionary. The function checks for entries that have the same name and are from the
#' same country but have different entity types.between the user's dataset and
#' the key dictionary. Users can decide whether they would like to accept the key
#' dictionary's entity types or export a dataframe of the differences for inspection.
#'
#' @param dataset Dataset containing actors' names
#' @param key.dict Key dictionary
#' @param clean_enc Is the data read in with the correct encoding?
#' If unknown, set as FALSE. Defaults to TRUE.
#' @return Returns the dataset with entity types resolved (if applicable)
#' @return Also returns a dataset that contains the different conflicts (if applicable)
#'

resolve_entity_types <- function(dataset, key.dict, clean_enc = T){
  if (!is.logical(clean_enc)){
    stop("clean_enc argument requires a logical (True/False) input.")
  }
  # If not sure if data is clean, check and convert to try and repair the encoding
  if (!clean_enc){
    dataset$name <- .check_and_convert(dataset$name)
  }
  # Check for column naming using helper function
  dataset <- .col_check(dataset, "name", environment())
  dataset <- .col_check(dataset, "entity_type", environment())
  dataset <- .col_check(dataset, "iso", environment())
  if (exists("to.stop")){
    stop("Stopping function. Missing the \"name\", \"entity_type\", or \"iso\" columns.")
  }
  # Find actors that have the correct names and iso but not the correct entity type
  # Match for both right and wrong columns in the key_dict
  # Subset data for those that have correct names and iso first
  name_iso_right <- which(paste0(dataset$name, dataset$iso) %in% paste0(key.dict$wrong, key.dict$iso))
  name_iso_right_short <- which(unique(paste0(dataset$name, dataset$iso)) %in% paste0(key.dict$wrong, key.dict$iso))
  # Coerce NA entity types to random character
  dataset$entity_type[intersect(which(is.na(dataset$entity_type)), name_iso_right)] <- ","
  # Check for wrong entity types in those that have correct names and iso
  dict_ind <- match(paste0(dataset$name[name_iso_right], dataset$iso[name_iso_right]), paste0(key.dict$wrong, key.dict$iso))
  ent_ind <- name_iso_right[dataset$entity_type[name_iso_right] != key.dict$entity_type[dict_ind]]
  dict_ent_ind <- dict_ind[dataset$entity_type[name_iso_right] != key.dict$entity_type[dict_ind]]
  # Let users decide which entity type conflicts they want to resolve
  if (length(ent_ind) != 0){
    # Print number of conflicts
    cat(paste0("We found ", length(ent_ind), " number of entries with the same actor name ",
               "and iso but conflicting entity types. Would you like to resolve all",
               " conflicts by accepting the key dictionary's entity type? (Y/N)"))
    ans <- readline(prompt = "Answer: ")
    # Make sure user enters valid response
    # Allow users to 1) accept all key dict's entity types or 2) not resolve and export
    # a dataset of conflicts
    while (substr(toupper(as.character(ans)), 1, 1) != "Y" & substr(toupper(as.character(ans)), 1, 1) != "N"){
      cat("Please enter a valid input (Y/N)")
      ans <- readline(prompt = "Answer: ")
    }
    if (substr(toupper(as.character(ans)), 1, 1) == "Y"){
      # Resolve conflicts by taking all key dict's entity types
      dataset$entity_type[ent_ind] <- key.dict$entity_type[dict_ent_ind]
      return(dataset)
    } else if (substr(toupper(as.character(ans)), 1, 1) == "N"){
      # Export a dataset consisting of conflicts
      cat(paste0("A dataframe of the actors with conflicting entity types will be ",
                 "created for closer inspection of the conflicts."))
      entity_conflicts <<- data.frame(name = dataset$name[ent_ind],
                                      iso = dataset$iso[ent_ind],
                                      user_entity = dataset$entity_type[ent_ind],
                                      keydict_entity = key.dict$entity_type[dict_ent_ind],
                                      user_index = ent_ind,
                                      keydict_index = dict_ent_ind)
      return(dataset)
    }
  } else {
    cat(paste0("There are no entries with conflicting entity types."))
    return(dataset)
  }
  if (exists(paste0("namename"))){
    names(dataset)[grepl("^name$", names(dataset))] <- namename
  }
  if (exists(paste0("entity_typename"))){
    names(dataset)[grepl("^entity_type$", names(dataset))] <- entity_typename
  }
  if (exists(paste0("isoname"))){
    names(dataset)[grepl("^iso$", names(dataset))] <- isoname
  }
}
datadrivenenvirolab/ClimActor documentation built on April 23, 2024, 7:40 a.m.