muniSpain: Working with historical local-level data in Spain

Documented in name_to_code

#' Returns INE codes corresponding to municipality names.
#'
#' @param muni Character string or character vector. It returns a character vector of
#'   INE codes equivalent to the input. If not found, it returns NA. In case two codes
#'   share a municipality name in the same province, it returns the municipality existing
#'   in latest census, or the one that had the name in latest census. Otherwise, it gives
#'   a warning. Usually, these problems are due to a municipality changing name and code,
#'   and then returning to the old denomination, but there might be more problematic cases.
#'
#' @param prov Optional, character vector. Include in case of name duplicates.
#'   If a duplicate is found and no province was specified, it returns an error message.
#'   If only one value is given for a set of municipalities, it is expanded to all of them.
#'
#'   Accepts official names, case-insensitive: Alava, Albacete, Alicante,
#'   Almeria, Avila, Badajoz, Baleares, Barcelona, Burgos, Caceres, Cadiz,
#'   Castellon, Ceuta, Ciudad Real, Cordoba, A Coruna, Cuenca, Girona,
#'   Granada, Guadalajara, Gipuzkoa, Huelva, Huesca, Jaen, Leon, Lleida,
#'   La Rioja, Lugo, Madrid, Malaga, Melilla, Murcia, Navarra, Ourense,
#'   Asturias, Palencia, Las Palmas, Pontevedra, Salamanca,
#'   Santa Cruz de Tenerife, Cantabria, Segovia, Sevilla, Soria, Tarragona,
#'   Teruel, Toledo, Valencia, Valladolid, Bizkaia, Zamora, Zaragoza.
#'   Also: Araba, La Coruna, Gerona, Lerida, Guipuzcoa, Vizcaya.
#'
#' @param year Integer, code_list will be restricted to municipalities existing in a
#'   particular census year. Must be a valid census year.
#'
#' @param year_excl Integer, code_list will exclude a particular census year.
#'   Must be a valid census year.
#'
#' @param force Logical (default = FALSE), if TRUE, it will return codes even if some municipaly
#'   names were found duplicated in more than one provinces (these will be returned as NA).
#'
#' @return Character vector equivalent to the input. NA if not found.
#'
#' @examples
#'   # name_to_code("Tapia") # Returns an error
#'   # name_to_code(c("Tapia", "Castropol")) # Returns an error
#'   name_to_code(c("Tapia", "Castropol"), force = TRUE)
#'   name_to_code("Tapia de Casariego")
#'   name_to_code(c("Tapia", "Castropol"), rep("Asturias", 2))
#'
#'
#' @export
name_to_code = function(muni, prov = NULL, year = NULL, year_excl = NULL, force = FALSE){

  if(class(muni) != "character"){
    stop("muni must be of class 'character'")
  }

  # Restrict code_list to years, if provided
  y_valid = c(1857, 1860, 1877, 1887, 1897, 1900, 1910, 1920,
    1930, 1940, 1950, 1960, 1970, 1981, 1991, 2001, 2011)
  # Only one of these options
  if(!is.null(year_excl) & !is.null(year)){
    stop("Can only give one argument: year OR year_excl")}
  # Include only these years
  if(!is.null(year)){
    if(!all(year %in% y_valid)){stop("Year must be a valid census year")}
    c_year_mat = !is.na(census[, paste0("c", year)])
    c_year = rowSums(c_year_mat) > 0
    c_year = census$muni_code[c_year]
    code_list = subset(codelist, muni_code %in% c_year)
  } else {code_list = codelist}
  # Exclude these years
  if(!is.null(year_excl)){
    if(!all(year_excl %in% y_valid)){stop("Year must be a valid census year")}
    c_year_mat = !is.na(census[, paste0("c", y_valid[!y_valid %in% year_excl])])
    c_year = rowSums(c_year_mat) > 0
    c_year = census$muni_code[c_year]
    code_list = subset(codelist, muni_code %in% c_year)
  } else {code_list = codelist}

  # Adapt muni to lower-case and add regex
  muni_ptt = gsub("\\(", "\\\\\\(", muni)
  muni_ptt = gsub("\\)", "\\\\\\)", muni_ptt)
  muni_ptt = paste0("(^|;)", muni_ptt, "(;|$)")

  # Expand province name if only one was given for many municipalities
  if(length(muni) > 1 & length(prov) == 1){
    prov = rep(prov, length(muni))
  }

  # Functions to find rows in code_list
  return_rows_noprov = function(muni_regex){
    rows = code_list[grepl(muni_regex, code_list$names, ignore.case = TRUE),]
    if(nrow(rows) != 0){rows$regex = muni_regex}
    if(length(unique(rows$prov)) > 1){
      if(force == TRUE){
        warning("Same names in different provinces (returning NA): Please specify provinces.")
        rows[1,] = NA
        rows = rows[1,]
        rows$regex = muni_regex
      } else if (force == FALSE){
        stop("Same names in different provinces (returning NA): Please specify provinces.")
      } else {stop("force argument undefined?")}
    }
    return(rows)
  }

  return_rows = function(muni_regex, prov){
    rows = code_list[grepl(muni_regex, code_list$names, ignore.case = TRUE) &
      code_list$prov_name == tolower(prov),]
    if(nrow(rows) != 0){rows$regex = muni_regex}
    return(rows)
  }

  # If something in prov, check & apply relevant variable
  if(!is.null(prov)){

    # Correct usual
    prov[tolower(prov) == "araba"] = "Alava"
    prov[tolower(prov) == "la coruna"] = "A Coruna"
    prov[tolower(prov) == "orense"] = "Ourense"
    prov[tolower(prov) == "gerona"] = "Girona"
    prov[tolower(prov) == "lerida"] = "Lleida"
    prov[tolower(prov) == "guipuzcoa"] = "Gipuzkoa"
    prov[tolower(prov) == "vizcaya"] = "Bizkaia"

    if(!all(tolower(prov) %in% code_list$prov_name)){
      stop(
        paste0(
          "Province names not valid, misspelled? NOT MACHED: ",
          paste(unique(prov[!tolower(prov) %in% code_list$prov_name]), collapse = "; ")
        )
      )
    }
    rows_found = apply(cbind(muni_ptt, prov), 1, function(x) return_rows(x[1], x[2]))

  # If prov = NULL
  } else {
    rows_found = lapply(muni_ptt, function(x) return_rows_noprov(x))
  }

  # Get municipality codes
  rows_to_code = function(code_list_row){

    # Get municipality codes and check for non-standard lengths
    code = code_list_row$muni_code

    # If no one, just return NA
    if(length(code) == 0){code = NA}

    # If more than 1, messy
    if(length(code) > 1){

      # Get census with those codes
      s_census = subset(census, muni_code %in% code)

      # If both codes were present in last census, return the one matching
      if(!any(is.na(s_census[, ncol(s_census)]))){
        s_census = subset(s_census,
          grepl(unique(code_list_row$regex), muni_name, ignore.case = TRUE))
      # If some but not all were missing from last census, return the one existing
      } else if (any(!is.na(s_census[, ncol(s_census)]))){
        s_census = s_census[!is.na(s_census[, ncol(s_census)]),]
      # If all missing, just Warning + NA
      } else {
        code = NA
        print(paste0("WARNING. Problem with ",
          gsub("\\(|\\)|\\$|\\;|\\^|\\|", "", unique(code_list_row$regex)),
          " in ", unique(code_list_row$prov_name)))
      }

      # Check if results is unique, if not just throw Warning + NA
      if(nrow(s_census) == 1){
        code = s_census$muni_code
        print(paste0("Several matches for ",
          gsub("\\(|\\)|\\$|\\;|\\^|\\|", "", unique(code_list_row$regex)),
          " in ", unique(code_list_row$prov_name), ". Returning: ", code))
      } else {
        code = NA
        print(paste0("WARNING. Problem with ",
          gsub("\\(|\\)|\\$|\\;|\\^|\\|", "", unique(code_list_row$regex)),
          " in ", unique(code_list_row$prov_name)))
      }

    }

    return(code)

  }

  codes = unlist(lapply(rows_found, rows_to_code))
  return(codes)

}