R/geocode_cities_cp.R

Defines functions check_warn_cities_cp geocode_df_cities_cp geocode_df_foreign_cities geocode_cities_cp

Documented in check_warn_cities_cp geocode_cities_cp geocode_df_cities_cp geocode_df_foreign_cities

#' geocode a data.frame with french cities and postcode
#'
#' Since there is a postcode information, this function is stricter than geo_code_cities()
#' mispleeing or wrong postcode are not taken in account but an informative warning is created.
#'
#' If country_col is provided, the
#'
#' @param .data a dataframe that needs to be updated
#' @param city_col column name containing the city name in .data
#' This value will also be used to name the new columns.
#' @param cp_col  postcode columns
#' @param country_col contry_col (used to define the type of geocoding)
#'
#' @return the input data.frame with 3 new columns with a name based on city_col (_lat, _long, _cog)
#' If the function propose a new city / postcode then 2 other columns are added based on city_col and cp_col (proposition_)
#' @keywords internal
#'

geocode_cities_cp <- function(.data, city_col, cp_col, country_col){
  ## enquo col_names
  country_col <- rlang::enquo(country_col)
  city_col <- rlang::enquo(city_col)
  cp_col <- rlang::enquo(cp_col)
  city_col_name <- rlang::as_name(city_col)

  ## Define names of new columns
  new_cols <- list(NA_real_, NA_real_, NA_character_)
  names(new_cols) <- paste0(city_col_name, c("_lat", "_lon", "_cog"))
  # Geocode Cities -----------------------------------------------------------------------
  message("\n...V\u00e9rification de ",
          city_col_name, ".............")

  result <- .data %>%
    ## Geocode french cities with postalcode
    geocode_df_cities_cp(!!city_col,
                         !!cp_col,
                         !!country_col) %>%
    ## Geocode Foreign cities with country name
    geocode_df_foreign_cities(!!city_col,
                              !!country_col)

  ## add empty cols if they have not been added during the geocoding process
  result %>%
    tibble::add_column(!!!new_cols[setdiff(names(new_cols), names(result))])

}


#' geocode df for foreign cities
#'
#' A variation of geocode_df_cities for foreign cities.
#' Instead of using banR, we use a local list of cities in the world.
#' For the remaining result to be found, we use tidygeocoder with Open Street Map (nomatim) API
#'
#' Messages are not displayed when there is no error.
#' input.data is not altered even if contains special characters.
#' Column order is also preserved.
#' Countries that not "France" or NA will not be altered by this function
#'
#' Due to the limitation of Nomatim, if you request several time the same query it will likely be blacklisted temporarly by Nomatim API.
#'
#' @param .data a data.frame
#' @param city_col name of the column that contains the city names
#' @param country_col name of the column that contains the country names
#'
#' @return .data with additional (or updated) columns named after city_col with the extension:
#' _lat _lon and _cog
#' @keywords internal

geocode_df_foreign_cities <- function(.data,
                                      city_col,
                                      country_col){
  city_col <- rlang::enquo(city_col)
  city_col_name <- rlang::as_name(city_col)
  country_col <- rlang::enquo(country_col)
  country_col_name <- rlang::as_name(country_col)
  input_data <- .data
  nomatim_error <- FALSE

  ## Geocode ----------------------
  ## Clean df and detect foreign cities
  foreign_cities <- input_data %>%
    dplyr::transmute(
      id_rows = dplyr::row_number(),
      city = !!city_col,
      country = !!country_col) %>%
    dplyr::filter(!is.na(.data$country) &
                    !is.na(.data$city) &
                    .data$country != "France") %>%
    dplyr::mutate(
      country = dplyr::case_when(
        country %in% c("Angleterre", "Ecosse", "Irlande du Nord", "Pays de Galles") ~ "Royaume-Uni",
        stringr::str_detect(country,
                            "Etat.+Uni")  ~ "US",
        TRUE ~ country)
    )

  if(nrow(foreign_cities) == 0)
    return(.data)

  ## First search cities in local database (world_cities)---------
  geocoding <- foreign_cities %>%
    dplyr::mutate(
      country = countrycode::countryname(.data$country),
      city = stringi::stri_trans_general(.data$city, id = "Latin-ASCII")
    ) %>%
    dplyr::left_join(world_cities,
                     by = c("city","country")
    ) %>%
    dplyr::filter(!is.na(.data$lon)) %>%
    dplyr::group_by(.data$id_rows) %>%
    dplyr::slice_max(.data$pop) %>% ## Get biggest city in case of multiple result
    dplyr::ungroup() %>%
    dplyr::select(.data$id_rows, .data$lat, .data$lon)
  ## Search approximate values in local database----------------
  if(nrow(geocoding) != nrow(foreign_cities)){
    remaining <- foreign_cities %>%
      dplyr::anti_join(geocoding, by = "id_rows") %>%
      dplyr::mutate(
        country = countrycode::countryname(.data$country),
        city = stringi::stri_trans_general(.data$city, id = "Latin-ASCII")
      ) %>%
      tidyr::drop_na(.data$city, .data$country)

    geocoding_approx <- remaining %>%
      fuzzyjoin::stringdist_left_join(
        world_cities %>%
          dplyr::filter(.data$country %in% unique(remaining$country)),
        by = c("city", "country"),
        ignore_case = TRUE,
        max_dist = 1,
        distance_col = "distance"
      ) %>%
      tidyr::drop_na(.data$lon) %>%
      dplyr::filter(.data$country.x == .data$country.y) %>%
      dplyr::group_by(.data$id_rows) %>%
      dplyr::slice_max(.data$pop) %>% ## Get biggest city in case of multiple result
      dplyr::ungroup()

    ## Warn user of interpretation and store the interpretation
    if(nrow(geocoding_approx) != 0){
      geocoding_approx_recap <- geocoding_approx %>%
        dplyr::select(.data$id_rows, .data$city.y) %>%
        dplyr::left_join(foreign_cities, by = "id_rows") %>%
        dplyr::distinct(.data$city, .data$country, .data$city.y) %>%
        dplyr::arrange(.data$country, .data$city)

      message("Interpr\u00e9tation de noms de communes \u00e9trang\u00e8res:\n\t",
              paste0(geocoding_approx_recap$country, " : ", geocoding_approx_recap$city,
                     " -> ",geocoding_approx_recap$city.y,
                     collapse = "\n\t"))

      geocoding <- geocoding_approx %>%
        dplyr::transmute(
          .data$id_rows, .data$lat, .data$lon,
          proposition_city = .data$city.y
        ) %>%
        dplyr::bind_rows(geocoding)
    }
  }

  ## Search remaining cities on Open Street Map (Nomatim)-----------
  ## for unknown cities (mispelled or small)
  remaining_cities <- foreign_cities %>%
    dplyr::anti_join(geocoding, by = "id_rows")

  if(!is_empty_df(remaining_cities)){
    tryCatch(
      {
        nomatim_geocoding <- geocode_nomatim(remaining_cities)
      },
      error = function(e){
        nomatim_error <<- TRUE
      })

    ## Insist on Nomatim if it fails (to be tested)
    if(nomatim_error){
      tryCatch(
        {
          nomatim_error <- FALSE
          tidygeocoder::geo("", method = "osm") ## Test to avoid being blacklisted with 2 times the same request in a row
          nomatim_geocoding <- geocode_nomatim(remaining_cities)
        },
        error = function(e){
          nomatim_error <<- TRUE
        })
    }
  }else{
    nomatim_geocoding <- data.frame()
  }



  ## Warns on geocoding that failed -------------------
  if(nomatim_error){
    message("Impossible de g\u00e9ocoder certaines villes \u00e9trang\u00e8res. Erreur lors de la requ\u00eate vers Nomatim")
  } else {
    geocoding <- geocoding %>%
      dplyr::bind_rows(nomatim_geocoding)
  }

  wrong <- foreign_cities %>%
    dplyr::anti_join(geocoding, by = "id_rows")
  if(!is_empty_df(wrong))
    message("Villes inconnues:",
            paste0("\n\t", wrong$city, " (", wrong$country, ")"))


  ## Format data for output ----------------------------
  index <- input_data %>%
    dplyr::transmute(id_rows = dplyr::row_number())

  ## Compute vectors to add to the data.frame.
  col_to_add <- geocoding %>%
    dplyr::right_join(index, by = "id_rows") %>%
    dplyr::arrange(.data$id_rows) %>%
    dplyr::select(-.data$id_rows) %>%
    dplyr::mutate(city_cog = NA_character_) %>% ## For compatibility with other functions (French)
    dplyr::rename(
      city_lat = .data$lat,
      city_lon = .data$lon,
    ) %>%
    dplyr::rename_with(
      ~ stringr::str_replace(.x, "city", city_col_name)
    )
  ## "Coalesce" new values with potential existing values.
  col_to_add <- col_to_add %>%
    dplyr::coalesce(input_data %>%
                      dplyr::select(dplyr::any_of(names(col_to_add))))
  ## update input data
  input_data %>%
    dplyr::select(-dplyr::any_of(names(col_to_add))) %>%
    dplyr::bind_cols(col_to_add) %>%
    dplyr::relocate(
      dplyr::starts_with("proposition"),
      .after = dplyr::last_col()
    )
}

#' geocode df with postal code
#'
#' A variation of geocode_df_cities for data with postcode provided
#' Use of banR. Countries that are not "France" or NA will not be altered by this function
#'
#' Messages are not displayed when there is no error.
#' input.data is not altered even if contains special characters.
#' Column order is also preserved
#'
#' @param .data a data.frame
#' @param city_col name of the column that contains the city names
#' @param cp_col name of the column that contains postal code (french)
#' @param country_col name of the column that contains the country names
#'
#' @return .data with additional (or updated) columns named after city_col with the extension:
#' _lat _lon and _cog
#'
#' @keywords internal

geocode_df_cities_cp <- function(.data,
                                 city_col,
                                 cp_col,
                                 country_col){
  city_col <- rlang::enquo(city_col)
  city_col_name <- rlang::as_name(city_col)
  cp_col <- rlang::enquo(cp_col)
  cp_col_name <- rlang::as_name(cp_col)
  country_col <- rlang::enquo(country_col)
  input_data <- .data

  # Geocode -------------------------------------------------------------------------------------

  french_cities <- input_data %>%
    dplyr::mutate(id_rows = dplyr::row_number()) %>%
    dplyr::filter(!is.na(!!city_col)) %>%
    dplyr::filter(is.na(!!country_col) | !!country_col == "France") %>%
    dplyr::select(.data$id_rows,
                  !!city_col,
                  !!cp_col) %>%
    dplyr::mutate(
      !!cp_col_name := stringr::str_pad(!!cp_col,### converting postcode that have less than 5 digits.
                                        5L,
                                        side = "left",
                                        pad ="0")
    )

  if(nrow(french_cities) == 0) ## Nothing to geocode
    return(.data)

  ## First geocode using local database (exact match)
  local_result <- french_cities %>%
    dplyr::mutate(city = rename_french_cities(!!city_col),
                  cp = !!cp_col) %>%
    dplyr::left_join(
      france_cities,
      by = c("city", "cp")) %>%
    tidyr::drop_na() %>%
    dplyr::select(.data$id_rows,
                  longitude = .data$lon,
                  latitude = .data$lat,
                  .data$cog)

  remaining_geocode <- french_cities %>%
    dplyr::anti_join(local_result, by = "id_rows")

  if(nrow(remaining_geocode) == 0){
    result <- tibble::tibble(id_rows = integer(),
                             latitude = numeric(),
                             longitude = numeric(),
                             cog = character())
  } else {

    ## Geocode the remaining using banR
    result <- french_cities %>%
      dplyr::anti_join(local_result, by = "id_rows") %>%
      dplyr::mutate(
        city_renamed = stringi::stri_trans_general(!!city_col,
                                                   id = "Latin-ASCII") # to avoid strange result from geocode_tbl
      ) %>%
      banR::geocode_tbl(tbl = .,
                        adresse = city_renamed,
                        code_postal = !!cp_col) %>%
      suppressMessages() %>%
      dplyr::mutate(
        geocode_ok = (.data$result_type == "municipality" &
                        .data$result_score >= 0.8)
      ) %>%
      dplyr::select(-.data$city_renamed)

    ## Checking wrong results and propose an alternative in warnings-------------------
    anomaly_to_check <- result %>%
      dplyr::filter(is.na(.data$geocode_ok) | .data$geocode_ok == FALSE) %>%
      dplyr::select(
        .data$id_rows,
        city = !!city_col, postcode = !!cp_col,
        .data$result_type, .data$result_score,
        .data$result_label, .data$result_postcode
      )
    ## Store propositions
    proposition <- check_warn_cities_cp(anomaly_to_check) %>%
      dplyr::rename(
        !!city_col_name := .data$proposition_city,
        !!cp_col_name := .data$proposition_cp
      ) %>%
      dplyr::rename_with(
        .fn = ~ paste0("proposition_", .x),
        .cols = -.data$id_rows
      )

    result <- result %>%
      dplyr::filter(.data$geocode_ok) %>%
      dplyr::select(.data$id_rows,
                    .data$latitude,
                    .data$longitude,
                    cog = .data$result_citycode) %>%
      dplyr::bind_rows(proposition)
  }

  ## Format data for output ----------------------------
  index <- input_data %>%
    dplyr::transmute(id_rows = dplyr::row_number())

  col_to_add <- result %>%
    dplyr::bind_rows(local_result) %>%
    dplyr::right_join(index, by = "id_rows") %>%
    dplyr::arrange(.data$id_rows) %>%
    dplyr::select(-.data$id_rows) %>%
    dplyr::rename_with(
      ~ paste0(city_col_name, "_",
               stringr::str_sub(.x, 1, 3)),
      .cols = c(.data$latitude, .data$longitude, .data$cog)
    )

  ## "Coalesce" new values with potential existing values.
  col_to_add <- col_to_add %>%
    dplyr::coalesce(input_data %>%
                      dplyr::select(dplyr::any_of(names(col_to_add))))
  ## update input data
  input_data %>%
    dplyr::select(-dplyr::any_of(names(col_to_add))) %>%
    dplyr::bind_cols(col_to_add)

}

#' Check a list of cities and try to find replacement values
#'
#' Information is printed in warnings.
#' This function returns a data.frame containing correction proposition
#'
#' @param data a data.frame with the following columns:
#' id_rows, city (string), postcode(string), result_type, result_score, result_label, result_postcode.
#' This data.frame is produced using banR::geocode_tbl()
#'
#' @return invisible(0)
#' @keywords internal
check_warn_cities_cp <- function(data){
  ## check input------------------
  df_has_cols(
    data,
    c("id_rows", "city", "postcode", "result_type", "result_score", "result_label", "result_postcode")
  )
  if(is_empty_df(data))
    return(
      tibble::tibble(
        id_rows = integer(),
        proposition_city = character(),
        proposition_cp = character()
      )
    )
  # Checking for "name" mispelling (wrong name with correct postcode)
  ## . Mispelling wih correct postcode -------------------------
  mispelling <- data %>%
    dplyr::filter(.data$result_type == "municipality" & .data$result_score < 0.8) %>%
    dplyr::select(
      .data$city, .data$postcode,
      .data$result_label, .data$result_postcode, .data$result_score
    ) %>%
    dplyr::distinct() %>%
    ## Improve score to be consistent with score given without postcode (postcode is good)
    ## Score is now between .5 and 1
    dplyr::mutate(result_score = 1-((1-.data$result_score)/2))


  ## . Wrong post code --------------------
  bad_city_nocp <- data %>%
    dplyr::filter(is.na(.data$result_type) | .data$result_type != "municipality") %>%
    dplyr::distinct(.data$city, .data$postcode)

  if(nrow(bad_city_nocp)!=0){
    bad_city_nocp <- bad_city_nocp %>%
      dplyr::mutate(
        city_renamed = stringi::stri_trans_general(.data$city,id = "Latin-ASCII") # to avoid strange result from geocode_tbl
      ) %>%
      banR::geocode_tbl(tbl = .,
                        adresse = city_renamed) %>%
      suppressMessages() %>%
      dplyr::select(
        .data$city, .data$postcode,
        .data$result_label, .data$result_postcode,
        .data$result_score, .data$result_type)

    wrong_cp <- bad_city_nocp %>%
      dplyr::filter(.data$result_type == "municipality" & .data$result_score > .8) %>%
      dplyr::select(-dplyr::any_of(c("result_type")))

    ## . Remaining mismatches  ---------------
    last_search <- bad_city_nocp %>%
      dplyr::anti_join(wrong_cp, by = c("city", "postcode")) %>%
      dplyr::filter(!is.na(.data$city)) %>%
      dplyr::select(.data$city, .data$postcode) %>%
      geocode_row_by_row(city_col = .data$city) %>%
      dplyr::select(
        dplyr::any_of(c("city", "postcode", "result_label", "result_postcode", "result_score"))
        #.data$city, .data$postcode ,.data$result_label, .data$result_postcode, .data$result_score
      )
  } else{
    wrong_cp <- data.frame()
    last_search <- data.frame()
  }


  if(nrow(last_search) != 0){
    last_search_proposal <- last_search %>%
      dplyr::filter(!is.na(.data$result_label))
    wrong_no_proposal <- last_search %>%
      dplyr::filter(is.na(.data$result_label)) %>%
      dplyr::arrange(.data$city)
  } else {# empty data.frames to avoid error subsetting non-existing cols
    last_search_proposal <- data.frame()
    wrong_no_proposal <- data.frame()
  }

  ## . Send warning---------------------

  wrong_with_proposal <- dplyr::bind_rows(last_search_proposal,
                                          mispelling,
                                          wrong_cp) %>%
    dplyr::arrange(.data$city)

  if(nrow(wrong_no_proposal)!=0)
    message("Villes inconnues:",
            paste0("\n\t", wrong_no_proposal$city, "(", wrong_no_proposal$postcode, ")"))

  if(nrow(wrong_with_proposal) != 0){
    message("Les villes suivantes ont ete ignor\u00e9es. Propositions de corrections:",
            paste0("\n\t",wrong_with_proposal$city," (", wrong_with_proposal$postcode,") -> \t",
                   wrong_with_proposal$result_label, " (",wrong_with_proposal$result_postcode, ")"))
  }

  ## . Return propositions-----------------

  data %>%
    dplyr::select(.data$id_rows, .data$postcode, .data$city) %>%
    dplyr::left_join(
      wrong_with_proposal,
      by = c("postcode", "city")
    ) %>%
    dplyr::select(
      .data$id_rows,
      proposition_city = .data$result_label,
      proposition_cp = .data$result_postcode
    )

}
JMPivette/evavelo documentation built on April 8, 2023, 4:20 p.m.