In scc-pi/addrsheff: Tools To Help With Address Data

library(testthat)

Add postcode variables

#' Add postcode variables e.g. Ward, to a data frame with postcodes  
#'
#' @param df data frame with a postcode variable
#' @param pcd_name name of the postcode variable in passed the data frame
#' @param .admin_district boolean indicating whether to add an admin_district variable
#' @param .lat_long boolean indicating whether to add a pair of latitude & longitude variables
#' @param other_vars character vector of other variables to add
#'
#' @return
#' data frame with variables added based on postcode
#' @export
#'
#' @examples
add_postcode_vars <- function(df, 
                              pcd_name = "postcode",
                              .admin_district = TRUE,
                              .lat_long = FALSE,
                              other_vars = character(0)){

  ## bulk_postcode_lookup() is currently not ignoring white space ----
  # - looks like it might be a postcodes.io issue rather than PostcodesioR

  # New variable with postcode minus white space
  df <- dplyr::mutate(
    df, 
    postcode_query = stringr::str_replace_all(postcode, " " , "")
  )

  ## Build a vector of variables of interest ----

  # Create an empty character vector  
  pcd_vars <- character(0)

  # From the arguments passed in the function call
  if(.admin_district) pcd_vars <- append(pcd_vars, "admin_district")   

  if(.lat_long) pcd_vars <- append(pcd_vars, c("longitude", "latitude"))

  if(length(other_vars) > 0) pcd_vars <- append(pcd_vars, other_vars) 

  # Remove any duplicates
  pcd_vars <- unique(pcd_vars)

  ## Build a postcode lookup template and a list of columns in the template ----

  # Empty postcode lookup data frame template
  pcd_template <- PostcodesioR::postcode_lookup("S1 2HH") %>%
    dplyr::filter(postcode == "ZZ ZZZ")

  # Data frame of postcode lookup column names, types & indices
  pcd_cols <- tidyr::tibble(
    name = colnames(pcd_template), 
    type = lapply(pcd_template, class)) %>% #TODO: lose or use & extract from list
    tibble::rowid_to_column(var = "rowid") %>% 
    dplyr::relocate(rowid, .after = last_col()) 

  # The position of "codes" helps with nested "*_code" values
  min_codes_var <- pcd_cols %>% 
    dplyr::filter(stringr::str_ends(name, "_code")) %>% 
    dplyr::filter(rowid == min(rowid))

  codes_position <- min_codes_var$rowid

  # Add index used to extract var from results
  pcd_cols <- dplyr::mutate(
    pcd_cols, index = purrr::map(
      rowid,
      ~ ifelse(.x < codes_position,
        list(c(2, .x)),
        list(c(
          2,
          codes_position,
          .x + 1 - codes_position
        ))
      )
    )) %>%
    dplyr::select(-rowid)

  # Filter list of postcode columns to vars of interest
  pcd_cols <- dplyr::filter(pcd_cols, name %in% pcd_vars)

  # Filter template columns to vars of interest
  pcd_details <- dplyr::select(pcd_template, dplyr::all_of(pcd_vars)) %>% 
    dplyr::mutate(postcode_query = as.character())


  # Need to distinguish between postcode passed & matched e.g. s1 2hh & S1 2HH
  pcd_match <- "postcode" %in% colnames(pcd_details) # flag is used more than once 

  if(pcd_match) pcd_details <- pcd_details %>% 
    dplyr::mutate(pcd_match = character(0)) 

  ## Do some checks -----

  # Check we have some postcode variables in the function arguments
  stopifnot(length(pcd_vars) > 0) 

  # Check we have some VALID postcode variables in the function arguments
  stopifnot(nrow(dplyr::filter(pcd_cols, name %in% pcd_vars)) > 0) 

  # Warn if there's an unused postcode variable from the function arguments
  if(length(pcd_vars)!=length(pcd_cols)) {
    warnings(paste0(
      "One or more of the variables requested is not available. ",
      "Check spelling and docs.ropensci.org/PostcodesioR/")
    )
  }

  ## Handle empty & duplicate postcodes -----

  # Snapshot to refer and join to later 
  df_orig <- df

  # PostcodesioR expects postcodes to be labelled "postcode" 
  df <- dplyr::rename(df, postcode = {{pcd_name}}) # we rename it back in the returned df

  # Don't process empty or duplicate postcodes  
  df <- df %>% 
    tidyr::drop_na(postcode) %>% 
    dplyr::distinct(postcode, .keep_all = TRUE)

  ## Batch-by-batch postcode lookup ----

  # PostcodesioR puts 100 row limit on bulk_postcode_lookup
  #  - so we're going to have to process the request in batches
  batch_size_max <- 100

  ### Batch loop prep ----

  # Numbers involved in breaking postcode lookup into batches
  total_rows <- nrow(df)

  whole_batches <- total_rows %/% batch_size_max

  remainder_batch_size <- total_rows %% batch_size_max

  n_batches <- ifelse(remainder_batch_size == 0, whole_batches, whole_batches + 1)

  ### Batch loop ----
  for (i in 1:n_batches) {

    #### Slice a batch ----

    # Determine the size of the batch
    batch_size <- ifelse(i != n_batches, batch_size_max, remainder_batch_size)

    # Small probability (1 in 100) but need to check ...
    batch_size <- ifelse(batch_size == 0, batch_size_max, batch_size)

    # Determine the start and end records of the batch
    batch_start <- ifelse(i == 1, 1, ((i-1) * batch_size_max) + 1)

    batch_end <- ifelse(
      i == n_batches,
      (ifelse(i == 1, i, (i - 1)) * batch_size_max) + remainder_batch_size,
      i * batch_size_max
    )

    # Get a batch of petition records
    batch <- dplyr::slice(df, batch_start:batch_end)

    #### Bulk lookup request to postcodes.io ----
    bulk_lookup_rslt <- PostcodesioR::bulk_postcode_lookup(
      list(postcodes = batch$postcode_query)
    )

    #### Extract the vars from the postcodes.io results in to a data frame ----
    df_batch_rslt <- purrr::map2_dfc(
      pcd_cols$name, 
      pcd_cols$index,
      ~ dplyr::tibble(!!.x := purrr::map(
          bulk_lookup_rslt,
          unlist(.y),
          .default = NA)
          )
      ) %>%
      tidyr::unnest(cols = dplyr::all_of(pcd_vars))

    # Need to distinguish between postcode passed & matched e.g. s1 2hh & S1 2HH
    if(pcd_match) df_batch_rslt <- df_batch_result %>% 
      dplyr::rename(pcd_match = postcode) 

    # Add postcode passed i.e. our primary key
    df_batch_rslt <- tibble::add_column(
      df_batch_rslt,  
      postcode_query = purrr::map_chr(bulk_lookup_rslt, c(1, 1), .default = ""),
      .before = 1
    )

    # Cumulative results from batches
    pcd_details <- dplyr::bind_rows(pcd_details, df_batch_rslt)
  }

  # Add postcode variables of interest to the original data
  df_new  <- df_orig %>%
    dplyr::left_join(pcd_details, by = c("postcode_query")) %>% 
    dplyr::select(-postcode_query)
}

# Create a data frame with some example records
df <- tibble::tribble(
  ~name,    ~postcode,
  "SCC",    "S1 2HH",
  "Blades", "S2 4SU",
  "Owls",   "S6 1SW"
)

add_postcode_vars(
  df,
  pcd_name = "postcode",
  .admin_district = FALSE,
  .lat_long = TRUE,
  other_vars = c("admin_ward", "msoa_code")
)

# test_that("add_postcode_vars works properly and show error if needed", {
#   expect_true(add_postcode_vars(1:12) == 6.5)
#   expect_error(add_postcode_vars("text"))
# })

# Keep eval=FALSE to avoid infinite loop in case you hit the knit button
# Execute in the console directly
fusen::inflate(flat_file = "dev/flat_full.Rmd", vignette_name = "Get started")