R/check.R

Defines functions check_names check_values closest_matches check_and_match

Documented in check_and_match check_names check_values closest_matches

#' Check a data frame against a white list for required names.
#'
#' Colorized output printed to the console with results
#' @md
#' @param data A data frame or list.
#' @param white_list A list, where the names are required columns
#' and the elements are permitted values for each column.
#' @return  The function invisibly returns a `boolean` value, `TRUE` if all
#' needed names are present and `FALSE` if any needed names are missing.
#' @examples
#' \dontrun{
#' hem_data %>% check_names(hem_wl)
#' }
#' @importFrom crayon bgRed bgGreen white
#' @importFrom glue glue
#' @export
check_names <- function(data, white_list) {
  needed_names <- names(white_list)

  if (length(needed_names[!needed_names %in% names(data)]) != 0) {
    message(crayon::bgRed$white(
      glue::glue("{needed_names[!needed_names %in% names(data)]} missing :(")
    ))
    invisible(FALSE)
  } else {
    message(crayon::bgGreen$white(glue::glue("names check out :)")))
    invisible(TRUE)
  }
}

#' Check a data frame against a white list for required names.
#'
#' Colorized output printed to the console with results
#' @md
#' @param data A dataframe or list.
#' @param white_list A list, where the names are required columns
#' and the elements are permitted values for each column.
#' @param return_bad Boolean, should non-permitted values be returned in a list?
#' @return  The function invisibly returns a `boolean` value, `TRUE` if all
#' needed names are present and `FALSE` if any needed names are missing.
#' Unless `return_bad=TRUE` then it returns a list instead.
#' @examples
#' \dontrun{
#' hem_data %>% check_values(hem_wl)
#' }
#' @export
check_values <- function(data, white_list, return_bad = FALSE) {
  bad_li <- list()

  for (n in names(white_list)) {
    ok_vals <- white_list[[n]]

    if (sum(!data[[n]] %in% ok_vals) != 0) {
      bad_vals <- unique(data[[n]][!data[[n]] %in% ok_vals])

      message(crayon::bgRed$white(
        n, " value(s):",
        paste(bad_vals, collapse = ", "),
        "not white-listed"
      ))
      bad_li[[n]] <- bad_vals
    }
  }
  if (length(bad_li) == 0) {
    message(crayon::bgGreen$white("all values check out :)"))
    invisible(TRUE)
  }
  else if (return_bad) {
    return(bad_li)
  }
  else {
    invisible(FALSE)
  }
}

#' Finds the value most similar to `value` in `values`.
#'
#' Uses [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
#' to compare similaririty between strings and will always
#' return a value, even if the similarity score is very low.
#' @md
#' @param value A character of length one. The target value used to find the most
#' similar strings in `values`.
#' @param values A character. The values to selected the most similar to `value`
#' from.
#' @param margin A numeric, the amount to expand the results by. The default, 0,
#' will return only the lowest similiarity score. Increasing `margin` will include
#' all values with a similarity score less than `similarity_score + margin`.
#' @param ... Arguments passed through to `adist()`.
#' @return A character vector with the subset of `values` equal to or less than
#' `similarity_score + margin`.
#' @examples
#' values <- c("dog", "cat", "bird")
#' closest_matches("cats", values)
#' @importFrom stats setNames
#' @importFrom utils adist
#' @export
closest_matches <- function(value, values, margin = 0, ...) {
  if (is.na(value)) {
    stop("NA value detected. String matching does not work with NAs. Replace missing values and retry.")
  }

  distances <- adist(value, values, ignore.case = TRUE, ...) %>%
    as.numeric() %>%
    setNames(values)

  # arrange matches so closest are first
  min_distance <- min(distances)
  guesses <- distances[distances %in% min_distance:(min_distance + margin)] %>%
    sort() %>%
    names()

  # handle targets like 'CXCL1/GROa' with grepl()
  # both 'CXCL1' and 'GROa' should return 'CXCL1/GROa' as the best guess
  values[agrepl(value, values, ignore.case = TRUE)] %>%
    c(., guesses) %>%
    unique()
}

#' A combination of `check_values()` and `closest_match()`.
#'
#' Starts interactive prompts to change not-permitted values. The white-list
#' can also be updated depending on the argument `append`.
#'
#' If the prompt response is:
#' 1. a number and the name of a printed potential match, then
#' that corresponding match is selected for substitution
#' 2. 'ok', then the value remains as is, no substituion occurs.
#' 3. Any other value, then the non-permitted value will be substituted for the
#' prompt value.
#'
#' Useful to run this without pipes if `insert_code = TRUE`,because of code-insertion, see `examples`
#'
#' @md
#' @param data A dataframe or list.
#' @param white_list A named list with permitted values for required fields.
#' Names are fields, contents are values.
#' @param append A boolean. Should the `white_list` be appended to? This modifies
#' the global variable that was passed in.
#' @param margin A numeric, passed onto to `closest_matches`. Expands potential
#' matches by increasing the distance cutoff used.
#' @param insert_code A boolean. Should code be inserted into the active script
#' with the record of `gsub()`s perfromed.
#' @examples
#' \dontrun{
#' hem_data <- check_and_match(hem_data, hem_wl)
#' }
#' @importFrom rstudioapi getSourceEditorContext
#' @importFrom glue glue
#' @export
check_and_match <- function(data, white_list, append = FALSE, margin = 1, insert_code = TRUE) {
  global_wl <- substitute(white_list)
  data_string <- deparse(substitute(data)) # used for code-building
  func_list <- list() # container for intermediate code chunks
  context <- rstudioapi::getSourceEditorContext() # document details for code-insertinon

  bad_li <- suppressMessages(check_values(data, white_list, return_bad = TRUE))

  if (length(bad_li) == 0) {
    message("Values check out.")
    return(NULL)
  }
  else {
    for (n in names(bad_li)) {
      bad_vals <- bad_li[[n]]
      good_vals <- white_list[[n]]

      for (v in bad_vals) {
        matches <- closest_matches(v, good_vals, margin = margin)
        names(matches) <- 1:length(matches)

        message(glue::glue("Closest values to {v} in `white_list${n}`:"))
        print(matches)

        choice <- readline(
          prompt = glue::glue("Replace {v} with: ")
        ) %>%
          trimws()

        if (tolower(choice) != "ok") {
          if (tolower(choice) %in% names(matches)) {
            choice <- matches[[choice]]
          }
          bad_vals <- gsub(v, choice, bad_vals)
          data[[n]] <- gsub(v, choice, data[[n]])

          func_list[[n]] <- c(func_list[[n]], glue::glue("{n} = gsub('{v}', '{choice}', {n}, fixed = TRUE)"))
        }
      }
      if (append) {
        vals_to_append <- bad_vals[!bad_vals %in% good_vals]
        white_list[[n]] <- c(white_list[[n]], vals_to_append)
      }
    }
    # overwrite the 'white_list' argument in global environment
    if (append) {
      assign(as.character(global_wl), white_list, envir = globalenv())
    }

    if (insert_code && length(func_list) > 0) {
      func_list <- unlist(func_list)

      code_string <- glue::glue("{data_string} <- dplyr::mutate({data_string}, {paste(func_list, collapse = ',\n')})\n",
        .trim = F
      ) %>%
        styler::style_text()

      range <- context$selection %>%
        unlist() %>%
        .[1:2] %>%
        as.numeric() %>%
        rstudioapi::as.document_position() %>%
        rstudioapi::document_range(start = ., end = .)

      rstudioapi::modifyRange(range,
        paste0(paste0(code_string, collapse = "\n"), "\n"), # extra new line to prevent code 'smushing'
        id = context$id
      )
    }
    data
  }
}
hemoshear/assayr2 documentation built on Nov. 8, 2019, 6:13 p.m.