R/import_category_check.R

Defines functions category_check test_category_check

Documented in category_check

# If data has individual categories, look at list of all categories,
# and make sure none are being excluded because of casing/spelling issues

# We should ask the user for the column we want to check out
# For now let's use column name as string

# Strategy:
# Group up the values in the column - that's our reference
# Group up the values after doing some type of standardization to the column

# Examples:
# 1. Removing spaces
# 1.1 Removing spaces from the end (trimming ws)
# 1.2 Removing spaces from the whole entry
# 1.3 Removing spaces from the beginning
# 2. Standardized capitalization - all lowercase

# Possible extensions:
# Fuzzy matching, checking for typos, handling punctuation of all types,
# Replace numbers with just ["#"] or some other alias

# To extend this function to other standardizations, modify the contents of the
# mutate statement
#' Category check
#'
#' This function works to catch any values in the column that should be combined
#' with other categories. It works by catching mismatched spaces, capitalization, or punctuation
#' Returns list of categories that the checker suspects might overlap.
#'
#' @param dataframe User provides reference to an R dataframe to check
#' @param col_name Reference to a column to be checked for discrete categories
#'

#' @export

category_check <- function(df, col_name) {
  # Base case
  column_of_interest <- df %>% count(!!(enquo(col_name)))
  # "standardized" columns:
  suspect_categories <- df %>%
    distinct(!!(enquo(col_name))) %>%
    mutate(
      # To extend this function, add new standardization steps here
      trimspaces = trimws(!!(enquo(col_name))),
      nopunctuation = gsub("[[:punct:]]", "", !!(enquo(col_name))),
      nospaces = gsub(" ", "", !!(enquo(col_name)))
      ) %>%
    group_by(trimspaces) %>%
    mutate(trimspacecount = n()) %>%
    group_by(nopunctuation) %>%
    mutate(nopunctuationcount = n()) %>%
    group_by(nospaces) %>%
    mutate(nospacecount = n()) %>%
    # Add a group by and counting step here for any standardization here
    filter_if(is.numeric, any_vars(. > 1))
  if(nrow(suspect_categories) > 0) {
    warning("category_check: After standardization, these categories overlap:\n",
            paste0("`", suspect_categories[,1, drop = TRUE], "`", collapse = ', '))
  }
}

test_category_check <- function() {
  test_data <- read_csv("testdata/category_check/spaces1.csv", col_types = cols(.default = 'c'), trim_ws = FALSE)
  category_check(test_data, Category)
}
associatedpress/apdatacheck documentation built on Feb. 17, 2020, 9:27 p.m.