R/miscoding_explorative.R

Defines functions compare_frame_classes compare_classes compare_frame_frequencies compare_frequencies sqrt_comparison

Documented in compare_classes compare_frame_classes compare_frame_frequencies compare_frequencies sqrt_comparison

sqrt_comparison <- function(l_unique, l_total){
  #' Function for comparing two values.  
  #' 
  #' @param l_unique number of unique values
  #' @param l_total number of observations
  #' 
  #' @return Boolean - FALSE: more unique values than the square root of the number of observations.
  l_unique < sqrt(l_total)
}

compare_frequencies <- function(values, limit=0.8){
  #' Vector function - check that no single value of a 
  #' variable is used more than a set limit.
  #' 
  #'  @param values Vector of values to check
  #'  @param limit The upper limit on how often one value should occur.  Real value [0, 1]. 
  #'  
  #'  @return Boolean 
  l <- length(values)
  freq <- table(values)
  all(freq < limit*l)
}

compare_frame_frequencies <- function(data, limit=0.8){
  #' Tests all variables in a data frame to check values aren't used too often.
  #'
  #' @param data Data frame of variables to check each column of
  #' @param limit Max percentage of a variable that can be the same value
  #' 
  #' @return Boolean-vector
  results <- blank_result(data)
  for (name in names(data)){
    values <- data[,name]
    results[name] <- compare_frequencies(values, limit)
  }
  results
}

compare_classes <- function(values, comparison=sqrt_comparison){
  #' Checks if there are enough examples for the number of classes in a variable.
  #' 
  #' @param values Vector of observations
  #' @param comparison  [optional] A function to compare the number of unique classes with the number of observations.
  #' 
  #' @return Boolean - FALSE: not enough examples present
  l_total <- length(values)
  l_unique <- length(unique(values))
  comparison(l_unique, l_total)
}

compare_frame_classes <- function(data, comparison=sqrt_comparison){
  #' Check each variable of a data frame to determine if there are enough examples for the number of distinct values.  
  #' NB: will return a spurious result for continous valued variables.
  #' 
  #' @param data Dataframe of variables for checking
  #' @param comparison [optional] A function to compare the number of unique classes with the number of observations.
  #' 
  #' @return Boolean-vector -  FALSE: not enough examples present
  #' 
  results <- blank_result(data)
  for (name in names(data)){
    values <- data[,name]
    results[name] <- compare_classes(values, comparison)
  }
  results
}
Stat-Cook/NURS.Data.Quality documentation built on Dec. 18, 2021, 2:09 p.m.