R/typing_functions.R

Defines functions get_feature_types decorate_dictionary get_stats compile_dictionary enforce_dictionary

Documented in compile_dictionary enforce_dictionary

# Extract classes from all columns of a data frame
#
# This function will extract the column classes from a data frame, returning a new data frame.
#
# @param dataset A data frame of the data for which we are building a dictionary.
# @return types A dataframe listing column names and their respective classes.
get_feature_types <- function(dataset) {
  types <- data.frame(feature = colnames(dataset), type = sapply(dataset, class))
  row.names(types) <- NULL
  return(types)
}



# Decorate the bare dicionary
#
# Adds decorators to the barebones dictionary. Adds Keep, Description, Notes columns
#
# @param dictionary A bare dictionary, in dataframe format, including column names and classes
#
# @return dictionary Decorated dictionary
decorate_dictionary <- function(dictionary) {
  dictionary$keep <- TRUE
  dictionary$description <- ""
  dictionary$notes <- ""

  return(dictionary)
}




# Add summary stats
#
# Adds summary stats to the dictionary
#   Unique levels   X
#   NA count        X
#   Mean            X
#   SD              X
#   Min/Max
#   Head            X
#
# @param dataset The dataset for summary stats
#
# @return dataset.stats Data frame of summary statistics
get_stats <- function(dataset) {
  # Unique levels
  unique.level.count <- sapply(dataset, function(x) length(unique(x)))
  unique.level.pct <- round(unique.level.count / nrow(dataset), 3)

  # Number of NAs
  na.count <- sapply(dataset, function(x) sum(is.na(x)))
  na.pct <- round(na.count / nrow(dataset), 3)

  # Heads
  feature.heads <- sapply(dataset, function(x) paste(utils::head(x), collapse = ", "))

  # Feature stats
  feature.means <- suppressWarnings(sapply(dataset, mean))
  feature.sd <- suppressWarnings(sapply(dataset, stats::sd))

  dataset.stats <- data.frame(unique.level.count, unique.level.pct,
                              na.count, na.pct,
                              feature.heads, feature.means, feature.sd)

  return(dataset.stats)

}



#' Compile a data dictionary for a data frame
#'
#' Compiles a data dictionary for a data frame input
#'
#' @param dataset The dataframe to generate the dictionary
#' @param name The filename for the dictionary
#' @param path The filepath to locate the dictionary
#'
#' @return dict.temp The dictionary dataframe (Note: Not returned, the function will save the csv dictionary to the desired path)
#'
#' @examples
#' my.dictionary <- websteR::compile_dictionary(iris)
#'
#' @export
compile_dictionary <- function(dataset, name = "Data_Dictionary.csv", path = "./") {
  dict.temp <- get_feature_types(dataset)
  dict.temp <- decorate_dictionary(dict.temp)
  dict.temp <- cbind(dict.temp, get_stats(dataset))

  dict.temp$feature <- as.character(dict.temp$feature)
  dict.temp$type <- as.character(dict.temp$type)

  utils::write.csv(dict.temp, file = paste(path, name, sep = "/"))

  return(dict.temp)
}




#' Apply data dictionary to dataset
#'
#' Applies typing and filtering rules from a data dictionary to a dataset
#'
#' @param data Dataset to be processed
#' @param dictionary.path Filepath to the data dictionary to be enforced
#' @param enforce.keep When enforce.keep = TRUE (Default), filters out columns with a "keep" value of FALSE in the dictionary
#'
#' @return data Data frame with dictionary rules applied
#'
#' @export
enforce_dictionary <- function(data, dictionary.path, enforce.keep = TRUE) {
  # Import dictionary
  dictionary <- utils::read.csv(dictionary.path, stringsAsFactors = FALSE)

  # Filter out unwanted columns
  if(enforce.keep == TRUE) {
    # Drop columns not explicitly labeled as keep = TRUE
    filter.list <- dictionary$feature[which(dictionary$keep == TRUE)]
    data <- data[, filter.list]

    # Drop the rows from the dictionary in this session
    dictionary <- dictionary[which(dictionary$keep == TRUE), ]
  }


  # If any columns in data are factors, convert to character before applying any operations
  data.factor.features <- get_feature_types(data)
  data.factor.features <- as.character(data.factor.features$feature[which(data.factor.features$type == "factor")])

  data[, data.factor.features] <- sapply(data[, data.factor.features], as.character)


  # Apply typing to columns
  ## Factors
  factor.features <- dictionary$feature[which(dictionary$type == "factor")]
  data[, factor.features] <- lapply(data[factor.features], as.factor)

  ## Numerics
  numeric.features <- dictionary$feature[which(dictionary$type == "numeric")]
  data[, numeric.features] <- lapply(data[numeric.features], as.numeric)

  ## Characters
  character.features <- dictionary$feature[which(dictionary$type == "character")]
  data[, character.features] <- lapply(data[character.features], as.character)

  ## Integers
  integer.features <- dictionary$feature[which(dictionary$type == "integer")]
  data[, integer.features] <- lapply(data[integer.features], as.integer)

  return(data)

}
DannyGsGit/websteR documentation built on May 6, 2019, 1:36 p.m.