R/peruse.R

Defines functions peruse profile profile.numeric profile.character profile.factor profile.logical

Documented in peruse profile profile.character profile.factor profile.logical profile.numeric

#' Peruse a Data Frame
#' 
#' \code{peruse} examines the variables in a data frame and returns basic 
#' summary info about the individual variables. Different values will be 
#' returned for different types of columns so the function returns a data frame 
#' with the underlying data nested.
#' @param data a data frame
#' @export
#' @examples 
#' cars_summary <- peruse(mtcars)
#' numeric_info <- cars_summary %>%
#'   dplyr::filter(Class == "Numeric") %>%
#'   tidyr::unnest(data)
#' 

peruse <- function(df){
  assertthat::assert_that(is.data.frame(df))
  assertthat::assert_that(nrow(df) > 0)
  perused <- df %>%
    purrr::map(profile) %>%
    purrr::map(tidyr::nest, -Class, -Type, -Num_Missing, -Num_Unique) %>%
    dplyr::bind_rows(.id = "Variable") %>%
    dplyr::select(Variable, Class, Type, dplyr::everything())
  return(perused)
}

#' Profile variables
#' 
#' \code{profile} takes in a vector and, depending on the class of the vector,
#' will return summary information on that vector returned as a data.frame. 
#' @param element a vector you wish to profile. 
#' @export
#' @examples 
#' set.seed(123456)
#' sample_variable <- sample(letters, size = 100, replace = T)
#' profile(sample_variable)
#' 

profile <- function(element) UseMethod("profile")

#' @describeIn profile profile numeric vectors
#' @export

profile.numeric <- function(element){
  num_missing <- sum(is.na(element))
  num_unique <- length(unique(element))
  if(min(element, na.rm = T) > 0 & max(element, na.rm = T) < 1) {
    type <- "Percentage"  
  } else if(num_unique - (num_missing > 0) == 2) {
    type <- "Binary"
  } else type <- "Numeric"
  
  summry <- summary(element)[1:6]
  std_dev <- sd(element, na.rm = T)
  terms = c("Class", "Type", "Mean", "Median", "Min", "Max", "First_Quartile", "Third_Quartile", "SD", "Num_Missing", "Num_Unique") 
  values = c("Numeric", type, unname(summry)[c(4, 3, 1, 6, 2, 5)], std_dev, num_missing, num_unique)
  var_info <- dplyr::data_frame(Term = terms, Value = values) %>%
    tidyr::spread(Term, Value)
  return(var_info)
}

#' @describeIn profile profile character vectors
#' @export

profile.character <- function(element) {
  num_missing <- sum(is.na(element))
  num_unique <- length(unique(element))
  
  if(num_unique - (num_missing > 0) == 2) {
    type <- "Binary"
  } else type <- "Character"
  
  top_5 <- head(sort(table(element), decreasing = T, na.last = NA), 5)
  terms <- c("Class", "Type", names(top_5), "Num_Unique", "Num_Missing")
  values <- c("Character", type, unname(top_5), num_unique, num_missing)
  var_info <- dplyr::data_frame(Term = terms, Value = values) %>%
    tidyr::spread(Term, Value)
  return(var_info)
}

#' @describeIn profile profile factor vectors
#' @export 

profile.factor <- function(variable) {
  factor_info <- profile.character(as.character(variable))
  factor_info$Value[factor_info$Term == "Class"] <- "Factor"
  return(factor_info)
}

#' @describeIn profile profile factor vectors
#' @export 

profile.logical <- function(variable) {
  log_info <- profile.character(as.numeric(variable))
  log_info$Value[log_info$Term == "Class"] <- "Logical"
  return(log_info)
}
mattmills49/modeler documentation built on May 21, 2019, 1:25 p.m.