R/SliceStats.R

#' Calculate multiple statistics for many variables and data subsets
#'
#' This function wraps use of data.table(), stat.desc() and helpful data
#' reshaping courtesy of reshape2.
#' @param data This is the data frame that will be operated on
#' @param vars Variables to summarize
#' @param slicevars Vector of variables which subset the data set
#' @param stats Statistics to calculate. The valid list of values comes from the stat.desc() function, including
#' @keywords data.table stat.desc pastecs reshape2
#' @export
#' @examples
#' SliceStats()

library("data.table")
library("pastecs")
library("magrittr")

SliceStats <- function(data, vars, slicevars, stats = c("mean", "SE.mean", "nbr.val"), suppress_n = NULL, id_delim = "__", label = FALSE){
  options(warn = -1) # this is done to suppress warnings that can be generated by stat.desc when performing variance operations on slices with only 1 row
  
  ### Set up inputs to the calculations
  statNames <- names(stat.desc(runif(2))) 
  vslicevars <- slicevars
  cslicevars <- paste(slicevars, collapse = ",") # strsplit(slicevars, ",")[[1]]
  nslicevars <- length(vslicevars)
  fmslice.var <- paste(vslicevars, collapse = " + ")
  vars <- unique(vars)
  
  ### Remove duplicate rows
  dtData <- data.table(data)
  bDup <- duplicated(dtData)
  dtData <- dtData[!bDup,]
  setkeyv(dtData, cols = vslicevars)
  
  ### Run calculations and select desired output
  dtStats <- dtData[, lapply(.SD, stat.desc), by = cslicevars, .SDcols = vars]
  dtStats$stat <- statNames
  dtStats <- dtStats[stat %in% stats]
  
  ### Reshape data
  dtStats_l <- melt(dtStats,
                    id.vars = c(vslicevars, "stat"),
                    variable.name = "x")
  dtStats_w <- data.table::dcast(dtStats_l,
                                 formula = as.formula(paste0(fmslice.var, " + x ~ stat")),
                                 value.name = "value",
                                 variable.name = "stat")
  
  ### Generate structured ID variable for each row
  # Pattern is "slicevar1:sliceval1__slicevar2:sliceval2__" etc
  valPairs <- cbind(sapply(vslicevars,
                           function(sv) paste0(sv, ":", dtStats_w[, get(sv)])),
                    paste0("x:", dtStats_w$x))
  dtStats_w$id <- apply(valPairs, 1, paste, collapse = id_delim)
  
  ### Suppress values for anything except the number of valid values
  if (is.numeric(suppress_n) & sum(dtStats_w$nbr.val < suppress_n) > 0 ){
    set(x = dtStats_w,
        i = which(dtStats_w$nbr.val < suppress_n),
        j = which(colnames(dtStats_w) %in% c("mean", "SE.mean")),
        value = NA)  
  }
  options(warn = 0)
  dtStats_w
  
}
nsmader/chstatsum documentation built on May 24, 2019, 7:50 a.m.