#' Calculate multiple statistics for many variables and data subsets
#'
#' This function wraps use of data.table(), stat.desc() and helpful data
#' reshaping courtesy of reshape2.
#' @param data This is the data frame that will be operated on
#' @param vars Variables to summarize
#' @param slicevars Vector of variables which subset the data set
#' @param stats Statistics to calculate. The valid list of values comes from the stat.desc() function, including
#' @keywords data.table stat.desc pastecs reshape2
#' @export
#' @examples
#' SliceStats()
library("data.table")
library("pastecs")
library("magrittr")
SliceStats <- function(data, vars, slicevars, stats = c("mean", "SE.mean", "nbr.val"), suppress_n = NULL, id_delim = "__", label = FALSE){
options(warn = -1) # this is done to suppress warnings that can be generated by stat.desc when performing variance operations on slices with only 1 row
### Set up inputs to the calculations
statNames <- names(stat.desc(runif(2)))
vslicevars <- slicevars
cslicevars <- paste(slicevars, collapse = ",") # strsplit(slicevars, ",")[[1]]
nslicevars <- length(vslicevars)
fmslice.var <- paste(vslicevars, collapse = " + ")
vars <- unique(vars)
### Remove duplicate rows
dtData <- data.table(data)
bDup <- duplicated(dtData)
dtData <- dtData[!bDup,]
setkeyv(dtData, cols = vslicevars)
### Run calculations and select desired output
dtStats <- dtData[, lapply(.SD, stat.desc), by = cslicevars, .SDcols = vars]
dtStats$stat <- statNames
dtStats <- dtStats[stat %in% stats]
### Reshape data
dtStats_l <- melt(dtStats,
id.vars = c(vslicevars, "stat"),
variable.name = "x")
dtStats_w <- data.table::dcast(dtStats_l,
formula = as.formula(paste0(fmslice.var, " + x ~ stat")),
value.name = "value",
variable.name = "stat")
### Generate structured ID variable for each row
# Pattern is "slicevar1:sliceval1__slicevar2:sliceval2__" etc
valPairs <- cbind(sapply(vslicevars,
function(sv) paste0(sv, ":", dtStats_w[, get(sv)])),
paste0("x:", dtStats_w$x))
dtStats_w$id <- apply(valPairs, 1, paste, collapse = id_delim)
### Suppress values for anything except the number of valid values
if (is.numeric(suppress_n) & sum(dtStats_w$nbr.val < suppress_n) > 0 ){
set(x = dtStats_w,
i = which(dtStats_w$nbr.val < suppress_n),
j = which(colnames(dtStats_w) %in% c("mean", "SE.mean")),
value = NA)
}
options(warn = 0)
dtStats_w
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.