Nothing
### summarize.R ---
##----------------------------------------------------------------------
## Author: Brice Ozenne
## Created: okt 7 2020 (11:12)
## Version:
## Last-Updated: nov 8 2023 (15:07)
## By: Brice Ozenne
## Update #: 304
##----------------------------------------------------------------------
##
### Commentary:
##
### Change Log:
##----------------------------------------------------------------------
##
### Code:
## * summarize (documentation)
##' @title Compute summary statistics
##' @description Compute summary statistics for multiple variables and/or multiple groups and save them in a data frame.
##'
##' @param formula [formula] on the left hand side the outcome(s) and on the right hand side the grouping variables.
##' E.g. Y1+Y2 ~ Gender + Gene will compute for each gender and gene the summary statistics for Y1 and for Y2.
##' Passed to the \code{stats::aggregate} function.
##' @param data [data.frame] dataset containing the observations.
##' @param na.action [function] a function which indicates what should happen when the data contain 'NA' values.
##' Passed to the \code{stats::aggregate} function.
##' @param na.rm [logical] Should the summary statistics be computed by omitting the missing values.
##' @param columns [character vector] name of the summary statistics to kept in the output.
##' Can be any of, or a combination of:\itemize{
##' \item \code{"observed"}: number of observations with a measurement.
##' \item \code{"missing"}: number of missing observations.
##' When specifying a grouping variable, it will also attempt to count missing rows in the dataset.
##' \item \code{"pc.missing"}: percentage missing observations.
##' \item \code{"mean"}, \code{"mean.lower"} \code{"mean.upper"}: mean with its confidence interval.
##' \item \code{"median"}, \code{"median.lower"} \code{"median.upper"}: median with its confidence interval.
##' \item \code{"sd"}: standard deviation.
##' \item \code{"q1"}, \code{"q3"}, \code{"IQR"}: 1st and 3rd quartile, interquartile range.
##' \item \code{"min"}, \code{"max"}: minimum and maximum observation.
##' \item \code{"predict.lower"}, \code{"predict.upper"}: prediction interval for normally distributed outcome.
##' \item \code{"correlation"}: correlation matrix between the outcomes (when feasible, see detail section).
##' }
##' @param FUN [function] user-defined function for computing summary statistics.
##' It should take a vector as an argument and output a named single value or a named vector.
##' @param which deprecated, use the argument columns instead.
##' @param level [numeric,0-1] the confidence level of the confidence intervals.
##' @param skip.reference [logical] should the summary statistics for the reference level of categorical variables be omitted?
##' @param digits [integer, >=0] the minimum number of significant digits to be used to display the results. Passed to \code{print.data.frame}
##' @param ... additional arguments passed to argument \code{FUN}.
##'
##' @details This function is essentially an interface to the \code{stats::aggregate} function.
##'
##' Confidence intervals (CI) and prediction intervals (PI) for the mean are computed via \code{stats::t.test}.
##' Confidence intervals (CI) for the median are computed via \code{asht::medianTest}.
##'
##' Correlation can be assessed when a grouping and ordering variable are given in the formula interface , e.g. Y ~ time|id.
##'
##' @return A data frame containing summary statistics (in columns) for each outcome and value of the grouping variables (rows). It has an attribute \code{"correlation"} when it was possible to compute the correlation matrix for each outcome with respect to the grouping variable.
##'
##' @keywords utilities
## * summarize (examples)
##' @examples
##' ## simulate data in the wide format
##' set.seed(10)
##' d <- sampleRem(1e2, n.times = 3)
##' d$treat <- sample(LETTERS[1:3], NROW(d), replace=TRUE, prob=c(0.3, 0.3, 0.4) )
##'
##' ## add a missing value
##' d2 <- d
##' d2[1,"Y2"] <- NA
##'
##' ## run summarize
##' summarize(Y1 ~ 1, data = d)
##' summarize(Y1 ~ 1, data = d, FUN = quantile, p = c(0.25,0.75))
##' summarize(Y1+Y2 ~ X1, data = d)
##' summarize(treat ~ 1, skip.reference = FALSE, data = d)
##'
##' summarize(Y1 ~ X1, data = d2)
##' summarize(Y1+Y2 ~ X1, data = d2, na.rm = TRUE)
##'
##' ## long format
##' dL <- reshape(d, idvar = "id", direction = "long",
##' v.names = "Y", varying = c("Y1","Y2","Y3"))
##' summarize(Y ~ time + X1, data = dL)
##'
##' ## compute correlations (single time variable)
##' e.S <- summarize(Y ~ time + X1 | id, data = dL, na.rm = TRUE)
##' e.S
##' attr(e.S, "correlation")
##'
##' ## compute correlations (composite time variable)
##' dL$time2 <- dL$time == 2
##' dL$time3 <- dL$time == 3
##' e.S <- summarize(Y ~ time2 + time3 + X1 | id, data = dL, na.rm = TRUE)
##' e.S
##' attr(e.S, "correlation")
## * summarize (code)
##' @export
summarize <- function(formula, data, na.action = stats::na.pass, na.rm = FALSE, level = 0.95,
columns = c("observed","missing","pc.missing","mean","sd","min","q1","median","q3","max","correlation"),
FUN = NULL,
which = NULL,
skip.reference = TRUE,
digits = NULL,
...){
data <- as.data.frame(data)
mycall <- match.call()
## ** check and normalize user imput
name.all <- all.vars(formula)
if(any(name.all %in% c("XXindexXX"))){
invalid <- name.all[name.all %in% c("XXindexXX")]
stop("Name(s) \"",paste(invalid, collapse = "\" \""),"\" are used internally. \n",
"Consider renaming the variables in the dataset and updating the formula. \n",
sep = "")
}
if(any(name.all %in% names(data) == FALSE)){
invalid <- name.all[name.all %in% names(data) == FALSE]
stop("Argument \'formula\' is inconsistent with argument \'data\'. \n",
"Variable(s) \"",paste(invalid, collapse = "\" \""),"\" could not be found in the dataset. \n",
sep = "")
}
detail.formula <- formula2var(formula)
name.Y <- detail.formula$var$response
n.Y <- length(name.Y)
if(n.Y==0){
stop("Wrong specification of argument \'formula\'. \n",
"There need to be at least one variable in the left hand side of the formula. \n")
}
name.id <- detail.formula$var$cluster
if(length(name.id)==0){
name.X <- detail.formula$var$regressor
formula <- detail.formula$formula$all
}else if("ranef" %in% names(detail.formula$vars)){
stop("Wrong specification of argument \'formula\'. \n",
"Should be something like Y ~ time or Y ~ time + G | cluster. \n")
}else if(length(name.id)==1){
name.X <- detail.formula$var$time
if(is.null(name.X)){
formula <- stats::as.formula(paste(paste(name.Y, collapse = "+"), "~1"))
}else{
formula <- stats::as.formula(paste(paste(name.Y, collapse = "+"), "~", paste(name.X, collapse = "+")))
}
test.between <- stats::setNames(sapply(name.X, function(iXvar){
max(tapply(data[[iXvar]],data[[name.id]], FUN = function(x){sum(!duplicated(x))}), na.rm = TRUE)
})==1,name.X)
if(any(test.between)){
vec.split <- nlme::collapse(data[names(which(test.between))], as.factor = TRUE)
ls.id <- tapply(as.character(data[[name.id]]), vec.split, unique)
}else{
ls.id <- list(unique(as.character(data[[name.id]])))
}
}else{
stop("Wrong specification of argument \'formula\'. \n",
"There should be exactly one variable on the right hand side of the formula after the symbol |. \n",
"Something like Y ~ time + G | cluster. \n")
}
n.X <- length(name.Y)
if("which" %in% names(match.call())){
warning("Argument \'which\' is deprecated. Consider using argument \'columns\' instead. \n")
columns <- which
}
valid.columns <- c("observed","missing","pc.missing",
"mean","mean.lower","mean.upper","predict.lower","predict.upper",
"sd","min","q1","median","q3","median.upper","median.lower","IQR","max","correlation")
columns <- match.arg(columns, choices = valid.columns, several.ok = TRUE)
dots <- list(...)
if(length(dots)>0 && is.null(FUN)){
stop("Unknown arguments \'",paste(names(dots), collapse = "\' \'"),"\'. \n")
}
if(!is.null(FUN)){
try.FUN <- try(FUN(data[[name.Y[1]]]), silent = TRUE)
if(!inherits(try.FUN,"try-error")){
if(is.null(names(try.FUN))){
stop("Argument \'FUN\' should return values with names. \n")
}else if(any(names(try.FUN) %in% valid.columns)){
stop("Argument \'FUN\' should not return values named \"",paste(names(try.FUN)[names(try.FUN) %in% valid.columns], collapse = "\" \""),"\" are used internally. \n")
}
}
}
## ** handle categorical variables
to.rm <- NULL
to.add <- NULL
for(iY in 1:n.Y){ ## iY <- 3
if(is.character(data[[name.Y[iY]]])||is.factor(data[[name.Y[iY]]])){
data[[name.Y[iY]]] <- as.factor(data[[name.Y[iY]]])
iLevel <- levels(data[[name.Y[iY]]])
if(any(paste(name.Y[iY],iLevel,sep=":") %in% names(data))){
stop("Name(s) \"",paste(paste(name.Y[iY],iLevel,sep=":")[paste(name.Y[iY],iLevel,sep=":") %in% names(data)], collapse ="\" \""),"\" are being used internally. \n",
"Consider rename or dropping columns from argument \'data\'.\n")
}
for(iL in iLevel){
if(!skip.reference || iL != iLevel[1]){
data[[paste(name.Y[iY],iL,sep=":")]] <- as.numeric(data[[name.Y[iY]]]==iL)
to.add <- c(to.add, paste(name.Y[iY],iL,sep=":"))
}
}
to.rm <- c(to.rm, name.Y[iY])
}
}
name.Y <- unique(c(setdiff(name.Y,to.rm),to.add))
n.Y <- length(name.Y)
## ** between and within variables
if(!is.null(name.id)){
time <- names(which(!test.between)) ## can be several variables
if(length(time)==0){
table.id.time <- do.call(table,stats::setNames(list(data[[name.id]],rep(1,NROW(data))),c(name.id,"XXXXXX")))
}else{
table.id.time <- do.call(table,stats::setNames(c(list(data[[name.id]]),data[,name.X,drop=FALSE]),
c(name.id,name.X)))
}
}else{
time <- NULL
table.id.time <- NULL
}
## ** compute summary statistics
out <- NULL
iFormula <- stats::update(formula, paste0("XXindexXX~."))
iData <- cbind(XXindexXX = 1:NROW(data), data)
for(iY in 1:n.Y){ ## iY <- 1
iAggregate <- stats::aggregate(iFormula, data=iData, function(x){
y <- data[x,name.Y[iY]]
## *** missing data
## as NA
n.obs <- sum(!is.na(y))
n.missing <- sum(is.na(y))
## as missing rows in the dataset
if(!is.null(table.id.time) && all(table.id.time %in% 0:1) && ("missing" %in% columns || "pc.missing" %in% columns)){
if(any(test.between)){
iIndex <- which(names(ls.id)==unique(nlme::collapse(data[x,names(which(test.between)),drop=FALSE], as.factor = FALSE)))
n.missing <- n.missing + sum(ls.id[[iIndex]] %in% unique(as.character(data[x,name.id])) == FALSE)
}else{
n.missing <- n.missing + sum(ls.id[[1]] %in% unique(as.character(data[x,name.id])) == FALSE)
}
}
## *** gather
if("mean.lower" %in% columns || "mean.upper" %in% columns || "predict.lower" %in% columns || "predict.upper" %in% columns){
if(all(y %in% 0:1)){
tty <- stats::binom.test(x = sum(y==1, na.rm = TRUE), n = sum(!is.na(y)), conf.level = level, alternative = "two.sided")
}else{
tty <- stats::t.test(y, na.rm = na.rm, conf.level = level, alternative = "two.sided")
}
}else{
tty <- list(estimate = NA, parameter = NA, stderr = NA, conf.int = c(NA, NA))
}
if(("median.lower" %in% columns || "median.upper" %in% columns) && requireNamespace("asht") && !all(y %in% 0:1)){
if(na.rm){
wty <- asht::medianTest(stats::na.omit(y), conf.level = level, alternative = "two.sided")
}else{
wty <- asht::medianTest(y, conf.level = level, alternative = "two.sided")
}
}else{
wty <- list(estimate = NA, parameter = NA, stderr = NA, conf.int = c(NA, NA))
}
if(all(is.na(y))){ ## avoid warning when taking min(), e.g. min(NA, na.rm = TRUE)
iVec <- c("observed" = n.obs,
"missing" = n.missing,
"pc.missing" = n.missing/(n.obs+n.missing),
"mean" = NA,
"mean.lower" = NA,
"mean.upper" = NA,
"predict.lower" = NA,
"predict.upper" = NA,
"sd" = NA,
"min" = NA,
"q1" = NA,
"median" = NA,
"median.lower" = NA,
"median.upper" = NA,
"q3" = NA,
"IQR" = NA,
"max" = NA)
}else{
iVec <- c("observed" = sum(!is.na(y)),
"missing" = n.missing,
"pc.missing" = n.missing/length(y),
"mean" = mean(y, na.rm = na.rm),
"mean.lower" = tty$conf.int[1], ## as.double(tty$estimate + stats::qt((1-level)/2,tty$parameter) * tty$stderr[1]) - tty$conf.int[1]
"mean.upper" = tty$conf.int[2], ## as.double(tty$estimate + stats::qt(1-(1-level)/2,tty$parameter) * tty$stderr[1]) - tty$conf.int[2]
"predict.lower" = as.double(tty$estimate + sqrt(n.obs+1) * stats::qt((1-level)/2,tty$parameter) * tty$stderr[1]),
"predict.upper" = as.double(tty$estimate + sqrt(n.obs+1) * stats::qt(1-(1-level)/2,tty$parameter) * tty$stderr[1]),
"sd" = stats::sd(y, na.rm = na.rm),
"min" = min(y, na.rm = na.rm),
"q1" = unname(stats::quantile(y, prob = 0.25, na.rm = na.rm)),
"median" = stats::median(y, na.rm = na.rm),
"median.lower" = wty$conf.int[1],
"median.upper" = wty$conf.int[2],
"q3" = unname(stats::quantile(y, prob = 0.75, na.rm = na.rm)),
"IQR" = stats::IQR(y, na.rm = na.rm),
"max" = max(y, na.rm = na.rm))
}
if(!is.null(FUN)){
iVec <- c(iVec,do.call(FUN, c(list(y), dots)))
if(all(y %in% 0:1)){
iVec[c("sd","predict.lower","predict.upper","q1","median","median.lower","median.upper","q3","IQR")] <- NA
}
}
return(iVec)
},
na.action=na.action)
iDF <- cbind(outcome = name.Y[iY],
iAggregate[name.X],
iAggregate[["XXindexXX"]][,union(setdiff(columns,"correlation"),setdiff(colnames(iAggregate[["XXindexXX"]]),valid.columns)),drop=FALSE])
out <- rbind(out,iDF)
}
## ** correlation
if(!is.null(name.id) && any(!test.between) && "correlation" %in% columns){ ## id and time variables
if(length(time)>1 && paste(time, collapse = "_X_XX_X_") %in% names(data)){
stop("Argument \'data\' should not contain a column named \"",paste(time, collapse = "_X_XX_X_"),"\" as this name is used internally. \n")
}
if(all(table.id.time %in% 0:1)){
attr(out,"correlation") <- stats::setNames(vector(mode = "list", length = length(name.Y)),
name.Y)
for(iY in 1:n.Y){ ## iY <- 1
attr(out,"correlation")[[iY]] <- stats::setNames(lapply(ls.id, function(iId){ ## iId <- ls.id[[1]]
iDataL <- data[data[[name.id]] %in% iId,,drop = FALSE]
if(length(time)>1){
iDataL[[paste(time, collapse = "_X_XX_X_")]] <- nlme::collapse(iDataL[,time, drop=FALSE], as.factor = TRUE)
Utime <- paste(time, collapse = "_X_XX_X_")
}else{
Utime <- time
}
iDataW <- stats::reshape(data = iDataL[,c(name.id, Utime, name.Y[iY])],
direction = "wide", timevar = Utime, idvar = name.id, v.names = name.Y[iY])
if(na.rm){
iCor <- stats::cor(iDataW[,-1,drop=FALSE], use = "pairwise")
}else{
iCor <- stats::cor(iDataW[,-1,drop=FALSE])
}
iLevels <- levels(as.factor(iDataL[[Utime]]))
if(NROW(iCor)==length(iLevels)){
dimnames(iCor) <- list(iLevels,iLevels)
}
return(iCor)
}), names(ls.id))
}
}
}
## ** export
if(!is.null(digits)){
attr(out,"digits") <- digits
}
attr(out,"call") <- mycall
attr(out,"name.Y") <- name.Y
attr(out,"name.X") <- name.X
attr(out,"name.time") <- time
attr(out,"name.id") <- name.id
class(out) <- append("summarize",class(out))
return(out)
}
######################################################################
### summarize.R ends here
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.