Nothing
#' @title Groupwise summary statistics
#'
#' @description Computes summary statistics by groups, similar to the `summary` procedure in SAS.
#' A more flexible alternative to base R's \code{\link[stats]{aggregate}}.
#'
#' @name by-summary
#' @param formula A formula specifying response and grouping variables.
#' @param data A data frame.
#' @param FUN A function or list of functions to apply to the response variables.
#' @param id A formula indicating variables to retain (not grouped by).
#' @param keep.names Logical; keep original variable names if only one function is applied.
#' @param p2d Replace parentheses in output names with dots?
#' @param order Logical; should result be ordered by grouping variables?
#' @param full.dimension Logical; if TRUE, repeat rows so output matches input size.
#' @param var.names Optional custom names for response variables.
#' @param fun.names Optional custom names for functions applied.
#' @param ... Additional arguments passed to functions in `FUN`.
#'
#' @return A data frame of grouped summary statistics.
#'
#' @details
#' Extra arguments in `...` are passed to all functions in `FUN`. If needed, wrap functions to handle these consistently (e.g., for `na.rm = TRUE`).
#'
#' @author Søren Højsgaard, \email{sorenh@@math.aau.dk}
#' @seealso \code{\link[stats]{aggregate}}, \code{\link{orderBy}}, \code{\link{transformBy}}, \code{\link{splitBy}}
#' @keywords summary grouping
#'
#' @examples
#' data(CO2)
#'
#' # Simple groupwise mean
#' summaryBy(uptake ~ Type + Treatment, data = CO2, FUN = mean)
#' summaryBy(cbind(uptake, conc) ~ Type + Treatment, data = CO2, FUN = mean)
#'
#' # Compare with
#' aggregate(cbind(uptake, conc) ~ Type + Treatment, data = CO2, FUN = mean)
#'
#' ## Using '.' on the right hand side of a formula means to stratify by
#' ## all variables not used elsewhere:
#' summaryBy(uptake ~ ., data = CO2, FUN = mean)
#'
#' # Multiple functions using a custom summary function
#' myfun <- function(x, ...)
#' c(m = mean(x, na.rm = TRUE), v = var(x, na.rm = TRUE), n = length(x))
#' summaryBy(uptake ~ Type + Treatment, data = CO2, FUN = myfun)
#'
#' # Summary on transformed variables
#' # works:
#' summaryBy(cbind(lu=log(uptake), conc) ~ Type, data = CO2, FUN = mean)
#' # fails:
#' #summaryBy(cbind(log(uptake), conc) ~ Type, data = CO2, FUN = mean)
#' @export
#' @rdname by-summary
summary_by <- function(data, formula, id=NULL, FUN=mean,
keep.names=FALSE,
p2d=FALSE, order=TRUE, full.dimension=FALSE,
var.names=NULL, fun.names=NULL,
...){
cl <- match.call(expand.dots = TRUE)
cl[[2]] <- formula
cl[[3]] <- data
names(cl)[2:3] <- c("formula", "data")
cl[[1]] <- as.name("summaryBy")
eval(cl)
}
#' @export
#' @rdname by-summary
summaryBy <- function (formula, data=parent.frame(), id=NULL, FUN=mean,
keep.names=FALSE,
p2d=FALSE, order=TRUE, full.dimension=FALSE,
var.names=NULL, fun.names=NULL,
...){
if (!inherits(data, "tbl_df")) is.tib = FALSE
else {is.tib = TRUE; data = as.data.frame(data)}
debug.info <- 0
zzz <- .get_variables(formula, data, id, debug.info) ## ; str(zzz)
lhs.num <- zzz$lhs.num
rhs.grp <- zzz$rhs.grp
ids.var <- zzz$form.ids.var
## str(list(lhs.num=lhs.num))
rh.trivial <- length( rhs.grp ) == 0 #; cat(sprintf("rh.trivial=%d\n", rh.trivial))
rh.string <- .get_rhs_string( data, rhs.grp )
rh.unique <- unique(rh.string)
rh.idx <- match(rh.unique, rh.string)
rh.string.factor <- factor(rh.string, levels=rh.unique) ## This is important
### Get data for id.vars; use ids.var, data, rh.idx
if (length(ids.var)>0){
id.data <- data[ rh.idx, ids.var, drop=FALSE ] ##; print(id.data)
}
### Get lhs data; use lhs.num, data
## Below, we need lhs in the form cbind(y1, y2, y3) for further computations.
##
## It is OK to have computations in this form a la cbind(y1, y2,
## y3, y4=y1+y2, y5=log(y2)).
##
## If lhs is y1 this is translated into cbind(y1).
##
## If lhs is ., this is translated into c(y1, y2, y3).
##
## It is also so (alas) that one may write y1 + y2 + y3,
## which is also translated into c(y1, y2, y3)
if (length(lhs.num) > 1) {
## If lhs.num is vector of length > 1, wrap with cbind:
lhs.num <- paste0("cbind( ", toString(lhs.num), " )")
} else {
## strip cbind( ... ) if it is there; then put cbind around
## (if lhs is simply y1, then there is no cbind around).
ff2 <- gsub("^cbind\\((.*)\\)$", "\\1", lhs.num)
lhs.num <- paste0("cbind( ", ff2, " )")
}
## aa <- lapply(paste(lhs.num), function(x)eval(parse(text=x), data))
## lh.data <- do.call(cbind, aa)
## replace two lines above with
lh.data <- eval(parse(text=lhs.num), data)
## Hack: redefine lhs.num : name of variables in data frame
lhs.num <- colnames(lh.data)
##colnames(lh.data) <- lhs.num
### Function names; use FUN
funNames <- .get_fun_names( FUN )
##print(rh.string.factor)
### Calculate groupwise statistics
if (!is.list(FUN))
FUN <- list(FUN)
out <- NULL
for (ff in 1:length(FUN)) { ## loop over functions
##currFUN <- FUN[[ff]]
currFUN <- match.fun( FUN[[ff]] )
for (vv in 1:length(lhs.num)) { ## loop over variables
currVAR <- lh.data[,lhs.num[vv]]
zzz <- tapply(currVAR, rh.string.factor,
function(x){ currFUN(x, ...) }, simplify=FALSE)
zzz <- do.call(rbind, zzz)
out <- cbind(out, zzz)
}
}
if (!is.null(var.names) && length(var.names)==length(lhs.num))
lhs.names <- var.names
else
lhs.names <- lhs.num
### Set names for columns
##print(funNames)
if (!is.null(fun.names) ) ##&& length(fun.names)==length(funNames))
funNames <- fun.names
newnames <- .get_col_names(ncol(out), colnames(out), funNames,
lhs.names, keep.names)
##cat(sprintf("newnames = %s\n", toString( newnames )))
colnames(out) <- newnames
out <- as.data.frame(out)
### Pad the rhs data to the result
if (!rh.trivial){
out <- cbind(data[rh.idx, rhs.grp, drop=FALSE], out)
}
### Pad id.data to result
##print(id.data)
if (length(ids.var)>0){
out <- cbind(out, id.data)
}
### Must the result have full dimension?
if (full.dimension){
rrr <- as.numeric(rh.string.factor)
out <- out[rrr,, drop=FALSE]
}
### Order the result by the rhs
if (order && !rh.trivial){
rhs.string <- paste (rhs.grp, collapse='+')
out <- orderBy(as.formula(paste("~", rhs.string)), data=out)
}
### Replace '('s and ')'s with '.'s
if (p2d)
names(out) <- gsub("\\)","\\.", gsub("\\(","\\.",names(out)))
### Finalize
rownames(out) <- 1:nrow(out)
if (length(unique(names(out))) != length(names(out)))
warning("dataframe contains replicate names \n", call.=FALSE)
if (is.tib) as_tibble(out) else out
## out
}
.get_rhs_string <- function(data, rhs.var, sep.string="@"){
if (length(rhs.var)==0){
rep.int("1", nrow(data))
} else {
rh.string <- paste(data[,rhs.var[1]])
if (length( rhs.var ) > 1){
for (ii in 2:length( rhs.var )){
rh.string <- paste(rh.string, sep.string, data[, rhs.var[ii]], sep='')
}
}
rh.string
}
}
.get_fun_names <- function( FUN ){
if (!is.list(FUN))
funNames <- paste(deparse(substitute(FUN, env=parent.frame())), collapse = " ")
else
funNames <- unlist(lapply(substitute(FUN, env=parent.frame())[-1], function(a) paste(a)))
##cat(sprintf("funNames = %s\n", toString(funNames)))
funNames
}
.get_col_names <- function(ncol.ans, colNames, funNames, lhs.num, keep.names){
### Names for new variables
## Does the columns of ans have names??
oldnames <- colNames #colnames(ans)
if (is.null(oldnames))
hasNames <- 0
else {
hasNames <- 1*(prod(nchar(oldnames))>0)
}
##cat(sprintf("hasNames=%i\n", hasNames))
## Dim of response (per variable on LHS)
dimr <- (ncol.ans)/length(lhs.num)
only.one.response <- (ncol.ans==length(lhs.num))
if ( keep.names && only.one.response ){
newnames <- lhs.num
} else {
if (hasNames>0){ ## newnames = var.fun
funNames <- colNames[1:dimr]
newnames <- unlist(lapply(lhs.num, function(v){paste(v, funNames, sep='.')}))
} else {
if (length(funNames) != dimr){
funNames <- paste("FUN", 1:dimr, sep='')
newnames <- unlist(lapply(lhs.num, function(v){paste(v, funNames, sep='.')}))
} else {
newnames <- unlist(lapply(funNames, function(x) paste(lhs.num, x, sep='.')))
}
if (length(newnames)!=ncol.ans){
funNames <- paste(funNames, 1:dimr, sep=".")
newnames <- unlist(lapply(funNames, function(x) paste(lhs.num, x, sep='.')))
}
}
}
newnames
}
.get_variables <- function(formula, data, id, debug.info){
data.var <- names(data)
if (!inherits(formula, c("formula", "list")))
stop("'formula' must be a formula or a list")
if (inherits(formula, "formula")){
if (length(formula) != 3) stop("Formula must have a left hand side")
rhs <- formula[[3]]
form.rhs.var <- all.vars(rhs) ## May contain "." and "1"
lhs <- formula[[2]]
form.lhs.var <- all.vars(lhs) ## May contain "."
#print(form.lhs.var)
zz <- .lhsParse(lhs)
form.lhs.var <-
if (length(zz)==1)
paste(zz)
else
paste(unlist(.lhsParse(lhs)))
##print(form.lhs.var)
} else {
if (length(formula)>=2){
lhs <- formula[[1]]
rhs <- formula[[2]]
form.lhs.var <- lhs
form.rhs.var <- rhs
} else {
stop("Invalid specification of formula")
}
}
if (is.null(id)){
form.ids.var <- character(0)
} else {
if (!inherits(id, c("formula", "character"))){
stop("'id' must be a formula or a character vector")
}
if (inherits(id, "formula")){
form.ids.var <- all.vars(id)
} else {
form.ids.var <- id
}
}
data.cls <- lapply(data, class)
data.num.idx <- data.cls %in% c("numeric","integer")
data.num.var <- data.var[ data.num.idx ]
data.fac.var <- data.var[ !data.num.idx ]
## print(form.lhs.var)
## print(data.num.var)
lhs.num <- intersect( form.lhs.var, data.num.var )
rhs.num <- intersect( form.rhs.var, data.num.var )
ids.num <- intersect( form.ids.var, data.num.var )
lhs.fac <- intersect( form.lhs.var, data.fac.var )
rhs.fac <- intersect( form.rhs.var, data.fac.var )
ids.fac <- intersect( form.ids.var, data.fac.var )
lll <- list(data.var=data.var,
form.lhs.var=form.lhs.var, form.rhs.var=form.rhs.var, form.ids.var=form.ids.var,
lhs.num=lhs.num, rhs.num=rhs.num, ids.num=ids.num,
lhs.fac=lhs.fac, rhs.fac=rhs.fac, ids.fac=ids.fac )
#if (debug.info>=1)
##{ cat("status:\n"); str(lll, vec.len=20) }
if ( "." %in% form.lhs.var ){ ## need all numeric variables not metioned elswhere on lhs
form.lhs.var <- setdiff(form.lhs.var, ".")
lhs.num <- union( form.lhs.var, setdiff(data.num.var, c(rhs.num, ids.num)))
if ( length( lhs.fac ) > 0 ){
isSpecial <- rep(NA, length( lhs.fac ))
for (j in 1:length(lhs.fac)){
isSpecial[j]<- (class(data[,lhs.fac[j]])[1] %in% c("POSIXt", "Date"))
}
lhs.num <- union( lhs.num, lhs.fac[ isSpecial ] )
}
} else {
lhs.num <- form.lhs.var
}
## The grouping variable
if ("." %in% form.rhs.var){ ## need all factors not mentioned elsewhere as grouping factors
free.fac <- setdiff(data.fac.var, c(lhs.fac, ids.fac))
rhs.grp <- c(setdiff(form.rhs.var, "."), free.fac)
} else {
rhs.grp <- form.rhs.var
}
rhs.grp <- intersect( rhs.grp, data.var )
rrr <- list(lhs.num=lhs.num, rhs.fac=rhs.fac, form.ids.var=form.ids.var,
form.rhs.var=form.rhs.var, rhs.grp=rhs.grp)
##str(rrr)
rrr
}
.lhsParse <- function(x){
##cat(".lhsParse:"); print(x); print(class(x))
if (inherits(x, 'name')){
value <- x
} else {
s <- paste(x[[1]])
value <- switch(s,
'+'={ c(.lhsParse(x[[2]]),.lhsParse(x[[3]]))},
'I'={ x[[2]]},
{ deparse(x)})
}
value
}
## '
## ' data(dietox)
## ' dietox12 <- subset(dietox,Time==12)
## '
## ' fun <- function(x){
## ' c(m=mean(x), v=var(x), n=length(x))
## ' }
## '
## ' summaryBy(cbind(Weight, Feed) ~ Evit + Cu, data=dietox12,
## ' FUN=fun)
## '
## ' summaryBy(list(c("Weight", "Feed"), c("Evit", "Cu")), data=dietox12,
## ' FUN=fun)
## '
## ' ## Computations on several variables is done using cbind( )
## ' summaryBy(cbind(Weight, Feed) ~ Evit + Cu, data=subset(dietox, Time > 1),
## ' FUN=fun)
## '
## ' ## Calculations on transformed data is possible using cbind( ), but
## ' # the transformed variables must be named
## '
## ' summaryBy(cbind(lw=log(Weight), Feed) ~ Evit + Cu, data=dietox12, FUN=mean)
## '
## ' ## There are missing values in the 'airquality' data, so we remove these
## ' ## before calculating mean and variance with 'na.rm=TRUE'. However the
## ' ## length function does not accept any such argument. Hence we get
## ' ## around this by defining our own summary function in which length is
## ' ## not supplied with this argument while mean and var are:
## '
## ' sumfun <- function(x, ...){
## ' c(m=mean(x, na.rm=TRUE, ...), v=var(x, na.rm=TRUE, ...), l=length(x))
## ' }
## ' summaryBy(cbind(Ozone, Solar.R) ~ Month, data=airquality, FUN=sumfun)
## ' ## Compare with
## ' aggregate(cbind(Ozone, Solar.R) ~ Month, data=airquality, FUN=sumfun)
## '
## ' ## Using '.' on the right hand side of a formula means to stratify by
## ' ## all variables not used elsewhere:
## '
## ' data(warpbreaks)
## ' summaryBy(breaks ~ wool + tension, warpbreaks, FUN=mean)
## ' summaryBy(breaks ~ ., warpbreaks, FUN=mean)
## ' summaryBy(. ~ wool + tension, warpbreaks, FUN=mean)
## '
## ' summaryBy(. ~ wool + tension, warpbreaks, FUN=mean)
## '
## '
## '
## ' @title Function to calculate groupwise summary statistics
## ' @description Function to calculate groupwise summary statistics,
## ' much like the summary procedure of SAS
## ' @name by-summary
## '
## ' @details Extra arguments (...) are passed onto the functions in
## ' FUN. Hence care must be taken that all functions in FUN accept
## ' these arguments - OR one can explicitly write a functions which
## ' get around this. This can particularly be an issue in
## ' connection with handling NAs. See examples below. Some code
## ' for this function has been suggested by Jim
## ' Robison-Cox. Thanks.
## '
## ' @param formula A formula object, see examples below.
## ' @param data A data frame.
## ' @param FUN A list of functions to be applied, see examples below.
## ' @param id A formula specifying variables which data are not grouped by but
## ' which should appear in the output. See examples below.
## ' @param keep.names If TRUE and if there is only ONE function in FUN, then the
## ' variables in the output will have the same name as the variables in the
## ' input, see 'examples'.
## ' @param p2d Should parentheses in output variable names be replaced by dots?
## ' @param order Should the resulting dataframe be ordered according to the
## ' variables on the right hand side of the formula? (using \link{orderBy}
## ' @param full.dimension If TRUE then rows of summary statistics are repeated
## ' such that the result will have the same number of rows as the input
## ' dataset.
## ' @param var.names Option for user to specify the names of the variables on the
## ' left hand side.
## ' @param fun.names Option for user to specify function names to apply to the
## ' variables on the left hand side.
## ' @param \dots Additional arguments to FUN. This could for example be NA actions.
## ' @return A dataframe.
## ' @author Søren Højsgaard, \email{sorenh@@math.aau.dk}
## ' @seealso \code{\link{ave}}, \code{\link{descStat}}, \code{\link{orderBy}}, \code{\link{order_by}},
## ' \code{\link{splitBy}}, \code{\link{split_by}}, \code{\link{transformBy}}, \code{\link{transform_by}}
## ' @keywords univar
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.