##' Categorical variables are summarized using counts and frequencies and compared .
##'
##' This function can generate the baseline demographic characteristics
##' that forms table 1 in many publications. It is also useful for generating
##' other tables of univariate statistics.
##'
##' The result of the function is an object (list) which containe the various data
##' generated. In most applications the \code{summary} function should be applied which generates
##' a data.frame with a (nearly) publication ready table. Standard manipulation can be
##' used to modify, add or remove columns/rows and for users not accustomed to R the table
##' generated can be exported to a text file which can be read by other software, e.g., via
##' write.csv(table,file="path/to/results/table.csv")
##'
##' By default, continuous variables are summarized by means and standard deviations
##' and compared with t-tests. When continuous variables are summarized by medians
##' and interquartile ranges the
##' Deviations from the above defaults are obtained when the
##' arguments summary.format and freq.format are combined with suitable
##' summary functions.
##'
##' @title Univariate table
##' @aliases utable univariateTable
##' @param formula Formula specifying the grouping variable (strata)
##' on the left hand side (can be omitted) and on the right hand side
##' the variables for which to obtain (descriptive) statistics.
##' @param data Data set in which formula is evaluated
##' @param summary.format Format for the numeric (non-factor)
##' variables. Default is mean (SD). If different formats are
##' desired, either special Q can be used or the function is called
##' multiple times and the results are rbinded. See examples.
##' @param Q.format Format for quantile summary of numerical
##' variables: Default is median (inter quartile range).
##' @param freq.format Format for categorical variables. Default is
##' count (percentage).
##' @param column.percent Logical, if \code{TRUE} and the default
##' freq.format is used then column percentages are given instead of
##' row percentages for categorical variables (factors).
##' @param digits Number of digits
##' @param big.mark For formatting large numbers (i.e., greater than 1,000). \code{""} turn this off.
##' @param short.groupnames If \code{TRUE} group names are abbreviated.
##' @param compare.groups Method used to compare groups. If
##' \code{"logistic"} and there are exactly two groups logistic
##' regression is used instead of t-tests and Wilcoxon rank tests to
##' compare numeric variables across groups.
##' @param show.totals If \code{TRUE} show a column with totals.
##' @param n If \code{TRUE} show the number of subjects as a separate
##' row. If equal to \code{"inNames"}, show the numbers in
##' parentheses in the column names. If \code{FALSE} do not show
##' number of subjects.
##' @param outcome Outcome data used to calculate p-values when
##' compare groups method is \code{'logistic'} or \code{'cox'}.
##' @param ... saved as part of the result to be passed on to
##' \code{labelUnits}
##' @return List with one summary table element for each variable on the right hand side of formula.
##' The summary tables can be combined with \code{rbind}. The function \code{summary.univariateTable}
##' combines the tables, and shows p-values in custom format.
##' @author Thomas A. Gerds
##' @seealso summary.univariateTable, publish.univariateTable
##' @examples
##' data(Diabetes)
##' library(data.table)
##' univariateTable(~age,data=Diabetes)
##' univariateTable(~gender,data=Diabetes)
##' univariateTable(~age+gender+ height+weight,data=Diabetes)
##' ## same thing but less typing
##' utable(~age+gender+ height+weight,data=Diabetes)
##'
##' ## summary by location:
##' univariateTable(location~Q(age)+gender+height+weight,data=Diabetes)
##' ## continuous variables marked with Q() are (by default) summarized
##' ## with median (IQR) and kruskal.test (with two groups equivalent to wilcox.test)
##' ## variables not marked with Q() are (by default) summarized
##' ## with mean (sd) and anova.glm(...,test="Chisq")
##' ## the p-value of anova(glm()) with only two groups is similar
##' ## but not exactly equal to that of a t.test
##' ## categorical variables are (by default) summarized by count
##' ## (percent) and chi-square tests (\code{chisq.test}). When \code{compare.groups ='logistic'}
##' ## anova(glm(...,family=binomial,test="Chisq")) is used to calculate p-values.
##'
##' ## export result to csv
##' table1 = summary(univariateTable(location~age+gender+height+weight,data=Diabetes),
##' show.pvalues=FALSE)
##' # write.csv(table1,file="~/table1.csv",rownames=FALSE)
##'
##' ## change labels and values
##' utable(location~age+gender+height+weight,data=Diabetes,
##' age="Age (years)",gender="Sex",
##' gender.female="Female",
##' gender.male="Male",
##' height="Body height (inches)",
##' weight="Body weight (pounds)")
##'
##' ## Use quantiles and rank tests for some variables and mean and standard deviation for others
##' univariateTable(gender~Q(age)+location+Q(BMI)+height+weight,
##' data=Diabetes)
##'
##' ## Factor with more than 2 levels
##' Diabetes$AgeGroups <- cut(Diabetes$age,
##' c(19,29,39,49,59,69,92),
##' include.lowest=TRUE)
##' univariateTable(location~AgeGroups+gender+height+weight,
##' data=Diabetes)
##'
##' ## Row percent
##' univariateTable(location~gender+age+AgeGroups,
##' data=Diabetes,
##' column.percent=FALSE)
##'
##' ## change of frequency format
##' univariateTable(location~gender+age+AgeGroups,
##' data=Diabetes,
##' column.percent=FALSE,
##' freq.format="percent(x) (n=count(x))")
##'
##' ## changing Labels
##' u <- univariateTable(location~gender+AgeGroups+ height + weight,
##' data=Diabetes,
##' column.percent=TRUE,
##' freq.format="count(x) (percent(x))")
##' summary(u,"AgeGroups"="Age (years)","height"="Height (inches)")
##'
##' ## more than two groups
##' Diabetes$frame=factor(Diabetes$frame,levels=c("small","medium","large"))
##' univariateTable(frame~gender+BMI+age,data=Diabetes)
##'
##' Diabetes$sex=as.numeric(Diabetes$gender)
##' univariateTable(frame~sex+gender+BMI+age,
##' data=Diabetes,freq.format="count(x) (percent(x))")
##'
##' ## multiple summary formats
##' ## suppose we want for some reason mean (range) for age
##' ## and median (range) for BMI.
##' ## method 1:
##' univariateTable(frame~Q(age)+BMI,
##' data=Diabetes,
##' Q.format="mean(x) (range(x))",
##' summary.format="median(x) (range(x))")
##' ## method 2:
##' u1 <- summary(univariateTable(frame~age,
##' data=na.omit(Diabetes),
##' summary.format="mean(x) (range(x))"))
##' u2 <- summary(univariateTable(frame~BMI,
##' data=na.omit(Diabetes),
##' summary.format="median(x) (range(x))"))
##' publish(rbind(u1,u2),digits=2)
##'
##' ## Large number format (big.mark)
##' n=100000
##' dat=data.frame(id=1:n,z=rbinom(n,1,.3),x=factor(sample(1:8,size=n,replace=TRUE)))
##' u3 <- summary(univariateTable(z~x,
##' data=dat,big.mark=","))
##' u3
##'
##' @export
univariateTable <- function(formula,
data=parent.frame(),
summary.format="mean(x) (sd(x))",
Q.format="median(x) [iqr(x)]",
freq.format="count(x) (percent(x))",
column.percent=TRUE,
digits=c(1,1,3),
big.mark=",",
short.groupnames,
compare.groups=TRUE,
show.totals=TRUE,
n="inNames",
outcome=NULL,
...){
if (length(digits)<3) digits <- rep(digits,3)
if (!is.numeric(digits.summary <- digits[[1]])) digits.summary <- 1
if (!is.numeric(digits.freq <- digits[[2]])) digits.freq <- 1
if (!is.numeric(pvalue.digits <- digits[[3]])) pvalue.digits <- 3
call <- match.call()
# {{{ parse formula and find data
oldnaaction <- options()$na.action
options(na.action="na.pass")
FRAME <- specialFrame(formula,
data,
specials.design=FALSE,
unspecials.design=FALSE,
specials=c("F","S","Q","strata","Strata","factor","Factor","Cont","nonpar"),
specials.factor = FALSE,
strip.specials=c("F","S","Q"),
strip.arguments=list("S"="format"),
strip.alias=list("F"=c("strata","factor","Strata","Factor"),"S"="Cont","Q"="nonpar"),
na.action="na.pass")
options(na.action=oldnaaction)
# }}}
# {{{ extract grouping variable
if (is.null(FRAME$response)){
groupvar <- NULL
groupname <- NULL
grouplabels <- NULL
groups <- NULL
n.groups <- NROW(data)
}
else{
mr <- FRAME$response
if(NCOL(mr)!=1) stop("Can only handle univariate outcome")
groupname <- colnames(mr)
groupvar <- as.character(FRAME$response[,1,drop=TRUE])
mr <- FRAME$response[,1,drop=TRUE]
## deal with missing values in group variable
if (is.factor(mr)){
if (any(is.na(groupvar))){
groupvar[is.na(groupvar)] <- "Missing"
groups <- c(levels(mr),"Missing")
}else{
groups <- levels(mr)
}
} else {
if (any(is.na(groupvar))){
groupvar[is.na(groupvar)] <- "Missing"
}
groups <- unique(groupvar)
}
groupvar <- factor(groupvar,levels=groups)
n.groups <- table(groupvar)
n.groups <- c(n.groups,sum(n.groups))
if (compare.groups=="logistic" & (length(groups)!=2))
stop("compare.groups can only be equal to 'logistic' when there are exactly two groups. You have ",length(groups)," groups")
## if (length(groups)>30) stop("More than 30 groups")
if (missing(short.groupnames)){
if(all(nchar(groups)<2) || all(groups %in% c(TRUE,FALSE)))
short.groupnames <- FALSE
else
short.groupnames <- TRUE
}
if (short.groupnames==TRUE)
grouplabels <- groups
else
grouplabels <- paste(groupname,"=",groups)
}
# }}}
# {{{ classify variables into continuous numerics and grouping factors
automatrix <- FRAME$design
continuous.matrix <- NULL
factor.matrix <- NULL
auto.type <- sapply(1:NCOL(automatrix),function(i){
x <- automatrix[,i]
# type 0=other coerced to numeric
# 1=factor
# 2=numeric
# 3=character
## set some useful default
type.i <- is.factor(x)+2*is.numeric(x)+3*is.logical(x)+4*is.character(x)
# treat character and logical as factors
if (type.i %in% c(3,4)) type.i <- 1
# treat other variables as numeric (e.g. difftime)
if (type.i==0) type.i <- 2
# force variables with less than 3 distinct values to be categorical (factors)
if (length(unique(x))<3) type.i <- 1
type.i})
if (any(auto.type==2)){
if (is.null(FRAME$S))
continuous.matrix <- automatrix[,auto.type==2,drop=FALSE]
else
continuous.matrix <- cbind(FRAME$S,automatrix[,auto.type==2,drop=FALSE])
}
if (any(auto.type==1)){
if (is.null(FRAME$F))
factor.matrix <- automatrix[,auto.type==1,drop=FALSE]
else
factor.matrix <- cbind(FRAME$F,automatrix[,auto.type==1,drop=FALSE])
}
Q.matrix <- FRAME$Q
NVARS <- NCOL(continuous.matrix)+ NCOL(continuous.matrix)+NCOL(factor.matrix)+ NCOL(Q.matrix)
# }}}
# {{{ summary numeric variables
if (!is.null(continuous.matrix)){
# prepare format
sumformat <- parseSummaryFormat(format=summary.format,digits=digits.summary)
# get summary excluding missing in groups and in totals
summaryNumeric <- getSummary(matrix=continuous.matrix,
varnames=names(continuous.matrix),
groupvar=groupvar,
groups=groups,
labels=grouplabels,
stats=sumformat$stats,
format=sumformat$format,
digits=digits.summary,big.mark=big.mark)
}
else{
sumformat <- NULL
summaryNumeric <- NULL
}
if (!is.null(Q.matrix)){
# prepare format
Qformat <- parseSummaryFormat(format=Q.format,digits=digits.summary)
# get summary excluding missing in groups and in totals
qNumeric <- getSummary(matrix=Q.matrix,
varnames=names(Q.matrix),
groupvar=groupvar,
groups=groups,
labels=grouplabels,
stats=Qformat$stats,
format=Qformat$format,digits=digits.summary,big.mark=big.mark)
}
else{
Qformat <- NULL
qNumeric <- NULL
}
# }}}
# {{{ categorical variables (factors)
if (!is.null(factor.matrix)){
if (column.percent==TRUE){
freq.format <- sub("percent","colpercent",freq.format)
freq.format <- sub("colcolpercent","colpercent",freq.format)
}
# prepare format
freqformat <- parseFrequencyFormat(format=freq.format,digits=digits.freq)
# get frequencies excluding missing in groups and in totals
freqFactor <- getFrequency(matrix=factor.matrix,
varnames=names(factor.matrix),
groupvar=groupvar,
groups=groups,
labels=grouplabels,
stats=freqformat$stats,
format=freqformat$format,big.mark=big.mark,digits=digits.freq)
}
else{
freqformat <- NULL
freqFactor <- NULL
}
# }}}
# {{{ missing values
mlist <- list(continuous.matrix,Q.matrix,factor.matrix)
allmatrix <- do.call("cbind",mlist[!sapply(mlist,is.null)])
totals.missing <- lapply(allmatrix,function(v){sum(is.na(v))})
if (!is.null(groups)){
group.missing <- lapply(allmatrix,function(v){
lapply(groups,function(g){
sum(is.na(v[groupvar==g]))
})
})}
else {
group.missing <- NULL
}
# }}}
# {{{ p-values
p.cont <- NULL
p.Q <- NULL
p.freq <- NULL
if (!is.null(groups) && (compare.groups!=FALSE)){
if (!is.null(continuous.matrix)){
p.cont <- sapply(names(continuous.matrix),function(v){
data.table::set(data,j=v,value=as.numeric(data[[v]]))
switch(tolower(as.character(compare.groups[[1]])),
"false"={NULL},
"logistic"={
## logistic regression
px <- anova(glm(update(formula,paste(".~",v)),data=data,family=binomial),test="Chisq")$"Pr(>Chi)"[2]
px
},
"cox"={
px <- anova(coxph(formula(paste("Surv(time,status)~",v)),data=cbind(outcome,data)))$"Pr(>|Chi|)"[2]
px
},
"true"={
## glm fails when there are missing values
## in outcome, so we remove missing values
fv <- formula(paste(v,"~",groupname))
vdata <- model.frame(fv,data,na.action=na.omit)
px <- anova(glm(fv,data=vdata),test="Chisq")$"Pr(>Chi)"[2]
px
},NULL)
})
}
if (!is.null(Q.matrix)){
p.Q <- sapply(names(Q.matrix),function(v){
switch(tolower(as.character(compare.groups[[1]])),
"false"={NULL},
"logistic"={
## logistic regression
## glm fails when there are missing values
## in outcome, so we remove missing values
fv <- formula(paste(v,"~",groupname))
vdata <- model.frame(fv,data,na.action=na.omit)
px <- anova(glm(update(formula,paste(".~",v)),data=vdata,family=binomial),test="Chisq")$"Pr(>Chi)"[2]
px
},
"cox"={
px <- anova(coxph(formula(paste("Surv(time,status)~",v)),data=cbind(outcome,data)))$"Pr(>|Chi|)"[2]
px
},
"true"={
if (is.character(data[[groupname]])){
data[[paste0(groupname,"asfactor")]] <- factor(data[[groupname]])
px <- kruskal.test(formula(paste0(v,"~",groupname,"asfactor")),data=data)$p.value
} else{
px <- kruskal.test(formula(paste(v,"~",groupname)),data=data)$p.value
}
px
},NULL)
})
}
if (!is.null(factor.matrix)){
p.freq <- sapply(names(factor.matrix),function(v){
switch(tolower(as.character(compare.groups[[1]])),
"false"={NULL},
"logistic"={
## logistic regression
fv <- formula(paste(v,"~",groupname))
vdata <- model.frame(fv,data,na.action=na.omit)
px <- anova(glm(update(formula,paste(".~",v)),data=vdata,family=binomial),test="Chisq")$"Pr(>Chi)"[2]
},
"cox"={
px <- anova(coxph(formula(paste("Surv(time,status)~",v)),data=cbind(outcome,data)))$"Pr(>|Chi|)"[2]
px
},
"true"={
fv <- factor.matrix[,v]
tabx <- table(fv,groupvar)
if (sum(tabx)==0) {
px <- NA
} else{
suppressWarnings(test <- chisq.test(tabx))
px <- test$p.value
}
## FIXME: need to catch and pass the warnings
## test <- suppressWarnings(fisher.test(tabx))
## if (any(test$expected < 5) && is.finite(test$parameter))
px
},NULL)
})
}
}
p.values <- c(p.cont,p.Q,p.freq)
if (length(p.values)>0)
if (is.null(p.values[[1]]))
p.values <- NULL
# }}}
# {{{ output
## xlevels <- lapply(factor.matrix,function(x){
## levels(as.factor(x,exclude=FALSE))
## levels(as.factor(x))
## })
vartypes <- rep(c("numeric","Q","factor"),c(length(names(continuous.matrix)),length(names(Q.matrix)),length(names(factor.matrix))))
names(vartypes) <- c(names(continuous.matrix),names(Q.matrix),names(factor.matrix))
out <- list(summary.groups=c(freqFactor$groupfreq,summaryNumeric$groupsummary,qNumeric$groupsummary),
summary.totals=c(freqFactor$totals,summaryNumeric$totals,qNumeric$totals),
missing=list(group=group.missing,totals=totals.missing),
n.groups=n.groups,
p.values=p.values,
formula=formula,
groups=grouplabels,
vartype=vartypes,
xlevels=freqFactor$xlevels,
Q.format=Q.format,
summary.format=summary.format,
freq.format=freq.format,
compare.groups=compare.groups,
## dots are passed to labelUnits without suitability checks
show.totals=show.totals,
n=n,
big.mark=big.mark,
labels=list(...))
class(out) <- "univariateTable"
out
# }}}
}
## the name utable is more handy
##' @export utable
utable <- univariateTable
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.