Summarize Mixed Data Types vs. Groups
Description
summaryM
summarizes the variables listed in an S formula,
computing descriptive statistics and optionally statistical tests for
group differences. This function is typically used when there are
multiple lefthandside variables that are independently against by
groups marked by a single righthandside variable. The summary
statistics may be passed to print
methods, plot
methods
for making annotated dot charts and extended box plots, and
latex
methods for typesetting tables using LaTeX. The
html
method uses htmlTable::htmlTable
to typeset the
table in html, by passing information to the latex
method with
html=TRUE
. This is for use with RMarkdown under RStudio.
The print
methods use the print.char.matrix
function to
print boxed tables.
The plot
method creates plotly
graphics if
options(grType='plotly')
, otherwise base graphics are used.
plotly
graphics provide extra information such as which
quantile is being displayed when hovering the mouse. Test statistics
are displayed by hovering over the mean.
Continuous variables are described by three quantiles (quartiles by
default) when printing, or by the following quantiles when plotting
expended box plots using the bpplt
function:
0.05, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 0.95. The box
plots are scaled to the 0.025 and 0.975 quantiles of each continuous
lefthandside variable. Categorical variables are
described by counts and percentages.
The left hand side of formula
may contain mChoice
("multiple choice") variables. When test=TRUE
each choice is
tested separately as a binary categorical response.
The plot
method for method="reverse"
creates a temporary
function Key
as is done by the xYplot
and
Ecdf.formula
functions. After plot
runs, you can type Key()
to put a legend in a default location, or
e.g. Key(locator(1))
to draw a legend where you click the left
mouse button. This key is for categorical variables, so to have the
opportunity to put the key on the graph you will probably want to use
the command plot(object, which="categorical")
. A second function
Key2
is created if continuous variables are being plotted. It is
used the same as Key
. If the which
argument is not
specified to plot
, two pages of plots will be produced. If you
don't define par(mfrow=)
yourself,
plot.summaryM
will try to lay out a multipanel
graph to best fit all the individual charts for continuous
variables.
Usage
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52  summaryM(formula, groups=NULL, data=NULL, subset, na.action=na.retain,
overall=FALSE, continuous=10, na.include=FALSE,
quant=c(0.025, 0.05, 0.125, 0.25, 0.375, 0.5, 0.625,
0.75, 0.875, 0.95, 0.975),
nmin=100, test=FALSE,
conTest=conTestkw, catTest=catTestchisq,
ordTest=ordTestpo)
## S3 method for class 'summaryM'
print(x, digits, prn = any(n != N),
what=c('proportion', '%'), pctdig = if(what == '%') 0 else 2,
npct = c('numerator', 'both', 'denominator', 'none'),
exclude1 = TRUE, vnames = c('labels', 'names'), prUnits = TRUE,
sep = '/', abbreviate.dimnames = FALSE,
prefix.width = max(nchar(lab)), min.colwidth, formatArgs=NULL, round=NULL,
prtest = c('P','stat','df','name'), prmsd = FALSE, long = FALSE,
pdig = 3, eps = 0.001, prob = c(0.25, 0.5, 0.75), prN = FALSE, ...)
## S3 method for class 'summaryM'
plot(x, vnames = c('labels', 'names'),
which = c('both', 'categorical', 'continuous'), vars=NULL,
xlim = c(0,1),
xlab = 'Proportion',
pch = c(16, 1, 2, 17, 15, 3, 4, 5, 0), exclude1 = TRUE,
main, subtitles = TRUE, ncols=2,
prtest = c('P', 'stat', 'df', 'name'), pdig = 3, eps = 0.001,
conType = c('bp', 'dot', 'raw'), cex.means = 0.5, cex=par('cex'),
height='auto', width=700, ...)
## S3 method for class 'summaryM'
latex(object, title =
first.word(deparse(substitute(object))),
file=paste(title, 'tex', sep='.'), append=FALSE, digits,
prn = any(n != N), what=c('proportion', '%'),
pctdig = if(what == '%') 0 else 2,
npct = c('numerator', 'both', 'denominator', 'slash', 'none'),
npct.size = if(html) mspecs$html$smaller else 'scriptsize',
Nsize = if(html) mspecs$html$smaller else 'scriptsize',
exclude1 = TRUE,
vnames=c("labels", "names"), prUnits = TRUE, middle.bold = FALSE,
outer.size = if(html) mspecs$html$smaller else "scriptsize",
caption, rowlabel = "", rowsep=html,
insert.bottom = TRUE, dcolumn = FALSE, formatArgs=NULL, round=NULL,
prtest = c('P', 'stat', 'df', 'name'), prmsd = FALSE,
msdsize = if(html) function(x) x else NULL, brmsd=FALSE,
long = FALSE, pdig = 3, eps = 0.001,
auxCol = NULL, table.env=TRUE, tabenv1=FALSE, prob=c(0.25, 0.5, 0.75),
prN=FALSE, legend.bottom=FALSE, html=FALSE,
mspecs=markupSpecs, ...)
## S3 method for class 'summaryM'
html(object, ...)

Arguments
formula 
An S formula with additive effects. There may be several variables
on the right hand side separated by "+",
or the numeral 
groups 
if there is more than one righthand variable, specify

x 
an object created by 
data 
name or number of a data frame. Default is the current frame. 
subset 
a logical vector or integer vector of subscripts used to specify the subset of data to use in the analysis. The default is to use all observations in the data frame. 
na.action 
function for handling missing data in the input data. The default is
a function defined here called 
overall 
Setting 
continuous 
specifies the threshold for when a variable is considered to be
continuous (when there are at least 
na.include 
Set 
nmin 
For categories of the response variable in which there
are less than or equal to 
test 
Set to 
conTest 
a function of two arguments (grouping variable and a continuous
variable) that returns a list with components 
catTest 
a function of a frequency table (an integer matrix) that returns a
list with the same components as created by 
ordTest 
a function of a frequency table (an integer matrix) that returns a
list with the same components as created by 
... 
For 
object 
an object created by 
quant 
vector of quantiles to use for summarizing continuous variables.
These must be numbers between 0 and 1
inclusive and must include the numbers 0.5, 0.25, and 0.75 which are
used for printing and for plotting
quantile intervals. The outer quantiles are used for scaling the xaxes
for such plots. Specify outer quantiles as 
prob 
vector of quantiles to use for summarizing continuous variables.
These must be numbers between 0 and 1 inclusive and have previously been
included in the Warning: specifying 0 and 1 as two of the quantiles will result in computing the minimum and maximum of the variable. As for many random variables the minimum will continue to become smaller as the sample size grows, and the maximum will continue to get larger. Thus the min and max are not recommended as summary statistics. 
vnames 
By default, tables and plots are usually labeled with variable labels
(see the 
pch 
vector of plotting characters to represent different groups, in order of group levels. 
abbreviate.dimnames 
see 
prefix.width 
see 
min.colwidth 
minimum column width to use for boxes printed with 
formatArgs 
a list containing other arguments to pass to 
digits 
number of significant digits to print. Default is to use the current
value of the 
what 
specifies whether proportions or percentages are to be printed or LaTeX'd 
pctdig 
number of digits to the right of the decimal place for printing
percentages or proportions. The default is zero if 
prn 
set to 
prN 
set to 
npct 
specifies which counts are to be printed to the right of percentages.
The default is to print the frequency (numerator of the percent) in
parentheses. You can specify 
npct.size 
the size for typesetting 
Nsize 
When a second row of column headings is added showing sample sizes,

exclude1 
By default, 
prUnits 
set to 
sep 
character to use to separate quantiles when printing tables 
prtest 
a vector of test statistic components to print if 
round 
Specify 
prmsd 
set to 
msdsize 
defaults to 
brmsd 
set to 
long 
set to 
pdig 
number of digits to the right of the decimal place for printing
Pvalues. Default is 
eps 
Pvalues less than 
auxCol 
an optional auxiliary column of information, right justified, to add
in front of statistics typeset by

table.env 
set to 
tabenv1 
set to 
which 
Specifies whether to plot results for categorical variables, continuous variables, or both (the default). 
vars 
Subscripts (indexes) of variables to plot for

conType 
For drawing plots for continuous variables,
extended box plots (boxpercentiletype plots) are drawn by default,
using all quantiles in 
cex.means 
character size for means in boxpercentile plots; default is .5 
cex 
character size for other plotted items 
height,width 
dimensions in pixels for the 
xlim 
vector of length two specifying xaxis limits. This is only used
for plotting categorical variables. Limits for continuous
variables are determined by the outer quantiles specified in

xlab 
xaxis label 
main 
a main title. This applies only to the plot for categorical variables. 
subtitles 
set to 
ncols 
number of columns for 
caption 
character string containing LaTeX table captions. 
title 
name of resulting LaTeX file omitting the 
file 
name of file to write LaTeX code to. Specifying

append 
specify 
rowlabel 
see 
rowsep 
if 
middle.bold 
set to 
outer.size 
the font size for outer quantiles 
insert.bottom 
set to 
legend.bottom 
set to 
html 
set to 
mspecs 
list defining markup syntax for various languages,
defauls to Hmisc 
dcolumn 
see 
Value
a list. plot.summaryM
returns the number
of pages of plots that were made if using base graphics, or
plotly
objects created by plotly::subplot
otherwise.
If both categorical and continuous variables were plotted, the
returned object is a list with two named elements Categorical
and Continuous
each containing plotly
objects.
Otherwise a plotly
object is returned.
The latex
method returns attributes legend
and
nstrata
.
Side Effects
plot.summaryM
creates a function Key
and
Key2
in frame 0 that will draw legends, if base graphics are
being used.
Author(s)
Frank Harrell
Department of Biostatistics
Vanderbilt University
f.harrell@vanderbilt.edu
References
Harrell FE (2004): Statistical tables and plots using S and LaTeX. Document available from http://biostat.mc.vanderbilt.edu/twiki/pub/Main/StatReport/summary.pdf.
See Also
mChoice
, label
, dotchart3
,
print.char.matrix
, update
,
formula
,
format.default
, latex
,
latexTranslate
, bpplt
,
tabulr
, bpplotM
, summaryP
Examples
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66  options(digits=3)
set.seed(173)
sex < factor(sample(c("m","f"), 500, rep=TRUE))
country < factor(sample(c('US', 'Canada'), 500, rep=TRUE))
age < rnorm(500, 50, 5)
sbp < rnorm(500, 120, 12)
label(sbp) < 'Systolic BP'
units(sbp) < 'mmHg'
treatment < factor(sample(c("Drug","Placebo"), 500, rep=TRUE))
treatment[1]
sbp[1] < NA
# Generate a 3choice variable; each of 3 variables has 5 possible levels
symp < c('Headache','Stomach Ache','Hangnail',
'Muscle Ache','Depressed')
symptom1 < sample(symp, 500,TRUE)
symptom2 < sample(symp, 500,TRUE)
symptom3 < sample(symp, 500,TRUE)
Symptoms < mChoice(symptom1, symptom2, symptom3, label='Primary Symptoms')
table(as.character(Symptoms))
# Note: In this example, some subjects have the same symptom checked
# multiple times; in practice these redundant selections would be NAs
# mChoice will ignore these redundant selections
f < summaryM(age + sex + sbp + Symptoms ~ treatment, test=TRUE)
f
# trio of numbers represent 25th, 50th, 75th percentile
print(f, long=TRUE)
plot(f) # first specify options(grType='plotly') to use plotly
plot(f, conType='dot', prtest='P')
bpplt() # annotated example showing layout of bp plot
# Produce separate tables by country
f < summaryM(age + sex + sbp + Symptoms ~ treatment + country,
groups='treatment', test=TRUE)
f
## Not run:
getHdata(pbc)
s5 < summaryM(bili + albumin + stage + protime + sex +
age + spiders ~ drug, data=pbc)
print(s5, npct='both')
# npct='both' : print both numerators and denominators
plot(s5, which='categorical')
Key(locator(1)) # draw legend at mouse click
par(oma=c(3,0,0,0)) # leave outer margin at bottom
plot(s5, which='continuous') # see also bpplotM
Key2() # draw legend at lower left corner of plot
# oma= above makes this default key fit the page better
options(digits=3)
w < latex(s5, npct='both', here=TRUE, file='')
options(grType='plotly')
pbc < upData(pbc, moveUnits = TRUE)
s < summaryM(bili + albumin + alk.phos + copper + spiders + sex ~
drug, data=pbc, test=TRUE)
html(s)
a < plot(s)
a$Categorical
a$Continuous
plot(s, which='con')
## End(Not run)
