Nothing
#' Univariate Statistics for Numerical Data
#'
#' Calculates mean, sd, min, Q1*, median, Q3*, max, MAD, IQR*, CV,
#' skewness*, SE.skewness*, and kurtosis* on numerical vectors. (*) Not
#' available when using sampling weights.
#'
#' @param x A numerical vector or a data frame.
#' @param var Unquoted expression referring to a specific column in \code{x}.
#' Provides support for piped function calls (e.g.
#' \code{my_df \%>\% descr(my_var)}.
#' @param stats Character. Which stats to produce. Either \dQuote{all} (default),
#' \dQuote{fivenum}, \dQuote{common} (see \emph{Details}), or a selection of :
#' \dQuote{mean}, \dQuote{sd}, \dQuote{min}, \dQuote{q1}, \dQuote{med},
#' \dQuote{q3}, \dQuote{max}, \dQuote{mad}, \dQuote{iqr}, \dQuote{cv},
#' \dQuote{skewness}, \dQuote{se.skewness}, \dQuote{kurtosis},
#' \dQuote{n.valid}, and \dQuote{pct.valid}. Can be set globally via
#' \code{\link{st_options}}, option \dQuote{descr.stats}.
#' @param na.rm Logical. Argument to be passed to statistical functions.
#' Defaults to \code{TRUE}.
#' @param round.digits Numeric. Number of significant digits to display.
#' Defaults to \code{2}. Can be set globally with \code{\link{st_options}}.
#' @param transpose Logical. Make variables appears as columns, and stats as
#' rows. Defaults to \code{FALSE}. Can be set globally with
#' \code{\link{st_options}}, option \dQuote{descr.transpose}.
#' @param order Character. When analyzing more than one variable, this parameter
#' determines how to order variables. Valid values are \dQuote{sort} (or
#' simply \dQuote{s}), \dQuote{preserve} (or \dQuote{p}), or a vector
#' containing all variable names in the desired order. Defaults to
#' \dQuote{sort}.
#' @param style Character. Style to be used by \code{\link[pander]{pander}}. One
#' of \dQuote{simple} (default), \dQuote{grid}, \dQuote{rmarkdown}, or
#' \dQuote{jira}. Can be set globally with \code{\link{st_options}}.
#' @param plain.ascii Logical. \code{\link[pander]{pander}} argument; when
#' \code{TRUE} (default), no markup characters will be used (useful when
#' printing to console). If \code{style = 'rmarkdown'} is specified, value
#' is set to \code{FALSE} automatically. Can be set globally using
#' \code{\link{st_options}}.
#' @param justify Character. Alignment of numbers in cells; \dQuote{l} for left,
#' \dQuote{c} for center, or \dQuote{r} for right (default). Has no effect on
#' \emph{html} tables.
#' @param headings Logical. Set to \code{FALSE} to omit heading section. Can be
#' set globally via \code{\link{st_options}}. \code{TRUE} by default.
#' @param display.labels Logical. Show variable / data frame labels in heading
#' section. Defaults to \code{TRUE}. Can be set globally with
#' \code{\link{st_options}}.
#' @param split.tables Character. \code{\link[pander]{pander}} argument that
#' specifies how many characters wide a table can be. \code{100} by default.
#' @param weights Numeric. Vector of weights having same length as \emph{x}.
#' \code{NA} (default) indicates that no weights are used.
#' @param rescale.weights Logical. When set to \code{TRUE}, a global constant is
#' apply to make the total count equal \code{nrow(x)}. \code{FALSE} by default.
#' @param \dots Additional arguments passed to \code{\link[pander]{pander}} or
#' \code{\link[base]{format}}.
#'
#' @return An object having classes \dQuote{\emph{matrix}} and
#' \dQuote{\emph{summarytools}} containing the statistics, with extra
#' attributes used by \code{\link{print}} method and \link{view} function.
#'
#' @examples
#' data("exams")
#'
#' # All stats for all numerical variabls
#' descr(exams)
#'
#' # Only common statistics
#' descr(exams, stats = "common")
#'
#' # Arbitrary selection of statistics, transposed
#' descr(exams, stats = c("mean", "sd", "min", "max"), transpose = TRUE)
#'
#' # Rmarkdown-ready
#' descr(exams, plain.ascii = FALSE, style = "rmarkdown")
#'
#' # Grouped statistics
#' data("tobacco")
#' with(tobacco, stby(BMI, gender, descr))
#'
#' # Grouped statistics, transposed
#' with(tobacco, stby(BMI, age.gr, descr, stats = "common", transpose = TRUE))
#'
#' \dontrun{
#' # Show in Viewer (or browser if not in RStudio)
#' view(descr(exams))
#'
#' # Save to html file with title
#' print(descr(exams),
#' file = "descr_exams.html",
#' report.title = "BMI by Age Group",
#' footnote = "<b>Schoolyear:</b> 2018-2019<br/><b>Semester:</b> Fall")
#'
#' }
#'
#' @keywords univar
#' @author Dominic Comtois, \email{dominic.comtois@@gmail.com}
#' @export
#' @importFrom matrixStats weightedMean weightedSd weightedMedian weightedMad
#' @importFrom rapportools skewness kurtosis nvalid
#' @importFrom stats IQR mad median sd quantile
#' @importFrom utils head
#' @importFrom dplyr %>% as_tibble select starts_with summarize_all group_keys
#' @importFrom tidyr separate gather spread
descr <- function(x,
var = NULL,
stats = st_options("descr.stats"),
na.rm = TRUE,
round.digits = st_options("round.digits"),
transpose = st_options("descr.transpose"),
order = "sort",
style = st_options("style"),
plain.ascii = st_options("plain.ascii"),
justify = "r",
headings = st_options("headings"),
display.labels = st_options("display.labels"),
split.tables = 100,
weights = NA,
rescale.weights = FALSE,
...) {
# handle objects of class "grouped_df" (dplyr::group_by)
if (inherits(x, "grouped_df")) {
if ("var" %in% names(match.call())) {
# var might contain a function call -- such as df %>% descr(na.omit(var1))
if (inherits(as.list(match.call()[-1])$var, "call")) {
var_obj <- eval(as.list(match.call()[-1])$var, envir = x)
varname <- intersect(colnames(x),
as.character(as.list(match.call()[-1])$var))
} else {
var_obj <- x[[as.list(match.call()[-1])$var]]
varname <- deparse(substitute(var))
}
} else {
var_obj <- x[ ,setdiff(colnames(x), group_vars(x))]
}
parse_info <- try(
parse_args(sys.calls(), sys.frames(), match.call(),
var_name = (ncol(x) == 1),
var_label = (ncol(x) == 1), caller = "descr"),
silent = TRUE)
outlist <- list()
gr_ks <- map_groups(group_keys(x))
gr_inds <- attr(x, "groups")$.rows
for (g in seq_along(gr_ks)) {
outlist[[g]] <- descr(x = as_tibble(var_obj)[gr_inds[[g]], ],
stats = stats,
na.rm = na.rm,
round.digits = round.digits,
transpose = transpose,
order = order,
style = style,
plain.ascii = plain.ascii,
justify = justify,
headings = headings,
display.labels = display.labels,
split.tables = split.tables,
weights = weights,
rescale.weights = rescale.weights,
... = ...)
if (!inherits(parse_info, "try-error")) {
if (!is.null(parse_info$df_name))
attr(outlist[[g]], "data_info")$Data.frame <- parse_info$df_name
if (!is.null(parse_info$df_label))
attr(outlist[[g]], "data_info")$Data.frame.label <- parse_info$df_label
if (!is.null(parse_info$var_name)) {
attr(outlist[[g]], "data_info")$Variable <- parse_info$var_name
} else if (exists("varname")) {
attr(outlist[[g]], "data_info")$Variable <- varname
if (identical(colnames(outlist[[g]]), "value"))
colnames(outlist[[g]]) <- varname
if (identical(rownames(outlist[[g]]), "value"))
rownames(outlist[[g]]) <- varname
}
if (!is.null(parse_info$var_label))
attr(outlist[[g]], "data_info")$Variable.label <- parse_info$var_label
}
attr(outlist[[g]], "data_info")$by_var <-
setdiff(colnames(attr(x, "groups")), ".rows")
attr(outlist[[g]], "data_info")$Group <- gr_ks[g]
attr(outlist[[g]], "data_info")$by_first <- g == 1
attr(outlist[[g]], "data_info")$by_last <- g == length(gr_ks)
}
if (length(group_vars(x)) == 1 && is.null(dim(var_obj))) {
names(outlist) <- sub(paste(group_vars(x), "= "), "", gr_ks)
} else {
names(outlist) <- gr_ks
}
class(outlist) <- c("stby")
attr(outlist, "groups") <- group_keys(x)
return(outlist)
}
# When var is provided, discard all other variables
if (is.data.frame(x) && ncol(x) > 1 && "var" %in% names(match.call())) {
# var might contain a function call -- such as df %>% descr(na.omit(var1))
if (inherits(as.list(match.call()[-1])$var, "call")) {
x_obj <- eval(as.list(match.call()[-1])$var, envir = x)
varname <- intersect(colnames(x),
as.character(as.list(match.call()[-1])$var))
} else {
x_obj <- x[[as.list(match.call()[-1])$var]]
varname <- deparse(substitute(var))
}
} else {
x_obj <- x
if (!is.null(colnames(x))) {
varname <- colnames(x)
}
}
# Validate arguments -------------------------------------------------------
errmsg <- character() # problems with arguments will be stored here
if (is.null(x)) {
tmp_x_name <- deparse(substitute(x))
stop(tmp_x_name, " is either NULL or does not exist")
}
if (is.atomic(x_obj) && !is.numeric(x_obj)) {
errmsg %+=% "'x' must be numeric"
}
# make x_obj a tibble
if (!inherits(x_obj, "tbl")) {
x.df <- as_tibble(x_obj)
} else {
x.df <- x_obj
}
# Get variable label
if (ncol(x.df) == 1) {
var_label <- label(x.df[[1]])
} else {
var_label <- NA
}
if (!is.data.frame(x.df)) {
errmsg %+=% paste("'x' must be a numeric vector, a data.frame, a tibble,",
"a data.table; attempted conversion to tibble failed")
}
errmsg <- c(errmsg, check_args(match.call(), list(...)))
valid_stats <- list(
no_wgts = c("mean", "sd", "min", "q1", "med", "q3","max", "mad",
"iqr", "cv", "skewness", "se.skewness", "kurtosis",
"n.valid", "pct.valid"),
wgts = c("mean", "sd", "min", "med", "max", "mad", "cv",
"n.valid", "pct.valid")
)
if (identical(stats, "all")) {
stats <- valid_stats[[2 - as.numeric(identical(weights, NA))]]
} else if (identical(stats, "fivenum")) {
if (!identical(weights, NA)) {
errmsg %+=% paste("fivenum is not supported when weights are used; valid",
"stats are:", paste(valid_stats$wgts, collapse = ", "))
}
stats <- c("min", "q1", "med", "q3", "max")
} else if (identical(stats, "common")) {
stats <- c("mean", "sd", "min", "med", "max", "n.valid", "pct.valid")
} else {
stats <- tolower(stats)
invalid_stats <-
setdiff(stats, valid_stats[[2 - as.numeric(identical(weights, NA))]])
if (length(invalid_stats) > 0) {
errmsg %+=%
paste("The following statistics are not recognized, or not allowed: ",
paste(dQuote(invalid_stats), collapse = ", "))
}
}
if (length(errmsg) > 0) {
stop(paste(errmsg, collapse = "\n "))
}
# End of arguments validation ------------------------------------------------
# When style is rmarkdown, make plain.ascii FALSE unless specified explicitly
if (style == "rmarkdown" && isTRUE(plain.ascii) &&
(!"plain.ascii" %in% (names(match.call())))) {
plain.ascii <- FALSE
}
# Get info about x from parsing function
parse_info <- try(
parse_args(sys.calls(), sys.frames(), match.call(),
var_name = (ncol(x.df) == 1),
var_label = (ncol(x.df) == 1), caller = "descr"),
silent = TRUE)
if (inherits(parse_info, "try-error")) {
parse_info <- list()
}
if (!"var_name" %in% names(parse_info)) {
if (exists("varname")) {
parse_info$var_name <- varname
} else {
parse_info$var_name <- colnames(x.df)
}
}
# Identify and exclude non-numerical columns from x
col_to_remove <- which(!vapply(x.df, is.numeric, logical(1)))
if (length(col_to_remove) > 0) {
ignored <- colnames(x.df)[col_to_remove]
x.df <- x.df[-col_to_remove]
order <- setdiff(order, ignored)
parse_info$var_name <- parse_info$var_name[-col_to_remove]
}
if (ncol(x.df) == 0) {
stop("no numerical variables found in ", deparse(match.call()$x))
}
# Verify that the order argument is still valid after column removal
if (length(order) > 1) {
if (length(ind <- which(!colnames(x.df) %in% order)) > 0) {
message("column(s) not specified in 'order' (",
paste(colnames(x.df)[ind], collapse = ", "),
") will appear at the end of the table")
order <- c(order, colnames(x.df)[ind])
}
}
# No weights being used ------------------------------------------------------
if (identical(weights, NA)) {
# Prepare the summarizing functions for dplyr::summarize; there are 3 stats
# that will be calculated later on so to not slow down the function
dummy <- function(x) NA
summar_funs <- list(~ mean(., na.rm = na.rm),
~ sd(., na.rm = na.rm),
~ min(., na.rm = na.rm),
~ quantile(., probs = .25, type = 2, names = FALSE,
na.rm = na.rm),
~ median(., na.rm = na.rm),
~ quantile(., probs = .75, type = 2, names = FALSE,
na.rm = na.rm),
~ max(., na.rm = na.rm),
~ mad(., na.rm = na.rm),
~ IQR(., na.rm = na.rm),
~ dummy(.), # placeholder for cv
~ rapportools::skewness(., na.rm = na.rm),
~ dummy(.), # placeholder for se.skewnes
~ rapportools::kurtosis(., na.rm = na.rm),
~ rapportools::nvalid(., na.rm = na.rm),
~ dummy(.)) # placeholder for pct.valid
fun_names <- c("mean", "sd", "min", "q1", "med", "q3", "max", "mad", "iqr",
"cv", "skewness", "se.skewness", "kurtosis", "n.valid",
"pct.valid")
names(summar_funs) <- fun_names
summar_funs <- summar_funs[which(fun_names %in% stats)]
if (ncol(x.df) > 1) {
results <- suppressWarnings(
x.df %>% summarize_all(.funs = summar_funs) %>%
gather("variable", "value") %>%
separate("variable", c("var", "stat"), sep = "_(?=[^_]*$)") %>%
spread("var", "value")
)
if (identical(order, "preserve")) {
results <- results[ ,c("stat", colnames(x.df))]
} else if (length(order) > 1) {
results <- results[ ,c("stat", order)]
}
# Transform results into output object
output <- as.data.frame(t(results[ ,-1]))
colnames(output) <- results$stat
} else {
output <- x.df %>% summarize_all(.funs = summar_funs, na.rm = na.rm) %>%
as.data.frame
rownames(output) <- parse_info$var_name
}
# Calculate additional stats if needed
if ("cv" %in% stats) {
output$cv <- output$sd / output$mean
}
if ("se.skewness" %in% stats) {
output$se.skewness <-
with(output,
sqrt((6 * n.valid * (n.valid - 1)) /
((n.valid - 2) * (n.valid + 1) * (n.valid + 3))))
}
if ("pct.valid" %in% stats) {
output$pct.valid <- output$n.valid *100 / nrow(x.df)
}
# Apply corrections where n.valid = 0
zerows <- which(output$n.valid == 0)
output[zerows, setdiff(stats, "n.valid")] <- NA
} else {
# Weights being used -------------------------------------------------------
weights_string <- deparse(substitute(weights))
if (sum(is.na(weights)) > 0) {
warning("Missing values on weight variable have been detected and will",
"be treated as zeroes")
weights[is.na(weights)] <- 0
}
# If some weights are 0 or negative, delete rows
zero_wgts <- which(weights <= 0)
if (length(zero_wgts)) {
x.df <- x.df[-zero_wgts, ]
message(length(zero_wgts), " rows with weight <= 0 were deleted")
}
# If weights are in x.df, remove them
if(length(parse_info$df_name) == 1 &&
grepl(parse_info$df_name, weights_string)) {
wgts_vname <- sub(paste0(parse_info$df_name, "\\$"), "", weights_string)
ind <- which(names(x.df) == wgts_vname)
if (length(ind) == 1) {
x.df <- x.df[-ind]
parse_info$var_name <- setdiff(parse_info$var_name, wgts_vname)
}
}
# Build skeleton for output dataframe
output <- data.frame(mean = numeric(),
sd = numeric(),
min = numeric(),
med = numeric(),
max = numeric(),
mad = numeric(),
cv = numeric(),
n.valid = numeric(),
pct.valid = numeric())
# Rescale weights if necessary
if (rescale.weights) {
weights <- weights / sum(weights) * nrow(x.df)
}
for(i in seq_along(x.df)) {
variable <- as.numeric(x.df[[i]])
# Extract number and proportion of missing and valid values
if (any(c("n.valid", "pct.valid") %in% stats)) {
n_valid <- sum(weights[which(!is.na(variable))])
p_valid <- n_valid / sum(weights)
} else {
# calculate n_valid for validation // all missing
n_valid <- sum(!is.na(variable))
p_valid <- NA
}
# Remove missing values from variable and from corresponding weights
if (isTRUE(na.rm)) {
ind <- which(!is.na(variable))
variable <- variable[ind]
weights_tmp <- weights[ind]
}
# Calculate mean and sd if necessary
if (any(c("mean", "cv") %in% stats)) {
variable.mean <- weightedMean(variable, weights_tmp, refine = TRUE,
na.rm = na.rm)
}
if (any(c("sd", "cv") %in% stats)) {
variable.sd <- weightedSd(variable, weights_tmp, na.rm = na.rm)
}
# Calculate and insert stats into output dataframe
output[i, ] <-
c(ifelse("mean" %in% stats, variable.mean, NA),
ifelse("sd" %in% stats, variable.sd, NA),
ifelse("min" %in% stats, min(variable, na.rm = na.rm), NA),
ifelse("med" %in% stats, weightedMedian(variable, weights_tmp,
refine = TRUE,
na.rm = na.rm), NA),
ifelse("max" %in% stats, max(variable, na.rm = na.rm), NA),
ifelse("mad" %in% stats, weightedMad(variable, weights_tmp,
refine = TRUE,
na.rm = na.rm), NA),
ifelse("cv" %in% stats, variable.sd/variable.mean, NA),
ifelse("n.valid" %in% stats, n_valid, NA),
ifelse("pct.valid" %in% stats, p_valid * 100, NA))
}
rownames(output) <- parse_info$var_name
# Apply corrections where n.valid = 0
zerows <- which(output$n.valid == 0)
output[zerows, setdiff(stats, "n.valid")] <- NA
}
# Prepare output data -------------------------------------------------------
# Keep and order required stats from output
output <- output[ ,stats]
# Corrections for special case where nrow = 0
if (nrow(x.df) == 0) {
for (cn in colnames(output)) {
if (cn == "n.valid") {
next
} else if (cn == "pct.valid") {
output[[cn]] <- NaN
} else {
output[[cn]] <- NA
}
}
}
# Apply translations to colnames
for (i in seq_along(output)) {
if (colnames(output)[i] == "sd") {
colnames(output)[i] <- trs("sd.long")
} else {
colnames(output)[i] <- trs(colnames(output)[i])
}
}
# Transpose when transpose is FALSE; even though this is counter-intuitive,
# we prefer that the "vertical" version be the default one and that at the
# same time, the default value for transpose be FALSE.
if (!isTRUE(transpose)) {
output <- t(output)
}
# Set class/attributes
class(output) <- c("summarytools", class(output))
attr(output, "st_type") <- "descr"
attr(output, "date") <- Sys.Date()
attr(output, "fn_call") <- match.call()
attr(output, "stats") <- stats
data_info <-
list(
Data.frame = ifelse("df_name" %in% names(parse_info),
parse_info$df_name, NA),
Data.frame.label = ifelse("df_label" %in% names(parse_info),
parse_info$df_label, NA),
Variable = ifelse("var_name" %in% names(parse_info) &&
length(parse_info$var_name) == 1,
parse_info$var_name, NA),
Variable.label = ifelse("var_label" %in% names(parse_info) &&
length(parse_info$var_label) == 1,
parse_info$var_label,
ifelse(!is.na(var_label), var_label, NA)),
Weights = ifelse(identical(weights, NA), NA,
sub(pattern = paste0(parse_info$df_name, "$"),
replacement = "", x = weights_string,
fixed = TRUE)),
by_var = NA,
Group = ifelse("by_group" %in% names(parse_info),
parse_info$by_group, NA),
by_first = ifelse("by_group" %in% names(parse_info),
parse_info$by_first, NA),
by_last = ifelse("by_group" %in% names(parse_info),
parse_info$by_last, NA),
transposed = transpose,
N.Obs = nrow(x.df))
if ("by_var" %in% names(parse_info)) {
data_info$by_var <- parse_info$by_var
}
attr(output, "data_info") <- data_info[!is.na(data_info)]
attr(output, "format_info") <- list(style = style,
round.digits = round.digits,
plain.ascii = plain.ascii,
justify = justify,
headings = headings,
display.labels = display.labels,
split.tables = split.tables)
if (nrow(x.df) == 0) {
attr(output, "format_info") %+=% list(missing = "N/A")
}
attr(output, "user_fmt") <- list(... = ...)
attr(output, "lang") <- st_options("lang")
if (exists("ignored"))
attr(output, "ignored") <- ignored
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.