#' Create a tidy summary of a data frame
#'
#' Create a summary (small, medium or large) of a data frame.
#'
#' @param x a \code{\link[base:data.frame]{data.frame}} to generate a summary for.
#' Can also be an \link[base:atomic]{atomic} vector (numeric, factor, boolean etc.).
#' @param size how large the summary should be. Either \code{small}, \code{medium}
#' or \code{large} (or simply \code{s}, \code{m} or \code{l}).
#' @param digits the number of decimal places for rounding. Can be
#' an integer or \code{NULL} (no rounding).
#'
#' @return A data frame with one row per column.
#'
#' @details
#'
#' The returned data frame has the following columns:
#'
#' \tabular{ll}{
#' \code{column} \tab Name of the column \cr
#' \code{n} \tab Number of observations \cr
#' \code{class} \tab Data type \cr
#' \code{min} \tab Minimum value \cr
#' \code{max} \tab Maximum value \cr
#' \code{mean} \tab Mean value \cr
#' \code{sd} \tab Standard deviation \cr
#' \code{unique} \tab Number of unique values (includes \code{NA}) \cr
#' \code{median} \tab Median value \cr
#' \code{na} \tab Number of \code{NA}s (missing values) \cr
#' \code{mode} \tab Most frequent value \cr
#' \code{quant25} \tab 25 percent quantile \cr
#' \code{quant75} \tab 75 percent quantile \cr
#' \code{levels} \tab Number of factor levels (only relevant for factors) \cr
#' \code{head} \tab Top (first) value of the data frame \cr
#' \code{tail} \tab Bottom (last) value of the data frame \cr
#' \code{se} \tab Standard error \cr
#' \code{skewness} \tab Skewness \cr
#' \code{kurtosis} \tab Kurtosis \cr
#' \code{bytes} \tab Size in bytes of the memory usage of the column
#'}
#'
#' Note:
#'
#' \itemize{
#' \item For character vectors, the text length will be used to calculate the
#' statistics.
#' \item Missing values will be automatically removed before any statistical
#' summaries are calculated
#' \item Statistical summaries are produced using base R functions with
#' defaults. Kurtosis and skewness is produced using code adapted
#' from the moments package
#' (\url{https://cran.r-project.org/web/packages/moments/index.html}).
#' }
#'
#' @export
#'
#' @examples
#' # Summary for data frame
#' tidysummary(iris)
#' tidysummary(iris, "medium")
#' tidysummary(iris, digits=2)
#'
#' # Summary for atomic vector
#' tidysummary(iris$Sepal.Length, "large")
#'
#' # Convert summary to Markdown
#' output <- tidysummary(iris)
#' to_markdown(output)
#'
#' @seealso
#' \code{\link[tidysummary:to_markdown]{to_markdown()}},
#' \code{\link[tidysummary:to_latex]{to_latex()}},
#' \code{\link[tidysummary:to_html]{to_html()}},
#' \code{\link[tidysummary:to_json]{to_json()}},
#' \code{\link[tidysummary:to_csv]{to_csv()}},
#' \code{\link[tidysummary:to_excel]{to_excel()}}
#'
#' @importFrom rlang :=
tidysummary <- function(x, size="small", digits=2) {
if (is.atomic(x)) {
# Force atomic vectors into a single row data frame
col_name <- deparse(substitute(x))
x <- dplyr::rename(data.frame(var = x), "{col_name}" := "var")
}
validate_input_arguments(x, size, digits)
df <- tidysummary_small(x)
if (base::tolower(size) %in% c("medium", "m")) {
df <- cbind(df, tidysummary_medium(x))
} else if (base::tolower(size) %in% c("large", "l")) {
df <- cbind(df, tidysummary_medium(x))
df <- cbind(df, tidysummary_large(x))
}
if (!is.null(digits)) {
df <- round_digits_dataframe(df, digits)
}
rownames(df) <- NULL
class(df) <- c("tidysummary", "data.frame")
df
}
validate_input_arguments <- function(x, size, digits) {
if (!(is.numeric(x) | is.factor(x) | is.data.frame(x))) {
stop("tidysummary() can't handle ", class(x)[1], " types, only data frames.",
call. = FALSE)
}
if (!(is.null(digits) | is.numeric(digits) && digits >= 0)) {
stop("Incorrect 'digits', must be NULL or 0 and above.", call.=FALSE)
}
if (base::tolower(size) %in% c("xl", "xs")) {
stop('Argument "size" must be small, medium, or large (s, m or l), not "', size, '".\n\n',
"Do you think that there should be extra small or extra large sizes?\n",
"Then submit a feature request at ",
"https://github.com/peterdalle/tidysummary",
call.=FALSE)
}
if (!(base::tolower(size) %in% c("small", "medium", "large", "s", "m", "l"))) {
stop('Argument "size" must be small, medium, or large (s, m or l), not "', size, '".',
call.=FALSE)
}
}
tidysummary_small <- function(x) {
data.frame(
column = names(x),
n = vapply(x, get_rows, numeric(1)),
class = vapply(x, get_class, character(1)),
min = vapply(x, get_min, numeric(1)),
max = vapply(x, get_max, numeric(1)),
mean = vapply(x, get_mean, numeric(1)),
sd = vapply(x, get_stddev, numeric(1)),
unique = vapply(x, get_unique, numeric(1)),
median = vapply(x, get_median, numeric(1)),
na = vapply(x, get_na, numeric(1))
)
}
tidysummary_medium <- function(x) {
data.frame(
mode = vapply(x, get_mode, numeric(1)),
quant25 = vapply(x, get_quant, numeric(1), 2),
quant75 = vapply(x, get_quant, numeric(1), 4),
levels = vapply(x, get_factor_levels_count, numeric(1))
)
}
tidysummary_large <- function(x) {
data.frame(
head = sapply(x, get_head, simplify="array"),
tail = sapply(x, get_tail, simplify="array"),
se = vapply(x, get_stderr, numeric(1)),
skewness = vapply(x, get_skewness, numeric(1)),
kurtosis = vapply(x, get_kurtosis, numeric(1)),
bytes = vapply(x, get_size, numeric(1))
#null = vapply(x, get_null, numeric(1)),
#label = vapply(x, get_label, character(1)),
)
}
round_digits_dataframe <- function(df, digits) {
dplyr::mutate(df, dplyr::across(.fns = function(x) try_round(x, digits=digits)))
}
try_round <- function(x, digits) {
if (is.numeric(x)) {
round(x, digits)
} else {
x
}
}
get_rows <- function(x) {
length(x) - sum(is.na(x))
}
get_class <- function(x) {
class(x)[1]
}
get_size <- function(x) {
utils::object.size(x)
}
get_label <- function(x, max_length=NULL) {
if (is.null(max_length)) {
paste0(attr(x, "label"), collapse=" ")
} else {
substr(paste0(attr(x, "label"), collapse=" "), 1, max_length)
}
}
get_quant <- function(x, part, na.rm=TRUE) {
if (is.numeric(x)) {
stats::quantile(x, names=FALSE, na.rm=na.rm)[part]
} else if (is.character(x)) {
stats::quantile(nchar(x), names=FALSE, na.rm=na.rm)[part]
} else {
NA
}
}
get_head <- function(x, n=1) {
if (length(x) > 0) {
if (is.numeric(x)) {
return(paste(x[1], collapse=" "))
}
}
NA
}
get_tail <- function(x, n=1) {
if (length(x) > 0) {
if (is.numeric(x)) {
return(paste(x[length(x):length(x) - n], collapse=" "))
}
}
NA
}
get_na <- function(x) {
sum(is.na(x))
}
get_null <- function(x) {
sum(is.null(x))
}
get_unique <- function(x) {
length(unique(x))
}
get_factor_levels_count <- function(x) {
if (is.factor(x)) {
length(levels(x))
} else {
NA
}
}
get_min <- function(x, na.rm=TRUE) {
if (is.numeric(x) | is.logical(x) | is_date_time(x)) {
min(x, na.rm=na.rm)
} else if (is.character(x)) {
min(nchar(x), na.rm=na.rm)
} else {
NA
}
}
get_max <- function(x, na.rm=TRUE) {
if (is.numeric(x) | is.logical(x) | is_date_time(x)) {
max(x, na.rm=na.rm)
} else if (is.character(x)) {
max(nchar(x), na.rm=na.rm)
} else {
NA
}
}
is_date_time <- function(value) {
any(class(value) %in% c("Date", "POSIXct", "POSIXt"))
}
get_mean <- function(x, na.rm=TRUE) {
if (is.numeric(x) | is.logical(x)) {
mean(x, na.rm=na.rm)
} else if (is.character(x)) {
mean(nchar(x), na.rm=na.rm)
} else {
NA
}
}
get_median <- function(x, na.rm=TRUE) {
if (is.numeric(x) | is.logical(x) | is_date_time(x)) {
stats::median(x, na.rm=na.rm)
} else if (is.character(x)) {
stats::median(nchar(x), na.rm=na.rm)
} else {
NA
}
}
get_mode <- function(x) {
if (is.numeric(x) | is.logical(x)) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
} else {
NA
}
}
get_stddev <- function(x, na.rm=TRUE) {
if (is.numeric(x) | is.logical(x)) {
stats::sd(x, na.rm=na.rm)
} else if (is.character(x)) {
stats::sd(nchar(x), na.rm=na.rm)
} else {
NA
}
}
get_stderr <- function(x, na.rm=TRUE) {
if (is.numeric(x) | is.logical(x)) {
stats::sd(x, na.rm=na.rm) / sqrt(length((x)))
} else {
NA
}
}
# Implemented from the moments package
# https://github.com/cran/moments/blob/master/R/kurtosis.R
get_kurtosis <- function(x, na.rm=TRUE) {
if (is.numeric(x) | is.logical(x)) {
if (na.rm) {
x <- x[!is.na(x)]
}
n <- length(x)
m <- mean(x)
return(n * sum((x - m) ^ 4) / (sum((x - m) ^ 2) ^ 2))
} else {
NA
}
}
# Implemented from the moments package
# https://github.com/cran/moments/blob/master/R/skewness.R
get_skewness <- function(x, na.rm=TRUE) {
if (is.numeric(x) | is.logical(x)) {
if (na.rm) {
x <- x[!is.na(x)]
}
n <- length(x)
m <- mean(x)
return((sum((x - m) ^ 3) / n) / (sum((x - m) ^ 2) / n) ^ (3 / 2))
} else {
NA
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.