Nothing
#' Calculate common descriptive statistics
#'
#' [describe_data()] returns a set of common descriptive statistics
#' (e.g., number of observations, mean, standard deviation) for one or more
#' numeric variables.
#'
#' @param data A data frame.
#' @param ... One or more unquoted column names from the data frame.
#' @param na.rm A boolean indicating whether missing values (including NaN)
#' should be excluded in calculating the descriptives? The default is TRUE.
#' @param short A boolean indicating whether only a subset of descriptives
#' should be reported? If set to `TRUE``, only the N, M, and SD will be
#' returned. The default is `FALSE`.
#'
#' @details The data can be grouped using [dplyr::group_by()] so that
#' descriptives will be calculated for each group level.
#'
#' Skew and kurtosis are based on the [datawizard::skewness()] and
#' [datawizard::kurtosis()] functions (Komsta & Novomestky, 2015).
#'
#' @examples
#' describe_data(quote_source, response)
#'
#' describe_data(quote_source, response, na.rm = FALSE)
#'
#' quote_source |>
#' dplyr::group_by(source) |>
#' describe_data(response)
#'
#' quote_source |>
#' dplyr::group_by(source) |>
#' describe_data(response, short = TRUE)
#'
#' @export
describe_data <- function(
data, ..., na.rm = TRUE, short = FALSE) {
checkmate::check_data_frame(data)
# Check if the user provided any columns.
if (length(rlang::enquos(...)) == 0) {
stop("No columns found; please provide one or more columns.")
}
# Check if the columns exist in the data frame.
column_names <- as.character(rlang::exprs(...))
if (sum(!column_names %in% names(data)) != 0) {
stop(
paste(
"Did not find the following column(s) in the data:",
paste(column_names[!column_names %in% names(data)], collapse = ", ")
)
)
}
# Check whether the values in the columns are numeric
columns <- dplyr::select(dplyr::ungroup(data), ...)
if (sum(!purrr::map_chr(columns, class) %in% c("numeric", "integer")) > 0) {
stop(
paste(
"The following columns are not numeric:",
paste(
column_names[
!purrr::map_chr(columns, class) %in% c("numeric", "integer")
]
)
)
)
}
grouping <- dplyr::group_vars(data)
data <- dplyr::select(data, dplyr::all_of(grouping), ...)
data <- tidyr::pivot_longer(data,
cols = -dplyr::all_of(grouping),
names_to = "var"
)
# Add the var to the existing grouping
data <- dplyr::group_by(data, var, .add = TRUE)
# Calculate descriptives
output <- data |>
dplyr::summarize(
missing = sum(is.na(value)),
N = dplyr::n() - missing,
M = mean(value, na.rm = na.rm),
SD = sd(value, na.rm = na.rm),
SE = SD / sqrt(N),
min = min(value, na.rm = na.rm),
max = max(value, na.rm = na.rm),
range = diff(range(value, na.rm = na.rm)),
median = median(value, na.rm = na.rm),
mode = unique(value)[which.max(tabulate(match(
value,
unique(value)
)))],
skew = (
sum((value - mean(value, na.rm = na.rm))^3, na.rm = na.rm) / N
) / (
sum((value - mean(value, na.rm = na.rm))^2, na.rm = na.rm) / N
)^(3 / 2),
kurtosis = N * sum((value - mean(value, na.rm = na.rm))^4,
na.rm = na.rm
) / (sum((value - mean(value, na.rm = na.rm))^2, na.rm = na.rm)^2)
)
# Reorder the columns and return only a subset if short was set to TRUE
if (short) {
output <- dplyr::select(
output,
dplyr::all_of(c("var", grouping, "N", "M", "SD"))
)
} else {
output <- dplyr::relocate(output, var, dplyr::all_of(grouping))
}
output <- dplyr::arrange(output, var)
# Add a tidystats class so we can use the tidy_stats() function to parse the
# the output
# Put it at the beginning otherwise we get an error when printing the tibble
class(output) <- c("tidystats_descriptives", class(output))
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.