# Statistics
# Written by Kevin Potter
# email: kevin.w.potter@gmail.com
# Please email me directly if you
# have any questions or comments
# Last updated 2024-07-05
# Table of contents
# 1) sem
# 2) statistic
# 3) boxcox_transform
# 4) pvalues
# 5) bootstrap
# 6) summa
# 7) bounds
# 8) standardize
# 9) density_points
# 10) yeo_johnson
# 11) yeo_johnson_transform
# 12) principal_components_analysis
# 13) stats_by_group
# 14) pool_over_imputations
# TO DO
# - Add unit tests for 'sem', 'statistic', 'boxcox_transform',
# 'pvalues', 'bootstrap', and 'summa'
#### 1) sem ####
#' Standard Error of the Mean
#'
#' This function calculates the standard error
#' of the mean for a set of numeric values.
#'
#' @param x A numeric vector.
#' @param na.rm Logical; if \code{TRUE}, removes \code{NA}
#' values first.
#'
#' @details Given the standard deviation \eqn{s_x} and sample size
#' \eqn{n} of a sample \eqn{x}, the standard error of the mean is:
#'
#' \deqn{ \frac{s_x}{\sqrt{n}}. }
#'
#' @return The standard error of the mean.
#'
#' @examples
#' # Simulate 100 values from a normal distribution
#' set.seed(2)
#' x <- rnorm(100)
#' # Standard error of the mean should be ~0.1
#' sem(x)
#' @export
sem <- function( x,
na.rm = TRUE) {
# Remove missing values
if (na.rm) {
x <- x[!is.na(x)]
# Close 'Remove missing values'
}
return(sd(x) / sqrt(length(x)))
}
#### 2) statistic ####
#' Compute a Statistic
#'
#' This function robustly computes a statistic
#' over a vector of values. The user can
#' specify what to return if the vector
#' is empty, values to include or exclude,
#' and how to treat missing values. Useful in
#' combination of functions like
#' \code{\link[stats]{aggregate}} or
#' \code{\link[dplyr:summarise]{dplyr's summarise}}.
#'
#' @param x A vector.
#' @param f A function that takes the vector
#' \code{x} as its first argument.
#' @param include A logical vector matching
#' length to the vector \code{x}.
#' @param exclude A vector of unique cases
#' to match and exclude.
#' @param na.rm Logical; if \code{TRUE}
#' removes \code{NA} values from \code{x}.
#' @param default The default value to
#' return if \code{x} is empty.
#' @param ... Additional arguments to
#' pass to the function \code{f}.
#'
#' @return A computed statistic, output from
#' the user-supplied function.
#'
#' @examples
#' # Examples using the 'iris' data set
#' data("iris")
#'
#' # Default
#' statistic(iris$Sepal.Length)
#' # User-specified statistic
#' statistic(iris$Sepal.Length, f = mean)
#' # Include subset of cases
#' statistic(iris$Sepal.Length,
#' f = mean,
#' include = iris$Species == "setosa"
#' )
#' # Custom function
#' statistic(iris$Species, f = function(x) mean(x %in% "setosa"))
#' # Exclude unique cases
#' statistic(iris$Species,
#' f = function(x) mean(x %in% "setosa"),
#' exclude = "virginica"
#' )
#' # If empty vector supplied, user-specified default value returned
#' statistic(iris$wrong_name, default = 0)
#' @export
statistic <- function( x,
f = length,
include = NULL,
exclude = NULL,
na.rm = T,
default = NA,
... ) {
# Initialize output
out <- default
# If there is any data
if (length(x) > 0) {
# If no logical vector is provided
if (is.null(include)) {
include <- rep(T, length(x))
# Close 'If no logical vector is provided'
}
# If categories to exclude are included
if (!is.null(exclude)) {
include <- include & !(x %in% exclude)
# Close 'If categories to exclude are included'
}
# If NA values should be removed
if (na.rm) {
include <- include & !is.na(x)
# Close 'If NA values should be removed'
} else {
include <- include | is.na(x)
# Close 'If NA values should be removed'
}
# If any data remains apply function
if (any(include)) {
out <- f(x[include], ...)
# Close 'If any data remains apply function'
}
# Close 'If there is any data'
}
return(out)
}
#### 3) boxcox_transform ####
#' Box-Cox Transformations for Linear Model
#'
#' Wrapper function to the \code{\link[MASS]{boxcox}}
#' function; estimates the optimal parameter
#' for the Box-Cox power transformation and
#' then applies the appropriate power transformation
#' to the outcome variable.
#'
#' @param x Either
#' \itemize{
#' \item A numeric vector of positive values;
#' \item A data frame with an outcome variable and
#' set of predictors for a linear regression.
#' }
#' @param outcome An optional character string with
#' the column name for the outcome variable in \code{x}.
#' Otherwise, the first column in \code{x} is assumed
#' to be the outcome variable.
#' @param parameter_grid Vector specifying the grid of
#' parameters to explore for maximum likelihood estimation.
#' @param output Logical; if \code{TRUE}, returns the
#' outcome variable following the power transformation.
#'
#' @details The Box-Cox power transformation for a vector of
#' positive values \eqn{x} and parameter \eqn{\lambda} is:
#'
#' \deqn{ f(x) = \frac{x^{\lambda} - 1}{\lambda}. }
#'
#' For \eqn{\lambda = 0}, one simply uses the log transform.
#'
#' @return Either the maximum likelihood estimate for
#' the power transformation parameter or a vector
#' for the outcome variable following the transformation.
#'
#' @examples
#' # Simulate 100 values from the normal distribution
#' set.seed(3)
#' z <- rnorm(100)
#' # Transform the simulated values
#' x <- (z * .5 + 1)^(1 / .5)
#' # Maximum likelihood estimate for
#' # transformation parameter
#' boxcox_transform(x, output = FALSE)
#'
#' # Histogram of x, transform of x, and
#' # original simulated values
#' layout(rbind(1, 2, 3))
#' hist(x)
#' hist(boxcox_transform(x))
#' hist(z)
#' @export
boxcox_transform <- function( x,
outcome = NULL,
parameter_grid = seq(-3, 3, .01),
output = TRUE ) {
# If data frame with predictors is provided
if (is.data.frame(x)) {
# Prep output and formula for lm call
if (is.null(outcome)) {
frm <- paste0(
colnames(x)[1], " ~ ."
)
transformed_x <- x[[colnames(x)[1]]]
# Close 'Prep output and formula for lm call'
} else {
frm <- paste0(outcome, "~ .")
transformed_x <- x[[outcome]]
# Close else for 'Prep output and formula for lm call'
}
# Maximum likelihood estimation via grid search
# of the Box-Cox power transformation parameter
xy <- MASS::boxcox(as.formula(frm),
data = x,
plotit = FALSE,
lambda = parameter_grid
)
# Extract maximum likelihood estimate
param <- xy$x[which.max(xy$y)]
# Close 'If data frame with predictors is provided'
} else {
# Convert to data frame to avoid scoping issues
dtf <- data.frame(
outcome = x
)
transformed_x <- x
# Maximum likelihood estimation via grid search
# of the Box-Cox power transformation parameter
xy <- MASS::boxcox(outcome ~ 1,
data = dtf,
plotit = FALSE,
lambda = parameter_grid
)
# Extract maximum likelihood estimate
param <- xy$x[which.max(xy$y)]
# Close else for 'If data frame with predictors is provided'
}
# If returning transformed values
if (output) {
# If power transformation
if (param != 0) {
# Power transformation
transformed_x <- (transformed_x^param - 1) / param
# Close 'If power transformation'
} else {
# Log-transform
transformed_x <- log(transformed_x)
# Close else for 'If power transformation'
}
# Add maximum likelihood estimate as attribute
attributes(transformed_x) <- list(
boxcox_parameter = param
)
out <- transformed_x
# Close 'If returning transformed values'
} else {
# Return maximum likelihood estimate
out <- param
# Close else for 'If returning transformed values'
}
return(out)
}
#### 4) pvalues ####
#' Compute and Format P-values
#'
#' Given a set of Monte Carlo
#' samples, estimates a p-value from
#' the proportion of values that fall
#' above or below a comparison point.
#' If \code{string} is \code{TRUE},
#' takes a numeric p-value and converts
#' it into a formatted character
#' string, either 'p = ...' or
#' 'p < ...'.
#'
#' @param x Either a) a vector of numeric
#' values (Monte Carlo samples) or b)
#' a single p-value.
#' @param comparison The comparison value;
#' the p-value is computed from the
#' proportion of Monte Carlo samples
#' above or below this cut-off.
#' @param alternative A character string
#' indicating the type of alternative
#' hypothesis to test, either a) \code{'two-sided'},
#' b) \code{'less'}, or c) \code{'greater'}.
#' If \code{'two-side'}, uses the alternative
#' hypothesis that produces the small p-value,
#' and then multiplies by two to adjust for
#' the two-sided comparison.
#' @param digits Number of digits to round to
#' when formatting the p-value.
#' @param string Logical; if \code{TRUE},
#' returns output as a nicely formatted
#' character string. Automatically set
#' to \code{TRUE} if length of \code{x}
#' equals 1.
#' @param pad Logical; if \code{TRUE},
#' pads number of values to right
#' of decimal to always equal \code{digits}.
#'
#' @return Either a numeric p-value or a character
#' string, a nicely formatted version of the
#' p-value.
#'
#' @examples
#' # Example based on two-sample t-test
#' set.seed(40)
#' x <- data.frame(
#' y = c(rnorm(50), rnorm(50, mean = .3)),
#' group = rep(1:2, each = 50)
#' )
#'
#' # Two-sample t-test
#' tt <- t.test(y ~ group, data = x)
#' print(pvalues(tt$p.value))
#' print(pvalues(tt$p.value, digits = 2))
#'
#' # For very small p-values, automatically
#' # converts to 'p < cut-off' format
#' print(pvalues(1e-6))
#'
#' # Computing p-values from
#' # Monte Carlo samples
#'
#' # Simulate data from standard normal;
#' # on average 50% of sample falls
#' # below zero
#' set.seed(50)
#' x <- rnorm(1000)
#'
#' # Default is two-sided
#' pvalues(x)
#' # Can specify less than or greater than
#' pvalues(x, alternative = "less")
#' pvalues(x, alternative = "greater")
#' # Against different comparison point
#' pvalues(x, alternative = "less", comparison = .68)
#'
#' # Simulate data from normal distribution
#' # with mean of 0.68, on average
#' # approximately 75% of sample falls
#' # below zero
#' set.seed(60)
#' x <- rnorm(1000, mean = .68)
#' pvalues(x)
#' pvalues(x, alternative = "less")
#' pvalues(x, alternative = "greater")
#' @export
pvalues <- function( x,
comparison = 0,
alternative = "two-sided",
digits = 3,
string = FALSE,
pad = FALSE ) {
# Initialize output
out <- NULL
# Assume if size of x equals 1 that
# it is a numeric p-value
# Check length
if (length(x) == 1) {
# Check if valid p-value
if (x >= 0 &
x <= 1) {
# Force argument 'string' to be TRUE
string <- TRUE
# Close 'Check if valid p-value'
} else {
stop("p-values must be between 0 and 1",
call. = FALSE
)
# Close else for 'Check if valid p-value'
}
# Close 'Check length'
}
# Estimate p-value from Monte Carlo samples
if (!string) {
check <- FALSE
# Two-sided test
if (alternative == "two-sided") {
check <- TRUE
# Lower tail
if (median(x) > comparison) {
out <- mean(x < comparison)
# Close 'Lower tail'
} else {
out <- mean(x > comparison)
# Close 'Lower tail'
}
out <- out * 2
# Close 'Two-sided test'
}
# Test if greater than comparison
if (alternative == "greater") {
check <- TRUE
out <- mean(x < comparison)
# Close 'Test if greater than comparison'
}
# Test if less than comparison
if (alternative == "less") {
check <- TRUE
out <- mean(x > comparison)
# Close 'Test if less than comparison'
}
# If alternative argument misspecified
if (!check) {
err_msg <- paste(
"Please specify 'alternative' as",
"either 'two-sided', 'greater', or",
"'less'."
)
stop(err_msg)
# Close 'If alternative argument misspecified'
}
# Close 'Estimate p-value from Monte Carlo samples'
}
# Convert to formatted string
if (string) {
# Pad end
if (pad) {
p <- format(round(x, digits = digits), nsmall = digits)
# Close 'Pad end'
} else {
p <- format(round(x, digits = digits))
# Close else for 'Pad end'
}
# If rounds to zero
if ( round(x, digits) == 0 ) {
# Specify as less than next smallest value
nd <- digits - 1
nd <- paste(
"0.",
paste(rep(0, nd), collapse = ""),
"1",
sep = ""
)
out <- paste("p < ", nd, sep = "")
# Close 'If rounds to zero'
} else {
out <- paste0("p = ", p)
# Close else for 'If rounds to zero'
}
# Close 'Convert to formatted string'
}
return(out)
}
#### 5) bootstrap ####
#' Non-Parametric Bootstrap
#'
#' Computes a test statistic over multiple
#' replicates from a set of data. Replicates
#' are created drawing a sample of the same
#' size and with replacement from the original
#' set of data.
#'
#' @param x A vector of values.
#' @param t_x A function to compute a test statistic;
#' the first argument must be for \code{x}.
#' @param N The number of samples with replacement
#' to draw from \code{X}.
#' @param summary An optional function to apply
#' to the test statistics after resampling.
#' @param ... Additional parameters for the \code{t_x}
#' function.
#'
#' @return A list consisting of...
#' \itemize{
#' \item observed = the value of the test statistic
#' for the original data set.
#' \item replicates = the values of the test statistic
#' for each of the replicates produced via resampling.
#' \item summary = the output of summary function
#' applied over the replicates; \code{NULL} if
#' no summary function was specified.
#' }
#'
#' @examples
#' # Simulate from normal distribution
#' set.seed(200)
#' x <- rnorm(50, mean = 100, sd = 15)
#'
#' # Use bootstrap method to estimate
#' # sampling distribution for the mean
#' btstrp <- bootstrap(x)
#'
#' hist(btstrp$replicates,
#' main = "",
#' xlab = "Sampling distribution - mean"
#' )
#' # True mean
#' abline(v = 100, lwd = 1)
#'
#' # Estimate of standard error
#' print(round(sem(x), 1))
#'
#' # Estimate of standard error
#' # computed from bootstrapped samples
#' btstrp <- bootstrap(x, summary = sd)
#' print(round(btstrp$summary, 1))
#'
#' # 95% confidence interval around the mean
#' # using bootstrapped samples
#' f <- function(y) {
#' paste(round(quantile(y, c(.025, .975)), 1),
#' collapse = " to "
#' )
#' }
#' btstrp <- bootstrap(x, summary = f)
#' print(btstrp$summary)
#'
#' # Use bootstrap method to estimate
#' # sampling distribution for the median
#' # (which has no close-formed solution)
#' btstrp <- bootstrap(x, t_x = median)
#' hist(btstrp$replicates,
#' main = "",
#' xlab = "Sampling distribution - median"
#' )
#' @export
bootstrap <- function(x, t_x = mean, N = 1000,
summary = NULL, ...) {
n <- length(x)
indices <- lapply(1:N, function(i) {
sample(1:n, size = n, replace = T)
})
monte_carlo_samples <- sapply(
indices, function(index) {
t_x(x[index], ...)
}
)
out <- list(
observed = t_x(x, ...),
replicates = monte_carlo_samples
)
if (!is.null(summary)) {
out$summary <-
summary(out$replicates)
}
return(out)
}
#### 6) summa ####
#' Flexible Formatted Summary Statistics
#'
#' A function that allows users to create
#' a nicely formatted character string
#' with summary statistics based on
#' user-supplied identifiers via a
#' simple, intuitive syntax.
#'
#' @param x A vector of values.
#' @param syntax A character string with
#' identifiers in the form \code{[[.]]}
#' (where \code{.} can be a variety of
#' letter sets for different summary statistics) -
#' the function then substitutes the appropriate
#' computed value for the corresponding
#' identifier (see details for more information).
#' @param categories An optional vector of
#' elements to match in \code{x} when
#' computing frequencies, proportions,
#' or percentages.
#' @param digits Number of digits to round
#' summary statistics.
#' @param na.rm Logical; if \code{TRUE}
#' removes \code{NA} values from \code{x}.
#' @param pad Logical; if \code{TRUE} pads
#' values with 0 to all have a matching
#' number of decimal places.
#' @param f An optional user-defined function
#' that takes \code{x} as a first argument
#' and returns a vector of values.
#' The i-th outputted value will then
#' be substituted for the corresponding
#' identifier \code{[[i]]} (see examples).
#' @param ... Additional arguments for the
#' user-defined function \code{f}.
#'
#' @details This function provides some simple syntax
#' to allow users to write out a custom phrase for
#' reporting summary statistics.
#' The function then searches the input for
#' identifiers - once found, the function computes
#' the appropriate summary statistic and
#' substitutes the numeric result in place of
#' the given identifier.
#'
#' For example, a user can provide the phrase:
#'
#' \code{'Mean = [[M]]'},
#'
#' and the function will then substitute the sample
#' mean of the vector \code{x} for the identifier
#' \code{[[M]]}.
#'
#' Pre-defined identifiers are:
#' \itemize{
#' \item \code{[[N]]} = Sample size;
#' \item \code{[[M]]} = Mean;
#' \item \code{[[SD]]} = Standard deviation;
#' \item \code{[[SE]]} = Standard error of the mean;
#' \item \code{[[Mn]]} = Minimum;
#' \item \code{[[Q1]]} = 1st quartile;
#' \item \code{[[Md]]} = Median;
#' \item \code{[[Q3]]} = 2nd quartile;
#' \item \code{[[Mx]]} = Maximum;
#' \item \code{[[IQR]]} = Inter-quartile range;
#' \item \code{[[C]]} = Counts/frequencies;
#' \item \code{[[P]]} = Percent;
#' \item \code{[[Pr]]} = Proportion.
#' }
#'
#' Users can also pass in a custom function \code{f}
#' that takes \code{x} as a first argument and
#' returns a vector of values. Then element \code{i}
#' from the outputted vector is substituted for
#' the identifier \code{[[i]]}.
#'
#' @return A character string.
#'
#' @examples
#' # Example using 'iris' data set
#' data("iris")
#' # Continuous variable - sepal length
#' x <- iris$Sepal.Length
#'
#' # Mean and standard deviation
#' summa(x)
#' # Median and IQR
#' summa(x, "[[M]] ([[IQR]])")
#' # Pad to 2 decimal places
#' summa(x, "[[M]] ([[IQR]])", pad = TRUE)
#' # Mean (SD); N [min and max]
#' summa(x, "[[N]]; [[M]] ([[SD]]); " %p%
#' "[[[Mn]], [[Q1]], [[Md]], [[Q3]], [[Mx]]]",
#' digits = 1
#' )
#'
#' # Custom measures via user-defined function
#' # (e.g., bootstrapped confidence interval)
#' fnc <- function(x) {
#' btstrp <- bootstrap(
#' x,
#' summary = function(y) quantile(y, c(.025, .975))
#' )
#' return(btstrp$summary)
#' }
#' summa(x, "[[M]] ([[SE]]) [[[1]] to [[2]]]",
#' f = fnc
#' )
#'
#' # Example using 'mtcars' data set
#' # Categorical variable - # of forward gears
#' data("mtcars")
#' x <- mtcars$gear
#'
#' # Percent and counts for 3 forward gears
#' summa(x == 3, "[[P]]% ([[C]] out of [[N]])")
#' # Percent and counts for 4 or 5 forward gears
#' summa(x, "[[P]]% ([[C]] out of [[N]])",
#' categories = c(4, 5)
#' )
#' @export
summa <- function(x, syntax = "[[M]] ([[SD]])",
categories = NULL,
digits = NULL, na.rm = TRUE,
pad = FALSE,
f = NULL,
... ) {
# Initialize output
out <- syntax
# Sample size
if (grepl("[[N]]", syntax, fixed = TRUE)) {
n <- length(x)
if (na.rm) length(x[!is.na(x)])
out <- gsub("[[N]]", n, out, fixed = TRUE)
# Close 'Sample size'
}
# Mean
if (grepl("[[M]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
m <- round(mean(x, na.rm = na.rm), dgt)
if (pad) {
m <- format(m, nsmall = dgt)
}
out <- gsub("[[M]]", m, out, fixed = TRUE)
# Close 'Mean'
}
# Standard deviation
if (grepl("[[SD]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
s <- round(sd(x, na.rm = na.rm), dgt)
if (pad) {
s <- format(s, nsmall = dgt)
}
out <- gsub("[[SD]]", s, out, fixed = TRUE)
# Close 'Standard deviation'
}
# Standard error
if (grepl("[[SE]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
n <- length(x)
if (na.rm) n <- length(x[!is.na(x)])
se <- round(sd(x, na.rm = na.rm) / sqrt(n), dgt)
if (pad) {
se <- format(se, nsmall = dgt)
}
out <- gsub("[[SE]]", se, out, fixed = TRUE)
# Close 'Standard error'
}
# Minimum
if (grepl("[[Mn]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
mn <- round(min(x, na.rm = na.rm), dgt)
if (pad) {
mn <- format(mn, nsmall = dgt)
}
out <- gsub("[[Mn]]", mn, out, fixed = TRUE)
# Close 'Minimum'
}
# 1st quartile
if ( grepl("[[Q1]]", syntax, fixed = TRUE) ) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
q1 <- round(quantile(x, prob = .25, na.rm = na.rm), dgt)
if (pad) {
q1 <- format(q1, nsmall = dgt)
}
out <- gsub("[[Q1]]", q1, out, fixed = TRUE)
# Close '1st quartile'
}
# Median
if (grepl("[[Md]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
md <- round(median(x, na.rm = na.rm), dgt)
if (pad) {
md <- format(md, nsmall = dgt)
}
out <- gsub("[[Md]]", md, out, fixed = TRUE)
# Close 'Median'
}
# 3rd quartile
if (grepl("[[Q3]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
q3 <- round(quantile(x, prob = .75, na.rm = na.rm), dgt)
if (pad) {
q3 <- format(q3, nsmall = dgt)
}
out <- gsub("[[Q3]]", q3, out, fixed = TRUE)
# Close '3rd quartile'
}
# Maximum
if (grepl("[[Mx]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
mx <- round(max(x, na.rm = na.rm), dgt)
if (pad) {
mx <- format(mx, nsmall = dgt)
}
out <- gsub("[[Mx]]", mx, out, fixed = TRUE)
# Close 'Maximum'
}
# Inter-quartile range
if (grepl("[[IQR]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
q <- quantile(x, prob = c(.25, .75), na.rm = na.rm)
iqr <- round(diff(q), dgt)
if (pad) {
iqr <- format(iqr, nsmall = dgt)
}
out <- gsub("[[IQR]]", iqr, out, fixed = TRUE)
# Close 'Inter-quartile range'
}
# Counts/frequencies
if (grepl("[[C]]", syntax, fixed = TRUE)) {
if (is.null(categories)) {
categories <- TRUE
}
cnt <- sum(x %in% categories, na.rm = na.rm)
out <- gsub("[[C]]", cnt, out, fixed = TRUE)
# Close 'Counts/frequencies'
}
# Percent
if (grepl("[[P]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 1
} else {
dgt <- digits
}
if (is.null(categories)) {
categories <- TRUE
}
per <- 100 * mean(x %in% categories, na.rm = na.rm)
per <- round(per, dgt)
if (pad) {
per <- format(per, nsmall = dgt)
}
out <- gsub("[[P]]", per, out, fixed = TRUE)
# Close 'Percent'
}
# Proportion
if (grepl("[[Pr]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
if (is.null(categories)) {
categories <- TRUE
}
prp <- round(mean(x %in% categories, na.rm = na.rm), dgt)
if (pad) {
prp <- format(prp, nsmall = dgt)
}
out <- gsub("[[Pr]]", prp, out, fixed = TRUE)
# Close 'Proportion'
}
# Counts/frequencies for NA values
if (grepl("[[NAC]]", syntax, fixed = TRUE)) {
if (is.null(categories)) {
categories <- TRUE
}
cnt <- sum( is.na(x) )
out <- gsub("[[NAC]]", cnt, out, fixed = TRUE)
# Close 'Counts/frequencies for NA values'
}
# Percentage of NA values
if (grepl("[[NAP]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 1
} else {
dgt <- digits
}
per <- 100 * mean( is.na(x) )
per <- round(per, dgt)
if (pad) {
per <- format(per, nsmall = dgt)
}
out <- gsub("[[NAP]]", per, out, fixed = TRUE)
# Close 'Percentage of NA values'
}
# Proportion of NA values
if (grepl("[[NAPr]]", syntax, fixed = TRUE)) {
# Default number of digits to round to
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
prp <- round(mean(is.na(x)), dgt)
if (pad) {
prp <- format(prp, nsmall = dgt)
}
out <- gsub("[[NAPr]]", prp, out, fixed = TRUE)
# Close 'Proportion of NA values'
}
# Custom function
if (!is.null(f)) {
val <- f(x, ...)
for (i in 1:length(val)) {
if (is.null(digits)) {
dgt <- 2
} else {
dgt <- digits
}
val[i] <- round(val[i], dgt)
if (pad) {
val[i] <- format(val[i], nsmall = dgt)
}
out <- gsub(paste0("[[", i, "]]"),
val[i], out,
fixed = TRUE
)
}
# Close conditional 'Custom function'
}
return(out)
}
#### 7) bounds ####
#' Lower and Upper Limits for Confidence or Credible Intervals
#'
#' Given a desired width, computes the lower and upper limit
#' for a confidence or credible interval. Can be combined
#' with functions like \code{\link[stat]{quantile}} or
#' the quantile functions for assorted probability
#' distributions( e.g., \code{\link[stats:Normal]{qnorm}},
#' \code{\link[stats:Beta]{qbeta}}, etc.).
#'
#' @param width The width of the interval, a numeric value
#' between 0 and 1.
#'
#' @return A vector with the lower and upper limits.
#'
#' @examples
#' # Lower and upper limits of 95% interval
#' bounds( .95 )
#' # Example data
#' x <- rnorm( 100, mean = 100, sd = 15 )
#' # 95% confidence interval around mean
#' mean(x) + qnorm( bounds( .95 ) ) * sem( x )
#' # Predicted
#' qnorm( bounds( .95 ), mean = 100, sd = 15/sqrt(100) )
#' # The 1st and 3rd quartiles
#' quantile( x, bounds( .5 ) )
#'
#' @export
bounds <- function( width ) {
return( .5 + c( -.5, .5 )*width )
}
#### 8) standardize ####
#' Standardize Columns in a Data Frame or Matrix
#'
#' Function to standardize (mean-center and scale
#' by standard deviation resulting in a mean of 0
#' and standard deviation of 1) columns in a
#' matrix or data frame.
#'
#' @param x A data frame or matrix of numeric values.
#' @param y A data frame or matrix of numeric values
#' (must have the same column names in same order
#' as \code{x}).
#' @param mean_sd A list of two numeric vectors equal in
#' length to the number of columns with the means and
#' standard deviations, respectively, to use for scaling.
#' @param raw Logical; if \code{TRUE}, uses the means
#' and standard deviations given in \code{mean_sd}
#' to return the original raw values of \code{x}.
#' @param as_list Logical; if \code{TRUE} returns
#' a named list with the scaled values of \code{x}
#' (and \code{y} if provided) along with the means and
#' standard deviations used for scaling. Automatically
#' set to \code{TRUE} when \code{y} is provided.
#' @param labels A character vector with the labels for
#' the \code{x} and \code{y} data sets if returning a
#' list.
#'
#' @returns Either a scaled data frame or matrix or a list
#' with the scaled values and the means and standard deviations
#' used for scaling.
#'
#' @examples
#' # Create data frame
#' x_raw <- round( matrix( rnorm( 9, 100, 15 ), 3, 3 ) )
#' colnames(x_raw) <- paste0( 'X', 1:3 )
#' print(x_raw)
#'
#' # Standardize columns
#' x <- standardize( x_raw )
#' print(x)
#'
#' # Create second data frame with same
#' # variables but new values
#' y_raw <- round( matrix( rnorm( 9, 50, 15 ), 3, 3 ) )
#' colnames(y_raw) <- paste0( 'X', 1:3 )
#' print(y_raw)
#'
#' # Scale columns of y_raw based on means and
#' # standard deviations from x_raw
#' lst <- standardize( x_raw, y_raw, labels = c('x', 'y') )
#' y <- lst$Data$y
#' print( y )
#'
#' # Undo scaling
#' standardize( y, mean_sd = lst$Scaling, raw = TRUE )
#'
#' @export
standardize <- function( x,
y = NULL,
mean_sd = NULL,
raw = FALSE,
as_list = FALSE,
labels = c( 'X', 'Y' ) ) {
# If scaling not provided
if ( is.null( mean_sd ) ) {
M <- apply(
x, 2, mean, na.rm = T
)
SD <- apply(
x, 2, sd, na.rm = T
)
# Close 'If scaling not provided'
} else {
M <- mean_sd[[1]]
SD <- mean_sd[[2]]
# Close else for 'If scaling not provided'
}
# If scaling should be undone
if ( raw ) {
# If scaling provided
if ( !is.null( mean_sd ) ) {
x.raw <- x
# Loop over columns
for ( k in 1:ncol(x) ) {
x.raw[, k] <- x[, k]*SD[k] + M[k]
# Close 'Loop over columns'
}
# Close 'If scaling provided'
} else {
stop(
paste0(
"Must provide list with means and standard deviations ",
"in order to compute original raw values"
)
)
# Close else for 'If scaling provided'
}
return( x.raw )
# Close 'If scaling should be undone'
}
# Initialize scaled data sets
z.x <- x
z.y <- NULL
# If second data set provided
if ( !is.null(y) ) {
# Check if same number of columns
if ( ncol(y) != ncol(x) ) {
stop( "Data set 'y' must have same number of columns as 'x'" )
# Close 'Check if same number of columns'
}
# Check if same column names
if ( any( colnames(y) != colnames(x) ) ) {
stop( "Data set 'y' should have same column names as 'x'" )
# Close 'Check if same column names'
}
z.y <- y
# Close 'If second data set provided'
}
# Loop over columns
for ( k in 1:ncol(x) ) {
z.x[, k] <- ( x[, k] - M[k] )/SD[k]
# If second data set provided
if ( !is.null(y) ) {
z.y[, k] <- ( y[, k] - M[k] )/SD[k]
# Close 'If second data set provided'
}
# Close 'Loop over columns'
}
# Return list given two data sets
if ( !is.null(y) ) as_list <- TRUE
# Return list of outputs
if ( as_list ) {
lst_output <- list(
Data = list(
X = z.x,
Y = z.y
),
Scaling = list(
Mean = M,
SD = SD
)
)
names( lst_output$Data ) <- labels
return( lst_output )
# Close 'Return list of outputs'
}
return( z.x )
}
#### 9) density_points ####
#' Estimate Densities for Individual Observations
#'
#' Given a vector of values, computes the empirical
#' density for each observation.
#'
#' @param x A numeric vector.
#' @param ... Additional arguments to be passed
#' to the density function.
#'
#' @return A list with...
#' \itemize{
#' \item x = the sorted values for the original input;
#' \item y = the associated empirical densities.
#' }
#'
#' @examples
#' plot(c(-4,4),c(0,.5),type='n',ylab='Density',xlab='z-scores')
#' x = rnorm( 100 )
#' dp = density_points( x )
#' points( dp$x, dp$y, pch = 19 )
#'
#' @export
density_points <- function (x, ...) {
ed = density(x, ...)
af = approxfun(ed)
y = af(sort(x))
return( list(x = sort(x), y = y) )
}
#### 10) yeo_johnson ####
#' Yeo-Johnson Transform
#'
#' Transforms a numeric vector of values using
#' the Yeo-Johnson (2000) power transformation
#' family.
#'
#' @param y A numeric vector (values can be
#' positive, negative, or zero).
#' @param lambda A numeric value specifying
#' the transformation to apply.
#'
#' @references
#' Yeo, I. K., & Johnson, R. A. (2000). A new family of power
#' transformations to improve normality or symmetry. Biometrika,
#' 87 (4), 954-959. https://doi.org/10.1093/biomet/87.4.954
#'
#' @return A numeric vector.
#'
#' @examples
#' # Example from page 958 of Yeo & Johnson (2000)
#' y <- c(
#' 6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5,
#' 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0
#' )
#' shapiro.test( y ) # Test of normality
#' y_transformed <- yeo_johnson(y, 1.305)
#' shapiro.test( y_transformed ) # Test of normality shows improvement
#'
#' @export
yeo_johnson <- function(y, lambda) {
if ( !is.numeric(y) ) {
stop(
"Argument 'y' must be a numeric vector"
)
}
if ( !is.numeric(lambda) | length( lambda ) != 1 ) {
stop(
"Argument 'lambda' must be a numeric value"
)
}
y_transformed <- y
# Remove missing values
lgc_no_NA <- !is.na(y)
# Identify negative values
lgc_negative <- lgc_no_NA & y < 0
# Apply transformation
if ( lambda != 1 ) {
lgc_subset <-
lgc_no_NA & !lgc_negative
if ( lambda != 0 ) {
y_transformed[lgc_subset] <-
( (y[lgc_subset] + 1)^lambda - 1 )/lambda
}
if ( lambda == 0 ) {
y_transformed[no_NA] <-
log( y[no_NA] + 1)
}
lgc_subset <-
lgc_no_NA & lgc_negative
if ( lambda != 2 ) {
y_transformed[lgc_subset] <-
-( (-y[lgc_subset] + 1)^(2 - lambda) - 1 )/(2 - lambda)
}
if ( lambda == 2 ) {
y_transformed[lgc_subset] <-
log( -y[lgc_subset] + 1)
}
# Close 'Apply transformation'
}
return(y_transformed)
}
#### 11) yeo_johnson_transform ####
#' Determine Best Parameter for Yeo-Johnson Transform
#'
#' Estimates and applies best-fitting Yeo-Johnson
#' transformation parameter via maximum likelihood
#' for a vector of numeric values.
#'
#' @param x A numeric vector (values can be
#' positive, negative, or zero).
#' @param lower The smallest value for the transformation
#' parameter to consider.
#' @param upper The highest value for the transformation
#' parameter to consider.
#'
#' @details
#' The transformation parameter to use is estimated via
#' maximum likelihood using the [base:optimize] and
#' [stats:dnorm] functions.
#'
#' @references
#' Yeo, I. K., & Johnson, R. A. (2000). A new family of power
#' transformations to improve normality or symmetry. Biometrika,
#' 87 (4), 954-959. https://doi.org/10.1093/biomet/87.4.954
#'
#' @return A numeric vector, the transformed values of x.
#'
#' @examples
#' # Example from page 958 of Yeo & Johnson (2000)
#' x <- c(
#' 6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5,
#' 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0
#' )
#' shapiro.test( x ) # Test of normality
#' x_transformed <- yeo_johnson_transform(x)
#' # Extract results of maximum likelihood estimation
#' attributes(x_transformed)$mle_for_yeo_johnson
#' shapiro.test( x_transformed ) # Test of normality shows improvement
#'
#' @export
yeo_johnson_transform <- function(
x,
lower = -100,
upper = 100 ) {
y <- x[ !is.na(x) ]
fun_sum_of_log_likelihood <- function(lambda, y) {
n <- length(y)
yt <- yeo_johnson(y, lambda)
mu <- mean(yt)
sigma_sq <- var(yt)*( length(yt)-1 )/length(yt)
sll_part_1 <- -(n/2)*log(2*pi)
sll_part_2 <- -(n/2)*log(sigma_sq)
sll_part_3 <- ( -1/(2*sigma_sq) )*sum( (yt - mu)^2 )
sll_part_4 <- (lambda-1) * sum( sign(y) * log( abs(y) + 1) )
sum_of_log_likelihood <-
sll_part_1 + sll_part_2 + sll_part_3 + sll_part_4
return(sum_of_log_likelihood)
}
mle = optimize(
fun_sum_of_log_likelihood,
lower = lower, upper = upper,
maximum = TRUE,
y = y
)
x_transformed <- yeo_johnson(
x, mle$maximum
)
lst_attr <- attributes(x_transformed)
if ( is.null( lst_attr ) ) {
lst_attr <- list(
mle_for_yeo_johnson = mle
)
} else {
lst_attr$mle_for_yeo_johnson <- mle
}
attributes( x_transformed ) <- lst_attr
return( x_transformed )
}
#### 12) principal_components_analysis ####
#' Principal Components Analysis
#'
#' Function for running a principal components
#' analysis (PCA) with training (and test) data.
#' Expands on the [stats::prcomp] function.
#'
#' @param train A data frame or matrix. Assumes all
#' columns should be included in the PCA.
#' @param test An optional data frame or matrix.
#' Must have the same number of columns and
#' same column names as \code{train}.
#'
#' @returns A list with the standardized data, the
#' results of the call to [stats::prcomp], the rotation
#' matrix (eigenvectors), the eigenvalues, the
#' correlations between raw scores and component scores,
#' and the root-mean square error for the training and
#' test sets when using a smaller number of components.
#'
#' @examples
#' # Simulate training and test data
#' train <- MASS::mvrnorm( 100, rep( 0, 8 ), diag(8) )
#' test <- MASS::mvrnorm( 100, rep( 0, 8 ), diag(8) )
#'
#' PCA <- principal_components_analysis(
#' train = train, test = test
#' )
#'
#' # Loading matrix
#' lambda <- cbind(
#' c( runif( 4, .3, .9 ), rep( 0, 4 ) ),
#' c( rep( 0, 4 ), runif( 4, .3, .9 ) )
#' )
#' # Communalities
#' D_tau <- diag( runif( 8, .5, 1.5 ) )
#'
#' cov_mat <- lambda %*% t( lambda ) + D_tau
#' cor_mat <- cov2cor( cov_mat )
#'
#' set.seed( 341 ) # For reproducibility
#' x <- MASS::mvrnorm( n = 200, mu = rep( 0, 8 ), Sigma = cor_mat )
#' colnames(x) <- paste0( 'C', 1:8 )
#'
#' PCA <- principal_components_analysis(
#' train = x
#' )
#'
#' @export
principal_components_analysis <- function( train,
test = NULL ) {
is_na <- apply(
train, 1, function(x) any( is.na(x) )
)
# If any missing training data
if ( any(is_na) ) {
train <- train[ !is_na, ]
warning(
paste0( "Removed ", sum(is_na),
" rows with missing values for 'train' argument" )
)
# Close 'If any missing training data'
}
# If test data provided
if ( !is.null(test) ) {
is_na <- apply(
train, 1, function(x) any( is.na(x) )
)
# If any missing test data
if ( any(is_na) ) {
train <- train[ !is_na, ]
warning(
paste0( "Removed ", sum(is_na), " rows with missing values" )
)
# Close 'If any missing test data'
}
# Close 'If test data provided'
}
# Number of columns
K <- ncol(train)
# Initialize output
lst_output <- list(
Data = list(
Train = NULL,
Test = NULL
),
Scaling = list(
Mean = NA,
SD = NA
),
Fit = NA,
Rotation = NA,
Eigenvalues = NA,
Proportion = NA,
Scores = list(
Train = NULL,
Test = NULL
),
Correlations = list(
Train = NULL,
Test = NULL
),
RMSE = list(
Train = NA,
Test = NA
)
)
# Standardize inputs
lst_scaled <- arfpam::standardize(
x = train,
y = test,
as_list = TRUE
)
# Save data
lst_output$Data$Train <- lst_scaled$Data$X
lst_output$Scaling$Mean <- lst_scaled$Scaling$Mean
lst_output$Scaling$SD <- lst_scaled$Scaling$SD
# Run PCA using training data
lst_PCA <- prcomp(
lst_scaled$Data$X,
retx = TRUE,
center = FALSE, scale. = FALSE
)
# Extract eigenvalues and proportion accounted for
importance <- summary( lst_PCA )$importance
# Save results
lst_output$Fit <- lst_PCA
lst_output$Rotation <- lst_PCA$rotation
lst_output$Eigenvalues <- importance[1, ]^2
lst_output$Scores$Train <- lst_PCA$x
lst_output$Proportion <- importance[2, ]
# Compute correlations between components and raw variables
lst_output$Correlations$Train <- sapply(
1:K, function(p) { # Loop over components
sapply(
1:K, function(v) { # Loop over raw variables
cor( lst_scaled$Data$X[, v], lst_PCA$x[, p] )
}
)
}
)
rmse <- rep( NA, K )
iR <- solve( lst_output$Rotation ) # Inverse of rotation matrix
# Loop over components
for ( p in 1:K ) {
# Reconstruct raw scores from component scores
# If one component
if ( p == 1 ) {
# Ensure matrix algebra works
train.hat <-
cbind( lst_output$Scores$Train[, 1] ) %*% rbind( iR[1, ] )
# Close 'If one component'
} else {
train.hat <-
as.matrix( lst_output$Scores$Train[, 1:p] ) %*% iR[1:p, ]
# Close else for 'If one component'
}
rmse[p] <-
sqrt( mean( (train.hat - as.matrix( lst_scaled$Data$X ) )^2 ) )
# Close 'Loop over components'
}
lst_output$RMSE$Train <- rmse
# If test data provided
if ( !is.null(test) ) {
# Save data
lst_output$Data$Test <- lst_scaled$Data$Y
# Compute component scores
lst_output$Scores$Test <-
as.matrix( lst_scaled$Data$Y ) %*% lst_PCA$rotation
# Compute correlations between components and raw variables
lst_output$Correlations$Test <- sapply(
1:K, function(p) { # Loop over components
sapply(
1:K, function(v) { # Loop over raw variables
cor( lst_scaled$Data$Y[, v], lst_output$Scores$Test )
}
)
}
)
rmse <- rep( NA, K )
# Loop over components
for ( p in 1:K ) {
# If one component
if ( p == 1 ) {
# Ensure matrix algebra works
test.hat <-
cbind( lst_output$Scores$Test[, 1] ) %*% rbind( iR[1, ] )
# Close 'If one component'
} else {
test.hat <-
as.matrix( lst_output$Scores$Test[, 1:p] ) %*% iR[1:p, ]
# Close else for 'If one component'
}
rmse[p] <-
sqrt( mean( (test.hat - as.matrix( lst_scaled$Data$Y) )^2 ) )
# Close 'Loop over components'
}
lst_output$RMSE$Test <- rmse
# Close 'If test data provided'
}
return( lst_output )
}
#### 13) stats_by_group ####
#' Compute Statistics by Group
#'
#' A function to compute assorted univariate statistics
#' for a specified variable in a data frame over desired
#' grouping factors.
#'
#' @param dtf A data frame.
#' @param column A character string, the column in \code{dtf} to
#' compute statistics for.
#' @param groupings A character vector, the columns in \code{dtf}
#' to use as grouping factors (it is recommended that they all
#' be categorical variables).
#' @param statistics A character vector, the set of different
#' statistics to compute over groups.
#' @param method A character string, the type of method to use
#' when computing uncertainty intervals. Options include:
#' \code{"Student's T"} for means or
#' \code{"Beta-binomial"} for proportions.
#' @param categories An optional vector of elements to match over
#' when computing frequencies, proportions, or percentages.
#' @param width A numeric value between 0 and 1, the width for
#' uncertainty intervals.
#' @param na.rm A logical value; if \code{TRUE} removes
#' \code{NA} values.
#'
#' @details
#' Possible univariate statistics that can be computed:
#' \itemize{
#' \item \code{'N'} = Sample size;
#' \item \code{'M'} = Mean;
#' \item \code{'Md'} = Median;
#' \item \code{'SD'} = Standard deviation;
#' \item \code{'SE'} = Standard error of the mean;
#' \item \code{'C'} = Counts/frequencies;
#' \item \code{'Pr'} = Proportions;
#' \item \code{'P'} = Percentages.
#' }
#'
#' Additionally, specifying \code{'UI'} in combination with the
#' argument \code{method} will compute the lower and upper limits
#' of a desired uncertainty interval. The width of the interval
#' can be controlled by the argument \code{width}.
#'
#' @returns A data frame with separate rows for each combination
#' of grouping factors and separate columns for each statistic
#' to compute.
#'
#' @examples
#' # Example data set
#' data(iris)
#' dtf <- iris
#'
#' # Mean/SD for sepal length by species
#' dtf |> stats_by_group( 'Sepal.Length', 'Species' )
#'
#' # Create additional categorical variable
#' dtf$Long_petal <- c( 'No', 'Yes' )[
#' ( dtf$Petal.Length > median( dtf$Petal.Length) ) + 1
#' ]
#' # Sample size, mean, and confidence intervals using Student's T
#' # distribution by species and whether petals are long
#' dtf |> stats_by_group(
#' 'Sepal.Length', c( 'Species', 'Long_petal' ), c( 'N', 'M', 'UI' )
#' )
#'
#' # Create additional categorical variable
#' dtf$Long_sepal <- c( 'No', 'Yes' )[
#' ( dtf$Sepal.Length > median( dtf$Sepal.Length) ) + 1
#' ]
#' # Proportion and confidence intervals based on beta-binomial
#' # distribution for long sepals by long petals
#' dtf |> stats_by_group(
#' 'Long_sepal', c( 'Long_petal' ), c( 'N', 'Pr', 'UI' ),
#' categories = 'Yes', method = 'Beta-binomial'
#' )
#'
#' @export
stats_by_group <- function( dtf,
column,
groupings,
statistics = c( 'M', 'SD' ),
method = "Student's T",
categories = 1,
width = .95,
na.rm = TRUE ) {
distribution_t <- c(
"T-distribution",
"t-distribution",
"T",
"t",
"Student's T",
"Student's t",
"Student T",
"Student t"
)
distribution_beta_binomial <- c(
'Beta-Binomial',
'Beta-binomial',
'beta-binomial'
)
lst_fun <- list()
l <- length( lst_fun )
# Statistic = sample size
if ( 'N' %in% statistics ) {
f <- function(x) {
if (na.rm) n <- sum( !is.na(x) ) else n <- length(x)
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'N'
l <- l + 1
# Close 'Statistic = mean'
}
# Statistic = mean
if ( 'M' %in% statistics ) {
f <- function(x) {
mean(x, na.rm = na.rm)
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'M'
l <- l + 1
# Close 'Statistic = mean'
}
# Statistic = median
if ( 'Md' %in% statistics ) {
f <- function(x) {
median(x, na.rm = na.rm)
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'Md'
l <- l + 1
# Close 'Statistic = mean'
}
# Statistic = standard deviation
if ( 'SD' %in% statistics ) {
f <- function(x) {
sd(x, na.rm = na.rm)
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'SD'
l <- l + 1
# Close 'Statistic = standard deviation'
}
# Statistic = standard error
if ( 'SE' %in% statistics ) {
f <- function(x) {
if ( na.rm ) n <- sum( !is.na(x) ) else n <- length(x)
sd(x, na.rm = na.rm) / sqrt( n )
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'SE'
l <- l + 1
# Close 'Statistic = standard error'
}
# Statistic = count
if ( 'C' %in% statistics ) {
f <- function(x) {
sum(x %in% categories, na.rm = na.rm)
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'C'
l <- l + 1
# Close 'Statistic = proportion'
}
# Statistic = proportion
if ( 'Pr' %in% statistics ) {
f <- function(x) {
mean(x %in% categories, na.rm = na.rm)
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'Pr'
l <- l + 1
# Close 'Statistic = proportion'
}
# Statistic = percentage
if ( 'P' %in% statistics ) {
f <- function(x) {
100*mean(x %in% categories, na.rm = na.rm)
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'P'
l <- l + 1
# Close 'Statistic = percentage'
}
# Statistic = lower limit of 95% UI [T distribution]
if ( 'UI' %in% statistics & method %in% distribution_t ) {
f <- function(x) {
m <- mean(x, na.rm = na.rm)
s <- sd(x, na.rm = na.rm)
if ( na.rm ) n <- sum( !is.na(x) ) else n <- length( x )
return( m + qt( .5 - width/2, n - 1 )*s/sqrt(n) )
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'UI_LB'
l <- l + 1
# Close 'Statistic = lower limit of 95% UI [T distribution]'
}
# Statistic = upper limit of 95% UI [T distribution]
if ( 'UI' %in% statistics & method %in% distribution_t ) {
f <- function(x) {
m <- mean(x, na.rm = na.rm)
s <- sd(x, na.rm = na.rm)
if ( na.rm ) n <- sum( !is.na(x) ) else n <- length( x )
return( m + qt( .5 + width/2, n - 1 )*s/sqrt(n) )
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'UI_UB'
l <- l + 1
# Statistic = upper limit of 95% UI [T distribution]
}
# Statistic = lower limit of 95% UI [Beta-binomial distribution]
if ( 'UI' %in% statistics & method %in% distribution_beta_binomial ) {
f <- function(x) {
counts <- sum(x %in% categories, na.rm = na.rm)
if ( na.rm ) n <- sum( !is.na(x) ) else n <- length( x )
return( qbeta( .5 - width/2, .5 + counts, .5 + n - counts ) )
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'UI_LB'
l <- l + 1
# Close 'Statistic = lower limit of 95% UI [Beta-binomial distribution]'
}
# Statistic = upper limit of 95% UI [Beta-binomial distribution]
if ( 'UI' %in% statistics & method %in% distribution_beta_binomial ) {
f <- function(x) {
counts <- sum(x %in% categories, na.rm = na.rm)
if ( na.rm ) n <- sum( !is.na(x) ) else n <- length( x )
return( qbeta( .5 + width/2, .5 + counts, .5 + n - counts ) )
}
lst_fun[[ l + 1]] <- f
names( lst_fun )[l + 1] <- 'UI_UB'
l <- l + 1
# Statistic = upper limit of 95% UI [Beta-binomial distribution]
}
# If only one grouping factor
if ( length( groupings ) == 1 ) {
lst_groups <- list( dtf[[ groupings ]] )
names( lst_groups ) <- groupings
# Close 'If only one grouping factor'
} else {
lst_groups <- dtf[, groupings]
# Close else for 'If only one grouping factor'
}
dtf_summary <- aggregate(
dtf[[ column ]], lst_groups, function(x) {
sapply( seq_along(lst_fun), function(i) {
return( lst_fun[[i]](x) )
} )
}
)
mat_summary <- matrix( NA, nrow(dtf_summary), length(lst_fun) )
colnames(mat_summary) <- names( lst_fun)
# If only one statistic
if ( ncol(mat_summary) == 1 ) {
mat_summary[, 1] <- dtf_summary[, colnames(dtf_summary) != groupings]
dtf_summary <- cbind(
dtf_summary[, groupings],
as.data.frame( mat_summary )
)
# Close 'If only one statistic'
} else {
# Loop over statistics
for ( k in 1:ncol(mat_summary) ) {
mat_summary[, k] <- dtf_summary$x[, k]
# Close 'Loop over statistics'
}
dtf_summary <- cbind(
dtf_summary[, groupings],
as.data.frame( mat_summary )
)
}
colnames(dtf_summary)[ seq_along(groupings) ] <- groupings
return( dtf_summary )
}
#### 14) pool_over_imputations ####
#' Pool Parameter Estimates Over Imputations
#'
#' Function to pool parameter estimates as
#' well as their squared standard errors and
#' associated degrees of freedom over imputation
#' sets when using multiple imputation via
#' chained equations (MICE). Estimates are
#' pooled according to Rubin's rules. Input notation
#' assumes I imputations and P parameters.
#'
#' @param estimates An I x P matrix of parameter estimates.
#' @param standard_errors An I x P matrix of standard errors.
#' @param degrees_of_freedom An I x P matrix of degrees of freedom.
#'
#' @details Pools over multiple imputations using Rubin's rules.
#' Final parameter estimates are the average over imputations,
#' while standard errors are the combination of within variance
#' (average of the estimates' variance, the square of the standard
#' error, across imputations) and between variance (the variance of the
#' point estimates across imputations). Finally, a correction is applied
#' to the parameters' degrees of freedom.
#'
#' @references
#' Rubin, D. B. (1987). Multiple imputation for nonresponse
#' in surveys. John Wiley & Sons.
#'
#' van Buuren, S., & Groothuis-Oudshoorn, K. (2011). mice:
#' Multivariate imputation by chained equations in R.
#' Journal of Statistical Software, 45 (3), 1-67.
#' https://doi.org/10.18637/jss.v045.i03.
#'
#' @returns A matrix with the pooled estimates, standard errors,
#' and associated degrees of freedom for the set of parameters.
#'
#' @examples
#' \dontrun{
#' # Simulation example
#' set.seed( 222 ) # For reproducibility
#'
#' # Simulate 200 observations from
#' # 3 correlated variables
#' n <- 200
#' Sigma <- rbind(
#' c( 1.0, 0.3, 0.2 ),
#' c( 0.3, 1.0, 0.7 ),
#' c( 0.2, 0.7, 1.0 )
#' )
#' Y <- MASS::mvrnorm(
#' n, c( 0, 0, 0 ), Sigma
#' )
#' colnames(Y) <- c( 'Y1', 'Y2', 'Y3' )
#' Y <- data.frame( Y )
#'
#' # Missing values for Y2 depend on both Y1 and Y2
#' Y$R2 <- rbinom(
#' n, 1, logistic(
#' logit(.3) + log(2)*Y$Y1 + log(4)*Y$Y2
#' )
#' )
#' Y$Y2.M <- Y$Y2
#' Y$Y2.M[ Y$R2 == 1 ] <- NA
#'
#' # Predict Y1 from Y2 (All cases)
#' lm_Y1_from_Y2 <- lm(
#' Y1 ~ Y2, data = Y
#' )
#' print( round(
#' summary( lm_Y1_from_Y2 )$coefficients[2, c(1:2, 4)], 3
#' ) )
#'
#' # Predict Y1 from Y2 (Complete cases)
#' lm_Y1_from_Y2.M <- lm(
#' Y1 ~ Y2.M, data = Y
#' )
#' print( round(
#' summary( lm_Y1_from_Y2.M )$coefficients[2, c(1:2, 4)], 3
#' ) )
#'
#' # Impute missing values of Y2 from Y3 via a simplistic
#' # version of predictive mean matching (Note better
#' # methods exist, approach is for example purposes only)
#'
#' lm_Y2.M_from_Y3 <- lm(
#' Y2.M ~ Y3, data = Y[ Y$R2 == 0, ]
#' )
#' Y2.M_pred <- predict(
#' lm_Y2.M_from_Y3, newdata = Y[ Y$R2 == 1, ]
#' )
#'
#' # Impute missing values 10 times, sampling randomly
#' # from 5 closest observed values
#' i <- sapply(
#' 1:10, function(m) {
#'
#' sapply(
#' Y2.M_pred, function( yhat ) {
#' smallest_diff <-
#' order( abs( Y$Y2.M[ Y$R2 == 0 ] - yhat ) )
#' return(
#' Y$Y2.M[ Y$R2 == 0 ][smallest_diff][
#' sample( 1:5, size = 1 )
#' ]
#' )
#' }
#' )
#'
#' }
#' )
#'
#' # Predict Y1 from Y2 using imputed data sets
#' # and save estimates, standard errors, and
#' # degrees of freedom
#' cf <- sapply(
#' 1:10, function(m) {
#'
#' Y$Y2.I <- Y$Y2.M
#' Y$Y2.I[ Y$R2 == 1 ] <- i[, m]
#'
#' lm_Y1_from_Y2.I <- lm(
#' Y1 ~ Y2.I, data = Y
#' )
#'
#' sm <- summary( lm_Y1_from_Y2.I )
#'
#' return(
#' cbind( sm$coefficients[, 1:2], sm$df[2] )
#' )
#'
#' }
#' )
#' cf <- array(
#' cf, dim = c( 2, 3, 10 )
#' )
#'
#' pooled <- pool_over_imputations(
#' t( cf[, 1, ] ), # Coefficients
#' t( cf[, 2, ] ), # Squared standard errors
#' t( cf[, 3, ] ) # Degrees of freedom
#' )
#' print( round(
#' c( pooled[2, 1:2],
#' pt(
#' abs( pooled[2, 1]/pooled[2, 2] ), pooled[2, 3], lower.tail = F
#' )*2
#' ), 3
#' ) )
#' }
#'
#' @export
pool_over_imputations <- function(
estimates,
standard_errors,
degrees_of_freedom ) {
# If single statistic provided
if ( !is.matrix(estimates) ) {
estimates <- cbind( estimates )
standard_errors <- cbind( standard_errors )
degrees_of_freedom <- cbind( degrees_of_freedom )
}
# Convert to variances
variances <- standard_errors^2
# Number of imputations
m <- nrow(estimates)
# Average over imputations
M <- apply( estimates, 2, mean )
# Variance [Between]
VB <- apply( estimates, 2, var )
# Variance [Within]
VW <- apply( variances, 2, mean )
# Total variance
VT <- VW + (1 + 1/m) * VB
# Degrees of freedom correction
# Code adapted from the function mice::::barnard.rubin
# (van Buuren & Groothuis-Oudshoorn, 2011)
# Prop. variance attr. to missing data
lambda <- (1 + 1/m) * VB / VT
# Avoid underflow problems
lambda[lambda < 1e-04] <- 1e-04
# Degrees of freedom adjustment per Rubin (1987)
dfold <- (m - 1) / lambda^2
# Small sample adaptation to avoid degrees of
# freedom that exceed sample size
dof <- degrees_of_freedom # Simplify notation
dfobs <- (dof + 1) / (dof + 3) * dof * (1 - lambda)
DOF <- sapply(
1:ncol( degrees_of_freedom ), function(s) {
ifelse(
is.infinite(dof[s]),
dfold[s], dfold[s] * dfobs[s] / (dfold[s] + dfobs[s])
)
}
)
output <- cbind(
M = M, SE = sqrt( VT ), DOF = DOF
)
return( output )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.