R/tabstat.R

Defines functions tabstat

Documented in tabstat

#' @title Compact Table of Summary Statistics
#'
#' @description  Based on Stata's "tabstat" command. "tabstat displays summary
#'  statistics for a series of numeric variables in one table.  It allows you
#'  to specify the list of statistics to be displayed.  Statistics can be
#'  calculated (conditioned on) another variable.  tabstat allows substantial
#'  flexibility in terms of the statistics presented and the format of the
#'  table" (Stata, 2016).
#'
#' @param x A continuous variable.
#' @param digits Rounds the values returned to the specified number of decimal
#'  places (default 3).
#' @param stats Return specified statistics. Options include:
#'  \describe{
#'    \item{n}{Count of nonmissing values of x}
#'    \item{nmiss}{Count of missing values of x}
#'    \item{ci}{95 percent confidence interval for the mean of x}
#'    \item{sum}{Sum of x}
#'    \item{max}{Maximum value of x}
#'    \item{min}{Minimum value of x}
#'    \item{range}{(Maximum value of x) - (minimum value of x)}
#'    \item{sd}{Standard deviation of x}
#'    \item{var}{Variance of x}
#'    \item{cv}{Coefficient of variation (sd / mean) of x}
#'    \item{sem}{Standard error of the mean of x}
#'    \item{skew}{Skewness of x}
#'    \item{kurt}{Kurtosis of x}
#'    \item{p1}{1st percentile of x}
#'    \item{p5}{5th percentile of x}
#'    \item{p10}{10th percentile of x}
#'    \item{p25}{25th percentile of x}
#'    \item{p50}{Median value of x}
#'    \item{median}{Median value of x}
#'    \item{p75}{75th percentile of x}
#'    \item{p90}{90th percentile of x}
#'    \item{p95}{95th percentile of x}
#'    \item{p99}{99th percentile of x}
#'    \item{iqr}{Interquartile range (p75 - p25)}
#'    \item{q}{Equivalent to specifying p25 p50 p75}
#'  }
#'
#' @return A data frame. By default, the data frame contains the variable name
#'  and mean.
#' @export
#'
#' @references Stata 14 help for tabstat
#'  \url{http://www.stata.com/help.cgi?tabstat}
#'
#' @examples
#' data(mtcars)
#'
#' # Single univariate analysis with Defaults
#' tabstat(mtcars$mpg)
#'
#' # Single univariate analysis with all stats
#' tabstat(mtcars$mpg, stats = c("n", "nmiss", "ci", "sum", "max", "min",
#' "range", "sd", "var", "cv", "sem", "skew", "kurt", "p1", "p5", "p10",
#' "p25", "p50", "median", "p75", "p90", "p95", "p99", "iqr", "q"))

tabstat <- function(x, digits = 3, stats = "default") {

  if (missing(x)){                               # Check arguments
    stop()
  } else if (is.factor(x)) {
    stop("factors objects not allowed, argument 'x' should be a numeric")
  } else if (is.character(x)) {
    stop("character objects not allowed, argument 'x' should be a numeric")
  }

  name = deparse(substitute(x))                  # Grab variable string
  name = sub("^[^\\$]*\\$", "", name)            # Regex remove up to $
  x2 = stats::na.omit(x)                         # Remove NA
  n = length(x2)                                 # Count nonmissing observations
  mean = mean(x2)                                # Mean

  df <- data.frame(                              # Default data frame
    variable = name,
    mean = round(mean, digits)
  )

  if ("n" %in% stats){                           # nonmissing observations
    df <- cbind(df, n)
    df <- df[c(1, 3, 2)]                         # Reorder n before mean
  }

  if ("nmiss" %in% stats){                       # missing observations
    nmiss = sum(is.na(x2))
    df <- cbind(df, nmiss)
    if ("n" %in% stats){
      df <- df[c(1:2, 4, 3)]                     # Reorder name, n, nmiss, mean
    } else {
      df <- df[c(1, 3, 2)]                       # Reorder name, nmiss, mean
    }
  }

  if ("ci" %in% stats) {                         # 95% Confidence Intervals
    lcl = round(mean(x2) - stats::qt(.975, n - 1) * stats::sd(x2) / sqrt(n), digits)
    ucl = round(mean(x2) + stats::qt(.975, n - 1) * stats::sd(x2) / sqrt(n), digits)
    df <- cbind(df, mean_lcl = lcl, mean_ucl = ucl)
  }

  if ("sum" %in% stats) {                        # Sum values
    sum = round(sum(x2), digits)
    df <- cbind(df, sum)
  }

  if ("max" %in% stats) {                        # Maximum value
    max = round(max(x2), digits)
    df <- cbind(df, max)
  }

  if ("min" %in% stats) {                        # Minimum value
    min = round(min(x2), digits)
    df <- cbind(df, min)
  }

  if ("range" %in% stats) {                      # Range
    range = round(max(x2) - min(x2), digits)
    df <- cbind(df, range)
  }

  if ("sd" %in% stats){                          # Standard deviation
    std_dev = round(stats::sd(x2), digits)
    df <- cbind(df, std_dev)
  }

  if ("var" %in% stats){                         # Variance
    variance = round(stats::var(x2), digits)
    df <- cbind(df, variance)
  }

  if ("cv" %in% stats){                          # Coefficient of variation
    coef_var = round(stats::sd(x2) / mean(x2), digits)  # (sd / mean)
    df <- cbind(df, coef_var)
  }

  if ("sem" %in% stats){                         # Coefficient of variation
    sem = round(stats::sd(x2) / sqrt(n), digits)        # (sd / sqrt(n))
    df <- cbind(df, sem)
  }

  if ("skew" %in% stats){                        # Skewness
    skewness = round((sum((x2 - mean(x2))^3) / n) /
        (sum((x2 - mean(x2))^2) / n)^(3 / 2), digits)
    df <- cbind(df, skewness)
  }

  if ("kurt" %in% stats){                        # Kurtosis
    kurtosis = round(n * sum((x2 - mean(x2))^4) /
        (sum((x2 - mean(x2))^2)^2), digits)
    df <- cbind(df, kurtosis)
  }

  if ("p1" %in% stats){                          # 1st percentile
    p1 = round(stats::quantile(x2, .01), digits)
    names(p1) <- NULL
    df <- cbind(df, p1)
  }

  if ("p5" %in% stats){                          # 5th percentile
    p5 = round(stats::quantile(x2, .05), digits)
    names(p5) <- NULL
    df <- cbind(df, p5)
  }

  if ("p10" %in% stats){                         # 10th percentile
    p10 = round(stats::quantile(x2, .1), digits)
    names(p10) <- NULL
    df <- cbind(df, p10)
  }

  if ("p25" %in% stats){                         # 25th percentile
    p25 = round(stats::quantile(x2, .25), digits)
    names(p25) <- NULL
    df <- cbind(df, p25)
  }

  if ("median" %in% stats | "p50" %in% stats){   # median
    median = round(median(x2), digits)
    df <- cbind(df, median)
  }

  if ("p75" %in% stats){                         # 75th percentile
    p75 = round(stats::quantile(x2, .75), digits)
    names(p75) <- NULL
    df <- cbind(df, p75)
  }

  if ("p90" %in% stats){                         # 90th percentile
    p90 = round(stats::quantile(x2, .90), digits)
    names(p90) <- NULL
    df <- cbind(df, p90)
  }

  if ("p95" %in% stats){                         # 95th percentile
    p95 = round(stats::quantile(x2, .95), digits)
    names(p95) <- NULL
    df <- cbind(df, p95)
  }

  if ("p99" %in% stats){                         # 99th percentile
    p99 = round(stats::quantile(x2, .99), digits)
    names(p99) <- NULL
    df <- cbind(df, p99)
  }

  if ("iqr" %in% stats){                         # Interquartile Range
    p75 = stats::quantile(x2, .75)
    p25 = stats::quantile(x2, .25)
    iqr = round(p75 - p25, digits)
    names(iqr) <- NULL
    df <- cbind(df, iqr)
  }

  if ("q" %in% stats){                           # Shortcut for p25, p50, p75
    p25 = round(stats::quantile(x2, .25), digits)
    names(p25) <- NULL
    p50 = round(stats::quantile(x2, .50), digits)
    names(p50) <- NULL
    p75 = round(stats::quantile(x2, .75), digits)
    names(p75) <- NULL
    df <- cbind(df, p25, p50, p75)
  }

  return(df)
}

# Build out test as you go.
brad-cannell/myFunctions documentation built on July 21, 2019, 5:48 p.m.