R/descriptives.R

Defines functions f.r2 roundedfivenum winsorizor egltable SEMSummary.fit SEMSummary smd cramerV moments diffCircular meanCircular

Documented in cramerV diffCircular egltable f.r2 meanCircular moments roundedfivenum SEMSummary SEMSummary.fit smd winsorizor

#' Calculate a Circular Mean
#'
#' Function to calculate circular mean
#'
#' @param x Numeric or integer values
#' @param max The theoretical maximum (e.g., if degrees, 360)
#' @param na.rm A logical value indicating whether to remove missing values.
#'   Defaults to \code{TRUE}.
#' @return A numeric value with the circular mean.
#' @export
#' @examples
#' meanCircular(c(22:23, 1:2), max = 24)
#' meanCircular(c(12, 24), max = 24)
#' meanCircular(c(6, 7, 23), max = 24)
#' meanCircular(c(6, 7, 21), max = 24)
#' meanCircular(c(6, 21), max = 24)
#' meanCircular(c(6, 23), max = 24)
#' meanCircular(c(.91, .96, .05, .16), max = 1)
#' meanCircular(c(6, 7, 8, 9), max = 24)
#' meanCircular(1:3, max = 24)
#' meanCircular(21:23, max = 24)
#' meanCircular(c(16, 17, 18, 19), max = 24)
#' meanCircular(c(355, 5, 15), max = 360)
#'
meanCircular <- function(x, max, na.rm = TRUE) {
  if (!(is.integer(x) || is.numeric(x))) {
    stop(sprintf("x must be class integer or numeric but was %s",
                 paste(class(x), collapse = "; ")))
  }

  if (na.rm) {
    x <- x[!is.na(x)]
    if (!length(x)) {
      return(NA_real_)
    }
  } else if (anyNA(x)) {
    return(NA_real_)
  }

  if (any(x < 0)) {
    stop("For cicular means, cannot have negative numbers")
  }
  if (any(x > max)) {
    stop("For cicular means, cannot have values above the theoretical maximum")
  }

  scale <- 360 / max
  rad <- x * scale * (pi / 180)
  mc <- mean.default(cos(rad))
  ms <- mean.default(sin(rad))

  if (ms >= 0) {
    out <- atan2(ms, mc)
  } else if (ms < 0) {
    out <- atan2(ms, mc) + 2 * pi
  }

  ## radians to degrees and unscale to the inputs
  out <- out * (180 / pi) / scale
  if (out == max) {
    out <- 0
  }
  return(out)
}

#' Calculate the Circular Difference
#'
#' @param x Numeric or integer values
#' @param y Numeric or integer values
#' @param max the theoretical maximum 
#'   (e.g., if degrees, 360; if hours, 24; etc.).
#' @return A value with the circular difference.
#'   This will always be positive if defined.
#' @export
#' @examples
#' diffCircular(330, 30, max = 360)
#' diffCircular(22, 1, max = 24)
#' diffCircular(c(22, 23, 21, 22), c(1, 1, 23, 14), max = 24)
diffCircular <- function(x, y, max) {
  if (!(is.integer(x) || is.numeric(x))) {
    stop(sprintf(
      "x must be class integer or numeric but was %s",
      paste(class(x), collapse = "; ")
    ))
  }
  if (!(is.integer(y) || is.numeric(y))) {
    stop(sprintf(
      "y must be class integer or numeric but was %s",
      paste(class(y), collapse = "; ")
    ))
  }

  stopifnot(identical(length(x), length(y)))

  if (any(x < 0 | y < 0, na.rm = TRUE)) {
    stop("For cicular means, cannot have negative numbers")
  }
  if (any(x > max | y > max, na.rm = TRUE)) {
    stop("For cicular means, cannot have values above the theoretical maximum")
  }

  scale <- 360 / max
  torad <- scale * (pi / 180)
  toorig <- (180 / pi) / scale
  radx <- x * torad
  rady <- y * torad
  acos(cos(radx - rady)) * toorig
}


#' Estimate the first and second moments
#'
#' This function relies on the \pkg{lavaan} package to use the
#' Expectation Maximization (EM) algorithm to estimate the first and
#' second moments (means and [co]variances) when there is missing data.
#'
#' @param data A data frame or an object coercable to a data frame.
#'   The means and covariances of all variables are estimated.
#' @param \dots Additional arguments passed on to the \code{estimate.moments.EM}
#'   function in \pkg{lavaan}. Note this is not an exported function.
#' @return A list containing the esimates from the EM algorithm.
#'   \item{mu}{A named vector of the means.}
#'   \item{sigma}{The covariance matrix.}
#' @seealso \code{\link{SEMSummary}}
#' @keywords multivariate
#' @importFrom lavaan lavCor lavInspect
#' @author Suggested by Yves Rosseel author of the lavaan 
#'   package on which this depends
#' @export
#' @examples
#' # sample data
#' Xmiss <- as.matrix(iris[, -5])
#' # make 25% missing completely at random
#' set.seed(10)
#' Xmiss[sample(length(Xmiss), length(Xmiss) * .25)] <- NA
#' Xmiss <- as.data.frame(Xmiss)
#'
#' # true means and covariance
#' colMeans(iris[, -5])
#' # covariance with n - 1 divisor
#' cov(iris[, -5])
#'
#' # means and covariance matrix using list wise deletion
#' colMeans(na.omit(Xmiss))
#' cov(na.omit(Xmiss))
#'
#' # means and covariance matrix using EM
#' moments(Xmiss)
#' # clean up
#' rm(Xmiss)
moments <- function(data, ...) {

  if (!is.data.frame(data)) {
    data <- as.data.frame(data)
  }

  ## from Yves
  fit <- lavCor(data,
                meanstructure = TRUE,
                output = "fit", missing = "ml")
  out <- lavInspect(fit, "sampstat.h1")

  class(out$cov) <- "matrix"
  class(out$mean) <- "numeric"
  names(out) <- c("sigma", "mu")

  return(out)
}

#' Calculate Phi or Cramer's V effect size
#'
#' Simple function to calculate effect sizes for frequency tables.
#'
#' @param x A frequency table, such as from \code{xtabs()}.
#' @return A numeric value with Phi for 2 x 2 tables or Cramer's V
#'   for tables larger than 2 x 2.
#' @importFrom stats ftable
#' @importFrom MASS loglm
#' @export
#' @examples
#' cramerV(xtabs(~ am + vs, data = mtcars))
#' cramerV(xtabs(~ cyl + vs, data = mtcars))
#' cramerV(xtabs(~ cyl + am, data = mtcars))
cramerV <- function(x) {
  if (length(dim(x)) > 2L) {
    stop("Cannot calculate effect size on more than two dimensions")
  }

  x2 <- ftable(x)
  chi2 <- summary(MASS::loglm(~ 1 + 2, x2))$tests["Pearson", "X^2"]
  N <- sum(x2)
  k <- min(dim(x2))

  out <- sqrt(chi2 / (N * (k - 1)))

  if (identical(dim(x), c(2L, 2L))) {
    names(out) <- "Phi"
  } else {
    names(out) <- "Cramer's V"
  }
  return(out)
}

#' Calculate Standardized Mean Difference (SMD)
#'
#' Simple function to calculate effect sizes for mean differences.
#'
#' @param x A continuous variable
#' @param g A grouping variable, with two levels
#' @param index A character string: \dQuote{all} uses pooled variance,
#'   \dQuote{1} uses the first factor level variance,
#'   \dQuote{2} uses the second factor level variance.
#' @return The standardized mean difference.
#' @importFrom stats var
#' @export
#' @examples
#' smd(mtcars$mpg, mtcars$am)
#' smd(mtcars$mpg, mtcars$am, "all")
#' smd(mtcars$mpg, mtcars$am, "1")
#' smd(mtcars$mpg, mtcars$am, "2")
#'
#' smd(mtcars$hp, mtcars$vs)
#'
#' d <- data.table::as.data.table(mtcars)
#' d[, smd(mpg, vs)]
#' rm(d)
smd <- function(x, g, index = c("all", "1", "2")) {
  index <- match.arg(index)

  ok <- !is.na(x) & !is.na(g)
  x <- x[ok]
  g <- g[ok]

  if (!is.factor(g)) {
    g <- factor(g)
  }
  k <- length(levels(g))

  if (!identical(k, 2L)) stop("Must have two groups")

  m <- as.vector(by(x, g, mean))
  v <- as.vector(by(x, g, var))
  n <- as.vector(by(x, g, length))

  useV <- switch(index,
    `all` = sum((n - 1) * v) / (sum(n) - k),
    `1` = v[1],
    `2` = v[2])

  out <- abs(diff(m) / sqrt(useV))

  names(out) <- "SMD"
  return(out)
}


#' Summary Statistics for a SEM Analysis
#'
#' This function is designed to calculate the descriptive statistics and
#' summaries that are often reported on raw data when the main analyses
#' use structural equation modelling.
#'
#' This function calculates a variety of relevant statistics on the raw
#' data used in a SEM analysis.  Because it is meant for SEM style data,
#' for now it expects all variables to be numeric.  In the future I may
#' try to expand it to handle factor variables somehow.
#'
#' Both the formula and data arguments are required.  The formula should
#' be the right hand side only.  The most common way to use it would be with
#' variable names separated by \sQuote{+s}.  For convenience, a \sQuote{.} is
#' expanded to mean \dQuote{all variables in the data set}.  For a large number
#' of variables or when whole datasets are being analyzed, this can
#' be considerably easier to write.  Also it facilitates column indexing
#' by simply passing a subset of the data (e.g., \code{data[, 1:10]}) and
#' using the \sQuote{.} expansion to
#' analyze the first 10 columns.  The examples section demonstrate this use.
#'
#' Also noteworthy is that \code{SEMSummary} is not really meant to be used
#' on its own.  It is the computational workhorse, but it is meant to be used
#' with a styling or printing method to produce simple output.
#' \code{APAStyler} has methods for \code{SEMSummary} output.
#'
#' There are several new ways to handle missing data now
#' including listwise deletion, pairwise deletion, and using the EM
#' algorithm, the default.
#'
#' @param formula A formula of the variables to be used in the analysis.
#'   See the \sQuote{details} section for more information.
#' @param data A data frame, matrix, or list containing the variables
#'   used in the formula.  This is a required argument.
#' @param use A character vector of how to handle missing data.
#'   Defaults to \dQuote{fiml}.
#' @return A list with S3 class \dQuote{SEMSummary}
#'   \item{names}{A character vector containing the variable names.}
#'   \item{n}{An integer vector of the length of each variable used
#'     (this includes available and missing data).}
#'   \item{nmissing}{An integer vector of the number of missing
#'                   values in each variable.}
#'   \item{mu}{A vector of the arithmetic means of each variable
#'             (on complete data).}
#'   \item{stdev}{A numeric vector of the standard deviations of each variable
#'                (on complete data).}
#'   \item{Sigma}{The numeric covariance matrix for all variables.}
#'   \item{sSigma}{The numeric correlation matrix for all variables.}
#'   \item{coverage}{A numeric matrix giving the percentage
#'                   (technically decimal) of information available
#'                   for each pairwise covariance/correlation.}
#'   \item{pvalue}{The two-sided p values for the correlation matrix.
#'                 Pairwise present N used to calculate degrees of freedom.}
#' @seealso \code{\link{APAStyler}}
#' @keywords multivariate
#' @importFrom stats terms as.formula
#' @importFrom extraoperators %?in%
#' @export
#' @examples
#' ## Example using the built in iris dataset
#' s <- SEMSummary(~ Sepal.Length + Sepal.Width + Petal.Length, data = iris)
#' s # show output ... not very nice
#'
#' ## Prettier output from SEMSummary
#' APAStyler(s)
#'
#' #### Subset the dataset and use the . expansion ####
#'
#' ## summary for all variables in mtcars data set
#' ## with 11 variables, this could be a pain to write out
#' SEMSummary(~ ., data = mtcars)
#'
#' ## . expansion is also useful when we know column positions
#' ## but not necessarily names
#' SEMSummary(~ ., data = mtcars[, c(1, 2, 3, 9, 10, 11)])
#'
#' ## clean up
#' rm(s)
#'
#' ## sample data
#' Xmiss <- as.matrix(iris[, -5])
#' # make q0% missing completely at random
#' set.seed(10)
#' Xmiss[sample(length(Xmiss), length(Xmiss) * .10)] <- NA
#' Xmiss <- as.data.frame(Xmiss)
#'
#' SEMSummary(~ ., data = Xmiss, use = "fiml")
#'
#'
#' ## pairwise
#' APAStyler(SEMSummary(~ ., data = Xmiss, use = "pair"),
#'   type = "cor")
#'
#' ## same as cor()
#' cor(Xmiss, use = "pairwise.complete.obs")
#'
#' ## complete cases only
#' SEMSummary(~ ., data = Xmiss, use = "comp")
#'
#' ## clean up
#' rm(Xmiss)
SEMSummary <- function(formula, data,
  use = c("fiml", "pairwise.complete.obs", "complete.obs")) {
  env <- environment(formula)

  if (isFALSE(is.data.frame(data))) {
    data <- as.data.frame(data)
  }

  use <- match.arg(use)

  tmp <- unlist(strsplit(paste(deparse(formula), collapse = ""), "\\|"))
  formula <- as.formula(tmp[1], env = env)

  if (length(tmp) > 1) {
    condition <- as.formula(paste0("~ ", tmp[2]), env = env)
    vars <- attr(terms(condition, data = data), "variables")
    vnames <- as.character(vars)[-1L]

    grouping <- interaction(eval(vars, data, env), drop = TRUE)

    output <- by(data[, -(colnames(data) %?in% vnames)],
      grouping,
      FUN = function(d) SEMSummary.fit(formula, d, use = use))
    output$Levels <- levels(grouping)

    class(output) <- "SEMSummary.list"
  } else {
    output <- SEMSummary.fit(formula, data, use = use)
  }

  return(output)
}

#' Summary Statistics for a SEM Analysis
#'
#' This is a low level fitting function, for SEMSummary.
#'
#' @param formula A formula of the variables to be used in the analysis.
#'   See the \sQuote{details} section for more information.
#' @param data A data frame, matrix, or list containing the variables
#'   used in the formula.  This is a required argument.
#' @param use A character vector of how to handle missing data.
#'   Defaults to \dQuote{fiml}.
#' @return A list with S3 class \dQuote{SEMSummary}
#'   \item{names}{A character vector containing the variable names.}
#'   \item{n}{An integer vector of the length of each variable used
#'     (this includes available and missing data).}
#'   \item{nmissing}{An integer vector of the number of missing
#'                   values in each variable.}
#'   \item{mu}{A vector of the arithmetic means of each variable
#'             (on complete data).}
#'   \item{stdev}{A numeric vector of the standard deviations of each
#'                variable (on complete data).}
#'   \item{Sigma}{The numeric covariance matrix for all variables.}
#'   \item{sSigma}{The numeric correlation matrix for all variables.}
#'   \item{coverage}{A numeric matrix giving the percentage
#'                   (technically decimal) of information available
#'                   for each pairwise covariance/correlation.}
#'   \item{pvalue}{The two-sided p values for the correlation matrix.
#'                 Pairwise present N used to calculate degrees of freedom.}
#' @seealso \code{\link{SEMSummary}}
#' @keywords multivariate
#' @importFrom stats cov cov2cor pt cor
SEMSummary.fit <- function(formula, data,
  use = c("fiml", "pairwise.complete.obs", "complete.obs")) {

  use <- match.arg(use)

  vars <- attr(terms(formula, data = data), "variables")
  vnames <- as.character(vars)[-1L]
  if (length(vnames) < 2) {
    stop("You must specify at least 2 variables to use this function")
  }
  env <- environment(formula)

  X <- eval(vars, data, env)
  names(X) <- vnames
  X <- as.data.frame(X)

  if (all(!is.na(X)) && use == "fiml") {
    use <- "complete.obs"
  }

  res <- switch(use,
    fiml = { moments(X) },
    pairwise.complete.obs = {
      list(mu = colMeans(X, na.rm = TRUE),
        sigma = cov(X, use = "pairwise.complete.obs"))
    },
    complete.obs = {
      list(mu = colMeans(na.omit(X)),
        sigma = cov(na.omit(X)))
    }
  )

  mu <- res$mu
  Sigma <- res$sigma
  stdev <- sqrt(diag(Sigma))

  if (identical(use, "pairwise.complete.obs")) {
    sSigma <- cor(X, use = "pairwise.complete.obs")
  } else {
    sSigma <- cov2cor(Sigma)
  }

  n <- nrow(X)
  L <- is.na(X)
  nmiss <- colSums(L)
  i <- which(upper.tri(Sigma), arr.ind = TRUE)
  pairmiss <- apply(i, 1L, function(j) {
    sum(L[, j[1]] | L[, j[2]])
  })
  pvalue <- coverage <- matrix(NA, nrow = ncol(X), ncol = ncol(X))
  diag(coverage) <- (n - nmiss) / n
  coverage[i] <- (n - pairmiss) / n
  coverage[i[, c(2, 1)]] <- (n - pairmiss) / n
  dimnames(coverage) <- dimnames(Sigma)

  df <- (coverage * n) - 2
  statistic <- sqrt(df) * sSigma / sqrt(1 - sSigma^2)
  p <- pt(statistic, df)
  pvalue[] <- 2 * pmin(p, 1 - p)
  diag(pvalue) <- NA
  dimnames(pvalue) <- dimnames(Sigma)

  names(nmiss) <- names(mu) <- names(stdev) <- names(X)

  output <- list(
    names = vnames, n = n, nmissing = nmiss, mu = mu, stdev = stdev,
    Sigma = Sigma, sSigma = sSigma, coverage = coverage, pvalue = pvalue)
  class(output) <- "SEMSummary"

  return(output)
}


## clear R CMD CHECK notes
if (getRversion() >= "2.15.1")  utils::globalVariables(c("dv1", "dv2"))

#' Function makes nice tables
#'
#' Give a dataset and a list of variables, or just the data
#' in the vars.  For best results, convert categorical
#' variables into factors.  Provides a table of estimated descriptive
#' statistics optionally by group levels.
#'
#' @param vars Either an index (numeric or character) of
#'   variables to access from the \code{data} argument,
#'   or the data to be described itself.
#' @param g A variable used tou group/separate the data prior
#'   to calculating descriptive statistics.
#' @param data optional argument of the dataset containing
#'   the variables to be described.
#' @param idvar A character string indicating the variable name
#'   of the ID variable.  Not currently used, but will eventually
#'   support \code{egltable} supporting repeated measures data.
#' @param strict Logical, whether to strictly follow the
#'   type of each variable, or to assume categorical if
#'   the number of unique values is less than or equal to 3.
#' @param parametric Logical whether to use parametric tests in the
#'   case of multiple groups to test for differences.  Only applies to
#'   continuous variables. If \code{TRUE}, the default, uses one-way ANOVA,
#'   and a F test. If \code{FALSE}, uses the Kruskal-Wallis test.
#' @param paired Logical whether the data are paired or not. Defaults to
#'   \code{FALSE}. If \code{TRUE}, the grouping variable, \code{g},
#'   must have two levels and \code{idvar} must be specified. When used
#'   a paired t-test is used for parametric, continuous data and a
#'   Wilcoxon test for paired  non parametric, continuous data and a McNemar
#'   chi square test is used for categorical data.
#' @param simChisq Logical whether to estimate p-values for chi-square test
#'   for categorical data when there are multiple groups, by simulation.
#'   Defaults to \code{FALSE}. Useful when there are small cells as will
#'   provide a more accurate test in extreme cases, similar to Fisher Exact
#'   Test but generalizing to large dimension of tables.
#' @param sims Integer for the number of simulations to be used to estimate
#'   p-values for the chi-square tests for categorical variables when
#'   there are multiple groups. Defaults to one million (\code{1e6L}).
#' @return A data frame of the table.
#' @keywords utils
#' @export
#' @importFrom stats sd aov chisq.test kruskal.test quantile xtabs t.test 
#' @importFrom stats wilcox.test mcnemar.test
#' @importFrom data.table is.data.table as.data.table setnames
#' @importFrom extraoperators %snin%
#' @examples
#' egltable(iris)
#' egltable(colnames(iris)[1:4], "Species", data = iris)
#' egltable(iris, parametric = FALSE)
#' egltable(colnames(iris)[1:4], "Species", iris,
#'   parametric = FALSE)
#' egltable(colnames(iris)[1:4], "Species", iris,
#'   parametric = c(TRUE, TRUE, FALSE, FALSE))
#' egltable(colnames(iris)[1:4], "Species", iris,
#'   parametric = c(TRUE, TRUE, FALSE, FALSE), simChisq=TRUE)
#'
#' diris <- data.table::as.data.table(iris)
#' egltable("Sepal.Length", g = "Species", data = diris)
#'
#' tmp <- mtcars
#' tmp$cyl <- factor(tmp$cyl)
#' tmp$am <- factor(tmp$am, levels = 0:1)
#'
#' egltable(c("mpg", "hp"), "vs", tmp)
#' egltable(c("mpg", "hp"), "am", tmp)
#' egltable(c("am", "cyl"), "vs", tmp)
#'
#' tests <- with(sleep,
#'     wilcox.test(extra[group == 1],
#'            extra[group == 2], paired = TRUE))
#' str(tests)
#'
#' ## example with paired data
#' egltable(c("extra"), g = "group", data = sleep, idvar = "ID", paired = TRUE)
#'
#' ## what happens when ignoring pairing (p-value off)
#' # egltable(c("extra"), g = "group", data = sleep, idvar = "ID")
#'
#' ## paired categorical data example
#' ## using data on chick weights to create categorical data
#' tmp <- subset(ChickWeight, Time %in% c(0, 20))
#' tmp$WeightTertile <- cut(tmp$weight,
#'   breaks = quantile(tmp$weight, c(0, 1/3, 2/3, 1), na.rm = TRUE),
#'   include.lowest = TRUE)
#'
#' egltable(c("weight", "WeightTertile"), g = "Time",
#'   data = tmp,
#'   idvar = "Chick", paired = TRUE)
#'
#' rm(tmp)
egltable <- function(vars, g, data, idvar, strict = TRUE, parametric = TRUE,
                     paired = FALSE, simChisq = FALSE, sims = 1e6L) {
  if (isFALSE(missing(data))) {
    present <- vars %in% names(data)
    if (isFALSE(all(present))) {
      absent <- vars %snin% names(data)
      errmsg <- sprintf(
        "The following variable name(s) were not found:\n%s",
        paste(absent, collapse = "\n"))
      stop(errmsg)
    }
    if (is.data.table(data)) {
      dat <- data[, vars, with = FALSE]
    } else {
      dat <- as.data.table(data[, vars, drop = FALSE])
    }
    if (isFALSE(missing(g))) {
      if (length(g) == 1) {
        g <- data[[g]]
      }
    }
    if (isFALSE(missing(idvar))) {
      ids <- data[[idvar]]
      if (anyNA(ids)) stop("cannot have missing IDs")
    }
  } else {
    dat <- as.data.table(vars)
  }

  if (missing(g)) {
    g <- rep(1, nrow(dat))
  }

  g <- droplevels(as.factor(g))

  if (isTRUE(paired)) {
    stopifnot(identical(length(unique(g)), 2L))
    stopifnot(isFALSE(missing(idvar)))
  }

  if (identical(length(parametric), 1L)) {
    if (isTRUE(parametric)) {
      parametric <- rep(TRUE, length(vars))
    } else {
      parametric <- rep(FALSE, length(vars))
    }
  }

  vnames <- names(dat)

  ##k <- ncol(dat)

  testmiss <- .allmissing(dat)
  if (!isFALSE(testmiss)) {
    stop(testmiss)
  }

  contvars.index <- unlist(lapply(dat, function(x) {
    (is.integer(x) | is.numeric(x)) &
      ((length(unique(x)) > 3) | strict)
  }))

  catvars.index <- which(!contvars.index)
  contvars.index <- which(contvars.index)

  if (length(contvars.index)) {
    if (length(unique(parametric[contvars.index])) > 1) {
      multi <- TRUE
    } else {
      multi <- FALSE
    }
  } else {
    multi <- FALSE
  }

  if (isTRUE(length(catvars.index) > 0)) {
    for (n in vnames[catvars.index]) {
      if (isFALSE(is.factor(dat[[n]]))) {
        dat[[n]] <- factor(dat[[n]])
      }
    }
  }

  tmpout <- lapply(levels(g), function(gd) {
    d <- dat[which(g == gd)]
    testmiss <- .allmissing(d)
    if (!isFALSE(testmiss)) {
      testmiss <- sprintf("In group: %s\n%s", gd, testmiss)
      stop(testmiss)
    }

    tmpres <- NULL
    reslab <- ""

    if (isTRUE(length(contvars.index) > 0)) {
      tmpcont <- lapply(contvars.index, function(v) {
        n <- vnames[v]
        if (parametric[v]) {
          .stylemsd(n, d[[n]], digits = 2, includeLabel = multi)
        } else {
          .stylemdniqr(n, d[[n]], digits = 2, includeLabel = multi)
        }
      })

      names(tmpcont) <- vnames[contvars.index]
      tmpres <- c(tmpres, tmpcont)

      reslab <- paste0(reslab, c(ifelse(parametric[contvars.index[1]],
                                        "M (SD)", "Mdn (IQR)"), "See Rows")[multi + 1])
    }

    if (isTRUE(length(catvars.index) > 0)) {
      tmpcat <- lapply(vnames[catvars.index], function(n) {
        .stylefreq(n, d[[n]])
      })

      names(tmpcat) <- vnames[catvars.index]
      tmpres <- c(tmpres, tmpcat)

      reslab <- paste0(reslab, ifelse(nzchar(reslab),
                                      "/N (%)", "N (%)"))
    }

    tmpres <- lapply(tmpres[vnames], function(d) {
      setnames(d, old = names(d), c("Vars", reslab))
      return(d)
      })

    return(tmpres)
  })


  if (isTRUE(length(levels(g)) > 1)) {
    tmpout <- lapply(seq_along(vnames), function(v) {

      out <- do.call(cbind, lapply(seq_along(levels(g)), function(i) {
        d <- tmpout[[i]][[v]]
        setnames(d, old = names(d)[2], 
          paste(levels(g)[i], names(d)[2], sep = " "))
        if (isTRUE(i == 1)) {
          return(d)
        } else {
          return(d[, -1, with = FALSE])
        }
      }))

      if (isTRUE(length(contvars.index) > 0)) {
        if (isTRUE(v %in% contvars.index)) {
          if (isTRUE(parametric[v])) {
            if (isTRUE(length(levels(g)) > 2)) {
              out <- cbind(out, Test = c(
                .styleaov(dat[[v]], g), rep("", nrow(out) - 1)))
            } else if (isTRUE(length(levels(g)) == 2) && isFALSE(paired)) {
              out <- cbind(out, Test = c(
                .style2sttest(dat[[v]], g), rep("", nrow(out) - 1)))
            } else if (isTRUE(length(levels(g)) == 2) && isTRUE(paired)) {
              out <- cbind(out, Test = c(
                .stylepairedttest(dat[[v]], g, ids), rep("", nrow(out) - 1)))
            }
          } else {
            if (isFALSE(paired)) {
              out <- cbind(out, Test = c(
                .stylekruskal(dat[[v]], g), rep("", nrow(out) - 1)))
            } else if (isTRUE(length(levels(g)) == 2) && isTRUE(paired)) {
              out <- cbind(out, Test = c(
                .stylepairedwilcox(dat[[v]], g, ids), rep("", nrow(out) - 1)))
            }
          }
        }
      }

      if (isTRUE(length(catvars.index) > 0)) {
        if (isTRUE(v %in% catvars.index)) {

          if (isFALSE(paired)) {
            out <- cbind(out, Test = c(
              .stylechisq(dat[[v]], g, simChisq = simChisq, sims = sims),
              rep("", nrow(out) - 1)))
          } else if (isTRUE(length(levels(g)) == 2) && isTRUE(paired)) {
            out <- cbind(out, Test = c(
              .stylepairedmcnemar(dat[[v]], g, ids), rep("", nrow(out) - 1)))
          }
        }
      }

      return(out)
    })
  } else {
    tmpout <- tmpout[[1]]
  }

  out <- do.call(rbind, tmpout)
  setnames(out, old = names(out)[1], "")

  return(out)
}

#' Winsorize at specified percentiles
#'
#' Simple function winsorizes data at the specified percentile.
#'
#' @param d A vector, matrix, data frame, or data table to be winsorized
#' @param percentile The percentile bounded by [0, 1] to winsorize data at.
#'   If a data frame or matrix is provided for the data, this should have the
#'   same length as the number of columns, or it will be repeated for all.
#' @param values If values are specified, use these instead of 
#'   calculating by percentiles.
#'   Should be a data frame with columns named \dQuote{low}, and \dQuote{high}.
#'   If a data frame or matrix is provided for the data, 
#'   there should be as many rows
#'   for values to winsorize at as there are columns in the data.
#' @param na.rm A logical whether to remove NAs.
#' @return winsorized data. Attributes are included to list the exact values
#'   (for each variable, if a data frame or matrix) used to winsorize
#'   at the lower and upper ends.
#' @importFrom stats quantile
#' @importFrom data.table data.table := copy is.data.table
#' @importFrom extraoperators %age% %ale%
#' @export
#' @examples
#' dev.new(width = 10, height = 5)
#' par(mfrow = c(1, 2))
#' hist(as.vector(eurodist), main = "Eurodist")
#' hist(winsorizor(as.vector(eurodist), .05), 
#'   main = "Eurodist with lower and upper\n5% winsorized")
#'
#' library(data.table)
#' dat <- data.table(x = 1:5)
#' dat[, y := scale(1:5)]
#' winsorizor(dat$y, .01)
#'
#' ## make a copy of the data table
#' winsorizor(dat, .01)
#'
#' winsorizor(mtcars, .01)
#'
#' winsorizor(matrix(1:9, 3), .01)
#'
#' rm(dat) # clean up
winsorizor <- function(d, percentile, values, na.rm = TRUE) {
  if (isFALSE(missing(percentile))) {
    stopifnot(percentile %age% 0 && percentile %ale% 1)
  } else if (missing(percentile)) {
    percentile <- NA_real_
  }

  if (is.null(d)) {
    warning("d was NULL, no winsorization performed")
  } else {

    if (!is.vector(d) && !is.matrix(d) && 
        !is.data.frame(d) && !is.data.table(d)) {
      if (is.atomic(d) && is.null(dim(d))) {
        warning(paste0(
          "Atomic type with no dimensions, coercing to a numeric vector.\n",
          "To remove this warning, try wrapping the data in as.numeric() or\n",
          "otherwise coercing to a vector prior to passing to winsorizor()."))
        d <- as.numeric(d)
      }
    }
    stopifnot(is.vector(d) || is.matrix(d) || 
      is.data.frame(d) || is.data.table(d))
    dismatrix <- is.matrix(d)

    f <- function(x, percentile, values, na.rm) {
      if (!missing(values)) {
        low <- values[, "low"]
        high <- values[, "high"]
      } else {
        low <- quantile(x, probs = 0 + percentile, na.rm = na.rm)
        high <- quantile(x, probs = 1 - percentile, na.rm = na.rm)
      }

      out <- pmin(pmax(x, low), high)

      new.attr <- data.frame(low = low, high = high, percentile = percentile)
      rownames(new.attr) <- NULL

      attributes(out) <- c(attributes(x), winsorizedValues = list(new.attr))

      return(out)
    }

    if (is.vector(d)) {
      d <- f(d, percentile = percentile, values = values, na.rm = na.rm)
    } else if (is.matrix(d) || is.data.frame(d) || is.data.table(d)) {
      if (length(percentile) == 1) {
        percentile <- rep(percentile, ncol(d))
      }

      if (is.data.table(d)) {
        d <- copy(d)
        if (missing(values)) {
          for (i in seq_len(ncol(d))) {
            v <- names(d)[i]
            d[, (v) := f(get(v), percentile = percentile[i], na.rm = na.rm)]
          }
        } else {
          for (i in seq_len(ncol(d))) {
            v <- names(d)[i]
            d[, (v) := f(get(v), percentile = percentile[i], 
              values = values[i, ], na.rm = na.rm)]
          }
        }

        all.attr <- do.call(rbind, lapply(seq_len(ncol(d)), 
          function(i) attr(d[[i]], "winsorizedValues")))
        all.attr$variable <- colnames(d)
        rownames(all.attr) <- NULL

        for (v in names(d)) {
          d[, (v) := as.vector(get(v))]
        }

      } else {

        if (missing(values)) {
          tmp <- lapply(seq_len(ncol(d)), function(i) {
            f(d[, i], percentile = percentile[i], na.rm = na.rm)
          })
        } else {
          tmp <- lapply(seq_len(ncol(d)), function(i) {
            f(d[, i], percentile = percentile[i],
              values = values[i, ], na.rm = na.rm)
          })
        }

        all.attr <- do.call(rbind, lapply(tmp,
          function(x) attr(x, "winsorizedValues")))
        all.attr$variable <- colnames(d)
        rownames(all.attr) <- NULL
        d <- as.data.frame(lapply(tmp, as.vector))
        colnames(d) <- all.attr$variable

        if (dismatrix) {
          d <- as.matrix(d)
        }

      }

      attributes(d) <- c(attributes(d), winsorizedValues = list(all.attr))
    }
  }

  return(d)
}

#' Calculate a rounded five number summary
#'
#' Numbers are the minimum, 25th percentile, median,
#' 75th percentile, and maximum, of the non missing data.
#' Values returned are either the significant digits or rounded values,
#' whichever ends up resulting in the fewest total digits.
#'
#' @param x The data to have the summary calculated on
#' @param round The number of digits to try rounding
#' @param sig The number of significant digits to try
#' @return The rounded or significant digit five number summary
#' @importFrom stats fivenum
#' @examples
#' JWileymisc:::roundedfivenum(rnorm(1000))
#' JWileymisc:::roundedfivenum(mtcars$hp)
roundedfivenum <- function(x, round = 2, sig = 3) {
  x <- as.numeric(quantile(x[!is.na(x)], breaks = c(0, .25, .5, .75, 1),
                type = 7))
  if (max(nchar(signif(x, sig))) < max(nchar(round(x, round)))) {
    signif(x, sig)
  } else {
    round(x, round)
  }
}

#' Calculate F and p-value from the R2
#'
#' @param r2 r squareds
#' @param numdf numerator degrees of freedom
#' @param dendf denominator degrees of freedom
#' @return a vector
#' @keywords internal
#' @importFrom stats pf
#' @examples
#' JWileymisc:::f.r2(.30, 1, 99)
f.r2 <- function(r2, numdf, dendf) {
  Fval <- (dendf / numdf) * (-r2 / (r2 - 1))
  p <- pf(Fval, df1 = numdf, df2 = dendf, lower.tail = FALSE)
  c(F = Fval[[1]], NumDF = numdf, DenDF = dendf, p = p[[1]])
}
JWiley/JWileymisc documentation built on Feb. 15, 2024, 12:23 p.m.