R/describe_distribution.R

Defines functions .boot_distribution print.parameters_distribution describe_distribution.grouped_df describe_distribution.data.frame describe_distribution.character describe_distribution.factor describe_distribution.numeric describe_distribution

Documented in describe_distribution describe_distribution.data.frame describe_distribution.factor describe_distribution.numeric

#' Describe a distribution
#'
#' This function describes a distribution by a set of indices (e.g., measures of
#' centrality, dispersion, range, skewness, kurtosis).
#'
#' @param x A numeric vector.
#' @param range Return the range (min and max).
#' @param quartiles Return the first and third quartiles (25th and 75pth
#'   percentiles).
#' @param include_factors Logical, if `TRUE`, factors are included in the
#'   output, however, only columns for range (first and last factor levels) as
#'   well as n and missing will contain information.
#' @param ci Confidence Interval (CI) level. Default is `NULL`, i.e. no
#'   confidence intervals are computed. If not `NULL`, confidence intervals
#'   are based on bootstrap replicates (see `iterations`). If
#'   `centrality = "all"`, the bootstrapped confidence interval refers to
#'   the first centrality index (which is typically the median).
#' @param iterations The number of bootstrap replicates for computing confidence
#'   intervals. Only applies when `ci` is not `NULL`.
#' @param iqr Logical, if `TRUE`, the interquartile range is calculated
#'   (based on [stats::IQR()], using `type = 6`).
#' @param verbose Toggle warnings and messages.
#' @inheritParams bayestestR::point_estimate
#'
#' @note There is also a
#'   [`plot()`-method](https://easystats.github.io/see/articles/parameters.html)
#'   implemented in the
#'   \href{https://easystats.github.io/see/}{\pkg{see}-package}.
#'
#' @return A data frame with columns that describe the properties of the variables.
#' @examples
#' describe_distribution(rnorm(100))
#'
#' data(iris)
#' describe_distribution(iris)
#' describe_distribution(iris, include_factors = TRUE, quartiles = TRUE)
#' @export
describe_distribution <- function(x, ...) {
  UseMethod("describe_distribution")
}


#' @rdname describe_distribution
#' @export
describe_distribution.numeric <- function(x,
                                          centrality = "mean",
                                          dispersion = TRUE,
                                          iqr = TRUE,
                                          range = TRUE,
                                          quartiles = FALSE,
                                          ci = NULL,
                                          iterations = 100,
                                          threshold = .1,
                                          verbose = TRUE,
                                          ...) {
  out <- data.frame(.temp = 0)

  # Missing
  n_missing <- sum(is.na(x))
  x <- stats::na.omit(x)

  # Point estimates
  out <- cbind(
    out,
    bayestestR::point_estimate(
      x,
      centrality = centrality,
      dispersion = dispersion,
      threshold = threshold,
      ...
    )
  )


  # interquartile range, type same as minitab and SPSS
  if (iqr) {
    out$IQR <- stats::IQR(x, na.rm = TRUE, type = 6)
  }


  # Confidence Intervals
  if (!is.null(ci)) {
    insight::check_if_installed("boot")
    results <- boot::boot(
      data = x,
      statistic = .boot_distribution,
      R = iterations,
      centrality = centrality
    )
    out_ci <- bayestestR::ci(results$t, ci = ci, verbose = FALSE)
    out <- cbind(out, data.frame(CI_low = out_ci$CI_low[1], CI_high = out_ci$CI_high[1]))
  }


  # Range
  if (range) {
    out <- cbind(
      out,
      data.frame(
        Min = min(x, na.rm = TRUE),
        Max = max(x, na.rm = TRUE)
      )
    )
  }

  # Quartiles
  if (quartiles) {
    out <- cbind(
      out,
      data.frame(
        Q1 = stats::quantile(x, probs = 0.25, na.rm = TRUE),
        Q3 = stats::quantile(x, probs = 0.75, na.rm = TRUE)
      )
    )
  }

  # Skewness
  out <- cbind(
    out,
    data.frame(
      Skewness = as.numeric(skewness(x, verbose = verbose)),
      Kurtosis = as.numeric(kurtosis(x, verbose = verbose))
    )
  )

  out$n <- length(x)
  out$n_Missing <- n_missing
  out$`.temp` <- NULL

  class(out) <- unique(c("parameters_distribution", "see_parameters_distribution", class(out)))
  attr(out, "data") <- x
  attr(out, "ci") <- ci
  attr(out, "threshold") <- threshold
  if (centrality == "all") attr(out, "first_centrality") <- colnames(out)[1]
  out
}


#' @rdname describe_distribution
#' @export
describe_distribution.factor <- function(x,
                                         dispersion = TRUE,
                                         range = TRUE,
                                         verbose = TRUE,
                                         ...) {

  # Missing
  n_missing <- sum(is.na(x))
  x <- stats::na.omit(x)

  out <- data.frame(
    Mean = NA,
    SD = NA,
    CI_low = NA,
    CI_high = NA,
    IQR = NA,
    Min = levels(x)[1],
    Max = levels(x)[nlevels(x)],
    Q1 = NA,
    Q3 = NA,
    Skewness = as.numeric(skewness(x, verbose = verbose)),
    Kurtosis = as.numeric(kurtosis(x, verbose = verbose)),
    n = length(x),
    n_Missing = n_missing,
    stringsAsFactors = FALSE
  )


  if (!dispersion) {
    out$SD <- NULL
  }


  dot.arguments <- list(...)

  if (is.null(dot.arguments[["ci"]])) {
    out$CI_low <- NULL
    out$CI_high <- NULL
  }

  if (is.null(dot.arguments[["iqr"]]) || isFALSE(dot.arguments[["iqr"]])) {
    out$IQR <- NULL
  }

  if (is.null(dot.arguments[["quartiles"]]) || isFALSE(dot.arguments[["quartiles"]])) {
    out$Q1 <- NULL
    out$Q3 <- NULL
  }

  if (!range) {
    out$Min <- NULL
    out$Max <- NULL
  }

  class(out) <- unique(c("parameters_distribution", "see_parameters_distribution", class(out)))
  attr(out, "data") <- x
  out
}



#' @export
describe_distribution.character <- function(x,
                                            dispersion = TRUE,
                                            range = TRUE,
                                            verbose = TRUE,
                                            ...) {

  # Missing
  n_missing <- sum(is.na(x))
  x <- stats::na.omit(x)
  values <- unique(x)

  out <- data.frame(
    Mean = NA,
    SD = NA,
    IQR = NA,
    CI_low = NA,
    CI_high = NA,
    Min = values[1],
    Max = values[length(values)],
    Q1 = NA,
    Q3 = NA,
    Skewness = as.numeric(skewness(x, verbose = verbose)),
    Kurtosis = as.numeric(kurtosis(x, verbose = verbose)),
    n = length(x),
    n_Missing = n_missing,
    stringsAsFactors = FALSE
  )


  if (!dispersion) {
    out$SD <- NULL
  }


  dot.arguments <- list(...)
  if (is.null(dot.arguments[["ci"]])) {
    out$CI_low <- NULL
    out$CI_high <- NULL
  }

  if (is.null(dot.arguments[["iqr"]]) || isFALSE(dot.arguments[["iqr"]])) {
    out$IQR <- NULL
  }

  if (is.null(dot.arguments[["quartiles"]]) || isFALSE(dot.arguments[["quartiles"]])) {
    out$Q1 <- NULL
    out$Q3 <- NULL
  }

  if (!range) {
    out$Min <- NULL
    out$Max <- NULL
  }

  class(out) <- unique(c("parameters_distribution", "see_parameters_distribution", class(out)))
  attr(out, "data") <- x
  out
}



#' @rdname describe_distribution
#' @export
describe_distribution.data.frame <- function(x,
                                             centrality = "mean",
                                             dispersion = TRUE,
                                             iqr = TRUE,
                                             range = TRUE,
                                             quartiles = FALSE,
                                             include_factors = FALSE,
                                             ci = NULL,
                                             iterations = 100,
                                             threshold = .1,
                                             verbose = TRUE,
                                             ...) {
  out <- do.call(rbind, lapply(x, function(i) {
    if ((include_factors && is.factor(i)) || (!is.character(i) && !is.factor(i))) {
      describe_distribution(
        i,
        centrality = centrality,
        dispersion = dispersion,
        iqr = iqr,
        range = range,
        quartiles = quartiles,
        ci = ci,
        iterations = iterations,
        threshold = threshold,
        verbose = verbose
      )
    }
  }))

  out$Variable <- row.names(out)
  row.names(out) <- NULL
  out <- out[c("Variable", setdiff(colnames(out), "Variable"))]

  class(out) <- unique(c("parameters_distribution", "see_parameters_distribution", class(out)))
  attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500)
  attr(out, "ci") <- ci
  attr(out, "threshold") <- threshold
  if (centrality == "all") attr(out, "first_centrality") <- colnames(out)[2]
  out
}




#' @export
describe_distribution.grouped_df <- function(x,
                                             centrality = "mean",
                                             dispersion = TRUE,
                                             iqr = TRUE,
                                             range = TRUE,
                                             quartiles = FALSE,
                                             include_factors = FALSE,
                                             ci = NULL,
                                             iterations = 100,
                                             threshold = .1,
                                             verbose = TRUE,
                                             ...) {
  group_vars <- setdiff(colnames(attributes(x)$groups), ".rows")
  group_data <- expand.grid(lapply(x[group_vars], function(i) unique(sort(i))))
  groups <- split(x, x[group_vars])

  out <- do.call(rbind, lapply(1:length(groups), function(i) {
    d <- describe_distribution.data.frame(
      groups[[i]],
      centrality = centrality,
      dispersion = dispersion,
      iqr = iqr,
      range = range,
      quartiles = quartiles,
      include_factors = include_factors,
      ci = ci,
      iterations = iterations,
      threshold = threshold,
      ...
    )
    d[[".group"]] <- paste(sprintf("%s=%s", group_vars, sapply(group_data[i, ], as.character)), collapse = " | ")
    d
  }))

  class(out) <- unique(c("parameters_distribution", "see_parameters_distribution", class(out)))
  attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500)
  attr(out, "ci") <- ci
  attr(out, "threshold") <- threshold
  if (centrality == "all") attr(out, "first_centrality") <- colnames(out)[2]
  out
}



#' @export
print.parameters_distribution <- function(x, digits = 2, ...) {
  formatted_table <- format(
    x,
    digits = digits,
    format = "text",
    ci_width = NULL,
    ci_brackets = TRUE,
    ...
  )
  cat(insight::export_table(formatted_table, format = "text", digits = digits))
  invisible(x)
}


# bootstrapping CIs ----------------------------------

.boot_distribution <- function(data, indices, centrality) {
  out <- datawizard::describe_distribution(
    data[indices],
    centrality = centrality,
    dispersion = FALSE,
    iqr = FALSE,
    range = FALSE,
    quartiles = FALSE,
    ci = NULL
  )
  out[[1]]
}

Try the datawizard package in your browser

Any scripts or data that you put into this service are public.

datawizard documentation built on Oct. 4, 2021, 9:07 a.m.