gtsummary: Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

Documented in datasummary_skim

#' Quick overview of numeric or categorical variables
#'
#' This function was inspired by the excellent `skimr` package for R.
#' See the Details and Examples sections below, and the vignettes on the
#' `modelsummary` website:
#' * https://modelsummary.com/
#' * https://modelsummary.com/vignettes/datasummary.html
#'
#' @inheritParams datasummary
#' @inheritParams modelsummary
#' @import data.table
#' @param type String. Variables to summarize: "all", "numeric", "categorical", "dataset"
#' @param by Character vector of grouping variables to compute statistics over.
#' @param fun_numeric Named list of funtions to apply to each numeric column of `data`. If `fun_numeric` includes "Histogram" or "Density", inline plots are inserted. This argument is only used when `type="numeric"` or `"all"`.
#'
#' @template kableExtra2tinytable
#' @template citation
#' @template options
#' @examplesIf FALSE
#' dat <- mtcars
#' dat$vs <- as.logical(dat$vs)
#' dat$cyl <- as.factor(dat$cyl)
#' datasummary_skim(dat)
#' datasummary_skim(dat, type = "categorical")
#'
#' @export
datasummary_skim <- function(
  data,
  output = getOption("modelsummary_output", default = "default"),
  type = getOption("modelsummary_type", default = "all"),
  fmt = 1,
  title = getOption("modelsummary_title", default = NULL),
  notes = getOption("modelsummary_notes", default = NULL),
  align = getOption("modelsummary_align", default = NULL),
  escape = getOption("modelsummary_escape", default = TRUE),
  by = getOption("modelsummary_by", default = NULL),
  fun_numeric = getOption(
    "modelsummary_fun_numeric",
    default = list(
      "Unique" = NUnique,
      "Missing Pct." = PercentMissing,
      "Mean" = Mean,
      "SD" = SD,
      "Min" = Min,
      "Median" = Median,
      "Max" = Max,
      "Histogram" = function(x) ""
    )
  ),
  ...
) {
  ## settings
  settings_init(settings = list("function_called" = "datasummary_skim"))
  tmp <- sanitize_output(output) # before sanitize_escape
  output_format <- tmp$output_format
  output_factory <- tmp$output_factory
  output_file <- tmp$output_file
  sanitize_escape(escape) # after sanitize_output
  sanity_align(align)
  checkmate::assert_list(fun_numeric, min.len = 1, names = "unique")
  for (fun_numeric_element in fun_numeric) {
    checkmate::assert_function(fun_numeric_element)
  }
  checkmate::assert_data_frame(data)
  checkmate::assert_character(by, null.ok = TRUE)
  if (!is.null(by)) {
    checkmate::assert_true(all(by %in% colnames(data)))
  }

  dots <- list(...)
  if (isFALSE(dots[["histogram"]])) {
    fun_numeric[["Histogram"]] <- NULL
    insight::format_warning(
      "The `histogram` argument is deprecated. Use `fun_numeric` instead."
    )
  }

  # in 2.0.0, histogram is a tinytable-specific option.
  if (!settings_equal("output_factory", c("tinytable", "dataframe"))) {
    insight::format_warning(
      "Inline histograms in `datasummary_skim()` are only supported for tables produced by the `tinytable` backend."
    )
    fun_numeric[["Histogram"]] <- NULL
  }

  checkmate::assert_choice(type, c("all", "numeric", "categorical", "dataset"))

  # tables does not play well with tibbles
  data <- as.data.frame(data)

  if (type == "all" && !settings_equal("output_factory", "tinytable")) {
    insight::format_warning(
      "`type='all'` is only supported for the `tinytable` backend. Set the `type` argument explicitly to suppress this warning."
    )
    type <- "numeric"
  }

  if (type == "all") {
    a <- tryCatch(
      datasummary_skim_numeric(
        data,
        output = "tinytable",
        fmt = fmt,
        by = by,
        title = title,
        notes = notes,
        align = align,
        escape = FALSE,
        fun_numeric = fun_numeric,
        ...
      ),
      error = function(e) e$message
    )
    b <- tryCatch(
      datasummary_skim_categorical(
        data,
        output = "tinytable",
        fmt = fmt,
        title = title,
        notes = notes,
        align = align,
        escape = FALSE,
        ...
      ),
      error = function(e) e$message
    )

    sanitize_output(output)

    data_list <- attr(a, "data_list")

    if (inherits(a, "tinytable") && inherits(b, "tinytable")) {
      out <- tinytable::rbind2(a, b, use_names = FALSE)
      out <- tinytable::format_tt(out, replace = "")
      out <- tinytable::style_tt(
        out,
        i = nrow(a) + 1,
        line = "t",
        line_size = .3
      )
      if (identical(output_format, "html")) {
        out <- tinytable::style_tt(
          out,
          i = nrow(a) + 1,
          bold = TRUE,
          line = "bt",
          line_color = "#d3d8dc"
        )
      }
    } else if (!inherits(a, "tinytable") && !inherits(b, "tinytable")) {
      insight::format_error(a, b)
    } else if (inherits(a, "tinytable")) {
      out <- a
    } else if (inherits(b, "tinytable")) {
      out <- b
    }

    ofmt <- output_format
    if (
      isTRUE(ofmt %in% c("latex", "markdown", "html", "typst", "dataframe"))
    ) {
      out@output <- ofmt
    }
    ofile <- output_file
    if (!is.null(ofile)) {
      tinytable::save_tt(out, output = ofile, overwrite = TRUE)
    }

    out <- tinytable::format_tt(out, escape = escape)
  } else if (type == "numeric") {
    out <- datasummary_skim_numeric(
      data,
      output = output,
      fmt = fmt,
      by = by,
      title = title,
      notes = notes,
      align = align,
      escape = escape,
      fun_numeric = fun_numeric,
      ...
    )
    sanitize_output(output)

    data_list <- attr(out, "data_list")
  } else if (type == "categorical") {
    out <- datasummary_skim_categorical(
      data,
      output = output,
      fmt = fmt,
      title = title,
      notes = notes,
      align = align,
      escape = escape,
      ...
    )
    sanitize_output(output)
  } else if (type == "dataset") {
    out <- datasummary_skim_dataset(
      data,
      output = output,
      title = title,
      notes = notes,
      align = align,
      escape = escape,
      ...
    )
    sanitize_output(output)
  }

  if (inherits(out, "tinytable")) {
    if ("Histogram" %in% out@names && !is.null(data_list)) {
      out <- tinytable::plot_tt(
        out,
        i = seq_along(data_list),
        j = "Histogram",
        fun = "histogram",
        data = data_list
      )
    }
    if ("Density" %in% out@names && !is.null(data_list)) {
      out <- tinytable::plot_tt(
        out,
        i = seq_along(data_list),
        j = "Density",
        fun = "density",
        data = data_list
      )
    }
  }

  if (!is.null(output_file)) {
    settings_rm()
    return(invisible(out))
  } else {
    if (
      output == "jupyter" ||
        (output == "default" && settings_equal("output_default", "jupyter"))
    ) {
      insight::check_if_installed("IRdisplay")
      return(invisible(IRdisplay::display_html(as.character(out))))
    }
    settings_rm()
    return(out)
  }
}

#' Internal function to skim whole datasets
#'
#' @noRd
datasummary_skim_dataset <- function(
  data,
  output,
  title,
  notes,
  align,
  escape,
  ...
) {
  is.binary <- function(x) {
    tryCatch(
      length(unique(stats::na.omit(x))) == 2,
      error = function(e) FALSE,
      silent = TRUE
    )
  }
  rounding <- fmt_decimal(digits = 0)
  out <- c(
    Rows = rounding(nrow(data)),
    Columns = rounding(ncol(data)),
    # `# Binary` = rounding(sum(sapply(data, is.binary))),
    `# Character` = rounding(sum(sapply(data, is.character))),
    `# Factor` = rounding(sum(sapply(data, is.factor))),
    `# Logical` = rounding(sum(sapply(data, is.logical))),
    `# Numeric` = rounding(sum(sapply(data, is.numeric))),
    `% Missing` = rounding(mean(is.na(data) * 100))
  )
  out <- data.frame(names(out), out)
  out <- out[out[[2]] != "0" | out[[1]] == "% Missing", ]
  row.names(out) <- NULL
  colnames(out) <- c(" ", "  ")

  out <- datasummary_df(
    data = out,
    output = output,
    title = title,
    align = align,
    notes = notes,
    ...
  )

  return(out)
}


#' Internal function to skim numeric variables
#'
#' @noRd
datasummary_skim_numeric <- function(
  data,
  output,
  fmt,
  title,
  notes,
  align,
  escape,
  by = NULL,
  fun_numeric = NULL,
  ...
) {
  # subset of numeric variables
  idx <- sapply(data, is.numeric)
  idx[colnames(data) %in% by] <- TRUE
  if (!any(idx)) insight::format_error("data contains no numeric variable.")
  dat_new <- data[, idx, drop = FALSE]

  # subset of non-NA variables
  idx <- sapply(dat_new, function(x) !all(is.na(x)))
  if (!any(idx))
    insight::format_error("all numeric variables are completely missing.")
  dat <- dat_new[, idx, drop = FALSE]

  # too large
  if (ncol(dat) > 250) {
    insight::format_error("Cannot summarize more than 250 variables at a time.")
  }

  cols <- setdiff(colnames(dat), by)

  dat <- data.table::as.data.table(dat)

  funcs <- list(
    "Variable" = function(x) "",
    "Internal Data List" = function(x) list(x)
  )
  funcs <- c(funcs, fun_numeric)

  # Compute
  rows <- list()
  for (v in cols) {
    tmp <- dat[,
      lapply(funcs, function(funny) funny(variable)),
      by = by,
      env = list("variable" = v)
    ][,
      Variable := v
    ]
    for (i in seq_along(tmp)) {
      class(tmp[[i]]) <- setdiff(
        class(tmp[[i]]),
        c("haven_labelled", "vctrs_vctr")
      )
    }
    rows <- c(rows, list(tmp))
  }
  rows <- data.table::rbindlist(rows)

  data_list <- rows[["Internal Data List"]]

  # labels
  for (col in colnames(data)) {
    lab <- attr(data[[col]], "label")
    if (!is.null(lab)) {
      rows[Variable == col, Variable := lab]
    }
  }

  rows[, Variable := dedup(Variable)]
  rows[, `Internal Data List` := NULL]
  idx <- unique(c("Variable", by, colnames(rows)))
  rows <- rows[, ..idx]
  data.table::setnames(rows, old = "Variable", new = " ")

  out <- datasummary_df(rows, fmt = fmt, output = output)

  attr(out, "data_list") <- data_list

  return(out)
}


#' Internal function to skim categorical variables
#'
#' @noRd
datasummary_skim_categorical <- function(
  data,
  output,
  fmt,
  title,
  notes,
  align,
  escape,
  ...
) {
  dat_new <- data

  # pad colnames in case one is named Min, Max, Mean, or other function name
  # colnames(dat_new) <- paste0(colnames(dat_new), " ")

  drop_too_many_levels <- NULL
  drop_entirely_na <- NULL

  for (n in colnames(dat_new)) {
    # completely missing
    if (all(is.na(dat_new[[n]]))) {
      dat_new[[n]] <- NULL
      drop_entirely_na <- c(drop_entirely_na, n)
    }

    if (
      is.logical(dat_new[[n]]) |
        is.character(dat_new[[n]]) |
        is.factor(dat_new[[n]])
    ) {
      # convert to factor and keep NAs as distinct level
      if (is.logical(dat_new[[n]]) | is.character(dat_new[[n]])) {
        dat_new[[n]] <- factor(dat_new[[n]], exclude = NULL)
      }

      # tables::tabular breaks on ""
      if (is.factor(dat_new[[n]]) && "" %in% levels(dat_new[[n]])) {
        idx <- levels(dat_new[[n]]) == ""
        levels(dat_new[[n]])[idx] <- " "
      }

      ## factors with too many levels
      if (is.factor(dat_new[[n]])) {
        if (length(levels(dat_new[[n]])) > 50) {
          dat_new[[n]] <- NULL
          drop_too_many_levels <- c(drop_too_many_levels, n)
        }
      }

      # discard non-factors
    } else {
      dat_new[[n]] <- NULL
    }
  }

  # too small
  if (ncol(dat_new) == 0) {
    stop("data contains no logical, character, or factor variable.")
  }

  # too large
  if (ncol(dat_new) > 50) {
    stop("Cannot summarize more than 50 variables at a time.")
  }

  if (!is.null(drop_too_many_levels)) {
    warning(
      sprintf(
        "These variables were omitted because they include more than 50 levels: %s.",
        paste(drop_too_many_levels, collapse = ", ")
      ),
      call. = FALSE
    )
  }

  if (!is.null(drop_entirely_na)) {
    warning(
      sprintf(
        "These variables were omitted because they are entirely missing: %s.",
        paste(drop_entirely_na, collapse = ", ")
      ),
      call. = FALSE
    )
  }

  pctformat <- sanitize_fmt(fmt)
  f <- All(
    dat_new,
    numeric = FALSE,
    factor = TRUE,
    logical = TRUE,
    character = TRUE
  ) ~
    (N = 1) * Format() + (`%` = Percent()) * Format(pctformat())

  datasummary(
    formula = f,
    data = dat_new,
    output = output,
    title = title,
    align = align,
    notes = notes
  )
}


dedup <- function(x) {
  if (length(x) < 2) {
    return(x)
  }
  for (i in length(x):2) {
    if (x[i] == x[i - 1]) {
      x[i] <- NA
    }
  }
  if (is.character(x)) {
    x[is.na(x)] <- ""
  }
  return(x)
}

vincentarelbundock/gtsummary documentation built on June 13, 2025, 5:57 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

vincentarelbundock/gtsummary
Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

R/datasummary_skim.R
In vincentarelbundock/gtsummary: Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

Defines functions dedup datasummary_skim_categorical datasummary_skim_numeric datasummary_skim_dataset datasummary_skim

Documented in datasummary_skim

R Package Documentation

Browse R Packages

We want your feedback!

vincentarelbundock/gtsummary Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

R/datasummary_skim.R In vincentarelbundock/gtsummary: Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

Defines functions dedup datasummary_skim_categorical datasummary_skim_numeric datasummary_skim_dataset datasummary_skim

Documented in datasummary_skim

R Package Documentation

Browse R Packages

We want your feedback!

vincentarelbundock/gtsummary
Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

R/datasummary_skim.R
In vincentarelbundock/gtsummary: Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready