R/datasummary_skim.R
In modelsummary: Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

Documented in datasummary_skim

#' Quick overview of numeric or categorical variables
#'
#' This function was inspired by the excellent `skimr` package for R.
#' See the Details and Examples sections below, and the vignettes on the
#'
#' `modelsummary` website:
#' * https://modelsummary.com/
#' * https://modelsummary.com/vignettes/datasummary.html
#'
#' Note that the default `escape=TRUE` may not be best for all formats, such as LaTeX/PDF.
#'
#' @inheritParams datasummary
#' @inheritParams modelsummary
#' @import data.table
#' @param type String. Variables to summarize: "all", "numeric", "categorical", "dataset"
#' @param by Character vector of grouping variables to compute statistics over.
#' @param fun_numeric Named list of funtions to apply to each numeric column of `data`. If `fun_numeric` includes "Histogram" or "Density", inline plots are inserted. This argument is only used when `type="numeric"` or `"all"`.
#'
#' @template kableExtra2tinytable
#' @template citation
#' @template options
#' @examplesIf FALSE
#' dat <- mtcars
#' dat$vs <- as.logical(dat$vs)
#' dat$cyl <- as.factor(dat$cyl)
#' datasummary_skim(dat)
#' datasummary_skim(dat, type = "categorical")
#'
#' @export
datasummary_skim <- function(
  data,
  output = getOption("modelsummary_output", default = "default"),
  type = getOption("modelsummary_type", default = "all"),
  fmt = 1,
  title = getOption("modelsummary_title", default = NULL),
  notes = getOption("modelsummary_notes", default = NULL),
  align = getOption("modelsummary_align", default = NULL),
  escape = getOption("modelsummary_escape", default = TRUE),
  by = getOption("modelsummary_by", default = NULL),
  fun_numeric = getOption(
    "modelsummary_fun_numeric",
    default = list(
      "Unique" = NUnique,
      "Missing Pct." = PercentMissing,
      "Mean" = Mean,
      "SD" = SD,
      "Min" = Min,
      "Median" = Median,
      "Max" = Max,
      "Histogram" = function(x) ""
    )
  ),
  ...
) {
  ## settings
  settings_init(settings = list("function_called" = "datasummary_skim"))
  tmp <- sanitize_output(output) # before sanitize_escape
  output_format <- tmp$output_format
  output_factory <- tmp$output_factory
  output_file <- tmp$output_file
  sanitize_escape(escape) # after sanitize_output
  sanity_align(align)
  checkmate::assert_list(fun_numeric, min.len = 1, names = "unique")
  for (fun_numeric_element in fun_numeric) {
    checkmate::assert_function(fun_numeric_element)
  }
  checkmate::assert_data_frame(data)
  checkmate::assert_character(by, null.ok = TRUE)
  if (!is.null(by)) {
    checkmate::assert_true(all(by %in% colnames(data)))
  }

  dots <- list(...)
  if (isFALSE(dots[["histogram"]])) {
    fun_numeric[["Histogram"]] <- NULL
    insight::format_warning(
      "The `histogram` argument is deprecated. Use `fun_numeric` instead."
    )
  }

  # in 2.0.0, histogram is a tinytable-specific option.
  if (!settings_equal("output_factory", c("tinytable", "dataframe"))) {
    insight::format_warning(
      "Inline histograms in `datasummary_skim()` are only supported for tables produced by the `tinytable` backend."
    )
    fun_numeric[["Histogram"]] <- NULL
  }

  checkmate::assert_choice(type, c("all", "numeric", "categorical", "dataset"))

  # tables does not play well with tibbles
  data <- as.data.frame(data)

  if (type == "all" && !settings_equal("output_factory", "tinytable")) {
    insight::format_warning(
      "`type='all'` is only supported for the `tinytable` backend. Set the `type` argument explicitly to suppress this warning."
    )
    type <- "numeric"
  }

  if (type == "all") {
    a <- tryCatch(
      datasummary_skim_numeric(
        data,
        output = "tinytable",
        fmt = fmt,
        by = by,
        title = title,
        notes = notes,
        align = align,
        escape = FALSE,
        fun_numeric = fun_numeric,
        ...
      ),
      error = function(e) e$message
    )
    b <- tryCatch(
      datasummary_skim_categorical(
        data,
        output = "tinytable",
        fmt = fmt,
        title = title,
        notes = notes,
        align = align,
        escape = FALSE,
        ...
      ),
      error = function(e) e$message
    )

    sanitize_output(output)

    data_list <- attr(a, "data_list")

    if (inherits(a, "tinytable") && inherits(b, "tinytable")) {
      out <- tinytable::rbind2(a, b, use_names = FALSE)
      out <- tinytable::format_tt(out, replace = "")
      out <- tinytable::style_tt(
        out,
        i = nrow(a) + 1,
        line = "t",
        line_size = .3
      )
      if (identical(output_format, "html")) {
        out <- tinytable::style_tt(
          out,
          i = nrow(a) + 1,
          bold = TRUE,
          line = "bt",
          line_color = "#d3d8dc"
        )
      }
    } else if (!inherits(a, "tinytable") && !inherits(b, "tinytable")) {
      insight::format_error(a, b)
    } else if (inherits(a, "tinytable")) {
      out <- a
    } else if (inherits(b, "tinytable")) {
      out <- b
    }

    ofmt <- output_format
    if (
      isTRUE(ofmt %in% c("latex", "markdown", "html", "typst", "dataframe"))
    ) {
      out@output <- ofmt
    }
    ofile <- output_file
    if (!is.null(ofile)) {
      tinytable::save_tt(out, output = ofile, overwrite = TRUE)
    }

    out <- tinytable::format_tt(out, escape = escape)
  } else if (type == "numeric") {
    out <- datasummary_skim_numeric(
      data,
      output = output,
      fmt = fmt,
      by = by,
      title = title,
      notes = notes,
      align = align,
      escape = escape,
      fun_numeric = fun_numeric,
      ...
    )
    sanitize_output(output)

    data_list <- attr(out, "data_list")
  } else if (type == "categorical") {
    out <- datasummary_skim_categorical(
      data,
      output = output,
      fmt = fmt,
      title = title,
      notes = notes,
      align = align,
      escape = escape,
      ...
    )
    sanitize_output(output)
  } else if (type == "dataset") {
    out <- datasummary_skim_dataset(
      data,
      output = output,
      title = title,
      notes = notes,
      align = align,
      escape = escape,
      ...
    )
    sanitize_output(output)
  }

  if (inherits(out, "tinytable")) {
    if ("Histogram" %in% out@names && !is.null(data_list)) {
      out <- tinytable::plot_tt(
        out,
        i = seq_along(data_list),
        j = "Histogram",
        fun = "histogram",
        data = data_list
      )
    }
    if ("Density" %in% out@names && !is.null(data_list)) {
      out <- tinytable::plot_tt(
        out,
        i = seq_along(data_list),
        j = "Density",
        fun = "density",
        data = data_list
      )
    }
  }

  if (!is.null(output_file)) {
    settings_rm()
    return(invisible(out))
  } else {
    if (
      output == "jupyter" ||
        (output == "default" && settings_equal("output_default", "jupyter"))
    ) {
      insight::check_if_installed("IRdisplay")
      return(invisible(IRdisplay::display_html(as.character(out))))
    }
    settings_rm()
    return(out)
  }
}

#' Internal function to skim whole datasets
#'
#' @noRd
datasummary_skim_dataset <- function(
  data,
  output,
  title,
  notes,
  align,
  escape,
  ...
) {
  is.binary <- function(x) {
    tryCatch(
      length(unique(stats::na.omit(x))) == 2,
      error = function(e) FALSE,
      silent = TRUE
    )
  }
  rounding <- fmt_decimal(digits = 0)
  out <- c(
    Rows = rounding(nrow(data)),
    Columns = rounding(ncol(data)),
    # `# Binary` = rounding(sum(sapply(data, is.binary))),
    `# Character` = rounding(sum(sapply(data, is.character))),
    `# Factor` = rounding(sum(sapply(data, is.factor))),
    `# Logical` = rounding(sum(sapply(data, is.logical))),
    `# Numeric` = rounding(sum(sapply(data, is.numeric))),
    `% Missing` = rounding(mean(is.na(data) * 100))
  )
  out <- data.frame(names(out), out)
  out <- out[out[[2]] != "0" | out[[1]] == "% Missing", ]
  row.names(out) <- NULL
  colnames(out) <- c(" ", "  ")

  out <- datasummary_df(
    data = out,
    output = output,
    title = title,
    align = align,
    notes = notes,
    ...
  )

  return(out)
}


#' Internal function to skim numeric variables
#'
#' @noRd
datasummary_skim_numeric <- function(
  data,
  output,
  fmt,
  title,
  notes,
  align,
  escape,
  by = NULL,
  fun_numeric = NULL,
  ...
) {
  # subset of numeric variables
  idx <- sapply(data, is.numeric)
  idx[colnames(data) %in% by] <- TRUE
  if (!any(idx)) insight::format_error("data contains no numeric variable.")
  dat_new <- data[, idx, drop = FALSE]

  # subset of non-NA variables
  idx <- sapply(dat_new, function(x) !all(is.na(x)))
  if (!any(idx))
    insight::format_error("all numeric variables are completely missing.")
  dat <- dat_new[, idx, drop = FALSE]

  # too large
  if (ncol(dat) > 250) {
    insight::format_error("Cannot summarize more than 250 variables at a time.")
  }

  cols <- setdiff(colnames(dat), by)

  dat <- data.table::as.data.table(dat)

  funcs <- list(
    "Variable" = function(x) "",
    "Internal Data List" = function(x) list(x)
  )
  funcs <- c(funcs, fun_numeric)

  # Compute
  rows <- list()
  for (v in cols) {
    tmp <- dat[,
      lapply(funcs, function(funny) funny(variable)),
      by = by,
      env = list("variable" = v)
    ][,
      Variable := v
    ]
    for (i in seq_along(tmp)) {
      class(tmp[[i]]) <- setdiff(
        class(tmp[[i]]),
        c("haven_labelled", "vctrs_vctr")
      )
    }
    rows <- c(rows, list(tmp))
  }
  rows <- data.table::rbindlist(rows)

  data_list <- rows[["Internal Data List"]]

  # labels
  for (col in colnames(data)) {
    lab <- attr(data[[col]], "label")
    if (!is.null(lab)) {
      rows[Variable == col, Variable := lab]
    }
  }

  rows[, Variable := dedup(Variable)]
  rows[, `Internal Data List` := NULL]
  idx <- unique(c("Variable", by, colnames(rows)))
  rows <- rows[, ..idx]
  data.table::setnames(rows, old = "Variable", new = " ")

  out <- datasummary_df(rows, fmt = fmt, output = output)

  attr(out, "data_list") <- data_list

  return(out)
}


#' Internal function to skim categorical variables
#'
#' @noRd
datasummary_skim_categorical <- function(
  data,
  output,
  fmt,
  title,
  notes,
  align,
  escape,
  ...
) {
  dat_new <- data

  # pad colnames in case one is named Min, Max, Mean, or other function name
  # colnames(dat_new) <- paste0(colnames(dat_new), " ")

  drop_too_many_levels <- NULL
  drop_entirely_na <- NULL

  for (n in colnames(dat_new)) {
    # completely missing
    if (all(is.na(dat_new[[n]]))) {
      dat_new[[n]] <- NULL
      drop_entirely_na <- c(drop_entirely_na, n)
    }

    if (
      is.logical(dat_new[[n]]) |
        is.character(dat_new[[n]]) |
        is.factor(dat_new[[n]])
    ) {
      # convert to factor and keep NAs as distinct level
      if (is.logical(dat_new[[n]]) | is.character(dat_new[[n]])) {
        dat_new[[n]] <- factor(dat_new[[n]], exclude = NULL)
      }

      # tables::tabular breaks on ""
      if (is.factor(dat_new[[n]]) && "" %in% levels(dat_new[[n]])) {
        idx <- levels(dat_new[[n]]) == ""
        levels(dat_new[[n]])[idx] <- " "
      }

      ## factors with too many levels
      if (is.factor(dat_new[[n]])) {
        if (length(levels(dat_new[[n]])) > 50) {
          dat_new[[n]] <- NULL
          drop_too_many_levels <- c(drop_too_many_levels, n)
        }
      }

      # discard non-factors
    } else {
      dat_new[[n]] <- NULL
    }
  }

  # too small
  if (ncol(dat_new) == 0) {
    stop("data contains no logical, character, or factor variable.")
  }

  # too large
  if (ncol(dat_new) > 50) {
    stop("Cannot summarize more than 50 variables at a time.")
  }

  if (!is.null(drop_too_many_levels)) {
    warning(
      sprintf(
        "These variables were omitted because they include more than 50 levels: %s.",
        paste(drop_too_many_levels, collapse = ", ")
      ),
      call. = FALSE
    )
  }

  if (!is.null(drop_entirely_na)) {
    warning(
      sprintf(
        "These variables were omitted because they are entirely missing: %s.",
        paste(drop_entirely_na, collapse = ", ")
      ),
      call. = FALSE
    )
  }

  pctformat <- sanitize_fmt(fmt)
  f <- All(
    dat_new,
    numeric = FALSE,
    factor = TRUE,
    logical = TRUE,
    character = TRUE
  ) ~
    (N = 1) * Format() + (`%` = Percent()) * Format(pctformat())

  datasummary(
    formula = f,
    data = dat_new,
    output = output,
    title = title,
    align = align,
    notes = notes
  )
}


dedup <- function(x) {
  if (length(x) < 2) {
    return(x)
  }
  for (i in length(x):2) {
    if (x[i] == x[i - 1]) {
      x[i] <- NA
    }
  }
  if (is.character(x)) {
    x[is.na(x)] <- ""
  }
  return(x)
}

Any scripts or data that you put into this service are public.

modelsummary documentation built on Aug. 25, 2025, 9:51 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

modelsummary
Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

R/datasummary_skim.R
In modelsummary: Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

Defines functions dedup datasummary_skim_categorical datasummary_skim_numeric datasummary_skim_dataset datasummary_skim

Documented in datasummary_skim

Try the modelsummary package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

modelsummary Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

R/datasummary_skim.R In modelsummary: Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

Defines functions dedup datasummary_skim_categorical datasummary_skim_numeric datasummary_skim_dataset datasummary_skim

Documented in datasummary_skim

Try the modelsummary package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

modelsummary
Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready

R/datasummary_skim.R
In modelsummary: Summary Tables and Plots for Statistical Models and Data: Beautiful, Customizable, and Publication-Ready