R/reshape.R
In skimr: Compact and Flexible Summaries of Data

Documented in bind focus partition to_long to_long.default to_long.skim_df yank

#' Separate a big `skim_df` into smaller data frames, by type.
#'
#' The data frames produced by [skim()] are wide and sparse, filled with
#' columns that are mostly `NA`. For that reason, it can be convenient to
#' work with "by type" subsets of the original data frame. These smaller
#' subsets have their `NA` columns removed.
#'
#' `partition()` creates a list of smaller `skim_df` data frames. Each entry
#' in the list is a data type from the original `skim_df`. The inverse of
#' `partition()` is `bind()`, which takes the list and produces the original
#' `skim_df`. While `partition()` keeps all of the subtables as list entries,
#' `yank()` gives you a single subtable for a data type.
#'
#' @param data A `skim_df`.
#' @param skim_type A character scalar. The subtable to extract from a
#'   `skim_df`.
#' @return A `skim_list` of `skim_df`'s, by type.
#' @examples
#' # Create a wide skimmed data frame (a skim_df)
#' skimmed <- skim(iris)
#'
#' # Separate into a list of subtables by type
#' separate <- partition(skimmed)
#'
#' # Put back together
#' identical(bind(separate), skimmed)
#' # > TRUE
#'
#' # Alternatively, get the subtable of a particular type
#' yank(skimmed, "factor")
#' @export
partition <- function(data) {
  assert_is_skim_df(data)
  data_as_list <- split(data, data$skim_type)
  groups <- group_names(data)
  base <- base_skimmers(data)

  skimmers <- reconcile_skimmers(data, groups, base)

  # Check to see that there are at least one, known used skim function
  # within the data. This can be reduced after using focus(). This indexing
  # is used so that the lookup order matches that of `data_as_list`.
  has_skim_data <- lengths(skimmers)[names(data_as_list)] > 0
  elements_to_keep <- has_skim_data | any(base %in% names(data))
  reduced <- purrr::imap(
    data_as_list[elements_to_keep],
    simplify_skimdf,
    skimmers = skimmers,
    groups = groups,
    base = base
  )

  reassign_skim_attrs(
    reduced,
    data,
    class = c("skim_list", "list"),
    skimmers_used = skimmers
  )
}

#' Align the skimmers_used attribute with the current columns in the data
#'
#' This catches the case where users partition (or more likely print) a data
#' frame that has had columns added after skimming.
#' @noRd
reconcile_skimmers <- function(data, groups, base) {
  all_columns <- colnames(data)
  skimmers_used <- skimmers_used(data)
  if (length(skimmers_used) == 0) {
    return(skimmers_used)
  }
  skimmers_from_names <- skimmers_from_names(all_columns, skimmers_used)
  with_base_columns <- c(
    "skim_variable",
    "skim_type",
    groups,
    base,
    collapse_skimmers(skimmers_used)
  )
  matched_cols <- dplyr::intersect(all_columns, with_base_columns)
  extra_cols <- dplyr::setdiff(all_columns, with_base_columns)
  if (length(extra_cols) > 0) {
    grouped <- dplyr::group_by(data, .data$skim_type)
    complete_by_type <- dplyr::summarize_at(
      grouped,
      extra_cols,
      ~ !all(is.na(.x))
    )
    complete_cols <- purrr::pmap(
      complete_by_type,
      get_complete_columns,
      names = extra_cols
    )
    new_cols_by_type <- rlang::set_names(
      complete_cols,
      complete_by_type$skim_type
    )
    skimmers_from_names <- purrr::list_merge(
      skimmers_from_names,
      !!!new_cols_by_type
    )
  }

  skimmers_from_names
}

skimmers_from_names <- function(names, skimmers) {
  matched <- purrr::imap(skimmers, match_skimmers, names)
  purrr::set_names(matched, names(skimmers))
}

match_skimmers <- function(fun_names, type, values_to_match) {
  stripped_values <- stringr::str_remove(
    values_to_match,
    paste0("^", type, "\\.")
  )
  dplyr::intersect(stripped_values, fun_names)
}

collapse_skimmers <- function(skimmers_used) {
  with_type <- purrr::imap(skimmers_used, ~ paste(.y, .x, sep = "."))
  purrr::flatten_chr(with_type)
}

get_complete_columns <- function(skim_type, ..., names) {
  names[c(...)]
}

#' For each type subtable, only select columns generated by this type's skimmers
#' This function also catches the case where the user removed columns from
#' the skim_df
#' @noRd
simplify_skimdf <- function(data, skim_type, skimmers, groups, base) {
  stopifnot(has_variable_column(data))
  names(data) <- stringr::str_remove(names(data), paste0(skim_type, "\\."))
  keep <- c("skim_variable", groups, base, skimmers[[skim_type]])
  out <- dplyr::select(data, tidyselect::any_of(keep))

  structure(
    out,
    class = c("one_skim_df", "tbl_df", "tbl", "data.frame"),
    skim_type = skim_type
  )
}

#' @describeIn partition The inverse of a `partition()`. Rebuild the original
#'   `skim_df`.
#' @export
bind <- function(data) {
  assert_is_skim_list(data)
  with_namespaces <- purrr::imap(data, add_namespaces)
  combined <- dplyr::bind_rows(!!!with_namespaces, .id = "skim_type")
  reassign_skim_attrs(combined, data)
}

add_namespaces <- function(data, skim_type) {
  base <- base_skimmers(data)
  meta_columns <- c("skim_variable", dplyr::group_vars(data), base)
  no_meta_columns <- dplyr::setdiff(names(data), meta_columns)
  with_namespace <- paste(skim_type, no_meta_columns, sep = ".")

  # TODO(michaelquinn32): Drop this when vctrs interface works correctly.
  names(data) <- c(meta_columns, with_namespace)
  attr(data, "skim_type") <- NULL
  tibble::as_tibble(data)
}

#' @describeIn partition Extract a subtable from a `skim_df` with a particular
#'   type.
#' @export
yank <- function(data, skim_type) {
  partition(data)[[skim_type]]
}

#' Only show a subset of summary statistics after skimming
#'
#' This function is a variant of [dplyr::select()] designed to work with
#' `skim_df` objects. When using `focus()`, `skimr` metadata columns are kept,
#' and `skimr` print methods are still utilized. Otherwise, the signature and
#' behavior is identical to [dplyr::select()].
#'
#' @param .data A `skim_df` object.
#' @param ...  One or more unquoted expressions separated by commas. Variable
#'   names can be used as if they were positions in the data frame, so
#'   expressions like x:y can be used to select a range of variables.
#' @examples
#' # Compare
#' iris %>%
#'   skim() %>%
#'   dplyr::select(n_missing)
#'
#' iris %>%
#'   skim() %>%
#'   focus(n_missing)
#'
#' # This is equivalent to
#' iris %>%
#'   skim() %>%
#'   dplyr::select(skim_variable, skim_type, n_missing)
#' @export
focus <- function(.data, ...) {
  assert_is_skim_df(.data)
  reduced <- dplyr::select(.data, "skim_type", "skim_variable", ...)
  if (could_be_skim_df(reduced)) {
    skimmers <- reconcile_skimmers(
      reduced,
      group_names(.data),
      base_skimmers(.data)
    )
    reassign_skim_attrs(reduced, .data, skimmers_used = skimmers)
  } else {
    stop("Cannot drop 'skim_variable' or 'skim_type' columns")
  }
}

#' Create "long" skim output
#'
#' Skim results returned as a tidy long data frame with four columns:
#' variable, type, stat and formatted.
#'
#' @param .data A data frame or an object that can be coerced into a data frame.
#' @param ...  Columns to select for skimming. When none are provided, the
#'   default is to skim all columns.
#' @param skim_fun The skim function used.
#' @return A tibble
#' @examples
#' to_long(iris)
#' to_long(skim(iris))
#' @export
to_long <- function(.data, ..., skim_fun = skim) {
  UseMethod("to_long")
}

#' @describeIn to_long Skim a data frame and convert the results to a
#'   long data frame.
#' @export
to_long.default <- function(.data, ..., skim_fun = skim) {
  skimmed <- skim_fun(.data, ...)
  to_long(skimmed, ..., skim_fun)
}

#' @describeIn  to_long Transform a skim_df to a long data frame.
#' @export
to_long.skim_df <- function(.data, ..., skim_fun = skim) {
  tidyr::gather(
    .data,
    key = "stat",
    value = "formatted",
    na.rm = TRUE,
    -"skim_type",
    -"skim_variable"
  )
}