Nothing
#' Separate a big `skim_df` into smaller data frames, by type.
#'
#' The data frames produced by [skim()] are wide and sparse, filled with
#' columns that are mostly `NA`. For that reason, it can be convenient to
#' work with "by type" subsets of the original data frame. These smaller
#' subsets have their `NA` columns removed.
#'
#' `partition()` creates a list of smaller `skim_df` data frames. Each entry
#' in the list is a data type from the original `skim_df`. The inverse of
#' `partition()` is `bind()`, which takes the list and produces the original
#' `skim_df`. While `partition()` keeps all of the subtables as list entries,
#' `yank()` gives you a single subtable for a data type.
#'
#' @param data A `skim_df`.
#' @param skim_type A character scalar. The subtable to extract from a
#' `skim_df`.
#' @return A `skim_list` of `skim_df`'s, by type.
#' @examples
#' # Create a wide skimmed data frame (a skim_df)
#' skimmed <- skim(iris)
#'
#' # Separate into a list of subtables by type
#' separate <- partition(skimmed)
#'
#' # Put back together
#' identical(bind(separate), skimmed)
#' # > TRUE
#'
#' # Alternatively, get the subtable of a particular type
#' yank(skimmed, "factor")
#' @export
partition <- function(data) {
assert_is_skim_df(data)
data_as_list <- split(data, data$skim_type)
groups <- group_names(data)
base <- base_skimmers(data)
skimmers <- reconcile_skimmers(data, groups, base)
# Check to see that there are at least one, known used skim function
# within the data. This can be reduced after using focus(). This indexing
# is used so that the lookup order matches that of `data_as_list`.
has_skim_data <- lengths(skimmers)[names(data_as_list)] > 0
elements_to_keep <- has_skim_data | any(base %in% names(data))
reduced <- purrr::imap(
data_as_list[elements_to_keep],
simplify_skimdf,
skimmers = skimmers,
groups = groups,
base = base
)
reassign_skim_attrs(
reduced,
data,
class = c("skim_list", "list"),
skimmers_used = skimmers
)
}
#' Align the skimmers_used attribute with the current columns in the data
#'
#' This catches the case where users partition (or more likely print) a data
#' frame that has had columns added after skimming.
#' @noRd
reconcile_skimmers <- function(data, groups, base) {
all_columns <- colnames(data)
skimmers_used <- skimmers_used(data)
if (length(skimmers_used) == 0) {
return(skimmers_used)
}
skimmers_from_names <- skimmers_from_names(all_columns, skimmers_used)
with_base_columns <- c(
"skim_variable",
"skim_type",
groups,
base,
collapse_skimmers(skimmers_used)
)
matched_cols <- dplyr::intersect(all_columns, with_base_columns)
extra_cols <- dplyr::setdiff(all_columns, with_base_columns)
if (length(extra_cols) > 0) {
grouped <- dplyr::group_by(data, .data$skim_type)
complete_by_type <- dplyr::summarize_at(
grouped,
extra_cols,
~ !all(is.na(.x))
)
complete_cols <- purrr::pmap(
complete_by_type,
get_complete_columns,
names = extra_cols
)
new_cols_by_type <- rlang::set_names(
complete_cols,
complete_by_type$skim_type
)
skimmers_from_names <- purrr::list_merge(
skimmers_from_names,
!!!new_cols_by_type
)
}
skimmers_from_names
}
skimmers_from_names <- function(names, skimmers) {
matched <- purrr::imap(skimmers, match_skimmers, names)
purrr::set_names(matched, names(skimmers))
}
match_skimmers <- function(fun_names, type, values_to_match) {
stripped_values <- stringr::str_remove(
values_to_match,
paste0("^", type, "\\.")
)
dplyr::intersect(stripped_values, fun_names)
}
collapse_skimmers <- function(skimmers_used) {
with_type <- purrr::imap(skimmers_used, ~ paste(.y, .x, sep = "."))
purrr::flatten_chr(with_type)
}
get_complete_columns <- function(skim_type, ..., names) {
names[c(...)]
}
#' For each type subtable, only select columns generated by this type's skimmers
#' This function also catches the case where the user removed columns from
#' the skim_df
#' @noRd
simplify_skimdf <- function(data, skim_type, skimmers, groups, base) {
stopifnot(has_variable_column(data))
names(data) <- stringr::str_remove(names(data), paste0(skim_type, "\\."))
keep <- c("skim_variable", groups, base, skimmers[[skim_type]])
out <- dplyr::select(data, tidyselect::any_of(keep))
structure(
out,
class = c("one_skim_df", "tbl_df", "tbl", "data.frame"),
skim_type = skim_type
)
}
#' @describeIn partition The inverse of a `partition()`. Rebuild the original
#' `skim_df`.
#' @export
bind <- function(data) {
assert_is_skim_list(data)
with_namespaces <- purrr::imap(data, add_namespaces)
combined <- dplyr::bind_rows(!!!with_namespaces, .id = "skim_type")
reassign_skim_attrs(combined, data)
}
add_namespaces <- function(data, skim_type) {
base <- base_skimmers(data)
meta_columns <- c("skim_variable", dplyr::group_vars(data), base)
no_meta_columns <- dplyr::setdiff(names(data), meta_columns)
with_namespace <- paste(skim_type, no_meta_columns, sep = ".")
# TODO(michaelquinn32): Drop this when vctrs interface works correctly.
names(data) <- c(meta_columns, with_namespace)
attr(data, "skim_type") <- NULL
tibble::as_tibble(data)
}
#' @describeIn partition Extract a subtable from a `skim_df` with a particular
#' type.
#' @export
yank <- function(data, skim_type) {
partition(data)[[skim_type]]
}
#' Only show a subset of summary statistics after skimming
#'
#' This function is a variant of [dplyr::select()] designed to work with
#' `skim_df` objects. When using `focus()`, `skimr` metadata columns are kept,
#' and `skimr` print methods are still utilized. Otherwise, the signature and
#' behavior is identical to [dplyr::select()].
#'
#' @param .data A `skim_df` object.
#' @param ... One or more unquoted expressions separated by commas. Variable
#' names can be used as if they were positions in the data frame, so
#' expressions like x:y can be used to select a range of variables.
#' @examples
#' # Compare
#' iris %>%
#' skim() %>%
#' dplyr::select(n_missing)
#'
#' iris %>%
#' skim() %>%
#' focus(n_missing)
#'
#' # This is equivalent to
#' iris %>%
#' skim() %>%
#' dplyr::select(skim_variable, skim_type, n_missing)
#' @export
focus <- function(.data, ...) {
assert_is_skim_df(.data)
reduced <- dplyr::select(.data, "skim_type", "skim_variable", ...)
if (could_be_skim_df(reduced)) {
skimmers <- reconcile_skimmers(
reduced,
group_names(.data),
base_skimmers(.data)
)
reassign_skim_attrs(reduced, .data, skimmers_used = skimmers)
} else {
stop("Cannot drop 'skim_variable' or 'skim_type' columns")
}
}
#' Create "long" skim output
#'
#' Skim results returned as a tidy long data frame with four columns:
#' variable, type, stat and formatted.
#'
#' @param .data A data frame or an object that can be coerced into a data frame.
#' @param ... Columns to select for skimming. When none are provided, the
#' default is to skim all columns.
#' @param skim_fun The skim function used.
#' @return A tibble
#' @examples
#' to_long(iris)
#' to_long(skim(iris))
#' @export
to_long <- function(.data, ..., skim_fun = skim) {
UseMethod("to_long")
}
#' @describeIn to_long Skim a data frame and convert the results to a
#' long data frame.
#' @export
to_long.default <- function(.data, ..., skim_fun = skim) {
skimmed <- skim_fun(.data, ...)
to_long(skimmed, ..., skim_fun)
}
#' @describeIn to_long Transform a skim_df to a long data frame.
#' @export
to_long.skim_df <- function(.data, ..., skim_fun = skim) {
tidyr::gather(
.data,
key = "stat",
value = "formatted",
na.rm = TRUE,
-"skim_type",
-"skim_variable"
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.