#' Generate a frequency table (1-, 2-, or 3-way).
#'
#' @description
#' A fully-featured alternative to `table()`. Results are data.frames and can be
#' formatted and enhanced with janitor's family of `adorn_` functions.
#'
#' Specify a `data.frame` and the one, two, or three unquoted column names you
#' want to tabulate. Three variables generates a list of 2-way tabyls,
#' split by the third variable.
#'
#' Alternatively, you can tabulate a single variable that isn't in a `data.frame`
#' by calling `tabyl()` on a vector, e.g., `tabyl(mtcars$gear)`.
#'
#' @param dat A `data.frame` containing the variables you wish to count.
#' Or, a vector you want to tabulate.
#' @param var1 The column name of the first variable.
#' @param var2 (optional) the column name of the second variable
#' (the rows in a 2-way tabulation).
#' @param var3 (optional) the column name of the third variable
#' (the list in a 3-way tabulation).
#' @param show_na Should counts of `NA` values be displayed? In a one-way tabyl,
#' the presence of `NA` values triggers an additional column showing valid percentages
#' (calculated excluding `NA` values).
#' @param show_missing_levels Should counts of missing levels of factors be displayed?
#' These will be rows and/or columns of zeroes. Useful for keeping consistent
#' output dimensions even when certain factor levels may not be present in the data.
#' @param ... Additional arguments passed to methods.
#' @return A `data.frame` with frequencies and percentages of the tabulated variable(s).
#' A 3-way tabulation returns a list of data frames.
#' @export
#' @examples
#'
#' tabyl(mtcars, cyl)
#' tabyl(mtcars, cyl, gear)
#' tabyl(mtcars, cyl, gear, am)
#'
#' # or using the %>% pipe
#' mtcars %>%
#' tabyl(cyl, gear)
#'
#' # illustrating show_na functionality:
#' my_cars <- rbind(mtcars, rep(NA, 11))
#' my_cars %>% tabyl(cyl)
#' my_cars %>% tabyl(cyl, show_na = FALSE)
#'
#' # Calling on a single vector not in a data.frame:
#' val <- c("hi", "med", "med", "lo")
#' tabyl(val)
tabyl <- function(dat, ...) UseMethod("tabyl")
#' @export
#' @rdname tabyl
# this method runs when tabyl() is called on plain vectors; tabyl_1way
# also reverts to this method
tabyl.default <- function(dat, show_na = TRUE, show_missing_levels = TRUE, ...) {
if (is.list(dat) && !"data.frame" %in% class(dat)) {
stop("tabyl() is meant to be called on vectors and data.frames; convert non-data.frame lists to one of these types")
}
# catch and adjust input variable name.
if (is.null(names(dat)) || is.vector(dat)) {
var_name <- deparse(substitute(dat))
} else {
var_name <- names(dat)
}
# useful error message if input vector doesn't exist
if (is.null(dat)) {
stop(paste0("object ", var_name, " not found"))
}
# an odd variable name can be deparsed into a vector of length >1, rare but throws warning, see issue #87
if (length(var_name) > 1) {
var_name <- paste(var_name, collapse = "")
}
# Try to retrieve label
if (is.data.frame(dat)) {
var_label <- attr(dat[, var_name], "label", exact = TRUE) %||% var_name
} else {
var_label <- attr(dat, "label", exact = TRUE) %||% var_name
}
# if show_na is not length-1 logical, error helpfully (#377)
if (length(show_na) > 1 || !inherits(show_na, "logical")) {
stop("The value supplied to the \"show_na\" argument must be TRUE or FALSE.\n\nDid you try to call tabyl on two vectors, like tabyl(data$var1, data$var2) ? To create a two-way tabyl, the two vectors must be in the same data.frame, and the function should be called like this: \n
tabyl(data, var1, var2)
or
data %>% tabyl(var1, var2). \n\nSee ?tabyl for more.")
}
# calculate initial counts table
# convert vector to a 1 col data.frame
if (mode(dat) %in% c("logical", "numeric", "character", "list") && !is.matrix(dat)) {
# to preserve factor properties when vec is passed in as a list from data.frame method:
if (is.list(dat)) {
dat <- dat[[1]]
}
dat_df <- data.frame(dat, stringsAsFactors = is.factor(dat))
names(dat_df)[1] <- "dat"
result <- dat_df %>% dplyr::count(dat)
if (is.factor(dat) && show_missing_levels) {
expanded <- tidyr::expand(result, dat)
result <- merge( # can't use left_join b/c NA matching changed in 0.6.0
x = expanded,
y = result,
by = "dat",
all.x = TRUE,
all.y = TRUE
)
result <- dplyr::arrange(result, dat) # restore sorting by factor level
}
} else {
stop("input must be a vector of type logical, numeric, character, list, or factor")
}
# calculate percent, move NA row to bottom
result <- result %>%
dplyr::mutate(percent = n / sum(n, na.rm = TRUE))
# sort the NA row to the bottom, necessary to retain factor sorting
result <- result[order(is.na(result$dat)), ]
result$is_na <- NULL
# replace all NA values with 0 - only applies to missing factor levels
result <- tidyr::replace_na(result, replace = list(n = 0, percent = 0))
## NA handling:
# if there are NA values & show_na = T, calculate valid % as a new column
if (show_na && sum(is.na(result[[1]])) > 0) {
valid_total <- sum(result$n[!is.na(result[[1]])], na.rm = TRUE)
result$valid_percent <- result$n / valid_total
result$valid_percent[is.na(result[[1]])] <- NA
} else { # don't show NA values, which necessitates adjusting the %s
result <- result %>%
dplyr::filter(!is.na(.[, 1])) %>%
dplyr::mutate(percent = n / sum(n, na.rm = TRUE)) # recalculate % without NAs
}
# reassign correct variable name (or label if it exists)
names(result)[1] <- var_label
# in case input var name was "n" or "percent", call helper function to set unique names
result <- handle_if_special_names_used(result)
data.frame(result, check.names = FALSE) %>%
as_tabyl(axes = 1)
}
#' @export
#' @rdname tabyl
# Main dispatching function to underlying functions depending on whether "..." contains 1, 2, or 3 variables
tabyl.data.frame <- function(dat, var1, var2, var3, show_na = TRUE, show_missing_levels = TRUE, ...) {
if (missing(var1) && missing(var2) && missing(var3)) {
stop("if calling on a data.frame, specify unquoted column names(s) to tabulate. Did you mean to call tabyl() on a vector?")
}
if (dplyr::is_grouped_df(dat)) {
dat <- dplyr::ungroup(dat)
}
if (missing(var2) && missing(var3) && !missing(var1)) {
tabyl_1way(dat, rlang::enquo(var1), show_na = show_na, show_missing_levels = show_missing_levels)
} else if (missing(var3) && !missing(var1) && !missing(var2)) {
tabyl_2way(dat, rlang::enquo(var1), rlang::enquo(var2), show_na = show_na, show_missing_levels = show_missing_levels)
} else if (!missing(var1) &&
!missing(var2) &&
!missing(var3)) {
tabyl_3way(dat, rlang::enquo(var1), rlang::enquo(var2), rlang::enquo(var3), show_na = show_na, show_missing_levels = show_missing_levels)
} else {
stop("please specify var1 OR var1 & var2 OR var1 & var2 & var3")
}
}
# a one-way frequency table; this was called "tabyl" in janitor <= 0.3.0
tabyl_1way <- function(dat, var1, show_na = TRUE, show_missing_levels = TRUE) {
x <- dplyr::select(dat, !!var1)
# gather up arguments, pass them to tabyl.default
arguments <- list()
arguments$dat <- x[1]
arguments$show_na <- show_na
arguments$show_missing_levels <- show_missing_levels
do.call(tabyl.default,
args = arguments
)
}
# a two-way frequency table; this was called "crosstab" in janitor <= 0.3.0
tabyl_2way <- function(dat, var1, var2, show_na = TRUE, show_missing_levels = TRUE) {
dat <- dplyr::select(dat, !!var1, !!var2)
if (!show_na) {
dat <- dat[!is.na(dat[[1]]) & !is.na(dat[[2]]), ]
}
if (nrow(dat) == 0) { # if passed a zero-length input, or an entirely NA input, return a zero-row data.frame
message("No records to count so returning a zero-row tabyl")
return(dat %>%
dplyr::select(1) %>%
dplyr::slice(0))
}
tabl <- dat %>%
dplyr::count(!!var1, !!var2, name = "tabyl_2way_n")
# Optionally expand missing factor levels.
if (show_missing_levels) {
tabl <- tidyr::complete(tabl, !!var1, !!var2)
}
# replace NA with string NA_ in vec2 to avoid invalid col name after spreading
# if this col is a factor, need to add that level to the factor
if (is.numeric(tabl[[2]])) { # have numerics treated like factors to not spread alphabetically
tabl[[2]] <- ordered(tabl[[2]], levels = unique(tabl[[2]]))
}
if (is.factor(tabl[[2]])) {
levels(tabl[[2]]) <- c(levels(tabl[[2]]), "emptystring_", "NA_")
} else {
tabl[2] <- as.character(tabl[[2]])
}
tabl[2][is.na(tabl[2])] <- "NA_"
tabl[2][tabl[2] == ""] <- "emptystring_"
result <- tabl %>%
tidyr::pivot_wider(
names_from = !!var2,
values_from = "tabyl_2way_n",
values_fn = ~ dplyr::coalesce(.x, 0L),
values_fill = 0L,
names_sort = TRUE
)
if ("emptystring_" %in% names(result)) {
result <- result[c(setdiff(names(result), "emptystring_"), "emptystring_")]
if (getOption("tabyl.emptystring", TRUE) & interactive()) {
message("The tabyl's column variable contained the empty string value, \"\". This is not a legal column name and has been converted to \"emptystring_\".\nConsider converting \"\" to NA if appropriate.\nThis message is shown once per session and may be disabled by setting options(\"tabyl.emptystring\" = FALSE).") # nocov
options("tabyl.emptystring" = FALSE) # nocov
}
}
if ("NA_" %in% names(result)) {
# move NA_ column to end, from http://stackoverflow.com/a/18339562
result <- result[c(setdiff(names(result), "NA_"), "NA_")]
}
row_var_name <- names(dat)[1]
col_var_name <- names(dat)[2]
names(result)[1] <- attr(dat[, 1], "label", exact = TRUE) %||% names(result)[1]
data.frame(result, check.names = FALSE) %>%
as_tabyl(axes = 2, row_var_name = row_var_name, col_var_name = col_var_name)
}
# a list of two-way frequency tables, split into a list on a third variable
tabyl_3way <- function(dat, var1, var2, var3, show_na = TRUE, show_missing_levels = TRUE) {
dat <- dplyr::select(dat, !!var1, !!var2, !!var3)
var3_numeric <- is.numeric(dat[[3]])
# Preserve labels, as attributes are sometimes dropped during transformations.
var1_label <- attr(dat[, 1], "label", exact = TRUE)
var2_label <- attr(dat[, 2], "label", exact = TRUE)
# Keep factor levels for ordering the list at the end
if (is.factor(dat[[3]])) {
third_levels_for_sorting <- levels(dat[[3]])
}
dat[[3]] <- as.character(dat[[3]]) # don't want empty factor levels in the result list - they would be empty data.frames
# grab class of 1st variable to restore it later
col1_class <- class(dat[[1]])
col1_levels <- NULL
if (is.factor(dat[[1]])) {
col1_levels <- levels(dat[[1]])
}
# print NA level as its own data.frame, and make it appear last
if (show_na && sum(is.na(dat[[3]])) > 0) {
dat[[3]] <- factor(dat[[3]], levels = c(sort(unique(dat[[3]])), "NA_"))
dat[[3]][is.na(dat[[3]])] <- "NA_"
if (exists("third_levels_for_sorting")) {
third_levels_for_sorting <- c(third_levels_for_sorting, "NA_")
}
}
if (show_missing_levels) { # needed to have each crosstab in the list aware of all values in the pre-split variables
dat[[1]] <- as.factor(dat[[1]])
dat[[2]] <- as.factor(dat[[2]])
}
result <- split(dat, dat[[rlang::quo_name(var3)]])
# split() drops attributes, so we manually add back the label attributes.
result <- lapply(result, function(x) {
attr(x[[1]], "label") <- var1_label
attr(x[[2]], "label") <- var2_label
x
})
result <- result %>%
purrr::map(tabyl_2way, var1, var2, show_na = show_na, show_missing_levels = show_missing_levels) %>%
purrr::map(reset_1st_col_status, col1_class, col1_levels) # reset class of var in 1st col to its input class, #168
# reorder when var 3 is a factor, per #250
if (exists("third_levels_for_sorting")) {
result <- result[order(third_levels_for_sorting[third_levels_for_sorting %in% unique(dat[[3]])])]
}
if (var3_numeric) {
result <- result[order(suppressWarnings(as.numeric(names(result))), na.last = TRUE)]
}
result
}
### Helper functions called by tabyl() ------------
# function that checks if col 1 name is "n" or "percent",
## if so modifies the appropriate other column name to avoid duplicates
handle_if_special_names_used <- function(dat) {
if (names(dat)[1] == "n") {
names(dat)[2] <- "n_n"
} else if (names(dat)[1] == "percent") {
names(dat)[3] <- "percent_percent"
}
dat
}
# reset the 1st col's class of a data.frame to a provided class
# also reset in tabyl's core
reset_1st_col_status <- function(dat, new_class, lvls) {
if ("factor" %in% new_class) {
dat[[1]] <- factor(dat[[1]],
levels = lvls,
ordered = ("ordered" %in% new_class)
)
attr(dat, "core")[[1]] <- factor(attr(dat, "core")[[1]],
levels = lvls,
ordered = ("ordered" %in% new_class)
)
} else {
dat[[1]] <- as.character(dat[[1]]) # first do as.character in case eventual class is numeric
class(dat[[1]]) <- new_class
attr(dat, "core")[[1]] <- as.character(attr(dat, "core")[[1]])
class(attr(dat, "core")[[1]]) <- new_class
}
dat
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.