janitor: Simple Tools for Examining and Cleaning Dirty Data

Documented in tabyl tabyl.data.frame tabyl.default

#' Generate a frequency table (1-, 2-, or 3-way).
#'
#' @description
#' A fully-featured alternative to `table()`. Results are data.frames and can be
#' formatted and enhanced with janitor's family of `adorn_` functions.
#'
#' Specify a `data.frame` and the one, two, or three unquoted column names you
#' want to tabulate.  Three variables generates a list of 2-way tabyls,
#' split by the third variable.
#'
#' Alternatively, you can tabulate a single variable that isn't in a `data.frame`
#'  by calling `tabyl()` on a vector, e.g., `tabyl(mtcars$gear)`.
#'
#' @param dat A `data.frame` containing the variables you wish to count.
#'   Or, a vector you want to tabulate.
#' @param var1 The column name of the first variable.
#' @param var2 (optional) the column name of the second variable
#'   (its values become the column names in a 2-way tabulation).
#' @param var3 (optional) the column name of the third variable
#'   (a 3-way tabulation is split into a list on its values).
#' @param show_na Should counts of `NA` values be displayed?  In a one-way tabyl,
#'   the presence of `NA` values triggers an additional column showing valid percentages
#'   (calculated excluding `NA` values).
#' @param show_missing_levels Should counts of missing levels of factors be displayed?
#'   These will be rows and/or columns of zeroes.  Useful for keeping consistent
#'   output dimensions even when certain factor levels may not be present in the data.
#' @param ... Additional arguments passed to methods.
#' @return A `data.frame` with frequencies and percentages of the tabulated variable(s).
#'   A 3-way tabulation returns a list of data frames.
#' @export
#' @examples
#'
#' tabyl(mtcars, cyl)
#' tabyl(mtcars, cyl, gear)
#' tabyl(mtcars, cyl, gear, am)
#'
#' # or using the %>% pipe
#' mtcars %>%
#'   tabyl(cyl, gear)
#'
#' # illustrating show_na functionality:
#' my_cars <- rbind(mtcars, rep(NA, 11))
#' my_cars %>% tabyl(cyl)
#' my_cars %>% tabyl(cyl, show_na = FALSE)
#'
#' # Calling on a single vector not in a data.frame:
#' val <- c("hi", "med", "med", "lo")
#' tabyl(val)
tabyl <- function(dat, ...) UseMethod("tabyl")



#' @export
#' @rdname tabyl
# this method runs when tabyl() is called on plain vectors; tabyl_1way
# also reverts to this method

tabyl.default <- function(dat, show_na = TRUE, show_missing_levels = TRUE, ...) {
  if (is.list(dat) && !"data.frame" %in% class(dat)) {
    stop("tabyl() is meant to be called on vectors and data.frames; convert non-data.frame lists to one of these types")
  }
  # catch and adjust input variable name.
  if (is.null(names(dat)) || is.vector(dat)) {
    var_name <- deparse(substitute(dat))
  } else {
    var_name <- names(dat)
  }

  # useful error message if input vector doesn't exist
  if (is.null(dat)) {
    stop(paste0("object ", var_name, " not found"))
  }
  # an odd variable name can be deparsed into a vector of length >1, rare but throws warning, see issue #87
  if (length(var_name) > 1) {
    var_name <- paste(var_name, collapse = "")
  }

  # Try to retrieve label
  if (is.data.frame(dat)) {
    var_label <- attr(dat[, var_name], "label", exact = TRUE) %||% var_name
  } else {
    var_label <- attr(dat, "label", exact = TRUE) %||% var_name
  }

  # if show_na is not length-1 logical, error helpfully (#377)
  if (length(show_na) > 1 || !inherits(show_na, "logical")) {
    stop("The value supplied to the \"show_na\" argument must be TRUE or FALSE.\n\nDid you try to call tabyl on two vectors, like tabyl(data$var1, data$var2) ? To create a two-way tabyl, the two vectors must be in the same data.frame, and the function should be called like this: \n
         tabyl(data, var1, var2)
         or
         data %>% tabyl(var1, var2).  \n\nSee ?tabyl for more.")
  }

  # calculate initial counts table
  # convert vector to a 1 col data.frame
  if (mode(dat) %in% c("logical", "numeric", "character", "list") && !is.matrix(dat)) {
    # to preserve factor properties when vec is passed in as a list from data.frame method:
    if (is.list(dat)) {
      dat <- dat[[1]]
    }
    dat_df <- data.frame(dat, stringsAsFactors = is.factor(dat))
    names(dat_df)[1] <- "dat"
    result <- dat_df %>% dplyr::count(dat)

    if (is.factor(dat) && show_missing_levels) {
      expanded <- tidyr::expand(result, dat)
      result <- merge( # can't use left_join b/c NA matching changed in 0.6.0
        x = expanded,
        y = result,
        by = "dat",
        all.x = TRUE,
        all.y = TRUE
      )
      result <- dplyr::arrange(result, dat) # restore sorting by factor level
    }
  } else {
    stop("input must be a vector of type logical, numeric, character, list, or factor")
  }

  # calculate percent, move NA row to bottom
  result <- result %>%
    dplyr::mutate(percent = n / sum(n, na.rm = TRUE))

  # sort the NA row to the bottom, necessary to retain factor sorting
  result <- result[order(is.na(result$dat)), ]
  result$is_na <- NULL

  # replace all NA values with 0 - only applies to missing factor levels
  result <- tidyr::replace_na(result, replace = list(n = 0, percent = 0))

  ## NA handling:
  # if there are NA values & show_na = T, calculate valid % as a new column
  if (show_na && sum(is.na(result[[1]])) > 0) {
    valid_total <- sum(result$n[!is.na(result[[1]])], na.rm = TRUE)
    result$valid_percent <- result$n / valid_total
    result$valid_percent[is.na(result[[1]])] <- NA
  } else { # don't show NA values, which necessitates adjusting the %s
    result <- result %>%
      dplyr::filter(!is.na(.[, 1])) %>%
      dplyr::mutate(percent = n / sum(n, na.rm = TRUE)) # recalculate % without NAs
  }

  # reassign correct variable name (or label if it exists)
  names(result)[1] <- var_label

  # in case input var name was "n" or "percent", call helper function to set unique names
  result <- handle_if_special_names_used(result)

  data.frame(result, check.names = FALSE) %>%
    as_tabyl(axes = 1)
}



#' @export
#' @rdname tabyl
# Main dispatching function to underlying functions depending on whether "..." contains 1, 2, or 3 variables
tabyl.data.frame <- function(dat, var1, var2, var3, show_na = TRUE, show_missing_levels = TRUE, ...) {
  if (missing(var1) && missing(var2) && missing(var3)) {
    stop("if calling on a data.frame, specify unquoted column names(s) to tabulate.  Did you mean to call tabyl() on a vector?")
  }
  if (dplyr::is_grouped_df(dat)) {
    dat <- dplyr::ungroup(dat)
  }

  if (missing(var2) && missing(var3) && !missing(var1)) {
    tabyl_1way(dat, rlang::enquo(var1), show_na = show_na, show_missing_levels = show_missing_levels)
  } else if (missing(var3) && !missing(var1) && !missing(var2)) {
    tabyl_2way(dat, rlang::enquo(var1), rlang::enquo(var2), show_na = show_na, show_missing_levels = show_missing_levels)
  } else if (!missing(var1) &&
    !missing(var2) &&
    !missing(var3)) {
    tabyl_3way(dat, rlang::enquo(var1), rlang::enquo(var2), rlang::enquo(var3), show_na = show_na, show_missing_levels = show_missing_levels)
  } else {
    stop("please specify var1 OR var1 & var2 OR var1 & var2 & var3")
  }
}

# a one-way frequency table; this was called "tabyl" in janitor <= 0.3.0
tabyl_1way <- function(dat, var1, show_na = TRUE, show_missing_levels = TRUE) {
  x <- dplyr::select(dat, !!var1)

  # gather up arguments, pass them to tabyl.default
  arguments <- list()
  arguments$dat <- x[1]
  arguments$show_na <- show_na
  arguments$show_missing_levels <- show_missing_levels
  do.call(tabyl.default,
    args = arguments
  )
}


# a two-way frequency table; this was called "crosstab" in janitor <= 0.3.0
tabyl_2way <- function(dat, var1, var2, show_na = TRUE, show_missing_levels = TRUE) {
  dat <- dplyr::select(dat, !!var1, !!var2)

  if (!show_na) {
    dat <- dat[!is.na(dat[[1]]) & !is.na(dat[[2]]), ]
  }
  if (nrow(dat) == 0) { # if passed a zero-length input, or an entirely NA input, return a zero-row data.frame
    message("No records to count so returning a zero-row tabyl")
    return(dat %>%
      dplyr::select(1) %>%
      dplyr::slice(0))
  }

  tabl <- dat %>%
    dplyr::count(!!var1, !!var2, name = "tabyl_2way_n")

  # Optionally expand missing factor levels.
  if (show_missing_levels) {
    tabl <- tidyr::complete(tabl, !!var1, !!var2)
  }

  # replace NA with string NA_ in vec2 to avoid invalid col name after spreading
  # if this col is a factor, need to add that level to the factor
  if (is.numeric(tabl[[2]])) { # have numerics treated like factors to not spread alphabetically
    tabl[[2]] <- ordered(tabl[[2]], levels = unique(tabl[[2]]))
  }
  if (is.factor(tabl[[2]])) {
    levels(tabl[[2]]) <- c(levels(tabl[[2]]), "emptystring_", "NA_")
  } else {
    tabl[2] <- as.character(tabl[[2]])
  }
  tabl[2][is.na(tabl[2])] <- "NA_"
  tabl[2][tabl[2] == ""] <- "emptystring_"
  result <- tabl %>%
    tidyr::pivot_wider(
      names_from = !!var2,
      values_from = "tabyl_2way_n",
      values_fn = ~ dplyr::coalesce(.x, 0L),
      values_fill = 0L,
      names_sort = TRUE
    )
  if ("emptystring_" %in% names(result)) {
    result <- result[c(setdiff(names(result), "emptystring_"), "emptystring_")]
    if (getOption("tabyl.emptystring", TRUE) & interactive()) {
      message("The tabyl's column variable contained the empty string value, \"\". This is not a legal column name and has been converted to \"emptystring_\".\nConsider converting \"\" to NA if appropriate.\nThis message is shown once per session and may be disabled by setting options(\"tabyl.emptystring\" = FALSE).") # nocov
      options("tabyl.emptystring" = FALSE) # nocov
    }
  }
  if ("NA_" %in% names(result)) {
    # move NA_ column to end, from http://stackoverflow.com/a/18339562
    result <- result[c(setdiff(names(result), "NA_"), "NA_")]
  }

  row_var_name <- names(dat)[1]
  col_var_name <- names(dat)[2]
  names(result)[1] <- attr(dat[, 1], "label", exact = TRUE) %||% names(result)[1]
  data.frame(result, check.names = FALSE) %>%
    as_tabyl(axes = 2, row_var_name = row_var_name, col_var_name = col_var_name)
}


# a list of two-way frequency tables, split into a list on a third variable
tabyl_3way <- function(dat, var1, var2, var3, show_na = TRUE, show_missing_levels = TRUE) {
  dat <- dplyr::select(dat, !!var1, !!var2, !!var3)
  var3_numeric <- is.numeric(dat[[3]])

  # Preserve labels, as attributes are sometimes dropped during transformations.
  var1_label <- attr(dat[, 1], "label", exact = TRUE)
  var2_label <- attr(dat[, 2], "label", exact = TRUE)

  # Keep factor levels for ordering the list at the end
  if (is.factor(dat[[3]])) {
    third_levels_for_sorting <- levels(dat[[3]])
  }
  dat[[3]] <- as.character(dat[[3]]) # don't want empty factor levels in the result list - they would be empty data.frames

  # grab class of 1st variable to restore it later
  col1_class <- class(dat[[1]])
  col1_levels <- NULL
  if (is.factor(dat[[1]])) {
    col1_levels <- levels(dat[[1]])
  }

  # print NA level as its own data.frame, and make it appear last
  if (show_na && sum(is.na(dat[[3]])) > 0) {
    dat[[3]] <- factor(dat[[3]], levels = c(sort(unique(dat[[3]])), "NA_"))
    dat[[3]][is.na(dat[[3]])] <- "NA_"
    if (exists("third_levels_for_sorting")) {
      third_levels_for_sorting <- c(third_levels_for_sorting, "NA_")
    }
  }

  if (show_missing_levels) { # needed to have each crosstab in the list aware of all values in the pre-split variables
    dat[[1]] <- as.factor(dat[[1]])
    dat[[2]] <- as.factor(dat[[2]])
  }

  result <- split(dat, dat[[rlang::quo_name(var3)]])
  # split() drops attributes, so we manually add back the label attributes.
  result <- lapply(result, function(x) {
    attr(x[[1]], "label") <- var1_label
    attr(x[[2]], "label") <- var2_label
    x
  })
  result <- result %>%
    purrr::map(tabyl_2way, var1, var2, show_na = show_na, show_missing_levels = show_missing_levels) %>%
    purrr::map(reset_1st_col_status, col1_class, col1_levels) # reset class of var in 1st col to its input class, #168

  # reorder when var 3 is a factor, per #250
  if (exists("third_levels_for_sorting")) {
    result <- result[order(third_levels_for_sorting[third_levels_for_sorting %in% unique(dat[[3]])])]
  }

  if (var3_numeric) {
    result <- result[order(suppressWarnings(as.numeric(names(result))), na.last = TRUE)]
  }

  result
}

### Helper functions called by tabyl() ------------

# function that checks if col 1 name is "n" or "percent",
## if so modifies the appropriate other column name to avoid duplicates
handle_if_special_names_used <- function(dat) {
  if (names(dat)[1] == "n") {
    names(dat)[2] <- "n_n"
  } else if (names(dat)[1] == "percent") {
    names(dat)[3] <- "percent_percent"
  }
  dat
}

# reset the 1st col's class of a data.frame to a provided class
# also reset in tabyl's core
reset_1st_col_status <- function(dat, new_class, lvls) {
  if ("factor" %in% new_class) {
    dat[[1]] <- factor(dat[[1]],
      levels = lvls,
      ordered = ("ordered" %in% new_class)
    )
    attr(dat, "core")[[1]] <- factor(attr(dat, "core")[[1]],
      levels = lvls,
      ordered = ("ordered" %in% new_class)
    )
  } else {
    dat[[1]] <- as.character(dat[[1]]) # first do as.character in case eventual class is numeric
    class(dat[[1]]) <- new_class
    attr(dat, "core")[[1]] <- as.character(attr(dat, "core")[[1]])
    class(attr(dat, "core")[[1]]) <- new_class
  }
  dat
}