janitor: Simple Tools for Examining and Cleaning Dirty Data

Documented in adorn_percentages

#' Convert a data.frame of counts to percentages.
#'
#' This function defaults to excluding the first column of the input data.frame,
#' assuming that it contains a descriptive variable, but this can be overridden
#' by specifying the columns to adorn in the `...` argument.
#'
#' @param dat A `tabyl` or other data.frame with a tabyl-like layout.
#'   If given a list of data.frames, this function will apply itself to each
#'   `data.frame` in the list (designed for 3-way `tabyl` lists).
#' @param denominator The direction to use for calculating percentages.
#'   One of "row", "col", or "all".
#' @param na.rm should missing values (including `NaN`) be omitted from the calculations?
#' @param ... columns to adorn. This takes a <[`tidy-select`][dplyr::dplyr_tidy_select]>
#'   specification. By default, all numeric columns (besides the initial column, if numeric)
#'   are adorned, but this allows you to manually specify which columns should
#'   be adorned, for use on a `data.frame` that does not result from a call to [tabyl()].
#'
#' @return A `data.frame` of percentages, expressed as numeric values between 0 and 1.
#' @export
#' @examples
#'
#' mtcars %>%
#'   tabyl(am, cyl) %>%
#'   adorn_percentages("col")
#'
#' # calculates correctly even with totals column and/or row:
#' mtcars %>%
#'   tabyl(am, cyl) %>%
#'   adorn_totals("row") %>%
#'   adorn_percentages()
#'
#' # Control the columns to be adorned with the ... variable selection argument
#' # If using only the ... argument, you can use empty commas as shorthand
#' # to supply the default values to the preceding arguments:
#'
#' cases <- data.frame(
#'   region = c("East", "West"),
#'   year = 2015,
#'   recovered = c(125, 87),
#'   died = c(13, 12)
#' )
#'
#' cases %>%
#'   adorn_percentages(, , recovered:died)
adorn_percentages <- function(dat, denominator = "row", na.rm = TRUE, ...) {
  # if input is a list, call purrr::map to recursively apply this function to each data.frame
  if (is.list(dat) && !is.data.frame(dat)) {
    purrr::map(dat, adorn_percentages, denominator, na.rm, ...)
  } else {
    # catch bad inputs
    if (!is.data.frame(dat)) {
      stop("adorn_percentages() must be called on a data.frame or list of data.frames")
    }
    rlang::arg_match0(denominator, c("row", "col", "all"))

    dat <- as_tabyl(dat)

    numeric_cols <- which(vapply(dat, is.numeric, logical(1)))
    non_numeric_cols <- setdiff(1:ncol(dat), numeric_cols)
    numeric_cols <- setdiff(numeric_cols, 1) # assume 1st column should not be included so remove it from numeric_cols. Moved up to this line so that if only 1st col is numeric, the function errors
    explicitly_exempt_totals <- FALSE

    if (rlang::dots_n(...) == 0) {
      cols_to_tally <- numeric_cols
    } else {
      expr <- rlang::expr(c(...))
      cols_to_tally <- tidyselect::eval_select(expr, data = dat)
      explicitly_exempt_totals <- !(ncol(dat) %in% cols_to_tally) # if not present, it's b/c user explicitly exempted it
      if (any(cols_to_tally %in% non_numeric_cols)) {
        message("At least one non-numeric column was specified.  All non-numeric columns will be removed from percentage calculations.")
        cols_to_tally <- setdiff(cols_to_tally, non_numeric_cols)
      }
    }

    if ("col" %in% attr(dat, "totals")) {
      # if there's a totals col, don't use it to calculate the %s
      cols_to_tally <- setdiff(cols_to_tally, ncol(dat))
    }

    if (denominator == "row") {
      # if row-wise percentages and a totals column, need to exempt totals col and make it all 1s
      if ("col" %in% attr(dat, "totals") & !explicitly_exempt_totals) {
        dat[[ncol(dat)]] <- rep(1, nrow(dat))
      }
      row_sum <- rowSums(dat[cols_to_tally], na.rm = na.rm)
      dat[, cols_to_tally] <- dat[cols_to_tally] / row_sum
    } else if (denominator == "col") {
      # if col-wise percentages and a row column, need to exempt totals row and make it all 1s
      if ("row" %in% attr(dat, "totals")) {
        col_sum <- colSums(dat[-nrow(dat), ][cols_to_tally], na.rm = na.rm)
      } else {
        col_sum <- colSums(dat[cols_to_tally], na.rm = na.rm)
      }
      # add totals col back to be tallied, #357
      if ("col" %in% attr(dat, "totals") & !explicitly_exempt_totals) {
        cols_to_tally <- c(cols_to_tally, ncol(dat))
        if ("row" %in% attr(dat, "totals")) {
          col_sum <- c(col_sum, sum(dat[-nrow(dat), ncol(dat)]))
        } else {
          col_sum <- c(col_sum, sum(dat[, ncol(dat)]))
        }
      }
      dat[cols_to_tally] <- sweep(dat[cols_to_tally], 2, col_sum, `/`) # from http://stackoverflow.com/questions/9447801/dividing-columns-by-colsums-in-r
    } else if (denominator == "all") {
      # if all-wise percentages, need to exempt any totals col or row
      if ("row" %in% attr(dat, "totals")) {
        complete_n <- sum(dat[-nrow(dat), cols_to_tally], na.rm = TRUE)
      } else {
        complete_n <- sum(dat[, cols_to_tally], na.rm = TRUE)
      }
      # add totals col back to be tallied, #357
      if ("col" %in% attr(dat, "totals") & !explicitly_exempt_totals) {
        cols_to_tally <- c(cols_to_tally, ncol(dat))
      }
      dat[cols_to_tally] <- dat[cols_to_tally] / complete_n
    }
    dat
  }
}