#' Tabulate counts and proportions
#'
#' @param x a [data.frame()] or [tbl_svy][srvyr::as_survey_design] object
#'
#' @param ... categorical variables to tabulate
#'
#' @param strata a stratifier to split the data
#'
#' @param keep a character vector specifying which values to retain in the
#' tabulation. Defaults to `TRUE`, which keeps all the values.
#'
#' @param drop a character vector specifying which values to drop in the
#' tabulation. Defaults to `NULL`, which keeps all values.
#'
#' @param na.rm When `TRUE` (default), missing (NA) values present in `var`
#' will be removed from the data set with a warning, causing a change in
#' denominator for the tabulations. Setting this to `FALSE` creates an
#' explicit missing value called "(Missing)".
#'
#' @param prop_total if `TRUE` and `strata` is not `NULL`, then the totals of the
#' rows will be reported as proportions of the total data set, otherwise, they
#' will be proportions within the stratum (default).
#'
#' @param row_total create a new column with the total counts for each row of
#' stratified data.
#'
#' @param col_total create a new row with the total counts for each column of
#' stratified data.
#'
#' @param wide if `TRUE` (default) and strata is defined, then the results are
#' presented in a wide table with each stratification counts and estimates in
#' separate columns. If `FALSE`, then the data will be presented in a long
#' format where the counts and estimates are presented in single columns. This
#' has no effect if strata is not defined.
#'
#' @param transpose if `wide = TRUE`, then this will transpose the columns to
#' the rows, which is useful when you stratify by age group. Default is
#' `NULL`, which will not transpose anything. You have three options for
#' transpose:
#' - `transpose = "variable"`: uses the variable column, (dropping values if strata exists).
#' Use this if you know that your values are all identical or at least
#' identifiable by the variable name.
#' - `transpose = "value"` : uses the value column, (dropping variables if strata exists).
#' Use this if your values are important and the variable names are
#' generic placeholders.
#' - `transpose = "both"` : combines the variable and value columns.
#' Use this if both the variables and values are important.
#'
#' @param pretty (survey only) if `TRUE`, default, the proportion and CI are merged
#'
#' @param digits (survey only) if `pretty = FALSE`, this indicates the number
#' of digits used for proportion and CI
#'
#' @param method (survey only) a method from [survey::svyciprop()] to calculate
#' the confidence interval. Defaults to "logit".
#'
#' @param deff a logical indicating if the design effect should be reported.
#' Defaults to `TRUE`.
#'
#' @return a [tibble::tibble()] with a column for variables, a column for values,
#' and counts and proportions. If `strata` is not `NULL` and `wide = TRUE`,
#' then there will be separate columns for each strata for the counts and
#' proportions. Survey data will report confidence intervals.
#'
#' @rdname tab_functions
#' @export
#'
#' @examples
#' have_packages <- require("matchmaker") & require("epidict")
#'
#' if (have_packages) {
#' withAutoprint({
#'
#' # Simulating linelist data
#'
#' linelist <- epidict::gen_data("Measles", numcases = 1000, org = "MSF")
#' measles_dict <- epidict::msf_dict("Measles", compact = FALSE)
#'
#' # Cleaning linelist data
#' linelist_clean <- matchmaker::match_df(
#' x = linelist,
#' dictionary = measles_dict,
#' from = "option_code",
#' to = "option_name",
#' by = "data_element_shortname",
#' order = "option_order_in_set"
#' )
#'
#' # get a descriptive table by sex
#' tab_linelist(linelist_clean, sex)
#'
#' # describe prenancy statistics, but remove missing data from the tally
#' tab_linelist(linelist_clean, trimester, na.rm = TRUE)
#'
#' # describe by symptom
#'
#' tab_linelist(linelist_clean,
#' cough, nasal_discharge, severe_oral_lesions,
#' transpose = "value"
#' )
#' # describe prenancy statistics, stratifying by vitamin A perscription
#' tab_linelist(linelist_clean, trimester, sex,
#' strata = prescribed_vitamin_a,
#' na.rm = TRUE, row_total = TRUE
#' )
#' })
#' }
#'
#' have_survey_packages <- require("survey") && require("srvyr")
#' if (have_survey_packages) {
#' withAutoprint({
#' data(api)
#'
#' # stratified sample
#' surv <- apistrat %>%
#' as_survey_design(strata = stype, weights = pw)
#'
#' s <- surv %>%
#' tab_survey(awards, strata = stype, col_total = TRUE, row_total = TRUE, deff = TRUE)
#' s
#'
#' # making things pretty
#' s %>%
#' # wrap all "n" variables in braces (note space before n).
#' epikit::augment_redundant(" (n)" = " n") %>%
#' # relabel all columns containing "prop" to "% (95% CI)"
#' epikit::rename_redundant(
#' "% (95% CI)" = ci,
#' "Design Effect" = deff
#' )
#'
#' # long data
#' surv %>%
#' tab_survey(awards, strata = stype, wide = FALSE)
#'
#' # tabulate binary variables
#' surv %>%
#' tab_survey(yr.rnd, sch.wide, awards, keep = "Yes")
#'
#' # stratify the binary variables
#' surv %>%
#' tab_survey(yr.rnd, sch.wide, awards,
#' strata = stype,
#' keep = "Yes"
#' )
#'
#' # invert the tabulation
#' surv %>%
#' tab_survey(yr.rnd, sch.wide, awards,
#' strata = stype,
#' drop = "Yes",
#' deff = TRUE,
#' row_total = TRUE
#' )
#' })
#' }
tab_linelist <- function(x,
...,
strata = NULL,
keep = TRUE,
drop = NULL,
na.rm = TRUE,
prop_total = FALSE,
row_total = FALSE,
col_total = FALSE,
wide = TRUE,
transpose = NULL,
digits = 1,
pretty = TRUE) {
tab_general(x,
...,
strata = !!rlang::enquo(strata),
keep = keep,
drop = drop,
na.rm = na.rm,
prop_total = prop_total,
row_total = row_total,
col_total = col_total,
wide = wide,
transpose = transpose,
digits = digits,
pretty = pretty
)
}
#' @rdname tab_functions
#' @export
tab_survey <- function(x,
...,
strata = NULL,
keep = TRUE,
drop = NULL,
na.rm = TRUE,
prop_total = FALSE,
row_total = FALSE,
col_total = FALSE,
wide = TRUE,
transpose = NULL,
digits = 1,
method = "logit",
deff = FALSE,
pretty = TRUE) {
tab_general(x,
...,
strata = !!rlang::enquo(strata),
keep = keep,
drop = drop,
na.rm = na.rm,
prop_total = prop_total,
row_total = row_total,
col_total = col_total,
wide = wide,
transpose = transpose,
digits = digits,
method = method,
deff = deff,
pretty = pretty
)
}
tab_general <- function(x,
...,
strata = NULL,
keep = TRUE,
drop = NULL,
na.rm = TRUE,
prop_total = FALSE,
row_total = FALSE,
col_total = FALSE,
wide = TRUE,
transpose = NULL,
digits = 1,
method = "logit",
deff = FALSE,
pretty = TRUE) {
is_survey <- inherits(x, "tbl_svy")
stopifnot(is_survey || is.data.frame(x))
# We try to match the user-supplied variables to the colnames. If the user
# supplied a tidyselect verb (e.g. `starts_with("CHOICE")`, then it should
# filter properly.
xnames <- colnames(x)
names(xnames) <- xnames
# 2020-02-10
#
# tidyselect has updated when I was on vacation and changed its behavior. It
# used to return nothing if one of the columns did not exist, which we could
# fix by wrapping the call in one_of and report which columns were not found,
# but now it's going to take some rethinking about how to handle this
# properly, so at the moment, we are sliently ignoring columns that don't
# match.
vars <- tidyselect::eval_select(rlang::expr(c(...)), data = xnames, strict = FALSE)
vars <- xnames[vars]
if (length(vars) == 0) {
stop("No columns matched the data", call. = FALSE)
}
stra <- rlang::enquo(strata)
flip_it <- wide && !is.null(transpose)
if (flip_it) {
transpose <- match.arg(tolower(transpose), c("variable", "value", "both"))
}
# Create list for results to go into that will eventually be bound together
res <- vector(mode = "list", length = length(vars))
names(res) <- vars
# loop over each name in the list and tabulate the survey for that variable
for (i in names(res)) {
i <- rlang::ensym(i)
if (is_survey) {
res[[i]] <- tabulate_survey(x,
var = !!i,
strata = !!stra,
proptotal = prop_total,
coltotals = col_total,
rowtotals = row_total,
pretty = pretty,
digits = digits,
method = method,
wide = wide,
na.rm = na.rm,
deff = deff
)
} else {
res[[i]] <- descriptive(x,
counter = !!i,
grouper = !!stra,
proptotal = prop_total,
coltotals = col_total,
rowtotals = row_total,
digits = digits,
explicit_missing = !na.rm
)
}
# The ouptut columns will have the value as whatever i was, so we should
# rename this to "value" to make it consistent
names(res[[i]])[names(res[[i]]) == i] <- "value"
res[[i]][["value"]] <- as.character(res[[i]][["value"]])
}
# Combine the results into one table
suppressWarnings(res <- dplyr::bind_rows(res, .id = "variable"))
# return the results with only the selected values
if (!isTRUE(keep) && !is.null(drop)) {
stop("you can only choose to keep values or drop values. Specifying both is not allowed", call. = FALSE)
}
strata_exists <- tidyselect::vars_select(colnames(x), !!stra)
strata_exists <- length(strata_exists) > 0
if (!isTRUE(keep)) {
res <- res[res$value %in% keep, , drop = FALSE]
} else if (!is.null(drop)) {
res <- res[!res$value %in% drop, , drop = FALSE]
} else if (flip_it && !strata_exists && transpose != "both") {
flip_it <- FALSE
# This is the situation where the user doesn't have a stratafying variable,
# but they want to transpose either the variable or value.
the_column <- if (transpose == "variable") "value" else "variable"
res[[the_column]] <- forcats::fct_inorder(res[[the_column]])
res[[transpose]] <- forcats::fct_inorder(res[[transpose]])
res <- widen_tabulation(res,
!!rlang::sym(the_column),
!!rlang::sym(transpose),
pretty = if (is_survey) pretty else FALSE,
digits = digits
)
if (col_total && the_column == "value") {
# prevent Total from appearing as one of the middle rows
res[["value"]] <- forcats::fct_relevel(res[["value"]], "Total", after = Inf)
res <- res[order(res[["value"]]), ]
}
if (col_total && the_column == "variable") {
# prevent Total from appearing as one of the middle columns
good_order <- c(
grep("Total", names(res), invert = TRUE),
grep("Total", names(res))
)
res <- res[good_order]
}
} else {
if (flip_it) {
warning("Cannot transpose data that hasn't been filtered with keep or drop", call. = FALSE)
}
flip_it <- FALSE
}
# If the user wants to transpose the data, then we need to do this for each
# level of data available into separate tables, combine the columns, and then
# rearrange them so that they are grouped by variable/value
if (flip_it) {
res <- flipper(if (is_survey) x$variables else x,
res, transpose,
pretty = pretty, stra = stra
)
}
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.