Nothing
#' Construct Value Label-Friendly Frequency Tables
#'
#' @description
#' `tabl` calculates raw or weighted frequency counts (or proportions) over
#' arbitrary categorical values (including integer values), which may be
#' expressed in terms of raw variable values or labelr label values.
#'
#' @details
#' This function creates a labelr-friendly data.frame representation of
#' multi-variable tabular data, where either value labels or values can be
#' displayed (use of value labels is the default), and where various convenience
#' options are provided, such as using frequency weights, using proportions
#' instead of counts, rounding those percentages, automatically expressing
#' many-valued, non-value-labeled numerical variables in terms of quantile
#' category groups, or pivoting / casting one of the categorical variables'
#' levels (labels) to serve as columns in a cross-tab-like table.
#'
#' @param data a data.frame.
#' @param vars a quoted character vector of variable names of variables you wish
#' to include in defining category groups to tabulate over in the table. If NULL
#' `tabl` will attempt to construct a table over all combinations of all
#' non-decimal-having variables in the data.frame that do not exceed the
#' max.unique.vals threshold. Additionally, note the effects of the qtiles
#' argument.
#' @param labs.on if TRUE (the default), then value labels -- rather than the
#' raw variable values -- will be displayed in the returned table for any
#' value-labeled variables. Variables need not be value-labeled: This command
#' (with this option set to TRUE or FALSE) will work even when no variables are
#' value-labeled.
#' @param qtiles if not NULL, must be a 1L integer between 2 and 100 indicating
#' the number of quantile categories to employ in temporarily (for purposes of
#' tabulation) auto-value-labeling numeric columns that exceed the max.unique.vals
#' threshold. If NULL, no such auto-value-labeling will take place. Note: When
#' labs.on = TRUE, any pre-existing variable value labels will be used in favor
#' of the quantile value labels generated by this argument. By default,
#' qtiles = 4, and the automatically generated quantile category levels will be
#' labeled as "q025" (i.e., first quartile), "q050", "q075", and "q100".
#' @param prop.digits if non-NULL, cell percentages (proportions) will be
#' returned instead of frequency counts, and these will be rounded to the digit
#' specified (e.g., prop.digits = 3 means a value of 0.157 would be returned for
#' a cell that accounted for 8 observations if the total number of observations
#' were 51). If NULL (the default), frequency counts will be returned.
#' @param wt an optional vector that includes cell counts or some other
#' idiosyncratic "importance" weight. If NULL, no weighting will be employed.
#' @param div.by Divide the returned counts by a constant for scaling purposes.
#' This may be a number (e.g., div.by = 10 to divide by 10) or a character that
#' follows the convention "number followed by 'K', 'M', or 'B'", where, e.g.,
#' "10K" is translated as 10000, "1B" is translated as 1000000000, etc.
#' @param max.unique.vals Integer to specify the maximum number of unique values
#' of a variable that may be observed for that variable to be included in
#' tabulations. Note that labelr sets a hard ceiling of 5000 on the total number
#' of unique value labels that any variable is permitted to have under any
#' circumstance, as labelr is primarily intended for interactive use with
#' moderately-sized data.frames. See the qtiles argument for an approach to
#' incorporating many-valued numeric variables that exceed the max.unique.vals
#' threshold.
#' @param sort.freq By default, returned table rows are sorted in descending
#' order of cell frequency (most frequent categories/combinations first). If set
#' to FALSE, table rows will be sorted by the the distinct values of the vars
#' (in the order vars are specified).
#' @param zero.rm If TRUE, zero-frequency vars categories/combinations (i.e.,
#' those not observed in the data.frame) will be filtered from the table. For
#' tables that would produce more than 10000 rows, this is done automatically.
#' @param irreg.rm If TRUE, tabulations exclude cases where any applicable
#' variable (see vars argument) features any of the following "irregular"
#' values: NA, NaN, Inf, -Inf, or any non-case-sensitive variation on "NA",
#' "NAN", "INF", or "-INF." If FALSE, all "irregular" values (as just defined)
#' are assigned to a "catch-all" category of NA that is featured in the
#' returned table (if/where present).
#' @param wide.col If non-NULL, this is the quoted name of a single column / var
#' of supplied data.frame whose distinct values (category levels) you wish to be
#' columns of the returned table. For example, if you are interested in a
#' cross-tab of "edu" (highest level of education) and "race" (a race/ethnicity
#' variable), you could supply vars= c("edu") and wide.col = "race", and the
#' different racial-ethnic group categories would appear as distinct columns,
#' with "edu" category levels appearing as distinct rows, and cell values
#' representing the cross-tabbed cell "edu" level frequencies for the respective
#' "race" groups (see examples). You may supply one wide.col at most.
#'
#' @return a data.frame.
#' @importFrom stats reshape aggregate
#' @export
#'
#' @examples
#' # assign mtcars to new data.frame df
#' df <- mtcars
#'
#' # add na values to make things interesting
#' df[1, 1:11] <- NA
#' rownames(df)[1] <- "Missing Car"
#'
#' # add value labels
#' df <- add_val_labs(
#' data = df,
#' vars = "am",
#' vals = c(0, 1),
#' labs = c("automatic", "manual")
#' )
#'
#' df <- add_val_labs(
#' data = df,
#' vars = "carb",
#' vals = c(1, 2, 3, 4, 6, 8),
#' labs = c(
#' "1-carb", "2-carbs",
#' "3-carbs", "4-carbs",
#' "6-carbs", "8-carbs"
#' )
#' )
#'
#' # var arg can be unquoted if using add_val1()
#' # note that this is not add_val_labs(); add_val1() has "var" arg instead of "vars
#' df <- add_val1(
#' data = df,
#' var = cyl, # note, "var," not "vars" arg
#' vals = c(4, 6, 8),
#' labs = c(
#' "four-cyl",
#' "six-cyl",
#' "eight-cyl"
#' )
#' )
#'
#' df <- add_val_labs(
#' data = df,
#' vars = "gear",
#' vals = 3:5,
#' labs = c(
#' "3-speed",
#' "4-speed",
#' "5-speed"
#' )
#' )
#'
#'
#' # lookup mapping
#' get_val_labs(df)
#'
#' # introduce other "irregular" values
#' df$am[1] <- NA
#'
#' df[2, "am"] <- NaN
#' df[3, "am"] <- -Inf
#' df[5, "cyl"] <- "NAN"
#'
#' # take a look
#' head(df)
#'
#' # demonstrate tabl() frequency tabulation function
#'
#' # this is the "first call" that will be referenced repeatedly below
#' # labels on, sort by variable values, suppress/exclude NA/irregular values
#' # ...return counts
#' tabl(df,
#' vars = c("cyl", "am"),
#' labs.on = TRUE, # use variable value labels
#' sort.freq = FALSE, # sort by vars values (not frequencies)
#' irreg.rm = TRUE, # NAs and the like are suppressed
#' prop.digits = NULL
#' ) # return counts, not proportions
#'
#' # same as "first call", except now value labels are off
#' tabl(df,
#' vars = c("cyl", "am"),
#' labs.on = FALSE, # use variable values
#' sort.freq = FALSE, # sort by vars values (not frequencies)
#' irreg.rm = TRUE, # NAs and the like are suppressed
#' prop.digits = NULL
#' ) # return counts, not proportions
#'
#' # same as "first call," except now proportions instead of counts
#' tabl(df,
#' vars = c("cyl", "am"),
#' labs.on = TRUE, # use variable value labels
#' sort.freq = FALSE, # sort by vars values (not frequencies)
#' irreg.rm = TRUE, # NAs and the like are suppressed
#' prop.digits = 3
#' ) # return proportions, rounded to 3rd decimal
#'
#' # same as "first call," except now sort by frequency counts
#' tabl(df,
#' vars = c("cyl", "am"),
#' labs.on = TRUE, # use variable value labels
#' sort.freq = TRUE, # sort in order of descending frequency
#' irreg.rm = TRUE, # NAs and the like are suppressed
#' prop.digits = NULL
#' ) # return proportions, rounded to 3rd decimal
#'
#' # same as "first call," except now use weights
#' set.seed(2944) # for reproducibility
#' df$freqwt <- sample(10:50, nrow(df), replace = TRUE) # create (fake) freq wts
#' tabl(df,
#' vars = c("cyl", "am"),
#' wt = "freqwt", # use frequency weights
#' labs.on = TRUE, # use variable value labels
#' sort.freq = FALSE, # sort by vars values (not frequencies)
#' irreg.rm = FALSE, # NAs and the like are included/shown
#' prop.digits = NULL
#' ) # return counts, not proportions
#'
#' df$freqwt <- NULL # we don't need this anymore
#'
#' # now, with extremely large weights to illustrate div.by
#' set.seed(428441) # for reproducibility
#' df$freqwt <- sample(1000000:10000000, nrow(df), replace = TRUE) # large freq wts
#' tabl(df,
#' vars = c("cyl", "am"),
#' wt = "freqwt", # use frequency weights
#' labs.on = TRUE, # use variable value labels
#' sort.freq = FALSE, # sort by vars values (not frequencies)
#' irreg.rm = FALSE, # NAs and the like are included/shown
#' prop.digits = NULL
#' ) # return counts, not proportions
#'
#' # show div by - Millions
#' tabl(df,
#' vars = c("cyl", "am"),
#' wt = "freqwt", # use frequency weights
#' labs.on = TRUE, # use variable value labels
#' sort.freq = FALSE, # sort by vars values (not frequencies)
#' irreg.rm = FALSE, # NAs and the like are included/shown
#' prop.digits = NULL, # return counts, not proportions
#' div.by = "1M"
#' ) # one million
#'
#' # show div by - Tens of millions
#' tabl(df,
#' vars = c("cyl", "am"),
#' wt = "freqwt", # use frequency weights
#' labs.on = TRUE, # use variable value labels
#' sort.freq = FALSE, # sort by vars values (not frequencies)
#' irreg.rm = FALSE, # NAs and the like are included/shown
#' prop.digits = NULL, # return counts, not proportions
#' div.by = "10M"
#' ) # ten million
#'
#' # show div by - 10000
#' tabl(df,
#' vars = c("cyl", "am"),
#' wt = "freqwt", # use frequency weights
#' labs.on = TRUE, # use variable value labels
#' sort.freq = FALSE, # sort by vars values (not frequencies)
#' irreg.rm = FALSE, # NAs and the like are included/shown
#' prop.digits = NULL, # return counts, not proportions
#' div.by = 10000
#' ) # ten thousand; could've used div.by = "10K"
#'
#' # show div by - 10000, but different syntax
#' tabl(df,
#' vars = c("cyl", "am"),
#' wt = "freqwt", # use frequency weights
#' labs.on = TRUE, # use variable value labels
#' sort.freq = FALSE, # sort by vars values (not frequencies)
#' irreg.rm = FALSE, # NAs and the like are included/shown
#' prop.digits = NULL, # return counts, not proportions
#' div.by = "10K"
#' ) # ten thousand; could've used div.by = 10000
#'
#' df$freqwt <- NULL # we don't need this anymore
#'
#' # turn labels off, to make this more compact
#' # do not show zero values (zero.rm)
#' # do not show NA values (irreg.rm)
#' # many-valued numeric variables will be converted to quantile categories by
#' # ...qtiles argument
#' tabl(df,
#' vars = c("am", "gear", "carb", "mpg"),
#' qtiles = 4, # many-valued numerics converted to quantile
#' labs.on = FALSE, # use values, not variable value labels
#' sort.freq = FALSE, # sort by vars values (not frequencies)
#' irreg.rm = TRUE, # NAs and the like are suppressed
#' zero.rm = TRUE, # variable combinations that never occur are suppressed
#' prop.digits = NULL, # return counts, not proportions
#' max.unique.vals = 10
#' ) # drop from table any var with >10 distinct values
#'
#' # same as above, but include NA/irregular category values,
#' # zero.rm is TRUE; include unobserved (zero-count) category combinations
#' tabl(df,
#' vars = c("am", "gear", "carb", "mpg"),
#' qtiles = 4,
#' labs.on = FALSE, # use values, not variable value labels
#' sort.freq = TRUE, # sort by frequency
#' irreg.rm = FALSE, # preserve/include NAs and irregular values
#' zero.rm = FALSE, # include non-observed combinations
#' prop.digits = NULL, # return counts, not proportions
#' max.unique.vals = 10
#' ) # drop from table any var with >10 distinct values
#'
#' # show cross-tab view with wide.col arg
#' tabl(df,
#' vars = c("cyl", "am"),
#' labs.on = TRUE, # use variable value labels
#' sort.freq = TRUE, # sort by vars values (not frequencies)
#' irreg.rm = TRUE, # NAs and the like are suppressed
#' prop.digits = NULL, # return counts, not proportions
#' wide.col = "am"
#' ) # use "am" as a column variable in a cross-tab view
#'
#' tabl(df,
#' vars = c("cyl", "am"),
#' labs.on = TRUE, # use variable value labels
#' sort.freq = TRUE, # sort by vars values (not frequencies)
#' irreg.rm = TRUE, # NAs and the like are suppressed
#' prop.digits = NULL, # return counts, not proportions
#' wide.col = "cyl"
#' ) # use "cyl" as a column variable in a cross-tab view
#'
#' # verify select counts using base::subset()
#' nrow(subset(df, am == 0 & cyl == 4))
#' nrow(subset(df, am == 0 & cyl == 8))
#' nrow(subset(df, am == 1 & cyl == 8))
#' nrow(subset(df, am == 0 & cyl == 6))
#' nrow(subset(df, am == 1 & cyl == 6))
#'
#' # will work on an un-labeled data.frame
#' tabl(mtcars, vars = c("am", "gear", "carb", "mpg"))
tabl <- function(data,
vars = NULL,
labs.on = TRUE,
qtiles = 4,
prop.digits = NULL,
wt = NULL,
div.by = NULL,
max.unique.vals = 10,
sort.freq = TRUE,
zero.rm = FALSE,
irreg.rm = FALSE,
wide.col = NULL) {
############################################################################
# tabl_df - internal function to calculate compact tables and weighted tables
############################################################################
tabl_df <- function(data, vars = NULL, group.id.name = "gid", wts = NULL) {
# safe_char_val() - search for presence of character stub in x
# ...and identify the first variant of stub that is not already in x
# "variant of stub" means the leading stub characters with numbers affixed
# ..afterward
safe_char_val <- function(x, stub) {
if (is.data.frame(x)) {
x <- as.data.frame(x)
x <- names(x)
}
if (!stub %in% x) {
the_name <- stub
} else {
the_name <- NULL
found_it <- FALSE
count <- 0
while (!found_it) {
count <- count + 1
the_name <- paste0(stub, "_", count)
if (!the_name %in% x) found_it <- TRUE
}
}
return(the_name)
}
# end safe_char_val
group_id_nm <- safe_char_val(names(data), group.id.name)
if (is.null(vars)) vars <- names(data)
if (!is.null(wts)) vars <- base::setdiff(vars, wts)
data_l <- data.frame(table(data[vars]))
data_l <- data.frame(col1 = rownames(data_l), data_l)
names(data_l)[1] <- group_id_nm
names(data_l)[ncol(data_l)] <- "n"
i <- sapply(data_l, is.factor)
data_l[i] <- lapply(data_l[i], as.character)
if (!is.null(wts)) {
data <- merge(data_l, data, all = TRUE)
data_z <- tapply(data[[wts]], data[["gid"]], sum)
data_z <- data.frame(col1 = names(data_z), "n.wtd" = unname(data_z))
data_z[is.na(data_z$n.wtd), "n.wtd"] <- 0
names(data_z)[1] <- group_id_nm
i <- sapply(data_z, is.character)
data_z[i] <- lapply(data_z[i], as.numeric)
data_l <- merge(data_l, data_z, by = group_id_nm)
}
data_l[[1]] <- NULL
return(data_l)
}
############################################################################
# end tabl_df() - begin main tabl() function code
############################################################################
# make this a Base R data.frame
data <- as_base_data_frame(data)
# get nrow
if (nrow(data) > 300000) {
warning("
Note: tabl() is not optimized for data.frames this large.\n")
}
# grab weights value if present
if (!is.null(wt)) {
if (!wt %in% names(data)) {
stop("
wt arg must be a colname of supplied data.frame. No such column found.")
}
wts <- data[[wt]]
# check weights variable (numeric)
if (!is.numeric(wts)) {
stop("
wt argument must be a numeric variable.")
}
# check weights variable (irregular)
if (any(unname(check_irregular(wts)))) {
stop("
wt variable may not include NA or other irregular (e.g., NaN) values.")
}
# check weights values (all >=0)
if (any(wts < 0)) {
stop("
All weights must be numeric values >=0.")
}
}
# make sure only one wide.col supplied
if (!is.null(wide.col)) {
if (length(wide.col) != 1) {
stop("
You may not specify more than one wide.col.")
}
}
# safely drop out of scope columns
if (!is.null(vars) && !is.null(wide.col)) {
vars <- unique(c(vars, wide.col))
} else if (!is.null(vars) && is.null(wide.col)) {
vars <- vars
} else {
vars <- names(data)
}
# make sure all selected vars are found in data.frame
if (any(!vars %in% names(data))) {
stop("
At least one colname arg to vars or wide.col not found in supplied data.frame.")
} else {
data <- sbrac(data, , vars) # subset, preserving labels
}
# check max vals - 5000 unique value labels for a variable is a hard cap:
# Under no circumstances can a variable with 5000 distinct values receive value
# ...labels
if (max.unique.vals > 5000) {
stop("
\n max.unique.vals may not exceed 5000.")
}
# turn on value labels, if specified
if (labs.on) {
# add quantile labels if specified
if (!is.null(qtiles)) {
data <- suppressWarnings(all_quant_labs(data,
qtiles = qtiles,
unique.vals.thresh = max.unique.vals
))
}
data <- suppressWarnings(use_val_labs(data))
} else {
data <- strip_labs(data)
# add quantile labels if specified
if (!is.null(qtiles)) {
data <- suppressWarnings(all_quant_labs(data,
qtiles = qtiles,
unique.vals.thresh = max.unique.vals
))
}
data <- suppressWarnings(use_val_labs(data))
}
# drop vars with decimal points or too many unique values
num_vars_to_drop <- sapply(
data,
function(x) {
length(unique(x)) > max.unique.vals |
has_decv(x)
}
)
if (any(num_vars_to_drop)) {
names_to_drop <- names(num_vars_to_drop)[unname(num_vars_to_drop)]
for (i in seq_along(names_to_drop)) {
this_name <- names_to_drop[i]
warning(sprintf("
Excluding variable --%s-- (includes decimals or exceeds max.unique.vals).\n", this_name))
}
data <- data[!num_vars_to_drop]
data <- as.data.frame(data)
vars <- names(data)
}
# combinations
combos <- prod(sapply(data, function(x) length(unique(x, na.rm = TRUE))))
# zero.rm
if (combos > 100000 && !zero.rm) {
zero.rm <- TRUE
warning("
Requested table would be >100000 rows. Excluding zero-frequency (unobserved) combinations")
}
# find a safe name to use (one not already in vars)
if (!"vars" %in% vars) {
the_name <- "vars"
} else {
the_name <- NULL
found_it <- FALSE
count <- 0
while (!found_it) {
count <- count + 1
the_name <- paste0("vars", "_", count)
if (!the_name %in% vars) found_it <- TRUE
}
}
# convert factors to character
data <- as.data.frame(data)
i <- sapply(data, is.factor)
data[i] <- lapply(data[i], as.character)
# convert irregular values to "NA" function over all remaining variables
data <- lapply(data, irregular2v, to = "NA")
data <- do.call("cbind", data)
data <- as.data.frame(data)
# do weighted counts if wt arg is not NULL (see sapply() call below w/ sum())
if (!is.null(wt)) {
last_col_name <- "n.wtd"
data <- cbind(data, wts) # restore weights
data <- as.data.frame(data)
names(data)[ncol(data)] <- wt
data2 <- tabl_df(data, vars = vars, wts = wt)
data2[["n"]] <- NULL
# do unweighted counts if wt arg is NULL (see sapply() call below w/ nrow())
} else {
last_col_name <- "n"
data2 <- tabl_df(data, vars = vars)
}
# remove irregular values, as requested
if (irreg.rm) {
irreg_rows <- unname(which(apply(data2, 1, function(x) any(x == "NA"))))
if (length(irreg_rows) != 0) data2 <- data2[-c(irreg_rows), ]
}
# remove rows with zero counts, as requested
if (zero.rm) {
zero_rows <- unname(which(data2[[last_col_name]] == 0))
if (length(zero_rows) != 0) data2 <- data2[-c(zero_rows), ]
}
# sort by frequency counts if that option is TRUE
# else by var values
# sort table results
# sort by frequency counts, then by vars for var combinations that
# share the same frequency count
if (sort.freq) {
sort_vars <- c(last_col_name, vars)
desc_args <- c(TRUE, rep(FALSE, length(vars)))
data2 <- suppressWarnings(
suppressMessages(
ssort(data2, sort_vars, desc_args)
)
)
# else, just sort by vars
} else {
data2 <- suppressWarnings(
suppressMessages(
ssort(data2, vars)
)
)
}
# use percents (proportions) instead of counts if prop.digits is not NULL
if (!is.null(prop.digits)) {
name_x <- names(data2)[ncol(data2)]
data2[[name_x]] <- data2[[name_x]] / sum(data2[[name_x]])
data2[[name_x]] <- round(data2[[name_x]], digits = prop.digits)
}
# allow for dividing totals by some constant, if one is specified
# and if prop.digits argument is NULL
if (!is.null(div.by) && is.null(prop.digits)) {
if (is.character(div.by)) {
div.by <- sub("B", "000000000", toupper(div.by))
div.by <- sub("M", "000000", toupper(div.by))
div.by <- sub("K", "000", toupper(div.by))
div.by <- as.integer(div.by)
}
data2[[ncol(data2)]] <- data2[[ncol(data2)]] / div.by
}
rownames(data2) <- 1:nrow(data2)
# "cast" / "pivot wider" by wide.col, if we've specified one
if (!is.null(wide.col)) {
vals.var <- names(data2)[ncol(data2)]
other.vars <- names(data2)[!names(data2) %in% c(wide.col, vals.var)]
data2 <- as_base_data_frame(data2)
data2 <- stats::reshape(data2,
timevar = wide.col,
idvar = other.vars,
direction = "wide"
)
vals.var <- paste0(vals.var, "\\.")
names(data2) <- gsub(vals.var, "", names(data2))
data2 <- as.data.frame(data2)
data2 <- data2[names(data2)]
data2 <- as.data.frame(data2)
# convert NA to 0 (counts) in new pivoted-wider cols
orig_vars <- base::setdiff(vars, wide.col)
new_vars <- base::setdiff(names(data2), orig_vars)
for (i in new_vars) {
data2[[i]] <- as_numv(data2[[i]])
data2[is.na(data2[[i]]), i] <- 0
}
}
# restore numeric status to any variables for which this makes sense
data2 <- as_num(data2)
# convert irregular values to NA values
data2 <- lapply(data2, irregular2v, to = NA)
data2 <- do.call("cbind", data2)
data2 <- as.data.frame(data2)
data2[[ncol(data2)]] <- as.numeric(data2[[ncol(data2)]])
return(data2)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.