### tab functions
# Jesse Zlotoff
# 3/4/19
#' Build Collapses
#'
#' Combine values in a given column to create collapsed categories
#'
#' @param df dataframe to edit
#' @param variable string variable/column name to edit
#' @param collapses list of collapses with "label"=c(inputs). use "@auto" to auto-generate labels.
#' @return collapsed dataframe
#' @export
#'
collapser <- function(df, variable, collapses) {
require(lazyeval)
require(dplyr)
new_levels <- c()
for(lab in names(collapses)) {
if(substr(lab,1,5) == "@auto") {
new_lab <- paste(unlist(collapses[lab]), collapse='/')
}
else {
new_lab <- lab
}
new_levels <- c(new_levels, new_lab)
for(v in unlist(collapses[lab])) {
m_call <- lazyeval::interp(quote(ifelse(variable==v, new_lab, as.character(variable))),
variable=as.name(variable))
df <- df %>%
mutate_(.dots = setNames(list(m_call), variable))
}
}
m_call <- lazyeval::interp(quote(factor(variable, levels=new_levels)),
variable=as.name(variable))
df <- df %>%
mutate_(.dots = setNames(list(m_call), variable))
return(df)
}
#' Reorder Tab Columns
#'
#' Reorder dataframe columns to display in logical order for tabs.
#'
#' @param df dataframe to edit
#' @return updated dataframe
#' @export
#'
reorder_columns <- function(df) {
require(dplyr)
pcols <- df %>%
select(-category, -starts_with("n_"), -starts_with("lci_"), -starts_with("uci_")) %>%
names()
old <- names(df)
new <- c("category")
for (c in pcols) {
new <- append(new, c)
if (paste0("lci_",c) %in% old) {new <- append(new, paste0("lci_",c))}
if (paste0("uci_",c) %in% old) {new <- append(new, paste0("uci_",c))}
if (paste0("n_",c) %in% old) {new <- append(new, paste0("n_",c))}
}
df <- df %>%
select(new)
return(df)
}
#' Valid Tab Inputs
#'
#' Check input dataframe and column(s) to tab functions.
#'
#' @param df dataframe to check
#' @param v1 variable name to check
#' @param v2 variable name to check. DEFAULT "NULL"
#' @return error/warning message, or empty string
#' @export
#'
valid_tab_inputs <- function(df, v1, v2="NULL") {
if (!is.data.frame(df)) {
return("first argument is not a dataframe")
}
if (!is.element(v1,names(df))) {
return(paste(v1, "not present in dataframe"))
}
if ("vv1" %in% names(df)) {
return("vv1 already present in dataframe")
}
if (v2!="NULL") {
if (!is.element(v2,names(df))) {
return(paste(v2, "not present in dataframe"))
}
if ("vv2" %in% names(df)) {
return("vv2 already present in dataframe")
}
}
return("")
}
#' Weighted Tabs
#'
#' Run weighted crosstabs on one or two variables, using quoted inputs.
#'
#' @param df input dataframe
#' @param v1 string name of 1st variable/column to tabulate
#' @param v2 string name of optional 2nd variable/column to tablulate. DEFAULT "NULL"
#' @param weight_var string name of weight variable/column. DEFAULT NULL
#' @param sdesign survey package surveydesign object. DEFAULT NULL
#' @param nsize boolean flag to include n-sizes in output. DEFAULT FALSE
#' @param ci boolean flag to include lower- and upper- bounds of confidence intervals. DEFAULT FALSE
#' @param to_factor boolean flag to convert v1/v2 to factors, if needed. DEFAULT TRUE
#' @return dataframe of tab results
#' @export
wtab <- function(df, v1, v2 = "NULL", weight_var = NULL, sdesign = NULL, nsize = FALSE, ci = FALSE, to_factor=TRUE) {
require(dplyr)
require(tibble)
require(stringr)
# check input
msg <- valid_tab_inputs(df, v1, v2)
if (msg!="") {
stop(msg)
}
# drop NA values of v1 and, if needed, v2
df <- df %>%
rename_("vv1" = v1) %>%
filter(!is.na(vv1))
if (v2!="NULL") {
df <- df %>%
rename_("vv2" = v2) %>%
filter(!is.na(vv2))
}
# check that v1/v2 are factors
if (!is.factor(df$vv1)) {
if (to_factor==TRUE) {
df$vv1 <- as.factor(df$vv1)
} else {
stop(v1, ' is not a factor variable. Set to_factor=TRUE to automatically convert it.')
geterrmessage()
}
}
if (v2!="NULL") {
if (!is.factor(df$vv2)) {
if (to_factor==TRUE) {
df$vv2 <- as.factor(df$vv2)
} else {
stop(v2, ' is not a factor variable. Set to_factor=TRUE to automatically convert it.')
geterrmessage()
}
}
}
if (is.null(sdesign)) { # need to build svy design
wform <- reformulate(weight_var)
sdesign <- svydesign(ids = ~0, data = df, weights = wform)
}
est <- svymean(~vv1, design = sdesign) %>%
as.tibble() %>%
rownames_to_column("category") %>%
rename(total = mean, se_total = SE)
if(nsize == TRUE) { # calculate weighted n for total
wtdn <- svytotal(~vv1, design = sdesign) %>%
as.tibble() %>%
rownames_to_column("category") %>%
rename(n_total = total) %>%
select(-SE)
est <- est %>%
left_join(wtdn, by="category")
rm(wtdn)
}
if(ci == TRUE) { # calculate moe/ci for total
est <- est %>%
mutate(moe_total = 1.96 * se_total,
lci_total = total - moe_total,
uci_total = total + moe_total) %>%
select(-moe_total, everything())
}
est <- est %>%
select(-se_total)
# two-way tab
if(v2!="NULL") {
est2 <- svyby(~vv1, ~vv2, design = sdesign, FUN = svymean, keep.names = FALSE) %>%
as.tibble() %>%
gather(key = category, value = value, -vv2) %>%
mutate(col = ifelse(grepl("^se\\.", category), paste0("se_", vv2) , paste(vv2)),
category = sub("^se\\.(.*)", "\\1", category)
) %>%
select(-vv2) %>%
spread(key = col, value = "value")
if(nsize == TRUE) { # calculate weighted n for subgroups
wtdn <- svyby(~vv1, ~vv2, design = sdesign, FUN = svytotal, keep.names = FALSE) %>%
as.tibble() %>%
select(-starts_with("se.")) %>%
gather(key = category, value = value, -vv2) %>%
mutate(col = paste0("n_", vv2)) %>%
select(-vv2) %>%
spread(key = col, value = "value")
est2 <- est2 %>%
left_join(wtdn, by="category")
rm(wtdn)
}
if(ci == TRUE) { # calculate moe/ci for subgroups, via reshaping
ci <- est2 %>%
select(-starts_with("n_")) %>%
gather(key, val, -category) %>%
separate(key, c("prefix", "col"), fill = "left") %>%
spread(prefix, val) %>%
rename(p = `<NA>`)
ci <- ci %>%
mutate(moe = 1.96 * se,
lci = p - moe,
uci = p + moe)
ci <- ci %>%
gather(key, val, -c(category, col)) %>%
unite(col, c(key, col)) %>%
filter(!grepl("^se_", col),
!grepl("^p_", col)) %>%
spread(col, val)
est2 <- est2 %>%
left_join(ci, by="category")
rm(ci)
}
est <- est2 %>%
full_join(est, by="category")
}
est <- est %>%
mutate(category = str_replace(category, paste0("^vv1", "(.*)"), "\\1")) %>%
select(-starts_with("se_"),
-starts_with("moe_")) %>%
select(-starts_with("n_"),
-starts_with("lci_"),
-starts_with("uci_"),
everything()) %>%
select(category, total, everything())
est <- reorder_columns(est)
return(est)
}
#' Unweighted Tabs
#'
#' Run unweighted crosstabs on one or two variables, using quoted inputs.
#'
#' @param df input dataframe
#' @param v1 string name of 1st variable/column to tabulate
#' @param v2 string name of optional 2nd variable/column to tablulate. DEFAULT "NULL"
#' @param nsize boolean flag to include n-sizes in output. DEFAULT FALSE
#' @param ci boolean flag to include lower- and upper- bounds of confidence intervals. DEFAULT FALSE
#' @return dataframe of tab results
#' @export
utab <- function(df, v1, v2 = "NULL", nsize = FALSE, ci = FALSE) {
require(dplyr)
require(tibble)
require(tidyr)
require(stringr)
# check input
msg <- valid_tab_inputs(df, v1, v2)
if (msg!="") {
stop(msg)
}
est <- df %>%
select(one_of(v1)) %>%
table() %>%
as.tibble() %>%
mutate(total = n / sum(n)) %>%
rename(category = 1, n_total = n) %>%
select(category, total, n_total)
if (ci == TRUE) {
est <- est %>%
mutate(moe = 1.96 * sqrt((total * (1-total))/n_total),
lci_total = total - moe,
uci_total = total + moe
) %>%
select(-moe)
}
if(v2!="NULL") {
est2 <- df %>%
select_(v1, v2) %>%
table() %>%
as.tibble() %>%
spread(key = 2, value = n) %>%
rename_at(vars(-1), .funs=funs(paste0(., "_n"))) %>%
mutate_at(vars(-1), .funs=funs(pct = . / sum(.))) %>%
rename_at(vars(ends_with("_n_pct")), .funs = funs(substr(., 1, str_length(.) -6))) %>%
rename_at(vars(ends_with("_n")), .funs = funs(sub("^([^_]+)_n", "n_\\1", .))) %>%
rename(category = 1)
if (ci == TRUE) {
ci <- est2 %>%
gather(key, val, -category) %>%
separate(key, c("prefix", "col"), fill = "left") %>%
spread(prefix, val) %>%
rename(p = `<NA>`)
ci <- ci %>%
group_by(col) %>%
mutate(col_n = sum(n)) %>%
ungroup() %>%
mutate(moe = 1.96 * sqrt((p * (1-p))/col_n),
lci = p - moe,
uci = p + moe) %>%
select(-col_n, -moe)
ci <- ci %>%
gather(key, val, -c(category, col)) %>%
unite(col, c(key, col)) %>%
filter(!grepl("^n_", col),
!grepl("^p_", col)) %>%
spread(col, val)
est2 <- est2 %>%
left_join(ci, by="category")
rm(ci)
}
est <- est %>%
full_join(est2, by="category")
}
if(nsize == FALSE) {
est <- est %>%
select(-starts_with("n_"))
}
est <- est %>%
select(category, total, everything())
est <- reorder_columns(est)
return(est)
}
#' Tabs - Quoted Input
#'
#' Run weighted or unweighted crosstabs on one or two variables, using quoted inputs.
#'
#' @param df input dataframe
#' @param v1 string name of 1st variable/column to tabulate
#' @param v2 string name of optional 2nd variable/column to tablulate. DEFAULT "NULL"
#' @param weight_var string name of weight variable/column. DEFAULT NULL
#' @param sdesign survey package surveydesign object. DEFAULT NULL
#' @param nsize boolean flag to include n-sizes in output. DEFAULT FALSE
#' @param ci boolean flag to include lower- and upper- bounds of confidence intervals. DEFAULT FALSE
#' @param to_factor boolean flag to convert v1/v2 to factors, if needed. DEFAULT TRUE
#' @param collapses list of categories to collapse as "new name"=c(inputs) DEFAULT list()
#' @return dataframe of tab results
#' @seealso \code{\link{wtab}}, \code{\link{utab}} which this function wraps
#' @export
stab <- function(df, v1, v2 = "NULL", weight_var = "NULL", sdesign = NULL, nsize = FALSE, ci = FALSE, to_factor=TRUE, collapses=list()) {
require(survey)
if (weight_var=="NULL" & is.null(sdesign)) {
est <- utab(df, v1, v2 = v2, nsize = nsize, ci = ci)
} else {
est <- wtab(df, v1, v2 = v2, weight_var = weight_var, sdesign = sdesign, nsize = nsize, ci = ci, to_factor=to_factor)
}
if (length(collapses) > 0) {
cdf <- collapser(df, v1, collapses)
if (weight_var=="NULL" & is.null(sdesign)) {
est2 <- utab(cdf, v1, v2 = v2, nsize = nsize, ci = ci)
} else {
est2 <- wtab(cdf, v1, v2 = v2, weight_var = weight_var, sdesign = sdesign, nsize = nsize, ci = ci, to_factor=to_factor)
}
est <- est %>%
bind_rows(est2)
}
return(est)
}
#' Tabs - Unquoted Input
#'
#' Run weighted or unweighted crosstabs on one or two variables, using unquoted inputs.
#'
#' @param df input dataframe
#' @param v1 name of 1st variable/column to tabulate
#' @param v2 name of optional 2nd variable/column to tablulate. DEFAULT NULL
#' @param weight_var name of weight variable/column. DEFAULT NULL
#' @param sdesign survey package surveydesign object. DEFAULT NULL
#' @param nsize boolean flag to include n-sizes in output. DEFAULT FALSE
#' @param ci boolean flag to include lower- and upper- bounds of confidence intervals. DEFAULT FALSE
#' @param to_factor boolean flag to convert v1/v2 to factors, if needed. DEFAULT TRUE
#' @param collapses list of categories to collapse as "new name"=c(inputs) DEFAULT list()
#' @return dataframe of tab results
#' @seealso \code{\link{stab}} which this function wraps
#' @export
tab <- function(df, v1, v2 = NULL, weight_var = NULL, sdesign = NULL, nsize = FALSE, ci = FALSE, to_factor=TRUE, collapses=list()) {
wt_str <- deparse(substitute(weight_var))
v1_str <- deparse(substitute(v1))
v2_str <- deparse(substitute(v2))
paste(c(v1_str, v2_str, wt_str))
est <- stab(df, v1_str, v2 = v2_str, weight_var = wt_str, sdesign = sdesign, nsize = nsize, ci = ci, to_factor=to_factor, collapses=collapses)
return(est)
}
#' Clean Up Column Names
#'
#' Convert machine-readable tab column names to human-readable names.
#'
#' @param col_name string column name
#' @return string updated column name
#' @export
pretty_col_name <- function(col_name) {
s <- strsplit(as.character(col_name), "_")[[1]]
col_name <- paste(toupper(substring(s, 1,1)), substring(s, 2),
sep="", collapse=" ")
return(col_name)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.