#' Compares a variable across a group, both visually and with an appropriate statistical test.
#'
#' @param data data frame. The data to test.
#' @param ... The variables to compare, plus any relevant filters contained within `filters()`. Currently supports one or two variables.
#' @param type character. "Continuous" if `variable` is continuous data (like age), or "categorical" / "discrete" if `variable` is categorical data (like hometown).
#' This can be automatically inferred by whether the variable is numeric or not.
#' @param na.rm logical. Whether or not to remove NAs from the variables being considered.
#' @param top numeric. See documentation for `tab`.
#' @aliases ctable ctab
#' @export
comparison_table <- function(.data, ..., type = NULL, na.rm = FALSE, top = 0) {
dots <- lazyeval::lazy_dots(...)
filter_data <- apply_filters(.data, dots)
.data <- filter_data$data
dots <- filter_data$dots
.print_filters <- filter_data$print_filters
variable <- dots[[1]]
groupby <- dots[[2]]
comparison_table_(.data, variable, groupby, type = type, na.rm = na.rm, top = top, .print_filters = .print_filters)
}
#' @export
#' @import checkr magrittr
comparison_table_ <- checkr::ensure(
pre = list(data %is% dataframe,
variable %is% simple_string || variable %is% lazy,
groupby %is% simple_string || groupby %is% lazy,
type %is% NULL || type %in% c("continuous", "discrete", "categorical"),
na.rm %is% logical,
top %is% numeric, top >= 0,
is.null(.print_filters) || is.character(.print_filters)
),
function(data, variable, groupby, type = NULL, na.rm = FALSE, top = 0, .print_filters = NULL) {
if (variable %is% lazy) {
x <- lazyeval::lazy_eval(variable, data = data)
y <- lazyeval::lazy_eval(groupby, data = data)
} else {
x <- data[[variable]]
y <- data[[groupby]]
}
if (isTRUE(na.rm)) {
x_ <- x[!is.na_like(x) & !is.na_like(y)]
y_ <- y[!is.na_like(x) & !is.na_like(y)]
x <- x_
y <- y_
}
if (length(x) != length(y)) stop("Lengths of x and y differ.")
if (is.null(type)) {
if (is.numeric(x)) { type <- "continuous" }
else { type <- "categorical" }
}
if (identical(type, "discrete")) { type <- "categorical" }
out <- list(
table = table_for(data, variable, groupby, type = type, na.rm = na.rm,
top = top, .print_filters = .print_filters),
stat = stat_for(x, y, type = type)
)
class(out) <- "comparison_table"
out
})
stat_for <- function(x, y, type) {
if (identical(type, "continuous")) { stat_for_continuous(x, y) }
else { stat_for_categorical(x, y) }
}
stat_for_continuous <- function(x, y) {
if (length(unique(y)) == 2) { t_test(x, y) }
else { summary(lm(x ~ y)) }
}
stat_for_categorical <- function(x, y) {
chisq_test(x, y)
}
table_for <- function(data, variable, groupby, type, na.rm, top, .print_filters = NULL) {
if (identical(type, "continuous")) {
table_for_continuous(data, variable, groupby, na.rm = na.rm, .print_filters = .print_filters)
}
else {
table_for_categorical(data, variable, groupby, na.rm = na.rm,
top = top, .print_filters = .print_filters)
}
}
table_for_continuous <- function(data, variable, groupby, na.rm, .print_filters = NULL) {
t <- dplyr::select_(data, get_base_varname(variable), get_base_varname(groupby)) %>%
dplyr::mutate_(ctab__internal_variable = variable$expr) %>%
dplyr::mutate_(ctab__internal_groupby = groupby$expr) %>%
dplyr::select(ctab__internal_variable, ctab__internal_groupby)
if (isTRUE(na.rm)) {
t <- na.omit(t)
}
t <- dplyr::group_by(t, ctab__internal_groupby) %>%
dplyr::summarise_each(., dplyr::funs(
length(.), mean(., na.rm = TRUE), median(., na.rm = TRUE), sd(., na.rm = TRUE))) %>%
setNames(c(get_varname(groupby), "N", "mean", "median", "sd"))
attr(t, "left_var") <- get_varname(variable)
attr(t, "upper_var") <- get_varname(groupby)
attr(t, "na.rm") <- TRUE
attr(t, "filters") <- .print_filters
t
}
table_for_categorical <- function(data, variable, groupby, na.rm = FALSE,
top = 0, .print_filters = NULL) {
tab_(data, .dots = list(variable, groupby), percent = TRUE, freq = FALSE,
byrow = FALSE, na.rm = na.rm, top = top, .print_filters = .print_filters)
}
#' @export
ctable <- comparison_table
#' @export
ctab <- comparison_table
#' Print the table without annoyingly displaying the class.
#' @export
print.comparison_table <- function(x) {
if (x$table %is% tbl_df) {
cat(attr(x$table, "left_var"));
if (!is.null(attr(x$table, "upper_var"))) {
cat(" ### "); cat(attr(x$table, "upper_var"))
}
cat("\n")
if (!is.null(attr(x$table, "filters"))) {
cat("Filters: ", attr(x$table, "filters"), "\n")
}
if (isTRUE(attr(x, "na.rm"))) { cat(" (nas removed)") }
# Hack to not print the source on the tibble::data_frame
print(tibble::trunc_mat(x$table, n = NULL, width = NULL))
} else {
print(x$table)
}
cat("\n")
print(x$stat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.