Nothing
#' Efficient tabulation
#'
#' Produces a tabulation: for each unique group from the variable(s),
#' \code{tab} shows the number of
#' observations with that value, proportion of observations with that
#' value, and cumulative proportion, in descending order of frequency.
#' Accepts data.table, tibble, or data.frame as input.
#' Efficient with big data: if you give it a \code{data.table},
#' \code{tab} uses \code{data.table} syntax.
#'
#' @usage tab(df, ..., by, round)
#'
#' @param df A data.table, tibble, or data.frame.
#' @param ... A column or set of columns (without quotation marks).
#' @param by A variable by which you want to group observations before tabulating (without quotation marks).
#' @param round An integer indicating the number of digits for proportion and cumulative proportion.
#'
#' @return Tabulation (frequencies, proportion, cumulative proportion) for each unique value of the variables given in \code{...} from \code{df}.
#'
#' @importFrom data.table data.table
#' @importFrom data.table .SD
#' @importFrom data.table :=
#' @importFrom data.table setcolorder
#' @importFrom data.table .GRP
#' @importFrom data.table .N
#' @importFrom magrittr %>%
#' @importFrom dplyr tibble
#' @importFrom stats quantile
#'
#' @examples
#' # data.table
#' library(data.table)
#' library(magrittr)
#' a <- data.table(varname = sample.int(20, size = 1000000, replace = TRUE))
#' a %>% tab(varname)
#'
#' # tibble
#' library(dplyr)
#' b <- tibble(varname = sample.int(20, size = 1000000, replace = TRUE))
#' b %>% tab(varname, round = 1)
#'
#' # data.frame
#' c <- data.frame(varname = sample.int(20, size = 1000000, replace = TRUE))
#' c %>% tab(varname)
#'
#' @export
tab <- function(df, ..., by = NULL, round = 2) {
UseMethod("tab", df)
}
#' @export
tab.data.table <- function(df, ..., by = NULL, round = 2) { # note ... is the variable names to group by
. <- temp_prop <- prop <- cum_prop <- N <- NULL
dt <- df # in case df has a condition on it
group_by <- rlang::enquos(...) %>% purrr::map(rlang::as_name) %>% unlist()
by__ <- rlang::enexpr(by)
if (!is.null(by__)) {
by_ <- rlang::enexpr(by) %>% rlang::as_name()
# assert constant values of group_by within by
assertthat::assert_that(
dt[, .GRP, by = c(by_, group_by)][, .N] ==
dt[, .GRP, by = by_][, .N]
)
dt <- dt[, .GRP, by = c(by_, group_by)]
tab.data.table(dt, ..., by = NULL, round = round) # recursive function
}
rowsofdata <- dt[, .N] # faster than nrow() on big data.tables
dt[, .N, by = group_by] %>%
.[, temp_prop := N / rowsofdata] %>%
.[, prop := round(temp_prop, digits = round)] %>%
# sort in descending order by N before cumulative prop
.[order(-N)] %>%
.[, cum_prop := round(cumsum(temp_prop), digits = round)] %>%
# remove temp var
.[, temp_prop := NULL] %>%
# make sure final data.table sorted
.[order(-N)]
}
#' @export
tab.tbl_df <- function(df, ..., by = NULL, round = 2) { # to check without requiring tibble
N <- temp_prop <- NULL
group_by <- rlang::enquos(...)
by_ <- rlang::enquo(by)
by__ <- rlang::enexpr(by)
if (!is.null(by__)) {
assertthat::assert_that(
df %>% dplyr::distinct(!!by_, !!!group_by) %>% nrow() ==
df %>% dplyr::distinct(!!by_) %>% nrow()
)
df <- df %>% dplyr::distinct(!!by_, !!!group_by, .keep_all = TRUE)
tab.tbl_df(df, ..., by = NULL, round = round) # recursive function
}
rowsofdata <- nrow(df)
df %>%
dplyr::group_by(!!!group_by) %>% # !!! since it's a quosure
dplyr::summarize(N = dplyr::n()) %>%
dplyr::arrange(dplyr::desc(N)) %>%
dplyr::ungroup() %>%
dplyr::mutate(
temp_prop = N / rowsofdata,
prop = round(temp_prop, digits = round),
cum_prop = round(cumsum(temp_prop), digits = round)
) %>%
dplyr::select(-temp_prop)
}
#' @export
tab.data.frame <- function(df, ..., by = NULL, round = 2) { # to check without requiring tibble
by__ <- rlang::enexpr(by)
if (!is.null(by__)) {
by_ <- rlang::enquo(by)
tab.data.table(data.table::as.data.table(df), ..., by = !!by_, round = round)
} else {
tab.data.table(data.table::as.data.table(df), ..., by = NULL, round = round)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.