Nothing
#' Summary and comparison of the levels in categorical columns
#'
#' @description For a single dataframe, summarise the levels of each categorical
#' column. If two dataframes are supplied, compare the levels of categorical features
#' that appear in both dataframes. For grouped dataframes, summarise the levels
#' of categorical features separately for each group.
#'
#' @param df1 A dataframe.
#' @param df2 An optional second data frame for comparing categorical levels.
#' Defaults to \code{NULL}.
#' @param include_int Logical flag - whether to treat integer columns as categories. Default is \code{FALSE}.
#' @return A tibble summarising or comparing the categorical features
#' in one or a pair of dataframes.
#'
#' @details
#' For a \strong{single dataframe}, the tibble returned contains the columns: \cr
#' \itemize{
#' \item \code{col_name}, character vector containing column names of \code{df1}.
#' \item \code{cnt} integer column containing count of unique levels found in each column,
#' including \code{NA}.
#' \item \code{common}, a character column containing the name of the most common level.
#' \item \code{common_pcnt}, the percentage of each column occupied by the most common level shown in
#' \code{common}.
#' \item \code{levels}, a named list containing relative frequency tibbles for each feature.
#' }
#' For a \strong{pair of dataframes}, the tibble returned contains the columns: \cr
#' \itemize{
#' \item \code{col_name}, character vector containing names of columns appearing in both
#' \code{df1} and \code{df2}.
#' \item \code{jsd}, a numeric column containing the Jensen-Shannon divergence. This measures the
#' difference in relative frequencies of levels in a pair of categorical features. Values near
#' to 0 indicate agreement of the distributions, while 1 indicates disagreement.
#' \item \code{pval}, the p-value corresponding to a NHT that the true frequencies of the categories are equal.
#' A small p indicates evidence that the the two sets of relative frequencies are actually different. The test
#' is based on a modified Chi-squared statistic.
#' \item \code{lvls_1}, \code{lvls_2}, the relative frequency of levels in each of \code{df1} and \code{df2}.
#' }
#' For a \strong{grouped dataframe}, the tibble returned is as for a single dataframe, but where
#' the first \code{k} columns are the grouping columns. There will be as many rows in the result
#' as there are unique combinations of the grouping variables.
#' @author Alastair Rushworth
#' @seealso \code{\link{inspect_imb}}, \code{\link{show_plot}}
#' @export
#' @examples
#' # Load dplyr for starwars data & pipe
#' library(dplyr)
#'
#' # Single dataframe summary
#' inspect_cat(starwars)
#'
#' # Paired dataframe comparison
#' inspect_cat(starwars, starwars[1:20, ])
#'
#' # Grouped dataframe summary
#' starwars %>% group_by(gender) %>% inspect_cat()
#' @importFrom tibble as_tibble
#' @importFrom tibble tibble
#' @importFrom dplyr arrange
#' @importFrom dplyr contains
#' @importFrom dplyr do
#' @importFrom dplyr group_by
#' @importFrom dplyr left_join
#' @importFrom dplyr mutate
#' @importFrom dplyr mutate_if
#' @importFrom dplyr rename
#' @importFrom dplyr select_if
#' @importFrom dplyr select
#' @importFrom dplyr slice
#' @importFrom dplyr ungroup
#' @importFrom magrittr %>%
#' @importFrom progress progress_bar
#' @importFrom Rcpp compileAttributes
inspect_cat <- function(df1, df2 = NULL, include_int = FALSE){
# perform basic column check on dataframe input
input_type <- check_df_cols(df1, df2)
# capture the data frame names
df_names <- get_df_names()
# if only a single df input
if(input_type == "single"){
# pick out categorical columns
# is.date_or_time <- function(v)
col_cats <- function(v) is.character(v) | is.factor(v) | is.logical(v) | any(c("Date", "datetime") %in% class(v))
if(include_int) col_cats <- function(v) is.character(v) | is.factor(v) | is.logical(v) | any(c("Date", "datetime") %in% class(v)) | is.integer(v)
df_cat <- df1 %>%
select_if(col_cats) %>%
mutate_if(is.factor, as.character) %>%
mutate_if(is.integer, as.character)
# calculate association if categorical columns exist
if(ncol(df_cat) > 0){
# check any factors for duplicate labels
df_cat <- check_factors(df_cat)
# get the levels for each category
names_cat <- colnames(df_cat)
n_cols <- ncol(df_cat)
levels_list <- vector("list", length = length(df_cat))
# loop over columns and tabulte frequencies
pb <- start_progress(prefix = " Column", total = n_cols)
for(i in 1:n_cols){
update_progress(bar = pb, iter = i, total = n_cols, what = names_cat[i])
levels_list[[i]] <- fast_table(df_cat[[i]], show_na = TRUE, show_cnt = TRUE)
}
names(levels_list) <- names(df_cat)
# get the most common level
levels_top <- lapply(levels_list, function(M) M[1, ]) %>%
do.call("rbind", .) %>%
mutate(col_name = names_cat) %>%
select(-cnt)
# get the unique levels
levels_unique <- suppressWarnings(lapply(levels_list, nrow) %>%
do.call("rbind", .) %>%
as_tibble(rownames = "col_name"))
# combine the above tables
out <- levels_unique %>%
left_join(levels_top, by = "col_name") %>%
mutate(prop = prop * 100) %>%
rename(cnt = V1, common = value, common_pcnt = prop)
out$levels <- levels_list
# sort by alphabetical order & filter to max number of rows
out <- out %>%
arrange(col_name)
# add names to the list
names(out$levels) <- out$col_name
} else {
out <- tibble(col_name = character(), cnt = integer(),
common = character(), common_pcnt = numeric(),
levels = list())
}
}
if(input_type == "pair"){
# levels for df1
s1 <- inspect_cat(df1) %>%
select(-contains("common"), -cnt)
# levels for df2
s2 <- inspect_cat(df2) %>%
select(-contains("common"), -cnt)
# combine and clean up levels
out <- full_join(s1, s2, by = "col_name") %>%
mutate(jsd = js_divergence_vec(levels.x, levels.y)) %>%
mutate(pval = chisq(levels.x, levels.y, n_1 = nrow(df1), n_2 = nrow(df2))) %>%
select(col_name, jsd, pval, lvls_1 = levels.x, lvls_2 = levels.y)
# ensure the list names are retained
names(out[[4]]) <- names(out[[5]]) <- as.character(out$col_name)
}
if(input_type == "grouped"){
out <- apply_across_groups(df = df1, fn = inspect_cat)
}
attr(out, "type") <- list(method = "cat", input_type = input_type)
attr(out, "df_names") <- df_names
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.