knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Show percentiles, mean, sd, min, max, first, last, and unique counts across quantitative variables in a dataset. This file works out how the ff_summ_bygroup function works from Fan's REconTools Package.

There is a quantitative variable, summarize this quantitative variable by multiple categorical groups, show a variety of statistics. including:

  1. percentiles
  2. mean, sd, etc
  3. min, max 4, first, last, unique counts

Load Packages

rm(list = ls(all.names = TRUE))
library(tibble)
library(tidyr)
library(dplyr)
library(purrr)

# library(ggplot2)
library(kableExtra)

Load Data and Parameers

data(mtcars)
df_mtcars <- mtcars
df <- df_mtcars
vars.group <- c('am', 'vs')
var.numeric <- 'mpg'
str.stats.group <- 'allperc'
ar.perc <- c(0.10, 0.25, 0.5, 0.75, 0.9)
boo.overall.stats <- TRUE

By Group Summarizing

Statistics to Include

# List of statistics
# https://rdrr.io/cran/dplyr/man/summarise.html
strs.center <- c('mean', 'median')
strs.spread <- c('sd', 'IQR', 'mad')
strs.range <- c('min', 'max')
strs.pos <- c('first', 'last')
strs.count <- c('n_distinct')

# Grouping of Statistics
if (str.stats.group == 'main') {
    strs.all <- c('mean', 'min', 'max', 'sd')
}
if (str.stats.group == 'all') {
    strs.all <- c(strs.center, strs.spread, strs.range, strs.pos, strs.count)
}
if (str.stats.group == 'allperc') {
    ar_st_percentile_func_names <- paste0(ar.perc*100, "%")
    funs_percentiles <- map(ar.perc, ~partial(quantile, probs = .x, na.rm = TRUE)) %>% set_names(nm = ar_st_percentile_func_names)
    strs.all <- c(strs.center, strs.spread, funs_percentiles, strs.range, strs.pos, strs.count)
}

Overall Statistics

# Start Transform
df <- df %>% drop_na() %>% mutate(!!(var.numeric) := as.numeric(!!sym(var.numeric)))

# Overall Statistics
if (boo.overall.stats) {
    df.overall.stats <- df %>% summarize_at(vars(var.numeric), funs(!!!strs.all))
    if (length(strs.all) == 1) {
        # give it a name, otherwise if only one stat, name of stat not saved
        df.overall.stats <- df.overall.stats %>% rename(!!strs.all := !!sym(var.numeric))
    }
    names(df.overall.stats) <- paste0(var.numeric, '.', names(df.overall.stats))
}

# Display Results
kable(df.overall.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Summarizing by Groups

# Group Sort
df.select <- df %>%
              group_by(!!!syms(vars.group)) %>%
              arrange(!!!syms(c(vars.group, var.numeric)))



# Table of Statistics
df.table.grp.stats <- df.select %>% summarize_at(vars(var.numeric), funs(!!!strs.all))

# Display Results
kable(df.table.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

# Add Stat Name
if (length(strs.all) == 1) {
    # give it a name, otherwise if only one stat, name of stat not saved
    df.table.grp.stats <- df.table.grp.stats %>% rename(!!strs.all := !!sym(var.numeric))
}

# Display Results
kable(df.table.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Statistics as Row

# Row of Statistics
str.vars.group.combine <- paste0(vars.group, collapse='_')
if (length(vars.group) == 1) {
    df.row.grp.stats <- df.table.grp.stats %>%
            mutate(!!(str.vars.group.combine) := paste0(var.numeric, '.',
                                           vars.group, '.g',
                                           (!!!syms(vars.group)))) %>%
            gather(variable, value, -one_of(vars.group)) %>%
            unite(str.vars.group.combine, c(str.vars.group.combine, 'variable')) %>%
            spread(str.vars.group.combine, value)
} else {
    df.row.grp.stats <- df.table.grp.stats %>%
                            mutate(vars.groups.combine := paste0(paste0(vars.group, collapse='.')),
                                   !!(str.vars.group.combine) := paste0(interaction(!!!(syms(vars.group))))) %>%
                            mutate(!!(str.vars.group.combine) := paste0(var.numeric, '.', vars.groups.combine, '.',
                                                                       (!!sym(str.vars.group.combine)))) %>%
                            ungroup() %>%
                            select(-vars.groups.combine, -one_of(vars.group)) %>%
            gather(variable, value, -one_of(str.vars.group.combine))  %>%
            unite(str.vars.group.combine, c(str.vars.group.combine, 'variable')) %>%
            spread(str.vars.group.combine, value)
}

# Display Results
kable(df.row.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Process Outputs

# Clean up name strings
names(df.table.grp.stats) <- gsub(x = names(df.table.grp.stats),pattern = "_", replacement = "\\.")
names(df.row.grp.stats) <- gsub(x = names(df.row.grp.stats),pattern = "_", replacement = "\\.")

# Return
list.return <- list(df_table_grp_stats = df.table.grp.stats, df_row_grp_stats = df.row.grp.stats)

# Overall Statistics, without grouping
if (boo.overall.stats) {
    df.row.stats.all <- c(df.row.grp.stats, df.overall.stats)
    list.return <- append(list.return, list(df_overall_stats = df.overall.stats,
                                            df_row_stats_all = df.row.stats.all))
}

# Display Results
kable(df.table.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
kable(df.row.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
kable(df.overall.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))


FanWangEcon/REconTools documentation built on Jan. 21, 2022, 10:28 p.m.