Nothing
#Created: 2019-10-14
#Updated: 2020-02-12
#Author: Alex Zajichek
#Package: cheese
#Description: Function definitions for the package
#Name: actual_depth
#Description: Compute the explicit mapping depth based on what is implied by the input
actual_depth <-
function(
depth, #An abstract depth
ref #A reference value for adjusting depth
) {
#Check that inputs are numeric
if(!is.numeric(depth) | !is.numeric(ref))
stop("Inputs must be numeric.")
#If depth is negative, subtract from ref
if(depth < 0) {
#Use zero if it goes beyond
depth <- max(ref + depth, 0)
} else {
#Use maximum depth as the limit
depth <- min(depth, ref)
}
depth
}
#Name: divide
#Description: Stratify a data frame into a list
divide <-
function(
data, #A data frame
..., #Variables to split by
depth = Inf, #Depth to split to; default to max depth
remove = TRUE, #Should stratification variables be removed?
drop = TRUE, #Should unused combinations be dropped?
sep = "|" #Character to separate levels when necessary
) {
#Splice variable names
selected_vars <-
tidyselect::eval_select(
rlang::expr(c(...)),
data
)
#Return error if no split variables entered
if(length(selected_vars) == 0)
stop("No columns registered.")
#Set values as names
selected_vars <- as.list(names(selected_vars))
#Get the depth
depth <-
actual_depth(
depth = depth,
ref = length(selected_vars)
)
#Return data if depth is 0
if(depth == 0) {
#Give warning
warning("Split depth is 0; returning original data.")
#Return original data as a tibble
return(data %>% tibble::as_tibble())
}
#Pull elements past position 'depth' into position 'depth'
selected_vars[[depth]] <-
selected_vars[seq.int(depth, length(selected_vars))] %>%
purrr::flatten_chr()
#Only keep up to desired depth
selected_vars <- selected_vars[seq_len(depth)]
#Set initial list to start loop
data <-
data %>%
#Convert to tibble
tibble::as_tibble() %>%
#Split by first variable(s)
split(
dplyr::select(data, tidyselect::all_of(selected_vars[[1]])),
drop = drop,
sep = sep
)
#If needed, loop to split at desired depth
if(depth > 1) {
for(i in seq_len(depth - 1)) {
data <-
data %>%
#Split at leaf data frames
purrr::modify_depth(
.depth = i,
~
.x %>%
split(
dplyr::select(.x, tidyselect::all_of(selected_vars[[i + 1]])),
drop = drop,
sep = sep
)
)
}
}
#Remove all split variables if necessary
if(remove) {
data <-
data %>%
#Alter at the leaves
purrr::modify_depth(
.depth = depth,
~
.x %>%
dplyr::select(
-tidyselect::all_of(
selected_vars %>%
purrr::flatten_chr()
)
)
)
}
#Return result
data
}
#Name: depths_string
#Description: Create representation of elements satisfying a predicate in a list structure
depths_string <-
function(
list, #A list, data frame or atomic vector
predicate, #A binary function
bare = TRUE, #Only continue on bare lists
... #Additional arguments for 'predicate'
) {
#Check if function is supplied
if(missing(predicate))
stop("No predicate supplied.")
#Get continuation function
continue <- rlang::is_list
if(bare)
continue <- rlang::is_bare_list
#Check if elements satisfy the predicate
result <-
list %>%
purrr::map_lgl(
.f = predicate,
...
)
#Make vector of indices
index <- seq_along(result)
#Merge indices with indicator
result <- ifelse(result, -1*index, index)
#Check if next elements are lists
are_lists <-
list %>%
purrr::map_lgl(
.f = continue
)
#Continue traversal for list elements only
if(any(are_lists)) {
result <-
c(
#Recursive function call for list elements
result[are_lists] %>%
stringr::str_c(
list[are_lists] %>%
purrr::map_chr(
.f = depths_string,
predicate = predicate,
bare = bare,
...
)
),
#Concatenate with indices of non-list elements
result[!are_lists]
)[order(order(are_lists, decreasing = TRUE))]
}
#Collapse indices and add boundaries
stringr::str_c(
"{",
stringr::str_c(
result,
collapse = ","
),
"}"
)
}
#Name: depths
#Description: Find the depths in a list structure that satisfy a predicate
depths <-
function(
list, #A list, data frame or atomic vector
predicate, #A binary function
bare = TRUE, #Only continue on bare lists
... #Additional arguments for 'predicate'
) {
#Get string representation of structure
string <-
depths_string(
list = list,
predicate = predicate,
bare = bare,
...
)
#Split input into vectors of characters
string <- strsplit(string, split = "")[[1]]
#Get the depth of each character
result <- cumsum(string == "{") - cumsum(string == "}")
#Return depths that satisfied the predicate
unique(result[string == "-"])
}
#Name: fasten
#Description: Collapse a list of data frames of arbitrary depth to any depth
fasten <-
function(
list, #A list with data frames at the leaves
into = NULL, #Character vector of variable names (use "" for NULL)
depth = 0 #Depth to collapse to
) {
#Check for a bare list
if(!rlang::is_bare_list(list))
stop("A bare list must be supplied.")
#Find depth of leaves
bind_depth <-
list %>%
#Extract the depth
depths(
predicate = is.data.frame,
bare = TRUE
)
#Check for a data frame
if(length(bind_depth) == 0)
stop("There must be data frames at the leaves.")
#Check for single depth
if(length(bind_depth) > 1)
stop("All data frames must be at the same depth.")
#Get implied depth
depth <-
actual_depth(
depth = depth,
ref = bind_depth
)
#Return original list if no binding is needed
if(depth == bind_depth) {
#Give warning
warning("Desired depth equal to current depth. Returning original list.")
#Return original data as a tibble
return(list)
}
#Check if column names were provided
if(is.null(into)) {
#Make vector of empty strings
into <- rep("", bind_depth - depth)
} else {
#Check if input is a character vector
if(!is.character(into)) {
#Coerce to character
into <- as.character(into)
#Give warning
warning("Coercing 'into' to character vector.")
}
#Check if enough column names were provided
if(length(into) < bind_depth - depth) {
#Add on empty strings for sufficient length
into <- c(into, rep("", bind_depth - depth - length(into)))
} else if(length(into) > bind_depth - depth) {
#Give warning
warning(paste0(length(into), " column names given but only ", bind_depth - depth, " needed."))
#Only take elements needed
into <- into[seq_len(bind_depth - depth)]
}
}
#Loop to bind rows
while(bind_depth > depth) {
#Get inner column name
.id <- into[length(into)]
#Set to NULL if empty string
if(.id == "")
.id <- NULL
if(bind_depth > 1) {
list <-
list %>%
#Bind in the interior of list
purrr::modify_depth(
.depth = bind_depth - 1,
dplyr::bind_rows,
.id = .id
)
} else {
list <-
list %>%
#Bind at top level of list
dplyr::bind_rows(
.id = .id
)
}
#Move to next depth
bind_depth <- bind_depth - 1
#Remove used column header
into <- into[-length(into)]
}
list
}
#Name: muddle
#Description: Randomly permute some or all columns in a data frame
muddle <-
function(
data, #A data frame
at, #Columns to permute
... #Arguments to pass to 'sample'
) {
#Set to all columns if missing input
if(missing(at))
at <- names(data)
#Splice variable names
selected_vars <-
tidyselect::eval_select(
rlang::enquo(at),
data
)
#Return error if no split variables entered
if(length(selected_vars) == 0)
stop("No columns registered.")
#Shuffle desired variables
data %>%
purrr::map_at(
selected_vars,
sample,
...
) %>%
#Combine columns back together
dplyr::bind_cols()
}
#Name: stratiply
#Description: Stratify a data frame and apply a function
stratiply <-
function(
data, #Any data frame
f, #Function to apply to each strata
by, #Stratification variables
... #Additional arguments to pass to f
) {
#Check for a function
if(missing(f))
stop("No function supplied.")
#Check for inputs
if(missing(by))
stop("No columns supplied.")
#Splice variable names
selected_vars <-
tidyselect::eval_select(
rlang::enquo(by),
data
)
#Return error if no split variables entered
if(length(selected_vars) == 0)
stop("No columns registered.")
data <-
data %>%
#Divide frame into a list
divide(tidyselect::all_of(selected_vars))
#Find mapping depth
mapping_depth <-
data %>%
depths(
predicate = is.data.frame,
bare = TRUE
)
#Evaluate function at desired depth
data %>%
purrr::map_depth(
.depth = mapping_depth,
.f = f,
...
)
}
#Name: dish
#Description: Evaluate a function on combinations of columns
dish <-
function(
data, #A data frame
f, #A two-argument function
left, #Columns to be entered in arg1
right, #Columns to be entered in arg2
each_left = TRUE, #Should the left variables be entered individually as vectors?
each_right = TRUE, #Should the right variables be entered individually as vectors?
...
) {
#Check for function
if(missing(f))
stop("No function supplied.")
#Check for left inputs
if(missing(left)) {
#Check for right inputs
if(missing(right)) {
#Set both sides to all variables
left <-
tidyselect::eval_select(
rlang::expr(names(data)),
data
)
right <- left
} else {
#Splice right variables
right <-
tidyselect::eval_select(
rlang::enquo(right),
data
)
#Check for registration
if(length(right) == 0)
stop("No right variables registered.")
#Left variables are everything else
left <-
tidyselect::eval_select(
rlang::expr(names(data)[-right]),
data
)
}
} else {
#Splice left variables
left <-
tidyselect::eval_select(
rlang::enquo(left),
data
)
#Check for registration
if(length(left) == 0)
stop("No left variables registered.")
#Check for right inputs
if(missing(right)) {
#Right variables are everything else
right <-
tidyselect::eval_select(
rlang::expr(names(data)[-left]),
data
)
} else {
#Splice right variables
right <-
tidyselect::eval_select(
rlang::enquo(right),
data
)
#Check for registration
if(length(right) == 0)
stop("No right variables registered.")
}
}
#Select data subsets
left <- data %>% dplyr::select(tidyselect::all_of(left))
right <- data %>% dplyr::select(tidyselect::all_of(right))
###Evaluate function depending on 'each_' arguments
#Check for left evaluation
if(each_left) {
#Check for right evaluation
if(each_right) {
#Evaluate each left argument as a vector with each right argument as a vector
left %>%
#For every left variable
purrr::map(
#Evaluate the function for every right variable
function(.x)
right %>% purrr::map(function(.y) f(.x, .y, ...))
)
} else {
#Evaluate each left argument as a vector with the right columns in a single data frame
left %>%
#For every left variable
purrr::map(
#Evaluate the function with the right data frame
function(.x) f(.x, right, ...)
)
}
} else {
#Check for right evaluation
if(each_right) {
#Evaluate each right argument as a vector with the left columns in a single data frame
right %>%
#For every right variable
purrr::map(
#Evaluate the function with the left data frame
function(.x) f(left, .x,...)
)
} else {
#Evaluate with a data frame of left columns and data frame of right columns
f(left, right, ...)
}
}
}
#Name: absorb
#Description: Populate a custom text template with the values in a set of key-value pairs
absorb <-
function(
key,
value,
text,
sep = "_",
trace = FALSE,
evaluate = FALSE
) {
#Check that lengths match
if(length(key) != length(value))
stop("Keys and values must hold the same number of elements.")
#Check text is a character vector
if(!is.character(text))
stop("Text input must be a vector of character strings.")
#Check for names
names <- names(text)
#Put into data frame
lookup <-
tibble::tibble(
key = key,
value = value
) %>%
#Convert keys/values to character vectors
dplyr::mutate_all(
as.character
) %>%
#Concatenate values for each key
dplyr::group_by(
key
) %>%
dplyr::summarise(
value = stringr::str_c(value, collapse = sep)
)
#Repeat for each unique key
for(i in seq_len(nrow(lookup))) {
#Trace if desired
if(trace)
cat(text, "\n")
#Replace all substrings in text with values that match the key
text <- text %>% stringr::str_replace_all(pattern = lookup$key[i], replacement = lookup$value[i])
}
#Restore names if provided
if(!is.null(names))
names(text) <- names
#Evaluate expressions if necessary
if(evaluate)
text <- text %>% purrr::map(~eval(parse(text = .x)))
#Return result
text
}
#Name: some_type
#Description: Does an object conform to one or more types?
some_type <-
function(
object,
types
) {
#Check for inputs
if(missing(object))
stop("No object supplied")
#Check for types
if(missing(types))
stop("No types supplied")
#Loop through input
match_found <- FALSE
for(i in seq_along(types)) {
#Check if object type conforms
if(methods::is(object, types[i])) {
#Change status; exit loop
match_found <- TRUE
break
}
}
#Return indicator
match_found
}
#Name: typly
#Description: Apply a function to columns conforming (or not) to the specified types
typly <-
function(
data,
f,
types,
negated = FALSE,
...
) {
#Check for a function
if(missing(f))
stop("No function supplied.")
#Check for types
if(missing(types))
stop("No types supplied")
#Test type matching
type_test <-
data %>%
#Get a logical vector
purrr::map_lgl(
.f = some_type,
types = types
)
#Negate if needed
if(negated)
type_test <- !type_test
#If any conform, apply function
if(any(type_test)) {
data %>%
#Select columns with conforming types
dplyr::select_if(
type_test
) %>%
#Apply the function
purrr::map(
f,
...
)
} else {
NULL
}
}
#Name: order_levels
#Description: Returns order of combinations of levels
order_levels <-
function(
data,
sep = "_"
) {
#Find set of unique values for each variable
sets <-
data %>%
purrr::map(unique)
#Get rank order of levels within each column
ranks <-
sets %>%
purrr::map_df(
~
tibble::tibble(
name = as.character(.x),
value = order(.x)
),
.id = "column"
)
sets <-
sets %>%
#Convert to character
purrr::map(as.character) %>%
#Get cross-product for levels
purrr::cross_df()
sets %>%
#Add group identifier
dplyr::mutate(
.id = seq_len(nrow(sets))
) %>%
#Send down the rows
tidyr::gather(
key = "column",
value = "name",
-tidyselect::all_of(".id")
) %>%
#Get rank of each level within each column
dplyr::inner_join(
y = ranks,
by = c("column", "name")
) %>%
#Within each combination
dplyr::group_by_at(".id") %>%
#Concatenate the levels
dplyr::summarise(
index = paste(.data$value, collapse = ""),
name = paste(.data$name, collapse = sep)
) %>%
dplyr::ungroup() %>%
#Keep rank and level
dplyr::transmute(
index = order(order(.data$index)),
name = .data$name
)
}
#Name: stretch
#Description: Span keys and values across the columns
stretch <-
function(
data,
key,
value,
sep = "_"
) {
#Check for keys
if(missing(key))
stop("No key(s) supplied.")
#Splice the keys
key <-
tidyselect::eval_select(
rlang::enquo(key),
data
) %>%
names
#Gather column order for keys
column_order <-
data %>%
dplyr::select(
tidyselect::all_of(key)
) %>%
order_levels(
sep = sep
)
#Check for registration
if(length(key) == 0)
stop("No key(s) registered")
#Check for values
if(missing(value))
stop("No value(s) supplied.")
#Splice the keys
value <-
tidyselect::eval_select(
rlang::enquo(value),
data
) %>%
names
#Check for registration
if(length(value) == 0)
stop("No value(s) registered")
##Create a single column for the keys
data <-
data %>%
#Combine columns together
tidyr::unite(
col = ".key",
key,
sep = sep,
remove = TRUE
)
#Filter column ordering to those existing in the data
column_order <-
column_order %>%
dplyr::filter(
.data$name %in% purrr::pluck(data, ".key")
)
#Gather id variables
id_vars <- setdiff(names(data), c(".key", value))
#Collect column ordering
temp_column_order <- tibble::tibble()
#For each value column...
result <- list()
for(i in seq_along(value)) {
#Extract the current value
value_i <- value[i]
#Make a temporary data frame
temp_dat <-
data %>%
#Remove all value columns except the current one
dplyr::select(
-tidyselect::all_of(
setdiff(value, value_i)
)
)
#Append keys if needed
if(length(value) > 1) {
temp_dat <-
temp_dat %>%
#Append key column
dplyr::mutate_at(
".key",
~paste0(.x, sep, value_i)
)
#Add to column ordering
temp_column_order <-
temp_column_order %>%
dplyr::bind_rows(
column_order %>%
#Append value name and add index
dplyr::mutate(
name = paste0(.data$name, sep, value_i),
val_index = i
)
)
}
result[[i]] <-
temp_dat %>%
#Send this value across the columns
tidyr::spread(
key = ".key",
value = value[i]
)
}
#Combine based on presence of id columns
if(length(id_vars) > 0) {
result <-
result %>%
#Iteratively join
purrr::reduce(
.f = dplyr::inner_join,
by = id_vars
)
} else {
result <-
result %>%
dplyr::bind_cols()
}
#Find final sorting order
if(nrow(temp_column_order) == 0)
temp_column_order <- column_order
column_order <-
temp_column_order %>%
#Arrange by indices
dplyr::arrange_at(
dplyr::vars(
tidyselect::any_of(
c(
"index",
"val_index"
)
)
)
) %>%
#Extract vector
dplyr::pull(.data$name)
#Rearrange the columns
result %>%
dplyr::select(
tidyselect::all_of(
c(
id_vars,
column_order
)
)
)
}
#Name: grable
#Description: Make a hierarchical kable
grable <-
function(
data, #A dataset
at, #Columns to include in hierarchy
sep = "_", #Delimiter for parsing
reverse = FALSE, #Stack in opposite direction?
format = c("html", "latex"), #Rendering format
caption = NULL, #Table caption
... #Arguments passed to kableExtra::kable_styling
) {
#Set to all columns if missing input
if(missing(at)) {
#Extract all indices
at <- seq_along(names(data))
#Set names
names(at) <- names(data)
}
#Splice variable names
selected_vars <-
tidyselect::eval_select(
rlang::enquo(at),
data
)
#Return error if no split variables entered
if(length(selected_vars) == 0)
stop("No columns registered.")
#Split to create header matrix
headers <-
selected_vars %>%
names %>%
stringr::str_split(
pattern = sep,
simplify = TRUE
)
#Iterable index vector
index <- rev(seq_len(ncol(headers)))
#Reverse if needed
if(reverse)
index <- rev(index)
#Replace names of data with first level
names(data)[selected_vars] <- headers[,index[1]]
#Get format
format <- match.arg(format)
#Make initial kable
result <-
data %>%
knitr::kable(
format = format,
caption = caption
) %>%
kableExtra::kable_styling(...)
#Iteratively add layers
if(length(index) > 1) {
#Make a template
template <- rep(" ", ncol(data))
for(i in 2:length(index)) {
#Make index
temp_template <- template
temp_template[selected_vars] <- headers[,index[i]]
#Add empty spots
result <-
result %>%
kableExtra::add_header_above(
kableExtra::auto_index(
temp_template
)
)
}
}
result
}
#Name: default_univariate_functions
#Description: Provides list of functions for different types of data
default_univariate_functions <-
function(useNA) {
list(
#Functions for all data
f_all =
list(
length = length,
missing = ~sum(is.na(.)),
available = ~sum(!is.na(.)),
class = class,
unique = ~length(unique(.))
),
#Functions for numeric data
f_numeric =
list(
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
q1 = ~quantile(., .25, na.rm = TRUE),
median = ~median(., na.rm = TRUE),
q3 = ~quantile(., .75, na.rm = TRUE),
max = ~max(., na.rm = TRUE),
iqr = ~IQR(., na.rm = TRUE),
range = ~diff(range(., na.rm = TRUE))
),
#Functions for categorical data
f_categorical =
list(
count = ~table(., useNA = useNA),
proportion = ~prop.table(table(., useNA = useNA)),
percent = ~prop.table(table(., useNA = useNA))*100
)
) %>%
#Convert everything to a function
purrr::map_depth(
.depth = 2,
rlang::as_function
)
}
#Name: descriptives_component
#Description: Internal function to build a subset of the descriptives() call for a given result
descriptives_component <-
function(res, na_string) {
#Check if the input is numeric or character
if(!some_type(res, c("numeric", "character", "logical", "Date", "table")))
stop("Function must return a an atomic vector or table.")
#Check if result has names
names <- NA_character_
if(!is.null(names(res)))
names <- dplyr::coalesce(names(res), na_string)
#Extract values from a table
if(is.table(res))
res <- as.numeric(res)
#Add result to a data frame
temp_result <-
tibble::tibble(
val_ind = seq_along(res),
val_lab = names,
val_dbl = res
)
#Change label for non-numeric output
if(!some_type(res, "numeric"))
names(temp_result)[3] <- "val_chr"
#Return results
temp_result
}
#Name: descriptives
#Description: Compute descriptive statistics on columns of a data frame
descriptives <-
function(
data,
f_all = NULL,
f_numeric = NULL,
numeric_types = "numeric",
f_categorical = NULL,
categorical_types = "factor",
f_other = NULL,
useNA = c("ifany", "no", "always"),
round = 2,
na_string = "(missing)"
) {
#Get default functions
useNA <- match.arg(useNA)
default_functions <- default_univariate_functions(useNA = useNA)
#Add user-specified functions
f_all <- c(f_all, default_functions$f_all)
f_numeric <- c(f_numeric, default_functions$f_numeric)
f_categorical <- c(f_categorical, default_functions$f_categorical)
###Evaluate functions for...
#All columns
all_results <-
f_all %>%
#Apply each function to each column
purrr::map(
~
data %>%
purrr::map(
.f = .x
)
)
#Numeric columns
numeric_results <-
f_numeric %>%
#Apply function for each type
purrr::map(
typly,
data = data,
types = numeric_types
)
#Categorical columns
categorical_results <-
f_categorical %>%
#Apply function for each type
purrr::map(
typly,
data = data,
types = categorical_types
)
#Other columns
other_results <- NULL
if(!is.null(f_other)) {
other_results <-
f_other %>%
#Apply function excluding each type
purrr::map(
typly,
data = data,
types = c(numeric_types, categorical_types),
negated = TRUE
)
}
#Get column indices
columns <- names(data)
#Build a data frame with the results
list(
all = all_results,
numeric = numeric_results,
categorical = categorical_results,
other = other_results
) %>%
purrr::map_depth(
.depth = 3,
descriptives_component,
na_string = na_string,
.ragged = TRUE
) %>%
#Bind columns
fasten(
into = c("fun_eval", "fun_key", "col_lab")
) %>%
#Join to get column order
dplyr::inner_join(
y =
tibble::tibble(
col_lab = columns,
col_ind = seq_along(columns)
),
by = "col_lab"
) %>%
#Rearrange columns
dplyr::select(
tidyselect::all_of(
c(
"fun_eval",
"fun_key",
"col_ind",
"col_lab"
)
),
tidyselect::everything()
) %>%
#Add consolidated summary column
dplyr::mutate(
val_cbn = dplyr::coalesce(as.character(round(.data$val_dbl, round)), .data$val_chr)
) %>%
#Rearrange rows
dplyr::arrange_at(
dplyr::vars(
tidyselect::all_of(
c(
"fun_eval",
"fun_key",
"col_ind",
"val_ind"
)
)
)
)
}
#Name: univariate_associations
#Description: Evaluate a list of functions to a variable for each response
univariate_associations <-
function(
data,
f,
responses,
predictors
) {
#Check for function
if(missing(f))
stop("No function(s) supplied.")
#Check for response inputs
if(missing(responses)) {
#Check for predictor inputs
if(missing(predictors)) {
#Set both sides to all variables
responses <-
tidyselect::eval_select(
rlang::expr(names(data)),
data
)
predictors <- responses
} else {
#Splice predictor variables
predictors <-
tidyselect::eval_select(
rlang::enquo(predictors),
data
)
#Check for registration
if(length(predictors) == 0)
stop("No predictor variables registered.")
#Response variables are everything else
responses <-
tidyselect::eval_select(
rlang::expr(names(data)[-predictors]),
data
)
}
} else {
#Splice response variables
responses <-
tidyselect::eval_select(
rlang::enquo(responses),
data
)
#Check for registration
if(length(responses) == 0)
stop("No response variables registered.")
#Check for predictor inputs
if(missing(predictors)) {
#Predictor variables are everything else
predictors <-
tidyselect::eval_select(
rlang::expr(names(data)[-responses]),
data
)
} else {
#Splice predictor variables
predictors <-
tidyselect::eval_select(
rlang::enquo(predictors),
data
)
#Check for registration
if(length(predictors) == 0)
stop("No predictor variables registered.")
}
}
#Make f a list if its a single function
if(methods::is(f, "function"))
f <- list(f)
f %>%
#Map each function
purrr::imap(
~
data %>%
#Get list of association statistics
dish(
f = .x,
left = tidyselect::all_of(responses),
right = tidyselect::all_of(predictors)
) %>%
#Convert result to tibble
purrr::map_depth(
.depth = 2,
~
.x %>%
#Make tibble
tibble::enframe(
value = "value"
) %>%
#Remove key
dplyr::select(
-tidyselect::all_of("name")
)
) %>%
#Zip up result
fasten(
into = c("response", "predictor")
) %>%
#Rename value to match function
dplyr::rename_at(
"value",
function(x) .y
)
) %>%
#Iteratively join
purrr::reduce(
dplyr::inner_join,
by =
c(
"response",
"predictor"
)
)
}
#Name: absorb_descriptive_variable
#Description: Absorbs values into a custom text string for a specific variable produced by descriptives
absorb_descriptive_variable <-
function(
variable,
numeric_summary,
categorical_summary,
other_summary,
all_summary,
evaluate
) {
#Vector function evaluation types
eval_types <- unique(variable$fun_eval)
#Set parameters for summarizing
use_groups <- c("col_lab", "val_ind")
if("numeric" %in% eval_types) {
use_summary <- numeric_summary
use_filter <- "numeric"
} else if("categorical" %in% eval_types) {
use_summary <- categorical_summary
use_filter <- "categorical"
use_groups <- c(use_groups, "val_lab")
} else {
use_summary <- other_summary
use_filter <- "other"
#If the 'other' summary contains names, treat it categorically
if(any(!is.na(variable$val_lab))) {
#Set the lower level groups
use_groups <- c(use_groups, "val_lab")
}
}
#Populate strings templates
results <-
variable %>%
#Filter to categorical functions only
dplyr::filter(
.data$fun_eval %in% use_filter
)
#Check if results exist
if(nrow(results) > 0) {
results <-
results %>%
#For each level
dplyr::group_by_at(use_groups) %>%
#Absorb the strings
dplyr::do(
absorb(
key = .data$fun_key,
value = .data$val_cbn,
text = use_summary,
evaluate = evaluate
) %>%
#Make a bindable tibble
as.list() %>%
purrr::map_df(~tibble::tibble(sum_val = as.character(.x)), .id = "sum_lab")
) %>%
dplyr::ungroup()
} else {
results <- NULL
}
#Add all summary if requested
if(!is.null(all_summary)) {
#Filter to subset with all functions
variable_all <-
variable %>%
dplyr::filter(
.data$fun_eval == "all"
)
#Add extra row for categorical variables only
val_lab_all <- 1
if(length(use_groups) == 3)
val_lab_all <- 0
results <-
results %>%
#Bind additional summary to results
dplyr::bind_rows(
absorb(
key = variable_all$fun_key,
value = variable_all$val_cbn,
text = all_summary,
evaluate = evaluate
) %>%
#Make a bindable tibble
as.list() %>%
purrr::map_df(~tibble::tibble(sum_val = as.character(.x)), .id = "sum_lab") %>%
#Put index at zero
tibble::add_column(
col_lab = variable_all$col_lab[1],
val_ind = val_lab_all
)
)
}
results
}
#Name: absorb_descriptives
#Description: Gathers the absorbed descriptives for all variables
absorb_descriptives <-
function(
data,
numeric_summary,
categorical_summary,
other_summary,
all_summary,
evaluate,
...
) {
data %>%
#Get descriptive statistics
descriptives(...) %>%
#Absorb into text strings for each variable
stratiply(
f = absorb_descriptive_variable,
by = .data$col_ind,
numeric_summary = numeric_summary,
categorical_summary = categorical_summary,
other_summary = other_summary,
all_summary = all_summary,
evaluate = evaluate
) %>%
#Bind back together
dplyr::bind_rows(
.id = "col_ind"
) %>%
#Keep index numeric; convert to factor to maintain order of summaries
dplyr::mutate(
col_ind = as.numeric(.data$col_ind),
sum_lab = forcats::as_factor(.data$sum_lab)
) %>%
#Spread over the columns
tidyr::spread(
key = .data$sum_lab,
value = .data$sum_val
)
}
#Name: f_add_n
#Description: Internal function for compute sample size to avoid redundant code
f_add_n <-
function(
result, #Data to add the sample size to
by, #Column to compute sample size for
data #Data to compute sample size on
) {
result %>%
#Join to get sample size
dplyr::inner_join(
y =
data %>%
#Group by column strata
dplyr::group_by_at(by) %>%
#Compute sample size
dplyr::summarise(
col_N = dplyr::n()
) %>%
dplyr::ungroup() %>%
#Convert to character types
dplyr::mutate_at(
dplyr::vars(
tidyselect::all_of(by)
),
as.character
),
by = by
) %>%
#Concatenate sample size to last col_strata
dplyr::mutate_at(
dplyr::vars(
tidyselect::all_of(by[length(by)])
),
~paste0(.x, " (N=", .data$col_N, ")")
) %>%
#Remove sample size column
dplyr::select(
-.data$col_N
)
}
#Name: remove_duplicates
#Description: Internal function to replace duplicates with a specified value
remove_duplicates <-
function(
results,
at,
fill_blanks,
groups = NULL
) {
#Group if non-null
if(!is.null(groups)) {
results <-
results %>%
#Remove duplicate values
dplyr::group_by_at(groups)
}
results %>%
#Scope is the specified set
dplyr::mutate_at(
dplyr::vars(
tidyselect::all_of(at)
),
~
dplyr::case_when(
duplicated(as.character(.x)) ~ fill_blanks,
TRUE ~ as.character(.x)
)
) %>%
dplyr::ungroup()
}
#Name: univariate_table
#Description: Create a custom table with univariate descriptive statistics
univariate_table <-
function(
data,
strata = NULL,
associations = NULL,
numeric_summary = c(Summary = "median (q1, q3)"),
categorical_summary = c(Summary = "count (percent%)"),
other_summary = NULL,
all_summary = NULL,
evaluate = FALSE,
add_n = FALSE,
order = NULL,
labels = NULL,
levels = NULL,
format = c("html", "latex", "markdown", "pandoc", "none"),
variableName = "Variable",
levelName = "Level",
sep = "_",
fill_blanks = "",
caption = NULL,
...
) {
#Set default values
col_strata <- NULL
row_strata <- character(0)
#Extract all stratification variables
strata_vars <- all.vars(strata)
#Summarize entire data set if no stratification columns
if(length(strata_vars) == 0) {
results <-
data %>%
absorb_descriptives(
numeric_summary = numeric_summary,
categorical_summary = categorical_summary,
other_summary = other_summary,
all_summary = all_summary,
evaluate = evaluate,
...
)
#Otherwise summarize within each group
} else {
results <-
data %>%
stratiply(
f = absorb_descriptives,
by = tidyselect::all_of(strata_vars),
numeric_summary = numeric_summary,
categorical_summary = categorical_summary,
other_summary = other_summary,
all_summary = all_summary,
evaluate = evaluate,
...
) %>%
#Bind back together
fasten(
into = strata_vars
)
#Differentiate row/column strata
col_strata <- colnames(attr(terms(strata), "factors"))
row_strata <- setdiff(strata_vars, col_strata)
}
#Compute stratification group sample size if requested
if(add_n) {
#Check for column strata
if(!is.null(col_strata)) {
#Call internal function
results <- f_add_n(results, col_strata, data)
}
#Check for row strata
if(length(row_strata) > 0) {
#Call internal function
results <- f_add_n(results, row_strata, data)
}
}
#Ensure all_summary are the last columns
if(!is.null(all_summary)) {
results <-
results %>%
#Remove then add to end
dplyr::select(
-tidyselect::any_of(names(all_summary)),
tidyselect::any_of(names(all_summary))
)
}
#Span results across columns
if(!is.null(col_strata)) {
results <-
results %>%
#Convert to factor to maintain order
dplyr::mutate_at(
dplyr::vars(
tidyselect::all_of(col_strata)
),
forcats::as_factor
) %>%
#Send all results across by the strata
stretch(
key = tidyselect::all_of(col_strata),
value = -tidyselect::any_of(c(row_strata, col_strata, "col_ind", "col_lab", "val_ind", "val_lab"))
)
}
#Gather association metrics
association_results <- NULL
if(!is.null(associations)) {
#Ensure column strata are available
if(is.null(col_strata)) {
#Give a warning
warning("Association metrics can only be computed with column strata present.")
} else {
#Create secondary dataset
temp_data <-
data %>%
#Combine column strata so they are compared across all groups
tidyr::unite(
col = "col_strata",
tidyselect::all_of(col_strata),
sep = sep
)
#If there are row strata, run association metrics within
if(length(row_strata) > 0) {
#Append the sample size to the row strata variable so joining works
if(add_n) {
#Convert to character to avoid warning
temp_data <-
temp_data %>%
dplyr::mutate_at(
dplyr::vars(
tidyselect::all_of(row_strata)
),
as.character
)
#Call internal function
temp_data <- f_add_n(temp_data, row_strata, temp_data)
}
association_results <-
temp_data %>%
#Evaluate functions on each row strata
stratiply(
f = univariate_associations,
by = tidyselect::all_of(row_strata),
associations,
predictors = tidyselect::all_of("col_strata")
) %>%
#Bind results together
fasten(
into = row_strata
)
} else {
association_results <-
temp_data %>%
univariate_associations(
f = associations,
predictors = tidyselect::all_of("col_strata")
)
}
#Determine index to join on depending on type (0 for categorical, 1 for everything else)
association_results$val_ind <- as.numeric(!(association_results$response %in% unique(results$col_lab[results$val_ind != 1])))
#Extract the col_ind to maintain order
association_results <-
association_results %>%
#Join with distinct labels/indices
dplyr::inner_join(
y =
results %>%
#Get distinct col_ind, col_labs
dplyr::select(
.data$col_ind,
.data$col_lab
) %>%
dplyr::distinct(),
by = c("response" = "col_lab")
) %>%
#Remove predictor column
dplyr::select(
-.data$predictor
)
#Join back to results
results <-
results %>%
#Join on required columns
dplyr::full_join(
y = association_results,
by = c(row_strata, "col_ind", "col_lab" = "response", "val_ind")
)
}
}
#Reassign col_ind if needed
if(!is.null(order)) {
results <-
results %>%
dplyr::mutate(
#Convert to factor and apply releveling
col_lab = forcats::fct_relevel(factor(.data$col_lab), order),
#Get numeric order
col_ind = as.numeric(.data$col_lab),
#Convert back to character
col_lab = as.character(.data$col_lab)
)
}
#Relevel if requested
if(!is.null(levels)) {
results <-
results %>%
#Keep all result rows
dplyr::left_join(
y =
levels %>%
#Convert each vector to a data frame
purrr::map_df(
tibble::enframe,
name = "val_lab",
value = "val_lab_new",
.id = "col_lab"
),
by = c("col_lab", "val_lab")
) %>%
#Replace levels with new levels if present
dplyr::mutate(
val_lab = dplyr::coalesce(.data$val_lab_new, .data$val_lab)
) %>%
#Remove column of new labels
dplyr::select(
-.data$val_lab_new
)
}
#Relabel if requested
if(!is.null(labels)) {
results <-
results %>%
#Keep all result rows
dplyr::left_join(
y =
labels %>%
tibble::enframe(
name = "col_lab",
value = "col_lab_new"
),
by = "col_lab"
) %>%
#Replace labels with new labels if present
dplyr::mutate(
col_lab = dplyr::coalesce(.data$col_lab_new, .data$col_lab)
) %>%
#Remove new labels
dplyr::select(
-.data$col_lab_new
)
}
#Arrange results by indices
results <-
results %>%
dplyr::arrange_at(
dplyr::vars(
tidyselect::all_of(c(row_strata, "col_ind", "val_ind"))
)
) %>%
#Remove index columns
dplyr::select(
-.data$col_ind,
-.data$val_ind
) %>%
#Provide specified column names
dplyr::rename_all(
dplyr::recode,
col_lab = variableName,
val_lab = levelName
) %>%
#Fill in empty values
dplyr::mutate_if(
is.character,
dplyr::coalesce,
fill_blanks
)
#Proceed depending on render format
format <- match.arg(format)
if(format %in% c("html", "latex")) {
#Make sequences up to collapse
collapse_set1 <- seq_len(which(names(results) == variableName))
collapse_set2 <- NULL
if(!is.null(associations) & !is.null(col_strata))
collapse_set2 <- which(names(results) %in% setdiff(names(association_results), c(row_strata, "col_lab", "response")))
#If there are no col strata, make kable
if(is.null(col_strata)) {
results <-
results %>%
knitr::kable(
format = format,
caption = caption
) %>%
kableExtra::kable_styling(
full_width = FALSE
)
} else {
#Make a grable
results <-
results %>%
grable(
at = -tidyselect::any_of(c(row_strata, variableName, levelName, names(association_results))),
sep = sep,
format = format,
caption = caption,
full_width = FALSE
)
}
#Remove duplicate rows
results %>%
kableExtra::collapse_rows(c(collapse_set1, collapse_set2), valign = "top")
} else {
#Remove duplicate association metrics
if(!is.null(associations)) {
results <-
#Call internal function
remove_duplicates(
results = results,
at = setdiff(names(association_results), c(row_strata, "col_lab", "response", "val_ind", "col_ind")),
fill_blanks = fill_blanks,
groups = c(row_strata, variableName)
)
}
#Apply within row strata
if(length(row_strata) > 0) {
results <-
results %>%
#Group by row strata
dplyr::group_by_at(row_strata)
}
#Remove duplicate variable names
results <-
#Call internal function
remove_duplicates(
results = results,
at = variableName,
fill_blanks = fill_blanks
)
#Remove duplicate row strata if needed
if(length(row_strata) > 0) {
#Iteratively group and remove if needed
if(length(row_strata) > 1) {
for(i in rev(seq_len(length(row_strata) - 1))) {
results <-
#Call internal function
remove_duplicates(
results = results,
at = row_strata[i + 1],
fill_blanks = fill_blanks,
groups = row_strata[seq_len(i)]
)
}
}
#Remove duplicate for remaining row strata
results <-
#Call internal function
remove_duplicates(
results = results,
at = row_strata[1],
fill_blanks = fill_blanks
)
}
#Return frame if no format requested
if(format == "none") {
results
} else {
results %>%
knitr::kable(
format = format,
caption = caption
)
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.