#' Pasting Together Information for Two Groups
#'
#' Paste together information, often statistics, from two groups. There are two
#' predefined combinations: mean(sd) and median\[min, max\], but user may also
#' paste any single measure together.
#'
#'
#' @param data input dataset. User must use consistent naming throughout,
#' \strong{with an underscore} to separate the group names from the measures
#' (i.e. `Group1_mean` and `Group2_mean`). There also must be two
#' columns with column names that exactly match the input for `first_name`
#' and `second_name` (i.e. 'Group1' and 'Group2'), which are used to form
#' the `Comparison` variable.
#' @param vars_to_paste vector of names of common measures to paste together.
#' Can be the predefined 'median_min_max' or 'mean_sd', or any variable as long
#' as they have matching columns for each group (i.e. Group1_MyMeasure and
#' Group2_MyMeasure). Multiple measures can be requested. Default: "all" will
#' run 'median_min_max' and 'mean_sd', as well as any pairs of columns in the
#' proper format.
#' @param first_name name of first group (string before '_') . Default is 'Group1'.
#' @param second_name name of second group (string before '_'). Default is 'Group2'.
#' @param sep_val value to be pasted between the two measures. Default is ' vs. '.
#' @param na_str_out the character to replace missing values with.
#' @param alternative a character string specifying the alternative hypothesis,
#' must be one of "two.sided" (default), "greater" or "less". Will be used to
#' determine the character to be pasted between the group names
#' (`Comparison` variable). Specifying "two.sided" will use the
#' `sep_val` input.
#' @param digits integer indicating the number of decimal places to round to before pasting for numeric variables. Default is 0.
#' @param trailing_zeros logical indicating if trailing zeros should be included (i.e. 0.100 instead of 0.1). Note if set to TRUE output is a character vector.
#' @param keep_all logical indicating if all remaining, unpasted variables in `data` should be returned with the pasted variables. Default TRUE.
#' @param verbose a logical variable indicating if warnings and messages should be displayed. Default FALSE.
#' @details
#'
#' User must use consistant naming throughout, with a underscore to separate the group names from the measures (i.e. `Group1_mean` and `Group2_mean`). There also must be columns defining the group names (i.e. `Group1` and `Group2`), which are used to form the `Comparison` variable.
#'
#' `alternative` included as a parameter so the direction can easily be seen in one-sided test. If "two.sided" is selected the value to be pasted between the two group names will be set to `sep_val`, where "greater" will use " > " and "less" with use " < " as the pasting value.
#'
#'
#' @return data.frame with all the pasted values requested. Each name will have '_comparison' at the end of the names (i.e. mean_comparison, median_comparison, ...)
#' @examples
#'
#' library(dplyr)
#' library(tidyr)
#' data(exampleData_BAMA)
#'
#' descriptive_stats_by_group <- exampleData_BAMA %>%
#' group_by(visitno,antigen) %>%
#' summarise(
#' Group1 = unique(group[group == 1]), Group2 = unique(group[group == 2]),
#' Group1_n = length(magnitude[group == 1]), Group2_n = length(magnitude[group == 2]),
#' Group1_mean = mean(magnitude[group == 1]), Group2_mean = mean(magnitude[group == 2]),
#' Group1_sd = sd(magnitude[group == 1]), Group2_sd = sd(magnitude[group == 2]),
#' Group1_median = median(magnitude[group == 1]), Group2_median = median(magnitude[group == 2]),
#' Group1_min = min(magnitude[group == 1]), Group2_min = min(magnitude[group == 2]),
#' Group1_max = max(magnitude[group == 1]), Group2_max = max(magnitude[group == 2]),
#' .groups = 'drop'
#' )
#'
#' paste_tbl_grp(data = descriptive_stats_by_group, vars_to_paste = 'all', first_name = 'Group1',
#' second_name = 'Group2', sep_val = " vs. ", digits = 0, keep_all = TRUE)
#'
#' paste_tbl_grp(data = descriptive_stats_by_group, vars_to_paste = c("mean", "median_min_max"),
#' alternative= "less", keep_all = FALSE)
#'
#' paste_tbl_grp(data = descriptive_stats_by_group, vars_to_paste = 'all', first_name = 'Group1',
#' second_name = 'Group2', sep_val = " vs. ",
#' alternative = 'less', digits = 5, keep_all = FALSE)
#'
#'
#' # Same example wit tidyverse in single pipe
#'
#'
#'exampleData_BAMA %>%
#' mutate(group = paste0("Group", group)) %>%
#' group_by(group, visitno, antigen) %>%
#' summarise(N = n(), mean = mean(magnitude), sd = sd(magnitude),
#' median = median(magnitude), min = min(magnitude),
#' max = max(magnitude), q95_fun = quantile(magnitude, 0.95),
#' .groups = 'drop') %>%
#' pivot_longer(-(group:antigen)) %>% # these three chains create a wide dataset
#' unite(temp, group, name) %>%
#' pivot_wider(names_from = temp, values_from = value) %>%
#' mutate(Group1 = "Group 1", Group2 = "Group 2") %>%
#' paste_tbl_grp()
#'
#' @export
paste_tbl_grp <- function(
data, vars_to_paste = 'all', first_name = 'Group1', second_name = 'Group2',
sep_val = " vs. ", na_str_out = "---",
alternative = c("two.sided", "less", "greater"),
digits = 0, trailing_zeros = TRUE, keep_all = TRUE, verbose = FALSE
){
#####Checking variables being used
data_here <- as.data.frame(data)
alternative <- match.arg(alternative)
.check_numeric_input(digits, lower_bound = 0, scalar = TRUE)
if (first_name == second_name) stop('"first_name" and "second_name" must be different')
if (sum(first_name == names(data_here)) != 1)
stop('Expecting one column named "', first_name ,
'" in input dataset, but there are ',
sum(first_name == names(data_here)), ' present')
if (sum(second_name == names(data_here)) != 1)
stop('Expecting one column named "', second_name ,
'" in input dataset, but there are ',
sum(second_name == names(data_here)), ' present')
if (length(vars_to_paste) != 1 & any(vars_to_paste == 'all')) {
vars_to_paste = 'all'
if (verbose)
message('Since "all" specified other entries of vars_to_paste ignored.')
}
# Defining vars when vars_to_paste set to 'all'
if (length(vars_to_paste) == 1 & any(vars_to_paste == 'all')) {
#Need to address if one group name a subset of another
temp_group1_names <- names(data_here)[
(substr(names(data_here), 0, nchar(first_name)) == first_name) &
(nchar(first_name) > nchar(second_name) |
substr(names(data_here), 0, nchar(second_name)) != second_name)
]
temp_group2_names <- names(data_here)[
(substr(names(data_here), 0, nchar(second_name)) == second_name) &
(nchar(second_name) > nchar(first_name) |
substr(names(data_here), 0, nchar(first_name)) != first_name)
]
temp_group1_measures <- gsub(paste0(first_name,'_'), '',
temp_group1_names, fixed = TRUE)
temp_group2_measures <- gsub(paste0(second_name,'_'), '',
temp_group2_names, fixed = TRUE)
vars_to_paste_here <- unique(intersect(temp_group1_measures,
temp_group2_measures))
# Adding special cases
if (sum(vars_to_paste_here %in% c('median','min','max')) == 3)
vars_to_paste_here <- c(vars_to_paste_here, 'median_min_max')
if (sum(vars_to_paste_here %in% c('mean','sd')) == 2)
vars_to_paste_here <- c(vars_to_paste_here, 'mean_sd')
if (verbose) message('The following measures will be combined: ',
paste0(vars_to_paste_here, collapse = ', '))
} else {
vars_to_paste_here <- unique(vars_to_paste)
}
# Giving a message if nothing to return
if (any(vars_to_paste == 'all') & length(vars_to_paste_here) == 0) {
if (verbose) message('"all" specified, but no matching columns to paste')
if (keep_all) return(data) else return(NULL)
}
# Need to define which variables to check. Special considerations for the predefined values
vars_to_check <- vars_to_paste_here[!vars_to_paste_here %in%
c('median_min_max','mean_sd')]
if (any(vars_to_paste_here == 'median_min_max'))
vars_to_check <- unique(c(vars_to_check, 'median', 'min', 'max'))
if (any(vars_to_paste_here == 'mean_sd'))
vars_to_check <- unique(c(vars_to_check, 'mean', 'sd'))
# Need to check the group1 and group2 version of each variable being pasted
group1_vars_to_check <- paste0(first_name, '_', vars_to_check)
group2_vars_to_check <- paste0(second_name, '_', vars_to_check)
for (i in 1:length(vars_to_check)) {
if (sum(group1_vars_to_check[i] == names(data_here)) != 1)
stop('Expecting one column named "', group1_vars_to_check[i] ,
'" in input dataset, but there are ',
sum(group1_vars_to_check[i] == names(data_here)), ' present')
temp_group1_var <- data_here[, group1_vars_to_check[i] == names(data_here)]
if (is.numeric(temp_group1_var) & any(!is.na(temp_group1_var)))
.check_numeric_input(temp_group1_var)
if (sum(group2_vars_to_check[i] == names(data_here)) != 1)
stop('Expecting one column named "', group2_vars_to_check[i] ,
'" in input dataset, but there are ',
sum(group2_vars_to_check[i] == names(data_here)), ' present')
temp_group2_var <- data_here[, group2_vars_to_check[i] == names(data_here)]
if (is.numeric(temp_group2_var) & any(!is.na(temp_group2_var)))
.check_numeric_input(temp_group2_var)
}
##### Pasting variables
# Comparison variable
comparison_sep <- switch(alternative,
two.sided = sep_val,
less = ' < ',
greater = ' > ')
comparison_var <- paste0(data_here[, first_name == names(data_here)],
comparison_sep,
data_here[, second_name == names(data_here)])
# Other variables
pasted_results <- list()
for (i in 1:length(vars_to_paste_here)) {
if (vars_to_paste_here[i] == 'median_min_max') {
pasted_results[[i]] <- paste0(
stat_paste(stat1 = data_here[, paste0(first_name, '_median')],
stat2 = data_here[, paste0(first_name, '_min')],
stat3 = data_here[, paste0(first_name, '_max')],
digits = digits, bound_char = '[', sep = ', ',
na_str_out = na_str_out, trailing_zeros = trailing_zeros
),
sep_val,
stat_paste(stat1 = data_here[, paste0(second_name, '_median')],
stat2 = data_here[, paste0(second_name, '_min')],
stat3 = data_here[, paste0(second_name, '_max')],
digits = digits, bound_char = '[', sep = ', ',
na_str_out = na_str_out, trailing_zeros = trailing_zeros
)
)
} else if (vars_to_paste_here[i] == 'mean_sd') {
pasted_results[[i]] <- paste0(
stat_paste(stat1 = data_here[, paste0(first_name, '_mean')],
stat2 = data_here[, paste0(first_name, '_sd')],
digits = digits, bound_char = '(', na_str_out = na_str_out,
trailing_zeros = trailing_zeros
),
sep_val,
stat_paste(stat1 = data_here[, paste0(second_name, '_mean')],
stat2 = data_here[, paste0(second_name, '_sd')],
digits = digits, bound_char = '(', na_str_out = na_str_out,
trailing_zeros = trailing_zeros
)
)
} else {
first_var_here <- data_here[, paste0(first_name, '_', vars_to_paste_here[i])]
second_var_here <- data_here[, paste0(second_name, '_', vars_to_paste_here[i])]
both_var_here <- c(first_var_here, second_var_here)
# Want to set digits to 0 if an integer
if (is.numeric(both_var_here)) {
if (any((both_var_here %% 1) != 0)) digits_here = digits else digits_here = 0
} else digits_here = digits
pasted_results[[i]] <- paste0(
ifelse(is.na(first_var_here),
na_str_out,
.round_if_numeric(first_var_here, digits = digits_here,
trailing_zeros = trailing_zeros)),
sep_val,
ifelse(is.na(second_var_here), na_str_out,
.round_if_numeric(second_var_here, digits = digits_here,
trailing_zeros = trailing_zeros))
)
}
}
names(pasted_results) <- paste0(vars_to_paste_here, '_comparison')
pasted_results <- data.frame('Comparison' = comparison_var,pasted_results,
stringsAsFactors = FALSE)
# Returning all data if desired
if (keep_all) {
index_to_keep <- !names(data_here) %in% c(first_name,
second_name,
group1_vars_to_check,
group2_vars_to_check)
data.frame(data_here[, index_to_keep, drop = FALSE], pasted_results,
stringsAsFactors = FALSE)
} else {
pasted_results
}
}
#' Rounds and combines up to three numbers into table friendly presentation
#'
#' Takes in up to 3 numeric values, rounds each to a specified digit amount
#' (if numeric), and then combines them accordingly.
#'
#' @param stat1 first statistic to be pasted.
#' @param stat2 second statistic to be pasted (optional).
#' @param stat3 third statistic to be pasted (optional).
#' @param digits positive integer of length 1 between 0 (default) and 14,
#' giving the amount of digits to round to.
#' @param trailing_zeros logical indicating if trailing zeros should included
#' (i.e. 0.100 instead of 0.1). Note is set to TRUE output is a character vector
#' @param bound_char the character to be used between stat1 and stat2/stat3.
#' Available options are '(' (default), '[', '\{', and '|'.
#' @param sep the string to be used between stat2 and stat3. The default is ', '.
#' @param na_str_out the character to replace missing values with.
#' @param suffix a character string to add at the end of each stat
#' (i.e. `%` if doing response rates)
#'
#' @return string of combined values
#'
#' @details
#'
#' One value provided - returns a rounded value or the missing character.
#' Two values - returns stat1 (stat2). e.g., mean (sd)
#' Three values - returns stat1 (stat2, stat3). e.g., estimate (95% lower,
#' 95% upper) or median \[min, max\]
#'
#' Currently the method does work with string variables, but of course rounding
#' would not be relevant for strings.
#'
#'
#' @examples
#'
#' stat_paste(5.109293)
#' stat_paste(NA)
#' stat_paste(5.109293, 2.145)
#' stat_paste(5.109293, 1.7645, 8.0345)
#' stat_paste(NA, NA, NA)
#' stat_paste(5.109, "p < 0.001", digits = 3)
#' stat_paste(c(rep(5,5),NA),c(1:5,NA),c(1,NA,2,NA,3,NA),bound_char = '[')
#'
#' library(dplyr)
#' exampleData_BAMA %>%
#' group_by(antigen, visitno, group) %>%
#' summarise(median_min_max = stat_paste(median(magnitude, na.rm = TRUE),
#' min(magnitude, na.rm = TRUE),
#' max(magnitude, na.rm = TRUE)),
#' .groups = "keep")
#'
#' @export
stat_paste = function(
stat1, stat2 = NULL, stat3 = NULL, digits = 0, trailing_zeros = TRUE,
bound_char = c('(','[','{','|'), sep = ', ',
na_str_out = "---", suffix = NULL
){
bound_char <- match.arg(bound_char)
end_bound_char <- switch(bound_char,
`(` = ')',
`[` = ']',
`{` = '}',
`|` = '|'
)
stat1_pasted_obj <- ifelse(
is.na(stat1), na_str_out,
as.character(.round_if_numeric(stat1, digits = digits,
trailing_zeros = trailing_zeros))
)
if (!is.null(suffix)) stat1_pasted_obj <- paste0(stat1_pasted_obj, suffix)
if (is.null(stat2)) {
pasted_output <- stat1_pasted_obj
} else {
stat2_pasted_obj <- ifelse(
is.na(stat2),
na_str_out,
as.character(.round_if_numeric(stat2, digits = digits,
trailing_zeros = trailing_zeros))
)
if (!is.null(suffix)) stat2_pasted_obj <- paste0(stat2_pasted_obj, suffix)
if (is.null(stat3)) {
pasted_output <- ifelse(
is.na(stat1) & is.na(stat2),
na_str_out,
paste0(stat1_pasted_obj, " ", bound_char, stat2_pasted_obj, end_bound_char)
)
} else {
stat3_pasted_obj <- ifelse(
is.na(stat3),
na_str_out,
as.character(.round_if_numeric(stat3, digits = digits,
trailing_zeros = trailing_zeros))
)
if (!is.null(suffix)) stat3_pasted_obj <- paste0(stat3_pasted_obj, suffix)
pasted_output <- ifelse(
is.na(stat1) & is.na(stat2) & is.na(stat3),
na_str_out,
paste0(stat1_pasted_obj, " ", bound_char, stat2_pasted_obj, sep,
stat3_pasted_obj, end_bound_char))
}
}
pasted_output
}
#' Round and format a vector of p-values
#'
#' pretty_pvalues() takes a vector of p-values, rounds them to a specified
#' digit amount,
#' allows options for emphasizing p-values < the defined significance level,
#' and returns a character for missing.
#'
#' @param pvalues numeric vector of raw p-values to be formatted
#' @param digits number of digits to round to; values with zeros past this
#' number of digits are truncated
#' @param bold TRUE or FALSE: set to TRUE to bold p-values < the defined
#' significance level
#' @param italic TRUE or FALSE: set to TRUE to italicize p-values < the defined
#' significance level
#' @param background highlight color for p-values < the defined significance
#' level. Default = NULL (no highlighting)
#' @param sig_alpha the defined significance level. Default = 0.05
#' @param missing_char character string that will replace missing values from
#' the p-value vector. Default = "---"
#' @param include_p TRUE or FALSE: set to TRUE to print "p = " before each p-value
#' @param trailing_zeros TRUE or FALSE: default = TRUE, p-values are formatted
#' with trailing zeros to the defined number of digits (i.e. 0.100 instead of 0.1 if digits = 3)
#' @param output_type output type, "latex" (default), "html" , "pandoc"
#' (for Word document output), or "no_markup"
#'
#' @return Vector of transformed p-values for table output
#'
#' @details
#'
#' With this function, there are two things to be noted:
#' Since the p-value vector formatting uses `cell_spec`, which generates raw
#' HTML or LaTeX, or uses custom markdown syntax for the pandoc option, make
#' sure you remember to put `escape = FALSE` into your kable code when generating
#' your table. At the same time, you will need to escape special symbols manually.
#' Additionally, `cell_spec` needs a way to know whether you want HTML, LaTeX,
#' or Markdown (pandoc) output. You can specify it locally in the function or
#' globally using `options(knitr.table.format = "latex")`. If you don't provide
#' anything, this function will output as HTML by default.
#'
#' For pandoc markup only bold and italic can be specified. Both can jointly be
#' specified for bold italics.
#'
#' @examples
#'
#' pvalue_example = c(1, 0.06, 0.0005, NA, 1e-6)
#'
#' pretty_pvalues(pvalue_example, background = "pink")
#'
#' pretty_pvalues(pvalue_example, digits = 4, missing_char = "missing",
#' bold = TRUE)
#'
#' # How to use pretty_pvalues in reports
#' raw_pvals <- c(0.00000007, .05000001, NaN, NA, 0.783)
#' pretty_pvals <- pretty_pvalues(raw_pvals , digits = 3, background = "green",
#' italic = TRUE, bold = TRUE)
#' kableExtra::kable(pretty_pvals , format = "latex", escape = FALSE,
#' col.names = c("P-values"))
#'
#' @export
pretty_pvalues = function(
pvalues, digits = 3, bold = FALSE, italic = FALSE, background = NULL,
sig_alpha = 0.05, missing_char = '---', include_p = FALSE,
trailing_zeros = TRUE, output_type = c('latex', 'html', 'pandoc', 'no_markup')
){
output_type <- match.arg(output_type)
.check_numeric_input(pvalues, lower_bound = 0, upper_bound = 1)
.check_numeric_input(sig_alpha, lower_bound = 0, upper_bound = 1, scalar = TRUE)
.check_numeric_input(digits, lower_bound = 1, upper_bound = 14, scalar = TRUE,
whole_num = TRUE)
#Need to set options for no scientific notation, but set back to user preference on exit
op <- options()
options(scipen = 10)
on.exit(options(op))
lower_cutoff = 10^(-digits)
## relevant p-value indices for specific assignments
missing_p = which(is.na(pvalues))
below_cutoff_p = which(pvalues < lower_cutoff)
sig_p = which(pvalues < sig_alpha)
if (trailing_zeros) {
pvalues_new = round_away_0(pvalues, trailing_zeros = TRUE, digits = digits)
} else {
pvalues_new = as.character(round_away_0(pvalues, trailing_zeros = FALSE, digits))
}
## manipulate and assign pvalues as characters to output pvalue vector
pvalues_new[missing_p] = missing_char
pvalues_new[below_cutoff_p] = paste0("<", lower_cutoff)
# the letter 'p' in front of values
if (include_p) pvalues_new <- ifelse(
pvalues_new < lower_cutoff,
paste0('p', pvalues_new),
paste0('p=', pvalues_new))
# formatting
if (output_type %in% c('latex', 'html')) {
pvalues_new[sig_p] =
kableExtra::cell_spec(pvalues_new[sig_p], format = output_type,
bold = bold, italic = italic,
background = background, escape = FALSE)
} else if (output_type == 'pandoc') {
if (bold) pvalues_new[sig_p] = paste0('**', pvalues_new[sig_p], '**')
if (italic) pvalues_new[sig_p] = paste0('*', pvalues_new[sig_p], '*')
}
pvalues_new
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.