#' Frequentist Test Comparisons Between Means
#'
#' @description Creates a summarized data frame out of a raw dataframe with an indication on whether
#' differences between mean results across a subgroup are statistically significant.
#'
#' @param df A raw data frame that includes a subgroup variable, a variable to be tested, and optionally a weight variable.
#' @param x A numeric variable name to be tested, in quotes.
#' @param subgroup A factor variable name that subgroups cases, in quotes.
#' @param level Optional. A level of the subgroup factor variable that would be compared to the rest of the sample, in quotes.
#' @param weight A post-stratification weight variable name, in quotes.
#' @param min_sample A numeric value that removes comparisons if one of the subgroups' sample size is below that value.
#' @param alpha The p-value level for which it is determined whether a difference is statistically significant or not. Default is 0.05.
#' @param p.adjust.method A type of adjustment method, as per the \link[stats]{pairwise.prop.test} function. The default method is "holm".
#' @param nlabels A logical argument. Default is FALSE. If TRUE, the subgroup levels will be changed to include the unweighted sample sizes ("subgroup (n=xx)").
#' @param newline A logical argument. Default is FALSE. If TRUE, the unweighted sample sizes will be shown on a separate line.
#'
#' @note If the level argument is used, the rest of the sample will be called "Rest of Sample". This assumes that you are not comparing a subgroup level that happens to have that name to the rest of the sample!
#'
#' @return A summary data frame that provides the results of the significance tests for each possible comparison, along with group's sample sizes and their means.
#'
#' @export
#'
#' @examples
#' #Prepping the data first
#' gss_data1 <- dplyr::filter(gss_data, year == "2016", coninc > 0)
#'
#' #With weights
#' freq_t_test(gss_data1, "coninc", "degree", weight = "wtssall")
#'
#' #Without weights
#' freq_t_test(gss_data1, "coninc", "degree")
#'
#' #Comparing one level (here the "Bachelor" level of education) to the rest of the sample
#' freq_t_test(gss_data1, "coninc", "degree", level = "BACHELOR", weight = "wtssall")
freq_t_test <- function(df, x, subgroup, level = NULL, weight = NULL, min_sample = NULL, alpha = 0.05, p.adjust.method = "holm", nlabels = FALSE, newline = FALSE) {
#Sanity checks
if (!base::is.data.frame(df)) {
stop("First argument must be a data frame. The object provided is of class: ", base::class(df)[1])
}
if (!base::is.numeric(df[[x]])) {
stop("Testing variable ", x, " must be a numeric vector. The object provided is of class: ", base::class(df[[x]])[1])
}
if (base::sum(base::names(df) %in% x) == 0) {
stop("Testing variable ", x, " not found in data frame.")
}
if (base::sum(base::names(df) %in% subgroup) == 0) {
stop("Subgroup variable ", subgroup, " not found in data frame.")
}
if (!base::is.factor(df[[subgroup]])) {
stop("Subgroup variable ", subgroup, " must be a factor vector. The object provided is of class: ", base::class(df[[subgroup]])[1])
}
if (base::is.null(weight)) {
df[["weight"]] <- base::rep(1, base::nrow(df))
vars <- base::c(x, subgroup, "weight")
weight <- "weight"
} else {
if (base::sum(base::names(df) %in% weight) == 0) {
stop("Weight variable ", weight, " not found in data frame.")
}
vars <- base::c(x, subgroup, weight)
}
if (!base::is.numeric(df[[weight]])) {
stop("Weight variable ", weight, " must be a numeric vector. The object provided is of class: ", base::class(df[[weight]])[1])
}
df_test <- df[vars]
df_test <- df_test[stats::complete.cases(df_test), ]
##new in v0.0.2
if (!is.logical(nlabels)) {
stop("Argument ", nlabels, " must be either TRUE or FALSE.")
}
if (!base::is.null(level)) {
if (base::sum(level %in% base::levels(df[[subgroup]])) == 0) {
stop("Argument ", level, " not found in the levels of factor variable ", subgroup, ".")
}
lvls <- base::levels(df_test[[subgroup]])
lvls[!lvls %in% level] <- "Rest of Sample"
base::levels(df_test[[subgroup]]) <- lvls
if(base::nrow(df_test[df_test[[subgroup]] == "Rest of Sample", ]) == 0) { #In rare scenarios that there are no other cases in the sample.
return_merged <- base::data.frame(
"group1" = base::factor(level, levels = c(level, "Rest of Sample")),
"group2" = base::factor("Rest of Sample", levels = c(level, "Rest of Sample")),
"p.value" = 1,
"significant" = FALSE,
"Sample_Size_group1" = base::nrow(df_test[df_test[[subgroup]] == level, ]),
"Sample_Size_group2" = 0,
"wtd.mean_group1" = Hmisc::wtd.mean(x = df_test[[x]], weights = df_test[[weight]]),
"wtd.mean_group2" = NA
)
# if (nlabels == TRUE) { #Do we still need this now that tbl_chart is not taking this function's output as input?
# LEV <- base::levels(result_merged$group1)
#
# if (newline == TRUE) {
# result_merged$group1 <- base::paste0(result_merged$group1, "\n(n=", base::format(result_merged$Sample_Size_group1, scientific = FALSE, trim = TRUE), ")")
# result_merged$group2 <- base::paste0(result_merged$group2, "\n(n=", base::format(result_merged$Sample_Size_group2, scientific = FALSE, trim = TRUE), ")")
# } else {
# result_merged$group1 <- base::paste0(result_merged$group1, " (n=", base::format(result_merged$Sample_Size_group1, scientific = FALSE, trim = TRUE), ")")
# result_merged$group2 <- base::paste0(result_merged$group2, " (n=", base::format(result_merged$Sample_Size_group2, scientific = FALSE, trim = TRUE), ")")
# }
#
# New_element <- base::unique(base::c(result_merged$group1, result_merged$group2))
# index <- base::pmatch(LEV, New_element)
#
# result_merged$group1 <- base::factor(result_merged$group1, levels = New_element[index])
# result_merged$group2 <- base::factor(result_merged$group2, levels = New_element[index])
# }
return(return_merged)
}
}
##end of new code
df_test[[subgroup]] <- base::droplevels(df_test[[subgroup]]) #new in v0.0.1.1
base::levels(df_test[[subgroup]]) <- base::gsub(x = base::levels(df_test[[subgroup]]), pattern = " ", replacement = "_") #Avoid whitespace and dots that can be an issue with reshape.
df_test_summ <- Hmisc::summarize(base::cbind(df_test[[x]], df_test[[weight]]), Hmisc::llist(df_test[[subgroup]]), function(y) Hmisc::wtd.mean(y[, 1], y[, 2]), stat.name = "wtd.mean")
base::names(df_test_summ)[1] <- subgroup
df_test_summ_n <- stats::aggregate(df_test[[x]] ~ df_test[[subgroup]], df_test, base::length)
base::names(df_test_summ_n)[2] <- "Sample_Size"
df_test_summ <- base::cbind(df_test_summ, df_test_summ_n[2])
df_test_summ[[subgroup]] <- base::droplevels(df_test_summ[[subgroup]])
##New in v0.0.3
if (base::is.null(min_sample)) {
min_sample <- 1
}
df_test_summ[base::which(df_test_summ$Sample_Size >= min_sample), ]
if (base::nrow(df_test_summ) < 2) { #In case no comparisons are made, returning a dummy data frame.
if (base::is.null(level)) {
return_merged <- base::data.frame(
"group1" = base::levels(df_test[[subgroup]]),
"group2" = base::rep(NA, base::nlevels(df_test[[subgroup]])),
"p.value" = base::rep(1, base::nlevels(df_test[[subgroup]])),
"significant" = base::rep(FALSE, base::nlevels(df_test[[subgroup]])),
"Sample_Size_group1" = base::rep(NA, base::nlevels(df_test[[subgroup]])),
"Sample_Size_group2" = base::rep(NA, base::nlevels(df_test[[subgroup]])),
"wtd.mean_group1" = base::rep(NA, base::nlevels(df_test[[subgroup]])),
"wtd.mean_group2" = base::rep(NA, base::nlevels(df_test[[subgroup]]))
)
} else {
return_merged <- base::data.frame(
"group1" = base::factor(level, levels = c(level, "Rest of Sample")),
"group2" = base::factor("Rest of Sample", levels = c(level, "Rest of Sample")),
"p.value" = 1,
"significant" = FALSE,
"Sample_Size_group1" = NA,
"Sample_Size_group2" = NA,
"wtd.mean_group1" = NA,
"wtd.mean_group2" = NA
)
}
return(return_merged)
} else {
##
test <- freq_pair_wtd_t_test(df_test[[x]], subgroup = df_test[[subgroup]], weight = df_test[[weight]], p.adjust.method = p.adjust.method)
test$p.value[base::is.nan(test$p.value)] <- 1 #New in v0.0.1.1.1 - in case a test is not run because both results are exactly the same, the p-value is set to 1.
result_test <- base::data.frame(group1 = base::row.names(test$p.value), test$p.value)
#New in v0.0.3.1 fixing a bug with non-letter characters and level names starting with a number getting changed
lvls <- base::levels(df_test[[subgroup]])
lvls <- lvls[-base::length(lvls)]
base::names(result_test) <- base::c(base::names(result_test)[1], lvls)
##
result <- stats::reshape(result_test, varying = base::names(result_test)[2:base::ncol(result_test)], timevar = "group2", v.names = "p.value", direction = "long", times = base::names(result_test)[2:base::ncol(result_test)])
result <- result[stats::complete.cases(result), ]
base::row.names(result) <- NULL
base::levels(result$group1) <- base::gsub(x = base::levels(result$group1), pattern = "_", replacement = " ")
result$group2 <- base::gsub(x = result$group2, pattern = "_", replacement = " ")
# base::levels(df_test[[subgroup]]) <- base::gsub(x = base::levels(df_test[[subgroup]]), pattern = "_", replacement = " ")
base::levels(df_test_summ[[subgroup]]) <- base::gsub(x = base::levels(df_test_summ[[subgroup]]), pattern = "_", replacement = " ")
result <- result[!base::names(result) %in% "id"]
result[1:2] <- base::lapply(result[1:2], base::factor, base::levels(df_test_summ[[subgroup]]))
result$significant <- base::ifelse(result$p.value < alpha, TRUE, FALSE)
tmp <- df_test_summ
base::names(tmp)[1] <- "group1"
base::names(tmp)[base::grepl("Sample_Size", base::names(tmp))] <- "Sample_Size_group1"
base::names(tmp)[base::grepl("wtd.mean", base::names(tmp))] <- "wtd.mean_group1"
tmp$group2 <- tmp$group1
tmp$Sample_Size_group2 <- tmp$Sample_Size_group1
tmp$wtd.mean_group2 <- tmp$wtd.mean_group1
tmp1 <- tmp[, base::c("group1", "wtd.mean_group1", "Sample_Size_group1")]
tmp2 <- tmp[, base::c("group2", "wtd.mean_group2", "Sample_Size_group2")]
result_merged <- base::merge(result, tmp1, by = "group1", sort = FALSE)
result_merged <- base::merge(result_merged, tmp2, by = "group2", sort = FALSE)
result_merged <- result_merged[, c("group1", "group2", "p.value", "significant", "Sample_Size_group1", "Sample_Size_group2", "wtd.mean_group1", "wtd.mean_group2")] #Rearranging order of variables in the output
##New in v0.0.2
if (nlabels == TRUE) {
LEV <- base::levels(result_merged$group1)
if (newline == TRUE) {
result_merged$group1 <- base::paste0(result_merged$group1, "\n(n=", base::format(result_merged$Sample_Size_group1, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
result_merged$group2 <- base::paste0(result_merged$group2, "\n(n=", base::format(result_merged$Sample_Size_group2, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
} else {
result_merged$group1 <- base::paste0(result_merged$group1, " (n=", base::format(result_merged$Sample_Size_group1, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
result_merged$group2 <- base::paste0(result_merged$group2, " (n=", base::format(result_merged$Sample_Size_group2, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
}
New_element <- base::unique(base::c(result_merged$group1, result_merged$group2))
index <- base::pmatch(LEV, New_element)
result_merged$group1 <- base::factor(result_merged$group1, levels = New_element[index])
result_merged$group2 <- base::factor(result_merged$group2, levels = New_element[index])
}
##
return(result_merged)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.