R/freq_t_test.R

Defines functions freq_t_test

Documented in freq_t_test

#' Frequentist Test Comparisons Between Means
#'
#' @description Creates a summarized data frame out of a raw dataframe with an indication on whether
#' differences between mean results across a subgroup are statistically significant.
#'
#' @param df A raw data frame that includes a subgroup variable, a variable to be tested, and optionally a weight variable.
#' @param x A numeric variable name to be tested, in quotes.
#' @param subgroup A factor variable name that subgroups cases, in quotes.
#' @param level Optional. A level of the subgroup factor variable that would be compared to the rest of the sample, in quotes.
#' @param weight A post-stratification weight variable name, in quotes.
#' @param min_sample A numeric value that removes comparisons if one of the subgroups' sample size is below that value.
#' @param alpha The p-value level for which it is determined whether a difference is statistically significant or not. Default is 0.05.
#' @param p.adjust.method A type of adjustment method, as per the \link[stats]{pairwise.prop.test} function. The default method is "holm".
#' @param nlabels A logical argument. Default is FALSE. If TRUE, the subgroup levels will be changed to include the unweighted sample sizes ("subgroup (n=xx)").
#' @param newline A logical argument. Default is FALSE. If TRUE, the unweighted sample sizes will be shown on a separate line.
#'
#' @note If the level argument is used, the rest of the sample will be called "Rest of Sample". This assumes that you are not comparing a subgroup level that happens to have that name to the rest of the sample!
#'
#' @return A summary data frame that provides the results of the significance tests for each possible comparison, along with group's sample sizes and their means.
#'
#' @export
#'
#' @examples
#' #Prepping the data first
#' gss_data1 <- dplyr::filter(gss_data, year == "2016", coninc > 0)
#'
#' #With weights
#' freq_t_test(gss_data1, "coninc", "degree", weight = "wtssall")
#'
#' #Without weights
#' freq_t_test(gss_data1, "coninc", "degree")
#'
#' #Comparing one level (here the "Bachelor" level of education) to the rest of the sample
#' freq_t_test(gss_data1, "coninc", "degree", level = "BACHELOR", weight = "wtssall")

freq_t_test <- function(df, x, subgroup, level = NULL, weight = NULL, min_sample = NULL, alpha = 0.05, p.adjust.method = "holm", nlabels = FALSE, newline = FALSE) {

	#Sanity checks

	if (!base::is.data.frame(df)) {
		stop("First argument must be a data frame. The object provided is of class: ", base::class(df)[1])
	}

	if (!base::is.numeric(df[[x]])) {
		stop("Testing variable ", x, " must be a numeric vector. The object provided is of class: ", base::class(df[[x]])[1])
	}

	if (base::sum(base::names(df) %in% x) == 0) {
		stop("Testing variable ", x, " not found in data frame.")
	}

	if (base::sum(base::names(df) %in% subgroup) == 0) {
		stop("Subgroup variable ", subgroup, " not found in data frame.")
	}

	if (!base::is.factor(df[[subgroup]])) {
		stop("Subgroup variable ", subgroup, " must be a factor vector. The object provided is of class: ", base::class(df[[subgroup]])[1])
	}

	if (base::is.null(weight)) {
		df[["weight"]] <- base::rep(1, base::nrow(df))
		vars <- base::c(x, subgroup, "weight")
		weight <- "weight"
	} else {

		if (base::sum(base::names(df) %in% weight) == 0) {
			stop("Weight variable ", weight, " not found in data frame.")
		}

		vars <- base::c(x, subgroup, weight)
	}

	if (!base::is.numeric(df[[weight]])) {
		stop("Weight variable ", weight, " must be a numeric vector. The object provided is of class: ", base::class(df[[weight]])[1])
	}

	df_test <- df[vars]
	df_test <- df_test[stats::complete.cases(df_test), ]

	##new in v0.0.2
	if (!is.logical(nlabels)) {
		stop("Argument ", nlabels, " must be either TRUE or FALSE.")
	}

	if (!base::is.null(level)) {
		if (base::sum(level %in% base::levels(df[[subgroup]])) == 0) {
			stop("Argument ", level, " not found in the levels of factor variable ", subgroup, ".")
		}
		lvls <- base::levels(df_test[[subgroup]])
		lvls[!lvls %in% level] <- "Rest of Sample"
		base::levels(df_test[[subgroup]]) <- lvls
		if(base::nrow(df_test[df_test[[subgroup]] == "Rest of Sample", ]) == 0) { #In rare scenarios that there are no other cases in the sample.
			return_merged <- base::data.frame(
				"group1" = base::factor(level, levels = c(level, "Rest of Sample")),
				"group2" = base::factor("Rest of Sample", levels = c(level, "Rest of Sample")),
				"p.value" = 1,
				"significant" = FALSE,
				"Sample_Size_group1" = base::nrow(df_test[df_test[[subgroup]] == level, ]),
				"Sample_Size_group2" = 0,
				"wtd.mean_group1" = Hmisc::wtd.mean(x = df_test[[x]], weights = df_test[[weight]]),
				"wtd.mean_group2" = NA
				)
			# if (nlabels == TRUE) { #Do we still need this now that tbl_chart is not taking this function's output as input?
			# 	LEV <- base::levels(result_merged$group1)
			#
			# 	if (newline == TRUE) {
			# 		result_merged$group1 <- base::paste0(result_merged$group1, "\n(n=", base::format(result_merged$Sample_Size_group1, scientific = FALSE, trim = TRUE), ")")
			# 		result_merged$group2 <- base::paste0(result_merged$group2, "\n(n=", base::format(result_merged$Sample_Size_group2, scientific = FALSE, trim = TRUE), ")")
			# 	} else {
			# 		result_merged$group1 <- base::paste0(result_merged$group1, " (n=", base::format(result_merged$Sample_Size_group1, scientific = FALSE, trim = TRUE), ")")
			# 		result_merged$group2 <- base::paste0(result_merged$group2, " (n=", base::format(result_merged$Sample_Size_group2, scientific = FALSE, trim = TRUE), ")")
			# 	}
			#
			# 	New_element <- base::unique(base::c(result_merged$group1, result_merged$group2))
			# 	index <- base::pmatch(LEV, New_element)
			#
			# 	result_merged$group1 <- base::factor(result_merged$group1, levels = New_element[index])
			# 	result_merged$group2 <- base::factor(result_merged$group2, levels = New_element[index])
			# }
			return(return_merged)
		}
	}
	##end of new code

	df_test[[subgroup]] <- base::droplevels(df_test[[subgroup]]) #new in v0.0.1.1

	base::levels(df_test[[subgroup]]) <- base::gsub(x = base::levels(df_test[[subgroup]]), pattern = " ", replacement = "_") #Avoid whitespace and dots that can be an issue with reshape.
	df_test_summ <- Hmisc::summarize(base::cbind(df_test[[x]], df_test[[weight]]), Hmisc::llist(df_test[[subgroup]]), function(y) Hmisc::wtd.mean(y[, 1], y[, 2]), stat.name = "wtd.mean")
	base::names(df_test_summ)[1] <- subgroup
	df_test_summ_n <- stats::aggregate(df_test[[x]] ~ df_test[[subgroup]], df_test, base::length)
	base::names(df_test_summ_n)[2] <- "Sample_Size"
	df_test_summ <- base::cbind(df_test_summ, df_test_summ_n[2])
	df_test_summ[[subgroup]] <- base::droplevels(df_test_summ[[subgroup]])

	##New in v0.0.3
	if (base::is.null(min_sample)) {
		min_sample <- 1
	}

		df_test_summ[base::which(df_test_summ$Sample_Size >= min_sample), ]

		if (base::nrow(df_test_summ) < 2) { #In case no comparisons are made, returning a dummy data frame.
			if (base::is.null(level)) {
			return_merged <- base::data.frame(
				"group1" = base::levels(df_test[[subgroup]]),
				"group2" = base::rep(NA, base::nlevels(df_test[[subgroup]])),
				"p.value" = base::rep(1, base::nlevels(df_test[[subgroup]])),
				"significant" = base::rep(FALSE, base::nlevels(df_test[[subgroup]])),
				"Sample_Size_group1" = base::rep(NA, base::nlevels(df_test[[subgroup]])),
				"Sample_Size_group2" = base::rep(NA, base::nlevels(df_test[[subgroup]])),
				"wtd.mean_group1" = base::rep(NA, base::nlevels(df_test[[subgroup]])),
				"wtd.mean_group2" = base::rep(NA, base::nlevels(df_test[[subgroup]]))
			)
			} else {
				return_merged <- base::data.frame(
					"group1" = base::factor(level, levels = c(level, "Rest of Sample")),
					"group2" = base::factor("Rest of Sample", levels = c(level, "Rest of Sample")),
					"p.value" = 1,
					"significant" = FALSE,
					"Sample_Size_group1" = NA,
					"Sample_Size_group2" = NA,
					"wtd.mean_group1" = NA,
					"wtd.mean_group2" = NA
				)
			}
			return(return_merged)
		} else {
	##

	test <- freq_pair_wtd_t_test(df_test[[x]], subgroup = df_test[[subgroup]], weight = df_test[[weight]], p.adjust.method = p.adjust.method)
	test$p.value[base::is.nan(test$p.value)] <- 1 #New in v0.0.1.1.1 - in case a test is not run because both results are exactly the same, the p-value is set to 1.
	result_test <- base::data.frame(group1 = base::row.names(test$p.value), test$p.value)
	#New in v0.0.3.1 fixing a bug with non-letter characters and level names starting with a number getting changed
	lvls <- base::levels(df_test[[subgroup]])
	lvls <- lvls[-base::length(lvls)]
	base::names(result_test) <- base::c(base::names(result_test)[1], lvls)
	##
	result <- stats::reshape(result_test, varying = base::names(result_test)[2:base::ncol(result_test)], timevar = "group2", v.names = "p.value", direction = "long", times = base::names(result_test)[2:base::ncol(result_test)])

	result <- result[stats::complete.cases(result), ]
	base::row.names(result) <- NULL

	base::levels(result$group1) <- base::gsub(x = base::levels(result$group1), pattern = "_", replacement = " ")
	result$group2 <- base::gsub(x = result$group2, pattern = "_", replacement = " ")
	# base::levels(df_test[[subgroup]]) <- base::gsub(x = base::levels(df_test[[subgroup]]), pattern = "_", replacement = " ")
	base::levels(df_test_summ[[subgroup]]) <- base::gsub(x = base::levels(df_test_summ[[subgroup]]), pattern = "_", replacement = " ")

	result <- result[!base::names(result) %in% "id"]

	result[1:2] <- base::lapply(result[1:2], base::factor, base::levels(df_test_summ[[subgroup]]))
	result$significant <- base::ifelse(result$p.value < alpha, TRUE, FALSE)

	tmp <- df_test_summ
	base::names(tmp)[1] <- "group1"
	base::names(tmp)[base::grepl("Sample_Size", base::names(tmp))] <- "Sample_Size_group1"
	base::names(tmp)[base::grepl("wtd.mean", base::names(tmp))] <- "wtd.mean_group1"
	tmp$group2 <- tmp$group1
	tmp$Sample_Size_group2 <- tmp$Sample_Size_group1
	tmp$wtd.mean_group2 <- tmp$wtd.mean_group1

	tmp1 <- tmp[, base::c("group1", "wtd.mean_group1", "Sample_Size_group1")]
	tmp2 <- tmp[, base::c("group2", "wtd.mean_group2", "Sample_Size_group2")]

	result_merged <- base::merge(result, tmp1, by = "group1", sort = FALSE)
	result_merged <- base::merge(result_merged, tmp2, by = "group2", sort = FALSE)
	result_merged <- result_merged[, c("group1", "group2", "p.value", "significant", "Sample_Size_group1", "Sample_Size_group2", "wtd.mean_group1", "wtd.mean_group2")] #Rearranging order of variables in the output

	##New in v0.0.2
	if (nlabels == TRUE) {
		LEV <- base::levels(result_merged$group1)

		if (newline == TRUE) {
			result_merged$group1 <- base::paste0(result_merged$group1, "\n(n=", base::format(result_merged$Sample_Size_group1, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
			result_merged$group2 <- base::paste0(result_merged$group2, "\n(n=", base::format(result_merged$Sample_Size_group2, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
		} else {
			result_merged$group1 <- base::paste0(result_merged$group1, " (n=", base::format(result_merged$Sample_Size_group1, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
			result_merged$group2 <- base::paste0(result_merged$group2, " (n=", base::format(result_merged$Sample_Size_group2, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
		}

		New_element <- base::unique(base::c(result_merged$group1, result_merged$group2))
		index <- base::pmatch(LEV, New_element)

		result_merged$group1 <- base::factor(result_merged$group1, levels = New_element[index])
		result_merged$group2 <- base::factor(result_merged$group2, levels = New_element[index])
	}
	##

	return(result_merged)
		}
}
philstraforelli/ggsigmark documentation built on May 20, 2019, 1:59 p.m.