R/freq_prop_test.R

Defines functions freq_prop_test

Documented in freq_prop_test

#' Frequentist Test Comparisons Between Proportions
#'
#' @description Creates a summarized data frame out of a raw dataframe with an indication on whether
#' differences between proportion results across a subgroup are statistically significant.
#'
#' @param df A raw data frame that includes a subgroup variable, a variable to be tested, and optionally a weight variable.
#' @param x A factor variable name to be tested, in quotes.
#' @param subgroup A factor variable name that subgroups cases, in quotes.
#' @param level Optional. A level of the subgroup factor variable that would be compared to the rest of the sample, in quotes.
#' @param weight Optional. A post-stratification weight variable name, in quotes.
#' @param min_sample A numeric value that removes comparisons if one of the subgroups' sample size is below that value.
#' @param alpha The p-value level for which it is determined whether a difference is statistically significant or not. Default is 0.05.
#' @param p.adjust.method A type of adjustment method, as per the \link[stats]{pairwise.prop.test} function. The default method is "holm".
#' @param nlabels A logical argument. Default is FALSE. If TRUE, the subgroup levels will be changed to include the unweighted sample sizes ("subgroup (n=xx)").
#' @param newline A logical argument. Default is FALSE. If TRUE, the unweighted sample sizes will be shown on a separate line.
#' @note If the level argument is used, the rest of the sample will be called "Rest of Sample". This assumes that you are not comparing a subgroup level that happens to have that name to the rest of the sample!
#'
#' @return A summary data frame that provides the results of the significance tests for each possible comparison, along with group's sample sizes and their proportions.
#'
#' @seealso \link[stats]{pairwise.prop.test}
#'
#' @export
#'
#' @examples
#' #Prepping the data first
#' gss_data1 <- dplyr::filter(gss_data, year == "2016",
#'                                       conlegis %in% c("A GREAT DEAL", "ONLY SOME", "HARDLY ANY"))
#'
#' #With weights
#' freq_prop_test(gss_data1, "conlegis", "region", weight = "wtssall")
#'
#' #Without weights
#' freq_prop_test(gss_data1, "conlegis", "region")
#'
#' #Comparing one level (here the "Pacific" region) to the rest of the sample
#' freq_prop_test(gss_data1, "conlegis", "region", level = "PACIFIC", weight = "wtssall")
#'
#' #Warnings of chi-squares being approximated can be ignored.

freq_prop_test <- function(df, x, subgroup, level = NULL, weight = NULL, min_sample = NULL, alpha = 0.05, p.adjust.method = "holm", nlabels = FALSE, newline = FALSE) {

	#Sanity checks

	if (!base::is.data.frame(df)) {
		stop("First argument must be a data frame. The object provided is of class: ", base::class(df)[1])
	}

	if (!base::is.factor(df[[x]])) {
		stop("Testing variable ", x, " must be a factor vector. The object provided is of class: ", base::class(df[[x]])[1])
	}

	if (base::sum(base::names(df) %in% x) == 0) {
		stop("Testing variable ", x, " not found in data frame ", df, ".")
	}

	if (base::sum(base::names(df) %in% subgroup) == 0) {
		stop("Subgroup variable ", subgroup, " not found in data frame ", df, ".")
	}

	if (!base::is.factor(df[[subgroup]])) {
		stop("Subgroup variable ", subgroup, " must be a factor vector. The object provided is of class: ", base::class(df[[subgroup]])[1])
	}

	if (base::is.null(weight)) {
		df[["weight"]] <- base::rep(1, base::nrow(df))
		vars <- base::c(x, subgroup, "weight")
		weight <- "weight"
	} else {

		if (base::sum(base::names(df) %in% weight) == 0) {
			stop("Weight variable ", weight, " not found in data frame.")
		}

		vars <- base::c(x, subgroup, weight)
	}

	if (!base::is.numeric(df[[weight]])) {
		stop("Weight variable ", weight, " must be a numeric vector. The object provided is of class: ", base::class(df[[weight]])[1])
	}

	df_test <- df[vars]
	df_test <- df_test[stats::complete.cases(df_test), ]

	##new in v0.0.2
	if (!is.logical(nlabels)) {
		stop("Argument ", nlabels, " must be either TRUE or FALSE.")
	}

	if (!base::is.null(level)) {
		if (base::sum(level %in% base::levels(df[[subgroup]])) == 0) {
			stop("Argument ", level, " not found in the levels of factor variable ", subgroup, ".")
		}
		lvls <- base::levels(df_test[[subgroup]])
		lvls[!lvls %in% level] <- "Rest of Sample"
		base::levels(df_test[[subgroup]]) <- lvls
		if(base::nrow(df_test[df_test[[subgroup]] == "Rest of Sample", ]) == 0) { #In case the minimum sample threshold is not met.
			prop_group1 <- base::rep(NA, base::nlevels(df_test[[x]]))
			for (i in base::seq_along(base::levels(df_test[[x]]))) {
				prop_group1[i] <- (base::nrow(df_test[df_test[[x]] == base::levels(df_test[[x]])[i], ]) * df_test[[weight]]) / (base::nrow(df_test) * df_test[[weight]])
				}
			return_merged <- base::data.frame(
				"group1" = base::rep(base::factor(level, levels = c(level, "Rest of Sample")), base::nlevels(df_test[[x]])),
				"group2" = base::rep(base::factor("Rest of Sample", levels = c(level, "Rest of Sample")), base::nlevels(df_test[[x]])),
				"level" = base::levels(df_test[[x]]),
				"p.value" = base::rep(1, base::nlevels(df_test[[x]])),
				"significant" = base::rep(FALSE, base::nlevels(df_test[[x]])),
				"Sample_Size_group1" = base::rep(base::nrow(df_test[df_test[[subgroup]] == level, ]), base::nlevels(df_test[[x]])),
				"Sample_Size_group2" = base::rep(0, base::nlevels(df_test[[x]])),
				"prop_group1" = prop_group1,
				"prop_group2" = base::rep(NA, base::nlevels(df_test[[x]]))
				)
			# if (nlabels == TRUE) { #Do we still need this now that tbl_chart is not taking this function's output as input?
			# 	LEV <- base::levels(result_merged$group1)
			#
			# 	if (newline == TRUE) {
			# 		result_merged$group1 <- base::paste0(result_merged$group1, "\n(n=", base::format(result_merged$Sample_Size_group1, scientific = FALSE, trim = TRUE), ")")
			# 		result_merged$group2 <- base::paste0(result_merged$group2, "\n(n=", base::format(result_merged$Sample_Size_group2, scientific = FALSE, trim = TRUE), ")")
			# 	} else {
			# 		result_merged$group1 <- base::paste0(result_merged$group1, " (n=", base::format(result_merged$Sample_Size_group1, scientific = FALSE, trim = TRUE), ")")
			# 		result_merged$group2 <- base::paste0(result_merged$group2, " (n=", base::format(result_merged$Sample_Size_group2, scientific = FALSE, trim = TRUE), ")")
			# 	}
			#
			# 	New_element <- base::unique(base::c(result_merged$group1, result_merged$group2))
			# 	index <- base::pmatch(LEV, New_element)
			#
			# 	result_merged$group1 <- base::factor(result_merged$group1, levels = New_element[index])
			# 	result_merged$group2 <- base::factor(result_merged$group2, levels = New_element[index])
			# }
			return(return_merged)
			}
	}
	##end of new code

	df_test[[subgroup]] <- base::droplevels(df_test[[subgroup]]) #new in v0.0.1.1

	base::levels(df_test[[subgroup]]) <- base::gsub(x = base::levels(df_test[[subgroup]]), pattern = " ", replacement = "_") #Avoid whitespace and dots that can be an issue with reshape.

	df_dummy <- df_test[x]
	df_dummy_matrix <- stats::model.matrix(~ . - 1, data = df_dummy)  #Applying dummy variables
	df_dummy_matrix <- base::cbind(df_test[[weight]], df_dummy_matrix)
	df_dummy_matrix[, -1] <- df_dummy_matrix[, 1] * df_dummy_matrix[, -1] #Applying weights
	df_dummy_matrix <- df_dummy_matrix[, -1]
	df_test <- base::cbind(df_test, df_dummy_matrix)
	df_test_summ <- stats::aggregate(. ~ df_test[[subgroup]], df_test, base::sum)
	df_test_summ <- df_test_summ[!base::names(df_test_summ) %in% base::c(x, subgroup)]
	base::names(df_test_summ)[1] <- subgroup
	df_test_summ_n <- stats::aggregate(df_test[[x]] ~ df_test[[subgroup]], df_test, base::length)
	base::names(df_test_summ_n)[2] <- "Sample_Size"

	##New in v0.0.3
	if (base::is.null(min_sample)) {
		min_sample <- 1
	}

	to_remove <- df_test_summ_n[base::which(df_test_summ_n$Sample_Size < min_sample), 1]
	df_test_summ_n <- df_test_summ_n[base::which(df_test_summ_n$Sample_Size >= min_sample), ]
	df_test_summ <- df_test_summ[which(!df_test_summ[, 1] %in% to_remove), ]

	if (base::nrow(df_test_summ) < 2) { #In case no comparisons are made, returning a dummy data frame.
		if (base::is.null(level)) {
		return_merged <- base::data.frame(
			"group1" = base::rep(base::levels(df_test[[subgroup]]), base::nlevels(df_test[[x]])),
			"group2" = base::rep(NA, base::nlevels(df_test[[subgroup]]) * base::nlevels(df_test[[x]])),
			"level" = base::rep(base::levels(df_test[[x]]), base::nlevels(df_test[[subgroup]])),
			"p.value" = base::rep(1, base::nlevels(df_test[[subgroup]]) * base::nlevels(df_test[[x]])),
			"significant" = base::rep(FALSE, base::nlevels(df_test[[subgroup]]) * base::nlevels(df_test[[x]])),
			"Sample_Size_group1" = base::rep(NA, base::nlevels(df_test[[subgroup]]) * base::nlevels(df_test[[x]])),
			"Sample_Size_group2" = base::rep(NA, base::nlevels(df_test[[subgroup]]) * base::nlevels(df_test[[x]])),
			"prop_group1" = base::rep(NA, base::nlevels(df_test[[subgroup]]) * base::nlevels(df_test[[x]])),
			"prop_group2" = base::rep(NA, base::nlevels(df_test[[subgroup]]) * base::nlevels(df_test[[x]]))
		)
		} else {
			return_merged <- base::data.frame(
				"group1" = base::rep(base::factor(level, levels = c(level, "Rest of Sample")), base::nlevels(df_test[[x]])),
				"group2" = base::rep(base::factor("Rest of Sample", levels = c(level, "Rest of Sample")), base::nlevels(df_test[[x]])),
				"level" = base::levels(df_test[[x]]),
				"p.value" = base::rep(1, base::nlevels(df_test[[x]])),
				"significant" = base::rep(FALSE, base::nlevels(df_test[[x]])),
				"Sample_Size_group1" = base::rep(NA, base::nlevels(df_test[[x]])),
				"Sample_Size_group2" = base::rep(NA, base::nlevels(df_test[[x]])),
				"prop_group1" = base::rep(NA, base::nlevels(df_test[[x]])),
				"prop_group2" = base::rep(NA, base::nlevels(df_test[[x]]))
			)
		}
		return(return_merged)
	} else {
	##

	for (i in 3:base::ncol(df_test_summ)) {
		df_test_summ[base::ncol(df_test_summ) + 1] <- df_test_summ[i] / df_test_summ[[weight]] #Apply proportions for each level
		base::names(df_test_summ)[base::ncol(df_test_summ)] <- base::paste0(base::names(df_test_summ[i]), "_prop")
	}

	df_test_summ <- base::cbind(df_test_summ, df_test_summ_n[2])
	df_test_summ[[subgroup]] <- base::droplevels(df_test_summ[[subgroup]])

	for (i in base::seq_along(base::levels(df_test[[x]]))) {
		test <- stats::pairwise.prop.test(df_test_summ[[base::paste0(x, base::levels(df_test[[x]])[i])]], df_test_summ[[weight]], p.adjust.method = p.adjust.method)
		test$p.value[base::is.nan(test$p.value)] <- 1 #New in v0.1.1 - in case a test is not run because both proportions are equal to 0, the p-value is set to 1.
		result_test <- base::data.frame(group1 = base::row.names(test$p.value), test$p.value)
		result <- stats::reshape(result_test, varying = base::names(result_test)[2:base::ncol(result_test)], timevar = "group2", v.names = "p.value", direction = "long", times = base::names(result_test)[2:base::ncol(result_test)])
		result <- result[stats::complete.cases(result), ]
		base::row.names(result) <- NULL
		result <- result[!base::names(result) %in% "id"]

		base::levels(result$group1) <- base::gsub(x = base::levels(result$group1), pattern = "_", replacement = " ")
		result$group2 <- base::gsub(x = result$group2, pattern = "_", replacement = " ")
		# base::levels(df_test[[subgroup]]) <- base::gsub(x = base::levels(df_test[[subgroup]]), pattern = "_", replacement = " ") #Do I still need this?
		base::levels(df_test_summ[[subgroup]]) <- base::gsub(x = base::levels(df_test_summ[[subgroup]]), pattern = "_", replacement = " ")

		result$group1 <- base::as.numeric(base::levels(result$group1))[result$group1]
		result$group2 <- base::as.numeric(base::as.factor(result$group2))
		result$group1 <- base::factor(result$group1, levels = 1:base::nlevels(df_test_summ[[subgroup]]), labels = base::levels(df_test_summ[[subgroup]]))
		result$group2 <- base::factor(result$group2, levels = 1:base::nlevels(df_test_summ[[subgroup]]), labels = base::levels(df_test_summ[[subgroup]]))
		result$significant <- base::ifelse(result$p.value < alpha, TRUE, FALSE)
		result$level <- base::levels(df_test[[x]])[i]
		if (i == 1) {
			result_merged <- result
		} else {
			result_merged <- base::rbind(result_merged, result)
		}
	}

	tmp1a <- df_test_summ[, base::c(subgroup, "Sample_Size")]
	tmp1b <- base::cbind(df_test_summ[[subgroup]], df_test_summ[, base::grepl("_prop", base::names(df_test_summ))])
	tmp2a <- tmp1a
	tmp2b <- tmp1b
	base::names(tmp1a)[1] <- "group1"
	base::names(tmp1b)[1] <- "group1"
	base::names(tmp1a)[2] <- "Sample_Size_group1"
	base::names(tmp1b)[2:base::ncol(tmp1b)] <- base::levels(df_test[[x]])
	base::names(tmp2a)[1] <- "group2"
	base::names(tmp2b)[1] <- "group2"
	base::names(tmp2a)[2] <- "Sample_Size_group2"
	base::names(tmp2b)[2:base::ncol(tmp2b)] <- base::levels(df_test[[x]])

	tmp1b <- stats::reshape(tmp1b, varying = base::levels(df_test[[x]]), timevar = "level", idvar = "group1", v.names = "prop_group1", direction = "long", times = base::levels(df_test[[x]]))
	tmp2b <- stats::reshape(tmp2b, varying = base::levels(df_test[[x]]), timevar = "level", idvar = "group2", v.names = "prop_group2", direction = "long", times = base::levels(df_test[[x]]))

	result_merged <- base::merge(result_merged, tmp2a, by = "group2", sort = FALSE)
	result_merged <- base::merge(result_merged, tmp1a, by = "group1", sort = FALSE)

	result_merged <- base::merge(result_merged, tmp1b, by = base::c("group1", "level"), sort = FALSE)
	result_merged <- base::merge(result_merged, tmp2b, by = base::c("group2", "level"), sort = FALSE)
	result_merged$level <- base::factor(result_merged$level, levels = base::levels(df_test[[x]]))
	result_merged <- result_merged[, c("group1", "group2", "level", "p.value", "significant", "Sample_Size_group1", "Sample_Size_group2", "prop_group1", "prop_group2")] #Rearranging order of variables in the output

	##New in v0.0.2
	if (nlabels == TRUE) {
		LEV <- base::levels(result_merged$group1)

		if (newline == TRUE) {
			result_merged$group1 <- base::paste0(result_merged$group1, "\n(n=", base::format(result_merged$Sample_Size_group1, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
			result_merged$group2 <- base::paste0(result_merged$group2, "\n(n=", base::format(result_merged$Sample_Size_group2, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
		} else {
			result_merged$group1 <- base::paste0(result_merged$group1, " (n=", base::format(result_merged$Sample_Size_group1, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
			result_merged$group2 <- base::paste0(result_merged$group2, " (n=", base::format(result_merged$Sample_Size_group2, big.mark = ",", scientific = FALSE, trim = TRUE), ")")
		}

		New_element <- base::unique(base::c(result_merged$group1, result_merged$group2))
		index <- base::pmatch(LEV, New_element)

		result_merged$group1 <- base::factor(result_merged$group1, levels = New_element[index])
		result_merged$group2 <- base::factor(result_merged$group2, levels = New_element[index])
	}
	##

	return(result_merged)
	}
}
philstraforelli/ggsigmark documentation built on May 20, 2019, 1:59 p.m.