R/shuffled_ttest.r

Defines functions shuffled_ttest

Documented in shuffled_ttest

shuffled_ttest.version = "1.20";
#initial program for running the shuffling of t-tests
#Data should be organized with Var 1 in column 1 and Var 2 in column 2

#runs independant sample t-test or paired t test, a number of times, with the data being segmented by minimum number of participants needed to reach significance

#####Parameters--------------

#data_set should refer to a dataset available in the global envir
#shuffle - denotes how many times the data will be shuffled
#alpha - desired alpha value
#paired - optional TRUE/FALSE for Paired Sample t-Test, default is False


#Required packages
packages = c("tictoc");

#use this function to check if each package is on the local machine
#if a package is installed, it will be loaded
#if any are not, the missing package(s) will be installed and loaded
package.check <- lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
        install.packages(x, dependencies = TRUE)
        library(x, character.only = TRUE);
    }
})
######End Packages



######Shuffled t-Test Function---------------------------
#'
#' This function runs iterative randomized replication t-tests on a dataset based on base-n value
#' @param g1 Group 1
#' @param g2 Group 2
#' @param shuffle number of times to replicate data, defaults to 50
#' @param alpha value of alpha, defaults to .05
#' @param paired Boolean to run a paired-sample t-test, defaults to FALSE
#' @param csvFileName Optional parameter to save results to .csv
#' @keywords shuffle, t-test
#' @export
#' @examples
#' shuffled_ttest(Group1, Group2, shuffle=500, alpha=.05)
#' shuffled_ttest(data$Var1, data$Var2, shuflled=10000, alpha=.01, paired=TRUE)

shuffled_ttest <- function(g1, g2, shuffle=50, alpha = 0.05, paired=FALSE, csvFileName){

######Variable setup-----------------------------------

#convert data to data.frame
data_set_orig <- as.data.frame(cbind(g1, g2), row.names = NULL, optional = FALSE,
              cut.names = FALSE, col.names = names(c(g1,g2)), fix.empty.names = TRUE,
              stringsAsFactors = default.stringsAsFactors());

shuffle <- shuffle;

tic("Run time") #start timer----


###Paired t-Test Check----
#Option for Paired-Sample t-test - Default is FALSE
if (paired == FALSE) {
	paired_test <- FALSE;
	t.method <- "Independant Sample t-Test";
	}
	else {
	paired_test <- TRUE;
	t.method <- "Paired Sample t-Test";}

###CSV Output Check----
#if csvFileName parameter is included, save the variable
if (!missing(csvFileName)) {
	#Appends '.csv' and saves desired file name as variable csvFileName
	csvFileName <- paste(csvFileName,".csv",sep="");
	}


#Create statistical output data frame named "results", with 7 headers, for ind. sampled t-tests, clears old data with each new run
results <- data.frame("iteration" = 0, "sample" = 0,"range" = character(1), "base n" = 0, "t" = double(1),"df" = double(1),"p value" = double(1), stringsAsFactors = FALSE);

sum_ZeroVar <- 0; #counter for no variance comparisons

########End Variable Setup---------------------------------


#Warning for large shuffling amounts
if (shuffle > 50) {
	print("Please wait...");
	}


######Shuffling and replication ------------------------------
	#Shuffles the data a number of times = to shuffle amount, running the replication tests for each iteration
	for (i in 1:shuffle) {

		cycle <- 1; #keep track of replications within shuffles

		x<-1; #Used for lower bounds of current selection - resets on new iteration

		#shuffles data set using 'sample()'
		data_set <- data_set_orig[sample(1:nrow(data_set_orig)),];

		#assigns shuffled data to global env
		assign("shuffled_data", data_set, envir=.GlobalEnv);

		#finds base n for each iteration
		base_n <- find_base_n(data_set, alpha, paired_test);

		#If there are no significant findings, add NA results and continue
		if (base_n == 0) {

		  results[nrow(results) + 1,] <- list(i,cycle,paste(1,':',nrow(data_set), sep=""), NA, NA, NA, NA);
		  next;
		}

		#If the test shows 0 variance, increase ZeroVar counter, ignore the findings: redo current iteration
		if (base_n == -1) {
		  sum_ZeroVar = sum_ZeroVar + 1;
			i = i-1;
			next;

		} else {

		  #y=set to min number of participants needed for each shuffle
		  y <- base_n;
		}

	#Repeats while the current selection of participants is less than the max number of participants - does not run less than base_n number of participants, so there may be missing data at the end
		#TODO - Add option for include/exclude uneven N
	while (y <= nrow(data_set)) {

	#Ind sample t-test---------------------
		if (paired_test == FALSE) {

			#Check that variance is greater than 0 in current selections
			#Use apply to get variance values of rows x:y, in columns 1 and 2
			#Use all to compare variance to 0
			if (all(apply(data_set[x:y,], 2, var) != 0 )){

				#t test on Group 1 and Group 2 using current selection of participants x through y
				ttestresults <- t.test(data_set[x:y,1],data_set[x:y,2]);

				#if the test is signficant, increase count by 1
				if (ttestresults$p.value <= alpha) {
					}

				#add statistical output to new row in results data.frame, rounding down the decimals
				#Organized as [iteration, cycle number, range, t-test statistic, degrees of freedom, p value].
				results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""),base_n,round(ttestresults$statistic,3), round(ttestresults$parameter,4), round(ttestresults$p.value,5));
			}

		#If there is an error, NA for test statistics
		else {
			results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""), base_n, NA, NA, NA);
			sum_ZeroVar <- sum_ZeroVar + 1;
			}
		} #end if


	#Paired sample t-test--------------------
		if (paired_test == TRUE) {
			#Check that variance is greater than 0 in current selections
			#Use apply to get variance values within-subject and between subject - 1=rows, 2=cols
			#Use all to compare variance to 0
			if (#all(apply(data_set[x:y,], 1, var) != 0) &
				#all(apply(data_set[x:y,1], 2, var) != 0) &
				all(apply(data_set[x:y,], 2, var) != 0)){

				#debug variable values when errors occur
				#assign("x",x, envir=.GlobalEnv);
				#assign("y",y, envir=.GlobalEnv);
				assign("base n",base_n, envir=.GlobalEnv);

				#Catch data is constant errors
				tryCatch({
				#t test on Group 1 and Group 2 using current selection of participants x through y
				ttestresults <- t.test(data_set[x:y,1],data_set[x:y,2],paired=TRUE);

				},
				error = function(err) {
					sum_ZeroVar <- sum_ZeroVar + 1;
					ttestresults$p.value <- 9;
				})


				#if the test is signficant, increase count by 1
				if (is.null(ttestresults$p.value) == FALSE &
					ttestresults$p.value <= alpha) {
				}

				#add statistical output to new row in results data.frame, rounding down the decimals
				#Organized as [iteration, cycle number, range, t-test statistic, degrees of freedom, p value].
				results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""),base_n,round(ttestresults$statistic,3), round(ttestresults$parameter,4), round(ttestresults$p.value,5));
			}

			else {
				#Skips t-test and report NA findings when variance = 0
				results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""),base_n, NA, NA, NA);
							sum_ZeroVar <- sum_ZeroVar + 1;
			}
		} #end if

	#Selects new range of participants of length base_n and increase cycle count
	x<-x+base_n;
	y<-y+base_n;
	cycle <- cycle + 1;
	}#end while loop

	end_time=Sys.time(); #End Timer
}#End for loop

#######End shuffling and replication---------------------



#######Export--------------------------------------------
	#Saves results to custom external file if option is include in parameters, if none included in argument, defaults output to 'results.csv'
	if (missing(csvFileName)){
		assign('results',results, envir=.GlobalEnv);
		write.csv(results, file="results.csv", row.names=TRUE);
		}

		#if there IS a name included
		else {

		#writes to a csv file using the variable output_fname
		write.csv(results, file=csvFileName, row.names=TRUE);
		}
#######End Export ------------------------------------------




#######Output-----------------------------------------------
	#shows the results in console if there are less than 50 rows
	if (nrow(results) < 50) {
		show(results);
	}
	toc(); #total time taken

	sum_sig_p <- NROW(results[results$p.value <= alpha & !is.na(results$p.value),]);

	#Summary of results - cat() allows use of \n for line breaks
	cat("\n*",t.method,
	  "*\nGroup 1: ", names(g1), " | Group 2: ", names(g2),
	  "\n\nSignificant findings: \n(p < ", alpha,"): ", sum_sig_p, "/", nrow(results), " (", round((sum_sig_p/nrow(results))*100,2),"%) | Avg: ", round(mean(results[,7], na.rm = TRUE),4),
	  "\nZero Variances: ", sum_ZeroVar, " | Mean Base n: ", round(mean(results[,4], na.rm=TRUE), 2), sep="");
}
baileymh/Shuffle documentation built on Sept. 4, 2019, 8:43 a.m.