Shuffle: What the Package Does (One Line, Title Case)

Documented in pwr.ttest

pwr_ttest.version = "0.2.8";
#Shuffled analysis based on Power

#runs independant sample t-test or paired t test, a number of times, with the data being segmented by
# minimum number of participants needed to reach significant power

#####Parameters--------------

#x Group 1
#y Group 2
#shuffle - denotes how many times the data will be shuffled
#alpha - desired alpha value, defaults = 0.05
#power - desired power level
#paired - optional TRUE/FALSE for Paired Sample t-Test, default is False


#Required packages
packages = c("tictoc", "pwr", "effsize");

#use this function to check if each package is on the local machine
#if a package is installed, it will be loaded
#if any are not, the missing package(s) will be installed and loaded
package.check <- lapply(packages, FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
        install.packages(x, dependencies = TRUE)
        library(x, character.only = TRUE);
    }
})
######End Packages



###### pwr.ttest Function---------------------------
#'
#' This function
#' @param g1 Group 1
#' @param g2 Group 2
#' @param shuffle number of times to replicate data, defaults to 50
#' @param alpha value of alpha, defaults to .05
#' @param power value of power criteria, defaults to .85
#' @param paired Boolean to run a paired-sample t-test, defaults to FALSE
#' @param csvFileName Optional parameter to save results to .csv
#' @keywords shuffle, t-test, power
#' @export
#' @examples
#' shuffled_ttest(g1, g2, 500, alpha=.05, power=.9)
#' shuffled_ttest(group1, group2, 10000, alpha=.01, paired=TRUE, paired_test1)

pwr.ttest <- function(g1, g2, shuffle=50, alpha=.05, power=0.85, paired=FALSE, csvFileName){

######Variable setup-----------------------------------

#convert g1 and g2 to data.frame
data_set <- as.data.frame(cbind(g1,g2), row.names = NULL, optional = FALSE,
              cut.names = FALSE, col.names = c("group1", "group2"), fix.empty.names = TRUE,
              stringsAsFactors = default.stringsAsFactors());

shuffle <- shuffle;

tic("Run time") #start timer----


###Paired t-Test Check----
#Option for Paired-Sample t-test - Default is FALSE
if (paired == FALSE) {
	paired_test <- FALSE;
	t.method <- "Independant Sample t-Test";
	}
	else {
	paired_test <- TRUE;
	t.method <- "Paired Sample t-Test";}


###CSV Output Check----
#if csvFileName parameter is included, save the variable
if (!missing(csvFileName)) {
	#Appends '.csv' and saves desired file name as variable csvFileName
	csvFileName <- paste(csvFileName,".csv",sep="");
	}


#Create statistical output data frame named "results", with 8 headers, for t-test, clears old data with each new run
results <- data.frame("iteration" = numeric(0), "sample" = numeric(0),"range" = character(0), "base n" = numeric(0),
                      "t" = double(0), "df" = double(0), "power" = double(0), "p value" = double(0), stringsAsFactors = FALSE);

sum_ZeroVar <- 0; #counter for no variance comparisons

########End Variable Setup---------------------------------


# Warning for large shuffling amounts
if (shuffle > 50) {
	print("Please wait...");
	}


###### Shuffling and replication ------------------------------
#Shuffles the data a number of times = to shuffle amount, running the replication tests for each iteration
	for (i in 1:shuffle) {

		cycle <- 1; #keep track of replications within shuffles

		x<-1; #Used for lower bounds of current selection - resets on new iteration

		#shuffles data set using 'sample()'
		data_set <- data_set[sample(1:nrow(data_set)),];

		#assigns shuffled data to global env for debugging
		assign("shuffled_data",data_set, envir=.GlobalEnv);

		#finds base n for each iteration
		base_n <- pwr.base_n(data_set[,1], data_set[,2], alpha, power, paired_test);


### Handle criteria not met and 0 variance----------------

		#If there are no significant findings, stop add NA results and continue
		if (base_n$participants == 0) {

		  results[nrow(results) + 1,] <- list(i,cycle,paste(1,':',nrow(data_set), sep=""), NA, NA, NA, NA, NA);
		  next;
		}

		#If the test shows 0 variance, increase NA counter, ignore the findings: redo current iteration
		if (base_n$participants == -1) {
		  sum_ZeroVar = sum_ZeroVar + 1;
			i = i-1;
			next;

		} else {

		  #y=set to min number of participants needed for each shuffle
		  y <- base_n$participants;
		}


	#Repeats while the current selection of participants is less than the max number of participants -
		#does not run less than base_n number of participants, so there may be missing data at the end
		#TODO - Add option for include/exclude uneven N
	while (y <= nrow(data_set)) {

    #### Ind sample t-test---------------------
		if (paired_test == FALSE) {

			#Check that variance is greater than 0 in current selections
			#Use apply to get variance values of rows x:y, in columns 1 and 2
			#Use all to compare variance to 0
			if (all(apply(data_set[x:y,], 2, var) != 0 )){

				#t test on Group 1 and Group 2 using current selection of participants x through y
				ttestresults <- t.test(data_set[x:y,1],data_set[x:y,2]);

				#calculate effect size
				d_value <- cohen.d(data_set[x:y,1],data_set[x:y,2])$estimate;

				#calculate power
				pwr_value <- pwr.t.test(n = base_n$participants, d_value, sig.level = alpha, power=NULL, type="two.sample")$power;

				#add statistical output to new row in results data.frame, rounding down the decimals
				#Organized as [iteration, cycle number, range, t-test statistic, degrees of freedom, p value].
				results[nrow(results) + 1,] <- list(i, cycle,paste(x,':',y, sep=""), base_n$participants, round(ttestresults$statistic,5),
				                                    round(ttestresults$parameter,5), round(pwr_value,5), round(ttestresults$p.value,5));
			}

		#If there is 0 variance, increase NA count and retry
		else {
			results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""), NA, NA, NA, NA, NA);
			sum_ZeroVar <- sum_ZeroVar + 1;
			}
		} #end if


    #### Paired sample t-test--------------------
		if (paired_test == TRUE) {
			#Check that variance is greater than 0 in current selections
			#Use apply to get variance values within-subject and between subject - 1=rows, 2=cols
			#Use all to compare variance to 0
			if (all(apply(data_set[x:y,], 2, var) != 0)){

				##debug variable values when errors occur
				#assign("x",x, envir=.GlobalEnv);
				#assign("y",y, envir=.GlobalEnv);
				#assign("base-n",base_n$participants, envir=.GlobalEnv);

				#Catch data is constant errors
				tryCatch({
				  #t test on Group 1 and Group 2 using current selection of participants x through y
				  ttestresults <- t.test(data_set[x:y,1], data_set[x:y,2], paired=TRUE);
				  },

				error = function(err) {
					sum_ZeroVar <- sum_ZeroVar + 1;
					ttestresults$p.value <- 9;
				})

			  #calculate effect size
			  d_value <- cohen.d(data_set[x:y,1], data_set[x:y,2])$estimate;

			  #calculate power
			  pwr_value <- pwr.t.test(n = base_n$participants, d_value, sig.level=alpha, power=NULL, type="paired")$power;


				#add statistical output to new row in results data.frame, rounding down the decimals
				#Organized as [iteration, cycle number, range, t-test statistic, degrees of freedom, power, p value].
				results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""), base_n$participants, round(ttestresults$statistic,3),
				                                round(ttestresults$parameter,4), round(pwr_value,4), round(ttestresults$p.value,5));
			}

			else {
				#Skips t-test and report NA findings when variance = 0
				results[nrow(results) + 1,] <- list(i,cycle,paste(x,':',y, sep=""),NA, NA, NA, NA, NA);
							sum_ZeroVar <- sum_ZeroVar + 1;
			}
		} #end if

	#Selects new range of participants of length base_n and increase cycle count
	x<-x+base_n$participants;
	y<-y+base_n$participants;
	cycle <- cycle + 1;
	}#end while loop

	end_time=Sys.time(); #End Timer
}#End for loop

#######End shuffling and replication




####### Export to CSV--------------------------------------------
	#Saves results to custom external file if option is include in parameters, if none included in argument, defaults output to 'results.csv'
	if (missing(csvFileName)){
		assign('results',results, envir=.GlobalEnv);
		write.csv(results, file="results.csv", row.names=TRUE);
		}

		#if there IS a name included
		else {

		#writes to a csv file using the variable output_fname
		write.csv(results, file=csvFileName, row.names=TRUE);
		}
####### End Export




####### Output Results-----------------------------------------------
	#shows the results in console if there are less than 50 rows
	if (nrow(results) < 50) {
		show(results);
	}
	toc(); #total time taken

	sum_sig_p <- NROW(results[results$p.value <= alpha & !is.na(results$p.value),]);
	sum_sig_power <- NROW(results[results$power >= power & !is.na(results$power),]);


	#Summary of results - cat() allows use of \n for line breaks
	cat("\n*",t.method,
	  "*\nGroup 1: ", names(g1), " | Group 2: ", names(g2),
	  "\n\nSignificant findings: \n(p < ",alpha,"): ", sum_sig_p, "/", nrow(results), " (", round((sum_sig_p/nrow(results))*100,2),"%) | Avg: ", round(mean(results[,8], na.rm = TRUE),4),
	  "\n(power >= ", power, "): ", sum_sig_power, "/", nrow(results), " (",round((sum_sig_power/nrow(results))*100,2),"%) | Avg: ", round(mean(results[,7], na.rm = TRUE),4),
	  "\nZero Variances: ", sum_ZeroVar, " | Mean Base n: ", round(mean(results[,4], na.rm=TRUE), 2), sep="");
}
###### End Output