R/ccrepe.utilities.R

Defines functions calculate.z.stat.and.p.value log.processing.progress try.reboot.again bootstrap.method.calcn clean_common_area_after_processing resample print.dist preprocess_data check_data_values filter_dataset permute lappend merge_two_matrices extractCor DecodeInputCAArguments ccrepe_process_two_datasets ccrepe_process_one_dataset ccrepe_norm calculate_q_values_vector calculate_q_values

calculate_q_values <- function(CA)
#**********************************************************************
#	Calculate Q values    				                              *
#**********************************************************************
{
	q.values.calc.matrix <- CA$p.values   #Set the default p.values  for one and two dataset cases
	if  (CA$OneDataset == TRUE)	
		{q.values.calc.matrix[lower.tri(q.values.calc.matrix,diag=TRUE)] <- NA}    #If One dataset,  use only the upper part of the matrix

	QValuesArranged <- calculate_q_values_vector(q.values.calc.matrix,CA)

	CA$q.values<-QValuesArranged
        if (CA$OneDataset){
          CA$q.values[lower.tri(CA$q.values)] <- t(CA$q.values)[lower.tri(CA$q.values)]
        }

	return(CA)
}

calculate_q_values_vector <- function(p.values.vector,CA){

	p.values <- p.values.vector
        missing_idx <- which(is.na(p.values))
        if (length(missing_idx)>0){ 
          non.missing.p.values <- p.values[-missing_idx]
          m = length(non.missing.p.values)                                #m is the total number of p-values
	  ln_m = log(m)									#log m
	  ln_m_Plus_Gamma = ln_m + CA$Gamma
	  q.values <- p.values
	  q.values[-missing_idx] = non.missing.p.values * m * ln_m_Plus_Gamma / rank(non.missing.p.values)
        } else {
	  m = length(p.values)          
          ln_m = log(m)
          ln_m_Plus_Gamma = ln_m + CA$Gamma
          q.values <- p.values * m * ln_m_Plus_Gamma / rank(p.values)
        }

	return(q.values)
}


ccrepe_norm <-
function(data){
#*************************************************************************************
#* A function to normalize a data matrix across the rows                             *
#*************************************************************************************
	if(0 %in% rowSums(data)){
	     ErrMsg = paste0( 
	     	    "All-zero subjects generated by permutation: ",
		    toString(which(rowSums(data)==0)),
		    ".  These will be replaced by all zeros after normalization"
		    )
	     warning(ErrMsg)
	}
	data.normalized <- data/rowSums(data)		#Normalize
	data.normalized[rowSums(data)==0,] <- 0 #Replace all zero-rows (which become NA after normalization) with zeros.
  	return(data.normalized)
}





ccrepe_process_one_dataset <-
function(data,n.iter, CA){
#*************************************************************************************
#* 	Function to apply the new method to a dataset                                    *
#*  As a note, data needs to be a matrix with no missing values                      *
#*************************************************************************************
	n.features = ncol(data)					# Number of columns, starting at 1; this is also the number of bugs
	
	CA$p.values <-matrix(data=NA,nrow=n.features,ncol=n.features)	#Build the empty PValues matrix
	CA$z.stat <-matrix(data=NA,nrow=n.features,ncol=n.features)	#Build the empty z.stat matrix
	
	CA$sim.score <-matrix(data=NA,nrow=n.features,ncol=n.features)	#Build the empty correlation matrix
	
	nsubj = nrow(data)				# Number of subjects

        # The subset of columns for which to calculate the similarity scores
        col.subset <- 1:n.features
        if( length(CA$filtered.subset.cols.1) > 0 && CA$compare.within.x )               #If the User entered a subset of columns
                    {
                    col.subset <- CA$filtered.subset.cols.1
                    } else if(length(CA$filtered.subset.cols.2) > 0 && !CA$compare.within.x)
                    {
                        col.subset <- unique(c(col.subset,CA$filtered.subset.cols.2))
                    }

        # Remove features with too many zeros from the subsets of interest
        num.feature.zeros <- apply(data,2,function(col) sum(col==0))
        CalcThresholdForError = ((CA$errthresh)^(1/nsubj))*nsubj		#If there is not enough data
        zero_features <- which(num.feature.zeros > CalcThresholdForError)
        if(length(zero_features)>0){
            ErrMsg = paste0(
                "Feature(s) ",
                toString(colnames(data)[intersect(zero_features,col.subset)]),
                " have more zeros than the threshold of ",
                trunc(CalcThresholdForError),
                " zeros.  Excluding from output (will still be used in normalizing.)"
                )
            warning(ErrMsg)
            col.subset <- setdiff(col.subset,zero_features)
        }

	# The matrix of possible bootstrap rows (which when multiplied by data give a specific row) 
	#  of the form with all 0s except for one 1 in each row
	possible.rows = diag(rep(1,nsubj)) 

	#Generating the bootstrap and permutation matrices
	bootstrap.matrices = list()    # The list of matrices of possible bootstrap rows 
								   #(each matrix multiplies by the data to give a resampled dataset)
	permutation.matrices = list()  # The list of permutation matrices; each matrix will be
								   # have columns which are permutations of the row indices
	
	

	#**************************************************************
	#*  Pre allocate                                              *
	#**************************************************************
	permutation.matrices  = vector("list", n.iter)		#Pre allocate the permutation matrices
	bootstrap.matrices = vector("list", n.iter)		#Pre allocate

	
	for(i in seq_len(n.iter)){
		if (i %% CA$iterations.gap == 0)   #If output is verbose and the number of iterations 
											#is multiple of iterations gap - print status
			{
			print.msg = paste('Sampled ',i,' permutation and bootstrap datasets')
			log.processing.progress(CA,print.msg)  #Log progress
			}
	
   
		# Get the rows of the possible.rows matrix; these correspond to the rows
		# which will be included in the resampled dataset

		boot.rowids = sample(seq(1,nsubj),nsubj,replace=TRUE)

		# Generate the bootstrap matrix by getting the appropriate rows from the possible bootstrap rows
		boot.matrix = possible.rows[boot.rowids,] 

		# Add the bootstrap matrix to the list                   
		bootstrap.matrices [[ i ]] = boot.matrix   #Add bootstrap matrix 

		perm.matrix = replicate(n.features,sample(seq(1,nsubj),nsubj,replace=FALSE))  # The matrix has each column be a permutation of the row indices
 
		permutation.matrices [[i]] = perm.matrix		#Populate the permutation matrix
	}

	# The bootstrapped data; resample the data using each bootstrap matrix

	
 	log.processing.progress(CA,"Building boot data")  #Log progress
	boot.data = lapply(bootstrap.matrices,resample,data=data)
	

	###  Using bootstrap.method.calcn  for the one dataset also
	
	log.processing.progress(CA,paste("Getting similarity score for bootstrap data: col.subset =",toString(col.subset)))  #Log progress
	boot.cor  = lapply(boot.data,bootstrap.method.calcn,nsubj,data,CA,col.subset )		 ###Function to check is all cols are zeros and apply cor

	log.processing.progress(CA,"Generating permutation data")  #Log progress
	# Generating the permutation data; permute the data using each permutation matrix
	permutation.data = lapply(permutation.matrices,permute,data=data)
	# The correlation matrices for the bootstrapped data; calculate the correlation for each resampled dataset

	# The normalized permutation data; the permutation data needs to be renormalized, but not the bootstrap data
	permutation.norm.raw = lapply(permutation.data,ccrepe_norm)
        permutation.norm     = lapply(permutation.norm.raw,function(mat) mat[,col.subset])

	# The correlation matrices of the permuted data; calculate the correlation for each permuted dataset
	
	CA$verbose.requested = FALSE			#If the User requested verbose output - turn it off temporarily
	if (CA$verbose == TRUE)
		{
			CA$verbose.requested <- TRUE		#Turn of the verbose flag save variable
			CA$verbose <- FALSE					#But pass non verbose to the CA$method (Could be nc.score or anything else)	
		}
	
	log.processing.progress(CA,paste("Calculating permutation similarity score: col.subset =",toString(col.subset)))  #Log progress
	permutation.cor <- do.call(lapply,c(list(permutation.norm,CA$method), CA$method.args))  #Invoke the measuring function
	 
	# Now, actually calculating the correlation p-values within the dataset
	log.processing.progress(CA,"Calculating p-values")  #Log progress

	
 
	loop.range1 <- which( col.subset %in% 1:n.features)						#Establish looping range default
	loop.range2 <- which( col.subset %in% 1:n.features )						#Establish looping range default
	max.comparisons <- choose(length(col.subset),2)					#The default number of comparisons

	if( length(CA$filtered.subset.cols.1) > 0 )		#If the User entered a subset of columns
	    	{
		loop.range1 <- which(col.subset %in% CA$filtered.subset.cols.1)		#Use that subset
		

		if( CA$compare.within.x )               #If comparing only within subset.cols.x
		    	{
			loop.range2 <- which(col.subset %in% CA$filtered.subset.cols.1)			#Use subset.cols.x for the inner loop as well

			} else if( length(CA$subset.cols.2)>0 ){	   #If comparing between x and y and the user input subset.cols.y

			loop.range2 <- which(col.subset %in% CA$filtered.subset.cols.2) #Use subset.cols.y for the inner loop

			} 
		}

	if( !is.na(CA$concurrent.output) || CA$make.output.table )
	    {
	    output.table = data.frame(i=rep(NA,max.comparisons),
                                      k=rep(NA,max.comparisons),
                                      feature1=rep(NA,max.comparisons), 
	    		   	      feature2=rep(NA,max.comparisons), 
				      sim.score=rep(NA,max.comparisons), 
				      z.stat=rep(NA,max.comparisons),
				      p.value=rep(NA,max.comparisons), 
				      q.value=rep(NA,max.comparisons))

	    } else {
	    output.table = NULL
	    }

	if ( !is.na(CA$concurrent.output) )						#If user requested to print the output
	   {
	   cat(colnames(output.table),sep="\t",file=CA$concurrentFile,append=TRUE)
	   cat("\n",file=CA$concurrentFile,append=TRUE)
	   }
        if    (!is.na(CA$outdist))                                              #If user requested to print the distributions
            {
                output.string0 <- paste0("iter_",seq_len(length(boot.cor)),collapse=",")
                output.string  <- paste0("dist_type,i,k,feature1,feature2,",output.string0,'\n', sep='',collapse=",")
                
                cat(output.string,file=CA$outdistFile,append=TRUE)
            }

	internal.loop.counter = 0		#Initialize the loop counter
	outer.loop.indices.completed = c()	#Initialize the list to keep track of already completed outer indices

	for( index1 in seq_len(length(loop.range1)) )
	{
		i = loop.range1[index1]
		outer.loop.indices.completed = c(outer.loop.indices.completed,i)	# Keep track of which outer loop indices have been accounted for
		inner.loop.range = setdiff(loop.range2,outer.loop.indices.completed)	# Use as the inner loop only those inner loop indices which haven't been in the outer loop
                for(index2 in seq_len(length(inner.loop.range)))
                    {
                        k = inner.loop.range[index2]
                                        # Get a vector of the (i,k)th element of each correlation matrix in the list of bootstrapped data; this is the bootstrap distribution

                        internal.loop.counter = internal.loop.counter + 1  #Increment the loop counter
                        if (internal.loop.counter %% CA$iterations.gap == 0)   #If output is verbose and the number of iterations is multiple of iterations gap - print status
                            {
                                print.msg = paste('Completed ', internal.loop.counter, ' comparisons')
                                log.processing.progress(CA,print.msg)  #Log progress
                            }

                        bootstrap.dist = unlist(lapply(boot.cor,'[',i,k))

                                        # Get a vector the (i,k)th element of each correlation matrix in the list of permuted data; this is the permuted distribution
                        permutation.dist = unlist(lapply(permutation.cor,'[',i,k))	#sets


                        if    (!is.na(CA$outdist))						#If user requested to print the distributions
                            {
                                RC <- print.dist(bootstrap.dist,permutation.dist,CA,col.subset[i],col.subset[k])
                            }
                        measure.parameter.list <- append(list(x=data[,col.subset[i]],y=data[,col.subset[k]]), CA$sim.score.parameters)  #build the method do.call parameter list
                        sim.score <- do.call(CA$method,measure.parameter.list)	#Invoke the measuring function
						
 
						z.stat_and_p.value.list  <- calculate.z.stat.and.p.value(bootstrap.dist,permutation.dist )  #Invoke new function to calculate it 
																		#Same calculations - just packed as a function
						z.stat <- z.stat_and_p.value.list$z.stat		#Retrieve value form the list
						p.value <- z.stat_and_p.value.list$p.value	    #Retrieve from the list
						
 
						
                        CA$z.stat[col.subset[i],col.subset[k]] = z.stat					#Post z.stat in output matrix
                        CA$z.stat[col.subset[k],col.subset[i]] = z.stat					#Post z.stat in output matrix
                        CA$p.values[col.subset[i],col.subset[k]] = p.value				#Post it in the p-values matrix
                        CA$p.values[col.subset[k],col.subset[i]] = p.value				#Post it in the p-values matrix
                        CA$sim.score[col.subset[i],col.subset[k]] = sim.score			#Post it in the cor matrix
                        CA$sim.score[col.subset[k],col.subset[i]] = sim.score			#Post it in the cor matrix


                        if( !is.na(CA$concurrent.output) || CA$make.output.table )
                            {
                                concurrent.vector <- c(col.subset[i],col.subset[k],colnames(data)[col.subset[i]],colnames(data)[col.subset[k]],sim.score,z.stat,p.value,NA)
                                output.table[internal.loop.counter,] = concurrent.vector
                            }

                        if ( !is.na(CA$concurrent.output) )						#If user requested to print the output
                            {
                                cat(concurrent.vector,sep="\t",file=CA$concurrentFile,append=TRUE)
                                cat("\n",file=CA$concurrentFile,append=TRUE)
                            }
                    }
            }
	

	CA <- calculate_q_values(CA)						#Calculate the QValues
	if( CA$make.output.table )
	    {
	    output.table$sim.score <- as.numeric(output.table$sim.score)
	    output.table$z.stat    <- as.numeric(output.table$z.stat)
	    output.table$p.value   <- as.numeric(output.table$p.value)
	    table.q.values <- calculate_q_values_vector(output.table$p.value,CA)$q.values.vec
	    output.table$q.value <- table.q.values
	    }
 
	CA$q.values[lower.tri(CA$q.values)] = t(CA$q.values)[lower.tri(t(CA$q.values))]  #Making the q.values matrix symmetrical for the one dataset case
	

	if( !is.na(CA$concurrent.output) || CA$make.output.table )
	    {
	    CA$output.table <- output.table[1:internal.loop.counter,]		#Post it in the common Area
	    } else 
	    {
	    CA$output.table <- NULL
	    }


		
	#********************************************************************
	#*  Final Edits before exiting                                      *
	#********************************************************************
	diag(CA$q.values) <- NA											#Set diagonal of q.values to NA 
	colnames(CA$p.values)<-colnames(CA$data1.norm)						#Set the names of the columns in the p.values matrix
	rownames(CA$p.values)<-colnames(CA$data1.norm)						#Set the names of the rows in the p.values matrix
	colnames(CA$q.values)<-colnames(CA$data1.norm)						#Set the names of the columns in the q.values matrix
	rownames(CA$q.values)<-colnames(CA$data1.norm)						#Set the names of the rows in the q.values matrix
	colnames(CA$sim.score)<-colnames(CA$data1.norm)							#Set the names of the columns in the q.values matrix
	rownames(CA$sim.score)<-colnames(CA$data1.norm)							#Set the names of the rows in the q.values matrix
	colnames(CA$z.stat) <- colnames(CA$data1.norm)						#Set the names of the cols in the z.stat matrix
	rownames(CA$z.stat) <- colnames(CA$data1.norm)						#Set the names of the rows in the z.stat matrix
	diag(CA$p.values) <- NA											#Set diagonal of p.values to NA
	diag(CA$z.stat)   <- NA                                          	#Set diagonal of z.stat
	######################CA$sim.score <- NULL                        #Not needed to be NUll

	if (length(CA$filtered.subset.cols.1) > 0)				#If used a subset - present only the subset
		{
		CA$p.values <- CA$p.values[col.subset,col.subset]   #Display only the subset of cols and rows
		CA$q.values <- CA$q.values[col.subset,col.subset]   #Display only the subset of cols and rows
		CA$sim.score <- CA$sim.score[col.subset,col.subset]   #Display only the subset of cols and rows
		CA$z.stat    <- CA$z.stat[col.subset,col.subset]
		}
        CA <- clean_common_area_after_processing(CA)	#Clean the Common Area before returning to the User
        

		

	return(CA)														# Return the output matrix
}


ccrepe_process_two_datasets <- function(data1.norm,data2.norm,n.iter, CA)
#*************************************************************************************
#* 	ccrepe function for two datasets                                                 *
#*************************************************************************************
{
	# Get number of bugs, subjects for each dataset
	n1.features = ncol(data1.norm)
	n2.features = ncol(data2.norm)

	log.processing.progress(CA,"Two datasets: Initial merge of the matrices")  #Log progress
	data = merge_two_matrices(data1.norm,data2.norm)
	data1 <- data[,1:n1.features]
	data2 <- data[,(n1.features+1):(n1.features+n2.features)]
	nsubj = nrow(data)


        # Subset of columns for the bootstrapped dataset
        col.subset <- 1:(n1.features+n2.features)
        n1.subset <- n1.features
	n2.subset <- n2.features
	if( length(CA$filtered.subset.cols.1) > 0 )		#If the User entered a subset of columns
	    	{
		col.subset <- CA$filtered.subset.cols.1
                n1.subset  <- length(CA$filtered.subset.cols.1)

                if(length(CA$subset.cols.2) > 0)
                       {
                       col.subset <- c(col.subset,n1.features+CA$filtered.subset.cols.2)
		       n2.subset <- length(CA$filtered.subset.cols.2)
                       }
                }
        
        # Remove features with too many zeros from the subsets of interest
        num.feature.zeros <- apply(data,2,function(col) sum(col==0))
        CalcThresholdForError = ((CA$errthresh)^(1/nsubj))*nsubj		#If there is not enough data
        zero_features <- which(num.feature.zeros > CalcThresholdForError)
        if(length(zero_features)>0){
            zero_features_data1 <- zero_features[which(zero_features <= n1.features)]
            zero_features_data2 <- zero_features[which(zero_features > n1.features)]
            ErrMsg = paste0(
                "Feature(s) ",
                toString(colnames(data)[intersect(zero_features_data1,col.subset)]),
                " in dataset 1 and feature(s) ",
                toString(colnames(data)[intersect(zero_features_data2,col.subset)]),
                " in dataset 2 have more zeros than the threshold of ",
                trunc(CalcThresholdForError),
                " zeros.  Excluding from output (will still be used in normalizing.)"
                )
            warning(ErrMsg)
            col.subset <- setdiff(col.subset,zero_features)
            n1.subset <- sum(col.subset <= n1.features)
            n2.subset <- sum(col.subset > n1.features)
        }
        # Subset of columns for the permutation datasets
        


	# The matrix of possible bootstrap rows (which when multiplied by data give a specific row); of the form with all 0s except for one 1 in each row
	possible.rows = diag(rep(1,nsubj))

	#Generating the bootstrap and permutation matrices
	bootstrap.matrices   = list()  # The list of matrices of possible bootstrap rows (each matrix multiplies by the data to give a resampled dataset)
	permutation.matrices1 = list()  # The list of permutation matrices; each matrix will be have columns which are permutations of the row indices
	permutation.matrices2 = list()  # The list of permutation matrices; each matrix will be have columns which are permutations of the row indices


	#*****************************************************
	#*  Pre Allocate Matrices                            *
	#*****************************************************
	bootstrap.matrices = vector("list", n.iter)		
	permutation.matrices1 = vector("list", n.iter)	
	permutation.matrices2 = vector("list", n.iter)	

	for(i in seq_len(n.iter)){

		if (i %% CA$iterations.gap == 0)   #If output is verbose and the number of iterations is multiple of iterations gap - print status
			{
			print.msg = paste('Sampled ',i,' permutation and bootstrap datasets')
			log.processing.progress(CA,print.msg)  #Log progress
			}
		
 
		# Get the rows of the two possible.rows matrices; these correspond to the rows which will be included in the resampled datasets
		boot.rowids        = sample(seq(1,nsubj),nsubj,replace=TRUE)

		# Generate the bootstrap matrices by getting the appropriate rows from the possible bootstrap rows
		boot.matrix        = possible.rows[boot.rowids,]

		# Add the bootstrap matrices to the appropriate lists  
			
		bootstrap.matrices[[ i ]] = boot.matrix # Add the bootstrap matrix to the list
        perm.matrix1 = replicate(n1.features,sample(seq(1,nsubj),nsubj,replace=FALSE)) # (Replaced nsubj1 with nsubj) The matrix has each column be a permutation of the row indices

		permutation.matrices1 [[ i ]] = perm.matrix1      # Add the new matrix to the list

		perm.matrix2 = replicate(n2.features,sample(seq(1,nsubj),nsubj,replace=FALSE)) # The matrix has each column be a permutation of the row indices
		permutation.matrices2 [[ i ]] = perm.matrix2     # Add the new matrix to the list

	}

	# The bootstrapped data; resample the data using each bootstrap matrix
	
	log.processing.progress(CA,"Generating boot data")  #Log progress
	boot.data = lapply(bootstrap.matrices,resample,data=data)
 


	log.processing.progress(CA,"Generating permutation data")  #Log progress
	# Generating the permutation data; permute the data using each permutation matrix
	permutation.data1 = lapply(permutation.matrices1,permute,data=data1)
	permutation.data2 = lapply(permutation.matrices2,permute,data=data2)

	# The normalized permutation data; the permutation data needs to be renormalized, but not the bootstrap data
	permutation.norm1 = lapply(permutation.data1,ccrepe_norm)
	permutation.norm2 = lapply(permutation.data2,ccrepe_norm)


	# The correlation matrices of the permuted and bootstrapped data; see extractCor function for more details
	# mapply is a function that applies over two lists, applying extractCor to the first element of each, then the
	# second element of each, etc.
 
 
	log.processing.progress(CA,paste("Calculating permutation similarity scores: col.subset =",toString(col.subset)))  #Log progress
	CA$verbose.requested = FALSE			#If the User requested verbose output - turn it off temporarily
	if (CA$verbose == TRUE)
		{
			CA$verbose.requested <- TRUE		#Turn of the verbose flag save variable
			CA$verbose <- FALSE					#But pass non verbose to the CA$method (Could be nc.score or anything else)	
		}

	permutation.cor = mapply(extractCor,
		            mat1=permutation.norm1,
		            mat2=permutation.norm2,
		            startrow=1,
		            endrow=n1.subset,
		            startcol=(n1.subset+1),
		            endcol=length(col.subset), 				 
                	MoreArgs = list(col.subset = col.subset,
					my.method=CA$method,
					method.args=CA$method.args,
					outdist=CA$outdist,
					outdistFile=CA$outdistFile),
		            SIMPLIFY=FALSE)
		

	#******************************************************************************** 
	#  Reboot times = RT =  -round(log2(CA$errthresh))                              *
	# This is the maximum number of iterations to try to reboot a matrix if in a col*
	# all values are 0                                                              *
	#                                                                               *
	# For each matrix, check if there is a column that is all zeros 		        *
	# If such matrix is found,  try to reboot it at most RT times until such matrix  *
	# is found that does not contain a column with all zeros                         *
	# The limit of RT tries is now hard coded and needs to be re-evaluated           *
	# If after RT times there is no success - we stop the run                       *
	#********************************************************************************
	
	log.processing.progress(CA,paste("Calculating boot similarity scores: col.subset  =",toString(col.subset)))  #Log progress
	boot.cor  = lapply(boot.data,bootstrap.method.calcn,nsubj,data,CA,col.subset )		 ###Function to check is all cols are zeros and apply cor
	
	# Now calculating the correlations and p-values between the two datasets
	log.processing.progress(CA,"Calculating p-values")  #Log progress

	
	
	CA$p.values <-matrix(data=NA,nrow=n1.features,ncol=n2.features)	#Build the empty p.values matrix
	CA$z.stat  <-matrix(data=NA,nrow=n1.features,ncol=n2.features)		#Build the empty z.stat matrix
 
	CA$sim.score <-matrix(data=NA,nrow=n1.features,ncol=n2.features)	#Build the empty correlation matrix
	
	loop.range1 <- which(col.subset %in% 1:n1.features)						#Establish looping range default

	if ( length(CA$filtered.subset.cols.1)> 0 )		#If the User entered a subset of columns
		{
		loop.range1 <- which(col.subset %in% CA$filtered.subset.cols.1)		#Use the subset of columns
		}

	loop.range2 <- which( col.subset %in% (n1.features + 1:n2.features)) - n1.subset						#Establish looping range default
	
	if ( length(CA$filtered.subset.cols.2) > 0 )		#If the User entered a subset of columns
		{
		loop.range2 <- which(col.subset %in% (n1.features + CA$filtered.subset.cols.2)) - n1.subset		#Use the subset of columns
		}

	max.comparisons <- n1.features*n2.features
	if( !is.na(CA$concurrent.output) || CA$make.output.table )
	    {
	    output.table = data.frame(i=rep(NA,max.comparisons),
                                      k=rep(NA,max.comparisons),
                                      feature1=rep(NA,max.comparisons), 
		       		      feature2=rep(NA,max.comparisons), 
				      sim.score=rep(NA,max.comparisons), 
				      z.stat=rep(NA,max.comparisons),
				      p.value=rep(NA,max.comparisons), 
				      q.value=rep(NA,max.comparisons))
	    }

	if ( !is.na(CA$concurrent.output) )						#If user requested to print the output
	   {
	   cat(colnames(output.table),sep="\t",file=CA$concurrentFile,append=TRUE)
	   cat("\n",file=CA$concurrentFile,append=TRUE)
	   }
        if    (!is.na(CA$outdist))                                              #If user requested to print the distributions
            {
                output.string0 <- paste0("iter_",seq_len(length(boot.cor)),collapse=",")
                output.string  <- paste0("dist_type,i,k,feature1,feature2,",output.string0,'\n', sep='',collapse=",")

                cat(output.string,file=CA$outdistFile,append=TRUE)
            }


	internal.loop.counter = 0   # Initialize the counter
 
	
	for(index1 in seq_len(length(loop.range1)))					#Modified seq_along to seq_len: Need to review....
	{
		i = loop.range1[index1]
		for(index2 in seq_len(length(loop.range2)))				#Modified seq_along to seq_len: Need to review....
		{
			k = loop.range2[index2]
			# Get a vector of the (i,k)th element of each correlation matrix in the list of bootstrapped data; this is the bootstrap distribution

				internal.loop.counter = internal.loop.counter + 1  #Increment the loop counter
				if (internal.loop.counter %% CA$iterations.gap == 0)   #If output is verbose and the number of iterations is multiple 
																	   #of iterations gap - print status
				{
					print.msg = paste('Completed ', internal.loop.counter, ' comparisons')
					log.processing.progress(CA,print.msg)  #Log progress
				}
				
			bootstrap.dist = unlist(lapply(boot.cor,'[',i,n1.subset+k))
			bootstrap.dist[is.na(bootstrap.dist)] <- 0				#If there is an NA in bootstrap.dist - replace with 0 (Needs 
																	#review)
			# Get a vector the (i,k)th element of each correlation matrix in the list of permuted data; this is the permuted distribution
			permutation.dist = unlist(lapply(permutation.cor,'[',i,k))
			
			if    (!is.na(CA$outdist))						#If user requested to print the distributions
					{
					RC <- print.dist(bootstrap.dist,permutation.dist,CA,col.subset[i],col.subset[n1.subset+k] - n1.features)
					}
                        measure.parameter.list <- append(list(x=data[,col.subset[i]],y=data[,col.subset[n1.subset+k]]), CA$sim.score.parameters)  #build the method
                                        #do.call parameter list
                        sim.score.value <- do.call(CA$method,measure.parameter.list)	#Invoke the measuring function  <-  Was cor
					
						z.stat_and_p.value.list  <- calculate.z.stat.and.p.value(bootstrap.dist,permutation.dist )  #Invoke new function to calculate it 
																		#Same calculations - just packed as a function
						z.stat <- z.stat_and_p.value.list$z.stat		#Retrieve value form the list
						p.value <- z.stat_and_p.value.list$p.value	    #Retrieve from the list
						
						
						
								  
			CA$p.values[col.subset[i],col.subset[n1.subset+k]-n1.features] = p.value				#Post it in the p.values matrix  
			CA$z.stat[col.subset[i],col.subset[n1.subset+k]-n1.features] = z.stat					#Post it in the z.stat matrix 
 
			CA$sim.score[col.subset[i],col.subset[n1.subset+k]-n1.features] = sim.score.value			    #Post it in the cor matrix   <--Was cor

			if( !is.na(CA$concurrent.output) || CA$make.output.table )
			    {
			    concurrent.vector <- c(col.subset[i],col.subset[k],colnames(data1)[col.subset[i]],colnames(data2)[col.subset[n1.subset+k]-n1.features],sim.score.value,z.stat,p.value,NA)
			    output.table[internal.loop.counter,] = concurrent.vector
			    }

			if ( !is.na(CA$concurrent.output) )						#If user requested to print the output
			   {
		   	   cat(concurrent.vector,sep="\t",file=CA$concurrentFile,append=TRUE)
			   cat("\n",file=CA$concurrentFile,append=TRUE)
			   }
		}
	}
	


	CA <- calculate_q_values(CA)						#Calculate the QValues

	if( CA$make.output.table )
	    {
	    output.table$sim.score <- as.numeric(output.table$sim.score)
	    output.table$z.stat    <- as.numeric(output.table$z.stat)
	    output.table$p.value   <- as.numeric(output.table$p.value)
	    table.q.values <- calculate_q_values_vector(output.table$p.value,CA)$q.values.vec
	    output.table$q.value <- table.q.values
	    }


	if( !is.na(CA$concurrent.output) || CA$make.output.table )
	    {
	    CA$output.table <- output.table[1:internal.loop.counter,]
	    } else
	    {
	    CA$output.table <- NULL
	    }
	
	#********************************************************************
	#*  Final Edits before exiting                                      *
	#********************************************************************
	rownames(CA$p.values) <- colnames(CA$data1.norm)		#Post the column names
	colnames(CA$p.values) <- colnames(CA$data2.norm)		#Post the column names
 
	rownames(CA$sim.score) <- colnames(CA$data1.norm)		#Post the column names
	colnames(CA$sim.score) <- colnames(CA$data2.norm)		#Post the column names
	rownames(CA$q.values) <- colnames(CA$data1.norm)		#Post the column names
	colnames(CA$q.values) <- colnames(CA$data2.norm)		#Post the column names
	rownames(CA$z.stat) <- colnames(CA$data1.norm)		#Post the column names
	colnames(CA$z.stat) <- colnames(CA$data2.norm)		#Post the column names
 
	total.rows.to.display = 1:nrow(CA$p.values)				#Number of rows to display
	total.cols.to.display = 1:ncol(CA$p.values)				#Number of cols to display

	if  (length(CA$filtered.subset.cols.1) > 0) #If User selected a subset
		{ 
			total.rows.to.display = CA$filtered.subset.cols.1
		}
	if  (length(CA$filtered.subset.cols.2) > 0) #If User selected a subset
		{ 
			total.cols.to.display = CA$filtered.subset.cols.2
		}
	CA$p.values <- CA$p.values[total.rows.to.display,total.cols.to.display]   #Display only the subset of cols and rows
	CA$q.values <- CA$q.values[total.rows.to.display,total.cols.to.display]   #Display only the subset of cols and rows
	CA$sim.score <- CA$sim.score[total.rows.to.display,total.cols.to.display]   #Display only the subset of cols and rows
	CA$z.stat    <- CA$z.stat[total.rows.to.display,total.cols.to.display]
	CA <- clean_common_area_after_processing(CA)	#Clean the Common Area before returning to the User

	return(CA)			# Return the output matrix
}



ccrepe.calculations  <-
#*************************************************************************************
#*  	ccrepe calculations                                                          *
#*************************************************************************************
function(CA) 
{
	Output <-list()
 
	CA <- DecodeInputCAArguments(CA)							#Decode Input Parameters
        CA <- preprocess_data(CA)

	if (CA$OneDataset == TRUE)
		{
		CA  = ccrepe_process_one_dataset(CA$data1.norm,CA$iterations, CA)  				#Invoke the new function 
		}
	else
		{
		CA = ccrepe_process_two_datasets  (CA$data1.norm ,CA$data2.norm ,CA$iterations, CA)
		}
	return ( CA )
}






DecodeInputCAArguments <-
function(CA){
#*************************************************************************************
#*	Decode and validate input parameters                                             *
#*************************************************************************************
	
	CA$Gamma = 0.57721566490153286060651209008240243104215933593992  		#Euler's Gamma


	if (length(CA$data1) == 1)					#If the user did not select at least one input dataframe - Stop the run
		{
		if (is.na(CA$data1))
			stop('You must select at least one input data frame (data1<-YourData)') 
		}
	CA$OneDataset <- FALSE						#Set the symmmetrical flag to False (Will be true if data2=data1)		
	if (length(CA$data2) == 1)					#If user did not enter data2 - we assume he wants data2=data1
		if (is.na(CA$data2))					
		{
		CA$OneDataset <- TRUE					#And set up the Symmetrical flag
		}

 
 
    if  (!class(CA$subset.cols.1   ) == "numeric"  && !(is.null(CA$subset.cols.1)))     #Modified the check for subset.cols
		{                                           #If class is not "numeric" - set up to NULL                             
		ErrMsg = paste0(                            #This replaces the old checks  -   length(CA$subset.cols.1) == 0 && (is.null(CA$subset.cols.1))
	     	    "Subset columns1 is not numeric - was : "  ,
				CA$subset.cols.1,
			    " - Set to NULL"
		    )
	     warning(ErrMsg)
	    CA$subset.cols.1 = NULL					#Set to default
		}
	if  (!class(CA$subset.cols.2  ) == "numeric"  && !(is.null(CA$subset.cols.2)))     #Modified the check for subset.cols
		{                                           #If class is not "numeric" - set up to NULL                             
		ErrMsg = paste0(                            #This replaces the old checks  -   length(CA$subset.cols.1) == 0 && (is.null(CA$subset.cols.1))
	     	    "Subset columns2 is not numeric - was : "  ,
				CA$subset.cols.2,
			    " - Set to NULL"
		    )
	     warning(ErrMsg)
	    CA$subset.cols.2 = NULL					#Set to default
		}
  
 
  
	if    (!is.na(CA$outdist))							#If the user passed a file - open it
		{
                if(file.exists(CA$outdist)){
                    ErrMsg = paste0(
                        "Distribution file ",
                        CA$outdist,
                        " already exists--overwriting."
                        )
                    warning(ErrMsg)
                    file.remove(CA$outdist)
                }
		CA$outdistFile = file(CA$outdist,open='at')		#Open outdist file
		}

	if    (!is.na(CA$concurrent.output))							#If the user passed a file - open it
		{
                if(file.exists(CA$concurrent.output)){
                    ErrMsg = paste0(
                        "Concurrent output file ",
                        CA$concurrent.output,
                        " already exists--overwriting."
                        )
                    warning(ErrMsg)
                    file.remove(CA$concurrent.output)
                }
		CA$concurrentFile = file(CA$concurrent.output,open='at')		#Open the concurrent output file
		}

 
	if ( !is.logical(CA$compare.within.x ) )	#(Modify - use is.logical)  compare.within.x must be either true or false
		{
		ErrMsg = paste0("Setting compare.within.x to TRUE - it was ",CA$compare.within.x )
		warning(ErrMsg)
		CA$compare.within.x =  TRUE									#True - is the default
		}
 		
	if ( !is.logical(CA$make.output.table ) )	#(Modify - use is.logical)   
		{
		ErrMsg = paste0("Setting CA$make.output.table to FALSE - it was ",CA$make.output.table )
		warning(ErrMsg)
		CA$compare.within.x =  FALSE								#True - is the default
		}	
 
 
 
	if ( !is.logical(CA$verbose)  )	#Verbose Flag has to be either true or false, otherwise - we set it to false
		{
		ErrMsg = paste0("Verbose Flag set to False was:", CA$verbose)
		warning(ErrMsg)
		CA$verbose =  FALSE									#False - is the default
		}
 	
 
	if  ( is.na(suppressWarnings(as.integer(CA$iterations.gap)))   || !(CA$iterations.gap%%1==0) ) 	#Check the iterations gap (Number of iterations after which to print 
																									#if status  verbose - Cannot be fractional
		{ 
		ErrMsg = paste0("iterations.gap set to 100 -  was: ", CA$iterations.gap)
		warning(ErrMsg)
		CA$iterations.gap = 100 					#If not valid - use 100
		}
 	

 
	CA$sim.score.parameters<-list()							#Initialize the parameter list
	CA$sim.score.parameters <- CA$method.args				#Add the entries in method.args to the measuring parameter list
 

	CA$retries.max.iterations =  -round(log2(CA$errthresh)) #This is the maximum number of iterations to try to reboot a matrix if in a col all values are 0
		
		
	return(CA)			 				#Return list of decoded input parameters
}




extractCor <-
function(mat1,mat2,startrow,endrow,startcol,endcol,col.subset,my.method,method.args,outdist,outdistFile,  ...)
#******************************************************************************************
# A function to calculate the correlation of the two matrices by merging them,            *
#     calculating the correlation of the merged matrix, and extracting the appropriate    *
#     submatrix to obtain the correlation of interest                                     *
# startrow, endrow, startcol, endcol give the indeces of the submatrix to extract         *
# The method and the method parameters are passed via the list
#******************************************************************************************
{
  	mat <- merge_two_matrices(mat1,mat2)[,col.subset]	            #Merge the two matrices
	measure.function.parm.list <- append(list(x=mat), method.args)	
	mat_C <-do.call(my.method,measure.function.parm.list)	#Invoke the measuring fnction
        if(length(startrow:endrow)==1){
            sub_mat_C <- matrix(mat_C[startrow:endrow,startcol:endcol],nrow=1)
        } else if(length(startcol:endcol)==1){
            sub_mat_C <- matrix(mat_C[startrow:endrow,startcol:endcol],ncol=1)
        } else {
            sub_mat_C <- mat_C[startrow:endrow, startcol:endcol] # Extract the appropriate submatrix
        }
	return(sub_mat_C)
}




merge_two_matrices <-
function(mat1,mat2)
#*************************************************************************************
# A function to merge two matrices, which works when they're different dimensions    *
# merge the two matrices by the rows, merging only rows present in both              *
# (ensures no missing samples)                                                       *
# exclude the first column, which is the row names                                   *
# <----------- Important  !!! --------------->                                       *
#* The matrices are merged by row name (Which defaults to row number)                *
#* so the assumption is that SAME ROWS pertain to SAME SUBJECTS                      *
#* Also not that first column is treated as data - not Subject.ID or any other       *
#* identifier                                                                        *
#*************************************************************************************
{
 	mat <- merge(mat1,mat2,by="row.names")
	rownames(mat) = mat[,1]    # The first column has the row names
	mat = mat[,-1]             # Remove the first column
	return(mat)
}




lappend <-
function(lst, obj) {
#*************************************************************************************
#*  Function to append obj to lst, where lst is a list                               *
#*************************************************************************************
    lst[[length(lst)+1]] <- obj
    return(lst)
}





permute <-
function(data,permute.id.matrix){
#*************************************************************************************
#* Function to permute the data using a matrix of indices                            *
#* data is the data which you want to permute                                        *
#* permute.id.matrix is a matrix where each column is some                           *
#* permutation of the row indices                                                    * 
#*************************************************************************************
	permute.data = data
	for(i in 1:ncol(data)){									# For each column
		permute.data[,i] = data[permute.id.matrix[,i],i] 	# Replace the original column with a permuted one
	}
	return(permute.data)
}



filter_dataset <-
function(X){
#**********************************************************************
#	Filter out missing rows and rows/cols that are all zero          *
#**********************************************************************
    missing_rows <- which(
        apply(
            X,
            1,
            function(row){
                sum(is.na(row))
            }
            ) > 0
        )
    if(length(missing_rows)>0){
        X_no_missing <- X[-missing_rows,]
    } else {
        X_no_missing <- X
    }
    raw_all_zero_rows <- which(
        apply(
            X,
            1,
            function(row){
                sum(row==0)/length(row)
            }
            )==1
        )
    all_zero_rows <- which(rownames(X_no_missing) %in% rownames(X)[raw_all_zero_rows])
    raw_all_zero_cols <- which(
        apply(
            X_no_missing,
            2,
            function(col){
                sum(col==0)/length(col)
            }
            )==1
        )
    all_zero_cols <- which(colnames(X_no_missing) %in% colnames(X)[raw_all_zero_cols])
    if(length(all_zero_rows)>0 && length(all_zero_cols)>0){
        X_filtered <- X_no_missing[-all_zero_rows,-all_zero_cols]
    } else if( length(all_zero_rows) > 0 ){
        X_filtered <- X_no_missing[-all_zero_rows,]
    } else if( length(all_zero_cols) > 0 ){
        X_filtered <- X_no_missing[,-all_zero_cols]
    } else {
        X_filtered <- X_no_missing
    }
    
    return(
        list(
            X_filtered    = X_filtered,
            all_zero_rows = raw_all_zero_rows,
            all_zero_cols = raw_all_zero_cols,
            missing_rows  = missing_rows
            )
        )
}

check_data_values <-
function(CA){
#**********************************************************************
#	Check that all data values are valid 				                              *
#**********************************************************************
    # Check non-negativity
    num.neg <- sum(CA$data1[!is.na(CA$data1)] < 0)
    if( num.neg > 0 ){
        ErrMsg <- paste(
            "There are",
            num.neg,
            "negative values in x.  CCREPE only uses non-negative values--stopping run.")
        stop(ErrMsg)
    }
    # Check simplex
    num.gt.1 <- sum(CA$data1[!is.na(CA$data1)] > 1)
    if( num.gt.1 > 0){
        ErrMsg <- paste(
            "x appears to be count data, since there are",
            num.gt.1,
            "values greater than 1."
            )
        warning(ErrMsg)
    }
    
    if(!CA$OneDataset){
        # Check non-negativity
        num.neg2 <- sum(CA$data2[!is.na(CA$data2)] < 0)
        if( num.neg2 > 0 ){
            ErrMsg <- paste(
                "There are",
                num.neg2,
                "negative values in y.  CCREPE only uses non-negative values--stopping run.")
        }
        # Check simplex
        num2.gt.1 <- sum(CA$data2[!is.na(CA$data2)] > 1)
        if( num2.gt.1 > 0){
            ErrMsg <- paste(
                "y appears to be count data, since there are",
                num2.gt.1,
                "values greater than 1."
                )
            warning(ErrMsg)
        }
    }
}

preprocess_data <-
function(CA)
#**********************************************************************
#	Preprocess input data 				                              *
#**********************************************************************
{
    check_data_values(CA)
    
    if(CA$OneDataset){
        if(is.null(colnames(CA$data1))) colnames(CA$data1) <- seq_len(ncol(CA$data1))
        if(is.null(rownames(CA$data1))) rownames(CA$data1) <- seq_len(nrow(CA$data1))

        filtered_data1 <- filter_dataset(CA$data1)
    } else {
        if(is.null(colnames(CA$data1))) colnames(CA$data1) <- paste0(seq_len(ncol(CA$data1)),".X")
        if(is.null(rownames(CA$data1))) rownames(CA$data1) <- seq_len(nrow(CA$data1))
        if(is.null(colnames(CA$data2))) colnames(CA$data2) <- paste0(seq_len(ncol(CA$data2)),".Y")
        if(is.null(rownames(CA$data2))) rownames(CA$data2) <- seq_len(nrow(CA$data2))

        filtered_data1 <- filter_dataset(CA$data1)
        filtered_data2 <- filter_dataset(CA$data2)
    }

    if(length(filtered_data1$missing_rows)>0){
        ErrMsg = paste(
            "Excluding subject(s)",
            toString(rownames(CA$data1)[filtered_data1$missing_rows]),
            "from x because they have missing values."
            )
        warning(ErrMsg)
    }

    if(length(filtered_data1$all_zero_rows)>0){
        ErrMsg = paste(
            "Excluding subject(s)",
            toString(rownames(CA$data1)[filtered_data1$all_zero_rows]),
            "from x because they are entirely zero."
            )
        warning(ErrMsg)
    }

    if(length(filtered_data1$all_zero_cols)>0){
        ErrMsg = paste(
            "Excluding feature(s)",
            toString(colnames(CA$data1)[filtered_data1$all_zero_cols]),
            "from x because they are entirely zero."
            )
        warning(ErrMsg)
    }

    if(CA$OneDataset){
        if(nrow(filtered_data1$X_filtered) < CA$min.subj )
            {
                ErrMsg = paste0(
                    'Not enough data - found ',
                    nrow(filtered_data1$X_filtered),
                    ' rows of non-missing data - Less than  ',
                    CA$min.subj,
                    ' (=min.subj) - Run Stopped'
                    )
                stop(ErrMsg)
            }

        CA$data1.norm = filtered_data1$X_filtered/rowSums(filtered_data1$X_filtered)
        CA$filtered.subset.cols.1 <- NULL
        CA$filtered.subset.cols.2 <- NULL
        if(!is.null(CA$subset.cols.1)){
            CA$filtered.subset.cols.1 <- which(colnames(CA$data1.norm) %in% colnames(CA$data1)[CA$subset.cols.1])
            if( length(CA$filtered.subset.cols.1)==0 ){
                ErrMsg = paste0(
                    "User requested subset.cols.x=c(",
                    toString(CA$subset.cols.1),
                    "), but all are filtered.  Setting subset.cols.x=NULL."
                    )
                warning(ErrMsg)
                CA$subset.cols.1 <- NULL
                CA$filtered.subset.cols.1 <- NULL
            }
        }
        if(!is.null(CA$subset.cols.2)){
            CA$filtered.subset.cols.2 <- which(colnames(CA$data1.norm) %in% colnames(CA$data1)[CA$subset.cols.2])
            if( length(CA$filtered.subset.cols.2)==0 ){
                ErrMsg = paste0(
                    "User requested subset.cols.y=c(",
                    toString(CA$subset.cols.2),
                    "), but all are filtered.  Setting subset.cols.y=NULL"
                    )
                warning(ErrMsg)
                CA$subset.cols.2 <- NULL
                CA$filtered.subset.cols.2 <- NULL
            }
        }

    } else {
        
        if(length(filtered_data2$missing_rows)>0){
            ErrMsg = paste(
                "Excluding subject(s)",
                toString(rownames(CA$data2)[filtered_data2$missing_rows]),
                "from y because they have missing values."
                )
            warning(ErrMsg)
        }
        
        if(length(filtered_data2$all_zero_rows)>0){
            ErrMsg = paste(
                "Excluding subject(s)",
                toString(rownames(CA$data2)[filtered_data2$all_zero_rows]),
                "from y because they are entirely zero."
                )
            warning(ErrMsg)
        }
        
        if(length(filtered_data2$all_zero_cols)>0){
            ErrMsg = paste(
                "Excluding feature(s)",
                toString(colnames(CA$data2)[filtered_data2$all_zero_cols]),
                "from y because they are entirely zero."
                )
            warning(ErrMsg)
        }

        CA$data1.norm = filtered_data1$X_filtered/rowSums(filtered_data1$X_filtered)
        CA$data2.norm = filtered_data2$X_filtered/rowSums(filtered_data2$X_filtered)
        remove_from_d1 <- which(
            rownames(CA$data1.norm) %in% setdiff(rownames(CA$data1.norm),rownames(CA$data2.norm))
            )
        remove_from_d2 <- which(
            rownames(CA$data2.norm) %in% setdiff(rownames(CA$data2.norm),rownames(CA$data1.norm))
            )
        if(length(remove_from_d1) > 0 && length(remove_from_d2) > 0){
            ErrMsg = paste0("Removing subjects ",
                toString(rownames(CA$data1.norm)[remove_from_d1]),
                " from dataset x and subjects ",
                toString(rownames(CA$data2.norm)[remove_from_d2]),
                " from dataset y because they are not paired between the datasets."
                )
            warning(ErrMsg)
            CA$data1.norm <- CA$data1.norm[-remove_from_d1,]
            CA$data2.norm <- CA$data2.norm[-remove_from_d2,]
        } else if( length(remove_from_d1) > 0 ){
            ErrMsg = paste0(
                "Removing subjects ",
                toString(rownames(CA$data1.norm)[remove_from_d1]),
                " from dataset x because they are not in dataset y."
                )
            warning(ErrMsg)
            CA$data1.norm <- CA$data1.norm[-remove_from_d1,]
        } else if( length(remove_from_d2) > 0 ){
            ErrMsg = paste0(
                "Removing subjects ",
                toString(rownames(CA$data2.norm)[remove_from_d2]),
                " from dataset y because they are not in dataset x."
                )
            warning(ErrMsg)
            CA$data2.norm <- CA$data2.norm[-remove_from_d2,]
        }

        merged_data <- merge_two_matrices(CA$data1.norm,CA$data2.norm)
        if(nrow(merged_data) < CA$min.subj )
            {
                ErrMsg = paste0(
                    'Not enough data - found ',
                    nrow(merged_data),
                    ' rows paired of non-missing data - Less than  ',
                    CA$min.subj,
                    ' (=min.subj) - Run Stopped'
                    )
                stop(ErrMsg)
            }
        
        CA$filtered.subset.cols.1 <- NULL
        CA$filtered.subset.cols.2 <- NULL
        if(!is.null(CA$subset.cols.1)){
            CA$filtered.subset.cols.1 <- which(colnames(CA$data1.norm) %in% colnames(CA$data1)[CA$subset.cols.1])
            if( length(CA$filtered.subset.cols.1)==0 ){
                ErrMsg = paste0(
                    "User requested subset.cols.x=c(",
                    toString(CA$subset.cols.1),
                    "), but all are filtered.  Setting subset.cols.x=NULL."
                    )
                warning(ErrMsg)
                CA$subset.cols.1 <- NULL
                CA$filtered.subset.cols.1 <- NULL
            }
        }
        if(!is.null(CA$subset.cols.2)){
            CA$filtered.subset.cols.2 <- which(colnames(CA$data2.norm) %in% colnames(CA$data2)[CA$subset.cols.2])
            if( length(CA$filtered.subset.cols.2)==0 ){
                ErrMsg = paste0(
                    "User requested subset.cols.y=c(",
                    toString(CA$subset.cols.2),
                    "), but all are filtered.  Setting subset.cols.y=NULL."
                    )
                warning(ErrMsg)
                CA$subset.cols.2 <- NULL
                CA$filtered.subset.cols.2 <- NULL
            }
        }


    }
    return(CA)

}




	
print.dist <-
function(bootstrap.dist,permutation.dist,  CA,i, k)	{
#*************************************************************************************
#*  Function to print the distribution                                               *
#*************************************************************************************
	outdistFile = CA$outdistFile 						#Output file
	
	output.string0 <- paste(bootstrap.dist,sep="",collapse=',')		#Build the output string 
	if (CA$OneDataset == TRUE)							#The structure of the output is different for one dataset and two datasets
		{output.string <-paste("Boot,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data1.norm)[k],",",output.string0,'\n', sep='',collapse=",")}
		else
		{output.string <-paste("Boot,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data2.norm)[k],",",output.string0,'\n', sep='',collapse=",")}

	cat(output.string,file=CA$outdistFile,append=TRUE)
	
	output.string0 <- paste(permutation.dist,sep="",collapse=',')		#Build the output string 
	if (CA$OneDataset == TRUE)							#The structure of the output is different for one dataset and two datasets
		{output.string <-paste("Permutation,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data1.norm)[k],",",output.string0,'\n', sep='',collapse=",")}
		else
		{output.string <-paste("Permutation,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data2.norm)[k],",",output.string0,'\n', sep='',collapse=",")}
	cat(output.string,file=CA$outdistFile,append=TRUE)


	return (0)
	}

 
	
	
	
	
	
resample <-
function(data,resample.matrix){
#*************************************************************************************
#* Function to calculate the resampled data matrix; this                             *
#*    is used for getting only the bootstrapped data                                 *
#*    data is the data which you want to bootstrap                                   *
#*   resample.matrix is the matrix appropriate for getting                           *
#*    you the bootstrapped dataset: matrix is square, with                           *
#*    the same number of rows as data; each row has exactly one                      *
#*    1 in it                                                                        *
#*************************************************************************************
	data.matrix <-as.matrix(data)
	return(resample.matrix%*%data.matrix)
}




clean_common_area_after_processing <-
function(CA){
#*************************************************************************************
#* 	Function to clean the Common Area After processing                               *
#*  Common Area is passed to the User as results                                     *
#*************************************************************************************
	if   (!is.na(CA$outdist))										#If user requested to print the distributions
		{
		close(CA$outdistFile)										#Close outdist file	
		CA$outdistFile <- NULL										#And remove it from the common area
		}

	if   (!is.na(CA$concurrent.output))										#If user requested to print the distributions
		{
		close(CA$concurrentFile)										#Close outdist file	
		CA$concurrentFile <- NULL										#And remove it from the common area
		}

	if( !CA$make.output.table )
	    {
	    CA$output.table <- NULL
	    }


	if (CA$verbose.requested == TRUE)								#Check if the User requested verbose
		{															#We might have turned it off before calling CA$method	
		CA$verbose = TRUE	
		}
	CA$verbose.requested = NULL		
	CA$data1 <- NULL													
	CA$data1.norm <- NULL  											
	CA$data2 <- NULL													
	CA$data2.norm <- NULL  												
	CA$method <- NULL													
	CA$method.args<- NULL 										
	CA$OneDataset <- NULL													
	CA$outdist <- NULL	
	CA$concurrent.output <- NULL
	CA$Gamma <- NULL 
	CA$retries.max.iterations <- NULL
	CA$subset.cols.x <-CA$subset.cols.1
	CA$subset.cols.y <-CA$subset.cols.2
	CA$subset.cols.1 <- NULL
	CA$subset.cols.2 <- NULL
        CA$filtered.subset.cols.1 <- NULL
        CA$filtered.subset.cols.2 <- NULL
	

	if  ( length(CA$subset.cols.x) == 1 &&  CA$subset.cols.x == c(0))			#If NA - set to default
		{
		CA$subset.cols.x <-NULL					#Set to default
		}
	if  ( length(CA$subset.cols.y) == 1 &&  CA$subset.cols.y == c(0))			#If NA - set to default
		{
		CA$subset.cols.y <-NULL					#Set to default
		}


						 
	if (!CA$verbose == TRUE)												 
		{
		CA$min.subj <- NULL		 											 
		CA$iterations <- NULL												 
		CA$method.args <- NULL										 		
		CA$verbose <- NULL													 
		CA$iterations.gap <- NULL	
		CA$compare.within.x <- NULL
		CA$sim.score.parameters <- NULL	
		CA$subset.cols.x <- NULL
		CA$subset.cols.y <- NULL	
		CA$errthresh <- NULL
		CA$make.output.table <- NULL
		}



	return(CA)
}



bootstrap.method.calcn <-
function(subsetted.matrix,nsubj,original.data,CA,col.subset){
#*************************************************************************************
#* 	Function to calculate cor  and check that total in cols is not zero              *
#*************************************************************************************
	check.col.sums <- colSums(subsetted.matrix[,col.subset])==0
	if (length(check.col.sums[check.col.sums==TRUE]))  		#If there is a column that is all zeros - try to reboot data CA$retries.max.iterations=-round(log2(CA$errthresh)) times
		{
		cnt.tries = 0							#Initialize counter of  tries that we will try to reboot
		while(cnt.tries < CA$retries.max.iterations  && length(check.col.sums[check.col.sums==TRUE]))  #Check if we succeeded rebooting the data so no cols are zero
			{
			cnt.tries <- cnt.tries+1			#Increase the counter
			b1 = try.reboot.again (nsubj,original.data)	#Try to reboot again
			check.col.sums <- colSums(b1[,col.subset])==0	#Are there columns with all zeros in the new rebooted matrix?
			if (!length(check.col.sums[check.col.sums==TRUE])) #If there is no column that is all zeros - post the result to continue processing
				{subsetted.matrix <- b1}						#Post the result
			}
		if (cnt.tries > CA$retries.max.iterations  && length(check.col.sums[check.col.sums==TRUE]))	#If reboot did not work - Stop the run
			{
			ErrMsg = paste('Tried to reboot the original.data', CA$retries.max.iterations, 'times but always get a column that is all zeros')  #Error 
			warning(ErrMsg)

			}
		}
	
	measure.function.parm.list <- append(list(x=subsetted.matrix[,col.subset]), CA$method.args)	#Build the measure function parameter list
	boot.cor <-do.call(CA$method,measure.function.parm.list)	#Invoke the measuring function		
	return(boot.cor)				
	}

try.reboot.again <-
function(nsubj,data){
#*************************************************************************************
#* 	Function to reboot the data matrix agin so that we don't get cols with all 0     *
#*************************************************************************************
		possible.rows = diag(rep(1,nsubj))
		boot.rowids        = sample(seq(1,nsubj),nsubj,replace=TRUE)
		# Generate the bootstrap matrices by getting the appropriate rows from the possible bootstrap rows
		boot.matrix        = possible.rows[boot.rowids,]
		b1<-resample (data,boot.matrix) 		#b1 is the rebooted matrix that we return
		return(b1)
}

log.processing.progress <-
function(CA,msg){
#*************************************************************************************
#* 	Function to report the progress of processing                                    *
#*  only if verbose flag is on                                                       *
#*************************************************************************************
		if (CA$verbose == TRUE  ||  (!is.null(CA$verbose.requested)  && CA$verbose.requested == TRUE ))
			message(cat(date(),' ==> ',msg))			#Display date and time and progress
		return (0)
}

calculate.z.stat.and.p.value <-
function(bootstrap.dist,permutation.dist ){
#*************************************************************************************
#* Function to calculate the z.stat and p.value                                      *
#*************************************************************************************
	z.stat_p.value_list = list()		#Define the list
	z.stat_p.value_list$z.stat <- (mean(bootstrap.dist) - mean(permutation.dist))/sqrt(0.5*(var(permutation.dist)+var(bootstrap.dist)))
	z.stat_p.value_list$p.value <- 2*pnorm(-abs(z.stat_p.value_list$z.stat))
	return(z.stat_p.value_list)
}

Try the ccrepe package in your browser

Any scripts or data that you put into this service are public.

ccrepe documentation built on Nov. 8, 2020, 5:51 p.m.