R/ccrepe.utilities.R

Defines functions calculate.z.stat.and.p.value log.processing.progress try.reboot.again bootstrap.method.calcn clean_common_area_after_processing resample print.dist preprocess_data check_data_values filter_dataset permute lappend merge_two_matrices extractCor DecodeInputCAArguments ccrepe_process_two_datasets ccrepe_process_one_dataset bootstrap.procedure.memory.optimized bootstrap.procedure.simple ccrepe_norm calculate_q_values_vector calculate_q_values

#' @importFrom stats complete.cases na.omit pnorm var
#'
calculate_q_values <- function(CA)
  #**********************************************************************
  #	Calculate Q values    				                              *
  #**********************************************************************
{
  q.values.calc.matrix <- CA$p.values   #Set the default p.values  for one and two dataset cases
  if  (CA$OneDataset == TRUE)
  {q.values.calc.matrix[lower.tri(q.values.calc.matrix,diag=TRUE)] <- NA}    #If One dataset,  use only the upper part of the matrix

  QValuesArranged <- calculate_q_values_vector(q.values.calc.matrix,CA)

  CA$q.values<-QValuesArranged
  if (CA$OneDataset){
    CA$q.values[lower.tri(CA$q.values)] <- t(CA$q.values)[lower.tri(CA$q.values)]
  }

  return(CA)
}

calculate_q_values_vector <- function(p.values.vector,CA){

  p.values <- p.values.vector
  missing_idx <- which(is.na(p.values))
  if (length(missing_idx)>0){
    non.missing.p.values <- p.values[-missing_idx]
    m = length(non.missing.p.values)                                #m is the total number of p-values
    ln_m = log(m)									#log m
    ln_m_Plus_Gamma = ln_m + CA$Gamma
    q.values <- p.values
    q.values[-missing_idx] = non.missing.p.values * m * ln_m_Plus_Gamma / rank(non.missing.p.values)
  } else {
    m = length(p.values)
    ln_m = log(m)
    ln_m_Plus_Gamma = ln_m + CA$Gamma
    q.values <- p.values * m * ln_m_Plus_Gamma / rank(p.values)
  }

  return(q.values)
}


ccrepe_norm <-
  function(data){
    #*************************************************************************************
    #* A function to normalize a data matrix across the rows                             *
    #*************************************************************************************
    if(0 %in% rowSums(data)){
      ErrMsg = paste0(
        "All-zero subjects generated by permutation: ",
        toString(which(rowSums(data)==0)),
        ".  These will be replaced by all zeros after normalization"
      )
      warning(ErrMsg)
    }
    data.normalized <- data/rowSums(data)		#Normalize
    data.normalized[rowSums(data)==0,] <- 0 #Replace all zero-rows (which become NA after normalization) with zeros.
    return(data.normalized)
  }



bootstrap.procedure.simple <- function(n.features,nsubj,n.iter,data,possible.rows,col.subset,CA) {
  #Generating the bootstrap and permutation matrices
  bootstrap.matrices = list()    # The list of matrices of possible bootstrap rows
  #(each matrix multiplies by the data to give a resampled dataset)
  permutation.matrices = list()  # The list of permutation matrices; each matrix will be
  # have columns which are permutations of the row indices



  #**************************************************************
  #*  Pre allocate                                              *
  #**************************************************************
  permutation.matrices  = vector("list", n.iter)		#Pre allocate the permutation matrices
  bootstrap.matrices = vector("list", n.iter)		#Pre allocate


  for(i in seq_len(n.iter)){
    if (i %% CA$iterations.gap == 0)   #If output is verbose and the number of iterations
      #is multiple of iterations gap - print status
    {
      print.msg = paste('Sampled ',i,' permutation and bootstrap datasets')
      log.processing.progress(CA,print.msg)  #Log progress
    }


    # Get the rows of the possible.rows matrix; these correspond to the rows
    # which will be included in the resampled dataset

    boot.rowids = sample(seq(1,nsubj),nsubj,replace=TRUE)

    # Generate the bootstrap matrix by getting the appropriate rows from the possible bootstrap rows
    boot.matrix = possible.rows[boot.rowids,]

    # Add the bootstrap matrix to the list
    bootstrap.matrices [[ i ]] = boot.matrix   #Add bootstrap matrix

    perm.matrix = replicate(n.features,sample(seq(1,nsubj),nsubj,replace=FALSE))  # The matrix has each column be a permutation of the row indices

    permutation.matrices [[i]] = perm.matrix		#Populate the permutation matrix
  }

  # The bootstrapped data; resample the data using each bootstrap matrix


  log.processing.progress(CA,"Building boot data")  #Log progress
  boot.data = lapply(bootstrap.matrices,resample,data=data)


  ###  Using bootstrap.method.calcn  for the one dataset also

  log.processing.progress(CA,paste("Getting similarity score for bootstrap data: col.subset =",toString(col.subset)))  #Log progress
  boot.cor  = lapply(boot.data,bootstrap.method.calcn,nsubj,data,CA,col.subset )		 ###Function to check is all cols are zeros and apply cor

  log.processing.progress(CA,"Generating permutation data")  #Log progress
  # Generating the permutation data; permute the data using each permutation matrix
  permutation.data = lapply(permutation.matrices,permute,data=data)
  # The correlation matrices for the bootstrapped data; calculate the correlation for each resampled dataset

  if (CA$renormalize){# The normalized permutation data; the permutation data needs to be renormalized, but not the bootstrap data
    permutation.norm.raw = lapply(permutation.data,ccrepe_norm)
  }
  else{
    permutation.norm.raw = permutation.data
  }
  permutation.norm     = lapply(permutation.norm.raw,function(mat) mat[,col.subset])

  # The correlation matrices of the permuted data; calculate the correlation for each permuted dataset

  CA$verbose.requested = FALSE			#If the User requested verbose output - turn it off temporarily
  if (CA$verbose == TRUE)
  {
    CA$verbose.requested <- TRUE		#Turn of the verbose flag save variable
    CA$verbose <- FALSE					#But pass non verbose to the CA$method (Could be nc.score or anything else)
  }

  log.processing.progress(CA,paste("Calculating permutation similarity score: col.subset =",toString(col.subset)))  #Log progress
  permutation.cor <- do.call(lapply,c(list(permutation.norm,CA$method), CA$method.args))  #Invoke the measuring function
  # Now, actually calculating the correlation p-values within the dataset
  log.processing.progress(CA,"Calculating p-values")  #Log progress



  loop.range1 <- which( col.subset %in% 1:n.features)						#Establish looping range default
  loop.range2 <- which( col.subset %in% 1:n.features )						#Establish looping range default
  max.comparisons <- choose(length(col.subset),2)					#The default number of comparisons

  if( length(CA$filtered.subset.cols.1) > 0 )		#If the User entered a subset of columns
  {
    loop.range1 <- which(col.subset %in% CA$filtered.subset.cols.1)		#Use that subset


    if( CA$compare.within.x )               #If comparing only within subset.cols.x
    {
      loop.range2 <- which(col.subset %in% CA$filtered.subset.cols.1)			#Use subset.cols.x for the inner loop as well

    } else if( length(CA$subset.cols.2)>0 ){	   #If comparing between x and y and the user input subset.cols.y

      loop.range2 <- which(col.subset %in% CA$filtered.subset.cols.2) #Use subset.cols.y for the inner loop

    }
  }
  if    (!is.na(CA$outdist))                                              #If user requested to print the distributions
  {
    output.string0 <- paste0("iter_",seq_len(length(boot.cor)),collapse=",")
    output.string  <- paste0("dist_type,i,k,feature1,feature2,",output.string0,'\n', sep='',collapse=",")

    cat(output.string,file=CA$outdistFile,append=TRUE)
  }

  internal.loop.counter = 0		#Initialize the loop counter
  outer.loop.indices.completed = c()	#Initialize the list to keep track of already completed outer indices

  for( index1 in seq_len(length(loop.range1)) )
  {
    i = loop.range1[index1]
    outer.loop.indices.completed = c(outer.loop.indices.completed,i)	# Keep track of which outer loop indices have been accounted for
    inner.loop.range = setdiff(loop.range2,outer.loop.indices.completed)	# Use as the inner loop only those inner loop indices which haven't been in the outer loop
    for(index2 in seq_len(length(inner.loop.range)))
    {
      k = inner.loop.range[index2]
      # Get a vector of the (i,k)th element of each correlation matrix in the list of bootstrapped data; this is the bootstrap distribution

      internal.loop.counter = internal.loop.counter + 1  #Increment the loop counter
      if (internal.loop.counter %% CA$iterations.gap == 0)   #If output is verbose and the number of iterations is multiple of iterations gap - print status
      {
        print.msg = paste('Completed ', internal.loop.counter, ' comparisons')
        log.processing.progress(CA,print.msg)  #Log progress
      }

      bootstrap.dist = unlist(lapply(boot.cor,'[',i,k))

      # Get a vector the (i,k)th element of each correlation matrix in the list of permuted data; this is the permuted distribution
      permutation.dist = unlist(lapply(permutation.cor,'[',i,k))	#sets


      if    (!is.na(CA$outdist))						#If user requested to print the distributions
      {
        RC <- print.dist(bootstrap.dist,permutation.dist,CA,col.subset[i],col.subset[k])
      }
      measure.parameter.list <- append(list(x=data[,col.subset[i]],y=data[,col.subset[k]]), CA$sim.score.parameters)  #build the method do.call parameter list
      sim.score <- do.call(CA$method,measure.parameter.list)	#Invoke the measuring function


      z.stat_and_p.value.list  <- calculate.z.stat.and.p.value(bootstrap.dist,permutation.dist )  #Invoke new function to calculate it
      #Same calculations - just packed as a function
      z.stat <- z.stat_and_p.value.list$z.stat		#Retrieve value form the list
      p.value <- z.stat_and_p.value.list$p.value	    #Retrieve from the list



      CA$z.stat[col.subset[i],col.subset[k]] = z.stat					#Post z.stat in output matrix
      CA$z.stat[col.subset[k],col.subset[i]] = z.stat					#Post z.stat in output matrix
      CA$p.values[col.subset[i],col.subset[k]] = p.value				#Post it in the p-values matrix
      CA$p.values[col.subset[k],col.subset[i]] = p.value				#Post it in the p-values matrix
      CA$sim.score[col.subset[i],col.subset[k]] = sim.score			#Post it in the cor matrix
      CA$sim.score[col.subset[k],col.subset[i]] = sim.score			#Post it in the cor matrix
    }
  }
    return(CA)
}

bootstrap.procedure.memory.optimized <- function(n.features,nsubj,n.iter,data,possible.rows,col.subset,CA){
  n.matrix.dim <- length(col.subset)
  boot.sum <- matrix(data = 0,nrow = n.matrix.dim,ncol = n.matrix.dim)
  boot.squared.sum <- matrix(data = 0,nrow = n.matrix.dim,ncol = n.matrix.dim)
  perm.sum <- matrix(data = 0,nrow = n.matrix.dim,ncol = n.matrix.dim)
  perm.squared.sum <- matrix(data = 0,nrow = n.matrix.dim,ncol = n.matrix.dim)
  for(i in seq_len(n.iter)){
    if (i %% CA$iterations.gap == 0)   #If output is verbose and the number of iterations
      #is multiple of iterations gap - print status
    {
      print.msg = paste('Sampled and computet similarity scores for',
                        i,' permutation and bootstrap datasets')
      log.processing.progress(CA,print.msg)  #Log progress
    }


    # Get the rows of the possible.rows matrix; these correspond to the rows
    # which will be included in the resampled dataset

    boot.rowids = sample(seq(1,nsubj),nsubj,replace=TRUE)

    # Generate the bootstrap matrix by getting the appropriate rows from the possible bootstrap rows
    boot.matrix = resample(data = data,resample.matrix = possible.rows[boot.rowids,])
    boot.cor <- bootstrap.method.calcn(subsetted.matrix= boot.matrix,nsubj,data,CA,col.subset)
    boot.sum <- boot.sum + boot.cor
    boot.squared.sum <- boot.squared.sum + boot.cor*boot.cor
    perm.matrix = permute(data = data, replicate(n.features,sample(seq(1,nsubj),nsubj,replace=FALSE)))  # The matrix has each column be a permutation of the row indicies.
    if (CA$renormalize){# The normalized permutation data; the permutation data needs to be renormalized, but not the bootstrap data
      permutation.norm.raw = ccrepe_norm(perm.matrix)
    }
    else{
      permutation.norm.raw = perm.matrix
    }
    permutation.norm     = permutation.norm.raw[,col.subset]
    perm.cor <- do.call(CA$method,c(list(permutation.norm),CA$method.args))
    perm.sum <- perm.sum + perm.cor
    perm.squared.sum <- perm.squared.sum + perm.cor*perm.cor
  }
  log.processing.progress(CA,"Calculating p-values")  #Log progress
  # Contrary to the usual procedure, the computations are run vectorizesed over all pairs of OTUs, avoiding memory
  # allocation trottling for high number of OTUs
  measure.parameter.list <- append(list(x=data[,col.subset]), CA$sim.score.parameters)  #build the method do.call parameter list
  CA$sim.score[col.subset,col.subset] <- do.call(CA$method,measure.parameter.list)	#Invoke the measuring function
  mean.bootstrap.value <- boot.sum / n.iter
  mean.perm.value <- perm.sum / n.iter
  var.bootstrap.value <- (boot.squared.sum - n.iter*mean.bootstrap.value*mean.bootstrap.value) / (n.iter-1)
  var.perm.value <- (perm.squared.sum - n.iter*mean.perm.value*mean.perm.value) / (n.iter - 1)
  CA$z.stat[col.subset,col.subset] <- (mean.bootstrap.value - mean.perm.value) / sqrt(0.5*(var.bootstrap.value+var.perm.value))
  CA$p.values[col.subset,col.subset] <- 2*pnorm(-abs(CA$z.stat[col.subset,col.subset]))
  # NaN-values are induced for the diagonal elements, we must eliminate them before proceeding
  diag(CA$sim.score) <- NA
  diag(CA$z.stat) <- NA
  diag(CA$p.values) <- NA
  CA$verbose.requested = FALSE			#If the User requested verbose output - turn it off temporarily
  if (CA$verbose == TRUE)
  {
    CA$verbose.requested <- TRUE		#Turn of the verbose flag save variable
    CA$verbose <- FALSE					#But pass non verbose to the CA$method (Could be nc.score or anything else)
  }
      return(CA)
}

ccrepe_process_one_dataset <-
  function(data,n.iter, CA){
    #*************************************************************************************
    #* 	Function to apply the new method to a dataset                                    *
    #*  As a note, data needs to be a matrix with no missing values                      *
    #*************************************************************************************
    n.features = ncol(data)					# Number of columns, starting at 1; this is also the number of bugs

    CA$p.values <-matrix(data=NA,nrow=n.features,ncol=n.features)	#Build the empty PValues matrix
    CA$z.stat <-matrix(data=NA,nrow=n.features,ncol=n.features)	#Build the empty z.stat matrix

    CA$sim.score <-matrix(data=NA,nrow=n.features,ncol=n.features)	#Build the empty correlation matrix

    nsubj = nrow(data)				# Number of subjects

    # The subset of columns for which to calculate the similarity scores
    col.subset <- 1:n.features
    if( length(CA$filtered.subset.cols.1) > 0 && CA$compare.within.x )               #If the User entered a subset of columns
    {
      col.subset <- CA$filtered.subset.cols.1
    } else if(length(CA$filtered.subset.cols.2) > 0 && !CA$compare.within.x)
    {
      col.subset <- unique(c(col.subset,CA$filtered.subset.cols.2))
    }

    # Remove features with too many zeros from the subsets of interest
    num.feature.zeros <- apply(data,2,function(col) sum(col==0))
    CalcThresholdForError = ((CA$errthresh)^(1/nsubj))*nsubj		#If there is not enough data
    zero_features <- which(num.feature.zeros > CalcThresholdForError)
    if(length(zero_features)>0){
      ErrMsg = paste0(
        "Feature(s) ",
        toString(colnames(data)[intersect(zero_features,col.subset)]),
        " have more zeros than the threshold of ",
        trunc(CalcThresholdForError),
        " zeros.  Excluding from output (will still be used in normalizing.)"
      )
      warning(ErrMsg)
      col.subset <- setdiff(col.subset,zero_features)
    }

    # The matrix of possible bootstrap rows (which when multiplied by data give a specific row)
    #  of the form with all 0s except for one 1 in each row
    possible.rows = diag(rep(1,nsubj))
    if(!CA$memory.optimize){
    CA <- bootstrap.procedure.simple (n.features,nsubj,n.iter,data,possible.rows,col.subset,CA)
    }
    # Assume we are memory-optimizing the application, we do not create the bootstrap and permutation matrices all
    # at once, but instead compute them one by one
    else{
      CA <- bootstrap.procedure.memory.optimized(n.features,nsubj,n.iter,data,possible.rows,col.subset,CA)
    }
    log.processing.progress(CA,"Calculating q-values")
    CA <- calculate_q_values(CA)						#Calculate the QValues
    CA$q.values[lower.tri(CA$q.values)] = t(CA$q.values)[lower.tri(t(CA$q.values))]  #Making the q.values matrix symmetrical for the one dataset case



    #********************************************************************
    #*  Final Edits before exiting                                      *
    #********************************************************************
    diag(CA$q.values) <- NA											#Set diagonal of q.values to NA
    colnames(CA$p.values)<-colnames(CA$data1.norm)						#Set the names of the columns in the p.values matrix
    rownames(CA$p.values)<-colnames(CA$data1.norm)						#Set the names of the rows in the p.values matrix
    colnames(CA$q.values)<-colnames(CA$data1.norm)						#Set the names of the columns in the q.values matrix
    rownames(CA$q.values)<-colnames(CA$data1.norm)						#Set the names of the rows in the q.values matrix
    colnames(CA$sim.score)<-colnames(CA$data1.norm)							#Set the names of the columns in the q.values matrix
    rownames(CA$sim.score)<-colnames(CA$data1.norm)							#Set the names of the rows in the q.values matrix
    colnames(CA$z.stat) <- colnames(CA$data1.norm)						#Set the names of the cols in the z.stat matrix
    rownames(CA$z.stat) <- colnames(CA$data1.norm)						#Set the names of the rows in the z.stat matrix
    diag(CA$p.values) <- NA											#Set diagonal of p.values to NA
    diag(CA$z.stat)   <- NA                                          	#Set diagonal of z.stat
    ######################CA$sim.score <- NULL                        #Not needed to be NUll

    if (length(CA$filtered.subset.cols.1) > 0)				#If used a subset - present only the subset
    {
      CA$p.values <- CA$p.values[col.subset,col.subset]   #Display only the subset of cols and rows
      CA$q.values <- CA$q.values[col.subset,col.subset]   #Display only the subset of cols and rows
      CA$sim.score <- CA$sim.score[col.subset,col.subset]   #Display only the subset of cols and rows
      CA$z.stat    <- CA$z.stat[col.subset,col.subset]
    }
    CA <- clean_common_area_after_processing(CA)	#Clean the Common Area before returning to the User




    return(CA)														# Return the output matrix
  }


ccrepe_process_two_datasets <- function(data1.norm,data2.norm,n.iter, CA)
  #*************************************************************************************
  #* 	ccrepe function for two datasets                                                 *
  #*************************************************************************************
{
  # Get number of bugs, subjects for each dataset
  n1.features = ncol(data1.norm)
  n2.features = ncol(data2.norm)

  log.processing.progress(CA,"Two datasets: Initial merge of the matrices")  #Log progress
  data = merge_two_matrices(data1.norm,data2.norm)
  data1 <- data[,1:n1.features]
  data2 <- data[,(n1.features+1):(n1.features+n2.features)]
  nsubj = nrow(data)


  # Subset of columns for the bootstrapped dataset
  col.subset <- 1:(n1.features+n2.features)
  n1.subset <- n1.features
  n2.subset <- n2.features
  if( length(CA$filtered.subset.cols.1) > 0 )		#If the User entered a subset of columns
  {
    col.subset <- CA$filtered.subset.cols.1
    n1.subset  <- length(CA$filtered.subset.cols.1)

    if(length(CA$subset.cols.2) > 0)
    {
      col.subset <- c(col.subset,n1.features+CA$filtered.subset.cols.2)
      n2.subset <- length(CA$filtered.subset.cols.2)
    }
  }

  # Remove features with too many zeros from the subsets of interest
  num.feature.zeros <- apply(data,2,function(col) sum(col==0))
  CalcThresholdForError = ((CA$errthresh)^(1/nsubj))*nsubj		#If there is not enough data
  zero_features <- which(num.feature.zeros > CalcThresholdForError)
  if(length(zero_features)>0){
    zero_features_data1 <- zero_features[which(zero_features <= n1.features)]
    zero_features_data2 <- zero_features[which(zero_features > n1.features)]
    ErrMsg = paste0(
      "Feature(s) ",
      toString(colnames(data)[intersect(zero_features_data1,col.subset)]),
      " in dataset 1 and feature(s) ",
      toString(colnames(data)[intersect(zero_features_data2,col.subset)]),
      " in dataset 2 have more zeros than the threshold of ",
      trunc(CalcThresholdForError),
      " zeros.  Excluding from output (will still be used in normalizing.)"
    )
    warning(ErrMsg)
    col.subset <- setdiff(col.subset,zero_features)
    n1.subset <- sum(col.subset <= n1.features)
    n2.subset <- sum(col.subset > n1.features)
  }
  # Subset of columns for the permutation datasets



  # The matrix of possible bootstrap rows (which when multiplied by data give a specific row); of the form with all 0s except for one 1 in each row
  possible.rows = diag(rep(1,nsubj))

  #Generating the bootstrap and permutation matrices
  bootstrap.matrices   = list()  # The list of matrices of possible bootstrap rows (each matrix multiplies by the data to give a resampled dataset)
  permutation.matrices1 = list()  # The list of permutation matrices; each matrix will be have columns which are permutations of the row indices
  permutation.matrices2 = list()  # The list of permutation matrices; each matrix will be have columns which are permutations of the row indices


  #*****************************************************
  #*  Pre Allocate Matrices                            *
  #*****************************************************
  bootstrap.matrices = vector("list", n.iter)
  permutation.matrices1 = vector("list", n.iter)
  permutation.matrices2 = vector("list", n.iter)

  for(i in seq_len(n.iter)){

    if (i %% CA$iterations.gap == 0)   #If output is verbose and the number of iterations is multiple of iterations gap - print status
    {
      print.msg = paste('Sampled ',i,' permutation and bootstrap datasets')
      log.processing.progress(CA,print.msg)  #Log progress
    }


    # Get the rows of the two possible.rows matrices; these correspond to the rows which will be included in the resampled datasets
    boot.rowids        = sample(seq(1,nsubj),nsubj,replace=TRUE)

    # Generate the bootstrap matrices by getting the appropriate rows from the possible bootstrap rows
    boot.matrix        = possible.rows[boot.rowids,]

    # Add the bootstrap matrices to the appropriate lists

    bootstrap.matrices[[ i ]] = boot.matrix # Add the bootstrap matrix to the list
    perm.matrix1 = replicate(n1.features,sample(seq(1,nsubj),nsubj,replace=FALSE)) # (Replaced nsubj1 with nsubj) The matrix has each column be a permutation of the row indices

    permutation.matrices1 [[ i ]] = perm.matrix1      # Add the new matrix to the list

    perm.matrix2 = replicate(n2.features,sample(seq(1,nsubj),nsubj,replace=FALSE)) # The matrix has each column be a permutation of the row indices
    permutation.matrices2 [[ i ]] = perm.matrix2     # Add the new matrix to the list

  }

  # The bootstrapped data; resample the data using each bootstrap matrix

  log.processing.progress(CA,"Generating boot data")  #Log progress
  boot.data = lapply(bootstrap.matrices,resample,data=data)



  log.processing.progress(CA,"Generating permutation data")  #Log progress
  # Generating the permutation data; permute the data using each permutation matrix
  permutation.data1 = lapply(permutation.matrices1,permute,data=data1)
  permutation.data2 = lapply(permutation.matrices2,permute,data=data2)

  if(CA$renormalize){
    # The normalized permutation data; the permutation data needs to be renormalized, but not the bootstrap data
    permutation.norm1 = lapply(permutation.data1,ccrepe_norm)
    permutation.norm2 = lapply(permutation.data2,ccrepe_norm)
  }
  else{
    permutation.norm1=permutation.data1
    permutation.norm2=permutation.data2
  }

  # The correlation matrices of the permuted and bootstrapped data; see extractCor function for more details
  # mapply is a function that applies over two lists, applying extractCor to the first element of each, then the
  # second element of each, etc.


  log.processing.progress(CA,paste("Calculating permutation similarity scores: col.subset =",toString(col.subset)))  #Log progress
  CA$verbose.requested = FALSE			#If the User requested verbose output - turn it off temporarily
  if (CA$verbose == TRUE)
  {
    CA$verbose.requested <- TRUE		#Turn of the verbose flag save variable
    CA$verbose <- FALSE					#But pass non verbose to the CA$method (Could be nc.score or anything else)
  }

  permutation.cor = mapply(extractCor,
                           mat1=permutation.norm1,
                           mat2=permutation.norm2,
                           startrow=1,
                           endrow=n1.subset,
                           startcol=(n1.subset+1),
                           endcol=length(col.subset),
                           MoreArgs = list(col.subset = col.subset,
                                           my.method=CA$method,
                                           method.args=CA$method.args,
                                           outdist=CA$outdist,
                                           outdistFile=CA$outdistFile),
                           SIMPLIFY=FALSE)


  #********************************************************************************
  #  Reboot times = RT =  -round(log2(CA$errthresh))                              *
  # This is the maximum number of iterations to try to reboot a matrix if in a col*
  # all values are 0                                                              *
  #                                                                               *
  # For each matrix, check if there is a column that is all zeros 		        *
  # If such matrix is found,  try to reboot it at most RT times until such matrix  *
  # is found that does not contain a column with all zeros                         *
  # The limit of RT tries is now hard coded and needs to be re-evaluated           *
  # If after RT times there is no success - we stop the run                       *
  #********************************************************************************

  log.processing.progress(CA,paste("Calculating boot similarity scores: col.subset  =",toString(col.subset)))  #Log progress
  boot.cor  = lapply(boot.data,bootstrap.method.calcn,nsubj,data,CA,col.subset )		 ###Function to check is all cols are zeros and apply cor

  # Now calculating the correlations and p-values between the two datasets
  log.processing.progress(CA,"Calculating p-values")  #Log progress



  CA$p.values <-matrix(data=NA,nrow=n1.features,ncol=n2.features)	#Build the empty p.values matrix
  CA$z.stat  <-matrix(data=NA,nrow=n1.features,ncol=n2.features)		#Build the empty z.stat matrix

  CA$sim.score <-matrix(data=NA,nrow=n1.features,ncol=n2.features)	#Build the empty correlation matrix

  loop.range1 <- which(col.subset %in% 1:n1.features)						#Establish looping range default

  if ( length(CA$filtered.subset.cols.1)> 0 )		#If the User entered a subset of columns
  {
    loop.range1 <- which(col.subset %in% CA$filtered.subset.cols.1)		#Use the subset of columns
  }

  loop.range2 <- which( col.subset %in% (n1.features + 1:n2.features)) - n1.subset						#Establish looping range default

  if ( length(CA$filtered.subset.cols.2) > 0 )		#If the User entered a subset of columns
  {
    loop.range2 <- which(col.subset %in% (n1.features + CA$filtered.subset.cols.2)) - n1.subset		#Use the subset of columns
  }

  max.comparisons <- n1.features*n2.features
  if    (!is.na(CA$outdist))                                              #If user requested to print the distributions
  {
    output.string0 <- paste0("iter_",seq_len(length(boot.cor)),collapse=",")
    output.string  <- paste0("dist_type,i,k,feature1,feature2,",output.string0,'\n', sep='',collapse=",")

    cat(output.string,file=CA$outdistFile,append=TRUE)
  }


  internal.loop.counter = 0   # Initialize the counter


  for(index1 in seq_len(length(loop.range1)))					#Modified seq_along to seq_len: Need to review....
  {
    i = loop.range1[index1]
    for(index2 in seq_len(length(loop.range2)))				#Modified seq_along to seq_len: Need to review....
    {
      k = loop.range2[index2]
      # Get a vector of the (i,k)th element of each correlation matrix in the list of bootstrapped data; this is the bootstrap distribution

      internal.loop.counter = internal.loop.counter + 1  #Increment the loop counter
      if (internal.loop.counter %% CA$iterations.gap == 0)   #If output is verbose and the number of iterations is multiple
        #of iterations gap - print status
      {
        print.msg = paste('Completed ', internal.loop.counter, ' comparisons')
        log.processing.progress(CA,print.msg)  #Log progress
      }

      bootstrap.dist = unlist(lapply(boot.cor,'[',i,n1.subset+k))
      bootstrap.dist[is.na(bootstrap.dist)] <- 0				#If there is an NA in bootstrap.dist - replace with 0 (Needs
      #review)
      # Get a vector the (i,k)th element of each correlation matrix in the list of permuted data; this is the permuted distribution
      permutation.dist = unlist(lapply(permutation.cor,'[',i,k))

      if    (!is.na(CA$outdist))						#If user requested to print the distributions
      {
        RC <- print.dist(bootstrap.dist,permutation.dist,CA,col.subset[i],col.subset[n1.subset+k] - n1.features)
      }
      measure.parameter.list <- append(list(x=data[,col.subset[i]],y=data[,col.subset[n1.subset+k]]), CA$sim.score.parameters)  #build the method
      #do.call parameter list
      sim.score.value <- do.call(CA$method,measure.parameter.list)	#Invoke the measuring function  <-  Was cor

      z.stat_and_p.value.list  <- calculate.z.stat.and.p.value(bootstrap.dist,permutation.dist )  #Invoke new function to calculate it
      #Same calculations - just packed as a function
      z.stat <- z.stat_and_p.value.list$z.stat		#Retrieve value form the list
      p.value <- z.stat_and_p.value.list$p.value	    #Retrieve from the list




      CA$p.values[col.subset[i],col.subset[n1.subset+k]-n1.features] = p.value				#Post it in the p.values matrix
      CA$z.stat[col.subset[i],col.subset[n1.subset+k]-n1.features] = z.stat					#Post it in the z.stat matrix

      CA$sim.score[col.subset[i],col.subset[n1.subset+k]-n1.features] = sim.score.value			    #Post it in the cor matrix   <--Was cor

    }
  }



  CA <- calculate_q_values(CA)						#Calculate the QValues
  #********************************************************************
  #*  Final Edits before exiting                                      *
  #********************************************************************
  rownames(CA$p.values) <- colnames(CA$data1.norm)		#Post the column names
  colnames(CA$p.values) <- colnames(CA$data2.norm)		#Post the column names

  rownames(CA$sim.score) <- colnames(CA$data1.norm)		#Post the column names
  colnames(CA$sim.score) <- colnames(CA$data2.norm)		#Post the column names
  rownames(CA$q.values) <- colnames(CA$data1.norm)		#Post the column names
  colnames(CA$q.values) <- colnames(CA$data2.norm)		#Post the column names
  rownames(CA$z.stat) <- colnames(CA$data1.norm)		#Post the column names
  colnames(CA$z.stat) <- colnames(CA$data2.norm)		#Post the column names

  total.rows.to.display = 1:nrow(CA$p.values)				#Number of rows to display
  total.cols.to.display = 1:ncol(CA$p.values)				#Number of cols to display

  if  (length(CA$filtered.subset.cols.1) > 0) #If User selected a subset
  {
    total.rows.to.display = CA$filtered.subset.cols.1
  }
  if  (length(CA$filtered.subset.cols.2) > 0) #If User selected a subset
  {
    total.cols.to.display = CA$filtered.subset.cols.2
  }
  CA$p.values <- CA$p.values[total.rows.to.display,total.cols.to.display]   #Display only the subset of cols and rows
  CA$q.values <- CA$q.values[total.rows.to.display,total.cols.to.display]   #Display only the subset of cols and rows
  CA$sim.score <- CA$sim.score[total.rows.to.display,total.cols.to.display]   #Display only the subset of cols and rows
  CA$z.stat    <- CA$z.stat[total.rows.to.display,total.cols.to.display]
  CA <- clean_common_area_after_processing(CA)	#Clean the Common Area before returning to the User

  return(CA)			# Return the output matrix
}



ccrepe.calculations  <-
  #*************************************************************************************
  #*  	ccrepe calculations                                                          *
  #*************************************************************************************
  function(CA)
  {
    Output <-list()

    CA <- DecodeInputCAArguments(CA)							#Decode Input Parameters
    CA <- preprocess_data(CA)

    if (CA$OneDataset == TRUE)
    {
      CA  = ccrepe_process_one_dataset(CA$data1.norm,CA$iterations, CA)  				#Invoke the new function
    }
    else
    {
      CA = ccrepe_process_two_datasets  (CA$data1.norm ,CA$data2.norm ,CA$iterations, CA)
    }
    return ( CA )
  }






DecodeInputCAArguments <-
  function(CA){
    #*************************************************************************************
    #*	Decode and validate input parameters                                             *
    #*************************************************************************************

    CA$Gamma = 0.57721566490153286060651209008240243104215933593992  		#Euler's Gamma


    if (length(CA$data1) == 1)					#If the user did not select at least one input dataframe - Stop the run
    {
      if (is.na(CA$data1))
        stop('You must select at least one input data frame (data1<-YourData)')
    }
    CA$OneDataset <- FALSE						#Set the symmmetrical flag to False (Will be true if data2=data1)
    if (length(CA$data2) == 1)					#If user did not enter data2 - we assume he wants data2=data1
      if (is.na(CA$data2))
      {
        CA$OneDataset <- TRUE					#And set up the Symmetrical flag
      }



    if  (!class(CA$subset.cols.1   ) == "numeric"  && !(is.null(CA$subset.cols.1)))     #Modified the check for subset.cols
    {                                           #If class is not "numeric" - set up to NULL
      ErrMsg = paste0(                            #This replaces the old checks  -   length(CA$subset.cols.1) == 0 && (is.null(CA$subset.cols.1))
        "Subset columns1 is not numeric - was : "  ,
        CA$subset.cols.1,
        " - Set to NULL"
      )
      warning(ErrMsg)
      CA$subset.cols.1 = NULL					#Set to default
    }
    if  (!class(CA$subset.cols.2  ) == "numeric"  && !(is.null(CA$subset.cols.2)))     #Modified the check for subset.cols
    {                                           #If class is not "numeric" - set up to NULL
      ErrMsg = paste0(                            #This replaces the old checks  -   length(CA$subset.cols.1) == 0 && (is.null(CA$subset.cols.1))
        "Subset columns2 is not numeric - was : "  ,
        CA$subset.cols.2,
        " - Set to NULL"
      )
      warning(ErrMsg)
      CA$subset.cols.2 = NULL					#Set to default
    }



    if    (!is.na(CA$outdist))							#If the user passed a file - open it
    {
      if(file.exists(CA$outdist)){
        ErrMsg = paste0(
          "Distribution file ",
          CA$outdist,
          " already exists--overwriting."
        )
        warning(ErrMsg)
        file.remove(CA$outdist)
      }
      CA$outdistFile = file(CA$outdist,open='at')		#Open outdist file
    }
    if ( !is.logical(CA$compare.within.x ) )	#(Modify - use is.logical)  compare.within.x must be either true or false
    {
      ErrMsg = paste0("Setting compare.within.x to TRUE - it was ",CA$compare.within.x )
      warning(ErrMsg)
      CA$compare.within.x =  TRUE									#True - is the default
    }


    if ( !is.logical(CA$verbose)  )	#Verbose Flag has to be either true or false, otherwise - we set it to false
    {
      ErrMsg = paste0("Verbose Flag set to False was:", CA$verbose)
      warning(ErrMsg)
      CA$verbose =  FALSE									#False - is the default
    }


    if  ( is.na(suppressWarnings(as.integer(CA$iterations.gap)))   || !(CA$iterations.gap%%1==0) ) 	#Check the iterations gap (Number of iterations after which to print
      #if status  verbose - Cannot be fractional
    {
      ErrMsg = paste0("iterations.gap set to 100 -  was: ", CA$iterations.gap)
      warning(ErrMsg)
      CA$iterations.gap = 100 					#If not valid - use 100
    }

    if (CA$memory.optimize && !CA$OneDataset){
      warning("The memory.optimize argument only applies to calculations on one dataset.
              Hence, this computation will not have optimized memory usage.")
    }
    if    (!is.na(CA$outdist) && CA$memory.optimize)                                              #If user requested to print the distributions
    {
      warning("In the memory optimized algorithm, output of the bootstrap distribution is not available")
    }
    CA$sim.score.parameters<-list()							#Initialize the parameter list
    CA$sim.score.parameters <- CA$method.args				#Add the entries in method.args to the measuring parameter list


    CA$retries.max.iterations =  -round(log2(CA$errthresh)) #This is the maximum number of iterations to try to reboot a matrix if in a col all values are 0


    return(CA)			 				#Return list of decoded input parameters
  }




extractCor <-
  function(mat1,mat2,startrow,endrow,startcol,endcol,col.subset,my.method,method.args,outdist,outdistFile,  ...)
    #******************************************************************************************
    # A function to calculate the correlation of the two matrices by merging them,            *
    #     calculating the correlation of the merged matrix, and extracting the appropriate    *
    #     submatrix to obtain the correlation of interest                                     *
    # startrow, endrow, startcol, endcol give the indeces of the submatrix to extract         *
    # The method and the method parameters are passed via the list
    #******************************************************************************************
  {
    mat <- merge_two_matrices(mat1,mat2)[,col.subset]	            #Merge the two matrices
    measure.function.parm.list <- append(list(x=mat), method.args)
    mat_C <-do.call(my.method,measure.function.parm.list)	#Invoke the measuring fnction
    if(length(startrow:endrow)==1){
      sub_mat_C <- matrix(mat_C[startrow:endrow,startcol:endcol],nrow=1)
    } else if(length(startcol:endcol)==1){
      sub_mat_C <- matrix(mat_C[startrow:endrow,startcol:endcol],ncol=1)
    } else {
      sub_mat_C <- mat_C[startrow:endrow, startcol:endcol] # Extract the appropriate submatrix
    }
    return(sub_mat_C)
  }




merge_two_matrices <-
  function(mat1,mat2)
    #*************************************************************************************
    # A function to merge two matrices, which works when they're different dimensions    *
    # merge the two matrices by the rows, merging only rows present in both              *
    # (ensures no missing samples)                                                       *
    # exclude the first column, which is the row names                                   *
    # <----------- Important  !!! --------------->                                       *
    #* The matrices are merged by row name (Which defaults to row number)                *
    #* so the assumption is that SAME ROWS pertain to SAME SUBJECTS                      *
    #* Also not that first column is treated as data - not Subject.ID or any other       *
    #* identifier                                                                        *
    #*************************************************************************************
  {
    mat <- merge(mat1,mat2,by="row.names")
    rownames(mat) = mat[,1]    # The first column has the row names
    mat = mat[,-1]             # Remove the first column
    return(mat)
  }




lappend <-
  function(lst, obj) {
    #*************************************************************************************
    #*  Function to append obj to lst, where lst is a list                               *
    #*************************************************************************************
    lst[[length(lst)+1]] <- obj
    return(lst)
  }





permute <-
  function(data,permute.id.matrix){
    #*************************************************************************************
    #* Function to permute the data using a matrix of indices                            *
    #* data is the data which you want to permute                                        *
    #* permute.id.matrix is a matrix where each column is some                           *
    #* permutation of the row indices                                                    *
    #*************************************************************************************
    permute.data = data
    for(i in 1:ncol(data)){									# For each column
      permute.data[,i] = data[permute.id.matrix[,i],i] 	# Replace the original column with a permuted one
    }
    return(permute.data)
  }



filter_dataset <-
  function(X){
    #**********************************************************************
    #	Filter out missing rows and rows/cols that are all zero          *
    #**********************************************************************
    missing_rows <- which(
      apply(
        X,
        1,
        function(row){
          sum(is.na(row))
        }
      ) > 0
    )
    if(length(missing_rows)>0){
      X_no_missing <- X[-missing_rows,]
    } else {
      X_no_missing <- X
    }
    raw_all_zero_rows <- which(
      apply(
        X,
        1,
        function(row){
          sum(row==0)/length(row)
        }
      )==1
    )
    all_zero_rows <- which(rownames(X_no_missing) %in% rownames(X)[raw_all_zero_rows])
    raw_all_zero_cols <- which(
      apply(
        X_no_missing,
        2,
        function(col){
          sum(col==0)/length(col)
        }
      )==1
    )
    all_zero_cols <- which(colnames(X_no_missing) %in% colnames(X)[raw_all_zero_cols])
    if(length(all_zero_rows)>0 && length(all_zero_cols)>0){
      X_filtered <- X_no_missing[-all_zero_rows,-all_zero_cols]
    } else if( length(all_zero_rows) > 0 ){
      X_filtered <- X_no_missing[-all_zero_rows,]
    } else if( length(all_zero_cols) > 0 ){
      X_filtered <- X_no_missing[,-all_zero_cols]
    } else {
      X_filtered <- X_no_missing
    }

    return(
      list(
        X_filtered    = X_filtered,
        all_zero_rows = raw_all_zero_rows,
        all_zero_cols = raw_all_zero_cols,
        missing_rows  = missing_rows
      )
    )
  }

check_data_values <-
  function(CA){
    #**********************************************************************
    #	Check that all data values are valid 				                              *
    #**********************************************************************
    # Check non-negativity
    num.neg <- sum(CA$data1[!is.na(CA$data1)] < 0)
    if( num.neg > 0 ){
      ErrMsg <- paste(
        "There are",
        num.neg,
        "negative values in x.  CCREPE only uses non-negative values--stopping run.")
      stop(ErrMsg)
    }
    # Check simplex
    if(CA$renormalize){
      num.gt.1 <- sum(CA$data1[!is.na(CA$data1)] > 1)
      if( num.gt.1 > 0){
        ErrMsg <- paste(
          "x appears to be count data or absolute abundances, since there are",
          num.gt.1,
          "values greater than 1.",
          'If you want to run CCREPE with absolute abundaces, please specify renormalize=FALSE'
        )
        warning(ErrMsg)
      }
    }
    if(!CA$OneDataset){
      # Check non-negativity
      num.neg2 <- sum(CA$data2[!is.na(CA$data2)] < 0)
      if( num.neg2 > 0 ){
        ErrMsg <- paste(
          "There are",
          num.neg2,
          "negative values in y.  CCREPE only uses non-negative values--stopping run.")
      }
      # Check simplex
      if(CA$renormalize){
        num2.gt.1 <- sum(CA$data2[!is.na(CA$data2)] > 1)
        if( num2.gt.1 > 0){
          ErrMsg <- paste(
            "y appears to be count data, since there are",
            num2.gt.1,
            "values greater than 1."
          )
          warning(ErrMsg)
        }
      }
    }
  }

preprocess_data <-
  function(CA)
    #**********************************************************************
    #	Preprocess input data 				                              *
    #**********************************************************************
  {
    check_data_values(CA)

    if(CA$OneDataset){
      if(is.null(colnames(CA$data1))) colnames(CA$data1) <- seq_len(ncol(CA$data1))
      if(is.null(rownames(CA$data1))) rownames(CA$data1) <- seq_len(nrow(CA$data1))

      filtered_data1 <- filter_dataset(CA$data1)
    } else {
      if(is.null(colnames(CA$data1))) colnames(CA$data1) <- paste0(seq_len(ncol(CA$data1)),".X")
      if(is.null(rownames(CA$data1))) rownames(CA$data1) <- seq_len(nrow(CA$data1))
      if(is.null(colnames(CA$data2))) colnames(CA$data2) <- paste0(seq_len(ncol(CA$data2)),".Y")
      if(is.null(rownames(CA$data2))) rownames(CA$data2) <- seq_len(nrow(CA$data2))

      filtered_data1 <- filter_dataset(CA$data1)
      filtered_data2 <- filter_dataset(CA$data2)
    }

    if(length(filtered_data1$missing_rows)>0){
      ErrMsg = paste(
        "Excluding subject(s)",
        toString(rownames(CA$data1)[filtered_data1$missing_rows]),
        "from x because they have missing values."
      )
      warning(ErrMsg)
    }

    if(length(filtered_data1$all_zero_rows)>0){
      ErrMsg = paste(
        "Excluding subject(s)",
        toString(rownames(CA$data1)[filtered_data1$all_zero_rows]),
        "from x because they are entirely zero."
      )
      warning(ErrMsg)
    }

    if(length(filtered_data1$all_zero_cols)>0){
      ErrMsg = paste(
        "Excluding feature(s)",
        toString(colnames(CA$data1)[filtered_data1$all_zero_cols]),
        "from x because they are entirely zero."
      )
      warning(ErrMsg)
    }

    if(CA$OneDataset){
      if(nrow(filtered_data1$X_filtered) < CA$min.subj )
      {
        ErrMsg = paste0(
          'Not enough data - found ',
          nrow(filtered_data1$X_filtered),
          ' rows of non-missing data - Less than  ',
          CA$min.subj,
          ' (=min.subj) - Run Stopped'
        )
        stop(ErrMsg)
      }


      if(CA$renormalize){
        CA$data1.norm = filtered_data1$X_filtered/rowSums(filtered_data1$X_filtered)
      }
      else{
        CA$data1.norm = filtered_data1$X_filtered
      }
      CA$filtered.subset.cols.1 <- NULL
      CA$filtered.subset.cols.2 <- NULL
      if(!is.null(CA$subset.cols.1)){
        CA$filtered.subset.cols.1 <- which(colnames(CA$data1.norm) %in% colnames(CA$data1)[CA$subset.cols.1])
        if( length(CA$filtered.subset.cols.1)==0 ){
          ErrMsg = paste0(
            "User requested subset.cols.x=c(",
            toString(CA$subset.cols.1),
            "), but all are filtered.  Setting subset.cols.x=NULL."
          )
          warning(ErrMsg)
          CA$subset.cols.1 <- NULL
          CA$filtered.subset.cols.1 <- NULL
        }
      }
      if(!is.null(CA$subset.cols.2)){
        CA$filtered.subset.cols.2 <- which(colnames(CA$data1.norm) %in% colnames(CA$data1)[CA$subset.cols.2])
        if( length(CA$filtered.subset.cols.2)==0 ){
          ErrMsg = paste0(
            "User requested subset.cols.y=c(",
            toString(CA$subset.cols.2),
            "), but all are filtered.  Setting subset.cols.y=NULL"
          )
          warning(ErrMsg)
          CA$subset.cols.2 <- NULL
          CA$filtered.subset.cols.2 <- NULL
        }
      }

    } else {

      if(length(filtered_data2$missing_rows)>0){
        ErrMsg = paste(
          "Excluding subject(s)",
          toString(rownames(CA$data2)[filtered_data2$missing_rows]),
          "from y because they have missing values."
        )
        warning(ErrMsg)
      }

      if(length(filtered_data2$all_zero_rows)>0){
        ErrMsg = paste(
          "Excluding subject(s)",
          toString(rownames(CA$data2)[filtered_data2$all_zero_rows]),
          "from y because they are entirely zero."
        )
        warning(ErrMsg)
      }

      if(length(filtered_data2$all_zero_cols)>0){
        ErrMsg = paste(
          "Excluding feature(s)",
          toString(colnames(CA$data2)[filtered_data2$all_zero_cols]),
          "from y because they are entirely zero."
        )
        warning(ErrMsg)
      }

      if(CA$renormalize){
        CA$data1.norm = filtered_data1$X_filtered/rowSums(filtered_data1$X_filtered)
        CA$data2.norm = filtered_data2$X_filtered/rowSums(filtered_data2$X_filtered)
      }
      else{
        CA$data1.norm = filtered_data1$X_filtered
        CA$data2.norm = filtered_data2$X_filtered
      }
      remove_from_d1 <- which(
        rownames(CA$data1.norm) %in% setdiff(rownames(CA$data1.norm),rownames(CA$data2.norm))
      )
      remove_from_d2 <- which(
        rownames(CA$data2.norm) %in% setdiff(rownames(CA$data2.norm),rownames(CA$data1.norm))
      )
      if(length(remove_from_d1) > 0 && length(remove_from_d2) > 0){
        ErrMsg = paste0("Removing subjects ",
                        toString(rownames(CA$data1.norm)[remove_from_d1]),
                        " from dataset x and subjects ",
                        toString(rownames(CA$data2.norm)[remove_from_d2]),
                        " from dataset y because they are not paired between the datasets."
        )
        warning(ErrMsg)
        CA$data1.norm <- CA$data1.norm[-remove_from_d1,]
        CA$data2.norm <- CA$data2.norm[-remove_from_d2,]
      } else if( length(remove_from_d1) > 0 ){
        ErrMsg = paste0(
          "Removing subjects ",
          toString(rownames(CA$data1.norm)[remove_from_d1]),
          " from dataset x because they are not in dataset y."
        )
        warning(ErrMsg)
        CA$data1.norm <- CA$data1.norm[-remove_from_d1,]
      } else if( length(remove_from_d2) > 0 ){
        ErrMsg = paste0(
          "Removing subjects ",
          toString(rownames(CA$data2.norm)[remove_from_d2]),
          " from dataset y because they are not in dataset x."
        )
        warning(ErrMsg)
        CA$data2.norm <- CA$data2.norm[-remove_from_d2,]
      }

      merged_data <- merge_two_matrices(CA$data1.norm,CA$data2.norm)
      if(nrow(merged_data) < CA$min.subj )
      {
        ErrMsg = paste0(
          'Not enough data - found ',
          nrow(merged_data),
          ' rows paired of non-missing data - Less than  ',
          CA$min.subj,
          ' (=min.subj) - Run Stopped'
        )
        stop(ErrMsg)
      }

      CA$filtered.subset.cols.1 <- NULL
      CA$filtered.subset.cols.2 <- NULL
      if(!is.null(CA$subset.cols.1)){
        CA$filtered.subset.cols.1 <- which(colnames(CA$data1.norm) %in% colnames(CA$data1)[CA$subset.cols.1])
        if( length(CA$filtered.subset.cols.1)==0 ){
          ErrMsg = paste0(
            "User requested subset.cols.x=c(",
            toString(CA$subset.cols.1),
            "), but all are filtered.  Setting subset.cols.x=NULL."
          )
          warning(ErrMsg)
          CA$subset.cols.1 <- NULL
          CA$filtered.subset.cols.1 <- NULL
        }
      }
      if(!is.null(CA$subset.cols.2)){
        CA$filtered.subset.cols.2 <- which(colnames(CA$data2.norm) %in% colnames(CA$data2)[CA$subset.cols.2])
        if( length(CA$filtered.subset.cols.2)==0 ){
          ErrMsg = paste0(
            "User requested subset.cols.y=c(",
            toString(CA$subset.cols.2),
            "), but all are filtered.  Setting subset.cols.y=NULL."
          )
          warning(ErrMsg)
          CA$subset.cols.2 <- NULL
          CA$filtered.subset.cols.2 <- NULL
        }
      }


    }
    return(CA)

  }





print.dist <-
  function(bootstrap.dist,permutation.dist,  CA,i, k)	{
    #*************************************************************************************
    #*  Function to print the distribution                                               *
    #*************************************************************************************
    outdistFile = CA$outdistFile 						#Output file

    output.string0 <- paste(bootstrap.dist,sep="",collapse=',')		#Build the output string
    if (CA$OneDataset == TRUE)							#The structure of the output is different for one dataset and two datasets
    {output.string <-paste("Boot,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data1.norm)[k],",",output.string0,'\n', sep='',collapse=",")}
    else
    {output.string <-paste("Boot,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data2.norm)[k],",",output.string0,'\n', sep='',collapse=",")}

    cat(output.string,file=CA$outdistFile,append=TRUE)

    output.string0 <- paste(permutation.dist,sep="",collapse=',')		#Build the output string
    if (CA$OneDataset == TRUE)							#The structure of the output is different for one dataset and two datasets
    {output.string <-paste("Permutation,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data1.norm)[k],",",output.string0,'\n', sep='',collapse=",")}
    else
    {output.string <-paste("Permutation,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data2.norm)[k],",",output.string0,'\n', sep='',collapse=",")}
    cat(output.string,file=CA$outdistFile,append=TRUE)


    return (0)
  }







resample <-
  function(data,resample.matrix){
    #*************************************************************************************
    #* Function to calculate the resampled data matrix; this                             *
    #*    is used for getting only the bootstrapped data                                 *
    #*    data is the data which you want to bootstrap                                   *
    #*   resample.matrix is the matrix appropriate for getting                           *
    #*    you the bootstrapped dataset: matrix is square, with                           *
    #*    the same number of rows as data; each row has exactly one                      *
    #*    1 in it                                                                        *
    #*************************************************************************************
    data.matrix <-as.matrix(data)
    return(resample.matrix%*%data.matrix)
  }




clean_common_area_after_processing <-
  function(CA){
    #*************************************************************************************
    #* 	Function to clean the Common Area After processing                               *
    #*  Common Area is passed to the User as results                                     *
    #*************************************************************************************
    if   (!is.na(CA$outdist))										#If user requested to print the distributions
    {
      close(CA$outdistFile)										#Close outdist file
      CA$outdistFile <- NULL										#And remove it from the common area
    }
    if (CA$verbose.requested == TRUE)								#Check if the User requested verbose
    {															#We might have turned it off before calling CA$method
      CA$verbose = TRUE
    }
    CA$verbose.requested = NULL
    CA$data1 <- NULL
    CA$data1.norm <- NULL
    CA$data2 <- NULL
    CA$data2.norm <- NULL
    CA$method <- NULL
    CA$method.args<- NULL
    CA$OneDataset <- NULL
    CA$outdist <- NULL
    CA$renormalize <- NULL
    CA$memory.optimize <- NULL
    CA$Gamma <- NULL
    CA$retries.max.iterations <- NULL
    CA$subset.cols.x <-CA$subset.cols.1
    CA$subset.cols.y <-CA$subset.cols.2
    CA$subset.cols.1 <- NULL
    CA$subset.cols.2 <- NULL
    CA$filtered.subset.cols.1 <- NULL
    CA$filtered.subset.cols.2 <- NULL


    if  ( length(CA$subset.cols.x) == 1 &&  CA$subset.cols.x == c(0))			#If NA - set to default
    {
      CA$subset.cols.x <-NULL					#Set to default
    }
    if  ( length(CA$subset.cols.y) == 1 &&  CA$subset.cols.y == c(0))			#If NA - set to default
    {
      CA$subset.cols.y <-NULL					#Set to default
    }



    if (!CA$verbose == TRUE)
    {
      CA$min.subj <- NULL
      CA$iterations <- NULL
      CA$method.args <- NULL
      CA$verbose <- NULL
      CA$iterations.gap <- NULL
      CA$compare.within.x <- NULL
      CA$sim.score.parameters <- NULL
      CA$subset.cols.x <- NULL
      CA$subset.cols.y <- NULL
      CA$errthresh <- NULL
    }




    return(CA)
  }



bootstrap.method.calcn <-
  function(subsetted.matrix,nsubj,original.data,CA,col.subset){
    #*************************************************************************************
    #* 	Function to calculate cor  and check that total in cols is not zero              *
    #*************************************************************************************
    check.col.sums <- colSums(subsetted.matrix[,col.subset])==0
    if (length(check.col.sums[check.col.sums==TRUE]))  		#If there is a column that is all zeros - try to reboot data CA$retries.max.iterations=-round(log2(CA$errthresh)) times
    {
      cnt.tries = 0							#Initialize counter of  tries that we will try to reboot
      while(cnt.tries < CA$retries.max.iterations  && length(check.col.sums[check.col.sums==TRUE]))  #Check if we succeeded rebooting the data so no cols are zero
      {
        cnt.tries <- cnt.tries+1			#Increase the counter
        b1 = try.reboot.again (nsubj,original.data)	#Try to reboot again
        check.col.sums <- colSums(b1[,col.subset])==0	#Are there columns with all zeros in the new rebooted matrix?
        if (!length(check.col.sums[check.col.sums==TRUE])) #If there is no column that is all zeros - post the result to continue processing
        {subsetted.matrix <- b1}						#Post the result
      }
      if (cnt.tries > CA$retries.max.iterations  && length(check.col.sums[check.col.sums==TRUE]))	#If reboot did not work - Stop the run
      {
        ErrMsg = paste('Tried to reboot the original.data', CA$retries.max.iterations, 'times but always get a column that is all zeros')  #Error
        warning(ErrMsg)

      }
    }

    measure.function.parm.list <- append(list(x=subsetted.matrix[,col.subset]), CA$method.args)	#Build the measure function parameter list
    boot.cor <-do.call(CA$method,measure.function.parm.list)	#Invoke the measuring function
    return(boot.cor)
  }

try.reboot.again <-
  function(nsubj,data){
    #*************************************************************************************
    #* 	Function to reboot the data matrix agin so that we don't get cols with all 0     *
    #*************************************************************************************
    possible.rows = diag(rep(1,nsubj))
    boot.rowids        = sample(seq(1,nsubj),nsubj,replace=TRUE)
    # Generate the bootstrap matrices by getting the appropriate rows from the possible bootstrap rows
    boot.matrix        = possible.rows[boot.rowids,]
    b1<-resample (data,boot.matrix) 		#b1 is the rebooted matrix that we return
    return(b1)
  }

log.processing.progress <-
  function(CA,msg){
    #*************************************************************************************
    #* 	Function to report the progress of processing                                    *
    #*  only if verbose flag is on                                                       *
    #*************************************************************************************
    if (CA$verbose == TRUE  ||  (!is.null(CA$verbose.requested)  && CA$verbose.requested == TRUE ))
      message(cat(date(),' ==> ',msg))			#Display date and time and progress
    return (0)
  }

calculate.z.stat.and.p.value <-
  function(bootstrap.dist,permutation.dist ){
    #*************************************************************************************
    #* Function to calculate the z.stat and p.value                                      *
    #*************************************************************************************
    z.stat_p.value_list = list()		#Define the list
    z.stat_p.value_list$z.stat <- (mean(bootstrap.dist) - mean(permutation.dist))/sqrt(0.5*(var(permutation.dist)+var(bootstrap.dist)))
    z.stat_p.value_list$p.value <- 2*pnorm(-abs(z.stat_p.value_list$z.stat))
    return(z.stat_p.value_list)
  }
AlmaasLab/micInt documentation built on April 1, 2022, 10:37 a.m.