#' @importFrom stats complete.cases na.omit pnorm var
#'
calculate_q_values <- function(CA)
#**********************************************************************
# Calculate Q values *
#**********************************************************************
{
q.values.calc.matrix <- CA$p.values #Set the default p.values for one and two dataset cases
if (CA$OneDataset == TRUE)
{q.values.calc.matrix[lower.tri(q.values.calc.matrix,diag=TRUE)] <- NA} #If One dataset, use only the upper part of the matrix
QValuesArranged <- calculate_q_values_vector(q.values.calc.matrix,CA)
CA$q.values<-QValuesArranged
if (CA$OneDataset){
CA$q.values[lower.tri(CA$q.values)] <- t(CA$q.values)[lower.tri(CA$q.values)]
}
return(CA)
}
calculate_q_values_vector <- function(p.values.vector,CA){
p.values <- p.values.vector
missing_idx <- which(is.na(p.values))
if (length(missing_idx)>0){
non.missing.p.values <- p.values[-missing_idx]
m = length(non.missing.p.values) #m is the total number of p-values
ln_m = log(m) #log m
ln_m_Plus_Gamma = ln_m + CA$Gamma
q.values <- p.values
q.values[-missing_idx] = non.missing.p.values * m * ln_m_Plus_Gamma / rank(non.missing.p.values)
} else {
m = length(p.values)
ln_m = log(m)
ln_m_Plus_Gamma = ln_m + CA$Gamma
q.values <- p.values * m * ln_m_Plus_Gamma / rank(p.values)
}
return(q.values)
}
ccrepe_norm <-
function(data){
#*************************************************************************************
#* A function to normalize a data matrix across the rows *
#*************************************************************************************
if(0 %in% rowSums(data)){
ErrMsg = paste0(
"All-zero subjects generated by permutation: ",
toString(which(rowSums(data)==0)),
". These will be replaced by all zeros after normalization"
)
warning(ErrMsg)
}
data.normalized <- data/rowSums(data) #Normalize
data.normalized[rowSums(data)==0,] <- 0 #Replace all zero-rows (which become NA after normalization) with zeros.
return(data.normalized)
}
bootstrap.procedure.simple <- function(n.features,nsubj,n.iter,data,possible.rows,col.subset,CA) {
#Generating the bootstrap and permutation matrices
bootstrap.matrices = list() # The list of matrices of possible bootstrap rows
#(each matrix multiplies by the data to give a resampled dataset)
permutation.matrices = list() # The list of permutation matrices; each matrix will be
# have columns which are permutations of the row indices
#**************************************************************
#* Pre allocate *
#**************************************************************
permutation.matrices = vector("list", n.iter) #Pre allocate the permutation matrices
bootstrap.matrices = vector("list", n.iter) #Pre allocate
for(i in seq_len(n.iter)){
if (i %% CA$iterations.gap == 0) #If output is verbose and the number of iterations
#is multiple of iterations gap - print status
{
print.msg = paste('Sampled ',i,' permutation and bootstrap datasets')
log.processing.progress(CA,print.msg) #Log progress
}
# Get the rows of the possible.rows matrix; these correspond to the rows
# which will be included in the resampled dataset
boot.rowids = sample(seq(1,nsubj),nsubj,replace=TRUE)
# Generate the bootstrap matrix by getting the appropriate rows from the possible bootstrap rows
boot.matrix = possible.rows[boot.rowids,]
# Add the bootstrap matrix to the list
bootstrap.matrices [[ i ]] = boot.matrix #Add bootstrap matrix
perm.matrix = replicate(n.features,sample(seq(1,nsubj),nsubj,replace=FALSE)) # The matrix has each column be a permutation of the row indices
permutation.matrices [[i]] = perm.matrix #Populate the permutation matrix
}
# The bootstrapped data; resample the data using each bootstrap matrix
log.processing.progress(CA,"Building boot data") #Log progress
boot.data = lapply(bootstrap.matrices,resample,data=data)
### Using bootstrap.method.calcn for the one dataset also
log.processing.progress(CA,paste("Getting similarity score for bootstrap data: col.subset =",toString(col.subset))) #Log progress
boot.cor = lapply(boot.data,bootstrap.method.calcn,nsubj,data,CA,col.subset ) ###Function to check is all cols are zeros and apply cor
log.processing.progress(CA,"Generating permutation data") #Log progress
# Generating the permutation data; permute the data using each permutation matrix
permutation.data = lapply(permutation.matrices,permute,data=data)
# The correlation matrices for the bootstrapped data; calculate the correlation for each resampled dataset
if (CA$renormalize){# The normalized permutation data; the permutation data needs to be renormalized, but not the bootstrap data
permutation.norm.raw = lapply(permutation.data,ccrepe_norm)
}
else{
permutation.norm.raw = permutation.data
}
permutation.norm = lapply(permutation.norm.raw,function(mat) mat[,col.subset])
# The correlation matrices of the permuted data; calculate the correlation for each permuted dataset
CA$verbose.requested = FALSE #If the User requested verbose output - turn it off temporarily
if (CA$verbose == TRUE)
{
CA$verbose.requested <- TRUE #Turn of the verbose flag save variable
CA$verbose <- FALSE #But pass non verbose to the CA$method (Could be nc.score or anything else)
}
log.processing.progress(CA,paste("Calculating permutation similarity score: col.subset =",toString(col.subset))) #Log progress
permutation.cor <- do.call(lapply,c(list(permutation.norm,CA$method), CA$method.args)) #Invoke the measuring function
# Now, actually calculating the correlation p-values within the dataset
log.processing.progress(CA,"Calculating p-values") #Log progress
loop.range1 <- which( col.subset %in% 1:n.features) #Establish looping range default
loop.range2 <- which( col.subset %in% 1:n.features ) #Establish looping range default
max.comparisons <- choose(length(col.subset),2) #The default number of comparisons
if( length(CA$filtered.subset.cols.1) > 0 ) #If the User entered a subset of columns
{
loop.range1 <- which(col.subset %in% CA$filtered.subset.cols.1) #Use that subset
if( CA$compare.within.x ) #If comparing only within subset.cols.x
{
loop.range2 <- which(col.subset %in% CA$filtered.subset.cols.1) #Use subset.cols.x for the inner loop as well
} else if( length(CA$subset.cols.2)>0 ){ #If comparing between x and y and the user input subset.cols.y
loop.range2 <- which(col.subset %in% CA$filtered.subset.cols.2) #Use subset.cols.y for the inner loop
}
}
if (!is.na(CA$outdist)) #If user requested to print the distributions
{
output.string0 <- paste0("iter_",seq_len(length(boot.cor)),collapse=",")
output.string <- paste0("dist_type,i,k,feature1,feature2,",output.string0,'\n', sep='',collapse=",")
cat(output.string,file=CA$outdistFile,append=TRUE)
}
internal.loop.counter = 0 #Initialize the loop counter
outer.loop.indices.completed = c() #Initialize the list to keep track of already completed outer indices
for( index1 in seq_len(length(loop.range1)) )
{
i = loop.range1[index1]
outer.loop.indices.completed = c(outer.loop.indices.completed,i) # Keep track of which outer loop indices have been accounted for
inner.loop.range = setdiff(loop.range2,outer.loop.indices.completed) # Use as the inner loop only those inner loop indices which haven't been in the outer loop
for(index2 in seq_len(length(inner.loop.range)))
{
k = inner.loop.range[index2]
# Get a vector of the (i,k)th element of each correlation matrix in the list of bootstrapped data; this is the bootstrap distribution
internal.loop.counter = internal.loop.counter + 1 #Increment the loop counter
if (internal.loop.counter %% CA$iterations.gap == 0) #If output is verbose and the number of iterations is multiple of iterations gap - print status
{
print.msg = paste('Completed ', internal.loop.counter, ' comparisons')
log.processing.progress(CA,print.msg) #Log progress
}
bootstrap.dist = unlist(lapply(boot.cor,'[',i,k))
# Get a vector the (i,k)th element of each correlation matrix in the list of permuted data; this is the permuted distribution
permutation.dist = unlist(lapply(permutation.cor,'[',i,k)) #sets
if (!is.na(CA$outdist)) #If user requested to print the distributions
{
RC <- print.dist(bootstrap.dist,permutation.dist,CA,col.subset[i],col.subset[k])
}
measure.parameter.list <- append(list(x=data[,col.subset[i]],y=data[,col.subset[k]]), CA$sim.score.parameters) #build the method do.call parameter list
sim.score <- do.call(CA$method,measure.parameter.list) #Invoke the measuring function
z.stat_and_p.value.list <- calculate.z.stat.and.p.value(bootstrap.dist,permutation.dist ) #Invoke new function to calculate it
#Same calculations - just packed as a function
z.stat <- z.stat_and_p.value.list$z.stat #Retrieve value form the list
p.value <- z.stat_and_p.value.list$p.value #Retrieve from the list
CA$z.stat[col.subset[i],col.subset[k]] = z.stat #Post z.stat in output matrix
CA$z.stat[col.subset[k],col.subset[i]] = z.stat #Post z.stat in output matrix
CA$p.values[col.subset[i],col.subset[k]] = p.value #Post it in the p-values matrix
CA$p.values[col.subset[k],col.subset[i]] = p.value #Post it in the p-values matrix
CA$sim.score[col.subset[i],col.subset[k]] = sim.score #Post it in the cor matrix
CA$sim.score[col.subset[k],col.subset[i]] = sim.score #Post it in the cor matrix
}
}
return(CA)
}
bootstrap.procedure.memory.optimized <- function(n.features,nsubj,n.iter,data,possible.rows,col.subset,CA){
n.matrix.dim <- length(col.subset)
boot.sum <- matrix(data = 0,nrow = n.matrix.dim,ncol = n.matrix.dim)
boot.squared.sum <- matrix(data = 0,nrow = n.matrix.dim,ncol = n.matrix.dim)
perm.sum <- matrix(data = 0,nrow = n.matrix.dim,ncol = n.matrix.dim)
perm.squared.sum <- matrix(data = 0,nrow = n.matrix.dim,ncol = n.matrix.dim)
for(i in seq_len(n.iter)){
if (i %% CA$iterations.gap == 0) #If output is verbose and the number of iterations
#is multiple of iterations gap - print status
{
print.msg = paste('Sampled and computet similarity scores for',
i,' permutation and bootstrap datasets')
log.processing.progress(CA,print.msg) #Log progress
}
# Get the rows of the possible.rows matrix; these correspond to the rows
# which will be included in the resampled dataset
boot.rowids = sample(seq(1,nsubj),nsubj,replace=TRUE)
# Generate the bootstrap matrix by getting the appropriate rows from the possible bootstrap rows
boot.matrix = resample(data = data,resample.matrix = possible.rows[boot.rowids,])
boot.cor <- bootstrap.method.calcn(subsetted.matrix= boot.matrix,nsubj,data,CA,col.subset)
boot.sum <- boot.sum + boot.cor
boot.squared.sum <- boot.squared.sum + boot.cor*boot.cor
perm.matrix = permute(data = data, replicate(n.features,sample(seq(1,nsubj),nsubj,replace=FALSE))) # The matrix has each column be a permutation of the row indicies.
if (CA$renormalize){# The normalized permutation data; the permutation data needs to be renormalized, but not the bootstrap data
permutation.norm.raw = ccrepe_norm(perm.matrix)
}
else{
permutation.norm.raw = perm.matrix
}
permutation.norm = permutation.norm.raw[,col.subset]
perm.cor <- do.call(CA$method,c(list(permutation.norm),CA$method.args))
perm.sum <- perm.sum + perm.cor
perm.squared.sum <- perm.squared.sum + perm.cor*perm.cor
}
log.processing.progress(CA,"Calculating p-values") #Log progress
# Contrary to the usual procedure, the computations are run vectorizesed over all pairs of OTUs, avoiding memory
# allocation trottling for high number of OTUs
measure.parameter.list <- append(list(x=data[,col.subset]), CA$sim.score.parameters) #build the method do.call parameter list
CA$sim.score[col.subset,col.subset] <- do.call(CA$method,measure.parameter.list) #Invoke the measuring function
mean.bootstrap.value <- boot.sum / n.iter
mean.perm.value <- perm.sum / n.iter
var.bootstrap.value <- (boot.squared.sum - n.iter*mean.bootstrap.value*mean.bootstrap.value) / (n.iter-1)
var.perm.value <- (perm.squared.sum - n.iter*mean.perm.value*mean.perm.value) / (n.iter - 1)
CA$z.stat[col.subset,col.subset] <- (mean.bootstrap.value - mean.perm.value) / sqrt(0.5*(var.bootstrap.value+var.perm.value))
CA$p.values[col.subset,col.subset] <- 2*pnorm(-abs(CA$z.stat[col.subset,col.subset]))
# NaN-values are induced for the diagonal elements, we must eliminate them before proceeding
diag(CA$sim.score) <- NA
diag(CA$z.stat) <- NA
diag(CA$p.values) <- NA
CA$verbose.requested = FALSE #If the User requested verbose output - turn it off temporarily
if (CA$verbose == TRUE)
{
CA$verbose.requested <- TRUE #Turn of the verbose flag save variable
CA$verbose <- FALSE #But pass non verbose to the CA$method (Could be nc.score or anything else)
}
return(CA)
}
ccrepe_process_one_dataset <-
function(data,n.iter, CA){
#*************************************************************************************
#* Function to apply the new method to a dataset *
#* As a note, data needs to be a matrix with no missing values *
#*************************************************************************************
n.features = ncol(data) # Number of columns, starting at 1; this is also the number of bugs
CA$p.values <-matrix(data=NA,nrow=n.features,ncol=n.features) #Build the empty PValues matrix
CA$z.stat <-matrix(data=NA,nrow=n.features,ncol=n.features) #Build the empty z.stat matrix
CA$sim.score <-matrix(data=NA,nrow=n.features,ncol=n.features) #Build the empty correlation matrix
nsubj = nrow(data) # Number of subjects
# The subset of columns for which to calculate the similarity scores
col.subset <- 1:n.features
if( length(CA$filtered.subset.cols.1) > 0 && CA$compare.within.x ) #If the User entered a subset of columns
{
col.subset <- CA$filtered.subset.cols.1
} else if(length(CA$filtered.subset.cols.2) > 0 && !CA$compare.within.x)
{
col.subset <- unique(c(col.subset,CA$filtered.subset.cols.2))
}
# Remove features with too many zeros from the subsets of interest
num.feature.zeros <- apply(data,2,function(col) sum(col==0))
CalcThresholdForError = ((CA$errthresh)^(1/nsubj))*nsubj #If there is not enough data
zero_features <- which(num.feature.zeros > CalcThresholdForError)
if(length(zero_features)>0){
ErrMsg = paste0(
"Feature(s) ",
toString(colnames(data)[intersect(zero_features,col.subset)]),
" have more zeros than the threshold of ",
trunc(CalcThresholdForError),
" zeros. Excluding from output (will still be used in normalizing.)"
)
warning(ErrMsg)
col.subset <- setdiff(col.subset,zero_features)
}
# The matrix of possible bootstrap rows (which when multiplied by data give a specific row)
# of the form with all 0s except for one 1 in each row
possible.rows = diag(rep(1,nsubj))
if(!CA$memory.optimize){
CA <- bootstrap.procedure.simple (n.features,nsubj,n.iter,data,possible.rows,col.subset,CA)
}
# Assume we are memory-optimizing the application, we do not create the bootstrap and permutation matrices all
# at once, but instead compute them one by one
else{
CA <- bootstrap.procedure.memory.optimized(n.features,nsubj,n.iter,data,possible.rows,col.subset,CA)
}
log.processing.progress(CA,"Calculating q-values")
CA <- calculate_q_values(CA) #Calculate the QValues
CA$q.values[lower.tri(CA$q.values)] = t(CA$q.values)[lower.tri(t(CA$q.values))] #Making the q.values matrix symmetrical for the one dataset case
#********************************************************************
#* Final Edits before exiting *
#********************************************************************
diag(CA$q.values) <- NA #Set diagonal of q.values to NA
colnames(CA$p.values)<-colnames(CA$data1.norm) #Set the names of the columns in the p.values matrix
rownames(CA$p.values)<-colnames(CA$data1.norm) #Set the names of the rows in the p.values matrix
colnames(CA$q.values)<-colnames(CA$data1.norm) #Set the names of the columns in the q.values matrix
rownames(CA$q.values)<-colnames(CA$data1.norm) #Set the names of the rows in the q.values matrix
colnames(CA$sim.score)<-colnames(CA$data1.norm) #Set the names of the columns in the q.values matrix
rownames(CA$sim.score)<-colnames(CA$data1.norm) #Set the names of the rows in the q.values matrix
colnames(CA$z.stat) <- colnames(CA$data1.norm) #Set the names of the cols in the z.stat matrix
rownames(CA$z.stat) <- colnames(CA$data1.norm) #Set the names of the rows in the z.stat matrix
diag(CA$p.values) <- NA #Set diagonal of p.values to NA
diag(CA$z.stat) <- NA #Set diagonal of z.stat
######################CA$sim.score <- NULL #Not needed to be NUll
if (length(CA$filtered.subset.cols.1) > 0) #If used a subset - present only the subset
{
CA$p.values <- CA$p.values[col.subset,col.subset] #Display only the subset of cols and rows
CA$q.values <- CA$q.values[col.subset,col.subset] #Display only the subset of cols and rows
CA$sim.score <- CA$sim.score[col.subset,col.subset] #Display only the subset of cols and rows
CA$z.stat <- CA$z.stat[col.subset,col.subset]
}
CA <- clean_common_area_after_processing(CA) #Clean the Common Area before returning to the User
return(CA) # Return the output matrix
}
ccrepe_process_two_datasets <- function(data1.norm,data2.norm,n.iter, CA)
#*************************************************************************************
#* ccrepe function for two datasets *
#*************************************************************************************
{
# Get number of bugs, subjects for each dataset
n1.features = ncol(data1.norm)
n2.features = ncol(data2.norm)
log.processing.progress(CA,"Two datasets: Initial merge of the matrices") #Log progress
data = merge_two_matrices(data1.norm,data2.norm)
data1 <- data[,1:n1.features]
data2 <- data[,(n1.features+1):(n1.features+n2.features)]
nsubj = nrow(data)
# Subset of columns for the bootstrapped dataset
col.subset <- 1:(n1.features+n2.features)
n1.subset <- n1.features
n2.subset <- n2.features
if( length(CA$filtered.subset.cols.1) > 0 ) #If the User entered a subset of columns
{
col.subset <- CA$filtered.subset.cols.1
n1.subset <- length(CA$filtered.subset.cols.1)
if(length(CA$subset.cols.2) > 0)
{
col.subset <- c(col.subset,n1.features+CA$filtered.subset.cols.2)
n2.subset <- length(CA$filtered.subset.cols.2)
}
}
# Remove features with too many zeros from the subsets of interest
num.feature.zeros <- apply(data,2,function(col) sum(col==0))
CalcThresholdForError = ((CA$errthresh)^(1/nsubj))*nsubj #If there is not enough data
zero_features <- which(num.feature.zeros > CalcThresholdForError)
if(length(zero_features)>0){
zero_features_data1 <- zero_features[which(zero_features <= n1.features)]
zero_features_data2 <- zero_features[which(zero_features > n1.features)]
ErrMsg = paste0(
"Feature(s) ",
toString(colnames(data)[intersect(zero_features_data1,col.subset)]),
" in dataset 1 and feature(s) ",
toString(colnames(data)[intersect(zero_features_data2,col.subset)]),
" in dataset 2 have more zeros than the threshold of ",
trunc(CalcThresholdForError),
" zeros. Excluding from output (will still be used in normalizing.)"
)
warning(ErrMsg)
col.subset <- setdiff(col.subset,zero_features)
n1.subset <- sum(col.subset <= n1.features)
n2.subset <- sum(col.subset > n1.features)
}
# Subset of columns for the permutation datasets
# The matrix of possible bootstrap rows (which when multiplied by data give a specific row); of the form with all 0s except for one 1 in each row
possible.rows = diag(rep(1,nsubj))
#Generating the bootstrap and permutation matrices
bootstrap.matrices = list() # The list of matrices of possible bootstrap rows (each matrix multiplies by the data to give a resampled dataset)
permutation.matrices1 = list() # The list of permutation matrices; each matrix will be have columns which are permutations of the row indices
permutation.matrices2 = list() # The list of permutation matrices; each matrix will be have columns which are permutations of the row indices
#*****************************************************
#* Pre Allocate Matrices *
#*****************************************************
bootstrap.matrices = vector("list", n.iter)
permutation.matrices1 = vector("list", n.iter)
permutation.matrices2 = vector("list", n.iter)
for(i in seq_len(n.iter)){
if (i %% CA$iterations.gap == 0) #If output is verbose and the number of iterations is multiple of iterations gap - print status
{
print.msg = paste('Sampled ',i,' permutation and bootstrap datasets')
log.processing.progress(CA,print.msg) #Log progress
}
# Get the rows of the two possible.rows matrices; these correspond to the rows which will be included in the resampled datasets
boot.rowids = sample(seq(1,nsubj),nsubj,replace=TRUE)
# Generate the bootstrap matrices by getting the appropriate rows from the possible bootstrap rows
boot.matrix = possible.rows[boot.rowids,]
# Add the bootstrap matrices to the appropriate lists
bootstrap.matrices[[ i ]] = boot.matrix # Add the bootstrap matrix to the list
perm.matrix1 = replicate(n1.features,sample(seq(1,nsubj),nsubj,replace=FALSE)) # (Replaced nsubj1 with nsubj) The matrix has each column be a permutation of the row indices
permutation.matrices1 [[ i ]] = perm.matrix1 # Add the new matrix to the list
perm.matrix2 = replicate(n2.features,sample(seq(1,nsubj),nsubj,replace=FALSE)) # The matrix has each column be a permutation of the row indices
permutation.matrices2 [[ i ]] = perm.matrix2 # Add the new matrix to the list
}
# The bootstrapped data; resample the data using each bootstrap matrix
log.processing.progress(CA,"Generating boot data") #Log progress
boot.data = lapply(bootstrap.matrices,resample,data=data)
log.processing.progress(CA,"Generating permutation data") #Log progress
# Generating the permutation data; permute the data using each permutation matrix
permutation.data1 = lapply(permutation.matrices1,permute,data=data1)
permutation.data2 = lapply(permutation.matrices2,permute,data=data2)
if(CA$renormalize){
# The normalized permutation data; the permutation data needs to be renormalized, but not the bootstrap data
permutation.norm1 = lapply(permutation.data1,ccrepe_norm)
permutation.norm2 = lapply(permutation.data2,ccrepe_norm)
}
else{
permutation.norm1=permutation.data1
permutation.norm2=permutation.data2
}
# The correlation matrices of the permuted and bootstrapped data; see extractCor function for more details
# mapply is a function that applies over two lists, applying extractCor to the first element of each, then the
# second element of each, etc.
log.processing.progress(CA,paste("Calculating permutation similarity scores: col.subset =",toString(col.subset))) #Log progress
CA$verbose.requested = FALSE #If the User requested verbose output - turn it off temporarily
if (CA$verbose == TRUE)
{
CA$verbose.requested <- TRUE #Turn of the verbose flag save variable
CA$verbose <- FALSE #But pass non verbose to the CA$method (Could be nc.score or anything else)
}
permutation.cor = mapply(extractCor,
mat1=permutation.norm1,
mat2=permutation.norm2,
startrow=1,
endrow=n1.subset,
startcol=(n1.subset+1),
endcol=length(col.subset),
MoreArgs = list(col.subset = col.subset,
my.method=CA$method,
method.args=CA$method.args,
outdist=CA$outdist,
outdistFile=CA$outdistFile),
SIMPLIFY=FALSE)
#********************************************************************************
# Reboot times = RT = -round(log2(CA$errthresh)) *
# This is the maximum number of iterations to try to reboot a matrix if in a col*
# all values are 0 *
# *
# For each matrix, check if there is a column that is all zeros *
# If such matrix is found, try to reboot it at most RT times until such matrix *
# is found that does not contain a column with all zeros *
# The limit of RT tries is now hard coded and needs to be re-evaluated *
# If after RT times there is no success - we stop the run *
#********************************************************************************
log.processing.progress(CA,paste("Calculating boot similarity scores: col.subset =",toString(col.subset))) #Log progress
boot.cor = lapply(boot.data,bootstrap.method.calcn,nsubj,data,CA,col.subset ) ###Function to check is all cols are zeros and apply cor
# Now calculating the correlations and p-values between the two datasets
log.processing.progress(CA,"Calculating p-values") #Log progress
CA$p.values <-matrix(data=NA,nrow=n1.features,ncol=n2.features) #Build the empty p.values matrix
CA$z.stat <-matrix(data=NA,nrow=n1.features,ncol=n2.features) #Build the empty z.stat matrix
CA$sim.score <-matrix(data=NA,nrow=n1.features,ncol=n2.features) #Build the empty correlation matrix
loop.range1 <- which(col.subset %in% 1:n1.features) #Establish looping range default
if ( length(CA$filtered.subset.cols.1)> 0 ) #If the User entered a subset of columns
{
loop.range1 <- which(col.subset %in% CA$filtered.subset.cols.1) #Use the subset of columns
}
loop.range2 <- which( col.subset %in% (n1.features + 1:n2.features)) - n1.subset #Establish looping range default
if ( length(CA$filtered.subset.cols.2) > 0 ) #If the User entered a subset of columns
{
loop.range2 <- which(col.subset %in% (n1.features + CA$filtered.subset.cols.2)) - n1.subset #Use the subset of columns
}
max.comparisons <- n1.features*n2.features
if (!is.na(CA$outdist)) #If user requested to print the distributions
{
output.string0 <- paste0("iter_",seq_len(length(boot.cor)),collapse=",")
output.string <- paste0("dist_type,i,k,feature1,feature2,",output.string0,'\n', sep='',collapse=",")
cat(output.string,file=CA$outdistFile,append=TRUE)
}
internal.loop.counter = 0 # Initialize the counter
for(index1 in seq_len(length(loop.range1))) #Modified seq_along to seq_len: Need to review....
{
i = loop.range1[index1]
for(index2 in seq_len(length(loop.range2))) #Modified seq_along to seq_len: Need to review....
{
k = loop.range2[index2]
# Get a vector of the (i,k)th element of each correlation matrix in the list of bootstrapped data; this is the bootstrap distribution
internal.loop.counter = internal.loop.counter + 1 #Increment the loop counter
if (internal.loop.counter %% CA$iterations.gap == 0) #If output is verbose and the number of iterations is multiple
#of iterations gap - print status
{
print.msg = paste('Completed ', internal.loop.counter, ' comparisons')
log.processing.progress(CA,print.msg) #Log progress
}
bootstrap.dist = unlist(lapply(boot.cor,'[',i,n1.subset+k))
bootstrap.dist[is.na(bootstrap.dist)] <- 0 #If there is an NA in bootstrap.dist - replace with 0 (Needs
#review)
# Get a vector the (i,k)th element of each correlation matrix in the list of permuted data; this is the permuted distribution
permutation.dist = unlist(lapply(permutation.cor,'[',i,k))
if (!is.na(CA$outdist)) #If user requested to print the distributions
{
RC <- print.dist(bootstrap.dist,permutation.dist,CA,col.subset[i],col.subset[n1.subset+k] - n1.features)
}
measure.parameter.list <- append(list(x=data[,col.subset[i]],y=data[,col.subset[n1.subset+k]]), CA$sim.score.parameters) #build the method
#do.call parameter list
sim.score.value <- do.call(CA$method,measure.parameter.list) #Invoke the measuring function <- Was cor
z.stat_and_p.value.list <- calculate.z.stat.and.p.value(bootstrap.dist,permutation.dist ) #Invoke new function to calculate it
#Same calculations - just packed as a function
z.stat <- z.stat_and_p.value.list$z.stat #Retrieve value form the list
p.value <- z.stat_and_p.value.list$p.value #Retrieve from the list
CA$p.values[col.subset[i],col.subset[n1.subset+k]-n1.features] = p.value #Post it in the p.values matrix
CA$z.stat[col.subset[i],col.subset[n1.subset+k]-n1.features] = z.stat #Post it in the z.stat matrix
CA$sim.score[col.subset[i],col.subset[n1.subset+k]-n1.features] = sim.score.value #Post it in the cor matrix <--Was cor
}
}
CA <- calculate_q_values(CA) #Calculate the QValues
#********************************************************************
#* Final Edits before exiting *
#********************************************************************
rownames(CA$p.values) <- colnames(CA$data1.norm) #Post the column names
colnames(CA$p.values) <- colnames(CA$data2.norm) #Post the column names
rownames(CA$sim.score) <- colnames(CA$data1.norm) #Post the column names
colnames(CA$sim.score) <- colnames(CA$data2.norm) #Post the column names
rownames(CA$q.values) <- colnames(CA$data1.norm) #Post the column names
colnames(CA$q.values) <- colnames(CA$data2.norm) #Post the column names
rownames(CA$z.stat) <- colnames(CA$data1.norm) #Post the column names
colnames(CA$z.stat) <- colnames(CA$data2.norm) #Post the column names
total.rows.to.display = 1:nrow(CA$p.values) #Number of rows to display
total.cols.to.display = 1:ncol(CA$p.values) #Number of cols to display
if (length(CA$filtered.subset.cols.1) > 0) #If User selected a subset
{
total.rows.to.display = CA$filtered.subset.cols.1
}
if (length(CA$filtered.subset.cols.2) > 0) #If User selected a subset
{
total.cols.to.display = CA$filtered.subset.cols.2
}
CA$p.values <- CA$p.values[total.rows.to.display,total.cols.to.display] #Display only the subset of cols and rows
CA$q.values <- CA$q.values[total.rows.to.display,total.cols.to.display] #Display only the subset of cols and rows
CA$sim.score <- CA$sim.score[total.rows.to.display,total.cols.to.display] #Display only the subset of cols and rows
CA$z.stat <- CA$z.stat[total.rows.to.display,total.cols.to.display]
CA <- clean_common_area_after_processing(CA) #Clean the Common Area before returning to the User
return(CA) # Return the output matrix
}
ccrepe.calculations <-
#*************************************************************************************
#* ccrepe calculations *
#*************************************************************************************
function(CA)
{
Output <-list()
CA <- DecodeInputCAArguments(CA) #Decode Input Parameters
CA <- preprocess_data(CA)
if (CA$OneDataset == TRUE)
{
CA = ccrepe_process_one_dataset(CA$data1.norm,CA$iterations, CA) #Invoke the new function
}
else
{
CA = ccrepe_process_two_datasets (CA$data1.norm ,CA$data2.norm ,CA$iterations, CA)
}
return ( CA )
}
DecodeInputCAArguments <-
function(CA){
#*************************************************************************************
#* Decode and validate input parameters *
#*************************************************************************************
CA$Gamma = 0.57721566490153286060651209008240243104215933593992 #Euler's Gamma
if (length(CA$data1) == 1) #If the user did not select at least one input dataframe - Stop the run
{
if (is.na(CA$data1))
stop('You must select at least one input data frame (data1<-YourData)')
}
CA$OneDataset <- FALSE #Set the symmmetrical flag to False (Will be true if data2=data1)
if (length(CA$data2) == 1) #If user did not enter data2 - we assume he wants data2=data1
if (is.na(CA$data2))
{
CA$OneDataset <- TRUE #And set up the Symmetrical flag
}
if (!class(CA$subset.cols.1 ) == "numeric" && !(is.null(CA$subset.cols.1))) #Modified the check for subset.cols
{ #If class is not "numeric" - set up to NULL
ErrMsg = paste0( #This replaces the old checks - length(CA$subset.cols.1) == 0 && (is.null(CA$subset.cols.1))
"Subset columns1 is not numeric - was : " ,
CA$subset.cols.1,
" - Set to NULL"
)
warning(ErrMsg)
CA$subset.cols.1 = NULL #Set to default
}
if (!class(CA$subset.cols.2 ) == "numeric" && !(is.null(CA$subset.cols.2))) #Modified the check for subset.cols
{ #If class is not "numeric" - set up to NULL
ErrMsg = paste0( #This replaces the old checks - length(CA$subset.cols.1) == 0 && (is.null(CA$subset.cols.1))
"Subset columns2 is not numeric - was : " ,
CA$subset.cols.2,
" - Set to NULL"
)
warning(ErrMsg)
CA$subset.cols.2 = NULL #Set to default
}
if (!is.na(CA$outdist)) #If the user passed a file - open it
{
if(file.exists(CA$outdist)){
ErrMsg = paste0(
"Distribution file ",
CA$outdist,
" already exists--overwriting."
)
warning(ErrMsg)
file.remove(CA$outdist)
}
CA$outdistFile = file(CA$outdist,open='at') #Open outdist file
}
if ( !is.logical(CA$compare.within.x ) ) #(Modify - use is.logical) compare.within.x must be either true or false
{
ErrMsg = paste0("Setting compare.within.x to TRUE - it was ",CA$compare.within.x )
warning(ErrMsg)
CA$compare.within.x = TRUE #True - is the default
}
if ( !is.logical(CA$verbose) ) #Verbose Flag has to be either true or false, otherwise - we set it to false
{
ErrMsg = paste0("Verbose Flag set to False was:", CA$verbose)
warning(ErrMsg)
CA$verbose = FALSE #False - is the default
}
if ( is.na(suppressWarnings(as.integer(CA$iterations.gap))) || !(CA$iterations.gap%%1==0) ) #Check the iterations gap (Number of iterations after which to print
#if status verbose - Cannot be fractional
{
ErrMsg = paste0("iterations.gap set to 100 - was: ", CA$iterations.gap)
warning(ErrMsg)
CA$iterations.gap = 100 #If not valid - use 100
}
if (CA$memory.optimize && !CA$OneDataset){
warning("The memory.optimize argument only applies to calculations on one dataset.
Hence, this computation will not have optimized memory usage.")
}
if (!is.na(CA$outdist) && CA$memory.optimize) #If user requested to print the distributions
{
warning("In the memory optimized algorithm, output of the bootstrap distribution is not available")
}
CA$sim.score.parameters<-list() #Initialize the parameter list
CA$sim.score.parameters <- CA$method.args #Add the entries in method.args to the measuring parameter list
CA$retries.max.iterations = -round(log2(CA$errthresh)) #This is the maximum number of iterations to try to reboot a matrix if in a col all values are 0
return(CA) #Return list of decoded input parameters
}
extractCor <-
function(mat1,mat2,startrow,endrow,startcol,endcol,col.subset,my.method,method.args,outdist,outdistFile, ...)
#******************************************************************************************
# A function to calculate the correlation of the two matrices by merging them, *
# calculating the correlation of the merged matrix, and extracting the appropriate *
# submatrix to obtain the correlation of interest *
# startrow, endrow, startcol, endcol give the indeces of the submatrix to extract *
# The method and the method parameters are passed via the list
#******************************************************************************************
{
mat <- merge_two_matrices(mat1,mat2)[,col.subset] #Merge the two matrices
measure.function.parm.list <- append(list(x=mat), method.args)
mat_C <-do.call(my.method,measure.function.parm.list) #Invoke the measuring fnction
if(length(startrow:endrow)==1){
sub_mat_C <- matrix(mat_C[startrow:endrow,startcol:endcol],nrow=1)
} else if(length(startcol:endcol)==1){
sub_mat_C <- matrix(mat_C[startrow:endrow,startcol:endcol],ncol=1)
} else {
sub_mat_C <- mat_C[startrow:endrow, startcol:endcol] # Extract the appropriate submatrix
}
return(sub_mat_C)
}
merge_two_matrices <-
function(mat1,mat2)
#*************************************************************************************
# A function to merge two matrices, which works when they're different dimensions *
# merge the two matrices by the rows, merging only rows present in both *
# (ensures no missing samples) *
# exclude the first column, which is the row names *
# <----------- Important !!! ---------------> *
#* The matrices are merged by row name (Which defaults to row number) *
#* so the assumption is that SAME ROWS pertain to SAME SUBJECTS *
#* Also not that first column is treated as data - not Subject.ID or any other *
#* identifier *
#*************************************************************************************
{
mat <- merge(mat1,mat2,by="row.names")
rownames(mat) = mat[,1] # The first column has the row names
mat = mat[,-1] # Remove the first column
return(mat)
}
lappend <-
function(lst, obj) {
#*************************************************************************************
#* Function to append obj to lst, where lst is a list *
#*************************************************************************************
lst[[length(lst)+1]] <- obj
return(lst)
}
permute <-
function(data,permute.id.matrix){
#*************************************************************************************
#* Function to permute the data using a matrix of indices *
#* data is the data which you want to permute *
#* permute.id.matrix is a matrix where each column is some *
#* permutation of the row indices *
#*************************************************************************************
permute.data = data
for(i in 1:ncol(data)){ # For each column
permute.data[,i] = data[permute.id.matrix[,i],i] # Replace the original column with a permuted one
}
return(permute.data)
}
filter_dataset <-
function(X){
#**********************************************************************
# Filter out missing rows and rows/cols that are all zero *
#**********************************************************************
missing_rows <- which(
apply(
X,
1,
function(row){
sum(is.na(row))
}
) > 0
)
if(length(missing_rows)>0){
X_no_missing <- X[-missing_rows,]
} else {
X_no_missing <- X
}
raw_all_zero_rows <- which(
apply(
X,
1,
function(row){
sum(row==0)/length(row)
}
)==1
)
all_zero_rows <- which(rownames(X_no_missing) %in% rownames(X)[raw_all_zero_rows])
raw_all_zero_cols <- which(
apply(
X_no_missing,
2,
function(col){
sum(col==0)/length(col)
}
)==1
)
all_zero_cols <- which(colnames(X_no_missing) %in% colnames(X)[raw_all_zero_cols])
if(length(all_zero_rows)>0 && length(all_zero_cols)>0){
X_filtered <- X_no_missing[-all_zero_rows,-all_zero_cols]
} else if( length(all_zero_rows) > 0 ){
X_filtered <- X_no_missing[-all_zero_rows,]
} else if( length(all_zero_cols) > 0 ){
X_filtered <- X_no_missing[,-all_zero_cols]
} else {
X_filtered <- X_no_missing
}
return(
list(
X_filtered = X_filtered,
all_zero_rows = raw_all_zero_rows,
all_zero_cols = raw_all_zero_cols,
missing_rows = missing_rows
)
)
}
check_data_values <-
function(CA){
#**********************************************************************
# Check that all data values are valid *
#**********************************************************************
# Check non-negativity
num.neg <- sum(CA$data1[!is.na(CA$data1)] < 0)
if( num.neg > 0 ){
ErrMsg <- paste(
"There are",
num.neg,
"negative values in x. CCREPE only uses non-negative values--stopping run.")
stop(ErrMsg)
}
# Check simplex
if(CA$renormalize){
num.gt.1 <- sum(CA$data1[!is.na(CA$data1)] > 1)
if( num.gt.1 > 0){
ErrMsg <- paste(
"x appears to be count data or absolute abundances, since there are",
num.gt.1,
"values greater than 1.",
'If you want to run CCREPE with absolute abundaces, please specify renormalize=FALSE'
)
warning(ErrMsg)
}
}
if(!CA$OneDataset){
# Check non-negativity
num.neg2 <- sum(CA$data2[!is.na(CA$data2)] < 0)
if( num.neg2 > 0 ){
ErrMsg <- paste(
"There are",
num.neg2,
"negative values in y. CCREPE only uses non-negative values--stopping run.")
}
# Check simplex
if(CA$renormalize){
num2.gt.1 <- sum(CA$data2[!is.na(CA$data2)] > 1)
if( num2.gt.1 > 0){
ErrMsg <- paste(
"y appears to be count data, since there are",
num2.gt.1,
"values greater than 1."
)
warning(ErrMsg)
}
}
}
}
preprocess_data <-
function(CA)
#**********************************************************************
# Preprocess input data *
#**********************************************************************
{
check_data_values(CA)
if(CA$OneDataset){
if(is.null(colnames(CA$data1))) colnames(CA$data1) <- seq_len(ncol(CA$data1))
if(is.null(rownames(CA$data1))) rownames(CA$data1) <- seq_len(nrow(CA$data1))
filtered_data1 <- filter_dataset(CA$data1)
} else {
if(is.null(colnames(CA$data1))) colnames(CA$data1) <- paste0(seq_len(ncol(CA$data1)),".X")
if(is.null(rownames(CA$data1))) rownames(CA$data1) <- seq_len(nrow(CA$data1))
if(is.null(colnames(CA$data2))) colnames(CA$data2) <- paste0(seq_len(ncol(CA$data2)),".Y")
if(is.null(rownames(CA$data2))) rownames(CA$data2) <- seq_len(nrow(CA$data2))
filtered_data1 <- filter_dataset(CA$data1)
filtered_data2 <- filter_dataset(CA$data2)
}
if(length(filtered_data1$missing_rows)>0){
ErrMsg = paste(
"Excluding subject(s)",
toString(rownames(CA$data1)[filtered_data1$missing_rows]),
"from x because they have missing values."
)
warning(ErrMsg)
}
if(length(filtered_data1$all_zero_rows)>0){
ErrMsg = paste(
"Excluding subject(s)",
toString(rownames(CA$data1)[filtered_data1$all_zero_rows]),
"from x because they are entirely zero."
)
warning(ErrMsg)
}
if(length(filtered_data1$all_zero_cols)>0){
ErrMsg = paste(
"Excluding feature(s)",
toString(colnames(CA$data1)[filtered_data1$all_zero_cols]),
"from x because they are entirely zero."
)
warning(ErrMsg)
}
if(CA$OneDataset){
if(nrow(filtered_data1$X_filtered) < CA$min.subj )
{
ErrMsg = paste0(
'Not enough data - found ',
nrow(filtered_data1$X_filtered),
' rows of non-missing data - Less than ',
CA$min.subj,
' (=min.subj) - Run Stopped'
)
stop(ErrMsg)
}
if(CA$renormalize){
CA$data1.norm = filtered_data1$X_filtered/rowSums(filtered_data1$X_filtered)
}
else{
CA$data1.norm = filtered_data1$X_filtered
}
CA$filtered.subset.cols.1 <- NULL
CA$filtered.subset.cols.2 <- NULL
if(!is.null(CA$subset.cols.1)){
CA$filtered.subset.cols.1 <- which(colnames(CA$data1.norm) %in% colnames(CA$data1)[CA$subset.cols.1])
if( length(CA$filtered.subset.cols.1)==0 ){
ErrMsg = paste0(
"User requested subset.cols.x=c(",
toString(CA$subset.cols.1),
"), but all are filtered. Setting subset.cols.x=NULL."
)
warning(ErrMsg)
CA$subset.cols.1 <- NULL
CA$filtered.subset.cols.1 <- NULL
}
}
if(!is.null(CA$subset.cols.2)){
CA$filtered.subset.cols.2 <- which(colnames(CA$data1.norm) %in% colnames(CA$data1)[CA$subset.cols.2])
if( length(CA$filtered.subset.cols.2)==0 ){
ErrMsg = paste0(
"User requested subset.cols.y=c(",
toString(CA$subset.cols.2),
"), but all are filtered. Setting subset.cols.y=NULL"
)
warning(ErrMsg)
CA$subset.cols.2 <- NULL
CA$filtered.subset.cols.2 <- NULL
}
}
} else {
if(length(filtered_data2$missing_rows)>0){
ErrMsg = paste(
"Excluding subject(s)",
toString(rownames(CA$data2)[filtered_data2$missing_rows]),
"from y because they have missing values."
)
warning(ErrMsg)
}
if(length(filtered_data2$all_zero_rows)>0){
ErrMsg = paste(
"Excluding subject(s)",
toString(rownames(CA$data2)[filtered_data2$all_zero_rows]),
"from y because they are entirely zero."
)
warning(ErrMsg)
}
if(length(filtered_data2$all_zero_cols)>0){
ErrMsg = paste(
"Excluding feature(s)",
toString(colnames(CA$data2)[filtered_data2$all_zero_cols]),
"from y because they are entirely zero."
)
warning(ErrMsg)
}
if(CA$renormalize){
CA$data1.norm = filtered_data1$X_filtered/rowSums(filtered_data1$X_filtered)
CA$data2.norm = filtered_data2$X_filtered/rowSums(filtered_data2$X_filtered)
}
else{
CA$data1.norm = filtered_data1$X_filtered
CA$data2.norm = filtered_data2$X_filtered
}
remove_from_d1 <- which(
rownames(CA$data1.norm) %in% setdiff(rownames(CA$data1.norm),rownames(CA$data2.norm))
)
remove_from_d2 <- which(
rownames(CA$data2.norm) %in% setdiff(rownames(CA$data2.norm),rownames(CA$data1.norm))
)
if(length(remove_from_d1) > 0 && length(remove_from_d2) > 0){
ErrMsg = paste0("Removing subjects ",
toString(rownames(CA$data1.norm)[remove_from_d1]),
" from dataset x and subjects ",
toString(rownames(CA$data2.norm)[remove_from_d2]),
" from dataset y because they are not paired between the datasets."
)
warning(ErrMsg)
CA$data1.norm <- CA$data1.norm[-remove_from_d1,]
CA$data2.norm <- CA$data2.norm[-remove_from_d2,]
} else if( length(remove_from_d1) > 0 ){
ErrMsg = paste0(
"Removing subjects ",
toString(rownames(CA$data1.norm)[remove_from_d1]),
" from dataset x because they are not in dataset y."
)
warning(ErrMsg)
CA$data1.norm <- CA$data1.norm[-remove_from_d1,]
} else if( length(remove_from_d2) > 0 ){
ErrMsg = paste0(
"Removing subjects ",
toString(rownames(CA$data2.norm)[remove_from_d2]),
" from dataset y because they are not in dataset x."
)
warning(ErrMsg)
CA$data2.norm <- CA$data2.norm[-remove_from_d2,]
}
merged_data <- merge_two_matrices(CA$data1.norm,CA$data2.norm)
if(nrow(merged_data) < CA$min.subj )
{
ErrMsg = paste0(
'Not enough data - found ',
nrow(merged_data),
' rows paired of non-missing data - Less than ',
CA$min.subj,
' (=min.subj) - Run Stopped'
)
stop(ErrMsg)
}
CA$filtered.subset.cols.1 <- NULL
CA$filtered.subset.cols.2 <- NULL
if(!is.null(CA$subset.cols.1)){
CA$filtered.subset.cols.1 <- which(colnames(CA$data1.norm) %in% colnames(CA$data1)[CA$subset.cols.1])
if( length(CA$filtered.subset.cols.1)==0 ){
ErrMsg = paste0(
"User requested subset.cols.x=c(",
toString(CA$subset.cols.1),
"), but all are filtered. Setting subset.cols.x=NULL."
)
warning(ErrMsg)
CA$subset.cols.1 <- NULL
CA$filtered.subset.cols.1 <- NULL
}
}
if(!is.null(CA$subset.cols.2)){
CA$filtered.subset.cols.2 <- which(colnames(CA$data2.norm) %in% colnames(CA$data2)[CA$subset.cols.2])
if( length(CA$filtered.subset.cols.2)==0 ){
ErrMsg = paste0(
"User requested subset.cols.y=c(",
toString(CA$subset.cols.2),
"), but all are filtered. Setting subset.cols.y=NULL."
)
warning(ErrMsg)
CA$subset.cols.2 <- NULL
CA$filtered.subset.cols.2 <- NULL
}
}
}
return(CA)
}
print.dist <-
function(bootstrap.dist,permutation.dist, CA,i, k) {
#*************************************************************************************
#* Function to print the distribution *
#*************************************************************************************
outdistFile = CA$outdistFile #Output file
output.string0 <- paste(bootstrap.dist,sep="",collapse=',') #Build the output string
if (CA$OneDataset == TRUE) #The structure of the output is different for one dataset and two datasets
{output.string <-paste("Boot,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data1.norm)[k],",",output.string0,'\n', sep='',collapse=",")}
else
{output.string <-paste("Boot,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data2.norm)[k],",",output.string0,'\n', sep='',collapse=",")}
cat(output.string,file=CA$outdistFile,append=TRUE)
output.string0 <- paste(permutation.dist,sep="",collapse=',') #Build the output string
if (CA$OneDataset == TRUE) #The structure of the output is different for one dataset and two datasets
{output.string <-paste("Permutation,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data1.norm)[k],",",output.string0,'\n', sep='',collapse=",")}
else
{output.string <-paste("Permutation,",i,",",k,",",colnames(CA$data1.norm)[i],",",colnames(CA$data2.norm)[k],",",output.string0,'\n', sep='',collapse=",")}
cat(output.string,file=CA$outdistFile,append=TRUE)
return (0)
}
resample <-
function(data,resample.matrix){
#*************************************************************************************
#* Function to calculate the resampled data matrix; this *
#* is used for getting only the bootstrapped data *
#* data is the data which you want to bootstrap *
#* resample.matrix is the matrix appropriate for getting *
#* you the bootstrapped dataset: matrix is square, with *
#* the same number of rows as data; each row has exactly one *
#* 1 in it *
#*************************************************************************************
data.matrix <-as.matrix(data)
return(resample.matrix%*%data.matrix)
}
clean_common_area_after_processing <-
function(CA){
#*************************************************************************************
#* Function to clean the Common Area After processing *
#* Common Area is passed to the User as results *
#*************************************************************************************
if (!is.na(CA$outdist)) #If user requested to print the distributions
{
close(CA$outdistFile) #Close outdist file
CA$outdistFile <- NULL #And remove it from the common area
}
if (CA$verbose.requested == TRUE) #Check if the User requested verbose
{ #We might have turned it off before calling CA$method
CA$verbose = TRUE
}
CA$verbose.requested = NULL
CA$data1 <- NULL
CA$data1.norm <- NULL
CA$data2 <- NULL
CA$data2.norm <- NULL
CA$method <- NULL
CA$method.args<- NULL
CA$OneDataset <- NULL
CA$outdist <- NULL
CA$renormalize <- NULL
CA$memory.optimize <- NULL
CA$Gamma <- NULL
CA$retries.max.iterations <- NULL
CA$subset.cols.x <-CA$subset.cols.1
CA$subset.cols.y <-CA$subset.cols.2
CA$subset.cols.1 <- NULL
CA$subset.cols.2 <- NULL
CA$filtered.subset.cols.1 <- NULL
CA$filtered.subset.cols.2 <- NULL
if ( length(CA$subset.cols.x) == 1 && CA$subset.cols.x == c(0)) #If NA - set to default
{
CA$subset.cols.x <-NULL #Set to default
}
if ( length(CA$subset.cols.y) == 1 && CA$subset.cols.y == c(0)) #If NA - set to default
{
CA$subset.cols.y <-NULL #Set to default
}
if (!CA$verbose == TRUE)
{
CA$min.subj <- NULL
CA$iterations <- NULL
CA$method.args <- NULL
CA$verbose <- NULL
CA$iterations.gap <- NULL
CA$compare.within.x <- NULL
CA$sim.score.parameters <- NULL
CA$subset.cols.x <- NULL
CA$subset.cols.y <- NULL
CA$errthresh <- NULL
}
return(CA)
}
bootstrap.method.calcn <-
function(subsetted.matrix,nsubj,original.data,CA,col.subset){
#*************************************************************************************
#* Function to calculate cor and check that total in cols is not zero *
#*************************************************************************************
check.col.sums <- colSums(subsetted.matrix[,col.subset])==0
if (length(check.col.sums[check.col.sums==TRUE])) #If there is a column that is all zeros - try to reboot data CA$retries.max.iterations=-round(log2(CA$errthresh)) times
{
cnt.tries = 0 #Initialize counter of tries that we will try to reboot
while(cnt.tries < CA$retries.max.iterations && length(check.col.sums[check.col.sums==TRUE])) #Check if we succeeded rebooting the data so no cols are zero
{
cnt.tries <- cnt.tries+1 #Increase the counter
b1 = try.reboot.again (nsubj,original.data) #Try to reboot again
check.col.sums <- colSums(b1[,col.subset])==0 #Are there columns with all zeros in the new rebooted matrix?
if (!length(check.col.sums[check.col.sums==TRUE])) #If there is no column that is all zeros - post the result to continue processing
{subsetted.matrix <- b1} #Post the result
}
if (cnt.tries > CA$retries.max.iterations && length(check.col.sums[check.col.sums==TRUE])) #If reboot did not work - Stop the run
{
ErrMsg = paste('Tried to reboot the original.data', CA$retries.max.iterations, 'times but always get a column that is all zeros') #Error
warning(ErrMsg)
}
}
measure.function.parm.list <- append(list(x=subsetted.matrix[,col.subset]), CA$method.args) #Build the measure function parameter list
boot.cor <-do.call(CA$method,measure.function.parm.list) #Invoke the measuring function
return(boot.cor)
}
try.reboot.again <-
function(nsubj,data){
#*************************************************************************************
#* Function to reboot the data matrix agin so that we don't get cols with all 0 *
#*************************************************************************************
possible.rows = diag(rep(1,nsubj))
boot.rowids = sample(seq(1,nsubj),nsubj,replace=TRUE)
# Generate the bootstrap matrices by getting the appropriate rows from the possible bootstrap rows
boot.matrix = possible.rows[boot.rowids,]
b1<-resample (data,boot.matrix) #b1 is the rebooted matrix that we return
return(b1)
}
log.processing.progress <-
function(CA,msg){
#*************************************************************************************
#* Function to report the progress of processing *
#* only if verbose flag is on *
#*************************************************************************************
if (CA$verbose == TRUE || (!is.null(CA$verbose.requested) && CA$verbose.requested == TRUE ))
message(cat(date(),' ==> ',msg)) #Display date and time and progress
return (0)
}
calculate.z.stat.and.p.value <-
function(bootstrap.dist,permutation.dist ){
#*************************************************************************************
#* Function to calculate the z.stat and p.value *
#*************************************************************************************
z.stat_p.value_list = list() #Define the list
z.stat_p.value_list$z.stat <- (mean(bootstrap.dist) - mean(permutation.dist))/sqrt(0.5*(var(permutation.dist)+var(bootstrap.dist)))
z.stat_p.value_list$p.value <- 2*pnorm(-abs(z.stat_p.value_list$z.stat))
return(z.stat_p.value_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.