###############################################################################################
############################# functions to generate the weights of the edges ##########################
###############################################################################################
#' generate_df_scores
#'
#' function to generate the weights of the edges given the combinations.
#' It is automatically called by the geNet() function.
#' the correlation coefficient and its pvalue is calculated using the cor.test function from the stat package.
#' See the relative documentation for additional details.
#' @param all_combs matrix of combinations
#' @param binary_matrix binary matrix of presence/absences genes
#' @param signif_test significance test to use.
#' * cor_test: use Pearson correlation test approach (faster)
#' * chisquare: use chi-square test approach (slower)
#' Read vignettes for additional details about the significance tests used.
#' @return Object of class "dataframe" containing the connections between the nodes and relative weights
#' @importFrom stats fisher.test
#' @importFrom stats chisq.test
#' @importFrom stats cor.test
#' @importFrom stats cor
generate_df_scores<-function(all_combs,binary_matrix,signif_test="cor_test"){
# fill the ff vectors
print("generate columns df scores")
if(signif_test=="chisquare"){
list_pair_results<-lapply(1:ncol(all_combs),function(i){
pair_from<-colnames(binary_matrix)[all_combs[,i][1]]
pair_to<-colnames(binary_matrix)[all_combs[,i][2]]
# Chisquare test for the pvalue
table_freqs<-table(binary_matrix[,all_combs[,i][1]],binary_matrix[,all_combs[,i][2]])
if(any(table_freqs)<5){
out_test<-fisher.test(table_freqs)
}else{
out_test<-chisq.test(table_freqs,simulate.p.value = F, correct = F)
}
# Pearson correlation coefficient
out_cor<-cor(binary_matrix[,all_combs[,i][1]],binary_matrix[,all_combs[,i][2]],method = "pearson")
pvalue<-out_test$p.value
return(list(from=pair_from,to=pair_to,pvalue=pvalue,coefficient=out_cor))
})
}else if(signif_test=="cor_test"){
list_pair_results<-lapply(1:ncol(all_combs),function(i){
pair_from<-colnames(binary_matrix)[all_combs[,i][1]]
pair_to<-colnames(binary_matrix)[all_combs[,i][2]]
# Chisquare test for the pvalue
out_test<-cor.test(binary_matrix[,all_combs[,i][1]],binary_matrix[,all_combs[,i][2]])
# Pearson correlation coefficient
out_cor<-cor(binary_matrix[,all_combs[,i][1]],binary_matrix[,all_combs[,i][2]],method = "pearson")
pvalue<-out_test$p.value
return(list(from=pair_from,to=pair_to,pvalue=pvalue,coefficient=out_cor))
})
}
df_scores_temp<-do.call(rbind, lapply(list_pair_results, as.data.frame))
#-------------------------------- correct formatting ----------------------
# check for NAs, i.e., cases in which the sd is 0 (maybe both genes are always together,they are core genes)
out<-is.na(df_scores_temp$coefficient)
inds<-which(out==T)
if(length(inds)!=0){
df_scores_temp<-df_scores_temp[inds,]
}
return(df_scores_temp)
}
#' generate_final_df
#'
#' function to generate the final df of scores based on the weights of edges
#' generated by the generate_df_scores_ff() function.
#' It is called automatically by the geNet() function
#' @param binary_matrix binary matrix of presence/absences genes
#' @param n_cores number of cores to use. Default to 1.
#' @param test significance test to use.
#' * cor_test: use Pearson correlation test approach (faster)
#' * chisquare: use chi-square test approach (slower)
#' @return Object of class "ffdf" containing the connections between the nodes and relative weights
#' @export
#' @examples
#' \dontrun{generate_final_df(binary_matrix,progress=F,n_cores=1,test="cor_test")}
#' @import ff pbmcapply parallel
#' @importFrom utils combn
generate_final_df<-function(binary_matrix,n_cores=1,test="cor_test"){
################## generation of the combinations ####################
print("----- generation of the combinations ")
print("may take from few secs to several mins")
start_time <- Sys.time()
estimate_n_comb<-choose(ncol(binary_matrix),2)
all_combs_ff<-ff(combn(1:ncol(binary_matrix),2),dim=c(2,estimate_n_comb))
end_time <- Sys.time()
time_exec<-end_time - start_time
print("time to generate all combinations:")
print(time_exec)
n_combinations<-estimate_n_comb
print(paste0("n_combinations to analyze: ",n_combinations))
# estimate batches for parallel processing of the combinations
print("---- Estimate ideal n. batches to analyze combinations")
x<-1000
n_batches<-ceiling(n_combinations/x)
print(paste0("n. batches:",n_batches))
gc() # free unused memory
############################ generation of batches intervals #################
print("----- generation of batches intervals")
vec_n_combs<-ff(1:n_combinations)
vec_n_combs_interval<-seq(from=min(vec_n_combs), to = max(vec_n_combs), by = (max(vec_n_combs)-min(vec_n_combs))/n_batches)
vec_n_combs_interval<-as.integer(vec_n_combs_interval)
list_pairs<-lapply(1:length(vec_n_combs_interval),function(i){
if(i!=length(vec_n_combs_interval)){
first<-vec_n_combs_interval[i]
second<-vec_n_combs_interval[i+1]
vec<-c(first,second)
return(vec)
}else{
return(NULL)
}
})
# we remove the last null value
vec<-sapply(list_pairs,is.null)
ind<-which(vec==T)
list_pairs<-list_pairs[-ind]
gc() # free unused memory
################### (Parallel) processing of the batches ####################
print("---------- processing of the batches")
list_df_scores_temp<-pbmclapply(1:length(list_pairs),function(i){
print(paste0("Input batch number:",i))
current_pair<-list_pairs[[i]]
start<-current_pair[1]
end<-current_pair[2]
if(i!=1){
start<-start+1
}
inds_pair<-start:end
print(sprintf("from %d to %d comb",start,end))
current_combs_ff<-all_combs_ff[,inds_pair]
df_scores_temp<-generate_df_scores(all_combs = current_combs_ff,
binary_matrix=binary_matrix,
signif_test=test) # calculate phi pvalue score
return(df_scores_temp)
},mc.cores = n_cores,mc.preschedule = T)
gc()
df_scores<-do.call(rbind,list_df_scores_temp)
gc()
################### add color edges #####################
print("------------- add color edges")
color_edges_vec<-rep("blue",nrow(df_scores))
inds<-which(df_scores$coefficient<0)
color_edges_vec[inds]<-"orange"
df_scores$color_edge<-color_edges_vec
################## fixing extremely pvalue = 0 #############
# they are extremely low pvalues, near 0. The negative log pvalue would be undefined.
inds<-which(df_scores$pvalue==0)
df_scores$pvalue[inds]<-1e-200
############# conversion to ffdf object #################
df_scores$from<-factor(df_scores$from)
df_scores$to<-factor(df_scores$to)
df_scores$color_edge<-factor(df_scores$color_edge)
df_scores<-ffdf(from=ff(df_scores$from),to=ff(df_scores$to),
pvalue=ff(df_scores$pvalue),coefficient=ff(df_scores$coefficient),
color_edge=ff(df_scores$color_edge))
gc()
return(df_scores)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.