geNet: Generating large network of co-occurring genes

Documented in generate_df_scores generate_final_df

###############################################################################################
############################# functions to generate the weights of the edges  ##########################
###############################################################################################

#' generate_df_scores
#' 
#' function to generate the weights of the edges given the combinations. 
#' It is automatically called  by the geNet() function.
#' the correlation coefficient and its pvalue is calculated using the cor.test function from the stat package. 
#' See the relative documentation for additional details.
#' @param all_combs matrix of combinations
#' @param binary_matrix binary matrix of presence/absences genes 
#' @param signif_test significance test to use.
#' * cor_test: use Pearson correlation test approach (faster)
#' * chisquare: use chi-square test approach (slower)
#' Read vignettes for additional details about the significance tests used.
#' @return Object of class "dataframe" containing the connections between the nodes and relative weights
#' @importFrom  stats fisher.test
#' @importFrom  stats chisq.test
#' @importFrom  stats cor.test
#' @importFrom  stats cor
generate_df_scores<-function(all_combs,binary_matrix,signif_test="cor_test"){
  # fill the ff vectors
  print("generate columns df scores")
  if(signif_test=="chisquare"){
    list_pair_results<-lapply(1:ncol(all_combs),function(i){
      pair_from<-colnames(binary_matrix)[all_combs[,i][1]]
      pair_to<-colnames(binary_matrix)[all_combs[,i][2]]
      # Chisquare test for the pvalue
      table_freqs<-table(binary_matrix[,all_combs[,i][1]],binary_matrix[,all_combs[,i][2]])
      if(any(table_freqs)<5){
        out_test<-fisher.test(table_freqs)
      }else{
        out_test<-chisq.test(table_freqs,simulate.p.value = F, correct = F)
      }
      # Pearson correlation coefficient
      out_cor<-cor(binary_matrix[,all_combs[,i][1]],binary_matrix[,all_combs[,i][2]],method = "pearson")
      pvalue<-out_test$p.value
      return(list(from=pair_from,to=pair_to,pvalue=pvalue,coefficient=out_cor))
    })
  }else if(signif_test=="cor_test"){
    list_pair_results<-lapply(1:ncol(all_combs),function(i){
      pair_from<-colnames(binary_matrix)[all_combs[,i][1]]
      pair_to<-colnames(binary_matrix)[all_combs[,i][2]]
      # Chisquare test for the pvalue
      out_test<-cor.test(binary_matrix[,all_combs[,i][1]],binary_matrix[,all_combs[,i][2]])
      # Pearson correlation coefficient
      out_cor<-cor(binary_matrix[,all_combs[,i][1]],binary_matrix[,all_combs[,i][2]],method = "pearson")
      pvalue<-out_test$p.value
      return(list(from=pair_from,to=pair_to,pvalue=pvalue,coefficient=out_cor))
    })
  }
  
  df_scores_temp<-do.call(rbind, lapply(list_pair_results, as.data.frame))
  #-------------------------------- correct formatting ----------------------
  # check for NAs, i.e., cases in which the sd is 0 (maybe both genes are always together,they are core genes)
  out<-is.na(df_scores_temp$coefficient)
  inds<-which(out==T)
  if(length(inds)!=0){
    df_scores_temp<-df_scores_temp[inds,]
  }
  return(df_scores_temp)
}
#' generate_final_df
#'  
#' function to generate the final df of scores based on the weights of edges 
#' generated by the generate_df_scores_ff() function. 
#' It is called automatically by the geNet() function 
#' @param binary_matrix binary matrix of presence/absences genes 
#' @param n_cores number of cores to use. Default to 1.
#' @param test significance test to use.
#' * cor_test: use Pearson correlation test approach (faster)
#' * chisquare: use chi-square test approach (slower)
#' @return Object of class "ffdf" containing the connections between the nodes and relative weights 
#' @export
#' @examples 
#' \dontrun{generate_final_df(binary_matrix,progress=F,n_cores=1,test="cor_test")}
#' @import ff pbmcapply parallel 
#' @importFrom  utils combn
generate_final_df<-function(binary_matrix,n_cores=1,test="cor_test"){
  ################## generation of the combinations ####################
  print("----- generation of the combinations ")
  print("may take from few secs to several mins")
  start_time <- Sys.time()
  estimate_n_comb<-choose(ncol(binary_matrix),2)
  all_combs_ff<-ff(combn(1:ncol(binary_matrix),2),dim=c(2,estimate_n_comb))
  end_time <- Sys.time()
  time_exec<-end_time - start_time
  print("time to generate all combinations:")
  print(time_exec)
  n_combinations<-estimate_n_comb
  print(paste0("n_combinations to analyze: ",n_combinations))
  # estimate batches for parallel processing of the combinations
  print("---- Estimate ideal n. batches to analyze combinations")
  x<-1000
  n_batches<-ceiling(n_combinations/x)
  print(paste0("n. batches:",n_batches))
  gc() # free unused memory
  ############################ generation of batches intervals #################
  print("----- generation of batches intervals")
  vec_n_combs<-ff(1:n_combinations)
  vec_n_combs_interval<-seq(from=min(vec_n_combs), to = max(vec_n_combs), by = (max(vec_n_combs)-min(vec_n_combs))/n_batches)
  vec_n_combs_interval<-as.integer(vec_n_combs_interval)
  list_pairs<-lapply(1:length(vec_n_combs_interval),function(i){
    if(i!=length(vec_n_combs_interval)){
      first<-vec_n_combs_interval[i]
      second<-vec_n_combs_interval[i+1]
      vec<-c(first,second)
      return(vec)
    }else{
      return(NULL)
    }
  })
  # we remove the last null value
  vec<-sapply(list_pairs,is.null)
  ind<-which(vec==T)
  list_pairs<-list_pairs[-ind]
  gc() # free unused memory
  ################### (Parallel) processing of the batches ####################
  print("---------- processing of the batches")
  list_df_scores_temp<-pbmclapply(1:length(list_pairs),function(i){
    print(paste0("Input batch number:",i))
    current_pair<-list_pairs[[i]]
    start<-current_pair[1]
    end<-current_pair[2]
    if(i!=1){
      start<-start+1
    }
    inds_pair<-start:end
    print(sprintf("from %d to %d comb",start,end))
    current_combs_ff<-all_combs_ff[,inds_pair]
    df_scores_temp<-generate_df_scores(all_combs = current_combs_ff,
                                       binary_matrix=binary_matrix,
                                       signif_test=test) # calculate phi pvalue score
    
    return(df_scores_temp)
  },mc.cores = n_cores,mc.preschedule = T)
  gc()
  df_scores<-do.call(rbind,list_df_scores_temp)
  gc() 
  ################### add color edges #####################
  print("------------- add color edges")
  color_edges_vec<-rep("blue",nrow(df_scores))
  inds<-which(df_scores$coefficient<0)
  color_edges_vec[inds]<-"orange"
  df_scores$color_edge<-color_edges_vec
  ################## fixing extremely pvalue = 0 #############
  # they are extremely low pvalues, near 0. The negative log pvalue would be undefined.
  inds<-which(df_scores$pvalue==0)
  df_scores$pvalue[inds]<-1e-200
  ############# conversion to ffdf object #################
  df_scores$from<-factor(df_scores$from)
  df_scores$to<-factor(df_scores$to)
  df_scores$color_edge<-factor(df_scores$color_edge)
  df_scores<-ffdf(from=ff(df_scores$from),to=ff(df_scores$to),
                  pvalue=ff(df_scores$pvalue),coefficient=ff(df_scores$coefficient),
                  color_edge=ff(df_scores$color_edge))
  gc()
  return(df_scores)
}