R/MERIT_functions.R

Defines functions extract_TFBS_positions extract_TFBS_positions generate_ensemble_grn_based_on_stability_selection run_grn_stability_selection run_MERIT format_results do_master_regulator_hierarchy_inference identify_regulatory_hierachy identify_bottom_tier_masterRegulators annotate_links_with_treatments_and_tissues perform_treatment_and_tissue_filtering evaluate_tissues_per_treatment do_treatment_filtering_single_link do_treatment_filtering_all_links transcriptionFactorBindingInference transcriptionFactorBindingInference_with_TFBStools load_datasets load_lead_support_grn compute_ensemble_regression_with_montecarlo_based_stability_sele compute_linearRegressionWithStabilitySelection_based_GRN compute_randomforest_based_GRN prepare matrix_to_dataframe grn_dataframe_to_matrix save_pheatmap_pdf run_comparative_evaluation stabilitySelection install_and_load_libraries

Documented in annotate_links_with_treatments_and_tissues compute_ensemble_regression_with_montecarlo_based_stability_sele do_master_regulator_hierarchy_inference format_results install_and_load_libraries load_datasets load_lead_support_grn run_MERIT stabilitySelection transcriptionFactorBindingInference transcriptionFactorBindingInference_with_TFBStools

#' install dependencies
#'
#'
#' @keywords 
#' @export
#' @examples
install_and_load_libraries <- function(){
  
  # CRAN
  
  list.of.packages <- c("ggplot2", "reshape2","doParallel", "pheatmap", "igraph", "seqinr", "networkD3", "taRifx", "parmigene", "ranger", "yaml", "plotROC")
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) install.packages(new.packages)
  
  # bioconductor
  source("https://bioconductor.org/biocLite.R")
  
  list.of.packages <- c("Biostrings", "TFBSTools","seqLogo", "PWMEnrich", "BSgenome.Athaliana.TAIR.TAIR9")
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) biocLite(new.packages)
  
  
  library(TFBSTools)
  # library(BSgenome.Athaliana.TAIR.TAIR9)
  
  library(networkD3)
  library(reshape2)
  library(foreach)
  library(doParallel)
  library(igraph)
  library(seqinr)
  library(Biostrings)
  library(PWMEnrich)
  library(seqLogo)
  
  library(ggplot2) 
  library(pheatmap)
  library(parmigene)
  library(ranger)
  
  library(taRifx)
  library(yaml)
  
  library(plotROC)
  
  
  
}


#' stabilitySelectionProcedure
#'
#'
#' @keywords 
#' @export
#' @examples
stabilitySelection <- function(x,y,nbootstrap=500,nsteps=5,alpha=0.2,plotme=FALSE){
  
  dimx <- dim(x)
  n <- dimx[1]
  p <- dimx[2]
  halfsize <- as.integer(n/2)
  freq <- matrix(0,nsteps+1,p)
  
  for (i in seq(nbootstrap)) {
    
    # Randomly reweight each variable
    xs <- t(t(x)*runif(p,alpha,1))
    
    # Ramdomly split the sample in two sets
    perm <- sample(dimx[1])
    i1 <- perm[1:halfsize]
    i2 <- perm[(halfsize+1):n]
    
    # run the randomized lasso on each sample and check which variables are selected
    r <- lars(xs[i1,],y[i1], max.steps=nsteps,normalize=FALSE, use.Gram=FALSE) #, eps = .Machine$double.eps)
    freq <- freq + abs(sign(coef.lars(r)))
    r <- lars(xs[i2,],y[i2],max.steps=nsteps,normalize=FALSE, use.Gram=FALSE)
    freq <- freq + abs(sign(coef.lars(r)))
    
  }
  
  # normalize frequence in [0,1]
  freq <- freq/(2*nbootstrap)
  
  if (plotme) {
    matplot(freq,type='l',xlab="LARS iteration",ylab="Frequency")
  }
  
  # the final stability score is the maximum frequency over the steps
  result <- apply(freq,2,max)
}

run_comparative_evaluation <- function(l.data, 
                                       l.results){
  
  
  
  
  n.min_hit_links = 5
  
  
  df.transcriptionFactorAnnotation=l.data$df.transcriptionFactorAnnotation
  df.geneGroups=l.data$df.geneGroups
  tb.geneGroups=l.data$tb.geneGroups
  v.geneGroups=l.data$v.geneGroups
  l.geneGroups=l.data$l.geneGroups 
  v.genes_geneGroups = unique(unlist(l.geneGroups))
  
  l.res.grn = l.results$l.res.grn
  l.res.grn_tfbs = l.results$l.res.grn_tfbs
  l.res.link_annotation = l.results$l.res.link_annotation
  l.res.MR_hierarchy = l.results$l.res.MR_hierarchy
  
  
  df.transcriptionFactorAnnotation = subset(df.transcriptionFactorAnnotation, df.transcriptionFactorAnnotation$with_geneExpression == "yes")
  df.geneGroups = subset(df.geneGroups, df.geneGroups$with_geneExpression == "yes")
  v.tfs = unique(df.transcriptionFactorAnnotation$TF_ID)
  

  ####  
  
  df.ATRM <- read.table("F:/junkDNA.ai/MERIT/datasets/evaluation/Regulations_in_ATRM.txt", header = TRUE, sep = "\t", fill = TRUE, stringsAsFactors = FALSE)[,1:2]
  names(df.ATRM) <- c("TF", "Target")
  
  df.AGRIS <- read.table("F:/junkDNA.ai/MERIT/datasets/evaluation/reg_net.txt", header = FALSE, sep = "\t", fill = TRUE, stringsAsFactors = FALSE, quote = "")
  df.AGRIS$V2 <- toupper(df.AGRIS$V2)
  df.AGRIS$V5 <- toupper(df.AGRIS$V5)
  df.AGRIS <- subset(df.AGRIS, df.AGRIS$V8 == "Confirmed")
  df.AGRIS <- df.AGRIS[,c(2,5)]
  names(df.AGRIS) <- c("TF", "Target")
  #   tb.tfs <- table(df.AGRIS$TF)
  #   tb.tfs <- tb.tfs[tb.tfs <= 1000] # test this
  #   df.AGRIS <- subset(df.AGRIS, df.AGRIS$TF %in% names(tb.tfs))
  # goldstandard - dataframe with two columns, labelled - TF, Target
  
  df.AGRIS <- subset(df.AGRIS, df.AGRIS$TF != df.AGRIS$Target)
  m.AGRIS <- acast(df.AGRIS, TF~Target)
  m.AGRIS[is.na(m.AGRIS)] <- 0
  m.AGRIS[m.AGRIS != 0] <- 1
  class(m.AGRIS) <- "numeric"
  
  
  df.ATRM <- subset(df.ATRM, df.ATRM$TF != df.ATRM$Target)
  m.ATRM <- acast(df.ATRM, TF~Target)
  m.ATRM[is.na(m.ATRM)] <- 0
  m.ATRM[m.ATRM != 0] <- 1
  class(m.ATRM) <- "numeric"
  
  
  df.AGRIS_ATRM <- unique(rbind(df.AGRIS, df.ATRM))
  # removing self regulation
  df.AGRIS_ATRM <- subset(df.AGRIS_ATRM, df.AGRIS_ATRM$TF != df.AGRIS_ATRM$Target)
  m.AGRIS_ATRM <- acast(df.AGRIS_ATRM, TF~Target)
  m.AGRIS_ATRM[is.na(m.AGRIS_ATRM)] <- 0
  m.AGRIS_ATRM[m.AGRIS_ATRM != 0] <- 1
  class(m.AGRIS_ATRM) <- "numeric"
  
  df.Chip500 = read.table("F:/junkDNA.ai/MERIT/datasets/evaluation/Chip500.txt", sep = "\t", stringsAsFactors = F, header = T)
  m.Chip500 <- acast(df.Chip500, TF.Gene.ID~Target.Gene.ID)
  m.Chip500[is.na(m.Chip500)] <- 0
  m.Chip500[m.Chip500 != 0] <- 1
  class(m.Chip500) <- "numeric"  
  
  df.DE = read.table("F:/junkDNA.ai/MERIT/datasets/evaluation/DE.txt", sep = "\t", stringsAsFactors = F, header = T)
  m.DE <- acast(df.DE, TF.Gene.ID~Target.Gene.ID)
  m.DE[is.na(m.DE)] <- 0
  m.DE[m.DE != 0] <- 1
  class(m.DE) <- "numeric"  
  
  ### 
  
  # df.aranet <- read.table("datasets/evaluation/AraNet_V2.txt", header = FALSE, sep = "\t")
  # names(df.aranet) <- c("G1", "G2", "val")
  # m.aranet <- acast(df.aranet, G1~G2, value.var = "val")
  # m.aranet[is.na(m.aranet)] <- 0
  # m.aranet[m.aranet != 0] <- 1
  # class(m.aranet) <- "numeric"   
  
  df.dna = read.table("F:/junkDNA.ai/MERIT/datasets/dna_binding_motifs/df.grn.dapSeq.csv", header = T, stringsAsFactors = F, sep = ";")
  m.dna <- acast(df.dna, TF~Target)
  m.dna[is.na(m.dna)] <- 0
  m.dna[m.dna != 0] <- 1
  class(m.dna) <- "numeric"
  
  # df.Nature2015 = read.table("datasets/evaluation/Nature2015.txt", header = T, stringsAsFactors = F, sep = "\t", fill = T)
  # m.Nature2015 <- acast(df.Nature2015, Transcription.factor~Promoter)
  # m.Nature2015[is.na(m.Nature2015)] <- 0
  # m.Nature2015[m.Nature2015 != 0] <- 1
  # class(m.Nature2015) <- "numeric"
  
  
  df.NCA = read.csv("F:/junkDNA.ai/MERIT/datasets/evaluation/NCA.csv", stringsAsFactors = F, sep = ";")
  df.NCA <- subset(df.NCA, df.NCA$TFGenes %in% v.tfs & df.NCA$TG %in% v.genes)
  m.NCA <- acast(df.NCA, TFGenes~TG)
  m.NCA[is.na(m.NCA)] <- 0
  m.NCA[m.NCA != 0] <- 1
  class(m.NCA) <- "numeric"
  
  
  df.pcc_pre_NCA = read.csv("F:/junkDNA.ai/MERIT/datasets/evaluation/pcc_pre_NCA.csv", stringsAsFactors = F, sep = ",")[,1:3]
  df.pcc_pre_NCA <- subset(df.pcc_pre_NCA, df.pcc_pre_NCA$TFid %in% v.tfs & df.pcc_pre_NCA$TGid %in% v.genes)
  m.pcc_pre_NCA <- acast(df.pcc_pre_NCA, TFid~TGid, value.var = "PCC")
  m.pcc_pre_NCA[is.na(m.pcc_pre_NCA)] <- 0
  # m.pcc_pre_NC[m.pcc_pre_NC != 0] <- 1
  class(m.pcc_pre_NCA) <- "numeric"
  m.pcc_pre_NCA <- abs(m.pcc_pre_NCA)
  
  
  
  df.Vermeissen2014 = read.table("F:/junkDNA.ai/MERIT/datasets/evaluation/Vermeissen2014.txt", header = T, stringsAsFactors = F, sep = "\t")
  df.Vermeissen2014 <- subset(df.Vermeissen2014, df.Vermeissen2014$TF %in% v.tfs & df.Vermeissen2014$Target.gene %in% v.genes)
  m.Vermeissen2014 <- acast(df.Vermeissen2014, TF~Target.gene, value.var = "Rank")
  m.Vermeissen2014[is.na(m.Vermeissen2014)] <- 0
  # m.Vermeissen2014[m.Vermeissen2014 != 0] <- 1
  class(m.Vermeissen2014) <- "numeric"
  
  
  # df.GGM2013 = read.table("datasets/evaluation/GGM2013.txt", header = T, stringsAsFactors = F, sep = "\t")
  # m.GGM2013 <- acast(df.GGM2013, Gene_A_Name~Gene_B_Name)
  # m.GGM2013[is.na(m.GGM2013)] <- 0
  # m.GGM2013[m.GGM2013 != 0] <- 1
  # class(m.GGM2013) <- "numeric"
  
  # df.Carrera2009 = read.table("datasets/evaluation/Carrera2009.txt", header = F, stringsAsFactors = F, sep = "\t")
  # df.Carrera2009 <- subset(df.Carrera2009, df.Carrera2009$V1 %in% v.tfs.on_chip & df.Carrera2009$V2 %in% v.tgs.on_chip)
  # m.Carrera2009 <- acast(df.Carrera2009, V1~V2)
  # m.Carrera2009[is.na(m.Carrera2009)] <- 0
  # m.Carrera2009[m.Carrera2009 != 0] <- 1
  # class(m.Carrera2009) <- "numeric"
  
  df.CNS2014 = read.table("F:/junkDNA.ai/MERIT/datasets/evaluation/CNS2014.txt", header = T, stringsAsFactors = F, sep = "\t")[,1:2]
  df.CNS2014 <- subset(df.CNS2014, df.CNS2014$Transcription.factor %in% v.tfs & df.CNS2014$Target.gene %in% v.genes)
  m.CNS2014 <- acast(df.CNS2014, Transcription.factor~Target.gene)
  m.CNS2014[is.na(m.CNS2014)] <- 0
  m.CNS2014[m.CNS2014 != 0] <- 1
  class(m.CNS2014) <- "numeric"
  
  
  df.CNS_PCC2014 = read.table("F:/junkDNA.ai/MERIT/datasets/evaluation/CNS_PCC2014.txt", header = T, stringsAsFactors = F, sep = "\t")
  df.CNS_PCC2014 <- subset(df.CNS_PCC2014, df.CNS_PCC2014$Transcription.factor %in% v.tfs & df.CNS_PCC2014$Target.gene %in% v.genes)
  df.CNS_PCC2014.unique = unique(df.CNS_PCC2014[,1:2])
  df.CNS_PCC2014.unique["PCC"] = 0
  
  for(i in 1:nrow(df.CNS_PCC2014.unique)){
    
    TF.i = df.CNS_PCC2014.unique$Transcription.factor[i]
    TG.i = df.CNS_PCC2014.unique$Target.gene[i]
    
    df.CNS_PCC2014.i = subset(df.CNS_PCC2014, df.CNS_PCC2014$Transcription.factor %in% TF.i & df.CNS_PCC2014$Target.gene %in% TG.i)
    df.CNS_PCC2014.unique$PCC[i] = max(abs(df.CNS_PCC2014.i$PCC))
    
  }
  
  m.CNS_PCC2014 <- acast(df.CNS_PCC2014.unique, Transcription.factor~Target.gene, value.var = "PCC")
  m.CNS_PCC2014[is.na(m.CNS_PCC2014)] <- 0
  #m.CNS_PCC2014[m.CNS_PCC2014 != 0] <- 1
  class(m.CNS_PCC2014) <- "numeric"
  
  
  l.gn.comparison <- list(m.ATRM,
                          m.AGRIS,
                          m.dna,
                          m.Chip500,
                          m.DE)
  
  names(l.gn.comparison) <- c("ATRM", "AtRegNet", "DapSeq",  "TF2Network Benchmark 1", "TF2Network Benchmark 2")
  
  
  
  # # subset to the genes in question 
  # for(i in 1:length(l.gn.comparison)){
  #   tfs.i = rownames(l.gn.comparison[[i]])
  #   tgs.i = colnames(l.gn.comparison[[i]])
  #   l.gn.comparison[[i]] = l.gn.comparison[[i]][intersect(rownames(m.MERIT_grn), tfs.i), 
  #                                               intersect(colnames(m.MERIT_grn), tgs.i)]
  # }
  
  
  v.methods <- c("CNS2014", "CNS_PCC2014", "Vermeissen2014",  "Barah 2015 PCC", "Barah 2015 NCA",
                 "Linear regression", "CLR", "Random Forest Regression", "lead support grn", "DNA binding", "MERIT grn inference")
  
  l.grn <- vector(mode = "list", length = length(v.methods))
  
  l.grn[[1]] <- m.CNS2014
  l.grn[[2]] <- m.CNS_PCC2014
  l.grn[[3]] <- m.Vermeissen2014
  l.grn[[4]] <- m.pcc_pre_NCA
  l.grn[[5]] <- m.NCA
  
  l.grn[[6]] = l.res.grn$m.lr_grn
  l.grn[[7]] = l.res.grn$m.clr_grn
  l.grn[[8]] = l.res.grn$m.rf_grn
  l.grn[[9]] = l.res.grn$m.lead_support.grn

  l.grn[[10]] = l.res.grn_tfbs$m.motifNet
  l.grn[[11]] = l.res.grn_tfbs$m.lead_support_w_motif.grn
  

  
  
# 
#   l.res.grn = l.results$l.res.grn
#   l.res.grn_tfbs = l.results$l.res.grn_tfbs
#   l.res.link_annotation = l.results$l.res.link_annotation
#   l.res.MR_hierarchy = l.results$l.res.MR_hierarchy
#   
#   
  for(i in 1:length(l.grn)){

    l.grn[[i]] = as.matrix(l.grn[[i]])
    # l.grn[[i]] = as.matrix(l.grn[[i]])[, intersect(colnames(l.grn[[i]]), v.genes_geneGroups)]
  }
    
  names(l.grn) = v.methods
  
  # l.grn.selected = l.grn
  
  ####
  # 
  # l.grn <- vector(mode = "list", length = length(v.methods))
  # for(i in 1:length(l.grn.selected)){
  #   
  #   tgs = colnames(l.grn[[i]])
  #   tgs = tgs[!tgs %in% ""]
  #   
  #   l.grn[[i]] = l.grn[[i]]  # [, intersect(tg.selection, tgs)]
  #   l.grn.selected[[i]] <- l.grn[[i]] #[names(which(rowSums(l.grn[[i]]) > 0)), names(which(colSums(l.grn[[i]]) > 0))]#   * m.motifNet[names(which(rowSums(m.rf_w_treatments_w_motifs) > 0)), names(which(colSums(m.rf_w_treatments_w_motifs) > 0))]
  #   # l.grn.selected[[i]] <- l.grn[[i]][intersect(rownames(l.grn[[i]]), rownames(m.AGRIS_ATRM)),intersect(colnames(l.grn[[i]]),colnames(m.AGRIS_ATRM))]  #[names(which(rowSums(m.rf_w_treatments_w_motifs) > 0)), names(which(colSums(m.rf_w_treatments_w_motifs) > 0))]
  #   #l.grn.selected[[i]][l.grn.selected[[i]] > 0] <- 1
  # }
  # names(l.grn.selected) <- v.methods
  # 
  
  df.comparativeEvaluation.total <- c()
  for(d in 1:length(l.gn.comparison)){
    
    m.gn.comparison <- l.gn.comparison[[d]]
    m.gn.comparison[m.gn.comparison > 0] <- 1
    
    df.comparativeEvaluation <- data.frame(method = v.methods, 
                                           tfs = rep(0,length(v.methods)),
                                           tgs = rep(0,length(v.methods)),
                                           hits = rep(0,length(v.methods)),
                                           links = rep(0,length(v.methods)),
                                           foldchange = rep(0,length(v.methods)), 
                                           pval = rep(1,length(v.methods)),
                                           dataset = names(l.gn.comparison)[d]
    )
    
    
    for(i in 1:length(v.methods)){
      
      tfs <- intersect(rownames(l.grn[[i]]), rownames(m.gn.comparison))
      tgs <- intersect(colnames(l.grn[[i]]), colnames(m.gn.comparison))
      
      fc = -1
      p.val = -1
      hitInSample = -1
      sampleSize = -1
      
      if(length(tfs) >= 2 & length(tgs) >= 2){
        
        m.grn.select <- l.grn[[i]][tfs, tgs]
        m.grn.select <- m.grn.select / max(m.grn.select)
        m.gn.comparison.select <- m.gn.comparison[tfs, tgs]
        
        m.grn.select[m.grn.select > 0] <- 1
        m.grn.select[m.grn.select <= 0] <- 0
        
        # optimize recovery or as in 
        n.links.total <- dim(m.grn.select)[1] * dim(m.grn.select)[2]
        fc <- (sum(m.grn.select * m.gn.comparison.select) / sum(m.grn.select)) / (sum(m.gn.comparison.select) / n.links.total)
        
        hitInSample <- sum(m.grn.select * m.gn.comparison.select) 
        sampleSize <- sum(m.grn.select)
        
        hitInPop <- sum(m.gn.comparison.select)
        failInPop <- (n.links.total - hitInPop)
        
        p.val <- phyper(hitInSample-1, hitInPop, failInPop, sampleSize, lower.tail= FALSE)
        
        
      }
      ###
      
      df.comparativeEvaluation$foldchange[i] <- fc
      df.comparativeEvaluation$pval[i] <- p.val
      df.comparativeEvaluation$tfs[i] <- length(tfs)
      df.comparativeEvaluation$tgs[i] <- length(tgs)
      df.comparativeEvaluation$hits[i] <- hitInSample
      df.comparativeEvaluation$links[i] <- sampleSize
    }
    
    
    df.comparativeEvaluation.total <- rbind(df.comparativeEvaluation.total, df.comparativeEvaluation)
    
    
  }
  print(df.comparativeEvaluation.total)
  
  
  
  #n.min_hit_links = 5
  
  
  
  write.csv2(df.comparativeEvaluation.total, "Supp/df.GRN_Comparative_Evaluation_on_biological_datasets_subset_targets_and_tfs.csv",row.names = FALSE)
  
  
  
  
  
  
  
  ####
  
  # l.grn.selected <- vector(mode = "list", length = length(v.methods))
  # for(i in 1:length(l.grn.selected)){
  #   
  #   tgs = colnames(l.grn[[i]])
  #   tgs = tgs[!tgs %in% ""]
  #   
  #   l.grn[[i]] = l.grn[[i]]# [, intersect(tg.selection, tgs)]
  #   l.grn.selected[[i]] <- l.grn[[i]] #[names(which(rowSums(l.grn[[i]]) > 0)), names(which(colSums(l.grn[[i]]) > 0))]#   * m.motifNet[names(which(rowSums(m.rf_w_treatments_w_motifs) > 0)), names(which(colSums(m.rf_w_treatments_w_motifs) > 0))]
  #   # l.grn.selected[[i]] <- l.grn[[i]][intersect(rownames(l.grn[[i]]), rownames(m.AGRIS_ATRM)),intersect(colnames(l.grn[[i]]),colnames(m.AGRIS_ATRM))]  #[names(which(rowSums(m.rf_w_treatments_w_motifs) > 0)), names(which(colSums(m.rf_w_treatments_w_motifs) > 0))]
  #   #l.grn.selected[[i]][l.grn.selected[[i]] > 0] <- 1
  # }
  # names(l.grn.selected) <- v.methods
  # 
  # 
  
  library(plotROC)
  
  
  
  
  
  l.grn.selected <- l.grn[c(2,3,4,6,7,8,9, 11)]
  v.methods = names(l.grn.selected)
  
  dataset <- c()
  
  p.plots <- vector(mode = "list", length = length(l.gn.comparison))
  
  for(d in 1:length(l.gn.comparison)){
    
    m.gn.comparison <- l.gn.comparison[[d]]
    
    m.gn.comparison[m.gn.comparison > 0] <- 1
    df.idx.grn <- which(m.gn.comparison > 0, arr.ind = TRUE)
    df.gn.comparison <- data.frame(TF = rownames(m.gn.comparison)[df.idx.grn[,1]], TG = colnames(m.gn.comparison)[df.idx.grn[,2]], stringsAsFactors = FALSE)
    names(df.gn.comparison) <- c("TF","Target")
    
    n.selection = max(unlist(lapply(l.grn.selected, function(m) length(which(m > 0)))))
    
    l.predictions <- vector(mode = "list", length = length(l.grn.selected))
    l.performance <- vector(mode = "list", length = length(l.grn.selected))
    l.auc <- vector(mode = "list", length = length(l.grn.selected))
    
    l.D <- vector(mode = "list", length = length(l.grn.selected))
    l.M <- vector(mode = "list", length = length(l.grn.selected))
    
    for(i in 1:length(l.grn.selected)){      
      
      m.grn <- l.grn.selected[[i]]
      m.grn <- m.grn / max(m.grn) # map into range 0 - 1
      
      df.grn <- melt(m.grn, stringsAsFactors = FALSE)
      names(df.grn) <- c("TF", "Target","val")
      
      df.grn <- subset(df.grn, df.grn$val > 0)
      df.grn <- subset(df.grn, as.character(df.grn$TF) != as.character(df.grn$Target))
      df.grn <- df.grn[order(-df.grn$val),] # order by rank (highest first)
      
      n.links <- min(nrow(df.grn), n.selection) # select to top n hits (as predicted by MERIT)
      df.grn <- df.grn[1:n.links,]# nrow(df.regulatoryNetwork),]
      
      rownames(df.grn) <- seq(1:nrow(df.grn))
      
      df.grn.gs <- rbind(df.grn[,1:2], df.gn.comparison)
      i.set <- which(duplicated(df.grn.gs) == TRUE) # identify hits with GS
      
      n.add_links <- n.selection - n.links
      
      df.grn["class"] <- 0
      df.grn[df.grn$TF %in% df.grn.gs[i.set,1] & df.grn$Target %in% df.grn.gs[i.set,2], ]$class <- 1
      
      l.D[[i]] <- df.grn$class
      l.M[[i]] <- df.grn$val
      
    }
    
    rm(m.gn.comparison)
    
    
    
    D.rand <- rbinom(length(l.D[[length(v.methods)]]), 1, .5)
    M.rand <- rep(0, length(l.D[[length(v.methods)]]))
    Z.rand <- rep("random", length( l.D[[length(v.methods)]]))
    
    n_hits = numeric(length(v.methods))
    for(i in 1:length(v.methods)){
      n_hits[i] = sum(l.D[[i]])
    }
    idx = which(n_hits >= n.min_hit_links)
    
    n_hits.selection = n_hits[idx]
    v.methods.selection = v.methods[idx]
    v.colors <- c( "grey", "green", "blue", "pink", "yellow", "gray", "orange", "darkred")
    names(v.colors) <- v.methods
    
    v.colors.selection = c("black", v.colors[idx])
    names(v.colors.selection) = c("random", v.methods.selection)
    
    D <- D.rand
    M <- M.rand
    Z <- Z.rand
    
    for(i in 1:length(v.methods.selection)){
      D <- c(D, l.D[[i]])
      M <- c(M, l.M[[i]])
      Z <- c(Z, rep(v.methods.selection[i], length(l.D[[i]])))
      
      n_hits[i] = sum(l.D[[i]])
    }
    
    rocdata <- data.frame(D = D,
                          M = M,
                          method = Z)
    rocdata$method <- factor(rocdata$method, levels = c("random", v.methods.selection))
    rocdata["colors"] <- v.colors.selection[rocdata$method]
    
    ggroc5 <- ggplot(rocdata, aes(m = M, d = D, color = method)) + geom_roc(n.cuts = 0) + theme_bw() + scale_color_manual(values=v.colors.selection) + scale_size_manual(values = rep(0.5,6))  + scale_linetype_manual(values = c("dashed", rep("solid", 5))) +  ggtitle(names(l.gn.comparison)[d]) + labs(x="false positive rate",y="true positive rate") +
      theme(plot.margin=unit(c(0.5,0.5,1.5,1),"cm")) #+ geom_rocci()
    
    
    
    p.plots[[d]] <- ggroc5
    
    plot(ggroc5)
    print(calc_auc(ggroc5))    
    
    
    
    dataset = rbind(dataset , data.frame(dataset = rep(names(l.gn.comparison)[d], length(v.methods.selection) + 1 ), method = c("Random", v.methods.selection), hits = c(0, n_hits.selection), auroc  = calc_auc(ggroc5)[,3]))
    
    # pdf("figures_and_data_output/auroc_merit.pdf", 10, 4)
    # ggroc5
    # dev.off()
    
    # calc_auc(ggroc5)
    
  }
  
  p <- multiplot(p.plots[[1]], p.plots[[2]], p.plots[[3]], p.plots[[4]],p.plots[[5]], cols=3)
  filename <- paste("Supp//GRN_inference_performance_comparision_MIN_LINKS.pdf", sep = "")
  
  pdf(filename, width=20, height=10)
  p
  dev.off()  
  
  
  filename <- paste("Supp//df.GRN_inference_performance_comparision_AUROC_MIN_LINKS.csv", sep = "")
  write.csv2(dataset, filename, row.names = FALSE)
  
  
  
  # 
  
  # library
  
  
  df.auroc = dataset
  idx = which(df.auroc$hits <= n.min_hit_links & df.auroc$method != "Random")
  df.auroc$auroc[idx] = 0
  
  # create a dataset
  
  df.auroc$method <- factor(df.auroc$method, levels = c("Random", v.methods))
  
  # Grouped
  v.colors <- c("black", "cyan", "green", "blue", "pink", "yellow", "gray", "purple", "orange", "darkred")
  names(v.colors) <- c("random", v.methods)
  plot_auroc = ggplot(df.auroc, aes(fill=method, y=auroc, x=dataset)) + geom_bar(position="dodge", stat="identity") + theme_bw() + scale_fill_manual(values=v.colors)
  
  
  return(list(plot_auroc=plot_auroc, df.auroc=df.auroc, plot_auroc_curves = p, df.comparativeEvaluation.total=df.comparativeEvaluation.total ))
  
  # 
  # , fill = v.colors
  # 
  # dataset = read.csv2(filename, stringsAsFactors = F)
  # 
  # message("AUPR curve ")
  # 
  # 
  # 
  # df.comparativeEvaluation.total["AUROC"] = "-"
  # for(i in 1:nrow(df.comparativeEvaluation.total)){
  #   
  #   auroc = subset(dataset, dataset == as.character(df.comparativeEvaluation.total$dataset[i]) & 
  #                  method == as.character(df.comparativeEvaluation.total$method[i]))
  #   
  #   if(nrow(auroc) > 0)
  #     df.comparativeEvaluation.total$AUROC[i] = auroc$auroc[1]
  #   
  # }
  # 
  # df.comparativeEvaluation.total = df.comparativeEvaluation.total[,c(1,6,2,3,4,5,7)]
  # df.comparativeEvaluation.total$hits = as.integer(df.comparativeEvaluation.total$hits)
  # df.comparativeEvaluation.total$links = as.integer(df.comparativeEvaluation.total$links)
  # df.comparativeEvaluation.total$pval =  as.character(df.comparativeEvaluation.total$pval)
  # install.packages("xtable")
  #library(xtable)
  # return(list(auroc=calc_auc(ggroc5), p = ggroc5, df.gs_enrichment = df.gs_enrichment))
  # 
  
  #xtable(df.comparativeEvaluation.total)
  
  rm(l.grn)
  rm(l.grn.selected)
  rm(l.gn.comparison)
  
  
  
}

save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
  stopifnot(!missing(x))
  stopifnot(!missing(filename))
  pdf(filename, width=width, height=height)
  grid::grid.newpage()
  grid::grid.draw(x$gtable)
  dev.off()
}

grn_dataframe_to_matrix <- function(df, val = "val"){
  m <- acast(df, TF~Target, value.var = val)
  m[is.na(m)] <- 0
  class(m) <- "numeric"
  return(m)
}

matrix_to_dataframe <- function(m){
  df <- melt(m, stringsAsFactors = FALSE)
  return(df)
}

prepare = function(){
  
  # transcription factor annotations
  
  tf.families = read.table("datasets/gene_annotation/Thale_cress-transcription factor.txt", header = FALSE, fill = TRUE, stringsAsFactors = FALSE)[,1:2] 
  tf.families$V1 <- gsub("\\..*", "",tf.families$V1)
  tf.families <- unique(tf.families)
  
  v.tf_families <- tf.families$V2
  names(v.tf_families) <- tf.families$V1
  
  v.tf_families[v.tf_families == "AP2/ERF-AP2"] <- "AP2_ERF-AP2"
  v.tf_families[v.tf_families == "AP2/ERF-ERF"] <- "AP2_ERF-ERF"
  v.tf_families[v.tf_families == "AP2/ERF-RAV"] <- "AP2_ERF-RAV"
  
  # add non present in thale cress - manual
  v.tffams.add <- c("C2H2", "zf-HD", "other", "other", "zf-HD", "other", "zf-HD")
  names(v.tffams.add) <- c("AT4G26030", "AT5G08750", "AT5G59430", "AT3G46590", "AT4G38170", "AT3G21890", "AT3G42860")
  
  v.tf_families <- (c(v.tf_families, v.tffams.add))
  # v.fams <- unique(tb.fams)
  v.tfs <- unique(names(v.tf_families))
  
  
  write.table(df.transcriptionFactorAnnotation, "data/df.transcriptionFactorAnnotation.txt", row.names = F, sep = "\t")
  
  
  
  
  # metabolic doains and enzymes
  df.domains <- read.table(filename.metabolicDomain_annotation, header = TRUE, stringsAsFactors = FALSE, sep = "\t")
  v.domains <- colnames(df.domains[,5:20])
  v.domains <- gsub(".Metabolism", "", v.domains)
  v.domains <- gsub("\\.", " ", v.domains)
  v.domains <- v.domains[!v.domains %in% c("Macromolecules", "Unclassified")]
  v.domains <- tolower(v.domains)
  
  df.domains <- df.domains[,c(1,5,6,7,8,9,10,11,12,13,15,16,17,18,19)]
  names(df.domains) <- c("Gene_ID",v.domains)
  tb.domains <- colSums(df.domains[,2:15])
  v.ath_coding_genes <- unique(df.domains$Gene_ID)
  v.enz <- df.domains$Gene_ID[which(rowSums(df.domains[,2:15]) > 0)]
  
  df.domains = subset(df.domains, df.domains$Gene_ID %in% v.enz)
  
  
  write.table(df.domains, "data/df.enzymes_w_metabolic_domains.txt", row.names = F,  sep = "\t")
  
  
}
  

compute_randomforest_based_GRN <- function(mat.expression, k="sqrt", nb.trees=10000, set.regulators = NULL, set.genes = NULL, seed=1234, importance.measure = "impurity", n.cpus = 5){
  
  if (!is.null(seed)) {
    set.seed(seed)
  }
  
  mat.expression.norm <- t(mat.expression)
  mat.expression.norm <- apply(mat.expression.norm, 2, function(x) { (x - mean(x)) / sd(x) } )
  n.genes <- dim(mat.expression.norm)[2]
  genes<- colnames(mat.expression.norm)
  n.samples <- dim(mat.expression.norm)[1]
  
  if(is.null(set.genes)){
    n.genes <- n.genes
    genes <- genes
    
  }else{
    n.genes <- length(set.genes)
    genes <- set.genes
    
  }
  
  #mat.expression.norm <- mat.expression.norm[,genes]
  
  if (is.null(set.regulators)) {
    n.regulators <- n.genes
    regulators <- genes
  } else {
    n.regulators <- length(set.regulators)
    # regulators provided by names
    if(is.character(set.regulators)){
      regulators <- set.regulators
      # genes.undefined <- setdiff(regulators, genes)
      # if (length(genes.undefined) != 0) {
      #   stop(paste("Error: genes: ", paste(genes.undefined, collapse = ",")," not represented in gene expression matrix \n", sep=""))
      # }
      # regulators provided by indices
    }else if (is.numeric(set.regulators)) {
      regulators <- genes[set.regulators]
    }else{
      stop("Error: invalid regulator format")
    }
  }
  # set mtry
  if (class(k) == "numeric") {
    mtry <- K
  } else if (k == "sqrt") {
    mtry <- round(sqrt(n.regulators))
  } else if (k == "all") {
    mtry <- n.regulators-1
  } else {
    stop("Error: invalid parameter k, options: \"sqrt\", \"all\", or integer")
  }
  
  print(paste("Performing random-forest regression based gene regulatory network inference (# of decision trees per gene is: ", nb.trees, ", # of regulators per decision tree node: ", mtry, sep=""))
  mat.rapid <- matrix(0.0, nrow=n.regulators, ncol=n.genes, dimnames = list(regulators, genes))
  
  strt<-Sys.time()
  for(i in 1:n.genes){
    cat("Processing... ", round(i/n.genes * 100, digits = 2) , "%", "\r"); flush.console() 
    gn.i <- genes[i]
    # remove target gene from set of regulators
    regulators.i <- setdiff(regulators, gn.i)
    x <- mat.expression.norm[,regulators.i, drop=FALSE] # avoid coercion to numeric
    y <- mat.expression.norm[,gn.i]
    
    
    
    df.xy <- cbind(as.data.frame(x),y)
    rf.model <- ranger(y ~ ., data = df.xy, mtry=mtry, num.trees=nb.trees, importance = "impurity", num.threads = n.cpus)
    imp_scores <- rf.model$variable.importance  
    imp_scores.names <- names(imp_scores)
    mat.rapid[imp_scores.names, gn.i] <- as.numeric(imp_scores)
  }
  print(Sys.time()-strt)
  print("..finished.")
  
  return(mat.rapid / n.samples)
}       


compute_linearRegressionWithStabilitySelection_based_GRN <- function(mat.expression, set.regulators = NULL, set.genes = NULL, nbootstrap = 100, nstepsLARS = 5, n.cpus = 5){
  
  mat.expression.norm <- t(mat.expression)
  mat.expression.norm <- apply(mat.expression.norm, 2, function(x) { (x - mean(x)) / sd(x) } )
  
  n.genes <- dim(mat.expression.norm)[2]
  genes<- colnames(mat.expression.norm)
  n.samples <- dim(mat.expression.norm)[1]
  
  if(is.null(set.genes)){
    n.genes <- n.genes
    genes <- genes
    
  }else{
    n.genes <- length(set.genes)
    genes <- set.genes
    
  }
  
  #mat.expression.norm <- mat.expression.norm[,genes]
  
  if (is.null(set.regulators)) {
    n.regulators <- n.genes
    regulators <- genes
  } else {
    n.regulators <- length(set.regulators)
    # regulators provided by names
    if(is.character(set.regulators)){
      regulators <- set.regulators
      # genes.undefined <- setdiff(regulators, genes)
      # if (length(genes.undefined) != 0) {
      #   stop(paste("Error: genes: ", paste(genes.undefined, collapse = ",")," not represented in gene expression matrix \n", sep=""))
      # }
      # regulators provided by indices
    }else if (is.numeric(set.regulators)) {
      regulators <- genes[set.regulators]
    }else{
      stop("Error: invalid regulator format")
    }
  }
  
  print(paste("Performing linear regression based gene regulatory network inference (# of bootstraps: ", nbootstrap, ", # of lars steps: ", nstepsLARS, sep=""))
  mat.rapid <- matrix(0.0, nrow=n.regulators, ncol=n.genes, dimnames = list(regulators, genes))
  
  strt<-Sys.time()
  
  cl<-makeCluster(n.cpus)
  registerDoParallel(cl)
  l.res <- foreach(i = 1:n.genes, .packages=c("lars", "MERIT")) %dopar% {
  
    #for(i in 1:n.genes){
    #  cat("Processing... ", round(i/n.genes * 100, digits = 2) , "%", "\r"); flush.console() 
    gn.i <- genes[i]
    # remove target gene from set of regulators
    regulators.i <- setdiff(regulators, gn.i)
    x <- mat.expression.norm[,regulators.i, drop=FALSE] # avoid coercion to numeric
    y <- mat.expression.norm[,gn.i]
    
    imp_scores <- stabilitySelection(x,y,nbootstrap=nbootstrap,nsteps=nstepsLARS,plotme=FALSE)
    
    imp_scores  
  }
  stopCluster(cl)
  
  print(Sys.time()-strt)
  print("..finished.")
  
  for(i in 1:n.genes){
    gn.i <- genes[i]
    imp_scores <- l.res[[i]]
    imp_scores.names <- names(imp_scores)
    mat.rapid[imp_scores.names, gn.i] <- as.numeric(imp_scores)
  }

  return(mat.rapid / n.samples)
}       




#' Step 1 - Gene regulatory network inference using ensemble regression with Monte Carlo based threshold selection
#'
#' 
#' @param mat.expression
#' @param k (default k="sqrt")
#' @param nb.trees
#' @param set.regulators
#' @param set.genes
#' @param seed
#' @param importance.measure
#' @param n.cpus
#' @param mat.expression
#' @param set.regulators
#' @param set.genes
#' @param nbootstrap
#' @param nstepsLARS
#' @param n.cpus
#' @return none, results are stored in tmp folder
#' @keywords 
#' @export
#' @examples
compute_ensemble_regression_with_montecarlo_based_stability_selection <- function(m.foldChange_differentialExpression,
                                                                                  df.transcriptionFactorAnnotation,
                                                                                  df.geneGroups,
                                                                                  targetSet = "geneGroups",
                                                                                  seed=1234,
                                                                                  importance.measure="impurity",
                                                                                  n.trees=1000,
                                                                                  n.lead_method_expression_shuffling = 1,
                                                                                  n.bootstrap=100,
                                                                                  n.stepsLARS=5,
                                                                                  n.cpus=5){
          
            df.transcriptionFactorAnnotation = subset(df.transcriptionFactorAnnotation, df.transcriptionFactorAnnotation$with_geneExpression == "yes")
            df.geneGroups = subset(df.geneGroups, df.geneGroups$with_geneExpression == "yes")
            v.tfs = unique(df.transcriptionFactorAnnotation$TF_ID)
            
            if(targetSet == "geneGroups"){
              v.genes = unique(c(v.tfs, rownames(df.geneGroups)))
            }
            
            if(targetSet == "all"){
              v.genes = rownames(m.foldChange_differentialExpression)
            }
            
            strt<-Sys.time()
            X <- m.foldChange_differentialExpression[v.genes,]
            colnames(X) = as.character(seq(1:dim(X)[2]))
            
            message("running lead method (random forest regression) with Monte Carlo based threshold selection")  
            m.rf_grn <- compute_randomforest_based_GRN(mat.expression=X, k="sqrt", nb.trees=n.trees, set.regulators = v.tfs, set.genes = v.genes, seed=seed, importance.measure = importance.measure, n.cpus = n.cpus)
            saveRDS(m.rf_grn, paste("tmp/m.grn.RF.rds", sep = ""))
            rm(m.rf_grn)
              
            set.seed(seed)
            message("running linear regression") 
            m.lr_grn <- compute_linearRegressionWithStabilitySelection_based_GRN(mat.expression=X, set.regulators = v.tfs, set.genes = v.genes, nbootstrap = n.bootstrap, nstepsLARS = n.stepsLARS, n.cpus = n.cpus)
            saveRDS(m.lr_grn, paste("tmp/m.grn.LR.rds", sep = ""))
            rm(m.lr_grn)
            # 
            message("running context likelihood of relatedness (CLR)")
            m.MI = knnmi.all(X)
            saveRDS(m.MI, paste("tmp/m.MI.rds"))
            m.CLR = parmigene::clr(m.MI)
            m.CLR = m.CLR[v.tfs, v.genes]
            saveRDS(m.CLR, paste("tmp/m.grn.CLR.rds"))
            print(Sys.time()-strt)
            #
            rm(m.MI)
            rm(M.CLR)

            
            for(i in 1:n.lead_method_expression_shuffling){

              message("running background monte carlo run", i)
              strt<-Sys.time()
              set.seed(seed + 25 * i)
              X.shuffled <- t(apply (X, 1,  function(m) sample(m, length(m))))
# 
              message("running random forest regression")
              m.rf.shuffled <- compute_randomforest_based_GRN(mat.expression=X.shuffled, k="sqrt", nb.trees=n.trees, set.regulators = v.tfs, set.genes = v.genes, seed= (seed + 25 * i), importance.measure = importance.measure, n.cpus = n.cpus)
              saveRDS(m.rf.shuffled, paste("tmp/m.grn.RF_bg_", i, ".rds", sep = ""))
              rm(m.rf.shuffled)
              
              message("running linear regression")
              set.seed(seed + 25 * i)
              m.lr_grn <- compute_linearRegressionWithStabilitySelection_based_GRN(mat.expression=X.shuffled, set.regulators = v.tfs, set.genes = v.genes, nbootstrap = n.bootstrap, nstepsLARS = n.stepsLARS, n.cpus = n.cpus)
              saveRDS(m.lr_grn, paste("tmp/m.grn.LR_bg_", i, ".rds", sep = ""))
              rm(m.lr_grn)
             
              message("running context likelihood of relatedness (CLR)")
              m.MI = knnmi.all(X.shuffled)
              saveRDS(m.MI, paste("tmp/m.MI_bg_", i, ".rds", sep = ""))
              
              m.MI <- readRDS(paste("tmp/m.MI_bg_", i, ".rds", sep = ""))
              m.CLR = parmigene::clr(m.MI)
              m.CLR = m.CLR[v.tfs, v.genes]
              saveRDS(m.CLR, paste("tmp/m.grn.CLR_bg_", i, ".rds", sep = ""))
              print(Sys.time()-strt)
              
              rm(m.MI)
              rm(m.CLR)
              
              
            }
    
}





#' load grns
#'
#' This function loads a computed grns 
#' @param df.transcriptionFactorAnnotation =l.data$df.transcriptionFactorAnnotation,
#' @param df.geneGroups = v.genes, 
#' @param th.lead_grn_method = 0.95,
#' @param th.support_grn_methods = 0.95,
#' @param n.lead_method_expression_shuffling = 3
#' @param ngrnSupport = 1
#' @keywords 
#' @export
load_lead_support_grn <- function(df.transcriptionFactorAnnotation,
                                   df.geneGroups, 
                                   m.foldChange_differentialExpression,
                                   targetSet = "all",
                                   th.lead_grn_method = 0.95,
                                   n.lead_method_expression_shuffling = 3,
                                   th.support_grn_methods = 0.95,
                                   n.grnSupport = 1){
  

  
  df.transcriptionFactorAnnotation = subset(df.transcriptionFactorAnnotation, df.transcriptionFactorAnnotation$with_geneExpression == "yes")
  df.geneGroups = subset(df.geneGroups, df.geneGroups$with_geneExpression == "yes")
  v.tfs = unique(df.transcriptionFactorAnnotation$TF_ID)

  if(targetSet == "geneGroups"){
    v.genes = unique(c(v.tfs, rownames(df.geneGroups)))
  }
  
  if(targetSet == "all"){
    v.genes = rownames(m.foldChange_differentialExpression)
  }
  

  # random forest regression - lead method
  m.rf_grn <- readRDS(paste("tmp/m.grn.RF.rds", sep = ""))[v.tfs,v.genes]
  v.th.grns <- numeric(n.lead_method_expression_shuffling)
  quantile(m.rf_grn, th.lead_grn_method)
  for(i in 1:n.lead_method_expression_shuffling){
    m.rf.shuffled = readRDS(paste("tmp/m.grn.RF_bg_", i, ".rds", sep = ""))[v.tfs,v.genes]
    v.th.grns[i] = quantile(m.rf.shuffled, th.lead_grn_method)
  }
  #print(v.th.grns)
  th.grns = sum(v.th.grns) / n.lead_method_expression_shuffling
  m.rf_grn[m.rf_grn < th.grns] <- 0
  m.rf_grn <- Matrix::Matrix(m.rf_grn, sparse = TRUE)

  # support method
  m.lr_grn = readRDS(paste("tmp/m.grn.LR.rds", sep = ""))[v.tfs,v.genes]
  quantile(m.lr_grn, th.lead_grn_method)
  v.th.grns <- numeric(n.lead_method_expression_shuffling)
  for(i in 1:n.lead_method_expression_shuffling){
    m.grn.shuffled = readRDS(paste("tmp/m.grn.LR_bg_", i, ".rds", sep = ""))[v.tfs,v.genes]
    v.th.grns[i] = quantile(m.grn.shuffled, th.support_grn_methods)
  }
  th.grns = sum(v.th.grns) / n.lead_method_expression_shuffling
  #print(v.th.grns)
  m.lr_grn[m.lr_grn < th.grns] <- 0
  m.lr_grn.classification = m.lr_grn
  m.lr_grn.classification[m.lr_grn.classification >= th.grns] = 1
  m.lr_grn.classification <- Matrix::Matrix(m.lr_grn.classification, sparse = TRUE)
  
  m.clr_grn = readRDS(paste("tmp/m.grn.CLR.rds"))[v.tfs,v.genes]
  quantile(m.clr_grn, th.lead_grn_method)
  v.th.grns <- numeric(n.lead_method_expression_shuffling)
  for(i in 1:n.lead_method_expression_shuffling){
    m.grn.shuffled = readRDS(paste("tmp/m.grn.CLR_bg_", i, ".rds", sep = ""))[v.tfs,v.genes]
    v.th.grns[i] = quantile(m.grn.shuffled, th.support_grn_methods)
  }
  th.grns = sum(v.th.grns) / n.lead_method_expression_shuffling
  #print(v.th.grns)
  m.clr_grn[m.clr_grn < th.grns] <- 0
  m.clr_grn.classification = m.clr_grn
  m.clr_grn.classification[m.clr_grn.classification >= th.grns] = 1
  m.clr_grn.classification <- Matrix::Matrix(m.clr_grn.classification, sparse = TRUE)
  
  # m.pcc_grn = abs(readRDS(paste("tmp/m.grn.PCC.rds", sep = "")))[v.tfs,v.genes]
  # quantile(m.pcc_grn, th.lead_grn_method)
  # v.th.grns <- numeric(n.lead_method_expression_shuffling)
  # for(i in 1:n.lead_method_expression_shuffling){
  #   m.grn.shuffled = abs(readRDS(paste("tmp/m.grn.PCC_bg_", i, ".rds", sep = ""))[v.tfs,v.genes])
  #   v.th.grns[i] = quantile(m.grn.shuffled, th.support_grn_methods)
  # }
  # th.grns = sum(v.th.grns) / n.lead_method_expression_shuffling
  # print(v.th.grns)
  # m.pcc_grn[m.pcc_grn < th.grns] <- 0
  # m.PCC.classification = m.pcc_grn
  # m.PCC.classification[m.PCC.classification >= th.grns] = 1
  # 
  sum(m.lr_grn.classification)
  sum(m.clr_grn.classification)
  #sum(m.PCC.classification)
  
  #####

  m.grn.support = m.lr_grn.classification + m.clr_grn.classification#  + m.PCC.classification
  m.grn.support[m.grn.support <  n.grnSupport] = 0
  m.grn.support[m.grn.support >= n.grnSupport] = 1
  
  m.lead_support.grn = m.rf_grn * m.grn.support 
 
  return(list(m.lead_support.grn = m.lead_support.grn, 
              m.rf_grn = m.rf_grn, 
              m.lr_grn = m.lr_grn, 
              m.clr_grn = m.clr_grn))
  
}




#' Load dataset function
#'
#' This function loads a datasets
#' @param 
#' @keywords 
#' @export
#' @examples
#' load_datasets()
load_datasets = function(filename.genes = "data/genes.txt",
                         filename.experiment_ids = "data/experiment_ids.txt",
                         filename.foldChange_differentialExpression = "data/m.foldChange_differentialExpression.txt",
                         filename.pvalue_differentialExpression =	"data/m.pvalue_differentialExpression.txt",
                         filename.experiment_condition_tissue_annotation =	"data/df.experiment_condition_annotation.txt",
                         filename.transcriptionfactor_annotation = "data/df.transcriptionFactorAnnotation.txt", 
                         filename.geneGroups = "data/df.enzymes_w_metabolic_domains.txt"){

  genes = read.table(filename.genes, header = F, sep = "\t", stringsAsFactors = F)[,1]
  experiment_series_ids = read.table(filename.experiment_ids, header = F, sep = "\t", stringsAsFactors = F)[,1]
  experiment_series_ids = as.character(experiment_series_ids)
  
  df.annotation = read.table(filename.experiment_condition_tissue_annotation, sep = "\t", stringsAsFactors = F, header = T)
  df.annotation$unique_ID = as.character(df.annotation$unique_ID)
  df.foldChange_differentialExpression = read.table(filename.foldChange_differentialExpression, header = F, sep = "\t", stringsAsFactors = F)
  df.pvalue_differentialExpression = read.table(filename.pvalue_differentialExpression, header = F, sep = "\t", stringsAsFactors = F)
  
  if(length(genes) == 0){
    stop("Error: no genes found")
  }
  if(length(experiment_series_ids) == 0){
    stop("Error: no experiments found")
  }
  if(nrow(df.annotation) == 0){
    stop("Error: no condition annotation found")
  }
  if(nrow(df.foldChange_differentialExpression) == 0){
    stop("Error: no differential expression foldchange found")
  }
  if(nrow(df.pvalue_differentialExpression) == 0){
    stop("Error: no differential expression pvalue found")
  }
  
  
  m.foldChange_differentialExpression = data.matrix(df.foldChange_differentialExpression, rownames.force = NA)
  m.pvalue_differentialExpression     = data.matrix(df.pvalue_differentialExpression, rownames.force = NA)
  rownames(m.foldChange_differentialExpression) = rownames(m.pvalue_differentialExpression) = genes
  colnames(m.foldChange_differentialExpression) = colnames(m.pvalue_differentialExpression) = experiment_series_ids
  
  v.tissues = unique(df.annotation$condition_tissue)
  v.treatments = unique(c(df.annotation$condition_treatment_1, df.annotation$condition_treatment_2))
  v.treatments = v.treatments[!v.treatments == ""]
  tb.experiment_series_ids = table(experiment_series_ids)
  
  df.annotation["number_series"] = 0
  for(i in 1:nrow(df.annotation)){
    df.annotation$number_series[i] = tb.experiment_series_ids[as.character(df.annotation$unique_ID[i])]
  }
  
  tb.tissues = numeric(length(v.tissues))
  names(tb.tissues) = v.tissues
  for(i in 1:length(v.tissues)){
    idx = which(df.annotation$condition_tissue == v.tissues[i])
    tissues.i = df.annotation$number_series[idx]
    tissues.i = tissues.i[!is.na(tissues.i)]
    tb.tissues[i] = sum(tissues.i)
  }
  tb.condition_tissues = tb.tissues
  
  
  tb.condition_treatments = numeric(length(v.treatments))
  names(tb.condition_treatments) = unique(v.treatments)
  for(i in 1:length(tb.condition_treatments)){
    idx_1 = which(df.annotation$condition_treatment_1 %in% names(tb.condition_treatments)[i])
    idx_2 = which(df.annotation$condition_treatment_2 %in% names(tb.condition_treatments)[i])
    tb.condition_treatments[i] = length(idx_1) + length(idx_2)# sum(df.annotation$number_series[idx_1]) + sum(df.annotation$number_series[idx_2])
  }
  
  
  df.transcriptionFactorAnnotation = read.table(filename.transcriptionfactor_annotation, header = T, sep = "\t", stringsAsFactors = F)
  if(nrow(df.transcriptionFactorAnnotation) == 0){
    stop("Error: no transcription factor annotation found")
  }
  df.transcriptionFactorAnnotation["with_geneExpression"] = "no"
  df.transcriptionFactorAnnotation$with_geneExpression[which(df.transcriptionFactorAnnotation$TF_ID %in% genes)] = "yes"
  

  df.geneGroups = read.table(filename.geneGroups, header = T,  sep = "\t", stringsAsFactors = F)
  if(nrow(df.geneGroups) == 0){
    stop("Error: no gene group annotation found")
  }
  
  rownames(df.geneGroups) = df.geneGroups$Gene_ID
  df.geneGroups <- df.geneGroups[,!names(df.geneGroups) %in% c("Gene_ID")]
  
  tb.geneGroups = colSums(df.geneGroups)
  v.geneGroups = colnames(df.geneGroups)
  
  
  l.geneGroups <- vector(mode = "list", length = length(v.geneGroups))
  names(l.geneGroups) <- v.geneGroups
  for(i in 1:length(v.geneGroups)){
    l.geneGroups[[i]] <- rownames(df.geneGroups)[which(df.geneGroups[,v.geneGroups[i]] == 1)]
    l.geneGroups[[i]] <- intersect(l.geneGroups[[i]], genes)
  }
  

  df.geneGroups["with_geneExpression"] = "no"
  df.geneGroups$with_geneExpression[which(rownames(df.geneGroups) %in% genes)] = "yes"
  
  
  return(list(m.foldChange_differentialExpression=m.foldChange_differentialExpression,
              m.pvalue_differentialExpression=m.pvalue_differentialExpression,
              df.experiment_condition_annotation=df.annotation,
              tb.condition_treatments=tb.condition_treatments,
              tb.condition_tissues=tb.condition_tissues,
              
              df.transcriptionFactorAnnotation=df.transcriptionFactorAnnotation, 
              df.geneGroups=df.geneGroups,
              tb.geneGroups=tb.geneGroups,
              v.geneGroups=v.geneGroups,
              l.geneGroups=l.geneGroups, 
              genes = genes
  ))
}





#' Step 2 - Transcription factor direct target promoter binding based filtering of gene regulatory link predictions
#'
#' 
#' @param m.grn gene regulatory network 
#' @return DNA binding based grn and p-values, filtered lead-support grn
#' @keywords 
#' @export
#' @examples
#' install_and_load_libraries()
transcriptionFactorBindingInference_with_TFBStools <- function(m.grn, 
                                                              file.TF_to_Motif_IDs = "",
                                                              file.TFBS_motifs = "",
                                                              file.promoterSeq = "",
                                                              file.geneSeq = "",
                                                              th.pre_tss = 1000,
                                                              th.post_tss = 200,
                                                              th.min.score.motif = "80%",
                                                              genome_nucleotide_distribution = c(0.3253439, 0.1746561, 0.1746561, 0.3253439 ),
                                                              th.pval.known_motifs = 0.05,
                                                              n.cpus = 2, 
                                                              b.load = "no"){
                
  
  if(b.load == "no"){
  
    bg.genome = genome_nucleotide_distribution
    names(bg.genome) = c("A","C","G","T") 
  
    df.tfs_motifs <- read.table(file.TF_to_Motif_IDs, sep = "\t", header = T, stringsAsFactors = F)
    l.tfs_w_motifs = vector(mode = "list", length = nrow(df.tfs_motifs))
    names(l.tfs_w_motifs) = df.tfs_motifs$TF_ID
    for(i in 1:nrow(df.tfs_motifs)){
      l.tfs_w_motifs[[i]] = unlist(strsplit(df.tfs_motifs$Motif_ID[i], ","))
      if(length(l.tfs_w_motifs[[i]]) == 0){
        l.tfs_w_motifs[[i]] = NULL
      } 
    }
    
    v.tfs.w.open_motifs <- names(l.tfs_w_motifs)[(which(lapply(l.tfs_w_motifs, function(m) is.null(m)) == FALSE))]
    
    l.motifs_files <- read.fasta(file = file.TFBS_motifs, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
    l.motifs <- vector(mode = "list", length = length(l.motifs_files))
    names(l.motifs) <- names(l.motifs_files)
    v.motif.ids <- names(l.motifs)
    for(m in 1:length(l.motifs_files)){
      
      motif <- l.motifs_files[[m]]
      motif <- unlist(strsplit(motif, "\t"))
      
      idx.c <- which(motif == "]c")
      idx.g <- which(motif == "]g")
      idx.t <- which(motif == "]t")
      
      v.a <- motif[2:(idx.c - 1)]
      v.c <- motif[(idx.c + 1):(idx.g - 1)]
      v.g <- motif[(idx.g + 1):(idx.t - 1)]
      v.t <- motif[(idx.t + 1):(length(motif) - 1)]
      
      v.a[1] <- gsub("\\[", "", v.a[1])
      v.c[1] <- gsub("\\[", "", v.c[1])
      v.g[1] <- gsub("\\[", "", v.g[1])
      v.t[1] <- gsub("\\[", "", v.t[1])
      
      m.motif <- matrix(0, nrow = 4, ncol = length(v.a), dimnames = list(c("A", "C", "G", "T"), seq(1:length(v.a))))
      m.motif[1,] <- as.numeric(v.a)
      m.motif[2,] <- as.numeric(v.c)
      m.motif[3,] <- as.numeric(v.g)
      m.motif[4,] <- as.numeric(v.t)
      
      l.motifs[[m]] <- m.motif
    }
    
    upstream_sequences <- read.fasta(file = file.promoterSeq, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
    upstream_sequences <- lapply(upstream_sequences, function(m) {toupper(m)})
    df.promSequences <- DNAStringSet(unlist(upstream_sequences))
   
    gene_sequences <- read.fasta(file = file.geneSeq, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
    gene_sequences <- lapply(gene_sequences, function(m) {toupper(m)})
    df.geneSequences <- DNAStringSet(unlist(gene_sequences))
    names(df.geneSequences) <- gsub("\\..*","", names(df.geneSequences)) # remove gene model 
    
    df.idx.grn <- which(m.grn > 0, arr.ind = TRUE)
    df.grn <- data.frame(TF = rownames(m.grn)[df.idx.grn[,1]], TG = colnames(m.grn)[df.idx.grn[,2]], stringsAsFactors = FALSE)
    tfs.grn <- unique(df.grn$TF)
    tgs.grn <- unique(df.grn$TG)
    tfs.grn <- intersect(tfs.grn, v.tfs.w.open_motifs)
    
    strt <- Sys.time()
    cl<-makeCluster(min(length(tfs.grn), n.cpus), outfile = "")
    registerDoParallel(cl)
    
    pb <- txtProgressBar(title = "Iterative training", min = 0, max = length(tfs.grn), style = 3)
    l.res <-  foreach(j = 1:length(tfs.grn), .packages = c("TFBSTools", "Biostrings")) %dopar% {
      setTxtProgressBar(pb, j)
      v.pvals <- rep(1, length(tgs.grn))
      v.tgs <- rep(0,length(tgs.grn))
      names(v.pvals) <- names(v.tgs) <- tgs.grn
  
      tf.j <- tfs.grn[j]
      if(tf.j %in% names(l.tfs_w_motifs)){
        l.tfs_w_motifs.j <- l.tfs_w_motifs[tf.j]
        if(!is.null(unlist(l.tfs_w_motifs.j))){
          strt <- Sys.time()
          df.grn.r <- subset(df.grn, df.grn$TF == tf.j)
          if(nrow(df.grn.r) > 0){
            l.motifs.r <- l.motifs[unlist(l.tfs_w_motifs.j)]
            tgs.r <- df.grn.r$TG 
            tgs.r <- intersect(tgs.r, df.promSequences@ranges@NAMES)
            Sequences <- df.promSequences[tgs.r]
            
            l.pwms <- list()
            for(k in 1:length(l.motifs.r)){
              pwm.groundTruth <- l.motifs.r[[k]]
              pwm <- PWMatrix(ID = "", name = "", profileMatrix = pwm.groundTruth, bg = bg.genome)  
              l.pwms <- c(l.pwms, list(pwm))
            }
            pwmList <- do.call(PWMatrixList, l.pwms)
            
            #Sequences <- subseq(Sequences, 3001 - v.th.motif_binding_promoter[i], 3000) # pre startsite
            if(length(Sequences) > 0){
              sitesetList = searchSeq(pwmList, Sequences, min.score=th.min.score.motif, strand="*") #, mc.cores = n.cpus)
              l.pvalues_binding_per_sequence <- pvalues(sitesetList, type="sampling")
              pval.min.tgs <- unlist(lapply(l.pvalues_binding_per_sequence, function(m) min(unlist(m))))
              pval.min.tgs <- pval.min.tgs[pval.min.tgs != "Inf"]
              if(length(pval.min.tgs) > 0){
                tgs.pval_min <- unique(names(pval.min.tgs))
                pval.min <- numeric(length(tgs.pval_min))
                names(pval.min) <- tgs.pval_min
                for(k in 1:length(tgs.pval_min)){
                  idx.k <- which(names(pval.min.tgs) == tgs.pval_min[k])
                  pval.min[k] <- min(pval.min.tgs[idx.k])
                }
                pval.min.tgs <- pval.min
                tgs.bound <- names(pval.min.tgs) 
                v.pvals[tgs.bound] <- as.numeric(pval.min.tgs)
                v.tgs[tgs.bound] <- 1
              }
            }
            
            # post TSS 200 kb
            tgs.r <- intersect(tgs.r, df.geneSequences@ranges@NAMES)
            Sequences <- df.geneSequences[tgs.r]
            
            if(length(Sequences) > 0){
              
              for(k in 1:length(Sequences)){
                if(length(Sequences[[k]]) > th.post_tss){
                  Sequences[[k]] <-  subseq(Sequences[[k]], 1, th.post_tss)
                }
              }
              sitesetList = searchSeq(pwmList, Sequences, min.score=th.min.score.motif, strand="*") #, mc.cores = n.cpus)
              l.pvalues_binding_per_sequence <- pvalues(sitesetList, type="sampling")
              pval.min.tgs <- unlist(lapply(l.pvalues_binding_per_sequence, function(m) min(unlist(m))))
              pval.min.tgs <- pval.min.tgs[pval.min.tgs != "Inf"]
              
              if(length(pval.min.tgs) > 0){
                tgs.pval_min <- unique(names(pval.min.tgs))
                pval.min <- numeric(length(tgs.pval_min))
                names(pval.min) <- tgs.pval_min
                for(k in 1:length(tgs.pval_min)){
                  idx.k <- which(names(pval.min.tgs) == tgs.pval_min[k])
                  pval.min[k] <- min(pval.min.tgs[idx.k])
                }
                pval.min.tgs <- pval.min
                tgs.bound <- names(pval.min.tgs) #names(pval.min.tgs)[pval.min.tgs <= th.pval.known_motifs]
                
                v.pvals[tgs.bound] <- pmin(v.pvals[tgs.bound] , as.numeric(pval.min.tgs))
                v.tgs[tgs.bound] <- 1
                
              }
              
            }
          }
          print(Sys.time() - strt)
          
        }
      }
      
      l.tmp <- list(v.pvals=v.pvals, v.tgs = v.tgs)
      
      saveRDS(j, paste("tmp_TFBS/t_",j, ".rds", sep = ""))
      
      return(l.tmp)  
    }
    stopCluster(cl)
    print(Sys.time()-strt)
    
    #######
    
    m.motifNet.pval <- matrix(1, nrow = length(tfs.grn), ncol = length(tgs.grn), dimnames = list(tfs.grn, tgs.grn))
    m.motifNet.tgs <- matrix(0, nrow = length(tfs.grn), ncol = length(tgs.grn), dimnames = list(tfs.grn, tgs.grn))
    
    for(j in 1:length(tfs.grn)){
      m.motifNet.pval[tfs.grn[j],] <- l.res[[j]]$v.pvals
      m.motifNet.tgs[tfs.grn[j],] <- l.res[[j]]$v.tgs
    }
    
    saveRDS(m.motifNet.pval, "tmp/m.motifNet.pval.rds")
    saveRDS(m.motifNet.tgs, "tmp/m.motifNet.tgs.rds")
    
  }else{
    
    m.motifNet.pval = readRDS("tmp/m.motifNet.pval.rds")
    m.motifNet.tgs = readRDS("tmp/m.motifNet.tgs.rds")
    
    if(length(m.motifNet.pval) == 0){
      stop("Error: no m.motifNet.pval.rds")
    }
    
    if(length(m.motifNet.tgs) == 0){
      stop("Error: no m.motifNet.tgs.rds")
    }
    
  }
  
  m.motifNet = m.motifNet.pval
  m.motifNet[m.motifNet > th.pval.known_motifs] <- 10
  m.motifNet[m.motifNet <= th.pval.known_motifs] <- 1
  m.motifNet[m.motifNet == 10] <- 0
  
  tfs_w_motif_binding = intersect(rownames(m.motifNet), rownames(m.grn))
  tgs_w_motif_binding = intersect(colnames(m.motifNet), colnames(m.grn))
  m.lead_support_w_motif.grn <- m.motifNet[tfs_w_motif_binding, tgs_w_motif_binding] * m.grn[tfs_w_motif_binding, tgs_w_motif_binding]
  
  
  return(list(m.motifNet.pval=m.motifNet.pval, 
              m.motifNet.tgs=m.motifNet.tgs, 
              m.lead_support_w_motif.grn=m.lead_support_w_motif.grn))
}




#' Step 2 - Transcription factor direct target promoter binding based filtering of gene regulatory link predictions
#'
#' 
#' @param m.grn gene regulatory network 
#' @return DNA binding based grn, DNA binding filtered lead-support grn
#' @keywords 
#' @export
#' @examples
#' install_and_load_libraries()
transcriptionFactorBindingInference <- function(m.grn,
                                                file.TF_to_Motif_IDs = "",
                                               file.TFBS_motifs = "",
                                               file.promoterSeq = "",
                                               file.geneSeq = "",
                                               th.pre_tss = 1000,
                                               th.post_tss = 200,
                                               genome_nucleotide_distribution = c(A = 0.3253439, C = 0.1746561, G = 0.1746561, T = 0.3253439 ),
                                               th.pval.known_motifs = 0.05, 
                                               th.multipleHypothesisTest = "bonferroni",
                                               b.load = FALSE){
  
  

  if(b.load == "no"){
    
    bg.genome = genome_nucleotide_distribution
    
    df.tfs_motifs <- read.table(file.TF_to_Motif_IDs, sep = "\t", header = T, stringsAsFactors = F)
    l.tfs_w_motifs = vector(mode = "list", length = nrow(df.tfs_motifs))
    names(l.tfs_w_motifs) = df.tfs_motifs$TF_ID
    for(i in 1:nrow(df.tfs_motifs)){
      l.tfs_w_motifs[[i]] = unlist(strsplit(df.tfs_motifs$Motif_ID[i], ","))
      if(length(l.tfs_w_motifs[[i]]) == 0){
        l.tfs_w_motifs[[i]] = NULL
      } 
    }
    
    v.tfs.w.open_motifs <- names(l.tfs_w_motifs)[(which(lapply(l.tfs_w_motifs, function(m) is.null(m)) == FALSE))]
    
    l.motifs_files <- read.fasta(file = file.TFBS_motifs, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
    l.motifs <- vector(mode = "list", length = length(l.motifs_files))
    names(l.motifs) <- names(l.motifs_files)
    v.motif.ids <- names(l.motifs)
    for(m in 1:length(l.motifs_files)){
      
      motif <- l.motifs_files[[m]]
      motif <- unlist(strsplit(motif, "\t"))
      
      idx.c <- which(motif == "]c")
      idx.g <- which(motif == "]g")
      idx.t <- which(motif == "]t")
      
      v.a <- motif[2:(idx.c - 1)]
      v.c <- motif[(idx.c + 1):(idx.g - 1)]
      v.g <- motif[(idx.g + 1):(idx.t - 1)]
      v.t <- motif[(idx.t + 1):(length(motif) - 1)]
      
      v.a[1] <- gsub("\\[", "", v.a[1])
      v.c[1] <- gsub("\\[", "", v.c[1])
      v.g[1] <- gsub("\\[", "", v.g[1])
      v.t[1] <- gsub("\\[", "", v.t[1])
      
      m.motif <- matrix(0, nrow = 4, ncol = length(v.a), dimnames = list(c("A", "C", "G", "T"), seq(1:length(v.a))))
      m.motif[1,] <- as.numeric(v.a)
      m.motif[2,] <- as.numeric(v.c)
      m.motif[3,] <- as.numeric(v.g)
      m.motif[4,] <- as.numeric(v.t)
      
      m.motif = apply(m.motif, 2, function(a) { a / sum(a) })
      
      l.motifs[[m]] <- m.motif
    }
    
    
    # pre TSS
    upstream_sequences <- read.fasta(file = file.promoterSeq, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
    upstream_sequences <- lapply(upstream_sequences, function(m) {toupper(m)})
    df.promSequences <- DNAStringSet(unlist(upstream_sequences))
    Sequences <- df.promSequences
    
    tfs.grn = v.tfs.w.open_motifs
    tgs.grn = names(Sequences)
    
    
    if(th.multipleHypothesisTest == "bonferroni"){
      th.pval = th.pval.known_motifs / length(tgs.grn)
    }else{
      th.pval = th.pval.known_motifs
    }
    
    
    message("TFBS analysis pre TSS")
    m.tfbs = matrix(0, nrow = length(tfs.grn), ncol = length(tgs.grn), dimnames = list(tfs.grn, tgs.grn))
    pb <- txtProgressBar(title = "TFBS analysis pre TSS", min = 0, max = length(tfs.grn), style = 3)
    for(j in 1:length(tfs.grn)){
      
      setTxtProgressBar(pb, j)
      
      tf.j <- tfs.grn[j]
      
      if(tf.j %in% names(l.tfs_w_motifs)){
        
        l.tfs_w_motifs.j <- l.tfs_w_motifs[tf.j]
        
        if(!is.null(unlist(l.tfs_w_motifs.j))){
  
          l.motifs.j = l.motifs[unlist(l.tfs_w_motifs.j)]
          
          l.pwms <- list()
          for(k in 1:length(l.motifs.j)){
            pwm.groundTruth <- l.motifs.j[[k]]
            pwm <- PWMatrix(ID = "", name = "", strand = "+", profileMatrix = pwm.groundTruth, bg = bg.genome)  
            l.pwms <- c(l.pwms, list(pwm))
          }
          pwmList <- do.call(PWMatrixList, l.pwms)
          
          # strt <- Sys.time()
          motif_ix_DNAStringSet <- matchMotifs(pwmList, Sequences, p.cutoff = th.pval, bg = bg.genome) #out = "scores") # , out = "positions")
          
          m.matches  = motifMatches(motif_ix_DNAStringSet)
          m.matches = as.matrix(m.matches)
          
          res = apply(m.matches, 1, function(m) { any(m == TRUE)})
          
          m.tfbs[tf.j, res] = 1
          
        }
        
      }
      
    }
    close(pb)
    m.tfbs.pre_tss = m.tfbs
    
  
    ### post TSS 
    
    gene_sequences <- read.fasta(file = file.geneSeq, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
    gene_sequences <- lapply(gene_sequences, function(m) {toupper(m)})
    df.geneSequences <- DNAStringSet(unlist(gene_sequences))
    names(df.geneSequences) <- gsub("\\..*","", names(df.geneSequences)) # remove gene model 
    Sequences <- df.geneSequences

    message("TFBS analysis post TSS")
    m.tfbs = matrix(0, nrow = length(tfs.grn), ncol = length(tgs.grn), dimnames = list(tfs.grn, tgs.grn))
    pb <- txtProgressBar(title = "TFBS analysis port TSS", min = 0, max = length(tfs.grn), style = 3)
    for(j in 1:length(tfs.grn)){
      
      setTxtProgressBar(pb, j)
      
      tf.j <- tfs.grn[j]
      
      if(tf.j %in% names(l.tfs_w_motifs)){
        
        l.tfs_w_motifs.j <- l.tfs_w_motifs[tf.j]
        
        if(!is.null(unlist(l.tfs_w_motifs.j))){
          
          # strt <- Sys.time()
          
          l.motifs.j = l.motifs[unlist(l.tfs_w_motifs.j)]
          
          l.pwms <- list()
          for(k in 1:length(l.motifs.j)){
            pwm.groundTruth <- l.motifs.j[[k]]
            pwm <- PWMatrix(ID = "", name = "", strand = "+", profileMatrix = pwm.groundTruth, bg = bg.genome)  
            l.pwms <- c(l.pwms, list(pwm))
          }
          pwmList <- do.call(PWMatrixList, l.pwms)
          
          # strt <- Sys.time()
          motif_ix_DNAStringSet <- matchMotifs(pwmList, Sequences, p.cutoff = th.pval, bg = bg.genome) #out = "scores") # , out = "positions")
          
          m.matches  = motifMatches(motif_ix_DNAStringSet)
          m.matches = as.matrix(m.matches)
          
          res = apply(m.matches, 1, function(m) { any(m == TRUE)})
          
          m.tfbs[tf.j, res] = 1
          
        }
        
      }
      
    }
    close(pb)
    m.tfbs.post_tss = m.tfbs
    
    m.tfbs = m.tfbs.pre_tss + m.tfbs.post_tss
    m.tfbs[m.tfbs > 0] = 1
    
    m.motifNet = m.tfbs
    
    
    saveRDS(m.motifNet, "tmp/m.motifNet.rds")
 
  }else{
    
    m.motifNet = readRDS("tmp/m.motifNet.rds")
   
    if(length(m.motifNet) == 0){
      stop("Error: no m.motifNet.rds")
    }
    
    
  }

  m.motifNet <- Matrix::Matrix(m.motifNet, sparse = TRUE)

  tfs_w_motif_binding = intersect(rownames(m.motifNet), rownames(m.grn))
  tgs_w_motif_binding = intersect(colnames(m.motifNet), colnames(m.grn))
  m.lead_support_w_motif.grn <- m.motifNet[tfs_w_motif_binding, tgs_w_motif_binding] * m.grn[tfs_w_motif_binding, tgs_w_motif_binding]
  
  return(list(m.motifNet=m.motifNet, 
              m.lead_support_w_motif.grn=m.lead_support_w_motif.grn))

}

do_treatment_filtering_all_links <- function(l.treatments_per_links){
  
  message("individual link based condition filtering based on hypergeometric test")
  
  pb <- txtProgressBar(min = 0, max = length(l.treatments_per_links), style = 3)
  for(l in 1:length(l.treatments_per_links)){
    setTxtProgressBar(pb, l)
    l.treatments.l <- l.treatments_per_links[[l]]
    
    for(j in 1:length(l.treatments.l)){ # root and shoot
      v.treatments.l <- l.treatments.l[[j]]
      p.prior <- l.treatments_per_links[[l]][[j]] / l.treatments.tissues[[j]][names(l.treatments_per_links[[l]][[j]])]
      if(length(v.treatments.l) > 0){
        p.treatment <- rep(1, length(v.treatments.l))
        names(p.treatment) <- names(v.treatments.l)
        for(i in 1:length(v.treatments.l)){
          hitInSample <- v.treatments.l[i] 
          sampleSize <- sum(v.treatments.l)
          hitInPop <- l.treatments.tissues[[j]][names(v.treatments.l)[i]]
          popSize <- sum(l.treatments.tissues[[j]])
          failInPop <- popSize - hitInPop
          fc <- ((hitInSample / sampleSize) / (hitInPop / popSize))
          if(fc > 1 & hitInSample >= th.min.samples)
            p.treatment[i] <- phyper(hitInSample-1, hitInPop, failInPop, sampleSize, lower.tail= FALSE)
        }
        # p.treatment <- p.adjust(p.treatment,"bonferroni") 
        i.sets <- which(p.treatment <= th.prob)
        v.sets <- names(p.treatment)[i.sets]
        l.treatments.l[[j]] <- v.treatments.l[v.sets]
      }
    }
    l.treatments_per_links[[l]] <- l.treatments.l 
  }
  close(pb)
  
  return(l.treatments_per_links)
}


do_treatment_filtering_single_link <- function(tb.link_treatments, tb.condition_treatments, 
                                               th.pval.treatment = 0.05, th.min.samples = 1, 
                                               s.multipleTestCorrection = "none"){ 
  
  p.treatment <- rep(1, length(tb.link_treatments))
  names(p.treatment) <- names(tb.link_treatments)
  
  for(i in 1:length(tb.link_treatments)){
    
    hitInSample <- tb.link_treatments[i] 
    sampleSize <- sum(tb.link_treatments)
    hitInPop <- tb.condition_treatments[names(tb.link_treatments)[i]]
    popSize <- sum(tb.condition_treatments)
    failInPop <- popSize - hitInPop
    fc <- ((hitInSample / sampleSize) / (hitInPop / popSize))
    
    if(fc > 1 & hitInSample >= th.min.samples)
      p.treatment[i] <- phyper(hitInSample, hitInPop, failInPop, sampleSize, lower.tail = FALSE)
    
  }
  
  p.treatment <- p.adjust(p.treatment, s.multipleTestCorrection)
  i.sets <- which(p.treatment <= th.pval.treatment)
  tb.link_treatments <- tb.link_treatments[i.sets]
  
  return(tb.link_treatments)
}

evaluate_tissues_per_treatment <- function(tb.tissues_per_treatment_per_link,  tb.condition_tissues, th.pval.tissue = 0.05, th.min.samples = 1, s.multipleTestCorrection = "none"){ 
  
  p.treatment <- rep(1, length(tb.tissues_per_treatment_per_link))
  names(p.treatment) <- names(tb.tissues_per_treatment_per_link)
  
  for(i in 1:length(tb.tissues_per_treatment_per_link)){
    
    hitInSample <- tb.tissues_per_treatment_per_link[i] 
    sampleSize <- sum(tb.tissues_per_treatment_per_link)
    hitInPop <- tb.condition_tissues[names(tb.tissues_per_treatment_per_link)[i]]
    popSize <- sum(tb.condition_tissues)
    failInPop <- popSize - hitInPop
    
    fc <- ((hitInSample / sampleSize) / (hitInPop / popSize))
    
    if(fc > 1 & hitInSample >= th.min.samples)
      p.treatment[i] <- phyper(hitInSample, hitInPop, failInPop, sampleSize, lower.tail = FALSE)
    
  }
  
  p.treatment <- p.adjust(p.treatment, s.multipleTestCorrection)
  tb.tissues_per_treatment_per_link <- tb.tissues_per_treatment_per_link[which(p.treatment <= th.pval.tissue)]  
  return(tb.tissues_per_treatment_per_link)    
  
}




perform_treatment_and_tissue_filtering <- function(m.grn, 
                                                   m.de.bin, 
                                                   v.conditionGroups, 
                                                   v.tissueGroups,
                                                   df.experiment_condition_annotation,
                                                   tb.condition_treatments,
                                                   tb.condition_tissues,
                                                   th.pval.treatment = 0.05, 
                                                   th.pval.tissue = 0.05,
                                                   th.min.samples = 1, 
                                                   s.multipleTestCorrection = "none"){  
  

  m.grn <- as.matrix(m.grn)
  m.rf_w_treatments <- m.grn
  idx.rf <- which(m.grn > 0, arr.ind = TRUE)
  v.tfs.rf = rownames(m.grn)
  v.tgs.rf = colnames(m.grn)
  
  
  # regulatory network 
  l.treatments_and_tissues <- vector(mode = "list", length = nrow(idx.rf))
  
  pb <- txtProgressBar(min = 0, max = nrow(idx.rf), style = 3)
  for(i in 1:nrow(idx.rf)){
    
    setTxtProgressBar(pb, i)
    
    tf <- v.tfs.rf[idx.rf[i, 1]]
    tg <- v.tgs.rf[idx.rf[i, 2]]
    
    sets.tf <- as.numeric(which(m.de.bin[tf,] == 1))
    sets.tg <- as.numeric(which(m.de.bin[tg,] == 1))
    
    sets.vals <- intersect(sets.tf, sets.tg)
    series.i <- colnames(m.de.bin)[sets.vals]
    
    tb.series.i <- table(series.i)
    series.i <- unique(series.i)
    
    df.annotation.i <- subset(df.experiment_condition_annotation, df.experiment_condition_annotation$unique_ID %in% series.i)
    
    ###
    
    tb.link_treatments <- numeric(length(v.conditionGroups))
    names(tb.link_treatments) = v.conditionGroups
    
    for(j in 1:length(series.i)){
      df.annotation.ij = subset(df.annotation.i, df.annotation.i$unique_ID == series.i[j])
      condition_treatment_1 = df.annotation.ij$condition_treatment_1
      condition_treatment_1 = condition_treatment_1[condition_treatment_1 != ""]
      if(length(condition_treatment_1) > 0){
        tb.link_treatments[v.conditionGroups[condition_treatment_1]] = tb.link_treatments[v.conditionGroups[condition_treatment_1]] + tb.series.i[series.i[j]]
      }
      condition_treatment_2 = df.annotation.ij$condition_treatment_2
      condition_treatment_2 = condition_treatment_2[condition_treatment_2 != ""]
      if(length(condition_treatment_2) > 0){
        tb.link_treatments[v.conditionGroups[condition_treatment_2]] = tb.link_treatments[v.conditionGroups[condition_treatment_2]] + tb.series.i[series.i[j]]
      }
    }
    tb.link_treatments = tb.link_treatments[tb.link_treatments > 0]
    
    if(length(tb.link_treatments) > 0){
      
      # identify significant treatments
      tb.link_treatments <- do_treatment_filtering_single_link(tb.link_treatments, 
                                                               tb.condition_treatments, 
                                                               th.pval.treatment, 
                                                               th.min.samples = th.min.samples, 
                                                               s.multipleTestCorrection = s.multipleTestCorrection)
      
      # remove all links without 
      if(length(tb.link_treatments) > 0){
        
        ## Step 2 - assign dominant tissues to these conditions 
        i.set <- which(v.conditionGroups[df.annotation.i$condition_treatment_1] %in% names(tb.link_treatments))
        i.set <- unique(c(i.set, which(v.conditionGroups[df.annotation.i$condition_treatment_2] %in% names(tb.link_treatments))))
        tb.tissues.i <- table(df.annotation.i$condition_tissue[i.set]) # tissue annotations all treatments 
        
        l.treatments <- vector(mode = "list", length = length(tb.link_treatments))
        names(l.treatments) <- names(tb.link_treatments)
        
        # dominant treatment filter per link
        for(t in 1:length(tb.link_treatments)){ 
          
          l.treatments[[t]] <- list()
          
          # which conditionset 
          i.set <- which(v.conditionGroups[df.annotation.i$condition_treatment_1] == names(tb.link_treatments)[t])
          i.set <- unique(c(i.set, which(v.conditionGroups[df.annotation.i$condition_treatment_2] == names(tb.link_treatments)[t])))
          df.annotation.it = df.annotation.i[i.set,]
          tissues.t = unique(df.annotation.it$condition_tissue)
          
          tb.tissues_link_treatments = numeric(length(tissues.t))
          names(tb.tissues_link_treatments) = tissues.t
          
          for(k in 1:length(tissues.t)){
            df.annotation.itk = subset(df.annotation.it, df.annotation.it$condition_tissue == tissues.t[k])
            tb.tissues_link_treatments[k] = sum(tb.series.i[df.annotation.itk$unique_ID])
          }
          
          tb.tissues_link_treatments <- evaluate_tissues_per_treatment(tb.tissues_link_treatments,  tb.condition_tissues, 
                                                                       th.pval.tissue = th.pval.tissue, 
                                                                       th.min.samples = th.min.samples, 
                                                                       s.multipleTestCorrection = s.multipleTestCorrection)
          
          # if significant tissues for treatment are found
          if(length(tb.tissues_link_treatments) > 0){
            l.treatments[[t]] <- names(tb.tissues_link_treatments)
          }
          
        }
        
        idx.treatment <- which(sapply(l.treatments, function(m) length(m) > 0) == TRUE)
        # list with individual (multiple) tissue elements
        l.treatments <- l.treatments[idx.treatment]
        
        if(length(l.treatments) > 0){
          l.treatments_and_tissues[[i]] <- l.treatments
        }else{
          m.rf_w_treatments[tf,tg] <- 0
          l.treatments_and_tissues[[i]] <- list()
        }
      }else{
        m.rf_w_treatments[tf,tg] <- 0
        l.treatments_and_tissues[[i]] <- list()
      }
    }else{
      m.rf_w_treatments[tf,tg] <- 0
      l.treatments_and_tissues[[i]] <- list()
    }
  }
  close(pb)
  
  ###
  
  idx.treatment <- which(sapply(l.treatments_and_tissues, function(m) length(m) > 0) == TRUE)
  l.treatments_and_tissues <- l.treatments_and_tissues[idx.treatment]
  
  # list with individual (multiple) tissue elements
  # l.treatments <- l.treatments[idx.treatment]
  idx.rf <- idx.rf[idx.treatment,]

  # extract the link condition
  l.regulatoryNetwork_treatments_and_tissues <- vector(mode = "list", length= length(l.treatments_and_tissues))
  pb <- txtProgressBar(min = 0, max = length(l.treatments_and_tissues), style = 3)
  for(i in 1:length(l.treatments_and_tissues)){
    setTxtProgressBar(pb, i)
    idx_links.treatment_tissue_filter <- which(unlist(lapply(l.treatments_and_tissues[[i]], function(m) if(length(m) == 0){FALSE}else{TRUE})) == TRUE)
    l.regulatoryNetwork_treatments_and_tissues[[i]] <-  l.treatments_and_tissues[[i]][idx_links.treatment_tissue_filter]
  }
  close(pb)
  l.regulatoryNetwork_treatments_and_tissues <-  unlist(lapply(l.regulatoryNetwork_treatments_and_tissues, function(m) return(m)), recursive=FALSE)
  

  v.gns <- rownames(m.de.bin)
  v.condition_tissue_pairs <- sapply(unique(v.conditionGroups), function(m) paste(m, "-",  unique(v.tissueGroups)))
  m.gn_condition_tissue_differentialExpression <- matrix(0, nrow = length(v.gns), ncol = (length(v.condition_tissue_pairs)),dimnames = list(v.gns, v.condition_tissue_pairs))
  pb <- txtProgressBar(min = 0, max = length(v.gns), style = 3)
  for(i in 1:length(v.gns)){
    
    setTxtProgressBar(pb, i)
    
    tg <- v.gns[i]
    
    sets.tg <- as.numeric(which(m.de.bin[tg,] == 1))
    sets.vals <- unique(colnames(m.de.bin)[sets.tg])
    
    df.annotation.i <- subset(df.experiment_condition_annotation, df.experiment_condition_annotation$unique_ID %in% sets.vals)
    
    ## Step 1 - identify conditions (sublevel treatments) per link
    tb.treatments <- table(c(v.conditionGroups[df.annotation.i$condition_treatment_1], 
                             v.conditionGroups[df.annotation.i$condition_treatment_2]))
    tb.treatments <- tb.treatments[names(tb.treatments) != ""]
    
    if(length(tb.treatments) > 0){
      
      # identify significant treatments
      # tb.treatments <- do_treatment_filtering_single_link(tb.treatments, tb.treatments.total, th.pval.treatment, th.min.samples = th.min.samples)
      
      # # remove all links without 
      # if(length(tb.treatments) > 0){
      
      ## Step 2 - assign dominant tissues to these conditions 
      # tb.tissues.total <- table(df.annotation$tissue)
      
      i.set <- which(v.conditionGroups[df.annotation.i$condition_treatment_1] %in% names(tb.treatments))
      i.set <- unique(c(i.set, which(v.conditionGroups[df.annotation.i$condition_treatment_2] %in% names(tb.treatments))))
      
      # tb.tissues.j <- table(df.annotation.j$tissue)
      tb.tissues.i <- table(df.annotation.i$condition_tissue[i.set]) # tissue annotations all treatments 
      
      # dominant treatment filter per link
      for(t in 1:length(tb.treatments)){ 
        
        # which conditionset 
        i.set <- which(v.conditionGroups[df.annotation.i$condition_treatment_1] == names(tb.treatments)[t])
        i.set <- unique(c(i.set, which(v.conditionGroups[df.annotation.i$condition_treatment_2] == names(tb.treatments)[t])))
        
        tb.tissues.i.t <- table(df.annotation.i$condition_tissue[i.set])
        #tb.tissues.i.t <- evaluate_tissues_per_treatment(tb.tissues.i.t,  tb.tissues.total, th.pval.tissue = th.pval.tissue, th.min.samples = th.min.samples)
        
        # if significant tissues for treatment are found
        if(length(tb.tissues.i.t) > 0){
          v.cond_tis_pairs.i <- paste(names(tb.treatments)[t], "-",  names(tb.tissues.i.t))
          m.gn_condition_tissue_differentialExpression[tg , v.cond_tis_pairs.i] <- 1
        }
        
      }
    }
    # }
  }
  close(pb)
  
  
  
  # perform treatment filtering for the entire random forest set - integrate with motif late
  v.tfs.rf <- rownames(m.rf_w_treatments)
  v.tgs.rf <- colnames(m.rf_w_treatments)
  idx.rf <- which(m.rf_w_treatments > 0, arr.ind = TRUE)
  
  
  idx.treatment <- which(sapply(l.treatments_and_tissues, function(m) length(m) > 0) == TRUE)
  l.treatments_and_tissues <- l.treatments_and_tissues[idx.treatment]
  
  
  tb.condition_tissue_differentialExpression <- colSums(m.gn_condition_tissue_differentialExpression)
  #message(length(which(tb.condition_tissue_differentialExpression > 0)), " out of ", length(tb.condition_tissue_differentialExpression), " conditions and tissue pairs with expressed genes ")
  
  idx.pairs <- which(tb.condition_tissue_differentialExpression > 0)
  
  m.gn_condition_tissue_differentialExpression <- m.gn_condition_tissue_differentialExpression[ , idx.pairs]
  tb.condition_tissue_differentialExpression <- tb.condition_tissue_differentialExpression[idx.pairs]
  
  v.cond_tiss_pairs <- colnames(m.gn_condition_tissue_differentialExpression)
  
  l.grn_subnetworks <- vector(mode = "list", length = length(v.cond_tiss_pairs))
  names(l.grn_subnetworks) <- v.cond_tiss_pairs
  
  for(i in 1:length(l.grn_subnetworks)){
    l.grn_subnetworks[[i]] <- matrix(0, nrow = nrow(m.grn), ncol = ncol(m.grn), dimnames = list(rownames(m.grn), colnames(m.grn)))
    l.grn_subnetworks[[i]] <- Matrix::Matrix(l.grn_subnetworks[[i]], sparse = TRUE)
  }
  
  pb <- txtProgressBar(min = 0, max = nrow(idx.rf), style = 3)
  for(i in 1:nrow(idx.rf)){
    setTxtProgressBar(pb, i)
    tf <- v.tfs.rf[idx.rf[i, 1]]
    tg <- v.tgs.rf[idx.rf[i, 2]]
    tb.treatments <- l.treatments_and_tissues[[i]]
    for(j in 1:length(tb.treatments)){
      v.ct_pair.ij <- paste(names(tb.treatments)[j], "-", tb.treatments[[j]]) 
      for(k in 1:length(v.ct_pair.ij)){
        idx <- which(names(l.grn_subnetworks) == v.ct_pair.ij[k])
        l.grn_subnetworks[[idx]][tf, tg] <- 1
      }
    }
  }
  close(pb)
  
  idx.subnetworks <- which( lapply(l.grn_subnetworks, sum) > 0)
  l.grn_subnetworks <- l.grn_subnetworks[idx.subnetworks]
  
  return(list(l.treatments_and_tissues=l.treatments_and_tissues, m.rf_w_treatments = m.rf_w_treatments, l.regulatoryNetwork_treatments_and_tissues = l.regulatoryNetwork_treatments_and_tissues, l.grn_subnetworks = l.grn_subnetworks,  m.gn_condition_tissue_differentialExpression = m.gn_condition_tissue_differentialExpression, tb.condition_tissue_differentialExpression = tb.condition_tissue_differentialExpression))
}






#' Step 3 - Context specific annotation and filtering of gene regulatory link predictions
#'
#' This function filter... 
#' @param m.grn = m.grn,
#' @param l.grn_subnetworks = l.grn_subnetworks, 
#' @param df.geneGroups,
#' @param tb.geneGroups,
#' @param v.geneGroups,
#' @param l.geneGroups,
#' @param th.min_number_targets = 2,
#' @param th.min_number_MR_targets = 2,
#' @param th.pval = 0.05
#' @return DNA binding and treatment filtered lead-support grn, condition specific subnetworks, statistics
#' @keywords 
#' @export
#' @examples
annotate_links_with_treatments_and_tissues <- function(m.lead_support_w_motif.grn, 
                                                       m.pvalue_differentialExpression,
                                                       df.experiment_condition_annotation,
                                                       tb.condition_treatments,
                                                       tb.condition_tissues,
                                                       v.conditionGroups, 
                                                       v.tissueGroups,
                                                       th.diffexp = 0.05,
                                                       th.pval.treatment = 0.05, 
                                                       th.pval.tissue = 0.05,
                                                       th.min.samples = 1, 
                                                       s.multipleTestCorrection = "none",
                                                       b.load = "no"){
  
  
  v.conditionGroups=names(tb.condition_treatments)
  names(v.conditionGroups)=names(tb.condition_treatments)
  
  v.tissueGroups=names(tb.condition_tissues)
  names(v.tissueGroups)=names(tb.condition_tissues)
  
  subDir <- paste("tmp/", th.diffexp, sep ="")
  
  if(b.load == "no"){
    
    m.grn = m.lead_support_w_motif.grn
    m.grn[m.grn > 0] = 1
    m.de = m.pvalue_differentialExpression
    
    if (!file.exists(subDir)){
      dir.create(subDir)
    }
    
    m.de.bin <- m.de
    m.de.bin[m.de.bin <= th.diffexp] <- 10
    m.de.bin[m.de.bin <= 1] <- 0
    m.de.bin[m.de.bin > 0] <- 1
    
    strt<-Sys.time()
    
    
    l.res <- perform_treatment_and_tissue_filtering(m.grn = m.grn, 
                                                    m.de.bin = m.de.bin, 
                                                    v.conditionGroups = v.conditionGroups, 
                                                    v.tissueGroups = v.tissueGroups,
                                                    df.experiment_condition_annotation = df.experiment_condition_annotation, 
                                                    tb.condition_treatments=tb.condition_treatments,
                                                    tb.condition_tissues=tb.condition_tissues,
                                                    th.pval.treatment = th.pval.treatment, 
                                                    th.pval.tissue = th.pval.tissue,
                                                    th.min.samples = th.min.samples, 
                                                    s.multipleTestCorrection = s.multipleTestCorrection)
    
    
    # sparse matrix conversion 
    
    l.treatments_and_tissues <- l.res$l.treatments_and_tissues
    m.rf_w_treatments <- l.res$m.rf_w_treatments
    l.regulatoryNetwork_treatments_and_tissues <- l.res$l.regulatoryNetwork_treatments_and_tissues
    m.gn_condition_tissue_differentialExpression <- l.res$m.gn_condition_tissue_differentialExpression
    tb.condition_tissue_differentialExpression = l.res$tb.condition_tissue_differentialExpression
    l.grn_subnetworks <- l.res$l.grn_subnetworks
    
    filename <- paste(subDir,"/l.treatments_and_tissues.rds", sep ="")
    saveRDS(l.treatments_and_tissues, filename)
    
    filename <- paste(subDir,"/m.rf_w_treatments.rds", sep ="")
    saveRDS(m.rf_w_treatments, filename)
    
    filename <- paste(subDir,"/l.regulatoryNetwork_treatments_and_tissues.rds", sep ="")
    saveRDS(l.regulatoryNetwork_treatments_and_tissues, filename)
    
    filename <- paste(subDir,"/m.gn_condition_tissue_differentialExpression.rds", sep ="")
    saveRDS(m.gn_condition_tissue_differentialExpression, filename)
    
    filename <- paste(subDir,"/l.grn_subnetworks.rds", sep ="")
    saveRDS(l.grn_subnetworks, filename)
    
    filename <- paste(subDir,"/tb.condition_tissue_differentialExpression.rds", sep ="")
    saveRDS(tb.condition_tissue_differentialExpression, filename)
    
    print(Sys.time()-strt)
    
  }
  
  filename <- paste(subDir,"/m.rf_w_treatments.rds", sep ="")
  m.grn = readRDS(filename)
  filename <- paste(subDir,"/l.grn_subnetworks.rds", sep ="")
  l.grn_subnetworks = readRDS(filename)

  filename <- paste(subDir,"/tb.condition_tissue_differentialExpression.rds", sep ="")
  tb.condition_tissue_differentialExpression = readRDS(filename)
  
  df.grn_subnetworks_statistics =  data.frame(conditions = names(l.grn_subnetworks), n_links = unlist(lapply(l.grn_subnetworks, sum)))
  
  return(list(m.grn = m.grn, 
              l.grn_subnetworks=l.grn_subnetworks,
              df.grn_subnetworks_statistics = df.grn_subnetworks_statistics, 
              tb.condition_tissue_differentialExpression = tb.condition_tissue_differentialExpression))

  
}








# identify master regulators => save iniital for stability selection
identify_bottom_tier_masterRegulators <- function(m.grn,
                                                  l.grn_subnetworks,
                                                  v.tfs,
                                                  v.genes,
                                                  v.group_genes,
                                                  v.conds,
                                                  df.transcriptionFactorAnnotation, 
                                                  df.geneGroups,
                                                  tb.geneGroups,
                                                  v.geneGroups,
                                                  l.geneGroups,
                                                  th.min_number_targets = 2,
                                                  th.pval = 0.05,
                                                  b.include_under_represented = "yes"){
  

  n.group_genes = length(intersect(v.group_genes, v.genes))
  n.genes = length(v.genes)
  m.tfs_vs_conditions <- matrix(NA, nrow = length(v.tfs), ncol = length(v.conds), dimnames = list(v.tfs, v.conds))
  
  pb <- txtProgressBar(min = 0, max = length(v.conds), style = 3)
  for(i in 1:length(v.conds)){
    
    setTxtProgressBar(pb, i)
    
    ct.i <- v.conds[i]
    
    m.grn.i <- as.matrix(l.grn_subnetworks[[i]])
    df.idx.grn.i <- which(m.grn.i == 1, arr.ind = TRUE)
    df.grn.i <- data.frame(TF = rownames(m.grn.i)[df.idx.grn.i[,1]], TG = colnames(m.grn.i)[df.idx.grn.i[,2]], stringsAsFactors = FALSE)
  
    if(nrow(df.grn.i) > 0){
      
      tfs.grn.i <- unique(df.grn.i$TF) # all transcritption factors in the network
      
      for(j in 1:length(tfs.grn.i)){
        
        tf.ij <- tfs.grn.i[j]
        
        # number of links per transcription factor family member in the subnetwork
        df.grn.ij <- subset(df.grn.i, df.grn.i$TF == tf.ij)  # number of targets in condition by TF
        
        # condition specific network 
        n.group_genes.ij = length(which(df.grn.ij$TG %in% v.group_genes))
        n.genes.ij   = nrow(df.grn.ij)
  
        hitInSample = n_A_B = n.group_genes.ij
        sampleSize = n_A = n.genes.ij
        
        hitInPop = n_B = n.group_genes
        popSize = n_C = n.genes
        failInPop = n_C - n_B
        
        if(hitInSample >= th.min_number_targets){
          
          pval <- 1
          fc <- (n_A_B / n_A) / (n_B / n_C)
          
          if(fc > 1){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= FALSE)
          }else if(fc < 1 & b.include_under_represented == "yes"){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= TRUE)
          }
          
          if(pval <= th.pval){
            m.tfs_vs_conditions[tf.ij,ct.i] <- fc # enriched or depleated over 
          }else{
            m.tfs_vs_conditions[tf.ij,ct.i] <- 1
          }
         
        }
      }
    }
  }
  close(pb)
  
  return(list(m.MR_vs_conditions = m.tfs_vs_conditions))
  
}




# define the bottom layer
identify_regulatory_hierachy = function(m.MR_vs_conditions, 
                                    
                                        m.grn,
                                        l.grn_subnetworks,
                                        
                                        v.tfs = rownames(m.grn),
                                        v.conds = names(l.grn_subnetworks),
                                        
                                        df.transcriptionFactorAnnotation,
                                        
                                        df.geneGroups,
                                        tb.geneGroups,
                                        v.geneGroups,
                                        l.geneGroups,
                                        
                                        th.min_number_MR_targets = 2,
                                        th.pval = 0.05){ # or genes
  
  
 
  v.genes = rownames(df.geneGroups)
  conds = v.conds
  
  l.Hierarchy = vector(mode = "list", length = length(conds))
  names(l.Hierarchy) = conds
  
  v.number_tiers = numeric(length(conds))
  names(v.number_tiers) = conds
  
  l.Hierarchy_nb_tfs_per_tier = vector(mode = "list", length = length(conds))
  names(l.Hierarchy_nb_tfs_per_tier) = conds
  
  l.Hierarchy_tfs_per_tier = vector(mode = "list", length = length(conds))
  names(l.Hierarchy_tfs_per_tier) = conds
  
  
  pb <- txtProgressBar(min = 0, max = length(v.conds), style = 3)
  
  for(i in 1:length(conds)){
    
    setTxtProgressBar(pb, i)
    
    m.grn.i <- as.matrix(l.grn_subnetworks[[i]])
    
    df.idx.grn.i <- which(m.grn.i == 1, arr.ind = TRUE)
    df.grn.i <- data.frame(TF = rownames(m.grn.i)[df.idx.grn.i[,1]], TG = colnames(m.grn.i)[df.idx.grn.i[,2]], stringsAsFactors = FALSE)
    v.tfs.i = unique(df.grn.i$TF)
    v.tgs.i = unique(df.grn.i$TG)
    
    v.MRs = rownames(m.MR_vs_conditions)
    
    v.MR_level1 = rownames(m.MR_vs_conditions)[which(m.MR_vs_conditions[,i] > 0)]
    l.Hierarchy[[i]] = matrix(0, nrow = length(v.MRs), ncol = length(c(v.MRs, v.genes)), dimnames = list(v.MRs, c(v.MRs, v.genes)))
    
    v.tgs_level_prevs = v.MR_level1    
    v.tfs.putative = v.tfs.i
    
    tiers = 1
    v.tfs_known = c()
    
    l.Hierarchy_tfs_per_tier[[i]] = list(v.MR_level1)
    
    b.continue <- TRUE
    
    while(b.continue){
      
      v.tgs_level_next = c()
      
      for(j in 1:length(v.tfs.i)){
        
        tf.ij <- v.tfs.i[j]
        # number of links per transcription factor family member in the subnetwork
        df.grn.ij <- subset(df.grn.i, df.grn.i$TF == tf.ij)  # number of targets in condition by T
        tgs.ij <- unique(df.grn.ij$TG)
        tgs_MR.ij = intersect(tgs.ij, v.tgs_level_prevs) # 
        
        # number of level prev master regulator trgets over all level prev master regulator | condition
        hitInSample = n_A_B = length(tgs_MR.ij) # number of master regulators as targets of TF in condition
        sampleSize = n_A = length(tgs.ij) # number of master regulators in condition (not just targets) - and level
        
        
        hitInPop = n_B = length(v.tgs_level_prevs) # sum(m.rf_w_treatments.stability_selection[tf.ij,]) #n.tgs.grn_w_treatments_w_motifs.r #sum(m.gn_condition_tissue_differentialExpression[tf.IDs.global,ct.i])
        popSize = n_C = length(v.tgs.i) # sum(m.rf_w_treatments.stability_selection) 
 
        # length(v.tgs.i) # all targets in condition (TFs and TGS)
        failInPop = n_C - n_B
        if(hitInSample >= th.min_number_MR_targets){
          pval <- 1
          fc <- (n_A_B / n_A) / (n_B / n_C)
          if(fc > 1){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= FALSE)
          } 
          if(pval <= th.pval){
            l.Hierarchy[[i]][tf.ij, tgs_MR.ij] = fc 
            v.tgs_level_next = c(v.tgs_level_next, tf.ij)
          }
        }
      }
      if(length(v.tgs_level_next) == 0 | all(v.tgs_level_next %in% v.tfs_known)){
        b.continue = FALSE
      }else{
        tiers = tiers + 1 
        v.tgs_level_prevs = v.tgs_level_next
        v.tfs_known = unique(c(v.tfs_known, v.tgs_level_next))
        l.Hierarchy_tfs_per_tier[[i]] = c(l.Hierarchy_tfs_per_tier[[i]], list(v.tgs_level_next))
      }
    } 
    
    nb.tfs_per_tier = numeric(tiers)
    if(tiers > 1){
      for(t in 1:(tiers - 1)){
        v.tfs.t = unlist(l.Hierarchy_tfs_per_tier[[i]][t])
        v.tfs.t_next = unlist(l.Hierarchy_tfs_per_tier[[i]][t + 1])
        idx = (which(v.tfs.t %in% v.tfs.t_next))
        v.tfs.t = v.tfs.t[!v.tfs.t %in% v.tfs.t[idx]]
        l.Hierarchy_tfs_per_tier[[i]][[t]] = v.tfs.t
        nb.tfs_per_tier[t] = length(v.tfs.t)
        nb.tfs_per_tier[t + 1] = length(v.tfs.t_next)
      }
    }else{
      l.Hierarchy_tfs_per_tier[[i]][[1]] = unlist(l.Hierarchy_tfs_per_tier[[i]][1])
      nb.tfs_per_tier = length(unlist(l.Hierarchy_tfs_per_tier[[i]][1]))
    }
    l.Hierarchy_nb_tfs_per_tier[[i]] = nb.tfs_per_tier
    v.number_tiers[i] = tiers
  }
  
  close(pb)
  
  for(i in 1:length(conds)){
    idx = which(l.Hierarchy_nb_tfs_per_tier[[i]] != 0)
    l.Hierarchy_nb_tfs_per_tier[[i]] = l.Hierarchy_nb_tfs_per_tier[[i]][idx]
    v.number_tiers[i] = length(l.Hierarchy_nb_tfs_per_tier[[i]])
  }
  

  layers <- paste("layer_", 1:max(v.number_tiers), sep="")
  df.masterRegulatorHierarchy <- as.data.frame(matrix(rep(NA, 2 + length(layers)), nrow=1))
  names(df.masterRegulatorHierarchy) <- c("condition", "number_of_tiers",  layers)
  
  for(i in 1:length(v.number_tiers)){
    
    df.masterRegulatorHierarchy.i <- as.data.frame(matrix(rep(NA, 2 + length(layers)), nrow=1))
    names(df.masterRegulatorHierarchy.i) <- c("condition", "number_of_tiers",  layers)
    
    df.masterRegulatorHierarchy.i$condition[1] = names(v.number_tiers)[i]
    df.masterRegulatorHierarchy.i$number_of_tiers[1] = v.number_tiers[i]
    
    if( v.number_tiers[i] > 0){
      idx = 3
      for(j in 1:length(l.Hierarchy_nb_tfs_per_tier[[i]])){
        df.masterRegulatorHierarchy.i[1,idx] = l.Hierarchy_nb_tfs_per_tier[[i]][j]
        idx = idx + 1
      }
    }
    df.masterRegulatorHierarchy = rbind(df.masterRegulatorHierarchy, df.masterRegulatorHierarchy.i)
  }
  
  
  return(list(l.Hierarchy=l.Hierarchy, 
              l.Hierarchy_tfs_per_tier=l.Hierarchy_tfs_per_tier, 
              l.Hierarchy_nb_tfs_per_tier = l.Hierarchy_nb_tfs_per_tier, 
              df.masterRegulatorHierarchy=df.masterRegulatorHierarchy, 
              v.number_tiers=v.number_tiers))
  
}



#' Step 4 - Master regulator hierarchy inference
#'
#' This function identifies the master regulator hierarchy
#' @param m.grn = m.grn,
#' @param l.grn_subnetworks = l.grn_subnetworks, 
#' @param df.geneGroups,
#' @param tb.geneGroups,
#' @param v.geneGroups,
#' @param l.geneGroups,
#' @param th.min_number_targets = 2,
#' @param th.min_number_MR_targets = 2,
#' @param th.pval = 0.05
#' @return condition specific master regulators and regulator hierarchies (across all genes and across gene groups)
#' @keywords 
#' @export
#' @examples
do_master_regulator_hierarchy_inference = function(m.grn,
                                                   m.pvalue_differentialExpression=m.pvalue_differentialExpression,
                                                   l.grn_subnetworks, 
                                                   df.transcriptionFactorAnnotation,
                                                   df.geneGroups,
                                                   tb.geneGroups,
                                                   v.geneGroups,
                                                   l.geneGroups,
                                                   th.min_number_targets = 2,
                                                   th.min_number_MR_targets = 2,
                                                   th.pval = 0.05, 
                                                   foldername.results = "results/"){
  
  
  v.genes = rownames(l.data$m.foldChange_differentialExpression)
  v.group_genes = rownames(l.data$df.geneGroups)
  
  df.transcriptionFactorAnnotation = subset(df.transcriptionFactorAnnotation, df.transcriptionFactorAnnotation$with_geneExpression == "yes")

  v.tfs = unique(df.transcriptionFactorAnnotation$TF_ID)
  v.tf_families = character(length(v.tfs))
  names(v.tf_families) = v.tfs
  
  for(i in 1:length(v.tfs)){
    df.transcriptionFactorAnnotation.i = subset(df.transcriptionFactorAnnotation, df.transcriptionFactorAnnotation$TF_ID == v.tfs[i])
    v.tf_families[i] = paste(df.transcriptionFactorAnnotation.i$TF_Fam, collapse = ", ")
  }
   
  v.tfs = rownames(m.grn)
  v.conds = names(l.grn_subnetworks)
  
  message("----- Identify bottom tier master regulators -----")
  
  # identify level 1 master regulators (bottom level - tfs x domains)
  l.res.MR <- identify_bottom_tier_masterRegulators(m.grn,
                                                 l.grn_subnetworks,
                                                 v.tfs,
                                                 v.genes = v.genes,
                                                 v.group_genes = v.group_genes,
                                                 v.conds,
                                                 df.transcriptionFactorAnnotation,
                                                 df.geneGroups,
                                                 tb.geneGroups,
                                                 v.geneGroups,
                                                 l.geneGroups,
                                                 th.min_number_targets,
                                                 th.pval,
                                                 b.include_under_represented = "yes")

  m.MR_vs_conditions <- l.res.MR$m.MR_vs_conditions  # A) TFs versus Conditions (Matrix plot) P(TF,C)
  m.MR_vs_conditions[!is.na(m.MR_vs_conditions)] = 1
  m.MR_vs_conditions[is.na(m.MR_vs_conditions)] = 0
  number_of_conditions_per_master_regulator = rowSums(m.MR_vs_conditions)
  
  
  message("----- Infer master regulator regulatory hierarchy -----")

  # per condition
  l.res = identify_regulatory_hierachy(m.MR_vs_conditions, 
                                       m.grn,
                                       l.grn_subnetworks,
                                       v.tfs = rownames(m.grn),
                                       v.conds = names(l.grn_subnetworks),
                                       df.transcriptionFactorAnnotation,
                                       df.geneGroups,
                                       tb.geneGroups,
                                       v.geneGroups,
                                       l.geneGroups,
                                       th.min_number_MR_targets = th.min_number_MR_targets,
                                       th.pval = th.pval)
    
  l.Hierarchy = l.res$l.Hierarchy
  l.Hierarchy_tfs_per_tier = l.res$l.Hierarchy_tfs_per_tier
  l.Hierarchy_nb_tfs_per_tier = l.res$l.Hierarchy_nb_tfs_per_tier
  l.df.masterRegulatorHierarchy = l.res$df.masterRegulatorHierarchy
  v.number_tiers = l.res$v.number_tiers
  
  m.MR_vs_conditions <- l.res.MR$m.MR_vs_conditions  # A) TFs versus Conditions (Matrix plot) P(TF,C)

  # # metabolic domains 
  l.tfs_vs_domains_given_condition <- vector(mode = "list", length = length(v.conds))
  names(l.tfs_vs_domains_given_condition) <- v.conds
  
  l.MR_hiearchy_vs_domains_given_condition <- vector(mode = "list", length = length(v.conds))
  names(l.MR_hiearchy_vs_domains_given_condition) <- v.conds
  
  pb <- txtProgressBar(min = 0, max = length(v.conds), style = 3)
  for(i in 1:length(v.conds)){
    
    setTxtProgressBar(pb, i)
    
    ct.i <- v.conds[i]
    m.grn.i <- as.matrix(l.grn_subnetworks[[i]])
    df.idx.grn.i <- which(m.grn.i == 1, arr.ind = TRUE)
    df.grn.i <- data.frame(TF = rownames(m.grn.i)[df.idx.grn.i[,1]], TG = colnames(m.grn.i)[df.idx.grn.i[,2]], stringsAsFactors = FALSE)
    l.tfs_vs_domains_given_condition[[i]] <- matrix(NA, nrow = length(v.tfs), ncol = length(v.geneGroups), dimnames = list(v.tfs, (v.geneGroups)))

    if(nrow(df.grn.i) > 0){
      idx = which(m.MR_vs_conditions[,ct.i] > 1 | m.MR_vs_conditions[,ct.i] < 1)
      mr.grn.i = rownames(m.MR_vs_conditions)[idx]
      for(j in 1:length(mr.grn.i)){
        tf.ij <- mr.grn.i[j]
        # number of links per transcription factor family member in the subnetwork
        df.grn.ij <- subset(df.grn.i, df.grn.i$TF == tf.ij)  # number of targets in condition by TF
        for(d in 1:length(v.geneGroups)){ # add the domain level
          
          ## domain genes in target genes - condition dependent
          hitInSample = n_A_B = length(intersect((unique(df.grn.ij$TG)), l.geneGroups[[d]]))
          sampleSize = n_A = length(unique(df.grn.ij$TG))
          hitInPop = n_B =  length(intersect(v.genes, l.geneGroups[[d]]))
          popSize = n_C = length(v.genes)
          failInPop = n_C-n_B
          
          if(hitInSample >= th.min_number_targets){
            pval <- 1
            fc <- (n_A_B / n_A) / (n_B / n_C)
            if(fc > 1){
              pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= FALSE)
            }else if(fc < 1){
              pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= TRUE)
            }
            if(pval <= th.pval){
              l.tfs_vs_domains_given_condition[[i]][tf.ij,d] <- fc
            }
          }
        }
      }
    }
    
    #### estimage the filtered hierarchy per domai
    l.MR_hiearchy_vs_domains_given_condition[[i]] <- matrix(NA, nrow = length(v.tfs), ncol = length(c(v.tfs, v.geneGroups)), dimnames = list(v.tfs, c(v.tfs, v.geneGroups)))
    l.MR_hiearchy_vs_domains_given_condition[[i]][rownames(l.tfs_vs_domains_given_condition[[i]]), colnames(l.tfs_vs_domains_given_condition[[i]])] <- l.tfs_vs_domains_given_condition[[i]] 
    
    tfs.d = c()
    for(d in 1:length(v.geneGroups)){ # add the domain level
      idx = which(l.MR_hiearchy_vs_domains_given_condition[[i]][,v.geneGroups[d]] > 1 | l.MR_hiearchy_vs_domains_given_condition[[i]][,v.geneGroups[d]] < 1)
      if(length(idx) > 0){
        tfs.d = c(tfs.d, names(l.MR_hiearchy_vs_domains_given_condition[[i]][idx,v.geneGroups[d]]))
      }
    }
    
    tfs_layer_minus_1 = tfs.d
    while(length(tfs_layer_minus_1) > 0){
      tfs_layer = names(which(l.Hierarchy[[i]][,tfs_layer_minus_1] > 0))
      if(length(tfs_layer) > 0){
        print(tfs_layer)
        l.MR_hiearchy_vs_domains_given_condition[[i]][tfs_layer, tfs_layer_minus_1] = l.Hierarchy[[i]][tfs_layer,tfs_layer_minus_1]
      }
      tfs_layer_minus_1 = tfs_layer
    }
    
  }
  close(pb)
  
  
  
  return(list(l.Hierarchy=l.Hierarchy, 
              l.MR_hiearchy_vs_domains_given_condition = l.MR_hiearchy_vs_domains_given_condition,
              l.Hierarchy_tfs_per_tier=l.Hierarchy_tfs_per_tier,
              l.Hierarchy_nb_tfs_per_tier=l.Hierarchy_nb_tfs_per_tier,
              l.df.masterRegulatorHierarchy=l.df.masterRegulatorHierarchy,
              l.MR_vs_geneGroups_given_condition=l.tfs_vs_domains_given_condition,
              v.number_tiers=v.number_tiers,
              m.MR_vs_conditions = l.res.MR$m.MR_vs_conditions,  # A) TFs versus Conditions (Matrix plot) P(TF,C)
              number_of_conditions_per_master_regulator=number_of_conditions_per_master_regulator))
  
}



#' format_results
#'
#' This function formats all results
#' @keywords 
#' @export
format_results = function(l.grn_subnetworks,
                          tb.condition_tissue_differentialExpression,
                          l.Hierarchy, 
                          l.Hierarchy_tfs_per_tier,
                          l.Hierarchy_nb_tfs_per_tier,
                          l.df.masterRegulatorHierarchy,
                          v.number_tiers,
                          m.MR_vs_conditions,  # A) TFs versus Conditions (Matrix plot) P(TF,C)
                          l.MR_vs_geneGroups_given_condition,  # B) per condition - TFs versus Domains (P(TF,D|C)) => also cumulative plot 
                          number_of_conditions_per_master_regulator,
                          tb.condition_treatments,
                          tb.condition_tissues,
                          df.transcriptionFactorAnnotation, 
                          df.geneGroups,
                          tb.geneGroups,
                          v.geneGroups,
                          l.geneGroups,
                          th.pval = 0.05,
                          foldername.results = "results/",
                          file.subGeneGroups = NULL){
  
  
  
  
  
  if(!is.null(file.subGeneGroups)){
    
    df.pwys = read.table(file.subGeneGroups,header = T,  stringsAsFactors = F, fill =  T, sep = "\t", quote = "")
    
    df.pwy_regulation = c()
    
    v.pwys = unique(df.pwys$Pathway.name)
    v.conds = names(l.grn_subnetworks)
    v.tfs = rownames(l.res.grn_tfbs$m.lead_support_w_motif.grn)
    
    pb <- txtProgressBar(min = 0, max = length(v.conds), style = 3)
    for(i in 1:length(v.conds)){
      
      setTxtProgressBar(pb, i)
      
      ct.i <- v.conds[i]
      
      m.grn.i <- as.matrix(l.grn_subnetworks[[i]])
      df.idx.grn.i <- which(m.grn.i == 1, arr.ind = TRUE)
      df.grn.i <- data.frame(TF = rownames(m.grn.i)[df.idx.grn.i[,1]], TG = colnames(m.grn.i)[df.idx.grn.i[,2]], stringsAsFactors = FALSE)
      
      if(nrow(df.grn.i) > 0){
        
        for(p in 1:length(v.pwys)){ # add the domain level
          
          df.pwys.p = subset(df.pwys, df.pwys$Pathway.name == v.pwys[p])
          v.gns.p = df.pwys.p$Gene.id
          
          idx = which(m.MR_vs_conditions[,ct.i] > 1 | m.MR_vs_conditions[,ct.i] < 1)
          mr.grn.i = rownames(m.MR_vs_conditions)[idx]
          
          df.grn.ij <- subset(df.grn.i, df.grn.i$TF %in% mr.grn.i ) 
          df.grn.ij <- subset(df.grn.ij, df.grn.ij$TG %in% v.gns.p ) 
          
          
          if(nrow(df.grn.ij) > 0){
            
            tgs = length(unique(df.grn.ij$TG))
            tfs = length(unique(df.grn.ij$TF))
            
            df.pwy_regulation <- rbind(df.pwy_regulation, data.frame(condition_tissue = ct.i, 
                                                                     pathway = v.pwys[p],
                                                                     tfs = tfs,
                                                                     tgs = tgs,
                                                                     ratio_tfs_vs_tgs = tfs / tgs))
            
            
          }
        }
      }
    }
    close(pb)
    
    
    write.csv(df.pwy_regulation, "results/df.condition_specific_pathway_regulation.csv", row.names = F)
    
  }
  
  df.transcriptionFactorAnnotation = subset(df.transcriptionFactorAnnotation, df.transcriptionFactorAnnotation$with_geneExpression == "yes")
  v.tfs = unique(df.transcriptionFactorAnnotation$TF_ID)
  v.tf_families = character(length(v.tfs))
  names(v.tf_families) = v.tfs
  for(i in 1:length(v.tfs)){
    df.transcriptionFactorAnnotation.i = subset(df.transcriptionFactorAnnotation, df.transcriptionFactorAnnotation$TF_ID == v.tfs[i])
    v.tf_families[i] = paste(unique(df.transcriptionFactorAnnotation.i$TF_Fam), collapse = ", ")
  }
  
  tb.tf_families = table(v.tf_families)
  
  df.transcriptionFactorAnnotation = subset(df.transcriptionFactorAnnotation, df.transcriptionFactorAnnotation$with_geneExpression == "yes")
  df.geneGroups = subset(df.geneGroups, df.geneGroups$with_geneExpression == "yes")
  v.tfs = unique(df.transcriptionFactorAnnotation$TF_ID)
  v.genes =  unique(c(v.tfs, rownames(df.geneGroups)))
  v.gns_geneGroups = rownames(df.geneGroups)
  
  ####
  
  filename = paste(foldername.results, "cumulative_numbers_of_active_genes_in_condition_groups.pdf", sep = "")
  df.data <- data.frame(cond = names(tb.condition_tissue_differentialExpression), nr = as.numeric(tb.condition_tissue_differentialExpression))
  p <- ggplot(df.data, aes(cond, nr))
  p <- p + geom_bar(stat = "identity") + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ylab("cumulative number of active (expressed and repressed) genes at differential expression p-value < 0.05")
  pdf(filename, 18,8)
  p
  dev.off()
  
  ####
  
  v.fams <- unique(v.tf_families)
  df.data <- data.frame(TF = character(), Fam = character(), Cond = character())
  
  for(i in 1:length(l.grn_subnetworks)){
    m.grn <- l.grn_subnetworks[[i]]
    m.grn = as.matrix(m.grn)
    df.idx.grn <- which(m.grn == 1, arr.ind = TRUE)
    df.grn <- data.frame(TF = rownames(m.grn)[df.idx.grn[,1]], TG = colnames(m.grn)[df.idx.grn[,2]], stringsAsFactors = FALSE)
    tfs.grn <- unique(df.grn$TF)
    ct.i <- names(l.grn_subnetworks)[i]
    for(j in 1:length(v.fams)){
      # family tfs in the grn (diff exp)
      tf.IDs <- names(v.tf_families[tfs.grn])[which(v.tf_families[tfs.grn] == v.fams[j])]
      # tf.IDs.global <- intersect(rownames(m.rf), names(v.tf_families)[which(v.tf_families == v.fams[j])])
      if(length(tf.IDs) > 0){
        df.data <- rbind(df.data, 
                         data.frame(TF = tf.IDs, 
                                    Fam = rep(v.fams[j] , length(tf.IDs)), 
                                    Cond = rep(ct.i, length(tf.IDs))))
      }
    }
  }
  
  # how many conditions does a TF occur in 
  df.data <- data.frame(TF = as.numeric(table(df.data$TF)) / length(l.grn_subnetworks)  * 100, Fam = as.character(v.tf_families[names(table(df.data$TF))]))
  p1 <- ggplot(df.data, aes(x=Fam, y=TF, fill=Fam))
  p1 <- p1 + geom_boxplot() + scale_fill_grey() + theme_bw() + theme(axis.text.x = element_text(angle = 270, hjust = 0)) + guides(fill=FALSE) + xlab("Transcription Factor family") + ylab("Percentage of conditions of TF family activity (%)")
  
  
  filename <- paste(foldername.results, "number_of_conditions_per_member_of_TFfam_in_percentage.pdf", sep = "")
  pdf(filename, 18, 8)
  p1
  dev.off()
  
  #####

  
  # Condition specific networks
  subDir = paste(foldername.results, "condition_specific_networks/", sep = "")
  # filename <- paste(getwd(),"/",subDir, ct.i,".html", sep ="")
  
  if (!file.exists(subDir)){
    dir.create(subDir)
  }
  
  for(i in 1:length(l.grn_subnetworks)){
    m.grn <- l.grn_subnetworks[[i]]
    m.grn = as.matrix(m.grn)
    df.idx.grn <- which(m.grn> 0, arr.ind = TRUE)
    df.grn <- data.frame(TF = rownames(m.grn)[df.idx.grn[,1]], TG = colnames(m.grn)[df.idx.grn[,2]], stringsAsFactors = FALSE)
    filename <- paste(subDir, names(l.grn_subnetworks)[i], ".csv", sep ="")
    write.csv2(df.grn, filename, row.names = FALSE)
  }
  

  # Master Regulator
  if(TRUE){
    
    hist(number_of_conditions_per_master_regulator / dim(m.MR_vs_conditions)[2] * 100)
    
    m.heatmap <- t(m.MR_vs_conditions)
    df.heatmap <- melt(m.heatmap)
    df.heatmap$value <- log(df.heatmap$value)
    df.heatmap$value[df.heatmap$value == -Inf] <- 0
    
    df.MR_vs_conditions <- df.heatmap
    df.MR_vs_conditions <- subset(df.MR_vs_conditions, !is.na(df.MR_vs_conditions$value))
    
    df.heatmap$Var2 <- paste(df.heatmap$Var2, "(", v.tf_families[df.heatmap$Var2], ")", sep = "")
    
    df.heatmap$value[df.heatmap$value < log(0.1)] <- log(0.1)
    df.heatmap$value[df.heatmap$value > log(10)] <- log(10)
    
    v.breaks = c(log(0.1), log(0.25), log(0.5), log(1), log(2), log(4), log(10))
    v.lables <- c("< 0.1", "0.25", "0.5","1","2","4", "> 10")#as.character(exp(v.breaks))
    
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile(color = "black") + scale_fill_gradient2(low = "yellow", high = "blue", mid = "black", midpoint = 0, name = "fold change", breaks=v.breaks, labels=v.lables) + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 10, hjust = 1, color = "black"))
    
    filename <- paste(foldername.results, "masterRegulators_vs_conditions.pdf", sep = "")
    pdf(filename, width=20, height=120)
    plot(ggheatmap)
    dev.off()
    
    ####
    
    v.MRs = rownames(m.MR_vs_conditions)
    v.Conds = colnames(m.MR_vs_conditions)

    m.MR_vs_geneGroups_across_conditions <- matrix(0, nrow = length(v.MRs), ncol = length(v.geneGroups), dimnames = list(v.MRs, v.geneGroups))
    for(i in 1:length(v.Conds)){
      m.heatmap <- l.MR_vs_geneGroups_given_condition[v.Conds[i]][[1]]
      m.heatmap[is.na(m.heatmap)] <- 0
      m.heatmap[m.heatmap < 0] <- 0
      m.heatmap[m.heatmap > 0] <- 1
      m.MR_vs_geneGroups_across_conditions[, colnames(m.heatmap)] <- m.MR_vs_geneGroups_across_conditions[, colnames(m.heatmap)] + m.heatmap[rownames(m.MR_vs_geneGroups_across_conditions),]
    }
    m.MR_vs_geneGroups_across_conditions <- round(m.MR_vs_geneGroups_across_conditions / length(v.Conds) * 100)
    df.heatmap <- melt(t(m.MR_vs_geneGroups_across_conditions))
    df.heatmap$Var2 <- paste(df.heatmap$Var2, "(", v.tf_families[df.heatmap$Var2], ")", sep = "")
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "Percentage of total master regulators") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
  
    filename <- paste(foldername.results, "masterRegulator_vs_GeneGroups_across_Conditions.pdf", sep = "")
    pdf(filename, width=10, height=100)
    plot(ggheatmap)
    dev.off()
    
    
    filename <- paste(foldername.results, "masterRegulator_vs_GeneGroups_across_Conditions_clustered.pdf", sep = "")
    m.heatmap <- m.MR_vs_geneGroups_across_conditions
    
    rownames(m.heatmap) <- paste(rownames(m.heatmap), "(", v.tf_families[rownames(m.heatmap)], ")", sep = "")
    
    paletteLength <- 10
    myColor <- colorRampPalette(c("white", "red"))(paletteLength)
    # length(breaks) == length(paletteLength) + 1
    # use floor and ceiling to deal with even/odd length pallettelengths
    myBreaks <- seq(min(m.heatmap), max(m.heatmap), length.out=paletteLength)
    
    p <- pheatmap(m.heatmap,  color=myColor, breaks=myBreaks)
    save_pheatmap_pdf(p, filename, width=10, height=110)
  }
  ####
  
  if(TRUE){
    filename <- paste(foldername.results, "df.masterRegulatorHierarchy_genes_of_geneGroups.csv", sep = "")
    write.table(l.df.masterRegulatorHierarchy, filename, row.names = F, sep = ";" )
  }
  
  #####

  #####


  if(TRUE){
  
    v.tf.fams.selection = unique(v.tf_families[rownames(m.MR_vs_geneGroups_across_conditions)])
    l.fams_vs_geneGroups = vector(mode = "list", length = 3)
    for(i in 1:2){l.fams_vs_geneGroups[[i]] = matrix(0, nrow = length(v.tf.fams.selection), ncol = length(v.geneGroups), dimnames = list(v.tf.fams.selection, v.geneGroups))}
    l.fams_vs_geneGroups[[3]] = matrix(1, nrow = length(v.tf.fams.selection), ncol = length(v.geneGroups), dimnames = list(v.tf.fams.selection, v.geneGroups))
    names(l.fams_vs_geneGroups) =c ("numbers", "percentage", "enrichment")
    
    for(i in 1:length(v.tf.fams.selection)){
      
      fam.i = v.tf.fams.selection[i]
      idx = which(v.tf_families[rownames(m.MR_vs_geneGroups_across_conditions)] == fam.i)
      
      for(j in 1:length(v.geneGroups)){
        
        v.ij.all = m.MR_vs_geneGroups_across_conditions[, v.geneGroups[j]]
        v.ij = m.MR_vs_geneGroups_across_conditions[idx, v.geneGroups[j]]
        v.tfs.ij = names(which(v.ij > 0))
        
        l.fams_vs_geneGroups[[1]][i,j] = length(v.tfs.ij)
        l.fams_vs_geneGroups[[2]][i,j] = length(v.tfs.ij) / tb.tf_families[fam.i] * 100
        
        # enrichment of the particular activity
        
        ## domain genes in target genes - condition dependent 
        hitInSample = n_A_B = length(v.tfs.ij)
        sampleSize = n_A = length(which(v.ij.all > 0))
        
        ## condition independent part 
        hitInPop = n_B = tb.tf_families[fam.i]
        popSize = n_C = sum(tb.tf_families)
        failInPop = n_C-n_B
        
        if(hitInSample >= 1){
          pval <- 1
          fc <- (n_A_B / n_A) / (n_B / n_C)
          if(fc > 1){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= FALSE)
          }else if(fc < 1){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= TRUE)
          }
          if(pval <= th.pval){
            l.fams_vs_geneGroups[[3]][i,j] <- fc
          }
        }
      }
    }
    
  
    subDir = paste(foldername.results, "family_v_geneGroups/", sep = "")
  
    if (!file.exists(subDir)){
      dir.create(subDir)
    }
    
    m.heatmap <- l.fams_vs_geneGroups[[1]]
    df.heatmap <- melt(t(m.heatmap))
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "Numbers") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
    filename <- paste(subDir, "numbers.pdf", sep ="")
    pdf(filename, width=10, height=10)
    plot(ggheatmap)
    dev.off()
    
    m.heatmap <- l.fams_vs_geneGroups[[2]]
    df.heatmap <- melt(t(m.heatmap))
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "Percentage") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
    filename <- paste(subDir, "percentage.pdf", sep ="")
    pdf(filename, width=10, height=10)
    plot(ggheatmap)
    dev.off()
    
    m.heatmap <- l.fams_vs_geneGroups[[3]]
    m.heatmap = log(m.heatmap)
    m.heatmap[m.heatmap == -Inf] = 0
    m.heatmap[m.heatmap == 0] = NA
    df.heatmap <- melt(t(m.heatmap))
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "log FC") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
    filename <- paste(subDir, "enrichment.pdf", sep ="")
    pdf(filename, width=10, height=10)
    plot(ggheatmap)
    dev.off()
    
    
    v.tf.fams.selection = unique(v.tf_families[rownames(m.MR_vs_conditions)])
    v.conds.selection = colnames(m.MR_vs_conditions)
  
    l.fams_vs_conds = vector(mode = "list", length = 3)
    for(i in 1:2)
      l.fams_vs_conds[[i]] = matrix(0, nrow = length(v.tf.fams.selection), ncol = length(v.conds.selection), dimnames = list(v.tf.fams.selection, v.conds.selection))
    l.fams_vs_conds[[3]] = matrix(1, nrow = length(v.tf.fams.selection), ncol = length(v.conds.selection), dimnames = list(v.tf.fams.selection, v.conds.selection))
    names(l.fams_vs_conds) = c("numbers", "percentage", "enrichment")
    
    for(i in 1:length(v.tf.fams.selection)){
      fam.i = v.tf.fams.selection[i]
      idx = which(v.tf_families[rownames(m.MR_vs_conditions)] == fam.i)
      for(j in 1:length(v.conds.selection)){
        v.ij.all = m.MR_vs_conditions[, v.conds.selection[j]]
        v.ij = m.MR_vs_conditions[idx, v.conds.selection[j]]
        v.tfs.ij = names(which(v.ij > 0))
        l.fams_vs_conds[[1]][i,j] = length(v.tfs.ij)
        l.fams_vs_conds[[2]][i,j] = length(v.tfs.ij) / tb.tf_families[fam.i] * 100
        
        # enrichment of the particular activity
        ## domain genes in target genes - condition dependent 
        hitInSample = n_A_B = length(v.tfs.ij)
        sampleSize = n_A = length(which(v.ij.all > 0))
        ## condition independent part 
        hitInPop = n_B = tb.tf_families[fam.i]
        popSize = n_C = sum(tb.tf_families)
        failInPop = n_C-n_B
        
        if(hitInSample >= 1){
          pval <- 1
          fc <- (n_A_B / n_A) / (n_B / n_C)
          if(fc > 1){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= FALSE)
          }else if(fc < 1){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= TRUE)
          }
          
          if(pval <= th.pval){
            l.fams_vs_conds[[3]][i,j] <- fc
            
          }
        }
        
      }
      
    }
    
    subDir = paste(foldername.results, "family_v_conditions/", sep = "")
    
    if (!file.exists(subDir)){
      dir.create(subDir)
    }
    
    m.heatmap <- l.fams_vs_conds[[1]]
    df.heatmap <- melt(t(m.heatmap))
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "Numbers") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
    filename <- paste(subDir, "numbers.pdf", sep ="")
    pdf(filename, width=10, height=10)
    plot(ggheatmap)
    dev.off()
    
    m.heatmap <- l.fams_vs_conds[[2]]
    df.heatmap <- melt(t(m.heatmap))
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "Percentage") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
    filename <- paste(subDir, "percentage.pdf", sep ="")
    pdf(filename, width=10, height=10)
    plot(ggheatmap)
    dev.off()
    
    m.heatmap <- l.fams_vs_conds[[3]]
    m.heatmap = log(m.heatmap)
    m.heatmap[m.heatmap == -Inf] = 0
    m.heatmap[m.heatmap == 0] = NA
    df.heatmap <- melt(t(m.heatmap))
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "log FC") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
    filename <- paste(subDir, "enrichment.pdf", sep ="")
    pdf(filename, width=10, height=10)
    plot(ggheatmap)
    dev.off()
    
    ######
    
    filename <- paste(subDir, "enrichment_clustered.pdf", sep = "")
    m.heatmap <- l.fams_vs_conds[[3]]
    m.heatmap = log(m.heatmap)
    m.heatmap[m.heatmap == -Inf] = 0
    m.heatmap[m.heatmap == 0] = -10 
    
    paletteLength <- 10
    myColor <- colorRampPalette(c("white", "red", "blue"))(paletteLength)
    # length(breaks) == length(paletteLength) + 1
    # use floor and ceiling to deal with even/odd length pallettelengths
    myBreaks <- seq(min(m.heatmap), max(m.heatmap), length.out=paletteLength)
    
    p <- pheatmap(m.heatmap,  color=myColor, breaks=myBreaks)
    save_pheatmap_pdf(p, filename, width=10, height=10)
    
    
    
    
    
    ######
  
    tb.geneGroup_activity = rep(0, length(v.geneGroups))
    names(tb.geneGroup_activity) = v.geneGroups
    
    b.tfs.active = rep(0, length(rownames(m.MR_vs_conditions)))
    names(b.tfs.active) = rownames(m.MR_vs_conditions)
    
    for(j in 1:length(v.geneGroups)){
      for(i in 1:length(v.conds.selection)){
        m.MR_vs_geneGroups_given_condition = l.MR_vs_geneGroups_given_condition[[i]]
        idx = which(m.MR_vs_geneGroups_given_condition[,v.geneGroups[j]] > 0)
        b.tfs.active[idx] = 1
      }
      tb.geneGroup_activity[j] = sum(b.tfs.active)
    }
    
    v.tf.fams.selection = unique(v.tf_families[rownames(m.MR_vs_conditions)])
    v.conds.selection = names(l.MR_vs_geneGroups_given_condition)
    
    m.fams_vs_geneGroups = matrix(0, nrow = length(v.conds.selection), ncol = length(v.geneGroups), dimnames = list(v.conds.selection, v.geneGroups))
    
    l.geneGroups_vs_conds = vector(mode = "list", length = 3)
    for(i in 1:2) 
      l.geneGroups_vs_conds[[i]] = matrix(0, nrow = length(v.conds.selection), ncol = length(v.geneGroups), dimnames = list(v.conds.selection, v.geneGroups))
    l.geneGroups_vs_conds[[3]] = matrix(1, nrow = length(v.conds.selection), ncol = length(v.geneGroups), dimnames = list(v.conds.selection, v.geneGroups))
    names(l.geneGroups_vs_conds) =c ("numbers", "percentage", "enrichment")
    
    for(i in 1:length(v.conds.selection)){
      m.MR_vs_geneGroups_given_condition = l.MR_vs_geneGroups_given_condition[[i]]
  
      for(j in 1:length(v.geneGroups)){
        
        m.ij = m.MR_vs_geneGroups_given_condition
        m.ij[is.na(m.ij)] = 0
        
        v.tfs.ij = names(which(m.MR_vs_geneGroups_given_condition[,v.geneGroups[j]] > 0))
        
        # number and percentage 
        l.geneGroups_vs_conds[[1]][i,j] = length(v.tfs.ij)
        l.geneGroups_vs_conds[[2]][i,j] = length(v.tfs.ij) / length(v.tf_families)
        
        # enrichment 
        hitInSample = n_A_B = length(v.tfs.ij) # active in domain given condition
        sampleSize = n_A = length(which(rowSums(m.ij) > 0)) # active across domains given condition
        
        ## condition independent part 
        hitInPop = n_B =  tb.geneGroup_activity[v.geneGroups[j]] # active in domain across conditions
        popSize = n_C = sum(tb.geneGroup_activity) # active in all domains across conditions
        
        if(hitInSample >= 1){
          pval <- 1
          fc <- (n_A_B / n_A) / (n_B / n_C)
          if(fc > 1){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= FALSE)
          }else if(fc < 1){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= TRUE)
          }
          
          if(pval <= th.pval){
            l.geneGroups_vs_conds[[3]][i,j] <- fc
            
          }
        }
        
      }
      
    }
  
    
    
    subDir = paste(foldername.results, "geneGroups_v_conditions/", sep = "")
    
    if (!file.exists(subDir)){
      dir.create(subDir)
    }
    
    m.heatmap <- l.geneGroups_vs_conds[[1]]
    df.heatmap <- melt(t(m.heatmap))
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "Numbers") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
    filename <- paste(subDir, "numbers.pdf", sep ="")
    pdf(filename, width=10, height=10)
    plot(ggheatmap)
    dev.off()
    
    m.heatmap <- l.geneGroups_vs_conds[[2]]
    df.heatmap <- melt(t(m.heatmap))
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "Percentage") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
    filename <- paste(subDir, "percentage.pdf", sep ="")
    pdf(filename, width=10, height=10)
    plot(ggheatmap)
    dev.off()
    
    m.heatmap <- l.geneGroups_vs_conds[[3]]
    m.heatmap = log(m.heatmap)
    m.heatmap[m.heatmap == -Inf] = 0
    m.heatmap[m.heatmap == 0] = NA
    df.heatmap <- melt(t(m.heatmap))
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "log FC") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
    filename <- paste(subDir, "enrichment.pdf", sep ="")
    pdf(filename, width=10, height=10)
    plot(ggheatmap)
    dev.off()
    
    
  }
    
  if(TRUE){
    
    df.ratio = data.frame(condition= character(length(l.grn_subnetworks)),
                          grn_size = numeric(length(l.grn_subnetworks)), 
                          tmt_number = numeric(length(l.grn_subnetworks)),
                          stringsAsFactors = F)
    
    v.sub_net = unlist(lapply(l.grn_subnetworks, sum))
    
    for(i in 1:length(v.sub_net)){
      
      condition = unlist(strsplit(names(v.sub_net)[i],  " - "))
      df.ratio$condition[i] = condition[1]
      df.ratio$grn_size[i] = as.numeric(v.sub_net[i])
      df.ratio$tmt_number[i] = as.numeric(tb.condition_treatments[condition[1]]) 
      
    }
    
    
    
    df.ratio["ratio"] = round(df.ratio$grn_size / df.ratio$tmt_number,2)

    
    filename <- paste(foldername.results, "df.ratio.csv", sep ="")
    write.table(df.ratio, filename, sep = ";", row.names = F)
    
    
  }

  if(TRUE){
    
    #library(xtable)
    
    df.condition_network_statistics = data.frame(condition = names(l.grn_subnetworks))
    df.condition_network_statistics["TFs"] = unlist(lapply(l.grn_subnetworks, function(m){ m = as.matrix(m);  length(which(rowSums(m) > 0))}))
    df.condition_network_statistics["TF families"] = unlist(lapply(l.grn_subnetworks, function(m){ m = as.matrix(m); length(unique(v.tf_families[(which(rowSums(m) > 0))]))}))
    
    df.condition_network_statistics["TGs"] = unlist(lapply(l.grn_subnetworks, function(m){ m = as.matrix(m);  length(which(colSums(m) > 0))}))
    df.condition_network_statistics["Links"] = unlist(lapply(l.grn_subnetworks, function(m){ m = as.matrix(m);  length(which(m > 0))}))
    
    df.condition_network_statistics["Ratio TFs vs Enzymes"] = 0
    df.condition_network_statistics["Enzymes"] = 0
    
    df.condition_network_enzymes_statistics = data.frame(condition = names(l.grn_subnetworks))
    df.condition_network_enzymes_statistics["Enzymes"] = 0
    
    # df.condition_network_enzymes_statistics["amines and polyamines"] = 0
    # df.condition_network_enzymes_statistics["amino acids"] = 0
    # df.condition_network_enzymes_statistics["carbohydrates"] = 0
    # df.condition_network_enzymes_statistics["cofactors"] = 0
    # df.condition_network_enzymes_statistics["detoxification"] = 0
    # df.condition_network_enzymes_statistics["energy"] = 0
    # df.condition_network_enzymes_statistics["fatty acids and lipids"] = 0
    # df.condition_network_enzymes_statistics["hormones"] = 0
    # df.condition_network_enzymes_statistics["inorganic nutrients"] = 0
    # df.condition_network_enzymes_statistics["nucleotides"] = 0
    # df.condition_network_enzymes_statistics["redox"] = 0
    # df.condition_network_enzymes_statistics["primary specialized interfaced"] = 0
    # df.condition_network_enzymes_statistics["specialized"] = 0
    # df.condition_network_enzymes_statistics["other"] = 0
    
    for(i in 1:length(l.grn_subnetworks)){
      
      m.grn = l.grn_subnetworks[[i]]
      m.grn = as.matrix(m.grn)
      
      df.idx.grn <- which(m.grn > 0, arr.ind = TRUE)
      df.grn <- data.frame(TF = rownames(m.grn)[df.idx.grn[,1]], TG = colnames(m.grn)[df.idx.grn[,2]], stringsAsFactors = FALSE)
      names(df.grn) <- c("TF","Target")
      
      n.gns_geneGroups.i = length(intersect(df.grn$Target, v.gns_geneGroups))
      
      df.condition_network_statistics[i,"Ratio TFs vs Enzymes"] = as.character(round((df.condition_network_statistics$TFs[i] / n.gns_geneGroups.i),2))
      
      df.condition_network_statistics$Enzymes[i] = as.character(n.gns_geneGroups.i)
      df.condition_network_enzymes_statistics$Enzymes[i] = as.character(n.gns_geneGroups.i)
      if(n.gns_geneGroups.i > 0){
        for(j in 1:length(v.geneGroups)){
          id = paste("", v.geneGroups[j], sep = "")
          n.enz_dom.j = length(intersect(df.grn$Target, unlist(l.geneGroups[id])))
          percentage = round(n.enz_dom.j / n.gns_geneGroups.i * 100,0)
          df.condition_network_enzymes_statistics[i, v.geneGroups[j]] =  paste(n.enz_dom.j, " (", percentage, "%)", sep = "")
        }
      }
    }
    
    # 
    # df.condition_network_pathways_statistics = data.frame(condition = names(l.grn_subnetworks))
    # df.condition_network_pathways_statistics["Pathways"] = 0
    # 
    # df.condition_network_pathways_statistics["amines and polyamines"] = 0
    # df.condition_network_pathways_statistics["amino acids"] = 0
    # df.condition_network_pathways_statistics["carbohydrates"] = 0
    # df.condition_network_pathways_statistics["cofactors"] = 0
    # df.condition_network_pathways_statistics["detoxification"] = 0
    # df.condition_network_pathways_statistics["energy"] = 0
    # df.condition_network_pathways_statistics["fatty acids and lipids"] = 0
    # df.condition_network_pathways_statistics["hormones"] = 0
    # df.condition_network_pathways_statistics["inorganic nutrients"] = 0
    # df.condition_network_pathways_statistics["nucleotides"] = 0
    # df.condition_network_pathways_statistics["redox"] = 0
    # df.condition_network_pathways_statistics["primary specialized interfaced"] = 0
    # df.condition_network_pathways_statistics["specialized"] = 0
    # df.condition_network_pathways_statistics["other"] = 0
    
    
    # for(i in 1:length(l.grn_subnetworks.stability_selection)){
    #   
    #   m.grn = l.grn_subnetworks.stability_selection[[i]]
    #   
    #   df.idx.grn <- which(m.grn > 0, arr.ind = TRUE)
    #   df.grn <- data.frame(TF = rownames(m.grn)[df.idx.grn[,1]], TG = colnames(m.grn)[df.idx.grn[,2]], stringsAsFactors = FALSE)
    #   names(df.grn) <- c("TF","Target")
    #   
    #   v.enz.i = intersect(df.grn$Target, v.enz)
    #   
    #   n.enz.i = length(intersect(df.grn$Target, v.enz))
    #   
    #   df.pwys.i = subset(df.enzymes.2, df.enzymes.2$Gene.id %in% v.enz.i)
    #   
    #   n.pwys.i = length(unique(df.pwys.i$Pathway.id))
    #   df.condition_network_statistics$Pathways[i] = as.character(n.pwys.i)
    #   df.condition_network_pathways_statistics$Pathways[i] = as.character(n.pwys.i)
    #   
    #   if(n.enz.i > 0){
    #     for(j in 1:length(v.domains)){
    #       id = paste("", v.domains[j], sep = "")
    #       v.enz_dom.j = intersect(df.grn$Target, unlist(l.domain_enz[id]))
    #       
    #       df.pwys.ij = subset(df.enzymes.2, df.enzymes.2$Gene.id %in% v.enz_dom.j)
    #       n.pwys.ij = length(unique(df.pwys.ij$Pathway.id))
    #       
    #       percentage = round(n.pwys.ij / n.pwys.i * 100,0)
    #       df.condition_network_pathways_statistics[i, v.domains[j]] =  paste(n.pwys.ij, " (", percentage, "%)", sep = "")
    #     }
    #   }
    # }
    
    
    #xtable(df.condition_network_statistics)
    #xtable(df.condition_network_enzymes_statistics)
    #xtable(df.condition_network_pathways_statistics)
    
    write.table(df.condition_network_statistics, "df.condition_network_statistics.txt", row.names = F, quote = F, sep = ";")
    write.table(df.condition_network_enzymes_statistics, "df.condition_network_enzymes_statistics.txt", row.names = F, quote = F, sep = ";")
    #write.table(df.condition_network_pathways_statistics, "df.condition_network_pathways_statistics.csv", row.names = F, quote = F, sep = ";")
    
    
    filename <- paste(foldername.results, "df.condition_network_statistics.txt", sep ="")
    write.table(df.condition_network_statistics, filename, sep = ";", row.names = F)
    
    filename <- paste(foldername.results, "df.condition_network_enzymes_statistics.txt", sep ="")
    write.table(df.condition_network_enzymes_statistics, filename, sep = ";", row.names = F)
    
    #filename <- paste(foldername.results, "df.condition_network_pathways_statistics.csv", sep ="")
    #write.table(df.condition_network_pathways_statistics.csv, filename, sep = ";", row.names = F)
    
    
    
    ## condition tissue heatmap
    condition_tuples = strsplit(as.character(df.condition_network_statistics$condition), " -")
    features = names(df.condition_network_statistics)[-1]
      
    for(j in 1:length(features)){
      
      features_j = as.numeric(df.condition_network_statistics[,features[j]])
    
      v.treatments = unique(as.character(lapply(condition_tuples, function(m){m[1]})))
      v.tissues = unique(as.character(lapply(condition_tuples, function(m){m[2]})))
      
      m.treatment_vs_tissue = matrix(0, nrow = length(v.treatments), ncol = length(v.tissues), 
                                      dimnames = list(v.treatments, v.tissues))
  
      for(i in 1:length(condition_tuples)){
        m.treatment_vs_tissue[condition_tuples[[i]][1], condition_tuples[[i]][2]] = features_j[i]
      }
  
      m.heatmap <- m.treatment_vs_tissue
      df.heatmap <- melt(t(m.heatmap))
      ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "Numbers") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
      filename <- paste(foldername.results, features[j], "_treatments_vs_tissue.pdf", sep ="")
      pdf(filename, width=5, height=8)
      plot(ggheatmap)
      dev.off() 
    
    }
    
    ### 
    
    l.MR_hiearchy_vs_domains_given_condition = l.results$l.res.MR_hierarchy$l.MR_hiearchy_vs_domains_given_condition
    v.conds = names(l.results$l.res.MR_hierarchy$l.MR_hiearchy_vs_domains_given_condition)
    
    
    files = paste(foldername.results, "geneGroups_masterRegulatorHierarchies/", sep = "")
    if(!file.exists(paste(foldername.results, "geneGroups_masterRegulatorHierarchies/", sep = ""))){
      dir.create(paste(foldername.results, "geneGroups_masterRegulatorHierarchies/", sep = ""))
    }
    
    if(FALSE){
     for(i in 1:length(v.conds)){
      
      ct.i = v.conds[i]
      tfs.i = rownames(l.MR_hiearchy_vs_domains_given_condition[ct.i][[1]])
      tgs.i = colnames(l.MR_hiearchy_vs_domains_given_condition[ct.i][[1]])
      m.MR_hiearchy_vs_domains_given_condition = l.MR_hiearchy_vs_domains_given_condition[ct.i][[1]]
      
      tfs.i = paste(rownames(m.MR_hiearchy_vs_domains_given_condition), " (", v.tf_families[rownames(m.MR_hiearchy_vs_domains_given_condition)], ")", sep ="")
      tgs.i = colnames(m.MR_hiearchy_vs_domains_given_condition)
      tgs_tfs.i = tgs.i[!tgs.i %in% v.geneGroups]
      tgs_tfs_w_fams.i = paste(tgs_tfs.i, "(", v.tf_families[tgs_tfs.i], ")", sep ="")
      
      m.net = m.MR_hiearchy_vs_domains_given_condition
      rownames(m.net) = tfs.i
      
      idx = which(colnames(m.net) %in% v.geneGroups)
      v.geneGroups.i = colnames(m.net)[idx]
      
      colnames(m.net) = c(tgs_tfs_w_fams.i, v.geneGroups.i)
      
      m.net[is.na(m.net)] = 0
      m.net[m.net > 1] = 1
      
      #g <- graph_from_adjacency_matrix(m.net)
      
      df.idx.MR_hierarchy.i <- which(m.net > 0, arr.ind = TRUE)
      
      if(nrow(df.idx.MR_hierarchy.i) > 0){
        
        df.MR_hierarchy.i <- data.frame(TF = rownames(m.net)[df.idx.MR_hierarchy.i[,1]], TG = colnames(m.net)[df.idx.MR_hierarchy.i[,2]], stringsAsFactors = FALSE)
        
        g <- graph_from_data_frame(df.MR_hierarchy.i, directed = TRUE)
        wc <- cluster_walktrap(g)
        members <- membership(wc)
        
        #Convert to object suitable for networkD3
        net_d3  <- igraph_to_networkD3(g,group = members)
        
        # filename = paste(output_folder, "Condition_specific_analyses\\df.regulatory_hiearchy_MR_per_tier_bottom_genes_enzymes.csv", sep = "")
        # filename = paste(output_folder, "Condition_specific_analyses\\df.regulatory_hiearchy_MR_per_tier_bottom_genes_enzymes.csv", sep = "")
        # subDir <- files[s]#  paste(output_folder, "C:/Users/Michael/Documents/Computational_Biology/MERIT_V1.0/figures/Analysis_per_condition/masterRegulatorHierarchies/", sep ="")
        
        subDir = files
        filename <- paste(getwd(), "/", subDir, ct.i,".html", sep ="")
        net_d3$links["value"] <- 1
        net_d3$nodes <- remove.factors(net_d3$nodes)
        net_d3$nodes$name[!net_d3$nodes$name %in% v.geneGroups] <- paste(net_d3$nodes$name[!net_d3$nodes$name %in% v.geneGroups], "(", v.tf_families[as.character(net_d3$nodes$name[!net_d3$nodes$name %in% v.geneGroups])], ")", sep ="")
        #
        sankeyNetwork(Links = net_d3$links, Nodes = net_d3$nodes,
                      Source = "source", Target = "target",
                      NodeID = "name", Value = "value",
                      fontSize = 20, nodeWidth = 20, units = "Letter(s)", fontFamily = "sans-serif", iterations = 0,
                      nodePadding = 1, height = 5000, width = 10000, sinksRight =TRUE, margin = NULL)%>%
          htmlwidgets::prependContent(htmltools::tags$h1(paste("Master Regulator Hierarchy in condition: ",ct.i))) %>%
          saveNetwork(file = filename)
      }
      
    }
    }
    
    if(FALSE){
      
      for(i in 1:length(v.number_tiers)){
      
      if(v.number_tiers[i] > 0){
        
        ct.i = names(v.number_tiers)[i]
        tfs.i = rownames(l.MR_vs_geneGroups_given_condition[ct.i][[1]])
        tgs.i = colnames(l.MR_vs_geneGroups_given_condition[ct.i][[1]])
        l.Hierarchy[[i]][tfs.i, tgs.i] = l.MR_vs_geneGroups_given_condition[ct.i][[1]]
        
        tfs.i = paste(rownames(l.Hierarchy[[i]]), "(", v.tf_families[rownames(l.Hierarchy[[i]])], ")", sep ="")
        tgs.i = colnames(l.Hierarchy[[i]])
        tgs_tfs.i = tgs.i[!tgs.i %in% v.geneGroups]
        tgs_tfs_w_fams.i = paste(tgs_tfs.i, "(", v.tf_families[tgs_tfs.i], ")", sep ="")
        
        m.net = l.Hierarchy[[i]]
        rownames(m.net) = tfs.i
        
        idx = which(colnames(m.net) %in% v.geneGroups)
        v.geneGroups.i = colnames(m.net)[idx]
        
        colnames(m.net) = c(tgs_tfs_w_fams.i, v.geneGroups.i)
        
        m.net[is.na(m.net)] = 0
        m.net[m.net > 1] = 1
                                                              
        #g <- graph_from_adjacency_matrix(m.net)
        
     
        df.idx.MR_hierarchy.i <- which(m.net > 0, arr.ind = TRUE)
        
        if(nrow(df.idx.MR_hierarchy.i) > 0){
          
          df.MR_hierarchy.i <- data.frame(TF = rownames(m.net)[df.idx.MR_hierarchy.i[,1]], TG = colnames(m.net)[df.idx.MR_hierarchy.i[,2]], stringsAsFactors = FALSE)
          
          g <- graph_from_data_frame(df.MR_hierarchy.i, directed = TRUE)
          wc <- cluster_walktrap(g)
          members <- membership(wc)
          
          #Convert to object suitable for networkD3
          net_d3  <- igraph_to_networkD3(g,group = members)
          
          # filename = paste(output_folder, "Condition_specific_analyses\\df.regulatory_hiearchy_MR_per_tier_bottom_genes_enzymes.csv", sep = "")
          # filename = paste(output_folder, "Condition_specific_analyses\\df.regulatory_hiearchy_MR_per_tier_bottom_genes_enzymes.csv", sep = "")
          # subDir <- files[s]#  paste(output_folder, "C:/Users/Michael/Documents/Computational_Biology/MERIT_V1.0/figures/Analysis_per_condition/masterRegulatorHierarchies/", sep ="")
          
          subDir = files
          filename <- paste(getwd(), "/", subDir, ct.i,".html", sep ="")
          net_d3$links["value"] <- 1
          net_d3$nodes <- remove.factors(net_d3$nodes)
          net_d3$nodes$name[!net_d3$nodes$name %in% v.geneGroups] <- paste(net_d3$nodes$name[!net_d3$nodes$name %in% v.geneGroups], "(", v.tf_families[as.character(net_d3$nodes$name[!net_d3$nodes$name %in% v.geneGroups])], ")", sep ="")
          #
          sankeyNetwork(Links = net_d3$links, Nodes = net_d3$nodes,
                        Source = "source", Target = "target",
                        NodeID = "name", Value = "value",
                        fontSize = 20, nodeWidth = 20, units = "Letter(s)", fontFamily = "sans-serif", iterations = 0,
                        nodePadding = 1, height = 5000, width = 10000, sinksRight =TRUE, margin = NULL)%>%
            htmlwidgets::prependContent(htmltools::tags$h1(paste("Master Regulator Hierarchy in condition: ",ct.i))) %>%
            saveNetwork(file = filename)
        }
        
      }
    }
    
    }
    
    
    ### 

    m.ratio_maps = matrix(0, nrow = length(v.conds.selection), ncol = length(v.geneGroups), dimnames = list(v.conds.selection, v.geneGroups))
    
    for(i in 1:length(v.conds.selection)){
      
      m.MR_vs_geneGroups_given_condition = l.MR_vs_geneGroups_given_condition[[i]]
      
      m.grn = as.matrix(l.grn_subnetworks[v.conds.selection[i]][[1]])

      df.idx.grn <- which(m.grn > 0, arr.ind = TRUE)
      df.grn <- data.frame(TF = rownames(m.grn)[df.idx.grn[,1]], TG = colnames(m.grn)[df.idx.grn[,2]], stringsAsFactors = FALSE)
      names(df.grn) <- c("TF","Target")
      
      
      for(j in 1:length(v.geneGroups)){
        
        m.ij = m.MR_vs_geneGroups_given_condition
        m.ij[is.na(m.ij)] = 0
        
        v.tfs.ij = names(which(m.MR_vs_geneGroups_given_condition[,v.geneGroups[j]] > 0))
        v.genes_geneGroups = l.geneGroups[[j]]
        df.grn.j = subset(df.grn, df.grn$TF %in% v.tfs.ij)
        df.grn.j <- subset(df.grn.j, df.grn.j$Target %in% v.genes_geneGroups)
        v.tgs.ij = unique(df.grn.j$Target)
        
        
        # enrichment 
        hitInSample = n_A_B = length(v.tfs.ij) # active in domain given condition
        sampleSize = n_A = length(which(rowSums(m.ij) > 0)) # active across domains given condition
        
        ## condition independent part 
        hitInPop = n_B =  tb.geneGroup_activity[v.geneGroups[j]] # active in domain across conditions
        popSize = n_C = sum(tb.geneGroup_activity) # active in all domains across conditions
        
        if(hitInSample >= 1){
          pval <- 1
          fc <- (n_A_B / n_A) / (n_B / n_C)
          if(fc > 1){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= FALSE)
          }else if(fc < 1){
            pval <- phyper(n_A_B, n_B, n_C-n_B, n_A,lower.tail= TRUE)
          }
          
          if(pval <= th.pval){
            
            
            m.ratio_maps[i,j] = length(v.tfs.ij) / length(v.tgs.ij)
            
          }
        }
        
      }
      
    }
    
    
    subDir = paste(foldername.results, "geneGroups_v_conditions/", sep = "")
    
    
    df.heatmap <- melt(t(m.ratio_maps))
    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile() + scale_fill_gradient(low = "white", high = "red", name = "Ratio TFs vs Targets") + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 8, hjust = 1, color = "black"))
    filename <- paste(subDir, "ratio_TFs_vs_TGs.pdf", sep ="")
    pdf(filename, width=10, height=10)
    plot(ggheatmap)
    dev.off()
    
    
  }
  
}


#' run algorithm 
#'
#' @param b.load_grn_inference ("yes","no")
#' @param b.load_TFBS_inference "yes","no")
#' @param b.load_treatment_tissue_inference ("yes","no")
#' @param m.foldChange_differentialExpression
#' @param m.pvalue_differentialExpression
#' @param df.experiment_condition_annotation
#' @param tb.condition_treatments
#' @param tb.condition_tissues
#' @param df.transcriptionFactorAnnotation 
#' @param df.geneGroups
#' @param tb.geneGroups
#' @param v.geneGroups
#' @param l.geneGroups
#' @param n.cpus
#' @param seed (default = 1234)
#' @param importance.measure  (default = "impurity")
#' @param n.trees (default  = 1000)
#' @param n.lead_method_expression_shuffling (default  = 1)
#' @param nbootstrap (default = =100)
#' @param nstepsLARS (default = 5)
#' @param th.lead_grn_method (default = 0.95)
#' @param th.support_grn_methods (default = 0.95)
#' @param n.grnSupport (default = 1)
#' @param file.TF_to_Motif_IDs (default = "data/TF_to_Motif_IDs.txt")
#' @param file.TFBS_motifs (default = "data/Transcription_factor_weight_matrix_Arabidopsis_thaliana.txt")
#' @param file.promoterSeq (default = "data/TAIR10_upstream_1000_20101104.txt")
#' @param file.geneSeq (default = "data/TAIR10_seq_20110103_representative_gene_model_updated.txt")
#' @param th.pre_tss (default= 1000)
#' @param th.post_tss (default = 200)
#' @param genome_nucleotide_distribution ACGT distribution (default = c(0.3253439, 0.1746561, 0.1746561, 0.3253439),
#' @param th.pval.known_motifs (default = 0.05)
#' @param th.diffexp (default = 0.05)
#' @param th.pval.treatment (default = 0.05) 
#' @param th.pval.tissue (default= 0.05)
#' @param th.min.samples (default = 1) 
#' @param s.multipleTestCorrection (default = "none")
#' @param th.min_number_targets (default = 2)
#' @param th.min_number_MR_targets (default = 2)
#' @param th.pval_masterRegulator (default = 0.05)
#' @param foldername.tmp (default = "tmp/") 
#' @param foldername.results = (default = "results/")
#' @keywords 
#' @export
#' @examples
#' 
#' @return a list of results from all 4 steps
#' setwd(...) # set to dataset
#' 
#' 
#' 
#' l.data  =  load_datasets(filename.genes = "data/genes.txt",
#'                          filename.experiment_ids = "data/experiment_ids.txt",
#'                          filename.foldChange_differentialExpression = "data/m.foldChange_differentialExpression.txt",
#'                          filename.pvalue_differentialExpression =	"data/m.pvalue_differentialExpression.txt",
#'                          filename.experiment_condition_tissue_annotation =	"data/experiment_annotation.txt",
#'                          filename.transcriptionfactor_annotation = "data/df.transcriptionFactorAnnotation.txt", 
#'                          filename.geneGroups = "data/df.enzymes_w_metabolic_domains.txt")
#' 
#' 
#' 
#' 
#' l.results = run_MERIT(b.load_grn_inference = "yes",
#'                       b.load_TFBS_inference = "yes",
#'                       b.load_treatment_tissue_inference = "yes",
#'                       m.foldChange_differentialExpression=l.data$m.foldChange_differentialExpression,
#'                       m.pvalue_differentialExpression=l.data$m.pvalue_differentialExpression,
#'                       df.experiment_condition_annotation=l.data$df.experiment_condition_annotation,
#'                       tb.condition_treatments=l.data$tb.condition_treatments,
#'                       tb.condition_tissues=l.data$tb.condition_tissues,
#'                       df.transcriptionFactorAnnotation=l.data$df.transcriptionFactorAnnotation, 
#'                       df.geneGroups=l.data$df.geneGroups,
#'                       tb.geneGroups=l.data$tb.geneGroups,
#'                       v.geneGroups=l.data$v.geneGroups,
#'                       l.geneGroups=l.data$l.geneGroups, 
#'                       n.cpus = 3,
#'                       seed=1234,
#'                       importance.measure="impurity",
#'                       n.trees=1000,
#'                       n.lead_method_expression_shuffling = 1,
#'                       n.bootstrap=100,
#'                       n.stepsLARS=5,
#'                       th.lead_grn_method = 0.95,
#'                       th.support_grn_methods = 0.95,
#'                       n.grnSupport = 1,
#'                       file.TF_to_Motif_IDs = "data/TF_to_Motif_IDs.txt",
#'                       file.TFBS_motifs = "data/Transcription_factor_weight_matrix_Arabidopsis_thaliana.txt",
#'                       file.promoterSeq = "data/TAIR10_upstream_1000_20101104.txt",
#'                       file.geneSeq = "data/TAIR10_seq_20110103_representative_gene_model_updated.txt",
#'                       th.pre_tss = 1000,
#'                       th.post_tss = 200,
#'                       genome_nucleotide_distribution = c(A = 0.3253439, C = 0.1746561, G = 0.1746561, T = 0.3253439 ),
#'                       th.pval.known_motifs = 0.05,
#'                       th.diffexp = 0.05,
#'                       th.pval.treatment = 0.05, 
#'                       th.pval.tissue = 0.05,
#'                       th.min.samples = 1, 
#'                       s.multipleTestCorrection = "none",
#'                       th.min_number_targets = 2,
#'                       th.min_number_MR_targets = 2,
#'                       th.pval_masterRegulator = 0.05, 
#'                       foldername.tmp = "tmp/", 
#'                       foldername.results = "results/")
#' 
#' 
#' 
#' 
#' 
#' 
#' # Step 1 - Gene regulatory network inference using ensemble regression with Monte Carlo based threshold selection 
#' l.res.grn = l.results$l.res.grn
#' 
#' # Step 2 - Transcription factor direct target promoter binding based filtering of gene regulatory link predictions
#' l.res.grn_tfbs = l.results$l.res.grn_tfbs
#'
#' # Step3 - Context specific annotation and filtering of gene regulatory link predictions
#' l.res.link_annotation = l.results$l.res.link_annotation
#' 
#' # Step 4 - Master regulator hierarchy inference
#' l.res.MR_hierarchy = l.results$l.res.MR_hierarchy
#' 
#' 
#' 
#' format_results(l.grn_subnetworks = l.res.link_annotation$l.grn_subnetworks,
#'                tb.condition_tissue_differentialExpression = l.res.link_annotation$tb.condition_tissue_differentialExpression,
#'                l.Hierarchy=l.res.MR_hierarchy$l.Hierarchy, 
#'                l.Hierarchy_tfs_per_tier=l.res.MR_hierarchy$l.Hierarchy_tfs_per_tier,
#'                l.Hierarchy_nb_tfs_per_tier=l.res.MR_hierarchy$l.Hierarchy_nb_tfs_per_tier,
#'                l.df.masterRegulatorHierarchy=l.res.MR_hierarchy$l.df.masterRegulatorHierarchy,
#'                v.number_tiers=l.res.MR_hierarchy$v.number_tiers, 
#'                m.MR_vs_conditions = l.res.MR_hierarchy$m.MR_vs_conditions,  # A) TFs versus Conditions (Matrix plot) P(TF,C)
#'                l.MR_vs_geneGroups_given_condition = l.res.MR_hierarchy$l.MR_vs_geneGroups_given_condition,  # B) per condition - TFs versus Domains (P(TF,D|C)) => also cumulative plot 
#'                number_of_conditions_per_master_regulator=l.res.MR_hierarchy$number_of_conditions_per_master_regulator,
#'                tb.condition_treatments=l.data$tb.condition_treatments,
#'                tb.condition_tissues=l.data$tb.condition_tissues,
#'                df.transcriptionFactorAnnotation=l.data$df.transcriptionFactorAnnotation, 
#'                df.geneGroups=l.data$df.geneGroups,
#'                tb.geneGroups=l.data$tb.geneGroups,
#'                v.geneGroups=l.data$v.geneGroups,
#'                l.geneGroups=l.data$l.geneGroups,
#'                th.pval = 0.05,
#'                foldername.results = "results/",
#'                file.subGeneGroups = "data/aracyc_pathways.20180327")
#' 
#' 
#'                
run_MERIT <- function(b.load_grn_inference = "yes",
                      b.load_TFBS_inference = "yes",
                      b.load_treatment_tissue_inference = "yes",
                      m.foldChange_differentialExpression=m.foldChange_differentialExpression,
                      m.pvalue_differentialExpression=m.pvalue_differentialExpression,
                      df.experiment_condition_annotation=df.experiment_condition_annotation,
                      tb.condition_treatments=tb.condition_treatments,
                      tb.condition_tissues=tb.condition_tissues,
                      df.transcriptionFactorAnnotation=df.transcriptionFactorAnnotation, 
                      df.geneGroups=df.geneGroups,
                      tb.geneGroups=tb.geneGroups,
                      v.geneGroups=v.geneGroups,
                      l.geneGroups=l.geneGroups, 
                      n.cpus = 3,
                      seed=1234,
                      importance.measure="impurity",
                      n.trees=1000,
                      n.lead_method_expression_shuffling = 1,
                      n.bootstrap=100,
                      n.stepsLARS=5,
                      th.lead_grn_method = 0.95,
                      th.support_grn_methods = 0.95,
                      n.grnSupport = 1,
                      file.TF_to_Motif_IDs = "data/TF_to_Motif_IDs.txt",
                      file.TFBS_motifs = "data/Transcription_factor_weight_matrix_Arabidopsis_thaliana.txt",
                      file.promoterSeq = "data/TAIR10_upstream_1000_20101104.txt",
                      file.geneSeq = "data/TAIR10_seq_20110103_representative_gene_model_updated.txt",
                      th.pre_tss = 1000,
                      th.post_tss = 200,
                      genome_nucleotide_distribution = c(A = 0.3253439, C = 0.1746561, G = 0.1746561, T = 0.3253439 ),
                      th.pval.known_motifs = 0.05,
                      th.diffexp = 0.05,
                      th.pval.treatment = 0.05, 
                      th.pval.tissue = 0.05,
                      th.min.samples = 1, 
                      s.multipleTestCorrection = "none",
                      th.min_number_targets = 2,
                      th.min_number_MR_targets = 2,
                      th.pval_masterRegulator = 0.05, 
                      foldername.tmp = "tmp/", 
                      foldername.results = "results/"){
  

  
  list.of.packages <- c("Matrix", "reshape2", "foreach", "doParallel", "pheatmap", "lars", "ggplot2", "igraph", "seqinr", "networkD3", "taRifx", "parmigene", "ranger")
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)){
    install.packages(new.packages)
  } 
  
  
  library(Matrix)
  library(reshape2)
  library(foreach)
  library(doParallel)
  library(pheatmap)
  library(lars)
  library(ggplot2)
  library(igraph)
  library(seqinr)
  library(networkD3)
  library(taRifx)
  library(parmigene)
  library(ranger)
  
  
  list.of.packages <- c("Biostrings", "TFBSTools", "motifmatchr")#,"seqLogo", "PWMEnrich", "BSgenome.Athaliana.TAIR.TAIR9")
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)){
    source("https://bioconductor.org/biocLite.R")
    biocLite(new.packages)
  } 
  
 
  if(!"Biostrings" %in% installed.packages()){
    stop("Error: Biostrings package could not be installed from Bioconductor")
  }
  require(Biostrings)
  
  if(!"TFBSTools" %in% installed.packages()){
    stop("Error: TFBSTools package could not be installed from Bioconductor")
  }
  require(TFBSTools)
  
  if(!"motifmatchr" %in% installed.packages()){
    stop("Error: motifmatchr package could not be installed from Bioconductor")
  }
  require(motifmatchr)
  

  if(!file.exists(foldername.tmp)){
    dir.create(foldername.tmp)
  }
  
  if(!file.exists(foldername.results)){
    dir.create(foldername.results)
  }
  
  message("--------------------------------------------------")
  message("Step 1 - Gene regulatory network inference using ensemble regression with Monte Carlo based threshold selection")
  
  if(b.load_grn_inference == "no"){
    
    l.res.grn = compute_ensemble_regression_with_montecarlo_based_stability_selection(m.foldChange_differentialExpression=m.foldChange_differentialExpression,
                                                                                      df.transcriptionFactorAnnotation=df.transcriptionFactorAnnotation,
                                                                                      df.geneGroups=df.geneGroups,
                                                                                      targetSet = "all",
                                                                                      seed=seed,
                                                                                      importance.measure=importance.measure,
                                                                                      n.trees=n.trees,
                                                                                      n.lead_method_expression_shuffling = n.lead_method_expression_shuffling,
                                                                                      n.bootstrap=n.bootstrap,
                                                                                      n.stepsLARS=n.stepsLARS,
                                                                                      n.cpus=n.cpus)
    
  }
  
  l.res.grn = load_lead_support_grn(df.transcriptionFactorAnnotation=df.transcriptionFactorAnnotation,
                                    df.geneGroups = df.geneGroups,
                                    m.foldChange_differentialExpression,
                                    targetSet = "all",
                                    th.lead_grn_method = th.lead_grn_method,
                                    n.lead_method_expression_shuffling = n.lead_method_expression_shuffling,
                                    th.support_grn_methods = th.support_grn_methods,
                                    n.grnSupport = n.grnSupport)


  message("")
  message("--------------------------------------------------")
  message("Step 2 - Transcription factor direct target promoter binding based filtering of gene regulatory link predictions")

  l.res.grn_tfbs = transcriptionFactorBindingInference(m.grn = l.res.grn$m.lead_support.grn ,
                                                       file.TF_to_Motif_IDs = file.TF_to_Motif_IDs,
                                                       file.TFBS_motifs = file.TFBS_motifs,
                                                       file.promoterSeq = file.promoterSeq,
                                                       file.geneSeq = file.geneSeq,
                                                       th.pre_tss = th.pre_tss,
                                                       th.post_tss = th.post_tss,
                                                       genome_nucleotide_distribution = genome_nucleotide_distribution,
                                                       th.pval.known_motifs=th.pval.known_motifs,
                                                       th.multipleHypothesisTest = "bonferroni",
                                                       b.load = b.load_TFBS_inference)

  message("")
  message("--------------------------------------------------")
  message("Step 3 - Context specific annotation and filtering of gene regulatory link predictions")

  l.res.link_annotation = annotate_links_with_treatments_and_tissues(m.lead_support_w_motif.grn=l.res.grn_tfbs$m.lead_support_w_motif.grn,
                                                                     m.pvalue_differentialExpression=m.pvalue_differentialExpression,
                                                                     df.experiment_condition_annotation=df.experiment_condition_annotation,
                                                                     tb.condition_treatments=tb.condition_treatments,
                                                                     tb.condition_tissues=tb.condition_tissues,
                                                                     th.diffexp = th.diffexp,
                                                                     th.pval.treatment = th.pval.treatment,
                                                                     th.pval.tissue = th.pval.tissue,
                                                                     th.min.samples = th.min.samples,
                                                                     s.multipleTestCorrection = "none",
                                                                     b.load = b.load_treatment_tissue_inference)
  
  
  
  message("")
  message("--------------------------------------------------")
  message("Step 4 - perform Master regulator hiearchy analysis")
  
  l.res.MR_hierarchy = do_master_regulator_hierarchy_inference(m.grn = l.res.link_annotation$m.grn,
                                                               m.pvalue_differentialExpression=m.pvalue_differentialExpression,
                                                               l.grn_subnetworks = l.res.link_annotation$l.grn_subnetworks, 
                                                               df.transcriptionFactorAnnotation = df.transcriptionFactorAnnotation,
                                                               df.geneGroups,
                                                               tb.geneGroups,
                                                               v.geneGroups,
                                                               l.geneGroups,
                                                               th.min_number_targets = th.min_number_targets,
                                                               th.min_number_MR_targets = th.min_number_MR_targets,
                                                               th.pval = th.pval_masterRegulator)
  
  

  
  message("--------------------------------------------------")
  return(list(l.res.grn = l.res.grn,
              l.res.grn_tfbs = l.res.grn_tfbs,
              l.res.link_annotation = l.res.link_annotation,
              l.res.MR_hierarchy = l.res.MR_hierarchy))
  
}                       










#### OBSOLETE ####
run_grn_stability_selection <- function(m.lead_suppport_w_motif.grn=m.lead_suppport_w_motif.grn, 
                                        m.foldChange_differentialExpression=m.foldChange_differentialExpression,
                                        m.pvalue_differentialExpression=m.pvalue_differentialExpression,
                                        df.experiment_condition_annotation=df.annotation,
                                        tb.condition_treatments=tb.condition_treatments,
                                        tb.condition_tissues=tb.condition_tissues,
                                        v.conditionGroups=v.conditionGroups, 
                                        v.tissueGroups=v.tissueGroups,
                                        v.th.diffexp.stabilitySelection = c(0.01, 0.05, 0.065) ,
                                        th.pval.treatment = 0.05, 
                                        th.pval.tissue = 0.05,
                                        th.min.samples = 1, 
                                        s.multipleTestCorrection = "none"
                                        
                                        ){
  
  #### LINK CONDITION STABILITY ANALYSIS - (variation on the differential expression thresholds)
  m.grn = m.lead_suppport_w_motif.grn
  m.grn[m.grn > 0] = 1
  m.de = m.pvalue_differentialExpression

  if (!file.exists("tmp/grn_stability_selection/")){
    dir.create("tmp/grn_stability_selection/")
  }
  
  strt<-Sys.time() 
  cl<-makeCluster(min(length(v.th.diffexp.stabilitySelection), n.cpus))
  registerDoParallel(cl)
  l.res <-  foreach(p = 1:length(v.th.diffexp.stabilitySelection)) %dopar% { 
    
    th.diffexp.p <- v.th.diffexp.stabilitySelection[p]
   
    subDir <- paste("tmp/grn_stability_selection/", th.diffexp.p, sep ="")
    
    if (!file.exists(subDir)){
      dir.create(subDir)
    }
    
    m.de.bin <- m.de
    m.de.bin[m.de.bin <= th.diffexp.p] <- 10
    m.de.bin[m.de.bin <= 1] <- 0
    m.de.bin[m.de.bin > 0] <- 1
    
    strt<-Sys.time()
    l.res <- perform_treatment_and_tissue_filtering(m.grn = m.grn, 
                                                    m.de.bin = m.de.bin, 
                                                    v.conditionGroups = v.conditionGroups, 
                                                    v.tissueGroups = v.tissueGroups,
                                                    df.annotation = df.annotation, 
                                                    tb.condition_treatments=tb.condition_treatments,
                                                    tb.condition_tissues=tb.condition_tissues,
                                                    th.pval.treatment = th.pval.treatment, 
                                                    th.pval.tissue = th.pval.tissue,
                                                    th.min.samples = th.min.samples, 
                                                    s.multipleTestCorrection = s.multipleTestCorrection)
    
    
    l.treatments_and_tissues <- l.res$l.treatments_and_tissues
    m.rf_w_treatments <- l.res$m.rf_w_treatments
    l.regulatoryNetwork_treatments_and_tissues <- l.res$l.regulatoryNetwork_treatments_and_tissues
    m.gn_condition_tissue_differentialExpression <- l.res$m.gn_condition_tissue_differentialExpression
    tb.condition_tissue_differentialExpression = l.res$tb.condition_tissue_differentialExpression
    l.grn_subnetworks <- l.res$l.grn_subnetworks
    
    filename <- paste(subDir,"/l.treatments_and_tissues.rds", sep ="")
    saveRDS(l.treatments_and_tissues, filename)
    
    filename <- paste(subDir,"/m.rf_w_treatments.rds", sep ="")
    saveRDS(m.rf_w_treatments, filename)
    
    filename <- paste(subDir,"/l.regulatoryNetwork_treatments_and_tissues.rds", sep ="")
    saveRDS(l.regulatoryNetwork_treatments_and_tissues, filename)
    
    filename <- paste(subDir,"/m.gn_condition_tissue_differentialExpression.rds", sep ="")
    saveRDS(m.gn_condition_tissue_differentialExpression, filename)
    
    filename <- paste(subDir,"/l.grn_subnetworks.rds", sep ="")
    saveRDS(l.grn_subnetworks, filename)
    
    filename <- paste(subDir,"/tb.condition_tissue_differentialExpression.rds", sep ="")
    saveRDS(tb.condition_tissue_differentialExpression, filename)
    
    print(Sys.time()-strt)
    
    rm(m.de.bin)
    rm(m.rf_w_treatments)
    rm(l.res)
    rm(l.treatments_and_tissues)
    rm(l.regulatoryNetwork_treatments_and_tissues)
    rm(m.gn_condition_tissue_differentialExpression)
    rm(tb.condition_tissue_differentialExpression)
    rm(l.grn_subnetworks)
    
    return(0)
  }
  stopCluster(cl)
  print(Sys.time()-strt)
  
}


generate_ensemble_grn_based_on_stability_selection <- function(df.transcriptionFactorAnnotation=df.transcriptionFactorAnnotation,
                                                               df.geneGroups=df.geneGroups,
                                                               v.th.diffexp.stabilitySelection = c(0.01, 0.05, 0.1) , 
                                                               th.diffexp_lead = 0.05, 
                                                               th.prob = 0.55){
  
  
  df.transcriptionFactorAnnotation = subset(df.transcriptionFactorAnnotation, df.transcriptionFactorAnnotation$with_geneExpression == "yes")
  df.geneGroups = subset(df.geneGroups, df.geneGroups$with_geneExpression == "yes")
  v.tfs = unique(df.transcriptionFactorAnnotation$TF_ID)
  v.genes =  unique(c(v.tfs, rownames(df.geneGroups)))
  
  if(!th.diffexp_lead %in% v.th.diffexp.stabilitySelection){
    stop("lead diff exp.  not in selection")
  }

  # lead 
  filename.parameter <- th.diffexp_lead
  subDir <- paste("tmp/grn_stability_selection/", filename.parameter, sep ="")
  
  filename <- paste(subDir,"/l.grn_subnetworks.rds", sep ="")
  l.grn_subnetworks = readRDS(filename)
  v.conditions.selected = names(l.grn_subnetworks)
  
  l.grn_subnetworks.stability_selection = vector(mode = "list", length = length(v.conditions.selected))
  names(l.grn_subnetworks.stability_selection) = v.conditions.selected
  for(c in 1:length(v.conditions.selected)){
    l.grn_subnetworks.stability_selection[[c]] = matrix(0, nrow = length(v.tfs), ncol = length(v.genes), dimnames = list(v.tfs, v.genes))
  }
  
  n.max.grn_subnetworks.stability_selection = numeric(length(v.conditions.selected))
  names(n.max.grn_subnetworks.stability_selection) = v.conditions.selected
  
  # support 
  pb <- txtProgressBar(min = 0, max = length(v.th.diffexp.stabilitySelection), style = 3)
  for(p in 1:length(v.th.diffexp.stabilitySelection)){
    setTxtProgressBar(pb, p)
    filename.parameter <- v.th.diffexp.stabilitySelection[p]
    subDir <- paste("tmp/grn_stability_selection/", filename.parameter, sep ="")
    filename <- paste(subDir,"/m.rf_w_treatments.rds", sep ="")
    m.rf_w_treatments = readRDS(filename)
    
    filename <- paste(subDir,"/l.grn_subnetworks.rds", sep ="")
    l.grn_subnetworks = readRDS(filename)
    
    for(c in 1:length(v.conditions.selected)){
      if(v.conditions.selected[c] %in% names(l.grn_subnetworks)){
        m.grn_subnetwork = l.grn_subnetworks[v.conditions.selected[c]][[1]]
        m.grn_subnetwork[m.grn_subnetwork > 0] = 1
        l.grn_subnetworks.stability_selection[[c]][rownames(m.grn_subnetwork), colnames(m.grn_subnetwork)] = l.grn_subnetworks.stability_selection[[c]][rownames(m.grn_subnetwork), colnames(m.grn_subnetwork)] + m.grn_subnetwork
        n.max.grn_subnetworks.stability_selection[c] = n.max.grn_subnetworks.stability_selection[c] + 1
      }
    }
  }
  close(pb)
  
  filename <- "tmp/grn_stability_selection/l.grn_subnetworks.stability_selection.rds"
  saveRDS(l.grn_subnetworks.stability_selection, filename)

  filename <- "tmp/grn_stability_selection/n.max.grn_subnetworks.stability_selection.rds"
  saveRDS(n.max.grn_subnetworks.stability_selection, filename)

  
  #### 
  
  
  filename.parameter <- th.diffexp_lead
  subDir <- paste("tmp/grn_stability_selection/", filename.parameter, sep ="")
  filename <- paste(subDir,"/m.rf_w_treatments.rds", sep ="")
  m.rf_w_treatments = readRDS(filename)
  filename <- paste(subDir,"/l.grn_subnetworks.rds", sep ="")
  l.grn_subnetworks = readRDS(filename)
  
  th.stabilitySelection <- round(length(v.th.diffexp.stabilitySelection) * th.prob, 0)
  for(c in 1:length(v.conditions.selected)){
    if(sum(l.grn_subnetworks.stability_selection[[c]]) > 0){
      l.grn_subnetworks.stability_selection[[c]][l.grn_subnetworks.stability_selection[[c]] < th.stabilitySelection] = 0
      l.grn_subnetworks.stability_selection[[c]][l.grn_subnetworks.stability_selection[[c]] >= th.stabilitySelection] = 1
      l.grn_subnetworks.stability_selection[[c]] = l.grn_subnetworks.stability_selection[[c]][rownames(l.grn_subnetworks[[c]]),] * l.grn_subnetworks[[c]]
    }
  }
  
  
  #filename <- paste(subDir.selection,"/m.rf_w_treatments.rds", sep ="")
  m.rf_w_treatments.stability_selection = matrix(0, 
                                                 nrow = dim(l.grn_subnetworks.stability_selection[[1]])[1],
                                                 ncol = dim(l.grn_subnetworks.stability_selection[[1]])[2],
                                                 dimnames = list(rownames(l.grn_subnetworks.stability_selection[[1]]), 
                                                                 colnames(l.grn_subnetworks.stability_selection[[1]])))
  
  #m.rf_w_treatments.stability_selection[m.rf_w_treatments.stability_selection > 0] = 1
  for(c in 1:length(v.conditions.selected)){
    if(sum(l.grn_subnetworks.stability_selection[[c]]) > 0){
      m.rf_w_treatments.stability_selection = m.rf_w_treatments.stability_selection + l.grn_subnetworks.stability_selection[[c]]
    }
  }
  
  m.rf_w_treatments.stability_selection = m.rf_w_treatments.stability_selection * m.rf_w_treatments
  m.number_of_conditions_per_link.stability_selection = m.rf_w_treatments.stability_selection
  m.rf_w_treatments.stability_selection[m.rf_w_treatments.stability_selection > 0] = 1
  
  if (!file.exists("results/condition_specific_networks/")){
    dir.create("results/condition_specific_networks/")
  }
  
  for(i in 1:length(l.grn_subnetworks.stability_selection)){
    m.grn <- l.grn_subnetworks.stability_selection[[i]]
    df.idx.grn <- which(m.grn> 0, arr.ind = TRUE)
    df.grn <- data.frame(TF = rownames(m.grn)[df.idx.grn[,1]], TG = colnames(m.grn)[df.idx.grn[,2]], stringsAsFactors = FALSE)
    filename <- paste("results/condition_specific_networks/", names(l.grn_subnetworks.stability_selection)[i], ".csv", sep ="")
    write.csv2(df.grn, filename, row.names = FALSE)
  }
  
  df.grn_subnetworks_stability_selection_statistics =  data.frame(conditions = names(l.grn_subnetworks), n_before = unlist(lapply(l.grn_subnetworks, sum)), 
                                                                  n_stability_selection = unlist(lapply(l.grn_subnetworks.stability_selection, sum)))
  
  
  return(list(m.grn.stability_selection=m.rf_w_treatments.stability_selection, 
              l.grn_subnetworks.stability_selection=l.grn_subnetworks.stability_selection, 
              m.number_of_conditions_per_link.stability_selection=m.number_of_conditions_per_link.stability_selection,
              df.grn_subnetworks_stability_selection_statistics = df.grn_subnetworks_stability_selection_statistics))
  
}




extract_TFBS_positions <- function(file.TF_to_Motif_IDs = "",
                                    file.TFBS_motifs = "",
                                    file.promoterSeq = "",
                                    file.geneSeq = "",
                                    th.pre_tss = 1000,
                                    th.post_tss = 200,
                                    th.min.score.motif = "80%",
                                    genome_nucleotide_distribution = c(A = 0.3253439, C = 0.1746561, G = 0.1746561, T = 0.3253439 ),
                                    th.pval.known_motifs = 0.05,
                                    n.cpus = 5, 
                                    b.load = "no"){

  
  th.min.score.motif = "80%"
  

    bg.genome = genome_nucleotide_distribution
 
    df.tfs_motifs <- read.table(file.TF_to_Motif_IDs, sep = "\t", header = T, stringsAsFactors = F)
    l.tfs_w_motifs = vector(mode = "list", length = nrow(df.tfs_motifs))
    names(l.tfs_w_motifs) = df.tfs_motifs$TF_ID
    for(i in 1:nrow(df.tfs_motifs)){
      l.tfs_w_motifs[[i]] = unlist(strsplit(df.tfs_motifs$Motif_ID[i], ","))
      if(length(l.tfs_w_motifs[[i]]) == 0){
        l.tfs_w_motifs[[i]] = NULL
      } 
    }
    
    v.tfs.w.open_motifs <- names(l.tfs_w_motifs)[(which(lapply(l.tfs_w_motifs, function(m) is.null(m)) == FALSE))]
    
    l.motifs_files <- read.fasta(file = file.TFBS_motifs, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
    l.motifs <- vector(mode = "list", length = length(l.motifs_files))
    names(l.motifs) <- names(l.motifs_files)
    v.motif.ids <- names(l.motifs)
    for(m in 1:length(l.motifs_files)){
      
      motif <- l.motifs_files[[m]]
      motif <- unlist(strsplit(motif, "\t"))
      
      idx.c <- which(motif == "]c")
      idx.g <- which(motif == "]g")
      idx.t <- which(motif == "]t")
      
      v.a <- motif[2:(idx.c - 1)]
      v.c <- motif[(idx.c + 1):(idx.g - 1)]
      v.g <- motif[(idx.g + 1):(idx.t - 1)]
      v.t <- motif[(idx.t + 1):(length(motif) - 1)]
      
      v.a[1] <- gsub("\\[", "", v.a[1])
      v.c[1] <- gsub("\\[", "", v.c[1])
      v.g[1] <- gsub("\\[", "", v.g[1])
      v.t[1] <- gsub("\\[", "", v.t[1])
      
      m.motif <- matrix(0, nrow = 4, ncol = length(v.a), dimnames = list(c("A", "C", "G", "T"), seq(1:length(v.a))))
      m.motif[1,] <- as.numeric(v.a)
      m.motif[2,] <- as.numeric(v.c)
      m.motif[3,] <- as.numeric(v.g)
      m.motif[4,] <- as.numeric(v.t)
      
      l.motifs[[m]] <- m.motif
    }
    
    upstream_sequences <- read.fasta(file = file.promoterSeq, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
    upstream_sequences <- lapply(upstream_sequences, function(m) {toupper(m)})
    df.promSequences <- DNAStringSet(unlist(upstream_sequences))
    
    gene_sequences <- read.fasta(file = file.geneSeq, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
    gene_sequences <- lapply(gene_sequences, function(m) {toupper(m)})
    df.geneSequences <- DNAStringSet(unlist(gene_sequences))
    names(df.geneSequences) <- gsub("\\..*","", names(df.geneSequences)) # remove gene model 
    
    # df.idx.grn <- which(m.grn > 0, arr.ind = TRUE)
    # df.grn <- data.frame(TF = rownames(m.grn)[df.idx.grn[,1]], TG = colnames(m.grn)[df.idx.grn[,2]], stringsAsFactors = FALSE)
    # tfs.grn <- unique(df.grn$TF)
    # tgs.grn <- unique(df.grn$TG)
    # tfs.grn <- intersect(tfs.grn, v.tfs.w.open_motifs)
    
    tfs.grn = v.tfs.w.open_motifs
    tgs.grn = names(df.geneSequences)
    
    strt <- Sys.time()
    cl<-makeCluster(min(length(tfs.grn), n.cpus))
    registerDoParallel(cl)
    l.res <-  foreach(j = 1:length(tfs.grn), .packages = c("TFBSTools", "Biostrings")) %dopar% {
      
      v.pvals <- rep(1, length(tgs.grn))
      v.tgs <- rep(0,length(tgs.grn))
      names(v.pvals) <- names(v.tgs) <- tgs.grn
      
      tf.j <- tfs.grn[j]
      
      if(tf.j %in% names(l.tfs_w_motifs)){
        
        l.tfs_w_motifs.j <- l.tfs_w_motifs[tf.j]
        
        if(!is.null(unlist(l.tfs_w_motifs.j))){
          
          strt <- Sys.time()
          
          l.motifs.j = l.motifs[unlist(l.tfs_w_motifs.j)]
          Sequences <- df.promSequences
          
          l.pwms <- list()
          for(k in 1:length(l.motifss.j)){
            pwm.groundTruth <- l.motifs.j[[k]]
            pwm <- PWMatrix(ID = "", name = "", profileMatrix = pwm.groundTruth, bg = bg.genome)  
            l.pwms <- c(l.pwms, list(pwm))
          }
          pwmList <- do.call(PWMatrixList, l.pwms)
          
          sitesetList = searchSeq(pwmList, Sequences, min.score=th.min.score.motif, strand="*") #, mc.cores = n.cpus)
          
          print(Sys.time() - strt)
          
          strt <- Sys.time()
          l.pvalues_binding_per_sequence <- pvalues(sitesetList, type="TFMPvalue")
          print(Sys.time() - strt)
          
          
          pval.min.tgs <- unlist(lapply(l.pvalues_binding_per_sequence, function(m) min(unlist(m))))
          pval.min.tgs <- pval.min.tgs[pval.min.tgs != "Inf"]
          if(length(pval.min.tgs) > 0){
            tgs.pval_min <- unique(names(pval.min.tgs))
            pval.min <- numeric(length(tgs.pval_min))
            names(pval.min) <- tgs.pval_min
            for(k in 1:length(tgs.pval_min)){
              idx.k <- which(names(pval.min.tgs) == tgs.pval_min[k])
              pval.min[k] <- min(pval.min.tgs[idx.k])
            }
            pval.min.tgs <- pval.min
            tgs.bound <- names(pval.min.tgs) 
            v.pvals[tgs.bound] <- as.numeric(pval.min.tgs)
            v.tgs[tgs.bound] <- 1
          }
          
          Sequences <- df.geneSequences
          for(k in 1:length(Sequences)){
            if(length(Sequences[[k]]) > th.post_tss){
              Sequences[[k]] <-  subseq(Sequences[[k]], 1, th.post_tss)
            }
          }
          sitesetList = searchSeq(pwmList, Sequences, min.score=th.min.score.motif, strand="*") #, mc.cores = n.cpus)
          l.pvalues_binding_per_sequence <- pvalues(sitesetList, type="TFMPvalue")
          pval.min.tgs <- unlist(lapply(l.pvalues_binding_per_sequence, function(m) min(unlist(m))))
          pval.min.tgs <- pval.min.tgs[pval.min.tgs != "Inf"]
          if(length(pval.min.tgs) > 0){
            
            tgs.pval_min <- unique(names(pval.min.tgs))
            pval.min <- numeric(length(tgs.pval_min))
            names(pval.min) <- tgs.pval_min
            for(k in 1:length(tgs.pval_min)){
              idx.k <- which(names(pval.min.tgs) == tgs.pval_min[k])
              pval.min[k] <- min(pval.min.tgs[idx.k])
            }
            pval.min.tgs <- pval.min
            tgs.bound <- names(pval.min.tgs) #names(pval.min.tgs)[pval.min.tgs <= th.pval.known_motifs]
            
            v.pvals[tgs.bound] <- pmin(v.pvals[tgs.bound] , as.numeric(pval.min.tgs))
            v.tgs[tgs.bound] <- 1
            
          }
              
          print(Sys.time() - strt)
        }
      }
      
      l.tmp <- list(v.pvals=v.pvals, v.tgs = v.tgs)
      
      return(l.tmp)  
    }
    stopCluster(cl)
    print(Sys.time()-strt)
    
    #######
    
    m.motifNet.pval <- matrix(1, nrow = length(tfs.grn), ncol = length(tgs.grn), dimnames = list(tfs.grn, tgs.grn))
    m.motifNet.tgs <- matrix(0, nrow = length(tfs.grn), ncol = length(tgs.grn), dimnames = list(tfs.grn, tgs.grn))
    
    for(j in 1:length(tfs.grn)){
      m.motifNet.pval[tfs.grn[j],] <- l.res[[j]]$v.pvals
      m.motifNet.tgs[tfs.grn[j],] <- l.res[[j]]$v.tgs
    }
    
    saveRDS(m.motifNet.pval, "tmp/m.motifNet.pval.rds")
    saveRDS(m.motifNet.tgs, "tmp/m.motifNet.tgs.rds")
 
  
  m.motifNet = m.motifNet.pval
  m.motifNet[m.motifNet > th.pval.known_motifs] <- 10
  m.motifNet[m.motifNet <= th.pval.known_motifs] <- 1
  m.motifNet[m.motifNet == 10] <- 0
  
  tfs_w_motif_binding = intersect(rownames(m.motifNet), rownames(m.grn))
  tgs_w_motif_binding = intersect(colnames(m.motifNet), colnames(m.grn))
  m.lead_support_w_motif.grn <- m.motifNet[tfs_w_motif_binding, tgs_w_motif_binding] * m.grn[tfs_w_motif_binding, tgs_w_motif_binding]
  
  
  return(list(m.motifNet.pval=m.motifNet.pval, 
              m.motifNet.tgs=m.motifNet.tgs, 
              m.lead_support_w_motif.grn=m.lead_support_w_motif.grn))
}






# based on motifmatchr

extract_TFBS_positions <- function(file.TF_to_Motif_IDs = "",
                                   file.TFBS_motifs = "",
                                   file.promoterSeq = "",
                                   file.geneSeq = "",
                                   th.pre_tss = 1000,
                                   th.post_tss = 200,
                                   th.min.score.motif = "80%",
                                   genome_nucleotide_distribution = c(A = 0.3253439, C = 0.1746561, G = 0.1746561, T = 0.3253439 ),
                                   th.pval.known_motifs = 0.05,
                                   n.cpus = 5){
  
 
  
  file.TF_to_Motif_IDs = "data/TF_to_Motif_IDs.txt"
  file.TFBS_motifs = "data/Transcription_factor_weight_matrix_Arabidopsis_thaliana.txt"
  file.promoterSeq = "data/TAIR10_upstream_1000_20101104.txt"
  file.geneSeq = "data/TAIR10_seq_20110103_representative_gene_model_updated.txt"
  th.pre_tss = 1000
  th.post_tss = 200
  genome_nucleotide_distribution = c(A = 0.3253439, C = 0.1746561, G = 0.1746561, T = 0.3253439 )
  th.pval.known_motifs = 5e-05
  foldername.tmp = "tmp/"
  foldername.results = "results/"
  n.cpus = 5
  
  
  
  require(Biostrings)
  require(TFBSTools)
  #library(reshape2)
  #library(foreach)
  #library(doParallel)
  library(seqinr)
  
  library(motifmatchr)
  
  
  #BiocManager::install("seqinr")
  
  
  # if (!requireNamespace("BiocManager", quietly = TRUE))
  #   install.packages("BiocManager")
  # BiocManager::install("motifmatchr", version = "3.8")
  # 
  
  
  bg.genome = genome_nucleotide_distribution
  
  df.tfs_motifs <- read.table(file.TF_to_Motif_IDs, sep = "\t", header = T, stringsAsFactors = F)
  l.tfs_w_motifs = vector(mode = "list", length = nrow(df.tfs_motifs))
  names(l.tfs_w_motifs) = df.tfs_motifs$TF_ID
  for(i in 1:nrow(df.tfs_motifs)){
    l.tfs_w_motifs[[i]] = unlist(strsplit(df.tfs_motifs$Motif_ID[i], ","))
    if(length(l.tfs_w_motifs[[i]]) == 0){
      l.tfs_w_motifs[[i]] = NULL
    } 
  }
  
  v.tfs.w.open_motifs <- names(l.tfs_w_motifs)[(which(lapply(l.tfs_w_motifs, function(m) is.null(m)) == FALSE))]
  
  l.motifs_files <- read.fasta(file = file.TFBS_motifs, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
  l.motifs <- vector(mode = "list", length = length(l.motifs_files))
  names(l.motifs) <- names(l.motifs_files)
  v.motif.ids <- names(l.motifs)
  for(m in 1:length(l.motifs_files)){
    
    motif <- l.motifs_files[[m]]
    motif <- unlist(strsplit(motif, "\t"))
    
    idx.c <- which(motif == "]c")
    idx.g <- which(motif == "]g")
    idx.t <- which(motif == "]t")
    
    v.a <- motif[2:(idx.c - 1)]
    v.c <- motif[(idx.c + 1):(idx.g - 1)]
    v.g <- motif[(idx.g + 1):(idx.t - 1)]
    v.t <- motif[(idx.t + 1):(length(motif) - 1)]
    
    v.a[1] <- gsub("\\[", "", v.a[1])
    v.c[1] <- gsub("\\[", "", v.c[1])
    v.g[1] <- gsub("\\[", "", v.g[1])
    v.t[1] <- gsub("\\[", "", v.t[1])
    
    m.motif <- matrix(0, nrow = 4, ncol = length(v.a), dimnames = list(c("A", "C", "G", "T"), seq(1:length(v.a))))
    m.motif[1,] <- as.numeric(v.a)
    m.motif[2,] <- as.numeric(v.c)
    m.motif[3,] <- as.numeric(v.g)
    m.motif[4,] <- as.numeric(v.t)
    
    m.motif = apply(m.motif, 2, function(a) { a / sum(a) })
    
    l.motifs[[m]] <- m.motif
  }
  
  
  # pre TSS
  upstream_sequences <- read.fasta(file = file.promoterSeq, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
  upstream_sequences <- lapply(upstream_sequences, function(m) {toupper(m)})
  df.promSequences <- DNAStringSet(unlist(upstream_sequences))
  
  tfs.grn = v.tfs.w.open_motifs
  tgs.grn = names(df.promSequences)
  
  m.tfbs = matrix(0, nrow = length(tfs.grn), ncol = length(tgs.grn), dimnames = list(tfs.grn, tgs.grn))
  pb <- txtProgressBar(title = "Iterative training", min = 0, max = length(tfs.grn), style = 3)
  for(j in 1:length(tfs.grn)){
    
    setTxtProgressBar(pb, j)
    
    tf.j <- tfs.grn[j]
    
    if(tf.j %in% names(l.tfs_w_motifs)){
      
      l.tfs_w_motifs.j <- l.tfs_w_motifs[tf.j]
      
      if(!is.null(unlist(l.tfs_w_motifs.j))){
        
        # strt <- Sys.time()
        
        l.motifs.j = l.motifs[unlist(l.tfs_w_motifs.j)]
        Sequences <- df.promSequences
        
        l.pwms <- list()
        for(k in 1:length(l.motifs.j)){
          pwm.groundTruth <- l.motifs.j[[k]]
          pwm <- PWMatrix(ID = "", name = "", strand = "+", profileMatrix = pwm.groundTruth, bg = bg.genome)  
          l.pwms <- c(l.pwms, list(pwm))
        }
        pwmList <- do.call(PWMatrixList, l.pwms)
        
        # strt <- Sys.time()
        motif_ix_DNAStringSet <- matchMotifs(pwmList, df.geneSequences, p.cutoff = th.pval.known_motifs) #out = "scores") # , out = "positions")
        
        
        m.matches  = motifMatches(motif_ix_DNAStringSet)
        m.matches = as.matrix(m.matches)
        
        res = apply(m.matches, 1, function(m) { any(m == TRUE)})
        
        m.tfbs[tf.j, res] = 1
        
      }
      
    }
    
  }
  close(pb)
  m.tfbs.pre_tss = m.tfbs
  
  
  
  ### post TSS 
  
  gene_sequences <- read.fasta(file = file.geneSeq, seqtype = "DNA",as.string = TRUE, set.attributes = FALSE)
  gene_sequences <- lapply(gene_sequences, function(m) {toupper(m)})
  df.geneSequences <- DNAStringSet(unlist(gene_sequences))
  names(df.geneSequences) <- gsub("\\..*","", names(df.geneSequences)) # remove gene model 
  
  tfs.grn = v.tfs.w.open_motifs
  tgs.grn = names(df.geneSequences)
  
  m.tfbs = matrix(0, nrow = length(tfs.grn), ncol = length(tgs.grn), dimnames = list(tfs.grn, tgs.grn))
  pb <- txtProgressBar(title = "Iterative training", min = 0, max = length(tfs.grn), style = 3)
  for(j in 1:length(tfs.grn)){
    
    setTxtProgressBar(pb, j)
    
    tf.j <- tfs.grn[j]
    
    if(tf.j %in% names(l.tfs_w_motifs)){
      
      l.tfs_w_motifs.j <- l.tfs_w_motifs[tf.j]
      
      if(!is.null(unlist(l.tfs_w_motifs.j))){
        
        # strt <- Sys.time()
        
        l.motifs.j = l.motifs[unlist(l.tfs_w_motifs.j)]
        Sequences <- df.promSequences
        
        l.pwms <- list()
        for(k in 1:length(l.motifs.j)){
          pwm.groundTruth <- l.motifs.j[[k]]
          pwm <- PWMatrix(ID = "", name = "", strand = "+", profileMatrix = pwm.groundTruth, bg = bg.genome)  
          l.pwms <- c(l.pwms, list(pwm))
        }
        pwmList <- do.call(PWMatrixList, l.pwms)
        
        # strt <- Sys.time()
        motif_ix_DNAStringSet <- matchMotifs(pwmList, df.geneSequences, p.cutoff = th.pval.known_motifs) #out = "scores") # , out = "positions")
        
        
        m.matches  = motifMatches(motif_ix_DNAStringSet)
        m.matches = as.matrix(m.matches)
        
        res = apply(m.matches, 1, function(m) { any(m == TRUE)})
        
        m.tfbs[tf.j, res] = 1
        
      }
      
    }
    
  }
  close(pb)
  m.tfbs.post_tss = m.tfbs
  
  
  
  
  
  
  
  m.tfbs.post_tss
  m.tfbs.pre_tss
  m.tfbs
  
  
  m.motifNet.pval <- matrix(1, nrow = length(tfs.grn), ncol = length(tgs.grn), dimnames = list(tfs.grn, tgs.grn))
  m.motifNet.tgs <- matrix(0, nrow = length(tfs.grn), ncol = length(tgs.grn), dimnames = list(tfs.grn, tgs.grn))
  
  for(j in 1:length(tfs.grn)){
    m.motifNet.pval[tfs.grn[j],] <- l.res[[j]]$v.pvals
    m.motifNet.tgs[tfs.grn[j],] <- l.res[[j]]$v.tgs
  }
  
  saveRDS(m.motifNet.pval, "tmp/m.motifNet.pval.rds")
  saveRDS(m.motifNet.tgs, "tmp/m.motifNet.tgs.rds")
  
  
  m.motifNet = m.motifNet.pval
  m.motifNet[m.motifNet > th.pval.known_motifs] <- 10
  m.motifNet[m.motifNet <= th.pval.known_motifs] <- 1
  m.motifNet[m.motifNet == 10] <- 0
  
  tfs_w_motif_binding = intersect(rownames(m.motifNet), rownames(m.grn))
  tgs_w_motif_binding = intersect(colnames(m.motifNet), colnames(m.grn))
  m.lead_support_w_motif.grn <- m.motifNet[tfs_w_motif_binding, tgs_w_motif_binding] * m.grn[tfs_w_motif_binding, tgs_w_motif_binding]
  
  
  return(list(m.motifNet.pval=m.motifNet.pval, 
              m.motifNet.tgs=m.motifNet.tgs, 
              m.lead_support_w_motif.grn=m.lead_support_w_motif.grn))
  
  
  
  
  
  
}
mbanf/MERIT documentation built on June 16, 2021, 1:07 p.m.