R/runSCENIC_2_createRegulons.R

Defines functions getDbTfs getDbAnnotations runSCENIC_2_createRegulons

Documented in getDbAnnotations getDbTfs runSCENIC_2_createRegulons

################################################################################
# Step 2. Identifying regulons (direct TF targets) based on DNA motif enrichment
################################################################################


#' @title runSCENIC_2_createRegulons
#' @description Step 2: RcisTarget (prune co-expression modules using TF-motif enrichment analysis)
#' @param scenicOptions Fields used: TODO
#' @param minGenes Minimum size of co-expression gene set (default: 20 genes)
#' @param signifGenesMethod Method for 'addSignificantGenes'
#' @param coexMethods Allows to select the method(s) used to generate the co-expression modules
#' @param minJakkardInd Merge overlapping modules (with Jakkard index >=minJakkardInd; reduces running time).
#' @param onlyPositiveCorr Whether to include only positive-correlated targets in the regulons (default: TRUE).
#' @param dbIndexCol Column containing the feature name in the database (default: 'features')
#' @return The output is written in the folders 'int' and 'ouput'
#' @details See the detailed vignette explaining the internal steps.
#' @examples 
#' scenicOptions <- readRDS("int/scenicOptions.Rds")
#' # In case any settings need to be modified:
#' scenicOptions@settings$nCores <- 20
#' scenicOptions@inputDatasetInfo$org <- "mgi" 
#' 
#' runSCENIC_2_createRegulons(scenicOptions)
#' @export
runSCENIC_2_createRegulons <- function(scenicOptions, 
                                       minGenes=20, 
                                       coexMethods=NULL, 
                                       minJakkardInd=0.8,
                                       signifGenesMethod="aprox", 
                                       onlyPositiveCorr=TRUE,
                                       onlyBestGsPerMotif=TRUE,
                                       dbIndexCol='features'
                                       )
{
  nCores <- getSettings(scenicOptions, "nCores")
  
  tfModules_asDF <- tryCatch(loadInt(scenicOptions, "tfModules_asDF"),
                       error = function(e) {
                         if(getStatus(scenicOptions, asID=TRUE) < 2) 
                           e$message <- paste0("It seems the co-expression modules have not been built yet. Please, run runSCENIC_1_coexNetwork2modules() first.\n", 
                                               e$message)
                         stop(e)
                       })
  if(!is.null(coexMethods)) tfModules_asDF <- tfModules_asDF[which(tfModules_asDF$method %in% coexMethods),]
  if(!is.null(minJakkardInd)) tfModules_asDF <- mergeOverlappingModules(tfModules_asDF, minJakkardInd=minJakkardInd) # New
  if(nrow(tfModules_asDF)==0) stop("The co-expression modules are empty.")
  
  # Set cores for RcisTarget::addMotifAnnotation(). The other functions use foreach package.
  if("BiocParallel" %in% installed.packages() && (nCores>1)) {
    library(BiocParallel)
    register(MulticoreParam(nCores), default=TRUE) 
  }
  
  msg <- paste0(format(Sys.time(), "%H:%M"), "\tStep 2. Identifying regulons")
  if(getSettings(scenicOptions, "verbose")) message(msg)

  ### Check load DBs
  library(AUCell)
  library(RcisTarget)
  motifAnnot <- getDbAnnotations(scenicOptions)
  
  if(is.null(names(getSettings(scenicOptions, "dbs")))) 
  {
    names(scenicOptions@settings$"dbs") <- scenicOptions@settings$"dbs"
    tmp <- sapply(strsplit(getSettings(scenicOptions, "dbs"),"-", fixed=T), function(x) x[grep("bp|kb",x)])
    if(all(lengths(tmp)>0)) names(scenicOptions@settings$"dbs") <- tmp
  }
  
  loadAttempt <- sapply(getDatabases(scenicOptions), dbLoadingAttempt, indexCol=dbIndexCol)
  if(any(!loadAttempt)) stop("It is not possible to load the following databses: \n",
                                paste(dbs[which(!loadAttempt)], collapse="\n"))
  
  genesInDb <- unique(unlist(lapply(getDatabases(scenicOptions), function(dbFilePath) {
	rf <- arrow::ReadableFile$create(dbFilePath)
	fr <- arrow::FeatherReader$create(rf)
	genesInDb <- names(fr)
	rnktype <- "features"        #TODO: add as option for custom dbs
	genesInDb <- genesInDb[genesInDb != rnktype]
	})))
  
  ## Check if annotation and rankings (potentially) match:
  featuresWithAnnot <- checkAnnots(scenicOptions, motifAnnot)
  if(any(featuresWithAnnot == 0)) warning("Missing annotations\n", names(which(rankingsInDb==0)))
  
  ### Filter & format co-expression modules
  # Remove genes missing from RcisTarget databases
  #  (In case the input matrix wasn't already filtered)
  tfModules_asDF$TF <- as.character(tfModules_asDF$TF)
  tfModules_asDF$Target <- as.character(tfModules_asDF$Target)
  allTFs <- getDbTfs(scenicOptions)
  tfModules_asDF <- tfModules_asDF[which(tfModules_asDF$TF %in% allTFs),]
  geneInDb <- tfModules_asDF$Target %in% genesInDb
      missingGene <- sort(unique(tfModules_asDF[which(!geneInDb),"Target"]))
      if(length(missingGene)>0) 
        warning(paste0("Genes in co-expression modules not available in RcisTargetDatabases: ", 
                                               paste(missingGene, collapse=", ")))
  tfModules_asDF <- tfModules_asDF[which(geneInDb),]

  ######
  # Targets with positive correlation
  if(all(is.na(tfModules_asDF$corr)))
  {
    warning("no correlation info available") 
    tfModules_Selected <- tfModules_asDF
    tfModules_Selected$geneSetName <- paste(tfModules_Selected$TF, tfModules_Selected$method, sep="_")
  }else{
    tfModules_Selected <- tfModules_asDF[which(tfModules_asDF$corr==1),]
    tfModules_Selected$geneSetName <- paste(tfModules_Selected$TF, tfModules_Selected$method, sep="_")
    
    if(!onlyPositiveCorr)
    {
      tfModules_IgnCorr <- tfModules_asDF[which(tfModules_asDF$corr!=1),]
      tfModules_IgnCorr$geneSetName <- paste0(tfModules_IgnCorr$TF,"_", tfModules_IgnCorr$method)
      
      # Include positive corrs for these geneSets: 
      # gplots::venn(list(pos=unique(tfModules_Selected$geneSetName), ign=unique(tfModules_IgnCorr$geneSetName)))
      posCorr <- tfModules_Selected[which(tfModules_Selected$geneSetName %in% unique(tfModules_IgnCorr$geneSetName)),]
      
      tfModules_IgnCorr <- rbind(tfModules_IgnCorr, posCorr)
      tfModules_IgnCorr$geneSetName <- paste0(tfModules_IgnCorr$geneSetName, "IgnCorr")
      
      tfModules_Selected <- rbind(tfModules_Selected, tfModules_IgnCorr)
    }
  }
  
  tfModules_Selected$geneSetName <- factor(as.character(tfModules_Selected$geneSetName))
  # head(tfModules_Selected)
  allGenes <- unique(tfModules_Selected$Target)

  
  #####
  # Split into tfModules (TF-modules, with several methods)
  tfModules <- split(tfModules_Selected$Target, tfModules_Selected$geneSetName)

  # Add TF to the gene set (used in the following steps, careful if editing)
  tfModules <- setNames(lapply(names(tfModules), function(gsn) {
    tf <- strsplit(gsn, "_")[[1]][1]
    unique(c(tf, tfModules[[gsn]]))
  }), names(tfModules))
  
  # Keep gene sets with at least 'minGenes' genes
  tfModules <- tfModules[which(lengths(tfModules)>=minGenes)]
  saveRDS(tfModules, file=getIntName(scenicOptions, "tfModules_forEnrichment")) #TODO as geneset? & previous step?

  if(getSettings(scenicOptions, "verbose")) {
      tfModulesSummary <- t(sapply(strsplit(names(tfModules), "_"), function(x) x[1:2]))
      message("tfModulesSummary:")
      print(cbind(sort(table(tfModulesSummary[,2]))))
  }

  ################################################################
  ### 1. Calculate motif enrichment for each TF-module (Run RcisTarget)

  ### 1.1 Calculate enrichment
  msg <- paste0(format(Sys.time(), "%H:%M"), "\tRcisTarget: Calculating AUC")
  if(getSettings(scenicOptions, "verbose")) message(msg)

  motifs_AUC <- lapply(getDatabases(scenicOptions), function(rnkName) {
    ranking <- importRankings(rnkName, columns=allGenes)
    message("Scoring database: ", ranking@description)
    RcisTarget::calcAUC(tfModules, ranking, aucMaxRank=0.03*getNumColsInDB(ranking), nCores=nCores, verbose=FALSE)})
  saveRDS(motifs_AUC, file=getIntName(scenicOptions, "motifs_AUC"))
  
  ### 1.2 Convert to table, filter by NES & add the TFs to which the motif is annotated
  # (For each database...)
  msg <- paste0(format(Sys.time(), "%H:%M"), "\tRcisTarget: Adding motif annotation")
  message(msg)
  motifEnrichment <- lapply(motifs_AUC, function(aucOutput)
  {
    # Extract the TF of the gene-set name (i.e. MITF_w001):
    tf <- sapply(setNames(strsplit(rownames(aucOutput), "_"), rownames(aucOutput)), function(x) x[[1]])
    
    # Calculate NES and add motif annotation (provide tf in 'highlightTFs'):
    addMotifAnnotation(aucOutput, 
                       nesThreshold=3, digits=3, 
                       motifAnnot=motifAnnot,
                       motifAnnot_highConfCat=c("directAnnotation", "inferredBy_Orthology"),
                       motifAnnot_lowConfCat=c("inferredBy_MotifSimilarity",
                                                 "inferredBy_MotifSimilarity_n_Orthology"), 
                       highlightTFs=tf)
  })

  # Merge both tables, adding a column that contains the 'motifDb'
  motifEnrichment <- do.call(rbind, lapply(names(motifEnrichment), function(dbName){
    cbind(motifDb=dbName, motifEnrichment[[dbName]])
  }))
  saveRDS(motifEnrichment, file=getIntName(scenicOptions, "motifEnrichment_full"))
  msg <- paste0("Number of motifs in the initial enrichment: ", nrow(motifEnrichment))
  if(getSettings(scenicOptions, "verbose")) message(msg)

  ### 1.3 Keep only the motifs annotated to the initial TF
  motifEnrichment_selfMotifs <- motifEnrichment[which(motifEnrichment$TFinDB != ""),, drop=FALSE]
  msg <- paste0("Number of motifs annotated to the matching TF: ", nrow(motifEnrichment_selfMotifs))
  if(getSettings(scenicOptions, "verbose")) message(msg)
  rm(motifEnrichment)

  if(nrow(motifEnrichment_selfMotifs)==0) 
    stop("None of the co-expression modules present enrichment of the TF motif: There are no regulons.")
  
  ####
  if(onlyBestGsPerMotif)
  {
    met_byDb <- split(motifEnrichment_selfMotifs, motifEnrichment_selfMotifs$motifDb)
    for(db in names(met_byDb))
    {
      met <- met_byDb[[db]]
      met <- split(met, factor(met$highlightedTFs))
      met <- lapply(met, function(x){
        rbindlist(lapply(split(x, x$motif), function(y) y[which.max(y$NES),]))
      })
      # sapply(met, nrow)
      met_byDb[[db]] <- rbindlist(met)
    }
    motifEnrichment_selfMotifs <- rbindlist(met_byDb)
    rm(met_byDb); rm(met)
  }
  ####
  
  
  ################################################################
  # 2. Prune targets
  msg <- paste0(format(Sys.time(), "%H:%M"), "\tRcisTarget: Pruning targets")
  if(getSettings(scenicOptions, "verbose")) message(msg)
    
  dbNames <- getDatabases(scenicOptions)
  motifEnrichment_selfMotifs_wGenes <- lapply(names(dbNames), function(motifDbName){
    ranking <- importRankings(dbNames[motifDbName], columns=allGenes)
    addSignificantGenes(resultsTable=motifEnrichment_selfMotifs[motifEnrichment_selfMotifs$motifDb==motifDbName,],
                        geneSets=tfModules,
                        rankings=ranking, 
                        plotCurve = FALSE,
                        maxRank=5000, 
                        method=signifGenesMethod, 
                        nMean=100,
                        nCores=nCores)
  })
  
  suppressPackageStartupMessages(library(data.table))
  motifEnrichment_selfMotifs_wGenes <- rbindlist(motifEnrichment_selfMotifs_wGenes)
  saveRDS(motifEnrichment_selfMotifs_wGenes, file=getIntName(scenicOptions, "motifEnrichment_selfMotifs_wGenes"))
  
  if(getSettings(scenicOptions, "verbose")) 
  {
    # TODO messages/print
    message(format(Sys.time(), "%H:%M"), "\tNumber of motifs that support the regulons: ", nrow(motifEnrichment_selfMotifs_wGenes))
    motifEnrichment_selfMotifs_wGenes[order(motifEnrichment_selfMotifs_wGenes$NES,decreasing=TRUE),][1:5,(1:ncol(motifEnrichment_selfMotifs_wGenes)-1), with=F] 
  }
  
  # Save as text:
  if(!file.exists("output")) dir.create("output") 
  write.table(motifEnrichment_selfMotifs_wGenes, file=getOutName(scenicOptions, "s2_motifEnrichment"),
              sep="\t", quote=FALSE, row.names=FALSE)
  if("DT" %in% installed.packages() && nrow(motifEnrichment_selfMotifs_wGenes)>0)
  {
    nvm <- tryCatch({
      colsToShow <- c("motifDb", "logo", "NES", "geneSet", "TF_highConf", "TF_lowConf")
      motifEnrichment_2html <- viewMotifs(motifEnrichment_selfMotifs_wGenes, colsToShow=colsToShow, options=list(pageLength=100))
      
      fileName <- getOutName(scenicOptions, "s2_motifEnrichmentHtml")
      
      dirName <- dirname(fileName)
      fileName <- basename(fileName)
      suppressWarnings(DT::saveWidget(motifEnrichment_2html, fileName))
      file.rename(fileName, file.path(dirName, fileName))
      if(getSettings(scenicOptions, "verbose")) message("\tPreview of motif enrichment saved as: ", file.path(dirName, fileName))
    }, error = function(e) print(e$message))
  }

  ################################################################
  # Format regulons & save
  motifEnrichment.asIncidList <- apply(motifEnrichment_selfMotifs_wGenes, 1, function(oneMotifRow) {
    genes <- strsplit(oneMotifRow["enrichedGenes"], ";")[[1]]
    oneMotifRow <- data.frame(rbind(oneMotifRow), stringsAsFactors=FALSE)
    data.frame(oneMotifRow[rep(1, length(genes)),c("NES", "motif", "highlightedTFs", "TFinDB", "geneSet", "motifDb")], genes, stringsAsFactors = FALSE)
  })
  motifEnrichment.asIncidList <- rbindlist(motifEnrichment.asIncidList)
  # colnames(motifEnrichment.asIncidList) <- c("NES", "motif", "TF", "annot", "gene", "motifDb", "geneSet")
  colnames(motifEnrichment.asIncidList)[which(colnames(motifEnrichment.asIncidList)=="highlightedTFs")] <- "TF"
  colnames(motifEnrichment.asIncidList)[which(colnames(motifEnrichment.asIncidList)=="TFinDB")] <- "annot"
  colnames(motifEnrichment.asIncidList)[which(colnames(motifEnrichment.asIncidList)=="genes")] <- "gene"
  
  motifEnrichment.asIncidList <- data.frame(motifEnrichment.asIncidList, stringsAsFactors = FALSE)

  # Get targets for each TF, but keep info about best motif/enrichment
  # (directly annotated motifs are considered better)
  regulonTargetsInfo <- lapply(split(motifEnrichment.asIncidList, motifEnrichment.asIncidList$TF), function(tfTargets){
    # print(unique(tfTargets$TF))
    tfTable <- as.data.frame(do.call(rbind, lapply(split(tfTargets, tfTargets$gene), function(enrOneGene){
      highConfAnnot <- "**" %in% enrOneGene$annot
      enrOneGeneByAnnot <- enrOneGene
      if(highConfAnnot) enrOneGeneByAnnot <- enrOneGeneByAnnot[which(enrOneGene$annot == "**"),]
      bestMotif <- which.max(enrOneGeneByAnnot$NES)

      tf <- unique(enrOneGene$TF)
      cbind(TF=tf, 
            gene=unique(enrOneGene$gene), 
            highConfAnnot=highConfAnnot,
            nMotifs=nrow(enrOneGene),
            bestMotif=as.character(enrOneGeneByAnnot[bestMotif,"motif"]), NES=as.numeric(enrOneGeneByAnnot[bestMotif,"NES"]), 
              motifDb=as.character(enrOneGeneByAnnot[bestMotif,"motifDb"]), coexModule=gsub(paste0(tf,"_"), "", as.character(enrOneGeneByAnnot[bestMotif,"geneSet"]), fixed=TRUE)
            )
    })), stringsAsFactors=FALSE)
    tfTable[order(tfTable$NES, decreasing = TRUE),]
  })
  rm(motifEnrichment.asIncidList)
  regulonTargetsInfo <- rbindlist(regulonTargetsInfo)

  
  # Optional: Add correlation
  corrMat <- loadInt(scenicOptions, "corrMat", ifNotExists="null")
  if(!is.null(corrMat))
  {
    regulonTargetsInfo$spearCor <- NA_real_
    for(tf in unique(regulonTargetsInfo$TF))
    {
      regulonTargetsInfo[which(regulonTargetsInfo$TF==tf),"spearCor"] <- corrMat[tf, unlist(regulonTargetsInfo[which(regulonTargetsInfo$TF==tf),"gene"])]
    }
  }else warning("It was not possible to add the correlation to the regulonTargetsInfo table.")
  
  
  # Optional: Add Genie3 score
  linkList <- loadInt(scenicOptions, "genie3ll", ifNotExists="null")
  if(!is.null(linkList) & ("weight" %in% colnames(linkList)))
  {
    if(is.data.table(linkList)) linkList <- as.data.frame(linkList)
    
    uniquePairs <- nrow(unique(linkList[,c("TF", "Target")]))
    if(uniquePairs == nrow(linkList)) {
      linkList <- linkList[which(linkList$weight>=getSettings(scenicOptions, "modules/weightThreshold")),]  # TODO: Will not work with GRNBOOST!
      rownames(linkList) <- paste(linkList$TF, linkList$Target,sep="__")
      regulonTargetsInfo <- cbind(regulonTargetsInfo, CoexWeight=linkList[paste(regulonTargetsInfo$TF, regulonTargetsInfo$gene,sep="__"),"weight"])
    }else {
      warning("There are duplicated regulator-target (gene id/name) pairs in the co-expression link list.",
              "\nThe co-expression weight was not added to the regulonTargetsInfo table.")
    }
  }else warning("It was not possible to add the weight to the regulonTargetsInfo table.")

  saveRDS(regulonTargetsInfo, file=getIntName(scenicOptions, "regulonTargetsInfo"))
  
  write.table(regulonTargetsInfo, file=getOutName(scenicOptions, "s2_regulonTargetsInfo"),
              sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
  rm(linkList)

  # Split into regulons... (output: list TF --> targets)
  regulonTargetsInfo_splitByAnnot <- split(regulonTargetsInfo, regulonTargetsInfo$highConfAnnot)
  regulons <- NULL
  if(!is.null(regulonTargetsInfo_splitByAnnot[["TRUE"]]))
  {
    regulons <- lapply(split(regulonTargetsInfo_splitByAnnot[["TRUE"]], regulonTargetsInfo_splitByAnnot[["TRUE"]][,"TF"]), function(x) sort(as.character(unlist(x[,"gene"]))))
  }
  regulons_extended <- NULL
  if(!is.null(regulonTargetsInfo_splitByAnnot[["FALSE"]]))
  {
    regulons_extended <- lapply(split(regulonTargetsInfo_splitByAnnot[["FALSE"]],regulonTargetsInfo_splitByAnnot[["FALSE"]][,"TF"]), function(x) unname(unlist(x[,"gene"])))
    regulons_extended <- setNames(lapply(names(regulons_extended), function(tf) sort(unique(c(regulons[[tf]], unlist(regulons_extended[[tf]]))))), names(regulons_extended))
    names(regulons_extended) <- paste(names(regulons_extended), "_extended", sep="")
  }
  regulons <- c(regulons, regulons_extended)
  saveRDS(regulons, file=getIntName(scenicOptions, "regulons"))
  
  # Save as incidence matrix (i.e. network)
  incidList <- reshape2::melt(regulons)
  incidMat <- table(incidList[,2], incidList[,1])
  saveRDS(incidMat, file=getIntName(scenicOptions, "regulons_incidMat"))
  rm(incidMat)
  #TODO NMF::aheatmap(incidMat)
  
  if(getSettings(scenicOptions, "verbose")) 
  {
    # Number of regulons and summary of sizes:
    length(regulons) # TODO
    summary(lengths(regulons))
  }
  
  # Finished. Update status.
  scenicOptions@status$current <- 2
  invisible(scenicOptions)
}

#' @title getDbAnnotations
#' @description Loads the motif annotation
#' @param scenicOptions Fields used: 
#' If scenicOptions@settings$db_annotFiles is set, it will load these files.
#' Otherwise, will load the default RcisTarget annotations based on 'scenicOptions@inputDatasetInfo$org', and 'scenicOptions@settings$db_mcVersion'
#' @return The motif annotations
#' @examples 
#' getDbAnnotations(scenicOptions)
#' @export 
getDbAnnotations <- function(scenicOptions)
{
  dbAnnotFiles <- scenicOptions@settings$db_annotFiles
  if(!is.null(dbAnnotFiles))
  {
    motifAnnotations <- NULL
    for(annotPath in dbAnnotFiles)
    {
      motifAnnot <- data.table::fread(annotPath) #; head(motifAnnot)
      motifAnnot$annotationSource <- factor(motifAnnot$annotationSource)
      colnames(motifAnnot)[1]<- "motif" # TEMP for now...
      levels(motifAnnot$annotationSource) <- c(levels(motifAnnot$annotationSource), c("directAnnotation","inferredBy_Orthology","inferredBy_MotifSimilarity","inferredBy_MotifSimilarity_n_Orthology")) # TEMP for now...
      motifAnnotations <- rbind(motifAnnotations, motifAnnot)
    }
  } else { # Default RcisTarget annotations
    if(is.na(getDatasetInfo(scenicOptions, "org"))) stop('Please provide an organism (scenicOptions@inputDatasetInfo$org).')
    org <- getDatasetInfo(scenicOptions, "org")
    if(is.na(org)) stop("Please provide an organism (scenicOptions@inputDatasetInfo$org).")
    if(!org %in% c("hgnc", "mgi", "dmel")) stop("Organism not recognized (scenicOptions@inputDatasetInfo$org).")
    
    if(org=="hgnc") motifAnnotName <- "motifAnnotations_hgnc"
    if(org=="mgi") motifAnnotName <- "motifAnnotations_mgi"
    if(org=="dmel") motifAnnotName <- "motifAnnotations_dmel"
    
    if(!is.null(scenicOptions@settings$db_mcVersion))
    {
      if(scenicOptions@settings$db_mcVersion=="v8") motifAnnotName <- paste0(motifAnnotName, "_v8")
    }
    
    library(RcisTarget) # Lazyload
    data(list=motifAnnotName, package="RcisTarget", verbose = FALSE)
    motifAnnotations <- eval(as.name(motifAnnotName))
  }
  
  return(motifAnnotations)
}

#' @title getDbTfs
#' @description Provides the list of transcription factors in the RcisTarget databases for the given organism
#' @param scenicOptions Fields used: 'scenicOptions@inputDatasetInfo$org'
#' @return List of transcription factors in the databases.
#' @examples 
#' getDbTfs(scenicOptions)
#' @export 
getDbTfs <- function(scenicOptions)
{
  motifAnnotations <- getDbAnnotations(scenicOptions)
  
  allTFs <- sort(unique(motifAnnotations$TF))
  return(allTFs)
}
aertslab/SCENIC documentation built on April 7, 2024, 10 a.m.