################################################################################
# Step 2. Identifying regulons (direct TF targets) based on DNA motif enrichment
################################################################################
#' @title runSCENIC_2_createRegulons
#' @description Step 2: RcisTarget (prune co-expression modules using TF-motif enrichment analysis)
#' @param scenicOptions Fields used: TODO
#' @param minGenes Minimum size of co-expression gene set (default: 20 genes)
#' @param signifGenesMethod Method for 'addSignificantGenes'
#' @param coexMethods Allows to select the method(s) used to generate the co-expression modules
#' @param minJakkardInd Merge overlapping modules (with Jakkard index >=minJakkardInd; reduces running time).
#' @param onlyPositiveCorr Whether to include only positive-correlated targets in the regulons (default: TRUE).
#' @param dbIndexCol Column containing the feature name in the database (default: 'features')
#' @return The output is written in the folders 'int' and 'ouput'
#' @details See the detailed vignette explaining the internal steps.
#' @examples
#' scenicOptions <- readRDS("int/scenicOptions.Rds")
#' # In case any settings need to be modified:
#' scenicOptions@settings$nCores <- 20
#' scenicOptions@inputDatasetInfo$org <- "mgi"
#'
#' runSCENIC_2_createRegulons(scenicOptions)
#' @export
runSCENIC_2_createRegulons <- function(scenicOptions,
minGenes=20,
coexMethods=NULL,
minJakkardInd=0.8,
signifGenesMethod="aprox",
onlyPositiveCorr=TRUE,
onlyBestGsPerMotif=TRUE,
dbIndexCol='features'
)
{
nCores <- getSettings(scenicOptions, "nCores")
tfModules_asDF <- tryCatch(loadInt(scenicOptions, "tfModules_asDF"),
error = function(e) {
if(getStatus(scenicOptions, asID=TRUE) < 2)
e$message <- paste0("It seems the co-expression modules have not been built yet. Please, run runSCENIC_1_coexNetwork2modules() first.\n",
e$message)
stop(e)
})
if(!is.null(coexMethods)) tfModules_asDF <- tfModules_asDF[which(tfModules_asDF$method %in% coexMethods),]
if(!is.null(minJakkardInd)) tfModules_asDF <- mergeOverlappingModules(tfModules_asDF, minJakkardInd=minJakkardInd) # New
if(nrow(tfModules_asDF)==0) stop("The co-expression modules are empty.")
# Set cores for RcisTarget::addMotifAnnotation(). The other functions use foreach package.
if("BiocParallel" %in% installed.packages() && (nCores>1)) {
library(BiocParallel)
register(MulticoreParam(nCores), default=TRUE)
}
msg <- paste0(format(Sys.time(), "%H:%M"), "\tStep 2. Identifying regulons")
if(getSettings(scenicOptions, "verbose")) message(msg)
### Check load DBs
library(AUCell)
library(RcisTarget)
motifAnnot <- getDbAnnotations(scenicOptions)
if(is.null(names(getSettings(scenicOptions, "dbs"))))
{
names(scenicOptions@settings$"dbs") <- scenicOptions@settings$"dbs"
tmp <- sapply(strsplit(getSettings(scenicOptions, "dbs"),"-", fixed=T), function(x) x[grep("bp|kb",x)])
if(all(lengths(tmp)>0)) names(scenicOptions@settings$"dbs") <- tmp
}
loadAttempt <- sapply(getDatabases(scenicOptions), dbLoadingAttempt, indexCol=dbIndexCol)
if(any(!loadAttempt)) stop("It is not possible to load the following databses: \n",
paste(dbs[which(!loadAttempt)], collapse="\n"))
genesInDb <- unique(unlist(lapply(getDatabases(scenicOptions), function(dbFilePath) {
rf <- arrow::ReadableFile$create(dbFilePath)
fr <- arrow::FeatherReader$create(rf)
genesInDb <- names(fr)
rnktype <- "features" #TODO: add as option for custom dbs
genesInDb <- genesInDb[genesInDb != rnktype]
})))
## Check if annotation and rankings (potentially) match:
featuresWithAnnot <- checkAnnots(scenicOptions, motifAnnot)
if(any(featuresWithAnnot == 0)) warning("Missing annotations\n", names(which(rankingsInDb==0)))
### Filter & format co-expression modules
# Remove genes missing from RcisTarget databases
# (In case the input matrix wasn't already filtered)
tfModules_asDF$TF <- as.character(tfModules_asDF$TF)
tfModules_asDF$Target <- as.character(tfModules_asDF$Target)
allTFs <- getDbTfs(scenicOptions)
tfModules_asDF <- tfModules_asDF[which(tfModules_asDF$TF %in% allTFs),]
geneInDb <- tfModules_asDF$Target %in% genesInDb
missingGene <- sort(unique(tfModules_asDF[which(!geneInDb),"Target"]))
if(length(missingGene)>0)
warning(paste0("Genes in co-expression modules not available in RcisTargetDatabases: ",
paste(missingGene, collapse=", ")))
tfModules_asDF <- tfModules_asDF[which(geneInDb),]
######
# Targets with positive correlation
if(all(is.na(tfModules_asDF$corr)))
{
warning("no correlation info available")
tfModules_Selected <- tfModules_asDF
tfModules_Selected$geneSetName <- paste(tfModules_Selected$TF, tfModules_Selected$method, sep="_")
}else{
tfModules_Selected <- tfModules_asDF[which(tfModules_asDF$corr==1),]
tfModules_Selected$geneSetName <- paste(tfModules_Selected$TF, tfModules_Selected$method, sep="_")
if(!onlyPositiveCorr)
{
tfModules_IgnCorr <- tfModules_asDF[which(tfModules_asDF$corr!=1),]
tfModules_IgnCorr$geneSetName <- paste0(tfModules_IgnCorr$TF,"_", tfModules_IgnCorr$method)
# Include positive corrs for these geneSets:
# gplots::venn(list(pos=unique(tfModules_Selected$geneSetName), ign=unique(tfModules_IgnCorr$geneSetName)))
posCorr <- tfModules_Selected[which(tfModules_Selected$geneSetName %in% unique(tfModules_IgnCorr$geneSetName)),]
tfModules_IgnCorr <- rbind(tfModules_IgnCorr, posCorr)
tfModules_IgnCorr$geneSetName <- paste0(tfModules_IgnCorr$geneSetName, "IgnCorr")
tfModules_Selected <- rbind(tfModules_Selected, tfModules_IgnCorr)
}
}
tfModules_Selected$geneSetName <- factor(as.character(tfModules_Selected$geneSetName))
# head(tfModules_Selected)
allGenes <- unique(tfModules_Selected$Target)
#####
# Split into tfModules (TF-modules, with several methods)
tfModules <- split(tfModules_Selected$Target, tfModules_Selected$geneSetName)
# Add TF to the gene set (used in the following steps, careful if editing)
tfModules <- setNames(lapply(names(tfModules), function(gsn) {
tf <- strsplit(gsn, "_")[[1]][1]
unique(c(tf, tfModules[[gsn]]))
}), names(tfModules))
# Keep gene sets with at least 'minGenes' genes
tfModules <- tfModules[which(lengths(tfModules)>=minGenes)]
saveRDS(tfModules, file=getIntName(scenicOptions, "tfModules_forEnrichment")) #TODO as geneset? & previous step?
if(getSettings(scenicOptions, "verbose")) {
tfModulesSummary <- t(sapply(strsplit(names(tfModules), "_"), function(x) x[1:2]))
message("tfModulesSummary:")
print(cbind(sort(table(tfModulesSummary[,2]))))
}
################################################################
### 1. Calculate motif enrichment for each TF-module (Run RcisTarget)
### 1.1 Calculate enrichment
msg <- paste0(format(Sys.time(), "%H:%M"), "\tRcisTarget: Calculating AUC")
if(getSettings(scenicOptions, "verbose")) message(msg)
motifs_AUC <- lapply(getDatabases(scenicOptions), function(rnkName) {
ranking <- importRankings(rnkName, columns=allGenes)
message("Scoring database: ", ranking@description)
RcisTarget::calcAUC(tfModules, ranking, aucMaxRank=0.03*getNumColsInDB(ranking), nCores=nCores, verbose=FALSE)})
saveRDS(motifs_AUC, file=getIntName(scenicOptions, "motifs_AUC"))
### 1.2 Convert to table, filter by NES & add the TFs to which the motif is annotated
# (For each database...)
msg <- paste0(format(Sys.time(), "%H:%M"), "\tRcisTarget: Adding motif annotation")
message(msg)
motifEnrichment <- lapply(motifs_AUC, function(aucOutput)
{
# Extract the TF of the gene-set name (i.e. MITF_w001):
tf <- sapply(setNames(strsplit(rownames(aucOutput), "_"), rownames(aucOutput)), function(x) x[[1]])
# Calculate NES and add motif annotation (provide tf in 'highlightTFs'):
addMotifAnnotation(aucOutput,
nesThreshold=3, digits=3,
motifAnnot=motifAnnot,
motifAnnot_highConfCat=c("directAnnotation", "inferredBy_Orthology"),
motifAnnot_lowConfCat=c("inferredBy_MotifSimilarity",
"inferredBy_MotifSimilarity_n_Orthology"),
highlightTFs=tf)
})
# Merge both tables, adding a column that contains the 'motifDb'
motifEnrichment <- do.call(rbind, lapply(names(motifEnrichment), function(dbName){
cbind(motifDb=dbName, motifEnrichment[[dbName]])
}))
saveRDS(motifEnrichment, file=getIntName(scenicOptions, "motifEnrichment_full"))
msg <- paste0("Number of motifs in the initial enrichment: ", nrow(motifEnrichment))
if(getSettings(scenicOptions, "verbose")) message(msg)
### 1.3 Keep only the motifs annotated to the initial TF
motifEnrichment_selfMotifs <- motifEnrichment[which(motifEnrichment$TFinDB != ""),, drop=FALSE]
msg <- paste0("Number of motifs annotated to the matching TF: ", nrow(motifEnrichment_selfMotifs))
if(getSettings(scenicOptions, "verbose")) message(msg)
rm(motifEnrichment)
if(nrow(motifEnrichment_selfMotifs)==0)
stop("None of the co-expression modules present enrichment of the TF motif: There are no regulons.")
####
if(onlyBestGsPerMotif)
{
met_byDb <- split(motifEnrichment_selfMotifs, motifEnrichment_selfMotifs$motifDb)
for(db in names(met_byDb))
{
met <- met_byDb[[db]]
met <- split(met, factor(met$highlightedTFs))
met <- lapply(met, function(x){
rbindlist(lapply(split(x, x$motif), function(y) y[which.max(y$NES),]))
})
# sapply(met, nrow)
met_byDb[[db]] <- rbindlist(met)
}
motifEnrichment_selfMotifs <- rbindlist(met_byDb)
rm(met_byDb); rm(met)
}
####
################################################################
# 2. Prune targets
msg <- paste0(format(Sys.time(), "%H:%M"), "\tRcisTarget: Pruning targets")
if(getSettings(scenicOptions, "verbose")) message(msg)
dbNames <- getDatabases(scenicOptions)
motifEnrichment_selfMotifs_wGenes <- lapply(names(dbNames), function(motifDbName){
ranking <- importRankings(dbNames[motifDbName], columns=allGenes)
addSignificantGenes(resultsTable=motifEnrichment_selfMotifs[motifEnrichment_selfMotifs$motifDb==motifDbName,],
geneSets=tfModules,
rankings=ranking,
plotCurve = FALSE,
maxRank=5000,
method=signifGenesMethod,
nMean=100,
nCores=nCores)
})
suppressPackageStartupMessages(library(data.table))
motifEnrichment_selfMotifs_wGenes <- rbindlist(motifEnrichment_selfMotifs_wGenes)
saveRDS(motifEnrichment_selfMotifs_wGenes, file=getIntName(scenicOptions, "motifEnrichment_selfMotifs_wGenes"))
if(getSettings(scenicOptions, "verbose"))
{
# TODO messages/print
message(format(Sys.time(), "%H:%M"), "\tNumber of motifs that support the regulons: ", nrow(motifEnrichment_selfMotifs_wGenes))
motifEnrichment_selfMotifs_wGenes[order(motifEnrichment_selfMotifs_wGenes$NES,decreasing=TRUE),][1:5,(1:ncol(motifEnrichment_selfMotifs_wGenes)-1), with=F]
}
# Save as text:
if(!file.exists("output")) dir.create("output")
write.table(motifEnrichment_selfMotifs_wGenes, file=getOutName(scenicOptions, "s2_motifEnrichment"),
sep="\t", quote=FALSE, row.names=FALSE)
if("DT" %in% installed.packages() && nrow(motifEnrichment_selfMotifs_wGenes)>0)
{
nvm <- tryCatch({
colsToShow <- c("motifDb", "logo", "NES", "geneSet", "TF_highConf", "TF_lowConf")
motifEnrichment_2html <- viewMotifs(motifEnrichment_selfMotifs_wGenes, colsToShow=colsToShow, options=list(pageLength=100))
fileName <- getOutName(scenicOptions, "s2_motifEnrichmentHtml")
dirName <- dirname(fileName)
fileName <- basename(fileName)
suppressWarnings(DT::saveWidget(motifEnrichment_2html, fileName))
file.rename(fileName, file.path(dirName, fileName))
if(getSettings(scenicOptions, "verbose")) message("\tPreview of motif enrichment saved as: ", file.path(dirName, fileName))
}, error = function(e) print(e$message))
}
################################################################
# Format regulons & save
motifEnrichment.asIncidList <- apply(motifEnrichment_selfMotifs_wGenes, 1, function(oneMotifRow) {
genes <- strsplit(oneMotifRow["enrichedGenes"], ";")[[1]]
oneMotifRow <- data.frame(rbind(oneMotifRow), stringsAsFactors=FALSE)
data.frame(oneMotifRow[rep(1, length(genes)),c("NES", "motif", "highlightedTFs", "TFinDB", "geneSet", "motifDb")], genes, stringsAsFactors = FALSE)
})
motifEnrichment.asIncidList <- rbindlist(motifEnrichment.asIncidList)
# colnames(motifEnrichment.asIncidList) <- c("NES", "motif", "TF", "annot", "gene", "motifDb", "geneSet")
colnames(motifEnrichment.asIncidList)[which(colnames(motifEnrichment.asIncidList)=="highlightedTFs")] <- "TF"
colnames(motifEnrichment.asIncidList)[which(colnames(motifEnrichment.asIncidList)=="TFinDB")] <- "annot"
colnames(motifEnrichment.asIncidList)[which(colnames(motifEnrichment.asIncidList)=="genes")] <- "gene"
motifEnrichment.asIncidList <- data.frame(motifEnrichment.asIncidList, stringsAsFactors = FALSE)
# Get targets for each TF, but keep info about best motif/enrichment
# (directly annotated motifs are considered better)
regulonTargetsInfo <- lapply(split(motifEnrichment.asIncidList, motifEnrichment.asIncidList$TF), function(tfTargets){
# print(unique(tfTargets$TF))
tfTable <- as.data.frame(do.call(rbind, lapply(split(tfTargets, tfTargets$gene), function(enrOneGene){
highConfAnnot <- "**" %in% enrOneGene$annot
enrOneGeneByAnnot <- enrOneGene
if(highConfAnnot) enrOneGeneByAnnot <- enrOneGeneByAnnot[which(enrOneGene$annot == "**"),]
bestMotif <- which.max(enrOneGeneByAnnot$NES)
tf <- unique(enrOneGene$TF)
cbind(TF=tf,
gene=unique(enrOneGene$gene),
highConfAnnot=highConfAnnot,
nMotifs=nrow(enrOneGene),
bestMotif=as.character(enrOneGeneByAnnot[bestMotif,"motif"]), NES=as.numeric(enrOneGeneByAnnot[bestMotif,"NES"]),
motifDb=as.character(enrOneGeneByAnnot[bestMotif,"motifDb"]), coexModule=gsub(paste0(tf,"_"), "", as.character(enrOneGeneByAnnot[bestMotif,"geneSet"]), fixed=TRUE)
)
})), stringsAsFactors=FALSE)
tfTable[order(tfTable$NES, decreasing = TRUE),]
})
rm(motifEnrichment.asIncidList)
regulonTargetsInfo <- rbindlist(regulonTargetsInfo)
# Optional: Add correlation
corrMat <- loadInt(scenicOptions, "corrMat", ifNotExists="null")
if(!is.null(corrMat))
{
regulonTargetsInfo$spearCor <- NA_real_
for(tf in unique(regulonTargetsInfo$TF))
{
regulonTargetsInfo[which(regulonTargetsInfo$TF==tf),"spearCor"] <- corrMat[tf, unlist(regulonTargetsInfo[which(regulonTargetsInfo$TF==tf),"gene"])]
}
}else warning("It was not possible to add the correlation to the regulonTargetsInfo table.")
# Optional: Add Genie3 score
linkList <- loadInt(scenicOptions, "genie3ll", ifNotExists="null")
if(!is.null(linkList) & ("weight" %in% colnames(linkList)))
{
if(is.data.table(linkList)) linkList <- as.data.frame(linkList)
uniquePairs <- nrow(unique(linkList[,c("TF", "Target")]))
if(uniquePairs == nrow(linkList)) {
linkList <- linkList[which(linkList$weight>=getSettings(scenicOptions, "modules/weightThreshold")),] # TODO: Will not work with GRNBOOST!
rownames(linkList) <- paste(linkList$TF, linkList$Target,sep="__")
regulonTargetsInfo <- cbind(regulonTargetsInfo, CoexWeight=linkList[paste(regulonTargetsInfo$TF, regulonTargetsInfo$gene,sep="__"),"weight"])
}else {
warning("There are duplicated regulator-target (gene id/name) pairs in the co-expression link list.",
"\nThe co-expression weight was not added to the regulonTargetsInfo table.")
}
}else warning("It was not possible to add the weight to the regulonTargetsInfo table.")
saveRDS(regulonTargetsInfo, file=getIntName(scenicOptions, "regulonTargetsInfo"))
write.table(regulonTargetsInfo, file=getOutName(scenicOptions, "s2_regulonTargetsInfo"),
sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
rm(linkList)
# Split into regulons... (output: list TF --> targets)
regulonTargetsInfo_splitByAnnot <- split(regulonTargetsInfo, regulonTargetsInfo$highConfAnnot)
regulons <- NULL
if(!is.null(regulonTargetsInfo_splitByAnnot[["TRUE"]]))
{
regulons <- lapply(split(regulonTargetsInfo_splitByAnnot[["TRUE"]], regulonTargetsInfo_splitByAnnot[["TRUE"]][,"TF"]), function(x) sort(as.character(unlist(x[,"gene"]))))
}
regulons_extended <- NULL
if(!is.null(regulonTargetsInfo_splitByAnnot[["FALSE"]]))
{
regulons_extended <- lapply(split(regulonTargetsInfo_splitByAnnot[["FALSE"]],regulonTargetsInfo_splitByAnnot[["FALSE"]][,"TF"]), function(x) unname(unlist(x[,"gene"])))
regulons_extended <- setNames(lapply(names(regulons_extended), function(tf) sort(unique(c(regulons[[tf]], unlist(regulons_extended[[tf]]))))), names(regulons_extended))
names(regulons_extended) <- paste(names(regulons_extended), "_extended", sep="")
}
regulons <- c(regulons, regulons_extended)
saveRDS(regulons, file=getIntName(scenicOptions, "regulons"))
# Save as incidence matrix (i.e. network)
incidList <- reshape2::melt(regulons)
incidMat <- table(incidList[,2], incidList[,1])
saveRDS(incidMat, file=getIntName(scenicOptions, "regulons_incidMat"))
rm(incidMat)
#TODO NMF::aheatmap(incidMat)
if(getSettings(scenicOptions, "verbose"))
{
# Number of regulons and summary of sizes:
length(regulons) # TODO
summary(lengths(regulons))
}
# Finished. Update status.
scenicOptions@status$current <- 2
invisible(scenicOptions)
}
#' @title getDbAnnotations
#' @description Loads the motif annotation
#' @param scenicOptions Fields used:
#' If scenicOptions@settings$db_annotFiles is set, it will load these files.
#' Otherwise, will load the default RcisTarget annotations based on 'scenicOptions@inputDatasetInfo$org', and 'scenicOptions@settings$db_mcVersion'
#' @return The motif annotations
#' @examples
#' getDbAnnotations(scenicOptions)
#' @export
getDbAnnotations <- function(scenicOptions)
{
dbAnnotFiles <- scenicOptions@settings$db_annotFiles
if(!is.null(dbAnnotFiles))
{
motifAnnotations <- NULL
for(annotPath in dbAnnotFiles)
{
motifAnnot <- data.table::fread(annotPath) #; head(motifAnnot)
motifAnnot$annotationSource <- factor(motifAnnot$annotationSource)
colnames(motifAnnot)[1]<- "motif" # TEMP for now...
levels(motifAnnot$annotationSource) <- c(levels(motifAnnot$annotationSource), c("directAnnotation","inferredBy_Orthology","inferredBy_MotifSimilarity","inferredBy_MotifSimilarity_n_Orthology")) # TEMP for now...
motifAnnotations <- rbind(motifAnnotations, motifAnnot)
}
} else { # Default RcisTarget annotations
if(is.na(getDatasetInfo(scenicOptions, "org"))) stop('Please provide an organism (scenicOptions@inputDatasetInfo$org).')
org <- getDatasetInfo(scenicOptions, "org")
if(is.na(org)) stop("Please provide an organism (scenicOptions@inputDatasetInfo$org).")
if(!org %in% c("hgnc", "mgi", "dmel")) stop("Organism not recognized (scenicOptions@inputDatasetInfo$org).")
if(org=="hgnc") motifAnnotName <- "motifAnnotations_hgnc"
if(org=="mgi") motifAnnotName <- "motifAnnotations_mgi"
if(org=="dmel") motifAnnotName <- "motifAnnotations_dmel"
if(!is.null(scenicOptions@settings$db_mcVersion))
{
if(scenicOptions@settings$db_mcVersion=="v8") motifAnnotName <- paste0(motifAnnotName, "_v8")
}
library(RcisTarget) # Lazyload
data(list=motifAnnotName, package="RcisTarget", verbose = FALSE)
motifAnnotations <- eval(as.name(motifAnnotName))
}
return(motifAnnotations)
}
#' @title getDbTfs
#' @description Provides the list of transcription factors in the RcisTarget databases for the given organism
#' @param scenicOptions Fields used: 'scenicOptions@inputDatasetInfo$org'
#' @return List of transcription factors in the databases.
#' @examples
#' getDbTfs(scenicOptions)
#' @export
getDbTfs <- function(scenicOptions)
{
motifAnnotations <- getDbAnnotations(scenicOptions)
allTFs <- sort(unique(motifAnnotations$TF))
return(allTFs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.