# Suppress loading messages when building the HTML
suppressPackageStartupMessages({
  library(SCENIC)
  library(AUCell)
  library(RcisTarget)
  library(SingleCellExperiment)
})

This tutorial provides the detailed explanation of runSCENIC_1_coexNetwork2modules(): Convert the output from GENIE3/GRNBoost to co-expression modules (potential TF-targets).

All the code below is the content of the function runSCENIC_1_coexNetwork2modules(). This tutorial is meant for advanced users, who want know the details about what this function does internally, or to modify the workflow. There is no need to follow this tutorial for a regular run of SCENIC (see vignette("SCENIC_Running")).

Overview of Step 1 - Part2: Transform GENIE3 output into co-expression modules

Once GENIE3/GRNBoost are run, it provides a link list including the potential regulators for each gene and their weight. The weight represents the relevance that the transcription factor (regulator) has in the prediction of the target. However, this output includes all possible pairs of genes and regulators (even if the weight is very low) and there is not a specific method or clear recommendation to select a subset of them. The most direct way to obtain the relevant links is to keep only those with a weight over a given threshold. These links can then be split by the transcription factor, to obtain potential targets for each TF.

We explored several ways to determine the threshold (e.g. looking at the rankings, distributions, and output after pruning with RcisTarget), but there was no specific method that was optimum for all cases. On the contrary, the best results applying SCENIC resulted from the combination of several strategies. In this way, we have finally opted for building multiple gene-sets of potential targets for each transcription factor: [a] setting several weight thresholds, [b] taking the top 50 targets for each TF, and [c] keeping only the top regulators for each target gene (then, split by TF). In all these cases, only the links with weight>0.001 (or value in "modules/weightThreshold" in scenicOptions) are taken into account.

The first method to create the TF-modules is to select the best targets for each transcription factor:

  1. Targets with weight > 0.001

  2. Targets with weight > 0.005

  3. Top 50 targets (targets with highest weight)

The alternative way to create the TF-modules is to select the best regulators for each gene (this is actually how GENIE3 internally works). Then, these targets can be assigned back to each TF to form the TF-modules. In this way we will create three more gene-sets:

  1. Targets for which the TF is within its top 5 regulators

  2. Targets for which the TF is within its top 10 regulators

  3. Targets for which the TF is within its top 50 regulators

The resulting TF-modules from these steps can already be analyzed for motif enrichment (Step 2 of SCENIC). However, GENIE3 can detect both positive and negative associations. In order to distinguish potential activation from repression, we will add the correlation information, which will be used to split the targets into positive- and negative- correlated targets in the next step.

Input

setwd("SCENIC_MouseBrain")
scenicOptions <- readRDS("int/scenicOptions.Rds")

runSCENIC_1_coexNetwork2modules() code:

linkList <- loadInt(scenicOptions, "genie3ll")
if(!all(colnames(linkList) == c("TF", "Target", "weight"))) stop('The link list colnames should be "TF", "Target", "weight"')

uniquePairs <- nrow(unique(linkList[,c("TF", "Target")]))
if(uniquePairs < nrow(linkList)) 
  stop("There are duplicated regulator-target (gene id/name) pairs in the input link list.")

msg <- paste0(format(Sys.time(), "%H:%M"), "\tCreating TF modules")
if(getSettings(scenicOptions, "verbose")) message(msg)

print(quantile(linkList$weight, probs=c(0.75, 0.90)))
.openDev(fileName=getIntName(scenicOptions, "genie3weighPlot"), 
         devType=getSettings(scenicOptions, "devType"))
  plot(linkList$weight[1:1000000], type="l", ylim=c(0, max(linkList$weight)), main="Weight of the links",
       ylab="Weight", xlab="Links sorted decreasingly")
  abline(h=0.001, col="blue") # Threshold
  #sum(linkList$weight>0.001)/nrow(linkList)
dev.off()

# Keep only genes with weight > threshold
linkList_001 <- linkList[which(linkList[,"weight"]>getSettings(scenicOptions, "modules/weightThreshold")),]
if(getSettings(scenicOptions, "verbose")) message("Number of links between TFs and targets: ", nrow(linkList_001))

#### Create the gene-sets & save:
tfModules <- list()

linkList_001$TF <- as.character(linkList_001$TF)
linkList_001$Target <- as.character(linkList_001$Target)

### Create TF-modules:
# 1: Weight > 0.001 (filtered in previous step)
tfModules[["w001"]] <- split(linkList_001$Target, factor(linkList_001$TF))

# 2: Weight > 0.005
llminW <- linkList_001[which(linkList_001[,"weight"]>0.005),]
tfModules[["w005"]] <- split(llminW$Target, factor(llminW$TF))

# 3: Top 50 targets for each TF
# ("w001" should be ordered decreasingly by weight)
tfModules[["top50"]] <- lapply(tfModules[["w001"]], function(x) x[1:(min(length(x), 50))])

# 4-6: Top regulators per target
# (linkList_001 should be ordered by weight!)
linkList_001_byTarget <- split(linkList_001, factor(linkList_001$Target))

nTopTfs <- c(5, 10, 50)
nTopTfs <- setNames(nTopTfs, paste("top", nTopTfs, "perTarget", sep=""))

#library(reshape2); library(data.table)
topTFsperTarget <- lapply(linkList_001_byTarget, function(llt) {
  nTFs <- nTopTfs[which(nTopTfs <= nrow(llt))]
  reshape2::melt(lapply(nTFs, function(x) llt[1:x,"TF"]))
})

topTFsperTarget <- topTFsperTarget[which(!sapply(sapply(topTFsperTarget, nrow), is.null))]
topTFsperTarget.asDf <-  data.frame(data.table::rbindlist(topTFsperTarget, idcol=TRUE))
# topTFsperTarget.asDf <- apply(topTFsperTarget.asDf, 2, as.character)
colnames(topTFsperTarget.asDf) <- c("Target", "TF", "method")

# Merge the all the gene-sets:
tfModules.melted <- reshape2::melt(tfModules)
colnames(tfModules.melted) <- c("Target", "TF", "method")
tfModules <- rbind(tfModules.melted, topTFsperTarget.asDf)
rm(tfModules.melted); rm(topTFsperTarget.asDf)
tfModules$TF <- as.character(tfModules$TF)
tfModules$Target <- as.character(tfModules$Target)

# Basic counts:  #TODO add comment
if(getSettings(scenicOptions, "verbose")) 
  print(
    rbind(nTFs=length(unique(tfModules$TF)),
          nTargets=length(unique(tfModules$Target)),
          nGeneSets=nrow(unique(tfModules[,c("TF","method")])),
          nLinks=nrow(tfModules))
  )

### Add correlation to split into positive- and negative-correlated targets
corrMat <- loadInt(scenicOptions, "corrMat")
# Keep only correlation between TFs and potential targets
tfs <- unique(tfModules$TF)
missingTFs <- tfs[which(!tfs %in% rownames(corrMat))]
if(length(missingTFs) >0 ) 
{ 
  warning("The following TFs are missing from the correlation matrix: ", paste(missingTFs, collapse=", "))

  tfs <- tfs[which(tfs %in% rownames(corrMat))]
  corrMat <- corrMat[tfs,]
}

# Add correlation to the table
# "corr" column: 1 if the correlation between the TF and the target is > 0.03, -1 if the correlation is < -0.03 and 0 otherwise.
tfModules_byTF <- split(tfModules, as.factor(tfModules$TF))
tfModules_withCorr_byTF <- lapply(tfModules_byTF[tfs[1:4]], function(tfGeneSets)
{
  tf <- as.character(unique(tfGeneSets$TF))
  targets <- as.character(tfGeneSets$Target)
  cbind(tfGeneSets, corr=c(as.numeric(corrMat[tf,targets] > 0.03) - as.numeric(corrMat[tf,targets] < -0.03)))
})
tfModules_withCorr <- data.frame(data.table::rbindlist(tfModules_withCorr_byTF))
if(length(missingTFs) >0 ) 
{ 
  tfModules_withCorr <- rbind(tfModules_withCorr, data.frame(tfModules[tfModules$TF %in% missingTFs,], corr=NA))
}
saveRDS(tfModules_withCorr, file=getIntName(scenicOptions, "tfModules_asDF"))

# Finished. Update status.
object@status$current <- 1

From this table, we can easily select gene-sets associated to each TF (i.e. split(tfModules_withCorr$Target, tfModules_withCorr$TF)). In this way, we obtain a list of potential targets for each TF based on their co-expression (TF co-expression modules).

In the next step, we will use RcisTarget to check which of these targets present enrichment of the motifs of the corresponding TF.



aertslab/SCENIC documentation built on April 7, 2024, 10 a.m.