# Suppress loading messages when building the HTML
suppressPackageStartupMessages({
  library(SCENIC)
  library(AUCell)
  library(RcisTarget)
  library(SCopeLoomR)
  library(KernSmooth)
  library(BiocParallel)
  library(ggplot2)
  library(data.table)
  library(grid)
  library(ComplexHeatmap)
})

options(width=200)

# To build a personalized report, update this working directory:
knitr::opts_knit$set(root.dir="SCENIC_mouseBrain") 

Vignette built on r format(Sys.time(), "%b %d, %Y") with SCENIC version r packageVersion("SCENIC").

SCENIC workflow

This tutorial goes through the steps in the SCENIC workflow:

Building the gene regulatory network (GRN):

  1. Identify potential targets for each TF based on co-expression.
  2. Filtering the expression matrix and running GENIE3/GRNBoost.
  3. Formatting the targets from GENIE3/GRNBoost into co-expression modules.

  4. Select potential direct-binding targets (regulons) based on DNA-motif analysis (RcisTarget: TF motif analysis)

Identify cell states and their regulators:

  1. Analyzing the network activity in each individual cell (AUCell)
  2. Scoring regulons in the cells (calculate AUC)
  3. Optional: Convert the network activity into ON/OFF (binary activity matrix)

  4. Identify stable cell states based on their gene regulatory network activity (cell clustering) and exploring the results...

To start this tutorial you should have read the "Introduction and setup" vignette (vignette("SCENIC_Setup")) and run the setup steps.

Command list

This is an overview of the main commands used to run the SCENIC workflow. (To be used as cheatsheet or template, it is not exhaustive). The commands are explained in the following sections.

### Load data
loomPath <- system.file(package="SCENIC", "examples/mouseBrain_toy.loom")
library(SCopeLoomR)
loom <- open_loom(loomPath)
exprMat <- get_dgem(loom)
cellInfo <- get_cell_annotation(loom)
close_loom(loom)

### Initialize settings
library(SCENIC)
scenicOptions <- initializeScenic(org="mgi", dbDir="cisTarget_databases", nCores=10)
# scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds"
saveRDS(scenicOptions, file="int/scenicOptions.Rds") 

### Co-expression network
genesKept <- geneFiltering(exprMat, scenicOptions)
exprMat_filtered <- exprMat[genesKept, ]
runCorrelation(exprMat_filtered, scenicOptions)
exprMat_filtered_log <- log2(exprMat_filtered+1) 
runGenie3(exprMat_filtered_log, scenicOptions)

### Build and score the GRN
exprMat_log <- log2(exprMat+1)
scenicOptions@settings$dbs <- scenicOptions@settings$dbs["10kb"] # Toy run settings
scenicOptions <- runSCENIC_1_coexNetwork2modules(scenicOptions)
scenicOptions <- runSCENIC_2_createRegulons(scenicOptions, coexMethod=c("top5perTarget")) # Toy run settings
scenicOptions <- runSCENIC_3_scoreCells(scenicOptions, exprMat_log)

# Optional: Binarize activity
# aucellApp <- plotTsne_AUCellApp(scenicOptions, exprMat_log)
# savedSelections <- shiny::runApp(aucellApp)
# newThresholds <- savedSelections$thresholds
# scenicOptions@fileNames$int["aucell_thresholds",1] <- "int/newThresholds.Rds"
# saveRDS(newThresholds, file=getIntName(scenicOptions, "aucell_thresholds"))
scenicOptions <- runSCENIC_4_aucell_binarize(scenicOptions)
tsneAUC(scenicOptions, aucType="AUC") # choose settings

# Export:
# saveRDS(cellInfo, file=getDatasetInfo(scenicOptions, "cellInfo")) # Temporary, to add to loom
export2loom(scenicOptions, exprMat)

# To save the current status, or any changes in settings, save the object again:
saveRDS(scenicOptions, file="int/scenicOptions.Rds") 

### Exploring output 
# Check files in folder 'output'
# Browse the output .loom file @ http://scope.aertslab.org

# output/Step2_MotifEnrichment_preview.html in detail/subset:
motifEnrichment_selfMotifs_wGenes <- loadInt(scenicOptions, "motifEnrichment_selfMotifs_wGenes")
tableSubset <- motifEnrichment_selfMotifs_wGenes[highlightedTFs=="Sox8"]
viewMotifs(tableSubset) 

# output/Step2_regulonTargetsInfo.tsv in detail: 
regulonTargetsInfo <- loadInt(scenicOptions, "regulonTargetsInfo")
tableSubset <- regulonTargetsInfo[TF=="Stat6" & highConfAnnot==TRUE]
viewMotifs(tableSubset) 

# Cell-type specific regulators (RSS): 
regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
rss <- calcRSS(AUC=getAUC(regulonAUC), cellAnnotation=cellInfo[colnames(regulonAUC), "CellType"], )
rssPlot <- plotRSS(rss)
plotly::ggplotly(rssPlot$plot)

Directories

During this workflow we will save multiple files. To keep them tidy, we recommend to set the working directory to a new folder.

For example:

dir.create("SCENIC_MouseBrain")
setwd("SCENIC_MouseBrain") # Or `knitr::opts_knit$set(root.dir = 'example_results/SCENIC_MouseBrain')` in the first chunk if running a notebook

The main outputs of scenic are stored into a loom file, in the the output folder, which also includes some automatically generated plots and reports which you can use to have an overview of the results.

In addition, some intermediate/temporary files will be saved into the int folder, with a numbered prefix to keep them in order. You may use these files to check details about each step, or re-run parts of the analysis with different settings.

Input

Expression matrix

The input for SCENIC is a single-cell RNA-seq expression matrix (with gene-symbol as rownames, see the vignette("SCENIC_Setup") for details). The first step is to load this matrix.

For this tutorial we provide a toy example only 200 cells and <1000 genes from the mouse brain (described in the setup vignette):

loomPath <- system.file(package="SCENIC", "examples/mouseBrain_toy.loom")
# This toy example is a subset of the 3005 cell mouse brain dataset by Zeisel et al.:
download.file("http://loom.linnarssonlab.org/clone/Previously%20Published/Cortex.loom", "Cortex.loom")
loomPath <- "Cortex.loom"

Open the loom file and load the expression matrix (and cell annotation if available)

library(SCopeLoomR)
loom <- open_loom(loomPath)
exprMat <- get_dgem(loom)
cellInfo <- get_cell_annotation(loom)
close_loom(loom)

dim(exprMat)

Cell info/phenodata

In Step 3-4 (scoring the GRN and clustering), it is interesting to compare the results with known information about the cells. You can already indicate which variables to plot, and assign them a specific color (otherwise one will be assigned automatically).

# cellInfo$nGene <- colSums(exprMat>0)
head(cellInfo)
cellInfo <- data.frame(cellInfo)
cbind(table(cellInfo$CellType))
dir.create("int")
saveRDS(cellInfo, file="int/cellInfo.Rds")
# Color to assign to the variables (same format as for NMF::aheatmap)
colVars <- list(CellType=c("microglia"="forestgreen", 
                           "endothelial-mural"="darkorange", 
                           "astrocytes_ependymal"="magenta4", 
                           "oligodendrocytes"="hotpink", 
                           "interneurons"="red3", 
                           "pyramidal CA1"="skyblue", 
                           "pyramidal SS"="darkblue"))
colVars$CellType <- colVars$CellType[intersect(names(colVars$CellType), cellInfo$CellType)]
saveRDS(colVars, file="int/colVars.Rds")
plot.new(); legend(0,1, fill=colVars$CellType, legend=names(colVars$CellType))

Initialize SCENIC settings

In order to keep consistent settings across the multiple steps of SCENIC, most functions in SCENIC package use a common object where the options for the current run are stored. This object replaces the "arguments" for most functions, and should be created at the begining of a SCENIC run with the function initializeScenic().

The default settings should be valid for most analyses. The parameters that need to be specified in all runs is the organism (mgi for mouse, hgnc for human, or dmel for fly), and the directory where the RcisTarget databases are stored (you may create a link in the current directory to avoid duplicating them, e.g. in linux: system("ln -s ~/path/to/dbs databases")).

For details on the options that can be modified check the help of ?initializeScenic or of the specific function that takes it as input.

library(SCENIC)
org <- "mgi" # or hgnc, or dmel
dbDir <- "cisTarget_databases" # RcisTarget databases location
myDatasetTitle <- "SCENIC example on Mouse brain" # choose a name for your analysis
data(defaultDbNames)
dbs <- defaultDbNames[[org]]
scenicOptions <- initializeScenic(org=org, dbDir=dbDir, dbs=dbs, datasetTitle=myDatasetTitle, nCores=10) 

# Modify if needed
scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds"
scenicOptions@inputDatasetInfo$colVars <- "int/colVars.Rds"
# Databases:
# scenicOptions@settings$dbs <- c("mm9-5kb-mc8nr"="mm9-tss-centered-5kb-10species.mc8nr.feather")
# scenicOptions@settings$db_mcVersion <- "v8"

# Save to use at a later time...
saveRDS(scenicOptions, file="int/scenicOptions.Rds") 

Co-expression network

The first step of the SCENIC workflow is to infer potential transcription factor targets based on the expression data. To do this we use GENIE3 or GRNBoost. The input to either of these tools are the expression matrix (filtered), and a list of transcription factors (potential regulators). The output of GENIE3/GRBBoost, and a correlation matrix will be used to create the co-expression modules (runSCENIC_1_coexNetwork2modules()).

Choosing between GENIE3/GRNBoost: In principle, many of the existing methods to infer co-expression networks could be used for this step, as long as its output is a list of potential targets for each TF (and it can be applied to scRNA-seq...). We selected GENIE3 (@huynh2010) because it allows to identify non-linear relationships, even if they are only present in a subset of samples, and it was the best performer in the Network Inference DREAM5 challenge (@marbach2012). GENIE3 can easily be run within R.

However, GENIE3 is very time- and computationally-consuming (it will take several hours or days on datasets of 3-5k cells). To allow scalability to bigger datasets, we created GRNboost (see @aibar2017) and the arboreto framework. GRNBoost provides similar results to GENIE3 in just a fraction of the time (publication in press), so we highly recommend it for bigger datasets.

Subsampling cells: When there is a high proportion of low-quality cells, or if the computation time is an issue, it is also possible to infer the regulatory network using a subset of cells (e.g. selecting random or high-quality cells as input to the co-expression analysis). The activity of the regulatory network, trained on this subset of cells, can then be evaluated on all the cells in the dataset with AUCell (Step 3). Note that to avoid loss of resolution, the subset of cells should be representative of the whole dataset (e.g. contain sufficient representation of all the cell types). Examples of this approach are presented in @aibar2017 (i.e. subsampling this mouse brain dataset, and the analysis of 49k cells from mouse retina).

Gene filter/selection

To run GENIE3/GRNBoost we recommend to apply soft gene filter, to remove genes that are expressed either at very low levels or in too few cells. Here we apply a filtering based on the total number of counts of the gene, and the number of cells in which it is detected.

  1. Filter by the total number of reads per gene. This filter is meant to remove genes that are most likely noise. By default it keeps only the genes with at least r 3*.01*ncol(exprMat) UMI counts across all samples (e.g. the total number the gene would have, if it was expressed with a value of 3 in 1% of the cells). Adjust this value (minCountsPerGene) according to the dataset (it will depend on the dataset units, e.g. UMI, TPMs...).

  2. Filter by the number of cells in which the gene is detected (e.g. >0 UMI, or >1 log2(TPM)). By default (minSamples), genes that are detected in at least 1% of the cells are kept. This filtering is meant to remove genes whose reads come from one a few 'noisy' cells (genes that are only expressed in one, or very few cells, gain a lot of weight if they happen to coincide in a given cell). To avoid removing small (but potentially interesting) cell populations, we recommend to set a percentage lower than the smallest population of cells to be detected.

  3. Finally, only the genes that are available in RcisTarget databases will be kept. This filter is mostly to save some running time for GENIE3/GRNBoost, since the genes that are not available in the databases will not be used in upcoming steps.

# (Adjust minimum values according to your dataset)
genesKept <- geneFiltering(exprMat, scenicOptions=scenicOptions,
                           minCountsPerGene=3*.01*ncol(exprMat),
                           minSamples=ncol(exprMat)*.01)

Before proceeding to the network inference, check whether any known relevant genes are filtered-out (if any relevant gene is missing, double-check whether the filters are appropiate):

interestingGenes <- c("Sox9", "Sox10", "Dlx5")
# any missing?
interestingGenes[which(!interestingGenes %in% genesKept)]

We can now filter the expression matrix to contain only these r length(genesKept) genes. This matrix is now ready for the co-expression analysis.

exprMat_filtered <- exprMat[genesKept, ]
dim(exprMat_filtered)

exprMat will not be used for the co-expression analysis, it can be unloaded:

rm(exprMat)

Correlation

GENIE3/GRNBoost can detect both positive and negative associations. In order to distinguish potential activation from repression, we will split the targets into positive- and negative-correlated targets (i.e. Spearman correlation between the TF and the potential target).

(This step can be run either before/after or simultaneously to GENIE3/GRNBoost)

Calculate the correlation:

runCorrelation(exprMat_filtered, scenicOptions)

GENIE3

To run GRNBoost (in Python) instead of GENIE3. See ?exportsForGRNBoost for details.

The input to GENIE3 is typically an expression matrix and a list of candidate regulators. The function runGenie3 will run GENIE3 with default settings, which are usually adequate for most datasets, using the transcription factors available in RcisTarget databases as candidate regulators.

Since GENIE3 is based on a Random Forest approach, each time it is run the results will be slightly different. The higher the number of trees used (ntrees), the lower the variability. We recommend to use set.seed to reproduce exact results in multiple runs. For more details, check ?GENIE3 (GENIE3 help) or ?runGenie3 (SCENIC wrapper for GENIE3).

GENIE3 will typically take several hours (or days) to run. If you are running this workflow on an RStudio session, we recommend that you stop here and run the next code chunk in an independent R console (i.e. with screen/tmux) or in an server/HPC (if available). The upcoming code chunks will resume the workflow by loading GENIE3 output.

## If launched in a new session, you will need to reload...
# setwd("...")
# loomPath <- "..."
# loom <- open_loom(loomPath)
# exprMat <- get_dgem(loom)
# close_loom(loom)
# genesKept <- loadInt(scenicOptions, "genesKept")
# exprMat_filtered <- exprMat[genesKept,]
# library(SCENIC)
# scenicOptions <- readRDS("int/scenicOptions.Rds")

# Optional: add log (if it is not logged/normalized already)
exprMat_filtered <- log2(exprMat_filtered+1) 

# Run GENIE3
runGenie3(exprMat_filtered, scenicOptions)

Build and score the GRN (runSCENIC_...)

Once the results from GENIE3/GRNBoost (and the correlation) are ready, the remaining steps of SCENIC can be run.

The easiest/fastest way is to use the following wrapper functions, each of them corresponding to one of the main steps in SCENIC workflow:

Build the gene regulatory network: 1. Get co-expression modules 2. Get regulons (with RcisTarget): TF motif analysis)

Identify cell states: 3. Score GRN (regulons) in the cells (with AUCell) 4. Cluster cells according to the GRN activity

An overview of the steps workflow is explained in @aibar2017. Detailed tutorials/notebooks explaining these functions in detail are also available (see vignette(package="SCENIC")). These might be useful for users who want to know the details of the implementation, understand the results more in depth, or to modify or run only some of the steps of the workflow. We recommend to check vignette("detailedStep_2_createRegulons", package="SCENIC") to understand how the Gene Regulatory Networks are build. For more info on the scoring of the networks on the cells see vignette("detailedStep_3_scoreCells", package="SCENIC").

Re-load the expression matrix if necessary:

loom <- open_loom(loomPath)
exprMat <- get_dgem(loom)
close_loom(loom)
# Optional: log expression (for TF expression plot, it does not affect any other calculation)
exprMat_log <- log2(exprMat+1)
dim(exprMat)

Run the remaining steps using the wrapper functions:

library(SCENIC)
scenicOptions <- readRDS("int/scenicOptions.Rds")
scenicOptions@settings$verbose <- TRUE
scenicOptions@settings$nCores <- 10
scenicOptions@settings$seed <- 123

# For a very quick run: 
# coexMethod=c("top5perTarget")
scenicOptions@settings$dbs <- scenicOptions@settings$dbs["10kb"] # For toy run
# save...

scenicOptions <- runSCENIC_1_coexNetwork2modules(scenicOptions)
scenicOptions <- runSCENIC_2_createRegulons(scenicOptions, coexMethod=c("top5perTarget")) #** Only for toy run!!
scenicOptions <- runSCENIC_3_scoreCells(scenicOptions, exprMat_log)

saveRDS(scenicOptions, file="int/scenicOptions.Rds") # To save status

*For a quick run*, using the 'toy dataset', you may set runSCENIC_2_createRegulons(scenicOptions, coexMethod="top5perTarget") and only one of the RcisTarget databases. These results will not be as comprehensive as the full run, but might be enough for getting a feel of the interface.

Optional steps:

The following steps are optional. Feel free to jump to exploring the results, or exporting to the web-viewer (SCope).

Binarize the network activity (regulon on/off)

Building the GRN and scoring its activity in AUCell is often enough for datasets with very clear cell types. However, in many cases it is also useful to binarize the activity score into "on/off"; either for easier interpretation, or for maximizing the differences across cell types. It is possible to binarize only specific regulons for exploring/interpreting key TFs. However, binarizing the activity of all the regulons in the dataset allows to create the "Binarized regulon activity matrix", which can be used for upstream analysis (e.g. clustering). The binarized activity is specially useful to reduce technical biases (e.g. number of detected genes, batch effects), the grouping by patient of origin in cancer datasets, or even cross-species comparisons (see @aibar2017).

To determine in which cells each regulon is active, we will use an AUC threshold. AUCell automatically calculates possible thresholds for the binarization, but they are often too conservative. We recommend to check these thresholds manually before proceeding to the binarization. This can be a iterative process, where the thresholds can be re-adjusted after an initial exploration. Once the final thresholds are selected, the cell-regulon activity will be summarized into a binary activity matrix in which the columns represent the cells and the rows the regulons. The coordinates of the matrix that correspond to active regulons in a given cell will contain a "1" value, and "0" all the others.

You can see the selected thresholds in the output from the previous step [file: output/Step3_3.2_AUCtSNEs.html (If you are using Rstudio, you might need to download the file and accompanying folder)], and these can be adjusted with AUCell's Shiny app:

aucellApp <- plotTsne_AUCellApp(scenicOptions, exprMat_log)
savedSelections <- shiny::runApp(aucellApp)

# Save the modified thresholds:
newThresholds <- savedSelections$thresholds
scenicOptions@fileNames$int["aucell_thresholds",1] <- "int/newThresholds.Rds"
saveRDS(newThresholds, file=getIntName(scenicOptions, "aucell_thresholds"))
saveRDS(scenicOptions, file="int/scenicOptions.Rds") 

Once you have optimized the thresholds, run runSCENIC_4_aucell_binarize to binarize the AUC, and generate some extra figures and clusterings:

# scenicOptions@settings$devType="png"
scenicOptions <- runSCENIC_4_aucell_binarize(scenicOptions)

The t-SNEs can also be created using the binary activity matrix (in the same way as indicated in section "Creating/comparing t-SNEs"), just set tsneAUC( ..., aucType="binary") instead.

Clustering / dimensionality reduction on the regulon activity

The cells can be grouped/clustered based on the regulon activity, either continuous or binarized (See the section below "Exploring > Cell states" for details).

If using t-SNE as visualization, it is recommended to try different settings to evaluate the stability of the states/clusters. Feel free to use UMAP, other clustering methods (or trajectory inference methods, if appropriate) instead.

The function included in SCENIC package runs multiple t-SNEs with different settings; It will create all combinations between the selected "number of PCs" and "perplexity" (expected running time: few minutes to hours, depending on the number of cells):

nPcs <- c(5) # For toy dataset
# nPcs <- c(5,15,50)
scenicOptions@settings$seed <- 123 # same seed for all of them
# Run t-SNE with different settings:
fileNames <- tsneAUC(scenicOptions, aucType="AUC", nPcs=nPcs, perpl=c(5,15,50))
fileNames <- tsneAUC(scenicOptions, aucType="AUC", nPcs=nPcs, perpl=c(5,15,50), onlyHighConf=TRUE, filePrefix="int/tSNE_oHC")
# Plot as pdf (individual files in int/):
fileNames <- paste0("int/",grep(".Rds", grep("tSNE_", list.files("int"), value=T), value=T))

Note: The toy dataset only contains ~8 regulons; using more than 8 PCs will not provide any difference...

and to view/compare them...

par(mfrow=c(length(nPcs), 3))
fileNames <- paste0("int/",grep(".Rds", grep("tSNE_AUC", list.files("int"), value=T, perl = T), value=T))
plotTsne_compareSettings(fileNames, scenicOptions, showLegend=FALSE, varName="CellType", cex=.5)
# Using only "high-confidence" regulons (normally similar)
par(mfrow=c(3,3))
fileNames <- paste0("int/",grep(".Rds", grep("tSNE_oHC_AUC", list.files("int"), value=T, perl = T), value=T))
plotTsne_compareSettings(fileNames, scenicOptions, showLegend=FALSE, varName="CellType", cex=.5)

The chosen t-SNE can then be saved as default to use for plots (can also be "binary", see below):

scenicOptions@settings$defaultTsne$aucType <- "AUC"
scenicOptions@settings$defaultTsne$dims <- 5
scenicOptions@settings$defaultTsne$perpl <- 15
saveRDS(scenicOptions, file="int/scenicOptions.Rds")

Export to loom/SCope

The results from SCENIC can also be explored in http://scope.aertslab.org (@davie2018).

The .loom file can be created with the function export2loom() (requires the package SCopeLoomR). This function saves the the main results from SCENIC into a .loom file:

The motif enrichment analysis and co-expression modules (e.g. GRNBoost/GENIE3 output) are stored in independent text files (mostly due to their bigger size).

# DGEM (Digital gene expression matrix)
# (non-normalized counts)
exprMat <- get_dgem(open_loom(loomPath))
dgem <- exprMat
head(colnames(dgem))  #should contain the Cell ID/name

# Export:
scenicOptions@fileNames$output["loomFile",] <- "output/mouseBrain_SCENIC.loom"
export2loom(scenicOptions, exprMat)

To add extra data (e.g. embeddings or clusters), see help(package="SCopeLoomR").

Loading results from a .loom file

SCopeLoomR also provides functions to import the regulons, AUC, and embeddings back from the loom file. e.g.:

library(SCopeLoomR)
scenicLoomPath <- getOutName(scenicOptions, "loomFile")
loom <- open_loom(scenicLoomPath)

# Read information from loom file:
regulons_incidMat <- get_regulons(loom)
regulons <- regulonsToGeneLists(regulons_incidMat)
regulonsAUC <- get_regulons_AUC(loom)
regulonsAucThresholds <- get_regulon_thresholds(loom)
embeddings <- get_embeddings(loom)

Exploring/interpreting the results

The output folder contains several files that provide an overview of the results from each step. These results can be explored in more detail through the intermediate files (saved in the int folder, which can be listed with loadInt(scenicOptions)).

Some examples on how to explore the results:

Cell states

AUCell provides the activity of the regulons across the cells. By clustering the cells based on this regulon activity (either the continuous or binary AUC matrix), we can see whether there are groups of cells that tend to have the same regulons active, and reveal the network states that are recurrent across multiple cells. These states would be equivalent to the attractor states of the network. Combining these clustering with different visualization methods, we can explore the association of cell states with specific regulons.

SCENIC provides some wrapper functions to get a quick overview. For example, projecting the AUC and TF expression onto t-SNEs, and visualizing of the AUC as heatmaps, but feel free to explore alternative clustering and visualization tools.

Projection the AUC and TF expression onto t-SNEs

Briefly, a t-SNE is a 2D projection of the cells, where cells (dots) are placed close to each other if they have similar input profiles (in our case, regulon activity). The t-SNE usually allows to get a quick and easy overview of the cell states in the dataset. Note however, that t-SNE works well to identify distinct classes, but it is not appropiate for dinamic/continuous processes (e.g. trajectory-like visualizations).

AUCell's interactive app (for SCope, see section "Export to loom/SCope"):

exprMat_log <- exprMat # Better if it is logged/normalized
aucellApp <- plotTsne_AUCellApp(scenicOptions, exprMat_log) # default t-SNE
savedSelections <- shiny::runApp(aucellApp)

AUCell_plotTSNE() to save static plots:

print(tsneFileName(scenicOptions))
tSNE_scenic <- readRDS(tsneFileName(scenicOptions))
aucell_regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")

# Show TF expression:
par(mfrow=c(2,3))
AUCell::AUCell_plotTSNE(tSNE_scenic$Y, exprMat, aucell_regulonAUC[onlyNonDuplicatedExtended(rownames(aucell_regulonAUC))[c("Dlx5", "Sox10", "Sox9","Irf1", "Stat6")],], plots="Expression")
# Save AUC as PDF:
Cairo::CairoPDF("output/Step4_BinaryRegulonActivity_tSNE_colByAUC.pdf", width=20, height=15)
par(mfrow=c(4,6))
AUCell::AUCell_plotTSNE(tSNE_scenic$Y, cellsAUC=aucell_regulonAUC, plots="AUC")
dev.off()

Density plot to detect most likely stable states (higher-density areas in the t-SNE):

library(KernSmooth)
library(RColorBrewer)
dens2d <- bkde2D(tSNE_scenic$Y, 1)$fhat
image(dens2d, col=brewer.pal(9, "YlOrBr"), axes=FALSE)
contour(dens2d, add=TRUE, nlevels=5, drawlabels=FALSE)

Show several regulons simultaneously:

#par(bg = "black")
par(mfrow=c(1,2))

regulonNames <- c( "Dlx5","Sox10")
cellCol <- SCENIC::plotEmb_rgb(scenicOptions, regulonNames, aucType="AUC", aucMaxContrast=0.6)

regulonNames <- list(red=c("Sox10", "Sox8"),
                     green=c("Irf1"),
                     blue=c( "Tef"))
cellCol <- SCENIC::plotEmb_rgb(scenicOptions, regulonNames, aucType="Binary")

GRN: Regulon targets and motifs

Genes included in the regulons:

regulons <- loadInt(scenicOptions, "regulons")
regulons[c("Dlx5", "Irf1")]

Note than only regulons with 10 genes or more are scored with AUCell (the numbers in brackets in the regulon names indicate the number of genes in the regulon):

regulons <- loadInt(scenicOptions, "aucell_regulons")
head(cbind(onlyNonDuplicatedExtended(names(regulons))))

Details on the TF-target links: For each TF-target pair, the stats from the intermediate steps are summarized in loadInt(scenicOptions, "regulonTargetsInfo") (saved as text in: getOutName(scenicOptions, "s2_regulonTargetsInfo"): r getOutName(scenicOptions, "s2_regulonTargetsInfo")). This table can be used to explore the support to specific links. Since it will typically contain several thousand rows (in this run: r nrow(loadInt(scenicOptions, "regulonTargetsInfo"))), in most cases it is advisable to subset it before exporting it as HTML.

regulonTargetsInfo <- loadInt(scenicOptions, "regulonTargetsInfo")
tableSubset <- regulonTargetsInfo[TF=="Stat6" & highConfAnnot==TRUE]
viewMotifs(tableSubset, options=list(pageLength=5)) 

The full list of TF motifs supporting the regulons can be seen in the restuls from RcisTarget motif enrichment results (for the co-expression modules). These are saved in motifEnrichment_selfMotifs_wGenes. A preview of these results is exported as html in r getOutName(scenicOptions, "s2_motifEnrichmentHtml") (and as text in: r getOutName(scenicOptions, "s2_motifEnrichment")).

Alternative tables, showing more or fewer rows/columns could be generated modifiying this code:

motifEnrichment_selfMotifs_wGenes <- loadInt(scenicOptions, "motifEnrichment_selfMotifs_wGenes")
tableSubset <- motifEnrichment_selfMotifs_wGenes[highlightedTFs=="Dlx5"]
viewMotifs(tableSubset) 

Regulators for known cell types or clusters

The regulatory analysis from SCENIC can be combined with other analyses (typically clustering), or focus on regulators for specific cell types. There are multiple options to do these analyses (your imagination is the limit!). Here are some quick examples to start:

To start from clusters/cell types from Seurat: cellInfo <- data.frame(seuratCluster=Idents(seuratObject)))

regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
regulonAUC <- regulonAUC[onlyNonDuplicatedExtended(rownames(regulonAUC)),]
regulonActivity_byCellType <- sapply(split(rownames(cellInfo), cellInfo$CellType),
                                     function(cells) rowMeans(getAUC(regulonAUC)[,cells]))
regulonActivity_byCellType_Scaled <- t(scale(t(regulonActivity_byCellType), center = T, scale=T))

ComplexHeatmap::Heatmap(regulonActivity_byCellType_Scaled, name="Regulon activity")
topRegulators <- reshape2::melt(regulonActivity_byCellType_Scaled)
colnames(topRegulators) <- c("Regulon", "CellType", "RelativeActivity")
topRegulators <- topRegulators[which(topRegulators$RelativeActivity>0),]
viewTable(topRegulators)
minPerc <- .7
binaryRegulonActivity <- loadInt(scenicOptions, "aucell_binary_nonDupl")
cellInfo_binarizedCells <- cellInfo[which(rownames(cellInfo)%in% colnames(binaryRegulonActivity)),, drop=FALSE]
regulonActivity_byCellType_Binarized <- sapply(split(rownames(cellInfo_binarizedCells), cellInfo_binarizedCells$CellType), 
                                               function(cells) rowMeans(binaryRegulonActivity[,cells, drop=FALSE]))
binaryActPerc_subset <- regulonActivity_byCellType_Binarized[which(rowSums(regulonActivity_byCellType_Binarized>minPerc)>0),]
ComplexHeatmap::Heatmap(binaryActPerc_subset, name="Regulon activity (%)", col = c("white","pink","red"))

topRegulators <- reshape2::melt(regulonActivity_byCellType_Binarized)
colnames(topRegulators) <- c("Regulon", "CellType", "RelativeActivity")
topRegulators <- topRegulators[which(topRegulators$RelativeActivity>minPerc),]
viewTable(topRegulators)
# regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
rss <- calcRSS(AUC=getAUC(regulonAUC), cellAnnotation=cellInfo[colnames(regulonAUC), "CellType"])
rssPlot <- plotRSS(rss)
plotly::ggplotly(rssPlot$plot)
plotRSS_oneSet(rss, setName = "interneurons")
library(Seurat)
dr_coords <- Embeddings(seuratObject, reduction="tsne")

tfs <- c("Sox10","Irf1","Sox9", "Dlx5")
par(mfrow=c(2,2))
AUCell::AUCell_plotTSNE(dr_coords, cellsAUC=selectRegulons(regulonAUC, tfs), plots = "AUC")

SessionInfo

date()
sessionInfo()

References



aertslab/SCENIC documentation built on April 7, 2024, 10 a.m.