###################################################################
# Functional Genomics Center Zurich
# This code is distributed under the terms of the GNU General
# Public License Version 3, June 2007.
# The terms are available here: http://www.gnu.org/licenses/gpl.html
# www.fgcz.ch
EzAppScSeurat <-
setRefClass("EzAppScSeurat",
contains = "EzApp",
methods = list(
initialize = function() {
"Initializes the application using its specific defaults."
runMethod <<- ezMethodScSeurat
name <<- "EzAppScSeurat"
appDefaults <<- rbind(
nfeatures = ezFrame(
Type = "numeric",
DefaultValue = 3000,
Description = "number of variable genes for SCT"
),
npcs = ezFrame(
Type = "numeric",
DefaultValue = 20,
Description = "The maximal dimensions to use for reduction"
),
pcGenes = ezFrame(
Type = "charVector",
DefaultValue = "",
Description = "The genes used in unsupervised clustering"
),
SCT.regress.CellCycle = ezFrame(
Type = "logical",
DefaultValue = FALSE,
Description="Choose CellCycle to be regressed out when using the SCTransform method if it is a bias."
),
DE.method = ezFrame(
Type = "charVector",
DefaultValue = "wilcoxon",
Description = "Method to be used when calculating gene cluster markers. Use LR if you want to include cell cycle in the regression model."
),
min.pct = ezFrame(
Type = "numeric",
DefaultValue = 0.1,
Description = "Used in calculating cluster markers: The minimum fraction of cells in either of the two tested populations."
),
min.diff.pct = ezFrame(
Type = "numeric",
DefaultValue = 0,
Description = "Used for filtering cluster markers: The minimum difference of cell fraction of the two tested populations."
),
logfc.threshold = ezFrame(
Type = "numeric",
DefaultValue = 0.25,
Description = "Used in calculating cluster markers: Limit testing to genes which show, on average, at least X-fold difference (log-scale) between the two groups of cells."
),
pvalue_allMarkers = ezFrame(
Type = "numeric",
DefaultValue = 0.01,
Description = "Used for filtering cluster markers: adjusted pValue threshold for marker detection"
),
resolution = ezFrame(
Type = "numeric",
DefaultValue = 0.5,
Description = "Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities."
),
nreads = ezFrame(
Type = "numeric",
DefaultValue = Inf,
Description = "Low quality cells have less than \"nreads\" reads. Only when applying fixed thresholds."
),
ngenes = ezFrame(
Type = "numeric",
DefaultValue = Inf,
Description = "Low quality cells have less than \"ngenes\" genes. Only when applying fixed thresholds."
),
perc_mito = ezFrame(
Type = "numeric",
DefaultValue = Inf,
Description = "Low quality cells have more than \"perc_mito\" percent of mitochondrial genes. Only when applying fixed thresholds."
),
perc_riboprot = ezFrame(
Type = "numeric",
DefaultValue = Inf,
Description = "Low quality cells have more than \"perc_ribo\" percent of ribosomal genes. Only when applying fixed thresholds."
),
keepDoublets = ezFrame(
Type = "logical",
DefaultValue = FALSE,
Description = "Whether we should keep cells suspected of being doublets. Set to TRUE only for QC purposes."
),
maxEmptyDropPValue = ezFrame(
Type = "numeric",
DefaultValue = 1,
Description = "filter droplets based on DropletUtils::emptyDrops method"
),
cellsFraction = ezFrame(
Type = "numeric",
DefaultValue = 0.01,
Description = "A gene will be kept if it is expressed in at least this percentage of cells"
),
nUMIs = ezFrame(
Type = "numeric",
DefaultValue = 1,
Description = "A gene will be kept if it has at least nUMIs in the fraction of cells specified before"
),
nmad = ezFrame(
Type = "numeric",
DefaultValue = 3,
Description = "Median absolute deviation (MAD) from the median value of each metric across all cells"
),
filterByExpression = ezFrame(
Type = "character", DefaultValue = FALSE,
Description = "Keep cells according to specific gene expression. i.e. Set > 1 | Pkn3 > 1"
),
estimateAmbient = ezFrame(
Type = "logical", DefaultValue = TRUE,
Description = "estimate contamination with ambient RNA"
),
controlSeqs = ezFrame(
Type = "charVector",
DefaultValue = "",
Description = "control sequences to add"
),
enrichrDatabase = ezFrame(
Type = "charVector", DefaultValue = "", Description="enrichR databases to search"
),
geneCountModel = ezFrame(
Type = "character",
DefaultValue = "GeneFull_ExonOverIntron",
Description = "(STARsolo Input Only) The gene count model, i.e. Solo features, to use from the previous step"
),
computePathwayTFActivity=ezFrame(Type="logical",
DefaultValue="TRUE",
Description="Whether we should compute pathway and TF activities.")
)
}
)
)
ezMethodScSeurat <- function(input = NA, output = NA, param = NA,
htmlFile = "00index.html") {
library(HDF5Array)
library(AUCell)
library(GSEABase)
library(SingleR)
library(Seurat)
library(tidyverse)
library(scDblFinder)
library(BiocParallel)
library(scuttle)
library(DropletUtils)
library(enrichR)
library(decoupleR)
library(Azimuth)
if (param$cores > 1){
BPPARAM <- MulticoreParam(workers = param$cores)
} else {
## scDblFinder fails with many cells and MulticoreParam
BPPARAM <- SerialParam()
}
register(BPPARAM)
require(future)
plan("multicore", workers = param$cores)
set.seed(38)
future.seed = TRUE
options(future.rng.onMisuse="ignore")
options(future.globals.maxSize = param$ram*1024^3)
cwd <- getwd()
setwdNew(basename(output$getColumn("SC Cluster Report")))
on.exit(setwd(cwd), add = TRUE)
cmDir <- input$getFullPaths("CountMatrix")
if (file.exists(file.path(cmDir, param$geneCountModel))){
cmDir <- file.path(cmDir, param$geneCountModel)
}
if(!ezIsSpecified(param$cellbender)){
param$cellbender = FALSE
cts <- Read10X(cmDir, gene.column = 1)
} else if(param$cellbender){
cts <- Read10X_h5(file.path(dirname(cmDir), 'cellbender_filtered_seurat.h5'), use.names = FALSE)
}
featInfo <- ezRead.table(paste0(cmDir, "/features.tsv.gz"), header = FALSE, row.names = NULL)
featInfo <- featInfo[,1:3] # in cases where additional column exist, e.g. CellRangerARC output
colnames(featInfo) <- c("gene_id", "gene_name", "type")
featInfo$isMito = grepl( "(?i)^MT-", featInfo$gene_name)
featInfo$isRiboprot = grepl( "(?i)^RPS|^RPL", featInfo$gene_name)
geneAnnoFile <- sub("byTranscript", "byGene", param$ezRef@refAnnotationFile)
if (file.exists(geneAnnoFile)){
geneAnno <- ezRead.table(geneAnnoFile)
if (any(geneAnno$type == "rRNA")){
featInfo$isRibosomal <- geneAnno[featInfo$gene_id, "type"] == "rRNA"
if(any(is.na(featInfo[, "isRibosomal"]))){
featInfo[, "isRibosomal"][which(is.na(featInfo[, "isRibosomal"]))] <- FALSE
}
}
}
## if we have feature barcodes we keep only the expression matrix
if (is.list(cts)){
cts <- cts$`Gene Expression`
featInfo <- featInfo[ featInfo$type == "Gene Expression", ]
}
if(param$cellbender){
rownames(featInfo) <- featInfo$gene_id
matchingIds <- intersect(rownames(cts), rownames(featInfo))
cts <- cts[matchingIds,]
featInfo <- featInfo[matchingIds,]
}
## underscores in genenames will become dashes
rownames(cts) <- rownames(featInfo) <- gsub("_", "-", uniquifyFeatureNames(ID=featInfo$gene_id, names=featInfo$gene_name))
scData <- CreateSeuratObject(counts = cts[rowSums2(cts >0) >0, ])
scData$Condition <- unname(input$getColumn("Condition"))
scData@meta.data$Sample <- input$getNames()
scData[["RNA"]] <- AddMetaData(object = scData[["RNA"]], metadata = featInfo[rownames(scData), ])
scData$cellBarcode <- sub(".*_", "", colnames(scData))
scData <- addCellQcToSeurat(scData, param=param, BPPARAM = BPPARAM, ribosomalGenes = featInfo[rownames(scData), "isRibosomal"])
## use empty drops to test for ambient
if ("UnfilteredCountMatrix" %in% input$colNames) {
rawDir <- input$getFullPaths("UnfilteredCountMatrix")
if (file.exists(file.path(rawDir, param$geneCountModel))) {
rawDir <- file.path(rawDir, param$geneCountModel)
}
} else {
# DEPRECATED; all input datasets should specify the path to the unfiltered count matrix
rawDir <- sub("filtered_", "raw_", cmDir)
}
if (file.exists(rawDir) && rawDir != cmDir){
if(param$cellbender){
rawCts <- Read10X_h5(file.path(dirname(cmDir), 'cellbender_raw_seurat.h5'), use.names = FALSE)
} else {
rawCts <- Read10X(rawDir, gene.column = 1)
}
if (is.list(rawCts)) {
rawCts <- rawCts$`Gene Expression`
rawCts <- rawCts[featInfo$gene_id,]
}
if (("SCDataOrigin" %in% input$colNames) &&
input$getColumn("SCDataOrigin") == 'BDRhapsody') {
rawCts <- rawCts[featInfo$gene_id,]
}
if(param$cellbender){
rawCts <- rawCts[featInfo$gene_id,]
}
if(length(setdiff(rownames(rawCts), featInfo$gene_id)) > 0){
rawCts <- rawCts[featInfo$gene_id,]
}
stopifnot(rownames(rawCts) == featInfo$gene_id)
emptyStats <- emptyDrops(rawCts[!featInfo$isMito & !featInfo$isRiboprot, ],
BPPARAM=BPPARAM, niters=1e5)
scData$negLog10CellPValue <- - log10(emptyStats[colnames(scData), "PValue"])
emptyStats <- emptyDrops(rawCts, BPPARAM=BPPARAM, niters=1e5)
scData$negLog10CellPValue <- pmin(scData$negLog10CellPValue, -log10(emptyStats[colnames(scData), "PValue"]))
scData@meta.data$negLog10CellPValue[is.na(scData$negLog10CellPValue)] <- 0
scData$qc.empty <- FALSE
if(param$maxEmptyDropPValue < 1){
scData$qc.empty[scData$negLog10CellPValue < -log10(param$maxEmptyDropPValue)] <- TRUE
scData$useCell[scData$qc.empty] <- FALSE
}
remove(rawCts)
}
allCellsMeta <- scData@meta.data
scData <- subset(scData, cells=rownames(allCellsMeta)[allCellsMeta$useCell]) # %>% head(n=1000))
## remove low expressed genes
num.cells <- param$cellsFraction * ncol(scData) # if we expect at least one rare subpopulation of cells, we should decrease the percentage of cells
cellsPerGene <- Matrix::rowSums(GetAssayData(scData, layer="counts") >= param$nUMIs)
is.expressed <- cellsPerGene >= num.cells
cellsPerGeneFraction <- data.frame(frac = cellsPerGene/ncol(scData), row.names = rownames(cellsPerGene))
scData <- scData[is.expressed,]
## Add Cell Cycle information to Seurat object as metadata columns
scData <- addCellCycleToSeurat(scData, param$refBuild, BPPARAM)
## Get information on which variables to regress out in scaling/SCT
scData <- seuratStandardSCTPreprocessing(scData, param)
## defaultAssay is now SCT
scData <- seuratStandardWorkflow(scData, param, ident.name="seurat_clusters")
# estimate ambient first
if (ezIsSpecified(param$estimateAmbient) && param$estimateAmbient) {
scData <- addAmbientEstimateToSeurat(scData, rawDir=rawDir, threads = param$cores)
}
# get markers and annotations
anno <- getSeuratMarkersAndAnnotate(scData, param, BPPARAM = BPPARAM)
# save markers
markers <- anno$markers
writexl::write_xlsx(markers, path="posMarkers.xlsx")
## generate template for manual cluster annotation -----
## we only deal with one sample
stopifnot(length(input$getNames()) == 1)
clusterInfos <- ezFrame(Sample=input$getNames(), Cluster=levels(Idents(scData)), ClusterLabel="")
if (!is.null(anno$singler.results)){
clusterInfos$SinglerCellType <- anno$singler.results$singler.results.cluster[clusterInfos$Cluster, "pruned.labels"]
}
nTopMarkers <- 10
topMarkers <- markers %>% group_by(cluster) %>%
slice_max(n = nTopMarkers, order_by = avg_log2FC)
topMarkerString <- sapply(split(topMarkers$gene, topMarkers$cluster), paste, collapse=", ")
clusterInfos[["TopMarkers"]] <- topMarkerString[clusterInfos$Cluster]
clusterInfoFile <- "clusterInfos.xlsx"
writexl::write_xlsx(clusterInfos, path=clusterInfoFile)
makeRmdReport(param=param, output=output, scData=scData, allCellsMeta=allCellsMeta,
cellsPerGeneFraction = cellsPerGeneFraction, enrichRout=anno$enrichRout,
cells.AUC=anno$cells.AUC, singler.results=anno$singler.results, aziResults=anno$aziResults,
pathwayActivity=anno$pathwayActivity, TFActivity=anno$TFActivity,
rmdFile = "ScSeurat.Rmd", reportTitle = paste0(param$name, ": ", input$getNames()))
#remove no longer used objects
rm(scData)
gc()
return("Success")
}
addCellQcToSeurat <- function(scData, param=NULL, BPPARAM=NULL, ribosomalGenes=NULL){
library(scater)
# Cells filtering
scData <- PercentageFeatureSet(scData, "(?i)^MT-", col.name = "percent_mito")
scData <- PercentageFeatureSet(scData, "(?i)^RPS|^RPL", col.name = "percent_riboprot")
if (!is.null(ribosomalGenes)){
scData <- PercentageFeatureSet(scData, features=ribosomalGenes, col.name = "percent_ribosomal")
}
if(grepl("Spatial", param$appName)) {
assay <- "Spatial"
att_nCounts <- "nCount_Spatial"
att_nGenes <- "nFeature_Spatial"
} else {
att_nCounts <- "nCount_RNA"
att_nGenes <- "nFeature_RNA"
assay <- "RNA"
}
if (!ezIsSpecified(param$nreads)) {
scData$qc.lib <- isOutlier(scData@meta.data[,att_nCounts], log = TRUE, nmads = param$nmad, type = "lower")
} else {
scData$qc.lib <- scData@meta.data[,att_nCounts] < param$nreads
}
if (!ezIsSpecified(param$ngenes)) {
scData$qc.nexprs <- isOutlier(scData@meta.data[,att_nGenes], nmads = param$nmad, log = TRUE, type = "lower")
} else {
scData$qc.nexprs <- scData@meta.data[,att_nGenes] < param$ngenes
}
if (!ezIsSpecified(param$perc_mito)) {
scData$qc.mito <- isOutlier(scData@meta.data[,"percent_mito"], nmads = param$nmad, type = "higher")
} else {
scData$qc.mito <- scData@meta.data[,"percent_mito"] > param$perc_mito
}
if (!ezIsSpecified(param$perc_riboprot )) {
scData$qc.riboprot <- isOutlier(scData@meta.data[,"percent_riboprot"], nmads = param$nmad, type = "higher")
} else {
scData$qc.riboprot <- scData@meta.data[,"percent_riboprot"] > as.numeric(param$perc_riboprot)
}
scData$useCell <- !(scData$qc.lib | scData$qc.nexprs | scData$qc.mito | scData$qc.riboprot)
set.seed(38)
doubletsInfo <- scDblFinder(GetAssayData(scData, layer="counts")[ , scData$useCell], returnType = "table", clusters=TRUE, BPPARAM = BPPARAM)
scData$doubletScore <- doubletsInfo[colnames(scData), "score"]
scData$doubletClass <- doubletsInfo[colnames(scData), "class"]
scData$qc.doublet <- scData$doubletClass %in% "doublet"
if (ezIsSpecified(param$keepDoublets) && param$keepDoublets) {
futile.logger::flog.info("Keeping doublets...")
} else {
scData$useCell <- scData$useCell & scData$doubletClass %in% "singlet"
}
return(scData)
}
querySignificantClusterAnnotationEnrichR <- function(genesPerCluster, dbs, overlapGeneCutOff = 3, adjPvalueCutOff = 0.001, reportTopN = 5) {
enrichRout <- list()
for (cluster in unique(names(genesPerCluster))) {
enriched <- enrichr(as.character(genesPerCluster[[cluster]]), dbs)
for (db in names(enriched)) {
enriched_db <- enriched[[db]]
if (nrow(enriched_db) > 0 && colnames(enriched_db)[1] == "Term"){
enriched_db$OverlapGenesN <- sub("/.*", "", enriched_db$Overlap) %>% as.numeric()
enriched_db$Cluster <- cluster
enriched_db <- enriched_db %>%
filter(., Adjusted.P.value < adjPvalueCutOff) %>%
filter(., OverlapGenesN > overlapGeneCutOff) %>%
head(reportTopN)
enrichRout[[cluster]][[db]] <- enriched_db[, c("Term", "Cluster", "Overlap", "OverlapGenesN", "Adjusted.P.value", "Odds.Ratio", "Combined.Score")]
}
}
}
return(enrichRout)
}
computeTFActivityAnalysis <- function(cells, species){
species <- tolower(species)
# Retrieve prior knowledge network.
network <- decoupleR::get_dorothea(organism = species,
levels = c("A", "B", "C"))
# Run weighted means algorithm.
activities <- decoupleR::run_wmean(mat = as.matrix(GetAssayData(cells)),
network = network,
.source = "source",
.targe = "target",
.mor = "mor",
times = 100,
minsize = 5)
return(activities)
}
computePathwayActivityAnalysis <- function(cells, species){
species <- tolower(species)
# Retrieve prior knowledge network.
network <- decoupleR::get_progeny(organism = species)
# Run weighted means algorithm.
activities <- decoupleR::run_wmean(mat = as.matrix(GetAssayData(cells)),
network = network,
.source = "source",
.targe = "target",
.mor = "weight",
times = 100,
minsize = 5)
return(activities)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.