knitr::opts_chunk$set(error = FALSE, warning = FALSE, message = TRUE, tidy = FALSE, cache = FALSE, dpi = 100) library(BiocStyle)
scRUtils
provides various utilities for visualising and functional analysis of RNA-seq data,
particularly single-cell dataset. It evolved from a collection of helper functions that were
used in our in-house scRNA-seq processing workflow.
The documentation of this package is divided into 5 sections:
This vignette (#5) will show how the demo datasets in this package were created. The codes were not evaluated to minimise the time needed to build the vignette.
phases_assignments
from cyclone
resultsA mock reference (ref
) with 1000 cells and the expression of 2000 genes was created using
mockSCE()
from the r Biocpkg("scuttle")
package, and its gene expression data was used to
train a classifier for cell cycle phase using sandbag()
from the r Biocpkg("scran")
package.
A second mock object (phases
) with 200 cells was created and the cells were classified into
their cell cycle phases using cyclone()
from the r Biocpkg("scran")
package. The results were
saved as a RData file.
library(scuttle) library(scran) set.seed(10010) ref <- mockSCE(ncells = 1000) # Constructing a classifier: is.G1 <- which(ref$Cell_Cycle %in% c("G1", "G0")) is.S <- which(ref$Cell_Cycle == "S") is.G2M <- which(ref$Cell_Cycle == "G2M") out <- sandbag(ref, list(G1 = is.G1, S = is.S, G2M = is.G2M)) set.seed(10010) phases <- mockSCE(ncells = 200) # Run cyclone phases_assignments <- cyclone(phases, out) save(phases_assignments, file = "phases_assignments.rda", compress = "xz")
dbl_results
from findDoubletClusters
resultsA SingleCellExperiment
object with 3 clusters of cells and the expression of 200 genes was
created using mockDoubletSCE()
from the r Biocpkg("scDblFinder")
package. Using the same
package, the doublet-ness of each cluster was computed using findDoubletClusters()
. The
results were saved as a RData file.
library(SingleCellExperiment) library(scDblFinder) set.seed(10010) dbl <- mockDoubletSCE(ncells = c(100, 300, 250), ngenes = 200) # Run findDoubletClusters dbl_results <- findDoubletClusters(counts(dbl), dbl$cluster) save(dbl_results, file = "dbl_results.rda", compress = "xz")
SingleCellExperiment
objectA mock cell type reference (ref
) with 20 samples and the expression of 1000 genes was
created using the r Biocpkg("SingleR")
package, which was used in the next section to annotate
cell types. A second mock object (sce
) with 250 cells was created using .mockTestData()
and
.mockRefData()
. The genes were given mocked up symbols.
library(SingleR) library(scater) library(scran) # Generate 1000 random gene symbols set.seed(10010) symbols <- paste0(do.call(paste0, replicate(3, sample(LETTERS, 1000, TRUE), FALSE)), do.call(paste0, replicate(2, sample(1:9, 1000, TRUE), FALSE))) # table(table(symbols) == 1) # check uniqueness # Create mock reference for SingleR set.seed(10010) ref <- .mockRefData() colnames(ref) <- paste0("SAMPLE_", seq_len(ncol(ref))) rownames(ref) <- symbols ref <- logNormCounts(ref) ref$label <- paste("Type", as.numeric(factor(ref$label))) # Create mock SingleCellExperiment set.seed(10010) sce <- as(.mockTestData(.mockRefData(), ncells = 250), "SingleCellExperiment") colnames(sce) <- paste0("CELL_", seq_len(ncol(sce))) rownames(sce) <- symbols
Then, the sce
object is processed as follow:
addPerCellQC()
and addPerFeatureQC
.logNormCounts()
.getTopHVGs()
based on the variance modelling statistics
from modelGeneVar()
.SingleR()
with mock reference (ref
).findMarkers()
.The resulting object was saved as a RData file.
sce <- addPerCellQC(sce) sce <- addPerFeatureQC(sce) sce <- logNormCounts(sce) dec <- modelGeneVar(sce) hvg <- getTopHVGs(dec, n = 200, var.threshold = 0) metadata(sce)[['modelGeneVar']] <- dec metadata(sce)[['HVG']] <- hvg set.seed(10010) sce <- runPCA(sce) set.seed(10010) sce <- runTSNE(sce) set.seed(10010) sce <- runUMAP(sce) # Run SingleR pred <- SingleR(sce, ref, labels = ref$label) sce$CellType <- pred$pruned.labels metadata(sce)[['SingleR']] <- pred # Run findMarkers out1 <- findMarkers(sce, groups = sce$label, pval.type = "any") metadata(sce)[['findMarkers1']] <- out1 out2 <- findMarkers(sce, groups = sce$label, pval.type = "all") metadata(sce)[['findMarkers2']] <- out2 out3 <- findMarkers(sce, groups = sce$label, pval.type = "any", min.prop = 0.5) metadata(sce)[['findMarkers3']] <- out3 out4 <- findMarkers(sce, groups = sce$label, test.type = "wilcox") metadata(sce)[['findMarkers4']] <- out4 save(sce, file = "sce.rda", compress = "xz")
pasilla
RNA-seq datasetThe r Biocpkg("pasilla")
package containing the Drosophila melanogaster RNA-seq with RNAi
knockdown experiment of Pasilla was used as the demo dataset for generating results following
differential expression (DE) analysis by r Biocpkg("edgeR")
and r Biocpkg("DESeq2")
.
library(pasilla) # Get data from the pasilla package pasCts <- system.file("extdata", "pasilla_gene_counts.tsv", package = "pasilla", mustWork = TRUE) pasAnno <- system.file("extdata", "pasilla_sample_annotation.csv", package = "pasilla", mustWork = TRUE) cts <- as.matrix(read.csv(pasCts, sep = "\t", row.names = "gene_id")) coldata <- read.csv(pasAnno, row.names = 1)
The pasilla
RNA-seq data was processed to harmonise the samples in the provided count and
annotation data files. Additional gene annotation were obtained from BioMart using the
r Biocpkg("biomaRt")
package, genes that have annotations were retained for DE analysis.
library(biomaRt) # Prepare sample annotations coldata <- coldata[,c("condition","type")] coldata$condition <- factor(coldata$condition) coldata$condition <- relevel(coldata$condition, ref = "untreated") coldata$type <- factor(coldata$type) levels(coldata$type) <- sub("-", "_", levels(coldata$type)) rownames(coldata) <- sub("fb", "", rownames(coldata)) cts <- cts[, rownames(coldata)] # Use BioMart for gene annotations ensembl <- useEnsembl(biomart = "genes", dataset = "dmelanogaster_gene_ensembl") geneAnno <- getBM(attributes = c("chromosome_name","start_position", "end_position","ensembl_gene_id", "external_gene_name","strand","gene_biotype"), filters = "ensembl_gene_id", values = rownames(cts), mart = ensembl) # Keep genes with annotation cts <- cts[geneAnno$ensembl_gene_id,] all(rownames(cts) == geneAnno$ensembl_gene_id)
TopTags
object res_edger
The r Biocpkg("edgeR")
package was used to perform DE analysis on the pasilla
dataset,
and the results were saved as a RData file.
library(edgeR) # Object construction d <- DGEList(counts = cts, genes = geneAnno, group = coldata$condition) d <- calcNormFactors(d) # Fit the NB GLMs with QL methods design <- model.matrix(~ coldata$type + coldata$condition) d <- estimateDisp(d, design) fit <- glmQLFit(d, design) results <- glmQLFTest(fit) res_edger <- topTags(results, n = nrow(results)) save(res_edger, file = "res_edger.rda", compress = "xz")
DESeqResults
object res_deseq2
The r Biocpkg("DESeq2")
package was used to perform DE analysis on the pasilla
dataset,
and the results were saved as a RData file.
library(DESeq2) # Object construction dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~ type + condition) dds <- DESeq(dds) res_deseq2 <- results(dds) res_deseq2@listData <- c(as.list(geneAnno), res_deseq2@listData) res_deseq2@elementMetadata <- rbind(DataFrame(type = "annotation", description = colnames(geneAnno)), res_deseq2@elementMetadata) save(res_deseq2, file = "res_deseq2.rda", compress = "xz")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.