suppressPackageStartupMessages({ library(MultiAssayExperiment) library(SingleCellExperiment) library(scater) library(scran) library(plyr) library(dplyr) library(ggplot2) library(splatter) })
(maex <- readRDS("../../data/data_raw/GSE60749-GPL13112.rds")) ## Extract the gene-level length-scaled TPMs cts <- assays(experiments(maex)[["gene"]])[["count_lstpm"]] ## Extract the phenotype data. phn <- colData(maex) phn$phenoid <- as.character(interaction(as.data.frame(phn[, c("source_name_ch1", "characteristics_ch1.1")]))) ## Simplify labels phn$phenoid <- plyr::revalue( phn$phenoid, c("Dgcr8 knockout mouse embryonic stem cells.culture conditions: serum+LIF" = "Dgcr8 knockout mouse serum+LIF", "v6.5 mouse embryonic stem cells.culture conditions: 2i+LIF" = "v6.5 mouse 2i+LIF", "v6.5 mouse embryonic stem cells.culture conditions: serum+LIF" = "v6.5 mouse serum+LIF") ) table(phn$phenoid)
stopifnot(all(colnames(cts) == rownames(phn))) sub.cts <- cts[, phn$phenoid == "v6.5 mouse 2i+LIF"] dim(sub.cts)
is.spike <- grepl("^ERCC", rownames(sub.cts)) sub.cts <- sub.cts[!is.spike, ]
(params <- splatEstimate(sub.cts))
sce <- splatSimulate(params = params, batchCells = 500, group.prob = c(0.2, 0.15, 0.4, 0.25), method = "groups", de.prob = c(0.01, 0.05, 0.05, 0.08), verbose = FALSE) sce <- normalise(sce, exprs_values = "counts", return_log = TRUE, return_norm_as_exprs = TRUE) ## generates logcounts(sce) sce$phenoid <- colData(sce)$Group
Exclude features that are not expressed
keep_features <- rowSums(counts(sce) > 0) > 0 table(keep_features) sce <- sce[keep_features, ] dim(sce)
sce <- calculateQCMetrics(sce)
We create a PCA plot based the quality metrics for each cell, e.g., the total number of reads and the total number of features.
sce <- scater::runPCA(sce, pca_data_input = "coldata") scater::plotPCA(sce, colour_by = "phenoid")
We remove cells with log-library sizes (or total features) that are more than 3 median absolute deviations (MADs) below the median log-library size (or total features).
colData(sce)$libsize.drop <- isOutlier(sce$total_counts, nmads = 3, type = "lower", log = TRUE) ggplot(as.data.frame(colData(sce)), aes(x = total_counts)) + geom_histogram(bins = 20, fill = "grey80") + xlab("Total count") + ylab("Number of cells") + geom_vline(xintercept = min(sce$total_counts[!sce$libsize.drop]), color = "red", linetype = "dashed") + theme_bw() colData(sce)$feature.drop <- isOutlier(sce$total_features, nmads = 3, type = "lower", log = TRUE) ggplot(as.data.frame(colData(sce)), aes(x = total_features)) + geom_histogram(bins = 20, fill = "grey80") + xlab("Number of detected features") + ylab("Number of cells") + geom_vline(xintercept = min(sce$total_features[!sce$feature.drop]), color = "red", linetype = "dashed") + theme_bw()
table(libsize = sce$libsize.drop, feature = sce$feature.drop) sce <- sce[, !(sce$libsize.drop | sce$feature.drop)] dim(sce)
plotQC(sce, type = "highest-expression", n = 50)
sce <- computeSumFactors(sce, sizes = pmin(ncol(sce), seq(20, 120, 20)), min.mean = 0.1) summary(sizeFactors(sce))
sce <- normalise(sce, exprs_values = "counts", return_log = TRUE, return_norm_as_exprs = TRUE) sce <- normalise(sce, exprs_values = "counts", return_log = FALSE, return_norm_as_exprs = FALSE)
expl_vars <- c("phenoid", "log10_total_counts", "log10_total_features", "pct_dropout", "pct_counts_top_200_features", "log10_counts_feature_controls", "pct_counts_feature_controls") plotQC(sce, type = "explanatory-variables", variables = expl_vars)
set.seed(1234) sce <- runTSNE(sce, exprs_values = "logcounts", perplexity = 10) plotTSNE(sce, colour_by = "phenoid") plotTSNE(sce, colour_by = "total_features", size_by = "total_counts")
dim(sce) table(sce$phenoid) saveRDS(sce, file = "../../data/sce_full/sce_full_SimKumar4hard.rds")
date() sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.