In csoneson/DuoClustering2018: Data, Clustering Results and Visualization Functions From Duò et al (2018)

suppressPackageStartupMessages({
  library(MultiAssayExperiment)
  library(SingleCellExperiment)
  library(scater)
  library(scran)
  library(plyr)
  library(dplyr)
  library(ggplot2)
  library(splatter)
})

Load MultiAssayExperiment object

(maex <- readRDS("../../data/data_raw/GSE60749-GPL13112.rds"))

## Extract the gene-level length-scaled TPMs
cts <- assays(experiments(maex)[["gene"]])[["count_lstpm"]]

## Extract the phenotype data.
phn <- colData(maex)
phn$phenoid <- as.character(interaction(as.data.frame(phn[, c("source_name_ch1",
                                                              "characteristics_ch1.1")])))
## Simplify labels
phn$phenoid <-  plyr::revalue(
  phn$phenoid, 
  c("Dgcr8 knockout mouse embryonic stem cells.culture conditions: serum+LIF" = "Dgcr8 knockout mouse serum+LIF", 
    "v6.5 mouse embryonic stem cells.culture conditions: 2i+LIF" = "v6.5 mouse 2i+LIF",
    "v6.5 mouse embryonic stem cells.culture conditions: serum+LIF" = "v6.5 mouse serum+LIF")
)
table(phn$phenoid)

Extract one subpopulation

stopifnot(all(colnames(cts) == rownames(phn)))

sub.cts <- cts[, phn$phenoid == "v6.5 mouse 2i+LIF"]
dim(sub.cts)

Remove the ERCC spike-ins

is.spike <- grepl("^ERCC", rownames(sub.cts))
sub.cts <- sub.cts[!is.spike, ]

Estimate parameters for simulation

(params <- splatEstimate(sub.cts))

Simulate new counts

sce <- splatSimulate(params = params, batchCells = 500,
                     group.prob = c(0.1, 0.15, 0.5, 0.25), 
                     method = "groups", de.prob = c(0.05, 0.1, 0.2, 0.4),
                     verbose = FALSE)
sce <- normalise(sce, exprs_values = "counts", return_log = TRUE, 
                 return_norm_as_exprs = TRUE) ## generates logcounts(sce)
sce$phenoid <- colData(sce)$Group

Exclude features that are not expressed

keep_features <- rowSums(counts(sce) > 0) > 0
table(keep_features)
sce <- sce[keep_features, ]
dim(sce)

Calculate QC metrics

sce <- calculateQCMetrics(sce)

Quality control using PCA on column data

We create a PCA plot based the quality metrics for each cell, e.g., the total number of reads and the total number of features.

sce <- scater::runPCA(sce, pca_data_input = "coldata")
scater::plotPCA(sce, colour_by = "phenoid")

Filter cells

We remove cells with log-library sizes (or total features) that are more than 3 median absolute deviations (MADs) below the median log-library size (or total features).

colData(sce)$libsize.drop <- isOutlier(sce$total_counts, nmads = 3, type = "lower", log = TRUE)
ggplot(as.data.frame(colData(sce)), aes(x = total_counts)) + 
  geom_histogram(bins = 20, fill = "grey80") + xlab("Total count") + 
  ylab("Number of cells") + 
  geom_vline(xintercept = min(sce$total_counts[!sce$libsize.drop]), 
             color = "red", linetype = "dashed") + 
  theme_bw()

colData(sce)$feature.drop <- isOutlier(sce$total_features, nmads = 3, type = "lower", log = TRUE)
ggplot(as.data.frame(colData(sce)), aes(x = total_features)) + 
  geom_histogram(bins = 20, fill = "grey80") + xlab("Number of detected features") + 
  ylab("Number of cells") + 
  geom_vline(xintercept = min(sce$total_features[!sce$feature.drop]), 
             color = "red", linetype = "dashed") + 
  theme_bw()

table(libsize = sce$libsize.drop, feature = sce$feature.drop)
sce <- sce[, !(sce$libsize.drop | sce$feature.drop)]
dim(sce)

Quality control using highest expressed genes

plotQC(sce, type = "highest-expression", n = 50)

Data normalization

sce <- computeSumFactors(sce, sizes = pmin(ncol(sce), seq(20, 120, 20)), min.mean = 0.1)
summary(sizeFactors(sce))

sce <- normalise(sce, exprs_values = "counts", return_log = TRUE, 
                 return_norm_as_exprs = TRUE)
sce <- normalise(sce, exprs_values = "counts", return_log = FALSE, 
                 return_norm_as_exprs = FALSE)

Plot the proportion of explained variances

expl_vars <- c("phenoid", "log10_total_counts", "log10_total_features", "pct_dropout",
               "pct_counts_top_200_features", "log10_counts_feature_controls",
               "pct_counts_feature_controls")
plotQC(sce, type = "explanatory-variables", variables = expl_vars)

Plot t-SNE representations

set.seed(1234)
sce <- runTSNE(sce, exprs_values = "logcounts", perplexity = 10)
plotTSNE(sce, colour_by = "phenoid")
plotTSNE(sce, colour_by = "total_features", size_by = "total_counts")

Save the normalized and cell filtered dataset

dim(sce)
table(sce$phenoid)
saveRDS(sce, file = "../../data/sce_full/sce_full_SimKumar4easy.rds")

Session info

date()
sessionInfo()

csoneson/DuoClustering2018 documentation built on Dec. 25, 2024, 2:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

csoneson/DuoClustering2018
Data, Clustering Results and Visualization Functions From Duò et al (2018)

In csoneson/DuoClustering2018: Data, Clustering Results and Visualization Functions From Duò et al (2018)

Load MultiAssayExperiment object

Extract one subpopulation

Remove the ERCC spike-ins

Estimate parameters for simulation

Simulate new counts

Calculate QC metrics

Quality control using PCA on column data

Filter cells

Quality control using highest expressed genes

Data normalization

Plot the proportion of explained variances

Plot t-SNE representations

Save the normalized and cell filtered dataset

Session info

R Package Documentation

Browse R Packages

We want your feedback!

csoneson/DuoClustering2018 Data, Clustering Results and Visualization Functions From Duò et al (2018)

In csoneson/DuoClustering2018: Data, Clustering Results and Visualization Functions From Duò et al (2018)

Load MultiAssayExperiment object

Extract one subpopulation

Remove the ERCC spike-ins

Estimate parameters for simulation

Simulate new counts

Calculate QC metrics

Quality control using PCA on column data

Filter cells

Quality control using highest expressed genes

Data normalization

Plot the proportion of explained variances

Plot t-SNE representations

Save the normalized and cell filtered dataset

Session info

R Package Documentation

Browse R Packages

We want your feedback!

csoneson/DuoClustering2018
Data, Clustering Results and Visualization Functions From Duò et al (2018)