In LTLA/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the count data

We obtain a single-cell RNA sequencing dataset of 416B cells and trophoblasts from Lun et al. (2017). Counts for endogenous genes and spike-in transcripts are available from ArrayExpress using the accession number E-MTAB-5522. We set up a cache based on the r Biocpkg("BiocFileCache") package:

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
base <- "https://www.ebi.ac.uk/biostudies/files/E-MTAB-5522/"

We set up some common metadata:

meta <- list(
    title="Assessing the reliability of spike-in normalization for analyses of single-cell RNA sequencing data [%s only]",
    description="By profiling the transcriptomes of individual cells, single-cell RNA sequencing provides unparalleled resolution to study cellular heterogeneity. However, this comes at the cost of high technical noise, including cell-specific biases in capture efficiency and library generation. One strategy for removing these biases is to add a constant amount of spike-in RNA to each cell and to scale the observed expression values so that the coverage of spike-in transcripts is constant across cells. This approach has previously been criticized as its accuracy depends on the precise addition of spike-in RNA to each sample. Here, we perform mixture experiments using two different sets of spike-in RNA to quantify the variance in the amount of spike-in RNA added to each well in a plate-based protocol. We also obtain an upper bound on the variance due to differences in behavior between the two spike-in sets. We demonstrate that both factors are small contributors to the total technical variance and have only minor effects on downstream analyses, such as detection of highly variable genes and clustering. Our results suggest that scaling normalization using spike-in transcripts is reliable enough for routine use in single-cell RNA sequencing data analyses.

Maintainer note: this dataset contains only the %s from this study.",
    taxonomy_id="10090",
    genome="GRCm38",
    sources=list(
        list(provider="ArrayExpress", id="E-MTAB-5522"),
        list(provider="PubMed", id="29030468")
    ),
    maintainer_name="Aaron Lun",
    maintainer_email="infinite.monkeys.with.keyboards@gmail.com"
)

And a directory for creating outputs later:

output.dir <- "2023-12-18_output"
unlink(output.dir, recursive=TRUE)
dir.create(output.dir)

Reading in cell annotations

We read in the cell annotations from the SDRF file:

lun.sdrf <- bfcrpath(bfc, paste0(base, "E-MTAB-5522.sdrf.txt"))
coldata <- read.delim(lun.sdrf, check.names=FALSE, stringsAsFactors=FALSE)

library(S4Vectors)
coldata <- as(coldata, "DataFrame")
colnames(coldata)

We keep only the experimentally interesting metadata, discarding columns that are duplicated or only have one level.

keep <- grep("Characteristics|Factor", colnames(coldata))
coldata <- coldata[,c(1, keep)] # keeping the cell IDs.

new.colnames <- sub(".*\\[(.*)\\]", "\\1", colnames(coldata))
u <- !duplicated(new.colnames)
coldata <- coldata[,u]
colnames(coldata) <- new.colnames[u]

has.multi.levels <- vapply(coldata, FUN=function(x) length(unique(x))>1L, TRUE)
coldata <- coldata[,has.multi.levels]
head(coldata)

Processing the 416B data

We load the counts into memory for the 416B cells.

path1 <- bfcrpath(bfc, paste0(base, "counts_Calero_20160113.tsv"))
plate1.416b <- read.delim(path1, header=TRUE, row.names=1, check.names=FALSE)
path2 <- bfcrpath(bfc, paste0(base, "counts_Calero_20160325.tsv"))
plate2.416b <- read.delim(path2, header=TRUE, row.names=1, check.names=FALSE)
stopifnot(identical(rownames(plate1.416b), rownames(plate2.416b)))

We extract the gene lengths and combine the matrices together:

gene.lengths <- plate1.416b$Length 
rowdata.416b <- DataFrame(Length=gene.lengths)
stopifnot(identical(gene.lengths, plate2.416b$Length))

plate1.416b <- as.matrix(plate1.416b[,-1])
plate2.416b <- as.matrix(plate2.416b[,-1])
counts.416b <- cbind(plate1.416b, plate2.416b)
dim(counts.416b)

We extract the relevant column data.

m <- match(colnames(counts.416b), coldata[,1])
coldata.416b <- coldata[m,]
stopifnot(identical(colnames(counts.416b), coldata.416b[,1]))

And we convert these into a SingleCellExperiment, attaching some concentrations for the ERCCs in the row data.

library(scRNAseq)

createSce <- function(counts, rowdata, coldata) {
    sce <- SingleCellExperiment(list(counts=counts), rowData=rowdata, colData=coldata)
    stopifnot(identical(sce$`Source Name`, colnames(sce)))
    sce$`Source Name` <- NULL # redundant with the column names

    spike.type <- rep("endogenous", nrow(sce))
    spike.type[grep("ERCC", rownames(sce))] <- "ERCC"
    spike.type[grep("SIRV", rownames(sce))] <- "SIRV"
    sce <- splitAltExps(sce, spike.type, ref="endogenous")

    spike.exp <- altExp(sce, "ERCC")
    spikedata <- countErccMolecules(volume = 100, dilution = 3e6)
    spikedata <- spikedata[rownames(spike.exp), ]
    rowData(spike.exp) <- cbind(rowData(spike.exp), spikedata)
    altExp(sce, "ERCC") <- spike.exp

    # Also polishing to save some disk space.
    polishDataset(sce)
}

sce.416b <- createSce(counts.416b, rowdata.416b, coldata.416b)

And saving it.

copy <- meta
copy$title <- sprintf(copy$title, "416B cells")
copy$description <- sprintf(copy$description, "416B cells")
saveDataset(sce.416b, file.path(output.dir, "416b"), copy)

Processing the trophoblast data

We repeat this process for the trophoblasts.

path1 <- bfcrpath(bfc, paste0(base, "counts_Liora_20160906.tsv"))
plate1.tropho <- read.delim(path1, header=TRUE, row.names=1, check.names=FALSE)
path2 <- bfcrpath(bfc, paste0(base, "counts_Liora_20170201.tsv"))
plate2.tropho <- read.delim(path2, header=TRUE, row.names=1, check.names=FALSE)
stopifnot(identical(rownames(plate1.tropho), rownames(plate2.tropho)))

We extract the gene lengths and combine the matrices together:

gene.lengths <- plate1.tropho$Length 
rowdata.tropho <- DataFrame(Length=gene.lengths)
stopifnot(identical(gene.lengths, plate2.tropho$Length))

plate1.tropho <- as.matrix(plate1.tropho[,-1])
plate2.tropho <- as.matrix(plate2.tropho[,-1])
counts.tropho <- cbind(plate1.tropho, plate2.tropho)
dim(counts.tropho)

We extract the relevant column data.

m <- match(colnames(counts.tropho), coldata[,1])
coldata.tropho <- coldata[m,]
stopifnot(identical(colnames(counts.tropho), coldata.tropho[,1]))

We assemble this into an SCE:

sce.tropho <- createSce(counts.tropho, rowdata.tropho, coldata.tropho)

And we save it to file for upload.

copy <- meta
copy$title <- sprintf(copy$title, "trophoblasts")
copy$description <- sprintf(copy$description, "trophoblasts")
saveDataset(sce.tropho, file.path(output.dir, "tropho"), copy)