In drisso/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the count data

We obtain a single-cell RNA sequencing dataset of 416B cells and trophoblasts from @lun2017assessing. Counts for endogenous genes and spike-in transcripts are available from ArrayExpress using the accession number E-MTAB-5522. We download and cache it using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
lun.zip <- bfcrpath(bfc, 
    file.path("https://www.ebi.ac.uk/arrayexpress/files",
        "E-MTAB-5522/E-MTAB-5522.processed.1.zip"))
unzip(lun.zip, exdir=tempdir())

Reading in the metadata

We read in the metadata from the SDRF file:

lun.sdrf <- bfcrpath(bfc, 
    file.path("https://www.ebi.ac.uk/arrayexpress/files",
        "E-MTAB-5522/E-MTAB-5522.sdrf.txt"))
coldata <- read.delim(lun.sdrf, check.names=FALSE, stringsAsFactors=FALSE)

library(S4Vectors)
coldata <- as(coldata, "DataFrame")
colnames(coldata)

We keep only the experimentally interesting metadata, discarding columns that are duplicated or only have one level.

keep <- grep("Characteristics|Factor", colnames(coldata))
coldata <- coldata[,c(1, keep)] # keeping the cell IDs.

new.colnames <- sub(".*\\[(.*)\\]", "\\1", colnames(coldata))
u <- !duplicated(new.colnames)
coldata <- coldata[,u]
colnames(coldata) <- new.colnames[u]

has.multi.levels <- vapply(coldata, FUN=function(x) length(unique(x))>1L, TRUE)
coldata <- coldata[,has.multi.levels]
head(coldata)

Processing the 416B data

We load the counts into memory for the 416B cells.

plate1.416b <- read.delim(file.path(tempdir(), "counts_Calero_20160113.tsv"),
    header=TRUE, row.names=1, check.names=FALSE)
plate2.416b <- read.delim(file.path(tempdir(), "counts_Calero_20160325.tsv"),
    header=TRUE, row.names=1, check.names=FALSE)
stopifnot(identical(rownames(plate1.416b), rownames(plate2.416b)))

We extract the gene lengths and combine the matrices together:

gene.lengths <- plate1.416b$Length 
rowdata.416b <- DataFrame(Length=gene.lengths)
stopifnot(identical(gene.lengths, plate2.416b$Length))

plate1.416b <- as.matrix(plate1.416b[,-1])
plate2.416b <- as.matrix(plate2.416b[,-1])
counts.416b <- cbind(plate1.416b, plate2.416b)
dim(counts.416b)

We extract the relevant column data.

m <- match(colnames(counts.416b), coldata[,1])
coldata.416b <- coldata[m,]
stopifnot(identical(colnames(counts.416b), coldata.416b[,1]))

And we save these to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "lun-spikein", "2.0.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(counts.416b, file=file.path(path, "counts-416b.rds"))
saveRDS(rowdata.416b, file=file.path(path, "rowdata-416b.rds"))
saveRDS(coldata.416b, file=file.path(path, "coldata-416b.rds"))

Processing the trophoblast data

We repeat this process for the trophoblasts.

plate1.tropho <- read.delim(file.path(tempdir(), "counts_Liora_20160906.tsv"),
    header=TRUE, row.names=1, check.names=FALSE)
plate2.tropho <- read.delim(file.path(tempdir(), "counts_Liora_20170201.tsv"),
    header=TRUE, row.names=1, check.names=FALSE)
stopifnot(identical(rownames(plate1.tropho), rownames(plate2.tropho)))

We extract the gene lengths and combine the matrices together:

gene.lengths <- plate1.tropho$Length 
rowdata.tropho <- DataFrame(Length=gene.lengths)
stopifnot(identical(gene.lengths, plate2.tropho$Length))

plate1.tropho <- as.matrix(plate1.tropho[,-1])
plate2.tropho <- as.matrix(plate2.tropho[,-1])
counts.tropho <- cbind(plate1.tropho, plate2.tropho)
dim(counts.tropho)

We extract the relevant column data.

m <- match(colnames(counts.tropho), coldata[,1])
coldata.tropho <- coldata[m,]
stopifnot(identical(colnames(counts.tropho), coldata.tropho[,1]))

And we save these to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "lun-spikein", "2.0.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(counts.tropho, file=file.path(path, "counts-tropho.rds"))
saveRDS(rowdata.tropho, file=file.path(path, "rowdata-tropho.rds"))
saveRDS(coldata.tropho, file=file.path(path, "coldata-tropho.rds"))