In drisso/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the data

We obtain a single-cell RNA sequencing dataset of the hashed PBMCs and cell lines from @stoeckius2018hashing. Counts for endogenous genes, ADTs and HTOs are available from the Gene Expression Omnibus using the accession number GGSE108313. We download and cache them using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)

pbmc.rna <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples",
    "GSM2895nnn/GSM2895282/suppl/GSM2895282_Hashtag-RNA.umi.txt.gz"))
pbmc.hto <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples",
    "GSM2895nnn/GSM2895283/suppl/GSM2895283_Hashtag-HTO-count.csv.gz"))
pbmc.adt1 <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples",
    "GSM2895nnn/GSM2895284/suppl/GSM2895284_Hashtag-ADT1-count.csv.gz"))
pbmc.adt2 <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples",
    "GSM2895nnn/GSM2895284/suppl/GSM2895284_Hashtag-ADT2-count.csv.gz"))

cell.rna <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples",
    "GSM3501nnn/GSM3501446/suppl/GSM3501446_MixCellLines-RNA.umi.txt.gz"))
cell.hto <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples",
    "GSM3501nnn/GSM3501447/suppl/GSM3501447_MixCellLines-HTO-counts.csv.gz"))

We read the RNA counts into memory as sparse matrices, while the HTO and ADT counts are read in as dense matrices owing to, well, their density.

library(scuttle)
pbmc.rna.mat <- readSparseCounts(pbmc.rna, row.names=1)
dim(pbmc.rna.mat)
pbmc.hto.mat <- as.matrix(read.csv(pbmc.hto, row.names=1))
dim(pbmc.hto.mat)
pbmc.adt1.mat <- as.matrix(read.csv(pbmc.adt1, row.names=1))
dim(pbmc.adt1.mat)
pbmc.adt2.mat <- as.matrix(read.csv(pbmc.adt2, row.names=1))
dim(pbmc.adt2.mat)

cell.rna.mat <- readSparseCounts(cell.rna, row.names=1)
dim(cell.rna.mat)
cell.hto.mat <- t(as.matrix(read.csv(cell.hto, row.names=1)))
dim(cell.hto.mat)

Sanity checks

Somehow, none of the matrices have the same set of columns! Incredible. This will need to be filtered out before use.

length(intersect(colnames(pbmc.hto.mat), colnames(pbmc.adt1.mat)))
length(intersect(colnames(pbmc.hto.mat), colnames(pbmc.adt2.mat)))
length(intersect(colnames(pbmc.hto.mat), colnames(pbmc.rna.mat)))
length(intersect(colnames(cell.hto.mat), colnames(cell.rna.mat)))

Even more fun is the fact that the PBMC dataset contains both human and mouse transcripts. Let's try to figure out which is which based on capitalization and the fact that the cell line data is human-only. Even this is not straightforward because the cell line data does not have the full set of human features!

probably.human <- !grepl("[a-z]+", rownames(pbmc.rna.mat)) |
    grepl("^hsa-", rownames(pbmc.rna.mat)) |
    rownames(pbmc.rna.mat) %in% rownames(cell.rna.mat)
summary(probably.human)
head(rownames(pbmc.rna.mat)[probably.human])
tail(rownames(pbmc.rna.mat)[probably.human])

head(rownames(pbmc.rna.mat)[!probably.human])
tail(rownames(pbmc.rna.mat)[!probably.human])

Well, whatever.

pbmc.human.mat <- pbmc.rna.mat[probably.human,]
pbmc.mouse.mat <- pbmc.rna.mat[!probably.human,]

Saving to file

We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "stoeckius-hashing", "2.4.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)

saveRDS(pbmc.human.mat, file=file.path(path, "counts-pbmc-human.rds"))
saveRDS(pbmc.mouse.mat, file=file.path(path, "counts-pbmc-mouse.rds"))
saveRDS(pbmc.adt1.mat, file=file.path(path, "counts-pbmc-adt1.rds"))
saveRDS(pbmc.adt2.mat, file=file.path(path, "counts-pbmc-adt2.rds"))
saveRDS(pbmc.hto.mat, file=file.path(path, "counts-pbmc-hto.rds"))

saveRDS(cell.rna.mat, file=file.path(path, "counts-mixed-rna.rds"))
saveRDS(cell.hto.mat, file=file.path(path, "counts-mixed-hto.rds"))