library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
We obtain a single-cell RNA sequencing dataset of the hashed PBMCs and cell lines from @stoeckius2018hashing.
Counts for endogenous genes, ADTs and HTOs are available from the Gene Expression Omnibus
using the accession number GGSE108313.
We download and cache them using the r Biocpkg("BiocFileCache")
package.
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) pbmc.rna <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples", "GSM2895nnn/GSM2895282/suppl/GSM2895282_Hashtag-RNA.umi.txt.gz")) pbmc.hto <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples", "GSM2895nnn/GSM2895283/suppl/GSM2895283_Hashtag-HTO-count.csv.gz")) pbmc.adt1 <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples", "GSM2895nnn/GSM2895284/suppl/GSM2895284_Hashtag-ADT1-count.csv.gz")) pbmc.adt2 <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples", "GSM2895nnn/GSM2895284/suppl/GSM2895284_Hashtag-ADT2-count.csv.gz")) cell.rna <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples", "GSM3501nnn/GSM3501446/suppl/GSM3501446_MixCellLines-RNA.umi.txt.gz")) cell.hto <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/samples", "GSM3501nnn/GSM3501447/suppl/GSM3501447_MixCellLines-HTO-counts.csv.gz"))
We read the RNA counts into memory as sparse matrices, while the HTO and ADT counts are read in as dense matrices owing to, well, their density.
library(scuttle) pbmc.rna.mat <- readSparseCounts(pbmc.rna, row.names=1) dim(pbmc.rna.mat) pbmc.hto.mat <- as.matrix(read.csv(pbmc.hto, row.names=1)) dim(pbmc.hto.mat) pbmc.adt1.mat <- as.matrix(read.csv(pbmc.adt1, row.names=1)) dim(pbmc.adt1.mat) pbmc.adt2.mat <- as.matrix(read.csv(pbmc.adt2, row.names=1)) dim(pbmc.adt2.mat) cell.rna.mat <- readSparseCounts(cell.rna, row.names=1) dim(cell.rna.mat) cell.hto.mat <- t(as.matrix(read.csv(cell.hto, row.names=1))) dim(cell.hto.mat)
Somehow, none of the matrices have the same set of columns! Incredible. This will need to be filtered out before use.
length(intersect(colnames(pbmc.hto.mat), colnames(pbmc.adt1.mat))) length(intersect(colnames(pbmc.hto.mat), colnames(pbmc.adt2.mat))) length(intersect(colnames(pbmc.hto.mat), colnames(pbmc.rna.mat))) length(intersect(colnames(cell.hto.mat), colnames(cell.rna.mat)))
Even more fun is the fact that the PBMC dataset contains both human and mouse transcripts. Let's try to figure out which is which based on capitalization and the fact that the cell line data is human-only. Even this is not straightforward because the cell line data does not have the full set of human features!
probably.human <- !grepl("[a-z]+", rownames(pbmc.rna.mat)) | grepl("^hsa-", rownames(pbmc.rna.mat)) | rownames(pbmc.rna.mat) %in% rownames(cell.rna.mat) summary(probably.human) head(rownames(pbmc.rna.mat)[probably.human]) tail(rownames(pbmc.rna.mat)[probably.human]) head(rownames(pbmc.rna.mat)[!probably.human]) tail(rownames(pbmc.rna.mat)[!probably.human])
Well, whatever.
pbmc.human.mat <- pbmc.rna.mat[probably.human,] pbmc.mouse.mat <- pbmc.rna.mat[!probably.human,]
We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub")
.
path <- file.path("scRNAseq", "stoeckius-hashing", "2.4.0") dir.create(path, showWarnings=FALSE, recursive=TRUE) saveRDS(pbmc.human.mat, file=file.path(path, "counts-pbmc-human.rds")) saveRDS(pbmc.mouse.mat, file=file.path(path, "counts-pbmc-mouse.rds")) saveRDS(pbmc.adt1.mat, file=file.path(path, "counts-pbmc-adt1.rds")) saveRDS(pbmc.adt2.mat, file=file.path(path, "counts-pbmc-adt2.rds")) saveRDS(pbmc.hto.mat, file=file.path(path, "counts-pbmc-hto.rds")) saveRDS(cell.rna.mat, file=file.path(path, "counts-mixed-rna.rds")) saveRDS(cell.hto.mat, file=file.path(path, "counts-mixed-hto.rds"))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.