In drisso/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the data

We obtain a single-cell RNA sequencing dataset of T cells from multiple patients from @bacher2020low. Counts for endogenous genes are available from the Gene Expression Omnibus using the accession number GSE162086. We download and cache them using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)

# Manually transcribed from GEO:
samples <- matrix(ncol=4, byrow=TRUE, 
    c(
        "GSM4932900","J09835","WHO 1","non-hospitalized",
        "GSM4932901","J09836","WHO 1","non-hospitalized",
        "GSM4932902","J10535","WHO 4","mild-moderate",
        "GSM4932903","J10624","WHO 2","non-hospitalized",
        "GSM4932904","J10625","WHO 2","non-hospitalized",
        "GSM4932905","J10886","WHO 5","mild-moderate",
        "GSM4932906","J10887","WHO 6","severe",
        "GSM4932907","J10888","WHO 4","mild-moderate",
        "GSM4932908","J11689","WHO 2","non-hospitalized",
        "GSM4932909","J14204","WHO 7","severe",
        "GSM4932910","J14205","WHO 5","mild-moderate",
        "GSM4932911","J15890","WHO 2","non-hospitalized",
        "GSM4932912","J15891","healthy","healthy",
        "GSM4932913","J15892","healthy","healthy",
        "GSM4932914","J15899","healthy","healthy",
        "GSM4932915","J15900","healthy","healthy",
        "GSM4932916","J15893","WHO 5","mild-moderate",
        "GSM4932917","J21854","WHO 7","severe",
        "GSM4932918","J21855","healthy","healthy",
        "GSM4932919","J21856","healthy","healthy"
    )
)

counts <- vector("list", nrow(samples))
for (x in seq_along(counts)) {
    url <- sprintf("https://www.ncbi.nlm.nih.gov/geo/download/?acc=%s&format=file&file=%s%%5F%s%%5Fcounts%%2Etsv%%2Egz", 
        samples[x,1], samples[x,1], samples[x,2])
    counts[[x]] <- bfcrpath(bfc, url) 
}

Reading all the counts in as sparse matrices:

library(scuttle)
library(BiocParallel)
counts <- bplapply(counts, readSparseCounts, quote="\"", BPPARAM=MulticoreParam())

# Sanity check:
gene.ids <- lapply(counts, rownames)
stopifnot(length(unique(gene.ids))==1L)

# Checking that the names are unique after combining.
all.cn <- unlist(lapply(counts, colnames))
stopifnot(anyDuplicated(all.cn)==0)

combined <- do.call(cbind, counts)

Creating a SingleCellExperiment object:

library(SingleCellExperiment)
sce <- SingleCellExperiment(list(counts=combined))
true.patient <- rep(samples[,2], vapply(counts, ncol, 0L))

Attaching some metadata:

meta.path <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE162086&format=file&file=GSE162086%5Fseurat%5Fmetadata%2Etsv%2Egz")
meta <- read.delim(meta.path, check.names=FALSE)

ID <- paste0(sub("-1", "", meta$barcode), "-", meta$sample)
ref <- gsub("\"", "", colnames(combined))
stopifnot(all(ID %in% ref))

m <- match(ref, ID)
meta <- meta[m,]
rownames(meta) <- ref
meta$retained <- !is.na(m)

# Filling in some of the patient-level attributes for missing cells.
# This is done by extrapolating from cells in the same patient. 
meta$sample <- sub(".*-", "", ref)
meta$barcode <- sub("-.*", "", ref)

for (field in c("batch", "seq_run", "diagnosis")) {
    tab <- table(meta$sample, meta[[field]])
    best <- colnames(tab)[max.col(tab)]
    best <- as(best, typeof(meta[[field]]))
    names(best) <- rownames(tab)
    meta[[field]] <- best[meta$sample]
}

m <- match(meta$sample,samples[,2])
stopifnot(all(!is.na(m)))
meta$who_class <- samples[m,3]
meta$severity <- samples[m,4]

colData(sce) <- DataFrame(meta)
sce

Saving to file

We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "bacher-tcell", "2.6.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(assay(sce), file=file.path(path, "counts.rds"))
saveRDS(colData(sce), file=file.path(path, "coldata.rds"))