In LTLA/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Download the data

We obtain a single-nucleus RNA sequencing dataset of mouse brains from Hu et al. (2017). Counts for endogenous genes and antibody-derived tags are available from the Gene Expression Omnibus using the accession number GSE106678.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
tarred <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE106678&format=file")

temp <- tempfile()
dir.create(temp)
untar(tarred, exdir=temp)
list.files(temp)

We also create a metadata template:

meta <- list(
    title="Dissecting cell-type composition and activity-dependent transcriptional state in mammalian brains by massively parallel single-nucleus RNA-Seq [%s only]",
    description="Massively parallel single-cell RNA sequencing can precisely resolve cellular diversity in a high-throughput manner at low cost, but unbiased isolation of intact single cells from complex tissues, such as adult mammalian brains, is challenging. Here, we integrate sucrose-gradient assisted purification of nuclei with droplet microfluidics to develop a highly scalable single-nucleus RNA-Seq approach (sNucDrop-Seq), which is free of enzymatic dissociation and nuclei sorting. By profiling ~18,000 nuclei isolated from cortical tissues of adult mice, we demonstrate that sNucDrop-Seq not only accurately reveals neuronal and non-neuronal subtype composition with high sensitivity, but also enables in-depth analysis of transient transcriptional states driven by neuronal activity, at single-cell resolution, in vivo.

Maintainer note: this dataset contains only the %s cells from the study.",
    taxonomy_id="10090",
    genome="GRCm38",
    sources=list(
        list(provider="GEO", id="GSE106678"),
        list(provider="PubMed", id="29220646")
    ),
    maintainer_name="Aaron Lun",
    maintainer_email="infinite.monkeys.with.keyboards@gmail.com"
)

We clear out any existing output directory:

output.dir <- "2023-12-20_output"
unlink(output.dir, recursive=TRUE)
dir.create(output.dir)

Processing the data

We load in each of the files.

library(scuttle)
counts <- list()
for (x in list.files(temp, full.names=TRUE)) {
    prefix <- sub("^[^_]*_", "", x)
    prefix <- sub("_.*", "", prefix)
    counts[[prefix]] <- readSparseCounts(x)
}
do.call(rbind, lapply(counts, dim))

For some unknown reason, each matrix has its own set of features! Crazy. At least the intersection is of a reasonable size.

length(Reduce(intersect, lapply(counts, rownames)))

I can't be sure that the missing features in a given matrix have all-zero expression values. So, I will make the executive decision of taking the intersection across matrices and cbind'ing them together into a single matrix for each system (i.e., cortex or 3T3). If you don't like that, take it up with the authors.

Saving the 3T3 data

Combining the 3T3 matrices for cell and nuclei:

counts.3T3 <- counts[c("cell-3T3", "nuclei-3T3")]
common.3T3 <- Reduce(intersect, lapply(counts.3T3, rownames))
combined.3T3 <- do.call(cbind, lapply(counts.3T3, function(x) x[common.3T3,]))
dim(combined.3T3)

Slapping together a SingleCellExperiment:

library(SingleCellExperiment)
sce <- SingleCellExperiment(list(counts=combined.3T3))
sce$protocol <- rep(sub("-3T3$", "", names(counts.3T3)), vapply(counts.3T3, ncol, 0L))
table(sce$protocol)

Adding some polish to optimize for space:

library(scRNAseq)
sce <- polishDataset(sce)
sce

And saving it:

copy <- meta
copy$title <- sprintf(copy$title, "3T3")
copy$description <- sprintf(copy$description, "3T3")
saveDataset(sce, file.path(output.dir, "3T3"), copy)

Saving the cortex data

Defining the sample identities:

samples <- c(
    "nuclei-ctx-1",
    "nuclei-ctx-2",
    "nuclei-ctx-3", 
    "nuclei-ctx-4",
    "nuclei-ctx-5",
    "nuclei-ctx-6",
    "nuclei-ctx-7",
    "nuclei-ctx-8",
    "nuclei-ctx-9",
    "nuclei-ctx-10",
    "nuclei-ctx-11",
    "nuclei-ctx-12",
    "nuclei-ctx-13",
    "nuclei-ctx-saline1", 
    "nuclei-ctx-PTZ1", 
    "nuclei-ctx-saline2",
    "nuclei-ctx-PTZ2" 
)

# Checking that everyone is accounted for:
cortex.samples <- sort(names(counts)[grep("^nuclei-ctx-", names(counts))])
stopifnot(identical(cortex.samples, sort(samples)))

Combining the cortex matrices across samples:

counts.cortex <- counts[samples]
common.cortex <- Reduce(intersect, lapply(counts.cortex, rownames))
combined.cortex <- do.call(cbind, lapply(counts.cortex, function(x) x[common.cortex,]))
dim(combined.cortex)

Slapping together a SingleCellExperiment:

library(SingleCellExperiment)
sce <- SingleCellExperiment(list(counts=combined.cortex))

Adding some more information about each sample.

replicator <- rep(seq_along(samples), vapply(counts.cortex, ncol, 0L))

treatment <- rep(NA_character_, length(samples))
treatment[grep("^nuclei-ctx-saline[0-9]+$", samples)] <- "saline"
treatment[grep("^nuclei-ctx-PTZ[0-9]+$", samples)] <- "PTZ"
sce$treatment <- treatment[replicator]
table(sce$treatment, useNA="always")

animal <- as.integer(sub("[^0-9]+([0-9]+)$", "\\1", samples))
stopifnot(!anyNA(animal))
sce$animal <- animal[replicator]
table(sce$animal)

Polishing it up:

sce <- polishDataset(sce)
sce

And saving it:

copy <- meta
copy$title <- sprintf(copy$title, "cortex")
copy$description <- sprintf(copy$description, "cortex")
saveDataset(sce, file.path(output.dir, "cortex"), copy)