In LTLA/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the data

We obtain a single-cell RNA sequencing dataset of the mouse brain from Zeisel et al. (2015). Counts for endogenous genes, mitochondrial genes, repeats and spike-in transcripts are available from http://linnarssonlab.org/cortex. We download and cache them using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
base.url <- "https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex"

mRNA.url <- paste0(base.url, "/expression_mRNA_17-Aug-2014.txt")
rep.url <- paste0(base.url, "/expression_rep_17-Aug-2014.txt")
mito.url <- paste0(base.url, "/expression_mito_17-Aug-2014.txt")
spike.url <- paste0(base.url, "/expression_spikes_17-Aug-2014.txt")

mRNA.path <- bfcrpath(bfc, mRNA.url)
rep.path <- bfcrpath(bfc, rep.url)
mito.path <- bfcrpath(bfc, mito.url)
spike.path <- bfcrpath(bfc, spike.url)

Loading the counts

We define a simple utility function for loading data in from each file. Each file contains some metadata, so we create a SingleCellExperiment object to accommodate both the counts and the metadata.

library(SingleCellExperiment)
readFormat <- function(infile) { 
    # First column is empty.
    metadata <- read.delim(infile, stringsAsFactors=FALSE, header=FALSE, nrow=10)[,-1]
    rownames(metadata) <- metadata[,1]
    metadata <- metadata[,-1]
    metadata <- DataFrame(t(metadata), check.names=FALSE)

    # Cleaning up aspects of coercion to a DataFrame.
    to.coerce <- c("group #", "total mRNA mol", "well", "sex", "age", "diameter")
    for (x in colnames(metadata)) {
        if (x %in% to.coerce) {
            FUN <- as.numeric
        } else {
            FUN <- as.character
        }
        metadata[[x]] <- FUN(metadata[[x]])
    }
    rownames(metadata) <- NULL

    # First column after row names is some unknown per-gene annotation, let's just ditch it.
    counts <- read.delim(infile, stringsAsFactors=FALSE, header=FALSE, row.names=1, skip=11)
    counts <- counts[,-1] 
    counts <- as.matrix(counts)

    # Adding the column names.
    colnames(counts) <- metadata$cell_id
    stopifnot(!anyDuplicated(colnames(counts)))
    metadata$cell_id <- NULL

    SingleCellExperiment(list(counts=as(counts, "SVT_SparseMatrix")), colData=metadata)
}

Using this function, we read in the counts for the endogenous genes, ERCC spike-in transcripts and mitochondrial genes.

endo.data <- readFormat(mRNA.path)
endo.data
rep.data <- readFormat(rep.path)
rep.data
spike.data <- readFormat(spike.path)
spike.data
mito.data <- readFormat(mito.path)
mito.data

We need to rearrange the columns for the mitochondrial data, as the order is not consistent with the other files.

m <- match(colnames(endo.data), colnames(mito.data))
mito.data <- mito.data[,m]
summary(is.na(m))

Checking that the column data is the same for all objects:

# Should all be the same.
stopifnot(identical(colData(endo.data), colData(spike.data))) 
stopifnot(identical(colData(endo.data), colData(rep.data)))
stopifnot(identical(colData(endo.data), colData(mito.data)))

Assembling the `SingleCellExperiment`

We combine all components into a single SingleCellExperiment object. First we start with the endogenous genes:

sce <- rbind(endo.data, mito.data)
rowData(sce)$featureType <- rep(c("endogenous", "mito"), c(nrow(endo.data), nrow(mito.data)))
mainExpName(sce) <- "gene"

Then we add the repeats:

altExp(sce, "repeat") <- rep.data

And finally, the ERCC spike-ins with their concentrations:

spike.conc <- scRNAseq::countErccMolecules(volume = 9, dilution = 20000)
spike.conc <- spike.conc[rownames(spike.data), ]
rowData(spike.data) <- cbind(rowData(spike.data), spike.conc)
altExp(sce, "ERCC") <- spike.data

Saving to file

We apply some polish to optimize the disk space:

library(scRNAseq)
sce <- polishDataset(sce)
sce

We now save all of the relevant components of sce to file for upload.

curdate <- as.character(Sys.Date())
meta <- list(
    title="Brain structure. Cell types in the mouse cortex and hippocampus revealed by single-cell RNA-seq",
    description="The mammalian cerebral cortex supports cognitive functions such as sensorimotor integration, memory, and social behaviors. Normal brain function relies on a diverse set of differentiated cell types, including neurons, glia, and vasculature. Here, we have used large-scale single-cell RNA sequencing (RNA-seq) to classify cells in the mouse somatosensory cortex and hippocampal CA1 region. We found 47 molecularly distinct subclasses, comprising all known major cell types in the cortex. We identified numerous marker genes, which allowed alignment with known cell types, morphology, and location. We found a layer I interneuron expressing Pax6 and a distinct postmitotic oligodendrocyte subclass marked by Itpr2. Across the diversity of cortical cell types, transcription factors formed a complex, layered regulatory code, suggesting a mechanism for the maintenance of adult cell type identity.

Maintainer note: ERCC molecule counts are computed using a volume of 9 nL per cell at a dilution of 1:20000.",
    taxonomy_id="10090",
    genome="GRCm38",
    sources=list(
        list(provider="URL", id=mRNA.url, version=curdate),
        list(provider="URL", id=rep.url, version=curdate),
        list(provider="URL", id=spike.url, version=curdate),
        list(provider="URL", id=mito.url, version=curdate),
        list(provider="PubMed", id="25700174")
    ),
    maintainer_name="Aaron Lun",
    maintainer_email="infinite.monkeys.with.keyboards@gmail.com"
)

saveDataset(sce, "2023-12-14_output", meta)