In LTLA/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the data

We obtain a single-cell RNA sequencing dataset of the mouse haematopoietic stem cells from Nestorowa et al. (2016). Counts for endogenous genes and spike-in transcripts are available from the Gene Expression Omnibus using the accession number GSE81682. We download and cache them using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
url <- "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE81nnn/GSE81682/suppl/GSE81682_HTSeq_counts.txt.gz"
count.file <- bfcrpath(bfc, url)

We read this into memory as a sparse matrix.

library(scuttle)
counts <- readSparseCounts(count.file)
dim(counts)

We hack out the metrics:

is.metric <- startsWith(rownames(counts), "__")
metrics <- DataFrame(t(as.matrix(counts[is.metric,])))
counts <- counts[!is.metric,]
metrics

Downloading the per-cell metadata

We download the per-cell metadata from the website.

base.url <- "http://blood.stemcells.cam.ac.uk/data"
celltype.file <- bfcrpath(bfc, paste0(base.url, "/all_cell_types.txt"))
celltype <- read.delim(celltype.file)
celltype <- celltype[match(colnames(counts), rownames(celltype)),]
stopifnot(identical(colnames(counts), rownames(celltype)))
head(celltype)

With some detective work (mostly inspecting crossprod(celltype)), we try to compact this into a more user-friendly representation by identifying the hierarchy of terms.

broad <- rep(NA_character_, nrow(celltype))
for (x in c("LTHSC", "LMPP", "MPP", "CMP", "MEP", "GMP")) {
    chosen <- celltype[,paste0(x, "_broad")] == 1
    stopifnot(all(is.na(broad[chosen])))
    broad[chosen] <- x
}

broad.mpp.subtype <- rep(NA_character_, nrow(celltype))
for (x in c("MPP1", "MPP2", "MPP3", "STHSC")) {
    chosen <- celltype[,paste0(x, "_broad")] == 1
    stopifnot(all(is.na(broad.mpp.subtype[chosen])))
    broad.mpp.subtype[chosen] <- x
}

fine <- rep(NA_character_, nrow(celltype))
for (x in c("LTHSC", "LMPP", "MPP", "CMP", "MEP", "GMP")) {
    chosen <- celltype[,x] == 1
    stopifnot(all(is.na(fine[chosen])))
    fine[chosen] <- x
}

fine.mpp.subtype <- rep(NA_character_, nrow(celltype))
for (x in c("MPP1", "MPP2", "MPP3", "STHSC")) {
    chosen <- celltype[,x] == 1
    stopifnot(all(is.na(fine.mpp.subtype[chosen])))
    fine.mpp.subtype[chosen] <- x
}

This gives us the following:

coldata <- DataFrame(
    row.names=rownames(celltype),
    gate=sub("_[0-9]+$", "", rownames(celltype)),
    broad=broad,
    broad.mpp=broad.mpp.subtype,
    fine=fine,
    fine.mpp=fine.mpp.subtype,
    ESLAM=celltype[,"ESLAM"] == 1,
    HSC1=celltype[,"HSC1"] == 1,
    projected=celltype[,"Projected"] == 1
)

We slap in the metrics:

coldata$metrics <- metrics
coldata

Obtaining some more stuff

We also obtain a matrix of flow cytometry intensities:

flowcyt.file <- bfcrpath(bfc, paste0(base.url, "/coordinates_gene_counts_flow_cytometry.txt.gz"))
flowcyt <- as.matrix(read.delim(flowcyt.file, row.names=1))
flowcyt <- flowcyt[match(colnames(counts), rownames(flowcyt)),]
rownames(flowcyt) <- colnames(counts)

is.diffusion <- grep("^DC[0-9]+$", colnames(flowcyt))
colnames(flowcyt)[is.diffusion]
is.facs <- setdiff(grep("^ENSMUSG[0-9]+$", colnames(flowcyt), invert=TRUE), is.diffusion)
colnames(flowcyt)[is.facs]

Saving to file

We assemble this into a SingleCellExperiment object, splitting off the spike-ins:

sce <- SingleCellExperiment(list(counts=counts), colData=coldata)
status <- ifelse(grepl("^ERCC-[0-9]+", rownames(sce)), "ERCC", "endogenous")
sce <- splitAltExps(sce, status)

We add some more bits and pieces:

reducedDim(sce, "diffusion") <- flowcyt[,is.diffusion]
altExp(sce, "FACS") <- SummarizedExperiment(list(intensities=t(flowcyt[,is.facs])))

We run some polish to save disk space:

library(scRNAseq)
sce <- polishDataset(sce)
sce

And then we save it to disk.

meta <- list(
    title="A single-cell resolution map of mouse hematopoietic stem and progenitor cell differentiation",
    description="Maintenance of the blood system requires balanced cell fate decisions by hematopoietic stem and progenitor cells (HSPCs). Because cell fate choices are executed at the individual cell level, new single-cell profiling technologies offer exciting possibilities for mapping the dynamic molecular changes underlying HSPC differentiation. Here, we have used single-cell RNA sequencing to profile more than 1600 single HSPCs, and deep sequencing has enabled detection of an average of 6558 protein-coding genes per cell. Index sorting, in combination with broad sorting gates, allowed us to retrospectively assign cells to 12 commonly sorted HSPC phenotypes while also capturing intermediate cells typically excluded by conventional gating. We further show that independently generated single-cell data sets can be projected onto the single-cell resolution expression map to directly compare data from multiple groups and to build and refine new hypotheses. Reconstruction of differentiation trajectories reveals dynamic expression changes associated with early lymphoid, erythroid, and granulocyte-macrophage differentiation. The latter two trajectories were characterized by common upregulation of cell cycle and oxidative phosphorylation transcriptional programs. By using external spike-in controls, we estimate absolute messenger RNA (mRNA) levels per cell, showing for the first time that despite a general reduction in total mRNA, a subset of genes shows higher expression levels in immature stem cells consistent with active maintenance of the stem-cell state. Finally, we report the development of an intuitive Web interface as a new community resource to permit visualization of gene expression in HSPCs at single-cell resolution for any gene of choice.

Maintainer note: FACS data is stored as an alternative experiment and the diffusion map components are stored as reduced dimensions. The cell type assignments are stored in the column data, transformed from the identity matrix provided by the authors into a more user-friendly form. The gating strategy for FACS is inferred from the cell names: Prog (Lin- Sca1- c-Kit+), HSPC (Lin- Sca1+ c-Kit+) and LT-HSC (Lin- c-Kit+ Sca1+ CD34- Flk2-).",
    taxonomy_id="10090",
    genome="GRCm38",
    sources=list(
        list(provider="GEO", id="GSE81682"),
        list(provider="PubMed", id="27365425"),
        list(provider="URL", id=base.url, version=as.character(Sys.Date()))
    ),
    maintainer_name="Aaron Lun",
    maintainer_email="infinite.monkeys.with.keyboards@gmail.com"
)

saveDataset(sce, "2023-12-19_output", meta)