In LTLA/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the data

We obtain a single-cell RNA sequencing dataset of human cortical organoids from Bhaduri et al. (2020). Normalized and batch-corrected data for endogenous genes are available from GEO using the accession GSE132672. We download and cache them using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
fname <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE132672&format=file&file=GSE132672%5Fallorganoids%5Fwithnew%5Fmatrix%2Etxt%2Egz")

Reading the values in as a sparse matrix - this may take some time.

library(scuttle)
normalized <- readSparseCounts(fname)
dim(normalized)

We then stick this into a SingleCelLExperiment.

sce <- SingleCellExperiment(list(normalized=normalized))
sce

Grabbing the metadata

We use r Biocpkg("GEOquery") to scrape the relevant sample-level metadata.

library(GEOquery)
out <- getGEO("GSE132672")
meta <- as(phenoData(out[[1]]), "data.frame")
meta <- meta[,c(1L, grep(":ch1$", colnames(meta)))]
colnames(meta) <- sub(":ch1$", "", colnames(meta))
meta <- DataFrame(meta, check.names=FALSE)
meta

Then it's just a simple matter to expand it. Well, not so simple, because these authors couldn't keep their names straight if their lives depended on it.

observed.donor <- sub("_.*", "", colnames(sce))

mappings <- c(
    H1SWeek24="wk24H1S_S2",
    L13234PWeek24="wk2413234P",
    L13234SWeek15="L13234controlweek15",
    L13234SWeek24="wk2413234S",
    Week10P="Week10P_1323_4",
    Week10S="Week10S_1323_4",
    Week5P="Week5P_1323_4",
    Week5S="Week5S_1323_4",
    Week8P="Week8P_1323_4",
    Week8S="Week8S_1323_4"
)

failed <- observed.donor %in% names(mappings)
observed.donor[failed] <- unname(mappings[observed.donor[failed]])
table(observed.donor)

m <- match(observed.donor, meta$title)
stopifnot(all(!is.na(m)))

coldata <- meta[m,,drop=FALSE]
rownames(coldata) <- colnames(sce)
colData(sce) <- coldata

Further notes

There is, in fact, another version of this dataset, available at https://cells.ucsc.edu/organoidreportcard. Briefly put: this was hell. The column names of the count matrix simply do not match with the cell names in the metadata table. Attempts to reconstruct the cell names failed to yield an unambiguous mapping, and so this was abandoned. The code used is reported below as a warning to future generations.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
fname <- bfcrpath(bfc, "https://cells.ucsc.edu/organoidreportcard/primary10X/primarymatrix_nonorm.txt.gz")

library(scuttle)
counts <- readSparseCounts(fname)
sce <- SingleCellExperiment(list(counts=counts))

meta.path <- bfcrpath(bfc, "https://cells.ucsc.edu/organoidreportcard/primary10X/meta.tsv")
meta <- read.delim(meta.path)
meta <- DataFrame(meta)
table(meta$Area)

ids <- colnames(sce)
ids <- sub("Central", "Central cortex", ids)
ids <- sub("Frontal", "Frontal cortex", ids)
ids <- sub("Occipital", "Occipital cortex", ids)
ids <- sub("Hippocampus", "hippocampus", ids)
ids <- sub("Motor", "motor", ids)
ids <- sub("motordeeper", "motor", ids)
ids <- sub("PFCdeeper", "PFC", ids)
ids <- sub("LPFC", "PFC", ids)
ids <- sub("parietaldeeper", "parietal", ids)
ids <- sub("Parietal", "parietal", ids)
ids <- sub("(S|s)omato", "somatosensory", ids)
ids <- sub("Temporal", "temporal", ids)

converted <- sub(".* ([A-Z]{2}[0-9]{2})","", ids)
table(converted)

idx <- sub(".*_", "", meta$Cell)

cell <- sub("([ACGT]+)_[^_]+$", "\\1", meta$Cell)
cell <- sub("^[^_]+_([ACTG]+)", "\\1", cell)
ids2 <- paste0(cell, " ", meta$Individual, meta$Area)

# Really poor matches here, but whatever. 
m <- match(ids, ids2)
table(converted, is.na(m))

Saving to file

We polish the dataset to optimize for space:

library(scRNAseq)
sce <- polishDataset(sce)
sce

We now save everything to disk in preparation for upload:

meta <- list(
title="Cell stress in cortical organoids impairs molecular subtype specification",
description="Cortical organoids are self-organizing three-dimensional cultures that model features of the developing human cerebral cortex. However, the fidelity of organoid models remains unclear. Here we analyse the transcriptomes of individual primary human cortical cells from different developmental periods and cortical areas. We find that cortical development is characterized by progenitor maturation trajectories, the emergence of diverse cell subtypes and areal specification of newborn neurons. By contrast, organoids contain broad cell classes, but do not recapitulate distinct cellular subtype identities and appropriate progenitor maturation. Although the molecular signatures of cortical areas emerge in organoid neurons, they are not spatially segregated. Organoids also ectopically activate cellular stress pathways, which impairs cell-type specification. However, organoid stress and subtype defects are alleviated by transplantation into the mouse cortex. Together, these datasets and analytical tools provide a framework for evaluating and improving the accuracy of cortical organoids as models of human brain development.",
taxonomy_id="9606",
genome="GRCh38",
sources=list(
list(provider="GEO", id="GSE132672"),
list(provider="PubMed", id="31996853")
),
maintainer_name="Aaron Lun",
maintainer_email="infinite.monkeys.with.keyboards@gmail.com"
)

saveDataset(sce, "2023-12-21_output", meta)