library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
We obtain a single-cell RNA sequencing dataset of human cortical organoids from @bhaduri2020cell.
Counts for endogenous genes are available from GEO using the accession GSE132672.
We download and cache them using the r Biocpkg("BiocFileCache")
package.
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) fname <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE132672&format=file&file=GSE132672%5Fallorganoids%5Fwithnew%5Fmatrix%2Etxt%2Egz")
Reading the counts in as a sparse matrix - this may take some time.
library(scuttle) counts <- readSparseCounts(fname) sce <- SingleCellExperiment(list(counts=counts)) sce
We use r Biocpkg("GEOquery")
to scrape the relevant sample-level metadata.
library(GEOquery) out <- getGEO("GSE132672") meta <- as(phenoData(out[[1]]), "data.frame") meta <- meta[,c(1L, grep(":ch1$", colnames(meta)))] colnames(meta) <- sub(":ch1$", "", colnames(meta)) meta <- DataFrame(meta, check.names=FALSE) meta
Then it's just a simple matter to expand it.
observed.donor <- sub("_.*", "", colnames(sce)) # Well, not so simple, because these authors couldn't # keep their names straight if their lives depended on it. mappings <- c( H1SWeek24="wk24H1S_S2", L13234PWeek24="wk2413234P", L13234SWeek15="L13234controlweek15", L13234SWeek24="wk2413234S", Week10P="Week10P_1323_4", Week10S="Week10S_1323_4", Week5P="Week5P_1323_4", Week5S="Week5S_1323_4", Week8P="Week8P_1323_4", Week8S="Week8S_1323_4" ) failed <- observed.donor %in% names(mappings) observed.donor[failed] <- unname(mappings[observed.donor[failed]]) table(observed.donor) m <- match(observed.donor, meta$title) stopifnot(all(!is.na(m))) coldata <- meta[m,,drop=FALSE] rownames(coldata) <- colnames(sce) colData(sce) <- coldata
There is, in fact, another version of this dataset, available at https://cells.ucsc.edu/organoidreportcard. Briefly put: this was hell. The column names of the count matrix simply do not match with the cell names in the metadata table. Attempts to reconstruct the cell names failed to yield an unambiguous mapping, and so this was abandoned. The code used is reported below as a warning to future generations.
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) fname <- bfcrpath(bfc, "https://cells.ucsc.edu/organoidreportcard/primary10X/primarymatrix_nonorm.txt.gz") library(scuttle) counts <- readSparseCounts(fname) sce <- SingleCellExperiment(list(counts=counts)) meta.path <- bfcrpath(bfc, "https://cells.ucsc.edu/organoidreportcard/primary10X/meta.tsv") meta <- read.delim(meta.path) meta <- DataFrame(meta) table(meta$Area) ids <- colnames(sce) ids <- sub("Central", "Central cortex", ids) ids <- sub("Frontal", "Frontal cortex", ids) ids <- sub("Occipital", "Occipital cortex", ids) ids <- sub("Hippocampus", "hippocampus", ids) ids <- sub("Motor", "motor", ids) ids <- sub("motordeeper", "motor", ids) ids <- sub("PFCdeeper", "PFC", ids) ids <- sub("LPFC", "PFC", ids) ids <- sub("parietaldeeper", "parietal", ids) ids <- sub("Parietal", "parietal", ids) ids <- sub("(S|s)omato", "somatosensory", ids) ids <- sub("Temporal", "temporal", ids) converted <- sub(".* ([A-Z]{2}[0-9]{2})","", ids) table(converted) idx <- sub(".*_", "", meta$Cell) cell <- sub("([ACGT]+)_[^_]+$", "\\1", meta$Cell) cell <- sub("^[^_]+_([ACTG]+)", "\\1", cell) ids2 <- paste0(cell, " ", meta$Individual, meta$Area) # Really poor matches here, but whatever. m <- match(ids, ids2) table(converted, is.na(m))
We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub")
.
path <- file.path("scRNAseq", "bhaduri-organoid", "2.6.0") dir.create(path, showWarnings=FALSE, recursive=TRUE) saveRDS(assay(sce), file=file.path(path, "counts.rds")) saveRDS(colData(sce), file=file.path(path, "coldata.rds"))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.