library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
We obtain a single-cell RNA sequencing dataset of the mouse brain from Jessa et al. (2019).
Counts for endogenous genes are available from GEO using the accession number GSE133531.
We download and cache them using the r Biocpkg("BiocFileCache")
package.
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) raw.path <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133531&format=file") tmp <- tempfile() untar(raw.path, exdir=tmp) list.files(tmp)
Creating a SingleCellExperiment
from the combined entries.
library(DropletUtils) nms <- c("GSM3934450_ET_CT_12", "GSM3934451_ET_CT_15", "GSM3934452_PT_CT_0", "GSM3934453_C57BL6-P3-cortex", "GSM3934454_C57BL6-P6-cortex", "GSM3934455_ET_PO_12", "GSM3934456_ET_PO_15", "GSM3934457_PT_PO_0", "GSM3934458_C57BL6-P3-pons", "GSM3934459_C57BL6-P6-pons") library(BiocParallel) all.prefixes <- file.path(tmp, paste0(nms, "_")) names(all.prefixes) <- nms sce <- read10xCounts(all.prefixes, type="prefix", BPPARAM=MulticoreParam()) sce
We don't really need a separate ID
column in the row data, as this is redundant with the row names.
stopifnot(identical(rownames(sce), rowData(sce)$ID)) rowData(sce)$ID <- NULL
We rename the sample column to avoid conflicts with later metadata fields. We also strip out the GEM group from the barcode to make life easier.
sce$Barcode <- sub("-1$", "", sce$Barcode) sce$Sample_ID <- sce$Sample sce$Sample <- NULL
Now we load up the per-sample metadata.
all.meta <- list() for (i in seq_along(nms)) { all.meta[[i]] <- read.delim(file.path(tmp, paste0(nms[i], ".metadata.tsv.gz"))) all.meta[[i]]$.Sample <- nms[i] } all.meta <- do.call(rbind, all.meta) head(all.meta)
Matching to the columns of sce
.
obs <- DataFrame(Sample_ID=all.meta$.Sample, Barcode=all.meta$Cell) stopifnot(all(obs %in% colData(sce))) m <- match(colData(sce), obs) all.meta <- DataFrame(all.meta[m,], row.names=NULL) all.meta$retained <- !is.na(m) summary(all.meta$retained)
Auto-filling sample name information, even for the cells that weren't used.
rename <- split(all.meta$Sample, all.meta$.Sample) rename <- lapply(rename, unique) stopifnot(all(lengths(rename)==1L)) all.meta$Sample <- unlist(rename)[sce$Sample_ID] table(all.meta$Sample, useNA="always")
Moving the dimensionality reduction results to a better home. Note that these are per-sample coordinates, so it doesn't make sense to plot them across samples.
pc.names <- c("PC1", "PC2") t.names <- c("tSNE_1", "tSNE_2") reducedDims(sce) <- list(Sample_PCA=as.matrix(all.meta[,pc.names]), Sample_TSNE=as.matrix(all.meta[,t.names])) keep <- colnames(all.meta) %in% c(pc.names, t.names) all.meta <- all.meta[,!keep,drop=FALSE]
Similarly, the clustering is done for each sample separately, so we rename it to make that clear.
colnames(all.meta)[colnames(all.meta)=="Cluster"] <- "Sample_Cluster" colnames(all.meta)[colnames(all.meta)=="Cluster_number"] <- "Sample_Cluster_number"
Removing the redundant sample and barcode columns before adding it to the SCE:
all.meta <- all.meta[,setdiff(colnames(all.meta), c(".Sample", "Cell"))] colData(sce) <- cbind(colData(sce), all.meta) colData(sce)
Adding the joint forebrain analysis results, consisting of various coordinates and clustering results.
fb.path <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133531&format=file&file=GSE133531%5FForebrain%5Fjoin%2E2D%2Etsv%2Egz") fb.meta <- DataFrame(read.delim(fb.path)) fb.meta obs <- DataFrame(Sample=fb.meta$Sample, Barcode=sub(".*_", "", fb.meta$Cell)) ref <- colData(sce)[,c("Sample", "Barcode")] stopifnot(all(obs %in% ref)) m <- match(ref, obs) summary(is.na(m)) fb.meta <- fb.meta[m,] u.names <- c("UMAP1", "UMAP2") reducedDim(sce, "Forebrain_PC") <- as.matrix(fb.meta[,pc.names]) reducedDim(sce, "Forebrain_tSNE") <- as.matrix(fb.meta[,t.names]) reducedDim(sce, "Forebrain_UMAP") <- as.matrix(fb.meta[,u.names]) sce$Forebrain_Joint_cluster_number <- fb.meta$Joint_cluster_number table(sce$Forebrain_Joint_cluster_number) sce$Forebrain_Joint_cluster <- fb.meta$Joint_cluster table(sce$Forebrain_Joint_cluster)
Repeating the dose with the Pons data.
pons.path <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE133531&format=file&file=GSE133531%5FPons%5Fjoin%2E2D%2Etsv%2Egz") pons.meta <- DataFrame(read.delim(pons.path)) pons.meta obs <- DataFrame(Sample=pons.meta$Sample, Barcode=sub(".*_", "", pons.meta$Cell)) ref <- colData(sce)[,c("Sample", "Barcode")] stopifnot(all(obs %in% ref)) m <- match(ref, obs) summary(is.na(m)) pons.meta <- pons.meta[m,] u.names <- c("UMAP1", "UMAP2") reducedDim(sce, "Pons_PC") <- as.matrix(pons.meta[,pc.names]) reducedDim(sce, "Pons_tSNE") <- as.matrix(pons.meta[,t.names]) reducedDim(sce, "Pons_UMAP") <- as.matrix(pons.meta[,u.names]) sce$Pons_Joint_cluster_number <- pons.meta$Joint_cluster_number table(sce$Pons_Joint_cluster_number) sce$Pons_Joint_cluster <- pons.meta$Joint_cluster table(sce$Pons_Joint_cluster)
Polishing up the SCE:
library(scRNAseq) sce <- polishDataset(sce) sce
We now save it to file in preparation for upload:
meta <- list( title="Stalled developmental programs at the root of pediatric brain tumors", description="Childhood brain tumors have suspected prenatal origins. To identify vulnerable developmental states, we generated a single-cell transcriptome atlas of >65,000 cells from embryonal pons and forebrain, two major tumor locations. We derived signatures for 191 distinct cell populations and defined the regional cellular diversity and differentiation dynamics. Projection of bulk tumor transcriptomes onto this dataset shows that WNT medulloblastomas match the rhombic lip-derived mossy fiber neuronal lineage and embryonal tumors with multilayered rosettes fully recapitulate a neuronal lineage, while group 2a/b atypical teratoid/rhabdoid tumors may originate outside the neuroectoderm. Importantly, single-cell tumor profiles reveal highly defined cell hierarchies that mirror transcriptional programs of the corresponding normal lineages. Our findings identify impaired differentiation of specific neural progenitors as a common mechanism underlying these pediatric cancers and provide a rational framework for future modeling and therapeutic interventions. Maintainer note: this dataset consists of cells from all samples in this study. Clustering annotations and reduced dimensions that have names prefixed with \"Sample_\" were presumably computed within each sample and should not be used in inter-sample comparisons.", taxonomy_id="10090", genome="GRCm38", sources=list( list(provider="GEO", id="GSE133531"), list(provider="PubMed", id="31768071") ), maintainer_name="Aaron Lun", maintainer_email="infinite.monkeys.with.keyboards@gmail.com" ) saveDataset(sce, "2023-12-22_output", meta)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.