library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
Here, we process a single-cell sequencing dataset of haematopoietic stem cells from Giladi et al. (2018).
This involves both RNA and CRISPR components, so we'll be handling each one separately.
We set up a local r Biocpkg("BiocFileCache")
directory to handle the file caching:
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE)
Counts for endogenous genes are available from the Gene Expression Omnibus using the accession number GSE92575. We download and cache them:
collected <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE92575&format=file") dir <- tempfile() untar(collected, exdir=dir) all.files <- list.files(dir, full=TRUE) length(all.files)
Loading all of the count matrices into memory.
library(scuttle) library(BiocParallel) out <- bplapply(all.files, readSparseCounts) # Sanity check: stopifnot(length(unique(lapply(out, rownames)))==1) # Checking that the names are unique after combining. all.cn <- unlist(lapply(out, colnames)) stopifnot(anyDuplicated(all.cn)==0) combined <- do.call(cbind, out)
Constructing a SingleCellExperiment
object.
sce <- SingleCellExperiment(list(counts=combined)) sce
Extracting an intelligible gene symbol from the row names:
rowData(sce)$symbol <- sub(".*;", "", rownames(sce)) head(rowData(sce)$symbol)
We download the cell metadata as well:
meta.path <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE92575&format=file&file=GSE92575%5Fmetadata%2Etxt%2Egz") meta <- read.delim(meta.path, skip=14, check.names=FALSE) str(meta)
We expand this to cover all cells in the SCE.
true.samples <- sub(".*_", "", sub(".txt.gz", "", basename(all.files))) true.samples <- rep(true.samples, vapply(out, ncol, 0L)) ref <- DataFrame(well=colnames(combined), sample=true.samples) obs <- DataFrame(well=meta$well, sample=meta$Amp_batch_ID) stopifnot(all(obs %in% ref)) m <- match(ref, obs) meta <- meta[m,] meta$retained <- !is.na(m) meta$sample <- true.samples stopifnot(all(meta$well == colnames(combined) | !meta$retained)) rownames(meta) <- colnames(combined) meta$well <- NULL # removing redundant columns
And then we attach it to the SCE.
colData(sce) <- DataFrame(meta) colData(sce)
Counts for CRISPR barcodes are also available from the Gene Expression Omnibus
using the accession number GSE113494.
We download and cache them using the r Biocpkg("BiocFileCache")
package.
crisp.path <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE113494&format=file&file=GSE113494%5Fcrispseq%5Fcount%2Etxt%2Egz") crisp.df <- read.delim(crisp.path, skip=9) head(crisp.df)
For some bizarre reason, this is not actually a count matrix, but rather, a molecule information file. So we convert this into a count matrix in the simplest way - namely, counting the number of unique UMIs for each cell in each barcode.
library(DropletUtils) crisp.mat <- makeCountMatrix(crisp.df$grna, crisp.df$well) dim(crisp.mat) crisp.cd <- DataFrame(crisp.df[,c("well", "amplification.batch")]) crisp.cd <- crisp.cd[match(colnames(crisp.mat), crisp.cd$well),] rownames(crisp.cd) <- crisp.cd$well crisp.cd <- crisp.cd[,2,drop=FALSE] head(crisp.cd) crisp.rd <- DataFrame(crisp.df[,c("grna", "ugi")]) crisp.rd <- crisp.rd[match(rownames(crisp.mat), crisp.rd$grna),] rownames(crisp.rd) <- crisp.rd$grna crisp.rd <- crisp.rd[,2,drop=FALSE] head(crisp.rd)
We also extract the target gene for each barcode, and whether it is a control:
crisp.rd$gene <- sub("_[0-9]+.*", "", rownames(crisp.rd)) crisp.rd$gene <- sub("CTR?L_", "", crisp.rd$gene) crisp.rd$control <- grepl("CTR?L_", rownames(crisp.rd)) head(crisp.rd)
We put this together into a SingleCellExperiment
object:
crisp.sce <- SingleCellExperiment(list(counts=crisp.mat), rowData=crisp.rd, colData=crisp.cd) crisp.sce
Unfortunately the RNA and CRISPR datasets don't have exactly the same set of cells, so we'll save them separately. We still check that the amplification batches are the same to ensure we're dealing with the same cells.
common <- intersect(colnames(sce), colnames(crisp.sce)) mr <- match(common, colnames(sce)) mc <- match(common, colnames(crisp.sce)) stopifnot(identical(sce$Amp_batch_ID[mr], crisp.sce$amplification.batch[mc]))
We perform polishing of the dataset to optimize for space.
library(scRNAseq) sce <- polishDataset(sce) sce crisp.sce <- polishDataset(crisp.sce) crisp.sce
Setting up some metadata:
meta_raw <- list( title="Single-cell characterization of haematopoietic progenitors and their trajectories in homeostasis and perturbed haematopoiesis [%s only]", description="The dynamics of haematopoietic stem cell differentiation and the hierarchy of oligopotent stem cells in the bone marrow remain controversial. Here we dissect haematopoietic progenitor populations at single cell resolution, deriving an unbiased reference model of transcriptional states in normal and perturbed murine bone marrow. We define the signature of the naive haematopoietic stem cell and find a continuum of core progenitor states. Core cell populations mix transcription of pre-myeloid and pre-lymphoid programs, but do not mix erythroid or megakaryocyte programs with other fates. CRISP-seq perturbation analysis confirms our models and reveals that Cebpa regulates entry into all myeloid fates, while Irf8 and PU.1 deficiency block later differentiation towards monocyte or granulocyte fates. Our transcriptional map defines a reference network model for blood progenitors and their differentiation trajectories during normal and perturbed haematopoiesis. Maintainer note: this object only the %s data from this study.", taxonomy_id="10090", genome="MGSCv37", sources=list( list(provider="PubMed", id="29915358") ), maintainer_name="Aaron Lun", maintainer_email="infinite.monkeys.with.keyboards@gmail.com" )
We now save all of the relevant components to file in preparation for upload.
output.dir <- "2023-12-21_output" unlink(output.dir, recursive=TRUE) dir.create(output.dir) rna.meta <- meta_raw rna.meta$title <- sprintf(rna.meta$title, "RNA") rna.meta$description <- sprintf(rna.meta$description, "RNA") rna.meta$sources <- c(rna.meta$sources, list(list(provider="GEO", id="GSE92575"))) saveDataset(sce, file.path(output.dir, "rna"), rna.meta) crisp.meta <- meta_raw crisp.meta$title <- sprintf(crisp.meta$title, "CRISPR") crisp.meta$description <- sprintf(crisp.meta$description, "CRISPR") crisp.meta$sources <- c(crisp.meta$sources, list(list(provider="GEO", id="GSE11349"))) saveDataset(crisp.sce, file.path(output.dir, "crispr"), crisp.meta)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.