library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
We obtain a single-cell RNA sequencing dataset of haematopoietic stem cells from @giladi2018singlecell.
Counts for endogenous genes are available from the Gene Expression Omnibus
using the accession number GSE92575.
We download and cache them using the r Biocpkg("BiocFileCache")
package.
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) collected <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE92575&format=file") dir <- tempfile() untar(collected, exdir=dir) all.files <- list.files(dir, full=TRUE) length(all.files)
Loading them all into memory.
library(scuttle) library(BiocParallel) out <- bplapply(all.files, readSparseCounts) # Sanity check: stopifnot(length(unique(lapply(out, rownames)))==1) # Checking that the names are unique after combining. all.cn <- unlist(lapply(out, colnames)) stopifnot(anyDuplicated(all.cn)==0) combined <- do.call(cbind, out)
Constructing a SingleCellExperiment
object.
sce <- SingleCellExperiment(list(counts=combined)) sce
Filling in the metadata as well.
meta.path <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE92575&format=file&file=GSE92575%5Fmetadata%2Etxt%2Egz") meta <- read.delim(meta.path, skip=14) # Expanding to cover all cells in the SCE. true.samples <- sub(".*_", "", sub(".txt.gz", "", basename(all.files))) true.samples <- rep(true.samples, vapply(out, ncol, 0L)) ref <- DataFrame(well=colnames(combined), sample=true.samples) obs <- DataFrame(well=meta$well, sample=meta$Amp_batch_ID) stopifnot(all(obs %in% ref)) m <- match(ref, obs) meta <- meta[m,] meta$retained <- !is.na(m) meta$sample <- true.samples stopifnot(all(meta$well == colnames(combined) | !meta$retained)) meta$well <- colnames(combined) rownames(meta) <- meta$well colData(sce) <- DataFrame(meta) colData(sce)
Counts for CRISPR barcodes are also available from the Gene Expression Omnibus
using the accession number GSE113494.
We download and cache them using the r Biocpkg("BiocFileCache")
package.
crisp.path <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE113494&format=file&file=GSE113494%5Fcrispseq%5Fcount%2Etxt%2Egz") crisp.df <- read.delim(crisp.path, skip=9) head(crisp.df)
For some bizarre reason, this is not actually a count matrix, but rather, a molecule information file. So we convert this into a count matrix in the simplest way - namely, counting the number of unique UMIs for each cell in each barcode.
library(DropletUtils) crisp.mat <- makeCountMatrix(crisp.df$grna, crisp.df$well) dim(crisp.mat) crisp.cd <- DataFrame(crisp.df[,c("well", "amplification.batch")]) crisp.cd <- crisp.cd[match(colnames(crisp.mat), crisp.cd$well),] rownames(crisp.cd) <- crisp.cd$well crisp.cd <- crisp.cd[,2,drop=FALSE] head(crisp.cd) crisp.rd <- DataFrame(crisp.df[,c("grna", "ugi")]) crisp.rd <- crisp.rd[match(rownames(crisp.mat), crisp.rd$grna),] rownames(crisp.rd) <- crisp.rd$grna crisp.rd <- crisp.rd[,2,drop=FALSE] head(crisp.rd)
We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub")
.
path <- file.path("scRNAseq", "giladi-hsc", "2.6.0") dir.create(path, showWarnings=FALSE, recursive=TRUE) saveRDS(assay(sce), file=file.path(path, "counts-rna.rds")) saveRDS(colData(sce), file=file.path(path, "coldata-rna.rds")) saveRDS(crisp.mat, file=file.path(path, "counts-crispr.rds")) saveRDS(crisp.cd, file=file.path(path, "coldata-crispr.rds")) saveRDS(crisp.rd, file=file.path(path, "rowdata-crispr.rds"))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.