library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the data

We obtain a single-cell RNA sequencing dataset of haematopoietic stem-progenitor cells from @bunis2021singlecell. Counts for endogenous genes are available from the Gene Expression Omnibus using the accession number GSE158490. We download and cache them using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
mat.path <- bfcrpath(bfc,
    "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE158490&format=file&file=GSE158490%5Fmatrix%2Emtx%2Egz")
mat <- Matrix::readMM(mat.path)
dim(mat)

Creating a SingleCellExperiment object:

library(SingleCellExperiment)
sce <- SingleCellExperiment(list(counts=mat))

cd.path <- bfcrpath(bfc,
    "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE158490&format=file&file=GSE158490%5Fbarcodes%2Etsv%2Egz")
colnames(sce) <- readLines(cd.path)

rd.path <- bfcrpath(bfc,
    "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE158490&format=file&file=GSE158490%5Fgenes%2Etsv%2Egz")
rd <- read.table(rd.path)
colnames(rd) <- c("ID", "Symbol")
rowData(sce) <- rd
rownames(sce) <- rowData(sce)$ID

sce

Pulling down the metadata

Attaching some metadata. The SNG.1ST column specifies the sample for each cell, ranging from adult bone marrow (APB), fetal bone marrow (FS) and newborn umbilical cord (UCB). Various other demuxlet statistics are also reported here. We won't expand this out to all 720k columns of mat to save some space, given that most downstream applications will only care about the observed cells.

We can import this data into our sce, using the importDemux() function of r Biocpkg("dittoSeq") to retain selected metadata. (Indeed, yes it is an odd placement for this function which should be split off into some other demultiplexing-focused package.)

We can then convert the ABM/UCB/FBM of Samples into age groups of samples.

demux.path <- bfcrpath(bfc, 
    "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE158490&format=file&file=GSE158490%5FHSPC%2Ebest%2Etxt%2Egz")
demux <- read.delim(demux.path, check.names=FALSE)
stopifnot(all(demux$BARCODE %in% colnames(sce)))
dim(demux)

sce <- dittoSeq::importDemux(sce, demuxlet.best = demux, verbose = FALSE)

# Remove the unnecessary (single-value here) Lane metadata
sce$Lane <- NULL

# Remove the "CD4_" from start of sample names, and replace "APB" and "FS"
# with "ABM" and "FBM" which reflect the cells put through bulk RNAseq to
# generate genotyping for Demuxlet, rather than these cells.
sce$Sample <- gsub("^CD4_", "", sce$Sample)
sce$Sample <- gsub("^FS", "FBM", sce$Sample)
sce$Sample <- gsub("^APB", "ABM", sce$Sample)

# Add age metadata
sce$age <- NA
sce$age[grep("^F", sce$Sample)] <- "fetal"
sce$age[grep("^U", sce$Sample)] <- "newborn"
sce$age[grep("^A", sce$Sample)] <- "adult"

colData(sce)

Adding additional metadata

From the fully processed objects shared on figshare, we will obtain cell type annotations and developmental stage scores.

full.path <- bfcrpath(bfc,
    "https://ndownloader.figshare.com/files/25953740")
raw_meta <- readRDS(full.path)@meta.data
stopifnot(all(rownames(raw_meta) %in% colnames(sce)))
dim(raw_meta)

# Add 'retained' metadata representing barcodes retained by authors.
sce$retained <- colnames(sce) %in% rownames(raw_meta)
summary(sce$retained)

# Add celltypes
authors_meta <- raw_meta[colnames(sce),]
rownames(authors_meta) <- colnames(sce)
sce$labels <- authors_meta$trajectory_calls
table(sce$labels)

# Add Developmental Stage Scoring as a DataFrame
# Retrieve and rename a bit better
DevStageScores <- DataFrame(
  HSCMPP_scores = authors_meta$MPP.RFScore,
  HSCMPP_inTraining = authors_meta$MPP.inTraining,
  GMP_scores = authors_meta$GMP.RFScore,
  GMP_inTraining = authors_meta$GMP.inTraining,
  MEP_scores = authors_meta$MEP.RFScore,
  MEP_inTraining = authors_meta$MEP.inTraining,
  row.names = rownames(authors_meta)
)
sce$DevStageScoring <- DevStageScores
sce$DevStageScoring[sce$retained,]

To save space, we only save the colData for non-NA rows. We will fill these in later.

keep <- colnames(sce) %in% c(demux$BARCODE, rownames(raw_meta))
summary(keep)

Saving to file

We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "bunis-hspc", "2.6.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(assay(sce), file=file.path(path, "counts.rds"))
saveRDS(colData(sce)[keep,], file=file.path(path, "coldata.rds"))
saveRDS(rowData(sce), file=file.path(path, "rowdata.rds"))

Session information

sessionInfo()

References



drisso/scRNAseq documentation built on Feb. 16, 2021, 1:18 a.m.