library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the count data

We obtain a single-cell RNA sequencing dataset of the mouse brain from @tasic2016adult. Counts for endogenous genes and spike-in transcripts are available from the Gene Expression Omnibus using the accession number GSE71585. We download and cache them using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
base.url <- file.path("ftp://ftp.ncbi.nlm.nih.gov/geo/series",
    "GSE71nnn/GSE71585/suppl")
count.file <- bfcrpath(bfc, file.path(base.url,
    "GSE71585_RefSeq_counts.csv.gz"))
spike.file <- bfcrpath(bfc, file.path(base.url,
    "GSE71585_ERCC_and_tdTomato_counts.csv.gz"))

We load them into memory.

count.mat <- read.csv(count.file, row.names=1, header=TRUE, check.names=FALSE)
count.mat <- as.matrix(count.mat)
dim(count.mat)
spike.mat <- read.csv(spike.file, row.names=1, header=TRUE, check.names=FALSE)
spike.mat <- as.matrix(spike.mat)
dim(spike.mat)

We check that all objects are in the same order and combine them.

stopifnot(identical(colnames(count.mat), colnames(spike.mat)))
counts <- rbind(count.mat, spike.mat)

Downloading the per-cell metadata

We also download a file containing the metadata for each cell.

meta.file <- bfcrpath(bfc, file.path(base.url,
    "GSE71585_Clustering_Results.csv.gz"))
metadata <- read.csv(meta.file, stringsAsFactors=FALSE)
nrow(metadata)
head(metadata)

Some clean-up is necessary to replace "N/A" with actual NA_character_ entries, which are more appropriate for conveying missingness.

for (i in colnames(metadata)) {
    current <- metadata[,i]
    to.rename <- current %in% c("N/A")
    current[to.rename] <- NA
    metadata[,i] <- current
}

We check that all objects are in the same order, and use this to create a column-level DataFrame.

m <- match(colnames(counts), metadata$sample_title)
stopifnot(all(!is.na(m)))
metadata <- metadata[m,]
library(S4Vectors)
coldata <- as(metadata, "DataFrame")

Saving to file

We now save all of the components to file for upload to r Biocpkg("ExperimentHub"). These will be used to construct a SingleCellExperiment on the client side when the dataset is requested.

path <- file.path("scRNAseq", "tasic-brain", "2.0.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(counts, file=file.path(path, "counts.rds"))
saveRDS(coldata, file=file.path(path, "coldata.rds"))

Session information

sessionInfo()

References



drisso/scRNAseq documentation built on Feb. 16, 2021, 1:18 a.m.