library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the data

We obtain a single-cell RNA sequencing dataset of human cerebral cortex cells from @darmanis2015survey. Counts for endogenous genes are available from the Gene Expression Omnibus using the accession number GSE67835. We download and cache them using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
tarball <- bfcrpath(bfc, 
    "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE67835&format=file")

fname <- tempfile()
untar(tarball, exdir=fname)
all.files <- list.files(fname, full=TRUE)
length(all.files)

Reading all the counts in as sparse matrices:

library(BiocParallel)
counts <- bplapply(all.files, read.table, sep="\t", header=FALSE, row.names=1,
    BPPARAM=MulticoreParam())

# Sanity check:
gene.ids <- lapply(counts, rownames)
stopifnot(length(unique(gene.ids))==1L)

combined <- as.matrix(do.call(cbind, counts))
rownames(combined) <- sub(" +$", "", rownames(combined))
colnames(combined) <- sub("_.*", "\\1", basename(all.files))
str(combined)

We pull down some metadata in SOFT format. This requires some cleaning to get rid of the useless bits of information.

library(GEOquery)
out <- GEOquery::getGEO("GSE67835")

df <- rbind(
    as(phenoData(out[[1]]), "data.frame"),
    as(phenoData(out[[2]]), "data.frame")
) 
stopifnot(anyDuplicated(rownames(df))==0)
stopifnot(identical(sort(rownames(df)), sort(colnames(combined))))

# Getting rid of fields that are either too specific or too general.
keep <- vapply(df, function(x) {
    n <- length(unique(x))
    n > 1 & n < length(x)
}, TRUE)
df <- df[,keep]

keep <- grep(":ch1$", colnames(df))
df <- df[,keep]

library(S4Vectors)
colnames(df) <- sub(":ch1$", "", colnames(df))
df <- DataFrame(df)

df <- df[colnames(combined),] # already know that rownames are valid.
df 

Saving to file

We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "darmanis-brain", "2.6.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(combined, file=file.path(path, "counts.rds"))
saveRDS(df, file=file.path(path, "coldata.rds"))

Session information

sessionInfo()

References



drisso/scRNAseq documentation built on Feb. 16, 2021, 1:18 a.m.