library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
We obtain a single-cell RNA sequencing dataset of human cerebral cortex cells from @darmanis2015survey.
Counts for endogenous genes are available from the Gene Expression Omnibus
using the accession number GSE67835.
We download and cache them using the r Biocpkg("BiocFileCache")
package.
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) tarball <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE67835&format=file") fname <- tempfile() untar(tarball, exdir=fname) all.files <- list.files(fname, full=TRUE) length(all.files)
Reading all the counts in as sparse matrices:
library(BiocParallel) counts <- bplapply(all.files, read.table, sep="\t", header=FALSE, row.names=1, BPPARAM=MulticoreParam()) # Sanity check: gene.ids <- lapply(counts, rownames) stopifnot(length(unique(gene.ids))==1L) combined <- as.matrix(do.call(cbind, counts)) rownames(combined) <- sub(" +$", "", rownames(combined)) colnames(combined) <- sub("_.*", "\\1", basename(all.files)) str(combined)
We pull down some metadata in SOFT format. This requires some cleaning to get rid of the useless bits of information.
library(GEOquery) out <- GEOquery::getGEO("GSE67835") df <- rbind( as(phenoData(out[[1]]), "data.frame"), as(phenoData(out[[2]]), "data.frame") ) stopifnot(anyDuplicated(rownames(df))==0) stopifnot(identical(sort(rownames(df)), sort(colnames(combined)))) # Getting rid of fields that are either too specific or too general. keep <- vapply(df, function(x) { n <- length(unique(x)) n > 1 & n < length(x) }, TRUE) df <- df[,keep] keep <- grep(":ch1$", colnames(df)) df <- df[,keep] library(S4Vectors) colnames(df) <- sub(":ch1$", "", colnames(df)) df <- DataFrame(df) df <- df[colnames(combined),] # already know that rownames are valid. df
We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub")
.
path <- file.path("scRNAseq", "darmanis-brain", "2.6.0") dir.create(path, showWarnings=FALSE, recursive=TRUE) saveRDS(combined, file=file.path(path, "counts.rds")) saveRDS(df, file=file.path(path, "coldata.rds"))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.