library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
We obtain a single-cell RNA sequencing dataset of human prefrontal cortex cells from @zhong2018singlecell.
Counts for endogenous genes are available from the Gene Expression Omnibus
using the accession number GSE104276.
We download and cache them using the r Biocpkg("BiocFileCache")
package.
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) fname <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE104276&format=file&file=GSE104276%5Fall%5Fpfc%5F2394%5FUMI%5Fcount%5FNOERCC%2Exls%2Egz") tmp <- tempfile(fileext=".xls") R.utils::gunzip(fname, destname=tmp, remove=FALSE)
Reading all the counts in as sparse matrices. Despite its name, this is not actually an XLS file.
library(scuttle) counts <- readSparseCounts(tmp, skip.row=1, col.names=FALSE) colnames(counts) <- strsplit(readLines(tmp, n=1), "\t")[[1]] dim(counts)
We pull down some sample-level metadata in SOFT format.
library(GEOquery) out <- GEOquery::getGEO("GSE104276") df <- as(phenoData(out[[1]]), "data.frame") sampdata <- DataFrame( developmental_stage=df[["developmental stage:ch1"]], gender=df[["gender:ch1"]], sample=df$title ) # Fixing an error in their annotation. sampdata$developmental_stage[sampdata$developmental_stage == "week gestation"] <- "23 weeks after gestation" sampdata
Unfortunately, it's not enough to link the exact samples to their cells. We take the nuclear option of rolling through the per-sample TPM files to figure out who lives where.
fname <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE104276&format=file") tmp <- tempfile() untar(fname, exdir=tmp) all.files <- list.files(tmp, full=TRUE) out <- lapply(all.files, function(x) { strsplit(readLines(x, n=1), "\t")[[1]] }) # So many errors in their sample names... samples <- sub("\\..*", "", basename(all.files)) samples <- sub("GSM[0-9]+_", "", samples) samples <- sub("GW8", "GW08", samples) samples <- sub("GW9", "GW09", samples) samples <- sub("GW19_PFC", "GW19_PFC1_", samples) mapping <- rep(basename(samples), lengths(out)) names(mapping) <- unlist(out) converted <- mapping[colnames(counts)] stopifnot(all(!is.na(converted))) m <- match(converted, sampdata$sample) stopifnot(all(!is.na(m))) coldata <- sampdata[m,] rownames(coldata) <- colnames(counts) coldata
But that's not all, because we can add even more annotation about the cell type.
fname <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE104276&format=file&file=GSE104276%5Freadme%5Fsample%5Fbarcode%2Exlsx") library(readxl) more.data <- read_xlsx(fname, sheet="SampleInfo") more.data <- as.data.frame(more.data) head(more.data) m <- match(rownames(coldata), more.data[,1]) coldata <- cbind(coldata, more.data[m,c(-1, -ncol(more.data))]) colnames(coldata)
We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub")
.
path <- file.path("scRNAseq", "zhong-prefrontal", "2.6.0") dir.create(path, showWarnings=FALSE, recursive=TRUE) saveRDS(counts, file=file.path(path, "counts.rds")) saveRDS(coldata, file=file.path(path, "coldata.rds"))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.