library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
We obtain a single-cell RNA sequencing dataset of human embryonic stem cells from @messmer2019transcriptional.
Counts for endogenous genes and spike-in transcripts are available from ArrayExpress
using the accession number E-MTAB-6819.
We download and cache it using the r Biocpkg("BiocFileCache")
package.
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) messmer.zip <- bfcrpath(bfc, file.path("https://www.ebi.ac.uk/arrayexpress/files", "E-MTAB-6819/E-MTAB-6819.processed.1.zip")) unzip(messmer.zip, exdir=tempdir())
We read in the metadata from the SDRF file:
messmer.sdrf <- bfcrpath(bfc, file.path("https://www.ebi.ac.uk/arrayexpress/files", "E-MTAB-6819/E-MTAB-6819.sdrf.txt")) coldata <- read.delim(messmer.sdrf, check.names=FALSE, stringsAsFactors=FALSE) library(S4Vectors) coldata <- as(coldata, "DataFrame") colnames(coldata)
We sort by the batch number.
This is important for making sure that the libnames
match up with the column names of the count matrix later.
libnames <- coldata[["Assay Name"]] o <- order(coldata[["Comment[sequencing run]"]], libnames) coldata <- coldata[o,] libnames <- libnames[o]
We keep only the experimentally interesting metadata, discarding columns that are duplicated or only have one level. We also discard some ArrayExpress-specific columns.
keep <- grep("(Characteristics|Factor|Parameter Value|Comment)", colnames(coldata)) coldata <- coldata[,c(1, keep)] # keeping the cell ID. new.colnames <- sub(".*\\[(.*)\\]", "\\1", colnames(coldata)) u <- !duplicated(new.colnames) coldata <- coldata[,u] colnames(coldata) <- new.colnames[u] has.multi.levels <- vapply(coldata, FUN=function(x) length(unique(x))>1L, TRUE) coldata <- coldata[,has.multi.levels] coldata <- coldata[,setdiff(colnames(coldata), c("ENA_SAMPLE", "BioSD_SAMPLE", "technical replicate group", "ENA_EXPERIMENT", "SUBMITTED_FILE_NAME", "ENA_RUN", "FASTQ_URI", "single cell identifier", "RUN"))] coldata
Many of these are actually technical replicates or multiple entries for paired data, so we only need to keep the first one of each set.
first <- !duplicated(coldata[,1]) coldata <- coldata[first,] libnames <- libnames[first] dim(coldata)
library(edgeR) all.counts <- list() cell.names <- list() gene.names <- NULL gene.length <- NULL for (sample in c("2383", "2384", "2677", "2678", "2739", "2740", "2780", "2781")) { cur.file <- file.path(tempdir(), paste0("genic_counts_", sample, ".tsv")) current_counts <- read.table(cur.file, sep="\t", header=TRUE, row.names=1) # Checking gene names and length are the same as those in other files. if (is.null(gene.names)){ gene.names <- rownames(current_counts) gene.length <- current_counts$Length } else { stopifnot(identical(gene.names, rownames(current_counts))) stopifnot(identical(gene.length, current_counts$Length)) } current_counts$Length <- NULL # Take the technical replicates and merge them, if they exist. cellname <- colnames(current_counts) cellname <- sub("^lane[0-9]_", "", cellname) cellname <- sub("_L00[0-9]_", "_", cellname) cellname <- sub("_[12]$", "", cellname) if (any(duplicated(cellname))) { oldnames <- colnames(current_counts) current_counts <- sumTechReps(current_counts, ID=cellname) m <- match(colnames(current_counts), cellname) cellname <- colnames(current_counts) colnames(current_counts) <- oldnames[m] gc() } # Adding to the list. all.counts[[sample]] <- as.matrix(current_counts) cell.names[[sample]] <- cellname } sapply(all.counts, ncol)
We then merge technical replicates across batches (2677 + 2678, 2739 + 2740, 2780 + 2781).
stopifnot(identical(cell.names[["2677"]], cell.names[["2678"]])) all.counts[["2677"]] <- all.counts[["2677"]] + all.counts[["2678"]] all.counts[["2678"]] <- NULL stopifnot(identical(cell.names[["2739"]], cell.names[["2740"]])) all.counts[["2739"]] <- all.counts[["2739"]] + all.counts[["2740"]] all.counts[["2740"]] <- NULL stopifnot(identical(cell.names[["2780"]], cell.names[["2781"]])) all.counts[["2780"]] <- all.counts[["2780"]] + all.counts[["2781"]] all.counts[["2781"]] <- NULL sapply(all.counts, ncol)
Finally, we cbind
everything together into one large matrix.
combined.counts <- do.call(cbind, all.counts) stopifnot(identical(colnames(combined.counts), libnames))
We save these to file for upload to r Biocpkg("ExperimentHub")
.
path <- file.path("scRNAseq", "messmer-esc", "2.0.0") dir.create(path, showWarnings=FALSE, recursive=TRUE) saveRDS(combined.counts, file=file.path(path, "counts.rds")) saveRDS(coldata, file=file.path(path, "coldata.rds")) saveRDS(DataFrame(Length=gene.length), file=file.path(path, "rowdata.rds"))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.