library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the count data

We obtain a single-cell RNA sequencing dataset of human embryonic stem cells from @messmer2019transcriptional. Counts for endogenous genes and spike-in transcripts are available from ArrayExpress using the accession number E-MTAB-6819. We download and cache it using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
messmer.zip <- bfcrpath(bfc, 
    file.path("https://www.ebi.ac.uk/arrayexpress/files",
        "E-MTAB-6819/E-MTAB-6819.processed.1.zip"))
unzip(messmer.zip, exdir=tempdir())

Reading in the metadata

We read in the metadata from the SDRF file:

messmer.sdrf <- bfcrpath(bfc, 
    file.path("https://www.ebi.ac.uk/arrayexpress/files",
        "E-MTAB-6819/E-MTAB-6819.sdrf.txt"))
coldata <- read.delim(messmer.sdrf, check.names=FALSE, stringsAsFactors=FALSE)

library(S4Vectors)
coldata <- as(coldata, "DataFrame")
colnames(coldata)

We sort by the batch number. This is important for making sure that the libnames match up with the column names of the count matrix later.

libnames <- coldata[["Assay Name"]]
o <- order(coldata[["Comment[sequencing run]"]], libnames)
coldata <- coldata[o,]
libnames <- libnames[o]

We keep only the experimentally interesting metadata, discarding columns that are duplicated or only have one level. We also discard some ArrayExpress-specific columns.

keep <- grep("(Characteristics|Factor|Parameter Value|Comment)", colnames(coldata))
coldata <- coldata[,c(1, keep)]  # keeping the cell ID.

new.colnames <- sub(".*\\[(.*)\\]", "\\1", colnames(coldata))
u <- !duplicated(new.colnames)
coldata <- coldata[,u]
colnames(coldata) <- new.colnames[u]

has.multi.levels <- vapply(coldata, FUN=function(x) length(unique(x))>1L, TRUE)
coldata <- coldata[,has.multi.levels]
coldata <- coldata[,setdiff(colnames(coldata), c("ENA_SAMPLE", 
    "BioSD_SAMPLE", "technical replicate group", "ENA_EXPERIMENT", 
    "SUBMITTED_FILE_NAME", "ENA_RUN", "FASTQ_URI",
    "single cell identifier", "RUN"))]
coldata

Many of these are actually technical replicates or multiple entries for paired data, so we only need to keep the first one of each set.

first <- !duplicated(coldata[,1])
coldata <- coldata[first,]
libnames <- libnames[first]
dim(coldata)

Processing the read counts

library(edgeR)
all.counts <- list()
cell.names <- list()
gene.names <- NULL
gene.length <- NULL

for (sample in c("2383", "2384", "2677", "2678", "2739", "2740", "2780", "2781")) {
    cur.file <- file.path(tempdir(), paste0("genic_counts_", sample, ".tsv"))
    current_counts <- read.table(cur.file, sep="\t", header=TRUE, row.names=1)

    # Checking gene names and length are the same as those in other files.
    if (is.null(gene.names)){
        gene.names <- rownames(current_counts)
        gene.length <- current_counts$Length
    } else {
        stopifnot(identical(gene.names, rownames(current_counts)))
        stopifnot(identical(gene.length, current_counts$Length))
    }
    current_counts$Length <- NULL

    # Take the technical replicates and merge them, if they exist.
    cellname <- colnames(current_counts)
    cellname <- sub("^lane[0-9]_", "", cellname)
    cellname <- sub("_L00[0-9]_", "_", cellname)
    cellname <- sub("_[12]$", "", cellname)

    if (any(duplicated(cellname))) {
        oldnames <- colnames(current_counts)
        current_counts <- sumTechReps(current_counts, ID=cellname)

        m <- match(colnames(current_counts), cellname)
        cellname <- colnames(current_counts)
        colnames(current_counts) <- oldnames[m]
        gc()
    }

    # Adding to the list.
    all.counts[[sample]] <- as.matrix(current_counts)
    cell.names[[sample]] <- cellname
}
sapply(all.counts, ncol)

We then merge technical replicates across batches (2677 + 2678, 2739 + 2740, 2780 + 2781).

stopifnot(identical(cell.names[["2677"]], cell.names[["2678"]]))
all.counts[["2677"]] <- all.counts[["2677"]] + all.counts[["2678"]]
all.counts[["2678"]] <- NULL

stopifnot(identical(cell.names[["2739"]], cell.names[["2740"]]))
all.counts[["2739"]] <- all.counts[["2739"]] + all.counts[["2740"]]
all.counts[["2740"]] <- NULL

stopifnot(identical(cell.names[["2780"]], cell.names[["2781"]]))
all.counts[["2780"]] <- all.counts[["2780"]] + all.counts[["2781"]]
all.counts[["2781"]] <- NULL

sapply(all.counts, ncol)

Finally, we cbind everything together into one large matrix.

combined.counts <- do.call(cbind, all.counts)
stopifnot(identical(colnames(combined.counts), libnames))

Saving for upload

We save these to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "messmer-esc", "2.0.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(combined.counts, file=file.path(path, "counts.rds"))
saveRDS(coldata, file=file.path(path, "coldata.rds"))
saveRDS(DataFrame(Length=gene.length), file=file.path(path, "rowdata.rds"))

Session information

sessionInfo()

References



drisso/scRNAseq documentation built on Feb. 16, 2021, 1:18 a.m.