library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
We obtain a single-cell RNA sequencing dataset of T cells from multiple patients from @bacher2020low.
Counts for endogenous genes are available from the Gene Expression Omnibus
using the accession number GSE162086.
We download and cache them using the r Biocpkg("BiocFileCache")
package.
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) # Manually transcribed from GEO: samples <- matrix(ncol=4, byrow=TRUE, c( "GSM4932900","J09835","WHO 1","non-hospitalized", "GSM4932901","J09836","WHO 1","non-hospitalized", "GSM4932902","J10535","WHO 4","mild-moderate", "GSM4932903","J10624","WHO 2","non-hospitalized", "GSM4932904","J10625","WHO 2","non-hospitalized", "GSM4932905","J10886","WHO 5","mild-moderate", "GSM4932906","J10887","WHO 6","severe", "GSM4932907","J10888","WHO 4","mild-moderate", "GSM4932908","J11689","WHO 2","non-hospitalized", "GSM4932909","J14204","WHO 7","severe", "GSM4932910","J14205","WHO 5","mild-moderate", "GSM4932911","J15890","WHO 2","non-hospitalized", "GSM4932912","J15891","healthy","healthy", "GSM4932913","J15892","healthy","healthy", "GSM4932914","J15899","healthy","healthy", "GSM4932915","J15900","healthy","healthy", "GSM4932916","J15893","WHO 5","mild-moderate", "GSM4932917","J21854","WHO 7","severe", "GSM4932918","J21855","healthy","healthy", "GSM4932919","J21856","healthy","healthy" ) ) counts <- vector("list", nrow(samples)) for (x in seq_along(counts)) { url <- sprintf("https://www.ncbi.nlm.nih.gov/geo/download/?acc=%s&format=file&file=%s%%5F%s%%5Fcounts%%2Etsv%%2Egz", samples[x,1], samples[x,1], samples[x,2]) counts[[x]] <- bfcrpath(bfc, url) }
Reading all the counts in as sparse matrices:
library(scuttle) library(BiocParallel) counts <- bplapply(counts, readSparseCounts, quote="\"", BPPARAM=MulticoreParam()) # Sanity check: gene.ids <- lapply(counts, rownames) stopifnot(length(unique(gene.ids))==1L) # Checking that the names are unique after combining. all.cn <- unlist(lapply(counts, colnames)) stopifnot(anyDuplicated(all.cn)==0) combined <- do.call(cbind, counts)
Creating a SingleCellExperiment
object:
library(SingleCellExperiment) sce <- SingleCellExperiment(list(counts=combined)) true.patient <- rep(samples[,2], vapply(counts, ncol, 0L))
Attaching some metadata:
meta.path <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE162086&format=file&file=GSE162086%5Fseurat%5Fmetadata%2Etsv%2Egz") meta <- read.delim(meta.path, check.names=FALSE) ID <- paste0(sub("-1", "", meta$barcode), "-", meta$sample) ref <- gsub("\"", "", colnames(combined)) stopifnot(all(ID %in% ref)) m <- match(ref, ID) meta <- meta[m,] rownames(meta) <- ref meta$retained <- !is.na(m) # Filling in some of the patient-level attributes for missing cells. # This is done by extrapolating from cells in the same patient. meta$sample <- sub(".*-", "", ref) meta$barcode <- sub("-.*", "", ref) for (field in c("batch", "seq_run", "diagnosis")) { tab <- table(meta$sample, meta[[field]]) best <- colnames(tab)[max.col(tab)] best <- as(best, typeof(meta[[field]])) names(best) <- rownames(tab) meta[[field]] <- best[meta$sample] } m <- match(meta$sample,samples[,2]) stopifnot(all(!is.na(m))) meta$who_class <- samples[m,3] meta$severity <- samples[m,4] colData(sce) <- DataFrame(meta) sce
We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub")
.
path <- file.path("scRNAseq", "bacher-tcell", "2.6.0") dir.create(path, showWarnings=FALSE, recursive=TRUE) saveRDS(assay(sce), file=file.path(path, "counts.rds")) saveRDS(colData(sce), file=file.path(path, "coldata.rds"))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.