In LTLA/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the count data

We obtain a single-cell RNA sequencing dataset of human pancreas from Baron et al. (2016). Counts for endogenous genes are available from the Gene Expression Omnibus using the accession number GSE84133. We download and cache it using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask=FALSE)    
tarball <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE84133&format=file")

We unpack it to a temporary directory.

temp <- tempfile()
untar(tarball, exdir=temp)

We set up the metadata template to be used for each subdataset (for each species):

meta <- list(
    title="A Single-Cell Transcriptomic Map of the Human and Mouse Pancreas Reveals Inter- and Intra-cell Population Structure [%s cells only]",
    description="Although the function of the mammalian pancreas hinges on complex interactions of distinct cell types, gene expression profiles have primarily been described with bulk mixtures. Here we implemented a droplet-based, single-cell RNA-seq method to determine the transcriptomes of over 12,000 individual pancreatic cells from four human donors and two mouse strains. Cells could be divided into 15 clusters that matched previously characterized cell types: all endocrine cell types, including rare epsilon-cells; exocrine cell types; vascular cells; Schwann cells; quiescent and activated stellate cells; and four types of immune cells. We detected subpopulations of ductal cells with distinct expression profiles and validated their existence with immuno-histochemistry stains. Moreover, among human beta- cells, we detected heterogeneity in the regulation of genes relating to functional maturation and levels of ER stress. Finally, we deconvolved bulk gene expression samples using the single-cell data to detect disease-associated differential expression. Our dataset provides a resource for the discovery of novel cell type-specific transcription factors, signaling receptors, and medically relevant genes.

Maintainer note: this dataset contains only the %s cells from the study.",
    sources=list(
        list(provider="GEO", id="GSE84133"),
        list(provider="PubMed", id="27667365")
    ),
    maintainer_name="Aaron Lun",
    maintainer_email="infinite.monkeys.with.keyboards@gmail.com"
)

We also flush any existing directory to remove previous artifacts.

output.dir <- "2023-12-14_output"
unlink(output.dir, recursive=TRUE)
dir.create(output.dir)

Reading in human data

We set up a function to load in each set of counts as a sparse matrix.

FUN <- function(X) {
    input <- read.csv(X, stringsAsFactors=FALSE)
    rownames(input) <- input[,1]
    labels <- as.character(input$assigned_cluster)
    input <- input[,4:ncol(input)]
    input <- t(input)
    list(mat=as.matrix(input), label=labels)
}

We read in all the human datasets.

hs.files <- c(
    "GSM2230757_human1_umifm_counts.csv.gz",
    "GSM2230759_human3_umifm_counts.csv.gz",
    "GSM2230758_human2_umifm_counts.csv.gz",  
    "GSM2230760_human4_umifm_counts.csv.gz"
)
all.human <- lapply(file.path(temp, hs.files), FUN)
sapply(all.human, function(x) dim(x$mat))

We verify that the gene order is the same, and combine the counts.

stopifnot(length(unique(lapply(all.human, function(x) rownames(x$mat))))==1L)
counts <- do.call(cbind, lapply(all.human, function(x) x$mat))
dim(counts)

We do the same thing with the column metadata.

labels <- lapply(all.human, function(x) x$label)
donor <- rep(sub("_.*", "", hs.files), lengths(labels))
labels <- unlist(labels)

library(S4Vectors)
coldata <- DataFrame(donor=donor, label=labels)
coldata

We slap together a SingleCellExperiment, with some polishing to optimize disk usage.

library(SingleCellExperiment)
sce <- SingleCellExperiment(list(counts=counts), colData=coldata)

library(scRNAseq)
sce <- polishDataset(sce)
sce

We save all of the components to disk in preparation for upload.

copy <- meta
copy$title <- sprintf(copy$title, "human")
copy$description <- sprintf(copy$description, "human")
copy$taxonomy_id <- "9606"
copy$genome <- "GRCh38"
saveDataset(sce, file.path(output.dir, "human"), copy)

Reading in mouse data

We read in all the mouse datasets.

mm.files <- c(
    "GSM2230761_mouse1_umifm_counts.csv.gz",
    "GSM2230762_mouse2_umifm_counts.csv.gz"
)
all.mouse <- lapply(file.path(temp, mm.files), FUN)
sapply(all.mouse, function(x) dim(x$mat))

We verify that the gene order is the same, and combine the counts.

stopifnot(length(unique(lapply(all.mouse, function(x) rownames(x$mat))))==1L)
counts <- do.call(cbind, lapply(all.mouse, function(x) x$mat))
dim(counts)

We do the same thing with the column metadata.

labels <- lapply(all.mouse, function(x) x$label)
strain <- rep(c("ICR", "C57BL/6"), lengths(labels))
labels <- unlist(labels)
coldata <- DataFrame(strain=strain, label=labels)
coldata

We slap this all together to a SingleCellExperiment:

sce <- SingleCellExperiment(list(counts=counts), colData=coldata)
sce <- polishDataset(sce)
sce

We save all of the components to disk in preparation for upload.

copy <- meta
copy$title <- sprintf(copy$title, "mouse")
copy$description <- sprintf(copy$description, "mouse")
copy$taxonomy_id <- "10090"
copy$genome <- "GRCm38"
saveDataset(sce, file.path(output.dir, "mouse"), copy)