In LTLA/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Download the data

We obtain a single-cell RNA sequencing dataset of the human PBMCs from Mair et al. (2020). Counts for endogenous genes and antibody-derived tags are available from the Gene Expression Omnibus using the accession number GSE13525. Of particular interest are the *Combined_PBMC_AbSeq*.csv.gz files that contain the data for Figure 3A.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
base.url <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4005nnn/GSM4005486/suppl/"
Abseq_1 <- count.file <- bfcrpath(bfc, paste0(base.url,
    "GSM4005486_Combined_PBMC_AbSeq_1_DBEC_MolsPerCell_with_SampleTag.csv.gz"))

base.url <- "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM4005nnn/GSM4005487/suppl/"
Abseq_2 <- count.file <- bfcrpath(bfc, paste0(base.url,
    "GSM4005487_Combined_PBMC_AbSeq_2_DBEC_MolsPerCell_with_SampleTag.csv.gz"))

Processing the data

We set up a function to process each of these files. This breaks up the data frame into the two matrices (RNA and ADT) as well as the sample-level metadata; it also harvests some row-level metadata from the feature names.

library(S4Vectors)
ingester <- function(fname) {
    df <- read.csv(fname, row.names = 1, check.names = FALSE)
    meta <- df[,tail(colnames(df), 2)]
    mat <- t(as.matrix(df[,head(colnames(df), -2)]))

    is.protein <- grepl("pAbO", rownames(mat))
    P <- mat[is.protein,]
    RNA <- mat[!is.protein,]

    rna.rowdata <- strsplit(rownames(RNA), "\\|")
    rna.rowdata <- DataFrame(do.call(rbind, rna.rowdata))
    colnames(rna.rowdata) <- c("Symbol", "RefSeq", "Type")
    rownames(RNA) <- with(rna.rowdata, paste0(Symbol, "_", ifelse(Type=="PolyA_1", Type, "RNA")))

    p.rowdata <- strsplit(rownames(P), "\\|")
    p.rowdata <- DataFrame(do.call(rbind, p.rowdata))[,1:3]
    colnames(p.rowdata) <- c("Symbol", "Alternative", "ID")
    rownames(P) <- with(p.rowdata, paste0(Symbol, "_", Alternative))

    list(rna.mat=RNA, rna.rowdata=rna.rowdata, 
        p.mat=P, p.rowdata=p.rowdata, 
        coldata=DataFrame(meta))
}

We run the ingester on each of the two files.

ingested1 <- ingester(Abseq_1)
dim(ingested1$rna.mat)
dim(ingested1$p.mat)
ingested1$coldata

ingested2 <- ingester(Abseq_2)
dim(ingested2$rna.mat)
dim(ingested2$p.mat)
ingested2$coldata

We check that the row metadata is consistent across both files.

stopifnot(identical(ingested1$rna.rowdata, ingested2$rna.rowdata))
stopifnot(identical(ingested1$p.rowdata, ingested2$p.rowdata))
stopifnot(identical(colnames(ingested1$coldata), colnames(ingested2$coldata)))
ingested1$p.rowdata
ingested1$rna.rowdata

We then combine all of the pieces of information into one matrix per set of features and a common sample metadata dataframe.

final.p.mat <- cbind(ingested1$p.mat, ingested2$p.mat)
dim(final.p.mat)

final.rna.mat <- cbind(ingested1$rna.mat, ingested2$rna.mat)
dim(final.rna.mat)

final.coldata <- rbind(ingested1$coldata, ingested2$coldata)
final.coldata$Cartridge <- rep(as.character(1:2), 
    c(nrow(ingested1$coldata), nrow(ingested2$coldata)))
final.coldata

stopifnot(identical(nrow(final.coldata), ncol(final.p.mat)))
stopifnot(identical(nrow(final.coldata), ncol(final.rna.mat)))

We also coerce all sample names to the same set of values to avoid later confusion.

final.names <- paste0(colnames(final.p.mat), "-", final.coldata$Cartridge)
rownames(final.coldata) <- colnames(final.p.mat) <- colnames(final.rna.mat) <- final.names
head(final.names)

Save for upload

We slap everything together into a SingleCellExperiment:

library(SingleCellExperiment)
sce <- SingleCellExperiment(
    list(counts=final.rna.mat), 
    colData=final.coldata, 
    rowData=ingested1$rna.rowdata)

altExp(sce, "ADT") <- SummarizedExperiment(
    list(counts=final.p.mat),
    rowData=ingested1$p.rowdata
)

We polish it up to save some disk space:

library(scRNAseq)
sce <- polishDataset(sce)
sce

We now save it to disk:

meta <- list(
    title="A Targeted Multi-omic Analysis Approach Measures Protein Expression and Low-Abundance Transcripts on the Single-Cell Level",
    description="High-throughput single-cell RNA sequencing (scRNA-seq) has become a frequently used tool to assess immune cell heterogeneity. Recently, the combined measurement of RNA and protein expression was developed, commonly known as cellular indexing of transcriptomes and epitopes by sequencing (CITE-seq). Acquisition of protein expression data along with transcriptome data resolves some of the limitations inherent to only assessing transcripts but also nearly doubles the sequencing read depth required per single cell. Furthermore, there is still a paucity of analysis tools to visualize combined transcript-protein datasets. Here, we describe a targeted transcriptomics approach that combines an analysis of over 400 genes with simultaneous measurement of over 40 proteins on 2 × 104 cells in a single experiment. This targeted approach requires only about one-tenth of the read depth compared to a whole-transcriptome approach while retaining high sensitivity for low abundance transcripts. To analyze these multi-omic datasets, we adapted one-dimensional soli expression by nonlinear stochastic embedding (One-SENSE) for intuitive visualization of protein-transcript relationships on a single-cell level.",
    taxonomy_id="9606",
    genome="GRCh38",
    sources=list(
        list(provider="PubMed", id="32268080"),
        list(provider="GEO", id="GSE135325"),
        list(provider="GEO", id="GSM4005486"),
        list(provider="GEO", id="GSM4005487")
    ),
    maintainer_name="Stephany Orjuela",
    maintainer_email="sorjuelal@gmail.com"
)

saveDataset(sce, "2023-12-20_output", meta)