In LTLA/scRNAseq: Collection of Public Single-Cell RNA-Seq Datasets

library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the count data

We obtain a single-cell RNA sequencing dataset of human and mouse lung cancer from Zilionis et al. (2019). Counts for endogenous genes are available from the Gene Expression Omnibus using the accession number GSE127465. We download and cache it using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask=FALSE)    
tarball <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE127465&format=file")

We unpack it to a temporary directory.

temp <- tempfile()
untar(tarball, exdir=temp)

We clean out any existing output directory:

output.dir <- "2023-12-20_output"
unlink(output.dir, recursive=TRUE)
dir.create(output.dir)

We also set up a metadata template in preparation for saving:

meta <- list(
    title="Single-Cell Transcriptomics of Human and Mouse Lung Cancers Reveals Conserved Myeloid Populations across Individuals and Species [%s only]",
    description="Tumor-infiltrating myeloid cells (TIMs) comprise monocytes, macrophages, dendritic cells, and neutrophils, and have emerged as key regulators of cancer growth. These cells can diversify into a spectrum of states, which might promote or limit tumor outgrowth but remain poorly understood. Here, we used single-cell RNA sequencing (scRNA-seq) to map TIMs in non-small-cell lung cancer patients. We uncovered 25 TIM states, most of which were reproducibly found across patients. To facilitate translational research of these populations, we also profiled TIMs in mice. In comparing TIMs across species, we identified a near-complete congruence of population structures among dendritic cells and monocytes; conserved neutrophil subsets; and species differences among macrophages. By contrast, myeloid cell population structures in patients' blood showed limited overlap with those of TIMs. This study determines the lung TIM landscape and sets the stage for future investigations into the potential of TIMs as immunotherapy targets.

Maintainer note: this dataset contains only the %s cells from this study.",
    sources=list(
        list(provider="GEO", id="GSE127465"),
        list(provider="PubMed", id="30979687")
    ),
    maintainer_name="Jens Preussner",
    maintainer_email="jens.preussner@mpi-bn.mpg.de"
)

Processing human data

Reading the counts

We read in all the human datasets as sparse matrices.

hs.files <- c(
    "GSM3635278_human_p1t1_raw_counts.tsv.gz",
    "GSM3635279_human_p1t2_raw_counts.tsv.gz",
    "GSM3635280_human_p1t3_raw_counts.tsv.gz",
    "GSM3635281_human_p1t4_raw_counts.tsv.gz",
    "GSM3635282_human_p1b1_raw_counts.tsv.gz",
    "GSM3635283_human_p1b2_raw_counts.tsv.gz",
    "GSM3635284_human_p1b3_raw_counts.tsv.gz",
    "GSM3635285_human_p2t1_raw_counts.tsv.gz",
    "GSM3635286_human_p2t2_raw_counts.tsv.gz",
    "GSM3635287_human_p2b1_raw_counts.tsv.gz",
    "GSM3635288_human_p3t1_raw_counts.tsv.gz",
    "GSM3635289_human_p3t2_raw_counts.tsv.gz",
    "GSM3635290_human_p3t3_raw_counts.tsv.gz",
    "GSM3635291_human_p3b1_raw_counts.tsv.gz",
    "GSM3635292_human_p4t1_raw_counts.tsv.gz",
    "GSM3635293_human_p4t2_raw_counts.tsv.gz",
    "GSM3635294_human_p4t3_raw_counts.tsv.gz",
    "GSM3635295_human_p4b1_raw_counts.tsv.gz",
    "GSM3635296_human_p5t1_raw_counts.tsv.gz",
    "GSM3635297_human_p5t2_raw_counts.tsv.gz",
    "GSM3635298_human_p6t1_raw_counts.tsv.gz",
    "GSM3635299_human_p6t2_raw_counts.tsv.gz",
    "GSM3635300_human_p6b1_raw_counts.tsv.gz",
    "GSM3635301_human_p7t1_raw_counts.tsv.gz",
    "GSM3635302_human_p7t2_raw_counts.tsv.gz",
    "GSM3635303_human_p7b1_raw_counts.tsv.gz"
)

library(scuttle)
all.human <- lapply(file.path(temp, hs.files), readSparseCounts)

library(Matrix) # Because the values are transposed.
all.human <- lapply(all.human, t)

t(sapply(all.human, dim))

We verify that the gene order is the same, and combine the counts.

stopifnot(length(unique(lapply(all.human, rownames)))==1L)
counts <- do.call(cbind, all.human)
dim(counts)

Creating column annotations

We derive some metadata from each file name and apply them to all of the constituent barcodes.

samples <- vapply(strsplit(hs.files, "_"), "[", i=3, "")
barcode <- lapply(all.human, colnames)
origin <- rep(samples, times = lengths(barcode))

donor <- sub("[bt].*", "", origin)
tissue <- ifelse(grepl("t", origin), "tumor", "blood")
barcode <- unlist(barcode)

library(S4Vectors)
coldata <- DataFrame(Library=origin, Barcode=barcode,
    Patient=donor, Tissue=tissue)
coldata

We then add additional metadata for a subset of cells that were used in the original paper. We convert some of the fields to logical values.

bfc <- BiocFileCache(ask=FALSE)
tarball <- bfcrpath(bfc, "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE127nnn/GSE127465/suppl/GSE127465_human_cell_metadata_54773x25.tsv.gz")

metadata <- read.delim(tarball, stringsAsFactors=FALSE, check.names = FALSE)
for (u in grep("^used", colnames(metadata))) {
    metadata[[u]] <- metadata[[u]]=="True"
}

metadata <- DataFrame(metadata, check.names=FALSE)
metadata

We merge this with our file name-derived metadata:

keys <- c("Library", "Barcode")
m <- match(coldata[,keys], metadata[,keys])
coldata$Used <- !is.na(m)

discard <- c(keys, "Patient", "Tissue")
colData <- cbind(coldata, metadata[m,setdiff(colnames(metadata), discard)])
colData

Putting it all together

We now assemble a SingleCellExperiment:

sce <- SingleCellExperiment(list(counts=counts))

We move the SPRING coordinates into the reduced dimensions for easier viewing.

cn <- colnames(colData)
is.x <- grepl("^x_", cn)
is.y <- grepl("^y_", cn)
which.x <- which(is.x)
which.y <- which(is.y)
grouping.x <- sub("^x_", "", cn[is.x])
grouping.y <- sub("^y_", "", cn[is.y])
stopifnot(identical(grouping.x, grouping.y)) # sanity check.

for (i in seq_along(grouping.x)) {
    reducedDim(sce, paste0("SPRING_", grouping.x[i])) <- cbind(
        x=colData[,which.x[i]], 
        y=colData[,which.y[i]]
    ) 
}

The remaining column annotations are added to the column data. Note that this overwrites the column names, which aren't unique across samples anyway, so it's no big deal.

colData(sce) <- colData[,!is.x & !is.y]
stopifnot(is.null(colnames(sce)))

We apply some polish to optimize disk space usage:

library(scRNAseq)
sce <- polishDataset(sce)
sce

And we save it in preparation for upload:

copy <- meta
copy$taxonomy_id <- "9606"
copy$genome <- "GRCh38"
copy$title <- sprintf(copy$title, "human")
copy$description <- sprintf(copy$description, "human")
saveDataset(sce, file.path(output.dir, "human"), copy)

rm(counts, all.human)
gc()

Processing mouse data

Reading the counts

We read in all the mouse datasets.

mm.files <- c(
    "GSM3635304_mouse_h_1_1_raw_counts.tsv.gz",
    "GSM3635305_mouse_h_1_2_raw_counts.tsv.gz",
    "GSM3635306_mouse_h_2_1_raw_counts.tsv.gz",
    "GSM3635307_mouse_h_2_2_raw_counts.tsv.gz",
    "GSM3635308_mouse_h_2_3_raw_counts.tsv.gz",
    "GSM3635309_mouse_t_1_1_raw_counts.tsv.gz",
    "GSM3635310_mouse_t_1_2_raw_counts.tsv.gz",
    "GSM3635311_mouse_t_1_3_raw_counts.tsv.gz",
    "GSM3635312_mouse_t_1_4_raw_counts.tsv.gz",
    "GSM3635313_mouse_t_1_5_raw_counts.tsv.gz",
    "GSM3635314_mouse_t_2_1_raw_counts.tsv.gz",
    "GSM3635315_mouse_t_2_2_raw_counts.tsv.gz",
    "GSM3635316_mouse_t_2_3_raw_counts.tsv.gz",
    "GSM3635317_mouse_t_2_4_raw_counts.tsv.gz"
)

all.mouse <- lapply(file.path(temp, mm.files), readSparseCounts)
all.mouse <- lapply(all.mouse, t)
t(sapply(all.mouse, dim))

We verify that the gene order is the same, and combine the counts.

stopifnot(length(unique(lapply(all.mouse, rownames)))==1L)
counts <- do.call(cbind, all.mouse)
dim(counts)

Creating column annotations

We derive some metadata from each file name and apply them to all of the constituent barcodes.

separated <- strsplit(mm.files, "_")

tissue <- vapply(separated, "[", i=3, "")
animal <- vapply(separated, "[", i=4, "")
replicate <- vapply(separated, "[", i=5, "")
barcode <- lapply(all.mouse, colnames)

animal <- rep(sprintf("%s_%s", tissue, animal), lengths(barcode))
replicate <- rep(replicate, lengths(barcode))
library <- sprintf("%s_%s", animal, replicate)
tissue <- rep(ifelse(tissue == "t", "tumor", "healthy"), times = lengths(barcode))
barcode <- unlist(barcode)

coldata <- DataFrame(Library=library, Barcode=barcode, Animal = animal, 
    Run = replicate, Tissue=tissue)
coldata

We next add additional metadata for a subset of cells that were used in the original paper. We keep only the experimentally interesting metadata, discarding columns that are duplicated or only have one level.

bfc <- BiocFileCache(ask=FALSE)
tarball <- bfcrpath(bfc, "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE127nnn/GSE127465/suppl/GSE127465_mouse_cell_metadata_15939x12.tsv.gz")

metadata <- read.delim(tarball, stringsAsFactors=FALSE, check.names = FALSE)
metadata <- DataFrame(metadata, check.names=FALSE)
metadata

We merge this with our file name-derived metadata:

keys <- c("Library", "Barcode")
m <- match(coldata[,keys], metadata[,keys])
coldata$Used <- !is.na(m)

discard <- c(keys, "Tumor or healthy", "Biological replicate")
colData <- cbind(coldata, metadata[m,setdiff(colnames(metadata), discard)])
colData

Putting it all together

We assemble all of this into a SingleCellExperiment.

sce <- SingleCellExperiment(list(counts=counts))

We move the SPRING coordinates into the reduced dimensions for easier viewing.

cn <- colnames(colData)
is.x <- cn=="x"
is.y <- cn=="y"
reducedDim(sce, "SPRING") <- cbind(x=colData[,is.x], y=colData[,is.y])

The remaining column annotations are added to the column data. Note that this overwrites the column names, which aren't unique across samples anyway, so it's no big deal.

colData(sce) <- colData[,!is.x & !is.y]
stopifnot(is.null(colnames(sce)))

We apply some polish to optimize the disk space.

sce <- polishDataset(sce)
sce

And then we save it to disk with some metadata:

copy <- meta
copy$taxonomy_id <- "10090"
copy$genome <- "GRCm38"
copy$title <- sprintf(copy$title, "mouse")
copy$description <- sprintf(copy$description, "mouse")
saveDataset(sce, file.path(output.dir, "mouse"), copy)