library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
We obtain a single-cell RNA sequencing dataset of human and mouse lung cancer from Zilionis et al. (2019).
Counts for endogenous genes are available from the Gene Expression Omnibus
using the accession number GSE127465.
We download and cache it using the r Biocpkg("BiocFileCache")
package.
library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask=FALSE) tarball <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE127465&format=file")
We unpack it to a temporary directory.
temp <- tempfile() untar(tarball, exdir=temp)
We clean out any existing output directory:
output.dir <- "2023-12-20_output" unlink(output.dir, recursive=TRUE) dir.create(output.dir)
We also set up a metadata template in preparation for saving:
meta <- list( title="Single-Cell Transcriptomics of Human and Mouse Lung Cancers Reveals Conserved Myeloid Populations across Individuals and Species [%s only]", description="Tumor-infiltrating myeloid cells (TIMs) comprise monocytes, macrophages, dendritic cells, and neutrophils, and have emerged as key regulators of cancer growth. These cells can diversify into a spectrum of states, which might promote or limit tumor outgrowth but remain poorly understood. Here, we used single-cell RNA sequencing (scRNA-seq) to map TIMs in non-small-cell lung cancer patients. We uncovered 25 TIM states, most of which were reproducibly found across patients. To facilitate translational research of these populations, we also profiled TIMs in mice. In comparing TIMs across species, we identified a near-complete congruence of population structures among dendritic cells and monocytes; conserved neutrophil subsets; and species differences among macrophages. By contrast, myeloid cell population structures in patients' blood showed limited overlap with those of TIMs. This study determines the lung TIM landscape and sets the stage for future investigations into the potential of TIMs as immunotherapy targets. Maintainer note: this dataset contains only the %s cells from this study.", sources=list( list(provider="GEO", id="GSE127465"), list(provider="PubMed", id="30979687") ), maintainer_name="Jens Preussner", maintainer_email="jens.preussner@mpi-bn.mpg.de" )
We read in all the human datasets as sparse matrices.
hs.files <- c( "GSM3635278_human_p1t1_raw_counts.tsv.gz", "GSM3635279_human_p1t2_raw_counts.tsv.gz", "GSM3635280_human_p1t3_raw_counts.tsv.gz", "GSM3635281_human_p1t4_raw_counts.tsv.gz", "GSM3635282_human_p1b1_raw_counts.tsv.gz", "GSM3635283_human_p1b2_raw_counts.tsv.gz", "GSM3635284_human_p1b3_raw_counts.tsv.gz", "GSM3635285_human_p2t1_raw_counts.tsv.gz", "GSM3635286_human_p2t2_raw_counts.tsv.gz", "GSM3635287_human_p2b1_raw_counts.tsv.gz", "GSM3635288_human_p3t1_raw_counts.tsv.gz", "GSM3635289_human_p3t2_raw_counts.tsv.gz", "GSM3635290_human_p3t3_raw_counts.tsv.gz", "GSM3635291_human_p3b1_raw_counts.tsv.gz", "GSM3635292_human_p4t1_raw_counts.tsv.gz", "GSM3635293_human_p4t2_raw_counts.tsv.gz", "GSM3635294_human_p4t3_raw_counts.tsv.gz", "GSM3635295_human_p4b1_raw_counts.tsv.gz", "GSM3635296_human_p5t1_raw_counts.tsv.gz", "GSM3635297_human_p5t2_raw_counts.tsv.gz", "GSM3635298_human_p6t1_raw_counts.tsv.gz", "GSM3635299_human_p6t2_raw_counts.tsv.gz", "GSM3635300_human_p6b1_raw_counts.tsv.gz", "GSM3635301_human_p7t1_raw_counts.tsv.gz", "GSM3635302_human_p7t2_raw_counts.tsv.gz", "GSM3635303_human_p7b1_raw_counts.tsv.gz" ) library(scuttle) all.human <- lapply(file.path(temp, hs.files), readSparseCounts) library(Matrix) # Because the values are transposed. all.human <- lapply(all.human, t) t(sapply(all.human, dim))
We verify that the gene order is the same, and combine the counts.
stopifnot(length(unique(lapply(all.human, rownames)))==1L) counts <- do.call(cbind, all.human) dim(counts)
We derive some metadata from each file name and apply them to all of the constituent barcodes.
samples <- vapply(strsplit(hs.files, "_"), "[", i=3, "") barcode <- lapply(all.human, colnames) origin <- rep(samples, times = lengths(barcode)) donor <- sub("[bt].*", "", origin) tissue <- ifelse(grepl("t", origin), "tumor", "blood") barcode <- unlist(barcode) library(S4Vectors) coldata <- DataFrame(Library=origin, Barcode=barcode, Patient=donor, Tissue=tissue) coldata
We then add additional metadata for a subset of cells that were used in the original paper. We convert some of the fields to logical values.
bfc <- BiocFileCache(ask=FALSE) tarball <- bfcrpath(bfc, "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE127nnn/GSE127465/suppl/GSE127465_human_cell_metadata_54773x25.tsv.gz") metadata <- read.delim(tarball, stringsAsFactors=FALSE, check.names = FALSE) for (u in grep("^used", colnames(metadata))) { metadata[[u]] <- metadata[[u]]=="True" } metadata <- DataFrame(metadata, check.names=FALSE) metadata
We merge this with our file name-derived metadata:
keys <- c("Library", "Barcode") m <- match(coldata[,keys], metadata[,keys]) coldata$Used <- !is.na(m) discard <- c(keys, "Patient", "Tissue") colData <- cbind(coldata, metadata[m,setdiff(colnames(metadata), discard)]) colData
We now assemble a SingleCellExperiment
:
sce <- SingleCellExperiment(list(counts=counts))
We move the SPRING coordinates into the reduced dimensions for easier viewing.
cn <- colnames(colData) is.x <- grepl("^x_", cn) is.y <- grepl("^y_", cn) which.x <- which(is.x) which.y <- which(is.y) grouping.x <- sub("^x_", "", cn[is.x]) grouping.y <- sub("^y_", "", cn[is.y]) stopifnot(identical(grouping.x, grouping.y)) # sanity check. for (i in seq_along(grouping.x)) { reducedDim(sce, paste0("SPRING_", grouping.x[i])) <- cbind( x=colData[,which.x[i]], y=colData[,which.y[i]] ) }
The remaining column annotations are added to the column data. Note that this overwrites the column names, which aren't unique across samples anyway, so it's no big deal.
colData(sce) <- colData[,!is.x & !is.y] stopifnot(is.null(colnames(sce)))
We apply some polish to optimize disk space usage:
library(scRNAseq) sce <- polishDataset(sce) sce
And we save it in preparation for upload:
copy <- meta copy$taxonomy_id <- "9606" copy$genome <- "GRCh38" copy$title <- sprintf(copy$title, "human") copy$description <- sprintf(copy$description, "human") saveDataset(sce, file.path(output.dir, "human"), copy)
rm(counts, all.human) gc()
We read in all the mouse datasets.
mm.files <- c( "GSM3635304_mouse_h_1_1_raw_counts.tsv.gz", "GSM3635305_mouse_h_1_2_raw_counts.tsv.gz", "GSM3635306_mouse_h_2_1_raw_counts.tsv.gz", "GSM3635307_mouse_h_2_2_raw_counts.tsv.gz", "GSM3635308_mouse_h_2_3_raw_counts.tsv.gz", "GSM3635309_mouse_t_1_1_raw_counts.tsv.gz", "GSM3635310_mouse_t_1_2_raw_counts.tsv.gz", "GSM3635311_mouse_t_1_3_raw_counts.tsv.gz", "GSM3635312_mouse_t_1_4_raw_counts.tsv.gz", "GSM3635313_mouse_t_1_5_raw_counts.tsv.gz", "GSM3635314_mouse_t_2_1_raw_counts.tsv.gz", "GSM3635315_mouse_t_2_2_raw_counts.tsv.gz", "GSM3635316_mouse_t_2_3_raw_counts.tsv.gz", "GSM3635317_mouse_t_2_4_raw_counts.tsv.gz" ) all.mouse <- lapply(file.path(temp, mm.files), readSparseCounts) all.mouse <- lapply(all.mouse, t) t(sapply(all.mouse, dim))
We verify that the gene order is the same, and combine the counts.
stopifnot(length(unique(lapply(all.mouse, rownames)))==1L) counts <- do.call(cbind, all.mouse) dim(counts)
We derive some metadata from each file name and apply them to all of the constituent barcodes.
separated <- strsplit(mm.files, "_") tissue <- vapply(separated, "[", i=3, "") animal <- vapply(separated, "[", i=4, "") replicate <- vapply(separated, "[", i=5, "") barcode <- lapply(all.mouse, colnames) animal <- rep(sprintf("%s_%s", tissue, animal), lengths(barcode)) replicate <- rep(replicate, lengths(barcode)) library <- sprintf("%s_%s", animal, replicate) tissue <- rep(ifelse(tissue == "t", "tumor", "healthy"), times = lengths(barcode)) barcode <- unlist(barcode) coldata <- DataFrame(Library=library, Barcode=barcode, Animal = animal, Run = replicate, Tissue=tissue) coldata
We next add additional metadata for a subset of cells that were used in the original paper. We keep only the experimentally interesting metadata, discarding columns that are duplicated or only have one level.
bfc <- BiocFileCache(ask=FALSE) tarball <- bfcrpath(bfc, "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE127nnn/GSE127465/suppl/GSE127465_mouse_cell_metadata_15939x12.tsv.gz") metadata <- read.delim(tarball, stringsAsFactors=FALSE, check.names = FALSE) metadata <- DataFrame(metadata, check.names=FALSE) metadata
We merge this with our file name-derived metadata:
keys <- c("Library", "Barcode") m <- match(coldata[,keys], metadata[,keys]) coldata$Used <- !is.na(m) discard <- c(keys, "Tumor or healthy", "Biological replicate") colData <- cbind(coldata, metadata[m,setdiff(colnames(metadata), discard)]) colData
We assemble all of this into a SingleCellExperiment
.
sce <- SingleCellExperiment(list(counts=counts))
We move the SPRING coordinates into the reduced dimensions for easier viewing.
cn <- colnames(colData) is.x <- cn=="x" is.y <- cn=="y" reducedDim(sce, "SPRING") <- cbind(x=colData[,is.x], y=colData[,is.y])
The remaining column annotations are added to the column data. Note that this overwrites the column names, which aren't unique across samples anyway, so it's no big deal.
colData(sce) <- colData[,!is.x & !is.y] stopifnot(is.null(colnames(sce)))
We apply some polish to optimize the disk space.
sce <- polishDataset(sce) sce
And then we save it to disk with some metadata:
copy <- meta copy$taxonomy_id <- "10090" copy$genome <- "GRCm38" copy$title <- sprintf(copy$title, "mouse") copy$description <- sprintf(copy$description, "mouse") saveDataset(sce, file.path(output.dir, "mouse"), copy)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.