library(BiocStyle) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
We obtain a single-cell RNA sequencing dataset of human and mouse lung cancer from @zilionis2019singlecell.
Counts for endogenous genes are available from the Gene Expression Omnibus
using the accession number GSE127465.
We download and cache it using the r Biocpkg("BiocFileCache")
package.
library(BiocFileCache) bfc <- BiocFileCache(ask=FALSE) tarball <- bfcrpath(bfc, file.path("https://www.ncbi.nlm.nih.gov/geo", "download/?acc=GSE127465&format=file"))
We unpack it to a temporary directory.
temp <- tempfile() untar(tarball, exdir=temp)
We read in all the human datasets as sparse matrices.
hs.files <- c( "GSM3635278_human_p1t1_raw_counts.tsv.gz", "GSM3635279_human_p1t2_raw_counts.tsv.gz", "GSM3635280_human_p1t3_raw_counts.tsv.gz", "GSM3635281_human_p1t4_raw_counts.tsv.gz", "GSM3635282_human_p1b1_raw_counts.tsv.gz", "GSM3635283_human_p1b2_raw_counts.tsv.gz", "GSM3635284_human_p1b3_raw_counts.tsv.gz", "GSM3635285_human_p2t1_raw_counts.tsv.gz", "GSM3635286_human_p2t2_raw_counts.tsv.gz", "GSM3635287_human_p2b1_raw_counts.tsv.gz", "GSM3635288_human_p3t1_raw_counts.tsv.gz", "GSM3635289_human_p3t2_raw_counts.tsv.gz", "GSM3635290_human_p3t3_raw_counts.tsv.gz", "GSM3635291_human_p3b1_raw_counts.tsv.gz", "GSM3635292_human_p4t1_raw_counts.tsv.gz", "GSM3635293_human_p4t2_raw_counts.tsv.gz", "GSM3635294_human_p4t3_raw_counts.tsv.gz", "GSM3635295_human_p4b1_raw_counts.tsv.gz", "GSM3635296_human_p5t1_raw_counts.tsv.gz", "GSM3635297_human_p5t2_raw_counts.tsv.gz", "GSM3635298_human_p6t1_raw_counts.tsv.gz", "GSM3635299_human_p6t2_raw_counts.tsv.gz", "GSM3635300_human_p6b1_raw_counts.tsv.gz", "GSM3635301_human_p7t1_raw_counts.tsv.gz", "GSM3635302_human_p7t2_raw_counts.tsv.gz", "GSM3635303_human_p7b1_raw_counts.tsv.gz" ) library(scater) all.human <- lapply(file.path(temp, hs.files), readSparseCounts) library(Matrix) # Because the values are transposed. all.human <- lapply(all.human, t) t(sapply(all.human, dim))
We verify that the gene order is the same, and combine the counts.
stopifnot(length(unique(lapply(all.human, rownames)))==1L) counts <- do.call(cbind, all.human) dim(counts)
We derive some metadata from each file name and apply them to all of the constituent barcodes.
samples <- vapply(strsplit(hs.files, "_"), "[", i=3, "") barcode <- lapply(all.human, colnames) origin <- rep(samples, times = lengths(barcode)) donor <- sub("[bt].*", "", origin) tissue <- ifelse(grepl("t", origin), "tumor", "blood") barcode <- unlist(barcode) library(S4Vectors) coldata <- DataFrame(Library=origin, Barcode=barcode, Patient=donor, Tissue=tissue) coldata
We then add additional metadata for a subset of cells that were used in the original paper. We convert some of the fields to logical values.
bfc <- BiocFileCache(ask=FALSE) tarball <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/series", "GSE127nnn/GSE127465/suppl", "GSE127465_human_cell_metadata_54773x25.tsv.gz")) metadata <- read.delim(tarball, stringsAsFactors=FALSE, check.names = FALSE) for (u in grep("^used", colnames(metadata))) { metadata[[u]] <- metadata[[u]]=="True" } metadata <- DataFrame(metadata, check.names=FALSE) metadata
We merge this with our file name-derived metadata:
keys <- c("Library", "Barcode") m <- match(coldata[,keys], metadata[,keys]) coldata$Used <- !is.na(m) discard <- c(keys, "Patient", "Tissue") colData <- cbind(coldata, metadata[m,setdiff(colnames(metadata), discard)]) colData
We save all of the components to file for upload to r Biocpkg("ExperimentHub")
.
path <- file.path("scRNAseq", "zilionis-lung", "2.4.0") dir.create(path, showWarnings=FALSE, recursive=TRUE) saveRDS(counts, file=file.path(path, "counts-human.rds")) saveRDS(colData, file=file.path(path, "coldata-human.rds"))
rm(counts, all.human) gc()
We read in all the mouse datasets.
mm.files <- c( "GSM3635304_mouse_h_1_1_raw_counts.tsv.gz", "GSM3635305_mouse_h_1_2_raw_counts.tsv.gz", "GSM3635306_mouse_h_2_1_raw_counts.tsv.gz", "GSM3635307_mouse_h_2_2_raw_counts.tsv.gz", "GSM3635308_mouse_h_2_3_raw_counts.tsv.gz", "GSM3635309_mouse_t_1_1_raw_counts.tsv.gz", "GSM3635310_mouse_t_1_2_raw_counts.tsv.gz", "GSM3635311_mouse_t_1_3_raw_counts.tsv.gz", "GSM3635312_mouse_t_1_4_raw_counts.tsv.gz", "GSM3635313_mouse_t_1_5_raw_counts.tsv.gz", "GSM3635314_mouse_t_2_1_raw_counts.tsv.gz", "GSM3635315_mouse_t_2_2_raw_counts.tsv.gz", "GSM3635316_mouse_t_2_3_raw_counts.tsv.gz", "GSM3635317_mouse_t_2_4_raw_counts.tsv.gz" ) all.mouse <- lapply(file.path(temp, mm.files), readSparseCounts) all.mouse <- lapply(all.mouse, t) t(sapply(all.mouse, dim))
We verify that the gene order is the same, and combine the counts.
stopifnot(length(unique(lapply(all.mouse, rownames)))==1L) counts <- do.call(cbind, all.mouse) dim(counts)
We derive some metadata from each file name and apply them to all of the constituent barcodes.
separated <- strsplit(mm.files, "_") tissue <- vapply(separated, "[", i=3, "") animal <- vapply(separated, "[", i=4, "") replicate <- vapply(separated, "[", i=5, "") barcode <- lapply(all.mouse, colnames) animal <- rep(sprintf("%s_%s", tissue, animal), lengths(barcode)) replicate <- rep(replicate, lengths(barcode)) library <- sprintf("%s_%s", animal, replicate) tissue <- rep(ifelse(tissue == "t", "tumor", "healthy"), times = lengths(barcode)) barcode <- unlist(barcode) coldata <- DataFrame(Library=library, Barcode=barcode, Animal = animal, Run = replicate, Tissue=tissue) coldata
We next add additional metadata for a subset of cells that were used in the original paper. We keep only the experimentally interesting metadata, discarding columns that are duplicated or only have one level.
bfc <- BiocFileCache(ask=FALSE) tarball <- bfcrpath(bfc, file.path("https://ftp.ncbi.nlm.nih.gov/geo/series", "GSE127nnn/GSE127465/suppl", "GSE127465_mouse_cell_metadata_15939x12.tsv.gz")) metadata <- read.delim(tarball, stringsAsFactors=FALSE, check.names = FALSE) metadata <- DataFrame(metadata, check.names=FALSE) metadata
We merge this with our file name-derived metadata:
keys <- c("Library", "Barcode") m <- match(coldata[,keys], metadata[,keys]) coldata$Used <- !is.na(m) discard <- c(keys, "Tumor or healthy", "Biological replicate") colData <- cbind(coldata, metadata[m,setdiff(colnames(metadata), discard)]) colData
We save all of the components to file for upload to r Biocpkg("ExperimentHub")
.
path <- file.path("scRNAseq", "zilionis-lung", "2.4.0") dir.create(path, showWarnings=FALSE, recursive=TRUE) saveRDS(counts, file=file.path(path, "counts-mouse.rds")) saveRDS(colData, file=file.path(path, "coldata-mouse.rds"))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.