Downloading the count data

We obtain a single-cell RNA sequencing dataset of human and mouse lung cancer from @zilionis2019singlecell. Counts for endogenous genes are available from the Gene Expression Omnibus using the accession number GSE127465. We download and cache it using the r Biocpkg("BiocFileCache") package.

bfc <- BiocFileCache(ask=FALSE)    
tarball <- bfcrpath(bfc, 

We unpack it to a temporary directory.

temp <- tempfile()
untar(tarball, exdir=temp)

Reading in human data

We read in all the human datasets as sparse matrices.

hs.files <- c(

all.human <- lapply(file.path(temp, hs.files), readSparseCounts)

library(Matrix) # Because the values are transposed.
all.human <- lapply(all.human, t)

t(sapply(all.human, dim))

We verify that the gene order is the same, and combine the counts.

stopifnot(length(unique(lapply(all.human, rownames)))==1L)
counts <-, all.human)

We derive some metadata from each file name and apply them to all of the constituent barcodes.

samples <- vapply(strsplit(hs.files, "_"), "[", i=3, "")
barcode <- lapply(all.human, colnames)
origin <- rep(samples, times = lengths(barcode))

donor <- sub("[bt].*", "", origin)
tissue <- ifelse(grepl("t", origin), "tumor", "blood")
barcode <- unlist(barcode)

coldata <- DataFrame(Library=origin, Barcode=barcode,
    Patient=donor, Tissue=tissue)

We then add additional metadata for a subset of cells that were used in the original paper. We convert some of the fields to logical values.

bfc <- BiocFileCache(ask=FALSE)
tarball <- bfcrpath(bfc, 

metadata <- read.delim(tarball, stringsAsFactors=FALSE, check.names = FALSE)
for (u in grep("^used", colnames(metadata))) {
    metadata[[u]] <- metadata[[u]]=="True"

metadata <- DataFrame(metadata, check.names=FALSE)

We merge this with our file name-derived metadata:

keys <- c("Library", "Barcode")
m <- match(coldata[,keys], metadata[,keys])
coldata$Used <- !

discard <- c(keys, "Patient", "Tissue")
colData <- cbind(coldata, metadata[m,setdiff(colnames(metadata), discard)])

We save all of the components to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "zilionis-lung", "2.4.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(counts, file=file.path(path, "counts-human.rds"))
saveRDS(colData, file=file.path(path, "coldata-human.rds"))
rm(counts, all.human)

Reading in mouse data

We read in all the mouse datasets.

mm.files <- c(

all.mouse <- lapply(file.path(temp, mm.files), readSparseCounts)
all.mouse <- lapply(all.mouse, t)
t(sapply(all.mouse, dim))

We verify that the gene order is the same, and combine the counts.

stopifnot(length(unique(lapply(all.mouse, rownames)))==1L)
counts <-, all.mouse)

We derive some metadata from each file name and apply them to all of the constituent barcodes.

separated <- strsplit(mm.files, "_")

tissue <- vapply(separated, "[", i=3, "")
animal <- vapply(separated, "[", i=4, "")
replicate <- vapply(separated, "[", i=5, "")
barcode <- lapply(all.mouse, colnames)

animal <- rep(sprintf("%s_%s", tissue, animal), lengths(barcode))
replicate <- rep(replicate, lengths(barcode))
library <- sprintf("%s_%s", animal, replicate)
tissue <- rep(ifelse(tissue == "t", "tumor", "healthy"), times = lengths(barcode))
barcode <- unlist(barcode)

coldata <- DataFrame(Library=library, Barcode=barcode, Animal = animal, 
    Run = replicate, Tissue=tissue)

We next add additional metadata for a subset of cells that were used in the original paper. We keep only the experimentally interesting metadata, discarding columns that are duplicated or only have one level.

bfc <- BiocFileCache(ask=FALSE)
tarball <- bfcrpath(bfc, 

metadata <- read.delim(tarball, stringsAsFactors=FALSE, check.names = FALSE)
metadata <- DataFrame(metadata, check.names=FALSE)

We merge this with our file name-derived metadata:

keys <- c("Library", "Barcode")
m <- match(coldata[,keys], metadata[,keys])
coldata$Used <- !

discard <- c(keys, "Tumor or healthy", "Biological replicate")
colData <- cbind(coldata, metadata[m,setdiff(colnames(metadata), discard)])

We save all of the components to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "zilionis-lung", "2.4.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(counts, file=file.path(path, "counts-mouse.rds"))
saveRDS(colData, file=file.path(path, "coldata-mouse.rds"))

