library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the count data

We obtain a single-cell RNA sequencing dataset of human and mouse lung cancer from @zilionis2019singlecell. Counts for endogenous genes are available from the Gene Expression Omnibus using the accession number GSE127465. We download and cache it using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache(ask=FALSE)    
tarball <- bfcrpath(bfc, 
    file.path("https://www.ncbi.nlm.nih.gov/geo",
        "download/?acc=GSE127465&format=file"))

We unpack it to a temporary directory.

temp <- tempfile()
untar(tarball, exdir=temp)

Reading in human data

We read in all the human datasets as sparse matrices.

hs.files <- c(
    "GSM3635278_human_p1t1_raw_counts.tsv.gz",
    "GSM3635279_human_p1t2_raw_counts.tsv.gz",
    "GSM3635280_human_p1t3_raw_counts.tsv.gz",
    "GSM3635281_human_p1t4_raw_counts.tsv.gz",
    "GSM3635282_human_p1b1_raw_counts.tsv.gz",
    "GSM3635283_human_p1b2_raw_counts.tsv.gz",
    "GSM3635284_human_p1b3_raw_counts.tsv.gz",
    "GSM3635285_human_p2t1_raw_counts.tsv.gz",
    "GSM3635286_human_p2t2_raw_counts.tsv.gz",
    "GSM3635287_human_p2b1_raw_counts.tsv.gz",
    "GSM3635288_human_p3t1_raw_counts.tsv.gz",
    "GSM3635289_human_p3t2_raw_counts.tsv.gz",
    "GSM3635290_human_p3t3_raw_counts.tsv.gz",
    "GSM3635291_human_p3b1_raw_counts.tsv.gz",
    "GSM3635292_human_p4t1_raw_counts.tsv.gz",
    "GSM3635293_human_p4t2_raw_counts.tsv.gz",
    "GSM3635294_human_p4t3_raw_counts.tsv.gz",
    "GSM3635295_human_p4b1_raw_counts.tsv.gz",
    "GSM3635296_human_p5t1_raw_counts.tsv.gz",
    "GSM3635297_human_p5t2_raw_counts.tsv.gz",
    "GSM3635298_human_p6t1_raw_counts.tsv.gz",
    "GSM3635299_human_p6t2_raw_counts.tsv.gz",
    "GSM3635300_human_p6b1_raw_counts.tsv.gz",
    "GSM3635301_human_p7t1_raw_counts.tsv.gz",
    "GSM3635302_human_p7t2_raw_counts.tsv.gz",
    "GSM3635303_human_p7b1_raw_counts.tsv.gz"
)

library(scater)
all.human <- lapply(file.path(temp, hs.files), readSparseCounts)

library(Matrix) # Because the values are transposed.
all.human <- lapply(all.human, t)

t(sapply(all.human, dim))

We verify that the gene order is the same, and combine the counts.

stopifnot(length(unique(lapply(all.human, rownames)))==1L)
counts <- do.call(cbind, all.human)
dim(counts)

We derive some metadata from each file name and apply them to all of the constituent barcodes.

samples <- vapply(strsplit(hs.files, "_"), "[", i=3, "")
barcode <- lapply(all.human, colnames)
origin <- rep(samples, times = lengths(barcode))

donor <- sub("[bt].*", "", origin)
tissue <- ifelse(grepl("t", origin), "tumor", "blood")
barcode <- unlist(barcode)

library(S4Vectors)
coldata <- DataFrame(Library=origin, Barcode=barcode,
    Patient=donor, Tissue=tissue)
coldata

We then add additional metadata for a subset of cells that were used in the original paper. We convert some of the fields to logical values.

bfc <- BiocFileCache(ask=FALSE)
tarball <- bfcrpath(bfc, 
    file.path("https://ftp.ncbi.nlm.nih.gov/geo/series",
        "GSE127nnn/GSE127465/suppl",
        "GSE127465_human_cell_metadata_54773x25.tsv.gz"))

metadata <- read.delim(tarball, stringsAsFactors=FALSE, check.names = FALSE)
for (u in grep("^used", colnames(metadata))) {
    metadata[[u]] <- metadata[[u]]=="True"
}

metadata <- DataFrame(metadata, check.names=FALSE)
metadata

We merge this with our file name-derived metadata:

keys <- c("Library", "Barcode")
m <- match(coldata[,keys], metadata[,keys])
coldata$Used <- !is.na(m)

discard <- c(keys, "Patient", "Tissue")
colData <- cbind(coldata, metadata[m,setdiff(colnames(metadata), discard)])
colData

We save all of the components to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "zilionis-lung", "2.4.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(counts, file=file.path(path, "counts-human.rds"))
saveRDS(colData, file=file.path(path, "coldata-human.rds"))
rm(counts, all.human)
gc()

Reading in mouse data

We read in all the mouse datasets.

mm.files <- c(
    "GSM3635304_mouse_h_1_1_raw_counts.tsv.gz",
    "GSM3635305_mouse_h_1_2_raw_counts.tsv.gz",
    "GSM3635306_mouse_h_2_1_raw_counts.tsv.gz",
    "GSM3635307_mouse_h_2_2_raw_counts.tsv.gz",
    "GSM3635308_mouse_h_2_3_raw_counts.tsv.gz",
    "GSM3635309_mouse_t_1_1_raw_counts.tsv.gz",
    "GSM3635310_mouse_t_1_2_raw_counts.tsv.gz",
    "GSM3635311_mouse_t_1_3_raw_counts.tsv.gz",
    "GSM3635312_mouse_t_1_4_raw_counts.tsv.gz",
    "GSM3635313_mouse_t_1_5_raw_counts.tsv.gz",
    "GSM3635314_mouse_t_2_1_raw_counts.tsv.gz",
    "GSM3635315_mouse_t_2_2_raw_counts.tsv.gz",
    "GSM3635316_mouse_t_2_3_raw_counts.tsv.gz",
    "GSM3635317_mouse_t_2_4_raw_counts.tsv.gz"
)

all.mouse <- lapply(file.path(temp, mm.files), readSparseCounts)
all.mouse <- lapply(all.mouse, t)
t(sapply(all.mouse, dim))

We verify that the gene order is the same, and combine the counts.

stopifnot(length(unique(lapply(all.mouse, rownames)))==1L)
counts <- do.call(cbind, all.mouse)
dim(counts)

We derive some metadata from each file name and apply them to all of the constituent barcodes.

separated <- strsplit(mm.files, "_")

tissue <- vapply(separated, "[", i=3, "")
animal <- vapply(separated, "[", i=4, "")
replicate <- vapply(separated, "[", i=5, "")
barcode <- lapply(all.mouse, colnames)

animal <- rep(sprintf("%s_%s", tissue, animal), lengths(barcode))
replicate <- rep(replicate, lengths(barcode))
library <- sprintf("%s_%s", animal, replicate)
tissue <- rep(ifelse(tissue == "t", "tumor", "healthy"), times = lengths(barcode))
barcode <- unlist(barcode)

coldata <- DataFrame(Library=library, Barcode=barcode, Animal = animal, 
    Run = replicate, Tissue=tissue)
coldata

We next add additional metadata for a subset of cells that were used in the original paper. We keep only the experimentally interesting metadata, discarding columns that are duplicated or only have one level.

bfc <- BiocFileCache(ask=FALSE)
tarball <- bfcrpath(bfc, 
    file.path("https://ftp.ncbi.nlm.nih.gov/geo/series",
        "GSE127nnn/GSE127465/suppl",
        "GSE127465_mouse_cell_metadata_15939x12.tsv.gz"))

metadata <- read.delim(tarball, stringsAsFactors=FALSE, check.names = FALSE)
metadata <- DataFrame(metadata, check.names=FALSE)
metadata

We merge this with our file name-derived metadata:

keys <- c("Library", "Barcode")
m <- match(coldata[,keys], metadata[,keys])
coldata$Used <- !is.na(m)

discard <- c(keys, "Tumor or healthy", "Biological replicate")
colData <- cbind(coldata, metadata[m,setdiff(colnames(metadata), discard)])
colData

We save all of the components to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "zilionis-lung", "2.4.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
saveRDS(counts, file=file.path(path, "counts-mouse.rds"))
saveRDS(colData, file=file.path(path, "coldata-mouse.rds"))

Session information

sessionInfo()

References



drisso/scRNAseq documentation built on Feb. 16, 2021, 1:18 a.m.