library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the data

We obtain a single-cell RNA sequencing dataset of human organs from @he2020singlecell. Counts for endogenous genes are available from the Gene Expression Omnibus using the accession number GSE159929. We download and cache them using the r Biocpkg("BiocFileCache") package.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
mat.path <- bfcrpath(bfc, "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE159929&format=file")

We load this into memory. Sadly, each matrix does not contain the same set of features. This is because they filtered out low-expressing genes - it would not be correct to zero-fill.

tmp <- tempfile()
untar(mat.path, exdir=tmp)
all.files <- list.files(tmp)

library(BiocParallel)
library(scuttle)
all.counts <- bplapply(file.path(tmp, all.files), readSparseCounts, sep=",", row.names=1, quote='"')
names(all.counts) <- sub(".*_([^_]+)_Counts.csv.gz", "\\1", all.files)
do.call(rbind, lapply(all.counts, dim))

Downloading the metadata

We obtain the metadata from the GitHub repository accompanying the study (see here).

meta.path <- bfcrpath(bfc,
    "https://github.com/bei-lab/scRNA-AHCA/raw/master/Cell_barcode_and_corresponding_cell_types_of_AHCA/Annotation_AHCA_alltissues_meta.data_84363_cell.txt")
meta <- read.delim(meta.path)
meta <- DataFrame(meta, check.names=FALSE)
meta

rownames(meta) <- meta$X
meta <- meta[,-1]
colnames(meta)[1] <- "Tissue"
meta$Tissue <- sub("_cDNA", "", meta$Tissue)

ref.names <- lapply(all.counts, colnames)
ref.names <- paste0(
    rep(names(all.counts), vapply(all.counts, ncol, 0L)),
    "_cDNA_", 
    unlist(ref.names)
)
stopifnot(identical(sort(ref.names), sort(rownames(meta))))

# Don't need this crap.
meta$Color_of_tissues <- NULL

reddims <- list(tSNE=cbind(meta$tSNE_1, meta$tSNE_2))
rownames(reddims[[1]]) <- rownames(meta)
meta <- meta[,setdiff(colnames(meta), c("tSNE_1", "tSNE_2"))]

We also obtain finer cell types from reclustering.

fine.anno <- broad.anno <- rep(NA_character_, nrow(meta))

for (sub in c("B_and_plasma.meta.data.txt", "CD4_meta.data.txt", "CD8_meta.data.txt",
    "Endothelial_cell.meta.data.txt", "Epithelial_cells.meta.data.txt", 
    "FibSmo.meta.data.txt", "Myeloid.meta.data.txt")) 
{
    sub.path <- bfcrpath(bfc,
        file.path("https://github.com/bei-lab/scRNA-AHCA/raw/master/Cell_barcode_and_corresponding_cell_types_of_AHCA", sub))
    sub.df <- read.delim(sub.path)

    m <- match(sub.df$X, rownames(meta))
    stopifnot(all(is.na(fine.anno[m])))
    stopifnot(all(is.na(broad.anno[m])))
    broad.anno[m] <- sub(".meta\\.data.txt", "", sub)
    fine.anno[m] <- sub.df$annotation
}

meta$reclustered.broad <- broad.anno
meta$reclustered.fine <- fine.anno

Saving to file

We now save all of the relevant components to file for upload to r Biocpkg("ExperimentHub").

path <- file.path("scRNAseq", "he-organ-atlas", "2.6.0")
dir.create(path, showWarnings=FALSE, recursive=TRUE)
for (tissue in names(all.counts)) {
    current <- all.counts[[tissue]]
    colnames(current) <- paste0(tissue, "_cDNA_", colnames(current))
    saveRDS(current, file=file.path(path, paste0("counts-", tissue, ".rds")))

    m <- match(colnames(current), rownames(meta))
    stopifnot(all(!is.na(m)))
    stopifnot(all(meta$Tissue[m]==tissue))
    saveRDS(meta[m,], file=file.path(path, paste0("coldata-", tissue, ".rds")))
}
saveRDS(reddims, file=file.path(path, "reddim.rds"))

Session information

sessionInfo()

References



drisso/scRNAseq documentation built on Feb. 16, 2021, 1:18 a.m.