knitr::opts_chunk$set(echo = TRUE)
options(width = 100)

Loading required packages

suppressPackageStartupMessages({
    library(utils)
    library(DelayedArray)
    library(HDF5Array)
    library(SingleCellExperiment)
    library(zellkonverter)
    library(dplyr)
})

Downloading the h5ad files from figshare

The data files were downloaded from figshare:

## Create data dirs
datadir <- "droplet-raw-data"
outdir <- "TabulaMurisSenisData/tabula-muris-senis-droplet"

dir.create(datadir, showWarnings = FALSE)
dir.create(outdir, showWarnings = FALSE, recursive = TRUE)

## Read and filter list of files to download
filelist <- read.csv("tabula-muris-senis-files.csv") %>%
    dplyr::filter(dataset == "droplet")
dim(filelist)
any(duplicated(filelist$ndownloader_id))
any(duplicated(filelist$filename))
## Download data files
options(timeout = 360)
for (i in seq_len(nrow(filelist))) {
    if (!file.exists(file.path(datadir, filelist$filename[i]))) {
        download.file(url = paste0("https://ndownloader.figshare.com/files/", 
                                   filelist$ndownloader_id[i]), 
                      destfile = file.path(datadir, filelist$filename[i]))
    }
}

Processing the H5AD files to extract relevant parts

options(DelayedArray.block.size = 1e9)

for (f in filelist$filename) {
    ## Get the tissue from the filename
    tissue <- sub("\\.h5ad", "", 
                  sub("tabula-muris-senis-droplet-processed-official-annotations[-]*",
                      "", f))
    if (tissue == "") {
        tissue <- "All"
    }
    print(tissue)

    ## Read H5AD file. The 'X' matrix corresponds to processed data.
    ## We also extract the reduced dimension representations, rowData and 
    ## colData here. 
    suppressWarnings({
        tmp <- zellkonverter::readH5AD(
            file.path(datadir, f),
            use_hdf5 = TRUE
        )
    })
    mat <- assay(tmp, "X")  ## processed data
    h5_file <- file.path(outdir, paste0(tissue, "_processed.h5"))
    if (!file.exists(h5_file)) {
        mat_h5 <- writeHDF5Array(
            mat,
            filepath = h5_file,
            name = "processed",
            chunkdim = HDF5Array::getHDF5DumpChunkDim(dim(mat)))
    }

    ## rowData, colData, reduced dimension representations
    saveRDS(colData(tmp), file = file.path(outdir, paste0(tissue, "_coldata.rds")))
    saveRDS(rowData(tmp), file = file.path(outdir, paste0(tissue, "_rowdata.rds")))
    if ("X_pca" %in% reducedDimNames(tmp)) {
        tmpmat <- reducedDim(tmp, "X_pca")
        colnames(tmpmat) <- paste0("PC", seq_len(ncol(tmpmat)))
        saveRDS(tmpmat, file = file.path(outdir, paste0(tissue, "_pca.rds")))
    }
    if ("X_tsne" %in% reducedDimNames(tmp)) {
        tmpmat <- reducedDim(tmp, "X_tsne")
        colnames(tmpmat) <- paste0("TSNE", seq_len(ncol(tmpmat)))
        saveRDS(tmpmat, file = file.path(outdir, paste0(tissue, "_tsne.rds")))
    }
    if ("X_umap" %in% reducedDimNames(tmp)) {
        tmpmat <- reducedDim(tmp, "X_umap")
        colnames(tmpmat) <- paste0("UMAP", seq_len(ncol(tmpmat)))
        saveRDS(tmpmat, file = file.path(outdir, paste0(tissue, "_umap.rds")))
    }

    ## Print number of cells per tissue
    print(table(colData(tmp)$tissue))

    ## Extract the 'raw.X' matrix (counts) and save as an integer matrix
    ad <- reticulate::import("anndata")
    res <- ad$read_h5ad(file.path(datadir, f))

    ## Save color vector (and other things) for the full data set
    if (tissue == "All") {
        res$write_csvs("droplet-csvs")
    }

    cts <- t(res$raw$X)
    cts <- as(cts, "CsparseMatrix")
    cts <- DelayedArray(cts)
    type(cts) <- "integer"
    h5_file <- file.path(outdir, paste0(tissue, "_counts.h5"))
    if (!file.exists(h5_file)) {
        mat_h5 <- writeHDF5Array(
            cts,
            filepath = h5_file,
            name = "counts",
            chunkdim = HDF5Array::getHDF5DumpChunkDim(dim(cts)))
    }

    ## Note that while we download the processed counts directly, they can also be 
    ## obtained from the raw counts as follows:
    if (ncol(cts) <= 10000) {
        cts <- t(cts)
        proc_counts <- log2(cts/DelayedArray::rowSums(cts) * 1e4 + 1)
        proc_counts <- t(t(proc_counts)/sqrt(DelayedMatrixStats::colVars(proc_counts)))
        proc_counts[is.na(proc_counts)] <- 0
        proc_counts[proc_counts > 10] <- 10
        print(max(abs(proc_counts - t(mat))))
    }
}

Session info

sessionInfo()


fmicompbio/TabulaMurisSenisData documentation built on Nov. 5, 2024, 8:45 a.m.