knitr::opts_chunk$set(echo = TRUE) options(width = 100)
suppressPackageStartupMessages({ library(utils) library(DelayedArray) library(HDF5Array) library(SingleCellExperiment) library(zellkonverter) library(dplyr) })
h5ad
files from figshareThe data files were downloaded from figshare:
## Create data dirs datadir <- "droplet-raw-data" outdir <- "TabulaMurisSenisData/tabula-muris-senis-droplet" dir.create(datadir, showWarnings = FALSE) dir.create(outdir, showWarnings = FALSE, recursive = TRUE) ## Read and filter list of files to download filelist <- read.csv("tabula-muris-senis-files.csv") %>% dplyr::filter(dataset == "droplet") dim(filelist) any(duplicated(filelist$ndownloader_id)) any(duplicated(filelist$filename))
## Download data files options(timeout = 360) for (i in seq_len(nrow(filelist))) { if (!file.exists(file.path(datadir, filelist$filename[i]))) { download.file(url = paste0("https://ndownloader.figshare.com/files/", filelist$ndownloader_id[i]), destfile = file.path(datadir, filelist$filename[i])) } }
options(DelayedArray.block.size = 1e9) for (f in filelist$filename) { ## Get the tissue from the filename tissue <- sub("\\.h5ad", "", sub("tabula-muris-senis-droplet-processed-official-annotations[-]*", "", f)) if (tissue == "") { tissue <- "All" } print(tissue) ## Read H5AD file. The 'X' matrix corresponds to processed data. ## We also extract the reduced dimension representations, rowData and ## colData here. suppressWarnings({ tmp <- zellkonverter::readH5AD( file.path(datadir, f), use_hdf5 = TRUE ) }) mat <- assay(tmp, "X") ## processed data h5_file <- file.path(outdir, paste0(tissue, "_processed.h5")) if (!file.exists(h5_file)) { mat_h5 <- writeHDF5Array( mat, filepath = h5_file, name = "processed", chunkdim = HDF5Array::getHDF5DumpChunkDim(dim(mat))) } ## rowData, colData, reduced dimension representations saveRDS(colData(tmp), file = file.path(outdir, paste0(tissue, "_coldata.rds"))) saveRDS(rowData(tmp), file = file.path(outdir, paste0(tissue, "_rowdata.rds"))) if ("X_pca" %in% reducedDimNames(tmp)) { tmpmat <- reducedDim(tmp, "X_pca") colnames(tmpmat) <- paste0("PC", seq_len(ncol(tmpmat))) saveRDS(tmpmat, file = file.path(outdir, paste0(tissue, "_pca.rds"))) } if ("X_tsne" %in% reducedDimNames(tmp)) { tmpmat <- reducedDim(tmp, "X_tsne") colnames(tmpmat) <- paste0("TSNE", seq_len(ncol(tmpmat))) saveRDS(tmpmat, file = file.path(outdir, paste0(tissue, "_tsne.rds"))) } if ("X_umap" %in% reducedDimNames(tmp)) { tmpmat <- reducedDim(tmp, "X_umap") colnames(tmpmat) <- paste0("UMAP", seq_len(ncol(tmpmat))) saveRDS(tmpmat, file = file.path(outdir, paste0(tissue, "_umap.rds"))) } ## Print number of cells per tissue print(table(colData(tmp)$tissue)) ## Extract the 'raw.X' matrix (counts) and save as an integer matrix ad <- reticulate::import("anndata") res <- ad$read_h5ad(file.path(datadir, f)) ## Save color vector (and other things) for the full data set if (tissue == "All") { res$write_csvs("droplet-csvs") } cts <- t(res$raw$X) cts <- as(cts, "CsparseMatrix") cts <- DelayedArray(cts) type(cts) <- "integer" h5_file <- file.path(outdir, paste0(tissue, "_counts.h5")) if (!file.exists(h5_file)) { mat_h5 <- writeHDF5Array( cts, filepath = h5_file, name = "counts", chunkdim = HDF5Array::getHDF5DumpChunkDim(dim(cts))) } ## Note that while we download the processed counts directly, they can also be ## obtained from the raw counts as follows: if (ncol(cts) <= 10000) { cts <- t(cts) proc_counts <- log2(cts/DelayedArray::rowSums(cts) * 1e4 + 1) proc_counts <- t(t(proc_counts)/sqrt(DelayedMatrixStats::colVars(proc_counts))) proc_counts[is.na(proc_counts)] <- 0 proc_counts[proc_counts > 10] <- 10 print(max(abs(proc_counts - t(mat)))) } }
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.