knitr::opts_chunk$set(echo = TRUE) options(width = 100)
Here we will download and process the version 2 of the Seurat objects associated with the single-cell transcriptomes of the tonsil cell atlas to prepare them for submission to ExperimentHub
. We will follow a similar style and structure as the one used for the TabulaMurisSenisData
package.
library(Seurat) library(SingleCellExperiment) library(scater) library(HDF5Array) library(rhdf5) library(here) library(glue) library(readr) library(dplyr) library(stringr) library(purrr) library(tools)
The Seurat objects derived from our tonsil atlas have been archived as .rds files in Zenodo under the record 8373756. To generate the URL's for each dataset we used the zenodo_get package as follows:
zenodo_get 10.5281/zenodo.8373756 -w tmp.txt paste tmp.txt md5sums.txt | sed 's/\t/,/' | sed 's/ \{1,\}/,/g' > tonsil-atlas-zenodo-get-output_v2.csv
Let us read and curate the "tonsil-atlas-zenodo-get-output_v2.csv":
# Read data path_to_csv1 <- here("inst/scripts/tonsil-atlas-zenodo-get-output_v2.csv") files_df <- read.csv(file = path_to_csv1, header = FALSE) colnames(files_df) <- c("url", "file_id", "file_name")
We proceed to download the RNA dataset:
# Create data directories data_dir <- here("inst/scripts/rna_raw_data") out_dir <- here("inst/scripts/HCATonsilData/2.0/RNA") dir.create(data_dir, recursive = TRUE, showWarnings = TRUE) dir.create(out_dir, recursive = TRUE, showWarnings = TRUE) # Get URL and download options(timeout = 10000000) rna_url <- files_df[str_detect(files_df$file_name, "RNA"), "url"] out_file <- file.path( data_dir, files_df[str_detect(files_df$file_name, "RNA"), "file_name"] ) download.file(url = rna_url, destfile = out_file) # Check MD5 md5_original <- files_df[str_detect(files_df$file_name, "RNA"), "file_id"] md5_download <- md5sum(out_file) if (!md5_original == md5_download) { stop("The file is corrupted!") } # Uncompress untar(out_file, exdir = data_dir)
# Get gene information genes_df <- read_tsv( here("inst/extdata/features.tsv.gz"), col_names = c("ensembl_id", "symbol", "type") ) %>% filter(type == "Gene Expression") %>% as.data.frame() gene_ids <- genes_df$ensembl_id names(gene_ids) <- genes_df$symbol # Process files_seurat <- list.files(file.path(data_dir, "scRNA-seq"), pattern = "rds") files_seurat <- str_subset(files_seurat, "atac", negate = TRUE) cell_types <- files_seurat %>% str_split("_seurat_obj.rds") %>% map_chr(1) %>% str_remove("(20230911_|20220215_)") cell_types[cell_types == "tonsil_atlas_rna"] <- "All" cell_types <- str_replace(cell_types, "_", "-") names(files_seurat) <- cell_types for (cell_type in cell_types) { print(cell_type) # Read Seurat object file_path <- file.path(data_dir, "scRNA-seq", files_seurat[cell_type]) seurat <- readRDS(file_path) dataset <- "RNA" # Extract and save reductions if ("pca" %in% names(seurat@reductions)) { tmpmat <- Embeddings(seurat, "pca") path_to_save_dims <- file.path(out_dir, glue("{cell_type}_{dataset}_pca.rds")) saveRDS(tmpmat, path_to_save_dims) } if ("harmony" %in% names(seurat@reductions)) { tmpmat <- Embeddings(seurat, "harmony") path_to_save_dims <- file.path(out_dir, glue("{cell_type}_{dataset}_harmony.rds")) saveRDS(tmpmat, path_to_save_dims) } if ("umap" %in% names(seurat@reductions)) { tmpmat <- Embeddings(seurat, "umap") path_to_save_dims <- file.path(out_dir, glue("{cell_type}_{dataset}_umap.rds")) saveRDS(tmpmat, path_to_save_dims) } # Extract and save rowData and colData saveRDS( as(seurat@meta.data, "DataFrame"), file.path(out_dir, glue("{cell_type}_{dataset}_coldata.rds")) ) rowdata <- data.frame(gene_name = rownames(seurat)) rowdata$highly_variable <- ifelse( rowdata$gene %in% rownames(seurat[["RNA"]]@scale.data), TRUE, FALSE ) rowdata$gene_id <- gene_ids[rowdata$gene_name] rowdata <- as(rowdata, "DataFrame") rownames(rowdata) <- rowdata$gene_name saveRDS(rowdata, file.path(out_dir, glue("{cell_type}_{dataset}_rowdata.rds"))) # Extract and save raw and processed counts cts <- seurat[["RNA"]]@counts h5_file <- file.path(out_dir, glue("{cell_type}_{dataset}_counts.h5")) if (!file.exists(h5_file)) { mat_h5 <- writeHDF5Array( cts, filepath = h5_file, name = "counts", chunkdim = HDF5Array::getHDF5DumpChunkDim(dim(cts)) ) } mat <- seurat[["RNA"]]@data h5_file <- file.path(out_dir, glue("{cell_type}_{dataset}_processed.h5")) if (!file.exists(h5_file)) { mat_h5 <- writeHDF5Array( mat, filepath = h5_file, name = "processed", chunkdim = HDF5Array::getHDF5DumpChunkDim(dim(mat)) ) } # Remove Seurat object rm(seurat) gc() }
As a sanity check, let us ensure that we can create successfully each SingleCellExperiment object from its individual slots:
all_files <- list.files(out_dir, full.names = TRUE) for (cell_type in cell_types) { print(cell_type) path_to_cts <- str_subset(all_files, glue("{cell_type}.*counts.h5")) path_to_processed <- str_subset(all_files, glue("{cell_type}.*processed.h5")) path_to_rowdata <- str_subset(all_files, glue("{cell_type}.*rowdata.rds")) path_to_coldata <- str_subset(all_files, glue("{cell_type}.*coldata.rds")) path_to_pca <- str_subset(all_files, glue("{cell_type}.*pca.rds")) path_to_umap <- str_subset(all_files, glue("{cell_type}.*umap.rds")) path_to_harmony <-str_subset(all_files, glue("{cell_type}.*harmony.rds")) counts <- HDF5Array::HDF5Array(path_to_cts, name = "counts") processed <- HDF5Array::HDF5Array(path_to_processed, name = "processed") row_data <- readRDS(path_to_rowdata) col_data <- readRDS(path_to_coldata) pca <- readRDS(path_to_pca) umap <- readRDS(path_to_umap) harmony <- readRDS(path_to_harmony) sce <- SingleCellExperiment::SingleCellExperiment( assays = list(counts = counts, logcounts = processed), colData = col_data, rowData = row_data ) SingleCellExperiment::reducedDims(sce) <- list( PCA = pca, UMAP = umap, HARMONY = harmony ) print(sce) #print(scater::plotUMAP(sce, colour_by = "annotation_20220215")) if (class(sce) != "SingleCellExperiment") { stop(glue("The output of {cell_type} is not a SingleCellExperiment!")) } }
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.