knitr::opts_chunk$set(echo = TRUE) options(width = 100)
Here we will download and process the Seurat objects associated with the tonsil cell atlas to prepare them for submission to ExperimentHub
. We will follow a similar style and structure as the one used for the TabulaMurisSenisData
pacakge.
library(Seurat) library(SingleCellExperiment) library(scater) library(HDF5Array) library(rhdf5) library(here) library(glue) library(readr) library(dplyr) library(stringr) library(tools)
The Seurat objects derived from our tonsil atlas have been archived as .rds files in Zenodo under the record 6340174. To generate the URL's for each package we used the zenodo_get package as follows:
zenodo_get 10.5281/zenodo.6340174 -w tmp.txt paste tmp.txt md5sums.txt | sed 's/\t/,/' | sed 's/ \{1,\}/,/g' > tonsil-atlas-zenodo-get-output.csv
Let us read and curate the "tonsil-atlas-rds-files.csv" to include the file name, URL, cell type and dataset for each object:
# Read data path_to_csv1 <- here("inst/scripts/tonsil-atlas-zenodo-get-output.csv") path_to_csv2 <- here("inst/scripts/tonsil-atlas-rds-files.csv") files_df <- read.csv(file = path_to_csv1, header = FALSE) colnames(files_df) <- c("url", "file_id", "file_name") # Set cell type and dataset variables files_df$cell_type <- files_df$file_name %>% str_remove("20220215_") %>% str_remove("_seurat_obj.*$") files_df$cell_type[files_df$cell_type == "CD4_T"] <- "CD4-T" files_df$cell_type[files_df$cell_type == "CD8_T"] <- "CD8-T" files_df$cell_type[files_df$cell_type == "NBC_MBC"] <- "NBC-MBC" files_df$cell_type[files_df$cell_type == "ILC_NK"] <- "ILC-NK" files_df$cell_type[str_detect(files_df$cell_type, "tonsil")] <- "All" files_df$dataset <- case_when( str_detect(files_df$file_name, "atac") ~ "ATAC", str_detect(files_df$file_name, "spatial") ~ "Spatial", str_detect(files_df$file_name, "cite") ~ "CITE", TRUE ~ "RNA" ) # Reorder columns new_order <- c("file_name", "file_id", "cell_type", "dataset", "url") files_df <- files_df[, new_order] # Rewrite write.csv(files_df, file = path_to_csv2, row.names = FALSE, quote = FALSE)
Now that we have the URL we can download the files:
# Create data directories data_dir <- here("inst/scripts/rna_raw_data") out_dir <- here("inst/scripts/HCATonsilData/1.0/RNA") dir.create(data_dir, recursive = TRUE, showWarnings = TRUE) dir.create(out_dir, recursive = TRUE, showWarnings = TRUE) # Read and filter list of files to download files_df <- read.csv(file = path_to_csv2, header = TRUE) files_df <- files_df[files_df$dataset == "RNA", ] # Downlaod options(timeout = 1440) for (i in seq_len(nrow(files_df))) { out_file <- file.path(data_dir, files_df$file_name[i]) if (!file.exists(out_file)) { download.file(url = files_df$url[i], destfile = out_file) } # Check MD5 md5_original <- files_df$file_id[i] md5_download <- md5sum(out_file) if (!md5_original == md5_download) { stop("The file is corrupted!") } }
# Get gene information genes_df <- read_tsv( here("inst/extdata/features.tsv.gz"), col_names = c("ensembl_id", "symbol", "type") ) %>% filter(type == "Gene Expression") %>% as.data.frame() gene_ids <- genes_df$ensembl_id names(gene_ids) <- genes_df$symbol # Process for (f in files_df$file_name) { print(f) # Read Seurat object seurat <- readRDS(file.path(data_dir, f)) cell_type <- files_df[files_df$file_name == f, "cell_type"] dataset <- files_df[files_df$file_name == f, "dataset"] # Extract and save reductions if ("pca" %in% names(seurat@reductions)) { tmpmat <- Embeddings(seurat, "pca") path_to_save_dims <- file.path(out_dir, glue("{cell_type}_{dataset}_pca.rds")) saveRDS(tmpmat, path_to_save_dims) } if ("harmony" %in% names(seurat@reductions)) { tmpmat <- Embeddings(seurat, "harmony") path_to_save_dims <- file.path(out_dir, glue("{cell_type}_{dataset}_harmony.rds")) saveRDS(tmpmat, path_to_save_dims) } if ("umap" %in% names(seurat@reductions)) { tmpmat <- Embeddings(seurat, "umap") path_to_save_dims <- file.path(out_dir, glue("{cell_type}_{dataset}_umap.rds")) saveRDS(tmpmat, path_to_save_dims) } # Extract and save rowData and colData saveRDS( as(seurat@meta.data, "DataFrame"), file.path(out_dir, glue("{cell_type}_{dataset}_coldata.rds")) ) rowdata <- data.frame(gene_name = rownames(seurat)) rowdata$highly_variable <- ifelse( rowdata$gene %in% rownames(seurat[["RNA"]]@scale.data), TRUE, FALSE ) rowdata$gene_id <- gene_ids[rowdata$gene_name] rowdata <- as(rowdata, "DataFrame") rownames(rowdata) <- rowdata$gene_name saveRDS(rowdata, file.path(out_dir, glue("{cell_type}_{dataset}_rowdata.rds"))) # Extract and save raw and processed counts cts <- seurat[["RNA"]]@counts h5_file <- file.path(out_dir, glue("{cell_type}_{dataset}_counts.h5")) if (!file.exists(h5_file)) { mat_h5 <- writeHDF5Array( cts, filepath = h5_file, name = "counts", chunkdim = HDF5Array::getHDF5DumpChunkDim(dim(cts)) ) } mat <- seurat[["RNA"]]@data h5_file <- file.path(out_dir, glue("{cell_type}_{dataset}_processed.h5")) if (!file.exists(h5_file)) { mat_h5 <- writeHDF5Array( mat, filepath = h5_file, name = "processed", chunkdim = HDF5Array::getHDF5DumpChunkDim(dim(mat)) ) } # Remove Seurat object rm(seurat) gc() }
As a sanity check, let us ensure that we can create successfully each SingleCellExperiment object from its individual slots:
all_files <- list.files(out_dir, full.names = TRUE) for (cell_type in files_df$cell_type) { print(cell_type) path_to_cts <- str_subset(all_files, glue("{cell_type}.*counts.h5")) path_to_processed <- str_subset(all_files, glue("{cell_type}.*processed.h5")) path_to_rowdata <- str_subset(all_files, glue("{cell_type}.*rowdata.rds")) path_to_coldata <- str_subset(all_files, glue("{cell_type}.*coldata.rds")) path_to_pca <- str_subset(all_files, glue("{cell_type}.*pca.rds")) path_to_umap <- str_subset(all_files, glue("{cell_type}.*umap.rds")) path_to_harmony <-str_subset(all_files, glue("{cell_type}.*harmony.rds")) counts <- HDF5Array::HDF5Array(path_to_cts, name = "counts") processed <- HDF5Array::HDF5Array(path_to_processed, name = "processed") row_data <- readRDS(path_to_rowdata) col_data <- readRDS(path_to_coldata) pca <- readRDS(path_to_pca) umap <- readRDS(path_to_umap) sce <- SingleCellExperiment::SingleCellExperiment( assays = list(counts = counts, logcounts = processed), colData = col_data, rowData = row_data ) SingleCellExperiment::reducedDims(sce) <- list( PCA = pca, UMAP = umap ) if (length(path_to_harmony) == 1) { harmony <- readRDS(path_to_harmony) SingleCellExperiment::reducedDim(sce, "HARMONY") <- harmony } print(sce) print(scater::plotUMAP(sce, colour_by = "annotation_20220215")) if (class(sce) != "SingleCellExperiment") { stop(glue("The output of {cell_type} is not a SingleCellExperiment!")) } }
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.