knitr::opts_chunk$set(echo = TRUE)
options(width = 100)

Introduction

Here we will download and process the Seurat objects associated with the tonsil cell atlas to prepare them for submission to ExperimentHub. We will follow a similar style and structure as the one used for the TabulaMurisSenisData pacakge.

Loading required packages

library(Seurat)
library(SingleCellExperiment)
library(scater)
library(HDF5Array)
library(rhdf5)
library(here)
library(glue)
library(readr)
library(dplyr)
library(stringr)
library(tools)

Downloading the Seurat objects from Zenodo

The Seurat objects derived from our tonsil atlas have been archived as .rds files in Zenodo under the record 6340174. To generate the URL's for each package we used the zenodo_get package as follows:

zenodo_get 10.5281/zenodo.6340174 -w tmp.txt
paste tmp.txt md5sums.txt | sed 's/\t/,/' | sed 's/ \{1,\}/,/g' > tonsil-atlas-zenodo-get-output.csv

Let us read and curate the "tonsil-atlas-rds-files.csv" to include the file name, URL, cell type and dataset for each object:

# Read data
path_to_csv1 <- here("inst/scripts/tonsil-atlas-zenodo-get-output.csv")
path_to_csv2 <- here("inst/scripts/tonsil-atlas-rds-files.csv")
files_df <- read.csv(file = path_to_csv1, header = FALSE)
colnames(files_df) <- c("url", "file_id", "file_name")


# Set cell type and dataset variables
files_df$cell_type <- files_df$file_name %>%
  str_remove("20220215_") %>%
  str_remove("_seurat_obj.*$")
files_df$cell_type[files_df$cell_type == "CD4_T"] <- "CD4-T"
files_df$cell_type[files_df$cell_type == "CD8_T"] <- "CD8-T"
files_df$cell_type[files_df$cell_type == "NBC_MBC"] <- "NBC-MBC"
files_df$cell_type[files_df$cell_type == "ILC_NK"] <- "ILC-NK"
files_df$cell_type[str_detect(files_df$cell_type, "tonsil")] <- "All"
files_df$dataset <- case_when(
  str_detect(files_df$file_name, "atac") ~ "ATAC",
  str_detect(files_df$file_name, "spatial") ~ "Spatial",
  str_detect(files_df$file_name, "cite") ~ "CITE",
  TRUE ~ "RNA"
)


# Reorder columns
new_order <- c("file_name", "file_id", "cell_type", "dataset", "url")
files_df <- files_df[, new_order]


# Rewrite
write.csv(files_df, file = path_to_csv2, row.names = FALSE, quote = FALSE)

Now that we have the URL we can download the files:

# Create data directories
data_dir <- here("inst/scripts/rna_raw_data")
out_dir <- here("inst/scripts/HCATonsilData/1.0/RNA")
dir.create(data_dir, recursive = TRUE, showWarnings = TRUE)
dir.create(out_dir, recursive = TRUE, showWarnings = TRUE)


# Read and filter list of files to download
files_df <- read.csv(file = path_to_csv2, header = TRUE)
files_df <- files_df[files_df$dataset == "RNA", ]


# Downlaod
options(timeout = 1440)
for (i in seq_len(nrow(files_df))) {
  out_file <- file.path(data_dir, files_df$file_name[i])
  if (!file.exists(out_file)) {
    download.file(url = files_df$url[i], destfile = out_file)
  }

  # Check MD5
  md5_original <- files_df$file_id[i]
  md5_download <- md5sum(out_file)
  if (!md5_original == md5_download) {
    stop("The file is corrupted!")
  }
}

Process Seurat objects to extract relevant slots

# Get gene information
genes_df <- read_tsv(
  here("inst/extdata/features.tsv.gz"),
  col_names = c("ensembl_id", "symbol", "type")
) %>%
  filter(type == "Gene Expression") %>%
  as.data.frame()
gene_ids <- genes_df$ensembl_id
names(gene_ids) <- genes_df$symbol


# Process
for (f in files_df$file_name) {
  print(f)

  # Read Seurat object
  seurat <- readRDS(file.path(data_dir, f))
  cell_type <- files_df[files_df$file_name == f, "cell_type"]
  dataset <- files_df[files_df$file_name == f, "dataset"]

  # Extract and save reductions
  if ("pca" %in% names(seurat@reductions)) {
    tmpmat <- Embeddings(seurat, "pca")
    path_to_save_dims <- file.path(out_dir, glue("{cell_type}_{dataset}_pca.rds"))
    saveRDS(tmpmat, path_to_save_dims)
  }
  if ("harmony" %in% names(seurat@reductions)) {
    tmpmat <- Embeddings(seurat, "harmony")
    path_to_save_dims <- file.path(out_dir, glue("{cell_type}_{dataset}_harmony.rds"))
    saveRDS(tmpmat, path_to_save_dims)
  }
  if ("umap" %in% names(seurat@reductions)) {
    tmpmat <- Embeddings(seurat, "umap")
    path_to_save_dims <- file.path(out_dir, glue("{cell_type}_{dataset}_umap.rds"))
    saveRDS(tmpmat, path_to_save_dims)
  }

  # Extract and save rowData and colData
  saveRDS(
    as(seurat@meta.data, "DataFrame"),
    file.path(out_dir, glue("{cell_type}_{dataset}_coldata.rds"))
  )
  rowdata <- data.frame(gene_name = rownames(seurat))
  rowdata$highly_variable <- ifelse(
    rowdata$gene %in% rownames(seurat[["RNA"]]@scale.data),
    TRUE,
    FALSE
  )
  rowdata$gene_id <- gene_ids[rowdata$gene_name]
  rowdata <- as(rowdata, "DataFrame")
  rownames(rowdata) <- rowdata$gene_name
  saveRDS(rowdata, file.path(out_dir, glue("{cell_type}_{dataset}_rowdata.rds")))

  # Extract and save raw and processed counts
  cts <- seurat[["RNA"]]@counts
  h5_file <- file.path(out_dir, glue("{cell_type}_{dataset}_counts.h5"))
  if (!file.exists(h5_file)) {
    mat_h5 <- writeHDF5Array(
      cts,
      filepath = h5_file,
      name = "counts",
      chunkdim = HDF5Array::getHDF5DumpChunkDim(dim(cts))
    )
  }
  mat <- seurat[["RNA"]]@data
  h5_file <- file.path(out_dir, glue("{cell_type}_{dataset}_processed.h5"))
  if (!file.exists(h5_file)) {
    mat_h5 <- writeHDF5Array(
      mat,
      filepath = h5_file,
      name = "processed",
      chunkdim = HDF5Array::getHDF5DumpChunkDim(dim(mat))
    )
  }

  # Remove Seurat object
  rm(seurat)
  gc()
}

Sanity checks

As a sanity check, let us ensure that we can create successfully each SingleCellExperiment object from its individual slots:

all_files <- list.files(out_dir, full.names = TRUE)
for (cell_type in files_df$cell_type) {
  print(cell_type)
  path_to_cts <- str_subset(all_files, glue("{cell_type}.*counts.h5"))
  path_to_processed <- str_subset(all_files, glue("{cell_type}.*processed.h5"))
  path_to_rowdata <- str_subset(all_files, glue("{cell_type}.*rowdata.rds"))
  path_to_coldata <- str_subset(all_files, glue("{cell_type}.*coldata.rds"))
  path_to_pca <- str_subset(all_files, glue("{cell_type}.*pca.rds"))
  path_to_umap <- str_subset(all_files, glue("{cell_type}.*umap.rds"))
  path_to_harmony <-str_subset(all_files, glue("{cell_type}.*harmony.rds"))
  counts <- HDF5Array::HDF5Array(path_to_cts, name = "counts")
  processed <- HDF5Array::HDF5Array(path_to_processed, name = "processed")
  row_data <- readRDS(path_to_rowdata)
  col_data <- readRDS(path_to_coldata)
  pca <- readRDS(path_to_pca)
  umap <- readRDS(path_to_umap)
  sce <- SingleCellExperiment::SingleCellExperiment(
    assays = list(counts = counts, logcounts = processed),
    colData = col_data,
    rowData = row_data
  )
  SingleCellExperiment::reducedDims(sce) <- list(
    PCA = pca,
    UMAP = umap
  )
  if (length(path_to_harmony) == 1) {
    harmony <- readRDS(path_to_harmony)
    SingleCellExperiment::reducedDim(sce, "HARMONY") <- harmony
  }
  print(sce)
  print(scater::plotUMAP(sce, colour_by = "annotation_20220215"))
  if (class(sce) != "SingleCellExperiment") {
    stop(glue("The output of {cell_type} is not a SingleCellExperiment!"))
  }
}

Session Information

sessionInfo()


massonix/HCATonsilData documentation built on May 7, 2024, 8:33 a.m.