start_time <- Sys.time() quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) } quiet_library(rhdf5) quiet_library(BarMixer) quiet_library(Matrix) quiet_library(ggplot2) quiet_library(cowplot)
Declaring start
stm("Merging .h5 files") stm(paste0("Using BarMixer v", installed.packages()["BarMixer","Version"]))
in_dir <- params$in_dir out_dir <- params$out_dir stm(paste0("IN H5 directory : ", in_dir)) stm(paste0("OUT Directory : ", out_dir))
print(c( paste0("IN H5 directory : ", in_dir), paste0("OUT Directory : ", out_dir) ))
in_file_names <- list.files(in_dir, pattern = "_.+h5$") # Check for per-well directories if(length(in_file_names) == 0) { in_file_dirs <- list.dirs(in_dir, recursive = FALSE) in_file_names <- unlist(lapply(in_file_dirs, list.files, pattern = "_.+h5$")) in_file_paths <- unlist(lapply(in_file_dirs, list.files, pattern = "_.+h5$", full.names = TRUE)) } else { in_file_paths <- list.files(in_dir, pattern = "_.+h5$", full.names = TRUE) } stm(paste0("Found ", length(in_file_paths), " .h5 files for processing")) print(paste0("Found ", length(in_file_paths), " .h5 files for processing"))
in_file_hashes <- sub(".h5$","",sub(".+_","",in_file_names)) unique_hashes <- unique(in_file_hashes) stm(paste0("Identified ", length(unique_hashes), " hash patterns: ", paste(unique_hashes, collapse = ", ")))
meta_list <- list() for(i in seq_along(unique_hashes)) { current_hash <- unique_hashes[i] stm(paste0("Merging for hash ", current_hash)) current_files <- in_file_paths[in_file_hashes == current_hash] for(f in current_files) { if(file.exists(f)) { stm(paste0("IN H5 file : ", f)) } else { stm(paste0("MISSING H5 file : ", f)) current_files <- current_files[current_files != f] } } if(length(current_files) > 1) { h5_ll <- lapply(current_files, h5dump) h5_list <- reduce_h5_list(h5_ll) } else { h5_list <- h5dump(current_files) } meta_list[[current_hash]] <- h5_list_cell_metadata(h5_list) if("pool_id" %in% names(meta_list[[current_hash]])) { current_pool <- meta_list[[current_hash]]$pool_id[1] } else { current_pool <- "merged" } if(current_hash == "multiplet") { out_file <- file.path(out_dir, paste0(current_pool, "_", current_hash, ".h5")) } else { current_sample_id <- meta_list[[current_hash]]$sample_id[1] out_file <- file.path(out_dir, paste0(current_pool, "_", current_sample_id, ".h5")) } stm(paste0("Writing HDF5 to : ", out_file)) write_h5_list(h5_list, out_file, overwrite = TRUE) }
stm("Generating QC Metrics and Plots for Report") all_meta <- do.call("rbind", meta_list) all_meta <- as.data.table(all_meta) if("sample_id" %in% names(all_meta)) { all_meta$plot_barcode <- paste0(all_meta$hto_barcode, "\n", all_meta$sample_id) hto_barcode_to_sample_id <- data.frame(hto_barcode = unique(all_meta$hto_barcode), sample_id = unique(all_meta$sample_id)) } else { all_meta$sample_id <- "No PBMC ID" all_meta$plot_barcode <- all_meta$hto_barcode hto_barcode_to_sample_id <- data.frame(sample_id = "No PBMC ID", hto_barcode = unique(all_meta$hto_barcode)) } singlet_meta <- all_meta[hto_category == "singlet"] hto_barcode_to_sample_id <- as.data.table(hto_barcode_to_sample_id) qc_list <- list(report_type = "merge_h5_by_hash", report_datetime = as.character(start_time), report_uuid = ids::uuid(use_time = TRUE), package = "BarMixer", package_version = sessionInfo()$otherPkgs$BarMixer$Version) out_json <- file.path(out_dir, paste0(current_pool, "_final_h5_metrics.json")) stm(paste0("OUT JSON : ", out_json)) print(paste0("OUT JSON : ", out_json))
overview_metrics <- all_meta[,.(n_cells = nrow(.SD), median_reads = median(n_reads), median_umis = median(n_umis), median_genes = median(n_genes))] qc_list$pool_stats <- as.list(overview_metrics) qc_table(overview_metrics)
hto_category_metrics <- all_meta[,.(n_cells = nrow(.SD), median_reads = as.numeric(median(n_reads)), median_umis = as.numeric(median(n_umis)), median_genes = as.numeric(median(n_genes))), by = hto_category] qc_list$hto_category_stats <- lapply(1:nrow(hto_category_metrics), function(x) { list(n_cells = hto_category_metrics$n_cells[x], median_reads = hto_category_metrics$median_reads[x], median_umis = hto_category_metrics$median_umis[x], median_genes = hto_category_metrics$median_genes[x]) }) names(qc_list$hto_category_stats) <- hto_category_metrics$hto_category qc_table(hto_category_metrics)
well_metrics <- all_meta[,.(n_cells = nrow(.SD), median_reads = as.numeric(median(n_reads)), median_umis = as.numeric(median(n_umis)), median_genes = as.numeric(median(n_genes))), by = well_id] for(well in well_metrics$well_id) { qc_list$well_stats[[well]] <- list(n_cells = well_metrics$n_cells[well_metrics$well_id == well], median_reads = well_metrics$median_reads[well_metrics$well_id == well], median_umis = well_metrics$median_umis[well_metrics$well_id == well], median_genes = well_metrics$median_genes[well_metrics$well_id == well]) } knitr::kable(well_metrics, caption = "Well Metrics Overview")
qc_violin_plot(all_meta, category_x = "well_id", name_x = "Well ID", column_y = "n_reads", name_y = "N Reads per Cell", fill = "dodgerblue") + ggtitle("Reads per Cell")
qc_violin_plot(all_meta, category_x = "well_id", name_x = "Well ID", column_y = "n_umis", name_y = "N UMIs per Cell", fill = "purple") + ggtitle("UMIs per Cell")
qc_violin_plot(all_meta, category_x = "well_id", name_x = "Well ID", column_y = "n_genes", name_y = "N Genes per Cell", fill = "orangered") + ggtitle("Genes per Cell")
hto_category_by_well <- all_meta[, .(n_cells = nrow(.SD)), by = list(well_id, hto_category)] hto_category_count_by_well <- dcast(hto_category_by_well, formula = hto_category ~ well_id, value.var = "n_cells") for(well in names(hto_category_count_by_well)[-1]) { hto_cat_list <- as.list(hto_category_count_by_well[[well]]) names(hto_cat_list) <- paste0("n_",hto_category_count_by_well$hto_category) qc_list$well_stats[[well]] <- c(qc_list$well_stats[[well]], hto_cat_list) } qc_table(hto_category_count_by_well)
qc_aligned_barplot(all_meta, category_x = "well_id", name_x = "Well ID", category_y = "hto_category", category_name = "HTO Category", colorset_y = "varibow", name_y = "N Cells", padding = 0.2) + ggtitle("HTO Category Counts per Well")
hto_category_by_well <- hto_category_by_well[, frac_cells := round(n_cells/sum(n_cells),4), by = well_id] hto_category_frac_by_well <- dcast(hto_category_by_well, formula = hto_category ~ well_id, value.var = "frac_cells") qc_table(hto_category_frac_by_well)
qc_stacked_barplot(all_meta, category_x = "well_id", name_x = "Well ID", category_y = "hto_category", category_name = "HTO Category", colorset_y = "varibow", name_y = "Fraction of Cells", as_fraction = TRUE) + ggtitle("HTO Category Fraction per Well")
hto_barcode_by_well <- singlet_meta[, .(n_cells = nrow(.SD)), by = list(well_id, hto_barcode)] hto_barcode_count_by_well <- dcast(hto_barcode_by_well, formula = hto_barcode ~ well_id, value.var = "n_cells") hto_barcode_count_by_well <- hto_barcode_to_sample_id[hto_barcode_count_by_well, on = "hto_barcode"] for(well in names(hto_category_count_by_well)[-1]) { sample_list <- as.list(hto_barcode_count_by_well[[well]]) names(sample_list) <- hto_barcode_count_by_well$sample_id qc_list$well_stats[[well]]$sample_singlet_counts <- sample_list } for(sample_id in hto_barcode_count_by_well$sample_id) { sample_row <- hto_barcode_count_by_well[hto_barcode_count_by_well$sample_id == sample_id, ,drop = FALSE] sample_row <- sample_row[,-c(1,2)] qc_list$sample_stats[[sample_id]]$well_counts <- as.list(sample_row) } qc_table(hto_barcode_count_by_well)
qc_aligned_barplot(singlet_meta, category_x = "well_id", name_x = "Well ID", category_y = "plot_barcode", category_name = "HTO Barcode", colorset_y = "varibow", name_y = "N Cells", padding = 0.2) + ggtitle("HTO Barcode Distribution in Wells")
qc_aligned_barplot(singlet_meta, category_x = "plot_barcode", name_x = "HTO Barcode", category_y = "well_id", category_name = "Well ID", colorset_y = "varibow", name_y = "N Cells", padding = 0.2) + ggtitle("Well Distribution in HTO Barcodes")
hto_barcode_by_well <- hto_barcode_by_well[, frac_cells := round(n_cells/sum(n_cells),4), by = well_id] hto_barcode_frac_by_well <- dcast(hto_barcode_by_well, formula = hto_barcode ~ well_id, value.var = "frac_cells") hto_barcode_frac_by_well <- hto_barcode_to_sample_id[hto_barcode_frac_by_well, on = "hto_barcode"] qc_table(hto_barcode_frac_by_well)
qc_stacked_barplot(singlet_meta, category_x = "well_id", name_x = "Well ID", category_y = "plot_barcode", category_name = "HTO Barcode", colorset_y = "varibow", name_y = "Fraction of Cells", as_fraction = TRUE) + ggtitle("HTO Barcode Fraction per Well")
qc_stacked_barplot(singlet_meta, category_x = "plot_barcode", category_y = "well_id", category_name = "Well ID", name_x = "HTO Barcode", colorset_y = "varibow", name_y = "Fraction of Cells", as_fraction = TRUE) + ggtitle("Well Fraction per HTO Barcode")
hto_barcode_metrics <- singlet_meta[,.(n_cells = nrow(.SD), median_reads = as.numeric(median(n_reads)), median_umis = as.numeric(median(n_umis)), median_genes = as.numeric(median(n_genes))), by = hto_barcode] hto_barcode_metrics <- hto_barcode_to_sample_id[hto_barcode_metrics, on = "hto_barcode"] for(sample_id in hto_barcode_metrics$sample_id) { sample_row <- hto_barcode_metrics[hto_barcode_metrics$sample_id == sample_id, , drop = FALSE] qc_list$sample_stats[[sample_id]] <- as.list(sample_row[, -2]) } qc_table(hto_barcode_metrics)
category_reads_violins <- qc_violin_plot(all_meta, category_x = "hto_category", name_x = "HTO Category", column_y = "n_reads", name_y = "N Reads per Cell", fill = "dodgerblue") barcode_reads_violins <- qc_violin_plot(all_meta[all_meta$hto_category == "singlet",], category_x = "plot_barcode", name_x = "HTO Barcode (singlets)", column_y = "n_reads", name_y = "N Reads per Cell", fill = "dodgerblue") + ggtitle("Reads per Cell") reads_violin_list <- list(category_reads_violins, barcode_reads_violins) plot_grid(plotlist = reads_violin_list, ncol = 2, rel_widths = c(1, 3), nrow = 1, align = "h")
reads_hto_barcode_by_well <- singlet_meta[, .(median_reads = median(n_reads)), by = list(well_id, hto_barcode)] reads_hto_barcode_by_well <- dcast(reads_hto_barcode_by_well, formula = hto_barcode ~ well_id, value.var = "median_reads") reads_hto_barcode_by_well <- hto_barcode_to_sample_id[reads_hto_barcode_by_well, on = "hto_barcode"] for(well in names(reads_hto_barcode_by_well)[-c(1,2)]) { sample_list <- as.list(reads_hto_barcode_by_well[[well]]) names(sample_list) <- reads_hto_barcode_by_well$sample_id qc_list$well_stats[[well]]$sample_mapped_reads_median <- sample_list } for(sample_id in reads_hto_barcode_by_well$sample_id) { sample_row <- reads_hto_barcode_by_well[reads_hto_barcode_by_well$sample_id == sample_id, ,drop = FALSE] sample_row <- sample_row[,-c(1,2)] qc_list$sample_stats[[sample_id]]$well_mapped_reads_median <- as.list(sample_row) } qc_table(reads_hto_barcode_by_well)
category_umis_violins <- qc_violin_plot(all_meta, category_x = "hto_category", name_x = "HTO Category", column_y = "n_umis", name_y = "N UMIs per Cell", fill = "purple") barcode_umis_violins <- qc_violin_plot(all_meta[all_meta$hto_category == "singlet",], category_x = "plot_barcode", name_x = "HTO Barcode (singlets)", column_y = "n_umis", name_y = "N UMIs per Cell", fill = "purple") + ggtitle("UMIs per Cell") umis_violin_list <- list(category_umis_violins, barcode_umis_violins) plot_grid(plotlist = umis_violin_list, ncol = 2, rel_widths = c(1, 3), nrow = 1, align = "h")
umis_hto_barcode_by_well <- singlet_meta[, .(median_umis = median(n_umis)), by = list(well_id, hto_barcode)] umis_hto_barcode_by_well <- dcast(umis_hto_barcode_by_well, formula = hto_barcode ~ well_id, value.var = "median_umis") umis_hto_barcode_by_well <- hto_barcode_to_sample_id[umis_hto_barcode_by_well, on = "hto_barcode"] for(well in names(umis_hto_barcode_by_well)[-c(1,2)]) { sample_list <- as.list(umis_hto_barcode_by_well[[well]]) names(sample_list) <- umis_hto_barcode_by_well$sample_id qc_list$well_stats[[well]]$sample_umis_median <- sample_list } for(sample_id in umis_hto_barcode_by_well$sample_id) { sample_row <- umis_hto_barcode_by_well[umis_hto_barcode_by_well$sample_id == sample_id, ,drop = FALSE] sample_row <- sample_row[,-c(1,2)] qc_list$sample_stats[[sample_id]]$well_umis_median <- as.list(sample_row) } qc_table(umis_hto_barcode_by_well)
category_genes_violins <- qc_violin_plot(all_meta, category_x = "hto_category", name_x = "HTO Category", column_y = "n_genes", name_y = "N Genes per Cell", fill = "orangered") barcode_genes_violins <- qc_violin_plot(all_meta[all_meta$hto_category == "singlet",], category_x = "plot_barcode", name_x = "HTO Barcode (singlets)", column_y = "n_genes", name_y = "N Genes per Cell", fill = "orangered") + ggtitle("Genes per Cell") genes_violin_list <- list(category_genes_violins, barcode_genes_violins) plot_grid(plotlist = genes_violin_list, ncol = 2, rel_widths = c(1, 3), nrow = 1, align = "h")
genes_hto_barcode_by_well <- singlet_meta[, .(median_genes = median(n_genes)), by = list(well_id, hto_barcode)] genes_hto_barcode_by_well <- dcast(genes_hto_barcode_by_well, formula = hto_barcode ~ well_id, value.var = "median_genes") genes_hto_barcode_by_well <- hto_barcode_to_sample_id[genes_hto_barcode_by_well, on = "hto_barcode"] for(well in names(genes_hto_barcode_by_well)[-c(1,2)]) { sample_list <- as.list(genes_hto_barcode_by_well[[well]]) names(sample_list) <- genes_hto_barcode_by_well$sample_id qc_list$well_stats[[well]]$sample_genes_median <- sample_list } for(sample_id in genes_hto_barcode_by_well$sample_id) { sample_row <- genes_hto_barcode_by_well[genes_hto_barcode_by_well$sample_id == sample_id, ,drop = FALSE] sample_row <- sample_row[,-c(1,2)] qc_list$sample_stats[[sample_id]]$well_genes_median <- as.list(sample_row) } qc_table(genes_hto_barcode_by_well)
stm(paste0("Writing JSON to ", out_json)) qc_list_json <- jsonlite::toJSON(qc_list, auto_unbox = TRUE, pretty = TRUE) writeLines(qc_list_json, out_json)
sessionInfo()
Total time elapsed
end_time <- Sys.time() diff_time <- end_time - start_time time_message <- paste0("Elapsed Time: ", round(diff_time, 3), " ", units(diff_time)) print(time_message) stm(time_message) stm("H5 merge process complete.")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.