Contents

Data Processing

QC Metrics and Plots

Session Info

Data Processing

Session Preparation

Load libraries:

start_time <- Sys.time()

quiet_library <- function(...) {
  suppressPackageStartupMessages(library(...))
}
quiet_library(BarMixer)
quiet_library(ggplot2)
quiet_library(cowplot)

Declaring start

stm("Starting HTO count processing")
stm(paste0("Using BarMixer v", installed.packages()["BarMixer","Version"]))

Argument parsing

in_file <- params$in_file
in_samples  <- params$in_samples
in_well <- params$in_well
if(is.null(params$in_min_cutoff)) {
  in_min_cutoff <- "auto"
} else {
  in_min_cutoff <- as.numeric(params$in_min_cutoff)
}
if(is.null(params$in_eel)) {
  in_eel <- TRUE
} else {
  in_eel <- as.logical(params$in_eel)
}
out_dir <- params$out_dir

out_mat <- file.path(out_dir, paste0(in_well, "_hto_count_matrix.csv.gz"))
out_tbl <- file.path(out_dir, paste0(in_well, "_hto_category_table.csv.gz"))
out_json <- file.path(out_dir, paste0(in_well, "_hto_processing_metrics.json"))

stm(paste0("IN  HTO Tag_Counts.csv : ", in_file))
stm(paste0("IN  SampleSheet.csv    : ", in_samples))
stm(paste0("IN  Well               : ", in_well))
stm(paste0("IN  Min. Cutoff        : ", in_min_cutoff))
stm(paste0("IN  Expect equal load  : ", in_eel))
stm(paste0("OUT Count matrix       : ", out_mat))
stm(paste0("OUT Category table     : ", out_tbl))
stm(paste0("OUT JSON metrics       : ", out_json))
print(c(
  paste0("IN  Tag_Counts.csv   : ", in_file),
  paste0("IN  HTO sample sheet : ", in_samples),
  paste0("IN  Well             : ", in_well),
  paste0("IN  Min. Cutoff      : ", in_min_cutoff),
  paste0("IN  Expect equal load: ", in_eel),
  paste0("OUT Count matrix     : ", out_mat),
  paste0("OUT Category table   : ", out_tbl),
  paste0("OUT JSON metrics     : ", out_json)
))

Return to Contents

Load Input Datasets

Check Input Files

if(!file.exists(in_file)) {
  stm(paste0("ERROR: Cannot find IN Tag_Counts.csv:", in_file))
  stop()
} else {
  stm(paste0("Found Tag Counts file:", in_file))
}

if(!file.exists(in_samples)) {
  stm(paste0("ERROR: Cannot find IN SampleSheet.csv:", in_samples))
  stop()
} else {
  stm(paste0("Found SampleSheet.csv:", in_samples))
}

Sample Sheet

in_hto_key <- read.csv(in_samples, header = TRUE, stringsAsFactors = FALSE)

qc_table(in_hto_key)

Read input table/matrix

in_hto_table <- fread(in_file)

Return to Contents

Process HTO Counts

Convert input data to matrix

stm("Formatting HTO matrix and filtering for valid barcodes")

keep_cols <- colnames(in_hto_table)[colnames(in_hto_table) %in% in_hto_key$hash_name]
in_hto_mat <- t(as.matrix(in_hto_table[, ..keep_cols]))
in_hto_mat <- as(in_hto_mat, "dgCMatrix")
colnames(in_hto_mat) <- in_hto_table$cell_barcode
rownames(in_hto_mat) <- in_hto_key$hash_tag[match(rownames(in_hto_mat), in_hto_key$hash_name)]

in_hto_mat <- add_missing_hto_rows(in_hto_mat,
                                   valid_htos = in_hto_key$hash_tag)
in_hto_mat <- in_hto_mat[,colSums(in_hto_mat) > 1]

Call positive and negative HTOs

stm("Binarizing HTO counts to call positive and negative hashes")
if(in_min_cutoff == "auto") {
  binary_results <- binarize_hash_matrix(in_hto_mat,
                                         valid_htos = in_hto_key$hash_tag,
                                         use_median_cut = TRUE,
                                         expect_equal_loading = in_eel)
} else {
  binary_results <- binarize_hash_matrix(in_hto_mat,
                                         valid_htos = in_hto_key$hash_tag,
                                         use_median_cut = FALSE,
                                         min_cut = as.numeric(in_min_cutoff),
                                         expect_equal_loading = in_eel)
}

Convert hashes to category calls

stm("Converting binary results to categories")
category_results <- categorize_binary_hash_matrix(binary_results$bmat)

Return to Contents

Write Outputs

Write count matrix

stm(paste0("Writing count matrix to ", out_mat))
out_hto_mat <- as(in_hto_mat, "matrix")
out_hto_mat <- as.data.table(out_hto_mat)
out_hto_mat <- cbind(data.table(hto_barcode = rownames(in_hto_mat)),
                     out_hto_mat)
fwrite(out_hto_mat,
       file = out_mat)

Write category table

stm(paste0("Writing category table to ", out_tbl))

hto_barcodes_to_sample_ids <- function(x,
                                       bc_key) {
  x <- as.character(x)
  res <- character()
  for(i in seq_along(x)) {
    bcs <- unlist(strsplit(x[i], ";")[[1]])
    res[i] <- paste(bc_key$sample_id[match(bcs, bc_key$hash_tag)],
                    collapse = ";")
  }
  res
}

hash_category_table <- category_results$hash_category_table
hash_category_table$sample_id <- hto_barcodes_to_sample_ids(hash_category_table$hto_barcode,
                                                            in_hto_key)

rownames(hash_category_table) <- NULL

fwrite(hash_category_table,
       file = out_tbl)

Return to Contents

QC Tables and Plots

qc_list <- list(report_type = "hto_processing",
                report_datetime = as.character(start_time),
                report_uuid = ids::uuid(use_time = TRUE),
                package = "HTOparser",
                package_version = sessionInfo()$otherPkgs$HTOparser$Version,
                well_id = in_well)

Hash Count Summary

bsummary <- binary_results$bsummary
bsummary$sample_id <- in_hto_key$sample_id[match(bsummary$hto_barcode, in_hto_key$hash_tag)]
bsummary <- setcolorder(bsummary,
                        c("sample_id","hto_barcode",
                          "cutoff","n_pos","n_neg","n_below_threshold",
                          "frac_pos","frac_neg","frac_below_threshold"))
rownames(bsummary) <- NULL

bsummary$hash_tag <- in_hto_key$hash_name[match(bsummary$hto_barcode, in_hto_key$hash_tag)]

qc_list$sample_hto_stats <- lapply(1:nrow(bsummary),
                                   function(x) {
                                     list(hto_barcode = as.character(bsummary$hto_barcode[x]),
                                          hash_tag = as.character(bsummary$hash_tag[x]),
                                          cutoff = bsummary$cutoff[x],
                                          n_pos = bsummary$n_pos[x],
                                          n_neg = bsummary$n_neg[x],
                                          n_below_threshold = bsummary$n_below_threshold[x],
                                          frac_pos = bsummary$frac_pos[x],
                                          frac_neg = bsummary$frac_neg[x],
                                          frac_below_threshold = bsummary$frac_below_threshold[x])
                                   })
n_no_sample_id <- sum(is.na(bsummary$sample_id))
if(n_no_sample_id > 0) {
  bsummary$sample_id[is.na(bsummary$sample_id)] <- paste0("NA_",1:n_no_sample_id)
}
names(qc_list$sample_hto_stats) <- bsummary$sample_id

qc_table(bsummary)

Return to Contents

Generate plots

stm("Generating plots for report")
plot_list <- lapply(
  1:nrow(in_hto_key),
  function(x) {
    barcode <- in_hto_key$hash_tag[x]
    sample_id <- in_hto_key$sample_id[x]
    vals <- in_hto_mat[barcode,]

    cutoff <- bsummary$cutoff[bsummary$hto_barcode == barcode]

    n_vals_below_min_cut <- sum(vals <= 5)
    vals_above_min_cut <- vals[vals > 5]

    if(length(vals_above_min_cut) > 0) {

      val_df <- data.frame(vals = log10(vals_above_min_cut))
      cut_df <- data.frame(cutoff = log10(cutoff))

      ggplot() +
        geom_histogram(data = val_df,
                       aes(x = vals),
                       fill = "darkgreen",
                       binwidth = 0.05) +
        geom_vline(data = cut_df,
                   aes(xintercept = cutoff),
                   size = 1,
                   color = "dodgerblue") +
        scale_x_continuous("log10(HTO Count)", 
                           limits = c(0,5)) +
        scale_fill_identity() +
        scale_color_identity() +
        ggtitle(paste(sample_id, "|", barcode)) +
        theme_bw(base_size = 8)
    } else {
      text_df <- data.frame(x = 2.5,
                            y = 0.5,
                            label = "Hash not detected")

      ggplot() +
        geom_text(data = text_df,
                  aes(x = x,
                      y = y,
                      label = label)) +
        scale_x_continuous("log10(HTO Count)", 
                           limits = c(0,5)) +
        scale_y_continuous("count",
                           limits = c(0,1)) +
        ggtitle(paste(sample_id, "|", barcode)) +
        theme_bw(base_size = 8)
    }
  }
)

Hash Count Distribution Plots

suppressWarnings(
  plot_grid(plotlist = plot_list,
            nrow = ceiling(length(plot_list) / 2),
            ncol = 2)
)

Return to Contents

Hash Category Summary

rownames(category_results$hash_summary) <- NULL

qc_list$hto_category_stats <- lapply(1:nrow(category_results$hash_summary),
                                     function(x) {
                                       list(n_cells = category_results$hash_summary$n_category[x],
                                            frac_cells = category_results$hash_summary$frac_category[x])
                                     })
names(qc_list$hto_category_stats) <- category_results$hash_summary$hto_category

qc_table(category_results$hash_summary)

Return to Contents

Sample/HTO Singlet Counts

singlet_summary <- make_singlet_summary(category_results$hash_category_table,
                                        valid_htos = in_hto_key$hash_tag)
singlet_summary$sample_id <- in_hto_key$sample_id[match(singlet_summary$hto_barcode, in_hto_key$hash_tag)]
singlet_summary <- setcolorder(singlet_summary,
                               c("sample_id",
                                 "hto_barcode",
                                 "n_singlets",
                                 "frac_singlets"))

n_no_sample_id <- sum(is.na(singlet_summary$sample_id))
if(n_no_sample_id > 0) {
  singlet_summary$sample_id[is.na(singlet_summary$sample_id)] <- paste0("NA_",1:n_no_sample_id)
}

for(sample_id in singlet_summary$sample_id) {
  qc_list$sample_hto_stats[[sample_id]]$n_singlets <- singlet_summary$n_singlets[singlet_summary$sample_id == sample_id]
}

qc_table(singlet_summary)

Return to Contents

Sample/HTO Singlet Counts Plot

singlet_plot_data <- singlet_summary
singlet_plot_data$x <- 1:nrow(singlet_summary)

ggplot() +
  geom_rect(data = singlet_plot_data,
            aes(xmin = x - 0.4, xmax = x + 0.4,
                ymin = 0, ymax = n_singlets)) +
  geom_text(data = singlet_plot_data,
            aes(x = x,
                y = n_singlets * 1.02,
                label = n_singlets),
            vjust = 0,
            size = 3) +
  ggtitle("Singlet Counts") +
  scale_x_continuous("",
                     breaks = singlet_plot_data$x,
                     labels = paste0(singlet_plot_data$hto_barcode,
                                     "\n",
                                     singlet_plot_data$sample_id)) +
  scale_y_continuous("N Singlets") +
  theme_bw(base_size = 8) +
  theme(axis.text.x = element_text(angle = 90,
                                   hjust = 1,
                                   vjust = 0.3))

Return to Contents

Write QC JSON

stm(paste0("Writing JSON to ", out_json))
qc_list_json <- jsonlite::toJSON(qc_list,
                                 auto_unbox = TRUE,
                                 pretty = TRUE)
writeLines(qc_list_json,
           out_json)

Return to Contents

Session Information

sessionInfo()

Total time elapsed

end_time <- Sys.time()
diff_time <- end_time - start_time
time_message <- paste0("Elapsed Time: ", 
                       round(diff_time, 3),
                       " ", units(diff_time))
print(time_message)
stm(time_message)
stm("HTO count processing complete.")

Return to Contents



AllenInstitute/BarMixer documentation built on Dec. 17, 2021, 8:42 a.m.