Contents

Data Processing

QC Metrics and Plots

Session Info

Data Processing

Session Preparation

Load libraries:

start_time <- Sys.time()

quiet_library <- function(...) {
  suppressPackageStartupMessages(library(...))
}
quiet_library(rhdf5)
quiet_library(BarMixer)
quiet_library(Matrix)
quiet_library(ggplot2)
quiet_library(cowplot)
quiet_library(jsonlite)

Declaring start

stm("Starting .h5 metadata analysis")
stm(paste0("Using BarMixer v", installed.packages()["BarMixer","Version"]))

Argument parsing

in_tenx <- params$in_tenx
in_well <- params$in_well
in_sample <- params$in_sample
out_dir <- params$out_dir

# Check Well ID format
if(grepl("^[A-Z|-]+[0-9]+-.?P[0-9]C[0-9]W[0-9]$", in_well)) {
  in_well <- sub("-.P","-P",in_well)
  well_type <- "aifi"
} else {
  well_type <- "other"
}

stm(paste0("IN  10x outs/       : ", in_tenx))
stm(paste0("IN  Well ID         : ", in_well))
stm(paste0("OUT H5 directory    : ", out_dir))

Input Parameters

print(c(
  paste0("IN  10x outs/       : ", in_tenx),
  paste0("IN  Well ID         : ", in_well),
  paste0("OUT H5 directory    : ", out_dir)
))

Check Input Files

in_h5 <- file.path(in_tenx, "filtered_feature_bc_matrix.h5")

if(!file.exists(in_h5)) {
  stm(paste0("ERROR: Cannot find IN H5 file:", in_h5))
  stop()
} else {
  stm(paste0("Found matrix file:", in_h5))
}

in_mol <- file.path(in_tenx, "molecule_info.h5")

if(!file.exists(in_mol)) {
  stm(paste0("ERROR: Cannot find IN Mol Info H5 file:", in_mol))
  stop()
} else {
  stm(paste0("Found mol_info file:", in_mol))
}

in_sum <- file.path(in_tenx, "metrics_summary.csv")

if(!file.exists(in_sum)) {
  stm(paste0("ERROR: Cannot find IN Metrics Summary file:", in_sum))
  stop()
} else {
  stm(paste0("Found metrics file:", in_sum))
}

Create out directory if missing

if(!dir.exists(out_dir)) {
  stm(paste0("Creating Output Directory: ",out_dir))
  dir.create(out_dir, 
             recursive = TRUE)
}

Return to Contents

Load inputs

Load scRNA-seq Dataset

stm(paste0("Loading HDF5 from ", in_h5))
h5_list <- h5dump(in_h5)

Set output files

out_h5 <- file.path(out_dir, paste0(in_well, ".h5"))
out_json <- sub(".h5$","_well_metrics.json",out_h5)

stm(paste0("OUT H5 file         : ", out_h5))
stm(paste0("OUT JSON file       : ", out_json))

print(c(
  paste0("OUT H5 file         : ", out_h5),
  paste0("OUT JSON file       : ", out_json)
))

Read molecule info to get read counts per cell

stm(paste0("Assembling Read Counts per Cell from ", in_mol))

bc <- sub("-1","",h5_list$matrix$barcodes)

bc_counts <- data.table(mol_idx = h5read(in_mol, "/barcode_idx"),
                        umi_count = h5read(in_mol, "/count"))

bc_sums <- bc_counts[, .(n_reads = sum(umi_count)), by = mol_idx]
rm(bc_counts)

mol_bc <- h5read(in_mol, "/barcodes")
bc_sums$cell_barcode <- mol_bc[bc_sums$mol_idx + 1]
rm(mol_bc)

bc_sums <- bc_sums[,.(cell_barcode, n_reads)]

n_reads <- bc_sums$n_reads[match(bc, bc_sums$cell_barcode)]
n_reads[is.na(n_reads)] <- 0

h5_list <- set_list_path(h5_list,
                           "/matrix/observations/n_reads",
                           n_reads)

Return to Contents

Assemble data

Compute N UMIs and N Genes per cell

stm("Computing UMI and Gene Counts per Cell")

h5_list <- h5_list_convert_to_dgCMatrix(h5_list, target = "matrix")

h5_list <- set_list_path(h5_list,
                         "/matrix/observations/n_umis",
                         unname(Matrix::colSums(h5_list$matrix_dgCMatrix)))

h5_list <- set_list_path(h5_list,
                         "/matrix/observations/n_genes",
                         unname(Matrix::colSums(h5_list$matrix_dgCMatrix > 0)))

h5_list <- h5_list_convert_from_dgCMatrix(h5_list, target = "matrix")

Add cell ids

stm("Adding Cell UUIDs and Names")

h5_list <- add_cell_ids(h5_list,
                        add_uuid = TRUE,
                        replace_barcode = TRUE,
                        retain_original_barcode = TRUE,
                        add_name = TRUE)

Generate Chip, Pool, and Batch IDs based on well_id

if(well_type == "aifi") {
  stm("Adding Batch, Pool, Chip, and Well metadata")

  h5_list <- add_well_metadata(h5_list,
                             well_id = in_well)
} else {
  h5_list <- set_list_path(h5_list,
                           "/matrix/observations/well_id",
                           rep(in_well, length(h5_list$matrix$barcodes)))
}

Add chrM gene counts

stm("Adding chrM count metadata")

h5_list <- h5_list_add_mito_umis(h5_list)

Add Well Metrics

well_metrics <- read_tenx_metrics(in_sum)

well_metrics <- as.list(well_metrics)

h5_list <- set_list_path(h5_list,
                         "/well",
                         well_metrics)

h5_list <- set_list_path(h5_list,
                         "/well/well_id",
                         in_well)

Return to Contents

Write Output

Write HDF5 files

stm(paste0("Writing HDF5 to ", out_h5))
write_h5_list(h5_list,
              h5_file = out_h5,
              overwrite = TRUE)
h5closeAll()

Return to Contents

QC Tables and Plots

Extract metadata for plotting

stm("Generating tables and plots for report")
meta <- h5_list_cell_metadata(h5_list)

qc_list <- list(report_type = "add_tenx_rna_metadata",
                report_datetime = as.character(start_time),
                report_uuid = ids::uuid(use_time = TRUE),
                package = "BarMixer",
                package_version = sessionInfo()$otherPkgs$H5weaver$Version,
                well_id = in_well)

Return to Contents

Well QC

10x Genomics Mapping Metrics

qc_well_metrics <- data.frame(tenx_metric = names(well_metrics),
                              value = as.vector(unlist(well_metrics)),
                              stringsAsFactors = FALSE)

qc_well_metrics$value[qc_well_metrics$tenx_metric == "number_of_reads"] <- as.numeric(qc_well_metrics$value[qc_well_metrics$tenx_metric == "number_of_reads"] / 1e6)
qc_well_metrics$tenx_metric[qc_well_metrics$tenx_metric == "number_of_reads"] <- "millions_of_reads"

qc_list$well_metrics <- split(qc_well_metrics$value, 1:nrow(qc_well_metrics))
names(qc_list$well_metrics) <- qc_well_metrics$tenx_metric

qc_table(qc_well_metrics)

Return to Contents

Well Read/UMI/Gene Summary Stats

overview_stats <- data.frame(statistic = c("N Mapped Reads",
                                           "N UMIs",
                                           "N Mito. UMIs",
                                           "N Genes",
                                           "Reads per UMI",
                                           "UMIs per Gene",
                                           "Reads per Gene"),
                             q_25 = c(quantile(meta$n_reads, 0.25),
                                      quantile(meta$n_umis,  0.25),
                                      quantile(meta$n_mito_umis, 0.25),
                                      quantile(meta$n_genes, 0.25),
                                      quantile(meta$n_reads/meta$n_umis, 0.25, na.rm = TRUE),
                                      quantile(meta$n_umis/meta$n_genes, 0.25, na.rm = TRUE),
                                      quantile(meta$n_reads/meta$n_genes, 0.25, na.rm = TRUE)),
                             median = c(quantile(meta$n_reads, 0.5),
                                        quantile(meta$n_umis,  0.50),
                                        quantile(meta$n_mito_umis, 0.50),
                                        quantile(meta$n_genes, 0.50),
                                        quantile(meta$n_reads/meta$n_umis, 0.50, na.rm = TRUE),
                                        quantile(meta$n_umis/meta$n_genes, 0.50, na.rm = TRUE),
                                        quantile(meta$n_reads/meta$n_genes, 0.50, na.rm = TRUE)),
                             q_75 = c(quantile(meta$n_reads, 0.75),
                                      quantile(meta$n_umis,  0.75),
                                      quantile(meta$n_mito_umis, 0.25),
                                      quantile(meta$n_genes, 0.75),
                                      quantile(meta$n_reads/meta$n_umis, 0.75, na.rm = TRUE),
                                      quantile(meta$n_umis/meta$n_genes, 0.75, na.rm = TRUE),
                                      quantile(meta$n_reads/meta$n_genes, 0.75, na.rm = TRUE)))

overview_stats$q_25 <- round(overview_stats$q_25, 2)
overview_stats$median <- round(overview_stats$median, 2)
overview_stats$q_75 <- round(overview_stats$q_75, 2)

qc_list$well_count_stats <- lapply(1:nrow(overview_stats),
                                      function(x) {
                                        list(q_25 = overview_stats$q_25[x],
                                             median = overview_stats$median[x],
                                             q_75 = overview_stats$q_75[x])
                                      })
names(qc_list$well_count_stats) <- tolower(gsub("[\\. ]+", "_", overview_stats$statistic))

qc_table(overview_stats)

Return to Contents

Well Read/UMI/Gene Histograms

reads_plot <- qc_hist_plot(meta,
                           column = "n_reads",
                           name_x = "N Reads per Cell",
                           fill = "dodgerblue",
                           target = 2e4) +
  ggtitle("Alignment/Mapping Distributions")

umis_plot <- qc_hist_plot(meta,
                          column = "n_umis",
                          name_x = "N UMIs per Cell",
                          fill = "purple",
                          target = 8e3)

genes_plot <- qc_hist_plot(meta,
                           column = "n_genes",
                           name_x = "N Genes per Cell",
                           fill = "orangered",
                           target = 2e3)

histogram_list <- list(reads_plot,
                       umis_plot,
                       genes_plot)

suppressWarnings(
  plot_grid(plotlist = histogram_list,
            ncol = 1,
            nrow = 3)
)

Return to Contents

UMIs vs Reads Scatter Plot

umi_reads_scatter <- qc_scatter_plot(meta,
                                     column_x = "n_reads",
                                     name_x = "N Reads per Cell",
                                     column_y = "n_umis",
                                     name_y = "N UMIs per Cell",
                                     color = "darkblue") +
  ggtitle("UMIs vs Reads")

suppressWarnings(
  umi_reads_scatter
)

Genes vs UMIs Scatter Plot

genes_umis_scatter <- qc_scatter_plot(meta,
                                      column_x = "n_umis",
                                      name_x = "N UMIs per Cell",
                                      column_y = "n_genes",
                                      name_y = "N Genes per Cell",
                                      color = "red") +
  ggtitle("Genes vs UMIs")

suppressWarnings(
  genes_umis_scatter
)

Genes vs Reads Scatter Plot

genes_reads_scatter <- qc_scatter_plot(meta,
                                       column_x = "n_reads",
                                       name_x = "N Reads per Cell",
                                       column_y = "n_genes",
                                       name_y = "N Genes per Cell",
                                       color = "darkgreen") +
  ggtitle("Genes vs Reads")

suppressWarnings(
  genes_reads_scatter
)

Return to Contents

Write QC JSON

stm(paste0("Writing JSON to ", out_json))

qc_list_json <- jsonlite::toJSON(qc_list,
                                 auto_unbox = TRUE,
                                 pretty = TRUE)

writeLines(qc_list_json,
           out_json)

Return to Contents

Session Information

sessionInfo()

Total time elapsed

end_time <- Sys.time()
diff_time <- end_time - start_time
time_message <- paste0("Elapsed Time: ", 
                       round(diff_time, 3),
                       " ", units(diff_time))
print(time_message)
stm(time_message)
stm("H5 metadata process complete.")

Return to Contents



AllenInstitute/BarMixer documentation built on Dec. 17, 2021, 8:42 a.m.