In Illumina/happyCompare: Reporting toolbox for hap.py output

knitr::opts_chunk$set(fig.width = 6, fig.height = 5, cache = FALSE)

library("happyCompare")
library("tidyverse")
library("ggplot2")
theme_set(theme_bw())

Usage

Loading demo data from a happyCompare samplesheet creates a happy_compare object:

samplesheet_path <- "vignettes/pcrfree_vs_nano.subset.csv"
happy_compare <- read_samplesheet(samplesheet_path, lazy = TRUE)

happy_compare <- readRDS("pcrfree_vs_nano.subset.Rds")

That contains the following fields:

samplesheet: the original samplesheet
happy_results: a list of happy_result objects as defined in happyR
build_metrics: a list of data.frames with custom metrics
ids: a vector of build ids

sapply(happy_compare, class)

hap.py results and samplesheet metadata can be accessed with extract_metrics(), leaving them ready for downstream analysis:

e <- extract_metrics(happy_compare, table = "summary")
class(e)

Example visualisations

Summary of performance metrics

# extract performance metrics and tabulate mean plus/minus SD per group and variant type
extract_metrics(happy_compare, table = "summary") %>% 
  filter(Filter == "PASS") %>% 
  hc_summarise_metrics(df = ., group_cols = c("Group.Id", "Type")) %>% 
  knitr::kable()

Precision-recall curves

# extract ROC metrics and plot a precision-recall curves, e.g. for PASS INDEL
extract_metrics(happy_compare, table = "pr.indel.pass") %>% 
  hc_plot_roc(happy_roc = ., type = "INDEL", filter = "PASS")

Custom metrics

# link build metrics to hap.py results
summary <- extract_metrics(happy_compare, table = "summary")
build_metrics <- extract_metrics(happy_compare, table = "build.metrics")
merged_df <- summary %>% 
  inner_join(build_metrics)
merged_df %>%
  filter(Type == "INDEL", Filter == "PASS") %>% 
  ggplot(aes(x = mean_coverage, y = METRIC.Precision, group = Group.Id)) +
  geom_point(aes(color = Group.Id)) +
  ylab("INDEL_precision")

Stratified counts

# extract stratified counts and visualise highest density intervals for recall in level 0 subsets
hdi <- extract_metrics(happy_compare, table = "extended") %>% 
  filter(Subtype == "*", Filter == "PASS", Subset.Level == 0, 
         Subset %in% c("high.at", "high.gc")) %>% 
  estimate_hdi(successes_col = "TRUTH.TP", totals_col = "TRUTH.TOTAL", 
               group_cols = c("Group.Id", "Subset", "Type"), aggregate_only = FALSE)

hdi %>% 
  mutate(Subset = factor(Subset, levels = rev(unique(Subset)))) %>% 
  filter(replicate_id == ".aggregate") %>% 
  ggplot(aes(x = estimated_p, y = Subset, group = Subset)) +
    geom_point(aes(color = Group.Id), size = 2) +
    geom_errorbarh(aes(xmin = lower, xmax = upper, color = Group.Id), height = 0.4) +
    facet_grid(. ~ Type) +
    scale_colour_manual(values = c("#E69F00", "#56B4E9")) +
    theme(legend.position = "bottom") +
    ggtitle("Recall estimates across L0 subsets") +
    xlab("Recall") +
    ylab("") +
    xlim(0.7, 1)