knitr::opts_chunk$set(warning = FALSE, message = FALSE)
suppressPackageStartupMessages(library("readr"))
suppressPackageStartupMessages(library("ggplot2"))
suppressPackageStartupMessages(library("dplyr"))
suppressPackageStartupMessages(library("qdapRegex"))
suppressPackageStartupMessages(library("tibble"))
library("annoFuse")

Introduction

To assess filtering sensitivity of annoFuse, we analyzed a subset of samples from TCGA and compared fusions retained with annoFuse filtering and oncogenic prioritization to those deemed the final call set in a previously published analysis by The Fusion Analysis Working Group PMID: 29617662.

A group of 160 samples were randomly selected from the following cancers: BLCA(10), BRCA(11), CESC(5), COAD(11), ESCA(5) , GBM(7), HNSC(10), KIRP(9), LGG(9), LIHC(9), LUAD(5), LUSC(11), OV(9), PAAD(8), PCPG(2), PRAD(14), SARC(6), SKCM(9), TGCT(6), THCA(4).

# Sample manifest was obtained from CAVATICA Data browser and TCGA files were copied to working project.
tcga_manifest <- read_tsv(system.file("extdata", "sample_aliquot_tcga.tsv", package = "annoFuseData"))
# Load [Final Fusion Call Set tab of Supplemental Table S1 of PMID: 29617662](https://pubmed.ncbi.nlm.nih.gov/29617662/)
final_fusion <- read_tsv(system.file("extdata", "final_fusion_mmc2.txt", package = "annoFuseData")) %>%
  # subset of final fusions in overlapping samples
  dplyr::filter(
    Sample %in% tcga_manifest$aliquot_id
  ) %>%
  as.data.frame() %>%
  unique()

We ran kf-rnaseq-workflow to obtain fusion calls from STAR-Fusion and Arriba and the expression level quantifications from RSEM. We then standardized STAR-Fusion and Arriba fusion calls using fusion_standardization.

# merged STAR-Fusion and arriba calls
tcga_arriba_df <- read_tsv(system.file("extdata", "merged_arriba_tcga.tsv", package = "annoFuseData"))
tcga_starfusion_df <- read_tsv(system.file("extdata", "merged_starfusion_tcga.tsv", package = "annoFuseData"))

# format
formatted_starfusion <- annoFuse::fusion_standardization(fusion_calls = tcga_starfusion_df, caller = "STARFUSION", tumorID = tcga_starfusion_df$Sample)

# format
formatted_arriba <- annoFuse::fusion_standardization(fusion_calls = tcga_arriba_df, caller = "ARRIBA", tumorID = tcga_arriba_df$Sample)


# Merge starfusion and arriba
formatted_calls <- rbind(formatted_starfusion, formatted_arriba)

Artifact and QC filtering was performed using fusion_filtering_QC. We filtered fusions using the following range of spanningDelta : 10, 20, 30, 40, 50, 100, 150, and 200.

merged_10 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 10)

merged_20 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 20)

merged_30 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 30)

merged_40 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 40)

merged_50 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 50)

merged_100 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 100)

merged_150 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 150)

merged_200 <- annoFuse::fusion_filtering_QC(standardFusioncalls = formatted_calls, readthroughFilter = FALSE, readingFrameFilter = "in-frame|frameshift|other", artifactFilter = "GTEx_Recurrent|DGD_PARALOGS|Normal|BodyMap", junctionReadCountFilter = 1, spanningFragCountFilter = 200)
# get overlap with sample_id/aliquot_id with completed tasks
get_tcga_overlap <- function(x, tcga_manifest, final_fusion) {
  samples_in_final_fusion <- x %>%
    left_join(tcga_manifest, by = c("Sample" = "sample_id")) %>%
    dplyr::filter(aliquot_id %in% final_fusion$Sample) %>%
    dplyr::select(aliquot_id, FusionName) %>%
    unique()

  final_fusion_subset <- final_fusion %>%
    dplyr::filter(Sample %in% samples_in_final_fusion$aliquot_id & Fusion %in% samples_in_final_fusion$FusionName) %>%
    nrow()
  return(final_fusion_subset)
}

We then calculated overlaps of the subsetted published final fusions with our filtered fusion calls across all spanningDelta cutoffs.

# get merged files for cutoff 10,20,30,40,50,100,150,200 with sensitivity
length_final_fusion_subset <- final_fusion$Fusion[
  # only keep fusions from raw merged fusion calls
  # since other callers were also used in the TCGA publication which captured other fusions
  which(final_fusion$Fusion %in% formatted_calls$FusionName)] %>% 
  length() # 603

hist_df <- merged_10 %>%
  dplyr::mutate("cutoff" = 10, "sensitivity" = get_tcga_overlap(merged_10, tcga_manifest, final_fusion) / length_final_fusion_subset) %>%
  dplyr::bind_rows(merged_20 %>%
    dplyr::mutate("cutoff" = 20, "sensitivity" = get_tcga_overlap(merged_20, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>%
  dplyr::bind_rows(merged_30 %>%
    dplyr::mutate("cutoff" = 30, "sensitivity" = get_tcga_overlap(merged_30, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>%
  dplyr::bind_rows(merged_40 %>%
    dplyr::mutate("cutoff" = 40, "sensitivity" = get_tcga_overlap(merged_40, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>%
  dplyr::bind_rows(merged_50 %>%
    dplyr::mutate("cutoff" = 50, "sensitivity" = get_tcga_overlap(merged_50, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>%
  dplyr::bind_rows(merged_100 %>%
    dplyr::mutate("cutoff" = 100, "sensitivity" = get_tcga_overlap(merged_100, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>%
  dplyr::bind_rows(merged_150 %>%
    dplyr::mutate("cutoff" = 150, "sensitivity" = get_tcga_overlap(merged_150, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>%
  dplyr::bind_rows(merged_200 %>%
    dplyr::mutate("cutoff" = 200, "sensitivity" = get_tcga_overlap(merged_200, tcga_manifest, final_fusion) / length_final_fusion_subset)) %>%
  dplyr::mutate("change_spanning" = .$SpanningFragCount - .$JunctionReadCount)

hist_df <- as.data.frame(hist_df)
dat_text <- data.frame(
  label = c(
    paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 10), "change_spanning"]))),
    paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 20), "change_spanning"]))),
    paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 30), "change_spanning"]))),
    paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 40), "change_spanning"]))),
    paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 50), "change_spanning"]))),
    paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 100), "change_spanning"]))),
    paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 150), "change_spanning"]))),
    paste("Mean", round(mean(hist_df[which(hist_df$cutoff == 200), "change_spanning"])))
  ),
  cutoff = c(10, 20, 30, 40, 50, 100, 150, 200)
)

Here, we visualize the distribution of spanningDelta (spanningFragCount - JunctionReadCount) across fusions called from the TCGA to assess a cutoff for spanningDelta

ggplot(hist_df, aes(x = change_spanning)) +
  geom_histogram(binwidth = 5) +
  scale_x_continuous(limits = c(0, 200)) +
  facet_wrap(. ~ cutoff) +
  theme_publication(base_size = 20) +
  geom_text(
    data = dat_text,
    mapping = aes(x = 100, y = 1200, label = label),
    hjust = -0.1,
    vjust = -1
  ) +
  theme(axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15))

We find that sensitivity reaches 96% at a spanningDelta cutoff of 100

extra_breaks <- c(0.7, 0.8, .85, 0.92, 0.96, 1.00)
breaks <- sort(c(extra_breaks, with(hist_df, pretty(range(sensitivity)))))

ggplot(hist_df, aes(x = cutoff, y = sensitivity)) +
  geom_line() +
  ylab("sensitivity") +
  theme_publication(base_size = 20) +
  scale_y_continuous(
    breaks = breaks,
    limits = range(breaks)
  ) +
  theme(axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15))
hist_df %>% select(cutoff,sensitivity) %>% 
  unique() %>%
  remove_rownames()

Session info {-}

sessionInfo()


d3b-center/annoFuse documentation built on Feb. 21, 2023, 1:06 a.m.