knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) options(crayon.enabled=FALSE)
library(CNAqc) # Extra packages require(dplyr) #if you use RStudio you will probably have to run this line # options(connectionObserver = NULL)
Note: variant annotation should be carried out with dedicated tools. CNAqc functions should only be used to get a preliminary idea of the most important mutations annotated in a sample.
CNAqc can annotate input mutations and flag potential driver mutations. Using the VariantAnnotation package and the intOGen database, CNAqc performs the following steps:
Annotates the position of each mutation, with VariantAnnotation:
coding
, intron
, fiveUTR
, threeUTR
,intron
,intergenic
, spliceSite
, promoter
.Annotates the consequence on the protein for coding mutations, with change in the amino acid if known, with VariantAnnotation:
nonsynonymous
, synonymous
, frameshift
,stop
.Compares non-synonymous mutations to known driver genes from the intOGen database or a custom list, and flags drivers.
This functionality works with a CNAqc object.
# Dataset available with the package data('example_dataset_CNAqc', package = 'CNAqc') x = CNAqc::init( mutations = example_dataset_CNAqc$mutations, cna = example_dataset_CNAqc$cna, purity = example_dataset_CNAqc$purity, ref = 'hg19') # What we annotate x %>% Mutations
CNAqc uses databases from Bioconductor to annotate the variants; installation of these databases might take a bit of time because ~1GB of data have to be downloaded. This will happen only the first time the annotation is run.
# Reference against which we mapped the reads reference_genome <- example_dataset_CNAqc$reference # All those packages are distributed in Bioconductor if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager", repos="http://cran.us.r-project.org") # We have to install the corresponding txdb package for transcript annotations paste0("TxDb.Hsapiens.UCSC.",reference_genome, ".knownGene") %>% BiocManager::install() # We have to install also the BS database for the sequences (it may take some time) paste0("BSgenome.Hsapiens.UCSC.",reference_genome) %>% BiocManager::install() # Then these two packages provide useful utilities to deal with biological databases "Organism.dplyr" %>% BiocManager::install() "org.Hs.eg.db" %>% BiocManager::install()
CNAqc has pre-loaded a list of 568 driver genes for 66 cancer types, compiled from intOGen release date 2020.02.01.
# The available list: # - gene id # - tumour code where the gene has been flagged as driver # - tumour code longname (Esophageal cancer) data("intogen_drivers", package = 'CNAqc') # Number of genes (568) intogen_drivers$gene %>% unique # Tumour types (66) intogen_drivers$synopsis %>% unique # Organised table intogen_drivers
We plot the list of genes that appear in at least 20 tumour types.
library(RColorBrewer) library(ggplot2) n <- 60 qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',] col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))) intogen_drivers %>% group_by(gene) %>% filter(n() > 20) %>% ggplot() + geom_bar(aes(x = gene, fill = synopsis)) + coord_flip() + scale_fill_manual(values = col_vector) + CNAqc:::my_ggplot_theme()
# Run default annotation function x_new <- annotate_variants(x) x_new
Note that there can be multiple locations and consequences for a single variant. This happens as we try to annotate the mutations in a transcript-agnostic manner, consequently we report all possible effects and locations for any transcript (separated by :
). Comparison between known and newly-annotated drivers seems consistent.
# Reference driver mutations x %>% print() # New driver mutations x_new %>% print()
One can restrict the list of potential genes to use for drivers detection. To this extent, it is convenient to use the available database and the information regarding the input cancer type.
# We pretend to work with OV, ovarian cancer OV_drivers = intogen_drivers %>% dplyr::filter(tumour == 'OV') OV_drivers$gene %>% unique() # Run annotation function x_new_ov <- annotate_variants(x, drivers = OV_drivers) # Reference driver mutations x %>% print() # New driver mutations, searching for OV drivers x_new_ov %>% print()
Annotated data can undergo the canonical analysis workflow.
ggpubr::ggarrange( plot_data_histogram(x_new, which = 'VAF'), plot_data_histogram(x_new, which = 'DP'), plot_data_histogram(x_new, which = 'NV'), ncol = 3, nrow = 1 )
# The actual plot function cowplot::plot_grid( plot_gw_counts(x_new), plot_gw_vaf(x_new, N = 10000), plot_gw_depth(x_new, N = 10000), plot_segments(x_new), align = 'v', nrow = 4, rel_heights = c(.15, .15, .15, .8))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.