# Load required packages library(VariantAnnotation) library(Espresso) # Load other packages library(doParallel) # processing samples in parallel library(magrittr) # piping and readibility maf_database="MafDb.gnomADex.r2.1.hs37d5" #If you are using a different MafDB annotation package (see "Installation") change it here accordingly library(maf_database,character.only = TRUE) # Specify the genome genome = "hg19" #The package currently support hg19 and hg38. If you like to use a different species genome, please contact us.
These files are VarScan pileup2cns outputs - see the paper's methods section (PMID: 33298453) for information on how to generate those files.
In this example we provide files for 35 patients to call mutations in AML-associated hotspots
dir.create("example_data") download.file(url="https://raw.githubusercontent.com/abelson-lab/Espresso_paper/master/example_data/example_35pts_pileup2cnsINFO_allPositions.zip", destfile = "example_data/example_patients.zip") unzip("example_data/example_patients.zip", exdir = "example_data", overwrite = TRUE)
dir="example_data/" files=list.files(dir,pattern = "ptx") sample_paths <- paste0(dir, files) cat("First 5 Sample Paths: \n", head(sample_paths)) sample_names <- substr(files,1,5) # change this based on your file name cat("\n\nFirst 5 Sample Names: \n", head(sample_names))
These are tab delimited files with four columns: chr, pos, ref, alt.
Contains blood cancer associated mutations from COSMIC observed at least 10 times or 3 times, respectively.
As real mutations in our data are likely to occur at these hotspots, we will exclude these mutational sites from downstream error modelling.
For updated cancer associated somatic mutations, please visit the COSMIC website.
If you have no hotspot mutations that you wish to exclude from flagged allele generation and error model generation, feel free to skip this step.
We've posted example data on github that you can download - heme malignancies from COSMIC
dir.create("example_data") download.file(url = "https://raw.githubusercontent.com/abelson-lab/Espresso_paper/master/heme_COSMIC/COSMIC_heme_freq10.txt", destfile = "example_data/COSMIC_heme_freq10.txt") download.file(url = "https://raw.githubusercontent.com/abelson-lab/Espresso_paper/master/heme_COSMIC/COSMIC_heme_freq3.txt", destfile = "example_data/COSMIC_heme_freq3.txt")
Now load the files using load_recurrent_mutations
# blood cancer mutations from COSMIC observed over 10 times hemeCOSMIC_10 <- load_recurrent_mutations("example_data/COSMIC_heme_freq10.txt", genome = "hg19") # In this example those alleles will be filtered out from model generation # blood cancer mutations from COSMIC observed over 3 times hemeCOSMIC_3 <- load_recurrent_mutations("example_data/COSMIC_heme_freq3.txt", genome = "hg19") # these will be filtered out from the output of the flagged alleles function (see below) hemeCOSMIC_3
Flagged alleles are alleles that appear at a high VAF in a significant number of samples within your cohort. These are very likely to be sequencing errors and should be interpreted with caution in variant calling results. However, if you are expecting recurrent mutations (e.g. Hematologic cancer associated mutations from COSMIC), then you can use the recurrent_mutations argument to exclude them from the flagging process.
Here, we will input the recurrent mutations from COSMIC with a frequency >= 3 and ask get_flagged_alleles to ignore these alleles.
Additionally, we are setting the memory_saving argument to FALSE in this case because we are only processing 40 samples.
If you are processing > 400 samples on a 16gb RAM device or > 200 samples on a 8gb RAM device, we recommend setting memory_saving = TRUE.
This will employ an alternative method that consumes less memory but takes approximately twice as long to run.
flagged_alleles <- get_flagged_alleles(sample_names, sample_paths, genome, recurrent_mutations = hemeCOSMIC_3, memory_saving = FALSE)
flagged_alleles
In this example we are only interested in examining hematologic malignancy associated mutations, so we've included a bed file with genomic positions flanking +/- 15bp from COSMIC hotspot mutations. This will allow us to limit mutation calling specifically to these hotspot locations. Note that the other positions are still used for error model generation.
download.file(url = "https://raw.githubusercontent.com/abelson-lab/Espresso_paper/master/heme_COSMIC/hemeCOSMIC_hotspot_range.bed", destfile = "example_data/hemeCOSMIC_hotspot_range.bed") # Get COSMIC hotspot range to call mutations in COSMIC_hotspot_range <- load_bed("example_data/hemeCOSMIC_hotspot_range.bed", genome) COSMIC_hotspot_range
If you wanted to generate variant calls only for specific mutations (i.e. specified ref + alt) rather than the position alone, you can load in your specific mutations in the "chr, pos, ref, alt" format using load_recurrent_mutations().
If you want to call mutations in all covered positions, you can skip this step and remove the intersect_VRanges(., COSMIC_hotspot_range) in the following code chunk. This will prevent it from filtering out any positions prior to variant calling.
We will be running this in parallel with the package DoParallel - using 4 cores to speed up the job.
We're currently generating models by trinucleotide context (192 models), but this can be changed by tweaking the context argument to 5 (substitution with 2 flanking bases, constituting 3072 models) if you have a very large panel (>1 million bp at >2000x depth) or to 1 (substitution only, constituting 12 models) if you have a very small panel (see paper for more).
Generally, the default trinucleotide context approach is the most versatile and effective acoss various ranges and depths.
Also note that we perform a key preprocessing step to minimize somatic mutation contamination of our error models. This is achieved in through the filter_model_input function where we remove flagged alleles, polymorphisms (MAF > 0.001), clear private germline variants (VAF > 0.05), low quality calls (MAPQ < 59), and recurrent mutations identified in COSMIC. Additionally, the filter_model_input function also removes contextual outliers, which are non-reference alleles with an exceptionally high read count compared to the rest of the distribution.
# Make cluster and start cl <- makeCluster(4) registerDoParallel(cl) # initialize variant calls variant_calls <- VRangesList() # Try for all files for(i in 1:length(sample_paths)){ # get sample name and path samp_path <- sample_paths[i] samp_name <- sample_names[i] print(samp_name) # get sample as VRanges and annotate with sequence context and MAF samp <- load_as_VRanges(samp_name, samp_path, genome) %>% sequence_context(., genome, context = 3) %>% annotate_MAF(., maf_database, genome) # use sample to generate the error models samp_models <- samp %>% filter_model_input(., flagged_alleles, MAF_cutoff = 0.001, VAF_cutoff = 0.05, MAPQ_cutoff = 59, recurrent_mutations = hemeCOSMIC_10) %>% # preprocessing to clean training set (error models) generate_all_models() # call variants using error models, and aggregate together variant_calls[[samp_name]] <- samp %>% intersect_VRanges(., COSMIC_hotspot_range) %>% # only keep positions within the hotspot range we want to call in filter_MAPQ(MAPQ_cutoff = 59) %>% call_all_variants(., samp_models) # cleanup rm(samp_name, samp, samp_models) } # Unregister Cluster stopCluster(cl) rm(cl) registerDoSEQ()
The output is a vranges list object with the variant calls for each samples.
# Output is in vranges list
variant_calls
We can unlist that vranges object, conduct the pvalue correction (bonferroni across all samples), and keep the significant calls.
Note that if we were scanning all sequenced positions for variants, we would recommend applying bonferonni correction by sample.
# unlist and correct pvalue (bonferroni) variant_calls_unlisted <- variant_calls %>% unlist() %>% correct_pvalues(., method = "bonferroni") ## if we want to correct sample-by-sample # variant_calls_unlisted <- variant_calls %>% correct_pvalues(., method = "bonferroni") %>% unlist() # show significant calls significant_calls <- variant_calls_unlisted[which(variant_calls_unlisted$corrected_pvalue <= 0.05)] significant_calls
We also recommend inspecting the "model" column to ensure that error models were successfully generated for each trinucleotide context. In case that your varscan input contains indels the model type will indicate "None". If a "None" model is indicated for a trinucleotide context it is possible that not enough alt alleles were present in the data to generate the model. In this case consider restricting the nucleotide context from 5 to 3 or from 3 to 1, thus reducing the number of models generated per sample yet including more data in each. If no model can be generated by Espresso for a context, the pvalue from varscan will be used.
Significant calls can be converted into a dataframe for input into annovar or saved directly as a VCF file.
significant_calls_df <- significant_calls %>% tidyr::as_tibble() %>% dplyr::select(-width, -strand, -totalDepth) %>% dplyr::rename("chr" = seqnames) %>% dplyr::mutate(chr = sub(pattern="chr", replacement="",x=chr)) significant_calls_df %>% readr::write_delim("espresso_calls.txt", delim = "\t") significant_calls_df %>% head() %>% print()
significant_calls %>% VariantAnnotation::asVCF() %>% VariantAnnotation::writeVcf("espresso_calls.vcf")
Alternatively, we can annotate the significant calls through cellBase, which returns a nested dataframe with transcript-specific functional annotations.
anno <- annotate_variants(significant_calls, genome='hg19') anno %>% head()
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.