# Load required packages
library(VariantAnnotation)
library(Espresso)

# Load other packages
library(doParallel)   # processing samples in parallel
library(magrittr)   # piping and readibility
maf_database="MafDb.gnomADex.r2.1.hs37d5" #If you are using a different MafDB annotation package (see "Installation") change it here accordingly
library(maf_database,character.only = TRUE)

# Specify the genome
genome = "hg19" #The package currently support hg19 and hg38. If you like to use a different species genome, please contact us.

Pre-Processing

Get Sample Files

These files are VarScan pileup2cns outputs - see the paper's methods section (PMID: 33298453) for information on how to generate those files.

In this example we provide files for 35 patients to call mutations in AML-associated hotspots

dir.create("example_data")
download.file(url="https://raw.githubusercontent.com/abelson-lab/Espresso_paper/master/example_data/example_35pts_pileup2cnsINFO_allPositions.zip",
              destfile = "example_data/example_patients.zip")
unzip("example_data/example_patients.zip", exdir = "example_data", overwrite = TRUE)
dir="example_data/"
files=list.files(dir,pattern = "ptx")

sample_paths <- paste0(dir, files)
cat("First 5 Sample Paths: \n", head(sample_paths))

sample_names <- substr(files,1,5) # change this based on your file name
cat("\n\nFirst 5 Sample Names: \n", head(sample_names))

Load recurrent mutations

These are tab delimited files with four columns: chr, pos, ref, alt.
Contains blood cancer associated mutations from COSMIC observed at least 10 times or 3 times, respectively.
As real mutations in our data are likely to occur at these hotspots, we will exclude these mutational sites from downstream error modelling.

For updated cancer associated somatic mutations, please visit the COSMIC website.
If you have no hotspot mutations that you wish to exclude from flagged allele generation and error model generation, feel free to skip this step.

We've posted example data on github that you can download - heme malignancies from COSMIC

dir.create("example_data")

download.file(url = "https://raw.githubusercontent.com/abelson-lab/Espresso_paper/master/heme_COSMIC/COSMIC_heme_freq10.txt", 
              destfile = "example_data/COSMIC_heme_freq10.txt")

download.file(url = "https://raw.githubusercontent.com/abelson-lab/Espresso_paper/master/heme_COSMIC/COSMIC_heme_freq3.txt", 
              destfile = "example_data/COSMIC_heme_freq3.txt")

Now load the files using load_recurrent_mutations

# blood cancer mutations from COSMIC observed over 10 times
  hemeCOSMIC_10 <- load_recurrent_mutations("example_data/COSMIC_heme_freq10.txt", genome = "hg19") # In this example those alleles will be filtered out from model generation

# blood cancer mutations from COSMIC observed over 3 times
hemeCOSMIC_3 <- load_recurrent_mutations("example_data/COSMIC_heme_freq3.txt", genome = "hg19") # these will be filtered out from the output of the flagged alleles function (see below)

hemeCOSMIC_3

Get flagged alleles

Flagged alleles are alleles that appear at a high VAF in a significant number of samples within your cohort. These are very likely to be sequencing errors and should be interpreted with caution in variant calling results. However, if you are expecting recurrent mutations (e.g. Hematologic cancer associated mutations from COSMIC), then you can use the recurrent_mutations argument to exclude them from the flagging process.

Here, we will input the recurrent mutations from COSMIC with a frequency >= 3 and ask get_flagged_alleles to ignore these alleles.
Additionally, we are setting the memory_saving argument to FALSE in this case because we are only processing 40 samples.

If you are processing > 400 samples on a 16gb RAM device or > 200 samples on a 8gb RAM device, we recommend setting memory_saving = TRUE.
This will employ an alternative method that consumes less memory but takes approximately twice as long to run.

flagged_alleles <- get_flagged_alleles(sample_names, sample_paths, genome, recurrent_mutations = hemeCOSMIC_3, memory_saving = FALSE)
flagged_alleles

Load hotspot regions to call variants in

In this example we are only interested in examining hematologic malignancy associated mutations, so we've included a bed file with genomic positions flanking +/- 15bp from COSMIC hotspot mutations. This will allow us to limit mutation calling specifically to these hotspot locations. Note that the other positions are still used for error model generation.

download.file(url = "https://raw.githubusercontent.com/abelson-lab/Espresso_paper/master/heme_COSMIC/hemeCOSMIC_hotspot_range.bed", 
              destfile = "example_data/hemeCOSMIC_hotspot_range.bed")

# Get COSMIC hotspot range to call mutations in
COSMIC_hotspot_range <- load_bed("example_data/hemeCOSMIC_hotspot_range.bed", genome)
COSMIC_hotspot_range

If you wanted to generate variant calls only for specific mutations (i.e. specified ref + alt) rather than the position alone, you can load in your specific mutations in the "chr, pos, ref, alt" format using load_recurrent_mutations().

If you want to call mutations in all covered positions, you can skip this step and remove the intersect_VRanges(., COSMIC_hotspot_range) in the following code chunk. This will prevent it from filtering out any positions prior to variant calling.

Generate Models and Call Variants

We will be running this in parallel with the package DoParallel - using 4 cores to speed up the job.

We're currently generating models by trinucleotide context (192 models), but this can be changed by tweaking the context argument to 5 (substitution with 2 flanking bases, constituting 3072 models) if you have a very large panel (>1 million bp at >2000x depth) or to 1 (substitution only, constituting 12 models) if you have a very small panel (see paper for more).
Generally, the default trinucleotide context approach is the most versatile and effective acoss various ranges and depths.

Also note that we perform a key preprocessing step to minimize somatic mutation contamination of our error models. This is achieved in through the filter_model_input function where we remove flagged alleles, polymorphisms (MAF > 0.001), clear private germline variants (VAF > 0.05), low quality calls (MAPQ < 59), and recurrent mutations identified in COSMIC. Additionally, the filter_model_input function also removes contextual outliers, which are non-reference alleles with an exceptionally high read count compared to the rest of the distribution.

# Make cluster and start
cl <- makeCluster(4)
registerDoParallel(cl)

# initialize variant calls
variant_calls <- VRangesList()

# Try for all files
for(i in 1:length(sample_paths)){

  # get sample name and path
  samp_path <- sample_paths[i]
  samp_name <- sample_names[i]

  print(samp_name)

  # get sample as VRanges and annotate with sequence context and MAF
  samp <- load_as_VRanges(samp_name, samp_path, genome) %>%
    sequence_context(., genome, context = 3) %>%
    annotate_MAF(., maf_database, genome)

# use sample to generate the error models
    samp_models <- samp %>%
        filter_model_input(., flagged_alleles, MAF_cutoff = 0.001, VAF_cutoff = 0.05, 
                           MAPQ_cutoff = 59, recurrent_mutations = hemeCOSMIC_10) %>%    # preprocessing to clean training set (error models)
        generate_all_models()

# call variants using error models, and aggregate together
    variant_calls[[samp_name]] <- samp %>% 
        intersect_VRanges(., COSMIC_hotspot_range) %>% # only keep positions within the hotspot range we want to call in
      filter_MAPQ(MAPQ_cutoff = 59) %>% 
        call_all_variants(., samp_models)

# cleanup
    rm(samp_name, samp, samp_models)
}


# Unregister Cluster
stopCluster(cl)
rm(cl)
registerDoSEQ()

The output is a vranges list object with the variant calls for each samples.

# Output is in vranges list
variant_calls

We can unlist that vranges object, conduct the pvalue correction (bonferroni across all samples), and keep the significant calls.
Note that if we were scanning all sequenced positions for variants, we would recommend applying bonferonni correction by sample.

# unlist and correct pvalue (bonferroni)
variant_calls_unlisted <- variant_calls %>% unlist() %>% correct_pvalues(., method = "bonferroni")
## if we want to correct sample-by-sample
# variant_calls_unlisted <- variant_calls %>% correct_pvalues(., method = "bonferroni") %>% unlist()


# show significant calls 
significant_calls <- variant_calls_unlisted[which(variant_calls_unlisted$corrected_pvalue <= 0.05)]
significant_calls

We also recommend inspecting the "model" column to ensure that error models were successfully generated for each trinucleotide context. In case that your varscan input contains indels the model type will indicate "None". If a "None" model is indicated for a trinucleotide context it is possible that not enough alt alleles were present in the data to generate the model. In this case consider restricting the nucleotide context from 5 to 3 or from 3 to 1, thus reducing the number of models generated per sample yet including more data in each. If no model can be generated by Espresso for a context, the pvalue from varscan will be used.

Save significant calls for annotation

Significant calls can be converted into a dataframe for input into annovar or saved directly as a VCF file.

significant_calls_df <- significant_calls %>% 
  tidyr::as_tibble() %>% 
  dplyr::select(-width, -strand, -totalDepth) %>% 
  dplyr::rename("chr" = seqnames) %>% 
  dplyr::mutate(chr = sub(pattern="chr", replacement="",x=chr))
significant_calls_df %>% readr::write_delim("espresso_calls.txt", delim = "\t")

significant_calls_df %>% head() %>% print()
significant_calls %>% 
  VariantAnnotation::asVCF() %>% 
  VariantAnnotation::writeVcf("espresso_calls.vcf")

Alternatively, we can annotate the significant calls through cellBase, which returns a nested dataframe with transcript-specific functional annotations.

anno <- annotate_variants(significant_calls, genome='hg19')
anno %>% head()
sessionInfo()


andygxzeng/ECSI documentation built on Feb. 6, 2021, 8:53 a.m.