#' Load a variant call file.
#' Allowed types are either maflite, vcf (version >=4), call_stats (MuTect1 format),
#' or snp (Varscan2 format) files
#'
#' The function is generally internal, but is exported to help the user in debugging input
#'
#' @param variant_file /path/to/variant/file
#' @param file_type either maf, vcf, call_stats, or snp
#' @return Data frame with 5 columns: chr, start, end, ref_allele, alt_allele
#'
#' @examples
#' # An example of converting oncotator output to maflite and calling load_variants
#' \dontrun{
#'
#'
#' }
load_variants <- function(vcf,reference,sample_name,file_type="vcf"){
if (! file.exists(vcf)){
stop('Variant file does not exist - Please check the path')
}
.load_and_filter_vcf(vcf,reference,sample_name)
}
.load_and_filter_vcf <- function(vcf,reference,sample_name){
# Load vcf as ranges, filter.
# Need SNVs that pass plus SNVs that only fail tlod
hard_filters <- S4Vectors::FilterRules(list(snv=VariantAnnotation::isSNV,
sample=function(x) {VariantAnnotation::sampleNames(x)==sample_name}))
vr <- VariantAnnotation::readVcfAsVRanges(vcf)
vr <- S4Vectors::subsetByFilter(vr,filter=hard_filters)
# Need to get rid of the GL sequences, leave just the chromosomes
GenomeInfoDb::seqlevels(vr) <- GenomicAlignments::seqlevelsInUse(vr)
GenomeInfoDb::genome(vr) <- GenomeInfoDb::genome(reference)[1:length(GenomeInfoDb::genome(vr))] # These need to have exactly the same name
# Now get contexts from SomaticSignatures
vr <- SomaticSignatures::mutationContext(vr,reference)
mcols(vr)$TLOD <- as.numeric(mcols(vr)$TLOD)
# TODO: softfiltermatrix values are NA in many cases, this happens because they put a "." in where all filters pass in GATK4
VariantAnnotation::softFilterMatrix(vr)[is.na(VariantAnnotation::softFilterMatrix(vr))] <- TRUE # This fixes it
# GATK 4 changes the filter t_lod_fstar to t_lod. need different functions for mutect2 in gatk3-4
if (sum(stringr::str_detect('t_lod_fstar',colnames(VariantAnnotation::softFilterMatrix(vr))))>0){
mcols(vr)$tlod_only <- rowSums(VariantAnnotation::softFilterMatrix(vr))==ncol(VariantAnnotation::softFilterMatrix(vr))-1 & !VariantAnnotation::softFilterMatrix(vr)[,"t_lod_fstar"]
}else{
mcols(vr)$tlod_only <- rowSums(VariantAnnotation::softFilterMatrix(vr))==ncol(VariantAnnotation::softFilterMatrix(vr))-1 & !VariantAnnotation::softFilterMatrix(vr)[,"t_lod"]
}
mcols(vr)$pass_all <- rowSums(VariantAnnotation::softFilterMatrix(vr))==ncol(VariantAnnotation::softFilterMatrix(vr))
# GATK 4 has lists of probabilities in mcols. For now just get rid of them
mcols(vr)$SA_MAP_AF <- ""
mcols(vr)$SA_POST_PROB <- ""
vars <- tibble::as_tibble(vr)
vars <- vars %>% dplyr::mutate(cref=stringr::str_sub(alteration,1L,1L),
calt=stringr::str_sub(alteration,2L,2L),
mutect_odds=10**(TLOD-6.0))
vars
}
# Return data frame with variant locations and ref/alt from maflite
.get_variant_locs_maf <- function(variant_file){
cols <- c('chr','start','end','ref','alt')
format <- c('ccccc')
readr::read_tsv(variant_file,comment='#',col_names=cols,col_types=format) %>%
dplyr::select(chr,start,end,ref,alt)
}
# Return data frame with variant locations and ref/alt from vcf
.get_variant_locs_vcf <- function(variant_file){
cols <- c('chr','pos','id','ref','alt','qual','filter','info','format','tumor','normal')
format <- c('ccccccccccc')
readr::read_tsv(variant_file,comment="#",col_names=cols,col_types=format) %>%
dplyr::select(chr,"start"=pos,"end"=pos,ref,alt)
}
# Return data frame with variant locations and ref/alt from Mutect1 call_stats file
.get_variant_locs_cs <- function(variant_file){
col_spec <- readr::cols_only(contig='c',position='c',ref_allele='c',alt_allele='c',judgement='c',tumor_f='d','t_lod_fstar'='d')
readr::read_tsv(variant_file,comment='#',col_types=col_spec) %>%
dplyr::mutate(start=position,end=position) %>%
dplyr::select("chr"=contig,start,end,"ref"=ref_allele,"alt"=alt_allele,'freq'=tumor_f,'tlod'=t_lod_fstar)
}
# Return data frame with variant locations and ref/alt from Varscan .snp file
.get_variant_locs_snp <- function(variant_file){
# Varscan somatic leaves GL chromosome variants in. They are removed here.
message('High confidence calls are generated with the default varscan algorithm, ',
'tumor VAF > 15% normal VAF < 5%, p-val < 0.03')
col_spec <- readr::cols_only(chrom='c',position='c',ref='c',var='c',
somatic_status='c',normal_var_freq='c',tumor_var_freq='c',
somatic_p_value='d')
readr::read_tsv(variant_file,comment='#',col_types=col_spec) %>%
dplyr::filter(! stringr::str_detect(chrom,'GL')) %>%
dplyr::mutate(normal_var_freq=readr::parse_number(normal_var_freq),
tumor_var_freq=readr::parse_number(tumor_var_freq)) %>%
dplyr::filter(somatic_status=="Somatic") %>%
dplyr::select("chr"=chrom,"start"=position,"end"=position,"ref"=ref,"alt"=var)
}
.get_oncotated_variants <- function(onc_file){
# Returns the full relevant data including rejection reasons as columns
# so that this can be filtered to get those variants that only failed
# because of TLOD
fr <- readr::read_tsv(onc_file,comment="#",col_types = cols(.default='c'),na=character())
fr <- fr %>% dplyr::filter(Tumor_Sample_Barcode=="TUMOR")
fr
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.