R/load_variants.R

Defines functions load_variants .load_and_filter_vcf .get_variant_locs_maf .get_variant_locs_vcf .get_variant_locs_cs .get_variant_locs_snp .get_oncotated_variants

#' Load a variant call file.
#' Allowed types are either maflite, vcf (version >=4), call_stats (MuTect1 format),
#' or snp (Varscan2 format) files
#'
#' The function is generally internal, but is exported to help the user in debugging input
#'
#' @param variant_file /path/to/variant/file
#' @param file_type either maf, vcf, call_stats, or snp
#' @return Data frame with 5 columns: chr, start, end, ref_allele, alt_allele
#'
#' @examples
#' # An example of converting oncotator output to maflite and calling load_variants
#' \dontrun{
#'
#'
#' }
load_variants <- function(vcf,reference,sample_name,file_type="vcf"){

  if (! file.exists(vcf)){
    stop('Variant file does not exist - Please check the path')
  }
  .load_and_filter_vcf(vcf,reference,sample_name)
}

.load_and_filter_vcf <- function(vcf,reference,sample_name){
  # Load vcf as ranges, filter.
  # Need SNVs that pass plus SNVs that only fail tlod
  hard_filters <- S4Vectors::FilterRules(list(snv=VariantAnnotation::isSNV,
                                              sample=function(x) {VariantAnnotation::sampleNames(x)==sample_name}))
  vr <- VariantAnnotation::readVcfAsVRanges(vcf)
  vr <- S4Vectors::subsetByFilter(vr,filter=hard_filters)
  # Need to get rid of the GL sequences, leave just the chromosomes
  GenomeInfoDb::seqlevels(vr) <- GenomicAlignments::seqlevelsInUse(vr)
  GenomeInfoDb::genome(vr) <- GenomeInfoDb::genome(reference)[1:length(GenomeInfoDb::genome(vr))] # These need to have exactly the same name
  # Now get contexts from SomaticSignatures
  vr <- SomaticSignatures::mutationContext(vr,reference)
  mcols(vr)$TLOD <- as.numeric(mcols(vr)$TLOD)
  # TODO: softfiltermatrix values are NA in many cases, this happens because they put a "." in where all filters pass in GATK4
  VariantAnnotation::softFilterMatrix(vr)[is.na(VariantAnnotation::softFilterMatrix(vr))] <- TRUE # This fixes it
  # GATK 4 changes the filter t_lod_fstar to t_lod. need different functions for mutect2 in gatk3-4
  if (sum(stringr::str_detect('t_lod_fstar',colnames(VariantAnnotation::softFilterMatrix(vr))))>0){
    mcols(vr)$tlod_only <- rowSums(VariantAnnotation::softFilterMatrix(vr))==ncol(VariantAnnotation::softFilterMatrix(vr))-1 & !VariantAnnotation::softFilterMatrix(vr)[,"t_lod_fstar"]
  }else{
    mcols(vr)$tlod_only <- rowSums(VariantAnnotation::softFilterMatrix(vr))==ncol(VariantAnnotation::softFilterMatrix(vr))-1 & !VariantAnnotation::softFilterMatrix(vr)[,"t_lod"]
  }
  mcols(vr)$pass_all <- rowSums(VariantAnnotation::softFilterMatrix(vr))==ncol(VariantAnnotation::softFilterMatrix(vr))
  # GATK 4 has lists of probabilities in mcols. For now just get rid of them
  mcols(vr)$SA_MAP_AF <- ""
  mcols(vr)$SA_POST_PROB <- ""
  vars <- tibble::as_tibble(vr)
  vars <- vars %>% dplyr::mutate(cref=stringr::str_sub(alteration,1L,1L),
                                 calt=stringr::str_sub(alteration,2L,2L),
                                 mutect_odds=10**(TLOD-6.0))
  vars
}
# Return data frame with variant locations and ref/alt from maflite
.get_variant_locs_maf <- function(variant_file){
  cols <- c('chr','start','end','ref','alt')
  format <- c('ccccc')
  readr::read_tsv(variant_file,comment='#',col_names=cols,col_types=format) %>%
    dplyr::select(chr,start,end,ref,alt)
}
# Return data frame with variant locations and ref/alt from vcf
.get_variant_locs_vcf <- function(variant_file){
  cols <- c('chr','pos','id','ref','alt','qual','filter','info','format','tumor','normal')
  format <- c('ccccccccccc')
  readr::read_tsv(variant_file,comment="#",col_names=cols,col_types=format) %>%
    dplyr::select(chr,"start"=pos,"end"=pos,ref,alt)
}

# Return data frame with variant locations and ref/alt from Mutect1 call_stats file
.get_variant_locs_cs <- function(variant_file){
  col_spec <- readr::cols_only(contig='c',position='c',ref_allele='c',alt_allele='c',judgement='c',tumor_f='d','t_lod_fstar'='d')
  readr::read_tsv(variant_file,comment='#',col_types=col_spec) %>%
    dplyr::mutate(start=position,end=position) %>%
    dplyr::select("chr"=contig,start,end,"ref"=ref_allele,"alt"=alt_allele,'freq'=tumor_f,'tlod'=t_lod_fstar)
}

# Return data frame with variant locations and ref/alt from Varscan .snp file
.get_variant_locs_snp <- function(variant_file){
  # Varscan somatic leaves GL chromosome variants in. They are removed here.
  message('High confidence calls are generated with the default varscan algorithm, ',
          'tumor VAF > 15% normal VAF < 5%, p-val < 0.03')
  col_spec <- readr::cols_only(chrom='c',position='c',ref='c',var='c',
                               somatic_status='c',normal_var_freq='c',tumor_var_freq='c',
                               somatic_p_value='d')
  readr::read_tsv(variant_file,comment='#',col_types=col_spec) %>%
    dplyr::filter(! stringr::str_detect(chrom,'GL')) %>%
    dplyr::mutate(normal_var_freq=readr::parse_number(normal_var_freq),
                  tumor_var_freq=readr::parse_number(tumor_var_freq)) %>%
    dplyr::filter(somatic_status=="Somatic") %>%
    dplyr::select("chr"=chrom,"start"=position,"end"=position,"ref"=ref,"alt"=var)
}

.get_oncotated_variants <- function(onc_file){
  # Returns the full relevant data including rejection reasons as columns
  # so that this can be filtered to get those variants that only failed
  # because of TLOD
  fr <- readr::read_tsv(onc_file,comment="#",col_types = cols(.default='c'),na=character())
  fr <- fr %>% dplyr::filter(Tumor_Sample_Barcode=="TUMOR")
  fr
}
bmannakee/smartcallr documentation built on May 3, 2019, 1:47 p.m.