R/filterVcfMuTect.R

Defines functions .detectCaller filterVcfMuTect

Documented in filterVcfMuTect

#' Filter VCF MuTect
#' 
#' Function to remove artifacts and low confidence/quality calls from a MuTect
#' generated VCF file. Also applies filters defined in \code{filterVcfBasic}.
#' This function will only keep variants listed in the stats file and those not
#' matching the specified failure reasons.
#' 
#' 
#' @param vcf \code{CollapsedVCF} object, read in with the \code{readVcf}
#' function from the VariantAnnotation package.
#' @param tumor.id.in.vcf The tumor id in the VCF file, optional.
#' @param stats.file MuTect stats file. If \code{NULL}, will check if VCF
#' was generated by MuTect2 and if yes will call \code{\link{filterVcfMuTect2}}
#' instead.
#' @param ignore MuTect flags that mark variants for exclusion.
#' @param \dots Additional arguments passed to \code{\link{filterVcfBasic}}.
#' @return A list with elements \code{vcf}, \code{flag} and
#' \code{flag_comment}.  \code{vcf} contains the filtered \code{CollapsedVCF},
#' \code{flag} a \code{logical(1)} flag if problems were identified, further
#' described in \code{flag_comment}.
#' @author Markus Riester
#' @seealso \code{\link{filterVcfBasic}}
#' @examples
#' 
#' ### This function is typically only called by runAbsolute via the 
#' ### fun.filterVcf and args.filterVcf comments.
#' library(VariantAnnotation)    
#' vcf.file <- system.file("extdata", "example.vcf.gz", package="PureCN")
#' vcf <- readVcf(vcf.file, "hg19")
#' vcf.filtered <- filterVcfMuTect(vcf)        
#' 
#' @export filterVcfMuTect
filterVcfMuTect <- function(vcf, tumor.id.in.vcf = NULL, stats.file = NULL, 
ignore=c("clustered_read_position", "fstar_tumor_lod", "nearby_gap_events", 
"poor_mapping_region_alternate_allele_mapq", "poor_mapping_region_mapq0", 
"possible_contamination", "strand_artifact", "seen_in_panel_of_normals"),
...){
    if (is.null(stats.file) && .detectCaller(vcf) == "MuTect2/GATK4") {
        flog.info("Detected MuTect2 VCF.")
        return(filterVcfMuTect2(vcf, tumor.id.in.vcf, ...))
    }    
    if (is.null(stats.file)) return(
        filterVcfBasic(vcf, tumor.id.in.vcf, ...))
    
    stats <- read.delim(stats.file, as.is=TRUE, skip=1)

    if (is.null(stats$contig) || is.null(stats$position)) {
        flog.warn("MuTect stats file lacks contig and position columns.")
        return(filterVcfBasic(vcf, tumor.id.in.vcf, ...))
    }

    # check for excessive nearby_gap_events
    if ("nearby_gap_events" %in% ignore &&
        sum(grepl("nearby_gap_events", stats$failure_reasons))/nrow(stats) > 0.5) {
        ignore <- ignore[-match("nearby_gap_events", ignore)]
        flog.warn("Excessive nearby_gap_events, ignoring this flag. Check your data.")
    }

    gr.stats <- GRanges(seqnames=stats$contig, 
        IRanges(start=stats$position, end=stats$position))
    
    ov <- findOverlaps(vcf, gr.stats)
     
    if (!identical(queryHits(ov),subjectHits(ov)) || 
            nrow(vcf) != nrow(stats)) {
        n <- .countVariants(vcf)
        stats <- stats[subjectHits(ov),]
        vcf <- .removeVariants(vcf, !seq(length(vcf)) %in% queryHits(ov), 
                               "MuTect align")
        flog.warn("MuTect stats file and VCF file do not align perfectly. Will remove %i unmatched variants.",
            n-.countVariants(vcf))
    }    
    if (is.null(stats$failure_reasons)) {
        flog.warn("MuTect stats file lacks failure_reasons column.%s",
            " Keeping all variants listed in stats file.")
        return(filterVcfBasic(vcf, tumor.id.in.vcf, ...))
    }

    n <- .countVariants(vcf)

    ids <- sort(unique(unlist(sapply(ignore, grep, stats$failure_reasons))))
    vcf <- .removeVariants(vcf, ids, "MuTect")

    flog.info("Removing %i MuTect calls due to blacklisted failure reasons.", 
        n-.countVariants(vcf))
    filterVcfBasic(vcf, tumor.id.in.vcf, ...)
}
    
.detectCaller <- function(vcf) {
    gatkVersion <- meta(header(vcf))[["GATKCommandLine"]]$Version[1]
    if (!is.null(gatkVersion)) {
        gatkVersion <- gsub("\\\"", "", gatkVersion)
        if (grepl("^4", gatkVersion)) return("MuTect2/GATK4")
    }    
    return("")
}    

Try the PureCN package in your browser

Any scripts or data that you put into this service are public.

PureCN documentation built on Nov. 8, 2020, 5:37 p.m.