R/mutFilterPON.R

Defines functions mutFilterPON

Documented in mutFilterPON

#' mutFilterPON
#' @description Filter variants based on Panel of Normals
#'
#' @param maf An MAF data frame, generated by \code{\link{vcfToMAF}} function.
#' @param PONfile Panel-of-Normals files, which can be either obtained through 
#' GATK (https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON-)
#' or generated by users. Should have at least four columns: CHROM, POS, REF, ALT
#' @param PONformat The format of PON file, either "vcf" or "txt". Default: "vcf"
#' @param verbose Whether to generate message/notification during the 
#' filtration process. Default: TRUE.
#'
#' @return An MAF data frame where some variants have P tag in CaTag column
#' for PON filtration.
#' @import vcfR stringr
#' @importFrom methods is
#'
#' @export mutFilterPON
#' @examples
#' maf <- vcfToMAF(system.file("extdata",
#' "WES_EA_T_1_mutect2.vep.vcf", package="CaMutQC"))
#' mafF <- mutFilterPON(maf, PONfile=system.file("extdata",
#' "PON_test.txt", package="CaMutQC"), PONformat="txt")

## PON filtration using external dataset and info flag
mutFilterPON <- function(maf, PONfile, PONformat = "vcf", verbose = TRUE) {
    # check user input
    if (!(is(maf, "data.frame"))) {
        stop("maf input should be a data frame, did you get it from vcfToMAF function?")
    }
    # give recommendation for PON file
    if (verbose) {
        url <- "https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON-"
        mes <- paste0("\n  PON file can be obtained from Public GATK Panel of Normals at ",
                      url)
        message(mes)
    }
    # first filter using the flag in maf column
    if('PON' %in% colnames(maf)) {
        # add P tag if variants have been labelled
        preLabelled <- rownames(maf[!(is.na(maf$PON)), ])
    }else{
        preLabelled <- NULL
    }
    # joint useful information for matching
    genomeVersion <- unique(maf$NCBI_Build)
    if (length(genomeVersion) > 1){
        stop('There are more than one version of genome in this dataset.')
    } else if (!(genomeVersion %in% c('GRCh38', 'GRCh37'))){
        stop('Invaild genome version. CaMutQC only supports human genome so far.')
    } else {
        # load PON file
        if (verbose) {
            message('  Loading PON data...')
        }
        if (PONformat == "vcf") {
            invisible(capture.output(somatic <- read.vcfR(PONfile)))
            # give PON a unique label, so they can be easily found and targeted
            somatic@fix[, 'ID'] <- 1
            pon_maf <- merge(maf, somatic@fix, all.x=TRUE, 
                           by = c("Chromosome", "Start_Position", 
                                  "Reference_Allele", "Tumor_Seq_Allele2"), 
                           by.y = c("CHROM", "POS", "REF", "ALT"), all.y=FALSE)
        }else if (PONformat == "txt") {
            PON_frame <- read.table(PONfile, header=TRUE, sep = "\t")
            PON_frame$ID <- 1
            # get the intersect veriants using merging action
            pon_maf <- merge(maf, PON_frame, all.x=TRUE, 
                             by.x = c("Chromosome", "Start_Position", 
                                      "Reference_Allele", "Tumor_Seq_Allele2"), 
                             by.y = c("CHROM", "POS", "REF", "ALT"), all.y=FALSE)
        }else{
            stop("Wrong PONformat, should be 'vcf' or 'txt'")
        }
        # add tag qualified variants
        discard <- rownames(pon_maf[which(pon_maf$ID == 1), ])
        maf[union(discard, preLabelled), 'CaTag'] <- paste0(maf[union(discard,
                                                               preLabelled),
                                                              'CaTag'], 'P')
        return(maf)
    }
}
likelet/CaMutQC documentation built on Aug. 17, 2024, 4 a.m.