R/mutFilterSB.R

Defines functions calP calSBscore mutFilterSB

Documented in mutFilterSB

#' mutFilterSB
#' @description Filter variants based on strand bias.
#'
#' @param maf An MAF object, generated by \code{\link{vcfToMAF}} function.
#' @param method Method will be used to detect strand bias,
#' including 'SOR' and 'Fisher'. Default: 'SOR'. SOR: StrandOddsRatio
#' (https://gatk.broadinstitute.org/hc/en-us/articles/360041849111-
#' StrandOddsRatio) Fisher's Exat Test: Switch to Phred socre
#' (https://gatk.broadinstitute.org/hc/en-us/articles/360035532152-Fisher-
#' s-Exact-Test)
#' @param SBscore Cutoff strand bias score used to filter variants.
#' Default: 3
#' @importFrom methods is
#'
#' @return An MAF data frame where some variants
#' have S tag in CaTag column for strand bias filtration
#'
#' @export mutFilterSB
#' @examples
#' maf <- vcfToMAF(system.file("extdata",
#' "WES_EA_T_1_mutect2.vep.vcf", package="CaMutQC"))
#' mafF <- mutFilterSB(maf)

mutFilterSB <- function(maf, method = 'SOR', SBscore = 3) {
  # check user input
  if (!(is(maf, "data.frame"))) {
    stop("maf input should be a data frame, did you get it from vcfToMAF function?")
  }
  
  if (method == 'Fisher') {
    # use for loop to get the SB score for each variation
    for (i in seq_len(nrow(maf))) {
      SBindex <- strsplit(maf$FORMAT[i], ':')[[1]] == 'SB'
      # different caller may have different field name to record strand read
      if (all(SBindex == FALSE)){
        if (length(grep('DP4', maf$FORMAT[i]))){
          DP4index <- strsplit(maf$FORMAT[i], ':')[[1]] == 'DP4'
          SBcharmatix <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][DP4index]
        }else{
          F1R2index <- strsplit(maf$FORMAT[i], ':')[[1]] == 'F1R2'
          F2R1index <- strsplit(maf$FORMAT[i], ':')[[1]] == 'F2R1'
          F1R2 <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][F1R2index]
          F2R1 <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][F2R1index]
          rf <- strsplit(F1R2, ',')[[1]][1]
          af <- strsplit(F1R2, ',')[[1]][2]
          rv <- strsplit(F2R1, ',')[[1]][1]
          av <- strsplit(F2R1, ',')[[1]][2]
          SBcharmatix <- paste(rf, rv, af, av, sep = ',')
        }
      }else{
        SBcharmatix <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][SBindex]
      }
      if (calSBscore(SBcharmatix, method) > SBscore) {
        maf[i, 'CaTag'] <- paste0(maf[i, 'CaTag'] , 'S')
      }
    }
  }else if (method == 'SOR'){
    for (i in seq_len(nrow(maf))) {
      if(length(grep('ALT_F1R2', maf$FORMAT[i]))){
        altF1R2index <- strsplit(maf$FORMAT[i], ':')[[1]] == 'ALT_F1R2'
        altF2R1index <- strsplit(maf$FORMAT[i], ':')[[1]] == 'ALT_F2R1'
        refF1R2index <- strsplit(maf$FORMAT[i], ':')[[1]] == 'REF_F1R2'
        refF2R1index <- strsplit(maf$FORMAT[i], ':')[[1]] == 'REF_F2R1'
        altF1R2 <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][altF1R2index]
        altF2R1 <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][altF2R1index]
        refF1R2 <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][refF1R2index]
        refF2R1 <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][refF2R1index]
        SBcharmatix <- paste(refF1R2, refF2R1, altF1R2, altF2R1, sep = ',')
        if (calSBscore(SBcharmatix, method, rorder=TRUE) > SBscore) {
          maf[i, 'CaTag'] <- paste0(maf[i, 'CaTag'] , 'S')
        }
      }else if (length(grep('F1R2', maf$FORMAT[i]))){
        F1R2index <- strsplit(maf$FORMAT[i], ':')[[1]] == 'F1R2'
        F2R1index <- strsplit(maf$FORMAT[i], ':')[[1]] == 'F2R1'
        F1R2 <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][F1R2index]
        F2R1 <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][F2R1index]
        SBcharmatix <- paste(F1R2, F2R1, sep = ',')
        if (calSBscore(SBcharmatix, method) > SBscore) {
            maf[i, 'CaTag'] <- paste0(maf[i, 'CaTag'] , 'S')
        }
      }else if (length(grep('DP4', maf$FORMAT[i]))){
        DP4index <- strsplit(maf$FORMAT[i], ':')[[1]] == 'DP4'
        SBcharmatix <- strsplit(maf[i, 'tumorSampleInfo'], ':')[[1]][DP4index]
        if (calSBscore(SBcharmatix, method, rorder=TRUE) > SBscore) {
            maf[i, 'CaTag'] <- paste0(maf[i, 'CaTag'] , 'S')
        }
      }
    }
  }
    return(maf)
}

## construct function to calculate the SB score for strand bias detection
calSBscore <- function(charmatrix, method = 'SOR', rorder = FALSE){
    depths <- as.numeric(strsplit(charmatrix, ",")[[1]])
    if (method == 'SOR') {
      if (!(rorder)){
        refFw <- depths[1] + 1
        refRv <- depths[3] + 1
        altFw <- depths[2] + 1
        altRv <- depths[4] + 1
      }else{
        refFw <- depths[1] + 1
        refRv <- depths[2] + 1
        altFw <- depths[3] + 1
        altRv <- depths[4] + 1
      }
      symmetricalRatio <- (refFw * altRv)/(refRv * altFw) +
        (refRv * altFw) / (refFw * altRv)
      refRatio <- min(refRv, refFw) / max(refRv, refFw)
      altRatio <- min(altRv, altFw) / max(altRv, altFw)
      score <- log(symmetricalRatio) + log(refRatio) - log(altRatio)
    }else if (method == 'Fisher')   {
      refFw <- depths[1]
      refRv <- depths[2]
      altFw <- depths[3]
      altRv <- depths[4]
      Row1 <- refRv + refFw
      Row2 <- altRv + altFw
      Col1 <- refFw + altFw
      Col2 <- refRv + altRv
      P1 <- calP(refFw, refRv, altFw, altRv)
      # check P1 whether it is over the depth
      if (is.na(P1)){
          mes <- paste0('Data is in high coverage, so the factorial of its depth',
                        ' cannot be obtained, ', ' use \'SOR\' method instead.')
          stop(mes)
      }
      # probability of observing more extreme data
      vaildP <- c()
      # get the range of refFw
      maxR <- min(Row1, Col1)
      minR <- max(0, Row1-Col2)
      for (i in seq(minR, maxR)){
          exP <- calP(i, Row1-i, Col1-i, Col2 - Row1 + i)
          if(exP <= P1){
              vaildP <- c(vaildP, exP)
          }
      }
      # get sum of P
      Pt <- sum(P1, vaildP)
      # get Phred score
      score <- -10 * log10(Pt)
    }else {
        stop('Please select a method from SOR and Fisher')
    }
    return(score)
}

# healper function to calculate probability P
calP <- function(refFw, refRv, altFw, altRv){
    R1 <- refRv + refFw
    R2 <- altRv + altFw
    C1 <- refFw + altFw
    C2 <- refRv + altRv
    # probability P
    P <- (factorial(R1) * factorial(R2) * factorial(C1) * factorial(C2))/
      (factorial(R1 + R2) * factorial(refRv) * factorial(refFw) *
         factorial(altFw) * factorial(altRv))
    return(P)
}
likelet/CaMutQC documentation built on Aug. 17, 2024, 4 a.m.