R/call.variants.fisher.R

Defines functions callVariantsPairedFisher .blockify

Documented in callVariantsPairedFisher

.blockify <- function( positions ){
  if(length(positions) == 1){
    return(c(1))
  }else{
    distances <- diff(positions)
    blocks <- rep(0, length(positions))
    blocks[1] <- 1
    for( j in seq(length(distances)) ){
      if( distances[j] == 1 ){
        blocks[j+1] <- blocks[j]
      }else{
        blocks[j+1] <- blocks[j] + 1
      }
    }
    return( blocks ) 
  }
}

callVariantsPairedFisher <- function( data, sampledata, pValCutOff = 0.05, minCoverage = 5, mergeDels = TRUE, mergeAggregator = mean ){
  blocksize <- data$h5dapplyInfo$Blockend - data$h5dapplyInfo$Blockstart + 1
  cases <- which( sampledata$Type == "Case" )
  controls <- which( sampledata$Type == "Control" )
  stopifnot(length(cases)>0, length(controls)>0)
  names(cases) <- sampledata$Sample[cases]
  callDels <- "Deletions" %in% names(data)
  the_bases <- c( "A", "C", "G", "T", "-", "N" )
  if(callDels){
    altAlleles <- the_bases[1:5]
  }else{
    altAlleles <- the_bases[1:4]
  }
  ret <- lapply(seq(along=cases), function(i) {
    caseColumn    = sampledata$Column[cases[i]]
    controlColumn = if( "Group" %in% colnames(sampledata) ){
      sampledata$Column[ (sampledata$Group == sampledata$Group[cases[i]]) & (sampledata$Type == "Control") ]
    }else{
      sampledata$Column[ (sampledata$Patient == sampledata$Patient[cases[i]]) & (sampledata$Type == "Control") ]
    }
    if( length(controlColumn) != 1 ){
      msg = paste("Number of 'Control' samples for patient", sampledata$Patient[cases[i]], "is", length(controlColumn) )
      con = textConnection("tmpstr", open="w"); tmpstr=""; sink(con); print(sampledata); sink()
      msg = paste(msg, "'sampledata' is:", paste(tmpstr, collapse="\n"), sep="\n") 
      stop(msg)
    }
    caseCounts      <- apply(data$Counts[,caseColumn,,], c(1,3), sum)
    caseCounts      <- caseCounts[1:4,] + caseCounts[5:8,] + caseCounts[9:12,]
    controlCounts   <- apply(data$Counts[,controlColumn,,], c(1,3), sum)
    controlCounts   <- controlCounts[1:4,] + controlCounts[5:8,] + controlCounts[9:12,]
    caseCoverage    <- data$Coverages[caseColumn,1,] + data$Coverages[caseColumn,2,]
    controlCoverage <- data$Coverages[controlColumn,1,] + data$Coverages[controlColumn,2,]
    if( callDels ){
      caseDeletions    <- data$Deletions[caseColumn,1,] + data$Deletions[caseColumn,2,]
      controlDeletions <- data$Deletions[controlColumn,1,] + data$Deletions[controlColumn,2,]  
    }
    refs <- data$Reference
    refs[refs == -1] = 5
    refs <- the_bases[refs + 1]
    template <- data.frame(
      Sample = names(cases)[i],
      Chrom = strsplit(data$h5dapplyInfo$Group, split="/")[[1]][3],
      Start = seq(length = blocksize) + data$h5dapplyInfo$Blockstart - 1,
      End = seq(length = blocksize) + data$h5dapplyInfo$Blockstart - 1,
      refAllele = refs
      )
    pvals <- lapply( seq(length = length(altAlleles)), function(altAllele){
      tmp <- template
      tmp$altAllele <- the_bases[altAllele]
      tmp$caseCoverage <- caseCoverage
      tmp$controlCoverage <- controlCoverage
      if( altAllele == 5 ){
        tmp$caseCount <- caseDeletions
        tmp$controlCount <- controlDeletions
        tmp$caseCoverage <- tmp$caseCoverage + caseDeletions
        tmp$controlCoverage <- tmp$controlCoverage + controlDeletions
      }else{
        tmp$caseCount <- caseCounts[altAllele,]
        tmp$controlCount <- controlCounts[altAllele,]
      }
      tmp$pValue <- sapply(
        seq(length = blocksize),
        function(gp) {
          contingencyTable <- matrix(
            c( tmp$caseCount[gp], tmp$caseCoverage[gp] - tmp$caseCount[gp], tmp$controlCount[gp], tmp$controlCoverage[gp] - tmp$controlCount[gp] ),
            nrow = 2
          )
          ft = fisher.test( contingencyTable )
          return( ft$p.value )
        })
      tmp <- subset(tmp, pValue <= pValCutOff & caseCoverage >= minCoverage & controlCoverage >= minCoverage )
      if(nrow(tmp) == 0){
        return(NULL)
      }
      if( callDels & mergeDels & altAllele == 5 ){
        tmp$BlockID <- .blockify(tmp$Start)
        blockCounts <- table(tmp$BlockID)
        singleValueBlocks <- names(blockCounts)[blockCounts == 1]
        tmptmp <- subset(tmp, !(BlockID %in% singleValueBlocks) )
        tmp <- subset(tmp, (BlockID %in% singleValueBlocks) )
        tmptmp <- lapply( split( x = tmptmp, f = tmptmp$BlockID ), function(df){
          tmp_df <- data.frame( Sample = df$Sample[1], Chrom = df$Chrom[1], Start = min(df$Start), End = max(df$End), refAllele = paste(df$refAllele, collapse=""), altAllele = "-" )
          for( slot in colnames(df) ){
            if(!(slot %in% colnames(tmp_df))){
              tmp_df[[slot]] <- mergeAggregator( df[[slot]] )
            }
          }
          return(tmp_df)
        })
        tmp <- rbind( tmp, do.call(rbind, tmptmp) )
        tmp$BlockID <- NULL
      }
      return(tmp)
      }
    )
    
    return( do.call(rbind, pvals) )
  })
  ret <- do.call( rbind, ret )
  ret$caseAF <- ret$caseCount / ret$caseCoverage
  ret$controlAF <- ret$controlCount / ret$controlCoverage
  return(ret)
}

Try the h5vc package in your browser

Any scripts or data that you put into this service are public.

h5vc documentation built on Nov. 8, 2020, 4:56 p.m.