R/call.variants.R

Defines functions callVariantsPaired vcConfParams

Documented in callVariantsPaired vcConfParams

vcConfParams <- function(
  minStrandCov = 5,
  maxStrandCov = 200,
  minStrandAltSupport = 2,
  maxStrandAltSupportControl = 0,
  minStrandDelSupport = minStrandAltSupport,
  maxStrandDelSupportControl = maxStrandAltSupportControl,
  minStrandInsSupport = minStrandAltSupport,
  maxStrandInsSupportControl = maxStrandAltSupportControl,
  minStrandCovControl = 5,
  maxStrandCovControl = 200,
  bases = 5:8,
  returnDataPoints = TRUE,
  annotateWithBackground = TRUE,
  mergeCalls = TRUE,
  mergeAggregator = mean,
  pValueAggregator = max
  ){
  return(
    list(
      minStrandCov = minStrandCov,
      maxStrandCov = maxStrandCov,
      minStrandAltSupport = minStrandAltSupport,
      maxStrandAltSupportControl = maxStrandAltSupportControl,
      minStrandDelSupport = minStrandDelSupport,
      maxStrandDelSupportControl = maxStrandDelSupportControl,
      minStrandInsSupport = minStrandInsSupport,
      maxStrandInsSupportControl = maxStrandInsSupportControl,
      minStrandCovControl = minStrandCovControl,
      maxStrandCovControl = maxStrandCovControl,
      bases = bases,
      returnDataPoints = returnDataPoints,
      annotateWithBackground = annotateWithBackground,
      mergeCalls = mergeCalls,
      mergeAggregator = mergeAggregator,
      pValueAggregator = pValueAggregator
      )
    )
}

callVariantsPaired <- function( data, sampledata, cl = vcConfParams() ){
  refBases <- c("A", "C", "G", "T", "N")
  cases = which( sampledata$Type == "Case" )
  controls = which( sampledata$Type == "Control" )
  stopifnot(length(cases)>0, length(controls)>0)
  if( cl$annotateWithBackground ){
    ## Background Error Estimates
    cc = sampledata$Column[controls]
    controlCounts = data$Counts[1:4,cc,,,drop=F] + data$Counts[5:8,cc,,,drop=F] + data$Counts[9:12,cc,,,drop=F]
    controlCoverages = data$Coverages[cc,,,drop=F]
    mismatchFrequencies = colSums(colSums( controlCounts )) / colSums(controlCoverages)
    controlDeletions = data$Deletions[cc,,]
    deletionFrequencies = colSums(controlDeletions) / ( colSums(controlCoverages) + colSums(controlDeletions) )
    if("Insertions" %in% names(data)){
      controlInsertions = data$Insertions[cc,,]
      insertionFrequencies = colSums(controlInsertions) / ( colSums(controlCoverages) + colSums(controlInsertions) )  
    }
  }
  names(cases) = sampledata$Sample[cases]
  ret = lapply(seq(along=cases), function(i) {
    caseColumn    = sampledata$Column[cases[i]]
    controlColumn = if( "Group" %in% colnames(sampledata) ){
      sampledata$Column[ (sampledata$Group == sampledata$Group[cases[i]]) & (sampledata$Type == "Control") ]
    }else{
      sampledata$Column[ (sampledata$Patient == sampledata$Patient[cases[i]]) & (sampledata$Type == "Control") ]
    }
    if( length(controlColumn) != 1 ){
      msg = paste("Number of 'Control' samples for patient", sampledata$Patient[cases[i]], "is", length(controlColumn) )
      con = textConnection("tmpstr", open="w"); tmpstr=""; sink(con); print(sampledata); sink()
      msg = paste(msg, "'sampledata' is:", paste(tmpstr, collapse="\n"), sep="\n") 
      stop(msg)
    }
    caseCounts      = data$Counts[cl$bases,caseColumn,,]
    ## sum up all mismatches, also the low quality ones
    controlCounts   = data$Counts[1:4,controlColumn,,] + data$Counts[5:8,controlColumn,,] + data$Counts[9:12,controlColumn,,] 
    caseCoverage    = data$Coverages[caseColumn,,]
    controlCoverage = data$Coverages[controlColumn,,]   
    cov_filter = (caseCoverage[1,] >= cl$minStrandCov &
                  caseCoverage[1,] <= cl$maxStrandCov &
                  caseCoverage[2,] >= cl$minStrandCov &
                  caseCoverage[2,] <= cl$maxStrandCov &
                  controlCoverage[1,] >= cl$minStrandCov &
                  controlCoverage[1,] <= cl$maxStrandCov &
                  controlCoverage[2,] >= cl$minStrandCov &
                  controlCoverage[2,] <= cl$maxStrandCov)
    count_filter = (caseCounts[,1,] >= cl$minStrandAltSupport &
                    caseCounts[,2,] >= cl$minStrandAltSupport &
                    controlCounts[,1,] <= cl$maxStrandAltSupportControl &
                    controlCounts[,2,] <= cl$maxStrandAltSupportControl)
    combined_filter = matrix(
      rep( cov_filter, dim(count_filter)[1] ),
      nrow = dim(count_filter)[1], byrow=TRUE
      ) & count_filter
    varpos = which(colSums( combined_filter ) > 0)
    caseDeletions    = data$Deletions[caseColumn,,]
    controlDeletions = data$Deletions[controlColumn,,]
    deletions_filter = (caseDeletions[1,] >= cl$minStrandDelSupport &
                          caseDeletions[2,] >= cl$minStrandDelSupport &
                          controlDeletions[1,] <= cl$maxStrandDelSupportControl &
                          controlDeletions[2,] <= cl$maxStrandDelSupportControl)
    combined_filter_del = cov_filter & deletions_filter
    delpos = which(combined_filter_del)
    if("Insertions" %in% names(data)){
      caseInsertions    = data$Insertions[caseColumn,,]
      controlInsertions = data$Insertions[controlColumn,,]
      insertions_filter = (caseInsertions[1,] >= cl$minStrandInsSupport &
                            caseInsertions[2,] >= cl$minStrandInsSupport &
                            controlInsertions[1,] <= cl$maxStrandInsSupportControl &
                            controlInsertions[2,] <= cl$maxStrandInsSupportControl)
      combined_filter_ins = cov_filter & insertions_filter
      inspos = which(combined_filter_ins)  
    }else{
      inspos <- NULL
    }
    if( cl$returnDataPoints ){
      vars <- {
        if( length(varpos) > 0 ){
          idxs = lapply( varpos, function(j) which(combined_filter[,j]) )
          alt_alleles = lapply(idxs, function(j) bases[j])
          gr_pos = unlist(lapply( seq( along = idxs ), function( j ) rep(varpos[j], length(idxs[[j]]))))
          idxs = unlist(idxs)
          gr_aa = unlist(alt_alleles)
          gr_mmf_case = sapply( seq(along = gr_pos), function( j ) caseCounts[idxs[j],1,gr_pos[j]] )
          gr_mmr_case = sapply( seq(along = gr_pos), function( j ) caseCounts[idxs[j],2,gr_pos[j]] )
          gr_cf_case  = sapply( seq(along = gr_pos), function( j ) caseCoverage[1,gr_pos[j]] )
          gr_cr_case  = sapply( seq(along = gr_pos), function( j ) caseCoverage[2,gr_pos[j]] )
          gr_mmf_cont = sapply( seq(along = gr_pos), function( j ) controlCounts[idxs[j],1,gr_pos[j]] )
          gr_mmr_cont = sapply( seq(along = gr_pos), function( j ) controlCounts[idxs[j],2,gr_pos[j]] )
          gr_cf_cont  = sapply( seq(along = gr_pos), function( j ) controlCoverage[1,gr_pos[j]] )
          gr_cr_cont  = sapply( seq(along = gr_pos), function( j ) controlCoverage[2,gr_pos[j]] )
          tmpRefIdx <- data$Reference[gr_pos] + 1
          tmpRefIdx[tmpRefIdx == 0] <- 5
          tmpRef <- refBases[tmpRefIdx]
          tmp = data.frame(
            Chrom = strsplit( data$h5dapplyInfo$Group, split="/")[[1]][3],
            Start = gr_pos + data$h5dapplyInfo$Blockstart - 1,
            End = gr_pos + data$h5dapplyInfo$Blockstart - 1,
            Sample = names(cases)[i],
            altAllele = gr_aa,
            refAllele = tmpRef,
            caseCountFwd = gr_mmf_case,
            caseCountRev = gr_mmr_case,
            caseCoverageFwd = gr_cf_case,
            caseCoverageRev = gr_cr_case,
            controlCountFwd = gr_mmf_cont,
            controlCountRev = gr_mmr_cont,
            controlCoverageFwd = gr_cf_cont,
            controlCoverageRev = gr_cr_cont
          )
          rm(tmpRef)
          rm(tmpRefIdx)
          if( cl$annotateWithBackground ){
            tmp$backgroundFrequencyFwd = mismatchFrequencies[1, gr_pos]
            tmp$backgroundFrequencyRev = mismatchFrequencies[2, gr_pos]
            tmp$pValueFwd = sapply( seq(along=gr_pos), function( j ) binom.test( caseCounts[idxs[j],1,gr_pos[j]], caseCoverage[1,gr_pos[j]], p = mismatchFrequencies[1,gr_pos[j]], alternative = "greater" )$p.value )
            tmp$pValueRev = sapply( seq(along=gr_pos), function( j ) binom.test( caseCounts[idxs[j],2,gr_pos[j]], caseCoverage[2,gr_pos[j]], p = mismatchFrequencies[2,gr_pos[j]], alternative = "greater" )$p.value )
          }
          tmp
        }else{
          NULL
        }
      }
      dels <- {
        if( length(delpos) > 0 ){
          tmpRefIdx <- data$Reference[delpos] + 1
          tmpRefIdx[tmpRefIdx == 0] <- 5
          tmpRef <- refBases[tmpRefIdx]
          tmp = data.frame(
            Chrom = strsplit( data$h5dapplyInfo$Group, split="/")[[1]][3],
            Start = delpos + data$h5dapplyInfo$Blockstart - 1,
            End = delpos + data$h5dapplyInfo$Blockstart - 1,
            Sample = names(cases)[i],
            altAllele = "-",
            refAllele = tmpRef,
            caseCountFwd = sapply( delpos, function(j) caseDeletions[1,j]),
            caseCountRev = sapply( delpos, function(j) caseDeletions[2,j]),
            caseCoverageFwd = sapply( delpos, function(j) caseCoverage[1,j]),
            caseCoverageRev = sapply( delpos, function(j) caseCoverage[2,j]),
            controlCountFwd = sapply( delpos, function(j) controlDeletions[1,j]),
            controlCountRev = sapply( delpos, function(j) controlDeletions[2,j]),
            controlCoverageFwd = sapply( delpos, function(j) controlCoverage[1,j]),
            controlCoverageRev = sapply( delpos, function(j) controlCoverage[2,j])
          )
          rm(tmpRef)
          rm(tmpRefIdx)
          # Deletions don't count as coverage but the reads still span that position
          tmp$caseCoverageFwd = tmp$caseCoverageFwd + tmp$caseCountFwd
          tmp$caseCoverageRev = tmp$caseCoverageRev + tmp$caseCountRev
          tmp$controlCoverageFwd = tmp$controlCoverageFwd + tmp$controlCountFwd
          tmp$controlCoverageRev = tmp$controlCoverageRev + tmp$controlCountRev
          if( cl$annotateWithBackground ){  
            tmp$pValueFwd = sapply( delpos, function( j ) binom.test( caseDeletions[1,j], caseCoverage[1,j] + caseDeletions[1,j], p = deletionFrequencies[1,j], alternative = "greater" )$p.value )
            tmp$pValueRev = sapply( delpos, function( j ) binom.test( caseDeletions[2,j], caseCoverage[2,j] + caseDeletions[2,j], p = deletionFrequencies[2,j], alternative = "greater" )$p.value )
            tmp$backgroundFrequencyFwd = deletionFrequencies[1, delpos]
            tmp$backgroundFrequencyRev = deletionFrequencies[2, delpos]
          }
          if( cl$mergeCalls ){
            sample_gr = list()
            chrom = strsplit( data$h5dapplyInfo$Group, split="/")[[1]][3]
            if( nrow(tmp) != 0 ){
              tmp_gr = GRanges( S4Vectors::Rle( tmp$Chrom ), ranges = IRanges::IRanges( start = tmp$Start, end = tmp$End ) )
              reduced_gr = IRanges::reduce(tmp_gr)
              reduced_gr$Sample = names(cases)[i]
              ol = IRanges::findOverlaps(reduced_gr, tmp_gr)
              reduced_gr$altAllele = "-"
              reduced_gr$refAllele = NA
              reduced_gr$caseCountFwd = NA
              reduced_gr$caseCountRev = NA
              reduced_gr$caseCoverageFwd = NA
              reduced_gr$caseCoverageRev = NA
              reduced_gr$controlCountFwd = NA
              reduced_gr$controlCountRev = NA
              reduced_gr$controlCoverageFwd = NA
              reduced_gr$controlCoverageRev = NA
              if( cl$annotateWithBackground ){
                reduced_gr$backgroundFrequencyFwd = NA
                reduced_gr$backgroundFrequencyRev = NA
                reduced_gr$pValueFwd = NA
                reduced_gr$pValueRev = NA
              }
              for( id in seq( length(reduced_gr) ) ){
                reduced_gr$refAllele[id] = paste0(tmp$refAllele[subjectHits(ol)[queryHits(ol)==id]], collapse="")
                reduced_gr$caseCoverageFwd[id] = cl$mergeAggregator(tmp$caseCoverageFwd[subjectHits(ol)[queryHits(ol)==id]])
                reduced_gr$caseCoverageRev[id] = cl$mergeAggregator(tmp$caseCoverageRev[subjectHits(ol)[queryHits(ol)==id]])
                reduced_gr$caseCountFwd[id] = cl$mergeAggregator(tmp$caseCountFwd[subjectHits(ol)[queryHits(ol)==id]])
                reduced_gr$caseCountRev[id] = cl$mergeAggregator(tmp$caseCountRev[subjectHits(ol)[queryHits(ol)==id]])
                reduced_gr$controlCoverageFwd[id] = cl$mergeAggregator(tmp$controlCoverageFwd[subjectHits(ol)[queryHits(ol)==id]])
                reduced_gr$controlCoverageRev[id] = cl$mergeAggregator(tmp$controlCoverageRev[subjectHits(ol)[queryHits(ol)==id]])
                reduced_gr$controlCountFwd[id] = cl$mergeAggregator(tmp$controlCountFwd[subjectHits(ol)[queryHits(ol)==id]])
                reduced_gr$controlCountRev[id] = cl$mergeAggregator(tmp$controlCountRev[subjectHits(ol)[queryHits(ol)==id]])
                if( cl$annotateWithBackground ){
                  reduced_gr$backgroundFrequencyFwd[id] = cl$mergeAggregator(tmp$backgroundFrequencyFwd[subjectHits(ol)[queryHits(ol)==id]])
                  reduced_gr$backgroundFrequencyRev[id] = cl$mergeAggregator(tmp$backgroundFrequencyRev[subjectHits(ol)[queryHits(ol)==id]])
                  reduced_gr$pValueFwd[id] = cl$pValueAggregator(tmp$pValueFwd[subjectHits(ol)[queryHits(ol)==id]])
                  reduced_gr$pValueRev[id] = cl$pValueAggregator(tmp$pValueRev[subjectHits(ol)[queryHits(ol)==id]])
                }
              }
              tmp = as.data.frame( reduced_gr )
              tmp$width = NULL
              tmp$strand = NULL
              colnames(tmp)[1:3] = c( "Chrom", "Start", "End" )
            }
          }
          tmp
        }else{
          NULL
        }
      }
      insertions <- {
        if( length(inspos) > 0 ){
          tmpRefIdx <- data$Reference[inspos] + 1
          tmpRefIdx[tmpRefIdx == 0] <- 5
          tmpRef <- refBases[tmpRefIdx]
          tmp = data.frame(
            Chrom = strsplit( data$h5dapplyInfo$Group, split="/")[[1]][3],
            Start = delpos + data$h5dapplyInfo$Blockstart - 1,
            End = delpos + data$h5dapplyInfo$Blockstart - 1,
            Sample = names(cases)[i],
            altAllele = "+",
            refAllele = tmpRef,
            caseCountFwd = sapply( inspos, function(j) caseInsertions[1,j]),
            caseCountRev = sapply( inspos, function(j) caseInsertions[2,j]),
            caseCoverageFwd = sapply( inspos, function(j) caseCoverage[1,j]),
            caseCoverageRev = sapply( inspos, function(j) caseCoverage[2,j]),
            controlCountFwd = sapply( inspos, function(j) controlInsertions[1,j]),
            controlCountRev = sapply( inspos, function(j) controlInsertions[2,j]),
            controlCoverageFwd = sapply( inspos, function(j) controlCoverage[1,j]),
            controlCoverageRev = sapply( inspos, function(j) controlCoverage[2,j])
          )
          rm(tmpRef)
          rm(tmpRefIdx)
          if( cl$annotateWithBackground ){  
            tmp$pValueFwd = sapply( inspos, function( j ) binom.test( caseInsertions[1,j], caseCoverage[1,j] + caseInsertions[1,j], p = insertionFrequencies[1,j], alternative = "greater" )$p.value )
            tmp$pValueRev = sapply( inspos, function( j ) binom.test( caseInsertions[2,j], caseCoverage[2,j] + caseInsertions[2,j], p = insertionFrequencies[2,j], alternative = "greater" )$p.value )
            tmp$backgroundFrequencyFwd = insertionFrequencies[1, inspos]
            tmp$backgroundFrequencyRev = insertionFrequencies[2, inspos]
          }
          tmp
        }else{
          NULL
        }
      }
      ret <- rbind( vars, dels, insertions )
      ret$caseCount <- ret$caseCountFwd + ret$caseCountRev
      ret$controlCount <- ret$controlCountFwd + ret$controlCountRev
      ret$caseCoverage <- ret$caseCoverageFwd + ret$caseCoverageRev
      ret$controlCoverage <- ret$controlCoverageFwd + ret$controlCoverageRev
      return(ret)
    }else{
      if( length(varpos) > 0 | length(delpos) > 0 | length(inspos) > 0 ){
        return( c( varpos, delpos, inspos ) + data$h5dapplyInfo$Blockstart )
      }else{
        return( NULL )
      }
    }
  })
  return( do.call( rbind, ret ) )
}

Try the h5vc package in your browser

Any scripts or data that you put into this service are public.

h5vc documentation built on Nov. 8, 2020, 4:56 p.m.