R/call.variants.single.R

Defines functions callVariantsSingle

Documented in callVariantsSingle

callVariantsSingle <- function( data, sampledata, samples = sampledata$Sample, errorRate = 0.001, minSupport = 2, minAF = 0.05, minStrandSupport = 1, mergeDels = TRUE, aggregator = mean){
  blocksize <- data$h5dapplyInfo$Blockend - data$h5dapplyInfo$Blockstart + 1
  samples <- which( sampledata$Sample %in% samples )
  names(samples) <- sampledata$Sample[samples]
  callDels <- "Deletions" %in% names(data)
  the_bases <- c( "A", "C", "G", "T", "-", "N" )
  ret = lapply(seq(along=samples), function(i) {
    sampleColumn <- sampledata$Column[samples[i]]
    covs <- apply( data$Coverages[-sampleColumn,,], 3, sum )
    snpBackground <- apply(data$Counts[,-sampleColumn,,], 4, sum) / covs
    if(callDels){
      dels <- apply(data$Deletions[-sampleColumn,,], 3, sum)
      delBackground <- dels / (covs + dels)
    }
    sampleCounts <- data$Counts[1:4,sampleColumn,,] + data$Counts[5:8,sampleColumn,,] + data$Counts[9:12,sampleColumn,,]
    sampleDeletions <- data$Deletions[sampleColumn,,]
    rownames(sampleCounts) <- h5vc::bases
    sampleCoverage <- data$Coverages[sampleColumn,,]
    
    template <- data.frame( Sample = names(samples)[i], Chrom = strsplit(data$h5dapplyInfo$Group, split="/")[[1]][3], Start = seq(length = blocksize), End = seq(length = blocksize) )
    if(callDels){
      alleles = 1:5
    }else{
      alleles = 1:4
    }
    pvals <- lapply( alleles, function( altAllele ){
      tmp <- template
      theRefs <- data$Reference+1
      theRefs[theRefs == 0] = 6
      tmp$refAllele <- the_bases[theRefs]
      tmp$altAllele <- the_bases[altAllele]
      
      if( altAllele == 5 ){
        tmp$SupFwd <- sampleDeletions[1,]
        tmp$SupRev <- sampleDeletions[2,]
        tmp$CovFwd <- sampleCoverage[1,] + tmp$SupFwd #Deleted Bases are Coverage too
        tmp$CovRev <- sampleCoverage[2,] + tmp$SupRev
      }else{
        tmp$SupFwd <- sampleCounts[altAllele,1,]
        tmp$SupRev <- sampleCounts[altAllele,2,]
        tmp$CovFwd <- sampleCoverage[1,]
        tmp$CovRev <- sampleCoverage[2,]
      }
      tmp$AF_Fwd <- tmp$SupFwd / tmp$CovFwd
      tmp$AF_Rev <- tmp$SupRev / tmp$CovRev
      tmp$Coverage <- tmp$CovFwd + tmp$CovRev
      tmp$Support <- tmp$SupFwd + tmp$SupRev
      tmp$AF <- tmp$Support / tmp$Coverage
      # Background rate
      if( altAllele == 5 ){
        tmp$fBackground <- delBackground
      }else{
        tmp$fBackground <- snpBackground
      }
      tmp <- subset( tmp, Support >= minSupport & AF >= minAF & SupFwd >= minStrandSupport & SupRev >= minStrandSupport)
      if(nrow(tmp) == 0){
        return(NULL)
      }else{
        binom.test.safe <- function( x, n, p ){
          if(n > 0){
            return( binom.test( x, n, p )$p.value )
          }else{
            return( NaN )
          }
        }
        tmp$pErrorFwd <- sapply( seq( length = nrow(tmp) ), function(i) binom.test.safe( tmp$SupFwd[i], tmp$CovFwd[i], errorRate ))
        tmp$pErrorRev <- sapply( seq( length = nrow(tmp) ), function(i) binom.test.safe( tmp$SupRev[i], tmp$CovRev[i], errorRate ))
        tmp$pError <- sapply( seq( length = nrow(tmp) ), function(i) binom.test.safe( tmp$Support[i], tmp$Coverage[i], errorRate ))
        # Strand bias test
        tmp$pStrand <- sapply( seq( length = nrow(tmp) ), function(i) fisher.test( matrix( c( tmp$SupFwd[i], tmp$CovFwd[i], tmp$SupRev[i], tmp$CovRev[i]), nrow = 2 ) )$p.value )
        if( callDels & mergeDels & altAllele == 5){
          tmp$BlockID <- .blockify(tmp$Start)
          blockCounts <- table(tmp$BlockID)
          singleValueBlocks <- names(blockCounts)[blockCounts == 1]
          tmptmp <- subset(tmp, !(BlockID %in% singleValueBlocks) )
          tmp <- subset(tmp, (BlockID %in% singleValueBlocks) )
          tmptmp <- lapply( split( x = tmptmp, f = tmptmp$BlockID ), function(df){
            tmp_df <- data.frame( Sample = df$Sample[1], Chrom = df$Chrom[1], Start = min(df$Start), End = max(df$End), refAllele = paste(df$refAllele, collapse=""), altAllele = "-" )
            for( slot in colnames(df) ){
              if(!(slot %in% colnames(tmp_df))){
                tmp_df[[slot]] <- aggregator( df[[slot]] )
              }
            }
            return(tmp_df)
          })
          tmp <- rbind( tmp, do.call(rbind, tmptmp) )
          tmp$BlockID <- NULL
        }
        return( tmp )
      }
    } )
    
    pvals <- do.call( rbind, pvals )
    if( is.null(pvals) ){
      return(NULL)
    }
    if( nrow(pvals) > 0 ){
      pvals$Start <- pvals$Start + data$h5dapplyInfo$Blockstart - 1
      pvals$End <- pvals$End + data$h5dapplyInfo$Blockstart - 1
      return( pvals )  
    }else{
      return( NULL ) 
    }
  })
  return( do.call( rbind, ret ) )
}

Try the h5vc package in your browser

Any scripts or data that you put into this service are public.

h5vc documentation built on Nov. 8, 2020, 4:56 p.m.