R/getVariantsByIndividual.R

Defines functions deleteDebugFiles newBamToVariants bamToVariants getNormalVariants inGene shareVariants QCsnp mergeVariantList QCsnps solveCigars readsToPileup getQuality importQualityScores matchToExac matchTodbSNPs SNP2GRanges chrToX vcfToPositions plotFrequencyDiagnostics getVariantsByIndividual

Documented in chrToX getVariantsByIndividual

#' Quality controls the variants in the vcfs, using the bams.
#'
#' @import BiocGenerics
#'
#' @importFrom BiocGenerics strand start start<- end end<-
#' @importFrom GenomicRanges findOverlaps
#' @importFrom IRanges %within%
#' @importFrom S4Vectors subjectHits queryHits
#' @importFrom biomaRt useMart getBM
#' @importFrom Rsamtools scanBamHeader ScanBamParam scanBamFlag scanBam
#' @importFrom R.oo charToInt
getVariantsByIndividual = function(metaData, captureRegions, fasta, genome, BQoffset, dbDir, Rdirectory, plotDirectory, cpus, exacPopulation='all', forceRedo=F, filterOffTarget=T) {
  catLog('Using variants by individual.\n')
  saveFile = paste0(Rdirectory, '/variantsBI.Rdata')
  if ( file.exists(saveFile) & !forceRedo ) {
    catLog('Loading saved variants by individual.\n')
    load(file=saveFile)
    return(variantsBI)
  } 
  
  variantsBI = list()

  #iterate over individuals
  for ( individual in unique(metaData$INDIVIDUAL) ) {
    catLog('\nVariants for', individual, '\n')
    samples = metaData$NAME[metaData$INDIVIDUAL == individual]
    positionsSaveFile = paste0(Rdirectory, '/positions.', individual,'.Rdata')
    variantsSaveFile = paste0(Rdirectory, '/variants.', individual,'.Rdata')
    if ( file.exists(variantsSaveFile) & !forceRedo ) {
      catLog('Loading saved variants for', individual,'\n')
      load(file=variantsSaveFile)
      variantsBI = c(variantsBI, variants)
      next
    } 
    if ( file.exists(positionsSaveFile) & !forceRedo ) {
      catLog('Loading saved positions for', individual,'\n')
      load(file=positionsSaveFile)
    }
    else {
      #extract positions in any vcf of any sample from this individual
      vcfFiles = metaData[samples,]$VCF
      positions = vcfToPositions(vcfFiles, genome=genome)

      #keep only positions within 300bp of a capture region
      paddedCaptureRegions = captureRegions
      start(paddedCaptureRegions) = start(captureRegions) - 300
      end(paddedCaptureRegions) = end(captureRegions) + 300
      positions = inGene(positions, paddedCaptureRegions, genome=genome)
      inCapture = SNP2GRanges(positions, genome=genome) %within% paddedCaptureRegions
      if ( filterOffTarget ) {
        catLog('Keeping ', sum(inCapture), ' out of ', length(inCapture),
               ' (', round(sum(inCapture)/length(inCapture), 3)*100,
               '%) SNVs that are inside capture regions.\n', sep='')
        positions = positions[inCapture,]
      }
      else {
        catLog(sum(inCapture), ' out of ', length(inCapture),
               ' (', round(sum(inCapture)/length(inCapture), 3)*100,
               '%) SNVs are inside capture regions. Not filtering as filterOffTarget is set to FALSE.\n', sep='')

      }
      catLog('saving positions...')
      save(positions, file=positionsSaveFile)
      catLog('done.\n')
    }

    #extract variant information over the identified positions
    variants = lapply(samples, function(sampleName) {
    	qSaveFile = paste0(Rdirectory, '/q_', sampleName, '.Rdata')
		if ( file.exists(qSaveFile) & !forceRedo ) {
		  catLog('Loading saved variants for ', sampleName, '.\n')
		  load(file=qSaveFile)
		  return(q)
		}
    	q = newBamToVariants(metaData[sampleName,]$BAM, positions,
    	                     fasta, Rdirectory, BQoffset, genome=genome, cpus=cpus)[[1]]
		save(q, file=qSaveFile)
    	return(q)
	})
	names(variants) = samples
    #bamFiles = metaData[samples,]$BAM
    #variants = newBamToVariants(bamFiles, positions, fasta, Rdirectory, BQoffset, genome=genome, cpus=cpus)
    
    
    names(variants) = samples
    variants = lapply(variants, function(q) q[!apply(is.na(q), 1, any),])
    variants = shareVariants(variants)
    catLog('saving variants to ', variantsSaveFile, '...')
    save(variants, file=variantsSaveFile)
    catLog('done.\n')

    variantsBI = c(variantsBI, variants)
  }

  #check db SNP for called variants.
  variantsBI = lapply(variantsBI, function(q) q[!is.na(q$x),])
  variantsBI = matchTodbSNPs(variantsBI, dir=dbDir, genome=genome, cpus=cpus)
  variantsBI = matchToExac(variantsBI, dir=dbDir, genome=genome, exacPopulation=exacPopulation, cpus=cpus)
  variantsBI = lapply(variantsBI, function(q) q[order(q$x, q$variant),])
    

  #generate SNPs, mainly for backwards compatibility
  q = do.call(rbind, variantsBI)
  q = q[!duplicated(q$x),]
  q = q[order(q$x),]
  inGene = xToGene(q$x, captureRegions=captureRegions, genome=genome)
  names(inGene) = q$x
  SNPs = data.frame(x=q$x, chr=xToChr(q$x, genome), start=xToPos(q$x, genome), end=xToPos(q$x, genome),
                    inGene=inGene, reference=q$reference, variant=q$variant, db=q$db)
  variantsBI = lapply(variantsBI, function(q) {
    q$inGene = inGene[as.character(q$x)]
    return(q)
    })

  variantsBI = list(variants=variantsBI, SNPs=SNPs)

  catLog('Saving variants..')
  save(variantsBI, file=saveFile)
  catLog('done.\n')

  plotFrequencyDiagnostics(variantsBI, plotDirectory)
    
  return(variantsBI)
}

plotFrequencyDiagnostics = function(variants, plotDirectory) {
  diagnosticPlotsDirectory = paste0(plotDirectory, '/diagnostics')
  if ( !file.exists(diagnosticPlotsDirectory) ) dir.create(diagnosticPlotsDirectory)
  FreqDirectory = paste0(diagnosticPlotsDirectory, '/frequencyDistribution/')
  catLog('Plotting frequency distributions to ', FreqDirectory,'..', sep='')
  if ( !file.exists(FreqDirectory) ) dir.create(FreqDirectory)
  for ( sample in names(variants$variants) ) {
    catLog(sample, '..', sep='')
    use = variants$variants[[sample]]$cov > 0 & !is.na(variants$variants[[sample]]$cov)
    if ( is.na(sum(use)) ) {
      warning('Found NA coverage in', sample, '...')
      use[is.na(use)] = F
    }
    if ( sum(use) > 0 ) {
      png(paste0(FreqDirectory, sample, '-varcov.png'), height=1000, width=2000, res=144)
      cleanUse = use & variants$variants[[sample]]$flag %in% c('', 'Svr')
      if ( any(cleanUse) ) {
        cov = noneg(variants$variants[[sample]]$cov[cleanUse] + pmax(-0.4, pmin(0.4, rnorm(sum(cleanUse), 0, 0.2))))
        var = pmin(cov, noneg(variants$variants[[sample]]$var[cleanUse] + pmax(-0.4, pmin(0.4, rnorm(sum(cleanUse), 0, 0.2)))))
        plotColourScatter((var/cov), cov, log='y', xlab='f', ylab='coverage', main=paste0(sample, ', clean variants'))
      }
      else {
        cov = noneg(variants$variants[[sample]]$cov[use] + pmax(-0.4, pmin(0.4, rnorm(sum(use), 0, 0.2))))
        var = pmin(cov, noneg(variants$variants[[sample]]$var[use] + pmax(-0.4, pmin(0.4, rnorm(sum(use), 0, 0.2)))))
        plotColourScatter((var/cov), cov, log='y', xlab='f', ylab='coverage', main=sample)
      }
      dev.off()
    }
    
    use = variants$variants[[sample]]$var > 0
    if ( any(use) ) {
      pdf(paste0(FreqDirectory, sample, '-hist.pdf'), height=7, width=14)
      hist((variants$variants[[sample]]$var/variants$variants[[sample]]$cov)[use],
           breaks=(0:100)/100, col=mcri('blue'), main='all Variants',
           xlab='variant frequency', ylab='number of variants')
      cleanUse = use & variants$variants[[sample]]$flag == ''
      if ( any(cleanUse) ) {
        hist((variants$variants[[sample]]$var/variants$variants[[sample]]$cov)[cleanUse],
             breaks=(0:100)/100, col=mcri('blue'), main='clean Variants',
             xlab='variant frequency', ylab='number of variants')
      }
      cleanDbUse = cleanUse & variants$variants[[sample]]$db
      if ( any(cleanDbUse) ) {
        hist((variants$variants[[sample]]$var/variants$variants[[sample]]$cov)[cleanDbUse],
             breaks=(0:100)/100, col=mcri('blue'), main='clean dbSNP Variants',
             xlab='variant frequency', ylab='number of variants')
      }
      dev.off()
    }
  }
  catLog('done.\n')
}



#helper function that imports the variants from a vcf file.
#takes a vector of vcf files as input, and outputs a data frame with
#chromosome, start, end, x (genomic coordinate), reference, variant and gene
#for each position. Duplicated positions are removed, and positions that fails to
#align with an ensembl gene is marked with '?'.
vcfToPositions = function(files, genome='hg19') {
  files = unique(files)
  #if more than one file, call each file separately and rbind the outputs.
  if ( length(files) > 1 ) {
    catLog('Found', length(files), 'files.', '\n')
    ret = do.call(rbind, lapply(files, function(file) vcfToPositions(file, genome=genome)))
    ret = ret[!duplicated(ret$x),]
    ret = ret[order(ret$x),]
    return(ret)
  }

  catLog('Reading file ', files, '...', sep='')
  raw = try(read.table(files, fill=T, skip=0, row.names=NULL, header=F, as.is=T))
  #empty VCF files throws an error in read.table.
  #catch and demote to a warning for that case, as there are legitimate reasons for empty VCF files as input.
  if ( class(raw) == 'try-error' ) {
    if ( grepl('no lines available in input', raw) ) {
      catLog('WARNING: Empty input VCF file ', files, '. Unless you know why this is, you probably want to check that input file, or you risk miss variants present in this sample.', sep='')
      warning('Empty input VCF file ', files, '. Unless you know why this is, you probably want to check that input file, or you risk miss variants present in this sample.')
      ret = data.frame(
        chr = character(0),
        start = numeric(0),
        end = numeric(0),
        x = numeric(0),
        reference = character(0),
        variant = character(0), stringsAsFactors=F)
      return(ret)
    }
    else stop(raw)
  }
  raw = raw[!grepl('^#', raw$V1),]
  catLog('done.\nProcessing data...')
  if ( nrow(raw) == 0 ) return(matrix(1, nrow=0, ncol=22))
  chrs = gsub('MT', 'M', gsub('chr', '', as.character(raw$V1)))
  use = chrs %in% names(chrLengths(genome))
  raw = raw[use,]
  chrs = chrs[use]
  thirdColIsReference = sum(substring(raw[,3], 1, 1) %in% c('A', 'C', 'T', 'G')) >= nrow(raw)/2
  refCol = 4
  if ( thirdColIsReference ) refCol = 3
  varCol = 5
  if ( thirdColIsReference ) varCol = 4
  variant = sapply(strsplit(raw[,varCol], split=','), function(parts) c(parts, '')[1])
  ret = data.frame(
    chr = chrs,
    start = as.numeric(as.character(raw[,2])),
    end = as.numeric(as.character(raw[,2])),
    x = chrToX(chrs, as.numeric(as.character(raw[,2])), genome=genome),
    reference = toupper(substr(as.character(raw[,refCol]), 1, 1)),
    variant = toupper(variant), stringsAsFactors=F)

  if ( any(ret$variant == ret$reference) )
    ret[ret$variant == ret$reference,]$variant = '-1'
  
  ret = ret[!duplicated(ret$x),]
  ret = ret[order(ret$x),]
  
  rownames(ret) = ret$x
  catLog('done.\n')
  
  catLog('Returning data frame of dimension', dim(ret), '\n')
  return(ret)
}

#' convert chromsome and position to the single genomic coordinate x
#'
#' @param chr character: The chromosome. Both 'chr1' and '1' works. 'Chr1' does not.
#' @param bp integer: the position on the chromosome.
#' @param genome character: the genome, such as 'hg19', 'hg38' or 'mm10'. Default 'hg19'.
#'
#' @details convert chromsome and position to the single genomic coordinate x.
#'
#' @export
#'
#' @examples
#' xToPos(1e9)
#' xToPos(1e9, genome='mm10')
chrToX = function(chr, bp, genome='hg19') {
  prevChrL = c(0, cumsum(chrLengths(genome)))
  names(prevChrL) = c(names(chrLengths(genome)), 'outside')
  return(prevChrL[gsub('chr', '', chr)] + bp)
}

#Helper function that takes a SNP data frame and returns an GRanges object for the SNPs.
SNP2GRanges = function(SNPs, genome=genome) {
  ir = IRanges(start = SNPs$start, end = SNPs$end, names = rownames(SNPs), width = rep(1, nrow(SNPs)))
  return(GRanges(seqnames = SNPs$chr, ranges = ir, seqlengths = chrLengths(genome)))
}

#hepler function that marks the variants in a SNPs object as db or non db SNPs.
matchTodbSNPs = function(variants, dir, genome='hg19', cpus=1) {

  if ( !all(sapply(variants, nrow) == nrow(variants[[1]])) ) {
  	catLog('\nWARNING: variants does not have same number of rows in all elements in matchTodbSNPs. This is not intended and is due to an upstream error. Likely to crash with unrelated hard-to-interpret error.\n\n')
  	warning('variants does not have same number of rows in all elements in matchTodbSNPs. This is not intended and is due to an upstream error. Likely to crash with unrelated hard-to-interpret error.')
  }

  variants = lapply(variants, function(q) {
    q$db = rep(F, nrow(q))
    q$dbMAF = rep(NA, nrow(q))
    q$dbValidated = rep(NA, nrow(q))
    return(q)
  })

  if ( genome == 'hg38' ) dir = paste0(dir, '/hg38')
  if ( genome == 'mm10' ) dir = paste0(dir, '/mm10')

  RsaveFile = paste0(dir,'/dbAFnew.Rdata')
  if ( !file.exists(RsaveFile) ) stop('dbSNP file not found at expected path ', RsaveFile)
  catLog('Importing dbSNP allele frequencies from ', RsaveFile, '...', sep='')
  load(RsaveFile)
  catLog('done. Match against variants...')
  qNames = unique(do.call(c, lapply(variants, rownames)))
  dbNames = names(dbAF)
  relevantDB = dbNames %in% qNames
  dbAF = dbAF[relevantDB]
  dbNames = names(dbAF)
  isDB = qNames %in% dbNames
  dbAF = dbAF[qNames[isDB]]
  if ( any(is.na(dbAF)) ) {
    dbValidated = !is.na(dbAF) & dbAF > 0
  }
  else {
    dbValidated = dbAF < 3 | dbAF > 70
    dbAF = ifelse(dbAF > 30, NA, ifelse(dbAF > 3, dbAF-10, dbAF))
  }
  
  variants = lapply(variants, function(q) {
    q$db = isDB
    q$dbMAF = NA
    q$dbValidated = NA
    q$dbMAF[isDB] = dbAF
    q$dbValidated[isDB] = dbValidated
    return(q)
  })
  catLog('done.\n')
  
  return(variants)
}


#hepler function that marks the variants in a SNPs object as db or non db SNPs.
matchToExac = function(variants, dir, genome='hg19', exacPopulation='all', cpus=1) {
  if ( !(genome %in% c('hg19', 'hg38')) ) return(variants)
  if ( genome == 'hg38' ) dir = paste0(dir, '/hg38')

  variants = lapply(variants, function(q) {
    q$exac = rep(F, nrow(q))
    q$exacAF = rep(NA, nrow(q))
    q$exacQual = rep(NA, nrow(q))
    q$exacFilter = rep(NA, nrow(q))
    return(q)
  })

  RsaveFile = paste0(dir,'/exac.Rdata')
  if ( !file.exists(RsaveFile) ) stop('exac file not found at expected path ', RsaveFile)
  catLog('Importing exac allele frequencies from ', RsaveFile, '...', sep='')
  load(RsaveFile)
  catLog('done. Match against variants...')
  qNames = unique(do.call(c, lapply(variants, rownames)))
  exacNames = rownames(exac)
  relevantEXAC = exacNames %in% qNames
  exac = exac[relevantEXAC,]
  exacNames = rownames(exac)

  #detect ethnicity and include specific population frequency as well.
  #what about privacy here? Should this be in the log file? Done quietly, not displayed?
  #shouldnt be done at all? Leaving out for now.
  #using highest AF of global AF and population AF seems suitably conservative
  populations = c('AFR', 'AMR', 'EAS', 'FIN', 'NFE', 'OTH', 'SAS')
  if ( FALSE & all(populations %in% names(exac)) ) {
      vars = unique(unlist(lapply(variants, function(q) rownames(q)[q$flag == '' & q$var > 0.25*q$cov])))
      vars = vars[vars %in% rownames(exac)]
      subexac = exac[rownames(exac) %in% vars,][vars,]
      popSums = sapply(populations, function(pop) sum(subexac[[pop]]))
      thisPop = populations[which(popSums == max(popSums))[1]]
  }

  popCol = 'alleleFrequency'
  if ( exacPopulation %in% populations ) popCol = exacPopulation
  else if ( exacPopulation != 'all' ) warning('Do not know about supplied exacPopulation ', exacPopulation, '. Defaulting to \'all\'. Other available choices are ', paste0('\'', populations, '\', '))

  if ( !(popCol %in% names(exac)) ) stop('The exac population \'', popCol, '\' wasnt found in the exac data. If you expected it to be present, you might have an outdated exac data file. If so, try deleting or renaming ', RsaveFile, ' and rerun: the latest version will be downloaded.')

  variants = lapply(variants, function(q) {
    q$exac = rownames(q) %in% exacNames
    q$exacAF[q$exac] = exac[rownames(q)[q$exac],][[popCol]]
    q$exacQual[q$exac] = exac[rownames(q)[q$exac],]$qual
    q$exacFilter[q$exac] = exac[rownames(q)[q$exac],]$filter
    return(q)
  })

  
  return(variants)
  
}




#helper functions that counts reads in favour of reference and any present variant
#at the given locations for the given bam files.
importQualityScores = function(positions, files, BQoffset, genome='hg19', cpus=1) {
  ret=list()
  chr = as.character(positions$chr)
  for ( file in files ) {
    catLog(as.character(Sys.time()), '\n')
    catLog('Importing', length(chr), 'pileups by chr from', file, '\n')
    if ( cpus > 1 ) {
      listRet = mclapply(unique(chr), function(ch) {
        use = chr == ch
        return(getQuality(file, chr[use], positions$start[use], BQoffset, cpus=1))
      }, mc.cores=cpus, mc.preschedule=F)
    }
    else {
      listRet = lapply(unique(chr), function(ch) {
        use = chr == ch
        return(getQuality(file, chr[use], positions$start[use], BQoffset, cpus=1))
      })
    }
    catLog('done!\n')
    ret[[file]] = do.call(c, listRet)
  }
  return(ret)
}
getQuality = function(file, chr, pos, BQoffset, cpus=1) {
  catLog(chr[1], '.. ', sep='')
  if ( any(grepl('^chr1', names(scanBamHeader(file)[[1]]$targets))) )
    chr = paste0('chr', chr)
  which = GRanges(chr, IRanges(pos-3, pos+3))
  chr = gsub('chr', '', chr)
  p1 = ScanBamParam(which=which, what=c('pos', 'seq', 'cigar', 'qual', 'mapq', 'strand', 'qname'),
    flag=scanBamFlag(isSecondaryAlignment=FALSE))
  index = paste0(file, '.bai')
  if ( !file.exists(index) ) {
    index = gsub('.bam$', '.bai', file)
    if ( !file.exists(index) ) {
      catLog('Could not find an index file for', file, '\nI want either', paste0(file, '.bai'), 'or', index)
      stop(paste('Could not find an index file for', file, '\nI want either', paste0(file, '.bai'), 'or', index))
    }
  }
  regionReads = scanBam(file, index=index, param=p1)
  regionReads = lapply(regionReads, function(reads) {
    keep = !is.na(reads$mapq) & !duplicated(reads$qname)
    if ( length(keep) == 0 ) return(reads)
    if ( sum(!keep) > 0 ) {
      reads$pos = reads$pos[keep]
      reads$seq = reads$seq[keep]
      reads$cigar = reads$cigar[keep]
      reads$qual = reads$qual[keep]
      reads$mapq = reads$mapq[keep]
      reads$strand = reads$strand[keep]
    }
    return(reads)
  })
  if ( cpus > 1 )
    pileups = mclapply(1:length(pos), function(i) {
      return(readsToPileup(regionReads[[i]], pos[i], BQoffset=BQoffset))}, mc.cores=cpus)
  else
    pileups = lapply(1:length(pos), function(i) readsToPileup(regionReads[[i]], pos[i], BQoffset=BQoffset))
  catLog('(done ', chr[1], ') ', sep='')
  return(pileups)
}
readsToPileup = function(reads, pos, BQoffset) {
  cigars = reads$cigar
  seqI = unlist(lapply(reads$pos, function(readPos) pos-readPos+1))
  call = rep(NA, length(cigars))
  qual = rep(NA, length(cigars))
  mapq = reads$mapq
  strand = reads$strand
  easy = grepl('^[0-9]*M$', cigars)
  call[easy] = substring(reads$seq[easy], seqI[easy], seqI[easy])
  qual[easy] = charToInt(substring(reads$qual[easy], seqI[easy], seqI[easy]))-BQoffset
  if ( length(cigars) > 0 ) {
    NAcigar = is.na(cigars)
    easy[NAcigar] = T
    call[NAcigar] = ''
    qual[NAcigar] = 0
    mapq[NAcigar] = 0
    strand[NAcigar] = '+'
  }
  if ( any(!easy) ) {
    messyCalls = solveCigars(cigars[!easy], reads$seq[!easy], reads$qual[!easy], seqI[!easy])
    seqI[!easy] = as.numeric(sapply(messyCalls, function(mC) mC[3]))

    #check for short softclipped calls, check if most other well-aligned reads agree, if so, keep, otherwise discard.
    if ( any(grepl('[a-z]', unlist(lapply(messyCalls, function(call) call[1])))) ) {
      softClipped = grep('[a-z]', unlist(lapply(messyCalls, function(call) call[1])))
      keepSoftClipped = rep(F, length(softClipped))
      if ( sum(easy) > 2 ) {
        consensus = substring(reads$seq[easy], seqI[easy]-2, seqI[easy]+2)
        consensus = consensus[nchar(consensus) == 5]
        if ( length(consensus) > 2 ) {
          before = names(sort(table(substring(consensus, 1, 2)),decreasing=T)[1])
          after = names(sort(table(substring(consensus, 4, 5)),decreasing=T)[1])
          if ( nchar(before) == 2 & nchar(after) == 2 ) {
            seqis = as.numeric(unlist(lapply(messyCalls, function(call) call[3])))[softClipped]
            keepSoftClipped =
              substring(reads$seq[!easy][softClipped], seqis-2,seqis-1) %in% c(before, gsub('^.', '^', before), '') &
            substring(reads$seq[!easy][softClipped], seqis+1, seqis+2) %in% c(after, gsub('.$', '$', before), '')
            if ( sum(keepSoftClipped) > 0 ) {
              messyCalls[softClipped][keepSoftClipped] = lapply(messyCalls[softClipped][keepSoftClipped], function(call) {
                call[1] = toupper(call[1])
                return(call)
              })
            }
          }
        }
      }
      if ( sum(!keepSoftClipped) > 0 ) {
        messyCalls[softClipped][!keepSoftClipped] = lapply(messyCalls[softClipped][!keepSoftClipped], function(call) {
          call = c('', '', 0)
          return(call)
        })
      }
    }
    
    call[!easy] = unlist(lapply(messyCalls, function(solution) solution[1]))
    qual[!easy] = unlist(lapply(messyCalls, function(solution) {
      if ( solution[2] == '' ) return(0)
      else return(mean(charToInt(unlist(strsplit(solution[2],split=NULL)))))
      })) - BQoffset
  }

  #look for stuttering in case of deletions/insertions.
  #this is removed for now, due to not really being needed and cpu intensive.
  note = rep('', length(call))
  #if ( length(easy) > 2 ) {
  #  stutterLength = 20
  #  consensus = substring(reads$seq, seqI-stutterLength, seqI+stutterLength)
  #  consensus = consensus[nchar(consensus) == 2*stutterLength+1]
  #  if ( length(consensus) > 2 ) {
  #    before = names(sort(table(substring(consensus, 1, stutterLength)),decreasing=T)[1])
  #    after = names(sort(table(substring(consensus, stutterLength+2, 2*stutterLength+1)),decreasing=T)[1])
  #    for ( repCall in c('A', 'T', 'C', 'G') ) {
  #      if ( do.call(paste0, as.list(rep(repCall, stutterLength))) %in% c(before, after) ) {
  #        note[call == repCall | grepl('[+-]', call)] = 'stutter'
  #      }
  #    }
  #  }
  #}

  readLengths = sapply(reads$seq, length)

  #look for palindromic regions.
  hasNeighourhood = !easy & seqI > 10 & readLengths > seqI + 10
  before = substring(reads$seq[hasNeighourhood], seqI[hasNeighourhood]-11, seqI[hasNeighourhood]-1)
  after = substring(reads$seq[hasNeighourhood], seqI[hasNeighourhood]+1, seqI[hasNeighourhood]+11)
  after = gsub('C', 'g', after)
  after = gsub('G', 'c', after)
  after = gsub('T', 'a', after)
  after = gsub('A', 't', after)
  compAfter = toupper(after)
  revCompAfter = sapply(lapply(strsplit(compAfter, NULL), rev), paste, collapse="")
  isPalindromic = revCompAfter == before
  note[hasNeighourhood][isPalindromic] = paste0(note[hasNeighourhood][isPalindromic], 'palindrome')

  ret = data.frame('call'=call, 'qual'=qual, 'mapq'=mapq, 'strand'=strand, 'note'=note)
  if ( any(is.na(as.matrix(ret))) ) {
    catLog('NAs in pileup. Data frame was:\n')
    for ( row in 1:nrow(ret) ) catLog(as.matrix(ret[row,]), '\n')
    catLog('cigars are\n')
    for ( row in 1:nrow(ret) ) catLog(reads$cigar[row], as.character(reads$seq[row]), as.character(reads$qual[row]), '\n')
    catLog('NA pos', pos, '\n')
    catLog('seqI', seqI, '\n')
    stop('NAs in pileup.')
  }
  return(ret[ret$call != '' & !is.na(ret$qual) & !is.na(ret$mapq) & !is.na(ret$strand) & !is.na(ret$call),])
}
solveCigars = function(cigars, seqs, quals, seqIs) {
  cigars = gsub('[X=]', 'M', cigars)
  type = strsplit(gsub('^[0-9]+', '', cigars), split='[0-9]+')
  lengths = strsplit(gsub('[MIDNSHP]+$', '', cigars), split='[MIDNSHP]')

  calls = rep('', length(cigars))
  qs = rep('', length(cigars))
  checked = rep(1, length(cigars))
  unSolved = rep(T, length(cigars))
  for ( i in 1:max(unlist(lapply(type, length))) ) {
    che = checked[unSolved]
    sI = seqIs[unSolved]
    tyIp1 = sapply(type[unSolved], function(t) if ( length(t) < i+1 ) '' else t[i+1])
    tyI = sapply(type[unSolved], function(t) if ( length(t) < i ) '' else t[i])
    se = seqs[unSolved]
    le = lengths[unSolved]
    leI = as.numeric(sapply(lengths[unSolved], function(l) l[i]))
    qu = quals[unSolved]
    
    eOR = tyI == ''  #read ends before position
    if ( any(eOR) ) {
      calls[which(unSolved)[eOR]] = ''
      qs[which(unSolved)[eOR]] = ''
      seqIs[which(unSolved)[eOR]] = 0
    }

    iAP = che == sI+1 & tyI == 'I' #insert at position
    if ( any(iAP) ) {
      calls[which(unSolved)[iAP]] = paste0('+', substring(se[iAP], sI[iAP]+1, sI[iAP]+leI[iAP]))
      qs[which(unSolved)[iAP]] = substring(qu[iAP], sI[iAP]+1, sI[iAP]+leI[iAP])
      seqIs[which(unSolved)[iAP]] = sI[iAP]
    }    

    dAP = che == sI+1 & (tyI == 'D' | tyI == 'N') #deletion at position
    if ( any(dAP) ) {
      calls[which(unSolved)[dAP]] = paste0('-', leI[dAP])
      qs[which(unSolved)[dAP]] = substring(qu[dAP], sI[dAP], sI[dAP])
      seqIs[which(unSolved)[dAP]] = sI[dAP]
    }

    dSP = che > sI & !iAP & !dAP & tyI != 'S'  #deletion skipped over position
    if ( any(dSP) ) {
      calls[which(unSolved)[dSP]] = ''
      qs[which(unSolved)[dSP]] = ''      
      seqIs[which(unSolved)[dSP]] = 0      
    }


    nAP = che + leI - 1 >= sI & che <= sI & tyI == 'M' #normal read until position
    fID = che + leI - 1 == sI & (tyIp1 == 'I' | tyIp1 == 'D' | tyIp1 == 'N')    #following indel
    if ( any(nAP & fID) ) {
      checked[which(unSolved)[nAP & fID]] = checked[which(unSolved)[nAP & fID]] + leI[nAP & fID]
    }
    if ( any(nAP & !fID) ) {
      calls[which(unSolved)[nAP & !fID]] = substring(se[nAP & !fID], sI[nAP & !fID], sI[nAP & !fID])
      qs[which(unSolved)[nAP & !fID]] = substring(qu[nAP & !fID], sI[nAP & !fID], sI[nAP & !fID])
      seqIs[which(unSolved)[nAP & !fID]] = sI[nAP & !fID]
    }
    
    sAPs =  che > sI & tyI == 'S' #soft clip over position at start of read
    lt4 = leI < 4 #3 or fewer bp are soft clipped
    if ( any(sAPs & lt4) ) {
      calls[which(unSolved)[sAPs & lt4]] = tolower(substring(se[sAPs & lt4], (sI+leI)[sAPs & lt4], (sI+leI)[sAPs & lt4]))
      qs[which(unSolved)[sAPs & lt4]] = substring(qu[sAPs & lt4], (sI+leI)[sAPs & lt4], (sI+leI)[sAPs & lt4])
      seqIs[which(unSolved)[sAPs & lt4]] = (sI+leI)[sAPs & lt4]
    }
    if ( any(sAPs & !lt4) ) {
      calls[which(unSolved)[sAPs & !lt4]] = ''
      qs[which(unSolved)[sAPs & !lt4]] = ''
      seqIs[which(unSolved)[sAPs & !lt4]] = 0
    }

    sAPe = che <= sI & che +leI > sI & tyI == 'S' & tyIp1 == '' #soft clip over position at end of read
    lt4 = leI < 4 #3 or fewer bp are soft clipped
    if ( any(sAPe & lt4) ) {
      calls[which(unSolved)[sAPe & lt4]] = tolower(substring(se[sAPe & lt4], sI[sAPe & lt4], sI[sAPe & lt4]))
      qs[which(unSolved)[sAPe & lt4]] = substring(qu[sAPe & lt4], sI[sAPe & lt4], sI[sAPe & lt4])
      seqIs[which(unSolved)[sAPe & lt4]] = sI[sAPe & lt4]
    }
    if ( any(sAPe & !lt4) ) {
      calls[which(unSolved)[sAPe & !lt4]] = ''
      qs[which(unSolved)[sAPe & !lt4]] = ''
      seqIs[which(unSolved)[sAPe & !lt4]] = 0
    }
    
    nBP = !nAP & tyI == 'M'    #normal ending before position
    if ( any(nBP) ) {
      checked[which(unSolved)[nBP]] = checked[which(unSolved)[nBP]] + leI[nBP]
    }

    iBP = !iAP & !sAPs & !sAPe & (tyI == 'I' | tyI == 'S')   #insertion before positions
    if ( any(iBP) ) {
      checked[which(unSolved)[iBP]] = checked[which(unSolved)[iBP]] + leI[iBP]
      seqIs[which(unSolved)[iBP]] = seqIs[which(unSolved)[iBP]] + leI[iBP]
    }

    dBP = !dAP & (tyI == 'D' | tyI == 'N')   #deletion before position
    if ( any(dBP) ) {
      seqIs[which(unSolved)[dBP]] = seqIs[which(unSolved)[dBP]] - leI[dBP]
    }

    unSolved[eOR | iAP | dAP | dSP | (nAP & !fID) | sAPs | sAPe] = F
  }

  solutions = lapply(1:length(cigars), function(i) c(calls[i], qs[i], seqIs[i]))
  return(solutions)
}
QCsnps = function(pileups, positions, cpus=10) {
  references = as.character(positions$reference)
  variants = as.character(positions$variant)
  xs = positions$x
  catLog('QCing', length(pileups), 'positons...')
  catLog(as.character(Sys.time()), '\n')
  ret = mclapply(1:length(pileups), function(i) {
    if ( i/10000 == round(i/10000) ) catLog(i/1000, 'k..', sep='')
    return(QCsnp(pileups[[i]], references[i], xs[i], variant='', defaultVariant=variants[i]))
  }, mc.cores=cpus)
  ret = mergeVariantList(ret)
  ret = ret[ret$reference != ret$variant,]
  rownames(ret) = paste0(ret$x, ret$variant)
  catLog('done.\n')
  catLog('Variants:', nrow(ret), '\n')
  catLog('Unflagged :', sum(ret$flag==''), '\n')
  catLog('Unflagged over 0%  freq :', sum(ret$var > 0 & ret$flag==''), '\n')
  catLog('Unflagged over 20% freq :', sum(ret$var > ret$cov/5 & ret$flag==''), '\n')
  catLog('Median coverage over unflagged variants:', median(ret$cov[ret$flag=='']), '\n')
  catLog('Repeat flags:', length(grep('Rep', ret$flag)), '\n')
  catLog('Mapping quality flags:', length(grep('Mq', ret$flag)), '\n')
  catLog('Base quality flags:', length(grep('Bq', ret$flag)), '\n')
  catLog('Strand bias flags:', length(grep('Sb', ret$flag)), '\n')
  catLog('Single variant read flags:', length(grep('Svr', ret$flag)), '\n')
  #catLog('Single reference read flags:', length(grep('Srr', ret$flag)), '\n')
  catLog('Minor variant flags:', length(grep('Mv', ret$flag)), '\n')
  catLog('Stutter flags:', length(grep('St', ret$flag)), '\n')
  return(ret)
}
mergeVariantList = function(variantList) {
  dat = unlist(variantList)
  nrow = length(variantList)
  ret = data.frame(
    'x'=as.numeric(dat[grep('^x',names(dat))]),
    'reference'=dat[grep('^reference',names(dat))],
    'variant'=dat[grep('^variant',names(dat))],
    'cov'=as.numeric(dat[grep('^cov',names(dat))]),
    'ref'=as.numeric(dat[grep('^ref[0-9]*$',names(dat))]),
    'var'=as.numeric(dat[grep('^var[0-9]*$',names(dat))]),
    'pbq'=as.numeric(dat[grep('^pbq',names(dat))]),
    'pmq'=as.numeric(dat[grep('^pmq',names(dat))]),
    'psr'=as.numeric(dat[grep('^psr',names(dat))]),
    'RIB'=as.numeric(dat[grep('^RIB',names(dat))]),
    'flag'=dat[grep('^flag',names(dat))], stringsAsFactors=F)
  return(ret)
}
QCsnp = function(pileup, reference, x, variant='', defaultVariant='') {
  if ( is.atomic(pileup) ) {
    return(data.frame('x'=x, 'reference'=reference, 'variant'='', 'cov'=0, 'ref'=0, 'var'=0,
                      'pbq'=1, 'pmq'=1, 'psr'=1, 'RIB'=0, 'flag'='', stringsAsFactors=F))
  }

  ref = pileup$call == reference
  if ( variant=='' ) {
    if ( !grepl('-[ATCG]', defaultVariant) )
      variant = unique(c(defaultVariant, as.character(pileup$call[!ref])))
    else
      variant = unique(as.character(pileup$call[!ref]))
    variant = variant[variant != '']
    if ( length(variant) > 1 ) {
      ret = do.call(rbind, lapply(variant, function(var) QCsnp(pileup, reference, x, var)))
      #flag non-zero variants that are not the strictly largest frequency
      minorVariants = ret$var > 0 & (ret$var < max(ret$var) | sum(ret$var == max(ret$var)) > 1)
      ret$flag[minorVariants] = paste0(ret$flag[minorVariants], 'Mv')
      return(ret)
    }
    else if ( length(variant) == 1 ) QCsnp(pileup, reference, x, variant)
    else variant = defaultVariant
  }
  var = pileup$call == variant

  flag = ''
  
  #check for mapQ 0 or 1, if too many, flag as repeat region.
  #then remove those reads.
  lowMqReads = 0
  if ( any(pileup$mapq < 2) ) {
    if ( sum(pileup$mapq < 2)/nrow(pileup) > 0.5 )
      flag = paste0(flag, 'Rep')
    lowMqReads = sum(pileup$mapq < 2)
    pileup = pileup[pileup$mapq > 1,]
    ref = pileup$call == reference
    var = pileup$call == variant
  }

  #check for palindromic reads and remove. Flag variant if more than half.
  if ( any(pileup$note == 'palindrome') ) {
    if ( sum(pileup$note == 'palindrome')/nrow(pileup) > 0.5 )
      flag = paste0(flag, 'Pa')
    pileup = pileup[!grepl('palindrome', pileup$note),]
    ref = pileup$call == reference
    var = pileup$call == variant
  }

  #compare base quality scores between variant and reference
  pbq = 1 
  if ( sum(ref) > 0 & sum(var) > 0 ) {
    pbq = if ( sum(ref > 0) ) wilcox.test(pileup$qual[var], pileup$qual[ref], alternative='less', exact=F)$p.value else 1
    if ( is.na(pbq) ) pbq = 1
    if ( pbq < 0.01 & mean(pileup$qual[var]) < 30 & mean(pileup$qual[ref]) - mean(pileup$qual[var]) > 10  ) flag = paste0(flag, 'Bq')
    else if ( mean(pileup$qual[var]) < 15 ) flag = paste0(flag, 'Bq')
    else if ( mean(pileup$qual[var]) < 30 - sum(var)*3 ) flag = paste0(flag, 'Bq') #if few reads, better be high quality
    #else if ( sum(pileup$qual[var] > 30) < 0.1*sum(var) ) flag = paste0(flag, 'Bq')
    #the wilcox test fails if all the scores are the same, returning NA. handle.
    if ( is.na(pbq) ) pbq=1
  }
  
  #compare mapping quality scores between variant and reference
  pmq = 1 
  if ( sum(ref) > 0 & sum(var) > 0 ) {
    pmq = if ( sum(ref > 0) ) wilcox.test(pileup$mapq[var], pileup$mapq[ref], alternative='less', exact=F)$p.value else 1
    if ( is.na(pmq) ) pmq = 1
    if ( pmq < 0.01 & mean(pileup$mapq[var]) < 30 & mean(pileup$mapq[ref]) - mean(pileup$mapq[var]) > 10 ) flag = paste0(flag, 'Mq')
    else if ( mean(pileup$mapq[var]) < 15 ) flag = paste0(flag, 'Mq')
    else if ( mean(pileup$mapq[var]) < 30 - sum(var)*3 ) flag = paste0(flag, 'Mq') #if few reads, better be high quality
    #else if ( sum(pileup$mapq[var] > 30) < 0.1*sum(var) ) flag = paste0(flag, 'Mq')
    #the wilcox test fails if all the scores are the same, returning NA. Handle.
    if ( is.na(pmq) ) pmq=1
  }
  
  #compare strand ratio between variant and reference
  psr = 1 
  if ( sum(ref) > 0 & sum(var) > 0 ) {
    psr = fisher.test(matrix(
      c(sum(pileup$strand[ref] == '+'), sum(pileup$strand[ref] == '-'),
        sum(pileup$strand[var] == '+'), sum(pileup$strand[var] == '-')), nrow=2))$p.value
    if ( psr < 0.001 ) flag = paste0(flag, 'Sb')
  }

  #check for stutter note
  if ( any(pileup$note == 'stutter') ) {
    flag = paste0(flag, 'St')
  }
  
  #probabilities that the base call and mapping is correct.
  bqW = 1-10^(-pileup$qual/10)
  mqW = 1-10^(-pileup$mapq/10)
  qW = bqW*mqW
  RIB = sum(1-qW)/length(qW)
  if ( is.na(RIB) ) RIB = 0

  #count weighted reads.
  refN = round(sum(qW[ref]))
  varN = round(sum(qW[var]))
  cov = round(sum(qW))

  if (varN == 1) flag = paste0(flag, 'Svr')
  #if (refN == 1) flag = paste0(flag, 'Srr')
  
  if ( lowMqReads > varN/2 & !grepl(flag, 'Rep') ) paste0(flag, 'Rep')
  
  ret = data.frame('x'=x, 'reference'=reference, 'variant'=variant, 'cov'=cov, 'ref'=refN, 'var'=varN,
    'pbq'=pbq, 'pmq'=pmq, 'psr'=psr, 'RIB'=RIB, 'flag'=flag, stringsAsFactors=F)
  if ( any(is.na(as.matrix(ret))) ) {
    catLog('Na in return value of QCsnp. Return value was:\n')
    for ( row in 1:nrow(ret) ) catLog(as.matrix(ret[row,]), '\n')
    catLog('input pileup:\n')
    for ( row in 1:nrow(pileup) ) catLog(as.matrix(pileup[row,]), '\n')
    catLog('input reference:', reference, '\n')
    catLog('input variant:', variant, '\n')
    catLog('input defaultVariant:', defaultVariant, '\n')
    catLog('input x:', x, '\n')
    stop('NA in return value of QCsnp. Abort.')
  }
  return(ret)
}

#helper function that makes sure all the variants are present in all the data frames.
shareVariants = function(variants) {
  catLog('Adding uncalled variants..')      
  common = Reduce(union, lapply(variants, rownames))
  if ( length(common) == 0 ) return()
  variants = lapply(variants, function(q) {
    new = common[!(common %in% rownames(q))]
    if ( length(new) > 0 ) {
      catLog('Found ', length(new), '..', sep='')      
      x = as.numeric(gsub('[AGNTCagtcn+-].*$', '', new))
      if ( !all(x %in% q$x) ) {
      	misX = x[!(x %in% q$x)]
      	top10 = misX[1:min(10,length(misX))]
      	warning('shareVariants found ', length(misX), ' positions where only some samples have data, such as x=', top10, '. This is likely to cause a crash. Cause is likely upstream related to reading out the variant information from the BAMs.')
      }
      newQ = q[q$x %in% x,]
      newQ = newQ[!duplicated(newQ$x),]
      rownames(newQ) = newQ$x
      newQ = newQ[as.character(x),]
      newQ$var = 0
      newQ$flag = ''
      rownames(newQ) = new
      newQ$variant = gsub('^[0-9]+', '', new)
      q = rbind(q, newQ)
      q = q[common,]
    }
    return(q)
  })
  catLog('done!\n')      
  return(variants)
}

#helper function that marks the SNPs with the gene they are in.
inGene = function(SNPs, genes, noHit = NA, genome='hg19') {
  SNPsGR = SNP2GRanges(SNPs, genome=genome)
  hits = findOverlaps(SNPsGR, genes)
  inGene = rep(noHit, length(SNPsGR))
  inGene[queryHits(hits)] = names(genes)[subjectHits(hits)]
  SNPs$inGene = inGene
  return(SNPs)
}



#Takes the sample variants and normal bam files and capture regions.
#return the variant information for the normals on the positions that the samples are called on.
getNormalVariants = function(variants, bamFiles, names, captureRegions, fasta, genome, BQoffset, dbDir, normalRdirectory,  Rdirectory, plotDirectory, cpus, exacPopulation='all', forceRedoSNPs=F, forceRedoVariants=F) {  
  SNPs = variants$SNPs
  variants = variants$variants
  variantsToCheck = unique(do.call(c, lapply(variants, rownames)))

  saveFile = paste0(Rdirectory, '/normalVariantsBI.Rdata')
  if ( file.exists(saveFile) & !forceRedoVariants ) {
    catLog('Loading saved variants by individual.\n')
    load(file=saveFile)
    return(normalVariantsBI)
  }
  

  #iterate over normals
  normalVariantsBI = list()
  for ( i in 1:length(bamFiles) ) {
    name = names(bamFiles)[i]
    bam = bamFiles[i]
    variantsSaveFile = paste0(Rdirectory, '/normalVariants.', name,'.Rdata')
    if ( file.exists(variantsSaveFile) & !forceRedoVariants ) {
      catLog('Loading saved variants for', name,'\n')
      load(file=variantsSaveFile)
      normalVariantsBI[[name]] = q
      next
    }

    preExistingVariantsFiles = list.files(normalRdirectory, pattern=paste0('q', name, '_[0-9]*.Rdata'), full.names=T)
    if ( length(preExistingVariantsFiles) > 0 ) {
        #load what variants are already called
      qList = mclapply(preExistingVariantsFiles, function(preBam) {
        catLog('Loading normal variants from', preBam, '.\n')
        load(preBam)
        return(qNew)
      }, mc.cores=cpus)
      q = do.call(rbind, qList)
      
        #fill in any missing variants
        missingVariants = variantsToCheck[!(variantsToCheck %in% rownames(q))]
        catLog('Can reuse ', sum(variantsToCheck %in% rownames(q)), ' calls.\n', sep='')
        if ( length(missingVariants) > 0 ) {
            missingX = variants[[1]][missingVariants,]$x
            missingSNPs = SNPs[SNPs$x %in% missingX,]
            
            catLog('Filling in missing normal variants from ', name,'.\n', sep='')
            qNew = newBamToVariants(bam, missingSNPs, fasta, Rdirectory, BQoffset=BQoffset, genome=genome, cpus=cpus)[[1]]
            q = rbind(q, qNew)
            q = q[!duplicated(paste0(q$x, q$variant)),]
            q = q[order(q$x, q$variant),]
            q = q[apply(!is.na(q), 1, any),]
  
            #save the new calls for future batches
            newSaveFile = paste0(normalRdirectory, '/q', name, '_', round(runif(1,1,1e10)), '.Rdata')
            catLog('Saving new normal variants back to', newSaveFile, '..')
            save(qNew, file=newSaveFile)
            catLog('done.\n')
        }
    }
    else {
      #extract variant information over the predetermined positions
      catLog('Normal variants from ', name,'.\n', sep='')

      q = newBamToVariants(bam, SNPs, fasta, Rdirectory, BQoffset, genome=genome, cpus=cpus)[[1]]
      q = q[apply(!is.na(q), 1, any),]
      qNew = q
      
      newSaveFile = paste0(normalRdirectory, '/q', name, '_', round(runif(1,1,1e10)), '.Rdata')
      catLog('Saving normal variants back to', newSaveFile, '..')
      save(qNew, file=newSaveFile)
      catLog('done.\n')
    }
    
    
    #filter out any new variants in the normal samples that are not previously seen
    q = q[variantsToCheck[variantsToCheck %in% rownames(q)],]
    normalVariantsBI[[name]] = q
    
    catLog('saving variants...')
    save(q, file=variantsSaveFile)
    catLog('done.\n')
  }
  #if any variants in the cancer samples are not seen in the normals, fill up with reference calls.
  normalVariantsBI = shareVariants(c(normalVariantsBI, variants))[1:length(normalVariantsBI)]

  
  #check db SNP for called variants.
  normalVariantsBI = matchTodbSNPs(normalVariantsBI, dir=dbDir, genome=genome, cpus=cpus)
  normalVariantsBI = matchToExac(normalVariantsBI, dir=dbDir, genome=genome, exacPopulation=exacPopulation, cpus=cpus)
  normalVariantsBI = lapply(normalVariantsBI, function(q) q[order(q$x, q$variant),])

  normalVariantsBI = list('variants'=normalVariantsBI, 'SNPs'=SNPs)
  
  catLog('Saving variants..')
  save(normalVariantsBI, file=saveFile)
  catLog('done.\n')

  plotFrequencyDiagnostics(normalVariantsBI, plotDirectory)
    
  return(normalVariantsBI)
}



bamToVariants = function(bamFiles, positions, BQoffset, genome='hg19', batchSize=1000, cpus=1) {
  
  variants = lapply(bamFiles, function(file) {
    catLog(as.character(Sys.time()), '\n')
    Npos = nrow(positions)
    Nbatch = ceiling(Npos/batchSize)
    catLog('Examining', nrow(positions), 'positions from', file, 'in', Nbatch, 'batches.\n')
    varList = mclapply(1:Nbatch, function(batchI) {
      setupTime = system.time({
        use = ((batchI-1)*batchSize+1):min(Npos, batchI*batchSize)
        thisChr = positions$chr[use]
        pos = positions$start[use]
        catLog(batchI, '.', sep='')
        if ( any(grepl('^chr1', names(scanBamHeader(file)[[1]]$targets))) )
          thisChr = paste0('chr', thisChr)
        which = GRanges(thisChr, IRanges(pos-3, pos+3))
        thisChr = gsub('chr', '', thisChr)
        p1 = ScanBamParam(which=which, what=c('pos', 'seq', 'cigar', 'qual', 'mapq', 'strand', 'qname'),
          flag=scanBamFlag(isSecondaryAlignment=FALSE))
        index = paste0(file, '.bai')
        if ( !file.exists(index) ) {
          index = gsub('.bam$', '.bai', file)
          if ( !file.exists(index) ) {
            catLog('Could not find an index file for', file, '\nI want either', paste0(file, '.bai'), 'or', index)
            stop(paste('Could not find an index file for', file, '\nI want either', paste0(file, '.bai'), 'or', index))
          }
        }
      })
      loadTime = system.time({
        regionReads = scanBam(file, index=index, param=p1)
      })
      pileupTime = system.time({
        regionReads = lapply(regionReads, function(reads) {
          keep = !is.na(reads$mapq) & !duplicated(reads$qname)
          if ( length(keep) == 0 ) return(reads)
          if ( sum(!keep) > 0 ) {
            reads$pos = reads$pos[keep]
            reads$seq = reads$seq[keep]
            reads$cigar = reads$cigar[keep]
            reads$qual = reads$qual[keep]
            reads$mapq = reads$mapq[keep]
            reads$strand = reads$strand[keep]
          }
          return(reads)
        })
        pileups = lapply(1:length(pos), function(i) readsToPileup(regionReads[[i]], pos[i], BQoffset=BQoffset))
        #catLog('(done pileup ', thisChr[1], ') ', sep='')
        catLog('.', sep='')
      })

      qcTime = system.time({
        references = as.character(positions$reference[use])
        variants = as.character(positions$variant[use])
        xs = positions$x[use]
        ret = lapply(1:length(pileups), function(i) {
          return(QCsnp(pileups[[i]], references[i], xs[i], variant='', defaultVariant=variants[i]))
        })
        ret = mergeVariantList(ret)
        ret = ret[ret$reference != ret$variant,]
        rownames(ret) = paste0(ret$x, ret$variant)
        #catLog('(done QCing ', thisChr[1], ') ', sep='')
      })

      cat('setupTime: ', setupTime[3], ', load time: ', loadTime[3], ', pileup time: ', pileupTime[3], ', qc time: ', qcTime[3], '\n', sep='')
      
      return(ret)
    }, mc.cores=cpus, mc.preschedule=F)
    catLog('done!\n')
    gc()
    
    ret = mergeVariantList(varList)
    ret = ret[ret$reference != ret$variant,]
    rownames(ret) = paste0(ret$x, ret$variant)
    catLog('done.\n')
    catLog('Variants:', nrow(ret), '\n')
    catLog('Unflagged :', sum(ret$flag==''), '\n')
    catLog('Unflagged over 0%  freq :', sum(ret$var > 0 & ret$flag==''), '\n')
    catLog('Unflagged over 20% freq :', sum(ret$var > ret$cov/5 & ret$flag==''), '\n')
    catLog('Median coverage over unflagged variants:', median(ret$cov[ret$flag=='']), '\n')
    catLog('Repeat flags:', length(grep('Rep', ret$flag)), '\n')
    catLog('Mapping quality flags:', length(grep('Mq', ret$flag)), '\n')
    catLog('Base quality flags:', length(grep('Bq', ret$flag)), '\n')
    catLog('Strand bias flags:', length(grep('Sb', ret$flag)), '\n')
    catLog('Single variant read flags:', length(grep('Svr', ret$flag)), '\n')
    catLog('Minor variant flags:', length(grep('Mv', ret$flag)), '\n')
    catLog('Stutter flags:', length(grep('St', ret$flag)), '\n')
    return(ret)
  })
  
  return(variants)
}




newBamToVariants = function(bamFiles, positions, fasta, Rdirectory, BQoffset=33, genome='hg19', batchSize=1000, cpus=1, stopAtNoReads=TRUE) {
  
  samtoolsVersion = system('samtools --version', intern=T)[1]

  variants = lapply(bamFiles, function(file) {
    catLog(as.character(Sys.time()), '\n')
    hasChr = any(grepl('^chr', names(scanBamHeader(file)[[1]]$targets)))
    if ( hasChr ) positions$chr = paste0('chr', positions$chr)
    
    Npos = nrow(positions)
    reducedBatchSize = pmin(batchSize, ceiling(Npos/cpus))
    Nbatch = ceiling(Npos/reducedBatchSize)
    catLog('Examining', nrow(positions), 'positions from', file, 'in', Nbatch, 'batches.\n')
    varList = mclapply(1:Nbatch, function(batchI) {
      catLog(batchI, '.', sep='')
      use = ((batchI-1)*reducedBatchSize+1):min(Npos, batchI*reducedBatchSize)
      pileups = bamToPileup(file, fasta, positions[use,], batchI, Rdirectory=Rdirectory,
      						BQoffset=BQoffset, samtoolsVersion=samtoolsVersion)
      catLog('.', sep='')
      
      references = as.character(positions$reference[use])
      variants = as.character(positions$variant[use])
      xs = positions$x[use]
      ret = lapply(1:length(pileups), function(i) {
        return(QCsnp(pileups[[i]], references[i], xs[i], variant='', defaultVariant=variants[i]))
      })
      ret = mergeVariantList(ret)
      ret = ret[ret$reference != ret$variant,]
      rownames(ret) = paste0(ret$x, ret$variant)

      if ( class(ret) != 'data.frame' || !('x' %in% names(ret)) || any(is.na(ret$x)) ) {
        debugFile = paste0(Rdirectory, '/debug_', basename(file), '_batch', batchI , '.mpileup.Rdata')
        catLog("Variants from batch ", batchI, ' are corrupt. Saving corresponding mpileups and variants to ', debugFile, '.\n',
               "Feel free to delete all debug files with\n",
               'deleteDebugFiles(\'', normalizePath(Rdirectory) ,'\')\n', sep='')
        debug = list('pileups'=pileups, 'ret'=ret)
        save(debug, file=debugFile)
      }
            
      return(ret)
    }, mc.cores=cpus, mc.preschedule=F)
    catLog('done!\n')
    gc()
    
    ret = mergeVariantList(varList)
    ret = ret[ret$reference != ret$variant,]
    rownames(ret) = paste0(ret$x, ret$variant)
    catLog('done.\n')
    catLog('Variants:', nrow(ret), '\n')
    catLog('Unflagged :', sum(ret$flag==''), '\n')
    catLog('Unflagged over 0%  freq :', sum(ret$var > 0 & ret$flag==''), '\n')
    catLog('Unflagged over 20% freq :', sum(ret$var > ret$cov/5 & ret$flag==''), '\n')
    catLog('Median coverage over unflagged variants:', median(ret$cov[ret$flag=='']), '\n')
    catLog('Repeat flags:', length(grep('Rep', ret$flag)), '\n')
    catLog('Mapping quality flags:', length(grep('Mq', ret$flag)), '\n')
    catLog('Base quality flags:', length(grep('Bq', ret$flag)), '\n')
    catLog('Strand bias flags:', length(grep('Sb', ret$flag)), '\n')
    catLog('Single variant read flags:', length(grep('Svr', ret$flag)), '\n')
    catLog('Minor variant flags:', length(grep('Mv', ret$flag)), '\n')
    catLog('Stutter flags:', length(grep('St', ret$flag)), '\n')
    if ( nrow(ret) > 0 & sum(ret$cov) == 0 & stopAtNoReads ) {
      catLog('Found no reads over any positions in the VCF. This is typically a sign of failing to read from the bam (', file, '), bam index or fasta (', fasta, '). Or it can be an issue with samtools not being properly available. Cant recover, stopping here.\n', sep='')
      stop('Found no reads over any positions in the VCF. This is typically a sign of failing to read from the bam (', file, '), bam index or fasta (', fasta, '). Or it can be an issue with samtools not being properly available.')
    }
    if ( any(is.na(ret$x)) ) {
      catLog('\nWARNING: Found ', sum(is.na(ret$x)), ' NA variants. This will likely be a problem downstreams, but trying to continue.\n',
'If no other relevant warnings, then out of memory may be the cause. If so, try increasing available memory or decrease cpus.\n\n', sep='')
      warning('WARNING: Found ', sum(is.na(ret$x)), ' NA variants. This will likely be a problem downstreams, but trying to continue.\n',
'If no other relevant warnings, then out of memory may be the cause. If so, try increasing available memory or decrease cpus.')
    }
    return(ret)
  })
  
  return(variants)
}




deleteDebugFiles = function(Rdirectory, delete=F) {
  debugFiles = list.files(normalizePath(Rdirectory), pattern='debug_.*\\.Rdata', full.names=T)
  cat('Identified files for deletion:', debugFiles, sep='\n')
  if ( !delete )
    cat('option \'delete\' is set to false. If you want to carry out the deletion, call with \'delete\' set to TRUE.\n')
  if ( delete ) unlink(debugFiles)
}
ChristofferFlensburg/superFreq documentation built on Nov. 15, 2023, 6:15 a.m.