R/mergeMethods.R

Defines functions cleanVariantRownames sampleToIndividual inSubgroup getSubgroups inProject getProjects getAllVEPdata captureRegionsToBed captureRegionsToGenome captureRegionsToBQoffset captureRegionsToNormalPath updateStory fillInMissingVariants mergeStrs getIndividualSmallVariants getIndividualVariants getAllIndividualVariants getProjectVariants cleanVariants updateMultisamplePlots updateCNVplots updateScatterPlots samplesToRdirs sampleToRdir getNormalVariant getVariant getFit getCNV

getCNV = function(metaData, samples) {
  Rdirs = unique(samplesToRdirs(samples, metaData))
  cnvs = list()
  for ( Rdir in Rdirs ) {
    catLog('Loading ', paste0(Rdir, '/clusters.Rdata'), '...', sep='')
    load(file=paste0(Rdir, '/clusters.Rdata'))
    catLog('done.\n')
    names = names(clusters)
    #names = gsub('[_-]', '.', names)
    #names(clusters) = gsub('[_-]', '.', names(clusters))
    names = names[names %in% samples]
    cnvs[names] = clusters[names]
  }
  cnvs = cnvs[samples]
  clusterColumns = Reduce(intersect, lapply(cnvs, function(cnv) colnames(cnv$clusters)))
  crColumns = Reduce(intersect, lapply(cnvs, function(cnv) colnames(cnv$CR)))
  cnvs = lapply(cnvs, function(cnv) {
    cnv$clusters =  cnv$clusters[,clusterColumns]
    cnv$CR =  cnv$CR[,crColumns]
    return(cnv)
  })
  return(cnvs)
}

getFit = function(metaData, samples) {
  Rdirs = unique(samplesToRdirs(samples, metaData))
  fits = list()
  for ( Rdir in Rdirs ) {
    loadFile = paste0(Rdir, '/fit.Rdata')
    if ( !file.exists(loadFile) ) loadFile = paste0(Rdir, '/fitP.Rdata')
    catLog('Loading ', loadFile, '...', sep='')
    fitName = load(file=loadFile)
    catLog('done.\n')
    if ( fitName == 'fit' ) fitP = fit$fit
    fitNames = gsub('-normal', '', colnames(fitP))
    #standardNames = gsub('[_-]', '.', fitNames)
    fitNames = fitNames[fitNames %in% samples]
    #standardNames = standardNames[standardNames %in% samples]
    fits[fitNames] = lapply(fitNames, function(name) subsetFit(fitP, cols=paste0(name, '-normal')))
  }
  fits = fits[samples]
  return(fits)
}

getVariant = function(metaData, samples, fillInMissing=F, cpus=1) {
  Rdirs = unique(samplesToRdirs(samples, metaData))
  variants = list('variants'=list(), 'SNPs'=c())
  for ( Rdir in Rdirs ) {
    a =
      try({
        catLog('Loading ', paste0(Rdir, '/allVariants.Rdata'), '...', sep='')
        load(file=paste0(Rdir, '/allVariants.Rdata'))
        catLog('done.\n')
        allVariants$variants$variants = cleanVariantRownames(allVariants$variants$variants)
        allVariants$variants$SNPs = allVariants$variants$SNPs[!duplicated(allVariants$variants$SNPs$x),]
        names = names(allVariants$variants$variants)
        names = names[names %in% samples]
        if ( length(names) == 0 ) stop('Couldnt find any relevant samples in ', paste0(Rdir, '/allVariants.Rdata'))
        allVariants$variants$variants = allVariants$variants$variants[names]
        #don't bother with variants that arent supported by at least two reads
        presentVariants = unique(do.call(c, lapply(allVariants$variants$variants, function(q) rownames(q[q$var > 1,]))))
        allVariants$variants$variants = lapply(allVariants$variants$variants, function(q) q[presentVariants[presentVariants %in% rownames(q)],])
        if ( length(variants$variants) > 0 ) {
          if ( fillInMissing ) {
            variants = fillInMissingVariants(metaData, variants, allVariants$variants, cpus=cpus)
            allVariants$variants = fillInMissingVariants(metaData, allVariants$variants, variants, cpus=cpus)
          }
          allVariants$variants = matchVariants(allVariants$variants, variants)
          variants = matchVariants(variants, allVariants$variants)
          variants$variants[names] = allVariants$variants$variants
        }
        else variants = allVariants$variants
      })
    if ( inherits(a, 'try-error') ) {
      warning('Failed to laod and merge variants from ', Rdir, '. Caught error message: ', a)

    }
  }
  samples = samples[samples %in% names(variants$variants)]
  variants$variants = variants$variants[samples]
  return(variants)
}

# This one doesnt work. May need to call missing variants in normals
# or retrieve from the saved variants in the normal folder.
getNormalVariant = function(metaData, variants, fillInMissing=F, cpus=1) {
  samples = names(variants$variants)
  Rdirs = unique(samplesToRdirs(samples))

  neededVariants = unique(do.call(c, lapply(variants$variants, rownames)))
  neededX = unique(variants$SNPs$x)
  ret = list(SNPs='', variants=list())
  for ( Rdir in Rdirs ) {
    #retrieve the normal variants
    load(paste0(Rdir, '/allVariants.Rdata'))
    normalVariants = allVariants$normalVariants

    #keep only needed variants
    normalVariants$variants = lapply(normalVariants$variants, function(q) q[rownames(q) %in% neededVariants,])
    normalVariants$SNPs = normalVariants$SNPs[normalVariants$SNPs$x %in% neededX,]

    ret$variants = c(ret$variants, normalVariants$variants)
    if ( inherits(ret$SNPs, 'character') ) ret$SNPs = normalVariants$SNPs
    else {
      ret$SNPs = rbind(ret$SNPs, normalVariants$SNPs)
      ret$SNPs = ret$SNPs[!duplicated(ret$SNPs),]
    }
  }

  return(ret)
}

sampleToRdir = function(bamFile, metaData, RNA=F) {
  if ( 'outputDirectories' %in% names(metaData$paths) )
    return(metaData$paths$outputDirectories$Rdirectory)
  if ( gsub('^.+/', '', dirname(bamFile)) == 'bam' ) {
    if ( RNA ) {
      candidate = gsub('bam$', 'RNAcalling/R', dirname(bamFile))
      if ( file.exists(candidate) ) return(candidate)
    }
    candidate = gsub('bam$', 'R', dirname(bamFile))
    if ( file.exists(candidate) ) return(candidate)
  }
  candidate = paste0(dirname(bamFile), '/R')
  if ( RNA ) candidate = gsub('bam$', 'RNAcalling/R', dirname(bamFile))
  if ( file.exists(candidate) ) return(candidate)
  warning(paste0('Couldnt find R directory matching bam file', bamFile))
  return(NA)
}
samplesToRdirs = function(samples, metaData) {
  sapply(samples, function(sample) sampleToRdir(metaData$samples[sample,]$BAM, metaData,
                                                metaData$samples[sample,]$DATATYPE == 'RNA'))
}

updateScatterPlots = function(metaData, cosmicDirectory='', individuals=NA, forceRedo=F, forceRedoVariants=F, cpus=1, genome='hg19') {
  if ( is.na(individuals[1]) ) individuals = metaData$individuals$individual
  
  for ( individual in individuals ) {
    logFile = paste0(metaData$paths$dataDirectory, '/R/individuals/', individual, '/runtimeTracking.log')
    assign('catLog', function(...) {cat(..., file=logFile, append=T); cat(...)}, envir = .GlobalEnv)
    catLog('\n\n\n\n\n\n\n#############################################################\n')
    catLog('Running superFreq version', superVersion(), '\n')
    catLog('\n\nDoing', individual, '...\n')
    
    
    samples = metaData$samples$NAME[metaData$samples$INDIVIDUAL == individual]
    normals = metaData$samples[samples,]$NORMAL
    names(normals) = samples

    Rdirectory = paste0(metaData$paths$dataDirectory, '/R/individuals/', individual)
    plotDirectory = paste0(metaData$paths$dataDirectory, '/plots/individuals/', individual)

        
    timePoints = metaData$samples[samples,]$TIMEPOINT
    names(timePoints) = samples
    dominantCaptureRegion = names(sort(table(metaData$samples$CAPTUREREGIONS[metaData$samples$INDIVIDUAL == individual]),
      decreasing=T))[1]
    genome = captureRegionsToGenome(dominantCaptureRegion)

    timeSeries = list(samples)
    names(timeSeries) = individual
    if ( length(timeSeries[[1]]) < 2 ) timeSeries = list()
    
    variants = getIndividualVariants(metaData, individual, cpus=cpus, forceRedo=forceRedoVariants)
    cnvs = getCNV(metaData, samples)
    stories = getStories(variants=variants, cnvs=cnvs, timeSeries=timeSeries,
      normals=normals, genome=genome, Rdirectory=Rdirectory, plotDirectory=plotDirectory, cpus=cpus, forceRedo=forceRedo)



    outputSomaticVariants(variants, genome=genome, plotDirectory=plotDirectory, cpus=cpus, forceRedo=forceRedo)
    variants = runVEP(variants, plotDirectory, cpus=cpus, genome=genome, forceRedoVEP=forceRedo)
    variants = getMoreVEPinfo(variants, plotDirectory, genome, cosmicDirectory=cosmicDirectory)

    samplePairs = metaToSamplePairs(samples, rep(individual, length(samples)), normals)
    inds = rep(individual, length(variants$variants))
    names(inds) = names(variants$variants)
    
    outputSomaticVariants(variants, genome, plotDirectory, cpus=cpus, forceRedo=forceRedo)
    makeSNPprogressionPlots(variants, timeSeries=timeSeries, normals = normals, plotDirectory=plotDirectory, forceRedo=forceRedo)
    makeRiverPlots(stories$stories, variants, genome=genome, cpus=cpus, plotDirectory=plotDirectory, forceRedo=forceRedo)
    makeScatterPlots(variants, samplePairs, timePoints, plotDirectory, genome=genome, cpus=cpus, forceRedo=forceRedo)
    makeCloneScatterPlots(variants, stories$stories, samplePairs, inds, timePoints,
                          plotDirectory, genome=genome, cpus=cpus, forceRedo=forceRedo)
    outputNewVariants(variants, samplePairs, genome, plotDirectory, cpus=cpus, forceRedo=forceRedo)


    
  }

  catLog('Done!\n')

}

updateCNVplots = function(metaData, forceRedo=F, cpus=1, genome='hg19') {
  samples = metaData$individuals$samples
  
  redoPairs = list()
  for ( ind in individuals ) {
    try({
      if ( sum(metaData$samples$INDIVIDUAL == ind) < 2 ) next
      samples = metaData$samples$NAME[metaData$samples$INDIVIDUAL == ind]
      pairs = list()
      for ( i in 2:length(samples) )
        for ( j in 1:(i-1) )
          pairs = c(pairs, list(samples[c(j,i)]))
      
      for ( pair in pairs ) {
        freqDir = paste0(metaData$paths$dataDirectory, '/plots/samples/', pair[1], '/frequencyScatters')
        ensureDirectoryExists(freqDir)
        plotDir = paste0(freqDir, '/', pair[2])
        ensureDirectoryExists(plotDir)
        if ( forceRedo | length(list.files(plotDir)) == 0 ) redoPairs = c(redoPairs, list(pair))
      }
    })
  }

  if ( length(redoPairs) == 0 ) {
    catLog('Scatter plots were already in place.\n')
    return()
  }
  
  variants = getVariant(metaData, unique(unlist(redoPairs)), cpus=cpus)

  for ( pair in redoPairs ) {
    try({
      boring = (variants$variants[[pair[1]]]$var == 0 & variants$variants[[pair[2]]]$var == 0)
      q1 = variants$variants[[pair[1]]][!boring,]
      q2 = variants$variants[[pair[2]]][!boring,]
      catLog('Caculating flagged p-values...')
      ps=qualityScatter(q1, q2, variants$SNPs, cpus=cpus, verbose=F)
      catLog(' and unflagged...')
      psuf=qualityScatter(q1, q2, variants$SNPs, cpus=cpus, plotFlagged=F, verbose=F)
      catLog('done.\n')
      
      freqDir = paste0(metaData$paths$dataDirectory, '/plots/samples/', pair[1], '/frequencyScatters')
      ensureDirectoryExists(freqDir)
      plotDir = paste0(freqDir, '/', pair[2])
      ensureDirectoryExists(plotDir)
      
      freqDir = paste0(metaData$paths$dataDirectory, '/plots/samples/', pair[2], '/frequencyScatters')
      ensureDirectoryExists(freqDir)
      linkDir = paste0(freqDir, '/', pair[1])
      ensureDirectoryExists(linkDir)
      
      outfile = paste0(plotDir, '/all.png')
      linkfile = paste0(linkDir, '/all.png')
      catLog('Plotting to', outfile, '\n')
      png(outfile, width = 10, height=10, res=144, units='in')
      qualityScatter(q1, q2, variants$SNPs, verbose=F, ps=psuf, plotFlagged=F,
                     main=paste0('clean variants: ', pair[1], ' vs ', pair[2]),
                     xlab=paste0('variant frequency for ', pair[1]),
                     ylab=paste0('variant frequency for ', pair[2]), cpus=1)
      dev.off()
      if ( file.exists(linkfile) ) system(paste0('rm ', linkfile))
      system(paste0('ln -s ', outfile, ' ', linkfile))
      
      outfile = paste0(plotDir, '/allNamed.png')
      linkfile = paste0(linkDir, '/allNamed.png')
      catLog('Plotting to', outfile, '\n')
      png(outfile, width = 10, height=10, res=144, units='in')
      qualityScatter(q1, q2, variants$SNPs, verbose=F, ps=psuf, plotFlagged=F,
                     main=paste0('clean variants: ', pair[1], ' vs ', pair[2]),
                     xlab=paste0('variant frequency for ', pair[1]),
                     ylab=paste0('variant frequency for ', pair[2]), cpus=1,
                     print=T, printRedCut=0.25)
      dev.off()
      if ( file.exists(linkfile) ) system(paste0('rm ', linkfile))
      system(paste0('ln -s ', outfile, ' ', linkfile))
      
      outfile = paste0(plotDir, '/allFlagged.png')
      linkfile = paste0(linkDir, '/allFlagged.png')
      catLog('Plotting to', outfile, '\n')
      png(outfile, width = 10, height=10, res=144, units='in')
      qualityScatter(q1, q2, variants$SNPs, verbose=F, ps=ps,
                     main=paste0('all variants: ', pair[1], ' vs ', pair[2]),
                     xlab=paste0('variant frequency for ', pair[1]),
                     ylab=paste0('variant frequency for ', pair[2]), cpus=1)
      dev.off()
      if ( file.exists(linkfile) ) system(paste0('rm ', linkfile))
      system(paste0('ln -s ', outfile, ' ', linkfile))
      
      catLog('Now by chromsome. Preparing..')
      chrs = xToChr(q1$x, genome=genome)
      catLog('done!\n  Plotting chr:')
      for ( chr in names(chrLengths(genome)) ) {
        outfile = paste0(plotDir, '/chr', chr, '.png')
        linkfile = paste0(linkDir, '/chr', chr, '.png')
        png(outfile, width = 10, height=10, res=144, units='in')
        catLog(chr, '..', sep='')
        use = chrs == chr
        qualityScatter(q1[use,], q2[use,], variants$SNPs, ps=ps[use],
                       main=paste0('all variants: ', pair[1], ' vs ', pair[2], ', chr', chr),
                       xlab=paste0('variant frequency for ', pair[1]),
                       ylab=paste0('variant frequency for ', pair[2]), cpus=1, print=T,
                       printRedCut=0.25, plotPosition=T, verbose=F)
        dev.off()
        if ( file.exists(linkfile) ) system(paste0('rm ', linkfile))
        system(paste0('ln -s ', outfile, ' ', linkfile))
      }
      catLog('done!\n')
    })
  }

  for ( ind in individuals ) {
    indDir = paste0(metaData$paths$dataDirectory, '/plots/individuals/', ind)
    ensureDirectoryExists(indDir)
    outfile = paste0(indDir, '/newVariants.xls')
    
    samples = metaData$samples$NAME[metaData$samples$INDIVIDUAL == ind]
    #set up links to sample plot directories
    for ( sample in samples ) {
      samplePlotDir = paste0(metaData$paths$dataDirectory, '/plots/samples/', sample)
      ensureDirectoryExists(samplePlotDir)
      linkfile = paste0(indDir, '/', sample)
      if ( file.exists(linkfile) ) system(paste0('rm ', linkfile))
      system(paste0('ln -s ', samplePlotDir, ' ', linkfile))
    }

    #ignore if not at least 2 samples
    if ( length(samples) < 2 ) next

    #set up pairs
    pairs = list()
    for ( i in 2:length(samples) )
      for ( j in 1:(i-1) )
        pairs = c(pairs, list(samples[c(j,i)]))

    #get the significantly different variants
    if ( (!file.exists(outfile) | forceRedo) & length(pairs) > 0 ) {
      news = list()
      for ( pair in pairs ) {
        name1 = substring(gsub('\\.', '', paste0(pair[1], ' to ', pair[2])), 1, 31)
        name2 = substring(gsub('\\.', '', paste0(pair[2], ' to ', pair[1])), 1, 31)
        catLog('Looking for new cancer variants in ', name1, '\n')
        news[[name1]] = newVariants(variants$variants[[pair[1]]], variants$variants[[pair[2]]], variants$SNPs, genome, cpus=cpus)
        catLog('Looking for new cancer variants in ', name2, '\n')
        news[[name2]] = newVariants(variants$variants[[pair[2]]], variants$variants[[pair[1]]], variants$SNPs, genome, cpus=cpus)
      }

      #print to excel
      catLog('Writing to', outfile, '\n')
      WriteXLS('news', outfile)
    }
  }
  catLog('Done!\n')

}


updateMultisamplePlots = function(metaData, forceRedo=F) {
  individuals = metaData$individuals$individual
  
  redoInd = c()
  for ( ind in individuals ) {
    if ( sum(metaData$samples$INDIVIDUAL == ind) < 2 ) next
    indDir = paste0(metaData$paths$dataDirectory, '/plots/individuals/', ind)
    ensureDirectoryExists(indDir)
    plotFile = paste0(indDir, '/multisample.pdf')
    if ( forceRedo | !file.exists(plotFile) ) redoInd = c(redoInd, ind)
  }
  
  if ( length(redoInd) == 0 ) {
    catLog('Multisample plots were already in place.\n')
    return()
  }
  
  variants = getVariant(metaData, metaData$samples$NAME[metaData$samples$INDIVIDUAL %in% redoInd])
  
  for ( ind in redoInd ) {
    try({
      indDir = paste0(metaData$paths$dataDirectory, '/plots/individuals/', ind)
      timeSeries = metaData$samples$NAME[metaData$samples$INDIVIDUAL == ind]
      normals = metaData$samples$NORMAL[metaData$samples$INDIVIDUAL == ind]
      if ( length(timeSeries) < 2 ) next
      outfile = paste0(indDir, '/frequencyHeatmap.pdf')
      excelFileDB = paste0(indDir, '/frequencyHeatmap_DB.xls')
      excelFileNotDB = paste0(indDir, '/frequencyHeatmap_NotDB.xls')
      catLog('Plotting SNP progression to ', outfile, '..', sep='')
      pdf(outfile, width = 15, height=10)
      qualityProgression(variants$variants[timeSeries], variants$SNPs, normals, nondb=F,
                         excelFile=excelFileDB, main='dbSNPs only')
      qualityProgression(variants$variants[timeSeries], variants$SNPs, normals, db=F,
                         excelFile=excelFileNotDB, main='non-dbSNPs only')
      dev.off()
      catLog('done.\n')
    })
  }
  catLog('Done!\n')

}

cleanVariants = function(variants) {
  variants$variants = lapply(variants$variants, function(q) {
    q = q[!is.na(q$x),]
    q$severity[is.na(q$severity)] = 100
    q$type[is.na(q$type)] = 'notChecked'
    return(q)
    })
  return(variants)
}


getProjectVariants = function(metaData, project, cpus=1, onlyDNA=T, includeNormal=F, forceRedo=F) {
  if ( length(inProject(metaData, project, includeNormal=T, onlyDNA=onlyDNA)) == 0 ) {
    warning(paste0('couldnt find any samples in project ', project, '.'))
    return()
  }
  saveFile = paste0(metaData$project[project,]$Rdirectory, '/variants.Rdata')
  ensureDirectoryExists(metaData$project[project,]$Rdirectory)
  if ( file.exists(saveFile) & !forceRedo ) {
    catLog('Loading saved project variants...')
    load(saveFile)
    return(variants)
  }

  samples = inProject(metaData, project, includeNormal=includeNormal, onlyDNA=onlyDNA)
  variants = getVariant(metaData, samples, cpus=cpus)

  catLog('Saving project variants to', saveFile, '.\n')
  save(variants, file=saveFile)
  return(variants)
}

getAllIndividualVariants = function(metaData, individual, cpus=1, forceRedo=F) {
  if ( !(individual %in% metaData$individuals$individual) ) {
    warning(paste0('couldnt find individual ', individual, ' in metadata.'))
    return()
  }
  saveFile = paste0(metaData$individuals[individual,]$Rdirectory, '/allVariants.Rdata')
  ensureDirectoryExists(metaData$individuals[individual,]$Rdirectory)
  if ( file.exists(saveFile) & !forceRedo ) {
    catLog('Loading saved variants...')
    load(saveFile)
    return(allVariants)
  }

  #import the variants from their respective batch directories. Match and cross-check variants if needed.
  variants = getIndividualVariants(metaData, individual, cpus=cpus, forceRedo=forceRedo)

  #get the variants in the individuals from the normal samples
  dominantCaptureRegion = names(sort(table(metaData$samples$CAPTUREREGIONS[metaData$samples$INDIVIDUAL == individual]),decreasing=T))[1]
  genome = captureRegionsToGenome(dominantCaptureRegion)
  BQoffset = captureRegionsToBQoffset(dominantCaptureRegion)
  Rdirectory = paste0(metaData$paths$dataDirectory, '/R/individuals/', individual)
  plotDirectory = paste0(metaData$paths$dataDirectory, '/plots/individuals/', individual)
  
  normalPath = captureRegionsToNormalPath(dominantCaptureRegion)
  normalBamFiles = list.files(paste0(normalPath, '/bam'), pattern = '*.bam$', full.names=T)
  normalNames = gsub('^.*/', '', gsub('.bam$', '', normalBamFiles))
  normalRdirectory = paste0(normalPath, '/R')
  normalVariants = getNormalVariants(variants, normalBamFiles, normalNames, dominantCaptureRegion,
                                     genome, BQoffset, normalRdirectory, Rdirectory, plotDirectory, cpus=cpus,
                                     forceRedoSNPs=F, forceRedoVariants=F)

  #redo flagging from the normals, and somatic analysis, by first resetting whatever was done in the batch
  #In particular this will allow the somatic score to be calculated using a matched normal from another batch.
  flagsFromNormal = c('Nnn', 'Nnc', 'Vn', 'Mc', 'Pn', 'Nr')
  variants$variants = lapply(variants$variants, function(q) {
    for ( flag in flagsFromNormal )
      q$flag = gsub(flag, '', q$flag)
    q$somaticP = 0
    return(q)
  })

  samples = metaData$samples$NAME[metaData$samples$INDIVIDUAL == individual]
  normals = metaData$samples[samples,]$NORMAL
  names(normals) = samples
  allVariants = matchFlagVariants(variants, normalVariants, individual, normals, Rdirectory,
                                  cpus=cpus, forceRedoMatchFlag=forceRedo)

  return(allVariants)
}


getIndividualVariants = function(metaData, individual, cpus=1, forceRedo=F) {
  if ( !(individual %in% metaData$individuals$individual) ) {
    warning(paste0('couldnt find individual ', individual, ' in metadata.'))
    return()
  }
  saveFile = paste0(metaData$individuals[individual,]$Rdirectory, '/variants.Rdata')
  ensureDirectoryExists(metaData$individuals[individual,]$Rdirectory)
  if ( file.exists(saveFile) & !forceRedo ) {
    catLog('Loading saved variants...')
    load(saveFile)
    return(variants)
  }

  samples = rownames(metaData$samples)[metaData$samples$INDIVIDUAL == individual]
  variants = getVariant(metaData, samples, fillInMissing=T, cpus=cpus)
  keepX = unique(do.call(c, lapply(variants$variants, function(q) q$x[q$var > 1])))
  variants$SNPs = variants$SNPs[variants$SNPs$x %in% keepX,]
  variants$variants = lapply(variants$variants, function(q) q[q$x %in% keepX,])
  keepVar = unique(do.call(c, lapply(variants$variants, function(q) rownames(q)[q$var > 1])))
  variants$variants = lapply(variants$variants, function(q) q[keepVar,])
    
  save(variants, file=saveFile)

  return(variants)
}


getIndividualSmallVariants = function(metaData, individual, cpus=1, forceRedo=F) {
  if ( !(individual %in% metaData$individuals$individual) ) {
    warning(paste0('couldnt find individual ', individual, ' in metadata.'))
    return()
  }

  saveFile = paste0(metaData$individuals[individual,]$Rdirectory, '/smallVariants.Rdata')
  ensureDirectoryExists(metaData$individuals[individual,]$Rdirectory)
  if ( file.exists(saveFile) & !forceRedo ) {
    catLog('Loading saved small variants...')
    load(saveFile)
    return(variants)
  }

  variants =
    getIndividualVariants(metaData=metaData, individual=individual,
                          cpus=cpus, forceRedo=forceRedo)

  qs = variants$variants
  keep = sapply(qs, function(q) q$flag=='' & q$cov > 5 & q$var/q$cov > 0.05 & !is.na(q$cov))
  keep = apply(keep, 1, any)

  keepColumns = c('x', 'cov', 'ref', 'var', 'flag', 'db', 'dbMAF', 'dbValidated', 'somaticP', 'germline', 'severity', 'isCosmicCensus')
  smallVariants = lapply(qs, function(q) q[keep,names(q) %in% keepColumns])

  save(smallVariants, file=saveFile)
  return(smallVariants)
}

mergeStrs = function(strs, sep='') {
  return(do.call(paste, c(as.list(strs), sep='')))
}

#takes all the variants present in qs2 that are not present in qs1,
#checks them up in the BAMs of qs1 and adds them to qs1, which is returned.
fillInMissingVariants = function(metaData, qs1, qs2, cpus=1) {
  #check that qs1 and qs2 share columns.
  if ( !setequal(colnames(qs1$variants[[1]]), colnames(qs2$variants[[1]])) ) {
    warning(paste('FillInMissingVariants called with variants with different columns. qs1 has', mergeStrs(colnames(qs1$variants[[1]])), ', qs2 has', mergeStrs(colnames(qs2$variants[[1]])), '.'))
    commonCols = intersect(colnames(qs1$variants[[1]]), colnames(qs2$variants[[1]]))
    qs1$variants = lapply(qs1$variants, function(q) q[,commonCols])
    qs2$variants = lapply(qs2$variants, function(q) q[,commonCols])
  }
  
  #if columns not in the same order, have qs1 match the qs2 order.
  if ( !identical(colnames(qs1$variants[[1]]), colnames(qs2$variants[[1]])) ) {
    catLog('Matching order of columns.\n')
    qs1$variants = lapply(qs1$variants, function(q) q[,colnames(qs2$variants[[1]],)])
  }
  
  allSNPs = qs2$SNPs[!duplicated(qs2$SNPs$x),]
  q2 = qs2$variants[[1]]
  q2 = q2[!duplicated(paste0(q2$x, q2$variant)),]
  q2 = q2[order(q2$x, q2$variant),]
  allX = q2$x
  allVar = q2$variant
  allRownames = rownames(q2)

  catLog('Filling in missing variants: Got', length(allRownames), 'variants that need to be present.\n')

  for ( i in 1:length(qs1$variants) ) {
    #first find the missing variants
    name = names(qs1$variants)[i]
    q = qs1$variants[[i]]
    bam = metaData$samples[names(qs1$variants)[i],]$BAM
    missing = !(allRownames %in% rownames(q))
    catLog('Sample', names(qs1)[i], 'has', sum(missing), 'missing variants.\n')
    if ( sum(missing) == 0 ) next

    #see if the positions of the missing variants are already called in q1 (for other variants)
    #then we can re-use the coverage and set the variant count to 0, dont need to re-check BAM
    positionChecked = allX[missing] %in% q$x
    catLog(sum(positionChecked), 'can be reused from previous analysis.\n')
    if ( sum(positionChecked) > 0 )
      q = rbind(q, shareVariants(list(q[q$x %in% allX[missing][positionChecked],], q2[missing,][positionChecked,]))[[1]])
    


    #for the remaining variants, we need to check the BAM for q1.
    missing = !(allRownames %in% rownames(q))
    catLog(sum(missing), 'need to be called from bam.\n')
    if ( sum(missing) > 0 ) {
      newSNPs = allSNPs[allSNPs$x %in% allX[missing],]
      BQoffset = captureRegionsToBQoffset(metaData$samples[name,]$CAPTUREREGIONS)
      genome = captureRegionsToGenome(metaData$samples[name,]$CAPTUREREGIONS)
      newQ = bamToVariants(bam=bam, positions=newSNPs, BQoffset=BQoffset, genome=genome, cpus=cpus)[[1]]
      #newQ = QCsnps(pileups=importQualityScores(newSNPs, bam, BQoffset, genome=genome, cpus=cpus)[[1]],
      #  SNPs=newSNPs, cpus=cpus)
      catLog('Matching new calls to the missing variants.\n')
      newQ = newQ[order(newQ$x, newQ$variant),]
      #fill in empty entry if the right variant wasn't called over the position
      newQ = rbind(newQ, shareVariants(list(newQ, q2[missing,]))[[1]])
      #only keep the missing variants that are present in q2 but not q1
      newQ = newQ[rownames(newQ) %in% allRownames[missing],]

      #add any missing columns, such as db, VEP info, etc.
      missingColumns = colnames(q2)[!(colnames(q2) %in% colnames(newQ))]
      if ( length(missingColumns) > 0 ) {
        newQ = cbind(newQ, q2[rownames(newQ),missingColumns])
      }
      #excpet somaticP that should be set to 0, as it wasnt called in q1.
      if ( 'somaticP' %in% colnames(newQ) ) newQ$somaticP = 0

      #any missing information, such as gene, dbSNP information or VEP output can be copied from q2.
      missingColumns = colnames(q2)[!(colnames(q2) %in% newQ)]
      if ( length(missingColumns) > 0 )
        newQ = cbind(newQ, q2[rownames(newQ),missingColumns])
      newQ = newQ[,colnames(q)]
      
      catLog('Adding', nrow(newQ), 'freshly called variants.\n')
      q = rbind(q, newQ)
    }
    
    q = q[order(q$x, q$variant),]
    qs1$variants[[i]] = q
  }

  if ( length(qs1$SNPs) != length(qs2$SNPs) || any(colnames(qs2$SNPs) != colnames(qs1$SNPs)) ) {
    catLog('Matching SNP columns.\n')
    commonColumns = intersect(colnames(qs1$SNPs), colnames(qs2$SNPs))
    qs1$SNPs = qs1$SNPs[,commonColumns]
    qs2$SNPs = qs2$SNPs[,commonColumns]
  }
  
  
  if ( any(!(qs2$SNPs$x %in% qs1$SNPs$x)) ) {
    catLog('Matching meta-information about variants.\n')
    qs1$SNPs = rbind(qs1$SNPs, qs2$SNPs[!(qs2$SNPs$x %in% qs1$SNPs$x),])
  }
  
  return(qs1)
}


updateStory = function(metaData, individual, cpus=1, forceRedo=F) {
  samples = metaData$samples$NAME[metaData$samples$INDIVIDUAL==individual]
  timeSeries = list(samples)
  names(timeSeries) = individual
  normals = metaData$samples[samples,]$NORMAL
  names(normals) = samples

  variants = getIndividualVariants(metaData, individual)
  dominantCaptureRegion = names(sort(table(metaData$samples$CAPTUREREGIONS[metaData$samples$INDIVIDUAL == individual]),decreasing=T))[1]
  genome = captureRegionsToGenome(dominantCaptureRegion)
  BQoffset = captureRegionsToBQoffset(dominantCaptureRegion)
  Rdirectory = paste0(metaData$paths$dataDirectory, '/R/individuals/', individual)
  plotDirectory = paste0(metaData$paths$dataDirectory, '/plots/individuals/', individual)
  
  normalPath = captureRegionsToNormalPath(dominantCaptureRegion)
  normalBamFiles = list.files(paste0(normalPath, '/bam'), pattern = '*.bam$', full.names=T)
  normalNames = gsub('^.*/', '', gsub('.bam$', '', normalBamFiles))
  normalRdirectory = paste0(normalPath, '/R')
  
  normalVariants = getNormalVariants(variants, normalBamFiles, normalNames, dominantCaptureRegion,
                                     genome, BQoffset, normalRdirectory, Rdirectory, plotDirectory, cpus=cpus,
                                     forceRedoSNPs=F, forceRedoVariants=F)
  
  cnvs = getCNV(metaData, samples)

  stories = getStories(variants, cnvs, timeSeries, normals, Rdirectory, plotDirectory, cpus=cpus, forceRedo=F)

  variants = getAllVEPdata(metaData, variants)

  
}

captureRegionsToNormalPath = function(captureRegions, coverage=F) {
  warning('Called a hardcoding function!')
  if ( captureRegions == 'SureSelect' & !coverage ) return('/wehisan/general/academic/grp_leukemia_genomics/normals/Agilent')
  if ( captureRegions == 'SureSelect' & coverage ) return('/wehisan/general/academic/grp_leukemia_genomics/normals/AgilentBlood')
  if ( captureRegions == 'TrueSeq' ) return('/wehisan/general/academic/grp_leukemia_genomics/normals/Illumina')
  if ( captureRegions == 'RNA' ) return('/wehisan/general/academic/grp_leukemia_genomics/normals/RNA')
  if ( captureRegions == 'kinome' ) return('/wehisan/general/academic/grp_leukemia_genomics/normals/kinome')
  if ( captureRegions == 'genome' ) return('/wehisan/general/academic/grp_leukemia_genomics/normals/genome')
  if ( captureRegions == 'siblingMUD' ) return('/wehisan/general/academic/grp_leukemia_genomics/normals/siblingMUD')
  warning(paste0('Dont know a normal path for capture regions ', captureRegions))
  return('')
}
captureRegionsToBQoffset = function(captureRegions) {
  warning('Called a hardcoding function!')
  if ( captureRegions == 'SureSelect' ) return(33)
  if ( captureRegions == 'TrueSeq' ) return(64)
  if ( captureRegions == 'RNA' ) return(33)
  if ( captureRegions == 'kinome' ) return(33)
  if ( captureRegions == 'genome' ) return(33)
  if ( captureRegions == 'siblingMUD' ) return(33)
  warning(paste0('Dont know a BQoffset for capture regions ', captureRegions))
  return('')
}
captureRegionsToGenome = function(captureRegions) {
  warning('Called a hardcoding function!')
  if ( captureRegions == 'SureSelect' ) return('hg19')
  if ( captureRegions == 'TrueSeq' ) return('hg19')
  if ( captureRegions == 'RNA' ) return('hg19')
  if ( captureRegions == 'kinome' ) return('hg19')
  if ( captureRegions == 'genome' ) return('hg19')
  if ( captureRegions == 'siblingMUD' ) return('hg19')
  warning(paste0('Dont know a genome for capture regions ', captureRegions))
  return('')
}
captureRegionsToBed = function(captureRegions) {
  warning('Called a hardcoding function!')
  if ( captureRegions == 'SureSelect' ) return('/wehisan/general/academic/grp_leukemia_genomics/AGILENT_EXOME/HUMAN_v5/hg19.bed')
  if ( captureRegions == 'TrueSeq' ) return('/wehisan/general/academic/grp_leukemia_genomics/F13TSFAPHT0461_HUMwkbX/analysis/captureRegions.bed')
  if ( captureRegions == 'RNA' ) return('/wehisan/general/academic/grp_leukemia_genomics/data/captureRegions/RNA/captureRegions.bed')
  if ( captureRegions == 'kinome' ) return('/wehisan/general/academic/grp_leukemia_genomics/DLBCL\\ PROJECT/analysis/captureRegions.gc.bed')
  if ( captureRegions == 'genome' ) return('/wehisan/general/academic/grp_leukemia_genomics/data/captureRegions/genomic/captureRegions.hg19.10k.bed')
  if ( captureRegions == 'siblingMUD' ) return('/wehisan/general/academic/grp_leukemia_genomics/data/captureRegions/siblingMUD/amplicons.bed')
  warning(paste0('Dont know a bed file path for capture regions ', captureRegions))
  return('')
}

getAllVEPdata = function(metaData, variants) {
  samples = names(variants$variants)
  plotDirectories = gsub('R$', 'plots', samplesToRdirs(samples, metaData))
  for ( sample in samples ) {
    variantSample = variants
    variantSample$variants = variantSample$variants[sample]
    variantSample = getMoreVEPinfo(variantSample, plotDirectories[sample])
    variants$variants[[sample]] = variantSample$variants[[sample]]
  }
  return(variants)
}

#Returns the samples that are part of a project
getProjects = function(metaData, onlyDNA=T) {
  projects = strsplit(metaData$samples$PROJECT, ',')
  ret = unique(unlist(projects))
  return(ret)
}

#Returns the samples that are part of a project
inProject = function(metaData, project, includeNormal=T, onlyDNA=T) {
  projects = strsplit(metaData$samples$PROJECT, ',')
  isInProject = sapply(projects, function(sampleProjects) any(sampleProjects == project))
  if ( !includeNormal ) isInProject = isInProject & !metaData$samples$NORMAL
  if ( onlyDNA ) isInProject = isInProject & metaData$samples$DATATYPE == 'DNA'
  return(metaData$samples$NAME[isInProject])
}

#This function returns the unique subgroups of the provided project
getSubgroups = function(metaData, project, includeNormal=T, onlyDNA=T) {
  samples = inProject(metaData, project, includeNormal=includeNormal, onlyDNA=onlyDNA)
  subgroups = metaData$samples[samples,]$PROJECT.SUBGROUP
  subgroups = subgroups[subgroups != '']
  subgroupList = strsplit(subgroups, ',')
  uniqueSubgroups = unique(unlist(subgroupList))
  isInformative = sapply(uniqueSubgroups, function(subgroup) !all(sapply(subgroupList, function(groups) subgroup %in% groups)) & any(sapply(subgroupList, function(groups) subgroup %in% groups)))
  return(uniqueSubgroups[isInformative])
}

#Returns the samples that are part of a project and subgroups
inSubgroup = function(metaData, project, subgroup, includeNormal=T, onlyDNA=T) {
  projectSamples = inProject(metaData, project, includeNormal=includeNormal, onlyDNA=onlyDNA)
  subgroups = metaData$samples[projectSamples,]$PROJECT.SUBGROUP
  subgroups = strsplit(subgroups, ',')
  isInSubgroup = sapply(subgroups, function(groups) any(groups %in% subgroup))
  subgroupSamples = projectSamples[isInSubgroup]
  if ( !includeNormal ) subgroupSamples = subgroupSamples[!metaData$samples[subgroupSamples,]$NORMAL]
  if ( onlyDNA ) subgroupSamples = subgroupSamples[metaData$samples[subgroupSamples,]$DATATYPE == 'DNA']
  return(subgroupSamples)
}

#returns the individual that the samples come from.
sampleToIndividual = function(metaData, samples) {
  return(metaData$samples[samples,]$INDIVIDUAL)
}

#takes a list of variants, and makes sure the rowname of each list entry is the appropriate combination
#of the position and variant. Returns the list of variants with fixed rownames.
cleanVariantRownames = function(variants) {
  if ( length(variants) == 0 ) return(variants)

  options(scipen = 10)
  variants = lapply(variants, function(q) {
    q = q[!is.na(q$x),]
    rownames(q) = paste0(q$x, q$variant)
    return(q)
    })
  return(variants)
}
ChristofferFlensburg/superFreq documentation built on Nov. 15, 2023, 6:15 a.m.