superFreq: Calls and tracks CNAs, SNVs and clones over multiple cancer exomes.

Documented in compareGroups

projectMeanCNV = function(metaData, project, cpus=1, cosmicDirectory='', onlyDNA=T, clonalityCut=0.4, includeNormal=F, cnvWeight=1, forceRedoMean=F, forceRedoVariants=F, forceRedoMeanPlot=F, forceRedoMatrixPlot=F, genome='hg19', ignoreCNAonly=F) {
  samples = inProject(metaData, project, includeNormal=includeNormal, onlyDNA=onlyDNA)
  if  ( length(samples) == 0 ) {
    warning('No cancer to analyse in project', project)
    return()
  }
  individuals = metaData$samples[samples,]$INDIVIDUAL

  variants = getProjectVariants(metaData, project, cpus=cpus, onlyDNA=onlyDNA,
    includeNormal=includeNormal, forceRedo=forceRedoVariants)
  
  saveFile = paste0(metaData$project[project,]$Rdirectory, '/meanCNV.Rdata')
  if ( file.exists(saveFile) & !forceRedoMean ) {
    catLog('loading cohort data...')
    load(saveFile)
    catLog('done.\n')
  }
  else {
    meanCNV = getMeanCNV(metaData, samples, variants, genome=genome, clonalityCut=clonalityCut, cpus=cpus, ignoreCNAonly=ignoreCNAonly)
    
    catLog('Saving meanCNV...')
    save(meanCNV, file=saveFile)
    catLog('done.\n')
  }

  catLog('Plotting mean CNV...')
  plotMeanCNVtoFile(metaData, project, meanCNV, cosmicDirectory=cosmicDirectory, forceRedo=forceRedoMeanPlot, genome=genome)
  catLog('mutation matrix...')
  plotMultipageMutationMatrix(metaData, meanCNV, project, cosmicDirectory=cosmicDirectory, nGenes=30, pages=10, priorCnvWeight=cnvWeight, forceRedo=forceRedoMatrixPlot, genome=genome)
  catLog('done.\n')

  variants$variants = lapply(variants$variants, function(q) q[q$severity < 11,])
  outputSomaticVariants(variants, genome=genome, plotDirectory=metaData$project[project,]$plotDirectory, cpus=cpus)
  
  subgroups = getSubgroups(metaData, project, includeNormal=includeNormal)
  catLog('Subgroups:\n')
  for ( sg in subgroups ) cat(sg, '\n')
  if ( length(subgroups) > 1 ) { 
    for ( subgroup in subgroups ) {
      catLog('Plotting mean CNV for subgroup', subgroup, '...')
      plotDirectory = paste0(metaData$project[project,]$plotDirectory, '/', subgroup)
      meanCNVs = plotSubgroupCNVtoFile(metaData, meanCNV, project, subgroup, includeNormal=includeNormal, cosmicDirectory=cosmicDirectory,
        forceRedo=forceRedoMeanPlot, genome=genome)
      plotMeanCNVtoFile(metaData, project, meanCNVs$inGroup, cosmicDirectory=cosmicDirectory, forceRedo=forceRedoMeanPlot,
                        plotFile=paste0(plotDirectory, '/meanCNVinGroup.pdf'), genome=genome)
      plotMeanCNVtoFile(metaData, project, meanCNVs$outGroup, cosmicDirectory=cosmicDirectory, forceRedo=forceRedoMeanPlot,
                        plotFile=paste0(plotDirectory, '/meanCNVoutGroup.pdf'), genome=genome)
      
      catLog('mutation matrix...')
      plotSubgroupMutationMatrix(metaData, meanCNVs, project, subgroup, cosmicDirectory=cosmicDirectory,
                                 nGenes=30, pages=10, priorCnvWeight=cnvWeight, forceRedo=forceRedoMatrixPlot)    
      plotMultipageMutationMatrix(metaData, meanCNVs$inGroup, project, cosmicDirectory=cosmicDirectory,
                                  nGenes=30, pages=10, priorCnvWeight=cnvWeight, forceRedo=forceRedoMatrixPlot,
                                  plotFile=paste0(plotDirectory, '/mutationMatrixInGroup.pdf'), genome=genome)
      plotMultipageMutationMatrix(metaData, meanCNVs$outGroup, project, cosmicDirectory=cosmicDirectory,
                                  nGenes=30, pages=10, priorCnvWeight=cnvWeight, forceRedo=forceRedoMatrixPlot,
                                  plotFile=paste0(plotDirectory, '/mutationMatrixOutGroup.pdf'), genome=genome)
      catLog('done.\n')
    }
  }
  
  catLog('Done! Invisible returning mean CNV data.\n\n')
  invisible(meanCNV)
}

#checks the mutation rates for gain, amplification, loss, complete loss, SNVs and biallelic loss
#returns matrices for genes over all samples, as well as rates in terms of fraction of individuals
#that have the gene mutated.
getMeanCNV = function(metaData, samples, variants, genome=genome, clonalityCut=0.4, cpus=1, ignoreCNAonly=F) {
  individuals = metaData$samples[samples,]$INDIVIDUAL
  uInd = unique(individuals)
  indSamples = lapply(uInd, function(i) which(individuals == i))
  cnvs = getCNV(metaData, samples)
    
  #fits = getFit(metaData, samples)
  sex = ifelse(sapply(1:length(cnvs), function(i) mean(cnvs[[i]]$clusters[xToChr(cnvs[[i]]$clusters$x1, genome)=='X',]$M)-mean(cnvs[[i]]$clusters[xToChr(cnvs[[i]]$clusters$x1, genome)=='Y',]$M)) > 3, 'female', 'male')
  if ( any(is.na(sex)) ) sex[is.na(sex)] = 'female'
  names(sex) = names(cnvs)
    
  cnvRates = getCNVrates(metaData, cnvs, sex, individuals, clonalityCut=clonalityCut, cpus=cpus, genome=genome)
    
  varSamples = names(variants$variants)
  varInd = metaData$samples[varSamples,]$INDIVIDUAL
  uVarInd = unique(varInd)
    
  snvRates = getSNVrates(metaData, variants, clonalityCut=clonalityCut, genome=genome, cpus=cpus)
  mutationRate = snvRates$mutationRate
  hitMx = snvRates$hitMx
  
  doubleLossRates = mergeDoubleLoss(snvRates, cnvRates, individuals, ignoreCNAonly=ignoreCNAonly)
  doubleLossRate = doubleLossRates$doubleLossRate
  doubleLossMx = doubleLossRates$doubleLossMx
  
  meanCNV = list(cnvs=cnvs, cnvRates=cnvRates, snvRates=snvRates, doubleLossRates=doubleLossRates, individuals=individuals)
  return(meanCNV)
}

#plots the mutation rates over the genome (CNVs and SNVs) to file.
#also outputs the data from the analysis to a spreadsheet.
plotMeanCNVtoFile = function(metaData, project, meanCNV, cosmicDirectory='', plotFile='', forceRedo=F, genome='hg19') {
  if ( plotFile == '' )
    plotFile = paste0(metaData$project[project,]$plotDirectory, '/meanCNV.pdf')
  if ( file.exists(plotFile) & !forceRedo ) return()

  pdf(plotFile, width=20, height=10)
  output = plotMeanCNV(metaData, meanCNV=meanCNV, cosmicDirectory=cosmicDirectory, add=F, genome=genome)
  dev.off()

  outputMeanCNV(metaData, project, output, outName=gsub('.pdf$', '',plotFile))
}

#plots the mutation reates over the genome.
plotMeanCNV = function(metaData, meanCNV, cosmicDirectory='', add=F, printGeneNames=T, meanCNV2=NA, genome='hg19', filterIG=T, filterTR=T, filterHLA=T, dontCountRepeatedSNVs=F, plotSex=T, plotY=T, outputData=T, plotMutations=T, reset=T, addLegend=T, chr.col=mcri('green', 0.3), label.cex=1, chr.cex=1, chr.stagger=0, chr.staggerFrom=1, maxFrequency=NULL, ...) {
  samples = names(meanCNV$cnvs)
  individuals = metaData$samples[samples,]$INDIVIDUAL
  uInd = unique(individuals)
  indSamples = lapply(uInd, function(ind) which(individuals == ind))

  cL = chrLengths(genome=genome)
  excludeChrs = c()
  if ( !plotY ) excludeChrs = c('Y', 'M')
  if ( !plotSex )  excludeChrs = c('X','Y', 'M')
  if ( length(excludeChrs) > 0 ) cL = cL[!(names(cL) %in% excludeChrs)]
  CR = meanCNV$cnvs[[1]]$CR
  
  maxCNV = max(meanCNV$cnvRates$gain, meanCNV$cnvRates$loss, meanCNV$cnvRates$loh)
  maxLOH = max(meanCNV$cnvRates$loh)
  if ( length(excludeChrs) > 0 ) {
    use = !(xToChr(CR$x1, genome=genome) %in% excludeChrs)
    maxCNV = max(meanCNV$cnvRates$gain[use], meanCNV$cnvRates$loss[use], meanCNV$cnvRates$loh[use])
    maxLOH = max(meanCNV$cnvRates$loh[use])
  }
  if ( !inherits(meanCNV2, 'logical') && !is.na(meanCNV2) ) {
    maxCNV = max(maxCNV, meanCNV2$cnvRates$gain, meanCNV2$cnvRates$loss, meanCNV2$cnvRates$loh)
    maxLOH = max(maxLOH, meanCNV2$cnvRates$loh)
  }
  
  spaceForSNVs = 0.5
  if ( !plotMutations ) spaceForSNVs = maxCNV*0.02
  spaceForLoh = pmin(maxLOH, maxCNV)/maxCNV
  
  if ( is.null(maxFrequency) ) maxFrequencyCNV = maxCNV+0.05
  else maxFrequencyCNV = maxFrequency
  indTicks = (1:length(uInd))/length(uInd)
  indTicks = indTicks[indTicks <= maxFrequencyCNV]
  heavy = rep(T, length(indTicks))
  while ( sum(heavy) > 10 ) heavy[heavy] = rep(c(F,T), length.out = sum(heavy))
  heavy[length(heavy)] = TRUE
  cnvScale = 1/max(indTicks)
  yTicks = indTicks*cnvScale

  if ( is.null(maxFrequency) ) maxFrequencyLOH = maxLOH+0.05
  else maxFrequencyLOH = maxFrequency
  indTicksLOH = (1:length(uInd))/length(uInd)
  indTicksLOH = indTicksLOH[indTicksLOH <= maxFrequencyLOH]
  heavyLOH = rep(T, length(indTicksLOH))
  while ( sum(heavyLOH) > 10*spaceForLoh ) heavyLOH[heavyLOH] = rep(c(F,T), length.out = sum(heavyLOH))
  cnvScaleLOH = 1/max(indTicksLOH)
  yTicksLOH = indTicksLOH*cnvScaleLOH
  
  yTicks = c(-spaceForLoh - spaceForSNVs - yTicks, -spaceForSNVs - yTicksLOH*spaceForLoh*0.85, spaceForSNVs + yTicks)
  indTicks = c(indTicks, indTicksLOH, indTicks)
  heavy = c(heavy, heavyLOH, heavy)
  ymax = max(yTicks)
  ymin = min(yTicks)

  if ( length(excludeChrs) > 0 ) CR = CR[!(xToChr(CR$x1, genome=genome) %in% excludeChrs),]
  xs = sort(c((CR$x1+CR$x2)/2, 0, cumsum(cL)))
  names(xs) = xs
  xsOri = c((CR$x1+CR$x2)/2)
  delimiter = xs %in% c(0, cumsum(cL))
  meanCNVori = meanCNV

  #split up delimiter points into two points, first matching previous value, second matching following.
  for ( cnv in c('gain', 'loss', 'amp', 'cl', 'loh') ) {
    vec = meanCNV$cnvRates[[cnv]]
    is = which(delimiter)
    vec[is] = vec[pmax(1,(is-1))]
    newIs = is + 0.5
    newValues = vec[pmin(length(vec), is+1)]
    vec = c(vec, newValues)[order(c(seq_along(vec), newIs))]
    meanCNV$cnvRates[[cnv]] = vec
  }
  #xs = sort(c((CR$x1+CR$x2)/2, rep(c(0, cumsum(cL)),2)))
  xs = sort(c((CR$x1+CR$x2)/2, c(0, rep(cumsum(cL),2))))
  Nx = length(xs)
  
  #set up plot
  if ( !add ) {
    if ( reset ) {
      par(oma=c(0,0,0,0))
      par(mar=c(0,0,0,0))
    }
    plot(xs, xs, type='n', xlim=c(-0.02*max(xs), max(xs)*(1 +0.2*addLegend)), ylim=c(ymin, ymax+(ymax-ymin)*0.03)*0.97,
         xaxt='n', yaxt='n', frame=F, xlab='', ylab='', ...)

    #horizontal lines for number of patients
    catLog('plotting CNVs..')
    segments(0, yTicks, max(xs), yTicks, lwd=1, col=rgb(0,0,0,0.05+heavy*0.15))
    text(-max(xs)*0.02, yTicks[heavy], round(indTicks[heavy], 2), col=rgb(0,0,0,1), cex=label.cex)
    text(-max(xs)*0.05, -spaceForLoh/2, 'frequency', srt=90, cex=label.cex)

    
    #mark chromosome boundaries
    superFreq:::addChromosomeLines(c(ymin, ymax*1.05), col=chr.col, lwd=1.5, genome=genome,
                                   onlyNumbers=!plotSex, cex=chr.cex, stagger=chr.stagger, staggerFrom=chr.staggerFrom,
                                   excludeChromosomes=excludeChrs)
  }

  #polygons for gain/amplification/loss/complete loss
  if ( !add ) {
    polygon(c(xs, rev(xs)), spaceForSNVs+c(meanCNV$cnvRates$gain[1:Nx], rep(0, length(xs)))*cnvScale,
            col=mcri('orange'), border=mcri('orange'))
    polygon(c(xs, rev(xs)), spaceForSNVs+c(meanCNV$cnvRates$amp[1:Nx], rep(0, length(xs)))*cnvScale,
            col=mcri('red'), border=mcri('red'))
    polygon(c(xs, rev(xs)), -spaceForSNVs-c(meanCNV$cnvRates$loh[1:Nx], rep(0, length(xs)))*cnvScaleLOH*spaceForLoh*0.85,
            col=mcri('green'), border=mcri('green'))
    polygon(c(xs, rev(xs)), -spaceForSNVs-spaceForLoh-c(meanCNV$cnvRates$loss[1:Nx], rep(0, length(xs)))*cnvScale,
            col=mcri('cyan'), border=mcri('cyan'))
    polygon(c(xs, rev(xs)), -spaceForSNVs-spaceForLoh-c(meanCNV$cnvRates$cl[1:Nx], rep(0, length(xs)))*cnvScale,
            col=mcri('blue'), border=mcri('blue'))
  } else {
    lines(xs, spaceForSNVs + meanCNV$cnvRates$gain[1:Nx]*cnvScale, col='white', lwd=2.5)
    lines(xs, spaceForSNVs + meanCNV$cnvRates$gain[1:Nx]*cnvScale, col=mcri('red'), lwd=1)
    lines(xs, spaceForSNVs + meanCNV$cnvRates$amp[1:Nx]*cnvScale, col='white', lwd=2.5)
    lines(xs, spaceForSNVs + meanCNV$cnvRates$amp[1:Nx]*cnvScale, col=mcri('darkred'), lwd=1)
    lines(xs, -spaceForSNVs - meanCNV$cnvRates$loh[1:Nx]*cnvScale*spaceForLoh, col='white', lwd=2.5)
    lines(xs, -spaceForSNVs - meanCNV$cnvRates$loh[1:Nx]*cnvScale*spaceForLoh, col=mcri('green'), lwd=1)
    lines(xs, -spaceForSNVs - spaceForLoh - meanCNV$cnvRates$loss[1:Nx]*cnvScale, col='white', lwd=2.5)
    lines(xs, -spaceForSNVs - spaceForLoh - meanCNV$cnvRates$loss[1:Nx]*cnvScale, col=mcri('blue'), lwd=1)
    lines(xs, -spaceForSNVs - spaceForLoh - meanCNV$cnvRates$cl[1:Nx]*cnvScale, col='white', lwd=2.5)
    lines(xs, -spaceForSNVs - spaceForLoh - meanCNV$cnvRates$cl[1:Nx]*cnvScale, col=mcri('darkblue'), lwd=1)
  }

  #legends
  if ( !add & addLegend ) {
    legend('bottomright', c('gain', 'amplification', 'CNN LOH', 'loss', 'complete loss'), lwd=rep(10, 5),
           col=mcri(c('orange', 'red', 'green', 'cyan', 'blue')), bg='white')
  }

  spaceForGenes = 0.15*spaceForSNVs

  if ( plotMutations ) {
    catLog('plotting mutations..')
    maxMut = max(c(0, meanCNV$snvRates$mutationRate))
    maxDL = max(meanCNV$doubleLossRates$doubleLossRate)
    
    maxSNV = max(maxMut, maxDL)
    if ( !inherits(meanCNV2, 'logical') && !is.na(meanCNV2) ) {
      maxSNV = max(maxSNV, meanCNV2$snvRates$mutationRate, meanCNV2$doubleLossRates$doubleLossRate)
    }
    
    indTicks = (1:length(uInd))/length(uInd)
    indTicks = indTicks[indTicks <= maxSNV]
    heavy = rep(T, length(indTicks))
    while ( sum(heavy) > 5 ) heavy[heavy] = rep(c(F,T), length.out = sum(heavy))
    snvScale = 1/max(c(0,indTicks))*(spaceForSNVs - spaceForGenes - 0.2*spaceForSNVs)
    yTicks = spaceForGenes + indTicks*snvScale
    yTicks = c(-yTicks, yTicks)
    indTicks = c(-indTicks, indTicks)
    
    if ( !add ) {
      segments(rep(0, length(yTicks)), yTicks, rep(max(xs), length(yTicks)), yTicks, lwd=1, col=mcri('cyan', 0.05+heavy*0.15))
    if ( length(indTicks[heavy]) > 0 )
      text(-max(xs)*0.02, yTicks[heavy], round(abs(indTicks[heavy]), 2), col=mcri('cyan'))
    }
    
    meanX = (CR$x1 + CR$x2)/2
    Nx = length(meanX)
    names(meanX) = rownames(meanCNV$cnvRates$gainMx)[1:Nx][!delimiter]
    SNVgenesX = meanCNV$snvRates$hitGenesX[1:Nx]
    DLgenesX = meanX[names(meanCNV$doubleLossRates$doubleLossRate)][1:Nx]
    
    if ( !printGeneNames ) {
      if ( length(SNVgenesX) > 0 )
        segments(SNVgenesX, spaceForGenes+meanCNV$snvRates$mutationRate[1:Nx]*snvScale,
                 SNVgenesX, spaceForGenes, col=mcri('cyan'), lwd=4)
      if ( length(DLgenesX) > 0 )
        segments(DLgenesX, -spaceForGenes - meanCNV$doubleLossRates$doubleLossRate[1:Nx]*snvScale,
                 DLgenesX, -spaceForGenes, col=mcri('orange'), lwd=4)
    } else {
      if ( length(SNVgenesX) > 0 )
        segments(SNVgenesX, spaceForGenes+meanCNV$snvRates$mutationRate[1:Nx]*snvScale,
                 SNVgenesX, spaceForGenes, col=mcri('blue'), lwd=1)
      if ( length(DLgenesX) > 0 )
        segments(DLgenesX, -spaceForGenes - meanCNV$doubleLossRates$doubleLossRate[1:Nx]*snvScale,
                 DLgenesX, -spaceForGenes, col=mcri('red'), lwd=1)
    }
    
    if ( printGeneNames ) {
      mutRate = meanCNV$snvRates$mutationRate[1:Nx]
      isIG = grepl('IG.[VLJC][0-9].*', names(mutRate)) | grepl('IGH[ADG][0-9].*', names(mutRate)) | grepl('LILR.*', names(mutRate))
      isTR = grepl('TR[AD][VJ][0-9].?', names(mutRate))
      isHLA = grepl('HLA-.*', names(mutRate))
      
      if ( filterIG ) mutRate = mutRate[!isIG]
      if ( filterTR ) mutRate = mutRate[!isTR]
      if ( filterHLA ) mutRate = mutRate[!isHLA]
      
      topMutGenes = names(sort(mutRate, decreasing=T)[1:min(10, length(mutRate))])
      mark = names(meanCNV$snvRates$mutationRate)[1:Nx] %in% topMutGenes
      if ( length(mark) > 0 && any(is.na(SNVgenesX[mark])) ) {
        warning('Removing genes from top 10 SNVs, as some are not found. Likely caused by unmatching gene names.')
        mark[is.na(SNVgenesX)] = F
      }
      if ( cosmicDirectory != '' )
        COSMICgenes = names(getCosmicCensusDensity(cosmicDirectory=cosmicDirectory))
      else
        COSMICgenes = c()
      printGenes = names(meanCNV$snvRates$mutationRate)[mark]
      if ( length(printGenes) > 0 ) {
        text(spreadPositions(SNVgenesX[mark], max(xs)*0.035), spaceForGenes*0.4,
             printGenes, col=ifelse(printGenes %in% COSMICgenes, mcri('green'), mcri('darkblue')),
             font=ifelse(printGenes %in% COSMICgenes, 2, 1), cex=0.6)
        segments(spreadPositions(SNVgenesX[mark], max(xs)*0.035), spaceForGenes*0.6,
                 SNVgenesX[mark], spaceForGenes, col=mcri('darkblue'))
      }
      
      dlRate = meanCNV$doubleLossRates$doubleLossRate[1:Nx]
      isIG = grepl('IG.[VLJC][0-9].*', names(dlRate)) | grepl('IGH[ADG][0-9].*', names(dlRate))
      isTR = grepl('TR[AD][VJ][0-9].?', names(dlRate))
      isHLA = grepl('HLA-.*', names(dlRate))
      
      if ( filterIG ) dlRate = dlRate[!isIG]
      if ( filterTR ) dlRate = dlRate[!isTR]
      if ( filterHLA ) dlRate = dlRate[!isHLA]
      
      topDHGenes = names(sort(dlRate, decreasing=T)[1:min(10, length(dlRate))])
      topDHGenes = topDHGenes[topDHGenes != '?']
      topDHGenes = topDHGenes[meanCNV$doubleLossRates$doubleLossRate[1:Nx][topDHGenes] > 0]
      mark = names(meanCNV$doubleLossRates$doubleLossRate)[1:Nx] %in% topDHGenes
      if ( any(is.na(DLgenesX[mark])) ) {
        warning('Removing genes from top 10 SNVs, as some are not found. Likely caused by unmatching gene names.')
        mark[is.na(DLgenesX)] = F
      }
      printGenes = names(meanCNV$doubleLossRates$doubleLossRate)[1:Nx][mark]
      if ( length(printGenes) > 0 ) {
        text(spreadPositions(DLgenesX[mark], max(xs)*0.035), -spaceForGenes*0.4,
             printGenes, col=ifelse(printGenes %in% COSMICgenes, mcri('green'), mcri('red')),
             font=ifelse(printGenes %in% COSMICgenes, 2, 1), cex=0.6)
        segments(spreadPositions(DLgenesX[mark], max(xs)*0.035), -spaceForGenes*0.6,
                 DLgenesX[mark], -spaceForGenes, col=mcri('red'))
      }
    }
    
    if ( !add ) {
      legend('right', c('changing SNV', 'biallelic loss'), col=mcri(c('darkblue', 'red')), lwd=3)
    }
    catLog('done.\n')
  }

  if ( outputData ) {
    catLog('Setting up output...')
    meanCNV = meanCNVori
    xs = sort(c((meanCNV$cnvs[[1]]$CR$x1+meanCNV$cnvs[[1]]$CR$x2)/2, 0, cumsum(chrLengths(genome=genome))))
    genes = rownames(meanCNV$cnvRates$gainMx)[!delimiter]
    allMutationRates = rep(0, length(genes))
    names(allMutationRates) = genes
    use = names(meanCNV$snvRates$mutationRate) %in% names(allMutationRates)
    allMutationRates[names(meanCNV$snvRates$mutationRate)[use]] = meanCNV$snvRates$mutationRate[use]
    allDLRates = rep(0, length(genes))
    names(allDLRates) = genes
    use = names(meanCNV$doubleLossRates$doubleLossRate) %in% names(allDLRates)
    allDLRates[names(meanCNV$doubleLossRates$doubleLossRate)[use]] = meanCNV$doubleLossRates$doubleLossRate[use]
    
    overview = data.frame(chr=xToChr(xs[!delimiter], genome), pos0=round(xToPos(xs[!delimiter], genome)), gene=genes,
      gainRate=meanCNV$cnvRates$gain[!delimiter], ampRate=meanCNV$cnvRates$amp[!delimiter],
      lossRate=meanCNV$cnvRates$loss[!delimiter], clRate=meanCNV$cnvRates$cl[!delimiter],
      lohRate=meanCNV$cnvRates$loh[!delimiter], mutationRate=allMutationRates, doubleLossRate=allDLRates)
    
    gains = data.frame(chr=xToChr(xs[!delimiter], genome), pos=xToPos(xs[!delimiter], genome), gene=genes, meanCNV$cnvRates$gainMx[!delimiter,])
    losss = data.frame(chr=xToChr(xs[!delimiter], genome), pos=xToPos(xs[!delimiter], genome), gene=genes, meanCNV$cnvRates$lossMx[!delimiter,])
    amps = data.frame(chr=xToChr(xs[!delimiter], genome), pos=xToPos(xs[!delimiter], genome), gene=genes, meanCNV$cnvRates$ampMx[!delimiter,])
    cls = data.frame(chr=xToChr(xs[!delimiter], genome), pos=xToPos(xs[!delimiter], genome), gene=genes, meanCNV$cnvRates$clMx[!delimiter,])
    lohs = data.frame(chr=xToChr(xs[!delimiter], genome), pos=xToPos(xs[!delimiter], genome), gene=genes, meanCNV$cnvRates$lohMx[!delimiter,])
  mutations = data.frame(gene=rownames(meanCNV$snvRates$hitMx), meanCNV$snvRates$hitMx)
    DLuse = meanCNV$doubleLossRates$doubleLossRate > 0 & !duplicated(names(meanCNV$doubleLossRates$doubleLossRate))
    DLs = data.frame(gene=names(meanCNV$doubleLossRates$doubleLossRate[DLuse]),
      meanCNV$doubleLossRates$doubleLossMx[DLuse,])
    
    output = list(overview=overview, gain=gains, loss=losss, amp=amps, lohs=lohs, completeLoss=cls, mutations=mutations, doubleLoss=DLs)
    catLog('done.\n')
    invisible(output)
  }
}


outputMeanCNV = function(metaData, project, output, subgroup='', inGroup=T, outName='') {
  plotDirectory = metaData$project[project,]$plotDirectory
  if ( subgroup != '' ) plotDirectory = paste0(metaData$project[project,]$plotDirectory, '/', subgroup)
  ensureDirectoryExists(plotDirectory)

  catLog('printing to csv...')
  csvFile = paste0(plotDirectory, '/meanCNV.csv')
  if ( !inGroup ) csvFile = paste0(plotDirectory, '/meanCNVnotInGroup.csv')
  if ( outName != '' ) csvFile = paste0(outName, '.csv')
  write.csv(output$overview, csvFile)
  output = lapply(output, function(dat) dat = dat[1:min(65000, nrow(dat)), 1:min(255, ncol(dat))])
  catLog('printing to xls...')
  xlsFile = paste0(plotDirectory, '/meanCNV.xls')
  if ( !inGroup ) xlsFile = paste0(plotDirectory, '/meanCNVnotInGroup.xls')
  if ( outName != '' ) xlsFile = paste0(outName, '.xls')
  WriteXLS('output', xlsFile)
    
  catLog('done.\n')
}


getCNVrates = function(metaData, cnvs, sex, individuals, clonalityCut=0.4, cpus=1, genome='hg19') {
  uInd = unique(individuals)
  indSamples = lapply(uInd, function(i) which(individuals == i))
  
  xs = sort(c((cnvs[[1]]$CR$x1+cnvs[[1]]$CR$x2)/2, 0, cumsum(chrLengths(genome=genome))))
  names(xs) = xs
  chr = xToChr(xs, genome)

  catLog('Calculating gain..')
  gainList = mclapply(cnvs, function(cnv) sapply(xs, function(x) {
    i = which(cnv$clusters$x1 < x & cnv$clusters$x2 > x)[1]
    return(((cnv$cluster[i,]$call %in% c('AAB', 'AAA') & cnv$cluster[i,]$clonality > clonalityCut) | cnv$cluster[i,]$M > log2(1 + clonalityCut/2))[1])
    }), mc.cores=cpus)
  gainList = lapply(gainList, function(gain) ifelse(is.na(gain), F, gain))
  gainMx = do.call(cbind, gainList)
  indGain = apply(gainMx, 1, FUN=function(hasGain) sum(sapply(indSamples, function(is) any(hasGain[is]))))
  gain = indGain/length(uInd)

  catLog('loss..')
  loss = mclapply(names(cnvs), function(name) sapply(xs, function(x) {
    cnv = cnvs[[name]]
    i = which(cnv$clusters$x1 < x & cnv$clusters$x2 > x)[1]
    return(((cnv$cluster[i,]$call %in% c('A', 'CL') & cnv$cluster[i,]$clonality > clonalityCut) | cnv$cluster[i,]$M < log2(1 - clonalityCut/2))[1] & !((xToChr(x, genome) %in% c('X','Y') & sex[name] == 'male') | xToChr(x, genome) == 'Y'))
  }), mc.cores=cpus)
  lossList = lapply(loss, function(loss) ifelse(is.na(loss), F, loss))
  lossMx = do.call(cbind, lossList)
  indLoss = apply(lossMx, 1, FUN=function(hasLoss) sum(sapply(indSamples, function(is) any(hasLoss[is]))))
  loss = indLoss/length(uInd)
  
  catLog('amplification..')
  amp = mclapply(cnvs, function(cnv) sapply(xs, function(x) {
    i = which(cnv$clusters$x1 < x & cnv$clusters$x2 > x)[1]
    return(((cnv$cluster[i,]$call %in% c('AAAB', 'AAAAB', 'AAAAAB', 'AAAAAAB') & cnv$cluster[i,]$clonality > clonalityCut) | cnv$cluster[i,]$M > log2(1.6))[1])
    }), mc.cores=cpus)
  ampList = lapply(amp, function(amp) ifelse(is.na(amp), F, amp))
  ampMx = do.call(cbind, ampList)
  indAmp = apply(ampMx, 1, FUN=function(hasAmp) sum(sapply(indSamples, function(is) any(hasAmp[is]))))
  amp = indAmp/length(uInd)
  
  catLog('complete loss..')
  cl = mclapply(names(cnvs), function(name) sapply(xs, function(x) {
    cnv = cnvs[[name]]
    i = which(cnv$clusters$x1 < x & cnv$clusters$x2 > x)[1]
    return(((cnv$cluster[i,]$call == 'CL' & cnv$cluster[i,]$clonality > clonalityCut) | cnv$cluster[i,]$M < log2(0.4))[1] & !((xToChr(x, genome) == 'X' & sex[name] == 'male') | xToChr(x, genome) == 'Y'))
  }), mc.cores=cpus)
  clList = lapply(cl, function(cl) ifelse(is.na(cl), F, cl))
  clMx = do.call(cbind, clList)
  indCl = apply(clMx, 1, FUN=function(hasCl) sum(sapply(indSamples, function(is) any(hasCl[is]))))
  cl = indCl/length(uInd)

  catLog('CNN LOH..')
  loh = mclapply(names(cnvs), function(name) sapply(xs, function(x) {
    cnv = cnvs[[name]]
    i = which(cnv$clusters$x1 < x & cnv$clusters$x2 > x)[1]
    return(((cnv$cluster[i,]$call == 'AA' & cnv$cluster[i,]$clonality > clonalityCut) | (abs(cnv$cluster[i,]$M) < 0.1 & cnv$cluster[i,]$f < 0.5 - clonalityCut/2))[1] & !((xToChr(x, genome) == 'X' & sex[name] == 'male') | xToChr(x, genome) == 'Y'))
  }), mc.cores=cpus)
  lohList = lapply(loh, function(loh) ifelse(is.na(loh), F, loh))
  lohMx = do.call(cbind, lohList)
  indLoh = apply(lohMx, 1, FUN=function(hasLoh) sum(sapply(indSamples, function(is) any(hasLoh[is]))))
  loh = indLoh/length(uInd)

  
  genes = rownames(cnvs[[1]]$CR)
  delimiters = c(0, cumsum(chrLengths(genome=genome)))
  rownames = c(genes, delimiters)[order(c(cnvs[[1]]$CR$x1, delimiters))]
  colnames(gainMx) = colnames(ampMx) = colnames(lossMx) = colnames(clMx) = colnames(lohMx) = names(cnvs)
  rownames(gainMx) = rownames(ampMx) = rownames(lossMx) = rownames(clMx) = rownames(lohMx) = rownames

  return(list(gain=gain, amp=amp, loss=loss, cl=cl, loh=loh, gainMx=gainMx, ampMx=ampMx, lossMx=lossMx, clMx=clMx, lohMx=lohMx))
}


getSNVrates = function(metaData, variants, clonalityCut=0.4, genome='hg19', cpus=1) {
  variants = cleanVariants(variants)
  SNPs = variants$SNPs
  samples = names(variants$variants)
  individuals = metaData$samples[samples,]$INDIVIDUAL
  uInd = unique(individuals)
  indSamples = lapply(uInd, function(i) which(individuals == i))

  catLog('somatic SNVs')
  hitX = mclapply(variants$variants, function(q) {
    catLog('.')
    q = q[q$somaticP > 0.1 & q$var > q$cov*clonalityCut/2 & q$severity <= 11,]
    return(list('x'=q$x, 'rowname'=paste0(q$x, q$variant), 'sev'=q$severity, 'genes'=q$consensusGene))
  }, mc.cores=cpus)
  hitInfo = data.frame(x=unlist(lapply(hitX, function(hit) hit$x)), rowname=unlist(lapply(hitX, function(hit) hit$rowname)), severity=unlist(lapply(hitX, function(hit) hit$sev)), genes=unlist(lapply(hitX, function(hit) hit$genes)), stringsAsFactors=F)
  hitInfo = hitInfo[!duplicated(hitInfo$rowname),]
  xMx = do.call(cbind, lapply(hitX, function(hit) hitInfo$rowname %in% hit$rowname))
  rownames(xMx) =rownames(hitInfo) = hitInfo$rowname
  xRate = rowsums(xMx)
  
  catLog('mutated genes..')
  hitGenes = mclapply(variants$variants, function(q) {
    catLog('.')
    q = q[q$somaticP > 0.1 & q$var > q$cov*clonalityCut/2 & q$severity <= 11,]
    if ( nrow(q) == 0 ) return(c())
    return(q$consensusGene)
  }, mc.cores=cpus)
  catLog('\n')
  mutatedGenes = unique(unlist(lapply(hitGenes, unique)))
  mutatedGenes = mutatedGenes[mutatedGenes != '?']
  hitMx = do.call(cbind, lapply(hitGenes, function(genes) mutatedGenes %in% genes))
  rownames(hitMx) = mutatedGenes
  indHit = apply(hitMx, 1, FUN=function(hasHit) sum(sapply(indSamples, function(is) any(hasHit[is]))))
  mutationRate = indHit/length(uInd)

  doubleHitGenes = unique(unlist(lapply(hitGenes, function(genes) genes[duplicated(genes)])))
  doubleHitGenes = doubleHitGenes[doubleHitGenes != '?']
  doubleHitMx = do.call(cbind, lapply(hitGenes, function(genes) doubleHitGenes %in% genes[duplicated(genes)]))
  rownames(doubleHitMx) = doubleHitGenes
  indDoubleHit = apply(doubleHitMx, 1, FUN=function(hasHit) sum(sapply(indSamples, function(is) any(hasHit[is]))))
  doubleMutationRate = indDoubleHit/length(uInd)

  #we use consensusGene for variants, but inGene for CNAs, might create conflicts when merged...
  hitSNPs = SNPs[SNPs$consensusGene %in% mutatedGenes,]
  hitGenesX = unlist(mclapply(mutatedGenes, function(gene) mean(hitSNPs[hitSNPs$consensusGene == gene,]$x), mc.cores=cpus))

  return(list(hitGenesX=hitGenesX, mutationRate=mutationRate, hitMx=hitMx, xRate=xRate, xMx=xMx, hitInfo=hitInfo,
              doubleHitMx=doubleHitMx, doubleMutationRate=doubleMutationRate))
}


mergeDoubleLoss = function(snvRates, cnvRates, individuals, ignoreCNAonly=F) {
  lossMx = cnvRates$lossMx
  clMx = cnvRates$clMx
  lohMx = cnvRates$lohMx
  hitMx = snvRates$hitMx
  doubleHitMx = snvRates$doubleHitMx

  if ( !setequal(colnames(lossMx), colnames(hitMx)) ) {
    warning('Couldnt do merged snv hits for the double loss analysis, as snv and cnv samples didnt match.')
    print(colnames(lossMx))
    print(colnames(hitMx))
    return(list('doubleLossMx'=clMx, 'doubleLossRate'=cnvRates$cl))
  }

  if ( !ignoreCNAonly ) lossMx = lossMx + clMx
  hitUse = rownames(hitMx) %in% rownames(lossMx)
  lossMx[rownames(hitMx[hitUse,]),] = lossMx[rownames(hitMx[hitUse,]),] + hitMx[hitUse,colnames(lossMx)]
  lossMx[rownames(hitMx[hitUse,]),] = lossMx[rownames(hitMx[hitUse,]),] + hitMx[hitUse,colnames(lossMx)]*lohMx[rownames(hitMx[hitUse,]),]
  doubleHitUse = rownames(doubleHitMx) %in% rownames(lossMx)
  lossMx[rownames(doubleHitMx[doubleHitUse,]),] = lossMx[rownames(doubleHitMx[doubleHitUse,]),] + doubleHitMx[doubleHitUse,colnames(doubleHitMx)]

  doubleLossMx = lossMx > 1
  rownames(doubleLossMx) = rownames(lossMx)

  uInd = unique(individuals)
  indSamples = lapply(uInd, function(i) which(individuals == i))
  indDoubleLoss = apply(doubleLossMx, 1, FUN=function(hasDL) sum(sapply(indSamples, function(is) any(hasDL[is]))))
  doubleLossRate = indDoubleLoss/length(uInd)
  names(doubleLossRate) = rownames(lossMx)

  return(list('doubleLossMx'=doubleLossMx, 'doubleLossRate'=doubleLossRate))
}


plotMultipageMutationMatrix =
  function(metaData, meanCNV, project, meanCNVs=NA, cosmicDirectory='',
           priorCnvWeight=1, nGenes=30, dontCountRepeatedSNVs=F,
           pages=1, forceRedo=F, plotFile='', cancerGenes=NA, genome='hg19') {
  if ( plotFile == '' )
    plotFile = paste0(metaData$project[project,]$plotDirectory, '/mutationMatrix.pdf')
  if ( file.exists(plotFile) & !forceRedo ) return()

  pdf(plotFile, width=20, height=10)
  for ( page in 1:pages ) {
    plotMutationMatrix(metaData, meanCNV, cosmicDirectory=cosmicDirectory, priorCnvWeight=priorCnvWeight, dontCountRepeatedSNVs=dontCountRepeatedSNVs,
                       nGenes=nGenes, skipFirst=(page-1)*nGenes, forceRedo=forceRedo, cancerGenes=cancerGenes, genome=genome)
  }
  dev.off()
  
}

cohortGeneScore = function(metaData, meanCNV, priorCnvWeight=1, filterIG=T, filterTR=T, filterHLA=T, genome='hg19', dontCountRepeatedSNVs=F) {
    
  xs = sort(c((meanCNV$cnvs[[1]]$CR$x1+meanCNV$cnvs[[1]]$CR$x2)/2, 0, cumsum(chrLengths(genome=genome))))
  delimiter = xs %in% c(0, cumsum(chrLengths(genome=genome)))
  
  samples = colnames(meanCNV$cnvRates$gainMx)
  individuals = sampleToIndividual(metaData, metaData$samples$NAME)
  normals = metaData$samples$NORMAL
  names(individuals) = names(normals) = metaData$samples$NAME
  hasMatchedNormal = !is.na(findCorrespondingNormal(metaData$samples$NAME, individuals, normals))[samples]
    
  weightedxMx = t((1+hasMatchedNormal)*t(meanCNV$snvRates$xMx))
  if ( dontCountRepeatedSNVs ) {
    weightedxMx = t(apply(weightedxMx, 1, function(row) {
      row[1:length(row) != which(row == max(row))[1]] = 0
      return(row)
      }))
  }
  individuals = sampleToIndividual(metaData, samples)
  uInd = unique(individuals)
  indSamples = lapply(uInd, function(i) which(individuals == i))
  snvGenes = unique(meanCNV$snvRates$hitInfo$genes)
  if ( length(snvGenes) == 0 ) return(c())
  geneMx = lapply(snvGenes, function(gene) weightedxMx[meanCNV$snvRates$hitInfo$genes==gene,,drop=F])
  indMut = sapply(geneMx, function(mx) sum(sapply(indSamples, function(is) {
    indMx = mx[,is,drop=F]
    return(min(2,sum(rowSums(indMx) > 0)))
  })))
  weightedMutationRate = indMut/length(uInd)
  names(weightedMutationRate) = snvGenes


  #weightedHitMx = t((1+hasMatchedNormal)*t(meanCNV$snvRates$hitMx))
  #individuals = sampleToIndividual(metaData, samples)
  #uInd = unique(individuals)
  #indSamples = lapply(uInd, function(i) which(individuals == i))
  #indMut = apply(weightedHitMx, 1, FUN=function(weight) sum(sapply(indSamples, function(is) max(weight[is]))))
  #weightedMutationRate = indMut/length(uInd)

  if ( dontCountRepeatedSNVs ) {
    names(geneMx) = snvGenes
    for ( gene in snvGenes ) {
      if ( !(gene %in% rownames(meanCNV$cnvRates$lossMx)) ) next
      lostOne = meanCNV$cnvRates$lossMx[gene,]
      notSNV = colSums(geneMx[[gene]]) == 0
      dl = meanCNV$doubleLossRates$doubleLossMx[gene,]
      fp = lostOne & notSNV & dl
      if ( any(fp) ) {
        meanCNV$doubleLossRates$doubleLossMx[gene,fp] = FALSE
        meanCNV$doubleLossRates$doubleLossRate[gene] =
          sum(meanCNV$doubleLossRates$doubleLossMx[gene,])/ncol(meanCNV$doubleLossRates$doubleLossMx)
      }
    }
  }

  genes = rownames(meanCNV$cnvRates$gainMx)[!delimiter]
  allMutationRates = rep(0, length(genes))
  names(allMutationRates) = genes
  use = names(weightedMutationRate) %in% names(allMutationRates)
  allMutationRates[names(weightedMutationRate)[use]] = weightedMutationRate[use]
  allDLRates = rep(0, length(genes))
  names(allDLRates) = genes
  use = names(meanCNV$doubleLossRates$doubleLossRate) %in% names(allDLRates)
  allDLRates[names(meanCNV$doubleLossRates$doubleLossRate)[use]] = meanCNV$doubleLossRates$doubleLossRate[use]

  gain = meanCNV$cnvRates$gain[!delimiter]
  amp = meanCNV$cnvRates$amp[!delimiter]
  loss = meanCNV$cnvRates$loss[!delimiter]
  cl = meanCNV$cnvRates$cl[!delimiter]
  loh = meanCNV$cnvRates$loh[!delimiter]

  mutation = allMutationRates
  
  doubleLoss = allDLRates
  names(gain) = names(amp) = names(loss) = names(cl) = names(loh) = names(mutation) = names(doubleLoss) = genes

  isIG = grepl('IG.[VLJC][0-9].*', genes) | grepl('IGH[ADG][0-9].*', genes) | grepl('LILR.*', genes)
  isTR = grepl('TR[AD][VJ][0-9].?', genes)
  isHLA = grepl('HLA-.*', genes)

  cnvScore = gain*0.5 + loss*0.5 + amp + cl + loh*0.5
  cnvWeight = sqrt(mean(mutation)/mean(cnvScore))*priorCnvWeight
  score = cnvScore*cnvWeight + mutation + (doubleLoss - cl + cl*cnvWeight)*0.5 #DL downweighted, as it is already counted as loss/cl/mutation
  if ( filterIG ) score = score*(1-isIG)
  if ( filterTR ) score = score*(1-isTR)
  if ( filterHLA ) score = score*(1-isHLA)

  return(score)
}


#This function takes the output from meanCNV, selects the most interesting genes
#and plots the mutation and CNV status of those genes for all samples.
plotMutationMatrix = function(metaData, meanCNV, cosmicDirectory='', priorCnvWeight=1, nGenes=30, filterIG=T, dontCountRepeatedSNVs=F,
                              skipFirst=0, forceRedo=F, meanCNV2=NA, add=F, cancerGenes=NA, genome='hg19') {
  if ( is.na(cancerGenes[1]) ) {
    score = cohortGeneScore(metaData, meanCNV, priorCnvWeight=priorCnvWeight, filterIG=filterIG, dontCountRepeatedSNVs=dontCountRepeatedSNVs)
    score[names(score)=='?'] = 0
    if ( !inherits(meanCNV2, 'logical') && !is.na(meanCNV2) ) {
      score2 = cohortGeneScore(metaData, meanCNV2, priorCnvWeight=priorCnvWeight, filterIG=filterIG, genome=genome, dontCountRepeatedSNVs=dontCountRepeatedSNVs)
      score = abs(score-score2)
    }
    cancerGenes = names(sort(score, decreasing=T)[skipFirst + 1:nGenes])
  }
  else {
    cancerGenes = cancerGenes[cancerGenes %in% rownames(meanCNV$cnvRates$gainMx)][skipFirst + 1:nGenes]
  }
  cancerGenes = cancerGenes[!is.na(cancerGenes)]
  if ( length(cancerGenes) == 0 ) return()

  xs = sort(c((meanCNV$cnvs[[1]]$CR$x1+meanCNV$cnvs[[1]]$CR$x2)/2, 0, cumsum(chrLengths(genome=genome))))
  delimiter = xs %in% c(0, cumsum(chrLengths(genome=genome)))

  gains = meanCNV$cnvRates$gainMx[!delimiter,,drop=F]
  losss = meanCNV$cnvRates$lossMx[!delimiter,,drop=F]
  amps = meanCNV$cnvRates$ampMx[!delimiter,,drop=F]
  cls = meanCNV$cnvRates$clMx[!delimiter,,drop=F]
  lohs = meanCNV$cnvRates$lohMx[!delimiter,,drop=F]
  mutations = gains*0
  doubleSNV = gains*0
  use = rownames(meanCNV$snvRates$hitMx) %in% rownames(gains)
  mutations[rownames(meanCNV$snvRates$hitMx)[use],] = meanCNV$snvRates$hitMx[use,,drop=F]
  use = rownames(meanCNV$snvRates$doubleHitMx) %in% rownames(gains)
  doubleSNV[rownames(meanCNV$snvRates$doubleHitMx)[use],] = meanCNV$snvRates$doubleHitMx[use,,drop=F]
  DLs = meanCNV$doubleLossRates$doubleLossMx

  samples = colnames(gains)
  samples = samples[order(metaData$samples[samples,]$INDIVIDUAL)]
  individuals = metaData$samples[samples,]$INDIVIDUAL
  gains = gains[cancerGenes,samples,drop=F]
  losss = losss[cancerGenes,samples,drop=F]
  amps = amps[cancerGenes,samples,drop=F]
  cls = cls[cancerGenes,samples,drop=F]
  lohs = lohs[cancerGenes,samples,drop=F]
  mutations = mutations[cancerGenes,samples,drop=F]
  doubleSNV = doubleSNV[cancerGenes,samples,drop=F]
  DLs = DLs[cancerGenes,samples,drop=F]

  bgCol = rgb(0.9, 0.9, 0.9)
  cnvsToCols = function(gains, amps, losss, cls, lohs) {
    is = 1:(nrow(gains)*ncol(gains))
    cols = sapply(is, function(i) cnvToCol(gains[i], amps[i], losss[i], cls[i], lohs[i]))
    return(matrix(cols, nrow=nrow(gains)))
  }
  cnvToCol = function(gain, amp, loss, cl, loh) {
    if ( cl ) return(mcri('blue'))
    if ( loss ) return(mcri('cyan'))
    if ( loh ) return(mcri('green'))
    if ( amp ) return(mcri('darkred'))
    if ( gain ) return(mcri('red'))
    return(bgCol)
  }

  cnvSize = 1
    
  x = 1:ncol(gains) + sapply(individuals, function(ind) which(unique(individuals) == ind))/3
  names(x) = samples
  y = nrow(gains):1
  names(y) = cancerGenes

  
  xmax = max(x+0.5)
  ymax = max(y+0.5)

  if ( !inherits(meanCNV2, 'logical') && !is.na(meanCNV2) ) {
    individuals2 = sampleToIndividual(metaData, names(meanCNV2$cnvs))
    maxX2 = ncol(meanCNV2$cnvRates$gainMx) + length(unique(individuals2))/3
    if ( add ) x = x + maxX2 + 1
    else xmax = xmax + maxX2 + 1
  }
  
  squareX = sapply(x, function(X) rep(X, length(y)))
  squareY = t(sapply(y, function(Y) rep(Y, length(x))))
  indX = sapply(unique(individuals), function(ind) mean(x[ind == individuals]))
  cnvCol = cnvsToCols(gains, amps, losss, cls, lohs)
  frameCol = matrix(ifelse(DLs, mcri('red'), rgb(0,0,0,0)), nrow=nrow(gains))
  snvCol = matrix(ifelse(mutations, mcri('red'), bgCol), nrow=nrow(gains))
  doubleSnvCol = matrix(ifelse(doubleSNV, mcri('white'), snvCol), nrow=nrow(gains))

  if ( !add ) {
    resetMargins()
    par(mar=c(0,0,0,0))
    plot(0, type='n', xlim=c(-xmax*0.02, xmax*1.2), ylim=c(-0.05*ymax,ymax*1.05), frame.plot=F, xaxt='n', yaxt='n')
  }
  rect(squareX-0.5, squareY-0.5, squareX+0.5, squareY+0.5, col=frameCol, border=F)
  rect(squareX-0.45, squareY-0.45, squareX+0.45, squareY+0.45, border=F, col=bgCol)
  if ( add ) segments(min(x)-1.25, 0.5, min(x)-1.25, ymax, lwd=5, col='black')

  boxScale = pmin(1, sqrt(20/length(indX)))
  indScale = pmin(1, sqrt(20/length(indX)))

  rect(squareX+0.25, squareY-0.25*cnvSize, squareX+0.25+0.15*cnvSize, squareY+0.25*cnvSize, col=cnvCol, border=F)
  points(squareX-0.15, squareY+0.2, pch=16, col=snvCol, cex=1.5*boxScale)
  points(squareX-0.15, squareY+0.2, pch=16, col=doubleSnvCol, cex=0.9*boxScale)

  if ( cosmicDirectory != '' ) {
    COSMICgenes = names(getCosmicCensusDensity(cosmicDirectory=cosmicDirectory))
    if ( file.exists(paste0(cosmicDirectory, '/CCGD_export.csv')) ) {
      CCGDdata = read.table(paste0(cosmicDirectory, '/CCGD_export.csv'), sep=',', header=T, stringsAsFactors=F)
      censusGenes = unique(CCGDdata$Mouse.Symbol)
      COSMICgenes = c(COSMICgenes, censusGenes)
    }
  }
  else
    COSMICgenes = c()
  if ( !add ) text(0.3, y, cancerGenes, adj=c(1, 0.5), srt=0, cex=1,
                   col=ifelse(cancerGenes %in% COSMICgenes, mcri('green'), 'black'),
                   font=ifelse(cancerGenes %in% COSMICgenes, 2, 1))
  text(indX, ymax, unique(individuals), srt=30, adj=c(0, 0), cex=indScale)
  text(x, min(y)-0.5, metaData$samples[samples,]$TIMEPOINT, srt=-30, adj=c(0, 1), cex=boxScale)

  legend('bottomright', c('SNV', '2 SNVs', 'complete loss', 'loss', 'loh', 'gain', 'amp', 'biallelic loss'), col=mcri(c('red', 'red', 'blue', 'cyan', 'green', 'red', 'darkred', 'red')), pch=c(16, 1, 15, 15, 15, 15, 15, 22), pt.cex=c(1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 2), pt.lwd=c(1,5,1,1,1,1,3), pt.bg=mcri(c('red', 'white', 'blue', 'cyan', 'green', 'red', 'darkred', 'white')), bg='white')
 
}


#takes a meanCNV object of a project and splits it up into two meanCNV objects (returned in a list)
#one containing the samples in the provided subgroup. the other containing the samples not in the subgroup
splitMeanCNV = function(metaData, meanCNV, project, subgroup, includeNormal=F) {
  samples = names(meanCNV$cnvs)
  groupSamples = inSubgroup(metaData, project, subgroup, includeNormal=includeNormal, onlyDNA=T)
  groupSamples = samples[samples %in% groupSamples]
  otherSamples = samples[!(samples %in% groupSamples)]
  meanCNVinSubgroup = meanCNVoutsideSubgroup = meanCNV
  
  individualsIn = sampleToIndividual(metaData, groupSamples)
  meanCNVinSubgroup$individuals = individualsIn
  uIndIn = unique(individualsIn)
  indSamplesIn = lapply(uIndIn, function(i) which(individualsIn == i))
  individualsOut = sampleToIndividual(metaData, otherSamples)
  uIndOut = unique(individualsOut)
  indSamplesOut = lapply(uIndOut, function(i) which(individualsOut == i))
  meanCNVoutsideSubgroup$individuals = individualsOut

  meanCNVinSubgroup$cnvs = meanCNVinSubgroup$cnvs[groupSamples]
  meanCNVoutsideSubgroup$cnvs = meanCNVoutsideSubgroup$cnvs[otherSamples]
  entries =
    list(list(data='cnvRates', rate='gain', matrix='gainMx'),
         list(data='cnvRates', rate='amp', matrix='ampMx'),
         list(data='cnvRates', rate='loss', matrix='lossMx'),
         list(data='cnvRates', rate='cl', matrix='clMx'),
         list(data='cnvRates', rate='loh', matrix='lohMx'),
         list(data='snvRates', rate='mutationRate', matrix='hitMx'),
         list(data='snvRates', rate='xRate', matrix='xMx'),
         list(data='snvRates', rate='doubleMutationRate', matrix='doubleHitMx'),
         list(data='doubleLossRates', rate='doubleLossRate', matrix='doubleLossMx'))
  for ( entry in entries ) {
    dat = entry$data
    mx = entry$matrix
    meanCNVinSubgroup[[dat]][[mx]] = meanCNVinSubgroup[[dat]][[mx]][,groupSamples,drop=F]
    meanCNVoutsideSubgroup[[dat]][[mx]] = meanCNVoutsideSubgroup[[dat]][[mx]][,otherSamples,drop=F]
    
    individualsHitIn = apply(meanCNVinSubgroup[[dat]][[mx]], 1,
                             function(hasHit) sum(sapply(indSamplesIn, function(is) any(hasHit[is]))))
    individualsHitOut = apply(meanCNVoutsideSubgroup[[dat]][[mx]], 1,
                             function(hasHit) sum(sapply(indSamplesOut, function(is) any(hasHit[is]))))
    meanCNVinSubgroup[[dat]][[entry$rate]] = individualsHitIn/length(uIndIn)
    meanCNVoutsideSubgroup[[dat]][[entry$rate]] = individualsHitOut/length(uIndOut)
  }

  return(list('inGroup'=meanCNVinSubgroup, 'outGroup'=meanCNVoutsideSubgroup))
}


#plots the mutation rates over the genome (CNVs and SNVs) to file.
#also outputs the data from the analysis to a spreadsheet.
#this version divides the samples of the project by the provided subgroup
#and highlights differences.
plotSubgroupCNVtoFile = function(metaData, meanCNV, project, subgroup, includeNormal=F, cosmicDirectory='', add=F, forceRedo=F, genome='hg19') {
  plotDirectory = paste0(metaData$project[project,]$plotDirectory, '/', subgroup)
  ensureDirectoryExists(plotDirectory)
  plotFile = paste0(plotDirectory, '/meanCNVs.pdf')

  catLog('Splitting stats by subgroup...')
  meanCNVs = splitMeanCNV(metaData, meanCNV, project, subgroup, includeNormal)
  catLog('done.\n')

  if ( file.exists(plotFile) & !forceRedo ) return(meanCNVs)

  pdf(plotFile, width=20, height=10)
  outputOut = plotMeanCNV(metaData, meanCNV=meanCNVs$outGroup, cosmicDirectory=cosmicDirectory,
                          add=F, printGeneNames=F, meanCNV2=meanCNVs$inGroup, genome=genome)
  outputIn = plotMeanCNV(metaData, meanCNV=meanCNVs$inGroup, cosmicDirectory=cosmicDirectory,
                         add=T, printGeneNames=T, genome=genome)
  legend('topright', c(subgroup, subgroup, subgroup, paste0('not ', subgroup), paste0('not ', subgroup), paste0('not ', subgroup)), col=mcri(c('blue', 'red', 'green', 'cyan', 'orange', 'green')), lwd=c(2, 2, 2, 10, 10, 10), bg='white')
  dev.off()

  outputMeanCNV(metaData, project, outputIn, subgroup=subgroup, inGroup=T)
  outputMeanCNV(metaData, project, outputOut, subgroup=subgroup, inGroup=F)
  return(meanCNVs)
}


plotSubgroupMutationMatrix = function(metaData, meanCNVs, project, subgroup, cosmicDirectory='', priorCnvWeight=1,
                                      nGenes=30, pages=10, forceRedo=F, genome='hg19') {
  plotDirectory = paste0(metaData$project[project,]$plotDirectory, '/', subgroup)
  ensureDirectoryExists(plotDirectory)
  plotFile = paste0(plotDirectory, '/mutationMatrix.pdf')
  if ( file.exists(plotFile) & !forceRedo ) return()

  pdf(plotFile, width=20, height=10)
  for ( page in 1:pages ) {
    plotMutationMatrix(metaData, meanCNVs$inGroup, cosmicDirectory=cosmicDirectory, priorCnvWeight=priorCnvWeight,
                       nGenes=nGenes, skipFirst=(page-1)*nGenes, meanCNV2=meanCNVs$outGroup, genome=genome)
    plotMutationMatrix(metaData, meanCNVs$outGroup, cosmicDirectory=cosmicDirectory, priorCnvWeight=priorCnvWeight,
                       nGenes=nGenes, skipFirst=(page-1)*nGenes, meanCNV2=meanCNVs$inGroup, add=T, genome=genome)
  }
  dev.off()
  catLog('done.\n')
}


#' Compares sets of individuals for reccuring mutations
#'
#' @param metaDataFile character: Path to the metaData.
#' @param outputDirectories A named list of output directories, containing the entry Rdirectory and plotDirectory where the saved data and plots will be stored respectively.
#' @param project character: The project containing the subgroups.
#' @param subgroup1 character: The first subgroup(s).
#' @param subgroup2 character: The second subgroup(s).
#' @param name character: The name of the comparison. This names the output directory.
#' @param clonalityCut numeric: the minimum required clonality to be included in the analysis. Deafult 0.4.
#' @param cosmicDirectory character: The directory with the COSMIC data.
#' @param cpus integer: The maximum number of parallel processes.
#' @param forceRedoMean boolean: Force redo the mean CNAs ans SNVs rates over individuals. Default FALSE.
#' @param forceRedoMatrixPlot boolean: Force redo the hit matrix plot. Default FALSE.
#' @param forceRedoMeanPlot boolean: Force redo the mean CNA plot. Default FALSE.
#' @param genome character: the genome being studied. Default "hg19".
#'
#' @details This function runs a cohort analysis, comparing two subgroups (within a project) to each other. See cohortAnalyseBatch for details.
#'
#'
compareGroups = function(metaDataFile, outputDirectories, project, subgroups1, subgroups2, name, includeNormal=F, clonalityCut=0.4, excludeSamples=c(), excludeIndividuals=c(), cosmicDirectory='', analysisName='cohortAnalysis', cpus=1, forceRedoVariants=F, forceRedoMean=F,
  forceRedoMeanPlot=F, forceRedoMatrixPlot=F, genome='hg19', ignoreCNAonly=F, cnvWeight=1) {

  metaData =
    makeMetaDataFromBatch(metaDataFile, outputDirectories, analysisName=analysisName,
                          excludeSamples=excludeSamples, excludeIndividuals=excludeIndividuals)
  createDirectories(metaData)
  linkBams(metaData)
  bringAnnotation(metaData, genome)
  
  samples = inProject(metaData, project, includeNormal=F, onlyDNA=T)
  individuals = metaData$samples[samples,]$INDIVIDUAL

  saveFile = paste0(metaData$project[project,]$Rdirectory, '/meanCNV.Rdata')
  if ( file.exists(saveFile) & !forceRedoMean ) {
    catLog('loading analysed cohort data...')
    load(saveFile)
    catLog('done.\n')
  }
  else {
    variants = getProjectVariants(metaData, project, cpus=cpus, forceRedo=forceRedoVariants)
    meanCNV = getMeanCNV(metaData, samples, variants, genome=genome, ignoreCNAonly=ignoreCNAonly)
    
    catLog('Saving meanCNV...')
    save(meanCNV, file=saveFile)
    catLog('done.\n')
  }

  meanCNV1 = meanCNV
  for ( subgroup in subgroups1 )
    meanCNV1 = splitMeanCNV(metaData, meanCNV1, project=project, subgroup=subgroup, includeNormal=includeNormal)$inGroup
  meanCNV2 = meanCNV
  for ( subgroup in subgroups2 )
    meanCNV2 = splitMeanCNV(metaData, meanCNV2, project=project, subgroup=subgroup, includeNormal=includeNormal)$inGroup
  #meanCNVboth = splitMeanCNV(metaData, meanCNV, project=project, subgroup=union(subgroups1, subgroups2), includeNormal=includeNormal)$inGroup
  meanCNVs = list('inGroup'=meanCNV1, 'outGroup'=meanCNV2)

  plotDirectory = paste0(metaData$project[project,]$plotDirectory, '/', name)
  ensureDirectoryExists(plotDirectory)

  plotFile = paste0(plotDirectory, '/meanCNVs.pdf')
  if ( !file.exists(plotFile) | forceRedoMeanPlot ) {
    pdf(plotFile, width=20, height=10)
    outputOut = plotMeanCNV(metaData, meanCNV=meanCNVs$outGroup, cosmicDirectory=cosmicDirectory,
      add=F, printGeneNames=F, meanCNV2=meanCNVs$inGroup, genome=genome)
    outputIn = plotMeanCNV(metaData, meanCNV=meanCNVs$inGroup, cosmicDirectory=cosmicDirectory,
      add=T, printGeneNames=T, genome=genome)
    legend('topright', c(subgroups1[1], subgroups1[1], subgroups1[1], subgroups2[1], subgroups2[1], subgroups2[1]), col=mcri(c('blue', 'red', 'green', 'cyan', 'orange', 'green')), lwd=c(2, 2, 2, 10, 10, 10), bg='white')
    dev.off()
  }
  
  plotMeanCNVtoFile(metaData, project, meanCNVs$inGroup, forceRedo=forceRedoMeanPlot,
                    plotFile=paste0(plotDirectory, '/meanCNV-', subgroups1[1], '.pdf'), genome=genome)
  plotMeanCNVtoFile(metaData, project, meanCNVs$outGroup, forceRedo=forceRedoMeanPlot,
                    plotFile=paste0(plotDirectory, '/meanCNV-', subgroups2[1], '.pdf'), genome=genome)
  
  plotSubgroupMutationMatrix(metaData, meanCNVs, project, name, cosmicDirectory=cosmicDirectory,
                             nGenes=30, pages=10, priorCnvWeight=cnvWeight, forceRedo=forceRedoMatrixPlot)    
  plotMultipageMutationMatrix(metaData, meanCNVs$inGroup, project, cosmicDirectory=cosmicDirectory,
                              nGenes=30, pages=10, priorCnvWeight=cnvWeight, forceRedo=forceRedoMatrixPlot,
                              plotFile=paste0(plotDirectory, '/mutationMatrix-', subgroups1[1], '.pdf'), genome=genome)
  plotMultipageMutationMatrix(metaData, meanCNVs$outGroup, project, cosmicDirectory=cosmicDirectory,
                              nGenes=30, pages=10, priorCnvWeight=cnvWeight, forceRedo=forceRedoMatrixPlot,
                              plotFile=paste0(plotDirectory, '/mutationMatrix-', subgroups2[1], '.pdf'), genome=genome)

  catLog('Done! Invisible returning mean CNV data.\n')
  invisible(meanCNV)
}


correctForGeneSize = function(hitMx, geneLengths) {

  
  
}
ChristofferFlensburg/superFreq documentation built on Nov. 15, 2023, 6:15 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ChristofferFlensburg/superFreq
Calls and tracks CNAs, SNVs and clones over multiple cancer exomes.

R/meanCNV.R
In ChristofferFlensburg/superFreq: Calls and tracks CNAs, SNVs and clones over multiple cancer exomes.

Defines functions correctForGeneSize compareGroups plotSubgroupMutationMatrix plotSubgroupCNVtoFile splitMeanCNV plotMutationMatrix cohortGeneScore plotMultipageMutationMatrix mergeDoubleLoss getSNVrates getCNVrates outputMeanCNV plotMeanCNV plotMeanCNVtoFile getMeanCNV projectMeanCNV

Documented in compareGroups

R Package Documentation

Browse R Packages

We want your feedback!

ChristofferFlensburg/superFreq Calls and tracks CNAs, SNVs and clones over multiple cancer exomes.

R/meanCNV.R In ChristofferFlensburg/superFreq: Calls and tracks CNAs, SNVs and clones over multiple cancer exomes.

Defines functions correctForGeneSize compareGroups plotSubgroupMutationMatrix plotSubgroupCNVtoFile splitMeanCNV plotMutationMatrix cohortGeneScore plotMultipageMutationMatrix mergeDoubleLoss getSNVrates getCNVrates outputMeanCNV plotMeanCNV plotMeanCNVtoFile getMeanCNV projectMeanCNV

Documented in compareGroups

R Package Documentation

Browse R Packages

We want your feedback!

ChristofferFlensburg/superFreq
Calls and tracks CNAs, SNVs and clones over multiple cancer exomes.

R/meanCNV.R
In ChristofferFlensburg/superFreq: Calls and tracks CNAs, SNVs and clones over multiple cancer exomes.