R/getStories.R

Defines functions renameCloneTree renameClones clonesInTree getDodgyness whichUnitarityViolating makeTreeConsistent getBestSigma getTheoreticalError addTheoreticalCNVerrors combineStories cloneStoryRMS mergeStories enforceTransitive reverseCall findChildren findCloneTree pairScore storiesToCloneStories freqToDirectionProb mergeToOneRegion extractClonalities cnvsToStories splitEvents splitRegions getCNVstories frequencyError findSNPclonalities findLocalCNV findSNPstories getStories

Documented in getStories

#summarises somatic SNVs and CNV calls into subclone evolution over samples
#and determines which subclones are subclones of which other subclones.

#' Combines variants and CNAs to clonal evolution
#'
#' @param variants variants: The variants.
#' @param cnvs cnvs: The copy number calls.
#' @param timeSeries names list of vectors: samples to be analysed together, named by individual.
#' @param normals named boolean vector: which sameples (names of vector) are normal.
#' @param genome character: the genome.
#' @param Rdirectory character: The save directory.
#' @param plotDirectory character: the directory to plot to.
#' @param cpus integer: maximum number of parallel processes. Default 1.
#' @param forceRedo boolen: if redoing calculations even if saved data is available. Default FALSE.
#'
#' @export
#'
#' @details This function calls VEP on the output from outputSomaticVariants. For this, VEP needs to be callable by system('vep').
getStories = function(variants, cnvs, timeSeries, normals, genome, cloneDistanceCut=-qnorm(0.01),
  Rdirectory, plotDirectory, cpus=1, forceRedo=F, manualStoryMerge=F, correctReferenceBias=T, rareGermline=T, maxStories = 3000) {
  stories = list()
  saveFile = paste0(Rdirectory, '/stories.Rdata')
  if ( file.exists(saveFile) & !forceRedo ) {
    catLog('Loading stories.\n')
    load(file=saveFile)
    return(stories)
  }

  #add theoretical errors to the CNA calls
  #this accounts for things such as 100% AAB = 50% AAAB.
  #removed (for now), as the consistency and dodgyness fixes handles the same problem
  #and fixing it twice makes it too conservative.
  #cnvs = addTheoreticalCNVerrors(cnvs)
  
  if ( length(timeSeries) > 0 ) {
    for ( i in 1:length(timeSeries) ) {
      ts = timeSeries[[i]]
      name = names(timeSeries)[i]
      qs = variants$variants[ts]
      catLog('\nTracking clonal evolution in ', name, '..', sep='')
      
      #select somatic SNPs
      if ( any(!normals[ts]) ) {
        somaticMx = do.call(cbind, lapply(qs[!normals[ts]], function(q) q$somaticP > 0.95))
        somatic = apply(somaticMx, 1, any)
        catLog('Found ', sum(somatic), ' high quality somatic SNVs to track.\n', sep='')
      }
      else
        somatic = rep(FALSE, nrow(qs[[1]]))

      somaticQs = lapply(qs, function(q) q[somatic,])

      #switch to effective coverage, to not overestimate the accuracy of high-coverage SNPs
      #not too low though, keep at least 100.
      mC = max(100, getMaxCov())
      somaticQs = lapply(somaticQs, function(q) {
        d = q$cov
        effectiveCov = round(d*(1 + d/mC)/(1 + d/mC + d^2/mC^2))
        effectiveVar = round(q$var/q$cov*effectiveCov)
        q$var = effectiveVar
        q$cov = effectiveCov
        q$ref = q$cov - q$var
        return(q)
      })
      
      #set clonality of SNPs from frequency and local CNV, return stories
      catLog('SNVs..\n')
      snpStories = findSNPstories(somaticQs, cnvs[ts], normals[ts], filter=T)
      if ( nrow(snpStories) > maxStories ) {
        catLog('restricting to ', maxStories, ' random somatic SNVs in the interest of time. Rest will be added back in after clustering.\n', sep='')
        snpStories = snpStories[sample(1:nrow(snpStories), maxStories),]
      }
      anchorSNVs = snpStories
      catLog('Keeping', nrow(snpStories), 'SNV stories.\n')
      
      #combine CNV calls over sample into stories
      catLog('Tracking clonal evolution in CNVs..')
      cnvStories = getCNVstories(cnvs[ts], normals[ts], genome, filter=T)
      anchorCNAs = cnvStories
      catLog('Keeping', nrow(cnvStories), 'CNV stories.\n')

      #merge SNV and CNA stories.
      catLog('merge events into clones..')
      allStories = data.frame(row.names=c('germline', rownames(snpStories), rownames(cnvStories)), stringsAsFactors=F)
      allStories$x1 = c(NA, snpStories$x1, cnvStories$x1)
      allStories$x2 = c(NA, snpStories$x2, cnvStories$x2)
      allStories$call = c('germline', as.character(snpStories$call), as.character(cnvStories$call))
      allStories$stories = as.matrix(rbind(matrix(rep(1, length(qs)), nrow=1), snpStories$stories, cnvStories$stories))
      allStories$errors = as.matrix(rbind(matrix(rep(0, length(qs)), nrow=1), snpStories$errors, cnvStories$errors))
      rownames(allStories$stories) = rownames(allStories$errors) = rownames(allStories)
      colnames(allStories$stories) = colnames(allStories$errors) = ts

      #clusters stories into clones
      clusteredStories = storiesToCloneStories(allStories, minDistance=cloneDistanceCut, cpus=cpus, manualStoryMerge=manualStoryMerge, variants=variants)
      germlineCluster = which(apply(clusteredStories$cloneStories$stories + 1e-3 > 1, 1, all) &
        apply(clusteredStories$cloneStories$errors-1e-5 < 0, 1, all))
      rownames(clusteredStories$cloneStories)[germlineCluster] = clusteredStories$cloneStories$call[germlineCluster] = 'germline'
      rownames(clusteredStories$cloneStories$stories)[germlineCluster] = rownames(clusteredStories$cloneStories$errors)[germlineCluster] = 'germline'
      names(clusteredStories$storyList)[germlineCluster] = 'germline'
      catLog('got', nrow(clusteredStories$cloneStories), 'stories...')

      #combine the clustered stories into clonal evolution (figure out which is subclone of which)
      catLog('decide subclone structure..')
      cloneTree = findCloneTree(clusteredStories$cloneStories)

      catLog('removing inconsistent dodgy clones..')
      consistentTree = makeTreeConsistent(cloneTree, clusteredStories$cloneStories, clusteredStories$storyList)
      consistentClusteredStories = clusteredStories
      consistentClusteredStories$cloneTree = consistentTree
      consistentClusteredStories$cloneStories = consistentClusteredStories$cloneStories[clonesInTree(consistentTree),]
      consistentClusteredStories$storyList = consistentClusteredStories$storyList[clonesInTree(consistentTree)]
      catLog('got', nrow(consistentClusteredStories$cloneStories), 'consistent stories...')

      
      #add in previously filtered SNV and CNA stories if they fit with the found clones
      #also reassign anchor mutations to best fitting clone.
      #accept somatic SNVs down to 0.5 somaticP for these
      if ( any(!normals[ts]) ) {
        somaticMx = do.call(cbind, lapply(qs[!normals[ts]], function(q) q$somaticP > 0.5))
        somatic = apply(somaticMx, 1, any)
        catLog('Found ', sum(somatic), ' somatic SNVs to track. These will be linked to the found clones.\n', sep='')
      }
      else
        somatic = rep(FALSE, nrow(qs[[1]]))
      if ( any(normals[ts]) & rareGermline ) {
        rareGermlineMx = do.call(cbind, lapply(qs[normals[ts]], function(q) q$somaticP > 0.5 & q$severity < 10 & !is.na(q$severity)))
        isRareGermline = apply(rareGermlineMx, 1, any)
        catLog('Found ', sum(isRareGermline), ' rare germline variants to track.\n', sep='')
      }
      else
        isRareGermline = rep(FALSE, nrow(qs[[1]]))
      somaticQs = lapply(qs, function(q) q[isRareGermline | somatic,])
      rareGermlineNames = rownames(qs[[1]])[isRareGermline]
      
      snpStories = findSNPstories(somaticQs, cnvs[ts], normals[ts], filter=F, germlineVariants=rareGermlineNames)
      cnvStories = getCNVstories(cnvs[ts], normals[ts], genome, filter=F)
      allStories = data.frame(row.names=c('germline', rownames(snpStories), rownames(cnvStories)), stringsAsFactors=F)
      allStories$x1 = c(NA, snpStories$x1, cnvStories$x1)
      allStories$x2 = c(NA, snpStories$x2, cnvStories$x2)
      allStories$call = c('germline', as.character(snpStories$call), as.character(cnvStories$call))
      allStories$stories = as.matrix(rbind(matrix(rep(1, length(qs)), nrow=1), snpStories$stories, cnvStories$stories))
      allStories$errors = as.matrix(rbind(matrix(rep(0, length(qs)), nrow=1), snpStories$errors, cnvStories$errors))
      rownames(allStories$stories) = rownames(allStories$errors) = rownames(allStories)
      colnames(allStories$stories) = colnames(allStories$errors) = ts

      #reassign mutations to the consistent clones
      consistentClusteredStories$storyList = lapply(consistentClusteredStories$storyList, function(l) c())
      consistentClusteredStories = mergeStories(consistentClusteredStories, allStories, germlineVariants=rareGermlineNames)

      #redo clustering of mutations, this time using unfiltered mutations, and no consistency contstraints.
      use = rownames(allStories)
      if ( length(use) > maxStories ) {
        catLog('restricting to ', maxStories, ' random mutations in dodgy clustering the interest of time.\n', sep='')
        use = use[c(1,sample(2:length(use), maxStories))]
      }
      clusteredStories = storiesToCloneStories(allStories[use,], minDistance=cloneDistanceCut, cpus=cpus)
      germlineCluster = which(apply(clusteredStories$cloneStories$stories + 1e-3 > 1, 1, all) &
        apply(clusteredStories$cloneStories$errors-1e-5 < 0, 1, all))
      rownames(clusteredStories$cloneStories)[germlineCluster] = clusteredStories$cloneStories$call[germlineCluster] = 'germline'
      rownames(clusteredStories$cloneStories$stories)[germlineCluster] = rownames(clusteredStories$cloneStories$errors)[germlineCluster] = 'germline'
      names(clusteredStories$storyList)[germlineCluster] = 'germline'
      cloneTree = findCloneTree(clusteredStories$cloneStories)


      #separate stories that fitted to a consistent clone
      allConsistentStories = allStories[unlist(consistentClusteredStories$storyList),]
      
      #pick out the variants that behave like germline, ie present clonaly in all samples
      if ( nrow(clusteredStories$cloneStories) > 1 ) {
        germlineVariants = consistentClusteredStories$storyList[[which(rownames(consistentClusteredStories$cloneStories) == 'germline')]]
        germlineVariants = germlineVariants[germlineVariants != 'germline']
      }
      else germlineVariants = c()

      consistentClusteredStories = renameClones(consistentClusteredStories)
      
      stories[[name]] = list('allConsistent'=allConsistentStories, 'consistentClusters'=consistentClusteredStories, 'germlineVariants'=germlineVariants, 'all'=allStories, 'clusters'=clusteredStories, 'cloneTree'=cloneTree, anchorStories=list('anchorSNVs'=anchorSNVs, 'anchorCNAs'=anchorCNAs))
      catLog('done!\n')
    }
  }

  #flag variants that ended up in the germline clone
  #If rareGermline is switched off, instead set somaticP to 0.
  catLog('\nMarking SNV that behave like germline SNPs..')
  variants$variants = lapply(variants$variants, function(q) {
    q$germline = rep(NA, nrow(q))
    return(q)
  })
  if ( length(timeSeries) > 0 ) {
    for ( ind in names(stories) ) {
      story = stories[[ind]]
      if ( length(story$germlineVariants) > 0 ) {
        SNVs = story$germlineVariants[grep('^[0-9]', story$germlineVariants)]
        if ( length(SNVs) > 0 ) {
          samples = timeSeries[[ind]]
          variants$variants[samples] = lapply(variants$variants[samples], function(q) {
            if ( rareGermline ) q$germline = rownames(q) %in% SNVs
            else q$somaticP[rownames(q) %in% SNVs] = 0
            return(q)
          })
        }
      }
    }
  }
  catLog('saving variants back to file...')
  allVariantSaveFile = paste0(Rdirectory, '/allVariants.Rdata')
  load(allVariantSaveFile)
  allVariants$variants = variants
  save('allVariants', file=allVariantSaveFile)
  catLog('done!\n')

  #remove all variants in the germline clone from the clonal tracking.
  #This is isn't done earlier so that the somaticP can be switched off first in above chunk.
  if ( !rareGermline ) {
  	catLog('Removing potential germline variants from the output...')
  	stories = lapply(stories, function(story) {
  		#extract and remove all germline mutations from the consistent analysis
  		#keep only the first mutation that is the artificial "germline" mutation used as bait.
  		story$consistentClusters$storyList$germline = 'germline'
  		story$allConsistent = story$allConsistent[
  		           !(rownames(story$allConsistent) %in% story$germlineVariants),]
  		story$anchorStories$anchorSNVs = story$anchorStories$anchorSNVs[
  		           !(rownames(story$anchorStories$anchorSNVs) %in% story$germlineVariants),]
  		story$anchorStories$anchorCNAs = story$anchorStories$anchorCNAs[
  		           !(rownames(story$anchorStories$anchorCNAs) %in% story$germlineVariants),]
  		story$germlineVariants = 'germline'
  		#repeat for dodgy analysis
  		dodgyGLmuts = story$clusters$storyList$germline
  		dodgyGLmuts = dodgyGLmuts[dodgyGLmuts != 'germline']
  		story$clusters$storyList$germline = 'germline'
		story$all = story$all[!(rownames(story$all) %in% dodgyGLmuts),]
		return(story)
  	})
    catLog('done!\n')
  }

  #return clustered and raw stories
  stories = list('stories'=stories, 'variants'=variants, 'normalVariants'=allVariants$normalVariants)
  catLog('Saving stories..')
  save('stories', file=saveFile)
  catLog('done!\n\n\n')
  return(stories)
}

findSNPstories = function(somaticQs, cnvs, normal, filter=T, germlineVariants=c()) {
  if ( filter ) {
    cov10 = rowMeans(do.call(cbind, lapply(somaticQs, function(q) q$cov))) >= 10
    somaticQs = lapply(somaticQs, function(q) q[cov10,])
  }
  if ( nrow(somaticQs[[1]]) == 0 ) {
    emptyMx = matrix(,nrow=0, ncol=length(somaticQs))
    ret = data.frame(x1=numeric(), x2=numeric(), call=character(), stories=emptyMx, errors=emptyMx, stringsAsFactors=F)
    return(ret)
  }
  somaticQs = superFreq:::findSNPclonalities(somaticQs, cnvs)
  clonality = matrix(sapply(somaticQs, function(q) q$clonality), ncol=length(somaticQs))
  clonalityError = matrix(sapply(somaticQs, function(q) q$clonalityError), ncol=length(somaticQs))
  ret = data.frame(x1=somaticQs[[1]]$x, x2=somaticQs[[1]]$x, call=rownames(somaticQs[[1]]), row.names=rownames(somaticQs[[1]]), stringsAsFactors=F)
  clonality[which(is.na(clonality), arr.ind=T)] = 0
  clonalityError[which(is.na(clonalityError) | is.infinite(clonalityError), arr.ind=T)] = 10
  ret$stories = clonality
  ret$errors = clonalityError
  rownames(ret$stories) = rownames(ret$errors) = rownames(ret)
  
  colnames(ret$stories) = colnames(ret$errors) = names(somaticQs)
  if ( !filter ) {
    if ( any(normal) & nrow(ret) > 0 ) {
      allowed = rownames(ret) %in% germlineVariants
      presentInNormal = ret$stories[,normal,drop=F] > ret$errors[,normal,drop=F] | ret$stories[,normal,drop=F] > 0.2
      presentInNormal = apply(presentInNormal, 1, any)
      if ( inherits(presentInNormal, 'matrix') ) presentInNormal = apply(presentInNormal, 1, any)
      ret = ret[!presentInNormal | allowed,,drop=F]
      catLog('Filtered ', sum(presentInNormal & !allowed), ' present in normal stories.\n', sep='')
    }
    return(ret)
  }
  
  allSmall = rowSums(is.na(ret$errors) | ret$stories - ret$errors*2 < 0 | ret$errors > 0.2) == ncol(ret$stories)
  ret = ret[!allSmall,,drop=F]
  uncertain = rowMeans(ret$errors) > 0.2
  ret = ret[!uncertain,,drop=F]
  indel = grepl('[-\\+]', rownames(ret))
  ret = ret[!indel,,drop=F]
  if ( any(normal) & nrow(ret) > 0 ) {
    presentInNormal = ret$stories[,normal,drop=F] > ret$errors[,normal,drop=F] | ret$stories[,normal,drop=F] > 0.2
    presentInNormal = apply(presentInNormal, 1, any)
    if ( inherits(presentInNormal, 'matrix') ) presentInNormal = apply(presentInNormal, 1, any)
    ret = ret[!presentInNormal,,drop=F]
    catLog('Filtered ', sum(allSmall), ' small, ', sum(uncertain), ' uncertain, ', sum(indel), ' indel and ', sum(presentInNormal), ' present in normal stories.\n', sep='')
  }
  else catLog('Filtered ', sum(allSmall) , ' small, ', sum(uncertain), ' uncertain and ', sum(indel), ' indel stories.\n', sep='')

  return(ret)
}

findLocalCNV = function(qs, cnvs) {
  x = qs[[1]]$x
  for ( i in 1:length(cnvs) ) {
    call = sapply(x, function(X) c(cnvs[[i]]$clusters$call[which(X > cnvs[[i]]$clusters$x1-300 & X < cnvs[[i]]$clusters$x2+300)], 'AB')[1])
    clonality = sapply(x, function(X) c(cnvs[[i]]$clusters$clonality[which(X > cnvs[[i]]$clusters$x1-300 & X < cnvs[[i]]$clusters$x2+300)], 1)[1])
    clonalityError = sapply(x, function(X) c(cnvs[[i]]$clusters$clonalityError[which(X > cnvs[[i]]$clusters$x1-300 & X < cnvs[[i]]$clusters$x2+300)], 0)[1])
    if ( any(grepl('CL', call)) ) {
      call[grepl('CL', call)] = 'AB'
      clonality[grepl('CL', call)] = 1 - clonality[grepl('CL', call)]
    }
    qs[[i]]$CNV = call
    qs[[i]]$CNVclonality = clonality
    qs[[i]]$CNVclonalityError = clonalityError
  }
  return(qs)
}

#takes a quality variant object, that has gone through findLocalCNV, and adds a clonality and clonalityError column
findSNPclonalities = function(somaticQs, cnvs) {
  if ( nrow(somaticQs[[1]]) == 0 ) return(somaticQs)
  somaticQs = findLocalCNV(somaticQs, cnvs)
  somaticQs = lapply(somaticQs, function(q) {
    nA = nchar(q$CNV) - nchar(gsub('A', '', q$CNV))
    nB = nchar(q$CNV) - nchar(gsub('B', '', q$CNV))

    noCNV = q$CNV %in% c('AB', 'AB?', 'AB??')

    #calculate expected frequencies range if the SNP is on the A or B allele of the CNV, or in the AB background.
    cloneMin = ifelse(noCNV, 0, pmax(0, q$CNVclonality-q$CNVclonalityError*1.5))
    cloneMax = ifelse(noCNV, 0, pmin(1, q$CNVclonality+q$CNVclonalityError*1.5))
    fAmin = nA*cloneMin/((nA+nB)*cloneMin + 2*(1-cloneMin))
    fAmax = nA*cloneMax/((nA+nB)*cloneMax + 2*(1-cloneMax))
    fBmin = nB*cloneMin/((nA+nB)*cloneMin + 2*(1-cloneMin))
    fBmax = nB*cloneMax/((nA+nB)*cloneMax + 2*(1-cloneMax))
    fNmax = (1-cloneMin)/((nA+nB)*cloneMin + 2*(1-cloneMin))
    fNmin = (1-cloneMax)/((nA+nB)*cloneMax + 2*(1-cloneMax))
    
    #calculate p-values for the SNV living on the different alleles.
    f = superFreq:::refUnbias(q$var/q$cov)
    f[q$cov==0] = 0
    closestFA = ifelse(f < fAmin, fAmin, ifelse(f > fAmax, fAmax, f))
    closestFB = ifelse(f < fBmin, fBmin, ifelse(f > fBmax, fBmax, f))
    closestFN = ifelse(f < fNmax, f, fNmax)
    pA = superFreq:::pBinom(q$cov, q$var, closestFA)
    pB = superFreq:::pBinom(q$cov, q$var, closestFB)
    pN = superFreq:::pBinom(q$cov, q$var, closestFN)

    clonalityA = pmin(1, ifelse(nA==0 & f==0, 0, 2*f/(nA+f*(2-nA-nB))))
    clonalityB = pmin(1, ifelse(nB==0 & f==0, 0, 2*f/(nB+f*(2-nA-nB))))
    clonalityN = pmin(1, 1 - (1 - f*2)/(f*(nA+nB-2)+1))

    #frequency error estimate: add up the poissonian width with the RIB as independent normal error sources.
    fErr = sqrt(superFreq:::frequencyError(q$var, q$cov, p0=0.10)^2 + q$RIB^2)
    fErr[q$cov==0] = 1
    #propagate to clonality
    clonalityHighA = 2/(2+nA/(f+fErr)-nA-nB)
    clonalityHighB = 2/(2+nB/(f+fErr)-nA-nB)
    clonalityHighN = 1 - (1 - (f+fErr)*2)/(pmin(1, f+fErr)*(nA+nB-2)+1)
    #Not allowing the lower limit to go into negatives reduces the error estimate of low frequencies
    #and is associated with a prior bias towards clonality 0 for low frequencies.
    clonalityLowA = pmax(0, 2/(2+nA/(f-fErr)-nA-nB))
    clonalityLowB = pmax(0, 2/(2+nB/(f-fErr)-nA-nB))
    clonalityLowN = pmax(0, 1 - (1 - (f-fErr)*2)/((f-fErr)*(nA+nB-2)+1))
      
    clonalityErrorA = abs(clonalityHighA - clonalityLowA)/2
    clonalityErrorB = abs(clonalityHighB - clonalityLowB)/2
    clonalityErrorN = abs(clonalityHighN - clonalityLowN)/2
    #clonalityError[f==0] = fErr[f==0]

    #if the B allele is lost, and the SNV is clonal in the N-cells, assume that the SNV
    #was present in the cells with the lost B-allele as well, giving a clonality of 1.
    lostSNV = nB==0 & abs(clonalityN+q$CNVclonality - 1) < q$CNVclonalityError
    if ( any(lostSNV) ) clonalityErrorN[lostSNV] = 1

    consistentA = pA > 0.05 & clonalityLowA <= cloneMax
    consistentB = pB > 0.05 & clonalityLowB <= cloneMax
    consistentN = pN > 0.05 & clonalityLowN <= 1-cloneMin
    
    #if no allele fits with the point mutation and CNA present in mostly the same cells
    #then check for SNV present in subclone of CNAs with a single allele
    #inconsistent = !consistentA & !consistentB & !consistentN
    #if ( any(inconsistent) ) {
	#	consistentSubA = f < fAmax & nA == 1
	#	consistentSubB = f < fBmax & nB == 1
	#	consistentSubN = f < fNmax
	#	consistentA[inconsistent] = consistentSubA[inconsistent]
	#	consistentB[inconsistent] = consistentSubB[inconsistent]
	#	consistentN[inconsistent] = consistentSubN[inconsistent]
    #}
    
    homeAllele =
      ifelse(q$CNV %in% c('AB', 'AB?', 'AB??', 'CL'),
             'N',
             ifelse(consistentA & (!consistentB | clonalityB <= clonalityA) & (!consistentN | clonalityN <= clonalityA),
                    'A',
                    ifelse(consistentB & (!consistentN | clonalityN <= clonalityB),
                           'B',
                           'N')))
    clonality = ifelse(homeAllele == 'A', clonalityA, ifelse(homeAllele == 'B', clonalityB, clonalityN))
    homeAllele[f==0] = 'N'
    clonality[f==0] = 0
    clonalityError = ifelse(homeAllele == 'A', clonalityErrorA, ifelse(homeAllele == 'B', clonalityErrorB, clonalityErrorN))

    #check for calls where several different home alleles are possible, and increase
    #error estimate accordingly
    uncertainCalls = which(consistentA + consistentB + consistentN > 1)
    if ( length(uncertainCalls) > 0 ) {
      clonalityVar = sapply(uncertainCalls, function(i) var(c(clonalityA[i], clonalityB[i], clonalityN[i])[c(consistentA[i], consistentB[i], consistentN[i])]))
      clonalityError[uncertainCalls] = sqrt(clonalityError[uncertainCalls]^2 + clonalityVar)
    }

    #check if the snv can be a germline SNP, ie present both on the N and one of the A or B alleles.
    #if so, make sure that the clonality error reaches 100%.
    SNPfA = (q$CNVclonality*nA + 1-q$CNVclonality)/(2*(1-q$CNVclonality) + q$CNVclonality*(nA+nB))
    SNPfB = (q$CNVclonality*nB + 1-q$CNVclonality)/(2*(1-q$CNVclonality) + q$CNVclonality*(nA+nB))
    canBeSNPA = superFreq:::pBinom(q$cov, q$var, SNPfA)
    canBeSNPB = superFreq:::pBinom(q$cov, q$var, SNPfB)
    canBeSNP = canBeSNPA > 0.1 | canBeSNPB > 0.1
    clonalityError[canBeSNP] = pmax(clonalityError[canBeSNP], (1-clonality)[canBeSNP])


    q$homeAllele = homeAllele
    q$clonality = superFreq:::noneg(clonality)
    q$clonalityError = abs(clonalityError)
    return(q)
    })
  return(somaticQs)
}

frequencyError = function(var, cov, p0=0.15, reportBothEnds=F) {
  covOri = cov
  varOri = var

  converged = cov == 0
  bsolutions = ifelse(cov==0, 1, 0)
  var = var[!converged]
  cov = cov[!converged]
  b = var - round(sqrt(var))
  p = ifelse(b < 0, 0, 1-pbinom(var-1, cov, noneg(b)/cov))
  while ( any(!converged) ) {
    tooLow = p < p0
    bnew = b + sign(p0-p)
    pnew = ifelse(bnew < 0, 0, 1-pbinom(var-1, cov, noneg(bnew)/cov))
    solved = sign(p-p0) != sign(pnew-p0) 
    #handle converged cases
    if ( any(solved) ) {
      bsolutions[which(!converged)[solved]] = ((b*(pnew-p0)+bnew*(p0-p))/(pnew-p))[solved]
      converged[which(!converged)[solved]] = rep(T, sum(solved))
    }
    #update not converged cases, removing solved values
    p = pnew[!solved]
    b = bnew[!solved]
    cov = cov[!solved]
    var = var[!solved]
  }

  var = varOri
  cov = covOri
  converged = cov == 0
  tsolutions = ifelse(cov==0, 1, 0)
  var = var[!converged]
  cov = cov[!converged]
  t = var + round(sqrt(var))
  p = ifelse(t > cov, 0, pbinom(var, cov, pmin(cov, t)/cov))
  while ( any(!converged) ) {
    tooLow = p < p0
    tnew = t + sign(p-p0)
    pnew = ifelse(tnew > cov, 0, pbinom(var, cov, pmin(cov, tnew)/cov))
    solved = sign(p-p0) != sign(pnew-p0) 
    #handle converged cases
    if ( any(solved) ) {
      tsolutions[which(!converged)[solved]] = ((t*(pnew-p0)+tnew*(p0-p))/(pnew-p))[solved]
      converged[which(!converged)[solved]] = rep(T, sum(solved))
    }
    #update not converged cases, removing solved values
    p = pnew[!solved]
    t = tnew[!solved]
    cov = cov[!solved]
    var = var[!solved]
  }

  if ( reportBothEnds ) return(cbind(pmax(0,bsolutions/covOri), pmin(1,tsolutions/covOri)))
  error = pmax(tsolutions-varOri, varOri-bsolutions)/covOri
  return(error)
}

getCNVstories = function(cnvs, normal, genome, filter=T) {
  regions = splitRegions(cnvs)
  catLog(nrow(regions), ' regions..', sep='')
  events = splitEvents(cnvs, regions)
  catLog(nrow(regions), ' events.\nExtracting stories, comparing SNP shift directions..', sep='')
  stories = cnvsToStories(cnvs, events, normal, genome, filter=filter)
  return(stories)
}

splitRegions = function(cnvs) {
  regions = data.frame('x1' = c(), 'x2' = c(), stringsAsFactors=F)
  cnvX1 = unique(unlist(sapply(cnvs, function(cnv) cnv$cluster$x1)))
  cnvX2 = unique(unlist(sapply(cnvs, function(cnv) cnv$cluster$x2)))
  x1 = x2 = -Inf
  while (x1 < max(cnvX1)) {
    x1 = min(cnvX1[cnvX1 >= x2])
    x2 = min(cnvX2[cnvX2 > x1])
    regions = rbind(regions, data.frame('x1'=x1, 'x2'=x2, stringsAsFactors=F))
  }
  return(regions)
}

splitEvents = function(cnvs, regions) {
  events = data.frame('x1' = c(), 'x2' = c(), 'call' = c(), stringsAsFactors=F)
  cnvX1 = unlist(sapply(cnvs, function(cnv) cnv$cluster$x1))
  cnvX2 = unlist(sapply(cnvs, function(cnv) cnv$cluster$x2))
  cnvCalls = unlist(sapply(cnvs, function(cnv) cnv$cluster$call))
  for ( i in 1:nrow(regions) ) {
    calls = unique(cnvCalls[cnvX1 <= regions$x1[i] & cnvX2 >= regions$x2[i]])
    calls = calls[!(calls %in% c('AB', 'AB?', 'AB??'))]
    for ( call in calls ) events = rbind(events, data.frame('x1' = regions$x1[i], 'x2' = regions$x2[i], 'call' = call, stringsAsFactors=F))
  }
  events = events[!grepl('\\?', events$call),]
  return(events)
}

cnvsToStories = function(cnvs, events, normal, genome, filter=T) {
  stories = errors = sigmas = data.frame(stringsAsFactors=F)
  if ( nrow(events) > 0 ) {
    i=1
    while ( i <= nrow(events) ) {
      call = as.character(events$call[i])
      clonalities = extractClonalities(cnvs, events[i,])
      stories = rbind(stories, noneg(clonalities$clonality))
      errors = rbind(errors, clonalities$clonalityError)
      sigmas = rbind(sigmas, clonalities$sigma)
      if ( any(clonalities$clonality < 0) & any(clonalities$clonality > 0) & !any(is.na(clonalities$clonality)) ) {
        negativeClonalities = clonalities
        negativeClonalities$clonality = noneg(-negativeClonalities$clonality)
        negEvent = events[i,]
        negEvent$call = reverseCall(negEvent$call)
        events = rbind(events, negEvent)[order(c(1:nrow(events), i+0.5)),]
        stories = rbind(stories, noneg(negativeClonalities$clonality))
        errors = rbind(errors, negativeClonalities$clonalityError)
        sigmas = rbind(sigmas, negativeClonalities$sigma)
        i = i + 1
      }
      i = i + 1
    }

    if ( nrow(events) > 1 ) {
      Nevents = nrow(events)
      keep = rep(T, Nevents)
      for ( x1 in unique(events$x1) ) {
        rows = events$x1 == x1
        if ( sum(rows) == 0 ) next
        subStories = stories[rows,,drop=F]
        subSigmas = sigmas[rows,,drop=F]
        subKeep = keep[rows]
        while ( !any(is.na(colSums(subStories[subKeep,,drop=F]))) & any(colSums(subStories[subKeep,,drop=F]) > 1) ) {
          presence = rowMeans(subStories[subKeep,,drop=F])
          inconsistency = sqrt(rowMeans(subSigmas[subKeep,,drop=F]^2))
          score = presence - inconsistency/3
          subKeep[subKeep][which.min(score)] = F
        }
        keep[rows] = subKeep
      }
      catLog(sum(!keep), 'overlapping stories filtered..')
      events = events[keep,,drop=F]
      stories = stories[keep,,drop=F]
      errors = errors[keep,,drop=F]
      sigmas = sigmas[keep,,drop=F]
    }
  }
  else {
    events$stories=matrix(,nrow=0, ncol=length(cnvs))
    events$errors=matrix(,nrow=0,ncol=length(cnvs))
    return(events)
  }
  colnames(errors) = colnames(stories) = names(cnvs)
  ret = events
  rownames(ret) = make.names(paste0('chr', xToChr(ret$x1, genome), '-', events$call), unique=T)
  ret$stories = as.matrix(stories)
  ret$errors = as.matrix(errors)
  ret$errors[is.na(ret$errors)] = 1

  if ( !filter ) return(ret)
  
  allSmall = rowSums(ret$stories - ret$errors*1.5 < 0.15 | ret$errors > 0.2) == ncol(ret$stories)
  ret = ret[!allSmall,,drop=F]
  uncertain = rowMeans(ret$errors) > 0.2
  ret = ret[!uncertain,,drop=F]
  notSignificant = rowSums(abs(ret$stories) < ret$errors*1.5 | is.na(ret$errors)) >= ncol(ret$stories)
  ret = ret[!notSignificant,,drop=F]
  smallRegion = ret$x2 - ret$x1 < 5e6 & !(ret$call == 'CL' & apply(ret$stories, 1, max) > 0.7)
  ret = ret[!smallRegion,,drop=F]
  if ( any(normal) & nrow(ret) > 0 ) {
    presentInNormal = ret$stories[,normal,drop=F] > ret$errors[,normal,drop=F] | ret$stories[,normal,drop=F] > 0.2
    presentInNormal = apply(presentInNormal, 1, any)
    ret = ret[!presentInNormal,,drop=F]
    catLog('Filtered ', sum(allSmall) , ' small, ', sum(uncertain), ' uncertain, ', sum(smallRegion), ' small region and ', sum(presentInNormal), ' present in normal stories.\n', sep='')
  }
  else catLog('Filtered ', sum(allSmall) , ' small, ', sum(uncertain), ' uncertain and ', sum(smallRegion), ' small region stories.\n', sep='')
  falseSNPcalls = ret$call == 'AA' & (ret$x2 - ret$x1 < 2e6 | rowMeans(ret$errors) > 0.15 )
  ret = ret[!falseSNPcalls,,drop=F]
  catLog('Filtered ', sum(falseSNPcalls) , ' stories that are potentially based on false SNPs.\n', sep='')
  return(ret)
}

extractClonalities = function(cnvs, event) {
  nSample = length(cnvs)
  subFreqs = lapply(cnvs, function(cs) cs$eFreqs[cs$eFreqs$x >= event$x1 & cs$eFreqs$x <= event$x2,])
  subFreqsMirror = lapply(cnvs, function(cs) {
    ret=cs$eFreqs[cs$eFreqs$x >= event$x1 & cs$eFreqs$x <= event$x2,]
    ret$var = mirrorDown(ret$var, ret$cov)
    return(ret)
    })
  subCR = do.call(rbind, lapply(1:length(cnvs), function(i) {
    cs = cnvs[[i]]
    efs = subFreqsMirror[[i]]
    ret = mergeToOneRegion(cs$CR[cs$CR$x1 >= event$x1 & cs$CR$x2 <= event$x2,,drop=F], efs)
  }))
  fM = callTofM(as.character(event$call))
  prior = callPrior(as.character(event$call))
  ret = as.data.frame(do.call(rbind, lapply(1:nSample, function(sample) {
    efs=subFreqsMirror[[sample]]
    iscnv = isCNV(cluster=subCR[sample,], efs=efs, M=fM[2], f=fM[1], prior=prior)
    #theoreticalError = getTheoreticalError(cluster=subCR[sample,], call=as.character(event$call), efs=efs)
    #iscnv$clonalityError = sqrt(iscnv$clonalityError^2 + theoreticalError^2)
    bestSigma = getBestSigma(cluster=subCR[sample,], call=as.character(event$call), efs=efs)
    return(c(iscnv$call, iscnv$clonality, iscnv$sigma, iscnv$clonalityError, bestSigma))
  })), stringsAsFactors=F)
  names(ret) = c('call', 'clonality', 'sigma', 'clonalityError', 'bestSigma')
  direction = rep(0, nSample)

  if ( fM[1] != 0.5 ) {
    freqX = unique(unlist(lapply(subFreqs, function(freq) freq$x)))
    if ( length(freqX) > 0 ) {
      directionScores = do.call(cbind, lapply(1:nSample, function(i) freqToDirectionProb(subFreqs[[i]], freqX, fM, ret$clonality[i])))
      strongestSample = which(colsums(directionScores^2) >= 0.999999*max(colsums(directionScores^2)))[1]
      direction = sapply(1:nSample, function(i) {
        sN = scalarNorm(directionScores[,strongestSample], directionScores[,i])
        power = sqrt(sum(directionScores[,i]^2))
        if ( (is.na(power) == TRUE) || power < 10 ) return(NA)  #if not enough power to tell up or down, nevermind.
        return(sN)
      })
    }
  }

  #if there are other much better calls, this call probably isnt present
  wrongCall = ret$sigma > 5 & ret$bestSigma < 3
  ret$clonality[wrongCall] = 0

  #if no call gives a good fit, just up the uncertainty depending on how bad fit.
  #messedUpCall = ret$sigma > 5 & !wrongCall
  #ret$clonalityError[messedUpCall] = sqrt(ret$clonalityError[messedUpCall]^2 + ((ret$sigma[messedUpCall]-5)/5)^2)
  ret$clonalityError = sqrt(ret$clonalityError^2 + ((ret$sigma - 2)/20)^2)
  
  ret$clonality[!is.na(direction) & direction < 0] = -ret$clonality[!is.na(direction) & direction < 0]
  if ( !any(ret$clonality > 0) & any(ret$clonality < 0) ) ret$clonality = - ret$clonality
  
  return(ret)
}

mergeToOneRegion = function(cR, eFreqs) {
  if ( nrow(cR) == 0 )
    return(data.frame(x1=NA, x2=NA, M=0, width=1000, df=10, var=0, cov=0,
                      pHet=0.5, pAlt=0.5, odsHet=0.5, f=NA, ferr=NA))
  cR$x1[1] = min(cR$x1)
  cR$x2[1] = max(cR$x2)
  efs = eFreqs[eFreqs$x > cR$x1[1] & eFreqs$x < cR$x2[1],]
  cR$var[1] = sum(cR$var)
  cR$cov[1] = sum(cR$cov)
  cR$M[1] = sum(cR$M/cR$width^2)/sum(1/cR$width^2)
  cR$width[1] = 1/sqrt(sum(1/cR$width^2))
  cR$Nsnps[1] = sum(cR$Nsnps)
  cR = cR[1,]
  cf = correctedFrequency(cR, efs)
  cR$f = cf[,'f']
  cR$ferr = cf[,'ferr']
  return(cR)
}

freqToDirectionProb = function(freq, freqX, fM, clonality) {
  f = fM[1]
  M = fM[2]
  if ( f == 0.5 ) return(rep(0, length(freqX)))
  
  upProb = function(x) {
    j = which(freq$x == x)
    if ( length(j) == 0 ) return(1)
    fClone = (clonality*(2*f*2^M-1) + 1)/(clonality*(2*2^M - 2) + 2)
    return(pBinom(freq$cov[j], freq$var[j], 1-fClone))
  }
  downProb = function(x) {
    j = which(freq$x == x)
    if ( length(j) == 0 ) return(1)
    fClone = (clonality*(2*f*2^M-1) + 1)/(clonality*(2*2^M - 2) + 2)
    return(pBinom(freq$cov[j], freq$var[j], fClone))
  }
  nullProb = function(x) {
    j = which(freq$x == x)
    if ( length(j) == 0 ) return(1)
    return(pBinom(freq$cov[j], freq$var[j], 0.5))
  }
  u = 1e-5 + sapply(freqX, function(x) upProb(x))
  d = 1e-5 + sapply(freqX, function(x) downProb(x))
  n = 1e-5 + sapply(freqX, function(x) nullProb(x))
  return((1-n)*log(u/d))
}



#takes a dataframe of stories and groups them into subclone stories. returns a data frame of the subclone stories
#and a list of dataframes for the individual stories in each subclone.
storiesToCloneStories = function(stories, storyList='',
  minDistance=-qnorm(0.01), cpus=1, manualStoryMerge=F, variants=NA) {
  if ( (storyList ==  '')[1] ) storyList = as.list(rownames(stories))
  if ( length(storyList) < 2 )
    return(list('cloneStories'=stories, 'storyList'=storyList))

  #if a lot of mutations, merge them in batches, as the algorithm scales as O(N^2) otherwise
  #group less in this first pass (minDistance*0.5), and then group as specified last round.
  batchSize = round(pmin(1000, pmax(100, 1e6/length(storyList))))
  if ( length(storyList) > batchSize ) catLog('Cluster mutations by batch. Remaining clusters: ', sep='')
  while ( length(storyList) > batchSize ) {
    catLog(length(storyList), '..', sep='')
    first300 = storiesToCloneStories(stories=stories, storyList=storyList[1:batchSize],
      minDistance=minDistance*0.5, cpu=cpus)
    storyList = c(first300$storyList, storyList[(batchSize+1):length(storyList)])
    batchSize = round(pmin(1000, pmax(100, 1e6/length(storyList))))
  }

  summariseClusters = function() {
    st = do.call(rbind, lapply(storyList, function(rows) {
      err = stories$errors[rows,,drop=F]
      if ( any(err<=0) ) err[err<=0] = rep(min(c(1,err[err>0]))/1e6, sum(err<=0))
      st = stories$stories[rows,,drop=F]
      w = t(1/t(err^2)/colsums(1/err^2))
      mean = colsums(st*w)
      ret = matrix(mean, nrow=1)
    }))
    err = do.call(rbind, lapply(storyList, function(rows) {
      err = stories$errors[rows,,drop=F]
      err = 1/sqrt(colsums(1/err^2))
      ret = matrix(err, nrow=1)
    }))
    if ( nrow(st) > 0 ) rownames(err) = rownames(st) = 1:length(storyList)
    colnames(err) = colnames(st) = colnames(stories$stories)
    clusters = data.frame('call'=rep('clone', length(storyList)), 'x1'=rep(NA, length(storyList)), 'x2'=rep(NA, length(storyList)), row.names=1:length(storyList), stringsAsFactors=F)
    clusters$stories = st
    clusters$errors = err
    return(clusters)
  }
  
  distance = do.call(rbind, mclapply(1:length(storyList), function(i) c(sapply(1:i, function(j) pairScore(stories, storyList[[i]], storyList[[j]])), rep(0, length(storyList)-i)), mc.cores=cpus))
  distance = distance + t(distance)
  distance = distance + ifelse(row(distance) == col(distance), (max(distance, minDistance)+1), 0)

  loops = 0
  while ( any(distance < minDistance) ) {
    merge = which(distance == min(distance), arr.ind=TRUE)[1,]

    if ( manualStoryMerge & loops == 0 ) {
      layout(matrix(1:2, nrow=2))
      cat('\n\nUSER INPUT:\nTop left: Merging story cluster ', merge[1], ' (blue) and ', merge[2], ' (red) at a distance of ', min(distance), '.\n', sep='')
      cat('There are ', nrow(distance), ' clusters.\n', sep='')
      i=merge[1];j=merge[2];plotStories(stories[c(storyList[[i]], storyList[[j]]),], variants, col=mcri(c(rep('blue', length(storyList[[i]])), rep('red', length(storyList[[j]])))), lty=1, main=paste0('next up to merge: ', merge[1], ' (blue) and ', merge[2], ' (red)'), setPar=F)
      clusters = summariseClusters()
      plotStories(clusters, variants, main = 'clusters right now', setPar=F)
      if ( nrow(distance) < 15 ) {
        cat('Clone distance matrix:\n')
        print(distance)
      }
      layout(1)

      a = as.numeric(readline('\nType number of merges to perform, or blank for this merge only.\nType 0 to not merge these two clusters.\nType -1 to reset and start over with clustering.\nType any number smaller than -1 to stop clustering and continue downstream analysis.\nInput:'))
      if ( is.na(a) ) a = 1
      if ( a < -1 ) {
        distance = Inf
        next
      }
      else if ( a == 0 ) {
        distance[merge[1], merge[2]] = distance[merge[2], merge[1]] = Inf
        next
      }
      else if ( a == -1 ) {
        storyList=as.list(rownames(stories))
        distance = do.call(rbind, mclapply(1:length(storyList), function(i) c(sapply(1:i, function(j) pairScore(stories, storyList[[i]], storyList[[j]])), rep(0, length(storyList)-i)), mc.cores=cpus))
        distance = distance + t(distance)
        distance = distance + ifelse(row(distance) == col(distance), (max(distance)+1), 0)
        next
      }
      else if ( a == round(a) & a > 0 ) loops = a - 1
      else loops = 0
    }
    else if ( manualStoryMerge ) loops = loops - 1
    
    storyList[[merge[1]]] = c(storyList[[merge[1]]], storyList[[merge[2]]])
    distance[merge[1],] = distance[,merge[1]] = sapply(1:length(storyList), function(j)
                                      pairScore(stories, storyList[[merge[1]]], storyList[[j]]) + ifelse(j==merge[1], (max(distance)+1), 0))
    distance = distance[-merge[2], -merge[2], drop=F]
    storyList = storyList[-merge[2]]
  }

  clusters = summariseClusters()
  names(storyList) = rownames(clusters)
  
  return(list('cloneStories'=clusters, 'storyList'=storyList))
}

#The metric on stories, used for clustering similar stories into subclones.
pairScore = function(stories, is, js) {
  if ( identical(is, js) ) return(0)
  #if ( length(is) == 1 ) rms1 = noneg(0.5 - mean(stories$errors[is,]))
  #else {
  #  err1 = stories$errors[is,]
  #  if ( any(err1<=0) ) err1[err1<=0] = rep(min(c(1,err1[err1>0]))/1e6, sum(err1<=0))
  #  st1 = stories$stories[is,]
  #  w1 = t(1/t(err1^2)/colsums(1/err1^2))
  #  mean1 = colSums(st1*w1)
  #  sigma1 = abs(t(t(st1)-mean1))/err1
  #  rms1 = max(sqrt(colmeans(sigma1^2)))
  #}
  #if ( length(js) == 1 ) rms2 = noneg(0.5 - mean(stories$errors[js,]))
  #else {
  #  err2 = stories$errors[js,]
  #  if ( any(err2<=0) ) err2[err2<=0] = rep(min(c(1,err2[err2>0]))/1e6, sum(err2<=0))
  #  st2 = stories$stories[js,]
  #  w2 = t(1/t(err2^2)/colsums(1/err2^2))
  #  mean2 = colSums(st2*w2)
  #  sigma2 = abs(t(t(st2)-mean2))/err2
  #  rms2 = max(sqrt(mean(sigma2^2)))
  #}
  #unpairedRms = sqrt(rms1^2+rms2^2)

  err = stories$errors[c(is,js),,drop=F]
  if ( any(err<=0) ) err[err<=0] = rep(min(c(1,err[err>0]))/1e6, sum(err<=0))
  if ( any(is.infinite(err)) ) err[is.infinite(err)] = rep(1e6, sum(is.infinite(err)))
  st = stories$stories[c(is,js),,drop=F]
  w = t(1/t(err^2)/colsums(1/err^2))
  mean = colSums(st*w)
  sigma = abs(t(t(st)-mean))/err
  #rms = max(sqrt(colmeans(sigma^2)))

  #p = min(apply(pnorm(-sigma)*2, 2, function(ps) min(p.adjust(ps, method='fdr'))))
  #p1 = min(apply(pnorm(-sigma[1:length(is),])*2, 2, function(ps) min(p.adjust(ps, method='fdr'))))
  #p2 = min(apply(pnorm(-sigma[(length(is)+1):nrow(sigma),])*2, 2, function(ps) min(p.adjust(ps, method='fdr'))))
  #pF = min(apply(pnorm(-sigma)*2, 2, function(ps) fisherTest(ps)['pVal']))
  pF1 = min(apply(pnorm(-sigma)*2, 2, function(ps) fisherTest(ps[1:length(is)])['pVal']))
  pF2 = min(apply(pnorm(-sigma)*2, 2, function(ps) fisherTest(ps[(length(is)+1):length(ps)])['pVal']))
  return(-qnorm(min(pF1, pF2)/2))
  
  #return(rms + noneg(rms - unpairedRms) )
}


#helper function that decided which subclones are subclones of each other.
findCloneTree = function(cloneStories, storyList) {
  #add the purity as a story (this may or may not already be present)
  purity = sapply(1:ncol(cloneStories$stories), function(i) max(cloneStories$stories[,i]))
  #check which clones are subclones of which others, allowing an error bar from each clone.
  subcloneMx = as.matrix(apply(abs(cloneStories$stories) + cloneStories$errors*1.5, 1,
    function(story1) apply(abs(cloneStories$stories) - cloneStories$errors*1.5, 1,
                           function(story2) all(story1 > story2))))
  greaterSum = as.matrix(apply(abs(cloneStories$stories), 1,
    function(story1) apply(abs(cloneStories$stories), 1,
                           function(story2) sum(story1) > sum(story2))))
  subcloneMx = subcloneMx & greaterSum
  subcloneMx = enforceTransitive(subcloneMx)
  subcloneStories = cloneStories$stories
  rownames(subcloneMx) = colnames(subcloneMx) = rownames(subcloneStories) = rownames(cloneStories)
  
  cloneTree = findChildren(subcloneMx, subcloneStories)

  return(cloneTree)
}
#helper function
findChildren = function(subcloneMx, subcloneStories) {
  #if no more subclones, return empty list
  if ( nrow(subcloneStories) == 0 ) return(list())

  #there will be at least one clone that is not a subclone (aka paranetless, the one with the largest clonality sum)
  cloneNames = rownames(subcloneStories)[which(rowSums(subcloneMx) == 0)]
  
  #score the parentless subclones from the sum of clonalities.
  cloneScores = rowSums(abs(subcloneStories))[cloneNames]
  cloneScores = sort(cloneScores, decreasing=T)

  #go through the parentless clones in order of score, each one recurring with its subclones.
  ret = list()
  for ( clone in names(cloneScores) ) {
    subclones = subcloneMx[,clone]
    if ( !any(subclones) ) ret[[clone]] = list()
    else ret[[clone]] = findChildren(subcloneMx[subclones, subclones, drop=F], subcloneStories[subclones,,drop=F])
    subcloneMx = subcloneMx[!subclones, !subclones, drop=F]
    subcloneStories = subcloneStories[!subclones,,drop=F]
  }

  return(ret)
}

#helper function, switching A and B to show that the other allele is affected.
reverseCall = function(call) {
  call = gsub('A', 'a', call)
  call = gsub('B', 'A', call)
  call = gsub('a', 'B', call)
  return(call)
}

#enforces transitivity on a subclone matrix by taking the transitive closure
enforceTransitive = function(mx) {
  for ( col in 1:ncol(mx) ) {
    mx[,col] = mx[,col] | rowsums(mx[,mx[,col],drop=F]) > 0
  }
  return(mx)
}

#add the filtered stories to the clone stories if consistent.
mergeStories = function(clusteredStories, filteredStories, germlineVariants=c()) {
  germlineOnly = rownames(filteredStories) %in% germlineVariants
  distance = sapply(1:nrow(clusteredStories$cloneStories), function(cloneRow) {
    isGermlineClone = rownames(clusteredStories$cloneStories)[cloneRow] == 'germline'
    sapply(1:nrow(filteredStories), function(storyRow) {
      ret = cloneStoryRMS(clusteredStories$cloneStories[cloneRow,], filteredStories[storyRow,])
      if ( germlineOnly[storyRow] & !isGermlineClone ) ret = Inf
      #be extra generous for rare germline variants matching the germline clone.
      if ( germlineOnly[storyRow] & isGermlineClone ) ret = ret/2
      return(ret)
    })
  })
  #if either as only one row, make it a 1-row or 1-column matrix.
  if ( !inherits(distance, 'matrix') )
    distance = matrix(distance, nrow = nrow(filteredStories))
  closestDistance = apply(distance, 1, min)
  toMerge = which(closestDistance <= 1)
  bestClone = sapply(toMerge, function(row) which(distance[row,] == closestDistance[row])[1])
  for ( i in 1:length(toMerge) )
    clusteredStories$storyList[[bestClone[i]]] = c(clusteredStories$storyList[[bestClone[i]]], rownames(filteredStories)[toMerge[i]])

  return(clusteredStories)
}

#distance between a story and a clone
cloneStoryRMS = function(clone, story, systematicVariance=0.02) {
  errors = sqrt(clone$errors^2+story$errors^2)+0.02
  sigmas = (clone$stories - story$stories)/errors
  rms = sqrt(mean(sigmas^2))
  return(rms)
}

combineStories = function(stories1, stories2) {
  ret = data.frame(row.names=c(rownames(stories1), rownames(stories2)), stringsAsFactors=F)
  ret$x1 = c(stories1$x1, stories2$x1)
  ret$x2 = c(stories1$x2, stories2$x2)
  ret$call = c(as.character(stories1$call), as.character(stories2$call))
  ret$stories = as.matrix(rbind(stories1$stories, stories2$stories))
  ret$errors = as.matrix(rbind(stories1$errors, stories2$errors))
  rownames(ret$stories) = rownames(ret$errors) = rownames(ret)
  return(ret)
}


addTheoreticalCNVerrors = function(cnvs) {
  for ( sample in names(cnvs) ) {
    clusters = cnvs[[sample]]$clusters
    eFreqs = cnvs[[sample]]$eFreqs
    eFreqs$var = mirrorDown(eFreqs$var, eFreqs$cov)
    for ( row in 1:nrow(clusters) ) {
      call = clusters$call[row]
      if ( grepl('\\?', call) ) next
      if ( call == 'AB' ) next
      efs = eFreqs[eFreqs$x > clusters$x1[row] & eFreqs$x < clusters$x2[row],]

      theoreticalError = getTheoreticalError(clusters[row,], clusters$call[row], efs)
      clusters[row,]$clonalityError = sqrt(clusters[row,]$clonalityError^2 + theoreticalError^2)
    }

    cnvs[[sample]]$clusters = clusters
  }

  return(cnvs)
}

getTheoreticalError = function(cluster, call, efs) {
  isCall = isCNV(cluster, efs, callTofM(call)['M'],
    callTofM(call)['f'], callPrior(call), 5)
  bestSigma = isCall$sigma
  bestClonality = isCall$clonality
  
  secondBestCall = ''
  secondBestSigma = Inf
  secondBestClonality = 0
  for ( tryCall in allCalls()[allCalls() != call] ) {
    iscnv = isCNV(cluster, efs, callTofM(tryCall)['M'],
      callTofM(tryCall)['f'], callPrior(tryCall), bestSigma)
    if ( secondBestCall == '' || iscnv$sigma < secondBestSigma ) {
      secondBestCall = tryCall
      secondBestSigma = iscnv$sigma
      secondBestClonality = iscnv$clonality
    }
  }
  
  theoreticalError = abs(secondBestClonality - bestClonality)/
    (1 + sqrt(noneg(max(bestSigma, secondBestSigma)^2-bestSigma^2)))

  return(theoreticalError)
}


getBestSigma = function(cluster, call, efs) {
  bestSigma = Inf
  for ( tryCall in allCalls() ) {
    iscnv = isCNV(cluster, efs, callTofM(tryCall)['M'],
      callTofM(tryCall)['f'], callPrior(tryCall), 5)
    if ( iscnv$sigma < bestSigma ) {
      bestSigma = iscnv$sigma
    }
  }
  
  return(bestSigma)
}


makeTreeConsistent = function(cloneTree, cloneStories, storyList) {

  toRemove = findFirstUnitarityViolation(cloneTree, cloneStories, storyList)
  if ( length(toRemove) == 0 ) return(cloneTree)
  
  #remove story from tree.
  cloneStories = cloneStories[rownames(cloneStories) != toRemove,]
  storyList = storyList[names(storyList) != toRemove]

  cloneTree = findCloneTree(cloneStories)

  #recur until nothing needs to be removed
  cloneTree = makeTreeConsistent(cloneTree, cloneStories, storyList)
  
  return(cloneTree)
  
}

#looks recursively for unitarity violations in the clone tree, and returns the first main suspect of
#false clone that cause unitarity violation. returns empty vector if consistent with unitarity.
findFirstUnitarityViolation = function (cloneTree, cloneStories, storyList) {
  if (length(cloneTree) == 0) 
    return(c())
  for (name in names(cloneTree)) {
    if (length(cloneTree[[name]]) == 0) {
      next
    }
    whichViolating = whichUnitarityViolating(cloneStories[names(cloneTree[[name]]),], cloneStories[name,])
    if (length(whichViolating) > 0) {
      dodgyness = superFreq:::getDodgyness(storyList[whichViolating], 
        cloneStories[whichViolating,])
      suspect = whichViolating[which.max(dodgyness)]
      return(suspect)
    }
    deeperSuspect = findFirstUnitarityViolation(cloneTree[[name]], 
      cloneStories, storyList)
    if (length(deeperSuspect) > 0) {
      return(deeperSuspect)
    }
  }
  return(c())
}


#check which, if any, of the immediate subclones contribute to unitarity violation.
#returns a character vector of the contributing subclones, empty vector if consistent with unitarity.
whichUnitarityViolating = function(cloneStories, parentStory) {
  sampleTooHigh = superFreq:::colsums(superFreq:::noneg(cloneStories$stories - cloneStories$errors*1.5)) > (parentStory$stories+parentStory$errors*1.5)[1,]
  if ( !any(sampleTooHigh) ) return(c())
  contributing = superFreq:::rowsums(t(t(cloneStories$stories -
  cloneStories$errors*1.5 > 0)*sampleTooHigh)) > 0
  return(names(contributing)[contributing])
}


getDodgyness = function(storyList, stories) {
  dodgyness = sapply(names(storyList), function(name) {
    mutations = storyList[[name]]
    if ( name == 'germline' ) return(0)
    Nmut = length(mutations)
    Ncna = sum(grepl('^chr', mutations))
    Nindel = sum(grepl('[+-]', mutations))
    Nsnv = Nmut - Ncna - Nindel

    #few mutations overall
    ret = 1/Nmut
    
    #high indel/snv ratio
    ret = ret + Nindel/(Nsnv+Nindel+1)

    #close-by SNVs (less than 10kbp)
    if ( Nsnv > 1 ) {
      snvX = as.numeric(gsub('[A-Z]', '', mutations[!grepl('^chr', mutations) & !grepl('[+-]', mutations)]))
      snvX = sort(snvX)
      closeBy = snvX[2:length(snvX)] - snvX[1:(length(snvX)-1)] < 1e4
      ret = ret + noneg(sum(closeBy)/length(closeBy) - 0.1)
    }
    
    #high cna/snv ratio
    ret = ret + Ncna/(Ncna+Nsnv+1)

    #less dodgy if both CNA and SNV support
    ret = noneg(ret - (Ncna > 0 & Nsnv > 0)*0.2)

    #noise tends to be reasonably constant over samples
    highestLow = max(stories[name,]$stories - stories[name,]$errors, na.rm=T)
    lowestHigh = min(stories[name,]$stories + stories[name,]$errors, na.rm=T)
    significantChange = noneg(highestLow - lowestHigh)
    ret = ret + noneg(0.5 - significantChange)*2

    return(ret)
  })
  return(dodgyness)
}

clonesInTree = function(tree) {
  if ( length(tree) == 0 ) return('')

  namesOnThisLevel = names(tree)
  namesOnDeeperLevels = unlist(sapply(tree, clonesInTree))
  namesOnDeeperLevels = namesOnDeeperLevels[namesOnDeeperLevels != '']

  return(c(namesOnThisLevel, namesOnDeeperLevels))
}

renameClones = function(clusters) {
  before = clonesInTree(clusters$cloneTree)
  after = as.character(1:length(before))
  if ( before[1] == 'germline' )
    after = c('germline', 1:(length(before)-1))
  names(after) = before

  clusters$cloneTree = renameCloneTree(clusters$cloneTree, after)
  rownames(clusters$cloneStories) = after[rownames(clusters$cloneStories)]
  rownames(clusters$cloneStories$stories) = after[rownames(clusters$cloneStories$stories)]
  rownames(clusters$cloneStories$errors) = after[rownames(clusters$cloneStories$errors)]
  names(clusters$storyList) = after[names(clusters$storyList)]

  return(clusters)
}


renameCloneTree = function(tree, newNames) {
  if ( length(tree) == 0 ) return(tree)

  for ( i in 1:length(tree) ) {
    names(tree)[i] = newNames[names(tree)[i]]
    tree[[i]] = renameCloneTree(tree[[i]], newNames)
  }
  return(tree)
}
ChristofferFlensburg/superFreq documentation built on Nov. 15, 2023, 6:15 a.m.