dpclust: DPClust - An R package for subclonal reconstruction of tumours using sequencing data

Documented in calc_cluster_conf_intervals calc_cluster_order_probs getClusterDensity getLocalOptima get_mutation_preferences multiDimensionalClustering mutation_assignment_em oneDimensionalClustering

#
# This file contains various functions to assign mutations to clusters
#

#' Identify clusters and assign mutations for 1-dimensional clustering
#' 
#' @param samplename Sample identifier, used for writing out data to disk
#' @param subclonal.fraction Cancer cell fraction estimates for each mutation
#' @param GS.data Output from the MCMC chain
#' @param density Posterior density estimate of where clusters are located
#' @param no.iters Number of total iterations the MCMC chain was run for
#' @param no.iters.burn.in Number of iterations to discard as burn in
#' @return Standardised mutation clustering output, including clusters, mutation assignments and likelihoods
#' @author dw9, sd11
oneDimensionalClustering <- function(samplename, subclonal.fraction, GS.data, density, no.iters, no.iters.burn.in) {
  no.muts = length(subclonal.fraction)
  normal.copy.number = rep(2,no.muts)
  post.burn.in.start = no.iters.burn.in
  
  S.i = GS.data$S.i
  V.h = GS.data$V.h
  pi.h = GS.data$pi.h[,,1]
  
  # Obtain local optima and peak indices
  res = getLocalOptima(density, hypercube.size=5)
  localOptima = res$localOptima
  peak.indices = res$peak.indices
  write.table(localOptima,paste(samplename,"_localOptima.txt",sep=""), quote=F, sep="\t")
  
  # Assign mutations to clusters
  no.optima = length(localOptima)
  if (no.optima>1) {  
    boundary = array(NA,no.optima-1)
    mutation.preferences = array(0,c(no.muts,no.optima))
    for(i in 1:(no.optima-1)){
      min.density = min(density$median.density[(peak.indices[i]+1):(peak.indices[i+1]-1)])
      min.indices = intersect(which(density$median.density == min.density),(peak.indices[i]+1):(peak.indices[i+1]-1))
      
      #what distance along the line between a pair of optima do we have to go to reach the minimum density
      boundary[i] = (density$fraction.of.tumour.cells[max(min.indices)] + density$fraction.of.tumour.cells[min(min.indices)])/2
    }
    
    sampledIters = (no.iters.burn.in + 1) : no.iters
    #don't use the intitial state
    sampledIters = sampledIters[sampledIters!=1]	
    if(length(sampledIters) > 1000){
      sampledIters=floor(post.burn.in.start + (1:1000) * (no.iters - no.iters.burn.in)/1000)  
    }
    
    S.i = data.matrix(S.i)
    for(s in sampledIters){
      temp.preferences = array(0,c(no.muts,no.optima))
      for(c in unique(S.i[s,])){
        bestOptimum = sum(pi.h[s,c]>boundary)+1
        temp.preferences[S.i[s,]==c,bestOptimum] = temp.preferences[S.i[s,]==c,bestOptimum] + 1		
      }
      iter.preferences = t(apply(temp.preferences, 1, function(p) { as.integer(p==max(p)) / sum(p==max(p)) }))
      mutation.preferences = mutation.preferences + iter.preferences
    }
    mutation.preferences = mutation.preferences / length(sampledIters)
    
    # Drop clusters with all probs for mutations zero
    not.is.empty = !apply(mutation.preferences, 2, function(x) { all(x==0) })
    mutation.preferences = mutation.preferences[, not.is.empty, drop=F]
    localOptima = localOptima[not.is.empty]
    no.optima = length(localOptima)
    
    # Save the cluster assignment probabilities table
    most.likely.cluster = max.col(mutation.preferences)
    out = cbind(mutation.preferences, most.likely.cluster)
    colnames(out)[(ncol(out)-no.optima):ncol(out)] = c(paste("prob.cluster", 1:ncol(mutation.preferences),sep=""),"most.likely.cluster")
    write.table(out, paste(samplename,"_DP_and_cluster_info.txt",sep=""), sep="\t", row.names=F, quote=F)

    # Assemble a table with mutation assignments to each cluster
    cluster_assignment_counts = sapply(1:ncol(mutation.preferences), function(x, m) { sum(m==x) }, m=most.likely.cluster) #table(most.likely.cluster)
    cluster_locations = array(NA, c(length(cluster_assignment_counts), 3))
    cluster_locations[,1] = 1:length(cluster_assignment_counts)
    cluster_locations[,2] = localOptima
    cluster_locations[,3] = cluster_assignment_counts
    
    # Keep a record of all clusters, in case it is of interest
    write.table(cluster_locations, paste(samplename,"_optimaInfo.txt",sep=""), col.names=c("cluster.no","location","no.of.mutations"), row.names=F, sep="\t", quote=F)		
    
    # Clear clusters with no mutations assigned
    non_empty_clusters = which(cluster_locations[,3] > 0)
    cluster_locations = cluster_locations[non_empty_clusters,,drop=F]
    mutation.preferences = mutation.preferences[,non_empty_clusters, drop=F]
    mutation.preferences = mutation.preferences / rowSums(mutation.preferences)
    
    # Sort clusters by CCF
    clust_order = order(cluster_locations[,2], decreasing=T)
    cluster_locations = cluster_locations[clust_order,,drop=F]
    cluster_locations[,1] = 1:nrow(cluster_locations)
    mutation.preferences = mutation.preferences[,clust_order, drop=F]
    
    # get most likely cluster
    most.likely.cluster = max.col(mutation.preferences)
    
    # Obtain likelyhood of most likely cluster assignments
    most.likely.cluster.likelihood = apply(mutation.preferences, 1, max)
    
  }else{
    warning("No local optima found when assigning mutations to clusters")
    most.likely.cluster = rep(1, no.muts)
    most.likely.cluster.likelihood = rep(1, no.muts)
    best.assignment.likelihoods = rep(1, no.muts)
    cluster_locations = NA
  }
  
  return(list(best.node.assignments=most.likely.cluster, best.assignment.likelihoods=most.likely.cluster.likelihood, cluster.locations=cluster_locations, all.assignment.likelihoods=mutation.preferences))
}

#' Assign mutations for multi-dimensional clustering by adding clusters until the likelihood no longer improves
#' 
#' @param GS.data Output from the MCMC chain
#' @param mutCount Matrix containing counts of the mutated allele
#' @param WTCount Matrix containing counts of the wild type allele
#' @param subclonal.fraction Matrix with cancer cell fraction estimates for each mutation in each sample
#' @param node.assignments Matrix with assignments of mutations to clusters during MCMC (i.e. GS.data$S.i)
#' @param opts List with various parameters, including samplename and number of iterations
#' @return Standardised clustering output, including mutation assignments, likelihoods and cluster positions
#' @author dw9, sd11
mutation_assignment_em = function(GS.data, mutCount, WTCount, subclonal.fraction, node.assignments, opts) {
  
  # Unpack analysis options required
  no.iters = opts$no.iters
  no.iters.burn.in = opts$no.iters.burn.in
  no.iters.post.burn.in = opts$no.iters.post.burn.in
  outdir = opts$outdir
  subsamplenames = opts$subsamplenames
  samplename = opts$samplename
  
  # Disabled because it rerutns different values than the original code below
  #identity.strengths = build_coassignment_prob_matrix_densities(GS.data$S.i, GS.data$pi.h, no.iters.burn.in)
  #identity.strengths = identity.strengths*no.iters.post.burn.in
  
  print("Setting up the data")
  no.muts = nrow(mutCount)
  no.subsamples = ncol(mutCount)
  
  if(no.muts<=1){
    stop(paste(samplename," has only 0 or 1 mutations",sep=""))
  }
  
  # Determine mutation strengths across all iterations, discarding burnin
  identity.strengths = array(0,c(no.muts,no.muts))
  for(m in 1:(no.muts-1)){
    identity.strengths[m,m] = no.iters.post.burn.in
    for(n in (m+1):no.muts){
      identity.strengths[m,n] = identity.strengths[n,m] = sum(node.assignments[(1+no.iters-no.iters.post.burn.in):no.iters,m] == node.assignments[(1+no.iters-no.iters.post.burn.in):no.iters,n])
    }
  }
  identity.strengths[no.muts,no.muts] = no.iters-no.iters.post.burn.in
  
  #initialise: assume all mutations are assigned to a single node, with mean subclonal fractions
  likelihoods = 0 
  #subclonal.fraction = mutCount/(mutCount+WTCount)
  #subclonal.fraction[is.nan(subclonal.fraction)]=0
  mean.subclonal.fractions = colMeans(subclonal.fraction)
  
  for(i in 1:no.muts){
    lfoy = log.f.of.y(mutCount[i,], mutCount[i,] + WTCount[i,], rep(1,no.subsamples), mean.subclonal.fractions)
    if(!is.nan(lfoy)){
      likelihoods <- likelihoods + lfoy
    }
  }
  
  print("Opening devices for plotting")
  pdf(file.path(outdir, paste(samplename, "_", no.iters, "iters_", no.iters.burn.in, "burnin_histograms.pdf",sep="")),height=4,width=4*no.subsamples)
  hist.device=dev.cur()
  par(mfrow=c(2,no.subsamples))   
  pdf(file.path(outdir, paste(samplename, "_", no.iters, "iters_", no.iters.burn.in, "burnin_densities.pdf",sep="")),height=4,width=4*no.subsamples)
  density.device=dev.cur()        
  par(mfrow=c(2,no.subsamples))
  if(no.subsamples>1){
    pdf(file.path(outdir, paste(samplename, "_", no.iters, "iters_", no.iters.burn.in, "burnin_consensus_scatter.pdf",sep="")),height=4,width=no.subsamples*(no.subsamples-1)*2)
    scatter.device=dev.cur()
    par(mfrow=c(1,no.subsamples*(no.subsamples-1)/2))
  }
  
  print("Initialising storage")
  consensus.assignments = rep(1,no.muts)
  no.nodes=1
  current.agreement = sum(identity.strengths)
  
  fractional.current.agreement = current.agreement/(no.muts*no.muts*no.iters.post.burn.in)
  print(paste("1 node:",current.agreement,fractional.current.agreement))
  
  #initialise all mutations in one node, so the number of pairwise agreements is just the number of times a pair of mutations appear in the same node
  pairwise.agreements = identity.strengths
  
  all.consensus.assignments = list()
  all.consensus.assignments[[no.nodes]] = consensus.assignments
  all.node.positions = list()
  all.node.positions[[no.nodes]] = array(mean.subclonal.fractions,c(1,no.subsamples))
  all.likelihoods = list()
  all.likelihoods[[no.nodes]] = matrix(rep(1, no.muts*no.subsamples), ncol=no.subsamples)
  
  print("Start adding nodes")
  node.added=T
  while(node.added){      
    unique.nodes = unique(consensus.assignments)
    no.nodes = length(unique.nodes)
    
    new.pairwise.agreements = pairwise.agreements
    new.consensus.assignments = consensus.assignments
    new.node = max(unique.nodes)+1
    new.unique.nodes = c(unique.nodes, new.node)
    
    # iteratively move muts to the new node or back again
    mut.moved=T
    count=1
    saved.consensus.assignments = new.consensus.assignments
    #we may need to avoid infinite cycling by checking whether a set of node assignments has been repeated
    while(mut.moved){
      count=count+1
      mut.moved=F
      rand.inds = sample(no.muts)
      for(r in rand.inds){
        old.agreement = sum(new.pairwise.agreements[r,])
        if(new.consensus.assignments[r]==new.node){
          new.ass = saved.consensus.assignments[r]
        }else{
          new.ass = new.node
        }
        temp.ass = new.consensus.assignments
        temp.ass[r] = new.ass
        new.agreement = sum(identity.strengths[r,temp.ass==new.ass]) + sum(no.iters.post.burn.in - identity.strengths[r,temp.ass!=new.ass])             
        
        if(new.agreement > old.agreement){
          mut.moved=T
          new.consensus.assignments[r] = new.ass
          new.pairwise.agreements[r,]=NA
          new.pairwise.agreements[,r]=NA
          new.pairwise.agreements[r,new.consensus.assignments==new.ass] = identity.strengths[r,new.consensus.assignments==new.ass]
          new.pairwise.agreements[new.consensus.assignments==new.ass,r] = identity.strengths[new.consensus.assignments==new.ass,r]
          new.pairwise.agreements[r,new.consensus.assignments!=new.ass] = no.iters.post.burn.in - identity.strengths[r,new.consensus.assignments!=new.ass]
          new.pairwise.agreements[new.consensus.assignments!=new.ass,r] = no.iters.post.burn.in - identity.strengths[new.consensus.assignments!=new.ass,r]                                        
          old.agreement = new.agreement
        }
      }
    }
    new.agreements = sum(new.pairwise.agreements)
    #don't move a whole node of mutations
    if(length(unique(new.consensus.assignments))<=no.nodes){
      new.agreements = 0
    }
    muts.to.move = which(new.consensus.assignments == new.node)
    if(new.agreements > current.agreement){
      consensus.assignments[muts.to.move] = new.node
      current.agreement = new.agreements
      pairwise.agreements = new.pairwise.agreements
      fractional.current.agreement = current.agreement/(no.muts*no.muts*no.iters.post.burn.in)
      
      #fairly crude - use mean position
      node.position = array(NA,c(no.nodes+1,no.subsamples))
      for(n in 1:(no.nodes+1)){
        for(s in 1:no.subsamples){
          node.position[n,s] = mean(subclonal.fraction[consensus.assignments==n,s])
        }
      }
      
      all.likelihoods[[no.nodes+1]] = calc.new.likelihood(mutCount, mutCount+WTCount, matrix(1, nrow=nrow(mutCount), ncol=ncol(mutCount)), node.position[consensus.assignments,])
      likelihoods = c(likelihoods, sum(all.likelihoods[[no.nodes+1]]))     
      
      all.consensus.assignments[[no.nodes+1]] = consensus.assignments
      all.node.positions[[no.nodes+1]] = node.position
      
      if(no.subsamples>1){
        #its hard to distinguish more than 8 different colours
        max.cols = 8
        cols = rainbow(min(max.cols,no.nodes))
        dev.set(which = scatter.device)
        plot.data = subclonal.fraction
        plot.data[is.na(plot.data)]=0
        for(i in 1:(no.subsamples-1)){
          for(j in (i+1):no.subsamples){
            plot(plot.data[,i],plot.data[,j],type = "n",xlab = paste(samplename,subsamplenames[i]," allele fraction",sep=""), ylab = paste(samplename,subsamplenames[j]," allele fraction",sep=""),xlim = c(0,max(plot.data[,i])*1.2))
            for(n in 1:(no.nodes+1)){
              points(plot.data[,i][consensus.assignments==n],plot.data[,j][consensus.assignments==n],col=cols[(n-1) %% max.cols + 1],pch=20 + floor((n-1)/max.cols))
            }
            #legend(max(plot.data[,i]),max(plot.data[,j]),legend = unique.nodes[n],col=cols[(n-1) %% max.cols + 1],pch=20 + floor((n-1)/max.cols))
            legend(max(plot.data[,i])*1.05,max(plot.data[,j]),legend = 1:(no.nodes+1),col=cols[(0:(no.nodes-1)) %% max.cols + 1],pch=20 + floor((0:(no.nodes-1))/max.cols),cex=1)
          }
        }
      }                                               
    }else{
      node.added = F
    }
  }
  print("Done adding nodes, cleaning up and writing output/last figures")
  dev.off(which = hist.device)
  dev.off(which = density.device)
  if(no.subsamples>1){
    dev.off(which = scatter.device)
  }
  
  tree.sizes = 1:no.nodes
  BIC = bic(likelihoods, no.subsamples, tree.sizes, no.muts)

  best.BIC.index = which.min(BIC)
  # print("likelihoods and BIC")
  # print(cbind(likelihoods,BIC))
  # print(paste("best BIC index=",best.BIC.index,sep=""))
  
  if(no.subsamples>1){
    consensus.assignments = all.consensus.assignments[[best.BIC.index]]
    pdf(file.path(outdir, paste(samplename, "_", no.iters, "iters_", no.iters.burn.in, "burnin_bestScatter.pdf",sep="")),height=4,width=4)
    
    #its hard to distinguish more than 8 different colours
    max.cols = 8
    cols = rainbow(min(max.cols,no.nodes))
    plot.data = subclonal.fraction
    plot.data[is.na(plot.data)]=0
    for(i in 1:(no.subsamples-1)){
      for(j in (i+1):no.subsamples){
        plot(plot.data[,i],plot.data[,j],type = "n",xlab = paste(samplename,subsamplenames[i]," subclonal fraction",sep=""), ylab = paste(samplename,subsamplenames[j]," subclonal fraction",sep=""),xlim = c(0,max(plot.data[,i])*1.25))
        for(n in 1:no.nodes){
          pch=20 + floor((n-1)/max.cols)
          #pch is not implmeneted above 25
          if(pch>25){
            pch=pch-20
          }
          points(plot.data[,i][consensus.assignments==n],plot.data[,j][consensus.assignments==n],col=cols[(n-1) %% max.cols + 1],pch=pch)
        }
        #legend(max(plot.data[,i]),max(plot.data[,j]),legend = unique.nodes[n],col=cols[(n-1) %% max.cols + 1],pch=20 + floor((n-1)/max.cols))
        pch=20 + floor((0:(no.nodes-1))/max.cols)
        pch[pch>25] = pch[pch>25]-20
        legend(max(plot.data[,i])*1.05,max(plot.data[,j]),legend = 1:no.nodes,col=cols[(0:(no.nodes-1)) %% max.cols + 1],pch=pch,cex=1)
      }
    }       
    dev.off()
    
    #
    # Following code commented out because the input for it is not (yet) available. For example of input, see:
    # /lustre/scratch109/sanger/dw9/2D_Dirichlet_Process/Lucy_heterogeneity/AllSampleSubsv0.4Jan23rd2014forDW9.txt
    #
    #     #png(paste("/nfs/team78pc11/dw9/Lucy_heterogeneity_23Jan2014_1000iters/",samplename,"_heterogeneity_linePlot.png",sep=""),width=2000,height=1500)
    #     png(paste(outdir, "/", samplename,"_heterogeneity_linePlot.png",sep=""),width=2000,height=1500)
    #     par(mar=c(10,6,2,2),cex=2)
    #     plot(rep(1:ncol(subclonal.fraction),nrow(subclonal.fraction)),c(subclonal.fraction),type="n",xlab = "sample",xaxt="n",ann = F,xlim=c(0.5,ncol(subclonal.fraction)+2))
    #     axis(1,at=1:ncol(subclonal.fraction),labels=paste(samplename,subsamplenames,sep=""),las=2)
    #     mtext(side = 1, text = "sample", line = 6,cex=3)
    #     mtext(side = 2, text = "allele fraction", line = 4,cex=3)
    #     for(i in 1:nrow(subclonal.fraction)){
    #       linetype = 3
    #       if(data$DRIVER_CATEGORY[i]=="ONCOGENIC"){
    #         linetype=1
    #       }else if(data$DRIVER_CATEGORY[i]=="POSSIBLE_ONCOGENIC"){
    #         linetype=2
    #       }
    #       lines(1:ncol(subclonal.fraction),subclonal.fraction[i,],col=cols[consensus.assignments[i]],lwd=3,lty=linetype)
    #       #points(1:ncol(subclonal.fraction),subclonal.fraction[i,],col=cols[consensus.assignments[i]],pch=20,cex=3)
    #       points(1:ncol(subclonal.fraction),subclonal.fraction[i,],col=cols[consensus.assignments[i]],pch=(i+14)%%25,cex=3)
    #     }
    #     #legend(ncol(subclonal.fraction)+0.2,max(subclonal.fraction),legend = data$geneRef,col=cols[consensus.assignments],pch=20 + floor((0:(no.nodes-1))/max.cols),cex=1)
    #     legend(ncol(subclonal.fraction)+0.2,max(subclonal.fraction),legend = data$geneRef,col=cols[consensus.assignments],pch=(14+(1:nrow(data)))%%25,cex=1)
    #     
    #     dev.off()               
    
  }
  most.likely.cluster = all.consensus.assignments[[best.BIC.index]]
  write.table(cbind(1:length(table(most.likely.cluster)), table(most.likely.cluster), all.node.positions[[best.BIC.index]]), paste(outdir, "/", samplename, "_optimaInfo.txt", sep=""), col.names=c("cluster.no", "no.muts.in.cluster", paste(samplename,subsamplenames,sep="")), sep="\t", quote=F, row.names=F)
  assignment_counts = table(most.likely.cluster)
  cluster.locations = data.frame(cbind(as.numeric(names(assignment_counts)), all.node.positions[[best.BIC.index]], assignment_counts), stringsAsFactors=F)
  return(list(best.node.assignments=most.likely.cluster, best.assignment.likelihoods=all.likelihoods[[best.BIC.index]], cluster.locations=cluster.locations, all.assignment.likelihoods=NA))
}

#' Establish clusters and assign mutations for multi-dimensional clustering using a multi-dimensional density
#' 
#' @param mutation.copy.number Mutation copy number values for all samples
#' @param copyNumberAdjustment Multiplicity values for all samples
#' @param GS.data Output from the MCMC chain containing mutation assignments, cluster positions and cluster weights
#' @param density.smooth Parameter that determines the amount of smoothing applied when establishing multi-dimensional density
#' @param opts List with parameters, including donorname (samplename), individual samplenames (subsamples) and iterations and burnin
#' @return List with various standardised clustering results, including cluster positions, assignments and probabilities
#' @author dw9, sd11
multiDimensionalClustering = function(mutation.copy.number, copyNumberAdjustment, GS.data, density.smooth, opts) {
  #
  # Uses clustering in multi dimensions to obtain a likelihood across all iterations for each mutation
  # The cluster where a mutation is assigned most often is deemed the most likeli destination.
  #
  
  # Unpack the opts
  samplename = opts$samplename
  subsamples = opts$subsamplenames
  no.iters = opts$no.iters
  burn.in = opts$no.iters.burn.in
  new_output_folder = opts$outdir
  
  post.burn.in.start = burn.in
  no.subsamples = length(subsamples)
  no.muts = nrow(mutation.copy.number)
  
  # Get multi-D density
  density.out = Gibbs.subclone.density.est.nd(mutation.copy.number/copyNumberAdjustment,GS.data,density.smooth,burn.in+1,no.iters,max.burden = 1.5)
  
  range = density.out$range
  gridsize = density.out$gridsize
  median.density = density.out$median.density
  lower.CI = density.out$lower.CI
  
  getHypercubeIndices<-function(gridsize,lastMin,hypercube.size){
    indices = array(0,(2*hypercube.size+1)^length(lastMin))
    pos.within.hypercube = array(0,c((2*hypercube.size+1)^length(lastMin),length(lastMin)))
    for(i in 1:length(lastMin)){
      pos.within.hypercube[,i]=rep(0:(2*hypercube.size),each=(2*hypercube.size+1)^(i-1), times = (2*hypercube.size+1)^(length(lastMin)-i))
    }
    indices = pos.within.hypercube[,1] + lastMin[1]
    for(i in 2:length(lastMin)){
      indices = indices + (lastMin[i]+pos.within.hypercube[,i]-1) * prod(gridsize[1:(i-1)])
    }
    return(indices)
  }
  
  hypercube.size = 2
  localMins = array(NA,c(0,no.subsamples))
  lastMin = rep(1,no.subsamples)
  if(median.density[rbind(lastMin+hypercube.size)]>0 & median.density[rbind(lastMin+hypercube.size)] == max(median.density[getHypercubeIndices(gridsize,lastMin,hypercube.size)])){
    localMins = rbind(localMins,lastMin)
  }
  
  getNextHyperCube<-function(gridsize,lastMin,hypercube.size){
    current.dimension = length(gridsize)
    lastMin[current.dimension] = lastMin[current.dimension] + 1
    while(T){
      if(lastMin[current.dimension]==gridsize[current.dimension] - 2 * hypercube.size + 1){
        if(current.dimension==1){
          return(NULL)
        }
        lastMin[current.dimension] = 1
        current.dimension = current.dimension-1
        lastMin[current.dimension] = lastMin[current.dimension] + 1
      }else{
        return(lastMin)
      }
    }
  }
  
  localMins = array(NA,c(0,no.subsamples))
  above95confidence = NULL
  lastMin = rep(1,no.subsamples)
  if(median.density[rbind(lastMin+hypercube.size)]>0 & median.density[rbind(lastMin+hypercube.size)] == max(median.density[getHypercubeIndices(gridsize,lastMin,hypercube.size)])){
    localMins = rbind(localMins,lastMin)
    above95confidence = c(above95confidence,lower.CI[rbind(lastMin+hypercube.size)]>0)
  }
  
  while(!is.null(lastMin)){
    if(median.density[rbind(lastMin+hypercube.size)]>0 & median.density[rbind(lastMin+hypercube.size)] == max(median.density[getHypercubeIndices(gridsize,lastMin,hypercube.size)])){
      localMins = rbind(localMins,lastMin)
      above95confidence = c(above95confidence,lower.CI[rbind(lastMin+hypercube.size)]>0)
    }
    lastMin = getNextHyperCube(gridsize,lastMin,hypercube.size)
  }
  
  localMins = localMins + hypercube.size
  localOptima = array(rep(range[,1],each=no.subsamples),dim(localMins))
  localOptima = localOptima + array(rep((range[,2] - range[,1])/(gridsize-1),each=no.subsamples),dim(localMins)) * localMins
  write.table(cbind(localOptima,above95confidence),paste(new_output_folder,"/",samplename,"_localMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = c(paste(samplename,subsamples,sep=""),"above95percentConfidence"))
  write.table(localOptima[above95confidence,,drop=F],paste(new_output_folder,"/",samplename,"_localHighConfidenceMultidimensionalOptima_",density.smooth,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names = paste(samplename,subsamples,sep=""))
  
  no.optima = nrow(localOptima)
  
  if(no.optima>1){        
    stepsize = (range[,2]-range[,1])/(gridsize-1)
    peak.indices = round((localOptima - rep(range[,1],each = nrow(localOptima))) / rep(stepsize,each = nrow(localOptima)))
    
    #peak.heights are not currently used, but they could be used to construct a mixture model
    #and then estimate posterior probs of belonging to each cluster
    peak.heights = median.density[peak.indices]
    
    #if T, j is 'above' i, relative to the plane through the origin
    vector.direction = array(NA,c(no.optima,no.optima))
    
    boundary = array(NA,c(no.optima,no.optima))
    vector.length = array(NA,c(no.optima,no.optima))
    plane.vector = array(NA,c(no.optima,no.optima,no.subsamples+1))
    for(i in 1:(no.optima-1)){
      for(j in (i+1):no.optima){
        #coefficients describe a plane ax + by + cz + d = 0,
        #perpendicular to the line between optimum i and optimum j, passing through optimum i
        plane.vector[i,j,1:no.subsamples] = localOptima[j,] - localOptima[i,]
        plane.vector[i,j,no.subsamples+1] = -sum(plane.vector[i,j,1:no.subsamples] * localOptima[i,])
        
        #not sure this is needed - it may always be true
        vector.direction[i,j] = sum(plane.vector[i,j,1:no.subsamples] * localOptima[j,]) > sum(plane.vector[i,j,1:no.subsamples] * localOptima[i,])
        #this is needed, in order to normalise the distance
        vector.length[i,j] = sqrt(sum(plane.vector[i,j,1:no.subsamples]^2))
        
        longest.dimension = which.max(abs(plane.vector[i,j,1:no.subsamples])/stepsize)                  
        no.steps = max(abs(plane.vector[i,j,1:no.subsamples])/stepsize) - 1
        #how long is each step?
        Euclidean.stepsize = sqrt(sum((plane.vector[i,j,1:no.subsamples])^2)) / (no.steps + 1)
        
        step.coords = array(NA,c(no.steps,no.subsamples))
        step.coords = t(sapply(1:no.steps,function(o,x){o[i,] + x * (o[j,] - o[i,]) / (no.steps + 1)},o=localOptima))
        step.indices = round((step.coords - rep(range[,1],each = nrow(step.coords))) / rep(stepsize,each = nrow(step.coords)))
        densities.on.line = median.density[step.indices]
        min.indices = which(densities.on.line == min(densities.on.line))
        #what distance along the line between a pair of optima do we have to go to reach the minimum density
        boundary[i,j] = (max(min.indices) + min(min.indices))/2 * Euclidean.stepsize
        #boundary[j,i] = Euclidean.stepsize * (no.steps+1) - boundary[i,j]
      }
    }
    #boundary = boundary - plane.vector[,,no.subsamples + 1] # make distance relative to a plane through the origin
    boundary = boundary - plane.vector[,,no.subsamples + 1] / vector.length # 020714 - normalise adjustment
    
    mutation.preferences = array(0,c(no.muts,no.optima))
    
    sampledIters = (burn.in + 1) : no.iters
    #don't use the intitial state
    sampledIters = sampledIters[sampledIters!=1]            
    if(length(sampledIters) > 1000){
      sampledIters=floor(post.burn.in.start + (1:1000) * (no.iters - burn.in)/1000)                   
    }
    
    S.i = data.matrix(GS.data$S.i)
    for(s in sampledIters){
      temp.preferences = array(0,c(no.muts,no.optima))
      for(c in unique(S.i[s,])){
        for(i in 1:(no.optima-1)){
          for(j in (i+1):no.optima){
            distance.from.plane = sum(GS.data$pi.h[s,c,] * plane.vector[i,j,1:no.subsamples]) / vector.length[i,j]
            if(distance.from.plane<=boundary[i,j]){
              if(vector.direction[i,j])
              {
                temp.preferences[S.i[s,]==c,i] = temp.preferences[S.i[s,]==c,i] + 1
              }else{
                temp.preferences[S.i[s,]==c,j] = temp.preferences[S.i[s,]==c,j] + 1
              }
            }else{
              if(vector.direction[i,j])
              {
                temp.preferences[S.i[s,]==c,j] = temp.preferences[S.i[s,]==c,j] + 1
              }else{
                temp.preferences[S.i[s,]==c,i] = temp.preferences[S.i[s,]==c,i] + 1
              }
            }
          }
        }                       
      }
      iter.preferences = t(sapply(1:no.muts,function(p,i){as.integer(p[i,]==max(p[i,])) / sum(p[i,]==max(p[i,]))},p=temp.preferences))
      mutation.preferences = mutation.preferences + iter.preferences
    }
    mutation.preferences = mutation.preferences / length(sampledIters)
    most.likely.cluster = sapply(1:no.muts,function(m,i){which.max(m[i,])},m=mutation.preferences)
    assignment.likelihood = sapply(1:no.muts,function(m,c,i){ m[i,c[i]] },m=mutation.preferences, c=most.likely.cluster)
    
    #get confidence intervals and median
    #subclonal.fraction = data.matrix(mutation.copy.number/copyNumberAdjustment)
    #no.perms = 10000
    #sampled.vals = array(0,c(no.perms,no.optima,no.subsamples))
    #no.muts.per.cluster = array(0,c(no.perms,no.optima))
    #for(p in 1:no.muts){
    #       print(p)
    #       sampled.cluster = sample(1:no.optima,no.perms,mutation.preferences[p,],replace=T)
    #       for(c in unique(sampled.cluster)){
    #               sampled.vals[sampled.cluster == c,c,] = sampled.vals[sampled.cluster == c,c,] + rep(subclonal.fraction[p,],each = sum(sampled.cluster == c))
    #               no.muts.per.cluster[sampled.cluster == c,c] = no.muts.per.cluster[sampled.cluster == c,c] + 1
    #       }
    #}
    #sampled.vals = sampled.vals/rep(no.muts.per.cluster,times = no.subsamples)
    #quantiles = array(NA, c(no.optima,no.subsamples,3))
    #for(c in 1:no.optima){
    #       for(s in 1:no.subsamples){
    #               quantiles[c,s,] = quantile(sampled.vals[,c,s],probs=c(0.025,0.5,0.975),na.rm=T)
    #       }
    #}
    #new method - 180714
    #quantiles = array(NA, c(no.optima,no.subsamples,3))
    #sampled.thetas = list()
    #totals = table(factor(most.likely.cluster,levels = 1:no.optima))
    #for(i in 1:no.optima){
    #       sampled.thetas[[i]] = array(NA,c(length(sampledIters)*totals[i],no.subsamples))
    #       for(s in 1:length(sampledIters)){
    #               sampled.thetas[[i]][((s-1)*totals[i]+1):(s*totals[i]),] = pi.h[sampledIters[s],S.i[sampledIters[s],most.likely.cluster==i],]
    #       }
    #       for(s in 1:no.subsamples){
    #               quantiles[i,s,] = quantile(sampled.thetas[[i]][,s],probs=c(0.025,0.5,0.975),na.rm=T)
    #       }
    #}
    #new method - 210714 - should be intermediate between previous methods
    quantiles = array(NA, c(no.optima,no.subsamples,3))
    sampled.thetas = list()
    totals = table(factor(most.likely.cluster,levels = 1:no.optima))
    for(i in 1:no.optima){
      sampled.thetas[[i]] = array(NA,c(length(sampledIters),totals[i],no.subsamples))
      for(s in 1:length(sampledIters)){
        sampled.thetas[[i]][s,,] = GS.data$pi.h[sampledIters[s],S.i[sampledIters[s],most.likely.cluster==i],]
      }
      for(s in 1:no.subsamples){
        median.sampled.vals = sapply(1:length(sampledIters),function(x){median(sampled.thetas[[i]][x,,s])})
        quantiles[i,s,] = quantile(median.sampled.vals,probs=c(0.025,0.5,0.975),na.rm=T)
      }
    }
    
    CIs = array(quantiles[,,c(1,3)],c(no.optima,no.subsamples*2))
    CIs = CIs[,rep(1:no.subsamples,each=2) + rep(c(0,no.subsamples),no.subsamples)]
    #out = cbind(data[[1]][,1:2],mutation.preferences,most.likely.cluster)
    #names(out)[(ncol(out)-no.optima):ncol(out)] = c(paste("prob.cluster",1:no.optima,sep=""),"most.likely.cluster")
    out = cbind(mutation.preferences,most.likely.cluster)
    names(out) = c(paste("prob.cluster",1:no.optima,sep=""),"most.likely.cluster")
    
    cluster.locations = cbind(1:ncol(mutation.preferences), quantiles[,,2], colSums(mutation.preferences), table(factor(most.likely.cluster,levels = 1:no.optima)))
    write.table(cluster.locations,
                paste(new_output_folder,"/",samplename,"_optimaInfo_",density.smooth,".txt",sep=""),
                col.names = c("cluster.no", paste(samplename, subsamples, sep=""), "estimated.no.of.mutations", "no.of.mutations.assigned"),
                row.names=F,
                sep="\t",
                quote=F)
    
    write.table(out,paste(new_output_folder,"/",samplename,"_DP_and_cluster_info_",density.smooth,".txt",sep=""),sep="\t",row.names=F,quote=F)
    write.table(CIs,paste(new_output_folder,"/",samplename,"_confInts_",density.smooth,".txt",sep=""),col.names = paste(rep(paste(samplename,subsamples,sep=""),each=2),rep(c(".lower.CI",".upper.CI"),no.subsamples),sep=""),row.names=F,sep="\t",quote=F)
  }else{
    most.likely.cluster = rep(1,no.muts)
    cluster.locations = matrix(NA, nrow=1, ncol=length(subsamples)+3)
    cluster.locations[1,1] = 1
    cluster.locations[1,2:(length(subsamples)+1)] = localOptima[1,]
    cluster.locations[1,(length(subsamples)+2):ncol(cluster.locations)] = c(no.muts, no.muts)
    mutation.preferences = data.frame(rep(1, no.muts))
    #cluster.locations = as.data.frame(cluster.locations)
    #colnames(cluster.locations) = c("cluster.no", paste(samplename, subsamples, sep=""), "estimated.no.of.mutations", "no.of.mutations.assigned")
    warning("No local optima found")
  }
  
  # Remove the estimated number of SNVs per cluster from the cluster locations table  
  cluster.locations = as.data.frame(cluster.locations[,c(1:(length(subsamples)+1),ncol(cluster.locations)), drop=F])
  # Report only the clusters that have mutations assigned
  non_empty_clusters = cluster.locations[,ncol(cluster.locations)] > 0
  cluster.locations = cluster.locations[non_empty_clusters,,drop=F]
  mutation.preferences = mutation.preferences[,non_empty_clusters, drop=F]
  mutation.preferences = mutation.preferences / rowSums(mutation.preferences)
  
  # Sort clusters by summed CCF (as a proxy for most clonal cluster first)
  clust_order = order(rowSums(cluster.locations[,2:(ncol(cluster.locations)-1)]), decreasing=T)
  cluster.locations = cluster.locations[clust_order,,drop=F]
  cluster.locations[,1] = 1:nrow(cluster.locations)
  mutation.preferences = mutation.preferences[,clust_order, drop=F]

  # get most likely cluster
  most.likely.cluster = max.col(mutation.preferences)

  # Obtain likelyhood of most likely cluster assignments
  assignment.likelihood = sapply(1:no.muts,function(m,c,i){ m[i,c[i]] },m=mutation.preferences, c=most.likely.cluster)
  
  no.optima = nrow(cluster.locations)
  pdf(paste(new_output_folder,"/",samplename,"_most_likely_cluster_assignment_",density.smooth,".pdf",sep=""),height=4,width=4)
  #its hard to distinguish more than 8 different colours
  max.cols = 8
  cols = rainbow(min(max.cols,no.optima))
  plot.data = mutation.copy.number/copyNumberAdjustment
  plot.data[is.na(plot.data)]=0
  for(i in 1:(no.subsamples-1)){
    for(j in (i+1):no.subsamples){
      plot(plot.data[,i],plot.data[,j],type = "n",xlab = paste(samplename,subsamples[i]," subclonal fraction",sep=""), ylab = paste(samplename,subsamples[j]," subclonal fraction",sep=""),xlim = c(0,max(plot.data[,i])*1.25))
      for(n in 1:no.optima){
        pch=20 + floor((n-1)/max.cols)
        #pch is not implmeneted above 25
        if(pch>25){
          pch=pch-20
        }
        points(plot.data[,i][most.likely.cluster==n],plot.data[,j][most.likely.cluster==n],col=cols[(n-1) %% max.cols + 1],pch=pch)
      }
      #legend(max(plot.data[,i]),max(plot.data[,j]),legend = unique.nodes[n],col=cols[(n-1) %% max.cols + 1],pch=20 + floor((n-1)/max.cols))
      #legend(max(plot.data[,i])*1.05,max(plot.data[,j]),legend = 1:no.optima,col=cols[(0:(no.optima-1)) %% max.cols + 1],pch=20 + floor((0:(no.optima-1))/max.cols),cex=1)
      pch=20 + floor((0:(no.optima-1))/max.cols)
      pch[pch>25] = pch[pch>25]-20
      legend(max(plot.data[,i])*1.05,max(plot.data[,j]),legend = 1:no.optima,col=cols[(0:(no.optima-1)) %% max.cols + 1],pch=pch,cex=1)
      
    }
  }       
  dev.off()

  return(list(best.node.assignments=most.likely.cluster, best.assignment.likelihoods=assignment.likelihood, all.assignment.likelihoods=mutation.preferences, cluster.locations=cluster.locations))
}

#######################################################################################################################
# Binomial based mutation assignment
#######################################################################################################################

#' Note: This doesn't work better than the density based approach
#'
#' Assign mutations to clusters by looking at the binomial probability of each cluster for generating a mutation
#' This for now only works with a single timepoint
#' @noRd
mutation_assignment_binom = function(clustering_density, mutCount, WTCount, copyNumberAdjustment, tumourCopyNumber, normalCopyNumber, cellularity, samplename) {
  # Define convenience variables
  num.timepoints = ncol(mutCount)
  num.muts = nrow(mutCount)

  if (num.timepoints > 1) {
    warning("Assigment of mutations through binomial only implemented for a single timepoint")
    q(save="no")
  }

  # Obtain peak locations whtin the given clustering density
  res = getLocalOptima(clustering_density, hypercube.size=5)
  cluster_locations = res$localOptima

  # Strip out clusters with a small density
  cluster_density = getClusterDensity(clustering_density, cluster_locations, min.window.density=1)
  # Take all clusters with at least 1% of the density
  cluster_locations = cluster_locations[cluster_density > 0.01]
  num.clusters = length(cluster_locations)

  # Calculate log likelihoods for each mutation to be part of each cluster location
  assignment_ll = array(NA, c(num.muts, num.clusters))
  for (t in 1:num.timepoints) {
    for (c in 1:num.clusters) {
      mutBurdens = mutationCopyNumberToMutationBurden(cluster_locations[c] * copyNumberAdjustment[,t], tumourCopyNumber[,t], cellularity[t], normalCopyNumber[,t])
      assignment_ll[,c] = sapply(1:num.muts, function(k, mc, wt, mb) {  mc[k]*log(mb[k]) + wt[k]*log(1-mb[k]) }, mc=mutCount[,t], wt=WTCount[,t], mb=mutBurdens)
    }
  }

  # Convert ll to prob
  assignment_probs = assignment_ll
  assignment_probs[is.na(assignment_probs)] = 0
  assignment_probs = t(apply(assignment_probs, 1, function(assignment_probs_k) { assignment_probs_k - max(assignment_probs_k) }))
  assignment_probs = exp(assignment_probs)
  assignment_probs = matrix(assignment_probs / rowSums(assignment_probs), ncol=num.clusters)

  # Hard assign mutations
  most.likely.cluster = sapply(1:num.muts, function(k, assignment_probs) { which.max(assignment_probs[k,]) }, assignment_probs=assignment_probs)
  assignment.likelihood = sapply(1:num.muts, function(k, assignment_probs, most.likely.cluster) { assignment_probs[k, most.likely.cluster[k]] }, assignment_probs=assignment_probs, most.likely.cluster=most.likely.cluster)

  # Save a table with the output as a summary
  cluster_assignments = table(most.likely.cluster)
  output = array(NA, c(length(cluster_locations), 3))
  
  for (c in 1:num.clusters) {
    cluster_id = names(cluster_assignments)[c]
    cluster_id = as.character(c)

    output[c,1] = as.numeric(cluster_id)
    output[c,2] = cluster_locations[c]
    # Check if there are mutations assigned to the cluster, i.e. it's in the cluster_assignments table
    if (cluster_id %in% names(cluster_assignments)) {
      output[c,3] = cluster_assignments[names(cluster_assignments)==cluster_id]
    } else {
      output[c,3] = 0
    }
  }
  write.table(output, paste(samplename,"_optimaInfo.txt",sep=""), col.names=c("cluster.no","location","no.of.mutations"), row.names=F, sep="\t", quote=F)

  return(list(best.node.assignments=most.likely.cluster, best.assignment.likelihoods=assignment.likelihood, all.assignment.likelihoods=assignment_probs, cluster.locations=output))
}

#' Function that fetches the local optima from a density function call output
#' 
#' @param cluster_density Density estimate of where clusters are located
#' @param hypercube.size Stepsize to use when stepping through the density looking for optima
#' @return A list containing two fields: localOptima, the location of a peak and peak.indices, the index of the peak within a hypercube
#' @author sd11, dw9
getLocalOptima = function(cluster_density, hypercube.size=20) {
  localOptima = NULL
  peak.indices = NULL
  for (i in (1+hypercube.size):(nrow(cluster_density)-hypercube.size)) {
    if (cluster_density$median.density[i] == max(cluster_density$median.density[(i-hypercube.size):(i+hypercube.size)])) {
      localOptima = c(localOptima, cluster_density$fraction.of.tumour.cells[i])
      peak.indices = c(peak.indices, i)
    }
  }
  return(list(localOptima=localOptima, peak.indices=peak.indices))
}

#' Obtain the mean density of each cluster. 
#' 
#' This function takes the cluster_locations and for
#' each cluster it walks from the cluster peak along CCF space until the median density
#' drops below the supplied minimum in both directions. After obtaining the CCF space that a
#' cluster takes up we calculate the mean density in that space. Finally across all cluster
#' densities we normalise to obtain the fraction of total density that each cluster represents
#' @param clustering_density Posterior density estimate of where clusters are
#' @param cluster_locations Estimated cluster locations within the density
#' @param min.window.density Minimum stepsize used
#' @return A vector with for each cluster the fraction of density
#' @author sd11, dw9
getClusterDensity = function(clustering_density, cluster_locations, min.window.density) {
  cluster_density = array(NA, length(cluster_locations))
  for (c in 1:length(cluster_locations)) {
    cluster_location = cluster_locations[c]
    x.cluster = which.min(abs(clustering_density$fraction.of.tumour.cells-cluster_location))
    # Obtain the left most x.axis point of the cluster
    run = T
    i = x.cluster
    while (run) {
      if (i!=0 && clustering_density[i,]$median.density > min.window.density) {
        i = i-1
      } else {
        run = F
      }
    }
    x.cluster.min = i
    
    # Obtain the right most x.axis point of the cluster
    run = T
    i = x.cluster
    while (run) {
      if (i!=(nrow(clustering_density)+1) && clustering_density[i,]$median.density > min.window.density) {
        i = i+1
      } else {
        run = F
      }
    }
    x.cluster.max = i
    
    # Take the average density
    cluster_density[c] = mean(clustering_density[x.cluster.min:x.cluster.max,]$median.density)
  }
  
  # Normalise the densities across the clusters
  cluster_density = cluster_density/sum(cluster_density)
  return(cluster_density)
}

##################################################################
# Confidence intervals and cluster order probabilities
##################################################################

#' Calculate confidence intervals on the cluster location
#' @param GS.data MCMC output with assignments and cluster locations
#' @param mut_assignments Final mutation to cluster assignments
#' @param clusterids Clusterids to run through
#' @param no.muts The total number of mutations
#' @param no.iters Total number of iterations
#' @param no.timepoints Total number of samples in this dataset
#' @param no.iters.burn.in Number of iterations to use as burn-in
#' @return A data.frame with the confidence intervals for each cluster
#' @author sd11
calc_cluster_conf_intervals = function(GS.data, mut_assignments, clusterids, no.muts, no.timepoints, no.iters, no.iters.burn.in) {
  assign_ccfs = get_snv_assignment_ccfs(GS.data$pi.h, GS.data$S.i, no.muts, no.timepoints, no.iters, no.iters.burn.in)
  cluster_intervals = data.frame()
  for (t in 1:no.timepoints) {
    for (i in 1:length(clusterids)) {
      # get all SNVs assigned to this cluster
      clusterid = clusterids[i]
      assigned = which(mut_assignments==clusterid)
      ccfs = assign_ccfs[, assigned, t]
      # Flatten the matrix
      dim(ccfs) = NULL
      quants = t(data.frame(quantile(ccfs, c(.05, .5, .95))))
      cluster_intervals = rbind(cluster_intervals, data.frame(clusterid=clusterid, timepoint=t, quants))
    }
  }
  return(cluster_intervals)
}

#' Get mutation preferences table given a density and cluster locations. The output table
#' contains the CCF of the preferred given cluster locations, i.e. the cluster to which
#' the SNV would've been assigned during clustering if those were the cluster locations
get_mutation_preferences = function(GS.data, density, mut_assignments, clusterids, cluster_ccfs, no.muts, no.timepoints, no.iters, no.iters.burn.in) {
  sampledIters = (no.iters.burn.in+1):no.iters
  
  res = getLocalOptima(density, hypercube.size=5)
  localOptima = res$localOptima
  peak.indices = res$peak.indices
  
  # Check if any corresponding peaks to found clusters
  peak_is_cluster = unlist(lapply(localOptima, function(x) any(abs(x-cluster_ccfs) < .Machine$double.eps ^ 0.5)))
  if (sum(peak_is_cluster)==0) {
    print("No corresponding cluster locations when calculating cluster order probs, this function only works with the density mutation assignment")
    dummy_matrix = array(0, c(length(sampledIters), no.muts, no.timepoints))
    return(dummy_matrix)
  }
  
  # Take only the already found clusters
  peak.indices = peak.indices[peak_is_cluster]
  localOptima = localOptima[peak_is_cluster]
  no.optima = length(localOptima)
  
  S.i = data.matrix(GS.data$S.i)
  pi.h = GS.data$pi.h #[,,1]
  
  if (no.optima == 1) {
    # If only a single optimum was found we assign all muts to that one cluster
    assign_ccfs = array(0, c(length(sampledIters), no.muts, no.timepoints))
    for (t in 1:no.timepoints) {
      for (s in sampledIters) {
        assign_ccfs[s-no.iters.burn.in, 1:no.muts, t] = localOptima[1]
      }
    }
      
  } else {
    
    boundary = array(NA,no.optima-1)
    for(i in 1:(no.optima-1)){
      min.density = min(density$median.density[(peak.indices[i]+1):(peak.indices[i+1]-1)])
      min.indices = intersect(which(density$median.density == min.density),(peak.indices[i]+1):(peak.indices[i+1]-1))
      
      #what distance along the line between a pair of optima do we have to go to reach the minimum density
      boundary[i] = (density$fraction.of.tumour.cells[max(min.indices)] + density$fraction.of.tumour.cells[min(min.indices)])/2
    }
    
    # Get a table with the preferred cluster CCF
    assign_ccfs = array(0, c(length(sampledIters), no.muts, no.timepoints))
    for (t in 1:no.timepoints) {
      for (s in sampledIters) {
        for (c in unique(S.i[s,])) {
          bestOptimum = sum(pi.h[s, c, t]>boundary)+1
          assigned.muts = which(S.i[s,]==c)
          assign_ccfs[s-no.iters.burn.in, assigned.muts, t] = localOptima[bestOptimum] #pi.h[s, c, t]
        }
      }
    }
  }
  return(assign_ccfs)
}


#' Calculate for a pair of clusters whether a has a higher CCF than b.
#' 
#' This function calculates cluster order probabilities by obtaining mutation preferences throughout the MCMC iterations for each provided cluster
#' and then classifies a pair of clusters in groups: Greater than / equal (GT-EQ), less than / equal (LT-EQ), greater than (GT), less than (LT),
#' equal (EQ) or undertain (uncertain). These classifications are obtained by sampling pairs of SNVs from either cluster and account how often
#' SNV 1 is assigned a higher CCF in the preferences than SNV 2.
#' @param GS.data MCMC output with assignments and cluster locations
#' @param clusterids Clusterids to run through
#' @param no.muts The total number of mutations
#' @param no.timepoints Total number of samples in this dataset
#' @param no.iters Total number of iterations
#' @param no.iters.burn.in Number of iterations to use as burn-in
#' @return A array multi-dimensional array with in each cell whether the column cluster has a higher CCF than the row cluster across the samples in the third dimension
#' @author sd11
#' Note: This approach only works with the density based mutation assignment strategy
calc_cluster_order_probs = function(GS.data, density, mut_assignments, clusterids, cluster_ccfs, no.muts, no.timepoints, no.iters, no.iters.burn.in, no.samples=1000) {
  assign_ccfs = get_mutation_preferences(GS.data, density, mut_assignments, clusterids, cluster_ccfs, no.muts, no.timepoints, no.iters, no.iters.burn.in)
  
  num_clusters = length(clusterids)
  sampledIters = no.iters-no.iters.burn.in
  if (num_clusters > 1) {
    
    probs_gt = array(NA, c(length(clusterids), length(clusterids), no.timepoints))
    probs_lt = array(NA, c(length(clusterids), length(clusterids), no.timepoints))
    probs_eq = array(NA, c(length(clusterids), length(clusterids), no.timepoints))
    for (t in 1:no.timepoints) {
      for (c in 1:(num_clusters)) {
        # for (k in (c+1):num_clusters) {
        for (k in 1:(num_clusters)) {
          
          snvs_a = which(mut_assignments==clusterids[c])
          snvs_b = which(mut_assignments==clusterids[k])
          
          if (length(snvs_a)==1) {
            sampled_a = rep(snvs_a, no.samples)
          } else {
            sampled_a = sample(snvs_a, no.samples, replace=T)
          }
          if (length(snvs_b)==1) {
            sampled_b = rep(snvs_b, no.samples)
          } else {
            sampled_b = sample(snvs_b, no.samples, replace=T)
          }
          
          frac_gt = sum(sapply(1:no.samples, function(i) { (sum(assign_ccfs[, sampled_a[i], t] > assign_ccfs[, sampled_b[i], t])) })) / (no.samples*sampledIters)
          frac_lt = sum(sapply(1:no.samples, function(i) { (sum(assign_ccfs[, sampled_a[i], t] < assign_ccfs[, sampled_b[i], t])) })) / (no.samples*sampledIters)
          frac_eq = sum(sapply(1:no.samples, function(i) { (sum(assign_ccfs[, sampled_a[i], t] == assign_ccfs[, sampled_b[i], t])) })) / (no.samples*sampledIters)
          
          # Filling the probability matrices as row-vs-column
          probs_gt[c, k, t] = frac_gt
          probs_lt[c, k, t] = frac_lt
          probs_eq[c, k, t] = frac_eq
        }
      }
    }
    
    #' Should return classification of each cluster/cluster pair (row vs column):
    #'  * GT / LT / EQ / GT-EQ / EQ-LT / uncertain / NA
    classification = array(NA, c(length(clusterids), length(clusterids), no.timepoints))
    classification[probs_gt+probs_eq > 0.95] = "GT-EQ"
    classification[probs_lt+probs_eq > 0.95] = "LT-EQ"
    classification[probs_eq > 0.95] = "EQ"
    classification[probs_lt > 0.95] = "LT"
    classification[probs_gt > 0.95] = "GT"
    classification[probs_lt > 0.95 & probs_gt > 0.95 & probs_eq > 0.95] = "EQ-special"
    classification[is.na(classification)] = "uncertain"
    
    return(list(classification=classification, probs_gt=probs_gt, probs_lt=probs_lt, probs_eq=probs_eq))
  } else {
    dummy_matrix = array(NA, c(length(clusterids), length(clusterids), no.timepoints))
    return(list(classification=dummy_matrix, probs_gt=dummy_matrix, probs_lt=dummy_matrix, probs_eq=dummy_matrix))
  }
}
Wedge-Oxford/dpclust documentation built on July 6, 2024, 2:02 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Wedge-Oxford/dpclust
DPClust - An R package for subclonal reconstruction of tumours using sequencing data

R/AssignMutations.R
In Wedge-Oxford/dpclust: DPClust - An R package for subclonal reconstruction of tumours using sequencing data

Defines functions calc_cluster_order_probs get_mutation_preferences calc_cluster_conf_intervals getClusterDensity getLocalOptima mutation_assignment_binom multiDimensionalClustering mutation_assignment_em oneDimensionalClustering

Documented in calc_cluster_conf_intervals calc_cluster_order_probs getClusterDensity getLocalOptima get_mutation_preferences multiDimensionalClustering mutation_assignment_em oneDimensionalClustering

R Package Documentation

Browse R Packages

We want your feedback!

Wedge-Oxford/dpclust DPClust - An R package for subclonal reconstruction of tumours using sequencing data

R/AssignMutations.R In Wedge-Oxford/dpclust: DPClust - An R package for subclonal reconstruction of tumours using sequencing data

Defines functions calc_cluster_order_probs get_mutation_preferences calc_cluster_conf_intervals getClusterDensity getLocalOptima mutation_assignment_binom multiDimensionalClustering mutation_assignment_em oneDimensionalClustering

Documented in calc_cluster_conf_intervals calc_cluster_order_probs getClusterDensity getLocalOptima get_mutation_preferences multiDimensionalClustering mutation_assignment_em oneDimensionalClustering

R Package Documentation

Browse R Packages

We want your feedback!

Wedge-Oxford/dpclust
DPClust - An R package for subclonal reconstruction of tumours using sequencing data

R/AssignMutations.R
In Wedge-Oxford/dpclust: DPClust - An R package for subclonal reconstruction of tumours using sequencing data