R/subclone_Dirichlet_Gibbs_sampler_nD_binomial.R

Defines functions subclone.dirichlet.gibbs

#
# DPClust core algorithm
#

subclone.dirichlet.gibbs <- function(mutCount, WTCount, totalCopyNumber=array(1,dim(mutCount)), normalCopyNumber=array(2,dim(mutCount)), copyNumberAdjustment = array(1,dim(mutCount)),C=30, cellularity=rep(1,ncol(mutCount)),iter=1000,conc_param=1,cluster_conc=10) {
  # y is a p-by-q matrix of the number of reads reporting each variant (p=number of mutations,q=number of timepoints / related samples)
  # N is a p-by-q matrix of the number of reads in total across the base in question (in the same order as Y obviously!, p=number of mutations,q=number of timepoints / related samples)
  # C is the maximum number of clusters in the Dirichlet process
  # iter is the number of iterations of the Gibbs sampler
  if (is.null(copyNumberAdjustment)) { copyNumberAdjustment = array(1,dim(mutCount)) }
  
  num.muts <- NROW(mutCount)
  num.timepoints  = NCOL(mutCount)
  
  # Hyperparameters for alpha
  #A <- B <- 0.01
  #strong prior on alpha
  A = 1
  B = conc_param
  
  # Set up data formats for recording iterations
  pi.h = array(NA, c(iter, C,num.timepoints))
  mutBurdens = array(NA, c(C,num.timepoints,num.muts))
  
  V.h = matrix(1, nrow=iter, ncol=C)
  S.i = matrix(NA, nrow=iter, ncol=num.muts)
  Pr.S = matrix(NA, nrow=num.muts, ncol=C)
  alpha = rep(NA, iter)
  
  lower = array(NA,num.timepoints)
  upper = array(NA,num.timepoints)
  
  mutCopyNum = array(NA,c(num.muts,num.timepoints))
  for (t in 1:num.timepoints) {  
    mutCopyNum[,t] = mutationBurdenToMutationCopyNumber(mutCount[,t]/(mutCount[,t]+WTCount[,t]),totalCopyNumber[,t] ,cellularity[t],normalCopyNumber[,t]) / copyNumberAdjustment[,t]
    lower[t] = min(mutCopyNum[,t])
    upper[t] = max(mutCopyNum[,t])
    difference = upper[t]-lower[t]
    lower[t] = lower[t]-difference/10
    upper[t] = upper[t]+difference/10
    # Randomise starting positions of clusters
    pi.h[1,,t]=runif(C,lower[t],upper[t])
    for(c in 1:C){
      mutBurdens[c,t,]=mutationCopyNumberToMutationBurden(pi.h[1,c,t] * copyNumberAdjustment[,t], totalCopyNumber[,t], cellularity[t],normalCopyNumber[,t]) 
    }
  }	
  V.h[1,] = c(rep(0.5,C-1), 1)
  S.i[1,] = c(1, rep(0,num.muts-1))
  alpha[1] = 1
  V.h[1:iter, C] = rep(1, iter)
  
  for (m in 2:iter) {
    if (m %% 100 == 0){ print(paste(m, " / ", iter, sep=" ")) }
    
    # Update cluster allocation for each individual mutation
    for (k in 1:num.muts) {			
      # use log-space to avoid problems with very high counts
      Pr.S[k,1] <- log(V.h[m-1,1])
      Pr.S[k,2:C] = sapply(2:C, FUN=function(j, V) { log(V[j]) + sum(log(1-V[1:(j-1)])) }, V=V.h[m-1,])
      
      for(t in 1:num.timepoints){
        for(c in 1:C){
          Pr.S[k,c] <- Pr.S[k,c] + mutCount[k,t]*log(mutBurdens[c,t,k]) + WTCount[k,t]*log(1-mutBurdens[c,t,k])
        }
        # It would be faster to use apply here, but it is returning values with small difference as compared to the above code. The scaling below 
        # blows these differences up to significant impact.
        #         Pr.S2[k,] <- Pr.S2[k,] + sapply(1:C, FUN=function(c, t, k, mutCount, mutBurdens, WTCount) { mutCount[k,t]*log(mutBurdens[c,t,k]) + WTCount[k,t]*log(1-mutBurdens[c,t,k]) }, t=t,k=k,mutCount=mutCount,mutBurdens=mutBurdens,WTCount=WTCount)
      }			
      Pr.S[k,is.na(Pr.S[k,])] = 0
      Pr.S[k,] = Pr.S[k,] - max(Pr.S[k,])
      Pr.S[k,] = exp(Pr.S[k,])
      Pr.S[k,] = Pr.S[k,] / sum(Pr.S[k,])				
    }
    
    # Determine which cluster each mutation is assigned to
    S.i[m,] = apply(Pr.S, 1, function(mut) { which(rmultinom(1,1,mut)==1) })
    
    # Update stick-breaking weights
    V.h[m,1:(C-1)] = sapply(1:(C-1), function(S, curr.m, curr.alpha, h) {rbeta(1, 1+sum(S[curr.m,] == h), curr.alpha+sum(S[curr.m,] > h))}, S=S.i, curr.m=m, curr.alpha=alpha[m-1])
    V.h[m,c(V.h[m,1:(C-1)] == 1,FALSE)] = 0.999 # Need to prevent one stick from taking all the remaining weight
    
    # Get expected number of mutant reads per mutation copy number
    countsPerCopyNum = array(NA,c(num.timepoints,num.muts))
    for (t in 1:num.timepoints) {
      countsPerCopyNum[t,] = (mutCount[,t]+WTCount[,t])*mutationCopyNumberToMutationBurden(copyNumberAdjustment[,t],totalCopyNumber[,t],cellularity[t],normalCopyNumber[,t])
    }
    
    # randomise unused pi.h
    for (t in 1:num.timepoints) {
      pi.h[m,,t] = runif(C,lower[t],upper[t])
    }

    for(c in unique(S.i[m,])){
      for(t in 1:num.timepoints){
        #040213 - problem if sum(countsPerCopyNum[t,S.i[m,]==c])==0 fixed
        if(sum(countsPerCopyNum[t,S.i[m,]==c])==0){
          pi.h[m,c,t] = 0
        }else{
          pi.h[m,c,t] = rgamma(1,shape=sum(mutCount[S.i[m,]==c,t]),rate=sum(countsPerCopyNum[t,S.i[m,]==c]))
        }
      }
    }		
    
    for(t in 1:num.timepoints){
      for(c in 1:C){
        mutBurdens[c,t,]=mutationCopyNumberToMutationBurden(pi.h[m,c,t] * copyNumberAdjustment[,t],totalCopyNumber[,t],cellularity[t],normalCopyNumber[,t])
      }
    }
    
    # Update alpha
    alpha[m] <- rgamma(1, shape=C+A-1, rate=B-sum(log(1-V.h[m,1:(C-1)]))) 
  }
  return(list(S.i=S.i, V.h=V.h, pi.h=pi.h, mutBurdens=mutBurdens, alpha=alpha, y1=mutCount, N1=mutCount+WTCount))
}
Wedge-Oxford/dpclust documentation built on July 6, 2024, 2:02 p.m.