R/estimateQ.R

Defines functions errorAF mleAF .estimateQ

Documented in .estimateQ

#' EM algorithm to calculate q
#'
#' Estimate the proportion the proportions of SSNVs at each allele state
#' from sequencing data.
#' 
#'
#' @param x vector of number of reads supporting SSNVs
#' @param m vector of total read depth for SSNVs
#' @param alleleSet vector of possible allele frequencies
#' @param alleleFreq the observed allele frequencies
#' @param history a list of possible evolutionary history matrices, see also function "cnmutHistory"
#' @param type set it to be either "identifiable" or "butte"
#' @param init the initial values of vector q (probability of a randomly acquired SSNV having each allele state)
#' @param maxiter maximum number of iterations in calculation q
#' @param tol the tolerance in the convergence of q
#' @param xGreaterZero determines whether the likelihood will account for the fact that only observe SSNVs with mutant read count x[i] > 0
#' @return A list of possible matrices
.estimateQ <- function(x,m,alleleSet,alleleFreq=NULL,history,type=c("identifiable","butte"),
                       init=NULL,maxiter=100, tol=0.0001,xGreaterZero=TRUE, useGradient=FALSE) {
    type = match.arg(type)
        
    N<-length(x)
    possAlleles<-alleleSet
    lengthOfTheta<-length(possAlleles)-1
    if(nrow(history)!=length(possAlleles))
        stop("length of alleleSet must equal the number of rows of history.")

    ##check if it is one of the easy ones:
    Avec<-as.vector(history)
    easyType<-"no"
    if(identical(Avec,c(0, 1, 2, 0))) easyType<-"CNLOH"
    if(identical(Avec,c(1 ,1, 3, 0))) easyType<-"SingleGain"
    #if (easyType != "no") message(easyType)
    
    
    if(is.null(init)) {
        #init<-history%*%rep(1/length(possAlleles),length=length(possAlleles))
	init<-history%*%rep(1/dim(history)[2], length=dim(history)[2])
        init<-as.vector(init)/sum(init)
    }
    
    ##Break it into 2 because useful to have a function that assigns most likely allele to each
    EStep <- function(q) { ####Calculate P(Pi=a | Xi,q) each i and a
        #P(Pi=a | Xi)=P(Xi|Pi=a)P(Pi=a)/sum(P(Xi|Pi=a)P(Pi=a))
        
        #calculate the log of the probability (numerator)
        logPPerAllele<-mapply(possAlleles,q,FUN=function(a,qa){
            dbinom(x=x, size=m, prob=a,log=TRUE)+log(qa) 
        }) #returns a N x numb.Alleles Matrix; needs to be normalized
        
        #-----
        #calculate log of sum of probabilities (denominator)
        #-----
        #note if individual probabilities very small, then denominator sums to zero,
        #so divide by zero -- problem!
        #use log sum exponential trick:
        # log(e[theta1]+e[theta2]+...)=m+log(e[theta1-m]+e[theta2-m]+...), where m=max(theta[i])
        #here theta1=log[P(Xi|Pi=a)]+log[P(Pi=a)]
        #-----

        rowMax<-apply(logPPerAllele,1,max) #max of theta[i]
        logPPerAlleleMinusMax<-sweep(logPPerAllele,1,rowMax,"-") #subtract off m from each row
        logdenom<-rowMax+log(apply(exp(logPPerAlleleMinusMax),1,sum))
        
        #-----
        #calculate log of sum of probabilities (denominator)
        #-----
        #now overall prob=exp[num]/exp[denom]=exp[num-denom]
        #-----
        logPPerAllele<-sweep(logPPerAllele,1,logdenom,"-")
        
	
        # #-----
        # #Old simplistic code:
        # #-----
        #     	PPerAllele<-mapply(possAlleles,q,FUN=function(a,qa){
        # 	dbinom(x=x, size=m, prob=a,log=FALSE)*(qa) 
        # }) #returns a N x numb.Alleles Matrix; needs to be normalized
        # PPerAllele<- prop.table(PPerAllele,1) #divide by the sum of each row
        
        #compare the two:
        # cbind(exp(logPPerAllele),PPerAllele)
        
        return(exp(logPPerAllele))
    }
    
    ###Constraints for the q
    #The feasible region is defined by ui %*% theta - ci >= 0. 
    if(easyType=="no") {
        uiL1<-diag(rep(-1,lengthOfTheta),nrow=lengthOfTheta,ncol=lengthOfTheta)
        ciL1<-rep(1,lengthOfTheta)
        uiGr0<-diag(rep(1,lengthOfTheta),nrow=lengthOfTheta,ncol=lengthOfTheta)
        ciGr0<-rep(0,lengthOfTheta)
        if (type == "identifiable") {
            Ainv <- solve(history)
            uiA <- Ainv%*%(rbind(diag(lengthOfTheta),rep(-1,lengthOfTheta)))
            ciA <- - Ainv%*%c(rep(0,lengthOfTheta),1)
            ui <- rbind(uiL1,uiGr0,uiA)
            ci <- c(-ciL1,-ciGr0,ciA)
        } else if (type == "butte") {
            ui <- rbind(uiL1,uiGr0)
            ci <- c(-ciL1,-ciGr0)
        }
    }
    
    #make matrix of (1-a[j])^m[i]
    MStep<-function(PMat=NULL,qinit) {
        #estimate of q constrained so that solve(history)%*%q is all positive
        if(is.null(alleleFreq)) Na<-colSums(PMat)
        #from E-Step, the estimated values from the multinomial...
        #basically just the maximum likelihood estimation if knew Pi exactly
        else Na<-alleleFreq
        if (easyType %in% c("CNLOH","SingleGain")) {
            #print("easytype")
            if (easyType=="CNLOH") { 
                a<-0
                b<-1
            }
            if (easyType=="SingleGain") {
                a<-1/2
                b<-1
            }
            qout<-Na[1]/sum(Na)
            if(qout>b) qout<-b
            if(qout<a) qout<-a
            qout<-c(qout,1-qout)
        } else {
            f<-function(q) {
                qS <- 1-sum(q)   # sum(q) = 1; qS is the last one; constrained
                #-dmultinom(Na, prob=c(q,qS), log = TRUE)
                #just to check; should be equivalent, but faster to not calculated normalizing constant
                ll<- sum(Na*log(c(q,qS)))
                if(xGreaterZero) {
                    #missZero[i]= 1- sum_j{(1-a[j])^m[i] q[j]}
                    missZero<-1-rowSums(mapply(possAlleles,c(q,qS),FUN=function(a,qa){
                        (1-a)^m*qa 
                    }) #returns a N x numb.Alleles Matrix; needs to be normalized
                    )  #sum over all j, so missZero is vector of length i single value for each i
                    ll<-ll-sum(missZero)
                } 
                return( - ll) #because minimizing
            }

            #used in the gradient
            #calculate (1-a[S])^{m[i]} - (1-a[j])^{m[i]} 
            #returns a N x S-1 Matrix; needs to be normalized
            aS <- tail(possAlleles,1)
            # #bug reported
            # num<-mapply(head(possAlleles,-1),q,FUN=function(a,qa){
            # 	(1-aS)^m - (1-a)^m
            # })
            #fix with
            num<-sapply(head(possAlleles,-1),FUN=function(a,qa) {
                (1-aS)^m - (1-a)^m
            })
            gr<-function(q) {
                qS<-1-sum(q)
                NS<-tail(Na,1)
                g<-(head(Na,-1)/q - NS/qS)
                #was - (head(Na,-1)/q - NS/qS*q) #which is error; why got wrong answer.
                if(xGreaterZero) {
                    #missZero[i]= 1- sum_j{(1-a[j])^m[i] q[j]}
                    missZero<-1-rowSums(mapply(possAlleles,c(q,qS),FUN=function(a,qa){
                        (1-a)^m*qa 
                    }) #returns a N x numb.Alleles Matrix; needs to be normalized
                    )  #sum over all j, so missZero is vector of length i single value for each i
                    g<-g-colSums(sweep(num,1,missZero,"/"))
                }
                return(-g) #because minimizing
            }
            theta <- head(qinit,-1)
            if ( lengthOfTheta==1 ) {
                upper<-min((ci/ui)[ui<0])
                lower<-max((ci/ui)[ui>0])
                out<-optimize(f=f,interval=c(lower,upper),tol=.Machine$double.eps)
                qout<-c(out$minimum,1-out$minimum)
            } else {
                out<-constrOptim(theta=theta,f = f, grad=if(useGradient) gr else NULL, ui=ui, ci=ci)
                #when use gr, get wrong answer -- returns me to init, doesn't iterate, etc. WHY????
                qout<-c(out$par,1-sum(out$par))      # return the q estimates
            }
        }
        names(qout)<-names(possAlleles)
        return(qout)
    }
    
    #print("gr uses - (head(Na,-1)/q - NS/qS) ")
    TotalStep<-function(q) {
        MStep(EStep(q),qinit=q)
    }
    if(is.null(alleleFreq)) {
        qOld <- init;
        qNew<-TotalStep(qOld)
        allQ<-cbind(qOld,qNew)
        nIter<-1
        #convMessages<-c(firstOut$convergence)
        while(sum(abs(qNew - qOld)) >= tol & nIter <= maxiter) {
            qOld<-qNew
            pMat<-EStep(qOld)
            qNew<-MStep(pMat,qinit=qOld)
            #qNew<-c(mstepOut$par,1-sum(mstepOut$par))
            #convMessages<-c(convMessages,mstepOut$convergence)
            nIter <- nIter + 1
            allQ<-cbind(allQ,qNew)
            #if(nIter %% 20 ==0) print(sprintf("finished iteration %s", nIter))
        }
        pMat<-EStep(qNew)
    }
    #given the alleleFreq obtained from partialMLE, optimize q directly
    else {
        qNew<-MStep(qinit=init)
        pMat<-NA
        nIter<-NA
        allQ<-NA
    }
    names(qNew)<-names(possAlleles)
    #return optimization results and details
    return(list(q=qNew,perLocationProb=pMat,alleleSet=possAlleles,
                optimDetails=list(nIter=nIter,initial=init,pathOfQ=allQ)))
}


mleAF <- function(x, m, nt, nb, seqError=0, purity=1) {

    if(length(x)!=length(m)) stop("x and m must be the same length")

    maxCopy = nt-nb
    r <- 1:maxCopy 

    alleles <- vafEst(nt=nt, nb=nb, pu=purity)
    alleles <- errorAF(alleles, seqError=seqError)
    
    out <- sapply(alleles, function(z){dbinom(x, size=m, prob=z)})
    colnames(out) <- r
    assign <- r[apply(out, 1, which.max)]
    assignAlleles <- factor(alleles[apply(out,1,which.max)], levels=alleles)
    afdf <- data.frame(tumorAF=r/nt, AF=alleles, table(assignAlleles))
    row.names(afdf) <- NULL
    afdf <- afdf[,-3]
    
    outList<-list(perLocationProb=out,assignments=data.frame(nCopies=assign,nt=nt,tumorAF=assign/nt))
    outList<-c(outList,list(alleleSet=afdf))
    return(outList)
    
}


errorAF <- function(trueAF, seqError=0) {
    ((3-4*seqError)*trueAF+seqError)/(3-2*seqError)
}
SunPathLab/Butte documentation built on Sept. 19, 2023, 9:42 a.m.