emcncf2=function(x,trace=FALSE,unif=FALSE,min.nhet=15,maxiter=10,difcf=0.05,maxk=5,eps=1e-3){
#' EM estimate of copy number and cellular fraction with clonal and subclonal cluster structure
#' @description Uses genotype mixture model to estimate copy number, cellular fraction, and subclonal structures. \
#' Uses estimates based on the cnlr.median and mafR as initial values for the EM iteration.
#' @param x (list) the output from procSample. This function uses the elements jointseg, out and dipLogR from the output.
#' @param trace (logical) flag to print the EM criteria at every step
#' @param unif (logical) random EM start values of cellular fractions instead of clusteredcncf values
#' @param min.nhet (numeric) minimum number of heterozygote snps in a segment used to call minor cn
#' @param maxiter (numeric) maximum number of EM iterations
#' @param difcf (numeric) the minimal difference between segment cluster specific cellular fraction estimate and clonally constrained estimate for declaring candidate subclonal events
#' @param maxk (numeric) maximum number of clonal and subclonal clusters allowed for the fit
#' @param eps (numeric) the convergence threshold
#' @return A list containing: \item{loglik}{loglikelihood value of the fitted model}
#' \item{purity}{fraction tumor cells in the tumor sample}
#' \item{ploidy}{average total copy number of the tumor cells}
#' \item{dipLogR}{estimated logR value of diploid segments}
#' \item{cncf}{dataframe consisting of the columns of segmentation output
#' as well as cellular fraction (cf), total (tcn) and lesser (lcn) copy number of each segment
#' and their em counterpart (with .em suffix), and a clonal.cluster indicater with
#' clonal.cluster=1 indicating the clonal cluster and a higher number (2,3, etc..) indicating subclonal clusters.}
#' @export
warning("emcncf2 occationally returns quirky copy number estimates due to the clonal cluster structure imposed. please use with caution.")
jointseg=x$jointseg
out=x$out
dipLogR=x$dipLogR
nX=x$nX
seg=out
jointseg=subset(jointseg,!is.na(jointseg$cnlr))
logR=jointseg$cnlr
#center logR at dipLogR
logR.adj=logR-dipLogR
logOR=jointseg$valor
logORvar=jointseg$lorvar
logOR2=logOR^2
logOR2var=logOR2/logORvar
logORvar.clust=by(logORvar,jointseg$segclust,function(x)mean(na.omit(x)))
het=jointseg$het
nmark=seg$num.mark
segclust=seg$segclust
nmark.clust=by(nmark,segclust,mean)
cnlr.median.clust=by(seg$cnlr.median,segclust,function(x)mean(na.omit(x)))
#mafR.clust=by(seg$mafR,segclust,function(x)mean(na.omit(x)))
mafR.clust=by(seg$mafR.clust,segclust,function(x)mean(na.omit(x)))
segs=rep(1:length(nmark),nmark)
nseg=length(nmark)
nhet=seg$nhet
nhet.clust=by(nhet,segclust,mean)
chr=seg$chrom
chr.clust=by(chr,segclust,mean)
#if(nseg>500)stop("Likely hyper-segmented. Increase cval in procSample.")
endseq=jointseg[cumsum(nmark),2]
startseq=jointseg[c(1,cumsum(nmark)[-nrow(seg)]+1),2]
seglen=(endseq-startseq)/1e6
seglen.clust=by(seglen,segclust,mean)
mafR=seg$mafR
mafR[mafR<0]=0
seglogr=seg$cnlr.median
nclust=max(segclust)
seglogr.clust=by(seglogr,segclust,mean)
seglogr.clust.adj=as.vector(seglogr.clust)-dipLogR
emflags=NULL
var=var(jointseg$cnlr,na.rm=T)
if(var>0.6){
logR=rep(seglogr,nmark)
emflags=paste(emflags,"Noisy sample, Calls can be unreliable.",sep=" ")
}
#consider genotypes up to t=6, assume minor alelle is B, switch to simple moment estimates for high copy numbers (t>6) for computational efficiency
genotype=c("0","A","AA","AB","AAB","AAA","AAAB","AABB","AAAA","AAAAB","AAABB","AAAAA","AAABBB","AAAABB","AAAAAB","AAAAAA")
minor=c(0,0,0,1,1,0,1,2,0,1,2,0,3,2,1,0)
major=c(0,1,2,1,2,3,3,2,4,4,3,5,3,4,5,6)
t=ifelse(genotype=="0",0,nchar(genotype))
ng=length(genotype)
n=length(logR)
#diploid genome check
if(all(seg$cf[seg$chrom<nX]==1&seg$tcn[seg$chrom<nX]==2)|max(mafR.clust[seg$chrom<nX & seg$nhet>15], na.rm = T) < 0.05){
rho=NA
gamma=2
out1=data.frame(seg[,1:9],start=startseq,end=endseq,cf.em=seg$cf,tcn.em=seg$tcn,lcn.em=seg$lcn)
emflags=paste(emflags,"Insufficient information to estimate purity. Likely diplod or purity too low.",sep=" ")
out=list(purity=rho,ploidy=gamma,dipLogR=dipLogR,cncf=out1, emflags=emflags)
return(out)
stop("Insufficient information",call.=F)
}
#set intialize value of tumor purity from naive estimates
rhov.lsd=seg$cf
minor.lsd=seg$lcn
t.lsd=seg$tcn
major.lsd=t.lsd-minor.lsd
nas=(is.na(major.lsd)|is.na(minor.lsd))
homdel=which(major.lsd==0&major.lsd==0)
genotype.lsd=rep(NA,nseg)
a=lapply(1:sum(!nas),function(x)paste(rep("A",major.lsd[!nas][x]),collapse=""))
b=lapply(1:sum(!nas),function(x)paste(rep("B",minor.lsd[!nas][x]),collapse=""))
genotype.lsd[!nas]=unlist(lapply(1:sum(!nas),function(x)paste(a[[x]],b[[x]],sep="")))
genotype.lsd[homdel]="0"
which.geno.lsd=match(genotype.lsd,genotype)
rhov.lsd[t.lsd==2&minor.lsd==1]=NA
rhov.lsd[t.lsd==2&rhov.lsd==1]=NA
rhov.lsd[chr>=nX&rhov.lsd==1]=NA
naive=quantile(rhov.lsd,prob=0.75,na.rm=T)
rhov.lsd.subset=rhov.lsd
#avoid genotypes with identifiability issue
rhov.lsd.subset[which.geno.lsd%in%c(3,5,7,10,11,14,15,NA)]=NA
rhov.lsd.subset[t.lsd>6]=NA
rhov.lsd.subset[seglen<50]=NA
loh=which(t.lsd>=1 & minor.lsd==0 & seglen>15) #use only LOH seg for initial estimate
rho=NA
if(length(loh)>2 &!all(is.na(rhov.lsd[loh]))){
rho=max(by(rhov.lsd[loh],segclust[loh],function(x)mean(x,na.rm=T)),na.rm=T) #use LOH to estimate putity when available.
}else{
if(length(na.omit(rhov.lsd.subset))>1)rho=max(by(rhov.lsd.subset,segclust,function(x)mean(x,na.rm=T)),na.rm=T)
}
if(is.na(rho)|rho<0.2)rho=naive
rhov=rhov.lsd
rhov[is.na(rhov)]=rho
#avoid initial value too low
rhov[rhov<0.1]=rho
#avoid 1
rhov[rhov==1]=rho
rhov[nhet<min.nhet]=rho
rhov[which.geno.lsd==4]=NA
rhov=as.vector(by(rhov,segclust,function(x)mean(x,na.rm=T)))
rhov0=rhov
rhov0[rhov0>rho]=rho
#intital single clone fit. set a single cf at purity for all segment clusters
rhov=rep(rho,nclust)
#initial value for genotype priror
prior=matrix(1/ng,nrow=nclust,ncol=ng)
posterior=rep(0.5,nclust)
#initial value for sigma
sigma=rep(2,nclust)
#cold start for rho if set unif=TRUE
if(unif){
rhov=runif(nclust,0.3,0.8)
}
environment(onepass)=environment()
#fit a single clonal cluster
rho.clust=rep(1,nclust)
rhov=rep(rho+0.05,nclust)
cat("fitting 1 clonal cluster ...",'\n')
em.out=onepass(x=x,trace=trace,unif=unif,maxiter=10,min.nhet=min.nhet,eps=eps,rho.clust=rho.clust,rho=rho,rhov=rhov,prior=prior, posterior=posterior, sigma=sigma)
which.geno=em.out$which.geno
posterior=em.out$posterior
threshold=max(posterior[seglen.clust>10],na.rm=T)
rhov=em.out$rhov
rho=em.out$rho
segclust.rhov=em.out$segclust.cf
segclust.rhov[is.na(segclust.rhov)]=rho
segclust.rhov[segclust.rhov>rho]=rho
#Identify poor fits by posterior probability to refit as subclonal cluster.
cond1=(threshold-posterior)>0.05
#The cf has to be sufficiently different to allow a subclonal cluster fit
cond2=abs(segclust.rhov-rho)>difcf
#Don't allow subclonal fit of tiny segments
#cond3=seglen.clust>1
cond3=(nmark.clust>50&nhet.clust>15)
#Genotypes with identifiability issue don't allow subclonal option
ub=c(10,15)
cond4=(which.geno%in%ub)
#Exclude sex chromosome
cond5=chr.clust<nX
#Exclude small changes
cond6=(abs(seglogr.clust.adj)>0.15|mafR.clust>0.05)
refit=which(cond2&cond3&!cond4&cond5&cond6)
#recursively identify poor fits and fit additional subclonal clusters up to 4
if(em.out$rho>0.4&any(refit)){
nclone=2
dif=refit
difrho=abs(max(segclust.rhov[refit],na.rm=T)-rho)
while(any(refit)&any(dif)&difrho>difcf&nclone<maxk){
refit.old=refit
rho.clust.start=rho.clust
rho.clust.start[refit]=nclone
#categories check to avoid overring
#ncat=length(table(rho.clust))
#if(max(rho.clust)>ncat)rho.clust[rho.clust>1]=rho.clust[rho.clust>1]-1
rhos=by(rep(segclust.rhov,as.vector(nmark.clust)),rep(rho.clust.start,as.vector(nmark.clust)),function(x)mean(x,na.rm=T))
rhov.start=rhos[rho.clust.start]
rhov.start=as.vector(rhov.start)
em.out=onepass(x=x,trace=trace,unif=unif,maxiter=maxiter,min.nhet=min.nhet,eps=eps,
rho.clust=rho.clust.start,rho=rho,rhov=rhov.start, prior=prior, posterior=posterior, sigma=sigma)
posterior=em.out$posterior
which.geno=em.out$which.geno
threshold=quantile(posterior[seglen.clust>10],1,na.rm=T)
rhov1=em.out$rhov
rho1=em.out$rho
rhos1=by(rep(rhov1,as.vector(nmark.clust)),rep(em.out$rho.clust,as.vector(nmark.clust)),function(x)mean(x,na.rm=T))
segclust.rhov=em.out$segclust.cf
segclust.rhov[is.na(segclust.rhov)]=rho1
segclust.rhov[segclust.rhov>rho1]=rho1
#set smaller threshold for posterior prob difference to have sufficient sensitivity
cond1=(threshold-posterior)>0.05
cond2=abs(segclust.rhov-rhov1)>difcf
cond4=(which.geno%in%ub)
refit=which(cond2&cond3&!cond4&cond5&cond6)
dif=setdiff(refit.old,refit)
difrho=min(abs(mean(rhov1[refit.old],na.rm=T)-rhos1[-nclone]))
if(difrho>difcf){
cat("fitting",nclone,"clonal clusters ...",'\n')
rhov=rhov1
rho=rho1
rho.clust=em.out$rho.clust
nclone=nclone+1
}
}
}
#rhov=em.out$rhov
#rho=em.out$rho
rhov.em=rhov[segclust]
which.geno.em=which.geno[segclust]
genotype.em=which.geno.em
genotype.em[which(!is.na(which.geno.em))]=paste("A",major[which.geno.em],"B",minor[which.geno.em],sep="")[which(!is.na(which.geno.em))]
t.em=t[which.geno.em]
major.em=major[which.geno.em]
minor.em=minor[which.geno.em]
clevel=unique(rhov.em)
clevel=clevel[!is.na(clevel)&clevel!=1]
corder=rank(-clevel)
names(corder)=clevel
clonal.cluster=corder[as.character(rhov.em)]
clonal.cluster[is.na(rhov.em)]=NA
clonal.cluster[which.geno.em==4]=1
#clonal.cluster=rho.clust[segclust]
#calculate ploidy
gamma=(2^(-dipLogR)*(2*(1-rho)+2*rho)-2*(1-rho))/rho
#hybrid: for high copy number (t>6), use moment estimates
seglogr.adj=seg$cnlr.median-dipLogR
idx=which(seglogr.adj>1.7*rho|is.na(which.geno.em))
if(any(idx)){
maf=exp(sqrt(mafR[idx]))
tt=round((2^(seglogr.adj[idx]+1)-2*(1-rho))/rho,0)
tt[tt<0]=0 #homdel
mm=round((tt*maf*rho+(maf-1)*(1-rho))/(rho*(maf+1)),0)
re=which(mm>tt) #rounding error can cause major>t
if(any(re)){mm[re]=tt[re]}
t.em[idx]=tt
major.em[idx]=mm
minor.em[idx]=t.em[idx]-major.em[idx]
genotype.em[idx] = paste("A",major.em[idx], "B", minor.em[idx], sep="")
rhov.em[idx]=rho
clonal.cluster[idx]=1
}
clonal.cluster[which.geno.em==4]=1
#if het SNPs are too few, not sufficient information to estimate minor cn
lownhet=which(nhet<min.nhet)
minor.em[lownhet]=NA
minor.em[t.em<=1]=0
#set cf=1 for 2-1 segments (100% nothing)
rhov.em[t.em==2&minor.em==1]=1
rhov.em[t.em==2&is.na(minor.em)]=NA
clonal.cluster[t.em==2&is.na(minor.em)]=NA
clonal.cluster[chr==nX]=NA
#for male, use the empirical call
if(sum(chr==nX)>0){
prop.nhet.chrX=sum(nhet[chr==nX])/sum(nmark[chr==nX])
male=(prop.nhet.chrX<0.01)
}else{
male=FALSE
}
#normal male X is one copy. No het snps to start with, so don't call minor cn
if(male){
t.em[chr>=nX]=round(t.em[chr>=nX]/2,0)
minor.em[chr>=nX]=NA
normalX=which(t.em[chr>=nX]==1)
if(any(normalX))rhov.em[chr>=nX][normalX]=1
}
out1=data.frame(seg[,1:9],start=startseq,end=endseq, cf.em=rhov.em,tcn.em=t.em, lcn.em=minor.em, clonal.cluster=clonal.cluster)
if(rho<0.3){emflags=paste(emflags,"Low purity. Calls can be unreliable.",sep=" ")}
out=list(purity=rho,ploidy=gamma,dipLogR=dipLogR,cncf=out1, emflags=emflags)
return(out)
}
#####onepass###
onepass=function(x, trace, unif, rho, rhov, prior, posterior, sigma, min.nhet, rho.clust, maxiter, eps){
# added to make R CMD check stop complaining - didn't work
# globalVariables(c("chr", "cnlr.median.clust", "dipLogR", "het", "jointseg", "logOR", "logOR2var", "logORvar", "logORvar.clust", "logR.adj", "mafR.clust", "major", "minor", "nX", "nclust", "ng", "nhet", "nmark", "nmark.clust", "segclust"))
dif=1
iter=0
while(dif>eps && iter<maxiter) {
iter = iter + 1
rhov[is.na(rhov)]=rho
#constraint: any segment cannot have purity higher than the mode purity
rhov[rhov>rho]=rho
#avoid cf below 10%
rhov[rhov<0.1]=rho
rho.old = rho
rhov.old=rhov
sigma.old = sigma
prior.old=prior
posterior.old=posterior
########
#E-step#
########
####LogR mixture model parameter####
gamma=2
phi=2*(1-rho)+gamma*rho
mu=log2(2*(1-rhov)+matrix(rhov,ncol=1)%*%t)-log2(phi)
####LogOR mixture model parameter####
#allelic ratio
k=(matrix(rhov,ncol=1)%*%major+1-rhov)/(matrix(rhov,ncol=1)%*%minor+1-rhov)
logk=log(k)
logk2=logk^2
#posterior probability matrix
#pmatrix=NULL
pmatrix=matrix(NA,nrow=nrow(jointseg),ncol=ng)
loglik=0
clust=rep(segclust,nmark)
segc=sort(unique(segclust[chr<=nX]))
for(s in segc){
idx=which(clust==s)
x1ij=logR.adj[idx]
upper=quantile(x1ij,0.95)
lower=quantile(x1ij,0.05)
x1ij[x1ij>upper]=NA
x1ij[x1ij<lower]=NA
mus=rep(mu[s,],each=length(idx))
sd=sigma[s]
if(rhov[s]<0.5){
x1ij=rep(cnlr.median.clust[s]-dipLogR,length(idx))
sd=0.1
}
#density for logR.adj (centered logR)
d1=dnorm(x1ij,mean=mus,sd=sd)
d1[d1==Inf]=NA
#density for logOR, non-central chi-square
nu=rep(logk2[s,],each=length(idx))
lambda=nu/rep(logORvar[idx],ng)
x2ij=logOR2var[idx]
if(rhov[s]<0.5){
x2ij=rep(mafR.clust[s]/logORvar.clust[s],length(idx))
lambda=nu/logORvar.clust[s]
}
#d2=dchisq(x2ij+1,df=1,ncp=lambda)
d2=dchisq(x2ij,df=1,ncp=lambda)
d2=1/(abs(x2ij-lambda)+1e-6)
d2[d2==Inf]=NA
#likelihood
d=d1*d2
hetsum=d[rep(het[idx]==1,ng)]
homsum=d1[rep(het[idx]==0,ng)]
d=sum(hetsum[hetsum<Inf],na.rm=T)+sum(homsum[homsum<Inf],na.rm=T)
if(!is.na(d)&d>0&d<Inf){loglik=loglik+log(d)}
#heterozygous positions contribute to logR and logOR
numerator1=matrix(d1*d2,nrow=length(idx),ncol=ng,byrow=F)
numerator1=sweep(numerator1,MARGIN=2,prior[s,],`*`)
#homozygous positions contribute to logR only
numerator0=matrix(d1,nrow=length(idx),ncol=ng,byrow=F)
numerator0=sweep(numerator0,MARGIN=2,prior[s,],`*`)
numerator=numerator1
numerator[het[idx]==0,]=numerator0[het[idx]==0,]
tmp=apply(numerator,1,function(x)x/(sum(x,na.rm=T)+1e-5))
#pmatrix=rbind(pmatrix,t(tmp))
pmatrix[idx,]=t(tmp)
#update prior
prior[s,]=apply(t(tmp),2,function(x)mean(x,na.rm=T))
}
########
#M-step#
########
#get CF per segments, pick mode close to 1 (favor high purity low cn solution)
rhom=gammam=matrix(NA,nrow=nclust,ncol=ng)
geno=matrix(0,nrow=nclust,ncol=ng)
which.geno=posterior=rep(NA,nclust)
for(i in segc){
idx=which(clust==i)
idxhet=which(clust==i&het==1)
sump=apply(pmatrix[idx,,drop=F],2,function(x)sum(x,na.rm=T))
#if probability is too small (highly uncertain), use lsd estimates for stability
if(all(is.na(prior[i,]))){
prior[i,]=prior.old[i,]
}else{
if(sum(prior[i,],na.rm=T)==0)prior[i,]=prior.old[i,]
}
sump[prior[i,]<max(prior[i,])]=NA
##update k
tmphet=pmatrix[idxhet,,drop=F]
v1=as.vector((logOR[idxhet]^2-logORvar[idxhet])/logORvar[idxhet])
v2=as.vector(1/logORvar[idxhet])
sumdphet=apply(sweep(tmphet,MARGIN=1, v1, `*`), 2,function(x)sum(x,na.rm=T))
sumphet=apply(sweep(tmphet,MARGIN=1,v2,`*`), 2,function(x)sum(x,na.rm=T))
sumphet[is.na(sump)]=NA
#CF from logOR
logk2hat=pmax(0,sumdphet/sumphet) #can be negative when k=1 logk=0 set to 0
khat=exp(sqrt(logk2hat))
a=(1-khat)/(khat*(minor-1)-(major-1))
a[abs(a)==Inf]=NA
a[a<=0]=NA
a[a>1]=1
if(all(nhet[segclust==i]<min.nhet))a=rep(NA,ng)
#CF from logR
tmp=pmatrix[idx,,drop=F]
v=as.vector(logR.adj[idx])
sumdp=apply(sweep(tmp,MARGIN=1,v,`*`),2,function(x)sum(x,na.rm=T))
mu.hat=sumdp/sump #mu.hat
aa=2*(2^mu.hat-1)/(t-2)
aa[abs(aa)==Inf]=NA
aa[aa<=0]=NA
aa[aa>1]=1
aaa=pmin(a,aa,na.rm=T)
#degenerate cases
#homozygous deletion (0) and balanced gain (AABB, AAABBB), maf=0.5, purity information comes from logr only
aaa[c(1,8,13)]=aa[c(1,8,13)]
#set upper bound at sample rho
aaa=pmin(aaa,rho)
#uniparental disomy (AA) CF information comes from logOR only.
which.geno[i]=which.max(prior[i,])
postprob=pmatrix[idx,which.geno[i]]
posterior[i]=mean(postprob[postprob>0],na.rm=T)
#update sigma
y=as.vector(logR.adj[idx])*pmatrix[idx,,drop=F]
r=y-mu[i,]*pmatrix[idx,,drop=F]
ss=sqrt(sum(r[,which.geno[i]]^2,na.rm=T)/sum(pmatrix[idx,which.geno[i]]))
sigma[i]=ifelse(is.na(ss),0.5,ss)
aaa[setdiff(1:ng,which.geno[i])]=NA
rhom[i,]=aaa
} #max prior
#}
#for segments noninformative for purity, plug in sample purity
rhov=unlist(apply(rhom,1,function(x)ifelse(all(is.na(x)),NA,na.omit(x))))
segclust.cf=rhov
rhos=by(rep(rhov,as.vector(nmark.clust)),rep(rho.clust,as.vector(nmark.clust)),function(x)mean(x,na.rm=T))
rhov=rhos[rho.clust]
if(all(rho.clust==1))rhov=rep(rho,nclust)
rhov=as.vector(rhov)
rhov[rho.clust==1]=rho
rho=max(rhov,na.rm=T)
dif=c(abs(rhov-rhov.old),diff(posterior-posterior.old))
dif = max(dif,na.rm=T)
if(trace) {
cat('iter:', iter, '\n')
cat('dif:', dif, '\n')
cat('purity:', rho, '\n')
cat('ploidy:', gamma, '\n')
}
}
out=list(posterior=posterior, which.geno=which.geno, rho=rho, rhov=rhov, segclust.cf=segclust.cf, rho.clust=rho.clust)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.