emcncf=function(x,trace=FALSE,unif=FALSE,min.nhet=15,maxiter=10,eps=1e-3){
#' EM estimate of copy number and cellular fraction of segment clusters
#' @description Uses genotype mixture model to estimate the cluster specific copy number and cellular fraction. Uses estimates based on the cnlr.median and mafR as initial values for the EM iteration.
#' @param x (list) the output from procSample. This function uses the elements jointseg, out and dipLogR from the output.
#' @param trace (logical) flag to print the EM criteria at every step
#' @param unif (logical) random EM start values of cellular fractions instead of clusteredcncf values
#' @param min.nhet (numeric) minimum number of heterozygote snps in a segment used to call minor cn
#' @param maxiter (numeric) maximum number of EM iterations
#' @param eps the (numeric) convergence threshold
#' @return A list containing: \item{loglik}{loglikelihood value of the fitted model}
#' \item{purity}{fraction tumor cells in the tumor sample}
#' \item{ploidy}{average total copy number of the tumor cells}
#' \item{dipLogR}{estimated logR value of diploid segments}
#' \item{cncf}{dataframe consisting of the columns of segmentation output as well as
#' cellular fraction (cf), total (tcn) and lesser (lcn) copy number of each segment
#' and their em counterpart (with .em suffix)}
#' @export
jointseg=x$jointseg
out=x$out
dipLogR=x$dipLogR
nX=x$nX
seg=out
jointseg=subset(jointseg,!is.na(jointseg$cnlr))
logR=jointseg$cnlr
logOR=jointseg$valor
logORvar=jointseg$lorvar
logOR2var=logOR^2/logORvar
logORvar.clust=by(logORvar,jointseg$segclust,function(x)mean(na.omit(x)))
het=jointseg$het
nmark=seg$num.mark
segclust=seg$segclust
cnlr.median.clust=by(seg$cnlr.median,segclust,function(x)mean(na.omit(x)))
#mafR.clust=by(seg$mafR,segclust,function(x)mean(na.omit(x)))
mafR.clust=by(seg$mafR.clust,segclust,function(x)mean(na.omit(x)))
segs=rep(1:length(nmark),nmark)
nseg=length(nmark)
nhet=seg$nhet
chr=seg$chrom
#if(nseg>500)stop("Likely hyper-segmented. Increase cval in procSample.")
endseq=jointseg[cumsum(nmark),2]
startseq=jointseg[c(1,cumsum(nmark)[-nrow(seg)]+1),2]
seglen=(endseq-startseq)/1e6
mafR=seg$mafR
mafR[mafR<0]=0
seglogr=seg$cnlr.median
nclust=max(segclust)
emflags=NULL
var=var(jointseg$cnlr,na.rm=T)
if(var>0.6){
logR=rep(seglogr,nmark)
emflags=paste(emflags,"Noisy sample, Calls can be unreliable.",sep=" ")
}
#consider genotypes up to t=6, assume minor alelle is B, switch to cncf for high copy numbers (t>6) for computational efficiency
genotype=c("0","A","AA","AB","AAB","AAA","AAAB","AABB","AAAA","AAAAB","AAABB","AAAAA","AAABBB","AAAABB","AAAAAB","AAAAAA")
minor=c(0,0,0,1,1,0,1,2,0,1,2,0,3,2,1,0)
major=c(0,1,2,1,2,3,3,2,4,4,3,5,3,4,5,6)
t=ifelse(genotype=="0",0,nchar(genotype))
ng=length(genotype)
n=length(logR)
#diploid genome with purity=1
if(all(seg$tcn[seg$chrom<nX]==2 & seg$lcn[seg$chrom<nX]%in%c(1, NA))|max(mafR.clust[seg$chrom<nX & seg$nhet>min.nhet], na.rm = T) < 0.05){
rho=NA
gamma=2
out1=data.frame(seg[,1:9],start=startseq,end=endseq,cf.em=seg$cf,tcn.em=seg$tcn,lcn.em=seg$lcn)
emflags=paste(emflags,"Insufficient information to estimate purity. Likely diplod or purity too low.",sep=" ")
out=list(purity=rho,ploidy=gamma,dipLogR=dipLogR,cncf=out1, emflags=emflags)
return(out)
stop("Insufficient information",call.=F)
}
#intialize cellular fraction rho vector use least squared distance estimates
rhov.lsd=seg$cf
minor.lsd=seg$lcn
t.lsd=seg$tcn
major.lsd=t.lsd-minor.lsd
nas=(is.na(major.lsd)|is.na(minor.lsd))
homdel=which(major.lsd==0&major.lsd==0)
genotype.lsd=rep(NA,nseg)
a=lapply(1:sum(!nas),function(x)paste(rep("A",major.lsd[!nas][x]),collapse=""))
b=lapply(1:sum(!nas),function(x)paste(rep("B",minor.lsd[!nas][x]),collapse=""))
genotype.lsd[!nas]=unlist(lapply(1:sum(!nas),function(x)paste(a[[x]],b[[x]],sep="")))
genotype.lsd[homdel]="0"
which.geno.lsd=match(genotype.lsd,genotype)
rhov.lsd[t.lsd==2&minor.lsd==1]=NA
rhov.lsd[t.lsd==2&rhov.lsd==1]=NA
rhov.lsd[chr>=nX&rhov.lsd==1]=NA
#if(!all(is.na(rhov.lsd[seglen>35]))){
# naive=max(by(rhov.lsd[seglen>35],segclust[seglen>35],function(x)mean(x,na.rm=T)),na.rm=T)
#}else{
naive=quantile(rhov.lsd,prob=0.75,na.rm=T)
#}
rhov.lsd.subset=rhov.lsd
rhov.lsd.subset[which.geno.lsd%in%c(3,5,7,10,11,14,15,NA)]=NA
rhov.lsd.subset[t.lsd>6]=NA
rhov.lsd.subset[seglen<50]=NA
loh=which(t.lsd>=1 & minor.lsd==0 & seglen>50) #use only LOH seg for initial estimate
rho=NA
if(length(loh)>2){
rho=max(by(rhov.lsd[loh],segclust[loh],function(x)mean(x,na.rm=T)),na.rm=T)
}else{
if(length(na.omit(rhov.lsd.subset))>1)rho=max(by(rhov.lsd.subset,segclust,function(x)mean(x,na.rm=T)),na.rm=T)
}
if(is.na(rho)|rho<0.2)rho=naive
rhov=rhov.lsd
rhov[is.na(rhov)]=rho
#avoid initial value too low
rhov[rhov<0.1]=rho
#avoid 1
rhov[rhov==1]=rho
rhov[nhet<min.nhet]=rho
rhov=as.vector(by(rhov,segclust,mean))
rhov0=rhov
lowpur=FALSE
#if(rho<0.2){lowpur=TRUE}
#if(lowpur){
#rhov=rep(rho,nclust)
#}
#center logR at diphet
logR.adj=logR-dipLogR
#initial value for genotype priror
prior=matrix(1/ng,nrow=nclust,ncol=ng)
#initial value for sigma
sigma=rep(2,nclust)
#cold start for rho if set unif=TRUE
if(unif){
rhov=runif(nclust,0.3,0.8)
}
dif=1
iter=0
while(dif>eps && iter<maxiter) {
iter = iter + 1
rhov[is.na(rhov)]=rho
#constraint: any segment cannot have purity higher than the mode purity
rhov[rhov>rho]=rho
#avoid cf below 10%
rhov[rhov<0.1]=rho
rho.old = rho
rhov.old=rhov
sigma.old = sigma
prior.old=prior
########
#E-step#
########
####LogR mixture model parameter####
gamma=2
phi=2*(1-rho)+gamma*rho
mu=log2(2*(1-rhov)+matrix(rhov,ncol=1)%*%t)-log2(phi)
####LogOR mixture model parameter####
#allelic ratio
k=(matrix(rhov,ncol=1)%*%major+1-rhov)/(matrix(rhov,ncol=1)%*%minor+1-rhov)
logk=log(k)
logk2=logk^2
#posterior probability matrix
#pmatrix=NULL
pmatrix=matrix(NA,nrow=nrow(jointseg),ncol=ng)
loglik=0
clust=rep(segclust,nmark)
segc=sort(unique(segclust[chr<=nX]))
for(s in segc){
idx=which(clust==s)
x1ij=logR.adj[idx]
upper=quantile(x1ij,0.95)
lower=quantile(x1ij,0.05)
x1ij[x1ij>upper]=NA
x1ij[x1ij<lower]=NA
mus=rep(mu[s,],each=length(idx))
sd=sigma[s]
if(rhov[s]<0.4){
x1ij=rep(cnlr.median.clust[s]-dipLogR,length(idx))
sd=0.1
}
#density for logR.adj (centered logR)
d1=dnorm(x1ij,mean=mus,sd=sd)
d1[d1==Inf]=NA
#density for logOR, non-central chi-square
nu=rep(logk2[s,],each=length(idx))
lambda=nu/rep(logORvar[idx],ng)
x2ij=logOR2var[idx]
if(rhov[s]<0.4){
x2ij=rep(mafR.clust[s]/logORvar.clust[s],length(idx))
lambda=nu/logORvar.clust[s]
}
#d2=dchisq(x2ij+1,df=1,ncp=lambda)
d2=dchisq(x2ij,df=1,ncp=lambda)
d2=1/(abs(x2ij-lambda)+1e-6)
d2[d2==Inf]=NA
#likelihood
d=d1*d2
hetsum=d[rep(het[idx]==1,ng)]
homsum=d1[rep(het[idx]==0,ng)]
d=sum(hetsum[hetsum<Inf],na.rm=T)+sum(homsum[homsum<Inf],na.rm=T)
if(!is.na(d)&d>0&d<Inf){loglik=loglik+log(d)}
#heterozygous positions contribute to logR and logOR
numerator1=matrix(d1*d2,nrow=length(idx),ncol=ng,byrow=F)
numerator1=sweep(numerator1,MARGIN=2,prior[s,],`*`)
#homozygous positions contribute to logR only
numerator0=matrix(d1,nrow=length(idx),ncol=ng,byrow=F)
numerator0=sweep(numerator0,MARGIN=2,prior[s,],`*`)
numerator=numerator1
numerator[het[idx]==0,]=numerator0[het[idx]==0,]
tmp=apply(numerator,1,function(x)x/(sum(x,na.rm=T)+1e-5))
#pmatrix=rbind(pmatrix,t(tmp))
pmatrix[idx,]=t(tmp)
#update prior
prior[s,]=apply(t(tmp),2,function(x)mean(x,na.rm=T))
}
########
#M-step#
########
#get CF per segments, pick mode close to 1 (favor high purity low cn solution)
rhom=gammam=matrix(NA,nrow=nclust,ncol=ng)
geno=matrix(0,nrow=nclust,ncol=ng)
which.geno=posterior=rep(NA,nclust)
for(i in segc){
idx=which(clust==i)
idxhet=which(clust==i&het==1)
sump=apply(pmatrix[idx,,drop=F],2,function(x)sum(x,na.rm=T))
#if probability is too small (highly uncertain), use lsd estimates for stability
if(all(is.na(prior[i,]))){
prior[i,]=prior.old[i,]
}else{
if(sum(prior[i,],na.rm=T)==0)prior[i,]=prior.old[i,]
}
if(max(prior[i,],na.rm=T)>0.05){
##calculate rho for the most likely genotype(s) for segment i
##if there more more than one likely candidates save two and pick one with higher CF
#top2=sort(prior[i,],decreasing=T)[1:2]
#if(top2[2]>0.05&abs(diff(top2))<0.0001){
# sump[prior[i,]<quantile(prior[i,],(ng-2)/ng)]=NA
# }else{
# sump[prior[i,]<max(prior[i,])]=NA
# }
sump[prior[i,]<max(prior[i,])]=NA
##update k
tmphet=pmatrix[idxhet,,drop=F]
v1=as.vector((logOR[idxhet]^2-logORvar[idxhet])/logORvar[idxhet])
v2=as.vector(1/logORvar[idxhet])
sumdphet=apply(sweep(tmphet,MARGIN=1, v1, `*`), 2,function(x)sum(x,na.rm=T))
sumphet=apply(sweep(tmphet,MARGIN=1,v2,`*`), 2,function(x)sum(x,na.rm=T))
sumphet[is.na(sump)]=NA
#CF from logOR
logk2hat=pmax(0,sumdphet/sumphet) #can be negative when k=1 logk=0 set to 0
khat=exp(sqrt(logk2hat))
a=(1-khat)/(khat*(minor-1)-(major-1))
a[abs(a)==Inf]=NA
a[a<=0]=NA
a[a>1]=1
if(all(nhet[segclust==i]<min.nhet))a=rep(NA,ng)
#CF from logR
tmp=pmatrix[idx,,drop=F]
v=as.vector(logR.adj[idx])
sumdp=apply(sweep(tmp,MARGIN=1,v,`*`),2,function(x)sum(x,na.rm=T))
mu.hat=sumdp/sump #mu.hat
aa=2*(2^mu.hat-1)/(t-2)
aa[abs(aa)==Inf]=NA
aa[aa<=0]=NA
aa[aa>1]=1
aaa=pmax(a,aa,na.rm=T)
#degenerate cases
#homozygous deletion (0) and balanced gain (AABB, AAABBB), maf=0.5, purity information comes from logr only
aaa[c(1,8,13)]=aa[c(1,8,13)]
#set upper bound at sample rho
aaa=pmin(aaa,rho)
#uniparental disomy (AA) CF information comes from logOR only.
##if there are two likely genotype, choose one with higher purity (e.g.,AAB 80% or AAAB 50%)
##if the higher CF exceeds sample purity, then the lower CF is the right one
#if(all(is.na(aaa))){which.geno[i]=which.max(prior[i,])}else{
# which.geno[i]=ifelse(max(aaa,na.rm=T)<rho,which.max(aaa),which.min(aaa))
#}
which.geno[i]=which.max(prior[i,])
postprob=pmatrix[idx,which.geno[i]]
posterior[i]=mean(postprob[postprob>0],na.rm=T)
#update sigma
y=as.vector(logR.adj[idx])*pmatrix[idx,,drop=F]
r=y-mu[i,]*pmatrix[idx,,drop=F]
ss=sqrt(sum(r[,which.geno[i]]^2,na.rm=T)/sum(pmatrix[idx,which.geno[i]]))
sigma[i]=ifelse(is.na(ss),0.5,ss)
aaa[setdiff(1:ng,which.geno[i])]=NA
#het dip (AB) seg has no information, set CF at a high value less than 1
#if(any(which(!is.na(sump))==4)){aaa[4]=0.9}
rhom[i,]=aaa
} #max prior
}
#for segments noninformative for purity, plug in sample purity
rhov=unlist(apply(rhom,1,function(x)ifelse(all(is.na(x)),NA,na.omit(x))))
#Determine sample rho
rhov.long=rhov[seg$segclust]
which.geno.long=which.geno[seg$segclust]
rhov.long[which.geno.long == 4]=NA
rhov.long[chr>=nX]=NA
if(sum(!is.na(rhov.long[seglen>35]))>1){
meanrho=max(by(rhov.long[seglen>35],segclust[seglen>35],function(x)mean(x,na.rm=T)),na.rm=T)
}else{
meanrho=quantile(rhov.long,prob=0.75,na.rm=T) #if no big segments, probably very noisy sample or over-segemnted. can't estimate rho accurately, just take upper quatile
}
rhov.long.subset=rhov.long
rhov.long.subset[which.geno.long %in% c(5,7,10,11,14,15,NA)]=NA #Imbalanced gains have big identifiability issue
rhov.long.subset[seglen<50]=NA
#if low purity, assume rhov the same
#if more than 100 seg give rho estimate, use density to find rho
#otherwise use LOH seg for purity estimate
if(all(is.na(rhov.long))){
rho=naive
}else{
if(lowpur){
rho=max(rhov.long,na.rm=T)
rhov=rep(rho,nclust)
}else{
nona=na.omit(rhov.long)
if(length(nona)>100){
rho=find.mode(nona)$rho
}else{
loh=which(major[which.geno.long]>=1 & minor[which.geno.long]==0 & seglen>50)
rhov.loh=rep(NA,length(rhov.long))
rhov.loh[loh]=rhov.long[loh]
if(length(loh)>1 & !all(is.na(rhov.loh))){
rho=max(by(rhov.loh,segclust,function(x)mean(x,na.rm=T)),na.rm=T)
}else{
if(length((na.omit(rhov.long.subset)))>1&dipLogR< -0.3)rho=max(by(rhov.long.subset,segclust,function(x)mean(x,na.rm=T)),na.rm=T)
}
}
}
}
if(is.na(rho))rho=meanrho
dif = quantile(abs(rhov-rhov.old),0.9,na.rm=T)
if(trace) {
cat('iter:', iter, '\n')
cat('dif:', dif, '\n')
cat('purity:', rho, '\n')
cat('ploidy:', gamma, '\n')
}
}
rhov.em=rhov[seg$segclust]
which.geno.em=which.geno[seg$segclust]
genotype.em=which.geno.em
genotype.em[which(!is.na(which.geno.em))]=paste("A",major[which.geno.em],"B",minor[which.geno.em],sep="")[which(!is.na(which.geno.em))]
t.em=t[which.geno.em]
major.em=major[which.geno.em]
minor.em=minor[which.geno.em]
#calculate ploidy
gamma=(2^(-dipLogR)*(2*(1-rho)+2*rho)-2*(1-rho))/rho
#hybrid: for high copy number (t>6), use moment estimates
seglogr.adj=seg$cnlr.median-dipLogR
idx=which(seglogr.adj>1.6*rho|is.na(which.geno.em))
if(any(idx)){
maf=exp(sqrt(mafR[idx]))
tt=round((2^(seglogr.adj[idx]+1)-2*(1-rho))/rho,0)
tt[tt<0]=0 #homdel
mm=round((tt*maf*rho+(maf-1)*(1-rho))/(rho*(maf+1)),0)
re=which(mm>tt) #rounding error can cause major>t
if(any(re)){mm[re]=tt[re]}
t.em[idx]=tt
major.em[idx]=mm
minor.em[idx]=t.em[idx]-major.em[idx]
genotype.em[idx] = paste("A",major.em[idx], "B", minor.em[idx], sep="")
rhov.em[idx]=rho
}
# #EM over-calling homozygous deletion at low cf, switch to lsa
# idx=which(which.geno.em==1&rhov.em<0.25&seglen>35)
# if(any(idx)){
# genotype.em[idx]=genotype.lsd[idx]
# t.em[idx]=t.lsd[idx]
# minor.em[idx]=minor.lsd[idx]
# major.em[idx]=major.lsd[idx]
# rhov.em[idx]=rhov.lsd[idx]
# }
#if het SNPs are too few, not sufficient information to estimate minor cn
lownhet=which(nhet<min.nhet)
minor.em[lownhet]=NA
minor.em[t.em<=1]=0
#set cf=1 for 2-1 segments (100% nothing)
rhov.em[t.em==2&minor.em==1]=1
rhov.em[t.em==2&is.na(minor.em)]=NA
#for male, use the empirical call
if(sum(chr==nX)>0){
prop.nhet.chrX=sum(nhet[chr==nX])/sum(nmark[chr==nX])
male=(prop.nhet.chrX<0.01)
}else{
male=FALSE
}
#normal male X is one copy. No het snps to start with, so don't call minor cn
if(male){
t.em[chr>=nX]=round(t.em[chr>=nX]/2,0)
minor.em[chr>=nX]=NA
normalX=which(t.em[chr>=nX]==1)
if(any(normalX))rhov.em[chr>=nX][normalX]=1
}
#out1=data.frame(seg,cf.em=rhov.em,tcn.em=t.em, lcn.em=minor.em)
out1=data.frame(seg[,1:9],start=startseq,end=endseq,cf.em=rhov.em,tcn.em=t.em,lcn.em=minor.em)
if(rho<0.3){emflags=paste(emflags,"Low purity. Calls can be unreliable.",sep=" ")}
out=list(loglik=loglik,purity=rho,ploidy=gamma,dipLogR=dipLogR,seglen=seglen,cncf=out1, emflags=emflags)
return(out)
}
find.mode=function (x)
{
den = density(na.omit(x),n=length(x))
y = den$y
y[y < 0.001] = 0
rho = den$x[which.max(y)]
difseq = rle(sign(diff(y)))
nmodes = length(difseq$lengths)/2
len = cumsum(difseq$lengths)
modes.pos = len[which(difseq$values == 1)] + 1
modes = y[modes.pos]
idx = order(modes, decreasing = T)
signodes = modes.pos[which(modes > 1.5)]
if (length(signodes) >= 2) {
#rho = max(den$x[modes.pos[idx]],na.rm=T)
rho = max(den$x[signodes],na.rm=T)
}
out = list(rho = rho, nmodes = nmodes)
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.