MutationTimeR: Timing somatic point mutations and copy number gains

Documented in addMetadataToHeader addMutTime classifyMutations classWgd getAltCount getClusterPower getHomozygousity getMetaEntry getPloidy getPurity getSex getSubclonalArchitecture getTumorDepth getWGDstatus mutationTime plotSample readVcf writeVcf

getTumorCounts <- function(vcf){
	sapply(grep("(F|R).Z", names(geno(vcf)), value=TRUE), function(i) geno(vcf)[[i]][,"TUMOUR"])
}

#' Extract tumour depth
#' @param vcf 
#' @return numeric()
#' 
#' @author mg14
#' @export
getTumorDepth <- function(vcf){
	if("t_alt_count" %in% colnames(info(vcf))){ ## consensus data, snv and indel
		return(info(vcf)$t_alt_count + info(vcf)$t_ref_count)
	}
	if("DP" %in% names(geno(vcf))){ ## Mutect2
		return(geno(vcf)$DP[,1])
	}else{ ## older data
		if("FAZ" %in% rownames(geno(header(vcf)))){
			return(rowSums(getTumorCounts(vcf)))
		}else{
			return(geno(vcf)$DEP[,2])
		}
	}
}


#' Get alt alleles counts 
#' @param vcf 
#' @return numeric()
#' 
#' @author mg14
#' @export
getAltCount <- function(vcf){
	if("t_alt_count" %in% colnames(info(vcf))){ ## consensus data, snv and indel
		return(info(vcf)$t_alt_count)
	}
	if("AD" %in% names(geno(vcf))){ ## Mutect2
		return(geno(vcf)$AD[,1,2])
	}else{ ## older formats
		if("FAZ" %in% rownames(geno(header(vcf)))){ ## ie subs
			c <- getTumorCounts(vcf)
			t <- c[,1:4] + c[,5:8]
			colnames(t) <- substring(colnames(t),2,2)
			a <- as.character(unlist(alt(vcf)))
			a[!a%in%c('A','T','C','G')] <- NA
			return(sapply(seq_along(a), function(i) if(is.na(a[i])) NA else t[i, a[i]]))
		}
		else{ ## ie indel
			#(geno(vcf)$PP + geno(vcf)$NP + geno(vcf)$PB + geno(vcf)$NB)[,"TUMOUR"]
			return(geno(vcf)$MTR[,2])
		}
	}
}

#' @importFrom VGAM pbetabinom
dtrbinom <- function(x, size, prob, xmin=0) dbinom(x,size,prob) / pbinom(xmin-1, size, prob, lower.tail=FALSE)
pbetabinom <- function(x, size, prob, rho){
	if(rho!=0)
		VGAM::pbetabinom(x, size, prob, rho)
	else
		pbinom(x, size, prob)
}
#' @importFrom VGAM dbetabinom
dbetabinom <- function(x, size, prob, rho){
	if(rho!=0)
		VGAM::dbetabinom(x, size, prob, rho)
	else
		dbinom(x, size, prob)
}
dtrbetabinom <- function(x, size, prob, rho, xmin=0) {y <- dbetabinom(x,size,prob,rho) / (1-pbetabinom(xmin-1, size, prob, rho))
	y[x<xmin] <- 0
	return(y)}
ptrbetabinom <- function(x, size, prob, rho, xmin=0) {
	pmin <- pbetabinom(xmin-1, size, prob, rho)
	pmax(0,(pbetabinom(x,size,prob,rho) - pmin) / (1-pmin))}


mergeClusters <- function(clusters, deltaFreq=0.05){
	if(nrow(clusters) <= 1) return(clusters)
	h <- hclust(dist(clusters$proportion))
	ct <- cutree(h, h=deltaFreq)
	cp <- as.matrix(cophenetic(h))
	Reduce("rbind",lapply(unique(ct), function(i) {
						n_ssms <- sum(clusters$n_ssms[ct==i])
						w <- max(cp[ct==i,ct==i])
						data.frame(new.cluster=i, n_ssms=n_ssms, proportion=sum(clusters$proportion[ct==i]*clusters$n_ssms[ct==i])/n_ssms, width=w)
					}
			))
}

#' Extract the average ploidy from copy number
#' @param cn The `GRanges` copy number data
#' @return The numeric value of average ploidy across the genome. 
#' 
#' @author Moritz Gerstung
#' @export
getPloidy <- function(cn) {
	c <- if(!is.null(cn$copy_number)) cn$copy_number else cn$total_cn
	sum(width(cn) * c * cn$clonal_frequency, na.rm=TRUE) / sum(width(cn) * cn$clonal_frequency, na.rm=TRUE)
}

#' Extract the average homozygousity from copy number
#' @param cn The `GRanges` copy number data
#' @return The numeric value of average homozygousity across the genome. 
#' 
#' @author Moritz Gerstung
#' @export
getHomozygousity <- function(cn){
	sum(width(cn) * (cn$minor_cn == 0) * cn$clonal_frequency, na.rm=TRUE) / sum(width(cn) * cn$clonal_frequency, na.rm=TRUE)
}

.classWgd <- function(ploidy, hom) 2.9 -2*hom <= ploidy ## Heuristic developed in PCAWG-11

#' Heuristic to classify WGD based on CN profile
#' 
#' The heuristic calculates the genome-wide average ploidy (ie total copy number) and also average homozygousity. Whole genome duplicated tumours
#' were found to have a ploidy in excess of 2.9 - 2 * homozygousity.
#' @param cn Copy number with consensus annotation meta data.
#' @return FALSE/TRUE
#' 
#' @author mg14
#' @export
classWgd <- function(cn) .classWgd(getPloidy(cn), getHomozygousity(cn))

defineMcnStates <- function(cn, clusters, purity, gender='female', isWgd= FALSE, deltaFreq=0.05){
	P <- vector(mode='list', length(cn))
	uniqueBB <- unique(cn)
	overlaps <- findOverlaps(uniqueBB, cn)
	
	majorCN <- split(cn$major_cn[subjectHits(overlaps)], queryHits(overlaps))
	m <- cn$minor_cn #hack: minor_cn > 0 in male samples - Battenberg bug?
	if(gender=='male')
		m[as.character(seqnames(cn)) %in% c('X','Y')] <- 0
	minorCN <- split(m[subjectHits(overlaps)], queryHits(overlaps))	
	h <- selectHits(overlaps, "first")
	H <- selectHits(overlaps, "last")
	
	cnNormal <- 2 - (gender=='male' & seqnames(uniqueBB)=="X" | seqnames(uniqueBB)=="Y")
	
	# Fix cluster and purity discrepancies
	clusters$proportion[which.max(clusters$proportion)] <- purity
	
	cloneFreq <- split(cn$clonal_frequency[subjectHits(overlaps)], queryHits(overlaps))
	cnStates <- matrix(0, nrow=10000, ncol=6)
	colnames(cnStates) <- c("state","m","f","n.m.s","pi.m.s","s")
	
	power.c <- rep(0, nrow(clusters))
	
	
	for( i in seq_along(uniqueBB)){
		if(!i %in% names(majorCN)) next
		majcni <- majorCN[[as.character(i)]]
		mincni <- minorCN[[as.character(i)]]
		cfi <- cloneFreq[[as.character(i)]]
		effCnTumor <- sum((majcni + mincni)*cfi)
		effCnNormal <- as.numeric(cnNormal[i]) * (1-purity)
		
		if(any(is.na(majcni))) next
		
		mixFlag <- FALSE
		subclonalGainFlag <- FALSE
		clonalFlag <- TRUE
		majdelta <- 0
		mindelta <- 0
		
		majanc <- majder <- majcni
		minanc <- minder <- mincni
		
		if(length(cfi)>1){ # multiple (subclonal) CN states, if so add clonal option (ie. mixture of both states), subclonal states only change by 1..delta(CN)
			d <- colSums(abs(rbind(majcni, mincni) - c(1,1) * (1+ isWgd)))
			derived <- d == max(d) ## derived state further from diploid/tetraploid
			if(all(derived)) next 
			majanc <- majcni[!derived]
			minanc <- mincni[!derived]
			majder <- majcni[derived]
			minder <- mincni[derived]
			majdelta <- majcni[derived] - majcni[!derived]
			mindelta <- mincni[derived] - mincni[!derived]
			majcni <- c(min(majcni), # clonal, sub on allele that doesn't change
					(majcni[!derived] + cfi[derived]/purity*majdelta), # clonal, sub on allele that does change
					(majcni[derived] >0) + (majdelta > 0)) # subclonal, subs after or before CNA, m=1,delta
			mincni <- c(min(mincni), (mincni[!derived] + cfi[derived]/purity*mindelta), (mincni[derived] >0) + (mindelta > 0))
			majdelta <- c(0, cfi[derived]/purity*majdelta, majdelta)
			mindelta <- c(0, cfi[derived]/purity*mindelta, mindelta)
			cfi <- c(purity, purity,  cfi[derived])
			mixFlag <- c(FALSE, TRUE, FALSE)
			clonalFlag <- c(TRUE,TRUE,FALSE)
			subclonalGainFlag <- c(FALSE, FALSE, TRUE)
		}
		
		
		a <- sapply(clusters$proportion, function(p) all(abs(p-cfi) > deltaFreq)) # subclone(s) not coinciding with CN change
		if(any(a)){ # assume subclones have derived from most abundant CN state
			majcni <- c(majcni, rep(majcni[which.max(cfi)]>0, sum(a))+0)
			mincni <- c(mincni, rep(mincni[which.max(cfi)]>0, sum(a))+0)
			cfi <- c(cfi, clusters$proportion[a])
			mixFlag <- c(mixFlag, rep(FALSE, sum(a)))
			clonalFlag <- c(clonalFlag, rep(FALSE, sum(a)))
			subclonalGainFlag <- c(subclonalGainFlag, rep(FALSE, sum(a)))
			majdelta <- c(majdelta, rep(0,sum(a)))
			mindelta <- c(mindelta, rep(0,sum(a)))
		}
		totcni <- majcni+mincni
		if(all(totcni==0)) next
		
		k <- 0
		for( j in seq_along(majcni)){
			if(majcni[j]==0 & mincni[j]==0) {
				f <- m <- 0 # allele frequency
				pi.m.s <- n.m.s <- 1
				l <- 1
			}else{
				l <- 1:max(majcni[j], mincni[j]) # mincni>majcni can occur if minor allele changes subclonally
				m <- l
				n.m.s <- rep(1, length(l)) #multiplicity of cn states
				if(!mixFlag[j]){ # single subclone, or no subclonal cn
					f <- m * cfi[j] / (effCnTumor + effCnNormal)
					if(mincni[j] > 0)
						n.m.s[1:min(majcni[j], mincni[j])] <- 2
					pi.m.s <- n.m.s/sum(n.m.s)
				}else{ # coexisting cn subclones, use mixture
					d <- if(majdelta[j] != 0) majdelta[j] else mindelta[j] 
					M <- max(mincni[j]*(mindelta[j]!=0), majcni[j]*(majdelta[j]!=0))
					m <- (d > 0):M + (M-floor(M))
					l <- seq_along(m)
					f <- m *cfi[j] / (effCnTumor + effCnNormal) # Variant on major allele
					n.m.s <- rep(1, length(l))
					pi.m.s <- n.m.s/sum(n.m.s)
				}
			}
			cnStates[k + l,"state"]=j
			cnStates[k + l,"m"]=m
			cnStates[k + l,"f"]=f
			cnStates[k + l,"pi.m.s"]=pi.m.s
			cnStates[k + l,"n.m.s"]=n.m.s
			k <- k + length(l)
		}
		whichStates <- (1:k)[cnStates[1:k,"f"]>0]
		
		# State probabilities - based on cell fractions
		cfi.s <- unique(cfi)
		cl <- clusters$n_ssms
		cl <- cl / ( majcni[1] + mincni[1])
		cl[1] <- cl[1] / ( (majcni[1]>0) + (mincni[1]>0)) * ( majcni[1] + mincni[1] )
		pi.s <- sapply(cfi.s, function(p) ifelse(min(abs(clusters$proportion - p)) < deltaFreq, cl[which.min(abs(clusters$proportion - p))], 1))
		pi.s <- pi.s/sum(pi.s)
		
		c.to.s <- sapply(cfi.s, function(p) ifelse(min(abs(clusters$proportion - p)) < deltaFreq, which.min(abs(clusters$proportion - p)), NA)) # map to cluster
		s.to.c <- sapply(clusters$proportion, function(c) which.min(abs(cfi.s - c)))
		
		cnStates[1:k,"s"] = as.numeric(factor(cfi, levels=cfi.s))[cnStates[1:k,"state"]]
		
		timing_param <- cbind(cnStates[whichStates,,drop=FALSE], cfi=cfi[cnStates[whichStates,"state"]], pi.s=pi.s[cnStates[whichStates,"s"]], P.m.sX=NA,P.m.sX.lo=NA, P.m.sX.up=NA, T.m.sX=NA, T.m.sX.lo=NA, T.m.sX.up=NA, power.s=NA, power.m.s = NA,
				majCN=majcni[cnStates[whichStates,"state"]], minCN=mincni[cnStates[whichStates,"state"]], 
				majDelta = majdelta[cnStates[whichStates,"state"]], minDelta = mindelta[cnStates[whichStates,"state"]], 
				clonalFlag=clonalFlag[cnStates[whichStates,"state"]], subclonalGainFlag=subclonalGainFlag[cnStates[whichStates,"state"]], mixFlag=mixFlag[cnStates[whichStates,"state"]], majCNanc=majanc, minCNanc=minanc, majCNder=majder, minCNder=minder)
		P[[h[i]]] <- timing_param
		if(H[i] != h[i]) P[[H[[i]]]] <- P[[h[i]]]
		
	}
	return(P)
} 


computeMutCn <- function(vcf, bb, clusters=data.frame(cluster=1, proportion=max(bb$clonal_frequency,na.rm=TRUE), n_ssms=100), purity=max(bb$clonal_frequency,na.rm=TRUE), gender='female', isWgd= FALSE, xmin=3, rho=0, n.boot=200, deltaFreq=0.05){
	n <- nrow(vcf)
	D <- DataFrame(MutCN=rep(NA,n), MutDeltaCN=rep(NA,n), MajCN=rep(NA,n), MinCN=rep(NA,n), MajDerCN=rep(NA,n), MinDerCN=rep(NA,n), CNF=rep(NA,n), CNID =as(vector("list", n),"List"), pMutCN=rep(NA,n), pGain=rep(NA,n),pSingle=rep(NA,n),pSub=rep(NA,n), pAllSubclones=as(vector(mode="list",n),"List"), pMutCNTail=rep(NA,n))	
	P <- defineMcnStates(bb,clusters, purity, gender, isWgd, deltaFreq=deltaFreq)
	if(n==0)
		return(list(D=D, P=P))
	
	altCount <- getAltCount(vcf)
	tumDepth <- getTumorDepth(vcf)
	names(altCount) <- names(tumDepth) <- NULL
	
	# Match VCF and CN
	overlaps <- findOverlaps(vcf, bb)
	D[["CNID"]] <- as(overlaps, "List")
	majorCN <- split(bb$major_cn[subjectHits(overlaps)], queryHits(overlaps))
	m <- bb$minor_cn #hack: minor_cn > 0 in male samples - Battenberg bug?
	if(gender=='male')
		m[as.character(seqnames(bb)) %in% c('X','Y')] <- 0
	minorCN <- split(m[subjectHits(overlaps)], queryHits(overlaps))	
	h <- selectHits(overlaps, "first")
	H <- selectHits(overlaps, "last")
	
	cnNormal <- 2 - (gender=='male' & seqnames(vcf)=="X" | seqnames(vcf)=="Y")
	
	# Fix cluster and purity discrepancies
	clusters$proportion[which.max(clusters$proportion)] <- purity
	
	cloneFreq <- split(bb$clonal_frequency[subjectHits(overlaps)], queryHits(overlaps))
	
	power.c <- rep(0, nrow(clusters))
	
	deltaFreq <- 0.05 # merge clusters withing deltaFreq
	
	boundaryHits <- countOverlaps(vcf, unique(bb)) > 1 # indels overlapping with CN boundaries
	
	for(globalIt in 1:2){ # 2 iterations, fist local (ie per segment) fit, then global
		for( i in which( (diff(c(-1, h)) !=0 | is.na(diff(c(-1, h)) !=0) ) & ! boundaryHits )){
			if(!i %in% names(majorCN)) next
			if(is.na(h[i])) next
			if(if(i>1) h[i] != h[i-1] | is.na(h[i-1]) else TRUE){ #ie. new segment
				
				hh <- which(h==h[i] & !is.na(altCount) &! is.na(tumDepth))
				if(length(hh)==0) next
				
				if(is.null(bb$timing_param[[h[i]]])){
					cnStates <- P[[h[i]]]
					if(is.null(cnStates)) next
					whichStates <- 1:nrow(cnStates)
					
					# State probabilities - based on cell fractions
					cfi.s <- unique(cnStates[,"cfi"])
					cl <- clusters$n_ssms
					cl <- cl / ( cnStates[1,"majCN"] + cnStates[1,"minCN"])
					cl[1] <- cl[1] / ((cnStates[1,"majCN"]>0) + (cnStates[1,"minCN"]>0)) * ( cnStates[1,"majCN"] + cnStates[1,"minCN"] )
					pi.s <- sapply(cfi.s, function(p) ifelse(min(abs(clusters$proportion - p)) < deltaFreq, cl[which.min(abs(clusters$proportion - p))], 1))
					pi.s <- pi.s/sum(pi.s)
					
					c.to.s <- sapply(cfi.s, function(p) ifelse(min(abs(clusters$proportion - p)) < deltaFreq, which.min(abs(clusters$proportion - p)), NA)) # map to cluster
					s.to.c <- sapply(clusters$proportion, function(c) which.min(abs(cfi.s - c)))
					
					
					# Likelihood
					L <- matrix(sapply(pmin(cnStates[whichStates,"f"],1), function(pp) dtrbetabinom(altCount[hh],tumDepth[hh], ifelse(pp==1, 1-.Machine$double.eps, pp), rho=rho, xmin=pmin(altCount[hh],0)) + .Machine$double.eps), ncol=length(whichStates))
					
					# Power
					power.sm <- colMeans(matrix(sapply(pmin(cnStates[whichStates,"f"],1), function(pp) 1-ptrbetabinom(pmin(altCount[hh],xmin-1),tumDepth[hh],ifelse(pp==1, 1-.Machine$double.eps, pp), rho=rho, xmin=0)), ncol=length(whichStates)), na.rm=TRUE)

					if(globalIt==2){
						P.m.sX <- P[[h[i]]][,"P.m.sX"]
						power.s <- sapply(split(power.sm * P.m.sX, cnStates[whichStates,"s"]), sum) # Power for state
						power.s[!is.na(c.to.s)] <- power.c[na.omit(c.to.s)]
						power.m.s <- power.sm # Relative power of m states (local) conditioned on s (global).
						for(s in unique(cnStates[whichStates,"s"])) power.m.s[cnStates[whichStates,"s"]==s] <- power.m.s[cnStates[whichStates,"s"]==s] / max(power.m.s[cnStates[whichStates,"s"]==s]) #power.s[s]
					}else{ # First iteration
						power.m.s <- power.sm
						power.s <- rep(1,length(whichStates))
					}
					
					mm <- function(x) {
						x <- factor(x)
						if(nlevels(x) > 1) t(model.matrix( ~ x + 0)) else matrix(1, ncol=length(x))
					}
					
					# EM algorithm (mixture fitting) for pi
					P.m.sX <- cnStates[whichStates,"pi.m.s"]
					s.from.m <- mm(cnStates[whichStates,"s"]) # indicator matrix to map
					for(em.it in 1:100){
						P.xsm <- L * rep(pi.s[cnStates[whichStates,"s"]] * P.m.sX / power.m.s / power.s[cnStates[whichStates,"s"]], each=nrow(L)) # P(X,s,m)
						P.sm.x <- P.xsm/rowSums(P.xsm) # P(s,m|Xi)
						P.sm.X <- pmax(.Machine$double.xmin,colMeans(P.sm.x)) # P(s,m|X) / piState[cnStates[1:k,"state"]] / cnStates[1:k,"pi.m.s"]
						if(em.it==100) break
						P.s.X <- s.from.m %*% P.sm.X 
						P.m.sX <- P.sm.X / P.s.X[cnStates[whichStates,"s"]]
					}
					
					toTime <- function(cnStates, P.m.sX, s.m) {
						mAnc <- cnStates[,"m"] - cnStates[,"minDelta"] - cnStates[,"majDelta"]
						mAnc.s <- factor(paste(mAnc, cnStates[,"s"], sep="."))
						n <- (mAnc <= cnStates[,"majCNanc"]) + (mAnc <= cnStates[,"minCNanc"] )
						mAnc.s.from.m <- mm(x = mAnc.s)# indicator matrix to map
						return((mAnc.s.from.m[mAnc.s,] %*% P.m.sX) / (s.m[cnStates[,"s"],] %*% (P.m.sX * mAnc)) *  (cnStates[,"majCNanc"] + cnStates[,"minCNanc"]) / n)
					}
					
					T.m.sX <- toTime(cnStates, P.m.sX, s.from.m) 
					
					if(globalIt==1){
						p <- (sapply(split(power.sm * P.m.sX, cnStates[whichStates,"s"]), sum) * nrow(L))[s.to.c]
						if(!any(is.na(p) | is.nan(p)))
							power.c <- power.c + p 
					}
					
					# Bootstrapping for CIs
					if(globalIt==2){
						b.m.sX <- if(n.boot>0) sapply(1:n.boot, function(foo){
												L <- rbind(L, rep(1e-3, each=ncol(L))) #add an uniformative row
												L <- L[sample(1:nrow(L), replace=TRUE),,drop=FALSE]
												P.m.sX <- cnStates[whichStates,"pi.m.s"]
												for(em.it in 1:50){
													P.xsm <- L * rep(pi.s[cnStates[whichStates,"s"]] * P.m.sX / power.m.s / power.s[cnStates[whichStates,"s"]], each=nrow(L)) # P(X,s,m)
													P.sm.x <- P.xsm/rowSums(P.xsm) # P(s,m|Xi)
													P.sm.X <- colMeans(P.sm.x) # P(s,m|X) / piState[cnStates[1:k,"state"]] / cnStates[1:k,"pi.m.s"]
													P.s.X <- s.from.m %*% P.sm.X 
													P.m.sX <- P.sm.X / P.s.X[cnStates[whichStates,"s"]]
												}
												return(P.m.sX)
											}) else NA
						if(n.boot>0) try({
										CI.m.sX <- apply(b.m.sX, 1, quantile, c(0.025, 0.975))
										cnStates[,"P.m.sX.lo"] <- CI.m.sX[1,] 
										cnStates[,"P.m.sX.up"] <- CI.m.sX[2,]
										B.m.sX <- toTime(cnStates = cnStates, P.m.sX = b.m.sX, s.m = s.from.m)
										C.m.sX <- apply(B.m.sX, 1, quantile, c(0.025, 0.975))
										cnStates[,"T.m.sX.lo"] <- C.m.sX[1,] 
										cnStates[,"T.m.sX.up"] <- C.m.sX[2,]
									}, silent=TRUE)
					}
					
					P.sm.x[apply(is.na(P.sm.x)|is.nan(P.sm.x),1,any),] <- NA
					cnStates[,"P.m.sX"] <- P.m.sX
					cnStates[,"T.m.sX"] <- T.m.sX
					cnStates[,"power.s"] <- power.s[cnStates[whichStates,"s"]]
					cnStates[,"power.m.s"] <- power.m.s
					
				}else{
					cnStates <- bb$timing_param[[h[i]]]
					if(is.null(cnStates)) next
					P.sm.x <- posteriorMutCN(x=altCount[hh],n=tumDepth[hh], cnStates=cnStates, xmin=0, rho=rho)
				}
				
				# Tail probs
				pMutCNTail <- matrix(sapply(pmin(cnStates[,"f"],1), function(pp) ptrbetabinom(altCount[hh],tumDepth[hh], ifelse(pp==1, 1-.Machine$double.eps, pp), rho=rho, xmin=pmin(altCount[hh],xmin))), ncol=nrow(cnStates)) #%*% c(pi.s[cnStates[whichStates,"state"]] * P.m.sX)				
				
				P[[h[i]]] <- cnStates
				if(H[i] != h[i]) P[[H[[i]]]] <- P[[h[i]]]
				
				w <- apply(P.sm.x, 1, function(x) if(any(is.na(x))) NA else which.max(x) )
				if(all(is.na(w))) next
				
				D[hh, "pSub"] <- rowSums(P.sm.x[, !cnStates[,"clonalFlag"], drop=FALSE])
				D[hh, "pGain"] <- rowSums(P.sm.x[, cnStates[,"clonalFlag"] & cnStates[,"m"] > 1.00001 + cnStates[,"majDelta"] + cnStates[,"minDelta"], drop=FALSE])
				#D[hh, "pSingle"] <- rowSums(P.sm.x[, cnStates[1:k,"state"] %in% which(clonalFlag) & cnStates[1:k,"m"]<=1, drop=FALSE])
				D[hh, "pSingle"] <-  1 - D[hh, "pSub"] - D[hh, "pGain"]			

				#currently doesn't work
				#D[hh,"pAllSubclones"] <- as(DataFrame(t(P.sm.x[, !cnStates[,"clonalFlag"], drop=FALSE])),"List")
				
				D[hh,"MutCN"]  <- cnStates[w,"m"]
				D[hh,"MutDeltaCN"]  <- cnStates[w,"majDelta"] + cnStates[w,"minDelta"]
				D[hh,"MinCN"] <- cnStates[w,"minCNanc"]
				D[hh,"MajCN"] <- cnStates[w,"majCNanc"]
				D[hh,"MinDerCN"] <- cnStates[w,"minCNder"]
				D[hh,"MajDerCN"] <- cnStates[w,"majCNder"]
				
				D[hh,"CNF"]  <- cnStates[w,"cfi"]
				D[hh,"pMutCN"] <- sapply(seq_along(w), function(i) P.sm.x[i,w[i]])
				D[hh,"pMutCNTail"] <- sapply(seq_along(w), function(i) pMutCNTail[i,w[i]])
				#D[hh,"altCount"] <- altCount[hh]
				#D[hh,"wtCount"] <- tumDepth[hh] - altCount[hh]
			}		
		}
		if(globalIt==1){
			power.c <- power.c / sum(!is.na(D[,"pSub"]))
			if(any(is.na(power.c) | power.c==0)) {
				break # Cancel 2nd iteration
				warning("Power calculation yielded NA, aborting.")
			}
			if(any(power.c > 1)) {
				warning("Calculated power > 1, rounding down.")
				power.c <- pmin(1, power.c)
			}
		}
	}
	return(list(D=D, P=P, power.c=power.c))
}

#' Compute mutation time from copy number gains and point mutations
#' @param vcf A vcf object of simple somatic mutations (SNV/MNV/indels) in generic format using `geno()` format entries `AD` and `DP` or in PCAWG format with `info()` columns `t_ref_count` and `t_alt_count` storing the allele counts of reference and alternative (mutated) sequence. 
#' See VariantAnnotation::readVcf(). `AD` and `DP` are automatically generated by conversion from `VRanges()`.
#' @param cn The copy number as a GRanges() object, meta data in PCAWG consensus format. 
#' This includes the extrac columns `major_cn` for major allele copy number, `minor_cn` for the minor allel copy number (both integers), as well as `clonal_frequency`, which should be 
#' the tumour purity. For subclonal copy number changes, two regions with identical coordinate and `clonal_frequency_1 + clonal_frequency_2 = purity` are supplied. See loadBB()
#' @param clusters A `data.frame` with information about known subclonal mutation clusters. The parameters are `cluster` identifier `proportion` (a fraction of purity) and `n_ssms` being the number of mutations assigned to each cluster, which serves as a prior. 
#' The default assumes a single subclonal cluster at about 50% of cancer cell fraction contributing 10% of mutations. This roughly
#' agrees with general observations, but a more bespoke analysis is recommented. Set to clusters = data.frame(cluster=1, proportion=max(cn$clonal_frequency,na.rm=TRUE), n_ssms=100) for a purely
#' clonal treatment.
#' @param purity The purity of the samples. Note that purity should match the values in `cn` and `clusters`.
#' @param gender 'male' or 'female' (default). This is needed to establish the normal sex chromosom configuration.
#' @param isWgd TRUE/FALSE. Whether the tumour is whole genome duplicated. The default uses classWgd(cn). WGD status changes the rules of timing subclonal copy number states such as a mixture of 1:2 and 2:2. In the case of WGD the latter would be considered to be ancestral, otherwise the former. 
#' @param xmin The minimal read support. Needed for power calculations. Default `xmin=2`.
#' @param rho Dispersion parameter for the beta-binomial model of mutant read count. Default `rho=0`
#' @param n.boot The number of bootstrap samples for confidence interval estimation. Defaul `n.boot=200`.
#' @return A list with elements (V: Data.Frame with variant-specific timing information, can be added to vcf object; T: DataFrame with timing information to be added to CN Ranges; power.c power of each cluster).
#' @example inst/example/example.R
#'  
#' @author mg14
#' @export
mutationTime <- function(vcf, cn, clusters=data.frame(cluster=1:2, proportion=max(cn$clonal_frequency,na.rm=TRUE)*c(1,0.5), n_ssms=c(90,10)), purity=max(cn$clonal_frequency,na.rm=TRUE), gender='female', isWgd=classWgd(cn), xmin=3, rho=0, n.boot=200){
	MT <- computeMutCn(vcf, cn, clusters, purity, gender, isWgd, xmin, rho, n.boot)
	MT$D$CLS <- classifyMutations(as.data.frame(MT$D))
	n.snv_mnv <- countOverlaps(cn,vcf)
	time <- mtToTime(cn, timing_param=MT$P, n.snv_mnv=n.snv_mnv)
	T <- DataFrame(timing_param=List(MT$P), time, n.snv_mnv=n.snv_mnv)
	return(list(V=MT$D, T=T))
}


# Return header elements
# @return DataFrame() to be added to VCF header
# 
# @author mg14
mtHeader <- function() {
	DataFrame(Number=c(1,1,1,1,1,1,1,1,".",1,1,1,1,".",1),Type=c("Float","Float","Integer","Integer","Integer","Integer","Float","Float","Integer","Float","Float","Float","Float","String","String"), 
			Description=c("Mutation copy number","Change in MutCN between ancestral and derived state","Major copy number (ancestral)","Minor copy number (ancestral)","Major copy number (derived)","Minor copy number (derived)","Copy number frequency (relative to all cancer cells)", "MutCN probability","BB segment ids","Posterior prob: Early clonal","Posterior prob: Late clonal","Posterior prob: Subclonal", "Tail prob of mixture model", "Assignment probability of mutation to subclone","Mutation Time {clonal [early], clonal [late], clonal [NA], subclonal} - MAP assignments"),
			row.names=c("MutCN","MutDeltaCN","MajCN","MinCN","MajDerCN","MinDerCN","CNF","pMutCN","CNID","pGain","pSingle","pSub","pMutCNTail","pAllSubclones","CLS"))
}

mcnHeader <- function() {
	DataFrame(Number=c(1,1,1,1,1,1,1,1,".",1,1,1,1,"."),Type=c("Float","Float","Integer","Integer","Integer","Integer","Float","Float","Integer","Float","Float","Float","Float","String"), 
			Description=c("Mutation copy number","Change in MutCN between ancestral and derived state","Major copy number (ancestral)","Minor copy number (ancestral)","Major copy number (derived)","Minor copy number (derived)","Copy number frequency (relative to all cancer cells)", "MutCN probability","BB segment ids","Posterior prob: Early clonal","Posterior prob: Late clonal","Posterior prob: Subclonal", "Tail prob of mixture model", "Assignment probability of mutation to subclone"),
			row.names=c("MutCN","MutDeltaCN","MajCN","MinCN","MajDerCN","MinDerCN","CNF","pMutCN","CNID","pGain","pSingle","pSub","pMutCNTail","pAllSubclones"))
}

#' Add mutation timing information to vcf object
#' @param vcf The original `VCF`
#' @param V The `DataFrame` with timing information from mutationTime
#' @return The combined vcf with updated header.
#' 
#' @author Moritz Gerstung
#' @export
addMutTime <- function(vcf, V){
	i = info(header(vcf))
	info(header(vcf)) <- rbind(i, mtHeader())
	info(vcf) <- cbind(info(vcf), V)
	return(vcf)
}

#' Add sample metadata entries to the VCF header
#' @param vcf A VCF object
#' @param purity Sample purity (Default: NULL)
#' @param ploidy Sample ploidy (Default: NULL)
#' @param sex Donor sex (Default: NULL)
#' @param is_wgd Sample whole genome doubling status (TRUE if a whole genome doubling has occurred) (Default: NULL)
#' @param subclonal_architecture Sample subclonal architecture data.frame (Default: NULL)
#' @param cluster_power Sample vector with a power estimate per mutation cluster (Default: NULL)
#' @return The VCF object with requested entries added to the metadata
#' @author sd11
addMetadataToHeader = function(vcf, purity=NULL, ploidy=NULL, sex=NULL, is_wgd=NULL, subclonal_architecture=NULL, cluster_power=NULL) {
  # convenience function that takes care of adding a metadata entry
  .addmeta = function(vcf, value, label) {
    temp = DataFrame(Value=value)
    rownames(temp) = label
    if ("META" %in% names(meta(header(vcf)))) {
      meta(header(vcf))[["META"]] = rbind(meta(header(vcf))[["META"]], temp)
    } else {
      meta(header(vcf))[["META"]] = temp
    }
    return(vcf)
  }
  
  if (!is.null(purity)) vcf = .addmeta(vcf, purity, "purity")
  if (!is.null(ploidy)) vcf = .addmeta(vcf, ploidy, "ploidy")
  if (!is.null(sex)) vcf = .addmeta(vcf, sex, "sex")
  if (!is.null(is_wgd)) vcf = .addmeta(vcf, is_wgd, "isWgd")
  if (!is.null(cluster_power)) vcf = .addmeta(vcf, paste(cluster_power, collapse=","), "clusterPower")
  
  # convenience function that collapses a data.frame column into a single string
  if (!is.null(subclonal_architecture)) {
    .collapsecol = function(subclonal_architecture, colname) paste(c(colname, ":", paste(subclonal_architecture[,colname], collapse=",")), collapse="")
    encoded_clusters = paste(sapply(colnames(subclonal_architecture), function(x) { .collapsecol(subclonal_architecture, x) }), collapse=";")
    vcf = .addmeta(vcf, encoded_clusters, "subclonalArchitecture")
  }
  
  return(vcf)
}

#' Convenience function to fetch info from metadata
#' @param vcf VCF object to query
#' @param label Row label to fetch from the metadata
#' @return The requested entry as a String, or NULL if the entry was not found
#' @author sd11
getMetaEntry = function(vcf, label) {
  if (label %in% rownames(meta(header(vcf))[["META"]])) {
    return(meta(header(vcf))[["META"]][label,"Value"])
  } else {
    return(NULL)
  }
}

#' Get sample purity from VCF metadata
#' @param VCF object to query
#' @return The sample purity, or NULL if the metadata entry is not available
#' @author sd11
#' @export
getPurity = function(vcf) {
  value = getMetaEntry(vcf, "purity")
  if (!is.null(value)) value = as.numeric(value)
  return(value)
}

#' Get sample ploidy from VCF metadata
#' @param VCF object to query
#' @return The sample ploidy, or NULL if the metadata entry is not available
#' @author sd11
#' @export
getPloidy = function(vcf) {
  value = getMetaEntry(vcf, "ploidy")
  if (!is.null(value)) value = as.numeric(value)
  return(value)
}

#' Get donor sex from VCF metadata
#' @param VCF object to query
#' @return The donor sex or, or NULL if the metadata entry is not available
#' @author sd11
#' @export
getSex = function(vcf) {
  value = getMetaEntry(vcf, "sex")
  if (!is.null(value)) value = as.character(value)
  return(value)
}

#' Get sample whole genome doubling status from VCF metadata
#' @param VCF object to query
#' @return The sample whole genome doubling status (TRUE if a WGD occurred, FALSE otherwise), or NULL if the metadata entry is not available
#' @author sd11
#' @export
getWGDstatus = function(vcf) {
  value = getMetaEntry(vcf, "isWgd")
  if (!is.null(value)) value = as.logical(value)
  return(value)
}

#' Get mutation cluster power from VCF metadata
#' @param VCF object to query
#' @return The mutation cluster power (vector with an entry per cluster), or NULL if the metadata entry is not available
#' @author sd11
#' @export
getClusterPower = function(vcf) {
  value = getMetaEntry(vcf, "clusterPower")
  if (!is.null(value)) value = as.numeric(unlist(strsplit(value, ",")))
  return(value)
}

#' Get sample subclonal architecture from VCF metadata
#' @param VCF object to query
#' @return A data.frame with the subclonal architecture of the sample, or NULL if the metadata entry is not available
#' @author sd11
#' @export
getSubclonalArchitecture = function(vcf) {
  value = getMetaEntry(vcf, "subclonalArchitecture")
  columns = lapply(unlist(strsplit(value, ";")), function(x) unlist(strsplit(x, ":")))
  columnnames = unlist(lapply(columns, function(x) x[1]))
  columnvalues = lapply(columns, function(x) unlist(strsplit(x[2], ",")))
  columnvalues = as.data.frame(do.call(cbind, columnvalues), stringsAsFactors=F)
  colnames(columnvalues) = columnnames
  
  for (columnname in c("cluster", "proportion", "n_ssms", "n_snvs", "ccf", "cp")) {
    if (columnname %in% colnames(columnvalues)) {
      columnvalues[,columnname] = as.numeric(columnvalues[,columnname])
    }
  }
  return(columnvalues)
}

#' Read a VCF file
#' 
#' This function overloads VariantAnnotation::readVcf as the pAllSubclones info column requires additional encoding to fit within the VCF standard
#' @param file Path to a VCF file
#' @return A VCF object
#' @importFrom VariantAnnotation readVcf
#' 
#' @author sd11
#' @export
readVcf = function(file, ...) {
  vcf = VariantAnnotation::readVcf(file, ...)
  if ("pAllSubclones" %in% colnames(info(vcf))) {
    info(vcf)$pAllSubclones = lapply(info(vcf)$pAllSubclones, function(x) as.numeric(unlist(strsplit(x, ","))))
  }
  return(vcf)
}

#' Write a VCF file
#' 
#' This function overloads VariantAnnotation::writeVcf as the pAllSubclones info column requires additional encoding to fit within the VCF standard
#' @param vcf A VCF object
#' @importFrom VariantAnnotation writeVcf
#' 
#' @author sd11
#' @export
writeVcf = function(vcf, ...) {
  if ("pAllSubclones" %in% colnames(info(vcf))) {
    info(vcf)$pAllSubclones = unlist(lapply(info(vcf)$pAllSubclones, function(x) paste(x, collapse=",")))
  }
  VariantAnnotation::writeVcf(vcf, ...)
}

#' Classify Mutations
#' @param x 
#' @param reclassify 
#' @return factor()
#' 
#' @importFrom S4Vectors as.matrix
#' @author mg14
classifyMutations <- function(x, reclassify=c("missing","all","none")) {
	reclassify <- match.arg(reclassify)
	if(nrow(x) ==0 )
		return(factor(NULL, levels=c("clonal [early]", "clonal [late]", "clonal [NA]", "subclonal")))
	if(class(x)=="CollapsedVCF")
		x <- info(x)
	.clsfy <- function(x) {
		cls <- x$CLS
		if(reclassify %in% c("missing", "none") &! is.null(cls)){
			if(all(unique(cls) %in% c("early", "late", "clonal", "subclonal")))
				cls <- factor(cls, levels=c("early", "late", "clonal", "subclonal"), labels=c("clonal [early]", "clonal [late]", "clonal [NA]", "subclonal"))
			cls <- as.character(cls)
			cls[cls=="NA"] <- NA
			if(reclassify=="missing" & any(is.na(cls)))
				cls[is.na(cls)] <- paste(factor(apply(as.matrix(x[is.na(cls), c("pGain","pSingle","pSub")]), 1, function(x) if(all(is.na(x))) NA else which.max(x)), levels=1:3, labels=c("clonal [early]", "clonal [late]","subclonal"))) ## reclassify missing
		}else{
			cls <- paste(factor(apply(as.matrix(x[, c("pGain","pSingle","pSub")]), 1, function(x) if(all(is.na(x))) NA else which.max(x)), levels=1:3, labels=c("clonal [early]", "clonal [late]","subclonal"))) ## reclassify missing
			
		}
		cls[x$pGain==0 & cls!="subclonal"] <- "clonal [NA]"
		if(!is.null(x$MajCN))
			cls[cls!="subclonal" & (x$MajCN == 1 | x$MinCN == 1) & abs(x$MutCN - x$MutDeltaCN -1) <= 0.0001] <- "clonal [NA]"
		cls <- factor(cls, levels=c("clonal [early]", "clonal [late]", "clonal [NA]", "subclonal"))
	}
	cls <- .clsfy(x)
	return(cls)
}

posteriorMutCN <- function(x,n, cnStates, xmin=3, rho=0.01){
	whichStates <- 1:nrow(cnStates)
	L <- matrix(sapply(pmin(cnStates[whichStates,"f"],1), function(pp) dtrbetabinom(x,n,pp, rho=rho, xmin=pmin(x,xmin)) + .Machine$double.eps), ncol=length(whichStates))
	P.xsm <- L * rep(cnStates[whichStates,"pi.s"] * cnStates[whichStates,"P.m.sX"] / cnStates[whichStates,"power.s"] / cnStates[whichStates,"power.m.s"], each=nrow(L)) # P(X,s,m)
	P.sm.x <- P.xsm/rowSums(P.xsm) # P(s,m|Xi)
	return(P.sm.x)
}

loadBB <- function(file){
  tab <- read.table(file, header=TRUE, sep='\t')
  GRanges(tab$chromosome, IRanges(tab$start, tab$end), strand="*", tab[-3:-1])
}

pGainToTime <- function(vcf){
	P <- matrix(NA, nrow=nrow(vcf), ncol=4, dimnames=list(NULL, c("pEarly","pLate","pClonal[NA]","pSub")))
	P[,c("pEarly","pClonal[NA]","pSub")] <- as.matrix(info(vcf)[,c("pGain","pSingle","pSub")])
	biAllelicGain <- (info(vcf)$MajCN > 1 & (info(vcf)$MinCN > 1 | info(vcf)$MinCN == 0) & ! ((info(vcf)$MajCN == 1 | info(vcf)$MinCN == 1) & abs(info(vcf)$MutCN - info(vcf)$MutDeltaCN -1) <= 0.0001))
	w <- which(biAllelicGain)
	P[w, "pLate"] <- P[w, "pClonal[NA]"]
	P[w, "pClonal[NA]"] <- 0
	P[which(!biAllelicGain),"pLate"] <- 0
	return(P)
}

piToTime <- function(timing_param, type=c("Mono-allelic Gain","CN-LOH", "Bi-allelic Gain (WGD)")){
	type <- match.arg(type)
	evo <- NA
	w <- timing_param[,"s"]==1 &! timing_param[,"mixFlag"] ## Clonal states, not mixture (CN subclonal)
	M <- sum(w) ## Max multiplicity = Major copy number
	m <- sum(w & timing_param[,"n.m.s"]==2) ## Minor CN
	#evo <- paste0("1:1", paste0(2,":",m), if(M>2) paste0(3,":",) else "", collapse="->")
	t <- timing_param[(M:(M-1)),c("T.m.sX","T.m.sX.lo","T.m.sX.up"), drop=FALSE] ## Timing M and M-1
	if(nrow(t)==1) t <- rbind(t, NA)
	if(!any(is.na(t))){
		if(M==4) {
			if(timing_param[M-1,"P.m.sX.lo"] < 0.001){ ## No MCN 3
				if(type=="CN-LOH"){ ## Hotfix for 4+0, treated at 1:1 -> 2:0 -> 4:0
					t <- timing_param[c(M,M-2),c("T.m.sX","T.m.sX.lo","T.m.sX.up"), drop=FALSE]*c(1,0.5) ## Timing M and M-2
					evo <- "1:1->2:0->4:0"
				}
				else if(type=="Bi-allelic Gain (WGD)"){			
					if(m==2) {## Hotfix for 4+2 regions, treated at 1:1 -> 2:1 -> 4:2
						t <- timing_param[c(M,M-2),c("T.m.sX","T.m.sX.lo","T.m.sX.up"), drop=FALSE] ## Timing M and M-2
						t[2,] <- pmax(0,2*t[2,] - t[1,])/3
						evo <- "1:1->2:1->4:2"
					} else if(m==4) {## Hotfix for 4+4 regions, treated at 1:1 -> 2:2 -> 4:4
						t <- timing_param[c(M,M-2),c("T.m.sX","T.m.sX.lo","T.m.sX.up"), drop=FALSE]*c(1,0.5) ## Timing M and M-2
						evo <- "1:1->2:2->4:4"
					}
				}			
			} else if(type=="Bi-allelic Gain (WGD)"){ ## Can't uniquely time second event
				t[2,] <- NA
			} 
			if(m==3) {
				t[2,] <- NA ## Don'time secondary 4+3 for now, needs more work
			} 
		} else {
			if(M==3 & type=="Bi-allelic Gain (WGD)") {## Hotfix for 3+2 regions, treated at 1:1 -> 2:2 -> 3:2
				t[2,] <- pmax(0,2*t[2,] - t[1,])
				evo <- "1:1->2:2->3:2"
			}
		}
	}
	colnames(t) <- c("","lo","up")
	t[,2] <- pmin(t[,1],t[,2])
	t[,3] <- pmax(t[,1],t[,3])
	t <- pmin(apply(t,2,cumsum),1) ## Times are actually deltas
	if(M < 3) t[2,] <- NA
	t[is.infinite(t)] <- NA
	rownames(t) <- c("",".2nd")
	return(c(t[1,],`.2nd`=t[2,])) ## Linearise
}

mtToTime <- function(cn, timing_param = cn$timing_param, n.snv_mnv = cn$n.snv_mnv, pseudo.count=5){
	sub <- duplicated(cn) 
	covrg <- countQueryHits(findOverlaps(cn, cn)) 
	maj <- sapply(timing_param, function(x) if(length(x) > 0) x[1, "majCNanc"] else NA) #bb$major_cn
	min <- sapply(timing_param, function(x) if(length(x) > 0) x[1, "minCNanc"] else NA) #bb$minor_cn
	type <- sapply(seq_along(cn), function(i){
				if(maj[i] < 2 | is.na(maj[i]) | sub[i] | (maj[i] > 4 & min[i] >= 2)) return(NA)
				type <- if(min[i]==1){ "Mono-allelic Gain" 
						}else if(min[i]==0){"CN-LOH"}
						else "Bi-allelic Gain (WGD)"
				return(type)
			})
	time <- t(sapply(seq_along(cn), function(i){
						if(sub[i] | is.na(type[i])) return(rep(NA,6)) 
						else piToTime(timing_param[[i]],type[i])
					}))
	
	res <- data.frame(type=factor(type, levels=c("Mono-allelic Gain","CN-LOH","Bi-allelic Gain (WGD)")), time=time)
	colnames(res) <- c("type","time","time.lo","time.up","time.2nd","time.2nd.lo","time.2nd.up")
	
	# posthoc adjustment of CI's
	res$time.up <- (pseudo.count + n.snv_mnv * res$time.up)/(pseudo.count + n.snv_mnv)
	res$time.lo <- (0 + n.snv_mnv * res$time.lo)/(pseudo.count + n.snv_mnv)
	res$time.2nd.up <- (pseudo.count + n.snv_mnv * res$time.2nd.up)/(pseudo.count + n.snv_mnv)
	res$time.2nd.lo <- (0 + n.snv_mnv * res$time.2nd.lo)/(pseudo.count + n.snv_mnv)
	
	res$time.star <- factor((covrg == 1) + (min < 2 & maj <= 2 | min==2 & maj==2) * (covrg == 1), levels=0:2, labels = c("*","**","***")) ## ***: 2+0, 2+1, 2+2; **: 2<n+1, {3,4}+2; *: subclonal gains
	res$time.star[is.na(res$time)] <- NA
	return(res)
}

getPloidy <- function(bb) {
	c <- if(!is.null(bb$copy_number)) bb$copy_number else bb$total_cn
	sum(width(bb) * c * bb$clonal_frequency, na.rm=TRUE) / sum(width(bb) * bb$clonal_frequency, na.rm=TRUE)
}

#' @importFrom VGAM rbetabinom
simulateMutations <- function(cn, purity=max(cn$clonal_frequency, na.rm=TRUE),  n=40, rho=0.01, xmin=3){
	g <- (getPloidy(cn)*purity + 2*(1-purity))
	V <- list(VRanges())#VRanges()
	for(i in which(!duplicated(cn)))
		if(cn$n.snv_mnv[i]>1 & !is.null( cn$timing_param[[i]]))try({
						cnStates <- cn$timing_param[[i]]
						p <- cnStates[,"pi.s"]* if(!any(is.na(cnStates[,"P.m.sX"]))) cnStates[,"P.m.sX"] else cnStates[,"pi.m.s"]
						pwr <- cnStates[,"power.m.s"]#(cnStates[,"power.s"] * cnStates[,"power.m.s"])
						s <- sample(1:nrow(cnStates), size=pmax(1,ceiling(cn$n.snv_mnv[i] * (p %*% (1/pwr)))), prob=p, replace=TRUE)
						f <- cnStates[s,"f"]
						mu.c <- ((cn$major_cn[i] + cn$minor_cn[i])*purity + 2*(1-purity))/g * n
						c <- rnbinom(length(f), size=1/rho, mu=mu.c)
						x <- rbetabinom(n=length(f), size=c, prob=f, rho=rho)
						pos <- round(runif(length(f), min=start(cn)[i], max=end(cn)[i]))
						w <- which(x>=xmin)
						V[[i]] <- VRanges(seqnames=seqnames(cn)[i], IRanges(pos, width=1), ref="N", alt="A", totalDepth=c, altDepth=x)[w]
					})
	V <- do.call("c", V[!sapply(V, is.null)])
	sampleNames(V) <- "SAMPLE"
	v <- as(V, "VCF")
	info(header(v)) <- rbind(info(header(v)),DataFrame(Number=1, Type=rep("Integer",2),Description=c("Tumour ref count","Tumour alt count"), row.names=c("t_ref_count","t_alt_count")))
	info(v)$t_alt_count <- altDepth(V)
	info(v)$t_ref_count <- totalDepth(V) - altDepth(V)
	return(v)
}


.plotBB <- function(bb, ylim=c(0,max(max(bb$total_cn, na.rm=TRUE))), col=RColorBrewer::brewer.pal(4,"Set2"), type=c("lines","bars"), legend=TRUE, lty.grid=1, col.grid="grey", xaxt=TRUE, xlim=c(min(chrOffset[as.character(seqnames(bb))]+start(bb)),max(chrOffset[as.character(seqnames(bb))]+end(bb)))){
	type <- match.arg(type)
	s <- c(1:22, "X","Y")
	l <- as.numeric(width(refLengths[as.character(seqnames(refLengths)) %in% s]))
	names(l) <- s
	plot(NA,NA, ylab="Copy number",xlab="",xlim=xlim, ylim=ylim, xaxt="n")
	c <- cumsum(l)
	axis(side=1, at=c(0,c), labels=rep('', length(l)+1))
	if(xaxt) mtext(side=1, at= cumsum(l) - l/2, text=names(l), line=1)
	#abline(v=c, lty=3)
	if(type=="lines"){
		x0 <- start(bb) + cumsum(l)[as.character(seqnames(bb))] - l[as.character(seqnames(bb))]
		x1 <- end(bb) + cumsum(l)[as.character(seqnames(bb))] - l[as.character(seqnames(bb))]
		lwd <- 5* bb$clonal_frequency / max(bb$clonal_frequency)
		segments(x0=x0, bb$major_cn ,x1, bb$major_cn, col=col[1], lwd=lwd)
		segments(x0=x0, bb$minor_cn -.125,x1, bb$minor_cn-.125, col=col[2], lwd=lwd)
		segments(x0=x0, bb$total_cn+.125,x1, bb$total_cn+.125, col=1, lwd=lwd)
#	cv <- coverage(bb)
#	cv <- cv[s[s%in%names(cv)]]
#	par(xpd=NA)
#	for(n in names(cv)){
#		cc <- cv[[n]]
#		segments(start(cc) + cumsum(l)[n] - l[n] ,-runValue(cc)/2,end(cc)+ cumsum(l)[n] - l[n], -runValue(cc)/2, col=4)
#	}
	}else{
		ub <- unique(bb)
		f <- findOverlaps(ub,bb)
		m <- t(model.matrix( ~ 0 + factor(queryHits(f))))
		ub$major_cn <- m %*% mg14::na.zero(bb$major_cn * bb$clonal_frequency) / max(bb$clonal_frequency)
		ub$minor_cn <- m %*% mg14::na.zero(bb$minor_cn * bb$clonal_frequency) / max(bb$clonal_frequency)
		ub$total_cn <- ub$major_cn + ub$minor_cn
		ub$clonal_frequency <- max(bb$clonal_frequency)
		x0 <- start(ub) + cumsum(l)[as.character(seqnames(ub))] - l[as.character(seqnames(ub))]
		x1 <- end(ub) + cumsum(l)[as.character(seqnames(ub))] - l[as.character(seqnames(ub))]
		rect(x0,0,x1, ub$major_cn, col=col[2], lwd=NA)
		rect(x0,ub$major_cn,x1, ub$total_cn, col=col[1], lwd=NA)
		abline(h = 1:floor(ylim[2]), lty=lty.grid, col=col.grid)
	}
	abline(v = chrOffset[1:25], lty=lty.grid, col=col.grid)
	if(xaxt) mtext(side=1, line=1, at=chrOffset[1:24] + diff(chrOffset[1:25])/2, text=names(chrOffset[1:24]))
	if(legend){
		if(type=="lines") legend("topleft", c("Total CN","Major CN","Minor CN"), col=c("black", col[1:2]), lty=1, lwd=2, bg='white')
		else legend("topleft", c("Major CN","Minor CN"), fill=col[1:2], bg='white')
	}
}

.plotVcf <- function(vcf, bb, col = RColorBrewer::brewer.pal(9, "Set1")[c(3,4,2,1,9)], ID = meta(header(vcf))[[1]]["ID",1], legend=TRUE, lty.grid=1, col.grid="grey", xaxt=TRUE, pch=16, pch.out=pch, cex=0.66, xlim=c(0,chrOffset["MT"])) {
	cls <- factor(paste(as.character(info(vcf)$CLS)), levels = c("clonal [early]","clonal [late]","clonal [NA]","subclonal" , "NA"))
	plot(NA,NA, xlab='', ylab="VAF", ylim=c(0,1), xlim=xlim, xaxt="n", cex=cex)
	abline(v = chrOffset[1:25], lty=lty.grid, col=col.grid)
	if(xaxt) mtext(side=1, line=1, at=chrOffset[1:24] + diff(chrOffset[1:25])/2, text=names(chrOffset[1:24]))
	for(i in which(!sapply(bb$timing_param, is.null))) {
		s <- start(bb)[i]
		e <- end(bb)[i]
		x <- chrOffset[as.character(seqnames(bb)[i])]
		y <- bb$timing_param[[i]][,"f"]
		l <- bb$timing_param[[i]][,"pi.s"] * bb$timing_param[[i]][,"P.m.sX"]
		l[is.na(l)] <- 0
		if(any(is.na(c(s,e,x,y,l)))) next
		segments(s+x,y,e+x,y, lwd=l*4+.1)
		#text(x=(s+e)/2 +x, y=y, paste(signif(bb$timing_param[[i]][,"m"],2),signif(bb$timing_param[[i]][,"cfi"]/purityPloidy[meta(header(vv))["ID",1],"purity"],2), sep=":"), pos=3, cex=0.5)
	}
	points(start(vcf) + chrOffset[as.character(seqnames(vcf))], getAltCount(vcf)/getTumorDepth(vcf),col=col[cls],  pch=ifelse(info(vcf)$pMutCNTail < 0.025 | info(vcf)$pMutCNTail > 0.975, pch.out , pch),  cex=cex)				
	if(legend) legend("topleft", pch=19, col=col, legend=paste(as.numeric(table(cls)), levels(cls)), bg='white')
}

.timeToBeta <- function(time){
	mu <- time[,1]
	#if(any(is.na(time))) return(c(NA,NA))
	mu <- pmax(1e-3, pmin(1 - 1e-3, mu))
	v <- (0.5 * (pmax(mu,time[,3])-pmin(mu,time[,2])))^2
	alpha <- mu * (mu * (1-mu) / v - 1)
	beta <- (1-mu) *  (mu * (1-mu) / v - 1)
	return(cbind(alpha, beta))
}

.plotTiming <- function(bb, time=mcols(bb), col=paste0(RColorBrewer::brewer.pal(5,"Set2")[c(3:5)],"88"), legend=TRUE, col.grid='grey', lty.grid=1, xlim=c(0,chrOffset["MT"]), plot=2){
	plot(NA,NA, xlab='', ylab="Time [mutations]", ylim=c(0,1), xlim=xlim, xaxt="n")
	if(any(!is.na(bb$time)))
		tryCatch({
					bb <- bb[!is.na(bb$time)]
					s <- start(bb)
					e <- end(bb)
					x <- chrOffset[as.character(seqnames(bb))]
					y <- time[,"time"]
					rect(s+x,time[,"time.lo"],e+x,time[,"time.up"], border=NA, col=col[time[,"type"]], angle = ifelse(bb$time.star=="*" | is.na(bb$time.star),45,135), density=ifelse(bb$time.star == "***", -1, 72))
					segments(s+x,y,e+x,y)
					
					if("time.2nd" %in% colnames(time)){ 
						w <- !is.na(time[,"time.2nd"])
						if(sum(w) != 0 & plot==2){
							s <- start(bb)[w]
							e <- end(bb)[w]
							x <- chrOffset[as.character(seqnames(bb))][w]
							y <- time[w,"time.2nd"]
							rect(s+x,time[w,"time.2nd.lo"],e+x,time[w,"time.2nd.up"], border=NA, col=sub("88$","44",col)[as.numeric(time[w,"type"])], angle = ifelse(bb$time.star[w]=="*" | is.na(bb$time.star[w]),45,135), density=ifelse(bb$time.star[w] == "***", -1, 72))
							segments(s+x,y,e+x,y)
						}
					}
				}, error=function(x) warning(x))
	abline(v = chrOffset[1:25], lty=lty.grid, col=col.grid)
	s <- c(1:22, "X","Y")
	l <- as.numeric(width(refLengths[as.character(seqnames(refLengths)) %in% s]))
	names(l) <- s
	c <- cumsum(l)
	axis(side=1, at=c(0,c), labels=rep('', length(l)+1))
	mtext(side=1, line=1, at=chrOffset[1:24] + diff(chrOffset[1:25])/2, text=names(chrOffset[1:24]))
	if(legend) legend("topleft", levels(time[,"type"]), fill=col, bg="white")
}

.betaFromCi <- function(x, weight.mode=5){
	if(any(is.na(x))) return(rep(NA,2))
	f <- function(par,x) {
		beta <- exp(par)
		sum((qbeta(c(0.025,0.975), beta[1], beta[2])-x[-1])^2)+weight.mode*((beta[1]-1)/(beta[1]+beta[2]-2)-x[1])^2
	}
	tryCatch(exp(optim(c(0.1,0.1), fn=f,x=x)$par), error=c(1,1))
}

.histBeta <- function(bb, time="time",n.min=10, s=seq(0.005,0.995,0.01)){
	s <- pmax(0.001,pmin(0.999, s))
	cols <- paste0(time,c("",".lo",".up"))
	w <- which(bb$n.snv_mnv > n.min & !is.na(mcols(bb)[cols[1]]) & !duplicated(bb))
	if(length(w)==0) return(rep(NA, length(s)))
	d <- apply(as.matrix(mcols(bb)[w,c(cols, "n.snv_mnv")]), 1, function(x){
				beta <- .betaFromCi(x[1:3])
				beta <- (beta * x[4] + 5*c(1,1))/(x[4]+5) # "posterior" with prior B(1,1)
				dbeta(s,beta[1],beta[2])
			})
	wd <- as.numeric(width(bb)[w])
	o <- d %*% wd
}



#' Plot timing
#' @param vcf The `VCF` file to plot, with timing information added 
#' @param cn The copy number `GRanges` object
#' @param sv Optional structural variants information.
#' @param title The titel of the plot. Default = "".
#' @param regions The region to show as a `GRanges` object. Default is to show all chromosomes.
#' @param ylim.cn The y-limits for the copy number stacked bar plot.
#' @param layout.height The relative proportions of the layout.
#' @param y.sv Where to plot arches for SVs.
#' @return NULL
#' 
#' @author mg14
#' @export
plotSample <- function(vcf, cn, sv=NULL, title="", regions=NULL, ylim.cn=c(0,5), layout.height=c(4,1.2,3.5), y.sv=ylim.cn[2]-1) {
	if(is.null(regions)) regions <- refLengths[1:24]
	p <- par()
	layout(matrix(1:3, ncol=1), height=layout.height)
	par(mar=c(0.5,3,0.5,0.5), mgp=c(2,0.25,0), bty="L", las=2, tcl=-0.25, cex=1)
	xlim=c(min(chrOffset[as.character(seqnames(regions))]+start(regions)),max(chrOffset[as.character(seqnames(regions))]+end(regions)))
	bbb <- cn[cn %over% regions]
	.plotVcf(vcf[vcf %over% regions], bbb, legend=FALSE, col.grid='white',  xaxt=FALSE, cex=0.33, xlim=xlim)
	mtext(line=-1, side=3, title, las=1)
	.plotBB(bbb, ylim=ylim.cn, legend=FALSE, type='bar', col.grid='white', col=c("lightgrey", "darkgrey"), xaxt=FALSE, xlim=xlim)
	tryCatch({
				par(xpd=NA)
				if(!is.null(sv))
					.plotSv(sv, y1=y.sv, regions=regions, add=TRUE)
				par(xpd=FALSE)
			}, error=function(x) warning(x))
	par(mar=c(3,3,0.5,0.5))
	.plotTiming(bbb, xlim=xlim, legend=FALSE, col.grid=NA)
	if(length(regions) == 1)
		axis(side=1, at=pretty(c(start(regions), end(regions)))+chrOffset[as.character(seqnames(regions))], labels=sitools::f2si(pretty(c(start(regions), end(regions)))))
	if(any(!is.na(cn$time))){
		y0 <- seq(0.005,0.995,0.01)
		s <- .histBeta(cn)
		g <- colorRampPalette(RColorBrewer::brewer.pal(4,"Set1")[c(3,2,4)])(100)
		segments(x0=chrOffset["MT"] ,y0=y0,x1=chrOffset["MT"] + s/max(s) * 1e8, col=g, lend=3)
		getMode <- function(s){
			if(all(is.na(s))) return(NA)
			w <- which.max(s)
			if(w %in% c(1, length(s))){
				m <- which(c(0,diff(s))>0 & c(diff(s),0)<0)
				if(length(m)==0) return(w)
				m <- m[which.max(s[m])]
				return(if(s[w] > 2*s[m]) w else m) 
			} else return(w)
		}
		abline(h=y0[getMode(s)], lty=5)
		if("time.2nd" %in% colnames(mcols(cn))) if(any(!is.na(cn$time.2nd))){
				s2 <- .histBeta(cn, time="time.2nd")
				segments(x0=chrOffset["MT"] + s/max(s) * 1e8 ,y0=y0,x1=chrOffset["MT"] + s/max(s) * 1e8 + s2/max(s) * 1e8, col=paste0(g,"44"), lend=3)
				abline(h=y0[getMode(s2)], lty=3)
				
			}
	}
	#print(w)
	par(p[setdiff(names(p), c("cin","cra","csi","cxy","din","page"))])
}

.plotSv <- function(sv, y0=0,y1=y0, h=1, col=paste0(RColorBrewer::brewer.pal(5,"Set1"),"44"), regions=refLengths[1:24], add=FALSE){
	if(add==FALSE){
		s <- c(1:22, "X","Y")
		l <- as.numeric(width(refLengths[as.character(seqnames(refLengths)) %in% s]))
		names(l) <- s
		plot(NA,NA, ylab="Copy number",xlab="",xlim=xlim, ylim=ylim, xaxt="n")
		c <- cumsum(l)
		axis(side=1, at=c(0,c), labels=rep('', length(l)+1))
		if(length(regions) == 1)
			axis(side=1, at=pretty(c(start(regions), end(regions)))+chrOffset[as.character(seqnames(regions))], labels=sitools::f2si(pretty(c(start(regions), end(regions)))))
		if(xaxt) mtext(side=1, at= cumsum(l) - l/2, text=names(l), line=1)
	}
	#r <- rowRanges(sv)
	#a <- unlist(alt(sv))
	vs <- GRanges(info(sv)$MATECHROM, IRanges(info(sv)$MATEPOS, width=1))
	l <- 20
	x0 <- seq(0,1,l=l) 
	y2 <- x0*(1-x0)*4*h
	cls <- factor(as.character(info(sv)$SVCLASS), levels=c("DEL", "DUP", "h2hINV","t2tINV","TRA"))
	w <- which(sv %over% regions | vs %over% regions)
	for(i in w)
		try({
					x <- seq(start(sv)[i] + chrOffset[as.character(seqnames(sv)[i])], start(vs)[i] + chrOffset[as.character(seqnames(vs)[i])], length.out=l)
					x <- c(x[1], x, x[length(x)])
					y <- y1 + y2 * if(grepl("INV", cls[i])) -1 else 1
					y <- c(y0, y , y0)
					lines(x, y, col=col[cls[i]])
					#segments(x0=c(x[1], x[l]), x1=c(x[1],x[l]), y0=y0, y1=y1, col=col[cls[i]])
				})
}
gerstung-lab/MutationTimeR documentation built on Dec. 12, 2020, 9:29 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
gerstung-lab/MutationTimeR
Timing somatic point mutations and copy number gains

R/MutationTime.R
In gerstung-lab/MutationTimeR: Timing somatic point mutations and copy number gains

Documented in addMetadataToHeader addMutTime classifyMutations classWgd getAltCount getClusterPower getHomozygousity getMetaEntry getPloidy getPurity getSex getSubclonalArchitecture getTumorDepth getWGDstatus mutationTime plotSample readVcf writeVcf

R Package Documentation

Browse R Packages

We want your feedback!

gerstung-lab/MutationTimeR Timing somatic point mutations and copy number gains

R/MutationTime.R In gerstung-lab/MutationTimeR: Timing somatic point mutations and copy number gains

Documented in addMetadataToHeader addMutTime classifyMutations classWgd getAltCount getClusterPower getHomozygousity getMetaEntry getPloidy getPurity getSex getSubclonalArchitecture getTumorDepth getWGDstatus mutationTime plotSample readVcf writeVcf

R Package Documentation

Browse R Packages

We want your feedback!

gerstung-lab/MutationTimeR
Timing somatic point mutations and copy number gains

R/MutationTime.R
In gerstung-lab/MutationTimeR: Timing somatic point mutations and copy number gains