#' Calculate timing of SCNA
#'
#' This function estimates the timing of SCNAs for the following two scenarios.
#' For 2:0 3:1 3:0 4:1, where an identifiable history matrix is available,
#' this function returns the time estimate and confidence interval for each stage
#' For SCNAs with multiple or non-indentifible matrix, this function instead
#' returns the bounds (lower and upper) of time period for the last stage of
#' the corresponding SCNA evolution.
#'
#' @param x vector of number of reads supporting SSNVs
#' @param m vector of total read depth for SSNVs
#' @param nt total copy number
#' @param nb copy number of the minor allele
#' @param history a list of possible evolutionary history matrices, see also function "cnmutHistory"
#' @param qmethod suggest to use fullMLE, which is more accurate
#' @param type set it to be either "identifiable" or "butte", or leave it unset
#' @param seqError sequencing errors
#' @param bootstrapCI set to "bootstrap" if non-parametric; or "parametric". This specify the confidence interval method. NULL if do not want to calculate CI
#' @param B the number of bootstrapping samples
#' @param CILevel confidence interval level
#' @param purity tumor sample purity
#' @param verbose logical. Turn on/off additional warnings.
#' @param returnAssignments logical. Whether to return the probabilistic assignments of SSNVs to allele states generated by the EM algorithm, as well as the SSNV read counts (total depth and depth of the mutant allele).
#' @param minMutations minimum number of SSNVs required for timing analysis
#' @param init the initial value of vector q passed to function .estimateQ
#' @param maxiter maximum number of iterations in EM algorithm when calculating the allele state distribution
#' @param tol the tolerance in the convergence of q
#' @param mutationId a vector of mutation IDs. The default is 1:length of x.
#' @return A list of possible matrices
#' @export
Butte <- function(x, m, history, nt, nb, qmethod=c("fullMLE","partialMLE"),
type=c("identifiable","butte"), seqError=0, bootstrapCI=NULL,
B=500, CILevel=0.9, purity=1, verbose=TRUE,
returnAssignments=TRUE, minMutations=10, init=NULL,
maxiter=100, tol=0.0001, mutationId=1:length(x),...) {
qmethod <- match.arg(qmethod)
doCI <- !is.null(bootstrapCI)
type <- match.arg(type)
nMuts<-length(x)
#checking input
if(length(m)!=nMuts) stop("x and m must be of same length")
if(length(purity) != 1 | purity > 1 | purity < 0)
stop("tumor purity must be given and should be single value between 0 and 1.")
if(length(mutationId)!=length(unique(mutationId)) & verbose)
warning("mutationId not unique values; using index as id instead.")
if(length(mutationId)!=nMuts & verbose)
warning("mutationId an invalid length; using index as id instead.")
if(returnAssignments) returnData <- TRUE
else returnData <- FALSE
if (missing(history)){ #history is not defined
history = cnmutHistory(nt = nt, nb = nb)
}
#message(paste0("possible history matrices: ", length(history)))
# re-check if the type arg is appropriate
# group1: one group with an unique and identifiable history matrix A
if (length(history) == 1 & dim(history[[1]])[1] == dim(history[[1]])[2]){ #2:0 3:1 3:0 4:1
if (type == "butte"){ type = "identifiable" } #this is an identifable case, setting is wrong
# group2: the other does not have identifiable history, or there are mulitple A
} else {
if (type == "identifiable"){ type = "butte" } #this is an butte case, setting is wrong
}
#message(type)
#input checking done, start working...
A <- history[[1]] #use the first history to calculate n steps
K <- ncol(A)-1 #number of steps
#get final allele frequencies
possAlleles <- vafEst(nt=nt, nb=nb, pu=purity)
#possAlleles <- errorAF(possAlleles,seqError=seqError)
summaryTable<-cbind(nMuts,length(m))
colnames(summaryTable)<-c("Total Mutations","Qualify for fitting")
summaryTable<-t(summaryTable)
colnames(summaryTable)<-"Number"
# call arguments for bootstrapping
call<-list(alleleSet=possAlleles,
history=history,
nt=nt,
nb=nb,
type=type,
qmethod=qmethod,
seqError=seqError,
purity=purity,
minMutations=minMutations,
init=init,
maxiter=maxiter,
tol=tol)
if(returnData) inputData=data.frame(mutationId=mutationId,x=x,m=m)
failReason <- NULL # a sentence tells the reason for failing the calculation
success <- TRUE # logical TRUE or FALSE
# formatting the output list
dummyOutput<-list("pi"=rep(NA,length=ncol(A)),alleleSet=possAlleles,
optimDetails=list(nIter=NA,initial=NA,pathOfQ=NA),
perLocationProb=NA,q=NA)
names(dummyOutput$pi) <- paste("Stage",0:K,sep="")
# formatting the confidence interval
ci <- matrix(rep(NA,length=2*length(dummyOutput$pi)),ncol=2)
row.names(ci) <- names(dummyOutput$pi)
dummyOutput$piCI <- ci
if ( !is.null(failReason) ) {
output <- dummyOutput #failed assign the dummy output
} else {
#estimate of time proportion
piEst <- rep(NA,ncol(A))
# optimizing q vector
if ( qmethod %in% c("fullMLE") ) {
### Estimate the q vector, THE OUTPUT has the SAME elements as dummyOutput
output<-try(.estimateQ(x, m, possAlleles, history=history[[1]], type=type,
init=init, maxiter=maxiter, tol=tol))
if( output$optimDetails$nIter == maxiter ) {
failReason<-paste(failReason,"hit maximum number of iterations",sep=",")
success<-FALSE
}
}
if( qmethod %in% c("partialMLE") ) { #partialMLE method for estimating q
mleAlleles <- mleAF(x, m, nt=nt, nb=nb, seqError=seqError, purity = purity)
output <- try(.estimateQ(x,m, possAlleles, alleleFreq = mleAlleles$alleleSet$Freq,
history=history[[1]], type=type, init=init, maxiter=maxiter, tol=tol))
}
if(!inherits(output, "try-error")) {
# identifiable: directly calculate timing
if ( type == "identifiable" ) {
piEst <- solve(A) %*% output$q
piEst <- as.vector(piEst)
piEst <- piEst/sum(piEst)
names(piEst) <- paste("Stage", 0:K, sep="")
output$pi <- piEst
}
# non-unique: estimating the bounds instead
else if ( type == "butte" ) {
# set to zero for too low mutations
q2 = output$q
#numq = round(q2 * nMuts) #testing force zero
#q2[which(q2 < 0.01 & numq < 5)] = 0 #testing force zero
#q2 = q2/sum(q2) #testing force zero
buttebounds = try(.lpbounds(q = q2, possible_histories = history))
pKEst <- buttebounds
pKEst[!is.finite(pKEst)] <- NA
names(pKEst) <- c("pKlower", "pKupper")
buttebounds.p0 = try(.lpbounds(q = q2, possible_histories = history, p0=TRUE)) #get initiation time bounds as well
p0Est <- buttebounds.p0
p0Est[!is.finite(p0Est)] <- NA
names(p0Est) <- c("p0lower", "p0upper") #swap p0 to facilitate output conversion
output$pi <- c(p0Est[2:1], pKEst)
}
} else { #if ERROR occurred when estimating q
output<-dummyOutput
failReason<-paste(failReason,"error returned in estimating q",sep=",")
success<-FALSE
}
if(!returnAssignments) output<-output[-grep("perLocationProb",names(output))]
else{ output[["perLocationProb"]]<-data.frame(inputData,output[["perLocationProb"]],
check.names=FALSE)}
#Calculate Confidence Interval
if ( doCI ) {
if ( type == "identifiable" & !any(is.na(output$pi)) ) { #unique history
#get pi of bootstrapping samples
bootsample <- bootstrapButte(call=call, B=B, x=x, m=m,
type=bootstrapCI, pi=output$pi)
#determining confidence interval
ci <- t(apply(bootsample, 2, quantile, c((1-CILevel)/2,1-(1-CILevel)/2)))
}
else if ( type == "butte" ) { #non-unique history, get CI for bounds instead
bootsample <- bootstrapButte(call=call, B=B, x=x, m=m,
type=bootstrapCI, pi=output$pi)
ci <- t(apply(bootsample, 2, quantile, c((1-CILevel)/2,1-(1-CILevel)/2), na.rm=T))
}
else {
ci<-matrix(rep(NA,length=2*length(output$pi)),ncol=2)
row.names(ci)<-names(output$pi)
}
#colnames(ci)<-c("lower","upper")
output$piCI <- ci
#selected elements for output
if(!returnAssignments)
output <- output[c("pi","piCI","q","optimDetails")]
else
output <- output[c("pi","piCI","q","perLocationProb","optimDetails")]
} else {
output<-output[c("pi","q","perLocationProb","optimDetails")]
}
}
return(c(output,list(summaryTable=summaryTable, success=success,
failReason=if(!is.null(failReason)) gsub("^,","",failReason)
else failReason,call=call)))
}
#' Use linear programming to solve the bounds of TK or T0
#'
#' This function estimate the bounds of TK, which is the last time interval in the tumor cell envolution
#' history. Based on different possible history given a copy number ratio, the function makes use of
#' linear programming to minimize and maximize TK. The objective function of the optimization if f(x) = ta, which
#' can be written as [0 0 0 0 0 ... 0 1]T*[t1 t2 t3 ... ta]. The first part of constraints are given by (A-qs')t = 0,
#' where s' refers to the transpose vector calculated by rowSum(A). The second part of constraints is the
#' convexity of time vector t. Each element of t refers to a relative fraction of time.
#' We then combine these two constraints into a single linear system formation. he first constraint directly
#' follows from At/c = q, wherer c is a normalizing constant given by the product s'*t.
#'
#' Then the feasibility of the solution region of the linear programming problem will only be influenced by q, which
#' is optimized previously from data. The uncertainty of q can make the solution space infeasible. So we add some
#' slack variables to elasticize the linear programming problem.
#' For details, please check: http://web.mit.edu/lpsolve/doc/Infeasible.htm
#' This elasticizing method will find the approximate bounds of TK close to the constraints.
#' "scost" is the argument adjusting the penalty of the additional slack variables.
#'
#' @param q q estimated from data
#' @param possible_histories matrices of possible SCNA-SSNV histories, see also function "cnmutHistory"
#' @param scost the cost for slack variables (default 100)
#' @param p0 logical, if TRUE, the upper bounds for T0 will be estimated (instead of TK)
#' @return the lower and upper bounds of the time duration for the last stage
#' @importFrom lpSolve lp
#' @export
.lpbounds <- function(q, possible_histories, scost=100, p0 = FALSE) { #modified from Yunong's code with slack variables
lbs = vector()
ubs = vector()
nrx = vector() #number of slack variable with non-zero coefficients
tcc = vector() #total cost coefficients
q[which(q < 1e-5)] = 0 #when some q is exteremely small, set it to 0
q = q/sum(q)
for(i in seq(possible_histories)) {
A = possible_histories[[i]]
#from A, we can get s vector, which is the copies survive till the end at each time interval
s = colSums(A)
#as described by the draft, we can get a matrix M = A-qs', where s' is s transpose, and M*pi = 0
M = A-q%*%t(s)
b = numeric(length(q))
#we should also consider the convexity of the time vector PI
M = rbind(M, rep(1,ncol(A)))
b = c(b,1)
#M add two columns
relaxCols = diag(dim(M)[1])
relaxCols = relaxCols[,rep(1:ncol(relaxCols), each=2)]
relaxCols[,seq(2,dim(relaxCols)[2], by=2)] = relaxCols[,seq(2,dim(relaxCols)[2], by=2)]*-1
M = cbind(M, relaxCols)
#notice that, the objective function we want to optimize is
# f(pi) = [0,0,0,0,0,0 ......,0, 1] * pi, which equals to
# the last time interval pi_n
#define objective function (represent coefficients in a vector)
f.obj <- c(rep(0,ncol(A)-1),1) #calculate lower bounds for arrival time
if (p0) #calculate lower bounds for initiation time
f.obj <- c(1, rep(0,ncol(A)-1))
f.obj <- c(f.obj, rep(scost, dim(M)[1]*2)) #relax the model
#define constraints
f.con <- M
f.dir <- rep("==",nrow(M))
f.rhs <- b
#message(paste0(i,":\n"))
lowbound = lpSolve::lp("min", f.obj, f.con, f.dir, f.rhs)
lb = lowbound$objval - sum(tail(lowbound$solution,dim(M)[1]*2) * scost)
if (abs(lb) < 1e-6) lb = 0
n_relax = length(which(tail(lowbound$solution,dim(M)[1]*2) > 0))
t_costc = sum(tail(lowbound$solution,dim(M)[1]*2))
f.obj <- c(rep(0,ncol(A)-1),1) #calculate upper bounds for arrival time
if (p0) #calculate upper bounds for initiation time
f.obj <- c(1, rep(0,ncol(A)-1))
f.obj <- c(f.obj, rep(-scost, dim(M)[1]*2)) #relax the model
uppbound = lpSolve::lp("max", f.obj, f.con, f.dir, f.rhs)
ub = uppbound$objval - sum(tail(uppbound$solution,dim(M)[1]*2) * scost * -1)
if (abs(ub) < 1e-6) ub = 0
n_relax = n_relax + length(which(tail(uppbound$solution,dim(M)[1]*2) > 0))
t_costc = t_costc + sum(tail(uppbound$solution,dim(M)[1]*2))
if (sum(lowbound$solution[1:dim(A)[2]]) > 0) {
lbs = append(lbs, lb)
names(lbs)[length(lbs)] = i
}
if (sum(uppbound$solution[1:dim(A)[2]]) > 0) {
ubs = append(ubs, ub)
names(ubs)[length(ubs)] = i
nrx = append(nrx, n_relax)
names(nrx)[length(nrx)] = i
tcc = append(tcc, t_costc)
names(tcc)[length(tcc)] = i
}
}
return(c(min(lbs[which(tcc == min(tcc))]), max(ubs[which(tcc == min(tcc))])))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.