R/treedater0.R

Defines functions sampleYearsFromLabels .make.tree.data .optim.r.gammatheta.nbinom0 .optim.nbinom1 .optim.omega.poisson0 .optim.Ti0 .optim.Ti5.constrained.limsolve .optim.Ti2 .optim.omegas.gammaPoisson1 .optim.omegas.gammaPoisson2 .optim.sampleTimes0 .Ti2blen .hack.times1 .fix.Ti .dater dater print.treedater summary.treedater goodnessOfFitPlot .fitDiagnostics rootToTipRegressionPlot

Documented in dater goodnessOfFitPlot rootToTipRegressionPlot sampleYearsFromLabels

#~ Treedater: fast relaxed molecular clock dating 
#~     Copyright (C) 2018  Erik Volz
#~     This program is free software: you can redistribute it and/or modify
#~     it under the terms of the GNU General Public License as published by
#~     the Free Software Foundation, either version 3 of the License, or
#~     (at your option) any later version.


#' Compute a vector of numeric sample times from labels in a sequence aligment or phylogeny
#' 
#' @param tips A character vector supplying the name of each sample 
#' @param dateFormat The format of the sample date. See ?Date for more information
#' @param delimiter Character(s) which separate data in each label
#' @param index Integer position of the date string in each label with respect to *delimiter*
#' @param regex A regular expression for finding the date substring. Should not be used with *delimiter* or *index*
#' @return Numeric vector with sample time in decimal format. 
#' @examples 
#' ## A couple of labels for Ebola virus sequences: 
#' sampleYearsFromLabels( c('EBOV|AA000000|EM104|SierraLeone_EM|2014-06-02'
#'                        , 'EBOV|AA000000|G3713|SierraLeone_G|2014-06-09')
#'	, delimiter='|' )
#' ## Equivalently: 
#' sampleYearsFromLabels( c('EBOV|AA000000|EM104|SierraLeone_EM|2014-06-02'
#'                        , 'EBOV|AA000000|G3713|SierraLeone_G|2014-06-09')
#'  , regex='[0-9]+-[0-9]+-[0-9]+')
#'
#' @export 
sampleYearsFromLabels <- function(tips, dateFormat='%Y-%m-%d'
 , delimiter=NULL #'|'
 , index=NULL
 , regex=NULL
)
{
	success = requireNamespace('lubridate', quietly=TRUE)
	if (!success) stop(' The `sampleYearsFromLabels` function requires the `lubridate` package. Please install `lubridate`. Stopping.')
	
	#units <- match.arg(units)
	if (!is.null(delimiter) & !is.null(regex))
	 stop('At most one of the options *delimiter* or *regex* should be supplied for determining times of sampled lineages. The *index* parameter may be supplied with delimiter, or if omitted, will assume that the last field in the tip label corresponds to date.')
	
	if(!is.null(delimiter)){
	  if(is.null(index)){
		tipdates <- sapply( strsplit( tips, delimiter, fixed=TRUE) , function(x) tail(x,1) )
	  } else {
	    tipdates <- sapply( strsplit( tips, delimiter, fixed=TRUE) , function(x) x[index] )
	  }
	} else if( !is.null(regex)){
		tipdates <- regmatches( tips, regexpr( regex, tips ))
		if (length( tipdates)!= length(tips)) 
		  stop('Error with regex: number of matches does not equal number of tips.')
	}
	
	tipdates <- as.Date( tipdates, format=dateFormat )
	sts <- lubridate::decimal_date( tipdates )
	setNames( sts, tips )
}

.make.tree.data <- function( tre, sampleTimes, s, cc)
{
	n <- length( tre$tip.label)
	
	tipEdges <- which( tre$edge[,2] <= n)
	i_tip_edge2label <- tre$tip.label[ tre$edge[tipEdges,2] ]
	sts <- sampleTimes[i_tip_edge2label]
	
	# daughters, parent
	daughters <- matrix( NA, nrow = n + n-1, ncol = 2)
	parent <- rep(NA, n + n - 1)
	for (k in 1:nrow(daughters)){
		x <- tre$edge[which(tre$edge[,1] == k),2]
		if (length(x) > 0){
			daughters[k, ] <- x
			for (u in x){
				if (!is.na(u)) parent[u] <- k
			}
		}
	}
	
	B <- tre$edge.length
	A <- matrix(0, nrow = length(B), ncol = n-1)
	
	#B[tipEdges] <- B[tipEdges] -  unname( omega * sts )
	A[ cbind(1:nrow(tre$edge), tre$edge[,1]-n) ] <- -1
	internalEdges <- setdiff( 1:length(B), tipEdges )
	A[ cbind(internalEdges, tre$edge[internalEdges,2] - n) ] <- 1
	
	# constraints(optional)
	Ain <-  matrix(0, nrow = length(B), ncol = n-1)
	Ain[ cbind(1:nrow(tre$edge), tre$edge[,1]-n) ] <- 1 # parent to -1
	Ain[ cbind(internalEdges, tre$edge[internalEdges,2] - n) ] <- -1 # dgtr to +1
	bin <- rep(0, length(B))
	bin[tipEdges] <- sts # terminal edges to -sample time #...
	
	W <- abs( 1/( (tre$edge.length + cc / s)/s) ) 
	
	postorder_internal_nodes <- unique( tre$edge[ ape::postorder( tre ),1] )
	
	list( A0 = A, B0 = B, W0 = W, n = n, tipEdges=tipEdges
	 , i_tip_edge2label = i_tip_edge2label
	 , sts2 = sts  # in order of tipEdges
	 , sts1 = sampleTimes[ tre$tip.label] # in order of tip.label
	 , sts = sampleTimes
	 , s = s, cc = cc, tre = tre
	 , daughters = daughters
	 , parent = parent
	 , Ain = Ain
	 , bin = bin
	 , postorder_internal_nodes = postorder_internal_nodes
	)
}


.optim.r.gammatheta.nbinom0 <- function(  Ti, r0, gammatheta0, td, lnd.mean.rate.prior)
{	
	blen <- .Ti2blen( Ti, td )
	
	#NOTE relative to wikipedia page on NB:
	#size = r
	#1-prob = p
	of <- function(x)
	{
		r <- max(1e-3, exp( x['lnr'] ) )
		gammatheta <- exp( x['lngammatheta'] )
		if (is.infinite(r) | is.infinite(gammatheta)) return(Inf)
		ps <- pmin(1 - 1e-5, gammatheta*blen / ( 1+ gammatheta * blen ) )
		ov <- -sum( dnbinom( pmax(0, round(td$tre$edge.length*td$s))
		  , size= r, prob=1-ps,  log = TRUE) )
		mr <-  r * gammatheta / td$s 
		ov <- ov - unname(lnd.mean.rate.prior( mr ))
		ov 
	}
	x0 <- c( lnr = unname(log(r0)), lngammatheta = unname( log(gammatheta0)))
	#o <- optim( par = x0, fn = of, method = 'BFGS' )
	if ( is.infinite( of( x0 )) ) stop( 'Can not optimize rate parameters from initial conditions. Try adjusting *meanRateLimits*.' )
	o <- optim( par = x0, fn = of)
	r <- unname( exp( o$par['lnr'] ))
	gammatheta <- unname( exp( o$par['lngammatheta'] ))
	list( r = r, gammatheta=gammatheta, ll = -o$value)
}

# additive varianc model of x didelot  
.optim.nbinom1 <- function(  Ti, mu0, sp0, td, lnd.mean.rate.prior)
{	
	blen <- .Ti2blen( Ti, td )
	
	#NOTE relative to wikipedia page on NB:
	#size = r
	#1-prob = p
	of <- function(x)
	{
		sp <- exp( x[ 'lnsp' ] )
		mu <- exp( x['lnmu'] )
		if (is.infinite(sp) | is.infinite(mu)) return(Inf)
		sizes <- mu * blen / sp 
		ov <- -sum( dnbinom( pmax(0, round(td$tre$edge.length*td$s))
		  , size = sizes
		  , prob = 1 - sp / ( 1+sp ),  log = TRUE) )
		ov <- ov - unname(lnd.mean.rate.prior( mu  / td$s))
		ov 
	}
	x0 <- c( lnmu = unname(log(mu0)), lnsp = unname( log(sp0)))
	#o <- optim( par = x0, fn = of, method = 'BFGS' )
	if ( is.infinite( of( x0 )) ) stop( 'Can not optimize rate parameters from initial conditions. Try adjusting *meanRateLimits*.' )
	o <- optim( par = x0, fn = of)
	mu <- unname( exp( o$par['lnmu'] ))
	sp <- unname( exp( o$par['lnsp'] ))
	list( mu = mu, sp = sp, ll = -o$value)
}

.optim.omega.poisson0 <- function(Ti, omega0, td, lnd.mean.rate.prior, meanRateLimits)
{	
	blen <- .Ti2blen( Ti, td )
	
	of <- function(omega)
	{
		-sum( dpois( pmax(0, round(td$tre$edge.length*td$s)), td$s * blen * omega ,  log = T) ) - unname(lnd.mean.rate.prior( omega ))
	}
	o <- optimise(  of, lower = max( omega0 / 10, meanRateLimits[1] )
	 , upper = min( omega0 * 10, meanRateLimits[2] ))
	list( omega = unname( o$minimum), ll = -unname(o$objective) )
}



.optim.Ti0 <- function( omegas, td , scale_var_by_rate = FALSE){
		A <- omegas * td$A0 
		B <- td$B0
		B[td$tipEdges] <- td$B0[td$tipEdges] -  unname( omegas[td$tipEdges] * td$sts2 )
		#solve( t(A) %*% A ) %*% t(A) %*% B
		if (scale_var_by_rate){
			rv <- ( coef( lm ( B ~ A -1 , weights = td$W/omegas) ) )
		} else{
			rv <- ( coef( lm ( B ~ A -1 , weights = td$W) ) ) 
		}
	if (any(is.na(rv))){
		warning('Numerical error when performing least squares optimisation. Values are approximate. Try adjusting minimum branch length(`minblen`) and/or initial rate omega0.')
		rv[is.na(rv)] <- max(rv, na.rm=T)
		rv <- .hack.times1(rv, td )
	}
	rv
}





# constraints using quad prog 
.optim.Ti5.constrained.limsolve <- function(omegas, td){
		A <- omegas * td$A0 
		B <- td$B0
		B[td$tipEdges] <- td$B0[td$tipEdges] -  unname( omegas[td$tipEdges] * td$sts2 )
		# initial feasible parameter values:
			
	w <- sqrt(td$W)
	unname( lsei( A = A * w
	 , B = B * w
	 , G = -td$Ain
	 , H = -td$bin
	 , type = 2
	)$X )
}


# <constrained ls>
.optim.Ti2 <- function( omegas, td ){
	success = requireNamespace('mgcv', quietly=TRUE)
	if (!success) stop('The *mgcv* option requires installation of the `mgcv` package. Stopping.') 
		A <- omegas * td$A0 
		B <- td$B0
		B[td$tipEdges] <- td$B0[td$tipEdges] -  unname( omegas[td$tipEdges] * td$sts2 )
		
		# initial feasible parameter values:
		p0 <- ( coef( lm ( B ~ A -1 , weights = td$W) ) )
		if (any(is.na(p0))){
			warning('Numerical error when performing least squares optimisation. Values are approximate. Try adjusting minimum branch length(`minblen`) and/or initial rate omega0.')
			p0[is.na(p0)] <- max(p0, na.rm=T)
		}
		p1 <- .hack.times1(p0, td )
		
		# design
		M <- list( 
			X  = A
			,p = p1
			,off = c()# rep(0, np)
			,S=list()
			,Ain=-td$Ain
			,bin=-td$bin
			,C=matrix(0,0,0)
			,sp= c()#rep(0,np)
			,y=B
			,w=td$W #/omegas # better performance on lsd tests w/o this  
		)
		o <- mgcv::pcls(M)
	o
}
#</ constrained ls>

.optim.omegas.gammaPoisson1 <- function( Ti, r, gammatheta, td )
{
	blen <- .Ti2blen( Ti, td )
	o <- sapply( 1:nrow(td$tre$edge), function(k){
		sb <- (td$tre$edge.length[k]*td$s)
		lb <- qgamma(1e-6,  shape=r, scale = gammatheta*blen[k] )
		lam_star <- max(lb, gammatheta * blen[k] * (sb + r - 1) / (gammatheta * blen[k] + 1)  )
		ll <- 	dpois( max(0, round(sb)),lam_star, log=T )  + 
		 dgamma(lam_star, shape=r, scale = gammatheta*blen[k], log = T)
		c(lam_star / blen[k] / td$s, ll )
	}) 
	list( omegas = o[1,], ll = unname(sum( o[2,] )), lls  = unname(o[2,])  )
}


# using additive relaxed clock model 
.optim.omegas.gammaPoisson2 <- function( Ti, mu, sp , td )
{
	blen <- .Ti2blen( Ti, td )
	o <- sapply( 1:nrow( td$tre$edge ), function(k) {
		tau <- blen[k] 
		x <- (td$tre$edge.length[k]*td$s)
		#lb <- qgamma(1e-6,  shape= mu*tau/sp , scale = sp ) # gives values too close to zero
		lb <- mu / 100# TODO would be nice to have a nice way to  choose this lower bound
		lam_star <- max(lb
		 , (mu*tau - sp + x * sp)  /  (sp + 1 )  # yup. 
		# , (mu*tau - sp + x * sp)  /  (sp + tau ) # nope.
		)
		ll <- dpois( max(0, round(x)),lam_star, log=T )  + 
		 dgamma(lam_star, shape=mu*tau/sp , scale = sp / tau , log = T) 
		c( lam_star / tau / td$s, ll )
	})
	list( omegas = o[1,] , ll = unname( sum( o[2,] )), lls = unname(o[2,] ) )
}

.optim.sampleTimes0 <- function( Ti, omegas, estimateSampleTimes, estimateSampleTimes_densities, td, iedge_tiplabel_est_samp_times )
{
	blen <- .Ti2blen( Ti, td )
	o <- sapply( iedge_tiplabel_est_samp_times, function(k) {
		u <- td$tre$edge[k,1]
		v <- td$tre$edge[k,2]
		V <- td$tre$tip.label[v]
		dst <- estimateSampleTimes_densities[[V]]
		tu <- Ti[u-td$n]
		of <- function(tv){
			.blen <- tv - tu 
if( .blen < 0 ) browser() 
			-dst(tv, V) -
			  dpois( max(0, round(td$tre$edge.length[k]*td$s))
			  , td$s * .blen * omegas[k] 
			  , log = T)
		}
		lb <- max( tu, estimateSampleTimes[V,'lower'] )
		ub <- max( tu, estimateSampleTimes[V, 'upper'] )
		if (ub == tu & lb == tu) return(tu ) #+ td$minblen
		o  <- optimise(  of, lower = lb, upper = ub )
		o$minimum
	})
	o
}

.Ti2blen <- function(Ti, td, enforce_minblen = TRUE ){
	Ti <- c( td$sts[td$tre$tip.label], Ti)
	elmat <- cbind( Ti[td$tre$edge[,1]], Ti[td$tre$edge[,2]])
	if ( enforce_minblen )
		return( pmax(td$minblen,-elmat[,1]  + elmat[,2] ) )
	else 
		return( -elmat[,1]  + elmat[,2] )
}


.hack.times1 <- function(Ti, td)
{
	#~ 	t <- c( td$sts2[td$tre$tip.label], Ti)
	t <- c( td$sts1[td$tre$tip.label], Ti)
	inodes <- (td$n+1):length(t)
	
	repeat
	{
		.t <- t
		.t[inodes] <- pmin(t[inodes], -td$minblen + pmin( t[ td$daughters[inodes,1] ], t[td$daughters[inodes,2] ] ) )
		if (identical( t, .t )) break
		t <- .t
	}
	t[inodes]
}


.fix.Ti <- function(Ti, td){
	.Ti <- Ti
	n <- 1 + length(Ti)
	sts_Ti <- c( td$sts1[td$tre$tip.label], Ti)
	for ( i in td$postorder_internal_nodes-n ){
		uv <- td$daughters[ i+n, ] 
		if ( !is.na(sts_Ti[uv[1]]))
			Ti[i] <- min( Ti[i], sts_Ti[uv[1]] - td$minblen  )
		if ( !is.na(sts_Ti[uv[2]]))
			Ti[i] <- min( Ti[i], sts_Ti[uv[2]] - td$minblen  )
		sts_Ti[ i + n ] <- Ti[i]
	}
	Ti
}


.dater <- function(tre, sts, s=1e3
 , omega0 = NA
 , minblen = NA
 , maxit=100
 , abstol = .0001
 , searchRoot = 5
 , quiet = TRUE
 , temporalConstraints = TRUE
 , clock = c('strict',  'uncorrelated', 'additive')
 , estimateSampleTimes = NULL
 , estimateSampleTimes_densities= list()
 , numStartConditions = 1
 , clsSolver=c('limSolve', 'mgcv')
 , meanRateLimits = NULL
 , ncpu = 1
 , parallel_foreach = FALSE
 , lnd.mean.rate.prior = function(x) 0
 , tiplabel_est_samp_times = NULL
)
{
	clsSolver <- match.arg( clsSolver, choices = c('limSolve', 'mgcv')) 
	clock <- match.arg( clock , choices = c('uncorrelated', 'additive', 'strict') ) 
	if ( clock == 'additive' )
		stop('ARC model not implemented')
	# defaults
	CV_LB <- 1e-6 # lsd tests indicate Gamma-Poisson model may be more accurate even in strict clock situation
	cc <- 10
	intree_rooted <- TRUE
	if (!is.rooted(tre)) stop('Tree not rooted.')
	
	EST_SAMP_TIMES  = ifelse( is.null( tiplabel_est_samp_times ),  FALSE, TRUE)
	iedge_tiplabel_est_samp_times <- match( tiplabel_est_samp_times, tre$tip.label[tre$edge[,2]] )
	
	if (is.na(omega0)){
		# guess
		#omega0 <- estimate.mu( tre, sts )
		# start conditiosn based on rtt
			g0 <- lm(ape::node.depth.edgelength(tre)[1:length(sts)] ~ sts, na.action = na.omit)
			omega0sd <- suppressWarnings(summary( g0 ))$coef[2,2]
			omega0 <- unname( coef(g0)[2] )
			if (omega0 < 0 ){
				warning('Root to tip regression predicts a substition rate less than zero. Tree may be poorly rooted or there may be small temporal signal.')
				omega0 <- abs(omega0)/10
			}
			omega0s <- qnorm( unique(sort(c(.5, seq(.001, .999, l=numStartConditions*2) )))  , omega0, sd = omega0sd )
		#start conditiosn based on earliest sample 
			D <- ape::cophenetic.phylo( tre ) [1:ape::Ntip(tre), 1:ape::Ntip(tre)]
			esi <- which.min( sts )[1]
			g1 <- lm( D[esi,] ~ sts )
			omega0sd.1 <- suppressWarnings(summary( g1 ))$coef[2,2]
			omega0.1 <- unname( coef(g1)[2] )
			omega0s.1 <- qnorm( unique(sort(c(.5, seq(.001, .999, l=numStartConditions*2) )))  , omega0.1, sd = omega0sd.1 )
		
		
		omega0s <- sort( unique( c( omega0s, omega0s.1 ) ))
		omega0s <- omega0s[ omega0s > 0 ]
		cat( paste( 'Initial guesses of substitution rate:', paste(collapse=',', omega0s), '\n')  )
	} else{
		omega0s <- c( omega0 )
	}
	if ( any ( omega0s < meanRateLimits[1] ) | any( omega0s > meanRateLimits[2])){
		warning('Initial guess of mean rate falls outside of user-specified limits.')
		omega0s <- omega0s[  omega0s >= meanRateLimits[1] & omega0s <= meanRateLimits[2] ]
	}
	if (length(omega0s)==0){
		warning( 'Setting initial guess of mean rate to be mid-point of *meanRateLimits*')
		omega0s <- (meanRateLimits[1] + meanRateLimits[2]) / 2
	}
	td <- .make.tree.data(tre, sts, s, cc )
	td$minblen <- minblen
	
	omega2ll <- -Inf 
	bestrv <- list()
	for ( omega0 in omega0s ){
		# initial gamma parms with small variance
		r = r0 <- ifelse(clock=='strict', Inf, sqrt(10))  #sqrt(r) = 10 
		gammatheta = gammatheta0 <- ifelse(clock=='strict', omega0, omega0 * td$s / r0)
		mu = omega0  * td$s
		sp = 1e-2
		
		done <- FALSE
		lastll <- -Inf
		iter <- 0
		nEdges <- nrow(tre$edge)
		omegas <- rep( omega0,  nEdges )
		edge_lls <- NA
		rv <- list()
		while(!done){
			if (temporalConstraints){
				if (clsSolver=='limSolve'){
					Ti <- tryCatch( .optim.Ti5.constrained.limsolve ( omegas, td ) 
					 , error = function(e) .optim.Ti2( omegas, td)  )
				} else{
					Ti <- .optim.Ti2( omegas, td)  
				}
			} else{
				Ti <- .optim.Ti0( omegas, td, scale_var_by_rate=FALSE )
			} 
			Ti <- .fix.Ti( Ti, td ) 
			if ( (1 / sqrt(r)) < CV_LB){
				# switch to poisson model
				o <- .optim.omega.poisson0(Ti
				  , mean(omegas)
				  , td, lnd.mean.rate.prior , meanRateLimits)
				gammatheta <- unname(o$omega)
				if (!is.infinite(r)) lastll <- -Inf # the first time it switches, do not do likelihood comparison 
				r <- Inf#unname(o$omega)
				ll <- o$ll
				edge_lls <- 0
				omegas <- rep( gammatheta, length(omegas))
			} else{
				if (clock == 'uncorrelated')
				{
					o <- .optim.r.gammatheta.nbinom0(  Ti, r, gammatheta, td, lnd.mean.rate.prior )
					r <- o$r
					ll <- o$ll
					gammatheta <- o$gammatheta
					oo <- .optim.omegas.gammaPoisson1( Ti, o$r, o$gammatheta, td )
				} else if (clock=='additive'){
					o <- .optim.nbinom1( Ti, mu, sp, td, lnd.mean.rate.prior )
					mu = o$mu 
					sp = o$sp 
					ll = o$ll 
					oo = .optim.omegas.gammaPoisson2( Ti, mu, sp , td )
				} else{
					stop('invalid value for *clock*')
				}
				edge_lls <- oo$lls
				omegas <- oo$omegas
			}
						
			if (EST_SAMP_TIMES)
			{
				o_sts <- .optim.sampleTimes0( Ti, omegas, estimateSampleTimes,estimateSampleTimes_densities, td, iedge_tiplabel_est_samp_times )
				sts[tiplabel_est_samp_times] <- o_sts
				td$sts[tiplabel_est_samp_times] <- o_sts
				td$sts1[tiplabel_est_samp_times] <- o_sts
				td$sts2[tiplabel_est_samp_times] <- o_sts
			}
			
			if (!quiet)
			{ 
				print( data.frame( iteration = iter, median_unadjusted_rate = median(omegas)
				 ,  coef_of_var =  sd(omegas) / mean(omegas) # 1 / sqrt(r) 
				 , tmrca = min(Ti), logLik=ll , row.names=iter) )
				cat( '---\n' )
			}
			
			if (clock !='strict'){
				blen <- .Ti2blen( Ti, td )
				omega <- sum( omegas * blen ) / sum(blen)
			} else{#strict
				omega <- omegas[1]
			}
			
			if ( clock=='strict'){
				meanRate = omegas[1]
			} else if (clock=='uncorrelated' ){
				meanRate <- ( r * gammatheta / td$s )
				#NB:  r, tau phi / (1 = tau * phi )
				# Gamma: r ,  phi * tau 
			} else if( clock=='additive'){
				meanRate <- mu / td$s
			}

			
			if ( ll >= lastll ){
				rv <- list( omegas = omegas, r = unname(r), theta = unname(gammatheta), Ti = Ti
				 , adjusted.mean.rate = omega
				 , mean.rate = meanRate
				 , loglik = ll
				 , edge_lls = edge_lls )
			}
			
			# check convergence
			iter <- iter + 1
			if (iter > maxit) done <- TRUE
			
			if ( abs( ll - lastll ) < abstol) done <- TRUE
			if (ll < lastll) {
				done <- TRUE
			}
			
			lastll <- ll
		}
		if ( omega2ll < rv$loglik){
			bestrv <- rv
			omega2ll <- rv$loglik
		}
	}
	
	rv <- bestrv
	
	# one last round of st estimation b/c td$sts may not match bestrv 
	if (EST_SAMP_TIMES)
	{
		o_sts <- .optim.sampleTimes0( rv$Ti, rv$omegas, estimateSampleTimes,estimateSampleTimes_densities, td, iedge_tiplabel_est_samp_times )
		sts[tiplabel_est_samp_times] <- o_sts
		td$sts[tiplabel_est_samp_times] <- o_sts
		td$sts1[tiplabel_est_samp_times] <- o_sts
		td$sts2[tiplabel_est_samp_times] <- o_sts
	}

	blen <- .Ti2blen( rv$Ti, td , enforce_minblen=FALSE)
	rv$edge <- tre$edge
	rv$edge.length <- blen
	rv$tip.label <- tre$tip.label
	rv$Nnode <- tre$Nnode
	
	rv$timeOfMRCA <- min(rv$Ti)
	rv$timeToMRCA <- max(sts) - rv$timeOfMRCA
	rv$s <- s
	rv$sts <- sts
	rv$minblen <- minblen
	rv$intree <- tre #.tre
	rv$coef_of_variation <- sd(rv$omegas) / mean(rv$omegas) #ifelse( is.numeric(rv$r), 1 / sqrt(rv$r), NA )  #
	rv$clock <- clock
	rv$intree_rooted <- intree_rooted
	rv$is_intree_rooted <- intree_rooted
	rv$temporalConstraints <- temporalConstraints
	rv$estimateSampleTimes <- estimateSampleTimes
	rv$EST_SAMP_TIMES = EST_SAMP_TIMES
	if (!EST_SAMP_TIMES) rv$estimateSampleTimes <- NULL
	rv$estimateSampleTimes_densities <- estimateSampleTimes_densities
	rv$numStartConditions <- numStartConditions
	rv$lnd.mean.rate.prior <- lnd.mean.rate.prior
	rv$meanRateLimits <- meanRateLimits
	rv$relaxedClockModel = clock 
	rv$mu <- mu
	rv$sp <- sp 
	rv$omega0 = omega0
	rv$omega0s = omega0s
	
	
	# add pvals for each edge
	if (rv$clock=='uncorrelated'){
		rv$edge.p <- with(rv, {
			blen <- pmax(minblen, edge.length)
			ps <- pmax(1e-12, pmin(1 - 1e-12, theta * blen/(1 + theta * blen)) )
			pnbinom(pmax(0, round(intree$edge.length * s)), size = r, 
					prob = 1 - ps)
			})
	} else if ( rv$clock=='additive'){
			rv$edge.p <- with(rv, {
				blen <- pmax(minblen, edge.length)
				sizes = mu * blen / sp 
				pnbinom(pmax(0, round(intree$edge.length * s)), size = sizes, 
					prob = 1 - sp / (1+sp) )
			})
	} else if (rv$clock=='strict') {
		rv$edge.p <- with(rv, {
			blen <- pmax(minblen, edge.length)
			ppois(pmax(0, round(intree$edge.length * s)), blen * 
				meanRate * s)
		})
	}
	
	class(rv) <- c('treedater', 'phylo')
	rv
}

#' Estimate a time-scaled tree and fit a molecular clock
#'
#' @details 
#' Estimates the calendar time of nodes in the given phylogenetic
#' tree with branches in units of substitutions per site. The
#' calendar time of each sample must also be specified and the length
#' of the sequences used to estimate the tree. If the tree is not
#' rooted, this function will estimate the root position. 
#' For an introduction to all options and features, see the vignette on Influenza H3N2: vignette("h3n2")
#'
#' Multiple molecular clock models are supported including a strict clock and two variations on relaxed clocks. The 'uncorrelated' relaxed clock is the Gamma-Poisson mixture presented by Volz and Frost (2017), while the 'additive' variance model was developed by Didelot & Volz (2019). 
#' 
#' @section References:
#' E.M. Volz and Frost, S.D.W. (2017) Scalable relaxed clock phylogenetic dating. Virus Evolution.
#' X. Didelot and Volz, E.M. (2019) Additive uncorrelated relaxed clock models.
#' 
#' @param tre An ape::phylo which describes the phylogeny with branches in
#'        units of substitutions per site. This may be a rooted or
#'        unrooted tree. If unrooted, the root position will be
#'        estimated by checking multiple candidates chosen by
#'        root-to-tip regression.  If the tree has multifurcations,
#'        these will be resolved and a binary tree will be returned.
#' @param sts Vector of sample times for each tip in phylogenetic tree.
#'        Vector must be named with names corresponding to
#'        tre$tip.label.
#' @param s Sequence length (numeric). This should correspond to sequence length used in phylogenetic analysis and will not necessarily be the same as genome length. 
#' @param omega0 Vector providing initial guess or guesses of the mean substitution rate (substitutions
#'        per site per unit time). If not provided, will guess using
#'        root to tip regression.
#' @param minblen Minimum branch length in calendar time. By default, this will
#'        be the range of sample times (max - min) divided by sample
#'        size.
#' @param maxit Maximum number of iterations 
#' @param abstol Difference in log likelihood between successive iterations for convergence.
#' @param searchRoot Will search for the optimal root position using the top
#'        matches from root-to-tip regression.  If searchRoot=x, dates
#'        will be estimated for x trees, and the estimate with the
#'        highest likelihood will be returned.
#' @param quiet If TRUE, will suppress messages during execution 
#' @param temporalConstraints  If TRUE, will enforce the condition that an
#'        ancestor node in the phylogeny occurs before all progeny.
#'        Equivalently, this will preclude negative branch lengths.
#'        Note that execution is faster if this option is FALSE.
#' @param clock The choice of molecular clock model. Choices are 'uncorrelated', 'additive', or 'strict'. 
#' @param estimateSampleTimes If some sample times are not known with certainty,
#'         bounds can be provided with this option. This should take the
#'         form of a data frame with columns 'lower' and 'upper'
#'         providing the sample time bounds for each uncertain tip. Row
#'         names of the data frame should correspond to elements in
#'         tip.label of the input tree. Tips with sample time bounds in
#'         this data frame do not need to appear in the *sts* argument,
#'         however if they are included in *sts*, that value will be
#'         used as a starting condition for optimisation.
#' @param estimateSampleTimes_densities An optional named list of log densities
#'           which would be used as priors for unknown sample times. Names
#'           should correspond to elements in tip.label with uncertain
#'           sample times.
#' @param numStartConditions Will attempt optimisation from more than one starting point if >0
#' @param clsSolver Which package should be used for constrained least-squares? Options are "mgcv" or "limSolve"
#' @param meanRateLimits Optional constraints for the mean substitution rate 
#' @param ncpu Number of threads for parallel computing 
#' @param parallel_foreach If TRUE, will use the "foreach" package instead of the "parallel" package. This may work better on some HPC systems. 
#' 
#' @return A time-scaled tree and estimated molecular clock rate 
#' 
#' @author Erik M Volz <erik.volz@gmail.com>
#' 
#' @seealso 
#' ape::chronos
#' ape::estimate.mu
#'
#' @examples
#' ## simulate a random tree and sample times for demonstration
#' # make a random tree:
#' tre <- ape::rtree(50)
#' # sample times based on distance from root to tip:
#' sts <- setNames( ape::node.depth.edgelength( tre )[1:ape::Ntip(tre)], tre$tip.label)
#' # modify edge length to represent evolutionary distance with rate 1e-3:
#' tre$edge.length <- tre$edge.length * 1e-3
#' # treedater: 
#' td <- dater( tre, sts =sts , s = 1000, clock='strict', omega0=.0015)
#'
#'
#' @export 
dater <- function(tre, sts, s=1e3
 , omega0 = NA
 , minblen = NA
 , maxit=100
 , abstol = .0001
 , searchRoot = 5
 , quiet = TRUE
 , temporalConstraints = TRUE
 , clock = c('strict' , 'uncorrelated', 'additive')
 , estimateSampleTimes = NULL
 , estimateSampleTimes_densities= list()
 , numStartConditions = 1
 , clsSolver=c('limSolve', 'mgcv')
 , meanRateLimits = NULL
 , ncpu = 1
 , parallel_foreach = FALSE
)
{ 
	clsSolver <- match.arg( clsSolver, choices = c('limSolve', 'mgcv'))
	clock <- match.arg( clock , choices = c('strict' , 'uncorrelated', 'additive') ) 
	# defaults
	if ( is.na( omega0 ) ){
		cat('Note: Initial guess of substitution rate not provided. Will attempt to guess starting conditions. Provide initial guesses of the rate using *omega0* parameter. \n')
	}
	if (!is.binary( tre ) ){
		cat( 'Note: *dater* called with non binary tree. Will proceed after resolving polytomies.\n' )
		if ( !is.rooted( tre )){
			tre <- unroot( multi2di( tre ) ) 
		} else{
			tre <-  multi2di( tre ) 
		}
	}
	
	if (class(tre)[1]=='treedater'){
		cat('Note: *dater* called with treedater input tree. Will use rooted tree with branch lengths in substitions.\n')
		tre <- tre$intree
	}
	
	# optional limits or prior for mean rate
	lnd.mean.rate.prior <- function(x) 0 #dunif( x , 0, Inf, log=TRUE )
	if (is.null( meanRateLimits) ) {
		meanRateLimits <- c( 0, Inf)
	} else if ( class(meanRateLimits)=='function'){
		lnd.mean.rate.prior <- meanRateLimits
		meanRateLimits <- c(0, Inf)
	} else{
		MEANRATEERR <- '*meanRateLimits* should be a length 2 vector providing bounds on the mean rate parameter OR a function providing the log prior density of the mean rate parameter. '
		if (length( meanRateLimits) != 2) stop(MEANRATEERR)
		if (meanRateLimits[2] <= meanRateLimits[1]) stop(MEANRATEERR)
		#lnd.mean.rate.prior <- function(x) dunif( x , meanRateLimits[1], meanRateLimits[2], log= TRUE )
		lnd.mean.rate.prior <- function(x) ifelse( x >= meanRateLimits[1] & x <= meanRateLimits[2], 0, -Inf )
	}
	
	numStartConditions <- max(0, round( numStartConditions )) # number of omega0 to try for optimisation
	
	EST_SAMP_TIMES <- TRUE
	EST_SAMP_TIMES_ERR <- 'estimateSampleTimes must specify a data frame with tip.label as row names and with columns `upper` and `lower`. You may also provide a named list of log density functions (improper priors for sample times).\n'
	if (is.null(estimateSampleTimes)) EST_SAMP_TIMES <- FALSE
	tiplabel_est_samp_times <- NULL 
	if (EST_SAMP_TIMES){
		if (class(estimateSampleTimes)=='data.frame'){
			if ( !('lower' %in% colnames(estimateSampleTimes)) | !('upper' %in% colnames(estimateSampleTimes) ) ){
				stop(EST_SAMP_TIMES_ERR)
			}
			if ( any (estimateSampleTimes$lower > estimateSampleTimes$upper) ){
				stop(EST_SAMP_TIMES_ERR )
			}
			estimateSampleTimes <- estimateSampleTimes[ estimateSampleTimes$lower < estimateSampleTimes$upper ,]
			for (tl in rownames(estimateSampleTimes)){
				if (!(tl %in% names( estimateSampleTimes_densities))){
					estimateSampleTimes_densities[[tl]] <-  function(x,tl) dunif(x, min= estimateSampleTimes[tl,'lower'], max=estimateSampleTimes[tl,'upper'] , log = TRUE) #
				}
			}
		} else {
			stop(EST_SAMP_TIMES_ERR)
		}
		tiplabel_est_samp_times <- intersect( rownames(estimateSampleTimes), tre$tip.label)
		iedge_tiplabel_est_samp_times <- match( tiplabel_est_samp_times, tre$tip.label[tre$edge[,2]] )
	}
		
	# check for missing sample times, impute missing if needed 
	#if (any(is.na(sts))) stop( 'Some sample times are NA.' )
	stinfo_provided <- union( names(na.omit(sts)), rownames(estimateSampleTimes))
	stinfo_not_provided <-   setdiff( tre$tip.label, stinfo_provided ) 
	if (length( stinfo_not_provided ) > 0){
		cat( 'NOTE: Neither sample times nor sample time bounds were provided for the following lineages:\n')
		cat( stinfo_not_provided )
		cat('\n Provide sampling info or remove these lineages from the tree. Stopping.\n ') 
		stop('Missing sample time information.' )
	}
	initial_st_should_impute <- setdiff( rownames(estimateSampleTimes), names(na.omit(sts)))
	if (length( initial_st_should_impute ) > 0){
		cat('NOTE: initial guess of sample times for following lineages was not provided:\n')
		cat ( initial_st_should_impute ) 
		cat('\n') 
		cat( 'Will proceed with midpoint of provided range as initial guess of these sample times.\n')
		sts[initial_st_should_impute] <- rowMeans( estimateSampleTimes )[initial_st_should_impute] 
	}
	
	if (is.null(names(sts))){
		if (length(sts)!=length(tre$tip.label)) stop('Sample time vector length does not match number of lineages.')
		names(sts) <- tre$tip.label
	}
	sts <- sts[tre$tip.label]
	
	if (is.na(minblen)){
		minblen <- diff(range(sts))/ length(sts) / 10 #TODO choice of this parm is difficult, may require sep optim / crossval
		cat(paste0('Note: Minimum temporal branch length  (*minblen*) set to ', minblen, '. Increase *minblen* in the event of convergence failures. \n'))
	}
	if (!is.na(omega0) & numStartConditions > 0 ){
		warning('omega0 provided incompatible with numStartConditions > 0. Setting numStartConditions to zero.')
		numStartConditions <- 0
	}
	
	intree_rooted <- TRUE
	if (!is.rooted(tre)){
		intree_rooted <- FALSE
		cat( 'Tree is not rooted. Searching for best root position. Increase searchRoot to try harder.\n')
		searchRoot <- round( searchRoot )
		rtres <- .multi.rtt(tre, sts, topx=searchRoot, ncpu = ncpu)
		if (ncpu > 1 )
		{
			if (parallel_foreach){
				success = requireNamespace('foreach', quietly=TRUE)
				if (!success) stop('*parallel_foreach* requires the `foreach` package. Stopping.')
				`%dopar%` <- foreach::`%dopar%`
				tds <- foreach::foreach( t = iterators::iter( rtres )) %dopar% {
					.dater( t, sts, s = s, omega0=omega0, minblen=minblen, maxit=maxit,abstol=abstol
						, clock = clock 
						, temporalConstraints = temporalConstraints, quiet = quiet
						, estimateSampleTimes = estimateSampleTimes
						, estimateSampleTimes_densities = estimateSampleTimes_densities  
						, numStartConditions = numStartConditions
						, meanRateLimits = meanRateLimits
						, lnd.mean.rate.prior =  lnd.mean.rate.prior 
						, tiplabel_est_samp_times = tiplabel_est_samp_times
						) 
				}
			} else{
				tds <- parallel::mclapply( rtres, function(t){
					.dater( t, sts, s = s, omega0=omega0, minblen=minblen, maxit=maxit,abstol=abstol
						, clock = clock 
						, temporalConstraints = temporalConstraints, quiet = quiet
						, estimateSampleTimes = estimateSampleTimes
						, estimateSampleTimes_densities = estimateSampleTimes_densities  
						, numStartConditions = numStartConditions
						, meanRateLimits = meanRateLimits
						, lnd.mean.rate.prior =  lnd.mean.rate.prior 
						, tiplabel_est_samp_times = tiplabel_est_samp_times
						) 
				}, mc.cores = ncpu )
			}
		} else{
			tds <- lapply( rtres, function(t) {
				.dater( t, sts, s = s, omega0=omega0, minblen=minblen, maxit=maxit,abstol=abstol
					, clock = clock 
					, temporalConstraints = temporalConstraints, quiet = quiet
					, estimateSampleTimes = estimateSampleTimes
					, estimateSampleTimes_densities = estimateSampleTimes_densities  
					, numStartConditions = numStartConditions
					, meanRateLimits = meanRateLimits
					, lnd.mean.rate.prior = lnd.mean.rate.prior 
					, tiplabel_est_samp_times = tiplabel_est_samp_times
					) 
			})
		}
		lls <- sapply( tds, function(td) td$loglik )
		td <- tds [[ which.max( lls ) ]]
		td$intree_rooted <- FALSE
		.fitDiagnostics ( td )
		return ( td )
	} else{
		cat( 'Tree is rooted. Not estimating root position.\n')
	}
	td = .dater( tre, sts, s = s, omega0=omega0, minblen=minblen, maxit=maxit,abstol=abstol
		, clock = clock
		, quiet = quiet
		, estimateSampleTimes = estimateSampleTimes
		, estimateSampleTimes_densities = estimateSampleTimes_densities  
		, numStartConditions = numStartConditions
		, meanRateLimits = meanRateLimits
		, lnd.mean.rate.prior = lnd.mean.rate.prior 
		, tiplabel_est_samp_times = tiplabel_est_samp_times
	)
	.fitDiagnostics ( td )
	return( td )
}

#' @export 
print.treedater <- function(x, ...){
	.fitDiagnostics( x )
	
    cl <- oldClass(x)
    oldClass(x) <- cl[cl != "treedater"]
    print(x$intree)
    cat('\n Time of common ancestor \n' )
    cat(paste( x$timeOfMRCA, '\n') )
    cat('\n Time to common ancestor (before most recent sample) \n' )
    cat(paste( x$timeToMRCA, '\n') )
    cat( '\n Weighted mean substitution rate (adjusted by branch lengths) \n')
    cat(paste( x$adjusted.mean.rate , '\n'))
    cat( '\n Unadjusted mean substitution rate \n')
    cat(paste( x$mean.rate , '\n'))
    cat( '\n Clock model  \n')
    cat(paste( x$clock , '\n'))
    cat( '\n Coefficient of variation of rates \n')
    cat(paste( x$coef_of_variation, '\n' ))
    
    invisible(x)
}

#' @export 
summary.treedater <- function(object, ...) {
    stopifnot(inherits(object, "treedater"))
    print.treedater( object )
}

#' Produce a goodness of fit plot
#'
#' The sorted tail probabilties (p values) for each edge in the tree under the fitted model
#'
#' @param td A treedater object generated by the \code{dater} function
#'
#' @export 
goodnessOfFitPlot <- function(td)
{
	stopifnot(inherits(td, "treedater"))
with( td, 
	{
		graphics::plot( 1:length(edge.p)/length(edge.p), sort (edge.p ) , type = 'l', xlab='Theoretical quantiles', ylab='Edge p value', xlim = c(0,1), ylim = c(0,1)); 
		abline( a = 0, b = 1 )
	})
}

# help message when model fit is poor 
.TROUBLESHOOT0 <- '
The following steps may help to fix or alleviate common problems: 
* Check that the vector of sample times is correctly named and that the units are correct. 
* If passing a rooted tree, make sure that the root position was chosen correctly, or estimate the root position by passing an unrooted tree (e.g. pass ape::unroot(tree))
* The root position may be poorly estimated. Try increasing the _searchRoot_ parameter in order to test more lineages as potential root positions. 
* The model may be fitted by a relaxed or strict molecular clock. Try changing the _clock_ parameter 
* A poor fit may be due to a small number of lineages with unusual / outlying branch lengths which can occur due to sequencing error or poor alignment. Try the *outlierTips* command to identify and remove these lineages. 
* Check that there is adequate variance in sample times in order to estimate a molecular clock by doing a root-to-tip regression. Try the *rootToTipRegressionPlot* command. If the clock rate can not be reliably estimated, you can fix the value to a range using the _meanRateLimits_ option which would estimate a time tree given the previous estimate of clock rates. 
'

# Check for common problems in treedater fit and suggest solutions if applicable 
.fitDiagnostics <- function( td ){
	stopifnot(inherits(td, "treedater"))
	p <- td$edge.p 
	cv <- td$coef_of_variation
	cvprob = FALSE
	pprob = FALSE
	if (cv > 1 ){
		cat('
NOTE: The estimated coefficient of variation of clock rates is high (>1). Sometimes this indicates a poor fit and/or a problem with the data.
		\n')
		cvprob <- TRUE 
	}
	#~ success = requireNamespace('harmonicmeanp', quietly=TRUE)
	#~ 	if ( success ){
	#~ 	  pp <- harmonicmeanp::p.hmp( p )
	#~ 	} else {
	#~ 	  pp <- min( p.adjust( p, 'BH' ))
	#~ 	}
	pp <- min( p.adjust( p, 'BH' ))
	
	if ( pp < .025 ){
		pprob <-  TRUE 
		cat( '
NOTE: The p values for lineage clock rates show at least one outlying value after adjusting for multiple-testing.  This indicates a poor fit to the data for at least a portion of the phylogenetic tree. To visualize the distribution p-values, use *goodnessOfFitPlot*. 
		\n')
	}
	if (pprob | cvprob ){
		cat( .TROUBLESHOOT0 )
	}
}


#' Plot evolutionary distance from root to sample times and estimated internal node times and regression lines
#' 
#' If a range of sample times was given, these will be estimated. Red and black respectively indicate sample and internal nodes. 
#' This function will print statistics computed from the linear regression model. 
#' 
#' @param td A fitted treedater object 
#' @param show.tip.labels If TRUE, the names of each sample will be plotted at the their corresponding time and evoutionary distance
#' @param textopts An optional list of parameters for plotted tip labels. Passed to the *text* function. 
#' @param pointopts An optional list of parameters for plotted points if showing tip labels. Passed to the *points* function. 
#' @param ... Additional arguments are passed to plot
#' @return The fitted linear model (class 'lm')
#' @examples
#' ## simulate a random tree and sample times for demonstration
#' # make a random tree:
#' tre <- ape::rtree(50)
#' # sample times based on distance from root to tip:
#' sts <- setNames( ape::node.depth.edgelength( tre )[1:ape::Ntip(tre)], tre$tip.label)
#' # modify edge length to represent evolutionary distance with rate 1e-3:
#' tre$edge.length <- tre$edge.length * 1e-3
#' # treedater: 
#' td <- dater( tre, sts =sts, clock='strict', s = 1000, omega0=.0015 )
#' # root to tip regression: 
#' fit = rootToTipRegressionPlot( td )
#' summary(fit)
#' 
#' @export 
rootToTipRegressionPlot <- function(td, show.tip.labels=FALSE, textopts = NULL, pointopts=NULL, ... ){
	stopifnot( inherits( td, 'treedater'))
	dT <- ape::node.depth.edgelength( td  )
	dG <- ape::node.depth.edgelength( td$intree )
	#scatter.smooth( dT, dG )
	sts <- (td$timeOfMRCA+dT[1:ape::Ntip(td)])
	nts <- (td$timeOfMRCA+dT)
	mtip  <- lm( dG[1:ape::Ntip(td)] ~ sts )
	mall  <- lm( dG ~ nts )
	if ( !show.tip.labels){
		graphics::plot( dT + td$timeOfMRCA, dG
		  , col = c(rep('red', ape::Ntip(td)), rep('black', ape::Nnode(td) ) )  
		  , xlab = '' 
		  , ylab = 'Evolutionary distance'
		  , ... 
		)
	} else {
		i <- 1:ape::Ntip(td)
		j <- (ape::Ntip(td)+1):( ape::Ntip(td) + ape::Nnode(td) ) 
		graphics::plot( x = NULL, y = NULL 
		  , xlab = '' 
		  , ylab = 'Evolutionary distance'
		  , xlim = range( dT + td$timeOfMRCA)
		  , ylim = range( dG ) 
		  , ...
		)
		do.call( graphics::points, c( pointopts, list(x =   dT[j] + td$timeOfMRCA, y = dG[j] )))
		do.call( graphics::text, c( list( x = dT[i] + td$timeOfMRCA, y = dG[i] , labels=td$tip.label ), textopts ) )
	}
	graphics::abline( a = coef(mtip)[1], b = coef(mtip)[2], col = 'red' ) 
	graphics::abline( a = coef(mall)[1], b = coef(mall)[2], col = 'black' ) 
	smtip <- summary( mtip )
	cat(paste( 'Root-to-tip mean rate:', coef(mtip)[2], '\n'))
	cat(paste( 'Root-to-tip p value:', smtip$coefficients[2, 4 ], '\n'))
	cat(paste( 'Root-to-tip R squared (variance explained):', smtip$r.squared, '\n'))
	cat('Returning fitted linear model.\n')
	invisible(mtip)
}
emvolz-phylodynamics/treedater-dev documentation built on Jan. 28, 2020, 6:05 p.m.