library( ape )
library( treedater )
library( phangorn )
library( ggtree )
#' @export
.mltr <- function( fn ){
file.copy( fn, getwd() )
d <- read.dna( fn, format='fasta' )
#fn = basename( path_to_align )
#tdir = tempdir()
#file.copy( fn, tdir )
success = system( paste0( 'iqtree -nt AUTO -redo -m HKY -s ', fn ), intern=FALSE )
if ( success != 0 ){
stop('iqtree problem. Is it installed? ')
}
#file.copy( paste0( tdir, '/', paste0( fn, '.treefile' ) ) , '.')
mltr = read.tree( paste0( fn, '.treefile' ) )
sids0 <- sapply( rownames(d) , function( x ){
y = strsplit( x, split = '[\\/_\\|]+')[[1]]
rey = paste( y, collapse = '([\\/_\\|]+)' )
mltr$tip.label [ grepl( mltr$tip.label, pattern = rey ) ]
})
if ( (!is.vector( sids0 )) | (length( sids0 ) != Ntip(mltr ) )){
print( setdiff( sids0, mltr$tip.label ))
print( setdiff( mltr$tip.label, sids0 ))
stop( 'Some tip labels in tree could not be matched sequence ids' )
}
sids <- setNames( rownames(d), sids0 )
tr <- mltr
tr$tip.label <- sids [ tr$tip.label ]
tr
}
#' Generate a starting tree for beast
#'
#' This assumes the sequence data are in the format generated by prep_tip_labels_phydyn
#' Requirements:
#' 1) Fasta must have labels matching this form hCoV-19/Netherlands/Haarlem_1363688/2020|EPI_ISL_413572|2020-03-01|2020.16393442623|_Il
#' 2) iqtree must be installed and in the environment
#'
#' @param fn File name of input alignment. Must be in current working directory
#' @param treeofn File name for output tree (newick)
#' @param ncpu Integer number of cores
#' @return Will write the starting tree and returns the treedater tree
#' @export
make_starting_tree = function( fn , treeofn = 'startTree.nwk', ncpu = 4){
library( treedater )
library ( ape )
tr <- .mltr( fn )
sts <- sapply( strsplit( tr$tip.label, '\\|' ), function(x){
as.numeric( tail(x,2)[1] )
})
names(sts) <- tr$tip.label
tr1 <- tr
tr1$edge.length <- pmax( 1/29e3/5 , tr1$edge.length )
td = dater( unroot(tr1), sts, s= 29e3, omega0 = .0012, numStartConditions=0, meanRateLimits = c( .0009, .0015) , ncpu = ncpu )
write.tree( td, file = treeofn )
td
}
#~ td = make_starting_tree( 'nl2.fasta' )
#' Running treedater on ML trees on HPC or work stations with many cores
#'
#' Collapses small edges and resolves polytomies randomly many times
#'
#' @param threads number of treedater fits to run in parallel
#' @param ncpu number of cpus to use for each treedater fit
#' @param meanRateLimits passed to treedater::dater
#' @return a bunch of treedater trees corresponding to random resolutions of ML tree
#' @export
cutie_treedater <- function(tr, ntres = 10, threads= 2 , ncpu = 5, meanRateLimits = c( .0008, .0015) )
{
sts <- sapply( strsplit( tr$tip.label, '\\|' ), function(x){
as.numeric( tail(x,2)[1] )
})
names(sts) <- tr$tip.label
trpl <- tr
# collapse small edges, make polytomies
tr <- di2multi( tr, tol = 1e-5 )
# resolve polytomies randomly
tres <- lapply( 1:ntres, function(i) {
tr = unroot( multi2di( tr ) )
tr$edge.length <- pmax( 1/29e3/5, tr$edge.length ) #ensures that edge is >0, makes treedater fit a little nicer
tr
})
tds <- parallel::mclapply( tres, function(tr){
dater( unroot(tr), sts[tr$tip.label], s= 29e3, omega0 = .001, numStartConditions=0, meanRateLimits = meanRateLimits, ncpu = ncpu )
}, mc.cores = threads)
tds
}
#' Make starting trees and ML tree plot
#'
#' This will generate multiple starting trees by ML & treedater.
#' Each tree is produced by a different random resolution of polytomies in the ML tree
#' Sequence names must have sample times included (see prep_tip_labels)
#'
#' @param fastafn File name of sequence data (needed for ML tree estimation)
#' @param treeoutfn File name of trees to be written
#' @param plotout File name of ML tree plot. set to Null to not plot
#' @param regionDemes demes to colour in the output tree plot
#' @param ntres integer how many start trees to produce? a distinct xml is made for each
#' @param ncpu Number of CPUs to use
#' @return Some treedater trees. New start trees are written to disk. ML tree plot written to disk
#' @export
make_starting_trees <- function( fastafn, treeoutfn='startTrees.nwk' , plotout=NULL, regionDemes=c('Il'), ntres = 1, ncpu = 4 ){
library( treedater )
library ( ape )
if ( inherits( fastafn, 'phylo' ) )
tr <- fastafn
else
tr = .mltr( fastafn )
sts <- sapply( strsplit( tr$tip.label, '\\|' ), function(x){
as.numeric( tail(x,2)[1] )
})
names(sts) <- tr$tip.label
trpl <- tr
tr <- di2multi( tr, tol = 1e-5 ) # make polytomies
tres <- lapply( 1:ntres, function(i) { # resolve polytomies randomly
tr = unroot( multi2di( tr ) )
tr$edge.length <- pmax( 1/29e3/5, tr$edge.length ) #ensures that edge is >0, makes treedater fit a little nicer
tr
})
tds <- lapply( tres, function(tr){
dater( unroot(tr), sts[tr$tip.label], s= 29e3, omega0 = .0012, numStartConditions=0, meanRateLimits = c( .0009, .0015) , ncpu = ncpu )
})
if(!is.null(plotout)){
library( phangorn )
library( ggtree )
curroot <- phangorn::getRoot(trpl)
if(curroot!=phangorn::getRoot(tds[[1]]$intree))
trroot <- root(trpl, node = phangorn::getRoot(tds[[1]]$intree))
else trroot <- trpl
treedata <- sapply(strsplit(trroot$tip.label, "_"),
tail, 1)
treedata <- data.frame(tip.label = trroot$tip.label,
region = treedata %in% regionDemes, row.names = NULL,
stringsAsFactors = FALSE)
treedata$size[!treedata$region] <- 0
treedata$region[!treedata$region] <- NA
plt <- ggtree(trroot)
plt <- plt %<+% treedata + geom_tippoint(aes(color = region),
na.rm = TRUE, show.legend = FALSE, size = 1.25) +
theme_tree2(legend.position = "none")
ggsave(plt, file = plotout, width = 4, height = 7)
}
outtrees = lapply( tds, function(x){
class(x) <- 'phylo'
x
})
class( outtrees ) <- 'multiPhylo'
write.tree( outtrees
, file = treeoutfn
)
invisible( tds )
}
#' Make starting trees, insert into beast xml and create ML tree plot
#'
#' This will generate multiple starting trees by ML & treedater.
#' Each tree is produced by a different random resolution of polytomies in the ML tree
#' Sequence names must have sample times included (see prep_tip_labels)
#'
#' @param xmlfn File name of beast xml
#' @param fastafn File name of sequence data (needed for ML tree estimation)
#' @param plotout Output file name for ML tree plot. Set NULL to not plot
#' @param regionDemes regions to colour in the output ML tree
#' @param ntres integer how many start trees to produce? a distinct xml is made for each
#' @param ncpu Number of CPUs to use
#' @return Some treedater trees. New XMLs are written to disk
#' @export
add_starting_trees_to_xml <- function( xmlfn , fastafn , plotout=NULL, regionDemes=c('Il'), ntres = 1, ncpu = 4 ){
if ( inherits( fastafn, 'phylo' ) )
tr <- fastafn
else
tr = .mltr( fastafn )
sts <- sapply( strsplit( tr$tip.label, '\\|' ), function(x){
as.numeric( tail(x,2)[1] )
})
names(sts) <- tr$tip.label
trpl <- tr
tr <- di2multi( tr, tol = 1e-5 ) # make polytomies
tres <- lapply( 1:ntres, function(i) { # resolve polytomies randomly
tr = unroot( multi2di( tr ) )
tr$edge.length <- pmax( 1/29e3/5, tr$edge.length )
tr
})
tds <- lapply( tres, function(tr){
dater( unroot(tr), sts[tr$tip.label], s= 29e3, omega0 = .0012, numStartConditions=0, meanRateLimits = c( .0009, .0015) , ncpu = ncpu )
})
if(!is.null(plotout)){
#trpl$edge.length <- pmax( 1e-6, trpl$edge.length / 29e3 )
library( phangorn )
library( ggtree )
trroot <- tryCatch( {
root(trpl, node=phangorn::getRoot(tds[[1]]$intree)) # seems to raise error some times
} , error = function(e){
root(trpl, node=which.min(tds[[1]]$sts) )
})
treedata <- sapply(strsplit(trroot$tip.label, "_"), tail, 1)
treedata <- data.frame(tip.label=trroot$tip.label,region=treedata %in% regionDemes, row.names=NULL, stringsAsFactors = FALSE)
treedata$size[ !treedata$region ] <- 0
treedata$region[ !treedata$region ] <- NA
plt <- ggtree(trroot)
plt <- plt %<+% treedata +
geom_tippoint(aes(color = region), na.rm=TRUE, show.legend=FALSE, size =1.25) + theme_tree2( legend.position = "none" )
ggsave(plt, file = plotout , width = 4, height=7)
}
x = readLines( xmlfn )
for ( k in 1:ntres ){
xk = gsub( x , pattern = 'START_TREE', replacement = write.tree( tds[[k]] ) )
if ( !grepl( pattern = '\\.xml$', xmlfn ) )
writeLines( xk, con = paste0( xmlfn, '.', k ) )
else
writeLines( xk, con = gsub( pattern = '\\.xml$', replacement = paste0('\\.',k,'\\.xml'), xmlfn ) )
}
tds
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.