R/starttree1.R

Defines functions add_starting_trees_to_xml make_starting_trees cutie_treedater make_starting_tree .mltr

Documented in add_starting_trees_to_xml cutie_treedater make_starting_tree make_starting_trees

library( ape ) 
library( treedater ) 
library( phangorn )
library( ggtree )

#' @export 
.mltr <-  function( fn ){
	file.copy( fn, getwd() )
	d <- read.dna( fn, format='fasta' ) 
	#fn = basename( path_to_align )
	#tdir = tempdir() 
	#file.copy( fn, tdir ) 
	success = system( paste0( 'iqtree -nt AUTO -redo -m HKY -s ', fn ), intern=FALSE  )
	if ( success != 0 ){
		stop('iqtree problem. Is it installed? ')
	}
	#file.copy( paste0( tdir, '/',  paste0( fn, '.treefile' ) ) , '.')
	mltr = read.tree( paste0( fn, '.treefile' ) )
	
	sids0 <- sapply( rownames(d) , function( x ){
	y = strsplit( x, split = '[\\/_\\|]+')[[1]]
	rey = paste( y, collapse = '([\\/_\\|]+)' ) 
		mltr$tip.label [ grepl( mltr$tip.label, pattern = rey ) ]
	})
	if ( (!is.vector( sids0 )) | (length( sids0 ) != Ntip(mltr ) )){
		print( setdiff( sids0, mltr$tip.label ))
		print( setdiff( mltr$tip.label, sids0 ))
		stop( 'Some tip labels in tree could not be matched sequence ids' )
	}
	
	sids <- setNames( rownames(d), sids0 )

	tr <- mltr
	tr$tip.label <- sids [ tr$tip.label ]
	
	tr 
}

#' Generate a starting tree for beast
#'
#' This assumes the sequence data are in the format generated by prep_tip_labels_phydyn
#' Requirements: 
#' 1) Fasta must have labels matching this form hCoV-19/Netherlands/Haarlem_1363688/2020|EPI_ISL_413572|2020-03-01|2020.16393442623|_Il
#' 2) iqtree must be installed and in the environment 
#' 
#' @param fn File name of input alignment. Must be in current working directory 
#' @param treeofn File name for output tree (newick)
#' @param ncpu Integer number of cores 
#' @return Will write the starting tree and returns the treedater tree 
#' @export 
make_starting_tree = function( fn , treeofn = 'startTree.nwk', ncpu = 4){
	library( treedater ) 
	library ( ape ) 
	tr <- .mltr( fn )

	sts <- sapply( strsplit( tr$tip.label, '\\|' ), function(x){
		as.numeric( tail(x,2)[1] )
	})
	names(sts) <- tr$tip.label 

	tr1 <- tr 
	tr1$edge.length <- pmax( 1/29e3/5 , tr1$edge.length )
	td = dater( unroot(tr1), sts, s= 29e3, omega0 = .0012, numStartConditions=0, meanRateLimits = c( .0009, .0015) , ncpu = ncpu )

	write.tree( td, file = treeofn ) 
	td 
}

#~ td = make_starting_tree( 'nl2.fasta' )

#' Running treedater on ML trees on HPC or work stations with many cores
#' 
#' Collapses small edges and resolves polytomies randomly many times
#'  
#' @param threads number of treedater fits to run in parallel
#' @param ncpu number of cpus to use for each treedater fit 
#' @param meanRateLimits passed to treedater::dater
#' @return a bunch of treedater trees corresponding to random resolutions of ML tree 
#' @export 
cutie_treedater <- function(tr, ntres = 10, threads= 2 , ncpu = 5,  meanRateLimits = c( .0008, .0015) )
{
  sts <- sapply( strsplit( tr$tip.label, '\\|' ), function(x){
    as.numeric( tail(x,2)[1] )
  })
  names(sts) <- tr$tip.label 
  trpl <- tr
  # collapse small edges, make polytomies 
  tr <- di2multi( tr, tol = 1e-5 ) 
  # resolve polytomies randomly 
  tres <- lapply( 1:ntres, function(i) { 
    tr = unroot( multi2di( tr )  )  
    tr$edge.length <- pmax( 1/29e3/5, tr$edge.length  ) #ensures that edge is >0, makes treedater fit a little nicer 
    tr
  })
  tds <- parallel::mclapply( tres, function(tr){
    dater( unroot(tr), sts[tr$tip.label], s= 29e3, omega0 = .001, numStartConditions=0, meanRateLimits = meanRateLimits, ncpu = ncpu )
  }, mc.cores = threads)
  tds
}

#' Make starting trees and ML tree plot
#' 
#' This will generate multiple starting trees by ML & treedater. 
#' Each tree is produced by a different random resolution of polytomies in the ML tree
#' Sequence names must have sample times included (see prep_tip_labels)
#' 
#' @param fastafn File name of sequence data (needed for ML tree estimation)
#' @param treeoutfn File name of trees to be written 
#' @param plotout File name of ML tree plot. set to Null to not plot
#' @param regionDemes demes to colour in the output tree plot
#' @param ntres integer how many start trees to produce? a distinct xml is made for each 
#' @param ncpu Number of CPUs to use 
#' @return Some treedater trees. New start trees are written to disk. ML tree plot written to disk
#' @export
make_starting_trees <- function(  fastafn, treeoutfn='startTrees.nwk' , plotout=NULL, regionDemes=c('Il'), ntres = 1,  ncpu = 4 ){
	library( treedater ) 
	library ( ape ) 
  if ( inherits( fastafn, 'phylo' )  )
    tr <- fastafn 
  else
    tr = .mltr( fastafn  )
  
  sts <- sapply( strsplit( tr$tip.label, '\\|' ), function(x){
    as.numeric( tail(x,2)[1] )
  })
  names(sts) <- tr$tip.label 
  trpl <- tr
  tr <- di2multi( tr, tol = 1e-5 ) # make polytomies 
  tres <- lapply( 1:ntres, function(i) { # resolve polytomies randomly 
    tr = unroot( multi2di( tr )  )  
    tr$edge.length <- pmax( 1/29e3/5, tr$edge.length  ) #ensures that edge is >0, makes treedater fit a little nicer 
    tr
  })
  tds <- lapply( tres, function(tr){
    dater( unroot(tr), sts[tr$tip.label], s= 29e3, omega0 = .0012, numStartConditions=0, meanRateLimits = c( .0009, .0015) , ncpu = ncpu )
  })
  
  if(!is.null(plotout)){
    library( phangorn )
    library( ggtree )
    curroot <- phangorn::getRoot(trpl)
    if(curroot!=phangorn::getRoot(tds[[1]]$intree))
      trroot <- root(trpl, node = phangorn::getRoot(tds[[1]]$intree))
    else trroot <- trpl
    treedata <- sapply(strsplit(trroot$tip.label, "_"), 
                       tail, 1)
    treedata <- data.frame(tip.label = trroot$tip.label, 
                           region = treedata %in% regionDemes, row.names = NULL, 
                           stringsAsFactors = FALSE)
    treedata$size[!treedata$region] <- 0
    treedata$region[!treedata$region] <- NA
    plt <- ggtree(trroot)
    plt <- plt %<+% treedata + geom_tippoint(aes(color = region), 
                                             na.rm = TRUE, show.legend = FALSE, size = 1.25) + 
      theme_tree2(legend.position = "none")
    ggsave(plt, file = plotout, width = 4, height = 7)
  }
  
  outtrees = lapply( tds, function(x){
    class(x) <- 'phylo'
    x
  })
  class( outtrees ) <- 'multiPhylo' 
  write.tree( outtrees 
              , file = treeoutfn 
  )
  
  invisible( tds )
}



#' Make starting trees, insert into beast xml and create ML tree plot
#' 
#' This will generate multiple starting trees by ML & treedater. 
#' Each tree is produced by a different random resolution of polytomies in the ML tree
#' Sequence names must have sample times included (see prep_tip_labels)
#' 
#' @param xmlfn File name of beast xml
#' @param fastafn File name of sequence data (needed for ML tree estimation)
#' @param plotout Output file name for ML tree plot. Set NULL to not plot
#' @param regionDemes regions to colour in the output ML tree
#' @param ntres integer how many start trees to produce? a distinct xml is made for each 
#' @param ncpu Number of CPUs to use 
#' @return Some treedater trees. New XMLs are written to disk 
#' @export
add_starting_trees_to_xml <- function( xmlfn ,  fastafn , plotout=NULL, regionDemes=c('Il'), ntres = 1,  ncpu = 4 ){
  if ( inherits( fastafn, 'phylo' )  )
    tr <- fastafn 
  else
    tr = .mltr( fastafn  )
  
  sts <- sapply( strsplit( tr$tip.label, '\\|' ), function(x){
    as.numeric( tail(x,2)[1] )
  })
  names(sts) <- tr$tip.label 
  trpl <- tr
  tr <- di2multi( tr, tol = 1e-5 ) # make polytomies 
  tres <- lapply( 1:ntres, function(i) { # resolve polytomies randomly 
    tr = unroot( multi2di( tr )  )  
    tr$edge.length <- pmax( 1/29e3/5, tr$edge.length ) 
    tr
  })
  tds <- lapply( tres, function(tr){
    dater( unroot(tr), sts[tr$tip.label], s= 29e3, omega0 = .0012, numStartConditions=0, meanRateLimits = c( .0009, .0015) , ncpu = ncpu )
  })
  if(!is.null(plotout)){
    #trpl$edge.length <- pmax( 1e-6, trpl$edge.length / 29e3 )
    library( phangorn )
    library( ggtree )
    trroot <- tryCatch( { 
		root(trpl, node=phangorn::getRoot(tds[[1]]$intree)) # seems to raise error some times 
	} , error = function(e){
		root(trpl, node=which.min(tds[[1]]$sts)  )
	})
    treedata <- sapply(strsplit(trroot$tip.label, "_"), tail, 1)
    treedata <- data.frame(tip.label=trroot$tip.label,region=treedata %in% regionDemes, row.names=NULL, stringsAsFactors = FALSE)
    treedata$size[ !treedata$region ] <- 0
    treedata$region[ !treedata$region ] <- NA
    plt <- ggtree(trroot)
    plt <- plt %<+% treedata +
      geom_tippoint(aes(color = region), na.rm=TRUE, show.legend=FALSE, size =1.25) + theme_tree2( legend.position = "none" )
    ggsave(plt, file = plotout , width = 4, height=7)
    
  }
  
  x = readLines( xmlfn ) 
  for ( k in 1:ntres ){
    xk = gsub( x , pattern = 'START_TREE', replacement = write.tree( tds[[k]] )  )
    if ( !grepl( pattern = '\\.xml$', xmlfn )  )
      writeLines( xk, con =  paste0( xmlfn, '.', k )  )
    else 
      writeLines( xk, con =  gsub( pattern = '\\.xml$', replacement = paste0('\\.',k,'\\.xml'), xmlfn ) )
  }
  tds
}
emvolz-phylodynamics/sarscov2Rutils documentation built on Nov. 17, 2020, 9:22 a.m.