beast.pool.clusters: Pool phylogenetic clusters for separate BEAST runs
In olli0601/rBEAST: rBEAST

Description Usage Author(s) See Also Examples

This function returns a list of data tables. Each data table contains the pooled taxa for a single BEAST run. See Examples.

1	beast.pool.clusters(cluphy.df, how = NA, verbose = 1, ...)

Oliver Ratmann

beastxml.from.template

require(rBEAST)
file				<- '~/git/rBEAST/inst/extra/exclupool_150414.R'
load(file)	
#expect data.table 'df' with (at least) columns
#	"TAXON_ID" 			taxon id
#	"CLU_ID" 			phylogenetic cluster ID of sequence, NA if not in any cluster
#	"SAMPLINGTIME"		sampling time of taxon

#	option 'by.samplingtime'
#	defines cluster sampling time as the earliest taxon sampling time
#	ranks clusters by cluster sampling time
#	pools clusters by evenly spaced rank, ie for 3 clusters, ranks 1, 4, 7, ... are grouped into one pool
#
#	pool.ntip: guide to the number of sequences in each pool. This is typically not met exactly, because sequences contain differing numbers of sequences.
df.clupool			<- beast.pool.clusters(df, how='by.samplingtime', pool.ntip=150, verbose=1)

#	option 'by.samplingtime.alwaysincludebeforeyear'
#	as option 'by.samplingtime'
#	and in addition includes all early cluster with cluster sampling time before 'pool.includealwaysbeforeyear' into each pool
#
#	pool.ntip: guide to the number of sequences in each pool. This is typically not met exactly, because sequences contain differing numbers of sequences.
#	pool.includealwaysbeforeyear: year before which clusters are always included.
df.clupool			<- beast.pool.clusters(df, how='by.samplingtime.alwaysincludebeforeyear', pool.ntip=150, pool.includealwaysbeforeyear=1996, verbose=1)

#	option 'by.required.seq.per.samplingperiod'

#	pool.ntip.guide: guide to the number of sequences in each pool. This is typically not met exactly, because sequences contain differing numbers of sequences.
#	pool.breaks: sampling periods defined in terms of height (max sampling time - sampling time)
#	pool.ntip.min: minimum number of sequences required per sampling period; NA if none required
df.clupool			<- beast.pool.clusters(df, how='by.required.seq.per.samplingperiod', pool.ntip.guide=150, pool.ntip.min=c(50, 70, 70, NA, NA), pool.breaks=c(0, 1.596, 3.596, 5.596, 9.596, Inf), verbose=1)
require(rBEAST)
infile.seq				<- system.file(package="rBEAST", 'extra','sim_150416_seq.R')
infile.tree				<- system.file(package="rBEAST", 'extra','sim_150416_njtree.R')
infile.xml.template		<- system.file(package="rBEAST", 'xml_templates','BEAST_template_um192rhU2080.xml')
outdir.xml				<- getwd()
outfile.xml				<- 'sim_150416_um192rhU2080.xml'

#	load data
load(infile.seq)		
#	expect data.table "df.seq" with columns
#	"TAXON_ID"		taxon ID
#	"TAXON_NAME" 	taxon name
#	"SAMPLINGTIME"	taxon sampling time
#	"GAG"   		gag sequence as character string
#	"POL"   		pol sequence as character string
#	"ENV"   		env sequence as character string
#	"CLU_ID" 		phylogenetic cluster ID of sequence

#	use POL sequences
setnames(df.seq,'POL','SEQ')
#	load tree
load(infile.tree)
#	expect ape tree 'ph'

#	illustrate automatic generation of multiple XML files, each for one pool of sequences
#	with monophyly constraints on phylogenetic clusters.
#	with this option, an appropriate starting tree should be specified
#
#	group clusters into separate pools
df.clupool	<- beast.pool.clusters(df.seq, how='by.samplingtime', pool.ntip=300, verbose=1)
#	generate XML file from template
for( pid in seq_along(df.clupool) )
{
	df			<- df.clupool[[pid]]	
	#	get newick start tree for all sequences in pool
	start.tree	<- beast.get.startingtree(ph, df, verbose=1)				
	#	get xml file from template			
	bxml		<- beastxml.from.template(infile.xml.template, df, start.tree=start.tree, xml.taxonsets4clusters=1, xml.monophyly4clusters=1,  verbose=1)
	#	adjust xml file depending on the template that is used
	beast.adjust.gmrfSkyrideLikelihood(bxml, df, verbose=1)				
	beast.adjust.prior.rootheight(bxml, df, rootHeight.idref='treeModel.rootHeight', verbose=1)
	beast.adjust.mcmc(bxml, 
			mcmc.chainLength=beast.mcmc.chainLength, 
			mcmc.logEvery=beast.mcmc.chainLength/1e4, 
			mcmc.outfile=gsub('.xml', paste('_pool',pid,'.xml',sep=''),outfile.xml), 
			verbose=1)
	#	write to file
	saveXML(bxml, file=paste(outdir.xml,gsub('.xml', paste('_pool',pid,'.xml',sep=''),outfile.xml),sep='/'))	
}