R/script.beast.R

.onAttach <- function(...)
{
	packageStartupMessage("rBEAST 1.0.0\nhttps://github.com/olli0601/rBEAST")
}
######################################################################################
#	write xml file for BEAST multilocus.hky run
#' @title Generate multilocus XML file with a codon specific GTR model and fixed starting tree
#' @description See Examples.
# import XML phytools ape data.table reshape2 ggplot2
#' @export
#' @example example/ex.beastxml.multilocus.codon.gtr.R
#' @author Oliver Ratmann
beastxml.multilocus.codon.gtr<- function( file.name, seq.select, phd, verbose=1 )
{
	stopifnot(c("CLU_ID", "TAXON_ID", "TAXON_NAME", "SEQ", "SAMPLINGTIME")%in%names(seq.select))
	file.name.base	<- gsub('.xml','',rev(strsplit(file.name,'/')[[1]])[1])
	#	write XML file with new sequences
	bxml			<- newXMLDoc(addFinalizer=T)
	bxml.beast		<- newXMLNode("beast", doc=bxml, addFinalizer=T)
	tmp				<- newXMLCommentNode(text=paste("Generated by beastxml.multilocus.codon.gtr"), parent=bxml.beast, doc=bxml, addFinalizer=T)
	#	add new taxa
	beast.add.taxa(bxml, seq.select, beast.date.direction= "forwards", beast.date.units= "years", verbose=1)
	#	add alignment for every transmission cluster
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.alignment(bxml, subset(seq.select, CLU_ID==clu), df.seqfield='SEQ',  beast.alignment.id=paste("alignment_CLU",clu,sep=''), beast.alignment.dataType= "nucleotide", verbose=1)
	}
	#	add starting tree for every transmission cluster
	for(clu in names(phd))
	{
		beast.add.startingtree(bxml, phd[[clu]], beast.usingDates="true", beast.newickid=paste("startingTree_CLU",clu,sep=''), beast.brlunits="years", verbose=1)
	}
	#	add CODON patterns for every alignment
	for(clu in seq.select[, unique(CLU_ID)])
		for(k in 1:3)
		{
			beast.add.patterns(bxml, paste("patterns.CP",k,"_CLU",clu,sep=''), paste("alignment_CLU",clu,sep=''), k, beast.patterns.every=3, beast.patterns.strip='false', verbose=1)
		}
	#	add tree model for every alignment
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.treemodel(bxml, paste("treeModel_CLU",clu,sep=''), paste("treeModel.rootHeight_CLU",clu,sep=''), paste("treeModel.internalNodeHeights_CLU",clu,sep=''), paste("treeModel.allInternalNodeHeights_CLU",clu,sep=''), newick.id=paste("startingTree_CLU",clu,sep=''), internalNodes='true', rootNode='true', verbose=1)
	}
	#	add skygrid
	tmp					<- sapply( seq.select[, unique(CLU_ID)], function(x) paste("treeModel_CLU",x,sep='') )
	grid.height			<- max(sapply(phd, function(ph) max(node.depth.edgelength(ph))))
	beast.add.gmrfskygrid(bxml, "skygrid", "skygrid.logPopSize", "skygrid.precision", "skygrid.numGridPoints", "skygrid.cutOff", tmp,
			80, -2.396895772465287, 1.0, 0.0, grid.height, verbose=1)
	#	add uncorrelated relaxed clock for every transmission cluster
	tmp					<- seq.select[, unique(CLU_ID)]
	clu					<- tmp[1]
	beast.add.discretizedBranchRates(bxml, paste("branchRates_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates.categories_CLU",clu,sep=''),
			mean.id="ucld.mean", mean.value="0.0025", mean.lower="0.0", sd.id="ucld.stdev", sd.value="0.3333333333333333", sd.lower="0.0", meanInRealSpace="true",
			rateCategories.dimension= subset(seq.select, CLU_ID==clu)[, 2*(length(unique(TAXON_ID))-1)], verbose=1)
	for(clu in tmp[-1])
	{
		beast.add.discretizedBranchRates(bxml, paste("branchRates_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates.categories_CLU",clu,sep=''),
				mean.idref="ucld.mean", sd.idref="ucld.stdev", meanInRealSpace="true",
				rateCategories.dimension= subset(seq.select, CLU_ID==clu)[, 2*(length(unique(TAXON_ID))-1)], verbose=1)
	}
	#	add rate statistic for first cluster
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.rateStatistics(bxml, paste('branchRatesStat_CLU',clu, sep=''), paste('treeModel_CLU',clu, sep=''), paste('branchRates_CLU',clu, sep=''))
	}
	#	add GTR CP1 2 3  models
	for(k in 1:3)
	{
		clu			<- seq.select[, unique(CLU_ID)][1]
		beast.add.gtrModel(bxml, paste("gtr.CP",k,"_CLU", clu,sep=""), paste("patterns.CP",k,"_CLU",clu,sep=''), paste("gtr.frequencies.CP",k,"_CLU",clu,sep=''),
				rateAC.id=paste("gtr.ac.CP",k,sep=""), rateAC.value="0.3", rateAC.lower="0.0",
				rateAG.id=paste("gtr.ag.CP",k,sep=""), rateAG.value="1.0", rateAG.lower="0.0",
				rateAT.id=paste("gtr.at.CP",k,sep=""), rateAT.value="0.2", rateAT.lower="0.0",
				rateCG.id=paste("gtr.cg.CP",k,sep=""), rateCG.value="0.2", rateCG.lower="0.0",
				rateGT.id=paste("gtr.gt.CP",k,sep=""), rateGT.value="0.1", rateGT.lower="0.0",
				frequencies.dimension='4', frequencyModel.dataType='nucleotide')
		for(clu in seq.select[, unique(CLU_ID)][-1])
			beast.add.gtrModel(bxml, paste("gtr.CP",k,"_CLU", clu,sep=""), paste("patterns.CP",k,"_CLU",clu,sep=''), paste("gtr.frequencies.CP",k,"_CLU",clu,sep=''),
					rateAC.idref=paste("gtr.ac.CP",k,sep=""),
					rateAG.idref=paste("gtr.ag.CP",k,sep=""),
					rateAT.idref=paste("gtr.at.CP",k,sep=""),
					rateCG.idref=paste("gtr.cg.CP",k,sep=""),
					rateGT.idref=paste("gtr.gt.CP",k,sep=""),
					frequencies.dimension='4', frequencyModel.dataType='nucleotide')
	}
	#	add CP1 2 3  site models
	for(k in 1:3)
	{
		clu			<- seq.select[, unique(CLU_ID)][1]
		beast.add.siteModel(bxml, paste("site.CP",k,"_CLU", clu,sep=""), paste("gtr.CP",k,"_CLU", clu,sep=""),
				relativeRate.id=paste("site.mu.CP",k,sep=""), relativeRate.value="1.0", relativeRate.lower="0.0",
				gamma.id=paste("site.alpha.CP",k,sep=""), gamma.value="0.5", gamma.lower="0.0",
				gammaCategories="4", verbose=1)
		for(clu in seq.select[, unique(CLU_ID)][-1])
			beast.add.siteModel(bxml, paste("site.CP",k,"_CLU", clu,sep=""), paste("gtr.CP",k,"_CLU", clu,sep=""),
					relativeRate.idref=paste("site.mu.CP",k,sep=""), gamma.idref=paste("site.alpha.CP",k,sep=""), gammaCategories="4", verbose=1)
	}
	#	copy compoundParameter
	beast.add.compoundParameter(bxml, 'all.site.mu', select.id='site.mu', verbose=1)
	#	add treeLikelihood
	for(clu in seq.select[, unique(CLU_ID)])
		for(k in 1:3)
		{
			beast.add.treeLikelihood(bxml, paste("treeLikelihood.CP",k,"_CLU",clu,sep=""),
					paste("patterns.CP",k,"_CLU",clu,sep=''),
					paste("treeModel_CLU",clu,sep=''),
					paste('site.CP',k,'_CLU',clu,sep=''),
					paste("branchRates_CLU",clu,sep=''), useAmbiguities='false', verbose=1)
		}
	#	add compound likelihood
	tmp					<- unique(unlist(xpathApply(bxml, "//treeLikelihood[@id]", xmlGetAttr, "id")))
	beast.add.likelihood(bxml, 'likelihood', tmp)
	#	add prior element
	beast.add.priors(bxml, 'prior')
	#	add priors
	beast.add.uniformPrior(bxml, "gtr.ac.CP1", 0, 1.5)
	beast.add.gammaPrior(bxml, "gtr.ag.CP1", 0.05, 20, 0)
	beast.add.uniformPrior(bxml, "gtr.at.CP1", 0, .5)
	beast.add.uniformPrior(bxml, "gtr.cg.CP1", 0, 1)
	beast.add.uniformPrior(bxml, "gtr.gt.CP1", 0, .5)

	beast.add.uniformPrior(bxml, "gtr.ac.CP2", 0, 1.5)
	beast.add.gammaPrior(bxml, "gtr.ag.CP2", 0.05, 20, 0)
	beast.add.uniformPrior(bxml, "gtr.at.CP2", 0, .5)
	beast.add.uniformPrior(bxml, "gtr.cg.CP2", 0, 1)
	beast.add.uniformPrior(bxml, "gtr.gt.CP2", 0, .5)

	beast.add.uniformPrior(bxml, "gtr.ac.CP3", 0, 1.5)
	beast.add.gammaPrior(bxml, "gtr.ag.CP3", 0.05, 20, 0)
	beast.add.uniformPrior(bxml, "gtr.at.CP3", 0, .5)
	beast.add.uniformPrior(bxml, "gtr.cg.CP3", 0, 1)
	beast.add.uniformPrior(bxml, "gtr.gt.CP3", 0, .5)

	for(k in 1:3)
		beast.add.uniformPrior(bxml, paste("site.mu.CP",k,sep=''), 0.1, 10)
	for(k in 1:3)
		beast.add.exponentialPrior(bxml, paste("site.alpha.CP",k,sep=''), 0.5, 0)

	beast.add.uniformPrior(bxml, "ucld.mean", 0.0001, 0.006)
	beast.add.exponentialPrior(bxml, "ucld.stdev", 0.1, 0)
	beast.add.gammaPrior(bxml, "skygrid.precision", 0.001, 1000, 0)
	beast.add.gmrfSkyGridPrior(bxml, 'skygrid')
	#	add mcmc element
	beast.add.mcmc(bxml, 	mcmc.id='mcmc', posterior.id='posterior', prior.idref='prior', likelihood.idref='likelihood',
			mcmc.chainLength=20e6, mcmc.autoOptimize='true', mcmc.operatorAnalysis= paste(file.name.base, '.ops',sep=''))
	#	add operator element
	beast.add.operators(bxml, 'operators')
	#	add scale operators
	tmp					<- as.vector( sapply( c("gtr.ac.CP","gtr.ag.CP","gtr.at.CP","gtr.cg.CP","gtr.gt.CP","site.alpha.CP"), function(x) paste(x,1:3,sep='') ) )
	tmp					<- c("skygrid.precision", tmp)
	tmp					<- data.table(	ID= tmp, SF=0.75, W=0.1 )
	for(i in seq_len(nrow(tmp)))
		beast.add.scaleOperator(bxml, tmp[i,ID], tmp[i,SF], tmp[i,W])
	tmp					<- data.table(	ID= c('ucld.mean',"ucld.stdev"), SF=0.75, W=3 )
	for(i in seq_len(nrow(tmp)))
		beast.add.scaleOperator(bxml, tmp[i,ID], tmp[i,SF], tmp[i,W])
	#	add deltaExchange operators
	beast.add.deltaExchangeOperator(bxml, 'all.site.mu', delta=0.75, parameterWeights=c(948, 948, 948), weight=9)
	#	add gmrf block operators
	beast.add.gmrfGridBlockUpdateOperator(bxml, 'skygrid', scaleFactor=2, weight=2)
	#	add branch rate category operators
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.uniformIntegerOperator(bxml, paste("branchRates.categories_CLU",clu,sep=''), 10)
		beast.add.swapOperator(bxml, paste("branchRates.categories_CLU",clu,sep=''), size=1, weight=10, autoOptimize="false")
	}
	#	add screen log
	beast.add.screenLog(bxml, log.id='screenLog', posterior.idref='posterior', likelihood.idref="likelihood", prior.idref="prior", logEvery=2e4)
	#	add file log
	beast.add.fileLog(bxml, log.id='fileLog', posterior.idref='posterior', likelihood.idref="likelihood", prior.idref="prior", logEvery=2e4,
			log.fileName=paste(file.name.base, '.log',sep=''), log.overwrite='false',
			select.id=c('rootHeight', 'ucld', 'gtr', 'alpha', 'skygrid','branchRatesStat'), deselect.id=c('frequencies','categories'))
	#	add tree log for every transmission cluster
	#for(clu in seq.select[, unique(CLU_ID)])
	#{
	#	beast.add.logTree(bxml, paste("logTree_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates_CLU",clu,sep=''), 'posterior',
	#			logEvery=2e4, nexusFormat='true', sortTranslationTable='true',
	#			fileName=paste(file.name.base,'_CLU', clu, '.trees', sep=''), verbose=1)
	#}
	bxml
}
######################################################################################
#' @title Generate multilocus XML file with HKY model and fixed starting tree
#' @description See Examples.
# import XML phytools ape data.table reshape2 ggplot2
#' @export
#' @author Oliver Ratmann
beastxml.multilocus.hky.fixed.topology<- function( file.name, seq.select, phd, verbose=1 )
{
	stopifnot(c("CLU_ID", "TAXON_ID", "TAXON_NAME", "SEQ", "SAMPLINGTIME")%in%names(seq.select))
	file.name.base	<- gsub('.xml','',rev(strsplit(file.name,'/')[[1]])[1])
	#	write XML file with new sequences
	bxml			<- newXMLDoc(addFinalizer=T)
	bxml.beast		<- newXMLNode("beast", doc=bxml, addFinalizer=T)
	tmp				<- newXMLCommentNode(text=paste("Generated by beastxml.multilocus.hky.fixed.tree"), parent=bxml.beast, doc=bxml, addFinalizer=T)
	#	add new taxa
	beast.add.taxa(bxml, seq.select, beast.date.direction= "forwards", beast.date.units= "years", verbose=1)
	#	add alignment for every transmission cluster
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.alignment(bxml, subset(seq.select, CLU_ID==clu), df.seqfield='SEQ', beast.alignment.id=paste("alignment_CLU",clu,sep=''), beast.alignment.dataType= "nucleotide", verbose=1)
	}
	#	add starting tree for every transmission cluster
	for(clu in names(phd))
	{
		beast.add.startingtree(bxml, phd[[clu]], beast.usingDates="true", beast.newickid=paste("startingTree_CLU",clu,sep=''), beast.brlunits="years", verbose=1)
	}
	#	add CODON patterns for every alignment
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.patterns(bxml, paste("patterns_CLU",clu,sep=''), paste("alignment_CLU",clu,sep=''), 1, beast.patterns.every=NA, beast.patterns.strip='false', verbose=1)
	}
	#	add tree model for every alignment
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.treemodel(bxml, paste("treeModel_CLU",clu,sep=''), paste("treeModel.rootHeight_CLU",clu,sep=''), paste("treeModel.internalNodeHeights_CLU",clu,sep=''), paste("treeModel.allInternalNodeHeights_CLU",clu,sep=''), newick.id=paste("startingTree_CLU",clu,sep=''), internalNodes='true', rootNode='true', verbose=1)
	}
	#	add skygrid
	tmp					<- sapply( seq.select[, unique(CLU_ID)], function(x) paste("treeModel_CLU",x,sep='') )
	grid.height			<- max(sapply(phd, function(ph) max(node.depth.edgelength(ph))))
	beast.add.gmrfskygrid(bxml, "skygrid", "skygrid.logPopSize", "skygrid.precision", "skygrid.numGridPoints", "skygrid.cutOff", tmp,
			80, -2.396895772465287, 1.0, 0.0, grid.height, verbose=1)
	#	add uncorrelated relaxed clock for first cluster
	tmp					<- seq.select[, unique(CLU_ID)]
	clu					<- tmp[1]
	beast.add.discretizedBranchRates(bxml, paste("branchRates_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates.categories_CLU",clu,sep=''),
			mean.id="ucld.mean", mean.value="0.0025", mean.lower="0.0", sd.id="ucld.stdev", sd.value="0.3333333333333333", sd.lower="0.0", meanInRealSpace="true",
			rateCategories.dimension= subset(seq.select, CLU_ID==clu)[, 2*(length(unique(TAXON_ID))-1)], verbose=1)
	#	link uncorrelated relaxed clock to other clusters
	for(clu in tmp[-1])
	{
		beast.add.discretizedBranchRates(bxml, paste("branchRates_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates.categories_CLU",clu,sep=''),
				mean.idref="ucld.mean", sd.idref="ucld.stdev", meanInRealSpace="true",
				rateCategories.dimension= subset(seq.select, CLU_ID==clu)[, 2*(length(unique(TAXON_ID))-1)], verbose=1)
	}
	#	add rate statistic for first cluster
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.rateStatistics(bxml, paste('branchRatesStat_CLU',clu, sep=''), paste('treeModel_CLU',clu, sep=''), paste('branchRates_CLU',clu, sep=''))
	}
	#	add HKY model for first cluster
	clu			<- seq.select[, unique(CLU_ID)][1]
	beast.add.hkyModel(bxml, paste("hky_CLU", clu,sep=""), paste("patterns_CLU",clu,sep=''), paste("hky.frequencies_CLU",clu,sep=''),
			rate.id=paste("hky.kappa",sep=""), rate.value="1.0", rate.lower="0.0",
			frequencies.dimension='4', frequencyModel.dataType='nucleotide')
	#	link HKY model to other clusters
	for(clu in seq.select[, unique(CLU_ID)][-1])
	{
		beast.add.hkyModel(bxml, paste("hky_CLU", clu,sep=""), paste("patterns_CLU",clu,sep=''), paste("hky.frequencies_CLU",clu,sep=''),
				rate.idref=paste("hky.kappa",sep=""),
				frequencies.dimension='4', frequencyModel.dataType='nucleotide')
	}
	#	add site model for first cluster
	clu			<- seq.select[, unique(CLU_ID)][1]
	beast.add.siteModel(bxml, paste("siteModel_CLU", clu,sep=""), paste("hky_CLU", clu,sep=""),
			substitutionModel.class='HKYModel', gamma.id=paste("site.alpha",sep=""), gamma.value="0.5", gamma.lower="0.0", gammaCategories="4", verbose=1)
	#	link site model to other clusters
	for(clu in seq.select[, unique(CLU_ID)][-1])
		beast.add.siteModel(bxml, paste("siteModel_CLU", clu,sep=""), paste("hky_CLU", clu,sep=""),
				substitutionModel.class='HKYModel',  gamma.idref=paste("site.alpha",sep=""), gammaCategories="4", verbose=1)
	#	add treeLikelihood
	for(clu in seq.select[, unique(CLU_ID)])
		beast.add.treeLikelihood(bxml, paste("treeLikelihood_CLU",clu,sep=""),
				paste("patterns_CLU",clu,sep=''),
				paste("treeModel_CLU",clu,sep=''),
				paste('siteModel_CLU',clu,sep=''),
				paste("branchRates_CLU",clu,sep=''), useAmbiguities='false', verbose=1)
	#	add compound likelihood
	tmp					<- as.vector(sapply(seq.select[, unique(CLU_ID)], function(clu) paste("treeLikelihood_CLU",clu,sep="") ))
	beast.add.likelihood(bxml, 'likelihood', tmp)
	#	add prior element
	beast.add.priors(bxml, 'prior')
	#	add priors	https://code.google.com/p/beast-mcmc/wiki/ParameterPriors
	beast.add.logNormalPrior(bxml, "hky.kappa", 1, 1.25, 0)		#default
	beast.add.uniformPrior(bxml, "ucld.mean", 0.0001, 0.006)
	beast.add.exponentialPrior(bxml, "ucld.stdev", 0.1, 0)
	beast.add.exponentialPrior(bxml, "site.alpha", 0.5, 0)
	for(clu in seq.select[, unique(CLU_ID)])
		beast.add.uniformPrior(bxml, paste("treeModel.rootHeight_CLU",clu,sep=''), 0, 80)
	beast.add.gammaPrior(bxml, "skygrid.precision", 0.001, 1000, 0)
	beast.add.gmrfSkyGridPrior(bxml, 'skygrid')
	#	add mcmc element
	beast.add.mcmc(bxml, 	mcmc.id='mcmc', posterior.id='posterior', prior.idref='prior', likelihood.idref='likelihood',
			mcmc.chainLength=20e6, mcmc.autoOptimize='true', mcmc.operatorAnalysis= paste(file.name.base, '.ops',sep=''))
	#	add operator element
	beast.add.operators(bxml, 'operators')
	#	add scale operators
	tmp					<- data.table(	ID= c("hky.kappa","site.alpha","ucld.mean","ucld.stdev","skygrid.precision", sapply( seq.select[, unique(CLU_ID)], function(clu) paste("treeModel.rootHeight_CLU",clu,sep='') )), SF=0.75, W=1)
	set(tmp, tmp[,which(grepl('ucld',ID))], 'W', 5)
	for(i in seq_len(nrow(tmp)))
		beast.add.scaleOperator(bxml, tmp[i,ID], tmp[i,SF], tmp[i,W])
	#	add internal node height operators
	for(clu in seq.select[, unique(CLU_ID)])
		beast.add.uniformOperator(bxml, paste("treeModel.internalNodeHeights_CLU",clu,sep=''), weight=1)
	#	add UpDown operator on ucld.mean and root heights
	tmp		<- sapply( seq.select[, unique(CLU_ID)], function(clu) paste("treeModel.allInternalNodeHeights_CLU",clu,sep='') )
	beast.add.upDownOperator(bxml, "ucld.mean", tmp, scaleFactor=0.75, weight=1)
	#	add gmrf block operator
	beast.add.gmrfGridBlockUpdateOperator(bxml, 'skygrid', scaleFactor=2, weight=1)
	#	add branch rate category operators
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.uniformIntegerOperator(bxml, paste("branchRates.categories_CLU",clu,sep=''), 1)
		beast.add.swapOperator(bxml, paste("branchRates.categories_CLU",clu,sep=''), size=1, weight=1, autoOptimize="false")
	}
	#	add screen log
	beast.add.screenLog(bxml, log.id='screenLog', posterior.idref='posterior', likelihood.idref="likelihood", prior.idref="prior", logEvery=2e4)
	#	add file log
	beast.add.fileLog(bxml, log.id='fileLog', posterior.idref='posterior', likelihood.idref="likelihood", prior.idref="prior", logEvery=2e4,
			log.fileName=paste(file.name.base, '.log',sep=''), log.overwrite='false',
			select.id=c('rootHeight', 'ucld', 'hky', 'alpha', 'skygrid','branchRatesStat'), deselect.id=c('frequencies','categories'))
	#	add tree log for every transmission cluster
	if(with.logTree)
		for(clu in seq.select[, unique(CLU_ID)])
		{
			beast.add.logTree(bxml, paste("logTree_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates_CLU",clu,sep=''), 'posterior',
					logEvery=2e4, nexusFormat='true', sortTranslationTable='true',
					fileName=paste(file.name.base,'_CLU', clu, '.trees', sep=''), verbose=1)
		}
	bxml
}
######################################################################################
#' @title Generate multilocus XML file with HKY model and fixed starting tree
#' @description See Examples.
# import XML phytools ape data.table reshape2 ggplot2
#' @export
#' @example example/ex.beastxml.multilocus.hky.fixed.tree.R
#' @author Oliver Ratmann
beastxml.multilocus.hky.fixed.tree<- function( file.name, seq.select, phd, verbose=1)
{
	stopifnot(c("CLU_ID", "TAXON_ID", "TAXON_NAME", "SEQ", "SAMPLINGTIME")%in%names(seq.select))
	file.name.base	<- gsub('.xml','',rev(strsplit(file.name,'/')[[1]])[1])
	#	write XML file with new sequences
	bxml			<- newXMLDoc(addFinalizer=T)
	bxml.beast		<- newXMLNode("beast", doc=bxml, addFinalizer=T)
	tmp				<- newXMLCommentNode(text=paste("Generated by beastxml.multilocus.hky.fixed.tree"), parent=bxml.beast, doc=bxml, addFinalizer=T)
	#	add new taxa
	beast.add.taxa(bxml, seq.select, beast.date.direction= "forwards", beast.date.units= "years", verbose=1)
	#	add alignment for every transmission cluster
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.alignment(bxml, subset(seq.select, CLU_ID==clu), df.seqfield='SEQ', beast.alignment.id=paste("alignment_CLU",clu,sep=''), beast.alignment.dataType= "nucleotide", verbose=1)
	}
	#	add starting tree for every transmission cluster
	for(clu in names(phd))
	{
		beast.add.startingtree(bxml, phd[[clu]], beast.usingDates="true", beast.newickid=paste("startingTree_CLU",clu,sep=''), beast.brlunits="years", verbose=1)
	}
	#	add CODON patterns for every alignment
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.patterns(bxml, paste("patterns_CLU",clu,sep=''), paste("alignment_CLU",clu,sep=''), 1, beast.patterns.every=NA, beast.patterns.strip='false', verbose=1)
	}
	#	add tree model for every alignment
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.treemodel(bxml, paste("treeModel_CLU",clu,sep=''), paste("treeModel.rootHeight_CLU",clu,sep=''), paste("treeModel.internalNodeHeights_CLU",clu,sep=''), paste("treeModel.allInternalNodeHeights_CLU",clu,sep=''), newick.id=paste("startingTree_CLU",clu,sep=''), internalNodes='true', rootNode='true', verbose=1)
	}
	#	add skygrid
	tmp					<- sapply( seq.select[, unique(CLU_ID)], function(x) paste("treeModel_CLU",x,sep='') )
	grid.height			<- max(sapply(phd, function(ph) max(node.depth.edgelength(ph))))
	beast.add.gmrfskygrid(bxml, "skygrid", "skygrid.logPopSize", "skygrid.precision", "skygrid.numGridPoints", "skygrid.cutOff", tmp,
			80, -2.396895772465287, 1.0, 0.0, grid.height, verbose=1)
	#	add uncorrelated relaxed clock for first cluster
	tmp					<- seq.select[, unique(CLU_ID)]
	clu					<- tmp[1]
	beast.add.discretizedBranchRates(bxml, paste("branchRates_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates.categories_CLU",clu,sep=''),
			mean.id="ucld.mean", mean.value="0.0025", mean.lower="0.0", sd.id="ucld.stdev", sd.value="0.3333333333333333", sd.lower="0.0", meanInRealSpace="true",
			rateCategories.dimension= subset(seq.select, CLU_ID==clu)[, 2*(length(unique(TAXON_ID))-1)], verbose=1)
	#	link uncorrelated relaxed clock to other clusters
	for(clu in tmp[-1])
	{
		beast.add.discretizedBranchRates(bxml, paste("branchRates_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates.categories_CLU",clu,sep=''),
				mean.idref="ucld.mean", sd.idref="ucld.stdev", meanInRealSpace="true",
				rateCategories.dimension= subset(seq.select, CLU_ID==clu)[, 2*(length(unique(TAXON_ID))-1)], verbose=1)
	}
	#	add rate statistics for all clusters
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.rateStatistics(bxml, paste('branchRatesStat_CLU',clu, sep=''), paste('treeModel_CLU',clu, sep=''), paste('branchRates_CLU',clu, sep=''))
	}
	#	add HKY model for first cluster
	clu			<- seq.select[, unique(CLU_ID)][1]
	beast.add.hkyModel(bxml, paste("hky_CLU", clu,sep=""), paste("patterns_CLU",clu,sep=''), paste("hky.frequencies_CLU",clu,sep=''),
			rate.id=paste("hky.kappa",sep=""), rate.value="1.0", rate.lower="0.0",
			frequencies.dimension='4', frequencyModel.dataType='nucleotide')
	#	link HKY model to other clusters
	for(clu in seq.select[, unique(CLU_ID)][-1])
	{
		beast.add.hkyModel(bxml, paste("hky_CLU", clu,sep=""), paste("patterns_CLU",clu,sep=''), paste("hky.frequencies_CLU",clu,sep=''),
				rate.idref=paste("hky.kappa",sep=""),
				frequencies.dimension='4', frequencyModel.dataType='nucleotide')
	}
	#	add site model for first cluster
	clu			<- seq.select[, unique(CLU_ID)][1]
	beast.add.siteModel(bxml, paste("siteModel_CLU", clu,sep=""), paste("hky_CLU", clu,sep=""),
			substitutionModel.class='HKYModel', gamma.id=paste("site.alpha",sep=""), gamma.value="0.5", gamma.lower="0.0", gammaCategories="4", verbose=1)
	#	link site model to other clusters
	for(clu in seq.select[, unique(CLU_ID)][-1])
		beast.add.siteModel(bxml, paste("siteModel_CLU", clu,sep=""), paste("hky_CLU", clu,sep=""),
				substitutionModel.class='HKYModel',  gamma.idref=paste("site.alpha",sep=""), gammaCategories="4", verbose=1)
	#	add treeLikelihood
	for(clu in seq.select[, unique(CLU_ID)])
		beast.add.treeLikelihood(bxml, paste("treeLikelihood_CLU",clu,sep=""),
				paste("patterns_CLU",clu,sep=''),
				paste("treeModel_CLU",clu,sep=''),
				paste('siteModel_CLU',clu,sep=''),
				paste("branchRates_CLU",clu,sep=''), useAmbiguities='false', verbose=1)
	#	add compound likelihood
	tmp					<- as.vector(sapply(seq.select[, unique(CLU_ID)], function(clu) paste("treeLikelihood_CLU",clu,sep="") ))
	beast.add.likelihood(bxml, 'likelihood', tmp)
	#	add prior element
	beast.add.priors(bxml, 'prior')
	#	add priors	https://code.google.com/p/beast-mcmc/wiki/ParameterPriors
	beast.add.logNormalPrior(bxml, "hky.kappa", 1, 1.25, 0)		#default
	beast.add.uniformPrior(bxml, "ucld.mean", 0.0001, 0.006)
	beast.add.exponentialPrior(bxml, "ucld.stdev", 0.1, 0)
	beast.add.exponentialPrior(bxml, "site.alpha", 0.5, 0)
	beast.add.gammaPrior(bxml, "skygrid.precision", 0.001, 1000, 0)
	beast.add.gmrfSkyGridPrior(bxml, 'skygrid')
	#	add mcmc element
	beast.add.mcmc(bxml, 	mcmc.id='mcmc', posterior.id='posterior', prior.idref='prior', likelihood.idref='likelihood',
			mcmc.chainLength=20e6, mcmc.autoOptimize='true', mcmc.operatorAnalysis= paste(file.name.base, '.ops',sep=''))
	#	add operator element
	beast.add.operators(bxml, 'operators')
	#	add scale operators
	tmp					<- data.table(	ID= c("hky.kappa","site.alpha","ucld.mean","ucld.stdev","skygrid.precision"),
			SF=	0.75,
			W=	c(0.1,        0.1,                      3,           3,           0.1))
	for(i in seq_len(nrow(tmp)))
	{
		beast.add.scaleOperator(bxml, tmp[i,ID], tmp[i,SF], tmp[i,W])
	}
	#	add gmrf block operator
	beast.add.gmrfGridBlockUpdateOperator(bxml, 'skygrid', scaleFactor=2, weight=2)
	#	add branch rate category operators
	for(clu in seq.select[, unique(CLU_ID)])
	{
		beast.add.uniformIntegerOperator(bxml, paste("branchRates.categories_CLU",clu,sep=''), 10)
		beast.add.swapOperator(bxml, paste("branchRates.categories_CLU",clu,sep=''), size=1, weight=10, autoOptimize="false")
	}
	#	add screen log
	beast.add.screenLog(bxml, log.id='screenLog', posterior.idref='posterior', likelihood.idref="likelihood", prior.idref="prior", logEvery=2e4)
	#	add file log
	beast.add.fileLog(bxml, log.id='fileLog', posterior.idref='posterior', likelihood.idref="likelihood", prior.idref="prior", logEvery=2e4,
			log.fileName=paste(file.name.base, '.log',sep=''), log.overwrite='false',
			select.id=c('rootHeight', 'ucld', 'hky', 'alpha', 'skygrid','branchRatesStat'), deselect.id=c('frequencies','categories'))
	#	add tree log for every transmission cluster
	#for(clu in seq.select[, unique(CLU_ID)])
	#{
	#		beast.add.logTree(bxml, paste("logTree_CLU",clu,sep=''), paste("treeModel_CLU",clu,sep=''), paste("branchRates_CLU",clu,sep=''), 'posterior',
	#				logEvery=2e4, nexusFormat='true', sortTranslationTable='true',
	#				fileName=paste(file.name.base,'_CLU', clu, '.trees', sep=''), verbose=1)
	#}
	bxml
}
######################################################################################
#' @title Create BEAST XML file from template
#' @description See Examples.
#' @param btemplate					BEAST XML template or file name of BEAST XML template
#' @param df						Taxon data.table
#' @param start.tree				Starting tree containing the taxa in the taxon data.table (optional)
#' @param xml.taxonsets4clusters	If true, add taxon sets for each phylogenetic cluster in the taxon data.table
#' @param xml.monophyly4clusters	If true, add monophyly contraints to each phylogenetic cluster in the taxon data.table
#' @param verbose					If true, run in verbose mode
#' @example example/ex.beast.from.template.R
#' @export
#' @author Oliver Ratmann
beastxml.from.template<- function(	btemplate, df, start.tree=NULL, xml.taxonsets4clusters=1, xml.monophyly4clusters=1,  verbose=1)
{
	stopifnot(class(btemplate)[1]%in%c('character',"XMLInternalDocument"))
	if(class(btemplate)=='character')
	{
		#	load xml template file
		btemplate	<- xmlTreeParse(btemplate, useInternalNodes=TRUE, addFinalizer = TRUE)
	}
	bxml			<- newXMLDoc(addFinalizer=T)
	bxml.beast		<- newXMLNode("beast", doc=bxml, addFinalizer=T)
	newXMLCommentNode(text=paste("Generated by rBEAST from template"), parent=bxml.beast, doc=bxml, addFinalizer=T)
	#	add new taxa
	beast.add.taxa(bxml, df, beast.date.direction= "forwards", beast.date.units= "years", verbose=verbose)
	#	add alignment for every transmission cluster
	beast.add.alignment(bxml, df, beast.alignment.dataType= "nucleotide", verbose=verbose)
	#	add startingTree
	if(!is.null(start.tree))		#	add startingTree if provided
		beast.add.startingtree(bxml, start.tree, beast.newickid= "startingTree", beast.usingDates="true", beast.brlunits="years", verbose=verbose)
	#	copy everything else from template
	beast.add.from.template(bxml, btemplate, select.after.elt=NA, select.to.elt=NA, verbose=1)
	#
	#	for clusters, add taxon sets and tmrca statistics
	#
	if(xml.taxonsets4clusters)
	{
		tmp				<- length(getNodeSet(bxml, "//treeModel"))
		if(tmp==0)
			stop('Cannot find treeModel.')
		treeModel.id	<- unique(unlist(xpathApply(bxml, "//treeModel[@id]", xmlGetAttr, "id")))
		if(length(treeModel.id)>1)
			stop('More than one treeModel currently not supported.')
		taxonset.prefix	<- 'c'
		includeStem		<- "false"
		#	add taxon sets for each cluster
		beast.add.taxonsets.for.clusters(bxml, df, taxonset.prefix=taxonset.prefix)
		#	add tmrcaStatistic for each cluster
		beast.add.tmrcaStatistic.for.taxonsets(bxml,  treeModel.id=treeModel.id, taxonset.prefix=taxonset.prefix, includeStem=includeStem, verbose=verbose)
		#	add monophylyStatistic for each cluster
		if(xml.monophyly4clusters)
		{
			if(is.null(start.tree))
				warning('BEAST may have zero probabilitiy for initial topology if starting tree not specified and mononphyly constraints set')
			beast.add.monophylyStatistic.for.taxonsets(bxml, treeModel.id=treeModel.id, taxonset.prefix=taxonset.prefix)
		}
	}
	#
	#	if user-provided startingTree, reset <upgmaTree idref="startingTree"/> to <newick idref="startingTree"/>
	#
	if(!is.null(start.tree))
	{
		tmp					<- getNodeSet(bxml, "//*[@idref='startingTree']")
		if(length(tmp)==0)
			stop('Cannot find idref startingTree')
		if(length(tmp)>1)
			stop("More than one idref startingTree")
		xmlName(tmp[[1]])	<- "newick"
	}
	#
	bxml
}
olli0601/rBEAST documentation built on May 24, 2019, 12:53 p.m.