R/methods-phylo4d_ext.R

#-------------------------------------------------------
#  Methods 
#  -Are concerned with subnode stuff only.  
#   Higher-level tree modifications should be done
#  	with phylobase functions.
#-------------------------------------------------------

# Table of Contents:

## Generic Methods: (and overloads)
#

## SIMMAP Methods:
# get.nodenames -- internal; get names of nodes
# is.simmap	 --  check whether file or text string contains simmap formatted trees
# read.simmap  --  read simmap text (v1.0-1.1) to phylo4d object (with singletons)
# read.simmap.new  --  read simmap text (v1.5) to phylo4d_ext
# read.nexus.simmap  -- reads nexus file which contains simmap (all versions) trees
# expand.singles  --  experimental; expand singleton nodes into a bifurcation with one 0-length branch terminating at a dummy node
# collapse.to.singles  --  experimental;  collapse bifurcations which contain a 0-length branch connected to a leaf node into singletons
# collapse.singletons  --  remove singleton nodes from a tree
# collapse.subnodes  --  experimental;  collapse subnodes to singletons
# write.simmap (w/ two subroutines)  -- write a tree (and parts of it's data) to simmap format (vers=c(1.0,1.1,1.5))
# write.simmap.old  --  wrapper; calls write.simmap with v1.0 or v1.1 parameters
# write.simmap.new  --  wrapper; calls write.simmap with v1.5 parameters
# write.nexus.simmap  --  uses write.simmap and other internal functions to write a nexus file using write.simmap to output each tree.

## Subnode Methods:
# .edge.index  --  internal;
# nSubNodes  --  get number of subnodes
# hasSubNodes  --  does the tree have subnodes?
# hasData  --  does the tree contain any data? (is the data slot empty?)
# getSubNodeData  --  returns subnode data.frame
# getSubNodePosish  -- returns subnode positions
# getSubNodeEdgeInds  --  returns subnode edge indices
# getEmptyDataFrame  --  returns empty data frame in the same format as the data frame from the data slot
# addSubNode  --  add a subnode to a tree
# showSubNodes  --  print out a representation of the subnodes to stdout


## Generics
setGeneric("sndata", function(x, ...) { standardGeneric("sndata") })
setGeneric("sndata<-", function(x,datnames=NULL, value) { standardGeneric("sndata<-") })
setGeneric("snid", function(x, ...) { standardGeneric("snid") })
setGeneric("snposition", function(x, ...) { standardGeneric("snposition") })
setGeneric("snbranch", function(x, ...) { standardGeneric("snbranch") })
setGeneric("rmdata", function(x,index,subindex) { standardGeneric("rmdata")} )
setGeneric("weight", function(x) { standardGeneric("weight")} )
setGeneric("weight<-", function(x,value) { standardGeneric("weight<-")} )
setGeneric("hasWeight",function(x,strict=TRUE) { standardGeneric("hasWeight")} )

# subnode
setGeneric("hasSubNodes", function(x) { standardGeneric("hasSubNodes") })
setGeneric("hasData", function(x) { standardGeneric("hasData") })
setGeneric("hasDataColumn",function(x,index) { standardGeneric("hasDataColumn") })

# overloading phylobase methods:

#--------------------------
# SIMMAP Processing Methods		
#--------------------------

# internal:
# get node names from newick string
get.nodenames<-function(newick.txt)
{
	nnames = character(0)
	ttmp = newick.txt
	
	if(grepl("^.*;$",ttmp)) {
		# remove trailing semi-colon, otherwise it will mess up the regular expression matching
		ttmp = substring(ttmp,first=1,last=(nchar(ttmp)-1))
	}
	nname.pat="(\\(|,|\\))([a-zA-Z0-9'_\\.]{1,})(:|;)"
	junk = regexpr(nname.pat,ttmp)
	count = 0
	while(junk[1] != -1)
	{
		tmpname = substr(ttmp,(junk[1]+1),(junk[1]+attr(junk,"match.length")-2))
		nnames = append(nnames,tmpname)
		ttmp = sub(nname.pat,"",ttmp)	
		junk = regexpr(nname.pat,ttmp)
		count = count + 1
		stopifnot(count < 100000)
	}
	return(nnames)
}



# Check for evidence that this file contains simmap-formatted trees
# @param 'vers' is the simmap version that should be checked for
# @param 'quick' if TRUE, then the text string containing a tree is 
#		 only checked to see if the formatting looks right.  If FALSE,
#		 the method attempts to create a tree from the string.
# @param 'vers' can take the values (1.0,1.1,1.5); a meaningless result
#		 will be returned otherewise
#
is.simmap <- function(finput="",text=NULL,vers=c(1.1),quick=TRUE)
{
	if(!is.null(text)){
		# TODO: check text for newlines and split on them if they exist.
		rawtext=text
	} else {
		if(!file.exists(finput))
			stop("Assumed 'finput' was a filename and could not be fond")
		
		rawtext = scan(finput,what=character(0),strip.white=T,sep="\n")		
	}

	## TODO: split individual strings on ';' character
	
	# if there is only one string, then just check that,
	# otherwise assume the file is text from a nexus file
	if(length(rawtext)==1)
	{
		treesblock = rawtext
		treelines = 1
	} else {
		treesblock = read.nexus.block(txt=rawtext,block="trees",rm.comments=T)
		treelines = which(tolower(substr(treesblock,1,4))=="tree")
	}
	
	if(length(treesblock)==0)
	{
		warning("No trees in this file...")
		return(FALSE)
	}
	
	potentialsimmap = logical(length(treelines))
	count = 1
	for(linenumb in treelines)
	{
		# check for simmap styles
		if(vers == 1.1 || vers == 1.0)
		{
			# remove all comments from the string
			junk = gsub("\\[(.*?)\\]","",treesblock[linenumb])
			potentialsimmap[count] = grepl(":\\{.*?\\}",junk)
			if(potentialsimmap[count] && !quick)
			{
				tr = try(read.simmap(text=junk,vers=vers),silent=T)
				if(is(tr,"try-error"))
					potentialsimmap[count] = FALSE
			}
			
		} else {
			
			potentialsimmap[count] = grepl("\\[(.*?)\\]",treesblock[linenumb])
			if(potentialsimmap[count] && !quick)
			{
				junk = gsub("\\[(.*?)\\]","",treesblock[linenumb])
				
				# TODO: change this to read.simmap.new when that function is finished:
				tr = try(read.tree(text=junk),silent=T)
				if(is(tr,"try-error"))
					potentialsimmap[count] = FALSE
			}
		}
		
		count = count + 1
	}
	
	return(any(potentialsimmap))
}



# read modified newick file
# citation: 
# Bollback J. P. (2006) SIMMAP: Stochastic character mapping of 
# discrete traits on phylogenies. BMC Bioinformatics. 7:88
# 
# Assume that branch lengths are present (otherwise, why use SIMMAP?)
# Assume that the root node can only have one simmap state (or, only use the first)
#
# @param as.num - convert the data to numeric (by default, a factor data type is used)
# @param text - use text string instead of a file name
# @param vers - which version of simmap to use
# @param as.num - no idea
# @param add.root - should the tree be treated as unrooted?
#
read.simmap <- function(file="",text=NULL, vers=1.1, as.num=FALSE, add.root=TRUE, ...)
{
	
	if(is.null(text))
	{
		#stop("Need to have text for now")
		if(!file.exists(file))
			stop("Assumed 'file' was a filename and could not be fond.")
		
		text = scan(file,what=character(0),strip.white=T,sep="\n")
	}

	# clear whitespace	
	text = gsub("\\s","",text)
	
	# add semicolon to end
	if(substring(text,nchar(text))!=";")
		text = paste(text,";",sep="")
	
	if(TRUE)
	{
		# add root node and internal node names
		count = 1
		while(regexpr("):",text)[1] != -1)
		{
			text = sub( "):", paste(")Internal",sprintf("%0.7d",count),":",sep=""), text)
			count = count + 1
		}
			
		# Root
		if(add.root && regexpr(");",text)[1] != -1)
			text = sub( ");", ")Root:0;" , text)
		
	}
	
	## Poor replacement for regular expressions
	#
	edge.ind = integer(0)
	edge.state = character(0)
	edge.len = numeric(0)
	junk = strsplit(text,"")[[1]]
	sub.branches = cbind( which(junk=="{"), which(junk=="}") )
	br.lens = numeric(nrow(sub.branches))
	
	for(ii in seq(nrow(sub.branches)))
	{
		br.total = 0
		# get the internal part
		within = paste(junk[seq( sub.branches[ii,1]+1 , sub.branches[ii,2]-1 )],collapse="")
		splitchar = ifelse(vers==1.0,";",":")
		within.sub = strsplit(within,splitchar)[[1]]
		for(jj in within.sub)
		{
			slptmp = strsplit(jj,",")[[1]]
			edge.ind = append(edge.ind,ii)
			edge.state = append(edge.state,slptmp[1])
			edge.len = append(edge.len,slptmp[2])
			br.total = br.total + as.numeric(slptmp[2])
		}
		br.lens[ii] = br.total
	}

	
	# horrible way to put it back together:
	# switch to regular expressions
	#
	newick.str = paste(junk,collapse="")
	for(ii in seq(nrow(sub.branches)))
	{
		replen = diff(sub.branches[ii,])+1
		substr(newick.str, sub.branches[ii,1], sub.branches[ii,2]) <- paste(rep(" ",replen),collapse="")
		substr(newick.str, sub.branches[ii,1], sub.branches[ii,2]) <- as.character(br.lens[ii])
	}
	newick.str = gsub("\\s","",newick.str) # strip whitespace
	
	# convert to ape format:
	tr = read.tree(text=newick.str)
	
	# make singleton nodes
	#
	edge.len = as.numeric(edge.len)
	all.labels = c(tr$tip.label,tr$node.label)
	
	# all names except the root node (which ought to be named, otherwise this will break)
	#
	#nnames = head(get.nodenames(newick.str),-1)  # This should match up with edge.ind
	nnames = get.nodenames(newick.str)
	scount = 1
	nTips = length(tr$tip.label)
	nInts = length(tr$node.label)
	
	# node information
	dataVal = character(0)
	dataNode = integer(0)
		
	# loop through split branches
	for(kk in unique(edge.ind))
	{
		# this information shouldn't change:
		is.root = F
		edgeind = which(edge.ind == kk)
		trind = which(nnames[kk] == all.labels)
		esplice = which(tr$edge[,2] == trind) # which edge to chop
		anc = tr$edge[esplice,1]
		dec = tr$edge[esplice,2]
	
		# assume that this is the root (no way to check for it otherwise..)
		if( length(esplice) == 0 )
		{
			dec = trind
			is.root = T
		}
		
		# store data information:
		dataNode = append(dataNode,dec)
		dataVal = append(dataVal, edge.state[edgeind[1]])

		newsingles = length(edgeind)-1
		
		if(newsingles==0 || is.root)
			next
		
		# add new singleton nodes
		for(mm in seq(newsingles))
		{
			#cat("- adding a new single\n")
			### add new internal nodes to 'phylo' object:
			#
			tr$Nnode = tr$Nnode + 1; nInts = nInts + 1
			tr$node.label = append(tr$node.label,sprintf("Singleton%0.7d",scount))
			nodeid = nTips + nInts # should be the last one available
			tr$edge[esplice,1] = nodeid
			tr$edge = rbind(tr$edge,c(anc,nodeid))
			tr$edge.length[esplice] = edge.len[edgeind[mm]]
			tr$edge.length = append(tr$edge.length,edge.len[edgeind[(mm+1)]])
			
			# store data information:
			dataNode = append(dataNode,nodeid)
			dataVal = append(dataVal, edge.state[edgeind[(mm+1)]])
			
			
			# update info:
			esplice = length(tr$edge.length) # should be the last one
			dec = nodeid
			scount = scount + 1
		}		
	}
	

	# create phylo4d object
	new.labels = c(tr$tip.label,tr$node.label)
	if(as.num)
		dataVal = as.numeric(dataVal)
	tmpdf = data.frame("simmap_state"=dataVal, row.names=new.labels[dataNode])
	rettree = phylo4d(tr,all.data=tmpdf,missing.data="OK",rownamesAsLabels=TRUE)
	
	#write.nexus(tr,file="written.tree")
	return(rettree)
}


# helper functions for read.simmap.new (below)
#
match.order <- function(basevec,cmpvec)
{
	neworder = order(basevec)
	neworder[order(basevec)] <- order(cmpvec)
	stopifnot(all(cmpvec[neworder] == basevec))
	return(neworder)		
}


strip<-function(str,left=TRUE,right=TRUE)
{
	if(left)
		str = sub("^[ \t\n]+","",str)
	if(right)
		str = sub("[ \t\n]+$","",str)
	return(str)
}


# Read simmap v1.5 files
# ----------------------
# Notes:
# 1. all mutational (mapping) information is contained with a comment block (e.g., [&map={0,0.144863,1,0.453803,0}] ),
# 2. within this block the mutational information is contained within curly braces, { }, preceded by a map command, &map= { },
# 3. the mutational information is from the descendant node to the ancestor (i.e., tips to root assignment),
# 4. the information is STATE, LENGTH, STATE,... excluding the last length, e.g., two changes (0=>1=>0) might look like this, [&map={0,0.144863,1,0.453803,0}], with state 0 having a length of 0.144863, followed by state 1 with a length of 0.453803, with the final state 0 having a length of BRLEN - (0.144863 + 0.453803); this approach helps reduce the size of the file by not including redundant information,
# 5. if a branch has no changes then it will look like this, [&map={STATE}]LENGTH
#
# @param specialpatt indicates which variables should be treated in the SIMMAP way
#		 any variables found which are not in specialpatt are treated as containing 
#		 vector data.  By default, all variable are considered to be in the special
#		 format.
#
read.simmap.new <- function(file="",text=NULL, specialpatt=character(0), add.root=TRUE)
{
	
	str.has <- function(patt,token,lowercase=T){
		len = length(grep(sprintf("%s",patt),tolower(token)))
		if(len == 1)
			return(TRUE)
		return(FALSE)
	}
	
	if(is.null(text))
	{
		if(file.exists(file)){
			text = paste(readLines(file),collapse="")
		}else{
			stop("Could not find file ",file)
		}
	}

	# clear whitespace
	txt = gsub("\\s","",text)
	
	# add semicolon to end
	if(substring(txt,nchar(txt))!=";")
		txt = paste(txt,";",sep="")
	
	# check to see if it is actually a SIMMAP v1.5 formatted
	if(!is.simmap(text=txt,vers=1.5))
		stop("'text' does not seem to contain any simmap-formatted trees")
	
	
	# Extract comments from the tree string (which should contain the mapped data)
	comments.pos = gregexpr("\\[&.*?\\]",txt)[[1]]
	comments.len = attr(comments.pos,"match.length")
	comments = character(length(comments.pos))
	for(ii in seq(length(comments)))
	{
		comments[ii] = substr(txt,(comments.pos[ii]+2), ((comments.pos[ii] + comments.len[ii] - 2)))
	}
	
	length(comments)
	
	
	####
	# Process Trees
	# 1.) strip SIMMAP comments from the tree string
	#
	cleaned = gsub("\\[&.*?\\]","",txt)
	textstr = cleaned
	
	# 2.) add root node and internal node names
	count = 1
	while(regexpr("):",textstr)[1] != -1)
	{
		textstr = sub( "):", paste(")Internal",sprintf("%0.7d",count),":",sep=""), textstr)
		count = count + 1
	}
	
	# check if root has a value, if it does not then 
	# the program needs to know so that it can be set
	# when processing the comments (check if anc == root).
	#
	norootval=FALSE
	if(add.root && regexpr(");",textstr)[1] != -1){
		textstr = sub( ");", ")Root:0;" , textstr)
		norootval=TRUE
	}
	
	# 3.) create phylo4d object from tree string where comments have been removed (data is added next)		
	nodenames = get.nodenames(textstr)
	phy = as(read.tree(text=textstr),'phylo4d')
	#
	# End Process Trees
	####
	
	# Sanity check: (leave out root, if it did not come with an explicit state)
	#if(length(comments)!= (length(nodenames) - as.integer(norootval)) )
	#	stop("Node names does not match up with comments (and they should).")
	
	
	####
	# Process Comments:
	#
	# NOTE: Only one branch mapping is allowed at this point (TODO: fix this)
	#
	subnode.names = c("map","simmap_state",specialpatt)
	subnode.patt = sprintf("^(%s)$",paste(subnode.names,collapse="|"))
	subnode.pos = matrix(NA,nrow=0,ncol=2)
	subnode.branch = matrix(NA,nrow=0,ncol=2)
	subnode.data = character(0)
	subnode.commentid = integer(0)
	subnode.subind = integer(0)	

	phy.data = NULL
	for (kk in seq(length(comments)))
	{
		## Tokenize command comment:
		comment.tokens = strsplit(comments[kk],",")[[1]]
		
		# check if there is an assignment (shouldn't there always be?)
		ex.inds = (grepl("=",comment.tokens))
	
		# reconstitute tokens if they were broken up in comment.tokens
		curr = 1
		using = rep(TRUE,length(ex.inds))
		count = 1
		while(count <= length(ex.inds)){
			if(ex.inds[count]){
				curr=count
			} else {
				using[count] = FALSE
				comment.tokens[curr] = paste(comment.tokens[curr],comment.tokens[count],sep=",")
			}
			count = count + 1
		}
		comment.tokens = comment.tokens[using]
		## end Tokenize command comment
		
		# strip beginning / ending whitespace from each token
		# comment.tokens = strip(comment.tokens)
	
		## Process name/value pairs from comment tokens
		## NOTE: branch mapping is treated differently that other attributes
		nnames = character(0)
		vals = character(0)
		
		for(attr.str in comment.tokens)
		{
			# strip whitespace from tokens
			#
			tmp=strsplit(attr.str,"=")[[1]]
			metric.basename = strip(tmp[1])
			metric.value = strip(tmp[2])
			if(str.has("\\{",metric.value) && str.has("\\}",metric.value))
			{
				metric.value = sub("^\\{(.*?)\\}$","\\1",metric.value)
				metric.value = strip(strsplit(metric.value,",")[[1]])
			}
			
			if(!grepl(subnode.patt,metric.basename))
			{
				# process regular values
				# Expand basenames:
				if(length(metric.value) != 1)
					metric.basename = sprintf("%s_%04d",metric.basename,seq(length(metric.value)))
				
				nnames = append(nnames,metric.basename)
				vals = append(vals,metric.value)
				
			} else {
				
				# process map nodes:
				# values format: STATE,LENGTH,STATE,....
				# FROM desc to ancs (so, 1st STATE and LENGTH pair are the values for the current node)
				#
				if(length(metric.value) == 1){
					nnames = append(nnames,metric.basename)
					vals = append(vals,metric.value[1])
				} else {
					
					desc = unname(which(labels(phy)==nodenames[kk]))
					# check if desc is the root node (if it is then the ancestor is node 0 per phylobase's standard, I think).
					if(isRooted(phy)){
						anc = ifelse(rootNode(phy) == desc ,0 ,unname(ancestor(phy,desc)))
					} else {
						anc = unname(ancestor(phy,desc))
					}
					eind = .edge.index(phy,anc,desc)
					elen = unname(edgeLength(phy)[eind])
	
					# if one more length is needed:
					if( length(metric.value) %% 2 != 0 )
					{
						tonow = sum(as.numeric(metric.value[seq(2,length(metric.value),by=2)]))
						metric.value = c(metric.value,(elen-tonow))
					}
	
					
					# Separate states from lengths and make lengths fractions of BRLEN 
					# NOTE: fractions start at ANC(!) not DESC in the phyext format.  This may cause some confusion
					#
					states = rev(metric.value[seq(1,length(metric.value),by=2)])
					lens = rev(as.numeric(metric.value[seq(2,length(metric.value),by=2)]))
					lens = cumsum(lens) / elen
					
					# Assign data from DESC node
					nnames = append(nnames,metric.basename)
					vals = append(vals,tail(states,1)) # last one the DESC node state
					
					# Cache data from subnodes on this branch:
					for (subind in seq(length(states)-1))
					{
						subnode.branch = rbind(subnode.branch,c(anc,desc)) # which branch
						subnode.pos = rbind(subnode.pos,c(lens[subind],lens[subind]))  # which position on the branch
						tmpstate=states[subind]
						names(tmpstate) <- metric.basename
						subnode.data = append(subnode.data,tmpstate)  # TODO: make sure this works well
						subnode.commentid = append(subnode.commentid,kk)
						subnode.subind = append(subnode.subind, subind)
					}
				}
			}
		}
		
		stopifnot(length(nnames) == length(vals))
			
		if(is.null(phy.data))
		{
			phy.data = matrix(NA,nrow=length(nodenames),ncol=length(nnames))
			rownames(phy.data) <- nodenames
			colnames(phy.data) <- nnames
		} else {
	
			newcols = setdiff(nnames,colnames(phy.data))
			if(length(newcols) != 0){
				
				for(cname in newcols)
					phy.data = cbind(phy.data,NA)
				
				colnames(phy.data) <- c(head(colnames(phy.data),-length(newcols)),newcols)
			}
			
			unusedcols = setdiff(colnames(phy.data),nnames)
			if(length(unusedcols) != 0)
			{
				nnames <- append(nnames,unusedcols)
				vals <- append(vals,rep(NA,length(unusedcols)))
			}
		}
		
		phy.data[kk,] = vals[match.order(colnames(phy.data),nnames)]
		
	} # end kk comments loop
	
	# Guess as phy.data types:
	options(warn=-1)  # suppress warnings for this sections
	phy.data = data.frame(apply(phy.data,2,I),stringsAsFactors=FALSE)
	for(jj in seq(ncol(phy.data)))
	{
		exclude = is.na(phy.data[,jj])
		if( all(!is.na(as.numeric(phy.data[(!exclude),jj]))) )
			phy.data[,jj] = as.numeric(phy.data[,jj])
	}
	options(warn=0)
	#
	# End Process Comments
	#####
	

	###
	# Combine processed tree and processed comments:
	if(isRooted(phy)){
		phyd = addData(phy,all.data=phy.data,match.data=TRUE,rownamesAsLabels=TRUE)
	} else {
		phyd = addData(phy,all.data=phy.data,match.data=TRUE,rownamesAsLabels=TRUE,missing.data="OK")
	}
	
	if(length(subnode.data)!=0){
		# add subnode stuff:
		phyd = phyext(phyd)
		usubbranches = unique(subnode.commentid)
		
#		for(ii in seq(length(subnode.data)))
#		{
		for(ii in seq(length(usubbranches)))
		{
			cominds = which(subnode.commentid == usubbranches[ii])
			usubnodes = unique(subnode.subind[cominds])
			for(jj in seq(length(usubnodes)))
			{
				thisinds = cominds[which(subnode.subind[cominds] == usubnodes[jj])]
				tmpdat = subnode.data[thisinds]
				# sanity check:
				check = apply(subnode.pos[thisinds,,drop=FALSE],1,mean)
				stopifnot(all(check == check[1]))
				phyd = addSubNode(phyd,subnode.branch[thisinds[1],1],subnode.branch[thisinds[1],2],subnode.pos[thisinds[1]],tmpdat,pos.is.fraction=TRUE)
			}
		}
	}
	
	return(phyd)
}




# Read trees from a nexus file.  This function is only really necessary for 
# nexus files where trees have simmap formatting.  If they don't, then
# readNexus should be used instead.
#
# note: if type=="all", data is read in from the "characters" block
#		and it's rownames are assumed to match the taxa names.
#
read.nexus.simmap <- function(finput="", text=NULL, vers=NULL, type=c("all", "tree"),...)
{
	
	type <- match.arg(type)

	if (type == "all") {
		returnData <- TRUE
	} else {
		returnData <- FALSE
	}


	outtrees = NULL
	if(!is.null(text)){
		# TODO: check text for newlines and split on them if they exist.
		rawtext=text
	} else {
		if(!file.exists(finput))
			stop("Assuming finput is a file and could not find it")
		
		rawtext = scan(finput,what=character(0),strip.white=T,sep="\n")		
	}

	treesblock = read.nexus.block(txt=rawtext,block="trees",rm.comments=T)
	
	if(length(treesblock)==0)
	{
		warning("No trees in this file...")
		return(FALSE)
	}

	## TODO: split individual strings on ';' character into newlines
	
	treelines = which(tolower(substr(treesblock,1,4))=="tree")
	if(length(treelines)==0)
		return(NULL)

	
	# make sure the the translated table is used if found
	transstart = which(sapply(treesblock,function(i) tolower(i)=="translate",USE.NAMES=F))
	translated = length(transstart)!=0
	transend=integer(0)
	taxatrans=""
	index=integer(0)
	taxname=""
	
	# assume that trees are not translated otherwise:
	# these should be in order numerically.
	if(translated)
	{
		transend = seq(transstart,min(treelines)-1)[min(grep(";",treesblock[seq(transstart,min(treelines)-1)]))]
		taxatrans = treesblock[(transstart+1):transend]
		
		# should ALWAYS be the last token:
		if(any(taxatrans==";"))
			taxatrans = taxatrans[-which(taxatrans==";")]
		
		index = as.integer(sapply(strsplit(taxatrans,"\\s"),function(i) i[1]))  # I'm assuming that they are integers
		taxname = sapply(strsplit(taxatrans,"\\s"),function(i) sub("(,|;)","",i[2]))
	}

	count = 0
	for(linenumb in treelines)
	{
		
		cat("tree number",linenumb,"\n")
		# clearly, this will not work for non-simmap, non-newick trees (or simmap v1.5 strings)
		#tmpstr = tail(strsplit(treesblock[linenumb],"=")[[1]],1)
		# so, trying this instead:
		#tmpstr = sub("^tree.+?=(.+)$","\\1",treesblock[linenumb])
		
		# This might not be necessary:
		if(has.weights(text=treesblock[linenumb])){
			tmpstr = sub("\\[(.*?)\\]","",treesblock[linenumb])
		} else {
			tmpstr = treesblock[linenumb]
		}
		#tmpstr = sub("^tree.+?=(.+)$","\\1",tmpstr)
		#tmpstr = sub("^tree(.+?)=\\s{0,}\\[(.*?)\\](.+)$","\\3",tmpstr)
		#tmpstr = sub("^tree(.+?)=(.*?)(\\(.+)$","\\3",tmpstr)
		roottype = sub("^(TREE|tree)(.+?)=(.*?)(\\(.+)$","\\3",tmpstr)
		roottype = gsub("\\s","",roottype)
		tmpstr = sub("^(TREE|tree)(.+?)=(.*?)(\\(.+)$","\\4",tmpstr)
		
		addroot = !grepl("&U",roottype)
		# check for simmap style
		# remove first comment (should others be removed?
		if(is.simmap(text=tmpstr)){
			trtmp = phyext(unname(read.simmap(text=tmpstr,add.root=addroot)))
		} else if (is.simmap(text=tmpstr,vers=1.5)) {
			trtmp = phyext(unname(read.simmap.new(text=tmpstr,add.root=addroot,...)))
		} else if (is.simmap(text=tmpstr,vers=1.0)){
			trtmp = phyext(unname(read.simmap(text=tmpstr,vers=1.0,add.root=addroot)))
		} else {
			trtmp = phyext(read.tree(text=tmpstr))
		}
		
		
		if(!is(trtmp,'phylo4'))
			trtmp = as(trtmp,'phylo4')
		
		if(!inherits(trtmp,"phylo4")){
			cat("Trouble parsing line for trees:\n",treesblock[linenumb],"\n")
			stop("Wha?")
		}
		
		outtrees = append(outtrees,trtmp)
		count = count + 1
	}
	
	# if the trees have been translated, then
	# add the names back in.
	#
	if(translated)
	{
		for(treeind in seq(length(outtrees)))
		{
			tnames = as.integer(tipLabels(outtrees[[treeind]]))
			if(!any(is.na(tnames)))
			{
				tipLabels(outtrees[[treeind]]) <- taxname[tnames]
			} else {
				warning("Tree translations found, but could not be mapped")
			}
		}
	}
	
	# Find and add data:
	if(returnData && has.block(txt=rawtext,block="characters"))
	{
		data.part = read.characters2(txt=rawtext,blockname="characters")
		if(length(data.part) > 0)
		{
			for(treeind in seq(length(outtrees)))
			{
				outtrees[[treeind]] <- addData(outtrees[[treeind]],tip.data=data.part,rownamesAsLabels=TRUE)
			}
		}
	}
	
	
	return(outtrees)
}




# expand singleton nodes into bifurcating nodes with one junk node
# this function is mainly for plotting and saving files
# TODO: convert all explicit calls to @'slot' to their accessor counterpart (e.g. tree@edge.length goes to edgeLength(tree)
#
expand.singles <- function(tree,keep.data=FALSE)
{
	# note: tips should be indexed 1...N, where N is the number of tips
	if(!is(tree,"phylo4"))
		stop("tree needs to be of class phylo4")
	
	if(hasSingle(tree))
	{
		has.data = hasData(tree) && keep.data		

		# Need to add labels to trees if any are NA:
		# TODO: make sure labels are unique
		if(any( is.na(labels(tree)) ))
		{
			count = round(runif(1) * 1e6)
			for(ii in which(is.na(labels(tree))))
			{
				labels(tree)[ii] <- sprintf("Internal%07d",count)  # warning, these labels might not be unique
				count = round(runif(1) * 1e6)
			}
		}
		
		# rename 'Singleton' labels
		#labels(tree) = sub("^Singleton(.*)$","Internal\\1",labels(tree))

		tmptable=table(tree@edge[,1])
		snodes = as.integer(names(tmptable)[which(tmptable==1)])
		snodes = snodes[snodes!=0] # 0 is the root node (check on this...)
		
		nold = nrow(tree@edge)
		nnew = length(snodes)
		count = 1
		for(ii in snodes)
		{
			tree@label = append(sprintf("JUNK%0.7d",count),tree@label)
			tree@edge = rbind(tree@edge, c(ii, count))
			tree@edge.length = append(tree@edge.length,0)
			count = count + 1
		}
		
		# rearrange
		#tree@order = "unknown"
		rootind = which(tree@edge[,1] == 0)
		tree@edge[seq(nold),] = tree@edge[seq(nold),] + nnew
		tree@edge[seq(from=nold+1,to=nold+nnew),1] = tree@edge[seq(from=nold+1,to=nold+nnew),1] + nnew
		tree@edge[rootind,1] = 0
		
		# rename
		nnodes = nTips(tree) + nNodes(tree)
		names(tree@label) <- as.character(seq(1,nnodes))
		names(tree@edge.length) <- apply(tree@edge,1,paste,collapse="-")
		if(has.data)
		{
			rownames(tree@data) <- as.integer(rownames(tree@data)) + nnew
		}
	}
	
	if(!keep.data)
		tree = as(tree,"phylo4")
	
	return(tree)
}


# 
# sister function of expand.singles -> converts internal nodes with
# zero-length branches into singletons (in phylo4 format).  Will only work if
# the zero-len branch is connected to a tip
#
collapse.to.singles <- function(tree,by.name=NULL)
{
	if(is(tree,"phylo"))
		tree = as(tree,"phylo4")
	
	if(!is(tree,"phylo4"))
		stop("tree argument needs to be of class phylo4")

	# don't use the root node
	if(any(edgeLength(tree, seq(1,(nNodes(tree)+nTips(tree)))[nodeType(tree)!='root']) == 0))
	{
		# Remove zero-length branches and their tips
		zinds = which(edgeLength(tree)==0)
		torem = edges(tree)[zinds,]
		torem = torem[edges(tree)[zinds,1]!=0,][,2] # don't include root
		zinds = zinds[edges(tree)[zinds,1]!=0] 		# don't include root
		torem = torem[nodeType(tree)[torem]=="tip"]
		
		if(length(torem)!=0)
		{
			# remove excess tips
			rm.count = length(torem)
			tip.count = length(tipLabels(tree))
			total.count = length(labels(tree))
			
			tree@edge <- edges(tree)[-zinds,]  # TODO: add 'edges<-' to phylobase
			tree@edge.length = tree@edge.length[-zinds]
			tree@label = tree@label[-torem]
			
			## begin reindexing nodes
			# tips
			t.oldseq = seq(1, (tip.count))[-torem]
			t.newseq = seq(1,(tip.count-rm.count))
			t.replace.inds = which(tree@edge[,2] %in% t.oldseq)
			stopifnot(length(t.replace.inds) == length(t.oldseq))  # it should be a 1-1 relationship for tips
			tree@edge[,2] = replace(tree@edge[,2],t.replace.inds ,t.newseq)
			
			# internal
			n.oldseq = seq(tip.count+1,total.count)
			n.newseq = seq(tip.count+1-rm.count, total.count-rm.count)
			for(kk in seq(length(n.oldseq)))
			{
				replaceinds = which(tree@edge[,1] == n.oldseq[kk])
				if(length(replaceinds)!=0)
					tree@edge[replaceinds,1] = rep(n.newseq[kk],length(replaceinds))
					
				replaceinds = which(tree@edge[,2] == n.oldseq[kk])
				if(length(replaceinds)!=0)
					tree@edge[replaceinds,2] = rep(n.newseq[kk],length(replaceinds))
			}
			
			
			## rename
			names(tree@label) <- as.character(seq(1,length(labels(tree))))
			names(tree@edge.length) <- apply(tree@edge,1,paste,collapse="-")
		}
	} else {
		warning("No zero-length branches found to be removed")
	}
	return(tree)
}


# collapse singleton nodes using ape<->phylo4 conversion functions:
# TODO: make this less reliant on ape functions
#
collapse.singletons <- function(phy)
{
	rettree = phy
	
	# if singletons exist
	if(hasSingle(rettree))
	{
		if(is(rettree,"phylo4d"))
		{
			tab=table(edges(rettree)[,1])
			snodeid = as.integer(names(tab)[which(tab==1)])
			snodeid = snodeid[snodeid!=0]  # 0 is a dummy node
			
			# not sure why this is used:
			#ancs =  sapply(snodeid,function(i) which(edges(rettree)[,1] == i)) # where the subnode is the ancestor
			#decs =  sapply(snodeid,function(i) which(edges(rettree)[,2] == i)) # where the subnode is the descendant
			#newdata = data.frame(tdata(rettree,"all")[-decs,],row.names=labels(rettree)[-decs])
			newdata = tdata(rettree)[-snodeid,,drop=F]
			#colnames(newdata) <- colnames(tdata(rettree))
			
			# hack it for now:
			rettree = as(rettree,"phylo") # convert to ape
			rettree <- collapse.singles(rettree) # collapse singles
			if(is.rooted(rettree)) {
				rettree = phylo4d(rettree,all.data=newdata,rownamesAsLabels=TRUE) # create new phylo4d object
			} else {
				rettree = phylo4d(rettree,all.data=newdata,rownamesAsLabels=TRUE,missing.data="OK") # 
			}
		} else {
			rettree = as(rettree,"phylo") # convert to ape
			rettree <- collapse.singles(rettree) # collapse singles
			rettree = phylo4(rettree)
		}
	}
	
	return(rettree)
}


# turn subnodes into singletons
# x - phylobase tree
# rm.ex.data - "remove 'extra' data":
#  should data frames unique to tdat be removed (TRUE)
#  or should extra columns be added to sdat (FALSE)
#  WARNING: rm.ex.data = TRUE could lead to a loss of data (obviously)
#
collapse.subnodes <- function(x,rm.ex.data = TRUE)
{
	
	if(!inherits(x,"phylo4d_ext") || !hasSubNodes(x))
	{
		warning("No subnodes to collapse")
		return(x)
	}
	
	# Step 1: cache data from x and setup variables to hold info for the new tree
	
	## a.) Handle data:
	tdat = tdata(x)
	sdat = sndata(x)
	col.diffs = setdiff(colnames(tdat),colnames(sdat))
	if(length(col.diffs) > 0)
	{
		if(rm.ex.data){
			tdat = tdat[,-which(colnames(tdat) %in% col.diffs)]
		} else {
			tmpnames = colnames(sdat)
			sdat = cbind(sdat,matrix(NA,nrow=nrow(sdat),ncol=length(col.diffs)))
			colnames(sdat) <- c(tmpnames,col.diffs)
		}
	}
	newdat = tdat
	
	# TODO: might need to reorder columns of tdat and sdat to match up....
	
	## b.) Subnode positions
	posmeans = snposition(x)
	if(!is.matrix(posmeans))
		posmeans = matrix(posmeans,nrow=1,ncol=2)
	posmeans = apply(posmeans,1,mean)
	
	## c.) Handle edges
	el = edges(x)
	newel = el
	elens = edgeLength(x)
	newelens = elens
	snedges.inds = getSubNodeEdgeInds(x)
	u.sninds = unique(snedges.inds)
	
	## d.) Handle node ids and labels
	nids = nodeId(x) # node ids
	lastnodeid = max(nids) # base from which to create new node ids
	nodelabs = nodeLabels(x)
	
	# Step 2.) Systematically convert subnodes into new sets of edges (with singletons)
	for(ii in seq(length(u.sninds)))
	{
		# get edge and subnode indices:
		eid = u.sninds[ii]
		sninds = which(snedges.inds == eid)
		curr.anc = el[eid,1]
		curr.desc = el[eid,2]	
		elen = elens[eid]
		newnodeids = lastnodeid + seq(length(sninds))
		lastnodeid = max(newnodeids)
		tmp = sprintf("Singleton%07d",newnodeids)
		names(tmp) <- newnodeids
	
		# Create new nodes and edges	
		# positions FROM anc TO desc:
		newlens = sort(posmeans[sninds] * elen) # absolute positions along the branch
		newlens = unname(c(newlens[1],diff(newlens),elen - max(newlens))) # convert to spacing btw subnodes
		newedges = matrix(NA,ncol=2,nrow=length(newlens))
		newedges[,1] <- unname(c(curr.anc,newnodeids))
		newedges[,2] <- unname(c(newnodeids,curr.desc))
	
		tmpdat = sdat[sninds,,drop=F]
		rownames(tmpdat) <- c(tmp)
		newdat = rbind(newdat,tmpdat)
		
		# Append data:	
		newel = rbind(newel,newedges)
		newelens = c(newelens,newlens)
	
		nodelabs = c(nodelabs,tmp)
	}
	
	# Step 3.) Create new tree with new edgelist,labels, and data
	
	# remove old indices:
	newel = newel[-u.sninds,]
	newelens = newelens[-u.sninds]
	
	# create new tree:
	newphy = phylo4(newel,newelens,tipLabels(x),nodelabs)
	# add data:
	newphy = addData(newphy,all.data=newdat)
	
	return(newphy)
}



# subroutine of write.simmap:
newlabels.v1x <- function(x,usestate,splitchar,write.nas=TRUE) 
{
	
	# Note: this version of simmap only handle one state
	if(is.null(usestate))
		usestate = colnames(tdata(x))
			
	usestate = usestate[1]
	tdat = tdata(x)[,usestate,drop=F]
	snodes.present = hasSubNodes(x)
	sdat = NULL; snedges.inds = NULL
	
	if(snodes.present){
		sdat = sndata(x)[,usestate,drop=F]
		snedges.inds = apply(snbranch(x),1,function(i) .edge.index(x,i[1],i[2]))
	}
		
	es = edges(x)[,2]
	if(!isRooted(x))
		es = union(es,edges(x)[,1])  # added 3/8 to accomodate unrooted trees:
	
	elens = edgeLength(x); elens[is.na(elens)] <- 0.0
	newlenlab=character(length(es))
	
	for(ii in seq(length(es)))
	{
		nodeid=es[ii]
		if(!(ii %in% snedges.inds))
		{
			# added 3/8 to accomodate unrooted trees:
			if(is.na(elens[ii])){
				newlenlab[ii] = ""
			} else if(!is.na(tdat[nodeid,1]) || write.nas){
				newlenlab[ii] = paste("{",tdat[nodeid,1] ,",", elens[ii],"}",sep="")
			} else {
				newlenlab[ii] = as.character(elens[ii])
			}
		} else {
			
			snind = which(snedges.inds == ii)
			snlens = snposition(x)[snind,] * elens[ii]
			snstates = sdat[snind,1]
			if(!is.matrix(snlens))
				snlens = matrix(snlens,nrow=1)
			
			# reorder
			snpos = apply(snlens,1,mean)
			neword = order(snpos,decreasing=T)
			snpos = snpos[neword]
			snstates = snstates[neword]
			
			newlenlab[ii] = paste("{",tdat[nodeid,1],",",(elens[ii]-max(snpos)),sep="")
			
			if(length(snpos)>1)
				for(jj in seq(length(snpos)-1))
					newlenlab[ii] = paste(newlenlab[ii],splitchar,snstates[jj],",",(snpos[jj]-snpos[(jj+1)]),sep="")
			
			newlenlab[ii] = paste(newlenlab[ii],splitchar,tail(snstates,1),",",tail(snpos,1),"}",sep="")
			
		}
	}
	
	oldlabs = labels(x)[es]
	names(newlenlab) <- oldlabs
	oldlabs[which(is.na(oldlabs))] <- ""
	newlab = paste(oldlabs,":", newlenlab ,sep="")
	newlab[which(newlenlab=="")] <- ""  # remove any <NA> data

	return(newlab)
}


# subroutine of write.simmap:
newlabels.v15 <- function(x,usestate,splitchar)
{
	
	if(is.null(usestate))
		usestate = colnames(tdata(x))
	else {
		if(is.numeric(usestate))
		{
			usestate = colnames(tdata(x))[usestate]
		}
	}
	
	# get tip and subnode data:
	tdat = tdata(x)[,usestate,drop=F]
	snodes.present = hasSubNodes(x)
	sdat = NULL; snedges.inds = NULL
	
	if(snodes.present){
		sdat = sndata(x)[,usestate,drop=F]
		snedges.inds = apply(snbranch(x),1,function(i) .edge.index(x,i[1],i[2]))
	}
	
	es = edges(x)[,2]
	if(!isRooted(x))
		es = union(es,edges(x)[,1])  # added 3/8 to accomodate unrooted trees:

	elens = edgeLength(x); elens[is.na(elens)] <- 0.0
	newlenlab=character(length(es)) # new "length" labels for each node
	
	# Setup SIMMAP comments to hold data values
	for(ii in seq(length(es)))
	{
		nodeid=es[ii]
		
		if(!(ii %in% snedges.inds))
		{ # if there is no subnode on this branch:
			if(is.na(elens[ii])) {
				newlenlab[ii] = ""  # added to deal with unrooted trees
			} else {		
				topper = "[&"
				backer = paste("]",elens[ii],sep="")
				middle = ""
				for(colind in seq(length(usestate)))
					if(!is.na(tdat[nodeid,usestate[colind]]))
						middle = paste(middle,sprintf("%s={%s}%s",usestate[colind],tdat[nodeid,usestate[colind]],ifelse(colind==length(usestate),"",",")),sep="")
			
				if(middle==""){
					newlenlab[ii] = as.character(elens[ii])
				} else {
					newlenlab[ii] = paste(topper,middle,backer,sep="")
				}
			}
		} else {
			
			topper = "[&"
			backer = paste("]",elens[ii],sep="")
			middle = ""

			# retrieve subnode data:
			snind = which(snedges.inds == ii)
			snlens = snposition(x)[snind,] * elens[ii]
			
			for(colind in seq(length(usestate)))
			{
				tmp = ""
				snstates = sdat[snind,usestate[colind]]
				if(!is.matrix(snlens))
					snlens = matrix(snlens,nrow=1)
				
				snpos = apply(snlens,1,mean)
				neword = order(snpos,decreasing=T)
				
				snpos = snpos[neword]
				snstates = snstates[neword]
				
				# first, write the DESC state:
				tmp = paste(sprintf("%s={%s", usestate[colind],tdat[nodeid,usestate[colind]]),(elens[ii]-max(snpos)),sep=",")
				#newlenlab[ii] = paste(sprintf("[&%s={%s",usestate[colind],tdat[nodeid,usestate[colind]]),(elens[ii]-max(snpos)),sep=",")
				
				# intermediates, from DESC -> ANC
				if(length(snpos)>1)
					for(jj in seq(length(snpos)-1))
						tmp = paste(tmp,splitchar,snstates[jj],",",(snpos[jj]-snpos[(jj+1)]),sep="")
						#newlenlab[ii] = paste(newlenlab[ii],splitchar,snstates[jj],",",(snpos[jj]-snpos[(jj+1)]),sep="")
				
				# last subnode
				#newlenlab[ii] = paste(newlenlab[ii],splitchar,tail(snstates,1),ifelse(colind==length(usestate),"",","),sep="")
				tmp = paste(tmp,splitchar,tail(snstates,1),"}",sep="")
				middle = paste(middle,tmp,ifelse(colind==length(usestate),"",","),sep="")
			}
			
			if(middle==""){
				newlenlab[ii] = as.character(elens[ii])
			} else {
				newlenlab[ii] = paste(topper,middle,backer,sep="")
			}
			
		}
	}
	# End setup SIMMAP comments
	
	
	# Substitute SIMMAP comments into tree
	oldlabs = labels(x)[es]
	names(newlenlab) <- oldlabs
	oldlabs[which(is.na(oldlabs))] <- ""
	#newlab = paste(oldlabs,":{", newlenlab ,"}",sep="")
	newlab = paste(oldlabs,newlenlab,sep=":")
	newlab[which(newlenlab=="")] <- ""  # remove any <NA> data
	
	# DEBUG: at this point, oldlabs and newlabs look like they have been lined up correctly
	
	return(newlab)		
}


# write modified newick file:
# -This is kind of a hack: it basically writes subnodes and lengths to a new character label and then
#  creates a new 'phylo' class using only those labels (without edge lengths).  Then it uses
#  APEs algorithm to write those labels to a newick string.  
#
# -If any of the SIMMAP datatypes are is.na, then they are left out!  TODO:  Look into changing this in the future
#
write.simmap <- function(x,usestate=NULL,file="",vers=1.1,...)
{
	splitchar = ifelse(vers==1.0,";",":")
	
	if(is.null(usestate))
		usestate = colnames(tdata(x))
	
	#if(hasSubNodes(x))
	if(hasData(x) && hasDataColumn(x,usestate))
	{
		newlab=NULL
		if(vers == 1.5){
			newlab <- newlabels.v15(x,usestate,",")
		} else {
			newlab <- newlabels.v1x(x,usestate,splitchar)
		}
		
		# reorder:
		es = edges(x)[,2]
		if(!isRooted(x))
			es = union(es,edges(x)[,1])  # added 3/8 to accomodate unrooted trees:
	
		altorder = order(es)
		newlab = newlab[altorder]
		ntype = nodeType(x)[es[altorder]]
		phy = as(x,'phylo')
		newedges = edges(x)
		newphy = list(edge=newedges,
						tip.label=newlab[which(ntype=="tip")],
						node.label=newlab[which(ntype!="tip")],
						Nnode=length(which(ntype!="tip"))
					)
		
		class(newphy) <- "phylo"
		
		################################################################
		# borrowed code from APE (write.tree.R):
		#
		output.tree.names=FALSE
		append = FALSE
		digits = 10
		brl <- !is.null(newphy$edge.length)
		nodelab <- !is.null(newphy$node.label)
		f.d <- paste("%.", digits, "g", sep = "")
		
		## Helper functions
		cp <- function(s) STRING <<- paste(STRING, s, sep = "")

		add.internal <- function(i) {
		    cp("(")
		    br <- which(newphy$edge[, 1] == i)
		    for (j in br) {
		        desc <- newphy$edge[j, 2]
		        if (desc > n) add.internal(desc)
		        else add.terminal(j)
		        if (j != br[length(br)])  cp(",")
		    }
		    cp(")")
		    if (nodelab) cp(newphy$node.label[i - n])
		    if (brl) {
		        cp(":")
		        cp(sprintf(f.d, newphy$edge.length[which(newphy$edge[, 2] == i)]))
		    }
		}
		
		add.terminal <- function(i) {
		    cp(newphy$tip.label[newphy$edge[i, 2]])
		    if (brl) {
		        cp(":")
		        cp(sprintf(f.d, newphy$edge.length[i]))
		    }
		}
		## End helper functions

		n <- length(newphy$tip.label)
		STRING <- if (output.tree.names) paste("ERROR!", "(", sep = "") else "("
		br <- which(newphy$edge[, 1] == n + 1)
		for (j in br) {
		    desc <- newphy$edge[j, 2]
		    if (desc > n) add.internal(desc)
		    else add.terminal(j)
		    if (j != br[length(br)]) cp(",")
		}
		if (is.null(newphy$root.edge)) {
		    cp(")")
		    if (nodelab) cp(newphy$node.label[1])
		    cp(";")
		} else {
		    cp(")")
		    if (nodelab) cp(newphy$node.label[1])
		    cp(":")
		    cp(sprintf(f.d, newphy$root.edge))
		    cp(";")
		}
		if (file == "") return(STRING)
	    cat(STRING, file = file, append = append, sep = "\n")
	    
		################################################################		
	} else {
		phy = as(x,'phylo')
		write.tree(phy,file,...)
	}
}

	
# write simmap version 1.1 strings
#
write.simmap.old <- function(x,usestate=NULL,file="",...)
{
	write.simmap(x,usestate,file=file,vers=1.1,...)
}

# write simmap version 1.5 strings
#
write.simmap.new <- function(x,usestate=NULL,file="",...)
{
	write.simmap(x,usestate,file=file,vers=1.5,...)
}



# This is the main write function for phylo4d_ext
# Mainly ripped from APE write.nexus function
#
# @param obj a list or single phylo4d_ext objects
# @param file the file to write to
# @param translate should the tree names be moved to a special section
# @param vers which SIMMAP version should be used (can be vers=c(1.1, 1.0, 1.5))
# @param usestates if NULL, then all data columns are used
# @param dtype for compatibility with older simmap versions
#  	   anything in 'usestate' will be written into simmap comments
#	   anything left over that is compatible with dtype will be written as 
#	   a character.
#				
#
write.nexus.simmap <- function(obj, file = "", translate = TRUE, dtype=c(noneData(),contData(),discData()), ...)
{
	
	if(!is.list(obj))
	{
		if(!is(obj,"phylo4d_ext"))
			stop("This function is only made to work with phylo4d_ext objects. Use write.nexus() instead.")
		
		obj <- list(obj)
	} else {
		if(!all(sapply(obj,is,'phylo4d_ext')))
			stop("This function is only made to work with phylo4d_ext objects or lists of such. Use write.nexus() instead.")
	}
	ntree <- length(obj)

	dtype = match.arg(dtype)
	dat = NULL
	if(hasData(obj[[1]]))
		dat = tdata(obj[[1]],"tip")
	
    cat("#NEXUS\n", file = file)
    cat(paste("[R-package APE, ", date(), "]\n\n", sep = ""),file = file, append = TRUE)
	
    #N <- length(obj[[1]]$tip.label)
	N <- length(tipLabels(obj[[1]]))

        cat("BEGIN TAXA;\n", file = file, append = TRUE)
        cat(paste("\tDIMENSIONS NTAX = ", N, ";\n", sep = ""),file = file, append = TRUE)
        cat("\tTAXLABELS\n", file = file, append = TRUE)
        cat(paste("\t\t", tipLabels(obj[[1]]), sep = ""),sep = "\n", file = file, append = TRUE)
        cat("\t;\n", file = file, append = TRUE)
        cat("END;\n", file = file, append = TRUE)
   	
    cat("BEGIN TREES;\n", file = file, append = TRUE)
    if (translate) {
        ## We take arbitrarily the labels of the first tree, and
        ## translate them as "1", "2", "3", ...
        cat("\tTRANSLATE\n", file = file, append = TRUE)
        tmp <- checkLabel(tipLabels(obj[[1]]))
        X <- paste("\t\t", 1:N, "\t", tmp, ",", sep = "")
        ## We remove the last comma:
        X[length(X)] <- gsub(",", "", X[length(X)])
        cat(X, file = file, append = TRUE, sep = "\n")
        cat("\t;\n", file = file, append = TRUE)
        token <- as.character(1:N)
        names(token) <- tipLabels(obj[[1]])
        tipLabels(obj[[1]]) <- token
        if (ntree > 1) {
            for (i in 2:ntree)
                tipLabels(obj[[i]]) <- token[tipLabels(obj[[i]])]
            class(obj) <- NULL
        }
    } else {
        for (i in 1:ntree)
          tipLabels(obj[[i]]) <- checkLabel(tipLabels(obj[[i]]))
    }
    for (i in 1:ntree) {
	    tprefix = "\tTREE * UNTITLED"
	    #weights
	    if(hasWeight(obj[[i]])){
		    tprefix = sprintf("%s [&W %f]",tprefix,weight(obj[[i]]))
	    }
	    #is rooted
        if (isRooted(obj[[i]])){
        	#cat("\tTREE * UNTITLED = [&R] ", file = file, append = TRUE)
        	tprefix = paste(tprefix," = [&R] ",sep="")
    	}else{
	    	#cat("\tTREE * UNTITLED = [&U] ", file = file, append = TRUE)
	    	tprefix = paste(tprefix," = [&U] ",sep="")
		}
		cat(tprefix, file = file, append = TRUE)
		
		# NOTE (2/13) - write.simmap does this step now, and better:
		#if(is.null(usestates))
		#	usestates = colnames(tdata(obj[[i]]))
		
        cat(write.simmap(obj[[i]], file="",...),"\n", sep = "", file = file, append = TRUE)
    }
    cat("END;\n", file = file, append = TRUE)

	
	# begin characters block write:
	if( dtype != noneData() && !is.null(dat) )
	{
		datnames = colnames(dat)

		treeargs <- list(...)
		if( "usestate" %in% names(treeargs) ){
			
			exclude = integer(0)
			if(!is.character(treeargs$usestate)) {
				exclude = as.integer(treeargs$usestate)
			} else {
				exclude = which(datnames == treeargs$usestate)
			}
			
			if(length(exclude) > 0)
				dat = dat[,-exclude,drop=FALSE]
		}
		
		dat = dat[,which(guess.datatype(dat) == dtype),drop=FALSE]
		
		if(ncol(dat) != 0)
		{
			cat( write.characters2(dat,dtype=dtype), sep = "\n", file= file, append=TRUE)
		} else {
			warning("Could not find any suitable data to write.")
		}
	}
 	# end characters block write

	
}








#------------------------------------------
# Modifying phylo4d-extension objects
# 
#------------------------------------------

# Internal:
# NOTE: labels are case-sensitive
# NOTE: this function needs to deal with NAs
.edge.index <- function(tree,anc,dec)
{
	if(!is(tree,'phylo4'))
		stop("Tree needs to inherit from class phylo4")
	
	# convert from characters to indices
	if(is.character(c(anc,dec))){
		
		if(is.na(anc)){
			anc = which(is.na(labels(tree)))
		} else {
			anc = which(labels(tree) == anc)
		}
		
		if(is.na(dec)){
			dec = which(is.na(labels(tree)))
		} else {
			dec = which(labels(tree) == dec)
		}
		
	}
			
	if(is.na(anc) || is.na(dec) || is.null(anc) || is.null(dec))
		stop("Must specify valid ancestor and decendent: ",anc,"-",dec)
	
	#eind = which(edges(tree)[,1] == anc & edges(tree)[,2] == dec)
	eind = which(edges(tree)[,1] %in% anc & edges(tree)[,2] %in% dec)

	if(length(eind) != 1){
		warning("No connection between ",anc," and ",dec,"\n")
		return(NA)
	}
	
	return(eind)
}


nSubNodes <- function(x)
{
	return (length(x@subnode.id))
}

setMethod("hasSubNodes", signature(x="list"),
  function(x) {
	return(sapply(x,hasSubNodes))
})


setMethod("hasSubNodes", signature(x="phylo4d_ext"),
  function(x) {
	return(nSubNodes(x)!=0)
})


setMethod("hasSubNodes", signature(x="phylo4"),
  function(x) {
	return(FALSE)
})

setMethod("hasSubNodes", signature(x="phylo"),
  function(x) {
	return(FALSE)
})

setMethod("hasData", signature(x="phylo"),
  function(x) {
	return(FALSE)
})

setMethod("hasData", signature(x="phylo4"),
  function(x) {
	return(FALSE)
})

setMethod("hasData", signature(x="phylo4d"),
  function(x) {
	return(ncol(tdata(x))!=0)
})

setMethod("hasData", signature(x="list"),
	function(x) {
		return(sapply(x,hasData))
})

setMethod("hasDataColumn", signature("phylo",index="ANY"),
	function(x,index){
	return(FALSE)
})

setMethod("hasDataColumn", signature("phylo4",index="ANY"),
	function(x,index){
	return(FALSE)
})

setMethod("hasDataColumn", signature("phylo4d",index="numeric"),
	function(x,index){
	retval = FALSE
	if( hasData(x) && length(index) != 0 && all(index <= ncol(tdata(x))) )
	{
		retval=TRUE
	}
	return(retval)
})

setMethod("hasDataColumn", signature("phylo4d",index="character"),
	function(x,index){

	uinds = unique(index)
	ind = which(colnames(tdata(x)) %in% uinds)
	
	if(length(ind) != length(uinds))
	{
		return(FALSE)
	} else {
		return(hasDataColumn(x,ind))
	}
})



getSubNodeData <- function(x,colname)
{
	if(missing(colname))
		return(x@subnode.data)
	
	return(x@subnode.data[colname])
}

getSubNodePosish <- function(x)
{
	return(x@subnode.pos)
}


getSubNodeEdgeInds <- function(x)
{
	edgeinds = integer(nSubNodes(x))
	if(length(edgeinds)!=0)
	{
		for(ii in seq(nrow(x@subnode.branch)))
			edgeinds[ii] = .edge.index(x,x@subnode.branch[ii,1],x@subnode.branch[ii,2])
	}
	return (edgeinds)
}

# return empty data.frame styled like
# @data slot
getEmptyDataFrame <- function(x,...)
{
	tmpdf = data.frame(x@data[0,],...)
	colnames(tmpdf) <- colnames(tdata(x))
	return(tmpdf)
}


# TODO: overload tdata here
# 
addSubNode <- function(x,anc,dec,position,dataf,pos.is.fraction=FALSE)
{
	
	if(!is(x,'phylo4d_ext')){
		warning("Converting x to an extended phylo4d object")
		x = phyext(x)
	}
	
	if(missing(dataf))
		stop("dataf needs to contain something")

	eind = .edge.index(x,anc,dec)
	if(is.na(eind))
		stop("Failure to find edge from ",anc, " to ",dec,"\n")
	
	elen = edgeLength(x)[eind]
	if(elen == 0)
		stop("Cannot place subnode on a zero length branch")
	
	if(position > ifelse(pos.is.fraction,1,elen))
		stop("Position: ",position,", is greater than allowed value of ",ifelse(pos.is.fraction,1,elen),"\n")
	
	# TODO: check for overlapping subnodes
	# TODO: check to see if every col in data.frame has 
	#		an associated col in tdata() AND that col is  
	#		defined for internal and tip nodes

	# construct data frame:
	newdf = getEmptyDataFrame(x)
	if(is.data.frame(dataf))
	{
		if( all(names(dataf) == names(newdf)) ){
			newdf = rbind(newdf, dataf)
		} else {
			# only use cols which exist in tdata(x)
			colinds = which(names(dataf) %in% names(newdf)) 
			if( length(colinds) == 0 )
				stop("Could not find any common columns between dataf and tdata(x)")
			
			newdf = merge(newdf,dataf[,colinds,drop=F],all=T)
		}
		
	} else {
		


		newdf = getEmptyDataFrame(x,stringsAsFactors=FALSE)  # ??

		# special case (if lengths are the same, then assume that they are in order):
		if( length(dataf) == ncol(newdf) && is.null(names(dataf)) )
			names(dataf) <- names(newdf)

		newdf[1,] <- rep(NA,ncol(newdf))
		# try to match up names
		ndf = names(dataf)
		count = 1
		
		for(nm in ndf){
			nmind = which(names(newdf)==nm)
			if(!is.null(nmind) && length(nmind) != 0)
			{
				if(is.factor(newdf[,nmind])  && !(dataf[count] %in% levels(newdf[,nmind])) )
				{
					# need to add more levels if this column is a factor
					newlevs = c(levels(newdf[,nmind]),unname(dataf[count]))
					levels(newdf[,nmind]) <- newlevs
					levels(tdata(x)[,nmind]) <- newlevs	# TODO: TEST THIS!
				}
				newdf[1,nmind] = dataf[count]
			}
			count = count + 1
		}

		
	}
	
	if(!pos.is.fraction) 
		position = position / elen
	
	x@subnode.id = append(x@subnode.id, as.integer(nTips(x) + nNodes(x) + nSubNodes(x) + 1))
	x@subnode.data = rbind(x@subnode.data, newdf)
	x@subnode.branch = rbind(x@subnode.branch, edges(x)[eind,])
	x@subnode.pos = rbind(x@subnode.pos, rep(position,2))
	
	return (x)
}


#
showSubNodes <- function(x)
{
	if(hasSubNodes(x))
	{
		charlen = 80
		brchar = "-"
		terminalchar = "-"
		regchar="0"
		overlapchar = "*"
		# use them all by default:
		
		einds = apply(x@subnode.branch,1,function(i) .edge.index(x,i[1],i[2]))
		
		for(eind in unique(einds))
		{
			snid = which(einds == eind)
			anc = x@subnode.branch[snid,1]
			dec = x@subnode.branch[snid,2]
			
			# setup output string
			tmpstr = rep(brchar,charlen)
			tmpstr[1] = terminalchar
			tmpstr[charlen] = terminalchar
			
			# get relative positions of nodes:
			elen = edgeLength(x)[eind]
			breaksize = elen / charlen
			snpos = x@subnode.pos[snid,] * elen 
			if(!is.matrix(snpos)) 
				snpos = matrix(snpos,nrow=1)
			
			snmeans = apply(snpos,1,mean)
			
			for(xx in seq(length(snid)))
			{
				nbreaks = max(1, floor(diff(snpos[xx,]) / breaksize) )
				from = floor(snpos[xx,] / breaksize)
				tmpstr[seq(from,length.out=nbreaks)][tmpstr[seq(from,length.out=nbreaks)]==regchar] = overlapchar
				tmpstr[seq(from,length.out=nbreaks)][tmpstr[seq(from,length.out=nbreaks)]==brchar] = regchar
				#tmpstr[seq(from,length.out=nbreaks)] = rep(regchar,nbreaks)
			}
			
			# collapse and print:
			tmpstr = paste(tmpstr,collapse="")
			if(length(snid)==1){
				names(tmpstr) <- sprintf("Subnode at ~%0.2f on branch: '%s' to '%s' (brlen=%0.2f)",snpos[1]+diff(snpos[1,])/2,labels(x)[anc],labels(x)[dec],elen)
			} else {
				names(tmpstr) <- sprintf("%d Subnodes on branch: %d to %d (bren=%0.2f)",length(snid),anc[1],dec[1],elen)
			}
			print(tmpstr)
			cat("\n")
		}
		print("0 indicates a subnode; * indicates 2+ subnodes overlapping. Positions are relative.")
	}
}



#-----------------------------
# Set Generics
#-----------------------------

# show
setMethod("show","phylo4d_ext", function(object){ printphylo4(object); showSubNodes(object)})

setMethod("snid", signature(x="phylo4d_ext"),
  function(x) {
	return(x@subnode.id)
})

setMethod("snid", signature(x="list"),
	function(x) {
		return(x[[1]]@subnode.id)
})

setMethod("sndata", signature(x="phylo4d_ext"),
  function(x) {
	return(x@subnode.data)
})

setMethod("sndata", signature(x="list"),
  function(x) {
	return(x[[1]]@subnode.data)
})


# this adds data
# TODO: Change 'adds' behavior to 'replaces'
setReplaceMethod("sndata", signature(x="phylo4d_ext"),
  function(x,datnames=NULL,value) {
	
	if(!is.data.frame(value) && (length(value) != nSubNodes(x)))
	{
	  warning("Can only add vectors of data if they are the same length as the number of subnodes")
	  return(x)
	}
	if(!is.data.frame(value)){
		value = data.frame(value)
	}
	if(!is.null(datnames))
	{
		if(is.character(datnames) && length(datnames)==ncol(value))
		{
			names(value) <- datnames
		} else {
			warning("Not using specified names")
		}
	}
	x@subnode.data = cbind(x@subnode.data,value)
	return(x)
})


setReplaceMethod("sndata",signature(x="list"),
	function(x,datnames=NULL,value) {
		for(ii in seq(length(x)))
			sndata(x[[ii]],datnames=datnames) <- value
			
	return(x)
})


# begin rmdata:
# 'rmdata' method removes data columns from the 'data' and 'subnode.data' slots
#

setMethod("rmdata", signature(x="phylo4d_ext",index="numeric",subindex="numeric"),
  function(x,index,subindex) {
	
	if(length(index)>0 && abs(index) <= ncol(tdata(x)) && index != 0)
	{
		x@data = x@data[,-index,drop=F]
	} else {
		warning("The data column to be removed could not be found in tdata(x)")
	}

	if( hasSubNodes(x) && length(subindex) >0 && abs(subindex) <= ncol(sndata(x)) && subindex != 0)
	{
		x@subnode.data = x@subnode.data[,-index,drop=F]
	}
	
	return(x)
})
	

setMethod("rmdata", signature(x="phylo4d_ext",index="numeric",subindex="missing"),
  function(x,index) {
	return( rmdata(x,index=index,subindex=index) )
})

setMethod("rmdata", signature(x="phylo4d_ext",index="character",subindex="missing"),
  function(x,index) {
	ind = which(colnames(tdata(x)) %in% index)
	if(hasSubNodes(x)){
		subind = which(colnames(sndata(x)) %in% index)
	} else {
		subind = numeric(0)
	}
	return( rmdata(x,index=ind,subindex=subind) )
})


setMethod("rmdata", signature(x="phylo4d",index="numeric",subindex="missing"),
  function(x,index) {
	  if(length(index)>0 && abs(index) <= ncol(tdata(x)) && index != 0)
		x@data = x@data[,-index,drop=F]
	
	return(x)
})

setMethod("rmdata", signature(x="phylo4d",index="character",subindex="missing"),
  function(x,index) {
	inds = which(names(tdata(x)) %in% index)
	return(rmdata(x,inds))
})


setMethod("rmdata", signature(x="list",index="ANY",subindex="missing"),
	function(x,index) {
		x = sapply(x,rmdata,index)
	return(x)
})
# end rmdata


setMethod("snposition", signature(x="phylo4d_ext"),
  function(x) {
	return(x@subnode.pos)
})

setMethod("snposition", signature(x="list"),
  function(x) {
	return(x[[1]]@subnode.pos)
})

setMethod("snbranch", signature(x="phylo4d_ext"),
  function(x) {
	return(x@subnode.branch)
})

setMethod("snbranch", signature(x="list"),
  function(x) {
	return(x[[1]]@subnode.branch)
})


#---------------
## WEIGHT:
#
#---------------
setMethod("weight", signature(x="phylo4d_ext"),
  function(x) {
	return(x@weight)
})

setMethod("weight", signature(x="list"),
	function(x) {
		if(hasWeight(x))
			return( sapply(x,weight) )
		
		return(numeric(0))
})

setReplaceMethod("weight", signature(x="phylo4d_ext"),
  function(x,value) {
	x@weight = value
	return(x)
})

setReplaceMethod("weight",signature(x="list"),
	function(x,value) {
	
	if(length(x) != length(value))
		stop("Replacement values need to be same length as the list")
	
	for(ii in seq(length(x)))
		weight(x[[ii]]) <- value[ii]
	
	return(x)
})

setMethod("hasWeight",signature(x="phylo4d_ext"),
	function(x,strict=TRUE){
		return( length(x@weight)!=0 )
})


setMethod("hasWeight",signature(x="list"),
	function(x,strict=TRUE){
		retbool=ifelse(strict,TRUE,FALSE)
		for(ii in seq(length(x))){
			tmpbool = hasWeight(x[[ii]])
			if(strict && !tmpbool)
				return(FALSE)
			
			if(!strict && tmpbool)
				return(TRUE)
			
		}
		return(retbool)
})

# 
# setMethod("addData", signature(x="phylo4d_ext"),
# 	function(x,...,snode.data=NULL) {
# 		
# 		oldcols = names(tdata(x))
# 		# Add data the normal way:
# 		x = getMethod("addData","phylo4d")(x,...)
# 		
# 		newcols = setdiff(names(tdata(x)),names(sndata(x)))
# 		supercols = setdiff(oldcols,names(tdata(x)))
# 		
# 		cat("newcols: ", newcols,"\n")
# 		
# 		# Add data to subedges:
# 		if(is.null(snode.data))
# 		{
# 			if(length(newcols)!=0)
# 			{
# 				newdat = data.frame(matrix(NA,nrow=nSubNodes(x),ncol=length(newcols)))
# 				names(newdat) <- newcols
# 				x@subnode.data = cbind(x@subnode.data,newdat)
# 			}
# 			
# 			# Remove superfluous columns
# 			if(length(supercols)!=0)
# 			{
# 				for(scol in supercols)
# 					x = rmdata(x,scol)
# 			}
# 		}else{
# 			# add this data:
# 			if(length(newcols)!=0)
# 			{
# 				print("adding this way")
# 				sndata(x,datnames=newcols) <- snode.data
# 			}
# 		}
# 		
# 		return(x)
# })
# 
# 

Try the phyext package in your browser

Any scripts or data that you put into this service are public.

phyext documentation built on May 2, 2019, 5:19 p.m.