treestructure: Detect Population Structure Within Phylogenetic Trees

Documented in plot.TreeStructure treestructure.test trestruct

#~ Treestructure
#~     Copyright (C) 2019  Erik Volz
#~     This program is free software: you can redistribute it and/or modify
#~     it under the terms of the GNU General Public License as published by
#~     the Free Software Foundation, either version 3 of the License, or
#~     (at your option) any later version.


.get_anc <- function(edge, u ){
	edge[ edge[,2]==u ,1]
}
.get_dgtrs <- function(edge, a){
	edge[ edge[,1] == a , 2 ]
}

.get_rootnode <- function(edge){
	setdiff( as.vector(edge), edge[,2] )
}

cocluster_accuracy <- function( x, y ){
# Statistic which measures overlap between equal length factors x and y
 n <- length(x) 
 a <- matrix( FALSE, nrow =	n , ncol = n  )
 b <- matrix( FALSE, nrow =	n , ncol = n  )
 sx <- split( 1:n, x )
 sy <- split( 1:n, y )
 for ( z in sx )
   a[z,z] <- TRUE
 for ( z in sy )
   b[z,z] <- TRUE
  
  (sum( a == b ) - n ) / (n^2-n)
}




# DISSIMILARITY FUNCTIONS
# test statistic comparing two internals  

# req ancestors 
.ispara <- function(u, v){
	(v %in% ancestors[[u]]) | (u %in% ancestors[[v]])
}


# requres tre 
.ismonomono_cl <- function( cl1, cl2){
	t1 <- intersect( 1:(ape::Ntip(tre)), cl1 )
	t2 <- intersect( 1:(ape::Ntip(tre)), cl2 )
	u <- ape::getMRCA(tre,  t1 )
	v <- ape::getMRCA(tre,  t2 )
	if ( is.null( u ) | is.null(v) ) 
	  return(FALSE)
	!.ispara( u, v)
}

# req nhs 
.rank.sum <- function( uinternals, vinternals ) {
	x =rbind( 
	 cbind(  nhs[ uinternals ], 1 )
	 , cbind(  nhs[ vinternals ], 0 )
	)
	x <- as.vector( x[ order(x[,1] ) , 2] )
	sum( x * (1:length(x)) )
}

# test stat null distribution 
# E_i : event type (cf paper). 
# 1: as wiuf and donelly; u is mono relative to v; mono/mono or mono/para
# 2: mono / mono 
# 3: mono/mono OR mono/para OR para/mono
# requres nhs , tre , inodes 
.uv.diss <- function(uset, vset, nsim , Ei = NA, returnabs=TRUE, detailedOut = FALSE){
	# 1 sample u
	# 0 co 
	# -1 sample v 
	
	if (is.na( Ei)){
		#nested = ifelse( .ismonomono( u, v), 1, 0 )
		Ei = ifelse( .ismonomono_cl( uset, vset ), 2, 3 )
	}
	
	uinternals <-  intersect( uset, inodes)
	vinternals <- setdiff(  intersect( vset, inodes), uinternals )
	utips <- intersect( uset, 1:n)
	vtips <- intersect( vset, 1:n)
	if ( (length( utips )==0) | (length(vtips) == 0) ){
		if ( detailedOut )
			return( list( statistic = 0 , null = c() , Z = x) )
		else
			return( 0 )
	}
	x <- rbind( 
	 cbind(    nhs[ uinternals ], 0 )
	 , cbind(  nhs[ vinternals ], 0 )
	 , cbind(  nhs[ utips ], 1 )
	 , cbind(  nhs[ vtips ], -1 )
	)
	x <- as.vector( x[ order(x[,1] ) , 2] )
	nd <- Cuv_ranksum_nulldist(x, nsim, Ei  )
	
	rsuv <- .rank.sum( uinternals, vinternals)
	
	m_nd <- mean(nd)
	sd_nd <- stats::sd(nd)
	if ( returnabs )
		x = ( abs( (rsuv - m_nd) / sd_nd ) )
	else
		x = ( (rsuv - m_nd) / sd_nd )
	
	if ( detailedOut ){
		return( list ( statistic = unname( rsuv ), null = nd , Z = x )  )
	}
	return ( x )
	
	if (FALSE){ # alternative empirical measure: 
		s <- sum(rsuv > nd )
		p <- min( s / nsim , (nsim - s) / nsim )
		
		p <- sum(rsuv > nd ) / nsim 
		abs( qnorm( p )  )	
	}
}


#


.tredat <- function ( tre ){
	n <- ape::Ntip( tre )
	nnode <- ape::Nnode(tre )
	rootnode <- .get_rootnode( tre$edge )
	inodes = iin <- (n+1):(n+nnode)
	
	D <- ape::node.depth.edgelength( tre )
	rh <- max( D[1:n] )
	sts <- D[1:n]
	shs <- rh - sts 
	nhs = nodeheights <- rh - D
	
	poedges <- tre$edge[ ape::postorder( tre ), ]
	preedges <- tre$edge[ rev( ape::postorder( tre )), ]

	# PRECOMPUTE for each node 
	# descendants; note also counts mrca node 
	descendants <- lapply( 1:(n+nnode), function(u) u )
	for (ie in 1:nrow(poedges)){
		a <- poedges[ie,1]
		u <- poedges[ie,2]
		descendants[[a]] <- c( descendants[[a]], descendants[[u]] )
	}
	
	
	# descendant tips
	descendantTips <- lapply( 1:(n+nnode), function(u) c() )
	for (u in 1:n)
	  descendantTips[[u]] <- u 
	for (ie in 1:nrow(poedges)){
		a <- poedges[ie,1]
		u <- poedges[ie,2]
		descendantTips[[a]] <- c( descendantTips[[a]], descendantTips[[u]] )
	}
	
	
	# descendant internal
	descInternal <- lapply( 1:(n+nnode), function(u) c() )
	for (u in (n+1):(n+nnode))
	  descInternal[[u]] <- u 
	for (ie in 1:nrow(poedges)){
		a <- poedges[ie,1]
		u <- poedges[ie,2]
		descInternal[[a]] <- c( descInternal[[a]], descInternal[[u]] )
	}

	# ancestors
	ancestors <- lapply( 1:(n+nnode), function(u) c() )
	for (ie in 1:nrow(preedges)){
		a <- preedges[ie,1]
		u <- preedges[ie,2]
		ancestors[[u]] <- c( ancestors[[a]], a)
	}

	# number tips desc 
	ndesc <- sapply( 1:(n+nnode), function(u) length( descendantTips[[u]] ) )
	
	# max dist to tip 
	maxDistToTip <- rep( 0, n + nnode )
	for (ie in 1:nrow(poedges)){
		a <- poedges[ie,1]
		u <- poedges[ie,2]
		maxDistToTip[a] <- max( maxDistToTip[a] , maxDistToTip[u] + 1)
	}
	
	# vector heights of all descending 
	descHeights <- lapply( 1:(n+nnode), function(u) nhs[ descendants[[u]] ] )
	descInternalHeights <- lapply( 1:(n+nnode), function(u) nhs[ descInternal[[u]] ] )

	# time range of each clade 
	timerange <- t( sapply( 1:(n+nnode), function(u) range( descHeights[[u]]  ) ))

	# vector heights of tips descending
	descendantTipHeights <- lapply( 1:(n+nnode), function(u) nhs[ descendantTips[[u]] ] ) 

	prenodes <- unique( preedges[,1] )

	dgtrMat <- matrix( NA, nrow = ape::Nnode(tre)+ape::Ntip(tre), ncol =2)
	for (ie in 1:nrow(poedges))
	{
		if (is.na( dgtrMat[ poedges[ie,1] , 1] ) )
		  dgtrMat[ poedges[ie,1], 1] <- poedges[ie,2]
		else 
		  dgtrMat[ poedges[ie,1], 2] <- poedges[ie,2]
	}
	
	# map node to branch length connecting to ancestor (useful for filtering multifurcations)
	node2edgelength <- rep(NA, n + nnode ) 
	node2edgelength[ tre$edge[,2] ] <-  tre$edge.length 

	list (n = n , nnode = nnode , rootnode = rootnode, inodes = inodes 
	  , D = D, rh = rh, sts = sts, shs = shs , nhs = nhs 
	  , poedges = poedges, preedges = preedges, descendants = descendants
	  , descendantTips = descendantTips , descInternal = descInternal
	  , ancestors = ancestors
	  , ndesc = ndesc 
	  , descHeights = descHeights
	  , descInternalHeights = descInternalHeights 
	  , timerange = timerange 
	  , descendantTipHeights = descendantTipHeights 
	  , prenodes = prenodes 
	  , dgtrMat = dgtrMat 
	  , maxDistToTip = maxDistToTip
	  , node2edgelength = node2edgelength 
	  , tre = tre 
	)
}

#' Test the hypothesis that two clades within a tree were generated by the same coalescent process. 
#'
#' @param tre An ape::phylo tree, must be binary and rooted 
#' @param x A character vector of tip labels or numeric node numbers. If numeric, can include internal node numbers.  
#' @param y as x, but must be disjoint with x 
#' @param nsim Number of simulations (larger = slower and more accurate)
#' @export 
treestructure.test <- function( tre, x, y, nsim = 1e4 )
{
	stopifnot( ape::is.rooted(tre))
	stopifnot( ape::is.binary(tre))
	stopifnot( length( intersect( x, y )) == 0 )
	if ( any(is.na(tre$tip.label)) | anyDuplicated(tre$tip.label)){
		message('Tree has NA or duplicated tip labels. Adding a unique id.')
		tre$tip.label <- paste0( paste0( 1:(ape::Ntip(tre)), '_' ), tre$tip.label )
	}
	tredat = .tredat( tre )
	attach( tredat ) 
	
	if ( is.numeric( x ))
		uset = x 
	else 
		uset = match( x, tre$tip.label )
	if ( is.numeric(y))
		vset = y 
	else 
		vset = match( y, tre$tip.label )
	
	if ( any( is.na( uset ) ) | any (is.na( vset )) )
		stop ( 'Some tip labels in x or y could not be matched tre$tip.label. Check inputs.' )
	
	Uset = unique( c(uset,  do.call( c, ancestors[uset] ) ) )
	Uset <- setdiff( Uset, ancestors[[ ape::getMRCA( tre, uset ) ]] )
	Vset = unique( c( vset, do.call( c, ancestors[vset] )) )
	Vset = setdiff( Vset , ancestors[[ ape::getMRCA( tre, vset ) ]] )
	uvd = .uv.diss(Uset, Vset, nsim = nsim, returnabs=FALSE, detailedOut = TRUE)
	
	res = structure( list( 
	  statistic = uvd$statistic 
	  , p.value = with (uvd,  min( mean(statistic < null), mean( statistic> null) ) )
	  , estimate = with (uvd,  mean( statistic> null))
	  , std.err = sd ( uvd$nd )
	  , conf.int =  quantile( uvd$null, c(0.025 , 0.975) ) 
	  , null.value = median( uvd$null )
	  , alternative='Alternative hypothesis: Rank sum differs from coalescent distribution'
	  , method = 'Two tailed simulation quantiles'
	  , data.name = 'tre'
	  , data = tre 
	  , nsim = nsim 
	)
	, class = 'treestructure.htest' 
	)
	
	detach( tredat )
	res$null = uvd$null 
	invisible(res)
}


#' @export 
print.treestructure.htest <- function(x, ... ){
	stopifnot( inherits( x, 'treestructure.htest' ))
	#~ 		One Sample t-test

	#~ data:  rnorm(10)
	#~ t = -0.01619, df = 9, p-value = 0.9874
	#~ alternative hypothesis: true mean is not equal to 0
	#~ 95 percent confidence interval:
	#~  -0.6089170  0.6002629
	#~ sample estimates:
	#~    mean of x 
	#~ -0.004327037 

	cat( '     Treestructure rank-sum test\n'  )
	cat( '     \n'  )
	cat( 'data: '  )
	print( x$data )
	if ( x$p.value > 0 )
		cat( sprintf( 'Rank sum = %g, p-value = %g\n', x$statistic, x$p.value ))
	else
		cat( sprintf( 'Rank sum = %g, p-value < %g\n', x$statistic, 1/x$nsim ))
	cat( x$alternative ); cat('\n' )
	cat( '95 percent confidence interval:\n')
	cat( sprintf( '  %g  %g\n', x$conf.int[1], x$conf.int[2] ))
invisible(x)
}


#' Detect cryptic population structure in time trees 
#'
#' @details 
#' Estimates a partition of a time-scaled tree by contrasting coalescent patterns. 
#' The algorithm is premised on a Kingman coalescent null hypothesis for the ordering of node heights when contrasting two clades, and a test statistic is formulated based on the rank sum of node times in the tree.
#' If node support values are available (as computed by bootstrap procedures), the method can optionally exclude designation of structure on poorly supported nodes. 
#' The method will not designate structure on nodes with zero branch length relative to their immediate ancestor. 
#' The significance level for detecting significant partitions of the tree can be provided, or a range of values can be examined. The CH index (\url{https://en.wikipedia.org/wiki/Calinski-Harabasz_index}) based on within- and between-cluster variance in node heights can be used to select a significance level if none is provided. 
#' 
#' @section References:
#' E.M. Volz, Wiuf, C., Grad, Y., Frost, S., Dennis, A., Didelot, X.D.  (2020) Identification of hidden population structure in time-scaled phylogenies.
#'
#' @author Erik M Volz <erik.volz@gmail.com>
#' 
#' @param tre A tree of type ape::phylo. Must be rooted. If the tree has multifurcations, it will be converted to a binary tree before processing.  
#' @param minCladeSize All clusters within partition must have at least this many tips. 
#' @param minOverlap Threshold time overlap required to find splits in a clade  
#' @param nodeSupportValues Node support values such as produced by bootrap or Bayesian credibility scores. Must be logical or vector with length equal to number of internal nodes in the tree. If numeric, these values should be between 0 and 100.   
#' @param nodeSupportThreshold Threshold node support value between 0 and 100. Nodes with support lower than this threshold will not be tested.  
#' @param nsim Number of simulations for computing null distribution of test statistics 
#' @param level Significance level for finding new split within a set of tips. Can also be NULL, in which case the optimal level is found according to the CH index
#' @param ncpu If >1 will compute statistics in parallel using multiple CPUs 
#' @param verbosity If > 0 will print information about progress of the algorithm 
#' @param debugLevel If > 0 will produce additional data in return value 
#' @param levellb If optimising the `level` parameter, this is the lower bound for the search 
#' @param levelub If optimising the `level` parameter, this is the upper bound for the search 
#' @param res If optimising the `level` parameter, this is the number of values to test
#' @return A TreeStructure object which includes cluster and partition assignment for each tip of the tree. 
#' @examples 
#' tree <- ape::rcoal(50)
#' struct <-  trestruct( tree )
#' 
#' @export 
trestruct <- function( tre, minCladeSize = 25, minOverlap = -Inf, nodeSupportValues = FALSE, nodeSupportThreshold = 95, nsim = 1e4, level = .01, ncpu = 1, verbosity = 1, debugLevel=0
	, levellb = 1e-3, levelub = 1e-1, res = 11)
{
	stopifnot( ape::is.rooted(tre))
	# stopifnot( ape::is.binary(tre))  
	if ( minOverlap >= minCladeSize){
		stop('*minOverlap* should be < *minCladeSize*.')
	}
	if ( any(is.na(tre$tip.label)) | anyDuplicated(tre$tip.label)){
		if ( verbosity > 0 )
			cat('Tree has NA or duplicated tip labels. Adding a unique id.\n')
		else
			message('Tree has NA or duplicated tip labels. Adding a unique id.')
		tre$tip.label <- paste0( paste0( 1:(ape::Ntip(tre)), '_' ), tre$tip.label )
	}
	
	useNodeSupport <- FALSE 
	if (!is.logical(nodeSupportValues) & is.vector(nodeSupportValues)) {
		# rewriting node.label here because if tree is multifurcating, support values would need to apply to most ancestral node in a multifurcation
		stopifnot( length(nodeSupportValues) == ape::Nnode( tre ) )
		stopifnot( is.numeric( nodeSupportValues ) )
		stopifnot( all( nodeSupportValues >= 0 ) )
		stopifnot( all( nodeSupportValues <= 100 ) )
		useNodeSupport <- TRUE 
		tre$node.label <- as.character( nodeSupportValues )
		nodeSupportValues <- TRUE 
	}
	if (!is.binary(tre)){# must be done after node labels are finalised 
		tre <- multi2di( tre, random=FALSE) # maintain node order in case node labels present 
	}
	if (is.logical(nodeSupportValues) & length(nodeSupportValues)==1){
		if ( nodeSupportValues ) {
			# stop('Not Implemented: Tree parsing node support values' ) 
			nodeSupportValues <- tre$node.label 
			if ( is.null( nodeSupportValues ) )
				stop('*tre* must have node labels with numeric node support values.')
			nodeSupportValues <- as.numeric( nodeSupportValues )
			if ( all( is.na( nodeSupportValues)))
				stop('Failure to parse tree node labels as node support values. These should be provided as numbers between 0 and 100.')
			nodeSupportValues[ is.na( nodeSupportValues ) ] <- 0 
			if ( max( nodeSupportValues ) < 1 )
				nodeSupportValues <- round( 100 * nodeSupportValues )
			useNodeSupport <- TRUE 
			nodeSupportValues <- c( rep(NA, ape::Ntip(tre)), nodeSupportValues )
		}
	} 
	if ( is.logical(nodeSupportValues) & length(nodeSupportValues)!=1)
		stop('Failure to parse node support. Must be a single logical or a numeric vector with length equal to number of internal nodes in the tree. If numeric, these values should be between 0 and 100. If logical, node support should be included in labeled nodes of the tree.')
	
	tredat = .tredat( tre )
	stopifnot( is.null(level) | is.numeric(level) )
	if( !is.null( level ) & is.numeric(level))
		return( .trestruct( tre, minCladeSize, minOverlap , nodeSupportValues , nodeSupportThreshold , nsim , level[1] , ncpu , verbosity , debugLevel
		, useNodeSupport, tredat) )
	if (is.null(level))
	{
		levels <- seq( levellb, levelub, length.out=res )
		tss <- lapply( levels, function(l)
		{
			message( paste('Running treestructure with significance level', round(l,3) ))
			message('')
			.trestruct( tre, minCladeSize, minOverlap , nodeSupportValues , nodeSupportThreshold , nsim , l, ncpu , verbosity , debugLevel, useNodeSupport, tredat)
		})
		chs <- sapply( tss, .ch )
		chdf <- data.frame( level = levels, CH = chs, optimal= '' )
		chdf$optimal[ which.max( chs ) ] <- '***' 
		ts <-  tss[[ which.max(chs) ]] 
		ts$chdf <- chdf 
		return(ts)
	}
}

.trestruct <- function( tre, minCladeSize = 25, minOverlap = -Inf, nodeSupportValues = FALSE, nodeSupportThreshold = 95, nsim = 1e3, level = .01, ncpu = 1, verbosity = 1, debugLevel=0 
	, useNodeSupport, tredat)
{
	attach( tredat ) 
	
	.ismonomono <- function( u, v){
		# NOTE if u is direct decendant of v or vice versa than the two tip sets are mono/mono related 
		if ( utils::tail(ancestors[[u]],1) == v){
			return(TRUE)
		}
		if ( utils::tail(ancestors[[v]],1) == u){
			return(TRUE)
		}
		!((v %in% ancestors[[u]]) | (u %in% ancestors[[v]] ))
	}
	
	# DISSIMILARITY FUNCTIONS
	# do clades u and v overlap in time? 
	.au.overlap <- function(a,u) {
		a_range <- range( nhs[ setdiff( descendants[[a]], descendants[[u]] ) ] )
		nu <- sum( (descInternalHeights[[u]]  > a_range[1]) & (descInternalHeights[[u]]  <= a_range[2]))
		( nu > minOverlap )
	}
	.overlap <- function( uset, vset){
		uhs <- nhs[ intersect( uset, inodes) ]
		vhs <- nhs[ intersect(vset, inodes) ]
		nu <- sum( (uhs > min(vhs)) & (uhs < max(vhs)))
		nv <- sum( (vhs > min(uhs)) & (vhs < max(uhs)))
		rv = ( min(nu,nv) > minOverlap )
		rv
	}
	# /DISSIMILARITY FUNCTIONS 

	# FIND OUTLIERS
	debugdf = NULL 
	zstar  <- stats::qnorm( 1-min(1,level)/2 )
	node2nodeset <- descendants 
	shouldDig <- rep(FALSE, n + nnode )
	shouldDig[ rootnode ] <- TRUE 

	# compute z score for u descendened from claderoot 
	.calc.z <- function(u, v, Ei = 1, returnabs = TRUE){ 
		if ( u <= n ) 
		  return(0)
		if ( v <= n ) 
		  return(0)
		if ( ndesc[u] < minCladeSize )
		  return(0 )
		if ( ndesc[v] < minCladeSize )
		  return(0 )
		if ( u==v )
		  return(0)
		if ( is.na( node2edgelength[ u ] ) )
			return(0)
		if ( node2edgelength[ u ] == 0 )
			return(0)
		if ( useNodeSupport ){
			if (!is.na( nodeSupportValues[u] ) ){
				if ( nodeSupportValues[u] < nodeSupportThreshold ){
					return(0)
				}
			}
		}
		if (is.na( Ei)){
			#nested = ifelse( .ismonomono( u, v), 1, 0 )
			if ( all( node2nodeset[[u]] %in% node2nodeset[[v]] ))
			{
				nsu <- node2nodeset[[u]]
				nsv <- setdiff( node2nodeset[[v]] , node2nodeset[[u]] )
			} else if ( all( node2nodeset[[v]] %in% node2nodeset[[u]]) ){
				nsv <- node2nodeset[[v]]
				nsu <- setdiff( node2nodeset[[u]] , node2nodeset[[v]] )
			} else{
					nsu <-  setdiff( node2nodeset[[u]], node2nodeset[[v]] )
					nsv <- setdiff( node2nodeset[[v]], node2nodeset[[u]])
			}
			Ei = ifelse( .ismonomono_cl(nsu, nsv ), 2,1 )
		}
		if (Ei==1) {
			if (!.au.overlap(v, u) ){
				return(0)
			}
		} else {
			if (!.overlap( node2nodeset[[u]], node2nodeset[[v]]) ){
				return(0)
			}
		}
		if (Ei==1){ # u under v
			nsu <-   intersect(  node2nodeset[[u]], node2nodeset[[v]] )
			nsv <- setdiff( node2nodeset[[v]], node2nodeset[[u]])
			vtips <- intersect( nsv, 1:(ape::Ntip(tre)))
			utips <- intersect( nsu, 1:(ape::Ntip(tre)))
			if (length( vtips ) < minCladeSize)
			  return(0)
			if (length( utips ) < minCladeSize)
			  return(0)
			# v clade must include tips not in u
			if ( length( vtips ) == 0){
				return( 0 )
			}
			return( .uv.diss( nsu , nsv, nsim = nsim , Ei=Ei, returnabs = returnabs ) ) 
		} else{
			nsu <- setdiff( node2nodeset[[u]], node2nodeset[[v]] )
			nsv <- setdiff( node2nodeset[[v]], node2nodeset[[u]])
			.uv.diss( nsu, nsv, nsim = nsim  , Ei=Ei , returnabs = returnabs)
		}
	}
	# find biggest outlier descend from a not counting rest of tree 
	.find.biggest.outlier <- function(a, Ei = 1 ){
		zs <- if ( ncpu  > 1 ){
			unlist( parallel::mclapply(  node2nodeset[[a]], function(u) .calc.z( u, a, Ei = Ei ) 
			 , mc.cores = ncpu ) )
		} else{
			sapply( node2nodeset[[a]], function(u)  .calc.z( u, a, Ei = Ei ) )
		}
		wm <- which.max( zs )
		ustar <- node2nodeset[[a]][ wm ]
		if ( zs[wm] <= zstar )
		  return(NULL)
		ustar 
	}
	# return ustar,  new exclude[a] 
	.dig.clade <- function(a){
		u <- .find.biggest.outlier( a )
		if (is.null(u)) 
		  return(NULL)
		list( ustar = u
		  , newnodeset_a = setdiff( node2nodeset[[a]], descendants[[u]] )
		)
	}

	.init.nodeset <- function(u, digging){
		setdiff( descendants[[u]], do.call( c, lapply( digging, function(v) node2nodeset[[v]] ) ) )
	}
	
	if ( debugLevel > 0 ){
		# store z for each node compared to root 
		debugdf = data.frame( z = sapply( node2nodeset[[rootnode]], function(u)  .calc.z( u, rootnode, Ei =1 , returnabs = FALSE) )
		 , cladeSize = sapply( node2nodeset[[rootnode]], function(u)  ndesc[u] )
		 , node = node2nodeset[[rootnode]]
		 )
	}

	node2cl <- c()
	clusterlist <- list()
	while( any(shouldDig) ){
		todig <- which( shouldDig )
		for (a in todig ){
			dc = .dig.clade( a )  # returns only u with highest z  
			if ( is.null( dc )){
				shouldDig[a] <- FALSE
				if ( length( node2nodeset[[a]] ) > 0 ){
					#  test  if tips in nodeset before adding to clusterlist 
					ans <- node2nodeset[[a]] 
					atips <- setdiff( ans , 1:(ape::Ntip(tre)))
					if ( length( atips ) > 0 ){
						no <- length( clusterlist)
						clusterlist[[ no + 1 ]] <- node2nodeset[[a]] 
						node2cl <- c( node2cl, a )
					}
				}
			}else{
				node2nodeset[[dc$ustar]] <- .init.nodeset( dc$ustar, setdiff(union( todig, node2cl ), a)  )
				if ( length(node2nodeset[[dc$ustar]]) > 0)
					shouldDig[ dc$ustar ] <- TRUE 
				node2nodeset[[a]] <- dc$newnodeset_a
			}
		}
		if (verbosity > 0 ){
			cat(paste( 'Finding splits under nodes:',   paste(todig, collapse = ' ') , '\n') )
		}
	}
	# /OUTLIERS

	names( clusterlist ) <- node2cl 
	no <- length(clusterlist)
	x <- setdiff( c( 1:n, inodes), do.call( c, clusterlist )) # remainder 
	if (length(x) > 0){
		clusterlist[[no + 1]] <- x
	}
	
	
	
	# DISTANCE
	nc <- length( clusterlist )
	D <- matrix( NA, nrow = nc, ncol = nc )
	diag(D) <- 0
	# 1) fill in D where ancestry/size/overlap req's are met 
	if ( nc  > 1){
		for (iu in 1:(nc-1)){
			for (iv in (iu+1):nc){
				# overlap
				if ( .overlap( clusterlist[[iu]] , clusterlist[[iv]] )) # 
				{
					D[iu,iv] <- .uv.diss ( clusterlist[[iu]] , clusterlist[[iv]], nsim = nsim,  Ei = 3)	
					D[iv, iu] <- D[iu, iv] 
				}
			}
		} 
	}
	# 2) impute any missing 
	# impute and remaining missingness using weighted mean; weights based on tree distance 
	.D <- D 
	.D[ is.na(D) | is.infinite(D)] <- 0  #
	rownames(.D)  = colnames(.D) <- 1:nc
	D <- stats::as.dist(.D)
	# /DISTANCE 



	# RETURN
	# for each tip: 
	stats::setNames( rep(NA, ape::Ntip(tre)), tre$tip.label) -> clustervec 
	for (k in 1:nc){
		cl <- clusterlist[[k]]
		itip <- intersect( 1:(ape::Ntip(tre)), cl)
		clustervec[ itip ] <- k 
	}
	
	if ( nc  >  1){
		h <- ape::as.phylo( stats::hclust( D ) ) 
		#~ 	h$edge.length[ h$edge.length <= zstar ] <- 0
		partinds <- stats::cutree( stats::hclust( stats::as.dist( ape::cophenetic.phylo( h ) ) ), h = zstar)
		partition <- as.factor( stats::setNames( partinds[ clustervec ] , tre$tip.label ) )
		clustering <- as.factor( clustervec )
		clusters <- split( tre$tip.label, clustervec )
		partitionSets <- split( tre$tip.label, partition )
		
	} else{
		clustering <- stats::setNames(  as.factor( rep(1, n )), tre$tip.label )
		partition <- stats::setNames( as.factor( rep(1, n )), tre$tip.label )
		clusters <- list( tre$tip.label )
		partitionSets <- list( tre$tip.label )
		remainderClade <- NULL 
	}
	
	rv = list(
	  clustering = clustering 
	  , partition = partition 
	  , clusterSets = clusters
	  , partitionSets = partitionSets
	  , D = D 
	  , clusterList = clusterlist 
	  , tree = tre
	  , level = level
	  , zstar = zstar 
	  , cluster_mrca  = node2cl 
	  , call = match.call() 
	  , data =  data.frame( taxon = tre$tip.label
		   # , cluster = clustering
		   , cluster = clustering
		   , partition = partition
	     , row.names = 1:ape::Ntip(tre)
	     , stringsAsFactors=FALSE
		)
	  , debugdf = debugdf 
	)
	class(rv) <- 'TreeStructure'
	detach( tredat )
	rv
}


.ch <- function(trstr)
{
	cldf <- trstr$data # .computeclusters( tre, node2z, zth, rescale=FALSE )
	ints <- node.depth.edgelength( trstr$tree )[(ape::Ntip(trstr$tree)+1):(ape::Ntip(trstr$tree)+ape::Nnode(trstr$tree))] # internalnode times
	clnts <- lapply( split( cldf, cldf$cluster), function(d){
		tr1 <- ape::keep.tip( trstr$tree, d$taxon )
		ints1 <- node.depth.edgelength( tr1 )[(ape::Ntip(tr1)+1):(ape::Ntip(tr1)+ape::Nnode(tr1))] # internalnode times
	})
	cln <- sapply( clnts , length ) 
	clmeans <- sapply( clnts, mean )
	oz <- mean( ints )
	bcss <- sum( cln * (clmeans - oz )^2 )
	wcss <- sapply( clnts, function(x) mean( (x-mean(x))^2 ) ) |> sum() 
	n <- sum( cln )
	k <- length( clnts ) 
	(bcss/(k-1)) / (wcss/(n-k))
}

.plot.TreeStructure.ggtree <- function(x, ... ){
	stopifnot(  'ggtree' %in% utils::installed.packages()[,1] ) 
	stopifnot( inherits( x, 'TreeStructure') )
	tre <- x$tree 
	d <- x$data 
	d$shape <-  rep('circle', ape::Ntip(tre))
	
	tre <- ggtree::groupOTU( tre, x$clusterSets) 
	pl <- ggtree::`%<+%`( ggtree::ggtree( tre, ggplot2::aes_(color=~group), ... ) ,  d  )
	pl <- pl +  ggtree::geom_tippoint(ggplot2::aes_( color=~partition, shape=~shape, show.legend=TRUE), size = 2 )	
	pl
}

#' Plot TreeStructure tree with cluster and partition variables 
#' @param x  A TreeStructure object 
#' @param use_ggtree Toggle ggtree or ape plotting behaviour 
#' @param ... Additional arguments passed to ggtree or ape::plot.phylo
#' @export 
plot.TreeStructure <- function(x, use_ggtree = TRUE , ... )
{
	stopifnot( inherits( x, 'TreeStructure') )
	if ( use_ggtree ){
		return( .plot.TreeStructure.ggtree (x , ... ) )
	} else{
		tr <- x$tree 
		tr$tip.label <- as.character( x$partition  )
		ape::plot.phylo( tr , ... ) 
		ape::nodelabels( '', node = x$cluster_mrca , pch = 8, cex = 3, frame = 'none') 
		ape::tiplabels( '', pch = 15, col   =  as.vector( x$partition ) , frame='none')
	}
	
	invisible(x)
}


#' @export 
print.TreeStructure <- function(x, rows = 0, ...)
{
	stopifnot( inherits( x, 'TreeStructure') )
	
	npart <- nlevels( x$partition)
	nc <- nlevels( x$clustering )
	tre <- x$tree 
	cat( 'Call: \r\n' )
	print( x$call )
	cat('\r\n')
	cat ( paste( 'Significance level:', x$level, '\n' ) )
	
	cat ( paste( 'Number of clusters:', nc , '\n') )
	cat ( paste( 'Number of partitions:', npart, '\n' ) )
	
	cat( 'Number of taxa in each cluster:\n' )
	print( table( x$clustering) )
	cat( 'Number of taxa in each partition:\n' )
	print( table( x$partition ))
	
	if ( rows > 0 ){
		cat ('Cluster and partition assignment: \n')
		print( x$data[1:min(nrow(x$data),rows), ] )
	}

	if (!is.null( x$chdf ))
	{
		cat('\r\n')
		cat( ' CH index ' )
		cat('\r\n')
		print( x$chdf )
	}
	
	cat( '...\r\n') 
	cat( 'For complete data, use `as.data.frame(...)` \n' )
	
	invisible(x)
}

#' @export 
as.data.frame.TreeStructure <- function(x, row.names=NULL, optional=FALSE, ...){
	stopifnot( inherits( x, 'TreeStructure') )
	if ( !is.null(row.names))
		rownames(x$data) <- row.names
	x$data
}

#' @export 
coef.TreeStructure <- function(object, ... )
{
	stopifnot( inherits( object, 'TreeStructure') )
	object$partition
}
emvolz-phylodynamics/treestructure documentation built on March 1, 2025, 3:47 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
emvolz-phylodynamics/treestructure
Detect Population Structure Within Phylogenetic Trees

R/m13.R
In emvolz-phylodynamics/treestructure: Detect Population Structure Within Phylogenetic Trees

Defines functions coef.TreeStructure as.data.frame.TreeStructure print.TreeStructure plot.TreeStructure .plot.TreeStructure.ggtree .ch .trestruct trestruct print.treestructure.htest treestructure.test .uv.diss .rank.sum .ismonomono_cl .ispara cocluster_accuracy .get_rootnode .get_dgtrs .get_anc

Documented in plot.TreeStructure treestructure.test trestruct

R Package Documentation

Browse R Packages

We want your feedback!

emvolz-phylodynamics/treestructure Detect Population Structure Within Phylogenetic Trees

R/m13.R In emvolz-phylodynamics/treestructure: Detect Population Structure Within Phylogenetic Trees

Defines functions coef.TreeStructure as.data.frame.TreeStructure print.TreeStructure plot.TreeStructure .plot.TreeStructure.ggtree .ch .trestruct trestruct print.treestructure.htest treestructure.test .uv.diss .rank.sum .ismonomono_cl .ispara cocluster_accuracy .get_rootnode .get_dgtrs .get_anc

Documented in plot.TreeStructure treestructure.test trestruct

R Package Documentation

Browse R Packages

We want your feedback!

emvolz-phylodynamics/treestructure
Detect Population Structure Within Phylogenetic Trees

R/m13.R
In emvolz-phylodynamics/treestructure: Detect Population Structure Within Phylogenetic Trees