R/IdClusters.R
In DECIPHER: Tools for curating, analyzing, and manipulating biological sequences

Documented in IdClusters

# below function modified from stats package
to.dendrogram <- function(object, states=NULL) {
	
	z <- list()
	oHgts <- object$lengths
	oHgt <- object$height
	nMerge <- length(oHgt)
	if (nMerge != nrow(object$merge))
		stop("'merge' and 'height' do not fit!")
	hMax <- oHgt[nMerge]
	
	count <- 1L
	one <- 1L
	two <- 2L
	for (k in seq_len(nMerge)) {
		x <- as.integer(object$merge[k, ])
		neg <- x < 0
		if (all(neg)) { # two leaves
			zk <- as.list(-x)
			attr(zk, "members") <- two
			objlabels <- object$labels[-x]
			attr(zk[[1L]], "label") <- objlabels[1L]
			attr(zk[[2L]], "label") <- objlabels[2L]
			attr(zk[[1L]], "members") <- attr(zk[[2L]], "members") <- one
			attr(zk[[1L]], "height") <- oHgt[k] - oHgts[k, 1]
			attr(zk[[2L]], "height") <- oHgt[k] - oHgts[k, 2]
			attr(zk[[1L]], "leaf") <- attr(zk[[2L]], "leaf") <- TRUE
		} else if (any(neg)) { # one leaf, one node
			X <- as.character(x)
			isL <- x[1L] < 0 # is leaf left?
			zk <-
			if (isL) {
				list(-x[1L], z[[X[2L]]])
			} else {
				list(z[[X[1L]]], -x[2L])
			}
			attr(zk, "members") <- attr(z[[X[1 + isL]]], "members") + one
			attr(zk[[2 - isL]], "members") <- one
			attr(zk[[2 - isL]], "height") <- oHgt[k] - oHgts[k, 2 - isL]
			attr(zk[[2 - isL]], "label") <- object$labels[-x[2 - isL]]
			attr(zk[[2 - isL]], "leaf") <- TRUE
			attr(zk[[1 + isL]], "state") <- states[count]
			count <- count + 1L
		} else { # two nodes
			x <- as.character(x)
			zk <- list(z[[x[1L]]], z[[x[2L]]])
			attr(zk, "members") <- attr(z[[x[1L]]], "members") + attr(z[[x[2L]]], "members")
			attr(zk[[1L]], "state") <- states[count]
			count <- count + 1L
			attr(zk[[2L]], "state") <- states[count]
			count <- count + 1L
		}
		attr(zk, "height") <- oHgt[k]
		k <- as.character(k)
		z[[k]] <- zk
	}
	z <- z[[k]]
	attr(z, "state") <- states[count]
	class(z) <- "dendrogram"
	z
}

.collapse <- function(dend, collapse, dim) {
	# initialize a stack of maximum length (dim)
	stack <- vector("list", dim)
	visit <- logical(dim) # node already visited
	parent <- integer(dim) # index of parent node
	index <- integer(dim) # index in parent node
	pos <- 1L # current position in the stack
	stack[[pos]] <- dend
	while (pos > 0L) { # more nodes to visit
		if (visit[pos]) { # ascending tree
			visit[pos] <- FALSE # reset visit
			
			if (!is.leaf(stack[[pos]][[1]])) {
				h1 <- attr(stack[[pos]][[1]], "height")
			} else {
				h1 <- -Inf
			}
			if (!is.leaf(stack[[pos]][[2]])) {
				h2 <- attr(stack[[pos]][[2]], "height")
			} else {
				h2 <- -Inf
			}
			
			h <- attr(stack[[pos]], "height")
			
			if ((h - h1) <= collapse || (h - h2) <= collapse) {
				# make multifurcating
				m1 <- attr(stack[[pos]][[1]], "members")
				m2 <- attr(stack[[pos]][[2]], "members")
				states <- c(attr(stack[[pos]][[1]], "state"),
					attr(stack[[pos]][[2]], "state"))
				m <- m1 + m2
				if ((h - h1) <= collapse && (h - h2) <= collapse) {
					l1 <- length(stack[[pos]][[1]])
					l2 <- length(stack[[pos]][[2]])
					x <- vector("list", l1 + l2)
					for (i in seq_len(l1))
						x[i] <- stack[[pos]][[1]][i]
					for (i in seq_len(l2))
						x[i + l1] <- stack[[pos]][[2]][i]
				} else if ((h - h1) <= collapse) {
					l <- length(stack[[pos]][[1]])
					x <- vector("list", l + 1)
					for (i in seq_len(l))
						x[i] <- stack[[pos]][[1]][i]
					x[l + 1] <- stack[[pos]][-1]
				} else if ((h - h2) <= collapse) {
					l <- length(stack[[pos]][[2]])
					x <- vector("list", l + 1)
					x[1] <- stack[[pos]][-2]
					for (i in seq_len(l))
						x[i + 1] <- stack[[pos]][[2]][i]
				}
				stack[[pos]] <- x
				
				attr(stack[[pos]], "height") <- h
				attr(stack[[pos]], "members") <- m
				attr(stack[[pos]], "state") <- unique(states)
				
				class(stack[[pos]]) <- "dendrogram"
			}
			
			# replace self in parent
			if (parent[pos] > 0)
				stack[[parent[pos]]][[index[pos]]] <- stack[[pos]]
			pos <- pos - 1L # pop off of stack
		} else { # descending tree
			visit[pos] <- TRUE
			p <- pos
			for (i in seq_along(stack[[p]])) {
				if (!is.leaf(stack[[p]][[i]])) {
					# push subtree onto stack
					pos <- pos + 1L
					stack[[pos]] <- stack[[p]][[i]]
					parent[[pos]] <- p
					index[[pos]] <- i
				}
			}
		}
	}
	return(stack[[1L]])
}

.organizeClusters <- function(myClusters,
	dNames,
	o) {
	
	l <- length(dNames)
	clusters <- data.frame(cluster=integer(l),
		row.names=dNames)
	w <- which(myClusters[, 7] < 0)
	if (length(w) > 0)
		clusters$cluster[-1*myClusters[w, 7]] <- as.integer(myClusters[w, 9])
	w <- which(myClusters[, 8] < 0)
	if (length(w) > 0)
		clusters$cluster[-1*myClusters[w, 8]] <- as.integer(myClusters[w, 10])
	
	# order the cluster numbers to match
	# the order of the dendrogram
	temp <- 0
	l <- max(clusters$cluster)
	clustersTemp <- clusters
	v <- vector(mode="numeric",length=l)
	j <- 0
	for (i in 1:length(o)) {
		if (clusters$cluster[o[i]] != temp &
			length(which(v==clusters$cluster[o[i]]))==0) {
			temp <- clusters$cluster[o[i]]
			j <- j + 1
			v[j] <- temp
		}
	}
	for (k in 1:l) {
		w <- which(clusters$cluster == v[k])
		clustersTemp$cluster[w] <- k
	}
	clusters <- clustersTemp
	
	return(clusters)
}

.organizeClustersFast <- function(myClusters,
	dNames) {
	
	l <- length(dNames)
	clusters <- data.frame(cluster=integer(l),
		row.names=dNames)
	w <- which(myClusters[, 7] < 0)
	if (length(w) > 0)
		clusters$cluster[-1*myClusters[w, 7]] <- as.integer(myClusters[w, 9])
	w <- which(myClusters[, 8] < 0)
	if (length(w) > 0)
		clusters$cluster[-1*myClusters[w, 8]] <- as.integer(myClusters[w, 10])
	
	return(clusters)
}

.rates <- function(alpha, nBins) {
	
	# Determine rates based on alpha and the number of bins
	# bins roots normalized to 1 of the General Laguerre Quadrature
	# first nBins elements are rates with mean 1
	# second nBins elements are probabilities with sum 1
	
	findRoots <- function(alpha, nBins) {
		
		# Determine rates based on Gamma's alpha and the number of bins
		# bins roots normalized to 1 of the General Laguerre Polynomial (GLP)
		
		coeff  <- integer(nBins + 1)
		for (i in 0:nBins) {
			n <- nBins + alpha
			k <- nBins - i
			coeff[i + 1] <- (-1)^i*choose(nBins + alpha, nBins - i)/factorial(i)
		}
		
		return(sort(Re(polyroot(coeff))))
	}
	
	roots <- findRoots(alpha - 1, nBins)
	
	Laguerre <- function(x, alpha, degree) {
		y <- 0
		for (i in 0:degree) {
			y <- y + (-1)^i*choose(degree + alpha, degree - i)*x^i/factorial(i)
		}
		return(y)
	}
	
	weights <- numeric(nBins)
	f <- prod(1 + (alpha - 1)/(1:nBins))
	
	for (i in 1:nBins) {
		weights[i] <- f*roots[i]/((nBins + 1)^2*Laguerre(roots[i],
			alpha - 1,
			nBins + 1)^2)
	}
	
	roots <- roots/alpha
	
	return(c(roots, weights))
}

.optimizeModel <- function(myClusters,
	model,
	myDNAStringSet,
	N,
	processors=1) {
	
	rates <- as.integer(sub("([^+]*)(\\+G(\\d+))?", "\\3", model))
	model <- sub("([^+]*)(\\+G(\\d+))?", "\\1", model)
	
	if (model=="JC69" && is.na(rates)) {
		LnL <- .Call("clusterML",
			myClusters,
			myDNAStringSet,
			c(0.25, 0.25, 0.25, 0.25, 1, 1, 1, 1),
			integer(),
			numeric(),
			0,
			1L,
			processors,
			PACKAGE="DECIPHER")
		K <- 2*dim(myClusters)[1] - 1
		AICc <- 2*K + 2*LnL + 2*K*(K + 1)/(N - K - 1)
		BIC <- 2*LnL + K*log(N)
		return(c(NA, NA, NA, NA, NA, NA, NA, LnL, AICc, BIC))
	} else if (model=="JC69") { # rates is an integer
		f <- function(params) {
			LnL <- .Call("clusterML",
				myClusters,
				myDNAStringSet,
				c(0.25, 0.25, 0.25, 0.25, 1, 1, .rates(params, rates)),
				integer(),
				numeric(),
				0,
				1L,
				processors,
				PACKAGE="DECIPHER")
		}
		o <- optimize(f, c(0.001, 500), tol=1e-4)
		K <- 2*dim(myClusters)[1]
		AICc <- 2*K + 2*o$objective + 2*K*(K + 1)/(N - K - 1)
		BIC <- 2*o$objective + K*log(N)
		return(c(NA, NA, NA, NA, NA, NA, o$minimum, o$objective, AICc, BIC))
	} else if (model=="K80") {
		f <- function(params) {
			LnL <- .Call("clusterML",
				myClusters,
				myDNAStringSet,
				c(0.25, 0.25, 0.25, 0.25, params, params, 1, 1),
				integer(),
				numeric(),
				0,
				1L,
				processors,
				PACKAGE="DECIPHER")
		}
		o <- optimize(f, c(0, 10), tol=1e-4)
		
		if (!is.na(rates)) {
			f <- function(params) {
				LnL <- .Call("clusterML",
					myClusters,
					myDNAStringSet,
					c(0.25, 0.25, 0.25, 0.25, o$minimum, o$minimum, .rates(params, rates)),
					integer(),
					numeric(),
					0,
					1L,
					processors,
					PACKAGE="DECIPHER")
			}
			a <- optimize(f, c(0.001, 500), tol=1e-4)
			LnL <- a$objective
			a <- a$minimum
			K <- 2*dim(myClusters)[1] + 1
		} else {
			a <- NA
			LnL <- o$objective
			K <- 2*dim(myClusters)[1]
		}
		
		AICc <- 2*K + 2*LnL + 2*K*(K + 1)/(N - K - 1)
		BIC <- 2*LnL + K*log(N)
		return(c(NA, NA, NA, NA, rep(o$minimum, 2), a, LnL, AICc, BIC))
	} else if (model=="T92") {
		baseFreqs <- alphabetFrequency(myDNAStringSet,
			baseOnly=TRUE,
			as.prob=TRUE,
			collapse=TRUE)[1:4]
		baseFreqs <- ifelse(baseFreqs < 0.01, 0.01, baseFreqs)
		baseFreqs <- baseFreqs/sum(baseFreqs)
		baseFreqs <- c((baseFreqs[1] + baseFreqs[4])/(2*sum(baseFreqs)),
			(baseFreqs[2] + baseFreqs[3])/(2*sum(baseFreqs)))
		baseFreqs <- c(baseFreqs[1], baseFreqs[2], baseFreqs[2], baseFreqs[1])
#		f <- function(params) {
#			if (params[1] > 0.49)
#				return(Inf)
#			LnL <- .Call("clusterML",
#				myClusters,
#				myDNAStringSet,
#				c(params, rep((1 - 2*params)/2, 2), params, 1, 1, 1, 1),
#				integer(),
#				numeric(),
#				0,
#				1L,
#				processors,
#				PACKAGE="DECIPHER")
#		}
#		o <- nlminb(0.25,
#			f,
#			upper=0.49,
#			lower=0.01,
#			control=list(rel.tol=1e-4))
#		baseFreqs <- c(o$par, rep((1 - 2*o$par)/2, 2), o$par)
		
		f <- function(params) {
			LnL <- .Call("clusterML",
				myClusters,
				myDNAStringSet,
				c(baseFreqs, params, params, 1, 1),
				integer(),
				numeric(),
				0,
				1L,
				processors,
				PACKAGE="DECIPHER")
		}
		o <- nlminb(1,
			f,
			upper=10,
			lower=0.01,
			control=list(rel.tol=1e-4))
		
		if (!is.na(rates)) {
			f <- function(params) {
				LnL <- .Call("clusterML",
					myClusters,
					myDNAStringSet,
					c(baseFreqs, o$par[1], o$par[1], .rates(params, rates)),
					integer(),
					numeric(),
					0,
					1L,
					processors,
					PACKAGE="DECIPHER")
			}
			a <- optimize(f, c(0.001, 500), tol=1e-4)
			LnL <- a$objective
			a <- a$minimum
			K <- 2*dim(myClusters)[1] + 2
		} else {
			a <- NA
			LnL <- o$objective
			K <- 2*dim(myClusters)[1] + 1
		}
		
		AICc <- 2*K + 2*LnL + 2*K*(K + 1)/(N - K - 1)
		BIC <- 2*LnL + K*log(N)
		return(c(baseFreqs, o$par[1], o$par[1], a, LnL, AICc, BIC))
	} else {
		baseFreqs <- alphabetFrequency(myDNAStringSet,
			baseOnly=TRUE,
			as.prob=TRUE,
			collapse=TRUE)[1:4]
		baseFreqs <- ifelse(baseFreqs < 0.01, 0.01, baseFreqs)
		baseFreqs <- baseFreqs/sum(baseFreqs)
#	f <- function(params) {
#			if (any(params > 0.8) ||
#				any(params < 0.05) ||
#				sum(params) > 0.95)
#				return(Inf)
#			LnL <- .Call("clusterML",
#				myClusters,
#				myDNAStringSet,
#				c(params[1], params[2], params[3], 1 - sum(params), 1, 1, 1, 1),
#				integer(),
#				numeric(),
#				0,
#				1L,
#				processors,
#				PACKAGE="DECIPHER")
#		}
#		o <- optim(baseFreqs[1:3],
#			f,
#			control=list(reltol=1e-3))
#		baseFreqs[1:3] <- o$par
#		baseFreqs[4] <- 1 - sum(baseFreqs[1:3])
		
		if (model=="F81") {
			if (!is.na(rates)) {
				f <- function(params) {
					LnL <- .Call("clusterML",
						myClusters,
						myDNAStringSet,
						c(baseFreqs, 1, 1, .rates(params, rates)),
						integer(),
						numeric(),
						0,
						1L,
						processors,
						PACKAGE="DECIPHER")
				}
				a <- optimize(f, c(0.001, 500), tol=1e-4)
				LnL <- a$objective
				a <- a$minimum
				K <- 2*dim(myClusters)[1] + 3
			} else {
				K <- 2*dim(myClusters)[1] + 2
				a <- NA
				LnL <- .Call("clusterML",
					myClusters,
					myDNAStringSet,
					c(baseFreqs, 1, 1, 1, 1),
					integer(),
					numeric(),
					0,
					1L,
					processors,
					PACKAGE="DECIPHER")
			}
			
			AICc <- 2*K + 2*LnL + 2*K*(K + 1)/(N - K - 1)
			BIC <- 2*LnL + K*log(N)
			return(c(baseFreqs, NA, NA, a, LnL, AICc, BIC))
		} else if (model=="HKY85") {
			f <- function(params) {
				LnL <- .Call("clusterML",
					myClusters,
					myDNAStringSet,
					c(baseFreqs, params, params, 1, 1),
					integer(),
					numeric(),
					0,
					1L,
					processors,
					PACKAGE="DECIPHER")
			}
			o <- nlminb(1,
				f,
				upper=10,
				lower=0.01,
				control=list(rel.tol=1e-4))
			
			if (!is.na(rates)) {
				f <- function(params) {
					LnL <- .Call("clusterML",
						myClusters,
						myDNAStringSet,
						c(baseFreqs, o$par[1], o$par[1], .rates(params, rates)),
						integer(),
						numeric(),
						0,
						1L,
						processors,
						PACKAGE="DECIPHER")
				}
				a <- optimize(f, c(0.001, 500), tol=1e-4)
				LnL <- a$objective
				a <- a$minimum
				K <- 2*dim(myClusters)[1] + 4
			} else {
				a <- NA
				LnL <- o$objective
				K <- 2*dim(myClusters)[1] + 3
			}
			
			AICc <- 2*K + 2*LnL + 2*K*(K + 1)/(N - K - 1)
			BIC <- 2*LnL + K*log(N)
			return(c(baseFreqs, o$par[1], o$par[1], a, LnL, AICc, BIC))
		} else if (model=="TN93") {
			f <- function(params) {
				LnL <- .Call("clusterML",
					myClusters,
					myDNAStringSet,
					c(baseFreqs, params[1], params[2], 1, 1),
					integer(),
					numeric(),
					0,
					1L,
					processors,
					PACKAGE="DECIPHER")
			}
			o <- nlminb(c(1, 1),
				f,
				upper=c(10, 10),
				lower=c(0.01, 0.01),
				control=list(rel.tol=1e-4))
			
			if (!is.na(rates)) {
				f <- function(params) {
					LnL <- .Call("clusterML",
						myClusters,
						myDNAStringSet,
						c(baseFreqs, o$par[1], o$par[2], .rates(params, rates)),
						integer(),
						numeric(),
						0,
						1L,
						processors,
						PACKAGE="DECIPHER")
				}
				a <- optimize(f, c(0.001, 500), tol=1e-4)
				LnL <- a$objective
				a <- a$minimum
				K <- 2*dim(myClusters)[1] + 5
			} else {
				a <- NA
				LnL <- o$objective
				K <- 2*dim(myClusters)[1] + 4
			}
			
			AICc <- 2*K + 2*LnL + 2*K*(K + 1)/(N - K - 1)
			BIC <- 2*LnL + K*log(N)
			return(c(baseFreqs, o$par[1], o$par[2], a, LnL, AICc, BIC))
		}
	}
}

.simultaneousBrent <- function(f, # scalar function
	a, # lower bounds
	b, # best guesses
	c, # upper bounds
	tol=1e-5, # accuracy of params change in x
	overallTol=1e0, # accuracy of overall scalar
	relTol=1e-2) { # accuracy of params change in y
	
	# performs optimization using Brent's method
	# simultaneously optimizes nearly-independent parameters
	
	phi <- (3 - sqrt(5))/2
	
	x <- w <- v <- b
	fx <- numeric(length(x))
	baseline <- f(x)
	fx[] <- baseline
	fw <- fv <- fx
	
	b <- ifelse(a > c, a, c)
	
	W <- (1:length(x))[(c - a) > tol]
	if (length(W)==0)
		return(x)
	
	e <- xm <- tol1 <- tol2 <- numeric(length(W))
	q <- r <- p <- numeric(length(W))
	
	iteration <- 0L
	maxIterations <- 1000L
	delta <- 0
	f_delta <- rep(Inf, length(W))
	while(iteration < maxIterations &&
		(iteration < 3 || delta > overallTol)) {
		iteration <- iteration + 1L
		
		xm[W] <- (a[W] + b[W])/2
		tol1[W] <- tol*abs(x[W]) + 1e-10
		tol2[W] <- 2*(tol1[W])
		
		w1 <- which(abs(x[W] - xm[W]) <= (tol2[W] - (b[W] - a[W])/2) |
			f_delta < relTol)
		if (length(w1) > 0) {
			W <- W[-w1]
			if (length(W)==0)
				break
		}
		
		w1 <- which(abs(e[W]) > tol1[W])
		w2 <- which(!(1:length(W) %in% w1))
		d <- numeric(length(W))
		
		if (length(w1) > 0) {
			r[W[w1]] <- (x[W[w1]] - w[W[w1]])*(fx[W[w1]] - fv[W[w1]])
			q[W[w1]] <- (x[W[w1]] - v[W[w1]])*(fx[W[w1]] - fw[W[w1]])
			p[W[w1]] <- (x[W[w1]] - v[W[w1]])*q[W[w1]] - (x[W[w1]] - w[W[w1]])*r[W[w1]]
			q[W[w1]] <- 2*(q[W[w1]] - r[W[w1]])
			p[W[w1]] <- ifelse(q[W[w1]] > 0,  p[W[w1]], -p[W[w1]])
			q[W[w1]] <- abs(q[W[w1]])
			
			etemp <- e[W[w1]]
			e[W[w1]] <- d[W[w1]]
			
			w3 <- which(abs(p[W[w1]]) >= abs(q[W[w1]]*etemp/2) |
				p[W[w1]] <= q[W[w1]]*(a[W[w1]] - x[W[w1]]) |
				p[W[w1]] >= q[W[w1]]*(b[W[w1]] - x[W[w1]]))
			w4 <- which(!(1:length(w1) %in% w3))
			if (length(w3) > 0) {
				e[W[w1[w3]]] <- ifelse(x[W[w1[w3]]] >= xm[W[w1[w3]]],
					a[W[w1[w3]]] - x[W[w1[w3]]],
					b[W[w1[w3]]] - x[W[w1[w3]]])
				d[W[w1[w3]]] <- phi*e[W[w1[w3]]]
			}
			if (length(w4) > 0) {
				d[W[w1[w4]]] <- p[W[w1[w4]]]/q[W[w1[w4]]]
				u <- x[W[w1[w4]]] + d[W[w1[w4]]]
				d[W[w1[w4]]] <- ifelse(u - a[W[w1[w4]]] < tol2[W[w1[w4]]] |
						b[W[w1[w4]]] - u < tol2[W[w1[w4]]],
					ifelse(xm[W[w1[w4]]] - x[W[w1[w4]]] >= 0,
						abs(tol1[W[w1[w4]]]),
						-abs(tol1[W[w1[w4]]])),
					d[W[w1[w4]]])
			}
		}
		if (length(w2) > 0) {
			e[W[w2]] <- ifelse(x[W[w2]] >= xm[W[w2]],
				a[W[w2]] - x[W[w2]],
				b[W[w2]] - x[W[w2]])
			d[W[w2]] <- phi*e[W[w2]]
		}
		
		# perform one function call per iteration
		u <- ifelse(abs(d[W]) >= tol1[W],
			x[W] + d[W],
			x[W] + ifelse(d[W] > 0,
				abs(tol1[W]),
				-abs(tol1[W])))
		fu <- f(x, W, u) # provide all alternative parameters
		
		newBaseline <- fu[1]
		fu <- fu[2:length(fu)]
		f_delta <- abs(fu - fx[W])
		offset <- fx[W] - newBaseline
		w1 <- which(offset > 0)
		if (length(w1) > 0) {
			fx[W[w1]] <- fx[W[w1]] - offset[w1]
			fw[W[w1]] <- fw[W[w1]] - offset[w1]
			fv[W[w1]] <- fv[W[w1]] - offset[w1]
		}
		delta <- abs(baseline - newBaseline)
		baseline <- newBaseline
		
		w1 <- which(fu <= fx[W])
		w2 <- which(!(1:length(W) %in% w1))
		if (length(w1) > 0){
			w3 <- which(u[w1] >= x[W[w1]])
			if (length(w3) > 0)
				a[W[w1[w3]]] <- x[W[w1[w3]]]
			w4 <- which(u[w1] < x[W[w1]])
			if (length(w4) > 0)
				b[W[w1[w4]]] <- x[W[w1[w4]]]
			v[W[w1]] <- w[W[w1]]
			w[W[w1]] <- x[W[w1]]
			x[W[w1]] <- u[w1]
			fv[W[w1]] <- fw[W[w1]]
			fw[W[w1]] <- fx[W[w1]]
			fx[W[w1]] <- fu[w1]
		}
		if (length(w2) > 0) {
			w3 <- which(u[w2] < x[W[w2]])
			if (length(w3) > 0)
				a[W[w2[w3]]] <- u[w2[w3]]
			w4 <- which(u[w2] >= x[W[w2]])
			if (length(w4) > 0)
				b[W[w2[w4]]] <- u[w2[w4]]
			
			w3 <- which(fu[w2] <= fw[W[w2]] | w[W[w2]]==x[W[w2]])
			if (length(w3) > 0) {
				v[W[w2[w3]]] <- w[W[w2[w3]]]
				w[W[w2[w3]]] <- u[w2[w3]]
				fv[W[w2[w3]]] <- fw[W[w2[w3]]]
				fw[W[w2[w3]]] <- fu[w2[w3]]
			}
			
			w4 <- which(fu[w2] <= fv[W[w2]] | v[W[w2]]==x[W[w2]] | v[W[w2]]==w[W[w2]])
			if (length(w4) > 0) {
				v[W[w2[w4]]] <- u[w2[w4]]
				fv[W[w2[w4]]] <- fu[w2[w4]]
			}
		}
	}
	
	return(x)
}

.reorderClusters <- function(myClusters, all=FALSE) {
	
	# order clusters by branching pattern
	repeat {
		a <- apply(as.matrix(myClusters[, 7:8], ncol=2), 1, max)
		w <- which(a > 1:dim(myClusters)[1])[1]
		if (is.na(w))
			break
		temp <- myClusters[w, c(4, 5, 7, 8)]
		myClusters[w:(a[w] - 1), c(4, 5, 7, 8)] <- myClusters[(w + 1):a[w], c(4, 5, 7, 8)]
		myClusters[a[w], c(4, 5, 7, 8)] <- temp
		w1 <- which(myClusters[w:dim(myClusters)[1], 7:8] %in% (w + 1):a[w])
		w2 <- which(myClusters[(a[w] + 1):dim(myClusters)[1], 7:8] %in% w)
		if (length(w1) > 0)
			myClusters[w:dim(myClusters)[1], 7:8][w1] <- myClusters[w:dim(myClusters)[1], 7:8][w1] - 1
		if (length(w2) > 0)
			myClusters[(a[w] + 1):dim(myClusters)[1], 7:8][w2] <- a[w]
	}
	
	if (all) { # also renumber columns 1 to 3
		count <- 0L
		myClusters[, 1:2] <- myClusters[, 7:8]
		for (i in 1:dim(myClusters)[1]) {
			if (myClusters[i, 7] > 0 && myClusters[i, 8] > 0) {
				myClusters[i, 1] <- myClusters[myClusters[i, 7], 3]
				myClusters[i, 2] <- myClusters[myClusters[i, 8], 3]
				count <- count + 1L
				myClusters[i, 3] <- count
			} else if (myClusters[i, 7] > 0) {
				myClusters[i, 1] <- myClusters[myClusters[i, 7], 3]
				myClusters[i, 3] <- myClusters[i, 1]
			} else if (myClusters[i, 8] > 0) {
				myClusters[i, 2] <- myClusters[myClusters[i, 8], 3]
				myClusters[i, 3] <- myClusters[i, 2]
			} else {
				count <- count + 1L
				myClusters[i, 3] <- count
			}
		}
	}
	
	return(myClusters)
}

.swapBranches <- function(myClusters, r1, c1, r2, c2) {
	
	# swap branch [r1, c1] with [r2, c2]
	temp <- myClusters[r1, c1]
	myClusters[r1, c1] <- myClusters[r2, c2]
	myClusters[r2, c2] <- temp
	temp <- myClusters[r1, c1 - 3]
	myClusters[r1, c1 - 3] <- myClusters[r2, c2 - 3]
	myClusters[r2, c2 - 3] <- temp
	
	myClusters <- .reorderClusters(myClusters)
	
	return(myClusters)
}

.NNI <- function(myClusters, bestLnL, NNIs, maximizeLikelihood, tol=1e-1) {
	
	if (dim(myClusters)[1]==1)
		return(myClusters)
	
	# perform rounds of nearest-neighbor interchanges
	W <- dim(myClusters)[1]:2
	while (length(W) > 0) {
		i <- W[1]
		W <- W[-1]
		
		if (i==dim(myClusters)[1]) { # swap nodes
			if (all(myClusters[i, 7:8] > 0)) { # two nodes
				myClustersTemp1 <- .swapBranches(myClusters,
					myClusters[i, 8], 7,
					myClusters[i, 7], 7)
				tempLnL1 <- maximizeLikelihood(myClustersTemp1, NNIs + 1, tol)
				myClustersTemp2 <- .swapBranches(myClusters,
					myClusters[i, 8], 7,
					myClusters[i, 7], 8)
				tempLnL2 <- maximizeLikelihood(myClustersTemp2, NNIs + 1, tol)
				if (tempLnL1 < bestLnL - tol &&
					tempLnL1 < tempLnL2) {
					W <- c(W, i)
					NNIs <- NNIs + 1
					myClusters <- myClustersTemp1
					bestLnL <- tempLnL1
				} else if (tempLnL2 < bestLnL - tol) {
					W <- c(W, i)
					NNIs <- NNIs + 1
					myClusters <- myClustersTemp2
					bestLnL <- tempLnL2
				}
			}
		} else {
			w <- which(myClusters[i, 7:8] > 0)
			if (length(w)==2) {
				myClustersTemp1 <- .swapBranches(myClusters,
					i, 8,
					myClusters[i, 7], 7)
				tempLnL1 <- maximizeLikelihood(myClustersTemp1, NNIs + 1, tol)
				myClustersTemp2 <- .swapBranches(myClusters,
					i, 8,
					myClusters[i, 7], 8)
				tempLnL2 <- maximizeLikelihood(myClustersTemp2, NNIs + 1, tol)
				myClustersTemp3 <- .swapBranches(myClusters,
					i, 7,
					myClusters[i, 8], 7)
				tempLnL3 <- maximizeLikelihood(myClustersTemp3, NNIs + 1, tol)
				myClustersTemp4 <- .swapBranches(myClusters,
					i, 7,
					myClusters[i, 8], 8)
				tempLnL4 <- maximizeLikelihood(myClustersTemp4, NNIs + 1, tol)
				if (tempLnL1 < bestLnL - tol &&
					tempLnL1 < tempLnL2 &&
					tempLnL1 < tempLnL3 &&
					tempLnL1 < tempLnL4) {
					W <- c(W, i)
					NNIs <- NNIs + 1
					myClusters <- myClustersTemp1
					bestLnL <- tempLnL1
				} else if (tempLnL2 < bestLnL - tol &&
					tempLnL2 < tempLnL3 &&
					tempLnL2 < tempLnL4) {
					W <- c(W, i)
					NNIs <- NNIs + 1
					myClusters <- myClustersTemp2
					bestLnL <- tempLnL2
				} else if (tempLnL3 < bestLnL - tol &&
					tempLnL3 < tempLnL4) {
					W <- c(W, i)
					NNIs <- NNIs + 1
					myClusters <- myClustersTemp3
					bestLnL <- tempLnL3
				} else if (tempLnL4 < bestLnL - tol) {
					W <- c(W, i)
					NNIs <- NNIs + 1
					myClusters <- myClustersTemp4
					bestLnL <- tempLnL4
				}
			} else if (length(w)==1) {
				myClustersTemp1 <- .swapBranches(myClusters,
					i, ifelse(w==1, 8, 7),
					myClusters[i, w + 6], 7)
				tempLnL1 <- maximizeLikelihood(myClustersTemp1, NNIs + 1, tol)
				myClustersTemp2 <- .swapBranches(myClusters,
					i, ifelse(w==1, 8, 7),
					myClusters[i, w + 6], 8)
				tempLnL2 <- maximizeLikelihood(myClustersTemp2, NNIs + 1, tol)
				if (tempLnL1 < bestLnL - tol &&
					tempLnL1 < tempLnL2) {
					W <- c(W, i)
					NNIs <- NNIs + 1
					myClusters <- myClustersTemp1
					bestLnL <- tempLnL1
				} else if (tempLnL2 < bestLnL - tol) {
					W <- c(W, i)
					NNIs <- NNIs + 1
					myClusters <- myClustersTemp2
					bestLnL <- tempLnL2
				}
			}
		}
	}
	
	# return the highest likelihood tree
	return(myClusters)
}

MODELS <- c("JC69",
	"JC69+G4",
	"K80",
	"K80+G4",
	"F81",
	"F81+G4",
	"HKY85",
	"HKY85+G4",
	"T92",
	"T92+G4",
	"TN93",
	"TN93+G4")

.splitClusters <- function(x, y) {
	
	clusterNum <- 0L
	X <- integer(length(x))
	u.y <- unique(y)
	for (i in u.y) {
		w.y <- which(y==i)
		u.x <- unique(x[w.y])
		for (j in u.x) {
			clusterNum <- clusterNum + 1L
			w.x <- which(x[w.y]==j)
			X[w.y[w.x]] <- clusterNum
		}
	}
	return(X)
}

.root <- function(x1, root) {
	
	# if root is zero then midpoint root the tree
	# otherwise outgroup root based on root index
	# (note: output columns 1:3 are uncorrected)
	
	n <- nrow(x1)
	if (root==0) { # midpoint root
		# find the leaf at minimum height
		r1 <- which(x1[, 7] < 0)
		h1 <- x1[r1, 6] - x1[r1, 4]
		r2 <- which(x1[, 8] < 0)
		h2 <- x1[r2, 6] - x1[r2, 5]
		r <- c(r1, r2) # row number of leaf
		z <- rep(c(7L, 8L), c(length(r1), length(r2)))
		h <- c(h1, h2) # height of leaf
		
		# reorder by sequence number
		o <- order(x1[cbind(r, z)],
			decreasing=TRUE)
		h <- h[o]
		minH <- which.min(h) # index of lowest leaf
	} else { # outgroup root
		w <- which(x1[n, 7:8]==-root)
		if (length(w) > 0) { # already outgroup rooted
			# extend the root node
			x1[n, 6] <- x1[n, 6] + x1[n, 3 + w]
			x1[n, 3 + w] <- 0
			return(x1)
		}
		minH <- root # index of root
	}
	
	# find most distant leaf from minH
	longest <- numeric(n) # length of longest path
	merged <- logical(n) # whether merged yet
	index <- numeric(n) # column back to minH
	for (i in seq_len(n)) {
		b1 <- x1[i, 7]
		if (b1 < 0) { # merged with leaf
			if (b1==-minH) {
				merged[i] <- TRUE
				b1 <- NA_real_
			} else {
				l1 <- x1[i, 4]
			}
		} else { # merged with node
			if (merged[b1]) {
				merged[i] <- TRUE
				b1 <- NA_real_
			} else {
				l1 <- longest[b1] + x1[i, 4]
			}
		}
		
		b2 <- x1[i, 8]
		if (b2 < 0) { # merged with leaf
			if (b2==-minH) {
				merged[i] <- TRUE
				b2 <- NA_real_
			} else {
				l2 <- x1[i, 5]
			}
		} else { # merged with node
			if (merged[b2]) {
				merged[i] <- TRUE
				b2 <- NA_real_
			} else {
				l2 <- longest[b2] + x1[i, 5]
			}
		}
		
		if (is.na(b1)) { # b1 contains minH
			longest[i] <- l2
			# leave index[i] at zero
		} else if (is.na(b2)) { # b2 contains minH
			longest[i] <- l1
			index[i] <- 1
		} else if (l1 >= l2) {
			longest[i] <- l1
			# index[i] not needed
		} else { # l2 > l1
			longest[i] <- l2
			# index[i] not needed
		}
	}
	
	if (root==0) { # determine height of the midpoint
		w <- which(merged)
		longest <- longest + x1[, 6] - h[minH]
		m <- w[which.max(longest[w])]
		midH <- longest[m]/2
		if (isTRUE(all.equal(x1[n, 6], midH)))
			return(x1) # already midpoint rooted
		
		# find the edge containing the midpoint
		lowH <- x1[m, 6] - x1[m, 4 + index[m]]
		while (lowH > midH) { # descend the tree
			m <- x1[m, 7 + index[m]]
			lowH <- x1[m, 6] - x1[m, 4 + index[m]]
		}
	} else { # root at tip of outgroup
		w <- which(x1[, 7:8]==-root, arr.ind=TRUE)
		midH <- x1[w[1], 6] - x1[w[1], 3 + w[2]]
		m <- w[1]
	}
	
	# invert and lower nodes above rotation point
	.dropH <- function(i, delta) {
		stack <- integer(n)
		pos <- 1L
		stack[pos] <- i
		while (pos > 0) {
			i <- stack[pos]
			x1[i, 6] <<- x1[i, 6] - delta
			pos <- pos - 1L
			if (x1[i, 7] > 0) {
				pos <- pos + 1L
				stack[pos] <- x1[i, 7]
			}
			if (x1[i, 8] > 0) {
				pos <- pos + 1L
				stack[pos] <- x1[i, 8]
			}
		}
	}
	up <- integer(n) # pointers up tree
	w <- which(x1[, 7:8] > 0, arr.ind=TRUE)
	up[x1[, 7:8][w]] <- w[, "row"]
	remove <- logical(n) # replaced nodes
	x2 <- x1 # new rooted tree
	count <- n # row in x2
	# make new root node
	delta <- x1[m, 6] - midH
	x2[count, 4:10] <- c(x1[m, 4 + index[m]] - delta,
		delta,
		midH,
		x1[m, 7 + index[m]],
		count - 1,
		x1[m, 9 + index[m]],
		-1)
	if (up[m]) {
		while (up[m]) {
			count <- count - 1
			delta <- x1[m, 6] - midH
			remove[m] <- TRUE
			x2[count, 4:10] <- c(x1[m, 5 - index[m]],
				x1[up[m], 4 + index[up[m]]],
				midH - delta,
				x1[m, 8 - index[m]],
				count - 1,
				x1[m, 10 - index[m]],
				-1)
			if (x1[m, 8 - index[m]] > 0)
				.dropH(x1[m, 8 - index[m]], 2*delta)
			m <- up[m]
		}
		delta <- x1[m, 6] - midH
		x2[count, 5] <- sum(x1[m, 4:5])
	}
	remove[m] <- TRUE
	keep <- which(!remove)
	x2[count, 8] <- x1[m, 8 - index[m]]
	if (x2[count, 8] > 0)
		x2[count, 8] <- match(x2[count, 8], keep)
	x2[count, 10] <- x1[m, 10 - index[m]]
	if (x1[m, 8 - index[m]] > 0)
		.dropH(x1[m, 8 - index[m]], 2*delta)
	if (length(keep) > 0) {
		x2[1:(count - 1),] <- x1[keep,]
		w <- which(x2[1:(count - 1), 7:8] > 0)
		if (length(w) > 0)
			x2[1:(count - 1), 7:8][w] <- match(x2[1:(count - 1), 7:8][w], keep)
		w <- which(x2[n:count, 7] %in% keep)
		x2[n:count, 7][w] <- match(x2[n:count, 7][w], keep)
	}
	
	if (root > 0) {
		x2[, 6] <- x2[, 6] - min(x2[, 6] - x2[, 4], x2[, 6] - x2[, 5])
	}
	
	return(x2)
}

.applyMidpoints <- function(dend, dim) {
	# initialize a stack of maximum length (dim)
	stack <- vector("list", dim)
	visit <- logical(dim) # node already visited
	parent <- integer(dim) # index of parent node
	index <- integer(dim) # index in parent node
	pos <- 1L # current position in the stack
	stack[[pos]] <- dend
	while (pos > 0L) { # more nodes to visit
		if (visit[pos]) { # ascending tree
			visit[pos] <- FALSE # reset visit
			
			members <- sapply(stack[[pos]],
				function(x) {
					m <- attr(x, "members")
					if (is.null(m)) {
						return(1L)
					} else {
						return(m)
					}
				})
			
			l <- length(stack[[pos]])
			if (is.leaf(stack[[pos]][[1]]) && is.leaf(stack[[pos]][[l]])) {
				attr(stack[[pos]], "midpoint") <- (sum(members) - 1)/2
			} else if (is.leaf(stack[[pos]][[1]])) {
				attr(stack[[pos]], "midpoint") <- (sum(members[-l]) + attr(stack[[pos]][[l]], "midpoint"))/2
			} else if (is.leaf(stack[[pos]][[l]])) {
				attr(stack[[pos]], "midpoint") <- (attr(stack[[pos]][[1]], "midpoint") + sum(members[-l]))/2
			} else {
				attr(stack[[pos]], "midpoint") <- (sum(members[-l]) + attr(stack[[pos]][[1]], "midpoint") + attr(stack[[pos]][[l]], "midpoint"))/2
			}
			
			# replace self in parent
			if (parent[pos] > 0)
				stack[[parent[pos]]][[index[pos]]] <- stack[[pos]]
			pos <- pos - 1L # pop off of stack
		} else { # descending tree
			visit[pos] <- TRUE
			p <- pos
			for (i in seq_along(stack[[p]])) {
				if (!is.leaf(stack[[p]][[i]])) {
					# push subtree onto stack
					pos <- pos + 1L
					stack[[pos]] <- stack[[p]][[i]]
					parent[[pos]] <- p
					index[[pos]] <- i
				}
			}
		}
	}
	return(stack[[1L]])
}

IdClusters <- function(myDistMatrix=NULL,
	method="UPGMA",
	cutoff=-Inf,
	showPlot=FALSE,
	type="clusters",
	myXStringSet=NULL,
	model=MODELS,
	collapse=0,
	reconstruct=FALSE,
	root=0,
	processors=1,
	verbose=TRUE) {
	
	# initialize variables
	time.1 <- Sys.time()
	
	# error checking
	METHODS <- c("NJ","UPGMA", "ML", "complete", "single", "WPGMA", "inexact")
	method <- pmatch(method, METHODS)
	if (is.na(method) && !is.null(myDistMatrix)) {
		stop("Invalid method.  Choose either ML, NJ, complete, single, WPGMA, or UPGMA.")
	} else if (is.na(method)) {
		stop("Invalid method.")
	}
	if (method==-1 && !is.null(myDistMatrix)) {
		stop("Ambiguous method.  Choose either ML, NJ, complete, single, WPGMA, or UPGMA.")
	} else if (method==-1) {
		stop("Ambiguous method.")
	}
	if (!is.logical(reconstruct) &&
		!is.numeric(reconstruct))
		stop("reconstruct must be a logical or numeric.")
	if (is.numeric(reconstruct)) {
		if (reconstruct <= 0)
			stop("reconstruct must be be greater than zero.")
		if (reconstruct > 1)
			stop("reconstruct can be at most one.")
	}
	if (method==3 ||
		(type > 1 && reconstruct)) {
		if (length(model) < 1)
			stop("No model(s) specified.")
		if (!is.character(model))
			stop("model must be a character vector.")
		w <- which(!(model %in% MODELS))
		if (length(w) > 0) {
			submodels <- sub("([^+]*)(\\+G(\\d+))?", "\\1", model[w])
			if (!all(submodels %in% sub("\\+G(\\d+)$", "", MODELS)))
				stop(paste("Available models are:",
					paste(MODELS, collapse=", "),
					collapse=" "))
			rates <- as.integer(sub("([^+]*)(\\+G(\\d+))?", "\\3", model[w]))
			if (any(is.na(rates)) || any(floor(rates)!=rates))
				stop("The number rates in the discrete Gamma distribution (i.e., +G4) should be an integer value.")
			if (any(rates > 10))
				stop("Up to 10 rates are allowed in the discrete Gamma distribution (i.e., +G10).")
			if (any(rates < 2))
				stop("A minimum of two rates are required for the discrete Gamma distribution (i.e., +G2).")
		}
		model <- unique(model)
	}
	if (!is.numeric(cutoff))
		stop("cutoff must be a numeric.")
	if (is.integer(cutoff))
		cutoff <- as.numeric(cutoff)
	if (!is.logical(showPlot))
		stop("showPlot must be a logical.")
	TYPES <- c("clusters", "dendrogram", "both")
	type <- pmatch(type[1], TYPES)
	if (is.na(type))
		stop("Invalid type.")
	if (type == -1)
		stop("Ambiguous type.")
	if (method==7 && showPlot)
		stop("showPlot must be FALSE if method is 'inexact'")
	if (method==7 && type > 1)
		stop("type must be 'clusters' when method is 'inexact'")
	if (length(cutoff) > 1 && type > 1)
		warning("More than one cutoff specified when type is ", TYPES[type], ".")
	if (method==7 && any(cutoff < 0))
		stop("cutoff must be at least zero when method is 'inexact'.")
	if (method==7 && any(cutoff >= 1))
		stop("cutoff must be less than one when method is 'inexact'.")
	ASC <- TRUE
	if (is.unsorted(cutoff)) {
		if (is.unsorted(rev(cutoff))) {
			stop("cutoff must be sorted.")
		} else {
			ASC <- FALSE
		}
	}
	if (!is.numeric(collapse))
		stop("collapse must be a numeric.")
	if (!is.logical(verbose))
		stop("verbose must be a logical.")
	if (!is.null(processors) && !is.numeric(processors))
		stop("processors must be a numeric.")
	if (!is.null(processors) && floor(processors)!=processors)
		stop("processors must be a whole number.")
	if (!is.null(processors) && processors < 1)
		stop("processors must be at least 1.")
	if (is.null(processors)) {
		processors <- detectCores()
	} else {
		processors <- as.integer(processors)
	}
	if (method != 7) {
		if (is(myDistMatrix, "matrix")) {
			dim <- dim(myDistMatrix)
			if (dim[2]!=dim[1])
				stop("myDistMatrix is not square.")
			dim <- dim[1]
		} else if (is(myDistMatrix, "dist")) {
			dim <- attr(myDistMatrix, "Size")
		} else {
			stop(paste("myDistMatrix must be a matrix for method '", METHODS[method], "'.", sep=""))
		}
		if (dim < 2)
			stop("myDistMatrix is too small.")
		if (typeof(myDistMatrix)=="integer")
			myDistMatrix[] <- as.numeric(myDistMatrix)
		
		if (!is.numeric(root))
			stop("root must be a numeric.")
		if (length(root) != 1)
			stop("root must be a single numeric.")
		if (floor(root) != root)
			stop("root must be an integer.")
		if (root < 0)
			stop("root must be at least 0.")
		if (root > dim)
			stop(paste("root cannot be greater than ", dim, ".", sep=""))
	}
	if (method == 3 ||
		(reconstruct && type > 1)) {
		if (is(myXStringSet, "DNAStringSet")) {
			typeX <- 1L
		} else if (is(myXStringSet, "RNAStringSet")) {
			typeX <- 2L
		} else {
			stop("myXStringSet must be a DNAStringSet or RNAStringSet.")
		}
		if (length(myXStringSet)!=dim)
			stop("myDistMatrix must have as many rows as the number of sequences.")
		if (length(unique(width(myXStringSet))) != 1)
			stop("All sequences in myXStringSet must be the same width (aligned).")
		if (method==3 &&
			!is.null(attr(myDistMatrix, "correction")) &&
			attr(myDistMatrix, "correction") == "none")
			warning('myDistMatrix should probably have a correction for method="ML".')
	}
	
	if (method == 7) {
		if (is(myXStringSet, "DNAStringSet")) {
			typeX <- 1L
		} else if (is(myXStringSet, "RNAStringSet")) {
			typeX <- 2L
		} else if (is(myXStringSet, "AAStringSet")) {
			typeX <- 3L
		} else {
			stop("myXStringSet must be an AAStringSet, DNAStringSet, or RNAStringSet.")
		}
		a <- vcountPattern("-", myXStringSet)
		if (any(a > 0))
			stop("Gap characters ('-') must be removed before inexact clustering.")
		a <- vcountPattern("+", myXStringSet)
		if (any(a > 0))
			stop("Mask characters ('+') must be removed before inexact clustering.")
		a <- vcountPattern(".", myXStringSet)
		if (any(a > 0))
			stop("Unknown characters ('.') must be removed before inexact clustering.")
		if (all(width(myXStringSet)==0L))
			stop("All sequences in myXStringSet are zero width.")
		
		if (verbose) {
			lastValue <- 0
			pBar <- txtProgressBar(style=ifelse(interactive(), 3, 1))
		}
		if (typeX==3L) { # AAStringSet
			wordSize <- ceiling(log(500*quantile(width(myXStringSet), 0.99),
				.Call("alphabetSizeReducedAA",
					myXStringSet,
					0:19,
					PACKAGE="DECIPHER")))
			if (wordSize > 7)
				wordSize <- 7
			if (wordSize < 1)
				wordSize <- 1
			words <- 20^wordSize
		} else { # DNAStringSet or RNAStringSet
			wordSize <- ceiling(log(500*quantile(width(myXStringSet), 0.99),
				.Call("alphabetSize",
					myXStringSet,
					PACKAGE="DECIPHER")))
			if (wordSize > 15)
				wordSize <- 15
			if (wordSize < 1)
				wordSize <- 1
			words <- 4^wordSize
		}
		
		l <- length(myXStringSet)
		if (l==0)
			stop("myXStringSet contains no sequences.")
		lc <- length(cutoff)
		if (is.null(names(myXStringSet))) {
			dNames <- 1:l
		} else {
			dNames <- names(myXStringSet)
			w <- which(duplicated(dNames))
			if (length(w) > 0) {
				warning("Duplicated names of myXStringSet appended with index.")
				dNames[w] <- paste(dNames[w],
					w,
					sep="_")
			}
		}
		if (lc > 1) {
			cNames <- paste("cluster",
				gsub("\\.", "_", cutoff),
				METHODS[method],
				sep="")
		} else {
			cNames <- "cluster"
		}
		c <- matrix(0L,
			nrow=l,
			ncol=lc,
			dimnames=list(dNames,
				cNames))
		
		# identify duplicated sequences
		x <- selfmatch(myXStringSet)
		u <- unique(x)
		l <- length(u)
		t <- tabulate(x, length(x))[u]
		
		cutoff <- cutoff/2 # cluster radius is half the diameter
		threshold <- (1 - cutoff)^wordSize # probability of k-mer match
		for (i in seq_len(lc)) {
			if (!ASC && i > 1) {
				o <- u[order(c[u, i - 1],
					width(myXStringSet)[u],
					t, # frequency
					decreasing=TRUE)]
			} else {
				o <- u[order(width(myXStringSet)[u],
					t, # frequency
					decreasing=TRUE)]
			}
			
			if (typeX==3L) { # AAStringSet
				v <- .Call("enumerateSequenceAA",
					.subset(myXStringSet, o),
					wordSize,
					PACKAGE="DECIPHER")
			} else { # DNAStringSet or RNAStringSet
				v <- .Call("enumerateSequence",
					.subset(myXStringSet, o),
					wordSize,
					PACKAGE="DECIPHER")
			}
			v <- lapply(v,
				sort,
				na.last=NA)
			
			cluster_num <- 1L
			offset <- 0L
			c[o[1L], i] <- 1L
			seeds.index <- integer(l)
			seeds.index[1] <- 1L
			
			for (j in seq_along(o)[-1]) {
				if (verbose) {
					value <- round(((i - 1)*l + j)/lc/l, 2)
					if (value > lastValue) {
						lastValue <- value
						setTxtProgressBar(pBar, value)
					}
				}
				
				if (!ASC && i > 1) {
					if (c[o[j], i - 1] != c[o[j - 1], i - 1]) {
						# different clusters in last cutoff
						offset <- offset + cluster_num
						cluster_num <- 1L
						c[o[j], i] <- cluster_num + offset
						seeds.index[1] <- j
						next
					} else if (c[o[j], i] > 0) { # cluster pre-assigned
						c[o[j], i] <- c[c[o[j], i], i]
						next
					}
				}
				
				m <- .Call("matchListsDual",
					v[j],
					v[seeds.index[seq_len(cluster_num)]],
					FALSE, # verbose
					NULL, # pBar
					processors,
					PACKAGE="DECIPHER")
				w <- which.max(m)
				
				if (length(w)==0 ||
					m[w] < threshold[i]) { # form a new group
					cluster_num <- cluster_num + 1L
					c[o[j], i] <- cluster_num
					seeds.index[cluster_num] <- j
				} else { # part of an existing group
					c[o[j], i] <- w + offset
					if (!ASC && i < lc) {
						cols <- (i + 1):lc
						cols <- cols[which(m[w] >= threshold[cols])]
						if (length(cols) > 0) # assign forward
							c[o[j], cols] <- o[seeds.index[w]]
					}
				}
			}
			
			c[, i] <- c[x, i]
		}
		myClusters <- as.data.frame(c)
	} else {
		# initialize a progress bar
		if (verbose) {
			if (method==3) {
				cat("Constructing initial neighbor-joining tree:\n")
				flush.console()
			}
			pBar <- txtProgressBar(min=0, max=100, initial=0, style=ifelse(interactive(), 3, 1))
		} else {
			pBar <- NULL
		}
		
		myClusters <- .Call("cluster",
			myDistMatrix,
			ifelse(method==3, -Inf, cutoff[1]),
			ifelse(method==3, 1L, method),
			ifelse(is(myDistMatrix, "matrix"),
				dim,
				-dim),
			verbose,
			pBar,
			processors,
			PACKAGE="DECIPHER")
		if (verbose) {
			setTxtProgressBar(pBar, 100)
			close(pBar)
		}
		
		if (method==3 ||
			(reconstruct && type > 1)) {
			myClusters <- .reorderClusters(myClusters,
				all=method != 3)
			
			m <- matrix(NA,
				nrow=length(model),
				ncol=10,
				dimnames=list(model,
					c("FreqA", "FreqC", "FreqG", "FreqT",
						"A2G", "C2T", "alpha",
						"-LnL", "AICc", "BIC")))
			
			if (!(length(model)==1 && model=="JC69")) {
				a <- apply(consensusMatrix(myXStringSet),
					2,
					function(x)
						sum(x[1:15] > 0)) > 1
				N <- sum(a)
				
				for (i in seq_along(model)) {
					m[model[i],] <- .optimizeModel(myClusters,
						model[i],
						myXStringSet,
						N,
						processors=processors)
					if (verbose)
						cat("\n", model[i],
							":", paste(rep(" ",
									max(nchar(rownames(m))) - nchar(model[i]) + 1),
								collapse=""),
							"-ln(L)=", round(m[model[i], "-LnL"], 0),
							", AICc=", round(m[model[i], "AICc"], 0),
							", BIC=", round(m[model[i], "BIC"], 0),
							sep="")
				}
			}
			
			if (length(model) > 1) { # choose the best model
				w <- which.min(m[,"BIC"])
				m <- m[w,, drop=FALSE]
				model <- model[w]
				if (verbose)
					cat("\n\nThe selected model was:  ",
						model,
						"\n",
						sep="")
			}
			
			.giveParams <- function(model_params) {
				w <- which(is.na(model_params))
				if (length(w) > 0) {
					model_params[w] <- c(0.25, 0.25, 0.25, 0.25, 1, 1, NA)[w]
				}
				if (is.na(model_params[7])) {
					model_params <- c(model_params[1:6], 1, 1)
				} else {
					model_params <- c(model_params[1:6], .rates(model_params[7], as.integer(sub("([^+]*)(\\+G(\\d+))?", "\\3", rownames(m)))))
				}
			}
			model_params <- .giveParams(as.numeric(m[1:7]))
			
			if (method==3) {
				# given myClusters return adjusted heights
				adjustTreeHeights <- function(myClusters) {
					cumHeight <- numeric(max(myClusters[, 3]))
					for (i in 1:dim(myClusters)[1]) {
						if (myClusters[i, 1] < 0 && myClusters[i, 2] < 0) {
							cumHeight[myClusters[i, 3]] <- max(myClusters[i, 4], myClusters[i, 5])
							myClusters[i, 6] <- cumHeight[myClusters[i, 3]]
						} else if (myClusters[i, 1] > 0 && myClusters[i, 2] > 0) {
							cumHeight[myClusters[i, 3]] <- max(myClusters[i, 4] + cumHeight[myClusters[i, 1]],
								myClusters[i, 5] + cumHeight[myClusters[i, 2]])
							myClusters[i, 6] <- cumHeight[myClusters[i, 3]]
						} else if (myClusters[i, 1] > 0) {
							cumHeight[myClusters[i, 3]] <- cumHeight[myClusters[i, 1]] + myClusters[i, 4]
							if (myClusters[i, 5] > cumHeight[myClusters[i, 3]])
								cumHeight[myClusters[i, 3]] <- myClusters[i, 5]
							myClusters[i, 6] <- cumHeight[myClusters[i, 3]]
						} else {
							cumHeight[myClusters[i, 3]] <- cumHeight[myClusters[i, 2]] + myClusters[i, 5]
							if (myClusters[i, 4] > cumHeight[myClusters[i, 3]])
								cumHeight[myClusters[i, 3]] <- myClusters[i, 4]
							myClusters[i, 6] <- cumHeight[myClusters[i, 3]]
						}
					}
					
					myClusters <- .Call("adjustHeights",
						myClusters,
						PACKAGE="DECIPHER")
					return(myClusters)
				}
				
				# print progress of likelihood maximization
				.startLnL <- Inf
				.bestLnL <- Inf
				.NNIs <- 0L
				if (verbose) {
					cat("\nMaximizing Likelihood of Tree:\n")
					flush.console()
					printLine <- function(value, percentComplete) {
						cat("\r-ln(Likelihood) = ",
							formatC(round(value, 0),
								digits=0,
								format="f"),
							" (",
							formatC(abs(round(100*(value - .startLnL)/.startLnL,
									2)),
								digits=2,
								format="f"),
							"% improvement), ",
							.NNIs,
							ifelse(.NNIs==1, " NNI  ", " NNIs "),
							sep="")
						flush.console()
						invisible(value)
					}
				}
				
				# given branch lengths return -LnL
				maximizeLikelihood <- function(x, branches=integer(), lengths=numeric()) {
					myClusters[, 4:5] <- x
					LnL <- .Call("clusterML",
						myClusters,
						myXStringSet,
						model_params,
						branches,
						lengths,
						0,
						1L,
						processors,
						PACKAGE="DECIPHER")
					
					w <- which.min(LnL)
					if (LnL[w] < .bestLnL) {
						if (w > 1) {
							x[branches[w - 1]] <- lengths[w - 1]
							params <<- x
						} else {
							params <<- x
						}
						.bestLnL <<- LnL[w]
						if (verbose) {
							if (is.infinite(.startLnL))
								.startLnL <<- LnL[1]
							printLine(LnL[w])
						}
					}
					
					return(LnL)
				}
				
				maximizeLikelihood2 <- function(myClusters, NNIs, tol) {
					LnL <- .Call("clusterML",
						myClusters,
						myXStringSet,
						model_params,
						integer(),
						numeric(),
						0,
						1L,
						processors,
						PACKAGE="DECIPHER")
					
					if (LnL < .bestLnL - tol) {
						.bestLnL <<- LnL
						if (verbose) {
							.NNIs <<- NNIs
							if (is.infinite(.startLnL))
								.startLnL <<- LnL
							printLine(LnL)
						}
					}
					
					return(LnL)
				}
				
				# maximize likelihood of tree
				index <- rep(TRUE, 2*dim(myClusters)[1])
				repeat {
					currentLnL <- .bestLnL
					currentNNIs <- .NNIs
					params <- as.numeric(myClusters[, 4:5])
					tempClusters <- paste(myClusters[, 7], myClusters[, 8])
					.simultaneousBrent(maximizeLikelihood,
						ifelse(index, 0, params),
						params,
						ifelse(index, 10*params, params))
					myClusters[, 4:5] <- params
					
					myClusters <- .NNI(myClusters,
						.bestLnL,
						.NNIs,
						maximizeLikelihood2)
					
					if (abs(.bestLnL - currentLnL) < 1e0 &&
						currentNNIs==.NNIs) {
						temp_params <- model_params
					} else if (model != "JC69") {
						m[1,] <- .optimizeModel(myClusters,
							rownames(m),
							myXStringSet,
							N,
							processors=processors)
						temp_params <- .giveParams(as.numeric(m[1:7]))
					} else {
						temp_params <- .giveParams(as.numeric(m[1:7]))
					}
					
					if ((abs(.bestLnL - currentLnL) < 1e0 || # negligible improvement in likelihood
						currentNNIs==.NNIs) && # no new nearest neighbor interchanges (NNIs)
						all(abs(temp_params - model_params) < 0.01)) # negligible change in model_params
						break
					model_params <- temp_params
					index <- rep(!(paste(myClusters[, 7], myClusters[, 8]) %in% tempClusters), 2)
				}
				
				myClusters <- .reorderClusters(myClusters, all=TRUE)
				myClusters <- adjustTreeHeights(myClusters)
				myClusters <- .Call("reclusterNJ",
					myClusters,
					cutoff[1],
					PACKAGE="DECIPHER")
				
				if (verbose) {
					printLine(.bestLnL, 100)
					cat("\n")
					flush.console()
				}
			}
			
			if (verbose) {
				params <- formatC(round(m, 3),
					digits=3,
					format="f")
				cat(ifelse(grepl("NA", params[1], fixed=TRUE),
						"",
						"\nModel parameters:"),
					ifelse(grepl("NA", params[1], fixed=TRUE),
						"",
						ifelse(grepl("T92", rownames(m), fixed=TRUE),
							paste("\nFrequency(A) = Frequency(T) = ", params[1],
								"\nFrequency(C) = Frequency(G) = ", params[2],
								sep=""),
							paste("\nFrequency(A) = ", params[1],
								"\nFrequency(C) = ", params[2],
								"\nFrequency(G) = ", params[3],
								"\nFrequency(T) = ", params[4],
								sep=""))),
					ifelse(grepl("NA", params[5], fixed=TRUE),
						"",
						ifelse(grepl("TN93", rownames(m), fixed=TRUE),
							paste("\nRate A <-> G = ", params[5],
								"\nRate C <-> T = ", params[6],
								"\nTransversion rates = 1",
								sep=""),
							paste("\nTransition rates = ", params[5],
								"\nTransversion rates = 1",
								sep=""))),
					ifelse(grepl("NA", params[7], fixed=TRUE),
						"",
						paste("\nAlpha = ", params[7], sep="")),
					ifelse(grepl("NA", params[1], fixed=TRUE),
						"",
						"\n"),
					sep="")
			}
			params <- m[, 1:7]
		} else {
			model <- NULL
			params <- NULL
		}
		
		if (showPlot || type > 1) {
			if (method==1 ||
				method==3 ||
				root > 0)
				myClusters <- .root(myClusters, root)
			
			# create a dendrogram object
			myClustersList <- list()
			dNames <- labels(myDistMatrix)
			if (is.null(dNames)) {
				myClustersList$labels <- 1:(dim(myClusters)[1] + 1)
			} else {
				if (is(dNames, "list")) {
					myClustersList$labels <- dNames[[1]]
				} else {
					myClustersList$labels <- dNames
				}
				
				w <- which(duplicated(myClustersList$labels))
				if (length(w) > 0) {
					warning("Duplicated labels in dendrogram appended with index.")
					myClustersList$labels[w] <- paste(myClustersList$labels[w],
						w,
						sep="_")
				}
			}
			
			myClustersList$merge <- matrix(myClusters[,7:8], ncol=2)
			myClustersList$height <- matrix(myClusters[,6], ncol=1)
			myClustersList$lengths <- matrix(myClusters[,4:5], ncol=2)
			if (dim > 100) {
				fontSize <- 0.6
			} else if (dim > 70) {
				fontSize <- 0.7
			} else if (dim > 40) {
				fontSize <- 0.8
			} else {
				fontSize <- 0.9
			}
			if (dim > 300) {
				leaves <- "none"
			} else {
				leaves <- "perpendicular"
			}
			
			if (reconstruct && type > 1) {
				# perform ancestral state reconstruction
				states <- .Call("clusterML",
					myClusters,
					myXStringSet,
					model_params,
					integer(),
					numeric(),
					reconstruct,
					typeX,
					processors,
					PACKAGE="DECIPHER")
				d <- to.dendrogram(myClustersList,
					states)
				myXStringSet <- unname(as.character(myXStringSet))
				d <- rapply(d,
					function(x) {
						attr(x, "state") <- myXStringSet[x]
						x
					},
					how="replace")
			} else {
				d <- to.dendrogram(myClustersList)
			}
			
			# convert bifurcating tree to multifurcating
			if (collapse >= 0)
				d <- .collapse(d, collapse, dim)
			
			attr(d, "method") <- METHODS[method]
			attr(d, "model") <- model
			attr(d, "parameters") <- params
			
			# specify the order of clusters that
			# will match the plotted dendrogram
			orderDendrogram <- order.dendrogram(d)
			
			c <- .organizeClusters(myClusters, myClustersList$labels, orderDendrogram)
			
			if (is.finite(cutoff[1])) {
				# create a visibily different vector of colors
				cl <- colors()
				v1 <- c(117,254,73,69,152,51,26,450,503,596,652,610,563,552,97)
				r <- cl[v1]
				
				# color edges by cluster
				.colEdge <- function(dend) {
					# initialize a stack of maximum length (dim)
					stack <- vector("list", dim)
					visit <- logical(dim) # node already visited
					parent <- integer(dim) # index of parent node
					index <- integer(dim) # index in parent node
					pos <- 1L # current position in the stack
					stack[[pos]] <- dend
					while (pos > 0L) { # more nodes to visit
						if (visit[pos]) { # ascending tree
							visit[pos] <- FALSE # reset visit
							
							for (i in seq_along(stack[[pos]])) {
								if (is.leaf(stack[[pos]][[i]])) {
									a <- attributes(stack[[pos]][[i]])
									num <- c$cluster[which(myClustersList$labels==as.character(a$label))]
									attr(stack[[pos]][[i]], "edgePar") <- list(col=r[num %% 15 + 1])
								}
							}
							
							# replace self in parent
							if (parent[pos] > 0)
								stack[[parent[pos]]][[index[pos]]] <- stack[[pos]]
							pos <- pos - 1L # pop off of stack
						} else { # descending tree
							visit[pos] <- TRUE
							p <- pos
							for (i in seq_along(stack[[p]])) {
								if (!is.leaf(stack[[p]][[i]])) {
									# push subtree onto stack
									pos <- pos + 1L
									stack[[pos]] <- stack[[p]][[i]]
									parent[[pos]] <- p
									index[[pos]] <- i
								}
							}
						}
					}
					return(stack[[1L]])
				}
				d <- .colEdge(d)
				
				.reorder <- function(dend) {
					# initialize a stack of maximum length (dim)
					stack <- vector("list", dim)
					visit <- logical(dim) # node already visited
					parent <- integer(dim) # index of parent node
					index <- integer(dim) # index in parent node
					pos <- 1L # current position in the stack
					stack[[pos]] <- dend
					while (pos > 0L) { # more nodes to visit
						if (visit[pos]) { # ascending tree
							visit[pos] <- FALSE # reset visit
							
							members <- lapply(stack[[pos]], unlist)
							# sort tree by ascending cluster number
							o <- sort.list(sapply(members,
									function(x)
										min(c[x, 1])))
							stack[[pos]][] <- stack[[pos]][o]
							
							# replace self in parent
							if (parent[pos] > 0)
								stack[[parent[pos]]][[index[pos]]] <- stack[[pos]]
							pos <- pos - 1L # pop off of stack
						} else { # descending tree
							visit[pos] <- TRUE
							p <- pos
							for (i in seq_along(stack[[p]])) {
								if (!is.leaf(stack[[p]][[i]])) {
									# push subtree onto stack
									pos <- pos + 1L
									stack[[pos]] <- stack[[p]][[i]]
									parent[[pos]] <- p
									index[[pos]] <- i
								}
							}
						}
					}
					return(stack[[1L]])
				}
				d <- .reorder(d)
			}
			
			# add midpoints to the tree
			d <- .applyMidpoints(d, dim)
		}
		if (type==1 || type==3) {
			dNames <- labels(myDistMatrix)
			if (is.null(dNames)) {
				dNames <- 1:(dim(myClusters)[1] + 1)
			} else {
				if (is(dNames, "list"))
					dNames <- dNames[[1]]
				
				w <- which(duplicated(dNames))
				if (length(w) > 0) {
					warning("Duplicated labels in myDistMatrix appended with index.")
					dNames[w] <- paste(dNames[w],
						w,
						sep="_")
				}
			}
			
			if (type==1) # do not number clusters by order of appearance
				c <- .organizeClustersFast(myClusters, dNames)
			
			if (length(cutoff) > 1) {
				names(c) <- paste("cluster",
					gsub("\\.", "_", cutoff[1]),
					METHODS[method],
					sep="")
				for (i in 2:length(cutoff)) {
					if (method==1 ||
						method==3 ||
						root > 0) {
						myClusters <- .Call("reclusterNJ",
							myClusters,
							cutoff[i],
							PACKAGE="DECIPHER")
					} else { # ultrametric
						myClusters <- .Call("reclusterUPGMA",
							myClusters,
							cutoff[i],
							PACKAGE="DECIPHER")
					}
					x <- .organizeClustersFast(myClusters, dNames)
					if ((method==1 ||
						method==3 ||
						root > 0) &&
						!ASC) # ensure clusters are subsets
						x[, 1] <- .splitClusters(x[, 1], c[, dim(c)[2]])
					names(x) <- paste("cluster",
						gsub("\\.", "_", cutoff[i]),
						METHODS[method],
						sep="")
					c <- cbind(c, x)
				}
			}
			myClusters <- c
		}
		
		if (showPlot)
			plot(d,
				horiz=FALSE,
				leaflab=leaves,	
				nodePar=list(lab.cex=fontSize, pch = NA))
	}
	
	if (verbose) {
		time.2 <- Sys.time()
		cat("\n")
		print(round(difftime(time.2,
			time.1,
			units='secs'),
			digits=2))
		cat("\n")
	}
	
	if (type==1) {
		return(myClusters)
	} else if (type==2) {
		return(d)
	} else {
		return(list(myClusters, d))
	}
}
Any scripts or data that you put into this service are public.
DECIPHER documentation built on Nov. 8, 2020, 8:30 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
DECIPHER
Tools for curating, analyzing, and manipulating biological sequences

R/IdClusters.R
In DECIPHER: Tools for curating, analyzing, and manipulating biological sequences

Defines functions IdClusters .applyMidpoints .root .splitClusters .NNI .swapBranches .reorderClusters .simultaneousBrent .optimizeModel .rates .organizeClustersFast .organizeClusters .collapse to.dendrogram

Documented in IdClusters

Try the DECIPHER package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DECIPHER Tools for curating, analyzing, and manipulating biological sequences

R/IdClusters.R In DECIPHER: Tools for curating, analyzing, and manipulating biological sequences

Defines functions IdClusters .applyMidpoints .root .splitClusters .NNI .swapBranches .reorderClusters .simultaneousBrent .optimizeModel .rates .organizeClustersFast .organizeClusters .collapse to.dendrogram

Documented in IdClusters

Try the DECIPHER package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DECIPHER
Tools for curating, analyzing, and manipulating biological sequences

R/IdClusters.R
In DECIPHER: Tools for curating, analyzing, and manipulating biological sequences