R/splits.R
In phangorn: Phylogenetic Reconstruction and Analysis

Documented in as.bitsplits.splits as.Matrix as.matrix.splits as.Matrix.splits as.phylo.splits as.prop.part.splits as.splits as.splits.bitsplits as.splits.multiPhylo as.splits.networx as.splits.phylo compatible c.splits distinct.splits matchSplits print.splits unique.splits

#' Splits representation of graphs and trees.
#'
#' \code{as.splits} produces a list of splits or bipartitions.
#'
#' @aliases splits as.Matrix distinct.splits as.phylo.splits
#' addTrivialSplits removeTrivialSplits matchSplits
#' @param x An object of class phylo or multiPhylo.
#' @param maxp integer, default from \code{options(max.print)}, influences how
#' many entries of large matrices are printed at all.
#' @param zero.print character which should be printed for zeros.
#' @param one.print character which should be printed for ones.
#' @param incomparables	only for compatibility so far.
#' @param unrooted todo.
#' @param \dots Further arguments passed to or from other methods.
#' @param recursive	logical. If recursive = TRUE, the function recursively
#' descends through lists (and pairlists) combining all their elements into a
#' vector.
#' @param obj1,obj2 an object of class splits.
#' @param k number of taxa.
#' @param labels names of taxa.
#' @return \code{as.splits} returns an object of class splits, which is mainly
#' a list of splits and some attributes. Often a \code{splits} object will
#' contain attributes \code{confidences} for bootstrap or Bayesian support
#' values and \code{weight} storing edge weights.
#' \code{compatible} return a lower triangular matrix where an 1 indicates that
#' two splits are incompatible.
#' @note The internal representation is likely to change.
#' @author Klaus Schliep \email{klaus.schliep@@gmail.com}
#' @seealso \code{\link[ape]{prop.part}}, \code{\link{lento}},
#' \code{\link{as.networx}}, \code{\link{distanceHadamard}},
#' \code{\link{read.nexus.splits}}
#' @keywords cluster
#' @examples
#'
#' (sp <- as.splits(rtree(5)))
#' write.nexus.splits(sp)
#' spl <- allCircularSplits(5)
#' plot(as.networx(spl))
#'
#' @rdname as.splits
#' @export
as.splits <- function(x, ...) {
  if (inherits(x, "splits")) return(x)
  UseMethod("as.splits")
}


#' @export
as.Matrix <- function(x, ...) {
  if (inherits(x, "Matrix")) return(x)
  UseMethod("as.Matrix")
}


#' @rdname as.splits
#' @method as.matrix splits
#' @export
as.matrix.splits <- function(x, zero.print = 0L, one.print = 1L, ...) {
  m <- length(x)
  labels <- attr(x, "labels")
  n <- length(labels)
  res <- matrix(zero.print, m, n)
  for (i in 1:m) res[i, x[[i]]] <- one.print
  dimnames(res) <- list(names(x), labels)
  res
}


#' @rdname as.splits
#' @importFrom Matrix sparseMatrix
#' @method as.Matrix splits
#' @export
as.Matrix.splits <- function(x, ...) {
  labels <- attr(x, "labels")
  l <- length(x)
  j <- unlist(x)
  i <- rep(1:l, lengths(x))
  sparseMatrix(i, j, x = rep(1L, length(i)), dimnames = list(NULL, labels))
  # included x und labels
}


#' @rdname as.splits
#' @export
print.splits <- function(x, maxp = getOption("max.print"),
                         zero.print = ".", one.print = "|", ...) {
  x.orig <- x
  cx <- as.matrix(x, zero.print = zero.print, one.print = one.print)
  print(cx, quote = FALSE, right = TRUE, max = maxp)
  invisible(x.orig)
}


#' @export
"[.splits" <- function(x, i) {
  tmp <- attributes(x)
  result <- unclass(x)[i]
  if (!is.null(tmp$weights)) tmp$weights <- tmp$weights[i]
  if (!is.null(tmp$confidences)) tmp$confidences <- tmp$confidences[i]
  if (!is.null(tmp$intervals)) tmp$intervals <- tmp$intervals[i]
  if (!is.null(tmp$data)) tmp$data <- tmp$data[i, , drop = FALSE]
  attributes(result) <- tmp
  result
}


changeOrder <- function(x, labels) {
  oldL <- attr(x, "labels")
  if(identical(oldL, labels)) return(x)
  ind <- match(oldL, labels)
  for (i in seq_along(x))
    x[[i]] <- sort(ind[x[[i]]])
  if (!is.null(attr(x, "cycle")))
    attr(x, "cycle") <- ind[attr(x, "cycle")]
  attr(x, "labels") <- labels
  x
}



## @rdname as.splits
#' @export
matchSplits <- function(x, y, as.in = TRUE) {
  tiplabel <- attr(x, "labels")
  if (any(is.na(match(tiplabel, attr(y, "labels")))))
    stop("x and y have different labels!")
  nTips <- length(tiplabel)
  y <- changeOrder(y, tiplabel)
  y <- SHORTwise(y) #, nTips)
  if (as.in) return(match(SHORTwise(x), y, nomatch = 0L) > 0L)
  match(SHORTwise(x), y)
}



countCycles <- function(splits, ord = NULL) {
  M <- as.matrix(splits)
  if(is.null(ord)){
    ord <- attr(splits, "cycle")
    if(is.null(ord)) ord <- seq_along(attr(splits, "labels"))
  }
  res <- countCycle2_cpp(M[, ord])
  res
}


#' @rdname as.splits
#' @method c splits
#' @export
c.splits <- function(..., recursive = FALSE) {
  x <- list(...)
  if (length(x) == 1 && !inherits(x[[1]], "splits")) x <- x[[1]]
  n <- length(x)
  match.names <- function(a, b) {
    if (any(!(a %in% b)))
      stop("names do not match previous names")
  }
  if (n == 1)
    return(x[[1]])

  labels <- attr(x[[1]], "labels")
  cycle <- attr(x[[1]], "cycle")
  for (i in 2:n) {
    match.names(labels, attr(x[[i]], "labels"))
    x[[i]] <- changeOrder(x[[i]], labels)
  }
  w <- as.vector(unlist(lapply(x, attr, "weights")))
  x <- lapply(x, unclass)
  res <- structure(do.call("c", x), class = c("splits", "prop.part"))
  names(res) <- NULL
  attr(res, "labels") <- labels
  attr(res, "weights") <- w
  attr(res, "cycle") <- cycle
  res
}


#' @rdname as.splits
#' @method unique splits
#' @export
unique.splits <- function(x, incomparables = FALSE, unrooted = TRUE, ...) {
  nTips <- length(attr(x, "labels"))
  x <- SHORTwise(x)
  x[!duplicated(x)]
}


#' @export distinct.splits
distinct.splits <- function(...) {
  tmp <- c(...)
  res <- unique(tmp)
  attributes(res) <-  attributes(tmp)
  attr(res, "weights") <- tabulate(match(tmp, res))
  res
}



# computes splits from phylo
#' @rdname as.splits
#' @method as.splits phylo
#' @export
as.splits.phylo <- function(x, ...) {
  if (hasArg(as.is))
    as.is <- list(...)$as.is
  else as.is <- TRUE
  result <- bip(x)
  if (!is.null(x$edge.length)) {
    edge.weights <- numeric(max(x$edge))
    edge.weights[x$edge[, 2]] <- x$edge.length
    attr(result, "weights") <- edge.weights
  }
  if (!is.null(x$node.label)) {
    conf <- x$node.label
    if (is.character(conf)) as.is <- TRUE
      #conf <- as.numeric(conf)
    if (!as.is) if (max(na.omit(conf)) > (1 + 1e-8)) conf <- conf / 100
    attr(result, "confidences") <- c(rep(NA_real_, length(x$tip.label)), conf)
  }
  attr(result, "labels") <- x$tip.label
  class(result) <- c("splits", "prop.part")
  result
}


# computes splits from multiPhylo object (e.g. bootstrap, MCMC etc.)
#' @rdname as.splits
#' @method as.splits multiPhylo
#' @export
as.splits.multiPhylo <- function(x, ...) {
  if (hasArg(trivial))
    trivial <- list(...)$trivial
  else trivial <- TRUE
  lx <-  length(x)
  x <- .uncompressTipLabel(x)
  x <- unroot(x)
  splits <- prop.part(x)
  splits <- postprocess.prop.part(splits, method="SHORTwise")
  class(splits) <- "list"
  weights <- attr(splits, "number")
  lab <- attr(splits, "labels")
  attr(splits, "labels") <- attr(splits, "number") <- NULL
  l <- length(lab)
  if(trivial){
    splitTips <- vector("list", l)
    for (i in 1:l) splitTips[[i]] <- i
    result <- c(splitTips, splits)
    attr(result, "weights") <- c(rep(lx, l), weights)
  }
  else attr(result, "weights") <- weights
  attr(result, "confidences") <- attr(result, "weights") / lx
  attr(result, "summary") <- list(confidences = "ratio", ntrees = lx,
                                  clades = FALSE)
  attr(result, "labels") <- lab
  class(result) <- c("splits", "prop.part")
  result
}


#' @export
as.splits.prop.part <- function(x, ...) {
  if (is.null(attr(x, "number")))
    attr(x, "weights") <- rep(1, length(x))
  else {
    attr(x, "weights") <- attr(x, "number")
    if( is.integer(attr(x, "number")) )
      attr(x, "confidences") <- attr(x, "number") / attr(x, "number")[1]
  }
  class(x) <- c("splits", "prop.part")
  x
}


#' @rdname as.splits
#' @method as.splits networx
#' @export
as.splits.networx <- function(x, ...) {
  if (!is.null(x$splits)) x$splits
  else warning("No split object included!")
}


#' @rdname as.splits
#' @method as.prop.part splits
#' @export
as.prop.part.splits <- function(x, ...) {
  attr(x, "number") <- attr(x, "weights")
  attr(x, "weights") <- NULL
  attr(x, "confidences") <- NULL
  class(x) <- c("prop.part")
  x
}

## as.splits.phylo
## @rdname as.splits
## @method as.phylo splits
#' @export
as.phylo.splits <- function(x, check=TRUE, rooted=FALSE, ...){
  if(check) x <- compatibleSplits(x)
  phy <- splits2phylo(x, rooted=rooted)
  spl <- as.splits(phy)[phy$edge[,2]]
  ind <- matchSplits(spl, x, FALSE)
  phy$edge.length <- attr(x, "weights")[ind]
  phy
}


splits2phylo <- function(x, rooted=FALSE){
  labels <- attr(x, "labels")
  nTips <- length(labels)
  if(!rooted) x <- SHORTwise(x)
  x <- x[lengths(x) > 1]
  x <- x[lengths(x) < nTips]
  # unique?
  l <- lengths(x)
  x <- x[order(l)]
  nNodes <- length(x) + 1L
  node_i <- as.integer( nNodes + nTips )
  edge <- matrix(0L, node_i - 1L, 2L)
  y <- seq_len(nTips)
  x <- unclass(x)
  m <- 0
  for(i in seq_along(x)){
    tmp <- x[[i]]
    kids <- unique( y[ tmp ] )
    k <- length(kids)
    y[ tmp ] <- node_i
    edge[m + 1:k ,1] <- node_i
    edge[m + 1:k ,2] <- kids
    m <- m + k
    node_i <- node_i - 1L
  }
  kids <- unique( y )
  k <- length(kids)
  edge[m + 1:k ,1] <- node_i
  edge[m + 1:k ,2] <- kids
  phy <- structure(list(edge, labels, nNodes),
                   .Names = c("edge", "tip.label", "Nnode"),
                   class = "phylo", order = "postorder")
  phy
}

compatibleSplits <- function(x) {
  x <- postprocess.splits(x)
  labels <- attr(x, "labels")
  nTips <- length(labels)
  x <- SHORTwise(x)
#  x <- x[lengths(x)>1]
  dm <- as.matrix(compatible(x))
  rs <- rowSums(dm)
  ind <- which(rs == 0)
  if (any(rs > 0)) {
    tmp <- which(rs > 0)
    candidates <- tmp[order(rs[tmp])]
    for (i in candidates) {
      if (sum(dm[ind, i]) == 0)
        ind <- c(ind, i)
    }
  }
  x[ind]
}


postprocess.splits <- function (x)
{
  #  w <- attr(x, "number")
  tmp <- attributes(x)
  labels <- attr(x, "labels")
  x <- SHORTwise(x)
  drop <- duplicated(x)
  if (any(drop)) {
    W <- ifelse (is.null(tmp$weights), FALSE, TRUE)
    CONF <- ifelse (is.null(tmp$confidences), FALSE, TRUE)
    if(W) w <- tmp$weights
    if (CONF) conf <- tmp$confidences
    class(x) <- NULL
    attributes(x) <- NULL
    y <- x[drop]
    ind1 <- match(y, x)
    ind2 <- which(drop)
    for (i in seq_along(ind2)) {
      if(W) w[ind1[i]] <- w[ind1[i]] + w[ind2[i]]
      if(CONF) conf[ind1[i]] <- conf[ind1[i]] + conf[ind2[i]]
    }
    x <- x[!drop]
    w <- w[!drop]
    if(CONF) conf <- conf[!drop]
    attr(x, "weights") <- w
    if(CONF) attr(x, "confidences") <- conf
    attr(x, "labels") <- labels
    class(x) <- c("splits", "prop.part")
  }
  x
}


#' @rdname as.splits
#' @method as.bitsplits splits
#' @export
as.bitsplits.splits <- function(x) {
  foo <- function(vect, RAWVECT) {
    res <- RAWVECT
    for (y in vect) {
      i <- ceiling(y / 8)
      res[i] <- res[i] | as.raw(2^(8 - ((y - 1) %% 8) - 1))
    }
    res
  }
  N <- length(x)
  n <- length(attr(x, "labels"))
  nr <- ceiling(n / 8)
  mat <- raw(N * nr)
  dim(mat) <- c(nr, N)
  RAWVECT <- raw(nr)
  for (i in 1:N) mat[, i] <- foo(x[[i]], RAWVECT)
  freq <- attr(x, "weights")
  if (is.null(freq)) freq <- rep(1, N)
  structure(list(matsplit = mat, labels = attr(x, "labels"),
    freq = freq), class = "bitsplits")
}


#' @rdname as.splits
#' @method as.splits bitsplits
#' @export
as.splits.bitsplits <- function(x, ...){
  as.splits(as.prop.part(x))
}


# computes compatible splits
#' @rdname as.splits
#' @export
compatible <- function(obj1, obj2 = NULL) {
  if (!inherits(obj1, "splits"))
    stop("obj must be of class splits")
  labels <- attr(obj1, "labels")
  l <- length(labels)
  n <- length(obj1)
  bp1 <- as.matrix(obj1)
  bp1[bp1[, 1] == 0L, ] <- 1L - bp1[bp1[, 1] == 0L, ]
  if (!is.null(obj2)) {
    m <- length(obj2)
    bp2 <- as.matrix(obj2)
    labels2 <- attr(obj2, "labels")
    bp2 <- bp2[, match(labels2, labels), drop = FALSE]
    bp2[bp2[, 1] == 0L, ] <- 1L - bp2[bp2[, 1] == 0L, ]
  }
  else bp2 <- bp1

  if (is.null(obj2)) res <- matrix(0L, n, n)
  else res <- matrix(0L, n, m)

  tmp1 <- tcrossprod(bp1, bp2)
  tmp2 <- tcrossprod(1L - bp1, 1L - bp2)
  tmp3 <- tcrossprod(bp1, 1L - bp2)
  tmp4 <- tcrossprod(1L - bp1, bp2)
  res[(tmp1 * tmp2 * tmp3 * tmp4) > 0] <- 1L
  if (is.null(obj2)) {
    res <- res[lower.tri(res)]
    attr(res, "Size") <- n
    attr(res, "Diag") <- FALSE
    attr(res, "Upper") <- FALSE
    class(res) <- "dist"
  }
  return(res)
}

# in clanistic.R ??
compatible3 <- function(x, y = NULL) {
  if (!inherits(x, "splits"))
    stop("x must be of class splits")
  if (is.null(y)) y <- x
  if (!inherits(y, "splits"))
    stop("y must be of class splits")
  xlabels <- attr(x, "labels")
  ylabels <- attr(y, "labels")
  if (identical(xlabels, ylabels)) labels <- xlabels
  else labels <- intersect(xlabels, ylabels)
  nx <- length(x)
  ny <- length(y)
  bp1 <- as.matrix(x)[, labels, drop = FALSE]
  bp2 <- as.matrix(y)[, labels, drop = FALSE]
  rs1 <- rowSums(bp1)
  rs2 <- rowSums(bp2)
  res <- matrix(0L, nx, ny)
  tmp1 <- tcrossprod(bp1, bp2)
  res <- matrix(0L, nx, ny)
  for (i in 1:nx) {
    for (j in 1:ny) {
      if (tmp1[i, j] == rs1[i]) res[i, j] <- 1
      if (tmp1[i, j] == rs2[j]) res[i, j] <- 2
      if (tmp1[i, j] == rs1[i] & tmp1[i, j] == rs2[j]) res[i, j] <- 3
    }
  }
  if (is.null(y)) {
    res <- res[lower.tri(res)]
    attr(res, "Size") <- length(x)
    attr(res, "Diag") <- FALSE
    attr(res, "Upper") <- FALSE
    class(res) <- "dist"
  }
  return(res)
}



compatible_2 <- function(obj1, obj2) {
  ntaxa <- length(obj1$labels)
  m1 <- obj1$matsplit
  m2 <- obj2$matsplit
  n1 <- ncol(m1)
  n2 <- ncol(m2)
  res <- rep(TRUE, n1)
  for (i in 1:n1) {
    j <- 1
    while (j <= n2) {
      if (!ape::arecompatible(m1[, i], m2[, j], ntaxa)) {
        res[i] <- FALSE
        break()
      }
      j <- j + 1L
    }
  }
  res
}