phangorn: Phylogenetic Reconstruction and Analysis

Documented in ancestral.pars ancestral.pml anc_pars anc_pml

#' Ancestral character reconstruction.
#'
#' Marginal reconstruction of the ancestral character states.
#'
#' The argument "type" defines the criterion to assign the internal nodes. For
#' \code{ancestral.pml} so far "ml and marginal (empirical) "bayes" and for
#' \code{ancestral.pars} "MPR" and "ACCTRAN" are possible.
#'
#' The function return a list containing the tree with node labels, the original
#' alignment as an \code{phyDat} object, a data.frame containing the
#' probabilities belonging to a state for all (internal nodes) and the most
#' likely state. For parsimony and nucleotide data the most likely state might
#' be ambiguous. For ML this is very unlikely to be the case.
#'
#' If the input tree does not contain unique node labels the function
#' \code{ape::MakeNodeLabel} is used to create them.
#'
#' With parsimony reconstruction one has to keep in mind that there will be
#' often no unique solution.
#'
#' The functions use the node labels of the provided tree (also if part of the
#' \code{pml} object) if these are unique. Otherwise the function
#' \code{ape::MakeNodeLabel} is used to create them.
#'
#' For further details see vignette("Ancestral").
#'
#' @param object an object of class pml
#' @param tree a tree, i.e. an object of class pml
#' @param data an object of class phyDat
#' @param type method used to assign characters to internal nodes, see details.
#' @param cost A cost matrix for the transitions between two states.
#' @param return return a \code{phyDat} object or matrix of probabilities.
##  @param x an object of class ancestral.
#' @param \dots Further arguments passed to or from other methods.
#' @return An object of class ancestral. This is a list containing the tree with
#' node labels, the original alignment as an \code{phyDat} object, a
#' \code{data.frame} containing the probabilities belonging to a state for all
#' (internal nodes) and the most likely state.
## For \code{return="phyDat"} an object  of class "phyDat", containing
## the ancestral states of all nodes. For nucleotide data this can contain
## ambiguous states. Apart from fitch parsimony the most likely states are
## returned.
#' @author Klaus Schliep \email{klaus.schliep@@gmail.com}
#' @seealso \code{\link{pml}}, \code{\link{parsimony}}, \code{\link[ape]{ace}},
#' \code{\link{plotAnc}}, \code{\link{latag2n.phyDat}},
#' \code{\link[ape]{latag2n}}, \code{\link{gap_as_state}},
#' \code{\link[ape]{root}}, \code{\link[ape]{makeNodeLabel}}
#' @references Felsenstein, J. (2004). \emph{Inferring Phylogenies}. Sinauer
#' Associates, Sunderland.
#'
#' Swofford, D.L., Maddison, W.P. (1987) Reconstructing ancestral character
#' states under Wagner parsimony. \emph{Math. Biosci.} \bold{87}: 199--229
#'
#' Yang, Z. (2006). \emph{Computational Molecular evolution}. Oxford University
#' Press, Oxford.
#' @keywords cluster
#' @examples
#'
#' example(NJ)
#' # generate node labels to ensure plotting will work
#' tree <- makeNodeLabel(tree)
#' fit <- pml(tree, Laurasiatherian)
#' anc.ml <- anc_pml(fit)
#' anc.p <- anc_pars(tree, Laurasiatherian)
#' # plot ancestral sequences at the root
#' plotSeqLogo( anc.ml, 48, 1, 20)
#' plotSeqLogo( anc.p, 48, 1, 20)
#' # plot the first character
#' plotAnc(anc.ml)
#' # plot the third character
#' plotAnc(anc.ml, 3)
#'
#' @rdname ancestral.pml
#' @export
ancestral.pml <- function(object, type = "marginal", return = "prob", ...) {
  assert_pml(object)
  call <- match.call()
  pt <- match.arg(type, c("marginal", "ml", "bayes", "lhood")) # "joint",
  rt <- match.arg(return, c("prob", "phyDat", "ancestral"))
  tree <- object$tree
  INV <- object$INV
  inv <- object$inv
  data <- getCols(object$data, tree$tip.label)
  data_type <- attr(data, "type")
  if (is.null(attr(tree, "order")) || attr(tree, "order") != "postorder") {
    tree <- reorder(tree, "postorder")
  }
  nTips <- length(tree$tip.label)
  node <- tree$edge[, 1]
  edge <- tree$edge[, 2]
  nNode <- Nnode(tree)
  m <- length(edge) + 1 # max(edge)
  w <- object$w
  g <- object$g
  l <- length(w)
  nr <- attr(data, "nr")
  nc <- attr(data, "nc")
  dat <- vector(mode = "list", length = m * l)
  result <- vector(mode = "list", length = nNode)
  result2 <- vector(mode = "list", length = nNode)
  dim(dat) <- c(l, m)
  node_label <- makeAncNodeLabel(tree, ...)
  tree$node.label <- node_label
  object$tree <- tree
  tmp <- length(data)

  joint <- TRUE
  if(length(w) > 1 || object$inv > 0) joint <- FALSE

  eig <- object$eig

  bf <- object$bf
  el <- tree$edge.length
  P <- getP(el, eig, g)
  nr <- as.integer(attr(data, "nr"))
  nc <- as.integer(attr(data, "nc"))
  node <- as.integer(node - min(node))
  edge <- as.integer(edge - 1)
  nTips <- as.integer(length(tree$tip.label))
  mNodes <- as.integer(max(node) + 1)
  contrast <- attr(data, "contrast")
  # proper format
  eps <- 1.0e-5
  attrib <- attributes(data)
  pos <- match(attrib$levels, attrib$allLevels)
  nco <- as.integer(dim(contrast)[1])
  for (i in 1:l) dat[i, (nTips + 1):m] <- .Call('LogLik2', data, P[i, ], nr, nc,
                                      node, edge, nTips, mNodes, contrast, nco)
  parent <- tree$edge[, 1]
  child <- tree$edge[, 2]
  nTips <- min(parent) - 1
  # in C with scaling
  for (i in 1:l) {
    for (j in (m - 1):1) {
      if (child[j] > nTips) {
        tmp2 <- (dat[[i, parent[j]]] / (dat[[i, child[j]]] %*% P[[i, j]]))
        dat[[i, child[j]]] <- (tmp2 %*% P[[i, j]]) * dat[[i, child[j]]]
      }
    }
  }
  for (j in unique(parent)) {
    tmp <- matrix(0, nr, nc)
    if (inv > 0) tmp <- as.matrix(INV) * inv
    for (i in 1:l) {
      # scaling!!!
      tmp <- tmp + w[i] * dat[[i, j]]
    }
    if ((pt == "bayes") || (pt == "marginal")) tmp <- tmp * rep(bf, each = nr)
    #KBH: don't normalize if partial lhoods desired
    if (pt != "lhood") tmp <- tmp / rowSums(tmp)
    if (data_type == "DNA") {
      tmp_max <- p2dna(tmp)
      tmp_max  <- fitchCoding2ambiguous(tmp_max)
    }
    else {
      tmp_max  <- pos[max.col(tmp)]
    }
    result[[j - nTips]] <- tmp
    result2[[j - nTips]] <- tmp_max
  }
#  ind <- identical_sites(data)
#  if(length(ind)>0){
#    for(k in seq_len(nNode)){
#      result[[k]][ind,] <- contrast[data[[1]][ind],]
#      result2[[k]][ind] <- data[[1]][ind]
#    }
#  }
#  browser()
  attrib$names <- node_label
  attributes(result2) <- attrib
  if(rt == "phyDat") return(rbind(data, result2))
  if(rt == "prob") {
    tmp <- c(unclass(new2old.phyDat(data)), result)
    attrib$names <- c(tree$tip.label, node_label)
    attributes(tmp) <- attrib
    return(tmp)
  }
  attributes(result) <- attrib
  if(joint) result2 <- joint_pml(object)
  erg <- list(tree=tree, data=data, prob=result, state=result2)
  class(erg) <- "ancestral"
  erg
}


#' @rdname ancestral.pml
#' @export
anc_pml <- function(object, type = "marginal", ...) {
  call <- c(object$call, match.call())
  res <- ancestral.pml(object, type=type, return="ancestral")
  res$call <- call
  res
}


fitchCoding2ambiguous <- function(x, type = "DNA") {
  y <- c(1L, 2L, 4L, 8L, 8L, 3L, 5L, 9L, 6L, 10L, 12L, 7L, 11L, 13L,
    14L, 15L, 15L, 15L)
  fmatch(x, y)
}



#' @rdname ancestral.pml
#' @export
ancestral.pars <- function(tree, data, type = c("MPR", "ACCTRAN", "POSTORDER"),
                           cost = NULL, return = "prob", ...) {
  call <- match.call()
  if (hasArg(tips)) tips <- list(...)$tips
  else tips <- TRUE
  type <- match.arg(type)
  assert_phylo(tree)
  assert_phyDat(data)
  tree$nodel.label <- makeAncNodeLabel(tree)
  if (type == "ACCTRAN" || type=="POSTORDER") {
    res <- ptree(tree, data, return = return, acctran=(type == "ACCTRAN"),
                 tips=tips)
    attr(res, "call") <- call
  }
  if (type == "MPR") {
    res <- mpr(tree, data, cost = cost, return = return, tips=tips)
    attr(res, "call") <- call
  }
  res
}



#' @rdname ancestral.pml
#' @export
anc_pars <- function(tree, data, type = c("MPR", "ACCTRAN", "POSTORDER"),
                           cost = NULL, ...) {
  #
  call <- match.call()
  type <- match.arg(type)
  assert_phylo(tree)
  assert_phyDat(data)
  tree$node.label <- makeAncNodeLabel(tree, ...)
  contrast <- attr(data, "contrast")
  data <- data[tree$tip.label,]

  prob <- ancestral.pars(tree, data, type, cost, tips=FALSE)
  joint <- joint_sankoff(tree, data, cost)
  ind <- identical_sites(data)
  if(length(ind)>0){
    for(k in seq_len(Nnode(tree))){
      prob[[k]][ind,] <- contrast[data[[1]][ind],]
      joint[[k]][ind] <- data[[1]][ind]
    }
  }
  erg <- list(tree=tree, data=data, prob=prob, state=joint, call=call)
  class(erg) <- "ancestral"
  erg
}



#' @rdname ancestral.pml
#' @export
pace <- ancestral.pars


mpr.help <- function(tree, data, cost = NULL) {
  tree <- reorder(tree, "postorder")
  levels <- attr(data, "levels")
  l <- length(levels)
  if (is.null(cost)) {
    cost <- matrix(1, l, l)
    cost <- cost - diag(l)
  }
  dat <- prepareDataSankoff(data)
  datp <- pnodes(tree, dat, cost)
  nr <- attr(data, "nr")
  nc <- attr(data, "nc")
  node <- as.integer(tree$edge[, 1] - 1L)
  edge <- as.integer(tree$edge[, 2] - 1L)
  res <- .Call('sankoffMPR', datp, as.numeric(cost), as.integer(nr),
    as.integer(nc), node, edge, as.integer(Nnode(tree)))
  root <- getRoot(tree)
  res[[root]] <- datp[[root]]
  res
}


mpr <- function(tree, data, cost = NULL, return="prob", tips=FALSE, ...) {
  data <- subset(data, tree$tip.label)
  att <- attributes(data)
  if(tips) att$names <- c(tree$tip.label, tree$node.label)
  else att$names <- tree$node.label
  type <- att$type
  nr <- att$nr
  nc <- att$nc
  res <- mpr.help(tree, data, cost)
  l <- length(tree$tip.label)
  m <- length(res)
  nNode <- Nnode(tree)
  ntips <- length(tree$tip.label)
  contrast <- att$contrast
  eps <- 5e-6
  rm <- apply(res[[ntips + 1]], 1, min)
  RM <- matrix(rm, nr, nc) + eps

  fun <- function(X) {
    rs <- rowSums(X) # apply(X, 1, sum)
    X / rs
  }
#  for (i in 1:ntips) res[[i]] <- contrast[data[[i]], , drop = FALSE]
  for (i in (ntips + 1):m) res[[i]][] <- as.numeric(res[[i]] < RM)
  if(tips)for(i in seq_len(ntips)) res[[i]] <- contrast[data[[i]],,drop=FALSE]
  else  res <- res[(ntips + 1):m]
  res_prob <- lapply(res, fun)
  attributes(res_prob) <- att
  if(return=="prob") return(res_prob)
#    class(res_prob) <- c("ancestral", "phyDat")
  #    else res[1:ntips] <- data[1:ntips]
  #fun2 <- function(x) {
  #  x <- p2dna(x)
  #  fitchCoding2ambiguous(x)
  #}
#  if (return != "prob") {
#  if (type == "DNA") {
#    res_state <- lapply(res, fun2)
#    attributes(res_state) <- att
#  }
#  else {
  attributes(res) <- att
  res_state <- highest_state(res)
  attributes(res_state) <- att
  if(tips) res_state[seq_len(ntips)] <- data
#  }
#    res[1:ntips] <- data
#  }
  res_state # list(res_prob, res_state)
}


ptree <- function(tree, data, acctran=TRUE, return = "prob", tips=FALSE, ...) {
  tree <- reorder(tree, "postorder")
  data <- subset(data, tree$tip.label)
  edge <- tree$edge
  att <- attributes(data)
  att$names <- c(tree$tip.label, tree$node.label)
  #else att$names <- tree$node.label
  nr <- att$nr
  type <- att$type
  m <- max(edge)
  nNode <- Nnode(tree)
  nTip <- Ntip(tree)
  f <- init_fitch(data, FALSE, FALSE, m=2L)
  f$traverse(edge)
  tmp <- reorder(tree)$edge
  tmp <- tmp[tmp[,2]>Ntip(tree),]
  if(length(tmp)>0 && acctran)f$acctran_traverse(tmp)
  res <- res_state <- vector("list", nNode)
  res <- vector("list", m)
  att$names <- c(tree$tip.label, tree$node.label) #makeAncNodeLabel(tree, ...)
#  else {
    fun <- function(X) {
      rs <- rowSums(X)
      X / rs
    }
    contrast <- att$contrast
    for(i in seq_len(nTip)) res[[i]] <- contrast[data[[i]], , drop=FALSE]
#    for(i in (nTip+1):m) res[[i]] <- f$getAnc(i)[1:nr, , drop=FALSE]
    for(i in seq_len(nNode)) {
      res[[i+nTip]] <- f$getAnc(i+nTip)[seq_len(nr), , drop=FALSE]
    }
    res <- lapply(res, fun)
    attributes(res) <- att
#    class(res) <- c("ancestral", "phyDat")
#  }
  if(!tips) res <- res[tree$node.label]

  if(return=="prob"){
#    if(tips)
    return(res)
#    else return(res[tree$node.label])
  }

#  if(type=="DNA"){
#    indx <- c(1, 2, 6, 3, 7, 9, 12, 4, 8, 10, 13, 11, 14, 15, 16)
#    res_state[seq_len(nTip)] <- data
#    for(i in (nTip+1):m)
#      res_state[[i]] <- indx[f$getAncAmb(i+nTip)[1:nr]]
#    attributes(res_state) <- att
#    #    return(res)
 # } else{
    res_state <- highest_state(res)
    attributes(res_state) <-  attributes(res)
#    class(res_state) <- "phyDat"
#  }
#  if(tips) return(res_state)
#  res_state[tree$node.label]
    res_state
}


makeAncNodeLabel <- function(tree, ...){
  if(!is.null(tree$node.label)){
    node_label <- tree$node.label
    if(length(unique(node_label)) == Nnode(tree)) return(node_label)
    else message("Node labels are not unique, used makeNodeLabel(tree, ...) to create them!")
  }
  tree <- makeNodeLabel(tree, ...)
  tree$node.label
}


identical_sites <- function(x){
  res <- rep(TRUE, attr(x, "nr"))
  for(i in seq_along(x)) res <- res & (x[[i]] == x[[1]])
  which(res)
}