R/read.tree.R

## read.tree.R (2010-09-27)

##   Read Tree Files in Parenthetic Format

## Copyright 2002-2010 Emmanuel Paradis, Daniel Lawson and Klaus Schliep

## This file is part of the R-package `ape'.
## See the file ../COPYING for licensing issues.

tree.build <- function(tp)
{
    add.internal <- function() {
        edge[j, 1] <<- current.node
        edge[j, 2] <<- current.node <<- node <<- node + 1L
        index[node] <<- j # set index
        j <<- j + 1L
    }
    add.terminal <- function() {
        edge[j, 1] <<- current.node
        edge[j, 2] <<- tip
        index[tip] <<- j # set index
        X <- unlist(strsplit(tpc[k], ":"))
        tip.label[tip] <<- X[1]
        edge.length[j] <<- as.numeric(X[2])
        k <<- k + 1L
        tip <<- tip + 1L
        j <<- j + 1L
    }
    go.down <- function() {
        l <- index[current.node]
        X <- unlist(strsplit(tpc[k], ":"))
        node.label[current.node - nb.tip] <<- X[1]
        edge.length[l] <<- as.numeric(X[2])
        k <<- k + 1L
        current.node <<- edge[l, 1]
    }
    if (!length(grep(",", tp))) {
        obj <- list(edge = matrix(c(2L, 1L), 1, 2))
        tp <- unlist(strsplit(tp, "[\\(\\):;]"))
        obj$edge.length <- as.numeric(tp[3])
        obj$Nnode <- 1L
        obj$tip.label <- tp[2]
        if (tp[4] != "") obj$node.label <- tp[4]
        class(obj) <- "phylo"
        return(obj)
    }

    tpc <- unlist(strsplit(tp, "[\\(\\),;]"))
    tpc <- tpc[nzchar(tpc)]
    ## the following 2 lines are (slightly) faster than using gsub()
    tsp <- unlist(strsplit(tp, NULL))
    skeleton <- tsp[tsp %in% c("(", ")", ",", ";")]
    nsk <- length(skeleton)
    nb.node <- sum(skeleton == ")")
    nb.tip <- sum(skeleton == ",") + 1
    ## We will assume there is an edge at the root;
    ## if so, it will be removed and put into a vector
    nb.edge <- nb.node + nb.tip
    node.label <- character(nb.node)
    tip.label <- character(nb.tip)

    edge.length <- numeric(nb.edge)
    edge <- matrix(0L, nb.edge, 2)
    current.node <- node <- as.integer(nb.tip + 1) # node number
    edge[nb.edge, 2] <- node
    index <- numeric(nb.edge + 1) # hash index to avoid which
    index[node] <- nb.edge

    ## j: index of the line number of edge
    ## k: index of the line number of tpc
    ## tip: tip number
    j <- k <- tip <- 1L

    for (i in 2:nsk) {
        if (skeleton[i] == "(") add.internal() # add an internal branch (on top)
        if (skeleton[i] == ",") {
            if (skeleton[i - 1] != ")") add.terminal() # add a terminal branch
        }
        if (skeleton[i] == ")") {
            if (skeleton[i - 1] != ")") { # add a terminal branch and go down one level
                add.terminal()
                go.down()
            }
            if (skeleton[i - 1] == ")") go.down() # go down one level
        }
    }

    edge <- edge[-nb.edge, ]
    obj <- list(edge = edge, Nnode = nb.node, tip.label = tip.label)
    root.edge <- edge.length[nb.edge]
    edge.length <- edge.length[-nb.edge]
    if (!all(is.na(edge.length))) # added 2005-08-18
        obj$edge.length <- edge.length
    if (is.na(node.label[1])) node.label[1] <- ""
    if (any(nzchar(node.label))) obj$node.label <- node.label
    if (!is.na(root.edge)) obj$root.edge <- root.edge
    class(obj) <- "phylo"
    obj
}

read.tree <- function(file = "", text = NULL, tree.names = NULL, skip = 0,
    comment.char = "#", keep.multi = FALSE, ...)
{
    unname <- function(treetext) {
        nc <- nchar(treetext)
	tstart <- 1
	while (substr(treetext, tstart, tstart) != "(" && tstart <= nc)
            tstart <- tstart + 1
	if (tstart > 1)
            return(c(substr(treetext, 1, tstart - 1),
                     substr(treetext, tstart, nc)))
	return(c("", treetext))
    }
    if (!is.null(text)) {
        if (!is.character(text))
          stop("argument `text' must be of mode character")
        tree <- text
    } else {
        tree <- scan(file = file, what = "", sep = "\n", quiet = TRUE,
                     skip = skip, comment.char = comment.char, ...)
    }
    ## Suggestion from Eric Durand and Nicolas Bortolussi (added 2005-08-17):
    if (identical(tree, character(0))) {
        warning("empty character string.")
        return(NULL)
    }
    tree <- gsub("[ \t]", "", tree)
    tree <- unlist(strsplit(tree, NULL))
    y <- which(tree == ";")
    Ntree <- length(y)
    x <- c(1, y[-Ntree] + 1)
    ## Suggestion from Olivier François (added 2006-07-15):
    if (is.na(y[1])) return(NULL)
    STRING <- character(Ntree)
    for (i in 1:Ntree)
        STRING[i] <- paste(tree[x[i]:y[i]], sep = "", collapse = "")

    tmp <- unlist(lapply(STRING, unname))
    tmpnames <- tmp[c(TRUE, FALSE)]
    STRING <- tmp[c(FALSE, TRUE)]
    if (is.null(tree.names) && any(nzchar(tmpnames)))
        tree.names <- tmpnames

    colon <- grep(":", STRING)
    if (!length(colon)) {
        obj <- lapply(STRING, clado.build)
    } else if (length(colon) == Ntree) {
        obj <- lapply(STRING, tree.build)
    } else {
        obj <- vector("list", Ntree)
        obj[colon] <- lapply(STRING[colon], tree.build)
        nocolon <- (1:Ntree)[!1:Ntree %in% colon]
        obj[nocolon] <- lapply(STRING[nocolon], clado.build)
    }
    for (i in 1:Ntree) {
        ## Check here that the root edge is not incorrectly represented
        ## in the object of class "phylo" by simply checking that there
        ## is a bifurcation at the root
        ROOT <- length(obj[[i]]$tip.label) + 1
        if(sum(obj[[i]]$edge[, 1] == ROOT) == 1 && dim(obj[[i]]$edge)[1] > 1)
            stop(paste("There is apparently two root edges in your file: cannot read tree file.\n  Reading Newick file aborted at tree no.", i, sep = ""))
    }
    if (Ntree == 1 && !keep.multi) obj <- obj[[1]] else {
        if (!is.null(tree.names)) names(obj) <- tree.names
        class(obj) <- "multiPhylo"
    }
    obj
}
gjuggler/ape documentation built on May 17, 2019, 6:03 a.m.