R/getLevel.R

Defines functions getLevel

Documented in getLevel

#' Search for a target level on the tree via a specified score
#'
#' Search for the target level of the tree via a specified score. The score
#' value needs to be provided for each node of the tree.
#'
#' @author Ruizhu Huang
#' @export
#'
#' @param tree A \code{phylo} object.
#' @param score_data A \code{data.frame} providing scores for all nodes in the
#'     tree. The data frame should have at least 2 columns, one with information
#'     about nodes (the node number) and the other with the score for each node.
#' @param drop A logical expression indicating elements or rows to keep.
#'     Missing values are taken as \code{FALSE}.
#' @param score_column The name of the column of \code{score_data} that
#'     contains the original scores of the nodes.
#' @param node_column The name of the column of \code{score_data} that contains
#'     the node numbers.
#' @param get_max A logical scalar. If \code{TRUE}, search for nodes
#'     that has higher score value than its descendants; otherwise, search for
#'     nodes that has lower score value than its descendants.
#' @param parent_first A logical scalar. If \code{TRUE}, the parent
#'     node is selected if tied values occur on the parent node and some of the
#'     children nodes.
#' @param message A logical scalar indicating whether progress messages should
#'     be printed.
#'
#' @returns A \code{data.frame} similar to \code{score_data} but with an
#' additional column named \code{keep} indicating which nodes to retain.
#'
#' @importFrom methods is
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom TreeSummarizedExperiment findDescendant printNode
#'
#' @examples
#' suppressPackageStartupMessages({
#'     library(TreeSummarizedExperiment)
#'     library(ggtree)
#' })
#'
#' data(tinyTree)
#' ggtree(tinyTree, branch.length = "none") +
#'     geom_text2(aes(label = node), color = "darkblue",
#'            hjust = -0.5, vjust = 0.7) +
#'     geom_text2(aes(label = label), color = "darkorange",
#'                hjust = -0.1, vjust = -0.7) +
#'     geom_hilight(node = 13, fill = "blue", alpha = 0.3) +
#'     geom_hilight(node = 16, fill = "orange", alpha = 0.3)
#'
#' ## Generate score for each node
#' pv <- rep(0.1, 19)
#' pv[c(16, 13, 17)] <- c(0.01, 0.05, 0.005)
#' out <- data.frame(node = 1:19, pvalue = pv)
#'
#' ## Search nodes
#' final <- getLevel(tree = tinyTree,
#'                   score_data = out,
#'                   drop =  pvalue > 0.05,
#'                   score_column = "pvalue",
#'                   node_column = "node",
#'                   get_max = FALSE,
#'                   parent_first = TRUE,
#'                   message = FALSE)
#'
#' ## Nodes to keep
#' final$node[final$keep]
#'
getLevel <- function(tree, score_data, drop, score_column,
                      node_column, get_max, parent_first = TRUE,
                      message = FALSE) {

    ## Check arguments
    ## -------------------------------------------------------------------------
    .assertVector(x = tree, type = "phylo")
    if (!methods::is(score_data, "data.frame")) {
        if (methods::is(score_data, "DataFrame")) {
            score_data <- data.frame(score_data)
        } else {
            stop("score_data should be a data.frame")
        }
    }
    .assertScalar(x = score_column, type = "character",
                  validValues = colnames(score_data))
    .assertScalar(x = node_column, type = "character",
                  validValues = colnames(score_data))
    .assertScalar(x = get_max, type = "logical")
    .assertScalar(x = parent_first, type = "logical")
    .assertScalar(x = message, type = "logical")

    ## Check that there is not already a column named 'keep' (will be added
    ## to hold the result of the search)
    if (any(colnames(score_data) == "keep")) {
        stop("The result will be output in the 'keep' column;
             Please use other name for the current 'keep' column.")
    }

    ## Filter nodes
    ## -------------------------------------------------------------------------
    if (message) {
        message("Preparing data ... ")
    }
    score_data$keep <- TRUE

    ## Drop nodes using the specified standard
    if (message) {
        message("Dropping nodes ... ")
    }
    if (missing(drop)) {
        r <- FALSE
    } else {
        e <- substitute(drop)
        r <- eval(expr = e, envir = score_data, enclos = parent.frame())

        if (!is.logical(r)) {
            stop("'drop' must be or evaluate to logical")
        }
        r <- r & !is.na(r)
    }
    score_data$keep[r] <- FALSE
    if (message) {
        message(sum(!score_data$keep), " nodes are dropped...")
    }

    ## Drop nodes without available score
    score_data$keep[is.na(score_data[, score_column])] <- FALSE
    if (message) {
        message(sum(is.na(score_data[, score_column])),
                " nodes with missing score are dropped...")
    }

    ## Search nodes
    ## -------------------------------------------------------------------------
    # This works only on nodes that are not dropped.
    # It compares the score on an internal node with those on the descendant
    # nodes.
    # 1. get_max = TRUE & parent_first = TRUE
    #   An internal node is selected if its score is not lower than the max
    #   score of its descendants.
    # 2. get_max = TRUE & parent_first = FALSE
    #   An internal node is selected if its score is above the max score
    #   of its descendants.
    # 3. get_max = FALSE & parent_first = TRUE
    #   An internal node is selected if its score is lower or equal to the min
    #   score of its descendants.
    # 4. get_max = FALSE & parent_first = FALSE
    #   An internal node is selected if its score is lower than the min score of
    #   its descendants.

    if (message) {
        message("Searching candidate nodes... ")
    }

    nodeKeep <- score_data[[node_column]][score_data$keep]
    treeData <- TreeSummarizedExperiment::printNode(tree = tree, type = "all")
    nodeIn <- treeData$nodeNum[!treeData$isLeaf]
    nodeI <- intersect(nodeKeep, nodeIn)

    if (message) {
        message("Searching the descendant nodes of the candidate nodes... ")
    }

    descI <- TreeSummarizedExperiment::findDescendant(
        tree = tree, node = nodeI, only.leaf = FALSE,
        self.include = FALSE, use.alias = TRUE)
    chlI <- findChild(tree = tree, node = nodeI)

    if (message) {
        message("Comparing nodes... ")
        if (length(descI)) {
            pb <- utils::txtProgressBar(0, length(descI), style = 3)
        }
    }

    for (i in seq_along(descI)) {
        node.i <- nodeI[i]
        child.i <- chlI[[i]]
        desc.i <- descI[[i]]

        row.p <- match(node.i, score_data[, node_column])
        row.c <- match(child.i, score_data[, node_column])
        row.d <- match(desc.i, score_data[, node_column])

        ## Extract the values on the parent and on the children
        score.p <- score_data[row.p, score_column]
        score.c <- score_data[row.c, score_column]
        score.d <- score_data[row.d, score_column]

        ## If more than half of the direct child nodes has NA score, don't take
        ## the parent node
        na_c <- sum(is.na(score.c))/length(score.c)
        na_d <- sum(is.na(score.d))/length(score.d)
        if (na_c >= 0.5 & na_d != 1) {
            score_data$keep[row.p] <- FALSE
        } else {
            ## If the direct child nodes all have score values, do the following
            ## comparison
            if (get_max) {
                score.p[is.na(score.p)] <- -Inf
                score.d[is.na(score.d)] <- -Inf
                if (parent_first) {
                    isKeep <- all(score.p >= score.d)
                } else {
                    isKeep <- all(score.p > score.d)
                }
            } else {
                score.p[is.na(score.p)] <- Inf
                score.d[is.na(score.d)] <- Inf
                if (parent_first) {
                    isKeep <- all(score.p <= score.d)
                } else {
                    isKeep <- all(score.p < score.d)
                }
            }

            if (isKeep) {
                score_data$keep[row.d] <- FALSE
            } else {
                score_data$keep[row.p] <- FALSE
            }
        }
        if (message) {
            utils::setTxtProgressBar(pb, i)
        }
    }
    return(score_data)
}
fionarhuang/treeclimbR documentation built on Aug. 7, 2024, 12:44 p.m.