R/as.treedata.table.R

Defines functions as.treedata.table

Documented in as.treedata.table

#' Combine tree (or set of trees) and data.frame into a single treedata.table
#' object
#'
#' This function takes as input a tree of class `phylo` or `multiPhylo` and a
#' `data.frame` and combines them into a treedata.table. If a `multiPhylo` is
#' provided, all trees must have the same tip.labels. `treedata.table` object is
#' sorted such that the rows in the data.table are matched to the tip.labels
#' of the phylogeny. Tip.labels on the tree must match a column of tip
#' names in the input data.frame. The output of this function will be a
#' treedata.table, which can be manipulated as a data.table.
#'
#' @importFrom data.table as.data.table
#' @param tree A tree of class `phylo` or multiple trees of class `multiPhylo`
#' @param data A dataset in format `data.frame`
#' @param name_column A character indicating the name of taxa in `data.frame`.
#' If set to `detect` (default) `as treedata.table` will auto-detect this
#' column
#' @return An object of type `treedata.table` containing the tree and data.table
#' @importFrom ape drop.tip
#' @importFrom geiger name.check
#' @examples
#'
#' data(anolis)
#' anolis2 <- anolis$phy
#' anolis2$tip.label[1] <- "NAA"
#' anolis1 <- anolis$phy
#' anolis1$tip.label[1] <- "NAA"
#' trees <- list(anolis1, anolis2)
#' class(trees) <- "multiPhylo"
#' treesFM <- list(anolis$phy, anolis$phy)
#' class(treesFM) <- "multiPhylo"
#'
#' # A phylo object that fully matches the data
#' td <- as.treedata.table(tree = anolis$phy, data = anolis$dat)
#' # A multiphylo object that fully matches the data
#' td <- as.treedata.table(tree = treesFM, data = anolis$dat)
#' # A phylo object that partially matches the data
#' td <- as.treedata.table(tree = anolis1, data = anolis$dat)
#' # A multiphylo object that partially matches the data
#' td <- as.treedata.table(tree = trees, data = anolis$dat)
#' @export

as.treedata.table <- function(tree, data, name_column = "detect") {
  if (!inherits(tree, c("multiPhylo", "phylo"))) {
    stop("Please use a class 'phylo' or 'multiPhylo' tree \n")
  }
  if (inherits(tree, "multiPhylo")) {
    equal_T <- length(unique(lapply(seq_along(tree), function(x) {
      sort(tree[[x]]$tip.label)
    }))) == 1
    if (!equal_T) {
      stop("Tip labels must be equivalent across trees in multiPhylo object")
    }
  }

  if (!inherits(data, "data.frame")) {
    stop("Your data MUST be of class data.frame")
  }
  # if(dim(data)[2] < 2){
  #  stop("Your data MUST have at least two columns (tip.names; nstates)")
  #  }
  if (is.vector(data)) {
    data <- data.frame(as.matrix(data))
    colnames(data) <- "trait"
  }
  if (is.null(colnames(data))) {
    colnames(data) <- paste("trait", seq_along(data), sep = "")
  }
  coln <- colnames(data)
  if (name_column == "detect") {
    if (is.null(rownames(data))) {
      tmp.df <- data.frame(data)
      offset <- 0
    } else {
      tmp.df <- data.frame(rownames(data), data)
      offset <- 1
    }

    matches <- vapply(tmp.df, function(x) {
      sum(x %in% if (inherits(tree, "phylo")) {
        tree$tip.label
      } else {
        tree[[1]]$tip.label
      })
    }, integer(1))

    if (all(matches == 0)) stop("No matching names found between data and tree")
    name_column <- which(matches == max(matches)) - offset
    message("Tip labels detected in column: ", colnames(data)[name_column])
  } else {
    if (is.character(name_column)) {
      name_column <- which(name_column == coln)[1]
    }
  }


  if (inherits(tree, "phylo")) {
    message("Phylo object detected \n")
    if (geiger::name.check(tree, data.names = data[, name_column])[1] != "OK") {
      data_not_tree <- setdiff(
        as.character(data[, name_column]),
        tree$tip.label
      )
      tree_not_data <- setdiff(tree$tip.label, data[, name_column])
      message(
        length(c(tree_not_data)), " tip(s) dropped from the original tree",
        "\n", length(c(data_not_tree)),
        " row(s) dropped from the original dataset"
      )
      tree <- ape::drop.tip(tree, tree_not_data)

      data<-if( length(data_not_tree) != 0){
        data[!as.character(data[, name_column]) ==
               data_not_tree, ] }else{
                 data
               }

    } else {
      message("All tips from original tree/dataset were preserved")
      data_not_tree <- "OK"
      tree_not_data <- "OK"
    }
  } else {
    message("Multiphylo object detected \n")

    if (geiger::name.check(tree[[1]],
      data.names = data[, name_column]
    )[1] != "OK") {
      data_not_tree <- setdiff(
        as.character(data[, name_column]),
        tree[[1]]$tip.label
      )
      tree_not_data <- setdiff(tree[[1]]$tip.label, data[, name_column])

      tree <- lapply(tree, ape::drop.tip, tip = tree_not_data)
      class(tree) <- "multiPhylo"
      data<-if( length(data_not_tree) != 0){
        data[!as.character(data[, name_column]) ==
               data_not_tree, ] }else{
                 data
               }
      message(
        length(c(tree_not_data)), " tip(s) dropped from ", length(tree),
        " trees", "\n", length(c(data_not_tree)), " row(s) dropped from the",
                                                    " original dataset"
      )
    } else {
      message("All tips from original tree/dataset were preserved")
      data_not_tree <- "OK"
      tree_not_data <- "OK"
    }
  }

  i <- vapply(data, is.factor, logical(1))
  data[i] <- lapply(data[i], as.character)

  data <- if (inherits(tree, "phylo")) {
    data[match(tree$tip.label, data[, name_column]), ]
  } else {
    data[match(tree[[1]]$tip.label, data[, name_column]), ]
  }


  colnames(data)[name_column] <- "tip.label"
  dr <- which(tree$tip.label %in% c(tree_not_data, data_not_tree))

  tree <- if (inherits(tree, "phylo")) {
    ape::drop.tip(tree, dr)
  } else {
    nt <- lapply(tree, ape::drop.tip, tip = dr)
    class(nt) <- "multiPhylo"
    nt
  }
  data <- data.table::as.data.table(data)[!dr]
  comb <- list(phy = tree, dat = data)
  attr(comb, "data_not_tree") <- data_not_tree
  attr(comb, "tree_not_data") <- tree_not_data
  class(comb) <- "treedata.table"
  return(comb)
}
ropensci/treedata.table documentation built on Sept. 12, 2021, 6:23 p.m.