phurl: Phylogenetic modelling Using Regularized Linear models

Documented in prl_prepare_data

#' Prepare data for analysing with phurl models
#'
#' @param tree Phylogenetic tree of class \code{\link[ape]{phylo}} or \code{\link[tibble:tbl_df-class]{tibble}}
#' (e.g. using \code{\link[tidytree]{as_tibble}})
#' @param traits Trait data. Can be a named \code{vector}, a \code{matrix} or \code{data.frame} with named rows, where the rownames
#' are the species names, or a \code{data.frame} or \code{\link[tibble:tbl_df-class]{tibble}} with a column specifying the
#' species name. If species names are not referenced in either the rownames or a column, data will be matched
#' to the tree using order only (e.g. assuming data rows are in the same order as \code{tree$tip.label}). Can also
#' be \code{NULL} in which case only the predictor variables derived from the phylogeny are generated. This can
#' be useful to feed into the simulation functions, or to use in a custom model somewhere else.
#' @param species_column Character value specifying which column of \code{traits} refers to the species
#' names, if they are not specified in the rownames. If not \code{NULL}, this will override any rownames in
#' \code{traits}
#' @param scale_edges Should the edges be scaled to a maximum of 1? This is highly recommended for regularised
#' models. Default TRUE.
#' @param centre Should the resulting edge path predictor matrix be centred to so that each column has a mean
#' of 0? This is recommended to improve convergence of the models. Default TRUE.
#' @param transform_traits List of transformations to apply to trait values. If named, will be matched to column
#' names, else they will be applied in order, and recycled if necessary.
#' @param standardise_traits Should traits values be standardised by subtracting the mean and dividing by
#' them standard deviation? Recommended. Default TRUE.
#'
#' @return A object of class \code{\link{phurl}}
#' @export
#'
#' @examples
prl_prepare_data <- function(tree, traits = NULL, species_column = NULL,
                               scale_edges = TRUE, centre = TRUE,
                               transform_traits = NULL,
                               standardise_traits = TRUE) {

  ## generate x data from tree
  temp_tree <- tree
  max_depth <- max(ape::node.depth.edgelength(temp_tree))
  if(scale_edges) {
    temp_tree$edge.length <- temp_tree$edge.length / max_depth
  }
  temp_tree$root.edge <- 1
  #tip_names <- dplyr::tibble(tip_name = temp_tree$tip.label, node = 1:length(temp_tree$tip.label))
  tree_dat <- tidytree::as_tibble(temp_tree) %>%
    dplyr::filter(node <= ape::Ntip(temp_tree))
  
  tip_labels <- tree_dat$node %>%
    as.data.frame() %>%
    unlist()
  names(tip_labels) <- tree_dat$label
  
  temp_tree2 <- temp_tree
  temp_tree2$tip.label[temp_tree2$tip.label %in% names(tip_labels)] <- tip_labels[temp_tree2$tip.label[temp_tree2$tip.label %in% names(tip_labels)]]

  edge_mat <- RRphylo::makeL(temp_tree2)
  tip_df <- t(apply(edge_mat, 1, function(x) {x[x != 0][-1] <- rev(cumsum(rev(x[x != 0][-1]))); x})) %>%
    dplyr::as_tibble() %>%
    stats::setNames(paste0("node_", names(.))) %>%
    dplyr::mutate(species = names(tip_labels)) %>%
    dplyr::select(species, dplyr::everything())

  root_node <- list(root_value = dplyr::sym(paste0("node_", length(temp_tree$tip.label) + 1)))

  first_splits_edges <- temp_tree$edge[temp_tree$edge[ , 1] == (length(temp_tree$tip.label) + 1), 2]
  first_splits_name <- paste0("first_split_", first_splits_edges)
  first_splits = list()
  first_splits[[first_splits_name[1]]] <- dplyr::sym(paste0("node_", first_splits_edges[1]))
  first_splits[[first_splits_name[2]]] <- dplyr::sym(paste0("node_", first_splits_edges[2]))

  tip_df <- tip_df %>%
    dplyr::rename(!!!root_node) %>%
    dplyr::rename(!!!first_splits)


  node_mat <- RRphylo::makeL1(temp_tree)
  node_df <- t(apply(node_mat, 1, function(x) {x[x != 0][-1] <- rev(cumsum(rev(x[x != 0][-1]))); x})) %>%
    dplyr::as_tibble() %>%
    stats::setNames(paste0("node_", names(.))) %>%
    dplyr::mutate(node = paste0("node_", rownames(node_mat))) %>%
    dplyr::select(node, dplyr::everything())

  first_splits_node <- first_splits[first_splits %in% colnames(node_df)]
  
  node_df <- node_df %>%
    dplyr::rename(!!!root_node) %>%
    dplyr::rename(!!!first_splits_node)

  if(centre) {
    tip_df <- tip_df %>%
      dplyr::mutate_at(dplyr::vars(-species, -root_value),
                       ~ (. - mean(., na.rm = TRUE)))

    node_df <- node_df %>%
      dplyr::mutate_at(dplyr::vars(-node, -root_value),
                       ~ (. - mean(., na.rm = TRUE)))
  }


  nodes_on_path_to_tips <- apply(edge_mat, 1, function(x) paste0("node_", names(which(x > 0))))
  names(nodes_on_path_to_tips) <- paste0("node_", names(nodes_on_path_to_tips))
  nodes_on_path_to_nodes <- apply(node_mat, 1, function(x) paste0("node_", names(which(x > 0))))
  names(nodes_on_path_to_nodes) <- paste0("node_", names(nodes_on_path_to_nodes))

  ## generate y data from traits
  if(!is.null(traits)){
    if(!is.null(species_column)){
      tree_df <- tree_dat %>%
        dplyr::left_join(traits, by = c("label" = species_column)) %>%
        dplyr::rename(species = label)
    } else {
      if(is.vector(traits)) {
        if(is.null(names(traits))) {
          trait_df <- dplyr::tibble(species = temp_tree$tip.label,
                                    trait = traits)
          warning("traits object has no names, assuming same order as tree$tip.label")
        } else {
          trait_df <- dplyr::tibble(species = names(traits),
                                    trait = traits)
        }
      } else {
         if(!is.null(rownames(traits))) {
          trait_df <- traits %>%
            as.data.frame() %>%
            tibble::rownames_to_column("species")
         } else {
           trait_df <- traits %>%
             as.data.frame() %>%
             dplyr::mutate(species = temp_tree$tip.label) %>%
             dplyr::select(species, dplyr::everything())
            warning("traits object has no rownames, assuming same order as tree$tip.label")
          }
      }
      tree_df <- tree_dat %>%
        dplyr::left_join(trait_df, by = c("label" = "species")) %>%
        dplyr::rename(species = label)
    }

    if(standardise_traits) {
      mean_sd <- tree_df %>%
        dplyr::summarise_at(dplyr::vars(-species, -node, -branch.length, -parent),
                            list(mean = ~mean(., na.rm = TRUE), sd = ~sd(., na.rm = TRUE)))
      tree_df <- tree_df %>%
        dplyr::mutate_at(dplyr::vars(-species, -node, -branch.length, -parent),
                         ~ (. - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE))
    } else {
      tree_df <- tree_df
      mean_sd <- NULL
    }
  } else {
    tree_df <- NULL
    mean_sd <- NULL
  }

  phurl_ob <- list(phylo = temp_tree,
                   y_data = tree_df,
                   x_data_tips = tip_df,
                   x_data_nodes = node_df,
                   nodes_on_path_to_tips = nodes_on_path_to_tips,
                   nodes_on_path_to_nodes = nodes_on_path_to_nodes,
                   model = NULL,
                   plots = NULL,
                   phylo_unmodified = tree,
                   edge_scaled = scale_edges,
                   edge_centred = centre,
                   y_standardisation = mean_sd,
                   first_splits = first_splits,
                   root_node = root_node)
  class(phurl_ob) <- "phurl"
  phurl_ob
}