phylopath: Perform Phylogenetic Path Analysis

Documented in phylo_path

#' Compare causal models in a phylogenetic context.
#'
#' Continuous variables are modeled using [phylolm::phylolm], while binary
#' traits are modeled using [phylolm::phyloglm].
#'
#' *Parallel processing*: From v1.2, `phylopath` uses the `future` framework
#' for parallel processing. This is compatible with the parallel computation
#' within the underlying `phylolm`, making it easy to enable parallel
#' processing of multiple models, and of bootstrap replicates. To enable,
#' simply set a parallel `plan()` using the `future` package. Typically, you'll
#' want to run `future::plan("multisession", workers = n)`, where `n` is the
#' number of cores. Now parallel processing is enabled. Return to sequential
#' processing using `future::plan("sequential")`
#'
#' @param model_set A list of directed acyclic graphs. These are matrices,
#'   typically created with \code{define_model_set}.
#' @param data A \code{data.frame} with data. If you have binary variables, make
#'   sure they are either character values or factors!
#' @param tree A phylogenetic tree of class \code{phylo}.
#' @param model The evolutionary model used for the regressions on continuous
#'   variables. See [phylolm::phylolm] for options and details. Defaults to
#'   Pagel's lambda model
#' @param method The estimation method for the binary models. See
#'   [phylolm::phyloglm] for options and details. Defaults to logistic MPLE.
#' @param order Causal order of the included variable, given as a character
#'   vector. This is used to determine which variable should be the dependent
#'   in the dsep regression equations. If left unspecified, the order will be
#'   automatically determined. If the combination of all included models is
#'   itself a DAG, then the ordering of that full model is used. Otherwise,
#'   the most common ordering between each pair of variables is used to create
#'   a general ordering.
#' @param parallel *Superseded* From v1.2 `phylopath` uses the `future` package
#'   for all parallel processing, see details.
#' @param na.rm Should rows that contain missing values be dropped from the data
#'   as necessary (with a message)?
#' @param ...
#'   Arguments passed on to `phylolm`:
#'
#'   `lower.bound`: optional lower bound for the optimization of the phylogenetic model parameter.
#'
#'   `upper.bound`: optional upper bound for the optimization of the phylogenetic model parameter.
#'
#'   `starting.value`: optional starting value for the optimization of the phylogenetic model parameter.
#'
#'   `measurement_error`: a logical value indicating whether there is measurement error sigma2_error (see Details).
#'
#'   Arguments passed on to `phyloglm`:
#'
#'   `btol`: bound on the linear predictor to bound the searching space.
#'
#'   `log.alpha.bound`: bound for the log of the parameter alpha.
#'
#'   `start.beta`: starting values for beta coefficients.
#'
#'   `start.alpha`: starting values for alpha (phylogenetic correlation).
#'
#'
#' @return A phylopath object, with the following components:
#'  \describe{
#'   \item{d_sep}{for each model a table with separation statements and statistics.}
#'   \item{model_set}{the DAGs}
#'   \item{data}{the supplied data}
#'   \item{tree}{the supplied tree}
#'   \item{model}{the employed model of evolution in `phylolm`}
#'   \item{method}{the employed method in `phyloglm`}
#'   \item{dots}{any additional arguments given, these are passed on to downstream functions}
#'   \item{warnings}{any warnings generated by the models}
#'   }
#' @export
#' @examples
#'   #see vignette('intro_to_phylopath') for more details
#'   candidates <- define_model_set(
#'     A = NL ~ BM,
#'     B = NL ~ LS,
#'     .common = c(LS ~ BM, DD ~ NL)
#'   )
#'   p <- phylo_path(candidates, rhino, rhino_tree)
#'
#'   # Printing p gives some general information:
#'   p
#'   # And the summary gives statistics to compare the models:
#'   summary(p)
#'
phylo_path <- function(model_set, data, tree, model = 'lambda', method = 'logistic_MPLE',
                       order = NULL, parallel = NULL, na.rm = TRUE, ...) {
  # Always coerce to data.frame, as tibbles and data.tables do NOT play nice.
  data <- as.data.frame(data)
  dots <- list(...)

  # check for users passing boot as an optional parameter, which has been deprecated
  if ('boot' %in% names(dots)) {
    warning('Using `boot` has been deprecated here, pass to choice(), best(), average() or est_DAG() instead.')
    dots <- dots[names(dots) != 'boot']
  }

  tmp <- check_models_data_tree(model_set, data, tree, na.rm)
  model_set <- tmp$model_set
  data <- tmp$data
  tree <- tmp$tree

  if (!is.null(order)) {
    # check for duplications in the order vector, when manually supplied
    if (length(order) > length(unique(order))) {
      stop('The supplied `order` argument must contain each variable exactly once.')
    }
    if (length(order) != ncol(data)) {
      stop('Not all variables are included in `order`, missing variables: ',
           setdiff(colnames(data), order))
    }
  } else {
    order <- find_consensus_order(model_set)
  }

  formulas <- lapply(model_set, find_formulas, order)
  formulas <- purrr::map(
    formulas,
    ~purrr::map(.x, ~{attr(., ".Environment") <- NULL; .})
  )
  f_list <- unique(unlist(formulas))
  if (!is.null(parallel)) {
    warning('Use of the `parallel` argument has been superseded by the use of `future`. See documentation for details.')
  }
  dsep_models_runs <- future.apply::future_lapply(
    f_list, phylo_g_lm,
    data = data, tree = tree, model = model, method = method, dots = dots,
    future.seed = TRUE
  )
  # Produce appropriate error if needed
  errors <- purrr::map(dsep_models_runs, 'error')
  purrr::map2(
    errors, f_list,
    ~if(!is.null(.x))
      stop(paste(
        'Fitting the following model:\n   ',
        Reduce(paste, deparse(.y)),
        '\nproduced this error:\n   ',
        .x
      ), call. = FALSE)
  )
  # Collect warnings as well, but save those for later.
  warnings <- purrr::map(dsep_models_runs, 'warning')
  warnings <- purrr::map2(
    warnings, f_list,
    ~if(!is.null(.x))
      paste(
        'Fitting the following model:\n   ',
        Reduce(paste, deparse(.y)),
        '\nproduced this/these warning(s):\n   ',
        .x, '\n'
      )
  )
  warnings <- warnings[!sapply(warnings, is.null)]
  if (length(warnings) > 1) {
    warning('Some models produced warnings. Use `show_warnings()` to view them.\n')
  }

  # Collect models.
  dsep_models <- purrr::map(dsep_models_runs, 'result')
  dsep_models <- purrr::map(formulas, ~dsep_models[match(.x, f_list)])

  d_sep <- purrr::map2(
    formulas,
    dsep_models,
    ~tibble::tibble(
      d_sep = as.character(.x),
      p = purrr::map_dbl(.y, get_p),
      phylo_par = purrr::map_dbl(.y, get_phylo_param),
      model = .y
    )
  )

  out <- list(
    d_sep = d_sep, model_set = model_set, data = data, tree = tree, model = model, method = method,
    dots = dots, warnings = warnings
  )
  class(out) <- 'phylopath'
  return(out)
}

#' @export
summary.phylopath <- function(object, ...) {
  phylopath <- object
  stopifnot(inherits(phylopath, 'phylopath'))
  n <- nrow(phylopath$data)
  k <- sapply(phylopath$d_sep, nrow)
  q <- sapply(phylopath$model_set, function(m) nrow(m) + sum(m))
  C <- sapply(phylopath$d_sep, function(x) C_stat(x$p))
  p <- C_p(C, k)
  IC <- CICc(C, q, n)

  if (n <= max(q) + 1) {
    IC[n <= q] <- NA
    warning('CICc was not calculated for causal models where the number of parameters is equal to',
            'or larger than the number of species.')
  }

  d <- data.frame(
    model = names(phylopath$model_set), k = k, q = q, C = C, p = p,
    CICc = IC, stringsAsFactors = FALSE
  )
  d <- d[order(d$CICc), ]
  d$delta_CICc <- d$CICc - d$CICc[1]
  d$l <- l(d$delta_CICc)
  d$w <- w(d$l)
  class(d) <- c('phylopath_summary', 'data.frame')
  return(d)
}

#' Extract and estimate the best supported model from a phylogenetic path
#' analysis.
#'
#' @param phylopath An object of class \code{phylopath}.
#' @param ... Arguments to pass to [phylolm::phylolm] and [phylolm::phyloglm]. Provide `boot = K`
#'   parameter to enable bootstrapping, where `K` is the number of bootstrap replicates. If you
#'   specified other options in the original [phylo_path] call you don't need to specify them again.
#'
#' @return An object of class \code{fitted_DAG}.
#' @export
#'
#' @examples
#'   candidates <- define_model_set(
#'     A = NL ~ BM,
#'     B = NL ~ LS,
#'     .common = c(LS ~ BM, DD ~ NL)
#'   )
#'   p <- phylo_path(candidates, rhino, rhino_tree)
#'   best_model <- best(p)
#'   # Print the best model to see coefficients, se and ci:
#'   best_model
#'   # Plot to show the weighted graph:
#'   plot(best_model)
#'
best <- function(phylopath, ...) {
  stopifnot(inherits(phylopath, 'phylopath'))
  dots <- combine_dots(phylopath$dots, ...)

  b <- summary(phylopath)[1, 'model']
  best_model <- phylopath$model_set[[b]]
  do.call(
    est_DAG,
    c(list(best_model, phylopath$data, phylopath$tree, phylopath$model, phylopath$method), dots)
  )
}

#' Extract and estimate an arbitrary model from a phylogenetic path analysis.
#'
#' @param phylopath An object of class \code{phylopath}.
#' @param choice A character string of the name of the model to be chosen, or
#'   the index in \code{model_set}.
#' @param ... Arguments to pass to [phylolm::phylolm] and [phylolm::phyloglm]. Provide `boot = K`
#'   parameter to enable bootstrapping, where `K` is the number of bootstrap replicates. If you
#'   specified other options in the original [phylo_path] call you don't need to specify them again.
#'
#' @return An object of class \code{fitted_DAG}.
#' @export
#'
#' @examples
#'   candidates <- define_model_set(
#'     A = NL ~ BM,
#'     B = NL ~ LS,
#'     .common = c(LS ~ BM, DD ~ NL)
#'   )
#'   p <- phylo_path(candidates, rhino, rhino_tree)
#'   my_model <- choice(p, "B")
#'   # Print the best model to see coefficients, se and ci:
#'   my_model
#'   # Plot to show the weighted graph:
#'   plot(my_model)
#'
choice <- function(phylopath, choice, ...) {
  stopifnot(inherits(phylopath, 'phylopath'))
  dots <- combine_dots(phylopath$dots, ...)

  do.call(
    est_DAG,
    c(
      list(
        phylopath$model_set[[choice]], phylopath$data, phylopath$tree, phylopath$model,
        phylopath$method),
      dots
    )
  )
}

#' Extract and average the best supported models from a phylogenetic path
#' analysis.
#'
#' @param phylopath An object of class `phylopath`.
#' @param cut_off The CICc cut-off used to select the best models. Use
#'   `Inf` to average over all models. Use the [best()] function to
#'   only use the top model, or [choice()] to select any single model.
#' @param ... Arguments to pass to [phylolm::phylolm] and [phylolm::phyloglm]. Provide `boot = K`
#'   parameter to enable bootstrapping, where `K` is the number of bootstrap replicates. If you
#'   specified other options in the original [phylo_path] call you don't need to specify them again.
#'
#' @inheritParams average_DAGs
#'
#' @return An object of class `fitted_DAG`.
#' @export
#'
#' @examples
#'   candidates <- define_model_set(
#'     A = NL ~ RS,
#'     B = RS ~ NL + BM,
#'     .common = c(LS ~ BM, DD ~ NL, NL ~ BM)
#'   )
#'   p <- phylo_path(candidates, rhino, rhino_tree)
#'   summary(p)
#'
#'   # Models A and B have similar support, so we may decide to take
#'   # their average.
#'
#'   avg_model <- average(p)
#'   # Print the average model to see coefficients, se and ci:
#'   avg_model
#'
#'   \dontrun{
#'   # Plot to show the weighted graph:
#'   plot(avg_model)
#'
#'   # One can see that an averaged model is not necessarily a DAG itself.
#'   # This model actually has a path in two directions.
#'
#'   # Note that coefficients that only occur in one of the models become much
#'   # smaller when we use full averaging:
#'
#'   coef_plot(avg_model)
#'   coef_plot(average(p, method = 'full'))
#'   }
#'
average <- function(phylopath, cut_off = 2, avg_method = 'conditional', ...) {
  stopifnot(inherits(phylopath, 'phylopath'))
  dots <- combine_dots(phylopath$dots, ...)

  d <- summary(phylopath)
  b <- d[d$delta_CICc < cut_off, ]

  best_models <- lapply(
    phylopath$model_set[b$model],
    function(x) {
      do.call(
        est_DAG,
        c(
          list(
            DAG = x, data = phylopath$data, tree = phylopath$tree, model = phylopath$model,
            method = phylopath$method),
          dots
        )
      )
    }
  )
  average <- average_DAGs(best_models, b$w, avg_method)
  class(average$coef) <- c('matrix', 'DAG')
  return(average)
}