R/mcpfit_methods.R

Defines functions with_loo tidy_to_matrix residuals.mcpfit fitted.mcpfit predict.mcpfit pp_eval tidy_samples unpack_varying mcmclist_samples is.mcpfit print.mcpfit ranef fixef summary.mcpfit get_summary

Documented in fitted.mcpfit fixef get_summary is.mcpfit mcmclist_samples pp_eval predict.mcpfit print.mcpfit ranef residuals.mcpfit summary.mcpfit tidy_samples tidy_to_matrix unpack_varying with_loo

# ABOUT: These are non-plotting functions that take an mcpfit as the first argument
# -----------------

#' Class `mcpfit` of models fitted with the \pkg{mcp} package
#'
#' Models fitted with the \code{\link[mcp:mcp]{mcp}} function are represented as
#' an `mcpfit` object which contains the user input (model, data, family),
#' derived model characteristics (prior, parameter names, and jags code), and
#' the fit (prior and/or posterior mcmc samples).
#'
#' @name mcpfit-class
#' @aliases mcpfit
#' @docType class
#'
#' @details
#' See `methods(class = "mcpfit")` for an overview of available methods.
#'
#' User-provided information (see \code{\link{mcp}} for more details):
#' @slot model A list of formulas, making up the model.
#'   Provided by user. See \code{\link{mcp}} for more details.
#' @slot data A data frame.
#'   Provided by user. See \code{\link{mcp}} for more details.
#' @slot family An `mcpfamily` object.
#'   Provided by user. See \code{\link{mcp}} for more details.
#' @slot prior A named list.
#'   Provided by user. See \code{\link{mcp}} for more details.
#' @slot mcmc_post An \code{\link[coda]{mcmc.list}} object with posterior samples.
#' @slot mcmc_prior An \code{\link[coda]{mcmc.list}} object with prior samples.
#' @slot mcmc_loglik An \code{\link[coda]{mcmc.list}} object with samples of log-likelihood.
#' @slot pars A list of character vectors of model parameter names.
#' @slot jags_code A string with jags code. Use `cat(fit$jags_code)` to show it.
#' @slot simulate A method to simulate and predict data.
#' @slot .other Information that is used internally by mcp.
#'
NULL


#' Internal function for summary.mcpfit, fixef.mcpfit, and ranef.mcpfit
#'
#' @aliases get_summary get_summary.mcpfit
#' @keywords internal
#' @inheritParams summary.mcpfit
#' @param fit An \code{\link{mcpfit}}` object.
#' @param varying Boolean. Get results for varying (TRUE) or population (FALSE)?
#' @return A data.frame with summaries for each model parameter.
#' @importFrom magrittr %>%
#' @importFrom dplyr .data
#' @encoding UTF-8
#' @author Jonas Kristoffer Lindeløv \email{jonas@@lindeloev.dk}
get_summary = function(fit, width, varying = FALSE, prior = FALSE) {
  # Check arguments
  assert_mcpfit(fit)
  assert_numeric(width, lower = 0, upper = 1)
  assert_logical(varying)
  assert_logical(prior)

  # Get posterior/prior samples
  # TO DO: transition this to tidy_samples at some point if rhat and n.eff
  #    can be computed on this data. Would avoid some hacky solutions in the code below.
  samples = mcmclist_samples(fit, prior = prior)

  # Select only varying or only population-level columns in data
  if (varying == FALSE) {
    regex_pars = paste0(paste0("^", fit$pars$population, "$"), collapse = "|")
  } else {
    regex_pars = paste0("^", paste(fit$pars$varying, collapse="|^"))
    if (regex_pars == "^")
      stop("There were no matching parameters in the model.")
  }

  # HACK: If there is just one parameter, it's not a matrix. Add two in to make the code run.
  all_cols = colnames(samples[[1]])
  get_cols = all_cols[stringr::str_detect(all_cols, regex_pars)]
  if (!stringr::str_detect(regex_pars, "\\|") && varying == FALSE) {
    samples = lapply(samples, function(x) x[, c(get_cols, "cp_0", "cp_1")])
  } else {
    samples = lapply(samples, function(x) x[, get_cols])
  }
  class(samples) = "mcmc.list"  # Return to original class

  # Get parameter estimates
  estimates = samples %>%
    # Get ready to compute stuff for each parameter
    tidybayes::tidy_draws() %>%
    tidyr::pivot_longer(-tidyselect::starts_with(".")) %>%
    dplyr::group_by(.data$name) %>%

    # Compute mean and HDI intervals and name appropriately
    tidybayes::mean_hdci(.data$value, .width = width) %>%
    dplyr::rename(mean = .data$value,
                  lower = .data$.lower,
                  upper = .data$.upper) %>%

    # Remove unneeded stuf
    dplyr::select(-tidyselect::starts_with("."))

  # Revert HACK and continue
  if (!stringr::str_detect(regex_pars, "\\|") && varying == FALSE) {
    estimates = dplyr::filter(estimates, !.data$name %in% c("cp_0", "cp_1"))
    samples = lapply(samples, function(x) x[, get_cols])
  }

  # Diagnostics: Gelman-Rubin and effective sample size
  Rhat = try(coda::gelman.diag(samples, multivariate = FALSE)$psrf[, 1], TRUE)
  if (!is.numeric(Rhat)) {
    warning("Rhat computation failed: ", Rhat)
    Rhat = rep(NA, nrow(estimates))
  }
  diagnostics = data.frame(
    Rhat = Rhat,
    n.eff = round(coda::effectiveSize(samples))
  )

  # Add simulation parameters if the data is simulated
  sim_list = attr(fit$data[, fit$pars$y], "simulated")
  if(!is.null(sim_list)) {
    simulated = as.list(sim_list)  # Get as oroper list
    simulated = simulated[sapply(simulated, is.numeric)]  # Remove non-numeric

    # Handle varying effects. Finds the matching labels
    for (this_varying in fit$pars$varying) {
      if (!is.null(simulated[[this_varying]])) {
        # Find the needed values and labels
        value = simulated[[this_varying]]  # Extract simulation values
        which_label_col = which(fit$.other$ST$cp_group == this_varying)
        label_col = fit$.other$ST$cp_group_col[which_label_col]  # What column is the labels in data.
        labs = fit$data[[label_col]]  # Find the labels. Same length as `value`
        if (length(value) != length(labs)) {
          warning("This is simulated data, but the labels for varying effect '", label_col, "' in data does not have the same length as the numeric params used for simulation.")
          next
        }

        # Name like the MCMC columns and use only unique combinations (assuming identical value for each level)
        value = value[!duplicated(value)]
        names(value) = unique(paste0(this_varying, "[", labs, "]"))

        # Delete the simulation vector and add the new label-value pairs to list
        simulated[[this_varying]] = NULL
        simulated = c(simulated, as.list(value))
      }
    }

    # Now unpack the whole bunch to a left_join() friendly data.frame.
    simulated = unlist(simulated)  # as named vector
    simulated = data.frame(
      name = names(simulated),
      sim = as.numeric(simulated),  # without row names
      stringsAsFactors = FALSE
    )

    # Add simulation to the beginning of the list
    estimates = estimates %>%
      dplyr::left_join(simulated, by = "name", relationship = "one-to-one") %>%
      dplyr::mutate(match = ifelse(.data$sim > .data$lower & .data$sim < .data$upper, yes = "OK", no = "")) %>%
      dplyr::select("name", "match", "sim", "mean", "lower", "upper")
  }

  # Merge them and return
  result = dplyr::bind_cols(estimates, diagnostics)
  data.frame(result, row.names = NULL)
}


#' Summarise mcpfit objects
#'
#' Summarise parameter estimates and model diagnostics.
#'
#' @aliases summary summary.mcpfit
#' @param object An \code{\link{mcpfit}} object.
#' @param width Float. The width of the highest posterior density interval
#'   (between 0 and 1).
#' @param digits a non-null value for digits specifies the minimum number of
#'   significant digits to be printed in values. The default, NULL, uses
#'   getOption("digits"). (For the interpretation for complex numbers see signif.)
#'   Non-integer values will be rounded down, and only values greater than or
#'   equal to 1 and no greater than 22 are accepted.
#' @param prior TRUE/FALSE. Summarise prior instead of posterior?
#' @param ... Currently ignored
#'
#' @return A data frame with parameter estimates and MCMC diagnostics.
#'   OBS: The change point distributions are often not unimodal and symmetric so
#'   the intervals can be deceiving Plot them using `plot_pars(fit)`.
#'
#'   * `mean` is the posterior mean
#'   * `lower` is the lower quantile of the highest-density interval (HDI) given in `width`.
#'   * `upper` is the upper quantile.
#'   * `Rhat` is the Gelman-Rubin convergence diagnostic which is often taken to
#'     be acceptable if < 1.1. It is computed using \code{\link[coda]{gelman.diag}}.
#'   * `n.eff` is the effective sample size computed using \code{\link[coda]{effectiveSize}}.
#'     Low effective sample sizes are also obvious as poor mixing in trace plots
#'     (see `plot_pars(fit)`). Read how to deal with such problems [here](https://lindeloev.github.io/mcp/articles/tips.html)
#'   * `ts_err` is the time-series error, taking autoregressive correlation
#'     into account. It is computed using \code{\link[coda]{spectrum0.ar}}.
#'
#'  For simulated data, the summary contains two additional columns so that it
#'  is easy to inspect whether the model can recover the parameters. Run
#'  simulation and summary multiple times to get a sense of the robustness.
#'
#'   * `sim` is the value used to generate the data.
#'   * `match` is `"OK"` if `sim` is contained in the HDI interval (`lower` to
#'     `upper`).
#'
#' @encoding UTF-8
#' @author Jonas Kristoffer Lindeløv \email{jonas@@lindeloev.dk}
#' @export
#' @examples
#' # Typical usage
#' summary(demo_fit)
#' summary(demo_fit, width = 0.8, digits = 4)  # Set HDI width
#'
#' # Get the results as a data frame
#' results = summary(demo_fit)
#'
#' # Varying (random) effects
#' # ranef(my_fit)
#'
#' # Summarise prior
#' summary(demo_fit, prior = TRUE)
#'
summary.mcpfit = function(object, width = 0.95, digits = 2, prior = FALSE, ...) {
  fit = object  # Standard name in mcp
  assert_mcpfit(fit)
  assert_numeric(width, lower = 0, upper = 1)
  assert_integer(digits, lower = 0)
  assert_logical(prior)
  assert_ellipsis(...)

  samples = mcmclist_samples(fit, prior = prior, error = FALSE)

  # Model info
  cat("Family: ", fit$family$family, "(link = '", fit$family$link, "')\n", sep = "")
  if (!is.null(samples))
    cat("Iterations: ", nrow(samples[[1]]) * length(samples), " from ", length(samples), " chains.\n", sep="")
  cat("Segments:\n")
  for (i in 1:length(fit$model)) {
    cat("  ", i, ": ", format(fit$model[i]), "\n", sep = "")
  }

  # Data
  if (!is.null(samples)) {
    # Print and return invisibly
    cat("\nPopulation-level parameters:\n")
    result = get_summary(fit, width, varying = FALSE, prior = prior)
    print(data.frame(result), digits = digits, row.names = FALSE)

    if (!is.null(fit$pars$varying))
      cat("\nUse `ranef(fit)` to summarise the varying effect(s):", paste0(fit$pars$varying, collapse = ", "))

    return(invisible(result))
  }
  else {
    cat("\nNo samples. Nothing to summarise.")
    return(invisible(NULL))
  }
}


#' @aliases fixef fixef.mcpfit fixed.effects
#' @describeIn summary.mcpfit Get population-level ("fixed") effects of an \code{\link{mcpfit}} object.
#' @export
fixef = function(object, width = 0.95, prior = FALSE, ...) {
  assert_ellipsis(...)
  get_summary(object, width, varying = FALSE, prior = prior)
}


#' @aliases ranef ranef.mcpfit random.effects
#' @describeIn summary.mcpfit Get varying ("random") effects of an \code{\link{mcpfit}} object.
#' @export
ranef = function(object, width = 0.95, prior = FALSE, ...) {
  assert_ellipsis(...)
  get_summary(object, width, varying = TRUE, prior = prior)
}


#' @aliases print print.mcpfit
#' @describeIn summary.mcpfit Print the posterior summary of an \code{\link{mcpfit}} object.
#' @param x An \code{\link{mcpfit}} object.
#' @export
print.mcpfit = function(x, ...) {
  summary(x, ...)
}


#' Checks if argument is an `mcpfit` object
#'
#' @aliases is.mcpfit
#' @param x An `R` object.
#' @export
#'
is.mcpfit = function(x) {
  inherits(x, "mcpfit")
}


#' Internal function to get samples.
#'
#' Returns posterior samples, if available. If not, then prior samples. If not,
#' then throw an informative error. This is useful for summary and plotting, that
#' works on both.
#'
#' @aliases mcmclist_samples mcmclist_samples.mcpfit
#' @keywords internal
#' @inheritParams summary.mcpfit
#' @param fit An \code{\link{mcpfit}} object
#' @param message TRUE: gives a message if returning prior samples. FALSE = no message
#' @param error TRUE: err if there are no samples. FALSE: return NULL
mcmclist_samples = function(fit, prior = FALSE, message = TRUE, error = TRUE) {
  if (prior == TRUE) {
    if (coda::is.mcmc.list(fit$mcmc_prior)) {
      return(fit$mcmc_prior)
    } else {
      stop("Prior requested but the prior was not sampled.")
    }
  }

  if (coda::is.mcmc.list(fit$mcmc_post)) {
    return(fit$mcmc_post)
  } else if (coda::is.mcmc.list(fit$mcmc_prior)) {
    message("Posterior was not sampled. Using prior samples.")
    return(fit$mcmc_prior)
  } else if (error == TRUE) {
    stop("This mcpfit contains no posterior or prior samples.")
  }

  return(NULL)
}



#' Get relevant info about varying parameters
#'
#' Returns parameters, data columns, and implicated segments given parameter name(s) or column(s).
#'
#' @aliases unpack_varying unpack_varying.mcpfit
#' @keywords internal
#' @param pars `NULL`/`FALSE` for nothing. `TRUE` for all. A vector of varying parameter names for specifics.
#' @param cols `NULL`/`FALSE` for nothing. `TRUE` for all. A vector of varying column names for specifics. Usually provided via "facet_by" argument in other functions.
#' @return A list. See details.
#'
#' @details
#' Returns a list with
#' @slot pars Character vector of parameter names. `NULL` if empty.
#' @slot cols Character vector of data column names. `NULL` if empty.
#' @slot indices Logical vector of segments in the segment table that contains the varying effect
#'
unpack_varying = function(fit, pars = NULL, cols = NULL) {
  assert_types(pars, "null", "logical", "character")
  assert_types(cols, "null", "logical", "character")

  # If everything is NULL, just return NULLs
  if ((is.null(pars) && is.null(cols))) {
    return(list(
      pars = NULL,
      cols = NULL,
      indices = rep(FALSE, nrow(fit$.other$ST))
    ))
  } else if (!is.null(pars) && !is.null(cols)) {
    stop("One of `pars` and `cols` must be NULL.")
  }


  if (!is.null(pars)) {
    if (all(pars == FALSE)) {
      # Select no varying effects
      use_varying = NULL
    } else if (all(pars == TRUE)) {
      # Select all varying effects
      use_varying = !is.na(fit$.other$ST$cp_group)
    } else if (is.character(pars)) {
      if (!all(pars %in% fit$pars$varying))
        stop("Not all `pars` are varying parameters (see fit$pars$varying).")
      # Select only specified varying effects
      use_varying = fit$.other$ST$cp_group %in% pars
    }
  } else if (!is.null(cols)) {
    use_varying = tidyr::replace_na(fit$.other$ST$cp_group_col == cols, FALSE)
  }

  return(list(
    pars = fit$.other$ST$cp_group[use_varying],
    cols = fit$.other$ST$cp_group_col[use_varying],
    indices = use_varying
  ))
}



#' Get tidy samples with or without varying effects
#'
#' Returns in a format useful for `fit$simulate()` with population parameters in wide format
#' and varying effects in long format (the number of rows will be `nsamples * n_levels_in_varying`).
#'
#' @aliases tidy_samples tidy_samples.mcpfit
#' @keywords internal
#' @inheritParams mcmclist_samples
#' @inheritParams pp_eval
#' @param population
#'   * `TRUE` All population effects. Same as `fit$pars$population`.
#'   * `FALSE` No population effects. Same as `c()`.
#'   * Character vector: Only include specified population parameters - see `fit$pars$population`.
#' @param varying One of:
#'   * `TRUE` All varying effects (`fit$pars$varying`).
#'   * `FALSE` No varying effects (`c()`).
#'   * Character vector: Only include specified varying parameters - see `fit$pars$varying`.
#' @param absolute
#'   * `TRUE` Returns the absolute location of all varying change points.
#'   * `FALSE` Just returns the varying effects.
#'   * Character vector: Only do absolute transform for these varying parameters - see `fit$pars$varying`.
#'
#'   OBS: This currently only applies to varying change points. It is not implemented for `rel()` regressors yet.
#' @return `tibble` of posterior draws in `tidybayes` format.
#' @encoding UTF-8
#' @author Jonas Kristoffer Lindeløv \email{jonas@@lindeloev.dk}
#'
tidy_samples = function(
  fit,
  population = TRUE,
  varying = TRUE,
  absolute = FALSE,
  prior = FALSE,
  nsamples = NULL
) {
  # General argument checks
  assert_mcpfit(fit)
  assert_types(population, "logical", "character")
  assert_types(varying, "null", "logical", "character")
  assert_types(absolute, "null", "logical", "character")
  assert_logical(prior)
  if (!is.null(nsamples))
    assert_integer(nsamples, lower = 1)

  if (all(population == FALSE) && all(varying == FALSE))
    stop("At least one TRUE or one parameter must be provided through either the `varying` or the `population` arguments.")


  # ----- IDENTIFY PARAMETERS -----
  # Varying parameters. Result is `terms_varying`
  varying_info = unpack_varying(fit, pars = varying)
  terms_varying = paste0(varying_info$pars, "[", varying_info$cols, "]")  # for tidybayes
  if (all(terms_varying == "[]")) terms_varying = ""  # quick fix

  # Population parameters. Result is `pars_population`.
  if (all(population == FALSE)) {
    pars_population = c()  # Empty if no absolute varying
  } else if (all(population == TRUE)) {
    pars_population = fit$pars$population
  } else if (is.character(population)) {
    if (!all(population %in% fit$pars$population))
      stop("Not all `population` are population parameters (see fit$pars$population).")

    pars_population = population
  }

  # Absolute effects. Results is `absolute_cps` and `absolute` (recoded to varying cp names)
  if (all(absolute == TRUE)) {
    absolute = fit$.other$ST$cp_group[varying_info$indices]
    absolute_cps = fit$.other$ST$cp_name[varying_info$indices]
  } else if (all(absolute == FALSE)) {
    absolute_cps = NULL
  } else if (is.character(absolute)) {
    # Check
    is_in_varying = absolute %in% varying_info$pars
    if (any(!is_in_varying))
      stop("The following parameter names in `absolute` are not in `varying`:", absolute[is_in_varying])

    use_absolute = fit$.other$ST$cp_group %in% absolute
    absolute_cps = fit$.other$ST$cp_name[use_absolute]
  }

  # ----- GET THESE PARAMETERS AS TIDY DRAWS -----
  # Select posterior/prior samples
  samples = mcmclist_samples(fit, prior = prior)

  # Build code for tidybayes::spread_draws() and execute it
  all_terms = unique(c(pars_population, terms_varying, absolute_cps))
  code = paste0("tidybayes::spread_draws(samples, ", paste0(all_terms, collapse = ", "), ", ndraws = nsamples)")
  samples = eval(parse(text = code))

  # Make varying columns factor if they are factors in fit$data
  if (length(varying_info$cols) > 0) {
    is_factor = lapply(fit$data, is.factor)[varying_info$cols]
    cols_to_factorize = varying_info$cols[as.logical(is_factor)]
    samples = dplyr::mutate_at(samples, cols_to_factorize, as.factor)
  }

  # Add population cp to varying and delete population cols only included for this purpose.
  if (length(absolute_cps) > 0) {
    samples[, absolute] = samples[, absolute] + samples[, absolute_cps]
    samples = dplyr::select(samples, -!!dplyr::setdiff(absolute_cps, pars_population))
  }

  # Return with chain etc. first
  samples = dplyr::select(samples, ".chain", ".iteration", ".draw", dplyr::everything())
  return(samples)
}




#' Fits and predictions from samples and newdata
#'
#' @aliases pp_eval pp_eval.mcpfit
#' @keywords internal
#' @inheritParams tidy_samples
#' @param object An `mcpfit` object.
#' @param newdata A `tibble` or a `data.frame` containing predictors in the model. If `NULL` (default),
#'   the original data is used.
#' @param summary Summarise at each x-value
#' @param type One of:
#'   - "fitted": return fitted values. See also `fitted()`
#'   - "predict": return predicted values, using random dispersion around the central tendency
#'     (e.g., `y_predict = rnorm(N, y_fitted, sigma_fitted)` for `family = gaussian()`).
#'     See also `predict()`.
#'   - "residuals": same as "predict" but the observed y-values are subtracted. See also `residuals()`
#' @param probs Vector of quantiles. Only in effect when `summary == TRUE`.
#' @param rate Boolean. For binomial models, plot on raw data (`rate = FALSE`) or
#'   response divided by number of trials (`rate = TRUE`). If FALSE, linear
#'   interpolation on trial number is used to infer trials at a particular x.
#' @param prior TRUE/FALSE. Plot using prior samples? Useful for `mcp(..., sample = "both")`
#' @param which_y What to plot on the y-axis. One of
#'
#'   * `"ct"`: The central tendency which is often the mean after applying the
#'     link function.
#'   * `"sigma"`: The variance
#'   * `"ar1"`, `"ar2"`, etc. depending on which order of the autoregressive
#'     effects you want to plot.
#' @param arma Whether to include autoregressive effects.
#'   * `TRUE` Compute autoregressive residuals. Requires the response variable in `newdata`.
#'   * `FALSE` Disregard the autoregressive effects. For `family = gaussian()`, `predict()` just use `sigma` for residuals.
#' @param nsamples Integer or `NULL`. Number of samples to return/summarise.
#'   If there are varying effects, this is the number of samples from each varying group.
#'   `NULL` means "all". Ignored if both are `FALSE`. More samples trade speed for accuracy.
#' @param samples_format One of "tidy" or "matrix". Controls the output format when `summary == FALSE`.
#'   See more under "value"
#' @param scale One of
#'   * "response": return on the observed scale, i.e., after applying the inverse link function.
#'   * "linear": return on the parameter scale (where the linear trends are modelled).
#' @param ... Currently unused
#' @return
#'   * If `summary = TRUE`: A `tibble` with the posterior mean for each row in `newdata`,
#'     If `newdata` is `NULL`, the data in `fit$data` is used.
#'
#'   * If `summary = FALSE` and `samples_format = "tidy"`: A `tidybayes` `tibble` with all the posterior
#'     samples (`Ns`) evaluated at each row in `newdata` (`Nn`), i.e., with `Ns x Nn` rows. If there are
#'     varying effects, the returned data is expanded with the relevant levels for each row.
#'
#'     The return columns are:
#'
#'      - Predictors from `newdata`.
#'      - Sample descriptors: ".chain", ".iter", ".draw" (see the `tidybayes` package for more), and "data_row" (`newdata` rownumber)
#'      - Sample values: one column for each parameter in the model.
#'      - The estimate. Either "predict" or "fitted", i.e., the name of the `type` argument.
#'
#'   * If `summary = FALSE` and `samples_format = "matrix"`: An `N_draws` X `nrows(newdata)` matrix with fitted/predicted
#'       values (depending on `type`). This format is used by `brms` and it's useful as `yrep` in
#'      `bayesplot::ppc_*` functions.
#' @importFrom magrittr %>%
#' @importFrom dplyr .data
#' @seealso \code{\link{fitted.mcpfit}} \code{\link{predict.mcpfit}} \code{\link{residuals.mcpfit}}
#' @encoding UTF-8
#' @author Jonas Kristoffer Lindeløv \email{jonas@@lindeloev.dk}
#'
pp_eval = function(
  object,
  newdata = NULL,
  summary = TRUE,
  type = "fitted",
  probs = TRUE,
  rate = TRUE,
  prior = FALSE,
  which_y = "ct",
  varying = TRUE,
  arma = TRUE,
  nsamples = NULL,
  samples_format = "tidy",
  scale = 'response',
  ...
) {
  # Recodings
  fit = object
  assert_mcpfit(fit)
  xvar = rlang::sym(fit$pars$x)
  yvar = rlang::sym(fit$pars$y)
  returnvar = rlang::sym(type)
  is_arma = length(fit$pars$arma) > 0

  # What varying cols to use (varying = TRUE keeps them all as is)
  varying_info = unpack_varying(fit, pars = varying)
  required_cols = unique(c(fit$pars$x, fit$pars$trials, varying_info$cols))  # May be amended later

  # R CMD Check wants a global definition of ".". The formal way of doing it is
  # if(getRversion() >= "2.15.1") utils::globalVariables(".")
  # but that makes the tests fail.
  . = "ugly fix to please R CMD check"


  ########################
  # ASSERTS AND RECODING #
  ########################
  assert_logical(summary)
  assert_value(type, allowed = c("fitted", "predict", "residuals"))
  assert_types(probs, "logical", "numeric")
  if (is.numeric(probs))
    assert_numeric(probs, lower = 0, upper = 1)
  if (all(probs == TRUE))
    probs = c(0.025, 0.975)
  assert_logical(rate)
  assert_logical(prior)
  assert_logical(arma)
  assert_types(nsamples, "null", "numeric")
  if (!is.null(nsamples))
    assert_integer(nsamples, lower = 1)
  assert_value(samples_format, allowed = c("tidy", "matrix"))

  if (type == "residuals") {
    if (!is.null(newdata) && !(fit$pars$y %in% colnames(newdata)))
      stop("`newdata` must contain a response column named '", fit$pars$y, "' when `type = 'residuals'`")

    if (!(fit$pars$y %in% required_cols))
      required_cols = c(required_cols, fit$pars$y)
  }
  if (scale == "linear" && (type != "fitted"))
    stop("`scale = 'linear'` is only meaningful when `type = 'fitted'`.")


  # Check and build stuff related to newdata
  if (!is.null(newdata)) {
    assert_types(newdata, "data.frame", "tibble")

    # Add response column to required_cols for ARMA models
    if (is_arma == TRUE && arma == TRUE) {
      if (fit$pars$y %in% colnames(newdata) && !(fit$pars$y %in% required_cols)) {
        required_cols = c(required_cols, fit$pars$y)
      } else if (".ydata" %in% colnames(newdata)) {
        required_cols = c(required_cols, ".ydata")
      }
    }

    cols_in_newdata = required_cols %in% colnames(newdata)
    if (any(cols_in_newdata == FALSE))
      stop("Missing columns '", paste0(required_cols[!cols_in_newdata], collapse = "', '"), "' in `newdata`.")
  } else {
    # Use original data. Remember the response variable too for ARMA
    newdata = fit$data
    if (is_arma == TRUE && arma == TRUE && !(fit$pars$y %in% required_cols))
      required_cols = c(required_cols, fit$pars$y)
  }
  newdata = data.frame(newdata[, required_cols])
  colnames(newdata) = required_cols  # Special case for when there's only one predictor
  newdata$data_row = 1:nrow(newdata)  # to maintain order in the output when summary == TRUE


  ########################
  # GET FITS/PREDICTIONS #
  ########################
  type_for_simulate = ifelse(type == "residuals", yes = "fitted", no = type)
  if (length(varying_info$cols) > 0) {
    # If there are varying effects: use varying-matching samples for each row of data
    samples = dplyr::left_join(
      newdata,
      tidy_samples(fit, population = TRUE, varying = varying, prior = prior, nsamples = nsamples),
      by = unique(varying_info$cols),
      relationship = "many-to-many"
    ) %>%
      dplyr::mutate(!!returnvar := rlang::exec(fit$simulate, !!!., type = type_for_simulate, rate = rate, which_y = which_y, arma = arma, add_attr = FALSE, scale = scale))
  } else {
    # No varying effects: use all samples for each row of data
    samples = tidy_samples(fit, population = TRUE, varying = varying, prior = prior, nsamples = nsamples) %>%
      tidyr::expand_grid(newdata) %>%
      dplyr::mutate(!!returnvar := rlang::exec(fit$simulate, !!!., type = type_for_simulate, rate = rate, which_y = which_y, arma = arma, add_attr = FALSE, scale = scale))
  }

  # Optionally compute residuals
  if (type == "residuals")
    samples = dplyr::mutate(samples, !!returnvar := !!returnvar - !!yvar)  # returnvar should be "residuals" in this case

  # Optionally summarise
  if (summary == TRUE) {
    df_return = samples %>%
      # Summarise for each row in newdata
      dplyr::group_by(.data$data_row) %>%
      dplyr::summarise(.groups = "drop",
                       error = stats::sd(!!returnvar),
                       !!returnvar := mean(!!returnvar)
      ) %>%

      # Apply original order and put newdata as the first columns
      dplyr::arrange(.data$data_row) %>%
      dplyr::left_join(newdata, by = "data_row", relationship = "one-to-one") %>%
      dplyr::select(dplyr::one_of(colnames(newdata)), !!returnvar, "error", -"data_row")


    # Quantiles
    if (all(probs != FALSE)) {
      quantiles_fit = samples %>%
        get_quantiles(probs, xvar, returnvar) %>%
        dplyr::mutate(quantile = 100 * .data$quantile) %>%
        tidyr::pivot_wider(names_from = "quantile", names_prefix = "Q", values_from = "y")

      df_return = dplyr::left_join(df_return, quantiles_fit, by = as.character(xvar), relationship = "many-to-one")
    }
    return(data.frame(df_return))
  } else if (samples_format == "tidy") {
    return(samples)
  } else if (samples_format == "matrix") {
    df_return = tidy_to_matrix(samples, type)
    return(df_return)
  }
}


#' Samples from the Posterior Predictive Distribution
#'
#' @aliases predict predict.mcpfit
#' @inheritParams pp_eval
#' @inherit pp_eval return
#' @seealso \code{\link{pp_eval}} \code{\link{fitted.mcpfit}} \code{\link{residuals.mcpfit}}
#' @encoding UTF-8
#' @author Jonas Kristoffer Lindeløv \email{jonas@@lindeloev.dk}
#' @export
#' @examples
#' \donttest{
#' predict(demo_fit)  # Evaluate at each demo_fit$data
#' predict(demo_fit, probs = c(0.1, 0.5, 0.9))  # With median and 80% credible interval.
#' predict(demo_fit, summary = FALSE)  # Samples instead of summary.
#' predict(
#'   demo_fit,
#'   newdata = data.frame(time = c(-5, 20, 300)),  # Evaluate
#'   probs = c(0.025, 0.5, 0.975)
#' )
#'}
#'
predict.mcpfit = function(
  object,
  newdata = NULL,
  summary = TRUE,
  probs = TRUE,
  rate = TRUE,
  prior = FALSE,
  which_y = "ct",
  varying = TRUE,
  arma = TRUE,
  nsamples = NULL,
  samples_format = "tidy",
  ...
) {
  assert_ellipsis(...)
  pp_eval(
    object,
    newdata = newdata,
    summary = summary,
    type = "predict",
    probs = probs,
    rate = rate,
    prior = prior,
    which_y = which_y,
    varying = varying,
    arma = arma,
    nsamples = nsamples,
    samples_format = samples_format
  )
}


#' Expected Values from the Posterior Predictive Distribution
#'
#' @aliases fitted fitted.mcpfit
#' @inheritParams pp_eval
#' @inherit pp_eval return
#' @seealso \code{\link{pp_eval}} \code{\link{predict.mcpfit}} \code{\link{residuals.mcpfit}}
#' @encoding UTF-8
#' @author Jonas Kristoffer Lindeløv \email{jonas@@lindeloev.dk}
#' @export
#' @examples
#' \donttest{
#' fitted(demo_fit)
#' fitted(demo_fit, probs = c(0.1, 0.5, 0.9))  # With median and 80% credible interval.
#' fitted(demo_fit, summary = FALSE)  # Samples instead of summary.
#' fitted(demo_fit,
#'        newdata = data.frame(time = c(-5, 20, 300)),  # New data
#'        probs = c(0.025, 0.5, 0.975))
#'}
#'
fitted.mcpfit = function(
  object,
  newdata = NULL,
  summary = TRUE,
  probs = TRUE,
  rate = TRUE,
  prior = FALSE,
  which_y = "ct",
  varying = TRUE,
  arma = TRUE,
  nsamples = NULL,
  samples_format = "tidy",
  scale = "response",
  ...
) {
  assert_ellipsis(...)
  pp_eval(
    object,
    newdata = newdata,
    summary = summary,
    type = "fitted",
    probs = probs,
    rate = rate,
    prior = prior,
    which_y = which_y,
    varying = varying,
    arma = arma,
    nsamples = nsamples,
    samples_format = samples_format,
    scale = scale
  )
}


#' Compute Residuals From Mcpfit Objects
#'
#' Equivalent to  `fitted(fit, ...) - fit$data[, fit$data$yvar]` (or `fitted(fit, ...) - newdata[, fit$data$yvar]`),
#' but with fixed arguments for `fitted`: `rate = FALSE, which_y = 'ct', samples_format = 'tidy'`.
#'
#' @aliases residuals residuals.mcpfit resid resid.mcpfit
#' @inheritParams pp_eval
#' @importFrom magrittr %>%
#' @importFrom dplyr .data
#' @seealso \code{\link{pp_eval}} \code{\link{fitted.mcpfit}} \code{\link{predict.mcpfit}}
#' @encoding UTF-8
#' @author Jonas Kristoffer Lindeløv \email{jonas@@lindeloev.dk}
#' @export
#' @examples
#' \donttest{
#' residuals(demo_fit)
#' residuals(demo_fit, probs = c(0.1, 0.5, 0.9))  # With median and 80% credible interval.
#' residuals(demo_fit, summary = FALSE)  # Samples instead of summary.
#'}
#'
residuals.mcpfit = function(
  object,
  newdata = NULL,
  summary = TRUE,
  probs = TRUE,
  prior = FALSE,
  varying = TRUE,
  arma = TRUE,
  nsamples = NULL,
  ...
) {
  assert_ellipsis(...)
  pp_eval(
    object,
    newdata = newdata,
    summary = summary,
    type = "residuals",
    probs = probs,
    rate = FALSE,
    prior = prior,
    which_y = "ct",
    varying = varying,
    arma = arma,
    nsamples = nsamples,
    samples_format = "tidy"
  )
}


#' Convert from tidy to matrix
#'
#' Converts from the output of `tidy_samples()` or `pp_eval(fit, samples_format = "tidy")`
#' to an `N_draws` X `nrows(newdata)` matrix with fitted/predicted values. This format is
#' used y `brms` and it's useful as `yrep` in `bayesplot::ppc_*` functions.
#'
#' @aliases tidy_to_matrix
#' @keywords internal
#' @param samples Samples in tidy format
#' @param returnvar An `rlang::sym()` object.
#' @return An  `N_draws` X `nrows(newdata)` matrix.
#' @importFrom magrittr %>%
#' @importFrom dplyr .data
#' @encoding UTF-8
#' @author Jonas Kristoffer Lindeløv \email{jonas@@lindeloev.dk}
#'
tidy_to_matrix = function(samples, returnvar) {
  returnvar = rlang::sym(returnvar)
  samples %>%
    dplyr::select(".draw", "data_row", !!returnvar) %>%
    tidyr::pivot_wider(names_from = "data_row", values_from = !!returnvar) %>%
    dplyr::select(-".draw") %>%
    as.matrix()
}


#' Add loo if not already present
#'
#' @aliases with_loo
#' @keywords internal
#' @param fit An mcpfit object
#' @param save_psis Logical. See documentation of loo::loo
#' @param info Optional message if adding loo
#' @return An mcpfit object with loo.
#' @encoding UTF-8
#' @author Jonas Kristoffer Lindeløv \email{jonas@@lindeloev.dk}
#'
with_loo = function(fit, save_psis = FALSE, info = NULL) {
  assert_mcpfit(fit)

  # Add loo if absent or needs psis
  if (is.null(fit$loo) || (save_psis == TRUE && loo::is.loo(fit$loo) && is.null(fit$loo$psis_object))) {
    if (is.character(info))
      message(info)
    fit$loo = loo(fit, save_psis = save_psis)
  }

  return(fit)
}

Try the mcp package in your browser

Any scripts or data that you put into this service are public.

mcp documentation built on May 29, 2024, 6:15 a.m.