R/sample_mallows.R
In BayesMallows: Bayesian Preference Learning with the Mallows Rank Model

Documented in sample_mallows

#' Random Samples from the Mallows Rank Model
#'
#' Generate random samples from the Mallows Rank Model
#' \insertCite{mallows1957}{BayesMallows} with consensus ranking \eqn{\rho} and
#' scale parameter \eqn{\alpha}. The samples are obtained by running the
#' Metropolis-Hastings algorithm described in Appendix C of
#' \insertCite{vitelli2018;textual}{BayesMallows}.
#'
#' @param rho0 Vector specifying the latent consensus ranking in the Mallows
#'   rank model.
#' @param alpha0 Scalar specifying the scale parameter in the Mallows rank
#'   model.
#' @param n_samples Integer specifying the number of random samples to generate.
#'   When `diagnostic = TRUE`, this number must be larger than 1.
#' @param leap_size Integer specifying the step size of the leap-and-shift
#'   proposal distribution.
#' @param metric Character string specifying the distance measure to use.
#'   Available options are `"footrule"` (default), `"spearman"`,
#'   `"cayley"`, `"hamming"`, `"kendall"`, and `"ulam"`.
#'   See also the `rmm` function in the `PerMallows` package
#'   \insertCite{irurozki2016}{BayesMallows} for sampling from the Mallows
#'   model with Cayley, Hamming, Kendall, and Ulam distances.
#' @param diagnostic Logical specifying whether to output convergence
#'   diagnostics. If `TRUE`, a diagnostic plot is printed, together with
#'   the returned samples.
#' @param burnin Integer specifying the number of iterations to discard as
#'   burn-in. Defaults to 1000 when `diagnostic = FALSE`, else to 0.
#' @param thinning Integer specifying the number of MCMC iterations to perform
#'   between each time a random rank vector is sampled. Defaults to 1000 when
#'   `diagnostic = FALSE`, else to 1.
#' @param items_to_plot Integer vector used if `diagnostic = TRUE`, in
#'   order to specify the items to plot in the diagnostic output. If not
#'   provided, 5 items are picked at random.
#' @param max_lag Integer specifying the maximum lag to use in the computation
#'   of autocorrelation. Defaults to 1000L. This argument is passed to
#'   `stats::acf`. Only used when `diagnostic = TRUE`.
#'
#' @references \insertAllCited{}
#'
#' @export
#'
#' @example /inst/examples/sample_mallows_example.R
#' @family rank functions
#'
sample_mallows <- function(rho0, alpha0, n_samples,
                           leap_size = max(1L, floor(n_items / 5)),
                           metric = "footrule",
                           diagnostic = FALSE,
                           burnin = ifelse(diagnostic, 0, 1000),
                           thinning = ifelse(diagnostic, 1, 1000),
                           items_to_plot = NULL,
                           max_lag = 1000L) {
  if (!(validate_permutation(rho0) && sum(is.na(rho0)) == 0)) {
    stop("rho0 must be a proper ranking with no missing values.")
  }

  if (diagnostic && n_samples == 1) {
    stop("Must have more than one samples to create diagnostic plots")
  } else if (n_samples <= 0) {
    stop("n_samples must be positive.")
  }

  n_items <- length(rho0)

  if (diagnostic) {
    internal_burnin <- 0
    internal_thinning <- 1
    internal_n_samples <- burnin + n_samples * thinning
  } else {
    internal_burnin <- burnin
    internal_thinning <- thinning
    internal_n_samples <- n_samples
  }

  samples <- t(rmallows(
    rho0 = rho0,
    alpha0 = alpha0,
    n_samples = internal_n_samples,
    burnin = internal_burnin,
    thinning = internal_thinning,
    leap_size = leap_size,
    metric = metric
  ))

  if (diagnostic) {
    if (is.null(items_to_plot) && n_items > 5) {
      message("Items not provided by user. Picking 5 at random.")
      items_to_plot <- sample.int(n_items, 5)
    } else {
      items_to_plot <- seq(from = 1, to = n_items, by = 1)
    }

    # Compute the autocorrelation in the samples
    autocorr <- apply(samples[, items_to_plot, drop = FALSE], 2, stats::acf,
      lag.max = max_lag, plot = FALSE, demean = TRUE
    )
    names(autocorr) <- items_to_plot

    autocorr <- do.call(rbind, Map(function(x, xnm) {
      data.frame(
        item = xnm,
        acf = x$acf[, 1, 1],
        lag = x$lag[, 1, 1]
      )
    }, x = autocorr, xnm = names(autocorr)))

    autocorr$item <- as.factor(as.integer(autocorr$item))

    ac_plot <- ggplot2::ggplot(
      autocorr,
      ggplot2::aes(x = .data$lag, y = .data$acf, color = .data$item)
    ) +
      ggplot2::geom_line() +
      ggplot2::theme(legend.title = ggplot2::element_blank()) +
      ggplot2::xlab("Lag") +
      ggplot2::ylab("Autocorrelation") +
      ggplot2::ggtitle("Autocorrelation of Rank Values")

    print(ac_plot)
    con <- getOption("ask_opts.con", stdin())
    print("Press [enter] to see the next plot")
    response <- readLines(con = con, n = 1)
    colnames(samples) <- seq(from = 1, to = n_items, by = 1)
    diagnostic <- as.data.frame(samples)
    diagnostic$iteration <- seq_len(nrow(diagnostic))

    diagnostic <- stats::reshape(diagnostic,
      direction = "long",
      varying = setdiff(names(diagnostic), "iteration"),
      v.names = "value",
      timevar = "item",
      times = setdiff(names(diagnostic), "iteration"),
      idvar = "iteration",
      ids = diagnostic$iteration
    )

    diagnostic <- diagnostic[diagnostic$item %in% items_to_plot, , drop = FALSE]
    diagnostic$item <- as.factor(as.integer(diagnostic$item))

    rho_plot <- ggplot2::ggplot(
      diagnostic,
      ggplot2::aes(x = .data$iteration, y = .data$value, color = .data$item)
    ) +
      ggplot2::geom_line() +
      ggplot2::theme(legend.title = ggplot2::element_blank()) +
      ggplot2::xlab("Iteration") +
      ggplot2::ylab("Rank value") +
      ggplot2::ggtitle("Trace Plot of Rank Values")

    print(rho_plot)

    samples <- samples[seq(from = burnin + 1, by = thinning, length.out = n_samples), ]
  }
  return(samples)
}