Setting a prior

mcp takes priors in the form of a named list. The names are the parameter names, and the values are JAGS code. Here is a fairly complicated example, just to get enough priors to demonstrate the various ways priors can be used:

model = list(
  y ~ 1 + x,  # int_1 + x_1
  ~ 1 + x,  # cp_1, int_2, and x_2
  ~ 1 + x  # cp_2
)

prior = list(
  int_1 = "dnorm(0, 5) T(, 10)",  # Intercept; less than 10
  x_1 = "dbeta(2, 5)",  # slope: beta with right skew
  cp_1 = "dunif(MINX, cp_2)",  # change point between smallest x and cp_2
  x_2 = "dt(0, 1, 3) T(x_1, )",  # slope 2 > slope 1 and t-distributed
  cp_2 = 80,  # A constant (set; not estimated)
  x_3 = "x_2"  # continue same slope
  # int_2 and int_3 not specified. Use default.
)

The values are JAGS code, so all JAGS distributions are allowed. These also include gamma, dt, cauchy, and many others. See the JAGS user manual for more details. The parameterization of the distributions are identical to standard R. Use SD when you specify priors for dnorm, dt, dlogis, etc. mcp converts to precision for JAGS under the hood via the sd_to_prec() function (prec = 1 / sd^2), so you don't have to worry about it. You can see the effects of this conversion by inspecting the difference between fit$prior (using SD) and fit$jags_code (using precision).

Other notes:

Let us see the priors after running them through mcp and compare to the default priors:

library(mcp)
options(mc.cores = 3)  # Speed up sampling
set.seed(42)  # Make the script deterministic

empty_manual = mcp(model, prior = prior, sample = FALSE)
empty_default = mcp(model, sample = FALSE)

# Look at fit$prior and show them side-by-side
cbind(manual = empty_manual$prior, default = empty_default$prior)

Now, let's simulate some data that from the model. The following priors are "at odds" with the actual data so as to show their effect.

df = data.frame(x = runif(200, 0, 100))  # 200 datapoints between 0 and 100
df$y = empty_default$simulate(df$x, 
    int_1 = 20, int_2 = 30, int_3 = 30,  # intercepts
    x_1 = -0.5, x_2 = 0.5, x_3 = 0,  # slopes
    cp_1 = 35, cp_2 = 70,  # change points
    sigma = 5)

head(df)

Sample the prior and posterior. We let the manual fit adapt for longer, since it is harder to find the right posterior under these weird prior constraints (priors will usually improve sampling efficiency).

fit_manual = mcp(model, data = df, sample = "both", adapt = 10000, prior = prior)
fit_default = mcp(model, data = df, sample = "both", adapt = 10000)

First, let's look at the priors side by side. Notice the use of prior = TRUE to show prior samples. This works in plot(), plot_pars(), and summary() among others.

library(ggplot2)
pp_default = plot_pars(fit_default, type = "dens_overlay", prior = TRUE) + 
  ggtitle("Default priors")

pp_manual = plot_pars(fit_manual, type = "dens_overlay", prior = TRUE) +
  ggtitle("Manual priors")

pp_default + pp_manual

Here is the resulting posterior fits:

plot_default = plot(fit_default) + ggtitle("Default priors")
plot_manual = plot(fit_manual) + ggtitle("Manual priors")

plot_default + plot_manual

We see the effects of the priors.

Of course, it will usually be the other way around: setting priors manually will often serve to sample the "correct" posterior.

Default priors on change points {#cp_prior}

The following are treated more formally in the mcp paper.

Change points have to be ordered from left (cp_1) to right (cp_2+). This order restriction is enforced through the priors and this is not trivial. mcp currently offers two "packages" of change point priors that achieves different goals:

They two "packages" are identical for one change point, though the default still samples more effectively.

df_dummy = data.frame(x = c(50, 100, 150), y = rnorm(3))

# Sample priors for two default change points
model_two = list(y ~ x, ~ 1, ~1)
fit_two = mcp(model_two, data = df_dummy, sample = "prior", chains = 1, iter = 300000)

# Sample priors for five default change points
model_five = list(y ~ x, ~ 1, ~1, ~1, ~1, ~1)
fit_five = mcp(model_five, data = df_dummy, sample = "prior", chains = 1, iter = 300000)
par(mfrow = c(2,2))
MINX = min(df_dummy$x)
MAXX = max(df_dummy$x)


###############
# DIRICH TWO #
###############

# Initiate for cp_1
curve(dbeta(x, 1, 2) / 2, 
      ylab = "density",
      xlab = NA,
      main = "Dirichlet two")

text(0.1, 0.7, labels = "cp_1")

# Add others
curve(dbeta(x, 2, 1) / 2, add = T); text(0.9, 0.7, labels = "cp_2")

# The sum
curve(dunif(x, 0, 1), lty = 2, add = T); text(0.5, 0.95, labels = "sum")

# Add extra x-axis
axis(1, at = seq(0, 1, by = 0.2), labels = seq(MINX, MAXX, by = 20), line=3)
mtext("Beta", 1, line = 1, at = -0.2)
mtext("Data", 1, line = 3, at = -0.2)


###############
# DIRICH FIVE #
###############

# Initiate for cp_1
curve(dbeta(x, 1, 5) / 5, 
      ylab = "density",
      xlab = NA,
      main = "Dirichlet five")

text(0.1, 0.8, labels = "cp_1")

# Add others
curve(dbeta(x, 2, 4) / 5, add = T); text(0.25, 0.48, labels = "cp_2")
curve(dbeta(x, 3, 3) / 5, add = T); text(0.5, 0.43, labels = "cp_3") 
curve(dbeta(x, 4, 2) / 5, add = T); text(0.75, 0.48, labels = "cp_4") 
curve(dbeta(x, 5, 1) / 5, add = T); text(0.9, 0.8, labels = "cp_5") 

# The sum
curve(dunif(x, 0, 1), lty = 2, add = T); text(0.5, 0.95, labels = "sum")

# Add extra x-axis
axis(1, at = seq(0, 1, by = 0.2), labels = seq(MINX, MAXX, by = 20), line=3)
mtext("Beta", 1, line = 1, at = -0.2)
mtext("Data", 1, line = 3, at = -0.2)


###############
# DEFAULT TWO #
###############
y1 = density(fit_two$mcmc_prior[[1]][, "cp_1"], from = MINX, to = MAXX)
y2 = density(fit_two$mcmc_prior[[1]][, "cp_2"], from = MINX, to = MAXX)
plot(y1$x, y1$y + y2$y, 
     type = "l", lty = 2, 
     xlab = NA, ylab = "Density", main = "t-tail two",
     xlim = c(MINX, MAXX), ylim = c(0, 0.024))

lines(y1)
lines(y2)

text(100, 0.018, labels = "sum")
text(65, 0.011, labels = "cp_1")
text(130, 0.011, labels = "cp_2")



################
# DEFAULT FIVE #
################
z1 = density(fit_five$mcmc_prior[[1]][, "cp_1"], from = MINX, to = MAXX)
z2 = density(fit_five$mcmc_prior[[1]][, "cp_2"], from = MINX, to = MAXX)
z3 = density(fit_five$mcmc_prior[[1]][, "cp_3"], from = MINX, to = MAXX)
z4 = density(fit_five$mcmc_prior[[1]][, "cp_4"], from = MINX, to = MAXX)
z5 = density(fit_five$mcmc_prior[[1]][, "cp_5"], from = MINX, to = MAXX)

plot(z1$x, (z1$y + z2$y + z3$y + z4$y + z5$y), 
     type = "l", lty = 2, 
     xlab = NA, ylab = "Density", main = "t-tail five",
     xlim = c(MINX, MAXX), ylim = c(0, 0.07)); 
text(100, 0.043, labels = "sum")

lines(z5); text(137, 0.03, labels = "cp_5")
lines(z4); text(115, 0.020, labels = "cp_4")
lines(z3); text(94, 0.023, labels = "cp_3")
lines(z3); text(75, 0.027, labels = "cp_2")
lines(z2); text(62, 0.041, labels = "cp_1")
lines(z1)

par(mfrow = c(1,1))

The t-tail prior on 2+ change points (default)

The first change point defaults to cp_1 = dunif(MINX, MAXX). In other words, the change point has to happen in the observed range of x, but it is equally probable across this range. This is identical to the Dirichlet prior.

For 2+ change points, the default (on all change points) is cp_i = dt(MINX, (MAXX-MINX) / N_CP, N_CP - 1) T(cp_i-1, MAXX). This is not as complicated as it looks, so let me unpack it.

One side effect of the truncation is that later change points have greater prior probability density towards the right side of the x-axis. In practice, this "bias" is so weak that it takes a combination of many change points and few data for it to impact the posterior in any noticeable way.

Dirichlet-based prior on change points

The Dirichlet distribution is a multivariate beta prior and these betas jointly form a simplex, meaning that they are all positive and sum to one. They are all in the interval $[0, 1]$ so they are shifted and scaled to $[min(x), max(x)]$. The Dirichlet prior has the nice property that (1) the order-restriction and boundedness is inherent to the distribution, and (2) it represents a uniform prior that any change happens at any $x$, i.e., it is maximally uninformative. It underlies the modeling of monotonic effects in brms (Büerkner & Charpentier (2019)).

To use the Dirichlet prior, you need to specify it for all or none of the change points. E.g.,

prior_dirichlet = list(
  cp_1 = "dirichlet(1)",
  cp_2 = "dirichlet(1)",
  cp_3 = "dirichlet(1)"
)

The number in the parenthesis is the $\alpha$ parameter, so you could also specify cp_1 = "dirichlet(3) if you want to push credence for that and later change points more to the rightwards while pushing earlier priors leftwards.

Manual priors on change points

You can easily change or modify change point priors, just as we did in the initial example. But beware that the nature of the priors change when truncation is applied. Use plot_pars(fit, prior = TRUE) to check the resulting prior.

If you want more informed priors on the change point location, i.e., cp_2 = "dnorm(40, 10), mcp adds this order restriction by adding cp_2 = "dnorm(40, 10) T(cp_1, MAXX). You can avoid this behavior by explicitly doing an "empty" truncation yourself, e.g., cp_2 = "dnorm(40, 10) T(,). However, the model may fail to sample the correct posterior in samples where order restriction is not kept.

Default priors on linear predictors

OBS: These priors are very to change in versions beyond mcp 0.3, but not drastically.

You can see the default priors for the gaussian() family in the previous example. They are similar to the brms default priors, i.e., t-distributed around mean = 0 with a standard deviation that scales with the data.

This means that there will be some "shrinkage" towards a mean and SD of zero for all parameters, especially for parameters with a large mean and a small SD.

The slopes are scaled as if it changed +/- 1 SD through the entire x-axis. This too will be insufficient for very steep slopes, i.e., if there are many change points on x.

See the family-specific articles for more information about the priors for other families:

Default priors on varying effects

See varying change points with mcp.

Prior predictive checks

Prior predictive checks is a great way to ensure that the priors are meaningful. Simply set sample = "prior". Let us do it for the two sets of priors defined previously in this article, to see their different prior predictive space.

# Sample priors 
fit_pp_manual = mcp(model, data = df, prior, sample = "prior")
fit_pp_default = mcp(model, data = df, sample = "prior")

# Plot it
plot_pp_manual = plot(fit_pp_manual, lines = 100) + ylim(c(-400, 400)) + ggtitle("Manual prior")
plot_pp_default = plot(fit_pp_default, lines = 100) + ylim(c(-400, 400)) + ggtitle("Default prior")
plot_pp_manual +  plot_pp_default  # using patchwork

You can see how the manual priors are more dense to the left, and the "concerted" change at x = 80.

JAGS code

Here is the JAGS code for fit_manual:

fit_manual$jags_code


lindeloev/mcp documentation built on Oct. 2, 2024, 1:52 a.m.