knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
cpal <- paste0('#', c('003E51', '007DBA', 'FFFFFF', 'D6D2C4', 'B78B20'))

Along with this package, this vignette uses the gratia and mgcv packages for comparsons. I'll start with the first example from mgcv.

# library(curvish)
library(mgcv)
library(gratia)
library(ggplot2)


#Below from ?gam
set.seed(2) ## simulate some data... 
d <- gamSim(1,n=200,dist="normal",scale=2)
fit <- gam(y~s(x0)+s(x1)+s(x2)+s(x3),data=d, method = 'REML')
summary(fit)
par(mar=c(1,1,1,1))
plot(fit,pages=1,residuals=TRUE)  ## show partial residuals
plot(fit,pages=1,seWithMean=TRUE) ## `with intercept' CIs
## run some basic model checks, including checking
## smoothing basis dimensions...
gam.check(fit)

Derivatives

eps = 1e-07
res = 200
fit <- gam(y~s(x2),data=d, method = 'REML')
deriv_su <- gratia::derivatives(fit, term = 'x2', n = res, eps = eps, level = .95, 
                                interval = 'simultaneous', unconditional = TRUE, n_sim = 10000)
deriv_sc <- gratia::derivatives(fit, term = 'x2', n = res, eps = eps, level = .95, 
                                interval = 'simultaneous', unconditional = FALSE, n_sim = 10000)
deriv_cu <- gratia::derivatives(fit, term = 'x2', n = res, eps = eps, level = .95, 
                                interval = 'confidence', unconditional = TRUE)
deriv_cc <- gratia::derivatives(fit, term = 'x2', n = res, eps = eps, level = .95, 
                                interval = 'confidence', unconditional = FALSE)

Sources of uncertainty

I'll start off by considering the important issue of simultaneous confidence intervals described in this blog post by the author of the gratia package. Essentially, if we are to compare, point by point, the confidence interval, we need to correct for multiple comparisons. See below for how this changes the width of the interval around the first derivative plot. Also note how making the interval unconditional on the smooth terms slightly widens it. This is to add in the uncertainty about the exact shape of the smooth.

The Bayesian method does not face this particular multiple comparison problem. I use the posterior of the derivative to compute the posterior for the difference between the derivative at a given age to the derivative at the age where the median posterior derivative is at its maximum (i.e., at r sprintf('[INSERT HERE]') years). I then compute the 95% credible interval for that statistic at each age. If at each age 95% of the posterior falls in a certain range, then we can also be sure that 95% of the posterior across all ages falls in this range, so we are not inflating the probability that we make a mistake in deciding that this range includes the true region of steepest slope.

In the plots below, you will notice that the more sources of uncertainty we account for in the ML models (that is, the non-Bayesian models), the wider the confidence regions are. The widest interval is when we account for the multiple tests (i.e., a test at each age we're interested in examining the derivative), and when we account for uncertainty in the exact shape of the spline. "Simul" refers to the use of simultaneous confidence intervals that account for multiple testing, whereas "Conf" refers to traditional confidence intervals. "Uncon" refers to intervals that are unconditional on the choice of shape of the spline and "Cond" refers to intervals that are conditional on the exact trajectory as estimated.

The derivative of the Bayesian model falls somewhere in the middle, and has a slightly different shape.

ggplot(deriv_su, aes(x = data, y = derivative)) + 
  geom_ribbon(aes(ymin = lower, ymax = upper, color = 'Simul/Uncond', fill = 'Simul/Uncond'), alpha = .25) + 
  geom_ribbon(data = deriv_sc, aes(ymin = lower, ymax = upper, color = 'Simul/Cond', fill = 'Simul/Cond'), alpha = .25) +
  geom_ribbon(data = deriv_cu, aes(ymin = lower, ymax = upper, color = 'Conf/Uncond', fill = 'Conf/Uncond'), alpha = .25) +
  geom_ribbon(data = deriv_cc, aes(ymin = lower, ymax = upper, color = 'Conf/Cond', fill = 'Conf/Cond'), alpha = .25) +
  scale_color_manual(breaks = c('Simul/Uncond', 'Simul/Cond', 'Conf/Uncond', 'Conf/Cond', 'Bayes'),
                     values = c(cpal[c(1:2, 4:5)], '#000000'),
                     name = 'Type of interval') + 
  scale_fill_manual(breaks = c('Simul/Uncond', 'Simul/Cond', 'Conf/Uncond', 'Conf/Cond', 'Bayes'),
                     values = c(cpal[c(1:2, 4:5)], '#000000'),
                     name = 'Type of interval') + 
  labs(x = 'x2', y = 'y') + 
  geom_line() + 
  theme_minimal()

Bayesian GAM and derivative

Using brms.

library(brms)
fit_b <- brms::brm(brms::bf(y ~ s(x0) + s(x1) + s(x2) + s(x3)), 
                   data=d,
                   chains = 4, cores = 4,
                   iter = 4500, warmup = 2000,
                   control = list(adapt_delta = .999, max_treedepth = 20),
                   file = 'fit_b')
summary(fit_b)
plot(fit_b, ask = FALSE)
plot(brms::conditional_smooths(fit_b, smooths = 's(x2)'))

deriv_b <- curvish::derivatives(fit_b, term = 'x2', n = res, eps = eps)

ggplot(deriv_su, aes(x = data, y = derivative)) + 
  geom_ribbon(aes(ymin = lower, ymax = upper, color = 'Simul/Uncond', fill = 'Simul/Uncond'), alpha = .15) + 
  geom_ribbon(data = deriv_sc, aes(ymin = lower, ymax = upper, color = 'Simul/Cond', fill = 'Simul/Cond'), alpha = .15) +
  geom_ribbon(data = deriv_cu, aes(ymin = lower, ymax = upper, color = 'Conf/Uncond', fill = 'Conf/Uncond'), alpha = .15) +
  geom_ribbon(data = deriv_cc, aes(ymin = lower, ymax = upper, color = 'Conf/Cond', fill = 'Conf/Cond'), alpha = .15) +
  geom_ribbon(data = deriv_b$deriv_posterior_summary, aes(x = x2, y = mean, ymin = l, ymax = u, color = 'Bayes', fill = 'Bayes'), 
              alpha = .7) + 
  geom_line(data = deriv_b$deriv_posterior_summary, aes (x = x2, y = mean)) + 
  scale_color_manual(breaks = c('Simul/Uncond', 'Simul/Cond', 'Conf/Uncond', 'Conf/Cond', 'Bayes'),
                     values = c('#aaaaaa', cpal[c(1:2, 4:5)]),
                     name = 'Type of interval') + 
  scale_fill_manual(breaks = c('Simul/Uncond', 'Simul/Cond', 'Conf/Uncond', 'Conf/Cond', 'Bayes'),
                    values = c('#222222', cpal[c(1:2, 4:5)]),
                    name = 'Type of interval') + 
  labs(x = 'x2', y = 'y') + 
  theme_minimal()

Plot method

The package can plot the derivative and the smooth.

plot(deriv_b, robust = TRUE, deriv = TRUE)
plot(deriv_b, robust = TRUE, deriv = FALSE)

Finding regions of difference

There are ways to do this using the gratia package and fequentist inference. An example is here. This vignette will focus on the Bayesian approach implemented in curvish.

X value for slope property

For example, you might want to find the most probably X value for where the slope is steepest (positive or negative), or closest to 0.

Max

library(patchwork)
max_p <- curvish::posterior_x_at_maxy(object = fit_b, term = 'x2', 
                                      n = res, eps = eps, 
                                      multimodal = TRUE, prob = .95, 
                                      summary_only = FALSE)
max_p$param_posterior_sum
plot(deriv_b, deriv = FALSE) + 
plot(max_p) + 
  patchwork::plot_layout(design = "
A
B", guides = "collect")

Two other views

You can set a constrained range, and wether to use the mode or the median within each chunk of the HPDI.

plot(max_p, range = c(0, .25)) + 
plot(max_p, range = c(0, .15), mode = FALSE) + 
  patchwork::plot_layout(guides = 'collect')

Min

min_p <- curvish::posterior_x_at_miny(object = fit_b, term = 'x2', 
                                       n = res, eps = eps, 
                                       multimodal = FALSE, prob = .95, 
                                       summary_only = FALSE)
min_p$param_posterior_sum
plot(deriv_b, deriv = FALSE) + 
  coord_cartesian(x = c(0, 1)) + 
plot(min_p, adjust = 1.5) + 
  coord_cartesian(x = c(0, 1)) + 
  patchwork::plot_layout(design = "
A
B", guides = "collect")

Other view

plot(min_p, range = c(0, .4), adjust = 1.75)
plot(min_p, range = c(0, .4), adjust = 1.75, robust = TRUE)

Near 0

zero_p <- curvish::posterior_x_at_yequalto(object = fit_b, term = 'x2', 
                                           n = 200, eps = eps, 
                                           value = 0, 
                                           multimodal = TRUE, prob = .95, 
                                           summary_only = FALSE)
zero_p$param_posterior_sum
plot(deriv_b, deriv = FALSE) + 
  coord_cartesian(x = c(0, 1)) + 
plot(zero_p, mode = FALSE) +
  coord_cartesian(x = c(0, 1)) + 
  patchwork::plot_layout(design = "
A
B", guides = "collect")

The question has to make sense.

set.seed(9)
linear_data <- data.frame(x = rnorm(200))
linear_data$y <- linear_data$x*.5 + rnorm(200)
plot(linear_data$x, linear_data$y)
fit_lin <- brms::brm(brms::bf(y ~ s(x)), 
                   data=linear_data,
                   chains = 4, cores = 4,
                   iter = 4500, warmup = 2000,
                   control = list(adapt_delta = .999, max_treedepth = 20),
                   file = 'fit_lin')
summary(fit_lin)
plot(fit_lin, ask = FALSE)
plot(conditional_smooths(fit_lin, smooths = 's(x)'))
deriv_lin <- curvish::derivatives(fit_lin, term = 's(x)', n = res, eps = eps)
plot(deriv_lin)
plot(deriv_lin, robust = TRUE)
plot(deriv_lin, deriv = FALSE)

What's the maximum slope along a line?

max_lin <- curvish::posterior_x_at_maxy(fit_lin, term = 'x', n = res, eps = eps, adjust = 1)

plot(deriv_lin, deriv = FALSE) + 
  coord_cartesian(x = range(linear_data$x)) + 
plot(max_lin) + 
  coord_cartesian(x = range(linear_data$x)) + 
  patchwork::plot_layout(ncol = 1)

plot(max_lin) + 
  coord_cartesian(x = range(linear_data$x)) + 
plot(max_lin, adjust = 2) + plot(max_lin, adjust = .5) + plot(max_lin, adjust = .1) + 
  patchwork::plot_layout(guides = 'collect', ncol = 1)

Clearly the correct answer is that the entire curve is of maximum slope, but the posterior we derive incorrectly excludes a portion, by necessity. This can be understood, perhaps, as an instance of model misspecification. One should never expect to get the right answer from the wrong question.

zero_lin <- curvish::posterior_x_at_yequalto(fit_lin, term = 'x', n = res, eps = eps, value = 0)
plot(deriv_lin, deriv = FALSE) + 
  coord_cartesian(x = range(fit_lin$data$x)) + 
plot(zero_lin) + 
  coord_cartesian(x = range(fit_lin$data$x)) + 
  patchwork::plot_layout(ncol = 1)

Interestingly we can see that it is most likely to be 0 where we have the least data, and least likely where we have the most data.

Finally, consider using the histogram overlay to check the density plots.

max_lin_uni <- curvish::posterior_x_at_maxy(fit_lin, term = 'x', n = res, eps = eps, multimodal = FALSE)
plot(max_lin_uni, histogram = TRUE, adjust = 1) + 
  plot(max_lin_uni, histogram = TRUE, adjust = .5) + 
  plot(max_lin_uni, histogram = TRUE, adjust = .1) + 
  patchwork::plot_layout(ncol = 1)

AUC

We can find the area under the curve of the derivative as a summary of total change.

auc_b <- curvish::auc(deriv_b, multimodal = FALSE, prob = .95)
print(auc_b$param_posterior_sum)
plot(auc_b)

If you want a single summary statistic, you can compute these from the posterior directly.

mean(auc_b$param_posterior)
median(auc_b$param_posterior)

This is just a way to recapitulate the quantity you would find by taking the difference between the posterior estimates end of the curve and the beginning of the curve.

Second derivative

#Taking the second derivative gets noisy if eps is very small.
deriv_b <- curvish::derivatives(fit_b, term = 'x2', n = res, eps = 1e-6, order = 2)
library(patchwork)
plot(deriv_b, deriv = 0) + 
plot(deriv_b, deriv = 1) +
plot(deriv_b, deriv = 2) + plot_layout(ncol = 1)

How wiggly is it? In other words how much does the slope change? If the slope is constant the second derivative is 0, so any absolute deviation from that indicates some wiggle.

dim(deriv_b$deriv2_posterior)
#should turn this into a function
mean_deriv2_post <- apply(deriv_b$deriv2_posterior, 1, function(x) mean(abs(x)))
bayesplot::mcmc_areas(array(mean_deriv2_post, dim = c(length(mean_deriv2_post),1), dimnames = list(NULL, 'mean second deriv')), point_est = 'median',
                      prob = .95, prob_outer = .99)

We can also apply the AUC method to the second derivative (see below). Both of these can be applied with and without taking the absolute value of each contributing estimate of the area.

auc_b_d2 <- curvish::auc(deriv_b, multimodal = FALSE, prob = .95, deriv = 2, abs = TRUE)
print(auc_b_d2$param_posterior_sum)
plot(auc_b_d2)

Works in progress

Regions along the slope

deriv_b2 <- curvish::derivatives(fit_b, term = 'x2', n = res, eps = eps, deriv_posterior = TRUE, prob = .95, prob_outer = .999)
plot(deriv_b2, deriv = TRUE, outer = TRUE)
ap <- deriv_b2$deriv_posterior

onoff_mat <- apply(ap, 2, function(x) {
  ps <- c(pGT = mean(x > 0), pLT = mean(x < 0))
  C <- c(1, -1)[which(ps == max(ps))] # is it confidently above or below 0
  S <- min(ps)
  nS <- min(c(sum(x > 0), sum(x < 0)))
  c(ps, C = C, S = S, nS = nS)
})

total_type_S_prob <- sum(onoff_mat['nS',])/length(ap)
confident_type_S_prob <- sum(onoff_mat['nS', onoff_mat['S',] < .05])/length(ap[, onoff_mat['S',] < .05])

confidence_regions <- onoff_mat['C',]
confidence_regions[onoff_mat['S', ] > .05] <- 0

region_data <- cbind(deriv_b2$deriv_posterior_summary, 
                     region = confidence_regions,
                     `Confidence Region` = factor(confidence_regions, levels = c(1, 0, -1), labels = c('GT', '-', 'LT')))
blocks <- curvish:::contiguous_zeros(region_data, colname = 'region', indexcol = 'x2')
region_data$region_group <- blocks$index

plot(deriv_b2, deriv = TRUE, outer = TRUE) + 
  geom_line(data = region_data,
            aes(color = `Confidence Region`, group = region_group)) + 
  scale_color_manual(breaks = c('GT', '-', 'LT'), values = curvish:::a_pal()[c(2,1,5)])
message(sprintf('P(Type-S) = %0.1f%% in confidence regions.', confident_type_S_prob*100))
blocks$block_lengths

Differences along the curve

#Find differences along the curve from a particular point on that curve. For
#example, what's the difference in slope between where it is most credibly at
#its maximum (i.e., the maximum of either the mean or median posterior
#derivative) and the rest of the slope?
#
#You should be able to pass this function a single X value, a function that
#returns a single Y value which will map to a single X value. You should also be
#able to pass it a curvish.param object.
object = deriv_b2
# comparison = min_p
comparison = max
# comparison = 5
robust = TRUE
prob = .95
prob_outer = .99
probs <- sort(c(.5, unlist(lapply((1-c(prob, prob_outer))/2,
                                    function(x) c(0, 1) + c(1, -1)*x))))

if(is.null(deriv_b2$deriv_posterior)){
  stop('Object must include full posterior. Please rerun curvish::derivatives with deriv_posterior = TRUE.')
}
if(robust){
  central_func <- 'median'
} else {
  central_func <- 'mean'
}
data_term <- attr(object, 'term')$data_term
if(is.numeric(comparison)){
  X <- comparison
} else if(is.function(comparison)){
  Y <- comparison(object$deriv_posterior_summary[central_func])
  which_X <- which(object$deriv_posterior_summary[central_func] == Y)
  X <- object$deriv_posterior_summary[[data_term]][which_X]
} else if(is.curvish.param(comparison)){
  if(!attr(comparison, 'multimodal')){
    if(is.null(comparison$param_posterior)){
      stop("Can't use curvish.param object as comparitor without posterior.")
    }
    X <- get(central_func)(comparison$param_posterior)
  } else {
    stop('Cannot handle multimodal comparison objects yet')
  }
}

#This may be unnecessary except for the is.numeric case
all_X <- object$deriv_posterior_summary[[data_term]]
which_closest_X <- which(abs(all_X - X) == min(abs(all_X - X)))

deriv_at_X <- object$deriv_posterior[, which_closest_X]
diff_deriv <- object$deriv_posterior - deriv_at_X
diff_deriv_summary <- do.call(rbind, apply(diff_deriv, 2, function(x){
  mn <- mean(x)
  qq <- quantile(x, probs)
  return(data.frame(mean = mn, median = qq[[3]], ll = qq[[1]], l = qq[[2]], u = qq[[4]], uu = qq[[5]]))
}))
diff_deriv_summary[[data_term]] <- object$deriv_posterior_summary[[data_term]]
new_object <- list(deriv_posterior_summary = diff_deriv_summary, 
                   smooth_posterior_summary = object$smooth_posterior_summary,
                   deriv_posterior = diff_deriv)
mostattributes(new_object) <- c(list(names = names(new_object)), attributes(object)[-1])
attr(new_object, 'class') <- c(class(object), 'curvish.diff')
attr(new_object, 'comparison') <- comparison
library(patchwork)
plot(new_object, deriv = 1) + 
plot(deriv_b2, deriv = 1) + plot_layout(ncol = 1)

And more

Find the posterior of differences from various points in the curve, or credible intervals for differences along arbitrary values.

Also, combine objects like posteriors for max slopes and the slopes themselves to make plots of regions of the curve that have certain properties.

deriv1_age_at_max_post <- c_smooths_1_sample_data[, list('max_age' = Age[which(deriv1 == max(deriv1))]),
                                                  by = 'sample__']
age_at_max_med_deriv <- deriv1_post_summary[deriv1 == max(deriv1), Age]
deriv1_diff_from_max_post <- c_smooths_1_sample_data[, diff_from_max := deriv1 - deriv1[Age == age_at_max_med_deriv],
                                                     by = 'sample__']
deriv1_diff_from_max_post_summary <- deriv1_diff_from_max_post[, list('diff_from_max' = median(diff_from_max), 
                                                                      'lower' = quantile(diff_from_max, probs = .025), 
                                                                      'upper' = quantile(diff_from_max, probs = .975)),
                                                               by = 'Age']
deriv1_diff_from_max_post_summary[, compared_to_max := case_when(upper < 0 ~ 'less_steep',
                                                                 lower > 0 ~ 'steeper',
                                                                 TRUE ~ 'as_steep')]

deriv1_post_summary <- deriv1_post_summary[deriv1_diff_from_max_post_summary[, .(Age, compared_to_max)], on = 'Age']


jflournoy/curvish documentation built on May 4, 2023, 3:57 a.m.