Nothing
# =========================== ithresh ===========================
#
#' Threshold selection in the i.i.d. case (peaks over threshold)
#'
#' Produces a diagnostic plot to assist in the selection of an extreme value
#' threshold in the case where the data can be treated as independent and
#' identically distributed (i.i.d.) observations. For example, it could be
#' that these observations are the cluster maxima resulting from the
#' declustering of time series data. The predictive ability of models
#' fitted using each of a user-supplied set of thresholds is assessed using
#' leave-one-out cross-validation in a Bayesian setup.
#' These models are based on a Generalized Pareto (GP) distribution for
#' threshold excesses and a binomial model for the probability of threshold
#' exceedance. See Northrop et al. (2017) for details.
#'
#' @param data A numeric vector of observations. Any missing values will
#' be removed. The argument \code{npy} (see below) may be supplied
#' as an attribute of \code{data} using \code{attr(data, "npy") <- value},
#' where \code{value} is the value of \code{npy} (see \code{\link{attr}}).
#' If \code{npy} is supplied twice, as both \code{attr(data, "npy")})
#' and using the \code{npy} argument, then the former is used.
#' @param u_vec A numeric vector. A vector of \emph{training} thresholds
#' at which inferences are made from a binomial-GP model. These could be
#' set at sample quantiles of \code{data} using
#' \code{\link[stats]{quantile}}. Any duplicated values will be removed.
#' @param ... Further (optional) arguments to be passed to the
#' \code{\link[revdbayes]{revdbayes}} function
#' \code{\link[revdbayes]{rpost_rcpp}} (or \code{\link[revdbayes]{rpost}}),
#' which use the generalized ratio-of-uniforms method to simulate from
#' extreme value posterior distributions.
#' In particular:
#' \itemize{
#' \item {\code{n}} {The size of the posterior sample used to perform
#' predictive inference. Default: \code{n = 1000}.}
#' \item {\code{prior}} {A prior for GP parameters to be passed to the
#' \strong{revdbayes} function \code{\link[revdbayes]{set_prior}}.
#' Can be either a character scalar that chooses an in-built prior,
#' or a user-supplied R function or pointer to a compiled C++ function.
#' See the \code{\link[revdbayes]{set_prior}} documentation for details
#' of the in-built priors.
#' See the \strong{revdbayes} vignette
#' \href{https://cran.r-project.org/package=revdbayes}{Faster simulation
#' using revdbayes} for information about creating
#' a pointer to a C++ function. See also the \strong{Examples} section.
#'
#' If the user supplies an R function then \code{\link{rpost}} will be
#' used for posterior simulation, rather than (the faster)
#' \code{\link{rpost_rcpp}}, regardless of the input value of
#' \code{use_rcpp}.
#'
#' Default: \code{prior = "mdi"} with \code{a = 0.6} and \code{min_xi = -1}.
#' This particular prior is studied in Northrop et al. (2017).}
#' \item {\code{h_prior}} {A \emph{list} of further arguments
#' (hyperparameters) for the GP prior specified in \code{prior}.}
#' \item {\code{bin_prior}} {A prior for the threshold exceedance probability
#' \eqn{p} to be passed to the \strong{revdbayes} function
#' \code{\link[revdbayes]{set_bin_prior}}.
#' Can either be a character scalar that chooses an in-built prior,
#' or a user_supplied R function.
#'
#' Default: \code{prior = "jeffreys"}, i.e. Beta(1/2, 1/2).}
#' \item {\code{h_bin_prior}} {A \emph{list} of further arguments
#' (hyperparameters) for the binomial prior specified in \code{bin_prior}.
#' See the \code{\link[revdbayes]{set_bin_prior}} documentation for details
#' of the in-built priors.}
#' \item {\code{trans}} {A character scalar: either \code{"none"} or
#' \code{"BC"}. See \code{\link{rpost_rcpp}} for details.
#' The default is \code{"none"}, which is usually faster than \code{"BC"}.
#' However, if there are very few threshold excesses then using
#' \code{trans = "BC"} can make the optimizations involved in the
#' generalized ratio-of-uniforms algorithm more stable. If using
#' \code{trans = "none"} produces an error for a particular posterior
#' simulation then \code{trans = "BC"} is used instead.}
#' }
#' @param n_v A numeric scalar.
#' Each of the \code{n_v} largest values in \code{u_vec} will be used
#' (separately) as a \emph{validation} threshold for the training thresholds
#' in \code{u_vec} that lie at or below that validation threshold.
#' If \code{n_v = 1} then all the training thresholds are used with
#' validation performed using the threshold \code{u_vec[length(u_vec)]}.
#' If \code{n_v = 2} then, in addition, the assessment is performed using
#' \code{u_vec[1], ..., u_vec[length(u_vec) - 1]} with
#' validation threshold \code{u_vec[length(u_vec) - 1]},
#' and so on.
#' @param npy A numeric scalar. The mean number of observations per year
#' of data, after excluding any missing values, i.e. the number of
#' non-missing observations divided by total number of years of non-missing
#' data. May be supplied using as an attribute \code{attr(data, "npy")}
#' of \code{data} instead.
#'
#' The value of \code{npy} does not affect any calculation in
#' \code{ithresh}, it only affects subsequent extreme value inferences using
#' \code{predict.ithresh}. However, setting \code{npy} in the call to
#' \code{rpost}, or as an attribute of \code{data} avoids the need to
#' supply \code{npy} when calling \code{predict.ithresh}.
#' @param use_rcpp A logical scalar. If \code{TRUE} (the default) the
#' revdbayes function \code{\link[revdbayes]{rpost_rcpp}} is used for
#' posterior simulation. If \code{FALSE}, or if the user supplies an R
#' function to set the prior for GP parameters,
#' the (slower) function \code{\link[revdbayes]{rpost}} is used.
#' @details For a given threshold in \code{u_vec}:
#' \itemize{
#' \item {the number of values in \code{data} that exceed the threshold,
#' and the amounts (the \emph{threshold excesses}) by which these value
#' exceed the threshold are calculated;}
#' \item {\code{\link[revdbayes]{rpost_rcpp}}
#' (or \code{\link[revdbayes]{rpost}}) is used to sample from the posterior
#' distributions of the parameters of a GP model for the threshold
#' excesses and a binomial model for the probability of threshold
#' exceedance;}
#' \item {the ability of this binomial-GP model to predict data
#' thresholded at the validation threshold(s) specified by \code{n_v} is
#' assessed using leave-one-out cross-validation (the measure of
#' this is given in equation (7) of Northrop et al. (2017).}
#' }
#' See Northrop et al. (2017) and the introductory threshr vignette for
#' further details and examples.
#' @return An object (list) of class \code{"ithresh"}, containing the
#' components
#' \itemize{
#' \item{\code{pred_perf}:} A numeric matrix with \code{length(u_vec)}
#' rows and \code{n_v} columns. Each column contains the values of
#' the measure of predictive performance. Entries corresponding
#' to cases where the training threshold is above the validation
#' threshold will be \code{NA}.
#' \item{\code{u_vec}:} The argument \code{u_vec} to \code{ithresh}.
#' \item{\code{v_vec}:} A numeric vector. The validation thresholds
#' implied by the argument \code{n_v} to \code{ithresh}.
#' \item{\code{u_ps}:} A numeric vector. The approximate levels of the
#' sample quantiles to which the values in \code{u_vec} correspond,
#' i.e. the approximate percentage of the data the lie at or below
#' each element in \code{u_vec}.
#' \item{\code{v_ps}:} A numeric vector. The values in \code{u_ps}
#' that correspond to the validation thresholds.
#' \item{\code{sim_vals}:} A numeric matrix with 4 columns and
#' \code{n} x \code{length(u_vec)} rows. The \eqn{j}th block of
#' \code{n} rows contains in columns 1-3 the posterior samples of
#' the threshold exceedance probability, the GP scale
#' parameter and the GP shape parameter respectively,
#' based on training threshold \code{u_vec[i]},
#' and in column 4 the value of \eqn{j}.
#' \item{\code{n}:} A numeric scalar. The value of \code{n}.
#' \item{\code{npy}:} A numeric scalar. The value of \code{npy}.
#' \item{\code{data}:} The argument \code{data} to \code{ithresh}
#' detailed above, with any missing values removed.
#' \item{\code{use_rcpp}:} A logical scalar indicating whether
#' \code{\link[revdbayes]{rpost_rcpp}} (\code{use_rcpp = TRUE}) or
#' \code{\link[revdbayes]{rpost}} (\code{use_rcpp = FALSE})
#' was used for posterior simulation.
#' \item{\code{for_post}:} A list containing arguments with which
#' \code{\link[revdbayes]{rpost_rcpp}}
#' (or \code{\link[revdbayes]{rpost}}) was called, including
#' any user-supplied arguments to these functions.
#' \item{\code{call:}} The call to \code{ithresh}.
#' }
#' @seealso \code{\link{plot.ithresh}} for the S3 plot method for objects of
#' class \code{ithresh}.
#' @seealso \code{\link{summary.ithresh}} Summarizing measures of threshold
#' predictive performance.
#' @seealso \code{\link{predict.ithresh}} for predictive inference for the
#' largest value observed in N years.
#' @seealso \code{\link[revdbayes]{rpost}} in the
#' \code{\link[revdbayes]{revdbayes}} package for details of the arguments
#' that can be passed to
#' \code{\link[revdbayes]{rpost_rcpp}}/\code{\link[revdbayes]{rpost}}.
#' @seealso \code{\link[revdbayes]{set_prior}} and
#' \code{\link[revdbayes]{set_bin_prior}} in the
#' \code{\link[revdbayes]{revdbayes}} package for details of how to set a
#' prior distributions for GP parameters and for the exceedance probability
#' \eqn{p}.
#' @seealso \code{\link[stats]{quantile}}.
#' @examples
#' # Note:
#' # 1. Smoother plots result from making n larger than the default n = 1000.
#' # 2. In some examples below validation thresholds rather higher than is
#' # advisable have been used, with far fewer excesses than the minimum of
#' # 50 suggested by Jonathan and Ewans (2013).
#'
#' ## North Sea significant wave heights, default prior -----------------------
#' #' # A plot akin to the top left of Figure 7 in Northrop et al. (2017)
#' #' # ... but with fewer training thresholds
#'
#' u_vec_ns <- quantile(ns, probs = seq(0.1, 0.9, by = 0.1))
#' ns_cv <- ithresh(data = ns, u_vec = u_vec_ns, n_v = 2)
#' plot(ns_cv, lwd = 2, add_legend = TRUE, legend_pos = "topright")
#' mtext("significant wave height / m", side = 3, line = 2.5)
#'
#' ## Gulf of Mexico significant wave heights, default prior ------------------
#'
#' u_vec_gom <- quantile(gom, probs = seq(0.2, 0.9, by = 0.1))
#' # Setting a prior using its name and parameter value(s) --------------------
#' # This example gives the same prior as the default
#' gom_cv <- ithresh(data = gom, u_vec = u_vec_gom, n_v = 2, prior = "mdi",
#' h_prior = list(a = 0.6))
#'
#' ## Setting a user-defined (log-)prior R function ---------------------------
#' # This example also gives the same prior as the default
#' # (It will take longer to run than the example above because ithresh detects
#' # that the prior is an R function and sets use_rcpp to FALSE.)
#' \donttest{
#' user_prior <- function(pars, a, min_xi = -1) {
#' if (pars[1] <= 0 | pars[2] < min_xi) {
#' return(-Inf)
#' }
#' return(-log(pars[1]) - a * pars[2])
#' }
#' user_bin_prior <- function(p, ab) {
#' return(stats::dbeta(p, shape1 = ab[1], shape2 = ab[2], log = TRUE))
#' }
#' gom_cv <- ithresh(data = gom, u_vec = u_vec_gom, n_v = 2, prior = user_prior,
#' h_prior = list(a = 0.6), bin_prior = user_bin_prior,
#' h_bin_prior = list(ab = c(1 / 2, 1 / 2)))
#' }
#' ## Setting a user-defined (log-)prior (pointer to a) C++ function ----------
#' # We make use of a C++ function and function create_prior_xptr() to create
#' # the required pointer from the revdbayes package
#'
#' prior_ptr <- revdbayes::create_prior_xptr("gp_flat")
#' gom_cv <- ithresh(data = gom, u_vec = u_vec_gom, n_v = 2, prior = prior_ptr,
#' h_prior = list(min_xi = -1))
#' @references Northrop, P.J. and Attalides, N. (2016) Posterior propriety in
#' Bayesian extreme value analyses using reference priors
#' \emph{Statistica Sinica}, \strong{26}(2), 721--743
#' \doi{10.5705/ss.2014.034}.
#' @references Northrop, P. J., Attalides, N. and Jonathan, P. (2017)
#' Cross-validatory extreme value threshold selection and uncertainty
#' with application to ocean storm severity.
#' \emph{Journal of the Royal Statistical Society Series C: Applied
#' Statistics}, \strong{66}(1), 93-120.
#' \doi{10.1111/rssc.12159}
#' @references Jonathan, P. and Ewans, K. (2013) Statistical modelling
#' of extreme ocean environments for marine design : a review.
#' \emph{Ocean Engineering}, \strong{62}, 91-109.
#' \doi{10.1016/j.oceaneng.2013.01.004}
#' @export
ithresh <- function(data, u_vec, ..., n_v = 1, npy = NULL, use_rcpp = TRUE) {
# Record the call for later use
Call <- match.call()
# Store npy (if it has been supplied)
if (!is.null(attr(data, "npy"))) {
return_npy <- attr(data, "npy")
}
if (!is.null(npy) & !is.null(attr(data, "npy"))) {
warning(paste("Two values of npy supplied. attr(data, ''npy'') = ",
return_npy, " will be returned.", sep=""),
immediate. = TRUE)
} else if (!is.null(npy)) {
return_npy <- npy
} else if (is.null(attr(data, "npy"))) {
return_npy <- NULL
}
# Remove missing values from data
data <- as.numeric(stats::na.omit(data))
# Put thresholds in ascending order and remove any repeated values.
u_vec <- unique(sort(u_vec))
n_u <- length(u_vec)
# Check that the highest threshold is lower than the largest observation.
if (u_vec[n_u] >= max(data)) {
stop("max(u_vec) must be less than max(data)")
}
if (n_v > n_u) {
n_v <- n_u
warning("n_v has been set to length(u_vec)")
}
v_vec <- u_vec[(n_u - n_v + 1):n_u]
temp <- cv_fn(data = data, u_vec = u_vec, v_vec = v_vec, n_u = n_u,
n_v = n_v, use_rcpp = use_rcpp, ...)
if (is.null(names(u_vec))) {
temp$u_ps <- round(100 * sapply(u_vec, function(x) mean(data < x)))
} else {
temp$u_ps <- as.numeric(substr(names(u_vec), 1, nchar(names(u_vec),
type = "c") - 1))
}
if (is.null(names(v_vec))) {
temp$v_ps <- round(100 * sapply(v_vec, function(x) mean(data < x)))
} else {
temp$v_ps <- as.numeric(substr(names(v_vec), 1, nchar(names(v_vec),
type = "c") - 1))
}
if (!is.null(return_npy)) {
temp$npy <- return_npy
}
temp$data <- data
temp$call <- Call
class(temp) <- "ithresh"
return(temp)
}
# =========================== cv_fn ===========================
cv_fn <- function(data, u_vec, v_vec, n_u, n_v, use_rcpp, ...) {
# Extract arguments for passing to revdbayes function rpost -----------------
# GP prior.
cv_control <- list(...)
# If prior is a (user-supplied) R function then set use_rcpp = FALSE
if (is.function(cv_control$prior)) {
use_rcpp <- FALSE
}
if (is.null(cv_control$prior)) {
cv_control$prior <- "mdi"
}
if (is.null(cv_control$h_prior$min_xi)) {
cv_control$h_prior$min_xi <- -1
}
if (is.character(cv_control$prior)) {
if (cv_control$prior == "mdi" & is.null(cv_control$h_prior$a)) {
cv_control$h_prior$a <- 0.6
}
}
for_set_prior <- c(list(prior = cv_control$prior, model = "gp"),
cv_control$h_prior)
gp_prior <- do.call(revdbayes::set_prior, for_set_prior)
cv_control$prior <- NULL
cv_control$h_prior <- NULL
# Binomial prior.
if (is.null(cv_control$bin_prior)) {
cv_control$bin_prior <- "jeffreys"
}
for_set_bin_prior <- c(list(prior = cv_control$bin_prior),
cv_control$h_bin_prior)
bin_prior <- do.call(revdbayes::set_bin_prior, for_set_bin_prior)
cv_control$bin_prior <- NULL
cv_control$h_bin_prior <- NULL
# Size of samples from posterior distributions.
if (is.null(cv_control$n)) {
n <- 1000
} else {
n <- cv_control$n
}
cv_control$n <- NULL
# Check for arguments passed in ... that are not formal arguments of
# rpost_rcpp/rpost or ru_rcpp/ru.
in_rpost <- names(cv_control) %in% methods::formalArgs(revdbayes::rpost_rcpp)
in_ru <- names(cv_control) %in% methods::formalArgs(rust::ru_rcpp)
rogue_names <- !in_rpost & !in_ru
rogue_args <- cv_control[rogue_names]
if (length(rogue_args) > 0) {
warning("The following arguments have been ignored", immediate. = TRUE)
print(rogue_args)
cv_control[rogue_names] <- NULL
}
# Set up quantities for passing to revdbayes::rpost()
# Locate the largest observation(s) in the data.
j_max <- which(data == max(data))
data_max <- data[j_max]
n_max <- length(data_max)
# Remove (one of the) the largest observation(s)
data_rm <- data[-j_max[1]]
n_rm <- length(data_rm)
#
# A matrix to store the measures of predictive performance.
pred_perf <- matrix(NA, nrow = n_u, ncol = n_v)
pred_perf_rcpp <- matrix(NA, nrow = n_u, ncol = n_v)
# Set the revdbayes posterior simulation function.
if (use_rcpp) {
gp_postsim <- revdbayes::rpost_rcpp
} else {
gp_postsim <- revdbayes::rpost
}
#
for_post <- c(list(n = n, model = "bingp", prior = gp_prior,
bin_prior = bin_prior), cv_control)
# A matrix to store the posterior samples for each threshold
sim_vals <- matrix(NA, ncol = 4, nrow = n * n_u)
# Loop over the training thresholds -------------------------------------
for (i in 1:n_u){
u <- u_vec[i]
# Simulation from posterior distributions.
#
# If an error occurs (this can sometimes happen if there are few excesses)
# then try trans = "BC". This tends to be slower but the Box-Cox
# transformation towards normality can improve stability of the
# ratio-of-uniforms boxing optimizations. If trans = "BC" was used already
# then try trans = "none".
#
# Simulate from (full) bin-GP posterior.
temp <- try(do.call(gp_postsim, c(for_post, list(data = data,
thresh = u))),
silent = TRUE)
if (inherits(temp, "try-error")) {
if (is.null(for_post$trans) || for_post$trans == "none") {
for_post$trans <- "BC"
} else {
for_post$trans <- "none"
}
temp <- do.call(gp_postsim, c(for_post, list(data = data, thresh = u)))
}
# Simulate from the bin-GP posterior after removal of the maximum value
temp_rm <- try(do.call(gp_postsim, c(for_post, list(data = data_rm,
thresh = u))),
silent = TRUE)
if (inherits(temp_rm, "try-error")) {
if (is.null(for_post$trans) || for_post$trans == "none") {
for_post$trans <- "BC"
} else {
for_post$trans <- "none"
}
temp_rm <- do.call(gp_postsim, c(for_post, list(data = data_rm,
thresh = u)))
}
# Combine binomial and GP posterior simulated values.
theta <- cbind(temp$bin_sim_vals, temp$sim_vals)
theta_rm <- cbind(temp_rm$bin_sim_vals, temp_rm$sim_vals)
# Carry out leave-one-out Bayesian cross-validation -------------------
which_v <- which(v_vec >= u)
v_vals <- v_vec[which_v] # select the validation thresholds
pred_perf[i, which_v] <- bloocv(z = data, theta = theta,
theta_rm = theta_rm, u1 = u,
u2_vec = v_vals, z_max = data_max,
z_rm = data_rm, n = n)
# Save the posterior samples values
which_rows <- (1 + (i - 1) * n):(i * n)
sim_vals[which_rows, ] <- cbind(theta, i)
}
temp <- list(pred_perf = pred_perf, u_vec = u_vec, v_vec = v_vec,
sim_vals = sim_vals, n = n, for_post = for_post,
use_rcpp = use_rcpp)
return(temp)
}
# =========================== new_bloocv ===========================
bloocv <- function(z, theta, theta_rm, u1, u2_vec, z_max, z_rm, n){
#
n1 <- sum(z_rm <= u1) # number of values that do not exceed u1
z_gt_u1 <- z_rm[z_rm > u1] # data greater than u1 (not including maximum)
n2 <- length(u2_vec) # number of validation thresholds considered
n_max <- length(z_max) # cardinality of max(z)
#
# Posterior samples of (p,sigma,xi) at training threshold u1 ...
p1 <- theta[, 1] # p.u from posterior sample
sigma_1 <- theta[, 2]
xi <- theta[, 3]
xi_near_0 <- abs(xi) < 1e-6
pow1 <- -1 / xi
pow2 <- -1 + pow1
mp1 <- 1 / (1 - p1)
# Probability of exceeding validation threshold u2, for each u2
p_gt_u2 <- function(x) {
xx <- (x - u1) / sigma_1
ifelse(xi_near_0, exp(-xx + xi * xx ^ 2 / 2), (1 + xi * xx) ^ pow1)
}
templ <- numeric(n)
p_2_vec <- p1 * vapply(u2_vec, p_gt_u2, templ)
# n.sim by (1+length(u2_vec)) matrix
# bin-GP density at each observation greater than u1
f_gt_u1 <- function(x) {
xx <- (x - u1) / sigma_1
ifelse(xi_near_0, exp(-xx + xi * xx * (xx - 2) / 2), (1 + xi * xx) ^ pow2)
}
fgp_r <- vapply(z_gt_u1, f_gt_u1, templ) / sigma_1
#
temp <- p1 * fgp_r
p2 <- p_2_vec
#
#----------------------- Function pred_u2 ---------------------------#
pred_u2 <- function(k) {
# Contribution for each z that is less than or equal to u1
t1 <- 1
if (n1 > 0) {
t1 <- mean((1 - p2[, k]) * mp1) / mean(mp1)
}
# Contributions from zs that are in (u1,u2]
t2 <- 1
if (min(z_gt_u1) <= u2_vec[k]) {
w2 <- which(z_gt_u1 <= u2_vec[k])
fr2 <- temp[, w2, drop = FALSE]
mp2 <- 1 / fr2
t2 <- colMeans((1 - p2[, k]) * mp2) / colMeans(mp2)
}
# Contributions from zs that are greater than u2
t3 <- 1
if (max(z_gt_u1) > u2_vec[k]) {
w3 <- which(z_gt_u1 > u2_vec[k])
fr3 <- temp[, w3, drop = FALSE]
t3 <- 1 / colMeans(1 / fr3)
}
#
return(n1 * log(t1) + sum(log(t2)) + sum(log(t3)))
}
#--------------------- End of function pred_u2 ----------------------#
#
t123 <- vapply(1:n2, pred_u2, 0)
#
# Contribution from z_max
p1 <- theta_rm[, 1]
sigma_1 <- theta_rm[, 2]
xi <- theta_rm[, 3]
xi_near_0 <- abs(xi) < 1e-6
pow2 <- -(1 + 1 / xi)
# bin-GP density at max(z)
xx <- (z_max - u1) / sigma_1
lin_max <- 1 + xi * xx
t4 <- ifelse(lin_max > 0,
ifelse(xi_near_0, exp(-xx + xi * xx * (xx - 2) / 2),
lin_max ^ pow2),
0)
t4 <- mean(p1 * t4 / sigma_1)
#
return(t123 + n_max * log(t4))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.