exdex: Estimation of the Extremal Index

# ================================= kgaps_mle =================================
#
#' Maximum likelihood estimation for the K-gaps model
#'
#' Calculates maximum likelihood estimates of the extremal index \eqn{\theta}
#' based on the K-gaps model for threshold inter-exceedances times of
#' Suveges and Davison (2010).
#'
#' @param data A numeric vector of raw data.  No missing values are allowed.
#' @param thresh A numeric scalar.  Extreme value threshold applied to data.
#' @param k A numeric scalar.  Run parameter \eqn{K}, as defined in Suveges and
#'   Davison (2010).  Threshold inter-exceedances times that are not larger
#'   than \code{k} units are assigned to the same cluster, resulting in a
#'   \eqn{K}-gap equal to zero.  Specifically, the \eqn{K}-gap \eqn{S}
#'   corresponding to an inter-exceedance time of \eqn{T} is given by
#'   \eqn{S = max(T - K, 0)}.
#' @param inc_cens A logical scalar indicating whether or not to include
#'   contributions from censored inter-exceedance times relating to the
#'   first and last observation.  See Attalides (2015) for details.
#' @param conf  A numeric scalar.  If \code{conf} is supplied then a
#'   \code{conf}\% likelihood-based confidence interval for \eqn{\theta} is
#'   estimated.
#' @details The maximum likelihood estimate of the extremal index \eqn{\theta}
#'   under the K-gaps model of Suveges and Davison (2010) is calculated.
#'   If \code{inc_cens = TRUE} then information from censored inter-exceedance
#'   times is included in the likelihood to be maximised, following
#'   Attalides (2015).  The form of the log-likelihood is given in the
#'   \strong{Details} section of \code{\link{kgaps_stats}}.
#' @references Suveges, M. and Davison, A. C. (2010) Model
#'   misspecification in peaks over threshold analysis, \emph{The Annals of
#'   Applied Statistics}, \strong{4}(1), 203-221.
#'   \url{http://dx.doi.org/10.1214/09-AOAS292"}
#' @references Attalides, N. (2015) Threshold-based extreme value modelling,
#'   PhD thesis, University College London.
#'   \url{http://discovery.ucl.ac.uk/1471121/1/Nicolas_Attalides_Thesis.pdf}
#' @return A list containing
#'   \itemize{
#'     \item {\code{theta_mle} : } {The maximum likelihood estimate (MLE) of
#'       \eqn{\theta}.}
#'     \item {\code{theta_se} : } {The estimated standard error of the MLE.}
#'     \item {\code{theta_ci} : } {(If \code{conf} is supplied) a numeric
#'       vector of length two giving lower and upper confidence limits for
#'       \eqn{\theta}.}
#'     \item {\code{ss} : } {The list of summary statistics returned from
#'       \code{\link{kgaps_stats}}.}
#'   }
#' @seealso \code{\link{kgaps_stats}} for the calculation of sufficient
#'   statistics for the K-gaps model.
#' @examples
#' thresh <- quantile(newlyn, probs = 0.90)
#' # MLE and SE only
#' kgaps_mle(newlyn, thresh)
#' # MLE, SE and 95% confidence interval
#' kgaps_mle(newlyn, thresh, conf = 95)
#' @export
kgaps_mle <- function(data, thresh, k = 1, inc_cens = FALSE, conf = NULL) {
  # Calculate sufficient statistics
  ss <- kgaps_stats(data, thresh, k, inc_cens)
  # If N0 = 0 then all exceedances occur singly (all K-gaps are positive)
  # and the likelihood is maximised at theta = 1.
  N0 <- ss$N0
  # If N1 = 0 then we are in the degenerate case where there is one cluster
  # (all K-gaps are zero) and the likelihood is maximised at theta = 0.
  N1 <- ss$N1
  if (N1 == 0) {
    theta_mle <- 0L
  } else if (N0 == 0) {
    theta_mle <- 1L
  } else {
    sum_qs <- ss$sum_qs
    theta_mle <- kgaps_quad_solve(N0, N1, sum_qs)
  }
  # Estimate standard error
  obs_info <- 0
  if (N0 > 0) {
    obs_info <- obs_info + N0 / (1 - theta_mle) ^ 2
  }
  if (N1 > 0) {
    obs_info <- obs_info + 2 * N1 / theta_mle ^ 2
  }
  theta_se <- sqrt(1 / obs_info)
  if (is.null(conf)) {
    return(list(theta_mle = theta_mle, theta_se = theta_se, ss = ss))
  }
  conf_int <- kgaps_conf_int(theta_mle, ss, conf = conf)
  return(list(theta_mle = theta_mle, theta_se = theta_se, theta_ci = conf_int,
              ss = ss))
}

# ================================ kgaps_stats ================================

#' Sufficient statistics for the K-gaps model
#'
#' Calculates sufficient statistics for the K-gaps model for the extremal index
#' \eqn{\theta}.
#'
#' @param data A numeric vector of raw data.  No missing values are allowed.
#' @param thresh A numeric scalar.  Extreme value threshold applied to data.
#' @param k A numeric scalar.  Run parameter \eqn{K}, as defined in Suveges and
#'   Davison (2010).  Threshold inter-exceedances times that are not larger
#'   than \code{k} units are assigned to the same cluster, resulting in a
#'   \eqn{K}-gap equal to zero.  Specifically, the \eqn{K}-gap \eqn{S}
#'   corresponding to an inter-exceedance time of \eqn{T} is given by
#'   \eqn{S = max(T - K, 0)}.
#' @param inc_cens A logical scalar indicating whether or not to include
#'   contributions from censored inter-exceedance times relating to the
#'   first and last observation.  See Attalides (2015) for details.
#' @details The sample K-gaps are \eqn{S_0, S_1, ..., S_(N-1),
#'   S_N}, where \eqn{S_1, ..., S_(N-1)} are uncensored and \eqn{S_0} and
#'   \eqn{S_N} are censored.  Under the assumption that the K-gaps are
#'   independent, the log-likelihood of the K-gaps model is given by
#'   \deqn{l(\theta; S_0, ..., S_N) = N_0 log(1 - \theta) + 2 N_1 log \theta -
#'     \theta q (S_0 + ... + S_N),}
#'    where \eqn{q} is the threshold exceedance probability,
#'    \eqn{N_0} is the number of sample K-gaps that are equal to zero and
#'    (apart from an adjustment for the contributions of \eqn{S_0} and
#'    \eqn{S_N}) \eqn{N_1} is the number of positive sample K-gaps.
#'    Specifically, \eqn{N_1} is equal to the number of \eqn{S_1, ..., S_(N-1)}
#'    that are positive plus \eqn{(I_0 + I_N) / 2}, where \eqn{I_0 = 1} if
#'    \eqn{S_0} is greater than zero and similarly for \eqn{I_N}.
#'    The differing treatment of uncensored and censored K-gaps reflects
#'    differing contributions to the likelihood.
#'    For full details see Suveges and Davison (2010) and Attalides (2015).
#' @return A list containing the sufficient statistics, with components
#'   \itemize{
#'     \item {\code{N0} : } {the number of zero K-gaps}
#'     \item {\code{N1} : } {contribution from non-zero K-gaps (see
#'       \strong{Details})}
#'     \item {\code{sum_qs} : } {the sum of the (scaled) K-gaps, i.e.
#'       \eqn{q (S_0 + ... + S_N)}, where \eqn{q} is estimated by the
#'       proportion of threshold exceedances.}
#'   }
#' @references Suveges, M. and Davison, A. C. (2010) Model
#'   misspecification in peaks over threshold analysis, \emph{The Annals of
#'   Applied Statistics}, \strong{4}(1), 203-221.
#'   \url{http://dx.doi.org/10.1214/09-AOAS292"}
#' @references Attalides, N. (2015) Threshold-based extreme value modelling,
#'   PhD thesis, University College London.
#'   \url{http://discovery.ucl.ac.uk/1471121/1/Nicolas_Attalides_Thesis.pdf}
#' @seealso \code{\link{kgaps_mle}} for maximum likelihood estimation of the
#'   extremal index \eqn{\theta} using the K-gaps model.
#' @examples
#' u <- quantile(newlyn, probs = 0.90)
#' kgaps_stats(newlyn, u)
#' @export
kgaps_stats <- function(data, thresh, k = 1, inc_cens = FALSE) {
  if (any(is.na(data))) {
    stop("No missing values are allowed in ''data''")
  }
  # Sample size, positions, number and proportion of exceedances
  nx <- length(data)
  exc_u <- (1:nx)[data > thresh]
  N_u <- length(exc_u)
  q_u <- N_u / nx
  # Inter-exceedances times and K-gaps
  T_u <- diff(exc_u)
  S_k <- pmax(T_u - k, 0)
  # N0, N1, sum of scaled K-gaps
  N1 <- sum(S_k > 0)
  N0 <- N_u - 1 - N1
  sum_qs <- sum(q_u * S_k)
  # Include censored inter-exceedance times?
  if (inc_cens) {
    # censored inter-exceedance times and K-gaps
    T_u_cens <- c(exc_u[1] - 1, nx - exc_u[N_u])
    S_k_cens <- pmax(T_u_cens - k, 0)
    # N0, N1, sum of scaled K-gaps
    N1_cens <- sum(S_k_cens > 0)
    sum_s_cens <- sum(q_u * S_k_cens)
    # Add contributions.
    # Note: we divide N1_cens by two because a censored non-zero K-gap S_c
    # contributes theta exp(-theta q_u S_c) to the K-gaps likelihood,
    # whereas a non-censored non-zero K-gap contributes
    # theta^2 exp(-theta q_u S_c).
    # See equation (4.3) of Attalides (2015)
    N1 <- N1 + N1_cens / 2
    sum_qs <- sum_qs + sum_s_cens
  }
  return(list(N0 = N0, N1 = N1, sum_qs = sum_qs))
}

# =============================== kgaps_loglik ================================

kgaps_loglik <- function(theta, N0, N1, sum_qs){
  if (theta < 0 || theta > 1) {
    return(-Inf)
  }
  loglik <- 0
  if (N1 > 0) {
    loglik <- loglik + 2 * N1 * log(theta) - sum_qs * theta
  }
  if (N0 > 0) {
    loglik <- loglik + N0 * log(1 - theta)
  }
  return(loglik)
}

# ============================== kgaps_conf_int ===============================

kgaps_conf_int <- function(theta_mle, ss, conf = 95) {
  cutoff <- stats::qchisq(conf / 100, df = 1)
  theta_list <- c(list(theta = theta_mle), ss)
  max_loglik <- do.call(kgaps_loglik, theta_list)
  ob_fn <- function(theta) {
    theta_list$theta <- theta
    loglik <- do.call(kgaps_loglik, theta_list)
    return(2 * (max_loglik - loglik) - cutoff)
  }
  ci_low <- 0
  ci_up <- 1
  if (ss$N1 > 0) {
    ci_low <- stats::uniroot(ob_fn, c(0, theta_mle))$root
  }
  if (ss$N0 > 0) {
    ci_up <- stats::uniroot(ob_fn, c(theta_mle, 1))$root
  }
  return(c(ci_low, ci_up))
}

# ============================== kgaps_quad_solve =============================

kgaps_quad_solve <- function(N0, N1, sum_qs) {
  aa <- sum_qs
  bb <- - (N0 + 2 * N1 + sum_qs)
  cc <- 2 * N1
  qq <- - (bb - sqrt(bb ^ 2 - 4 * aa * cc)) / 2
  theta_mle <- cc / qq
  return(theta_mle)
}