NOT_CRAN <- identical(Sys.getenv("NOT_CRAN"), "true") knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = NOT_CRAN, echo = TRUE )
library(ExactVaRTest)
Let ({X_t}_{t=1}^n) be the (0/1) hit sequence.
Under the null (H_0 : X_t \stackrel{\text{i.i.d.}}{\sim} \mathrm{Bernoulli}(\alpha)),
define the cell counts
[ T_{00}=\sum_{t=2}^{n}! \mathbf 1{X_{t-1}=0,\;X_t=0},\qquad T_{01},\;T_{10},\;T_{11}\hspace{2pt}\text{analogously},\qquad T_0=T_{00}+T_{10},\;T_1=T_{01}+T_{11}. ]
The likelihood-ratio statistic for independence is
[ \mathrm{LR}{\text{ind}} = -2!\left[ T_0\log(1-\hat p)+T_1\log\hat p \;-\; T{00}\log(1-\pi_{01})-T_{01}\log\pi_{01} \;-\; T_{10}\log(1-\pi_{11})-T_{11}\log\pi_{11} \right], ]
where (\displaystyle \hat p = \frac{T_1}{n-1},\qquad \pi_{01} = \frac{T_{01}}{T_{00}+T_{01}},\qquad \pi_{11} = \frac{T_{11}}{T_{10}+T_{11}}. )
Note.
All logarithms are evaluated with the safe function
[ \operatorname{safe_log}(x)=\log\bigl(\max{x,10^{-15}}\bigr), ]
which prevents floating-point underflow when (x) is extremely small.
At time (k\;(1\le k\le n)) we compress every partial path into the state
[ s = \bigl(\,\ell,\;c_{00},c_{10},c_{01},c_{11}\bigr),\qquad \ell\in{0,1}, ]
where (\ell) is the last hit and (c_{xy}) are the running counts of
((X_{t-1}=x,\;X_t=y)) up to time (k).
The forward probability attached to (s) is the summed mass of all paths that
lead to this state.
With transition probabilities
[ P(X_k=0\mid H_0)=1-\alpha,\qquad P(X_k=1\mid H_0)=\alpha, ]
each state produces two offspring:
[ \begin{aligned} \textbf{0-step}:&\; (\ell,c_{00},c_{10},c_{01},c_{11};\,p) \;\longrightarrow\; (0,c_{00}!+!\mathbf1_{{\ell=0}},c_{10}!+!\mathbf1_{{\ell=1}}, c_{01},c_{11};\,p(1-\alpha)),\[6pt] \textbf{1-step}:&\; (\ell,c_{00},c_{10},c_{01},c_{11};\,p) \;\longrightarrow\; (1,c_{00},c_{10}, c_{01}!+!\mathbf1_{{\ell=0}},c_{11}!+!\mathbf1_{{\ell=1}};\,p\alpha). \end{aligned} ]
If multiple paths arrive at the same offspring state, their probabilities are summed (state aggregation).
States with probability mass below a fixed threshold (\tau\;(=10^{-15}\text{ by default})) are discarded at each step:
[ \text{keep } s \iff p_s \ge \tau. ]
Empirically this leaves the exact distribution unchanged while reducing the state space by several orders of magnitude.
Christoffersen’s conditional-coverage test adds an unconditional-coverage term
[ \mathrm{LR}_{\text{uc}} = -2!\Bigl[ c_1\log\alpha + (n-c_1)\log(1-\alpha) -c_1\log\hat\alpha - (n-c_1)\log(1-\hat\alpha) \Bigr], \qquad \hat\alpha=\frac{c_1}{n}, ]
to the independence part, so that
[ \mathrm{LR}{\text{cc}} = \mathrm{LR}{\text{uc}} + \mathrm{LR}_{\text{ind}}. ]
To adapt the algorithm, we augment each state with the running total of exceptions (c_1=\sum_{t=1}^{k} X_t):
[ s = (\ell,\,c_1,\,c_{00},c_{10},c_{01},c_{11}). ]
A 1-step transition increments (c_1); a 0-step leaves it unchanged.
All other mechanics (expansion, aggregation, pruning) are identical to the
independence algorithm.
At termination we compute (\mathrm{LR}{\text{uc}}) and
(\mathrm{LR}{\text{ind}}) for every state, sum them to obtain
(\mathrm{LR}_{\text{cc}}), and aggregate identical values as before.
The table below benchmarks how long the package needs to produce the exact finite-sample distribution for a range of (n) and (\alpha).
library(bench) library(dplyr) library(tidyr) library(purrr) library(knitr) n_vec <- c(50, 100, 250, 500, 750, 1000) alpha_vec <- c(0.01, 0.025, 0.05) grid <- expand.grid(n = n_vec, alpha = alpha_vec, KEEP.OUT.ATTRS = FALSE) bench_one <- function(n, alpha, fun) { bm <- bench::mark( fun(n = n, alpha = alpha), iterations = 3, check = FALSE ) tibble( n = n, alpha = alpha, median_s = as.numeric(bm$median) ) } timings_ind <- pmap_dfr(grid, bench_one, fun = lr_ind_dist) timings_cc <- pmap_dfr(grid, bench_one, fun = lr_cc_dist) fmt_wide <- function(df) { df %>% mutate(alpha = sprintf("alpha = %.3f", alpha)) %>% pivot_wider(names_from = alpha, values_from = median_s) %>% arrange(n) } table_ind <- fmt_wide(timings_ind) table_cc <- fmt_wide(timings_cc) kable( table_ind, digits = 3, col.names = c("n", "α = 0.010", "α = 0.025", "α = 0.050"), caption = "Median run-time (seconds) for lr_ind_dist()" ) kable( table_cc, digits = 3, col.names = c("n", "α = 0.010", "α = 0.025", "α = 0.050"), caption = "Median run-time (seconds) for lr_cc_dist()" )
Here it measures the end-to-end cost of a single backtest_lr()
call on a synthetic 0/1 series.
library(xts) alpha <- 0.01 window <- 250 local_file <- "inst/extdata/GSPC_close.rds" file_path <- if (file.exists(local_file)) local_file else system.file("extdata", "GSPC_close.rds", package = "ExactVaRTest") px <- readRDS(file_path) ret <- diff(log(px), lag = 1) VaR <- rollapply( ret, window, function(r) quantile(r, alpha, na.rm = TRUE), by.column = FALSE, align = "right" ) keep <- complete.cases(ret, VaR) ret <- ret[keep] VaR <- coredata(VaR[keep]) x <- ifelse(coredata(ret) < VaR, 1L, 0L) cat("Series length :", length(x), "\n") cat("Exception rate:", mean(x), "\n\n") bench_res <- bench::mark( LR_ind = backtest_lr(x, alpha, "ind"), LR_cc = backtest_lr(x, alpha, "cc"), iterations = 10, check = FALSE ) suppressWarnings( print(bench_res[, c("expression", "median")]) ) res_ind <- backtest_lr(x, alpha, "ind") res_cc <- backtest_lr(x, alpha, "cc") cat("\n--- Independence test ---\n") print(res_ind) cat("\n--- Conditional-coverage test ---\n") print(res_cc)
The following figure shows the exact finite-sample CDF with the usual (\chi^2) approximation for (n) = 250, (\alpha) = 1%.
n <- 250 alpha <- 0.01 d_ind <- lr_ind_dist(n, alpha) d_cc <- lr_cc_dist(n, alpha) d_cc$LR <- d_cc$LR_cc d_cc$prob <- d_cc$prob_cc oldpar <- par(mfrow = c(1, 2), mar = c(4, 4, 2, 1)) # LR_ind vs χ²(1) curve(pchisq(x, df = 1), 0, 20, lty = 2, ylab = "CDF", xlab = "LR_ind statistic", main = "LR_ind") lines(stepfun(d_ind$LR, c(0, cumsum(d_ind$prob))), pch = 19) legend("bottomright", c("Chi-sq(1)", "Exact"), lty = c(2, 1), bty = "n") # LR_cc vs χ²(2) curve(pchisq(x, df = 2), 0, 30, lty = 2, ylab = "CDF", xlab = "LR_cc statistic", main = "LR_cc") lines(stepfun(d_cc$LR, c(0, cumsum(d_cc$prob))), pch = 19) legend("bottomright", c("Chi-sq(2)", "Exact"), lty = c(2, 1), bty = "n") par(oldpar)
A quick Monte-Carlo shows how often each method rejects under the null when (n) = 250 and (\alpha) = 1%.
n <- 250 alpha <- 0.01 set.seed(1) # LR_cc p.chi.cc <- replicate( 1000, ExactVaRTest::lr_cc_stat(rbinom(n, 1, alpha), alpha) > qchisq(.95, 2) ) p.exact.cc <- replicate( 1000, { x <- rbinom(n, 1, alpha) ExactVaRTest::backtest_lr(x, alpha, "cc")$pval < 0.05 } ) # LR_ind p.chi.ind <- replicate( 1000, ExactVaRTest::lr_ind_stat(rbinom(n, 1, alpha), alpha) > qchisq(.95, 1) ) p.exact.ind <- replicate( 1000, { x <- rbinom(n, 1, alpha) ExactVaRTest::backtest_lr(x, alpha, "ind")$pval < 0.05 } ) data.frame( Test = c("LR_cc Chi^2", "LR_cc Exact", "LR_ind Chi^2", "LR_ind Exact"), Size = c(mean(p.chi.cc), mean(p.exact.cc), mean(p.chi.ind), mean(p.exact.ind)) )
We plot 250-day rolling p-values ((\alpha) = 1%) for LRind and LRcc.
library(ggplot2) library(dplyr) library(tidyr) alpha <- 0.01 win <- 250 local_file <- "inst/extdata/GSPC_close.rds" file_path <- if (file.exists(local_file)) local_file else system.file("extdata", "GSPC_close.rds", package = "ExactVaRTest") px <- readRDS(file_path) px <- px[index(px) >= "2012-01-01"] ret <- diff(log(px)) VaR <- rollapply( ret, win, function(r) quantile(r, alpha, na.rm = TRUE), by.column = FALSE, align = "right" ) keep <- complete.cases(ret, VaR) r <- coredata(ret[keep]) v <- coredata(VaR[keep]) exc <- ifelse(r < v, 1L, 0L) n_seg <- length(exc) - win + 1 ind_exact <- ind_chi <- cc_exact <- cc_chi <- numeric(n_seg) for (i in seq_len(n_seg)) { seg <- exc[i:(i + win - 1)] ind_stat <- ExactVaRTest::lr_ind_stat(seg, alpha) cc_stat <- ExactVaRTest::lr_cc_stat(seg, alpha) ind_exact[i] <- ExactVaRTest::pval_lr_ind(ind_stat, win, alpha) cc_exact[i] <- ExactVaRTest::pval_lr_cc(cc_stat, win, alpha) ind_chi[i] <- pchisq(ind_stat, df = 1, lower.tail = FALSE) cc_chi[i] <- pchisq(cc_stat, df = 2, lower.tail = FALSE) } df <- tibble( idx = seq_len(n_seg), ind_exact, ind_chi, cc_exact, cc_chi ) %>% pivot_longer(cols = -idx, names_to = c("test", "method"), names_pattern = "(ind|cc)_(.*)", values_to = "pval") %>% mutate(test = ifelse(test == "ind", "LRind", "LRcc"), method = ifelse(method == "exact", "exact", "chi-sq")) ggplot(df, aes(idx, pval, colour = method)) + geom_step() + geom_hline(yintercept = 0.05, linetype = 2, colour = "red") + facet_wrap(~ test, ncol = 1, scales = "free_x") + scale_colour_manual(values = c("chi-sq" = "red", "exact" = "cyan4")) + labs(x = "window start index", y = "p-value", title = "Rolling 250-day p-values (α = 1%)") + theme_bw()
library(ExactVaRTest) n_set <- c(250, 500, 750, 1000) alpha_set <- c(0.005, 0.01, 0.025, 0.05) gamma_set <- c(0.90, 0.95, 0.99) q_lr <- function(d, g) d$LR[which(cumsum(d$prob) >= g)[1L]] tbl <- expand.grid(n = n_set, alpha = alpha_set, gamma = gamma_set, KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE) tbl$crit_ind <- mapply(function(n, a, g) q_lr(lr_ind_dist(n, a, prune_threshold = 1e-15), g), tbl$n, tbl$alpha, tbl$gamma, SIMPLIFY = TRUE) tbl$crit_cc <- mapply(function(n, a, g) { d <- lr_cc_dist(n, a, prune_threshold = 1e-15) d$LR <- d$LR_cc d$prob <- d$prob_cc q_lr(d, g) }, tbl$n, tbl$alpha, tbl$gamma, SIMPLIFY = TRUE) print(tbl, digits = 6)
( p = \sum_{k=0}^{P} \Pr!\bigl(P' = k\bigr)\,\Pr!\bigl(LR_{\text{uc/ind}} \ge \text{obs}\mid P' = k\bigr) )
library(ExactVaRTest) set.seed(123) P <- 250 alpha <- 0.05 alpha_prime <- 0.10 inst_flag <- rbinom(P, 1, alpha_prime) sys_flag <- if (sum(inst_flag)) rbinom(sum(inst_flag), 1, alpha) else integer(0) lr_uc <- lr_uc_stat(sys_flag, alpha) lr_ind <- lr_ind_stat(sys_flag, alpha) mix_tail <- function(lr_obs, P, alpha, alpha_prime, type = c("uc", "ind"), prune = 1e-15) { type <- match.arg(type) w <- dbinom(0:P, P, alpha_prime) tail_prob <- function(k) { if (type == "uc") { if (!k) return(as.numeric(lr_obs <= 0)) d <- lr_uc_dist(k, alpha) sum(d$prob[d$LR >= lr_obs]) } else { if (k < 2) return(as.numeric(lr_obs <= 0)) d <- lr_ind_dist(k, alpha, prune) sum(d$prob[d$LR >= lr_obs]) } } sum(vapply(0:P, tail_prob, numeric(1)) * w) } p_uc <- mix_tail(lr_uc, P, alpha, alpha_prime, "uc") p_ind <- mix_tail(lr_ind, P, alpha, alpha_prime, "ind") data.frame( test = c("UC", "IND"), stat = c(lr_uc, lr_ind), p = c(p_uc, p_ind) )
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.