################################################################################
# ChangePointTests.R
################################################################################
# 2018-08-10
# Curtis Miller
################################################################################
# Definition of all change point statistical tests.
################################################################################
################################################################################
# GENERAL STATISTICAL FUNCTIONS
################################################################################
#' Variance Estimation Consistent Under Change
#'
#' Estimate the variance (using the sum of squared errors) with an estimator
#' that is consistent when the mean changes at a known point.
#'
#' This is the estimator
#'
#' \deqn{\hat{\sigma}^2_{T,t} = T^{-1}\left(\sum_{s = 1}^t \left(X_s -
#' \bar{X}_t\right)^2 + \sum_{s = t + 1}^{T}\left(X_s - \tilde{X}_{T - t}
#' \right)^2\right)}
#'
#' where \eqn{\bar{X}_t = t^{-1}\sum_{s = 1}^t X_s} and \eqn{\tilde{X}_{T - t} =
#' (T - t)^{-1} \sum_{s = t + 1}^{T} X_s}. In this implementation, \eqn{T} is
#' computed automatically as \code{length(x)} and \code{k} corresponds to
#' \eqn{t}, a potential change point.
#'
#' @param x A numeric vector for the data set
#' @param k The potential change point at which the data set is split
#' @import stats
#' @return The estimated change-consistent variance
#' @examples
#' CPAT:::cpt_consistent_var(c(rnorm(500, mean = 0), rnorm(500, mean = 1)), k = 500)
cpt_consistent_var <- function(x, k) {
n <- length(x)
if (n < k | k < 0) {stop("k must be an integer between 1 and length(x)")}
x1 <- x[1:k]
x2 <- x[(k + 1):n]
mu1 <- mean(x1)
mu2 <- mean(x2)
sse1 <- sum((x1 - mu1)^2)
sse2 <- sum((x2 - mu2)^2)
(sse1 + sse2)/n
}
#' Weights for Long-Run Variance
#'
#' Compute some weights for long-run variance. This code comes directly from the
#' source code of \pkg{cointReg}; see \code{\link[cointReg]{getLongRunWeights}}.
#'
#' @param n Length of weights' vector
#' @param bandwidth A number for the bandwidth
#' @param kernel The kernel function; see \code{\link[cointReg]{getLongRunVar}}
#' for possible values
#' @return List with components \code{w} containing the vector of weights and
#' \code{upper}, the index of the largest non-zero entry in \code{w}
#' @examples
#' CPAT:::getLongRunWeights(10, 1)
getLongRunWeights <- function(n, bandwidth, kernel = "ba") {
w <- numeric(n - 1)
bw<- bandwidth
if (kernel == "tr") {
w <- w + 1
upper <- min(bw, n - 1)
}
else if (kernel == "ba") {
upper <- ceiling(bw) - 1
if (upper > 0) {
j <- 1:upper
}
else {
j <- 1
}
w[j] <- 1 - j/bw
}
else if (kernel == "pa") {
upper1 <- floor(bw/2)
if (upper1 > 0) {
j <- 1:upper1
}
else {
j <- 1
}
jj <- j/bw
w[j] <- 1 - 6 * jj^2 + 6 * jj^3
j2 <- (floor(bw/2) + 1):bw
jj2 <- j2/bw
w[j2] <- 2 * (1 - jj2)^3
upper <- ceiling(bw) - 1
}
else if (kernel == "bo") {
upper <- ceiling(bw) - 1
if (upper > 0) {
j <- 1:upper
}
else {
j <- 1
}
jj <- j/bw
w[j] <- (1 - jj) * cos(pi * jj) + sin(pi * jj)/pi
}
else if (kernel == "da") {
upper <- n - 1
j <- 1:upper
w[j] <- sin(pi * j/bw)/(pi * j/bw)
}
else if (kernel == "qs") {
sc <- 1.2 * pi
upper <- n - 1
j <- 1:upper
jj <- j/bw
w[j] <- 25/(12 * pi^2 * jj^2) * (sin(sc * jj)/(sc * jj) -
cos(sc * jj))
}
if (upper <= 0)
upper <- 1
list(w = w, upper = upper)
}
#' Long-Run Variance Estimation With Possible Change Points
#'
#' Computes the estimates of the long-run variance in a change point context, as
#' described in \insertCite{horvathricemiller19}{CPAT}. By default it uses
#' kernel and bandwidth selection as used in the package \pkg{cointReg}, though
#' changing the parameters \code{kernel} and \code{bandwidth} can change this
#' behavior. If \pkg{cointReg} is not installed, the Bartlett internal (defined
#' internally) will be used and the bandwidth will be the square root of the
#' sample size.
#'
#' @param dat The data vector
#' @param kernel If character, the identifier of the kernel function as used in
#' \pkg{cointReg} (see \code{\link[cointReg]{getLongRunVar}}); if
#' function, the kernel function to be used for long-run variance
#' estimation (default is the Bartlett kernel in \pkg{cointReg})
#' @param bandwidth If character, the identifier for how to compute the
#' bandwidth as defined in \pkg{cointReg} (see
#' \code{\link[cointReg]{getBandwidth}}); if function, a function
#' to use for computing the bandwidth; if numeric, the bandwidth
#' value to use (the default is to use Andrews' method, as used in
#' \pkg{cointReg})
#' @return A vector of estimates of the long-run variance
#' @references
#' \insertAllCited{}
#' @examples
#' x <- rnorm(1000)
#' CPAT:::get_lrv_vec(x)
#' CPAT:::get_lrv_vec(x, kernel = "pa", bandwidth = "nw")
get_lrv_vec <- function(dat, kernel = "ba", bandwidth = "and") {
has_cointreg <- requireNamespace("cointReg", quietly = TRUE)
n <- length(dat)
if (is.character(bandwidth)) {
if (!has_cointreg) {
warning("cointReg is not installed! Defaulting to sqrt.")
bandwidth <- sqrt
} else {
if (is.character(kernel) &&
kernel %in% c("ba", "pa", "qs", "th", "tr")) {
kervar <- kernel
} else {
kervar = "ba"
}
h <- cointReg::getBandwidth(dat, bandwidth = bandwidth, kernel = kervar)
}
} else if (is.numeric(bandwidth)) {
if (bandwidth <= 0) {
stop("Bandwidth must be greater than zero.")
} else {
h <- bandwidth
}
} else if (!is.function(bandwidth)) {
stop(paste("bandwidth must be a function, a valid character string, or a",
"non-negative number."))
} else {
h <- bandwidth(n)
}
if (is.character(kernel)) {
kern_vals <- c(1, getLongRunWeights(n, kernel = kernel,
bandwidth = h)$w[-n])
} else if (!is.function(kernel)) {
stop("kernel must be a function or a valid character string.")
} else {
kern_vals <- sapply(1:(n - 1)/h, kernel)
}
# The maximum lag that needs to be checked
max_l <- max(which(kern_vals != 0))
# dat <- rnorm(100)
#dat <- 1:10
dat_mat <- matrix(rep(dat, times = n), byrow = FALSE, nrow = n)
dat_lower <- dat_mat; dat_upper <- dat_mat;
dat_lower[lower.tri(dat_mat, diag = FALSE)] <- NA
dat_upper[!lower.tri(dat_mat, diag = FALSE)] <- NA
x_bar <- colMeans(dat_lower, na.rm = TRUE)
x_tilda <- colMeans(dat_upper, na.rm = TRUE)
mean_mat <- matrix(rep(x_bar, times = n), byrow = TRUE, nrow = n)
mean_mat[lower.tri(diag(n), diag = FALSE)] <- matrix(rep(x_tilda, times = n),
byrow = TRUE,
nrow = n)[lower.tri(
diag(n), diag = FALSE)]
# l increases by row (starts at 0), t by column
y <- dat_mat - mean_mat
# Function implemented in Rcpp for speed
# It accepts the matrix y and the evaluation of the kernel function stored
# in kern_vals, and returns a vector containing estimated long-run variances
# at points t
sigma <- get_lrv_vec_cpp(y, kern_vals, max_l)
# "Equivalent" R code (slower)
# covs <- sapply(1:n, function(t) {
# sapply(0:(n - 1), function(l) {
# mean(y[1:(n - l),t]*y[(1 + l):n,t])
# })
# })
#
# sigma <- sapply(2:(n - 2), function(t) {
# covs[0 + 1, t] + 2 * sum(vkernel(1:(n - 1)/h) * covs[1:(n - 1), t])
# })
if (any(sigma < 0)) {
warning(paste("A negative variance was computed! This may be due to a bad",
"kernel being chosen."))
}
sigma
}
################################################################################
# TEST STATISTIC COMPUTATION FUNCTIONS
################################################################################
#' Compute the CUSUM Statistic
#'
#' This function computes the CUSUM statistic (and can compute weighted/trimmed
#' variants, depending on the values of \code{kn} and \code{tau}).
#'
#' The definition of the statistic is
#'
#' \deqn{T^{-1/2} \max_{1 \leq t \leq T} \hat{\sigma}_{t,T}^{-1} \left|
#' \sum_{s = 1}^t X_s - \frac{t}{T}\sum_{s = 1}^T \right|}
#'
#' A more general version is
#'
#' \deqn{T^{-1/2} \max_{t_T \leq t \leq T - t_T} \hat{\sigma}_{t,T}^{-1}
#' \left(\frac{t}{T}
#' \left(\frac{T - t}{T}\right)\right)^{\tau} \left| \sum_{s = 1}^t X_s -
#' \frac{t}{T}\sum_{s = 1}^T \right|}
#'
#' The parameter \code{kn} corresponds to the trimming parameter \eqn{t_T} and
#' the parameter \code{tau} corresponds to \eqn{\tau}.
#'
#' See \insertCite{horvathricemiller19}{CPAT} for more details.
#'
#' @param dat The data vector
#' @param kn A function corresponding to the trimming parameter \eqn{t_T} in the
#' trimmed CUSUM variant; by default, is a function returning 1 (for
#' no trimming)
#' @param tau The weighting parameter \eqn{\tau} for the weighted CUSUM
#' statistic; by default, is 0 (for no weighting)
#' @param estimate Set to \code{TRUE} to return the estimated location of the
#' change point
#' @param use_kernel_var Set to \code{TRUE} to use kernel methods for long-run
#' variance estimation (typically used when the data is
#' believed to be correlated); if \code{FALSE}, then the
#' long-run variance is estimated using
#' \eqn{\hat{\sigma}^2_{T,t} = T^{-1}\left(
#' \sum_{s = 1}^t \left(X_s - \bar{X}_t\right)^2 +
#' \sum_{s = t + 1}^{T}\left(X_s -
#' \tilde{X}_{T - t}\right)^2\right)}, where
#' \eqn{\bar{X}_t = t^{-1}\sum_{s = 1}^t X_s} and
#' \eqn{\tilde{X}_{T - t} = (T - t)^{-1}
#' \sum_{s = t + 1}^{T} X_s}
#' @param custom_var Can be a vector the same length as \code{dat} consisting of
#' variance-like numbers at each potential change point (so
#' each entry of the vector would be the "best estimate" of
#' the long-run variance if that location were where the
#' change point occured) or a function taking two parameters
#' \code{x} and \code{k} that can be used to generate this
#' vector, with \code{x} representing the data vector and
#' \code{k} the position of a potential change point; if
#' \code{NULL}, this argument is ignored
#' @param kernel If character, the identifier of the kernel function as used in
#' \pkg{cointReg} (see \code{\link[cointReg]{getLongRunVar}}); if
#' function, the kernel function to be used for long-run variance
#' estimation (default is the Bartlett kernel in \pkg{cointReg})
#' @param bandwidth If character, the identifier for how to compute the
#' bandwidth as defined in \pkg{cointReg} (see
#' \code{\link[cointReg]{getBandwidth}}); if function, a function
#' to use for computing the bandwidth; if numeric, the bandwidth
#' value to use (the default is to use Andrews' method, as used in
#' \pkg{cointReg})
#' @param get_all_vals If \code{TRUE}, return all values for the statistic at
#' every tested point in the data set
#' @return If both \code{estimate} and \code{get_all_vals} are \code{FALSE}, the
#' value of the test statistic; otherwise, a list that contains the test
#' statistic and the other values requested (if both are \code{TRUE},
#' the test statistic is in the first position and the estimated change
#' point in the second)
#' @references
#' \insertAllCited{}
#' @examples
#' CPAT:::stat_Vn(rnorm(1000))
#' CPAT:::stat_Vn(rnorm(1000), kn = function(n) {0.1 * n}, tau = 1/2)
#' CPAT:::stat_Vn(rnorm(1000), use_kernel_var = TRUE, bandwidth = "nw", kernel = "bo")
stat_Vn <- function(dat, kn = function(n) {1}, tau = 0, estimate = FALSE,
use_kernel_var = FALSE, custom_var = NULL, kernel = "ba",
bandwidth = "and", get_all_vals = FALSE) {
# Formerly named statVn()
# Here is equivalent (slow) R code
# n = length(dat)
# return(n^(-1/2)*max(sapply(
# floor(max(kn(n),1)):min(n - floor(kn(n)),n-1), function(k)
# (k/n*(1 - k/n))^(-tau)/
# sqrt((sum((dat[1:k] -
# mean(dat[1:k]))^2)+sum((dat[(k+1):n] -
# mean(dat[(k+1):n]))^2))/n)*
# abs(sum(dat[1:k]) - k/n*sum(dat))
# )))
if (use_kernel_var) {
lrv <- get_lrv_vec(dat, kernel, bandwidth)
} else if (!is.null(custom_var)) {
use_kernel_var <- TRUE # Otherwise stat_Zn_cpp() will ignore lrv
if (is.function(custom_var)) {
# This may seem silly, but this is so that error codes refer to
# custom_var, and we don't want recursion either
custom_var_temp <- custom_var
custom_var <- purrr::partial(custom_var_temp, x = dat, .lazy = FALSE)
custom_var_vec <- vapply(1:length(dat), custom_var,
FUN.VALUE = numeric(1))
} else if (is.numeric(custom_var)) {
if (length(custom_var) < length(dat)) stop("custom_var must have" %s%
"length at least" %s%
length(dat) %s0% ", the" %s%
"length of the data set")
custom_var_vec <- custom_var
} else {
stop("Don't know how to handle custom_var of class" %s% class(custom_var))
}
if (any(custom_var_vec < 0)) stop("custom_var suggests a negative" %s%
"variance, which is impossible.")
lrv <- custom_var_vec
} else {
# A vector must be passed to stat_Zn_cpp, so we'll pass one with an
# impossible value
lrv <- c(-1)
}
res <- stat_Vn_cpp(dat, kn(length(dat)), tau, use_kernel_var, lrv,
get_all_vals)
res[[2]] <- as.integer(res[[2]])
if (!estimate & !get_all_vals) {
return(res[[1]])
} else {
return(res[c(TRUE, estimate, get_all_vals)])
}
}
#' Compute the Darling-Erdös Statistic
#'
#' This function computes the Darling-Erdös statistic.
#'
#' If \eqn{\bar{A}_T(\tau, t_T)} is the weighted and trimmed CUSUM statistic
#' with weighting parameter \eqn{\tau} and trimming parameter \eqn{t_T} (see
#' \code{\link{stat_Vn}}), then the Darling-Erdös statistic is
#'
#' \deqn{l(a_T) \bar{A}_T(1/2, 1) - u(b_T)}
#'
#' with \eqn{l(x) = \sqrt{2 \log x}} and \eqn{u(x) = 2 \log x + \frac{1}{2} \log
#' \log x - \frac{1}{2} \log \pi} (\eqn{\log x} is the natural logarithm of
#' \eqn{x}). The parameter \code{a} corresponds to \eqn{a_T} and \code{b} to
#' \eqn{b_T}; these are both \code{log} by default.
#'
#' See \insertCite{horvathricemiller19}{CPAT} to learn more.
#'
#' @param dat The data vector
#' @param a The function that will be composed with
#' \eqn{l(x) = (2 \log x)^{1/2}}
#' @param b The function that will be composed with
#' \eqn{u(x) = 2 \log x + \frac{1}{2} \log \log x - \frac{1}{2} \log
#' \pi}
#' @param estimate Set to \code{TRUE} to return the estimated location of the
#' change point
#' @param use_kernel_var Set to \code{TRUE} to use kernel methods for long-run
#' variance estimation (typically used when the data is
#' believed to be correlated); if \code{FALSE}, then the
#' long-run variance is estimated using
#' \eqn{\hat{\sigma}^2_{T,t} = T^{-1}\left(
#' \sum_{s = 1}^t \left(X_s - \bar{X}_t\right)^2 +
#' \sum_{s = t + 1}^{T}\left(X_s -
#' \tilde{X}_{T - t}\right)^2\right)}, where
#' \eqn{\bar{X}_t = t^{-1}\sum_{s = 1}^t X_s} and
#' \eqn{\tilde{X}_{T - t} = (T - t)^{-1}
#' \sum_{s = t + 1}^{T} X_s}
#' @param kernel If character, the identifier of the kernel function as used in
#' \pkg{cointReg} (see \code{\link[cointReg]{getLongRunVar}}); if
#' function, the kernel function to be used for long-run variance
#' estimation (default is the Bartlett kernel in \pkg{cointReg})
#' @param bandwidth If character, the identifier for how to compute the
#' bandwidth as defined in \pkg{cointReg} (see
#' \code{\link[cointReg]{getBandwidth}}); if function, a function
#' to use for computing the bandwidth; if numeric, the bandwidth
#' value to use (the default is to use Andrews' method, as used in
#' \pkg{cointReg})
#' @param custom_var Can be a vector the same length as \code{dat} consisting of
#' variance-like numbers at each potential change point (so
#' each entry of the vector would be the "best estimate" of
#' the long-run variance if that location were where the
#' change point occured) or a function taking two parameters
#' \code{x} and \code{k} that can be used to generate this
#' vector, with \code{x} representing the data vector and
#' \code{k} the position of a potential change point; if
#' \code{NULL}, this argument is ignored
#' @param get_all_vals If \code{TRUE}, return all values for the statistic at
#' every tested point in the data set
#' @return If both \code{estimate} and \code{get_all_vals} are \code{FALSE}, the
#' value of the test statistic; otherwise, a list that contains the test
#' statistic and the other values requested (if both are \code{TRUE},
#' the test statistic is in the first position and the estimated changg
#' point in the second)
#' @references
#' \insertAllCited{}
#' @examples
#' CPAT:::stat_de(rnorm(1000))
#' CPAT:::stat_de(rnorm(1000), use_kernel_var = TRUE, bandwidth = "nw", kernel = "bo")
stat_de <- function(dat, a = log, b = log, estimate = FALSE,
use_kernel_var = FALSE, custom_var = NULL, kernel = "ba",
bandwidth = "and", get_all_vals = FALSE) {
# Formerly known as statDE()
n <- length(dat)
l <- function(x) {sqrt(2*log(x))}
u <- function(x) {2*log(x) + 1/2*log(log(x)) - 1/2*log(pi)}
res <- stat_Vn(dat, kn = function(n) {1}, tau = 1/2, estimate = TRUE,
use_kernel_var = use_kernel_var, kernel = kernel,
bandwidth = bandwidth, get_all_vals = get_all_vals,
custom_var = custom_var)
res[[2]] <- as.integer(res[[2]])
res[[1]] <- l(a(n)) + res[[1]] - u(b(n))
if (get_all_vals) {
res[[3]] <- l(a(n)) + res[[3]] - u(b(n))
}
if (!estimate & !get_all_vals) {
return(res[[1]])
} else {
return(res[c(TRUE, estimate, get_all_vals)])
}
}
#' Compute the Hidalgo-Seo Statistic
#'
#' This function computes the Hidalgo-Seo statistic for a change in mean model.
#'
#' For a data set \eqn{x_t} with \eqn{n} observations, the test statistic is
#'
#' \deqn{\max_{1 \leq s \leq n - 1} (\mathcal{LM}(s) - B_n)/A_n}
#'
#' where \eqn{\hat{u}_t = x_t - \bar{x}} (\eqn{\bar{x}} is the sample mean),
#' \eqn{a_n = (2 \log \log n)^{1/2}}, \eqn{b_n = a_n^2 - \frac{1}{2} \log \log
#' \log n - \log \Gamma (1/2)}, \eqn{A_n = b_n / a_n^2}, \eqn{B_n =
#' b_n^2/a_n^2}, \eqn{\hat{\Delta} = \hat{\sigma}^2 = n^{-1} \sum_{t = 1}^{n}
#' \hat{u}_t^2}, and \eqn{\mathcal{LM}(s) = n (n - s)^{-1} s^{-1}
#' \hat{\Delta}^{-1} \left( \sum_{t = 1}^{s} \hat{u}_t\right)^2}.
#'
#' If \code{corr} is \code{FALSE}, then the residuals are assumed to be
#' uncorrelated. Otherwise, the residuals are assumed to be correlated and
#' \eqn{\hat{\Delta} = \hat{\gamma}(0) + 2 \sum_{j = 1}^{\lfloor \sqrt{n}
#' \rfloor} (1 - \frac{j}{\sqrt{n}}) \hat{\gamma}(j)} with \eqn{\hat{\gamma}(j)
#' = \frac{1}{n}\sum_{t = 1}^{n - j} \hat{u}_t \hat{u}_{t + j}}.
#'
#' This statistic was presented in \insertCite{hidalgoseo13}{CPAT}.
#'
#' @param dat The data vector
#' @param estimate Set to \code{TRUE} to return the estimated location of the
#' change point
#' @param corr If \code{TRUE}, the long-run variance will be computed under the
#' assumption of correlated residuals; ignored if \code{custom_var}
#' is not \code{NULL} or \code{use_kernel_var} is \code{TRUE}
#' @param get_all_vals If \code{TRUE}, return all values for the statistic at
#' every tested point in the data set
#' @param custom_var Can be a vector the same length as \code{dat} consisting of
#' variance-like numbers at each potential change point (so
#' each entry of the vector would be the "best estimate" of
#' the long-run variance if that location were where the
#' change point occured) or a function taking two parameters
#' \code{x} and \code{k} that can be used to generate this
#' vector, with \code{x} representing the data vector and
#' \code{k} the position of a potential change point; if
#' \code{NULL}, this argument is ignored
#' @param use_kernel_var Set to \code{TRUE} to use kernel methods for long-run
#' variance estimation (typically used when the data is
#' believed to be correlated); if \code{FALSE}, then the
#' long-run variance is estimated using
#' \eqn{\hat{\sigma}^2_{T,t} = T^{-1}\left(
#' \sum_{s = 1}^t \left(X_s - \bar{X}_t\right)^2 +
#' \sum_{s = t + 1}^{T}\left(X_s -
#' \tilde{X}_{T - t}\right)^2\right)}, where
#' \eqn{\bar{X}_t = t^{-1}\sum_{s = 1}^t X_s} and
#' \eqn{\tilde{X}_{T - t} = (T - t)^{-1}
#' \sum_{s = t + 1}^{T} X_s}; if \code{custom_var} is not
#' \code{NULL}, this argument is ignored
#' @param kernel If character, the identifier of the kernel function as used in
#' \pkg{cointReg} (see \code{\link[cointReg]{getLongRunVar}}); if
#' function, the kernel function to be used for long-run variance
#' estimation (default is the Bartlett kernel in \pkg{cointReg})
#' @param bandwidth If character, the identifier for how to compute the
#' bandwidth as defined in \pkg{cointReg} (see
#' \code{\link[cointReg]{getBandwidth}}); if function, a function
#' to use for computing the bandwidth; if numeric, the bandwidth
#' value to use (the default is to use Andrews' method, as used in
#' \pkg{cointReg})
#' @return If both \code{estimate} and \code{get_all_vals} are \code{FALSE}, the
#' value of the test statistic; otherwise, a list that contains the test
#' statistic and the other values requested (if both are \code{TRUE},
#' the test statistic is in the first position and the estimated change
#' point in the second)
#' @references
#' \insertAllCited{}
#' @examples
#' CPAT:::stat_hs(rnorm(1000))
#' CPAT:::stat_hs(rnorm(1000), corr = FALSE)
stat_hs <- function(dat, estimate = FALSE, corr = TRUE, get_all_vals = FALSE,
custom_var = NULL, use_kernel_var = FALSE, kernel = "ba",
bandwidth = "and") {
# Formerly named statHS()
n <- length(dat)
mu <- mean(dat)
u <- dat - mu
if (corr) {
m <- sqrt(n)
Delta <- sum(u^2) / n + sum(sapply(1:m, function(j) {
2 * (1 - j/m) * sum(u[(j + 1):n] * u[1:(n - j)])/n
}))
} else {
# Delta will be the analog of lrv in stat_Zn, but this variable was already
# defined and the naming is in alignment with Hidalgo's paper
Delta <- ((n-1)/n) * var(u)
}
# To implement kernel LRV estimation that is consistent under H_A, I play
# around with custom_var; this will be the avenue through which that
# functionality is implemented.
if (is.null(custom_var)) {
if (use_kernel_var) {
Delta <- get_lrv_vec(dat, kernel, bandwidth)
} else {
Delta <- rep(Delta, length(dat))
}
} else {
if (is.function(custom_var)) {
# To allow greater flexibility, Delta will be made a vector so that Delta
# can take values at different possible change points; this allows for
# greater experimentation on our part
# This may seem silly, but this is so that error codes refer to
# custom_var, and we don't want recursion either
custom_var_temp <- custom_var
custom_var <- purrr::partial(custom_var_temp, x = dat, .lazy = FALSE)
custom_var_vec <- vapply(1:length(dat), custom_var,
FUN.VALUE = numeric(1))
} else if (is.numeric(custom_var)) {
if (length(custom_var) < length(dat)) stop("custom_var must have" %s%
"length at least" %s%
length(dat) %s0% ", the" %s%
"length of the data set")
custom_var_vec <- custom_var
} else {
stop("Don't know how to handle custom_var of class" %s% class(custom_var))
}
if (any(custom_var_vec < 0, na.rm = TRUE)) {
stop("custom_var suggests a negative variance, which is impossible.")
}
Delta <- custom_var_vec
}
la_mu <- sapply(1:(n - 1), function(s) {
n/(n - s) * 1/s * sum(u[1:s])^2/Delta[s]
})
a_n <- sqrt(2 * log(log(n)))
b_n <- 2 * log(log(n)) - log(log(log(n)))/2 - log(pi/4)/2
A_n <- b_n/a_n^2 # Matching notation in Hidalgo and Seo (2013)
B_n <- A_n * b_n
stat <- (max(la_mu) - B_n)/A_n
est <- which.max(la_mu)
res <- list("statistic" = stat,
"estimate" = est,
"stat_vals" = (la_mu - B_n)/A_n)
if (!estimate & !get_all_vals) {
return(res[[1]])
} else {
return(res[c(TRUE, estimate, get_all_vals)])
}
}
#' Univariate Andrews Test for End-of-Sample Structural Change
#'
#' This implements Andrews' test for end-of-sample change, as described by
#' \insertCite{andrews03;textual}{CPAT}. This test was derived for detecting a
#' change in univariate data. See \insertCite{andrews03}{CPAT} for
#' a description of the test.
#'
#' @param x Vector of the data to test
#' @param M Numeric index of the location of the first potential change point
#' @param pval If \code{TRUE}, return a p-value
#' @param stat If \code{TRUE}, return a test statistic
#' @return If both \code{pval} and \code{stat} are \code{TRUE}, a list
#' containing both; otherwise, a number for one or the other, depending
#' on which is \code{TRUE}
#' @references
#' \insertAllCited{}
#' @examples
#' CPAT:::andrews_test(rnorm(1000), M = 900)
andrews_test <- function(x, M, pval = TRUE, stat = TRUE) {
mu <- mean(x)
u <- x - mu
m <- length(x) - M # Deriving m and n as described in Andrews (2003)
n <- length(x) - m
Sigma <- Reduce("+", lapply(1:(n + 1), function(j)
{u[j:(j + m - 1)] %*% t(u[j:(j + m - 1)])})) /
(n + 1)
S <- (t(u[(n + 1):(n + m)]) %*% solve(Sigma) %*% u[(n + 1):(n + m)])[1,1]
submu <- sapply(1:(n - m + 1),
function(j) {
(sum(x[1:(j - 1)]) + sum(u[(j + ceiling(m/2)):n]))/
(n - ceiling(m/2))})
Sj <- sapply(1:(n - m + 1), function(j) {
uj <- x - submu[j]
(t(uj[j:(j + m - 1)]) %*% solve(Sigma) %*%
uj[j:(j + m - 1)])[1,1]
})
res <- list("pval" = mean(S <= Sj), "stat" = S)[c(pval, stat)]
if (length(res) == 1) {
return(res[[1]])
} else {
return(res)
}
}
#' Compute the Rényi-Type Statistic
#'
#' This function computes the Rényi-type statistic.
#'
#' The definition of the statistic is
#'
#' \deqn{\max_{t_T \leq t \leq T - t_T} \hat{\sigma}_{t,T}^{-1}
#' \left|t^{-1}\sum_{s = 1}^{t}X_s - (T - t)^{-1}\sum_{s = t + 1}^{T}
#' X_s \right|}
#'
#' The parameter \code{kn} corresponds to the trimming parameter \eqn{t_T}.
#'
#' @param dat The data vector
#' @param kn A function corresponding to the trimming parameter \eqn{t_T}; by
#' default, the square root function
#' @param estimate Set to \code{TRUE} to return the estimated location of the
#' change point
#' @param use_kernel_var Set to \code{TRUE} to use kernel methods for long-run
#' variance estimation (typically used when the data is
#' believed to be correlated); if \code{FALSE}, then the
#' long-run variance is estimated using
#' \eqn{\hat{\sigma}^2_{T,t} = T^{-1}\left(
#' \sum_{s = 1}^t \left(X_s - \bar{X}_t\right)^2 +
#' \sum_{s = t + 1}^{T}\left(X_s -
#' \tilde{X}_{T - t}\right)^2\right)}, where
#' \eqn{\bar{X}_t = t^{-1}\sum_{s = 1}^t X_s} and
#' \eqn{\tilde{X}_{T - t} = (T - t)^{-1}
#' \sum_{s = t + 1}^{T} X_s}; if \code{custom_var} is not
#' \code{NULL}, this argument is ignored
#' @param custom_var Can be a vector the same length as \code{dat} consisting of
#' variance-like numbers at each potential change point (so
#' each entry of the vector would be the "best estimate" of
#' the long-run variance if that location were where the
#' change point occured) or a function taking two parameters
#' \code{x} and \code{k} that can be used to generate this
#' vector, with \code{x} representing the data vector and
#' \code{k} the position of a potential change point; if
#' \code{NULL}, this argument is ignored
#' @param kernel If character, the identifier of the kernel function as used in
#' \pkg{cointReg} (see \code{\link[cointReg]{getLongRunVar}}); if
#' function, the kernel function to be used for long-run variance
#' estimation (default is the Bartlett kernel in \pkg{cointReg})
#' @param bandwidth If character, the identifier for how to compute the
#' bandwidth as defined in \pkg{cointReg} (see
#' \code{\link[cointReg]{getBandwidth}}); if function, a function
#' to use for computing the bandwidth; if numeric, the bandwidth
#' value to use (the default is to use Andrews' method, as used in
#' \pkg{cointReg})
#' @param get_all_vals If \code{TRUE}, return all values for the statistic at
#' every tested point in the data set
#' @return If both \code{estimate} and \code{get_all_vals} are \code{FALSE}, the
#' value of the test statistic; otherwise, a list that contains the test
#' statistic and the other values requested (if both are \code{TRUE},
#' the test statistic is in the first position and the estimated change
#' point in the second)
#' @examples
#' CPAT:::stat_Zn(rnorm(1000))
#' CPAT:::stat_Zn(rnorm(1000), kn = function(n) {floor(log(n))})
#' CPAT:::stat_Zn(rnorm(1000), use_kernel_var = TRUE, bandwidth = "nw",
#' kernel = "bo")
stat_Zn <- function(dat, kn = function(n) {floor(sqrt(n))}, estimate = FALSE,
use_kernel_var = FALSE, custom_var = NULL, kernel = "ba",
bandwidth = "and", get_all_vals = FALSE) {
# Formerly known as statZn()
if (use_kernel_var) {
lrv <- get_lrv_vec(dat, kernel, bandwidth)
} else if (!is.null(custom_var)) {
use_kernel_var <- TRUE # Otherwise stat_Zn_cpp() will ignore lrv
if (is.function(custom_var)) {
# This may seem silly, but this is so that error codes refer to
# custom_var, and we don't want recursion either
custom_var_temp <- custom_var
custom_var <- purrr::partial(custom_var_temp, x = dat, .lazy = FALSE)
custom_var_vec <- vapply(1:length(dat), custom_var,
FUN.VALUE = numeric(1))
} else if (is.numeric(custom_var)) {
if (length(custom_var) < length(dat)) stop("custom_var must have" %s%
"length at least" %s%
length(dat) %s0% ", the" %s%
"length of the data set")
custom_var_vec <- custom_var
} else {
stop("Don't know how to handle custom_var of class" %s% class(custom_var))
}
if (any(custom_var_vec < 0)) stop("custom_var suggests a negative" %s%
"variance, which is impossible.")
lrv <- custom_var_vec
} else {
# A vector must be passed to stat_Zn_cpp, so we'll pass one with an
# impossible value
lrv <- c(-1)
}
res <- stat_Zn_cpp(dat, kn(length(dat)), use_kernel_var, lrv, get_all_vals)
res[[2]] <- as.integer(res[[2]])
if (!estimate & !get_all_vals) {
return(res[[1]])
} else {
return(res[c(TRUE, estimate, get_all_vals)])
}
# Here is equivalen (slow) R code
#n = length(dat)
#return(sqrt(kn(n))*max(sapply(
# floor(kn(n)):(n - floor(kn(n))), function(k)
# abs(1/k*sum(dat[1:k]) -
# 1/(n-k)*sum(dat[(k+1):n]))/sqrt((sum((dat[1:k] -
# mean(dat[1:k]))^2)+sum((dat[(k+1):n] -
# mean(dat[(k+1):n]))^2))/n))))
}
#' Multivariate Andrews' Test for End-of-Sample Structural Change
#'
#' This implements Andrews' test for end-of-sample change, as described by
#' \insertCite{andrews03;textual}{CPAT}. This test was derived for detecting a
#' change in multivarate data, aso originally described. See
#' \insertCite{andrews03}{CPAT} for a description of the test.
#'
#' @param formula The regression formula, which will be passed to
#' \code{\link[stats]{lm}}
#' @param data \code{data.frame} containing the data
#' @inheritParams andrews_test
#' @return If both \code{pval} and \code{stat} are \code{TRUE}, a list
#' containing both; otherwise, a number for one or the other, depending
#' on which is \code{TRUE}
#' @references
#' \insertAllCited{}
#' @examples
#' x <- rnorm(1000)
#' y <- 1 + 2 * x + rnorm(1000)
#' df <- data.frame(x, y)
#' CPAT:::andrews_test_reg(y ~ x, data = df, M = 900)
andrews_test_reg <- function(formula, data, M, pval = TRUE, stat = TRUE) {
if (!methods::is(formula, "formula")) stop("Bad formula passed to" %s%
"argument \"formula\"")
fit <- lm(formula = formula, data = data)
beta <- coefficients(fit)
d <- length(beta)
X <- model.matrix(fit)
u <- residuals(fit)
y <- fit$model[[1]]
m <- nrow(X) - M # Deriving m and n as described in Andrews (2003)
n <- nrow(X) - m
Sigma <- Reduce("+", lapply(1:(n + 1), function(j)
{u[j:(j + m - 1)] %*% t(u[j:(j + m - 1)])})) /
(n + 1)
if (d <= m) {
V <- t(X[(n + 1):(n + m),]) %*% solve(Sigma) %*% X[(n + 1):(n + m),]
A <- t(X[(n + 1):(n + m),]) %*% solve(Sigma) %*% u[(n + 1):(n + m)]
S <- (t(A) %*% solve(V) %*% A)[1,1]
} else {
S <- (t(u[(n + 1):(n + m)]) %*% solve(Sigma) %*% u[(n + 1):(n + m)])[1,1]
}
subbeta <- lapply(1:(n - m + 1),
function(j) {
coefficients(lm(formula = formula,
data = data[c(1:(j - 1),
(j + ceiling(m/2)):n),]))
})
Sj <- sapply(1:(n - m + 1), function(j) {
yj <- y[j:(j + m - 1)]
Xj <- X[j:(j + m - 1),]
uj <- yj - Xj %*% subbeta[[j]]
Vj <- t(Xj) %*%
solve(Sigma) %*% Xj
Aj <- t(Xj) %*%
solve(Sigma) %*% uj
(t(Aj) %*% solve(Vj) %*% A)[1,1] # Sj
})
res <- list("pval" = mean(S <= Sj), "stat" = S)[c(pval, stat)]
if (length(res) == 1) {
return(res[[1]])
} else {
return(res)
}
}
################################################################################
# STATISTICAL TEST INTERFACES
################################################################################
#' CUSUM Test
#'
#' Performs the (univariate) CUSUM test for change in mean, as described in
#' \insertCite{horvathricemiller19}{CPAT}. This is effectively an interface to
#' \code{\link{stat_Vn}}; see its documentation for more details. p-values are
#' computed using \code{\link{pkolmogorov}}, which represents the limiting
#' distribution of the statistic under the null hypothesis.
#'
#' @param x Data to test for change in mean
#' @param stat_plot Whether to create a plot of the values of the statistic at
#' all potential change points
#' @inheritParams stat_Vn
#' @return A \code{htest}-class object containing the results of the test
#' @references
#' \insertAllCited{}
#' @examples
#' CUSUM.test(rnorm(1000))
#' CUSUM.test(rnorm(1000), use_kernel_var = TRUE, kernel = "bo",
#' bandwidth = "nw")
#' @export
CUSUM.test <- function(x, use_kernel_var = FALSE, stat_plot = FALSE,
kernel = "ba", bandwidth = "and") {
testobj <- list()
testobj$method <- "CUSUM Test for Change in Mean"
testobj$data.name <- deparse(substitute(x))
res <- stat_Vn(x,
estimate = TRUE,
use_kernel_var = use_kernel_var,
kernel = kernel,
bandwidth = bandwidth,
get_all_vals = stat_plot)
stat <- res[[1]]
est <- res[[2]]
if (stat_plot) {
plot.ts(res[[3]], main = "Value of Test Statistic", ylab = "Statistic")
}
attr(stat, "names") <- "A"
attr(est, "names") <- "t*"
testobj$p.value <- 1 - pkolmogorov(res[[1]])
testobj$estimate <- est
testobj$statistic <- stat
class(testobj) <- "htest"
testobj
}
#' Darling-Erdös Test
#'
#' Performs the (univariate) Darling-Erdös test for change in mean, as described
#' in \insertCite{horvathricemiller19}{CPAT}. This is effectively an interface
#' to \code{\link{stat_de}}; see its documentation for more details. p-values
#' are computed using \code{\link{pdarling_erdos}}, which represents the
#' limiting distribution of the test statistic under the null hypothesis when
#' \code{a} and \code{b} are chosen appropriately. (Change those parameters at
#' your own risk!)
#'
#' @param x Data to test for change in mean
#' @param stat_plot Whether to create a plot of the values of the statistic at
#' all potential change points
#' @inheritParams stat_de
#' @return A \code{htest}-class object containing the results of the test
#' @references
#' \insertAllCited{}
#' @examples
#' DE.test(rnorm(1000))
#' DE.test(rnorm(1000), use_kernel_var = TRUE, kernel = "bo", bandwidth = "nw")
#' @export
DE.test <- function(x, a = log, b = log, use_kernel_var = FALSE,
stat_plot = FALSE, kernel = "ba", bandwidth = "and") {
l <- function(x) {sqrt(2*log(x))}
u <- function(x) {2*log(x) + 1/2*log(log(x)) - 1/2*log(pi)}
testobj <- list()
testobj$method <- "Darling-Erdos Test for Change in Mean"
testobj$data.name <- deparse(substitute(x))
params <- c(l(a(length(x))), u(b(length(x))))
names(params) <- c("a(" %s0% deparse(substitute(a)) %s0% "(T))", "b(" %s0%
deparse(substitute(b)) %s0% "(T))")
testobj$parameter <- params
res <- stat_de(x,
estimate = TRUE,
a = a,
b = b,
use_kernel_var = use_kernel_var,
kernel = kernel,
bandwidth = bandwidth,
get_all_vals = stat_plot)
stat <- res[[1]]
est <- res[[2]]
if (stat_plot) {
plot.ts(res[[3]], main = "Value of Test Statistic", ylab = "Statistic")
}
attr(stat, "names") <- "A"
attr(est, "names") <- "t*"
testobj$p.value <- 1 - pdarling_erdos(res[[1]])
testobj$estimate <- est
testobj$statistic <- stat
class(testobj) <- "htest"
testobj
}
#' Rényi-Type Test
#'
#' Performs the (univariate) Rényi-type test for change in mean, as described in
#' \insertCite{horvathricemiller19}{CPAT}. This is effectively an interface to
#' \code{\link{stat_Zn}}; see its documentation for more details. p-values are
#' computed using \code{\link{pZn}}, which represents the limiting distribution
#' of the test statistic under the null hypothesis, which represents the
#' limiting distribution of the test statistic under the null hypothesis when
#' \code{kn} represents a sequence \eqn{t_T} satisfying \eqn{t_T \to \infty}
#' and \eqn{t_T/T \to 0} as \eqn{T \to \infty}. (\code{\link[base]{log}} and
#' \code{\link[base]{sqrt}} should be good choices.)
#'
#' @param x Data to test for change in mean
#' @param stat_plot Whether to create a plot of the values of the statistic at
#' all potential change points
#' @inheritParams stat_Zn
#' @return A \code{htest}-class object containing the results of the test
#' @references
#' \insertAllCited{}
#' @examples
#' HR.test(rnorm(1000))
#' HR.test(rnorm(1000), use_kernel_var = TRUE, kernel = "bo", bandwidth = "nw")
#' @export
HR.test <- function(x, kn = log, use_kernel_var = FALSE, stat_plot = FALSE,
kernel = "ba", bandwidth = "and") {
testobj <- list()
testobj$method <- "Horvath-Rice Test for Change in Mean"
testobj$data.name <- deparse(substitute(x))
res <- stat_Zn(x,
kn = kn,
estimate = TRUE,
use_kernel_var = use_kernel_var,
kernel = kernel,
bandwidth = bandwidth,
get_all_vals = stat_plot)
stat <- res[[1]]
est <- res[[2]]
if (stat_plot) {
series <- ts(res[[3]], start = ceiling(kn(length(x))))
plot.ts(res[[3]], main = "Value of Test Statistic", ylab = "Statistic")
}
kn_val <- kn(length(x))
attr(kn_val, "names") <- deparse(substitute(kn)) %s0% "(T)"
attr(stat, "names") <- "D"
attr(est, "names") <- "t*"
testobj$parameters <- kn_val
testobj$p.value <- 1 - pZn(res[[1]])
testobj$estimate <- est
testobj$statistic <- stat
class(testobj) <- "htest"
testobj
}
#' Hidalgo-Seo Test
#'
#' Performs the (univariate) Hidalgo-Seo test for change in mean, as described
#' in \insertCite{horvathricemiller19}{CPAT}. This is effectively an interface
#' to \code{\link{stat_hs}}; see its documentation for more details. p-values
#' are computed using \code{\link{phidalgo_seo}}, which represents the limiting
#' distribution of the test statistic when the null hypothesis is true.
#'
#' @param x Data to test for change in mean
#' @param stat_plot Whether to create a plot of the values of the statistic at
#' all potential change points
#' @inheritParams stat_hs
#' @return A \code{htest}-class object containing the results of the test
#' @references
#' \insertAllCited{}
#' @examples
#' HS.test(rnorm(1000))
#' HS.test(rnorm(1000), corr = FALSE)
#' @export
HS.test <- function(x, corr = TRUE, stat_plot = FALSE) {
testobj <- list()
testobj$method <- "Hidalgo-Seo Test for Change in Mean"
testobj$data.name <- deparse(substitute(x))
params <- c(corr)
names(params) <- c("Correlated Residuals")
testobj$parameter <- params
res <- stat_hs(x, estimate = TRUE, corr = corr, get_all_vals = stat_plot)
stat <- res[[1]]
est <- res[[2]]
if (stat_plot) {
plot.ts(res[[3]], main = "Value of Test Statistic", ylab = "Statistic")
}
attr(stat, "names") <- "A"
attr(est, "names") <- "t*"
testobj$p.value <- 1 - phidalgo_seo(res[[1]])
testobj$estimate <- est
testobj$statistic <- stat
class(testobj) <- "htest"
testobj
}
#' Andrews' Test for End-of-Sample Structural Change
#'
#' Performs Andrews' test for end-of-sample structural change, as described in
#' \insertCite{andrews03}{CPAT}. This function works for both univariate and
#' multivariate data depending on the nature of \code{x} and whether
#' \code{formula} is specified. This function is thus an interface to
#' \code{\link{andrews_test}} and \code{\link{andrews_test_reg}}; see the
#' documentation of those functions for more details.
#'
#' @param x Data to test for change in mean (either a vector or
#' \code{data.frame})
#' @inheritParams andrews_test_reg
#' @return A \code{htest}-class object containing the results of the test
#' @references
#' \insertAllCited{}
#' @examples
#' Andrews.test(rnorm(1000), M = 900)
#' x <- rnorm(1000)
#' y <- 1 + 2 * x + rnorm(1000)
#' df <- data.frame(x, y)
#' Andrews.test(df, y ~ x, M = 900)
#' @export
Andrews.test <- function(x, M, formula = NULL) {
testobj <- list()
testobj$method <- "Andrews' Test for Structural Change"
testobj$data.name <- deparse(substitute(x))
if (is.numeric(x)) {
mchange <- length(x) - M
res <- andrews_test(x, M, pval = TRUE, stat = TRUE)
} else if (is.data.frame(x)) {
mchange <- nrow(x) - M
res <- andrews_test_reg(formula, x, M, pval = TRUE, stat = TRUE)
} else {
stop("x must be vector-like or a data frame")
}
stat <- res[["stat"]]
attr(mchange, "names") <- "m"
attr(stat, "names") <- "S"
testobj$p.value <- res[["pval"]]
testobj$parameters <- mchange
testobj$statistic <- stat
class(testobj) <- "htest"
testobj
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.