################################################################################
# SimulationUtils.R
################################################################################
# 2018-08-27
# Curtis Miller
################################################################################
# Utilities for performing and managing statistical tests and simulations used
# in our paper (Horváth et. al. 2019).
################################################################################
################################################################################
# POWER SIMULATION OBJECT AND FILE MANAGEMENT
################################################################################
#' Convert Rényi-Type Statistic Power Simulation Save List to Data Frame
#'
#' This function will convert the power simulation data generated in a list in
#' our simulation scripts to a \code{data.frame}. Given such a list and a
#' critical value to determine whether the null hypothesis should be rejected,
#' the function will return a \code{data.frame} with columns \code{power},
#' \code{stat}, \code{dist}, \code{kn}, \code{n}, \code{cpt}, and \code{delta},
#' which correspond to: the empirical power of the statistic; the identifier of
#' the statistic; the generating distribution of the statistic was computed on;
#' the \code{kn} parameter; the identifier of how change points were computed;
#' and the size of the change.
#'
#' @param obj A list containing simulated statistic values
#' @param crit The critical value determining whether a statistic should lead to
#' the rejection of the null hypothesis
#' @return A \code{data.frame} summarizing the results of the data stored in
#' \code{obj}
#' @examples
#' saveobj <- list("norm" = list(
#' "log" = list(
#' "n50" = list(
#' "c4rt" = list(
#' "d_0" = c(1.551, 1.276, 1.348, 1.982, 1.423)
#' )))))
#' CPAT:::power_sim_Zn_to_df(saveobj, CPAT:::qZn(.95))
power_sim_Zn_to_df <- function(obj, crit) {
# Formerly known as powerSim_Zn_to_df()
return_obj <- list(power = c(), stat = c(), dist = c(),
kn = c(), n = c(), cpt = c(),
delta = c())
for (dist in names(obj)) {
for (kn in names(obj[[dist]])) {
for (n in names(obj[[dist]][[kn]])) {
for (cpt in names(obj[[dist]][[kn]][[n]])) {
newprops <- sapply(names(
obj[[dist]][[kn]][[n]][[cpt]]),
function(delta) {
return(mean(
obj[[dist]][[kn]][[n]][[cpt]][[
delta]] >= crit))})
return_obj$power <- c(return_obj$power,
newprops)
return_obj$dist <- c(return_obj$dist,
rep(dist,
times = length(newprops)))
return_obj$kn <- c(return_obj$kn,
rep(kn,times =
length(newprops)))
return_obj$n <- c(return_obj$n,
rep(n, times = length(newprops)))
return_obj$cpt <- c(return_obj$cpt,
rep(cpt,
times = length(newprops)))
return_obj$delta <- c(return_obj$delta,
names(
obj[[dist]][[kn]][[n]][[
cpt]]))
}
}
}
}
return_obj$stat <- rep("Zn",
times = length(return_obj$power))
as.data.frame(return_obj)
}
#' Convert CUSUM-Type Statistic Power Simulation Save List to Data Frame
#'
#' This function will convert the power simulation data generated in a list in
#' our simulation scripts to a \code{data.frame}. Given such a list and a
#' critical value to determine whether the null hypothesis should be rejected,
#' the function will return a \code{data.frame} with columns \code{power},
#' \code{stat}, \code{dist}, \code{n}, \code{cpt}, and \code{delta},
#' which correspond to: the empirical power of the statistic; the identifier of
#' the statistic; the generating distribution of the statistic was computed on;
#' the identifier of how change points were computed; and the size of the
#' change.
#'
#' @inheritParams power_sim_Zn_to_df
#' @return A \code{data.frame} summarizing the results of the data stored in
#' \code{obj}
#' @examples
#' saveobj <- list("norm" = list(
#' "n50" = list(
#' "c4rt" = list(
#' "d_0" = c(1.551, 1.276, 1.348, 1.982, 1.423)
#' ))))
#' CPAT:::power_sim_Vn_to_df(saveobj, CPAT:::qkolmogorov(.95))
power_sim_Vn_to_df <- function(obj, crit) {
# Formerly known as powerSim_Vn_to_df()
return_obj <- list(power = c(), stat = c(), dist = c(),
n = c(), cpt = c(), delta = c())
for (dist in names(obj)) {
for (n in names(obj[[dist]])) {
for (cpt in names(obj[[dist]][[n]])) {
newprops <- sapply(names(obj[[dist]][[n]][[cpt]]),
function(delta) {return(
mean(obj[[dist]][[n]][[cpt]][[
delta]] >= crit))})
return_obj$power <- c(return_obj$power, newprops)
return_obj$dist <- c(return_obj$dist,
rep(dist,
times = length(newprops)))
return_obj$n <- c(return_obj$n,
rep(n, times = length(newprops)))
return_obj$cpt <- c(return_obj$cpt,
rep(cpt,
times = length(newprops)))
return_obj$delta <- c(return_obj$delta,
names(obj[[dist]][[n]][[cpt]]))
}
}
}
return_obj$stat <- rep("Vn",
times = length(return_obj$power))
as.data.frame(return_obj)
}
#' Create Power Simulation Results Data Frame
#'
#' Creates a \code{data.frame} that contains power simulation results from files
#' containing power simulations. This function should automate the use of
#' \code{\link{power_sim_Zn_to_df}} and \code{\link{power_sim_Vn_to_df}} for
#' collecting power simulation data. It takes two \code{CSV} files, one passed
#' (as a character string) to \code{file_meta} and the other to
#' \code{stat_meta}, describing how the files (named and described in
#' \code{file_meta}) should be handled.
#'
#' @param file_meta The location of a \code{CSV} file that contains file names
#' and the statistics that those files correspond to
#' @param stat_meta The location of a \code{CSV} file that contains statistic
#' (\code{stat}) labels (used in \code{file_meta}). the name
#' of the variable for the statistic, and the name of the
#' function that converts a file (mentioned in
#' \code{file_meta}) to a \code{data.frame} of power data
#' @param prefix Character string representing a prefix for file names mentioned
#' in \code{file_meta}; could be used for adding path information
#' to those names, in case the files are not in the working
#' directory and there is no desire to edit \code{file_meta}'s
#' data
#' @param alpha Numeric for level of significance used in power calculations
#' @return A data frame containing the power simulation data
#' @import utils
#' @examples
#' \dontrun{
#' power_sim_stat_df_creator("FileStatMeta.csv", "StatMeta.csv")
#' }
power_sim_stat_df_creator <- function(file_meta, stat_meta, prefix = "",
alpha = 0.05) {
stat_meta <- read.csv(stat_meta, stringsAsFactors = FALSE)
file_meta <- read.csv(file_meta, stringsAsFactors = FALSE)
file_meta$file <- paste0(prefix, file_meta$file)
stats <- unique(file_meta$statistic)
perc_Zn_theo <- qZn(1 - alpha)
perc_Vn_theo <- qkolmogorov(1 - alpha)
perc_de_theo <- qdarling_erdos(1 - alpha)
perc_hs_theo <- qhidalgo_seo(1 - alpha)
Zn_df <- bind_power_sim_objs(with(file_meta, file[statistic == "Zn"]),
with(stat_meta, eval(as.name(
crit_value[statistic == "Zn"]))),
with(stat_meta, eval(as.name(
conv_func[statistic == "Zn"]))),
"Zn")
df <- Zn_df
kn <- unique(Zn_df$kn)
for (s in stats[which(stats != "Zn")]) {
temp <- bind_power_sim_objs(with(file_meta, file[statistic == s]),
with(stat_meta, eval(as.name(
crit_value[statistic == s]))),
with(stat_meta, eval(as.name(
conv_func[statistic == s]))),
s)
for (k in kn) {
temp$kn <- k
df <- rbind(df, temp)
}
}
df$delta <- as.character(df$delta)
df$delta <- as.numeric(substr(df$delta, 3, nchar(df$delta)))
df$n <- as.character(df$n)
df$n <- as.integer(substr(df$n, 2, nchar(df$n)))
df[with(df, order(stat, dist, kn, n, cpt, delta)),]
}
#' Power Result Data Frame Creation
#'
#' Creates a \code{data.frame} containing power simulation results. Effectively
#' a better, higher-level interface to \code{\link{power_sim_Zn_to_df}} and
#' \code{\link{power_sim_Vn_to_df}}.
#'
#' @param files A character vector of file names
#' @param crit_value The critical value against which to compare a test
#' statistic
#' @param conv_func The function responsible for converting a list containing
#' simulated statistic values under different conditions to a
#' \code{data.frame}
#' @param stat_name The label of the statistic
#' @return A \code{data.frame} containing power levels
#' @examples
#' \dontrun{
#' filenames <- c("powerSimulations_sdest_norm_DE.rda",
#' "powerSimulations_sdest_ar1_0.5_DE.rda")
#' bind_power_sim_objs(filenames, crit_value = qdarling_erdos(.95),
#' conv_func = power_sim_Vn_to_df, stat_name = "de")
#' }
bind_power_sim_objs <- function(files, crit_value, conv_func, stat_name) {
# Formerly known as bind_powerSim_objs()
load(files[1])
df <- conv_func(saveobj, crit_value)
df$stat <- stat_name
# print(head(df))
# cat("\n\n")
for (f in files[-1]) {
load(f)
temp <- conv_func(saveobj, crit_value)
temp$stat <- stat_name
# print(head(temp))
# cat("\n\n")
df <- rbind(df, temp)
# print(tail(df))
# cat("\n\n")
}
df
}
################################################################################
# CHANGING WINDOWN P-VALUE COMPUTATION
################################################################################
#' Expanding Window p-Values
#'
#' Gets p-values for the CUSUM, Darling-Erdös, Hidalgo-Seo, Andrews, and
#' Rényi-type tests when applied to an expanding window of data.
#'
#' @param dat The dataset for which to test for change in mean
#' @param m The location of the first potential change point for Andrews' test
#' @return A matrix containing p-values for an expanding sample size, with each
#' row corresponding to one observation larger; columns are labeled for
#' each statistic
#' @examples
#' if (require("foreach") & require("doParallel")) {
#' CPAT:::get_expanding_window_pvals(rnorm(1000), m = 900)
#' }
get_expanding_window_pvals <- function(dat, m = Inf) {
has_parallel <- requireNamespace("foreach", quietly = TRUE) &&
requireNamespace("doParallel", quietly = TRUE)
if (!has_parallel) {stop("The foreach and doParallel packages are needed" %s%
"to use this function")}
`%dopar%` <- foreach::`%dopar%`
foreach <- foreach::foreach
test_p_vals <- foreach(n = 3:length(dat), .combine = rbind) %dopar% {
cusum <- CUSUM.test(dat[1:n])
de <- DE.test(dat[1:n])
hr <- HR.test(dat[1:n])
hs <- HS.test(dat[1:n], corr = FALSE)
cusum_ker <- CUSUM.test(dat[1:n], use_kernel_var = TRUE)
de_ker <- DE.test(dat[1:n], use_kernel_var = TRUE)
hr_ker <- HR.test(dat[1:n], use_kernel_var = TRUE)
hs_ker <- HS.test(dat[1:n], corr = TRUE)
if (n > m) {
andrew <- Andrews.test(dat[1:n], M = m)$p.value[[1]]
} else {
andrew <- NA
}
c("n" = n,
"cusum" = cusum$p.value[[1]],
"de" = de$p.value[[1]],
"hr" = hr$p.value[[1]],
"hs" = hs$p.value[[1]],
"cusum_ker" = cusum_ker$p.value[[1]],
"de_ker" = de_ker$p.value[[1]],
"hr_ker" = hr_ker$p.value[[1]],
"hs_ker" = hs_ker$p.value[[1]],
"andrew" = andrew)
}
rownames(test_p_vals) <- NULL
test_p_vals
}
#' Expanding Window p-Values for Regression Models
#'
#' Gets p-values for the CUSUM, Darling-Erdös, Hidalgo-Seo, Andrews, and
#' Rényi-type tests when applied to an expanding window of data for a regression
#' model.
#'
#' @param data A \code{data.frame}, the dataset for which to test for
#' structural change
#' @param formula The regression model formula, which will be passed to
#' \code{\link[stats]{lm}}
#' @param min_n An integer; the minimum sample size
#' @param m The location of the first potential change point for Andrews' test
#' @param verbose If \code{TRUE}, send messages to output
#' @return A matrix containing p-values for an expanding sample size, with each
#' row corresponding to one observation larger; columns are labeled for
#' each statistic
#' @examples
#' x <- rnorm(1000)
#' y <- 1 + 2 * x + rnorm(1000)
#' df <- data.frame(x, y)
#' if (require("foreach") & require("doParallel")) {
#' CPAT:::get_expanding_window_pvals_reg(y ~ x, data = df, min_n = 4, m = 900)
#' }
get_expanding_window_pvals_reg <- function(formula, data, min_n = 3, m = Inf,
verbose = FALSE) {
has_parallel <- requireNamespace("foreach", quietly = TRUE) &&
requireNamespace("doParallel", quietly = TRUE)
if (!has_parallel) {stop("The foreach and doParallel packages are needed" %s%
"to use this function")}
`%dopar%` <- foreach::`%dopar%`
foreach <- foreach::foreach
if (requireNamespace("cointReg", quietly = TRUE)) {
pkgs <- c("stats", "cointReg")
} else {
pkgs <- c("stats")
}
test_p_vals <- foreach(n = min_n:nrow(data), .combine = rbind,
.packages = pkgs) %dopar% {
if (verbose) {
cat("Working on n = ", n, "out of ", nrow(data), "\n")
}
vec <- residuals(lm(formula = formula, data = data))
cusum <- CUSUM.test(vec[1:n])
de <- DE.test(vec[1:n])
hr <- HR.test(vec[1:n])
hs <- HS.test(vec[1:n], corr = FALSE)
cusum_ker <- CUSUM.test(vec[1:n], use_kernel_var = TRUE)
de_ker <- DE.test(vec[1:n], use_kernel_var = TRUE)
hr_ker <- HR.test(vec[1:n], use_kernel_var = TRUE)
hs_ker <- HS.test(vec[1:n], corr = TRUE)
if (n > m) {
andrew <- tryCatch(Andrews.test(x = data[1:n,], formula = formula,
M = m)$p.value[[1]],
error = function(e) {return(NA)})
} else {
andrew <- NA
}
c("n" = n,
"cusum" = cusum$p.value[[1]],
"de" = de$p.value[[1]],
"hr" = hr$p.value[[1]],
"hs" = hs$p.value[[1]],
"cusum_ker" = cusum_ker$p.value[[1]],
"de_ker" = de_ker$p.value[[1]],
"hr_ker" = hr_ker$p.value[[1]],
"hs_ker" = hs_ker$p.value[[1]],
"andrew" = andrew)
}
rownames(test_p_vals) <- rownames(data[test_p_vals[, "n"],])
test_p_vals
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.