### Title: Create the test-statistic under strong identification by
### thresholding the residuals to create the "dual" variables
###
### Description: Under the null hypothesis that beta_D,J and beta_X,J is a
### specified vector, we can create a test statistic under the assumption of
### homoskedasticity or heteroskedasticity. If |J| + |K| = 1, then we have a
### test statistic that is a linear form. If |J| + |K| > 1, then we have a test
### statistic that is a quadratic form. The test statistic, in a nutshell,
### evaluates the dual feasibility constraint constructed with objects that are
### the result of a short IQR problem where we concentrate out D_J and X_K from
### Y with the null values of beta_D,J and beta_X,J.
### Note: unlike `R/test_stat.R`, we define the \hat{a} vector (i.e., the dual
### variables used to define the \hat{b} in the numerator of the
### test-statistic) by thresholding the residuals from the short-IQR.
###
### Author: Omkar A. Katta
### test_stat_threshold -------------------------
#' Compute the test statistic and p-value under strong identification by
#' thresholding the residuals
#'
#' Under user-specified null hypotheses (multivariate or univariate) on the
#' endogeneous and exogeneous variables' coefficients, compute the test
#' statistic and p-value associated with a specified \code{alpha}-level under
#' homoskedasticity or heteroskedasticity. (Note: we threshold the residuals to
#' create the \eqn{\hat{b}} in the numerator)
#'
#' For example, if \code{beta_D_null = c(NA, 0, NA)} and \code{beta_X_null =
#' c(1, NA)}, then \eqn{\beta_{D,2} = 0} and \eqn{\beta_{X,1} = 1} is the null
#' hypothesis.
#' Note that in this example, p_D is 3 and p_X is 2, which corresponds to the
#' lengths of the coefficient vectors.
#'
#' Based on \code{beta_D_null}, we find the vector J, which contains the
#' indices of the coefficients specified under the null.
#' We can then define D_J and Phi_J as the columns of D and Phi whose indices
#' are in J. Similarly, we can define D_J_minus and Phi_J_minus as the columns
#' of D and Phi whose indices are not in J. If J is the empty set, then D_J
#' and Phi_J should be thought of as being empty matrices without any
#' dimensions. If J specified all indices from 1 to p_D, then D_J_minus and
#' Phi_J_minus should be thought of as being empty. These conventions apply to
#' the vector K, which contains the indices of the coefficients on the
#' endogeneous variable specified uner the null.
#'
#' If the test is univariate (i.e., |J| + |K| = 1), then the test statistic
#' has a linear form. Otherwise, the test statistic has a quadratic form.
#'
#' @param beta_D_null Vector of coefficients on the endogeneous variable under
#' the null hypothesis; if a coefficient is not specified under the null, let
#' the corresponding entry be NA (vector of length p_D with |J| non-zero
#' entries)
#' @param beta_X_null Vector of coefficients on the covariates under
#' the null hypothesis; if a coefficient is not specified under the null, let
#' the corresponding entry be NA (vector of length p_X with |K| non-zero
#' entries)
#' @param alpha Alpha level of the test; defaults to 0.1; only used when
#' \code{homoskedasticity} is FALSE (numeric between 0 and 1)
#' @param Y Dependent variable (vector of length n)
#' @param X Exogenous variable (including constant vector) (n by p_X matrix)
#' @param D Endogenous variable (n by p_D matrix)
#' @param Z Instrumental variable (n by p_Z matrix)
#' @param Phi Transformation of X and Z to be used in the program;
#' defaults to the linear projection of D on X and Z (matrix with n rows)
#' @param tau Quantile (number between 0 and 1)
#' @param B Matrix that enters numerator of test statistic
#' (n by |J| + |K| matrix); If NULL (default), this matrix is (Phi_J, X_K),
#' where Phi_J and X_K are the columns of Phi and X with indices J and K;
#' @param orthogonalize_statistic If TRUE, \eqn{\tilde{B}} will be used in
#' numerator of test statistic; defaults to FALSE; for advanced users only
#' @param homoskedasticity If TRUE, assume density of error at 0 is constant;
#' defaults to FALSE (boolean)
#' @param kernel Only active if \code{homoskedasticity} is FALSE; either
#' "Powell" (default) to use the Powell estimator or
#' "Gaussian" to use a Gaussian kernel; only used when
#' \code{homoskedasticity} is FALSE
#' @param a_hat Vector (n by 1) of dual variables; if NULL (default), use
#' dual-variables from short-iqr regression
#' @param residuals Residuals from IQR MILP program; if NULL (default), use
#' residuals from short-iqr regression
#' @param show_progress If TRUE (default), sends progress messages during
#' execution (boolean); also passed to \code{preprocess_iqr_milp}
#' @param print_results If TRUE (default), print the test-statistic, p-value,
#' and alpha level (boolean)
#' @param ... Arguments passed to \code{preprocess_iqr_milp}
test_stat_threshold <- function(beta_D_null,
beta_X_null,
alpha = 0.1,
Y,
X,
D,
Z,
Phi = linear_projection(D, X, Z),
tau,
B = NULL,
orthogonalize_statistic = FALSE,
homoskedasticity = FALSE,
a_hat = NULL,
residuals = NULL,
kernel = "Powell",
show_progress = TRUE,
print_results = TRUE,
# FUN = preprocess_iqr_milp,
...) {
# Start clock
clock_start <- Sys.time()
msg <- paste("Clock started:", clock_start)
send_note_if(msg, show_progress, message)
out <- list() # Initialize list of results to return
# Check arguments
# if (!identical(FUN, preprocess_iqr_milp) & !identical(FUN, iqr_milp)) {
# stop("`FUN` should either be `preprocess_iqr_milp` or `iqr_milp`.")
# }
kernel <- tolower(kernel)
if (kernel != "powell" & kernel != "gaussian" & !homoskedasticity) {
stop(paste0("`kernel` should either be 'Powell' or 'Gaussian', not ",
kernel))
}
# Get dimensions of data
n <- length(Y)
p_D <- ncol(D)
p_X <- ncol(X)
# Check dimensions of vectors under null hypothesis
stopifnot(length(beta_D_null) == p_D)
stopifnot(length(beta_X_null) == p_X)
stopifnot(alpha < 1 & alpha > 0)
out$beta_D_null <- beta_D_null
out$beta_X_null <- beta_X_null
out$alpha <- alpha
send_note_if("Obtained data", show_progress, message)
# Get indices as a boolean vector: TRUE => specified under null, FALSE => o/w
J <- !is.na(beta_D_null)
K <- !is.na(beta_X_null)
cardinality_J <- sum(J)
cardinality_K <- sum(K)
beta_D_null_nontrivial <- beta_D_null[J]
beta_X_null_nontrivial <- beta_X_null[K]
if (cardinality_J + cardinality_K == 0) {
stop("Invalid null: beta_D_null and beta_X_null cannot be all NA values.")
}
# Construct matrices
D_J <- D[, J, drop = FALSE]
D_J_minus <- D[, !J, drop = FALSE]
X_K <- X[, K, drop = FALSE]
X_K_minus <- X[, !K, drop = FALSE]
# Z_J <- Z[, J, drop = FALSE] # not used
Z_J_minus <- Z[, !J, drop = FALSE]
Phi_J <- Phi[, J, drop = FALSE]
Phi_J_minus <- Phi[, !J, drop = FALSE]
send_note_if("Constructed basic matrices", show_progress, message)
# Concentrate out D_J and X_K
tmp <- Y
if (cardinality_J > 0) {
tmp <- tmp - D_J %*% beta_D_null_nontrivial
}
if (cardinality_K > 0) {
tmp <- tmp - X_K %*% beta_X_null_nontrivial
}
Y_tilde <- tmp
send_note_if("Concentrated out Y", show_progress, message)
# Obtain \hat{a} and residuals via short-iqr regression
if (is.null(a_hat) | is.null(residuals)) {
if (ncol(D_J_minus) == 0) {
# If there are no endogeneous variables, return quantile regression results:
msg <- paste("p_D is 0 -- running QR instead of IQR MILP...")
send_note_if(msg, TRUE, warning)
qr <- quantreg::rq(Y_tilde ~ X_K_minus - 1, tau = tau)
short_iqr <- qr
FUN <- "qr"
} else {
short_iqr <- preprocess_iqr_milp(
Y = Y_tilde,
X = X_K_minus,
D = D_J_minus,
Z = Z_J_minus, # not really important since we specify Phi
Phi = Phi_J_minus,
tau = tau,
show_progress = show_progress,
...
)
FUN <- "preprocess_iqr_milp"
}
send_note_if("Computed short-IQR solution", show_progress, message)
if (FUN == "preprocess_iqr_milp") {
short_iqr_result <- short_iqr$final_fit
} else if (FUN == "qr") {
short_iqr_result <- short_iqr
}
out$short_iqr <- short_iqr_result
}
# residuals for estimating variance matrix aka Psi
# short-iqr residuals are used to construct a_hat (and consequently b_hat)
# `residuals` argument is used to construct Psi matrix (aka variance of residuals)
# If the `residuals` argument is not provided, we use the short-iqr residuals.
if (is.null(residuals)) {
msg <- "`residuals` not provided for variance estimation; using short-iqr residuals instead"
warning(msg)
resid <- short_iqr_result$resid
out$resid <- resid
} else {
resid <- residuals
out$resid <- resid
}
# dual variables for computing test-stat
# NOTE: differs from 'R/test_stat.R'
if (is.null(a_hat)) {
# threshold the residuals from the short-iqr to get a_hat
# To get residuals under full-vector inference (i.e., cardinality_J == p_D),
# we run a quantile regression, not the usual IQR MILP procedure.
# Hence, we don't need to print "status" or "objval" if we are in
# the full-vector setting because quantile regression doesn't return such
# results.
if (cardinality_J != p_D) {
if (short_iqr_result$status != "OPTIMAL") {
warning(paste("Short IQR Status:", short_iqr_result$status))
}
if (short_iqr_result$status == "TIME_LIMIT" || short_iqr_result$objval != 0) {
message(paste("Short IQR Objective Value:", short_iqr_result$objval))
out$ended_early <- TRUE
return(out)
}
}
a_hat <- as.numeric(short_iqr_result$resid > 0)
out$a_hat <- a_hat
} else {
# user-specified a_hat vector
out$a_hat <- a_hat
}
# Obtain \hat{b}
b_hat <- a_hat - (1 - tau)
# Create B
if (is.null(B)) {
B <- cbind(Phi_J, X_K)
}
stopifnot(nrow(B) == n)
stopifnot(ncol(B) == cardinality_J + cardinality_K)
out$B <- B
send_note_if("Created `B`", show_progress, message)
# Create \tilde{B} depending on homoskedasticity or heteroskedasticity
B_minus <- cbind(Phi_J_minus, X_K_minus)
stopifnot(nrow(B_minus) == n)
stopifnot(ncol(B_minus) == p_D - cardinality_J + p_X - cardinality_K)
C_minus <- cbind(D_J_minus, X_K_minus)
stopifnot(nrow(C_minus) == n)
stopifnot(ncol(C_minus) == p_D - cardinality_J + p_X - cardinality_K)
if (homoskedasticity) {
Psi <- diag(1, nrow = n)
} else {
# Hall and Sheather (1988) bandwidth
tmp_a <- n ^ (1 / 3)
tmp_b <- stats::qnorm(1 - 0.5 * alpha) ^ (2 / 3)
tmp_c <- 1.5 * (stats::dnorm(stats::qnorm(tau)) ^ 2)
tmp_d <- 2 * (stats::qnorm(tau) ^ 2) + 1
hs <- tmp_a * tmp_b * ((tmp_c / tmp_d) ^ (1 / 3))
out$hs <- hs
if (kernel == "powell") {
bw <- hs
# Note that the 1 / (2 * n * bw) is negated in the formula for B_tilde
Psi <- diag(as.numeric(abs(resid) < bw), nrow = n, ncol = n)
} else if (kernel == "gaussian") {
bw <- hs
# Note that the 1 / (n * bw) is negated in the formula for B_tilde
Psi <- diag(stats::dnorm(resid / bw), nrow = n, ncol = n)
} else {
stop(
"Let `homoskedasticity` be TRUE or choose an appropriate `kernel`."
)
}
}
G_minus <- Psi %*% C_minus
B_tilde <- B - B_minus %*% solve(t(G_minus) %*% B_minus) %*% t(G_minus) %*% B
send_note_if("Created `B_tilde`", show_progress, message)
out$Psi <- Psi
out$B_tilde <- B_tilde
out$homoskedasticity <- homoskedasticity
out$kernel <- ifelse(homoskedasticity, "homoskedasticity", kernel)
# Construct S_n with orthogonalize_statistic
if (orthogonalize_statistic) {
S_n <- n ^ (-1 / 2) * t(B_tilde) %*% b_hat
} else {
S_n <- n ^ (-1 / 2) * t(B) %*% b_hat
}
stopifnot(nrow(S_n) == cardinality_J + cardinality_K)
stopifnot(ncol(S_n) == 1)
out$S_n <- S_n
send_note_if("Created `S_n`", show_progress, message)
# Construct L_n or Q_n depending on |J| + |K| == or != 1
# Compute p-value
if (cardinality_J + cardinality_K == 1) {
denom <- sqrt(tau * (1 - tau) * t(B_tilde) %*% B_tilde / n)
out$denom <- denom
test_stat <- S_n / denom
p_val <- 2 * (1 - stats::pnorm(abs(test_stat)))
} else {
M_n <- (1 / n) * t(B_tilde) %*% B_tilde
out$M_n <- M_n
test_stat <- t(S_n) %*% solve(M_n) %*% S_n / (tau * (1 - tau))
p_val <- 1 - stats::pchisq(test_stat, df = cardinality_J + cardinality_K)
}
send_note_if("Computed test statistic and p-value", show_progress, message)
if (print_results) {
print(paste("Alpha:", alpha))
print(paste("Test Statistic:", test_stat))
print(paste("p-value:", p_val))
}
stopifnot(nrow(test_stat) == 1)
stopifnot(ncol(test_stat) == 1)
out$test_stat <- as.numeric(test_stat)
out$p_val <- p_val
# Stop the clock
clock_end <- Sys.time()
elapsed_time <- difftime(clock_end, clock_start, units = "mins")
out$time_elapsed <- elapsed_time
if (print_results) {
print(paste("Time Elapsed (mins):", elapsed_time))
}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.