### Meta -------------------------
###
### Title: Compute IQR estimator by preprocessing MILP
###
### Description: The IQR estimator is computed by solving a MILP.
### This function pre-processes the data and fixes the sign of the residuals of
### outliers to speed up the procedure.
###
### Author: Omkar A. Katta
###
### preprocess_iqr_milp -------------------------
#' Compute IQR estimator by preprocessing MILP
#'
#' Fix the sign of outliers' residuals to solve the IQR MILP more quickly
#'
#' Obtain the residuals from a quantile regression of \code{Y} on \code{D} and
#' \code{X}. Use these residuals to guide which ones are the outliers, and fix
#' their dual variables. In particular, fix the sign of the dual variables
#' associated with residuals whose magnitude is larger than the
#' \eqn{\alpha}-percentile of the absolute residuals.
#' (Note that the initial value of \eqn{\alpha} is given by
#' \code{prop_alpha_initial}.)
#' Then, solve the preprocessed MILP.
#'
#' If the minimized objective of the preprocessed MILP is not 0,
#' then too many dual variables have been fixed.
#' So, relax the number of fixed dual variables by multiplying the bandwidth
#' \eqn{alpha} by \code{r}. Note that \code{r} should be at least 1 to ensure
#' we are relaxing our preprocessing constraints. We then solve the relaxed
#' preprocessed MILP.
#'
#' We repeatedly relax our preprocessing until the minimized objective is
#' finally 0.
#'
#' @param Y Dependent variable (vector of length n)
#' @param X Exogenous variable (including constant vector) (n by p_X matrix)
#' @param D Endogenous variable (n by p_D matrix)
#' @param Z Instrumental variable (n by p_Z matrix)
#' @param Phi Transformation of X and Z to be used in the program;
#' defaults to the linear projection of D on X and Z (matrix with n rows)
#' @param tau Quantile (number strictly between 0 and 1)
#' @param M A large number that bounds the absolute value of the residuals
#' (a positive number); defaults to 2 times the largest absolute residual from
#' quantile regression of Y on X and D
#' @param prop_alpha_initial Initial value of the bandwidth \eqn{alpha};
#' @param TimeLimit Maximum time (in seconds) spent on a linear program;
#' defaults to heuristic (numeric)
#' @param globalTimeLimit Maximum time (in seconds) spent on the entire
#' preprocessing function; defaults to Inf (i.e., no time limit) (numeric)
#' thus, 1 - \code{prop_alpha_initial} is the proportion of observations whose
#' dual variables (i.e., sign of the residuals) are fixed at the start
#' (number between 0 and 1)
#' @param r Rate at which we increase the bandwidth (\eqn{alpha}) at every
#' iteration (number greater than 1)
#' @param show_iterations If TRUE, print the iteration number to the console;
#' defaults to FALSE (boolean)
#' @param LogFileExt Extension of Gurobi log file; If \code{LogFileName} is
#' empty, then Gurobi log won't be saved and this argument will be ignored;
#' defaults to "log" (string)
#' @param ... Arguments that will be passed to \code{\link{iqr_milp}}
#'
#' @return A named list of
#' \enumerate{
#' \item final_fit: output of \code{\link{iqr_milp}} from the final
#' iteration
#' \item time: time elapsed for this function to run
#' \item iteration: number of iterations before objective is 0
#' \item O_neg: indices for residuals that are fixed to be negative
#' by the final iteration
#' \item O_pos: indices for residuals that are fixed to be positive
#' by the final iteration
#' }
#'
#' @export
preprocess_iqr_milp <- function(Y,
D,
X,
Z,
Phi = linear_projection(D, X, Z),
tau,
M = NULL,
TimeLimit = NULL,
globalTimeLimit = Inf,
prop_alpha_initial = 0.7,
r = 1.25,
show_iterations = FALSE,
LogFileExt = ".log",
...) {
out <- list() # Initialize list of results to return
# Start the clock
clock_start <- Sys.time()
# Get dimensions of data
n <- length(Y)
n_D <- nrow(D)
n_X <- nrow(X)
n_Z <- nrow(Z)
p_D <- ncol(D)
p_X <- ncol(X)
p_Z <- ncol(Z)
# if (!is.null(start)) {
# msg <- "No warm-starting allowed with preprocessing"
# send_note_if(msg, TRUE, warning)
# start <- NULL
# }
# If there are no endogeneous variables, return quantile regression results:
if (p_D == 0) {
msg <- paste("p_D is 0 -- running QR instead of IQR MILP...")
send_note_if(msg, TRUE, warning)
qr <- quantreg::rq(Y ~ X - 1, tau = tau)
out$final_fit <- qr
return(out)
}
# Determine preliminary residuals
if (p_X == 0) {
resid <- quantreg::rq(Y ~ D - 1, tau = tau)$residuals
} else if (p_D == 0) {
resid <- quantreg::rq(Y ~ X - 1, tau = tau)$residuals
} else {
resid <- quantreg::rq(Y ~ X + D - 1, tau = tau)$residuals
}
# Determine initial residual bounds
alpha_initial <- stats::quantile(abs(resid), prop_alpha_initial)
# Determine M before iqr_milp to avoid rerunning quantreg::rq
if (is.null(M)) {
# by default, M = 2 * max(resid from QR of Y on X and D)
# TODO: update heuristic for choosing M
# TODO: update documentation with default M
max_qr <- max(abs(resid))
M <- 2 * max_qr
}
# Start the while loop
alphawidth <- alpha_initial
status <- "TIME_LIMIT"
num_fixed_vars_per_iteration <- c()
time_limit_per_iteration <- c()
time_elapsed_per_iteration <- c()
final_objective_per_iteration <- c()
# Continue the while loop if the program took too long to solve or if
# the objective (i.e., absolute value of beta_Z) is not 0. Note that
# if the program is infeasible, i.e., the objective was NULL, we also
# continue the while loop (we mechanically set the objective to be nonzero
# later in the code).
counter <- 0
while (status == "TIME_LIMIT" || obj != 0) {
counter <- counter + 1
send_note_if(paste("Iteration", counter), show_iterations, message)
while_start_time <- Sys.time()
# TODO: Is it possible for two iterations of the while loop to have the same number of fixed variables?
# Fix the most negative and most positive residuals
O_neg <- which(resid < -1 * alphawidth)
O_pos <- which(resid > alphawidth)
O <- c(O_neg, O_pos)
send_note_if(paste("Alpha:", alphawidth), show_iterations, message)
# TODO: are we fixing the "dual" variables? I think we can improve the message below
send_note_if(paste("Number of Fixed Dual Variables:", length(O)), show_iterations, message)
num_fixed_vars_per_iteration <- c(num_fixed_vars_per_iteration, length(O))
# Heuristic for time limit
if (length(O) == 0) {
TT <- Inf
} else if (is.null(TimeLimit)) {
num_free <- n - length(O)
TT <- exp(num_free / 200 + p_D / 5 + num_free * p_D / 1000) * 4
} else {
TT <- TimeLimit
}
if (TT > globalTimeLimit) {
TT <- globalTimeLimit
} # TODO: check if globalTimeLimit is less than TimeLimit
send_note_if(paste("TT:", TT), show_iterations, message)
time_limit_per_iteration <- c(time_limit_per_iteration, TT)
# IQR
fit <- iqr_milp(Y = Y,
X = X,
D = D,
Z = Z,
Phi = Phi,
tau = tau,
O_neg = O_neg,
O_pos = O_pos,
TimeLimit = TT,
M = M, # by default, M = 2 * max(resid from QR of Y on X and D)
LogFileExt = paste0("_", counter, LogFileExt),
...)
final_objective_per_iteration <- c(final_objective_per_iteration,
ifelse(is.null(fit$objval),
"NULL",
as.character(round(fit$objval, 3))))
if (is.null(fit$objval)) {
obj <- 0.5
} else {
obj <- fit$objval
}
status <- fit$status
alphawidth <- alphawidth * r # TODO: add check to make sure alphawidth is not 0; otherwise, infeasibility will repeat ad infinitum and we won't leave while loop
if (show_iterations) {
print(paste("Iteration", counter, "complete"))
}
if (TT == Inf & obj != 0) {
warning("Nonzero Coefficients on Instruments")
break # exit while loop
}
current <- Sys.time()
while_elapsed <- difftime(current, while_start_time, units = "secs")
time_elapsed_per_iteration <- c(time_elapsed_per_iteration, while_elapsed)
elapsed_time <- difftime(current, clock_start, units = "secs")
if (as.numeric(elapsed_time) > globalTimeLimit) {
warning(paste("Global Time Limit of", globalTimeLimit, "reached."))
break # exit while loop
}
}
# Stop the clock
clock_end <- Sys.time()
elapsed_time <- difftime(clock_end, clock_start, units = "mins")
# Return results
out$status <- fit$status
out$final_fit <- fit
out$time <- elapsed_time # mins
out$O_neg <- O_neg
out$O_pos <- O_pos
out$iterations <- counter
out$num_fixed_vars_per_iteration <- num_fixed_vars_per_iteration
out$time_limit_per_iteration <- time_limit_per_iteration
out$time_elapsed_per_iteration <- time_elapsed_per_iteration
out$final_objective_per_iteration <- final_objective_per_iteration
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.