#Depends: AER, sandwich
library(AER);
library(sandwich);
#'The Confidence Interval Method for Selecting Valid Instrumental Variables.
#'
#'Implements the confidence interval (CI) downward testing procedure based on
#'the Sargan/Hansen-J test for overidentifying restrictions. The algorithm
#'selects the valid instrumental variables from a set of potential instruments
#'that may contain invalid ones. It also provides post-selection IV estimation
#'results using the selected valid instruments as IV and controlling for the
#'selected invalid instruments as explanatory variables.
#'
#'@param Y A numeric vector of outcomes.
#'@param D A numeric vector of exposures/treatments.
#'@param Z A numeric matrix of instrumental variables, with each column
#' referring to one instrument.
#'@param X An optional numeric matrix of exogenous explanatory variables, with
#' each column referring to one variable.
#'@param alpha A numeric scalar between 0 and 1 specifying the significance
#' level for the confidence interval for the causal effect estimate (default =
#' 0.05).
#'@param tuning A numeric scalar specifiying the threshold p-value for the
#' Saran/Hansen-J test (default = 0.1/log(n)).
#'@param robust Logical. If robust = TRUE, the linear model is robust to
#' heteroskedasticity (default = TRUE).
#'@param firststage Logical. If firststage = TRUE, a first-stage thresholding is
#' implemented to select the relevant instrument variables (default = FALSE).
#'@param firsttuning A numeric scalar specifiying the threshold critical value for the
#' first stage t-test (default = sqrt(2.01*log(pz))).
#'@return Valid instruments:
#' Identities of the valid instrumental variables
#' selected by the algorithm.
#'@return Number of Valid Instruments:
#' The number of the selected valid instrumental variables.
#'@return Relevant instruments:
#' Identities of the relevant instrumental variables
#' selected by the first stage thresholding if firststage = TRUE.
#'@return Number of Relevant Instruments:
#' The number of the selected relevant instrumental variables
#' if firststage = TRUE.
#'@return Coefficients:
#' The matrix for the post-selection IV estimation results
#' for the coefficients of the exposure/treatment variable and exogenous
#' explanatory variables using the selected valid instruments as IV and
#' controlling for the selected invalid instruments. The first two columns are
#' 2SLS estimates and their standard errors. If robust = TRUE, the second
#' column is heteroskedasticity-robust stand errors. The two-step GMM estimates
#' and their standard errors are also reported. If intercept = TRUE, the
#' estimates for the intercept are reported in the last row.
#'@return Confidence Interval:
#' The confidence interval for the 2SLS estimates for
#' the coefficient of the exposure/treatment variable with significance level
#' specified by alpha (default = 0.05). If robust = TRUE, the confidence
#' interval for the two-step GMM estimate is also reported.
#'@return p-value of Sargan:
#' The p-value for the Sargan overidentifying test for
#' the selected valid instruments. If robust = TRUE, the p-value for the
#' Hansen-J test is reported.
#'@examples
#'library(AER); library(sandwich)
#'# the MASS package is only needed to
#'run the working example
#'library(MASS)
#'#Generate data
#'n = 2000; L = 10; s = 3
#'pi = c(rep(3,s),rep(0,L-s)); beta = 1; gamma = c(0, rep(1,(L-2)), 0)
#'epsilonSigma = matrix(c(1,0.8,0.8,1),2,2)
#'epsilon = mvrnorm(n,rep(0,2),epsilonSigma)
#'Z = matrix(rnorm(n*L),n,L)
#'D = 0.5 + Z %*% gamma + epsilon[,1]
#'Y = -0.5 + Z %*% pi + D * beta + epsilon[,2]
#'result = CIIV(Y,D,Z,robust=FALSE, firststage=TRUE)
#'result
#' @export
CIIV <- function(Y, D, Z, X, alpha = 0.05, tuning = 0.1/log(length(Y)), robust = TRUE, firststage = FALSE, firsttuning = sqrt(2.01*log(ncol(Z)))) {
#Check Input
#Check Data
if(is.data.frame(Y)){Y <- as.matrix(Y)}
if(!is.vector(Y) && !is.matrix(Y) | !is.numeric(Y) | ncol(as.matrix(Y))!=1)
stop("Y must be a numeric vector.");
Y = as.numeric(Y);
if(!is.null(colnames(D))){Dname = colnames(D)}else{Dname = "D"};
if(is.data.frame(D)){D <- as.matrix(D)}
if(!is.vector(D) && !is.matrix(D) | !is.numeric(D) | ncol(as.matrix(D))!=1)
stop("D must be a numeric vector.");
D = as.numeric(D);
if(is.data.frame(Z)){Z <- as.matrix(Z)}
if(!is.matrix(Z) | !is.numeric(Z))
stop("Z must be a numerical matrix.");
if(ncol(Z)<2)
stop("The number of instruments must be greater than 1.");
if( ncol(Z) > nrow(Z))
stop("The number of instruments must be smaller than the sample size.");
if(!missing(X) && is.data.frame(X)){X <- as.matrix(X)}
if(!missing(X) && (!is.matrix(X) | !is.numeric(X)))
stop("X must be a numerical matrix.");
stopifnot(length(Y) == length(D),length(Y) == nrow(Z));
#Other Arguments
stopifnot(is.logical(firststage), is.logical(robust));
stopifnot(is.numeric(alpha), length(alpha) == 1,alpha <= 1,alpha >= 0);
stopifnot(is.numeric(tuning), length(tuning) == 1);
# Define Constants
n <- length(Y);
pz <- ncol(Z);
# Preserve Data
d = D;
y = Y;
z = Z;
if (!missing(X)){
X <- cbind(1, X);
if(is.null(colnames(X))){colnames(X) = c("intercept", paste0('X', 1:ncol(X)))};
}else{
X <- matrix(1, n, 1);
colnames(X) <- "intercept";
};
Covariates = cbind(D,X);
Exogenous = cbind(Z,X);
# Variable Names
colnames(Covariates)[1] <- Dname;
CovariatesNames <- colnames(Covariates);
if(!is.null(colnames(Z))){InstrumentNames = colnames(Z)}else{InstrumentNames = paste0('Z', 1:ncol(Z))};
# Centralization
D <- qr.resid(qr(X), D);
Z <- scale(qr.resid(qr(X), Z), center = FALSE, scale = TRUE);
# First Stage
if (firststage){
lm_first <- lm(D ~ Z - 1);
coef_first <- coef(lm_first);
sd_first <- sqrt(diag(if(robust) vcovHC(lm_first, "HC0") else vcov(lm_first)));
first_index <- as.numeric(abs(coef_first/sd_first) >= firsttuning);
relevant_index <- relevant_instruments <- which(first_index == 1);
Nr_relevant <- length(relevant_instruments);
if (sum(first_index) <= 1){
warning("Less than two IVs are individually relevant, treat all IVs as strong");
relevant_instruments <- c(1:pz);
}
if (length(relevant_instruments) < pz){
X <- cbind(X, z[, -relevant_instruments]);
Z <- as.matrix(z[, relevant_instruments], nrow = n, ncol = Nr_relevant);
pz <- ncol(Z);
D <- qr.resid(qr(X), d);
Z <- scale(qr.resid(qr(X), Z), center = FALSE, scale = TRUE);
}
}else{
relevant_index <- relevant_instruments <- c(1:pz);
Nr_relevant <- pz;
};
Y <- qr.resid(qr(X), Y);
# OLS Estimation for the IV-Specific Estimates and Standard Error
lm_Reduced <- lm(cbind(Y, D) ~ Z - 1);
# IV-Specific Estimates
gamma_Y <- coef(lm_Reduced)[, 1];
gamma_D <- coef(lm_Reduced)[, 2];
betaIV <- gamma_Y / gamma_D;
# Variances by Delta Method
U <- solve(crossprod(Z));
Jacobian_betaIV <- cbind(
diag(c(1 / gamma_D)), diag(c(-gamma_Y / gamma_D^2))
);
Covmat_gamma <- if(robust) vcovHC(lm_Reduced, "HC0") else vcov(lm_Reduced);
sdIV <- sqrt(diag(Jacobian_betaIV %*% Covmat_gamma %*% t(Jacobian_betaIV)));
# CI Downward Testing Sargen/Hansen-J Procedure and Post-Selection Estimation
CIIV.TestingSelectionEstimation(
Y, D, Z, U,
Covariates,
Exogenous,
y, z,
CovariatesNames,
InstrumentNames,
betaIV = betaIV,
sdIV = sdIV,
alpha = alpha,
tuning = tuning,
robust = robust,
gamma_D = gamma_D,
Covmat_gamma = Covmat_gamma,
firststage = firststage,
relevant_instruments = relevant_instruments,
Nr_relevant = Nr_relevant,
relevant_index = relevant_index
);
}
#Selection by the Sargan/Hansen Downward Testing Procedure
#'Internal CIIV Functions
#'
#'These are not to be called by the user.
CIIV.TestingSelectionEstimation <- function(
Y, D, Z, U, Covariates, Exogenous, y, z, CovariatesNames, InstrumentNames, betaIV, sdIV, alpha = 0.05, tuning = 0.1/log(length(Y)), robust = TRUE,
gamma_D, Covmat_gamma, firststage = FALSE, relevant_instruments, Nr_relevant, relevant_index) {
# Function for Sargan Test
non_robust_sar_CI <- function(res_CI, Z, U, n) {
(t(res_CI) %*% Z %*% U %*% t(Z) %*% res_CI) /
(t(res_CI) %*% res_CI / n)
}
# Function for Two Step GMM and Hansen-J Test
CIM.HansenJTest <- function(Y,X,Z){
res_FirstStep <- residuals(AER::ivreg(Y ~ X - 1 | Z));
Weight_SecondStep <- crossprod(res_FirstStep * Z);
Coef_SecondStep <- solve(
t(X) %*% Z %*% solve(Weight_SecondStep) %*%t(Z) %*% X
) %*% t(X) %*% Z %*% solve(Weight_SecondStep) %*% t(Z) %*% Y;
res_SecondStep <- as.vector(Y - X %*% Coef_SecondStep);
sd_SecondStep <- sqrt(diag(solve(
t(X) %*% Z %*% solve(Weight_SecondStep) %*%t(Z) %*% X
) %*% t(X) %*% Z %*% solve(Weight_SecondStep)%*%crossprod(res_SecondStep * Z)%*%t(
solve(
t(X) %*% Z %*% solve(Weight_SecondStep) %*%t(Z) %*% X
) %*% t(X) %*% Z %*% solve(Weight_SecondStep)
)));
HansenJ_Stat <- t(res_SecondStep) %*% Z %*% solve(Weight_SecondStep) %*%
t(Z) %*% res_SecondStep;
list(HansenJ_Stat, Coef_SecondStep,sd_SecondStep)
}
# Define Constants
n <- length(Y);
pz <- ncol(Z);
colnames(Z) = paste0(relevant_instruments);
colnames(z) = paste0(c(1:ncol(z)));
# Break Points
crit <- matrix(0, pz, pz);
rownames(crit) = colnames(crit) <- paste0(relevant_instruments);
for (i in 1:pz) {
for (j in 1:pz) {
crit[i, j] <- abs(betaIV[i] - betaIV[j]) / (sdIV[i] + sdIV[j]);
}
}
# The Downward Testing Procedure
maxs <- pz;
Psar <- 0;
epsi <- 10^(-7);
Psi <- max(crit) + epsi;
while (maxs > 0 && Psar < tuning) {
# The confidence intervals
ciIV <- matrix(0, pz, 3);
ciIV[, 1] <- betaIV - Psi * sdIV;
ciIV[, 2] <- betaIV + Psi * sdIV;
ciIV[, 3] <- relevant_instruments;
# Ordering the Confidence Intervals by the Lower Ends
ciIV <- ciIV[order(ciIV[, 1]), ];
# Check Overlap
grid_CI <- matrix(0, pz, pz)
for (i in 2:pz) {
for (j in 1:i) {
grid_CI[i, j] <- as.numeric(ciIV[i, 1] <= ciIV[j, 2]);
}
}
sumCI <- apply(grid_CI, 1, sum);
selCI <- as.matrix(grid_CI[which(sumCI == max(sumCI)), ]);
maxs <- max(sumCI);
#Selection
if(maxs >= 2 && length(selCI) < pz + 1) { # No tie
selCI <- cbind(ciIV[, 3], selCI);
selVec <- sort(selCI[selCI[, 2] == 1, 1]);
Zv <- Z[,!colnames(Z) %in% paste0(selVec)];
if(robust) {
sar_CI <- CIM.HansenJTest(Y, cbind(D, Zv), Z)[[1]];
} else {
res_CI <- resid(AER::ivreg(Y ~ cbind(D, Zv) - 1 | Z));
sar_CI <- non_robust_sar_CI(res_CI, Z, U, n);
}
Psi <- max(crit[paste0(selVec), paste0(selVec)]) - epsi; # updated psi
Psar <- pchisq(sar_CI, maxs - 1, lower.tail = FALSE);
} else if(maxs >= 2) { # Tie
selCI <- cbind(ciIV[, 3], t(selCI));
selVec <- matrix(0, maxs, ncol(selCI) - 1);
Psar <- Psi <- rep(0, ncol(selCI) - 1);
for (i in 1:(ncol(selCI) - 1)) {
selVec[, i] <- sort(selCI[selCI[, i+1] == 1, 1]);
if(robust){
sar_CI <- CIM.HansenJTest(Y, cbind(D, Z[,!colnames(Z) %in% paste0(selVec[, i])]), Z)[[1]];
} else{
res_CI <- resid(AER::ivreg(Y ~ D + Z[,!colnames(Z) %in% paste0(selVec[, i])] - 1 | Z));
sar_CI <- non_robust_sar_CI(res_CI, Z, U, n);
}
Psar[i] <- pchisq(sar_CI, maxs - 1, lower.tail = FALSE);
Psi[i] <- max(crit[paste0(selVec[, i]), paste0(selVec[, i])]);
}
index.tie <- match(max(Psar), Psar);
selVec <- selVec[,index.tie];
Psar <- Psar[index.tie];
Psi <- min(Psi) - epsi; # updated psi
} else { # None Valid
maxs <- 0;
Psar <- 0;
selVec <- NULL;
}
}
Nr_valid <- length(selVec);
# Post-Selection Estimation
if (Nr_valid == 0) {
print("None of the instruments is selected as valid, do OLS.")
regressor_CIM_temp = regressor_CIM <- cbind(Covariates, z);
length_regressor <- ncol(regressor_CIM)-ncol(z);
Coefficients_CIM <- qr.coef(qr(regressor_CIM), y)[1:length_regressor];
res_CIM <- qr.resid(qr(regressor_CIM), y);
} else {
# At least one of the IVs is selected as valid.
z_invalid <- matrix(z[,!colnames(z) %in% paste0(selVec)], ncol = (ncol(z) - Nr_valid), nrow = n);
regressor_CIM_temp <- cbind(Covariates, z_invalid);
regressor_CIM <- cbind(fitted(lm(Covariates[,1] ~ Exogenous)), Covariates[,-1], z_invalid)
length_regressor <- ncol(regressor_CIM_temp) - ncol(z_invalid);
iv_CIM <- AER::ivreg(y ~ regressor_CIM_temp - 1 | Exogenous);
Coefficients_CIM <- coef(iv_CIM)[1:length_regressor];
if(robust){
Coefficients_CIM_GMM <- (CIM.HansenJTest(y, regressor_CIM_temp, Exogenous)[[2]])[1:length_regressor];
}
res_CIM <- resid(iv_CIM);
}
# Standard Error and Confidence Interval
if (robust) {
sd_CIM <- sqrt(diag(
solve(crossprod(regressor_CIM)) %*%
crossprod(res_CIM * regressor_CIM) %*%
solve(crossprod(regressor_CIM))
))[1:length_regressor];
ci_CIM <- c(
Coefficients_CIM[1] - qnorm(1-alpha/2) * sd_CIM[1],
Coefficients_CIM[1] + qnorm(1-alpha/2) * sd_CIM[1]
);
if (Nr_valid == 0){
Coefficients_CIM_GMM <- NA;
sd_CIM_GMM <- NA;
ci_CIM_GMM <- NA;
}else{
sd_CIM_GMM <- (CIM.HansenJTest(y, regressor_CIM_temp, Exogenous)[[3]])[1:length_regressor];
ci_CIM_GMM <- c(
Coefficients_CIM_GMM[1] - qnorm(1-alpha/2) * sd_CIM_GMM[1],
Coefficients_CIM_GMM[1] + qnorm(1-alpha/2) * sd_CIM_GMM[1]
);
}
} else {
sd_CIM <- sqrt(diag(
mean(res_CIM^2) * solve(crossprod(regressor_CIM))
))[1:length_regressor];
ci_CIM <- c(
Coefficients_CIM[1] - qnorm(1-alpha/2) * sd_CIM[1],
Coefficients_CIM[1] + qnorm(1-alpha/2) * sd_CIM[1]
);
}
# Results
if(robust){
object <- list(
robust = robust,
if(is.null(selVec)){
Valid_Instruments = paste0("None instruments selected as valid.");
}else{
Valid_Instruments = InstrumentNames[c(1:length(z))[selVec]];
},
Valid_Instruments = Valid_Instruments,
Nr_valid = Nr_valid,
if(firststage){
if(Nr_relevant == 0){
Relevant_Instruments = paste0("None instruments selected as relevant.")
}else{
Relevant_Instruments = InstrumentNames[c(1:length(z))[relevant_index]];
};
}else{
Relevant_Instruments = paste0("No first stage selection.");
Nr_relevant = paste0("No first stage selection.");
},
Relevant_Instruments = Relevant_Instruments,
Nr_relevant = Nr_relevant,
Covariate_Names = CovariatesNames,
Coefficients_CIM = Coefficients_CIM,
sd_CIM = sd_CIM,
Coefficients_CIM_GMM = Coefficients_CIM_GMM,
sd_CIM_GMM = sd_CIM_GMM,
ci_CIM = ci_CIM,
ci_CIM_GMM = ci_CIM_GMM,
HansenJ_CIM = Psar
)
}else{
object <- list(
robust = robust,
if(is.null(selVec)){
Valid_Instruments = paste0("None instruments selected as valid.");
}else{
Valid_Instruments = InstrumentNames[c(1:ncol(z))[selVec]];
},
Valid_Instruments = Valid_Instruments,
Nr_valid = Nr_valid,
if(firststage){
if(Nr_relevant == 0){
Relevant_Instruments = paste0("None instruments selected as relevant.")
}else{
Relevant_Instruments = InstrumentNames[c(1:length(z))[relevant_index]];
};
}else{
Relevant_Instruments = paste0("No first stage selection.");
Nr_relevant = paste0("No first stage selection.");
},
Relevant_Instruments = Relevant_Instruments,
Nr_relevant = Nr_relevant,
Covariate_Names = CovariatesNames,
Coefficients_CIM = Coefficients_CIM,
sd_CIM = sd_CIM,
ci_CIM = ci_CIM,
Sargan_CIM = Psar
)
}
class(object) <- "CIIV"
object
}
#'Internal CIIV Functions
#'
#'These are not to be called by the user.
print.CIIV <- function(object,robust = object$robust,...){
cat("\nValid Instruments:\n", object$Valid_Instruments, "\n","\nNumber of Valid Instruments:\n", object$Nr_valid,"\n");
cat("\nRelevant Instruments:\n", object$Relevant_Instruments, "\n","\nNumber of Relevant Instruments:\n", object$Nr_relevant,"\n");
cat("\nCoefficients:\n")
names(object$Coefficients_CIM) = names(object$sd_CIM) = names(object$ci_CIM) = NULL;
if(robust){
names(object$Coefficients_CIM_GMM) = names(object$sd_CIM_GMM) = names(object$ci_CIM_GMM) = NULL;
coef_cim <- cbind(object$Coefficients_CIM, object$sd_CIM, object$Coefficients_CIM_GMM,object$sd_CIM_GMM);
colnames(coef_cim) = c("2SLS Estimate", "2SLS Std. Error", "GMM Estimate", "GMM Std. Error");
rownames(coef_cim) = object$Covariate_Names;
print(coef_cim, quote = FALSE);
cat("\nConfidence Interval 2SLS: [", object$ci_CIM[1], ",", object$ci_CIM[2], "]", "\n", sep = '');
cat("\nConfidence Interval GMM: [", object$ci_CIM_GMM[1], ",", object$ci_CIM_GMM[2], "]", "\n", sep = '');
cat("\np-value of Hansen-J: ", object$HansenJ_CIM, sep = '');
}else{
coef_cim <- cbind(object$Coefficients_CIM, object$sd_CIM);
colnames(coef_cim) = c("2SLS Estimate", "Std. Error");
rownames(coef_cim) = object$Covariate_Names;
print(coef_cim, quote = FALSE);
cat("\nConfidence Interval: [", object$ci_CIM[1], ",", object$ci_CIM[2], "]", "\n", sep = '');
cat("\np-value of Sargan: ", object$Sargan_CIM, sep = '');
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.