#' @title Endogeneity test in high dimensions
#' @description Conduct the endogeneity test with high dimensional and possibly invalid instrumental variables.
#'
#' @param Y The outcome observation, a vector of length \eqn{n}.
#' @param D The treatment observation, a vector of length \eqn{n}.
#' @param Z The instrument observation of dimension \eqn{n \times p_z}.
#' @param X The covariates observation of dimension \eqn{n \times p_x}.
#' @param intercept Whether the intercept is included. (default = \code{TRUE})
#' @param invalid If \code{TRUE}, the method is robust to the presence of possibly invalid IVs; If \code{FALSE}, the method assumes all IVs to be valid. (default = \code{FALSE})
#' @param method The method used to estimate the reduced form parameters. \code{"OLS"} stands for ordinary least squares, \code{"DeLasso"} stands for the debiased Lasso estimator, and \code{"Fast.DeLasso"} stands for the debiased Lasso estimator with fast algorithm. (default = \code{"Fast.DeLasso"})
#' @param voting The voting option used to estimate valid IVs. \code{'MP'} stnads for majority and plurality voting, \code{'MaxClique'} stands for maximum clique in the IV voting matrix. (default = \code{'MP'})
#' @param alpha The significance level for the confidence interval. (default = \code{0.05})
#' @param tuning.1st The tuning parameter used in the 1st stage to select relevant instruments. If \code{NULL}, it will be generated data-dependently, see Details. (default=\code{NULL})
#' @param tuning.2nd The tuning parameter used in the 2nd stage to select valid instruments. If \code{NULL}, it will be generated data-dependently, see Details. (default=\code{NULL})
#'
#' @details
#' When \code{voting = MaxClique} and there are multiple maximum cliques, the null hypothesis is rejected if one of maximum clique rejects the null.
#' As for tuning parameter in the 1st stage and 2nd stage, if do not specify, for method "OLS" we adopt \eqn{\sqrt{\log n}} for both tuning parameters, and for other methods
#' we adopt \eqn{\max{(\sqrt{2.01 \log p_z}, \sqrt{\log n})}} for both tuning parameters.
#'
#' @return
#' \code{endo.test} returns an object of class "endotest", which is a list containing the following components:
#' \item{\code{Q}}{The test statistic.}
#' \item{\code{Sigma12}}{The estimated covaraince of the regression errors.}
#' \item{\code{VHat}}{The set of selected vaild IVs.}
#' \item{\code{p.value}}{The p-value of the endogeneity test.}
#' \item{\code{check}}{The indicator that \eqn{H_0:\Sigma_{12}=0} is rejected.}
#' @export
#'
#' @examples
#'
#' n = 500; L = 11; s = 3; k = 10; px = 10;
#' alpha = c(rep(3,s),rep(0,L-s)); beta = 1; gamma = c(rep(1,k),rep(0,L-k))
#' phi<-(1/px)*seq(1,px)+0.5; psi<-(1/px)*seq(1,px)+1
#' epsilonSigma = matrix(c(1,0.8,0.8,1),2,2)
#' Z = matrix(rnorm(n*L),n,L)
#' X = matrix(rnorm(n*px),n,px)
#' epsilon = MASS::mvrnorm(n,rep(0,2),epsilonSigma)
#' D = 0.5 + Z %*% gamma + X %*% psi + epsilon[,1]
#' Y = -0.5 + Z %*% alpha + D * beta + X %*% phi + epsilon[,2]
#' endo.test.model <- endo.test(Y,D,Z,X,invalid = TRUE)
#' summary(endo.test.model)
#'
#'
#' @references {
#' Guo, Z., Kang, H., Tony Cai, T. and Small, D.S. (2018), Testing endogeneity with high dimensional covariates, \emph{Journal of Econometrics}, Elsevier, vol. 207(1), pages 175-187. \cr
#' }
#'
#'
#'
endo.test <- function(Y,D,Z,X,intercept=TRUE,invalid=FALSE, method=c("Fast.DeLasso","DeLasso","OLS"),
voting = c('MP','MaxClique'), alpha=0.05,tuning.1st=NULL, tuning.2nd=NULL){
# Check and Clean Input Type #
# Check Y
method = match.arg(method)
voting = match.arg(voting)
stopifnot(!missing(Y),(is.numeric(Y) || is.logical(Y)),is.vector(Y)||(is.matrix(Y) || is.data.frame(Y)) && ncol(Y) == 1)
stopifnot(all(!is.na(Y)))
if (is.vector(Y)) {
Y <- cbind(Y)
}
Y = as.numeric(Y)
# Check D
stopifnot(!missing(D),(is.numeric(D) || is.logical(D)),is.vector(D)||(is.matrix(D) || is.data.frame(D)) && ncol(D) == 1)
stopifnot(all(!is.na(D)))
if (is.vector(D)) {
D <- cbind(D)
}
D = as.numeric(D)
# Check Z
stopifnot(!missing(Z),(is.numeric(Z) || is.logical(Z)),(is.vector(Z) || is.matrix(Z)))
stopifnot(all(!is.na(Z)))
if (is.vector(Z)) {
Z <- cbind(Z)
}
# Check dimesions
stopifnot(nrow(Y) == nrow(D), nrow(Y) == nrow(Z))
# Check X, if present
if(!missing(X)) {
stopifnot((is.numeric(X) || is.logical(X)),(is.vector(X))||(is.matrix(X) && nrow(X) == nrow(Z)))
stopifnot(all(!is.na(X)))
if (is.vector(X)) {
X <- cbind(X)
}
W = cbind(Z,X)
} else {
W = Z
}
# centralize W
W = scale(W, center = T, scale = F)
# All the other argument
stopifnot(is.logical(intercept))
stopifnot(is.numeric(alpha),length(alpha) == 1,alpha <= 1,alpha >= 0)
stopifnot(method=='OLS' | method=='DeLasso' | method =="Fast.DeLasso")
stopifnot(voting=='MaxClique' | voting=='MP')
# Derive Inputs for Endogeneity test
n = length(Y); pz=ncol(Z)
if(method == "OLS") {
inputs = TSHT.OLS(Y,D,W,pz,intercept) # recommend when n>>p
} else if (method == "Fast.DeLasso"){
inputs = TSHT.DeLasso(Y,D,W,pz,intercept)
} else if(method == "DeLasso"){
inputs = TSHT.SIHR(Y, D, W, pz, intercept=intercept)
}
ITT_Y = inputs$ITT_Y;
ITT_D = inputs$ITT_D;
WUMat = inputs$WUMat;
SigmaSqD = inputs$SigmaSqD;
SigmaSqY = inputs$SigmaSqY;
SigmaYD=inputs$SigmaYD;
V.Gamma = SigmaSqY * t(WUMat)%*%WUMat / n
V.gamma = SigmaSqD * t(WUMat)%*%WUMat / n
C = SigmaYD * t(WUMat)%*%WUMat / n
# Estimate Relevant IVs
voting = match.arg(voting)
if (invalid) {
SetHats = TSHT.VHat(n, ITT_Y, ITT_D, V.Gamma, V.gamma, C, voting, method=method,tuning.1st=tuning.1st, tuning.2nd=tuning.2nd)
Set = SetHats$VHat
} else {
SetHats <- endo.SHat(n,ITT_D,V.gamma,method=method,tuning.1st=tuning.1st, tuning.2nd=tuning.2nd)
Set = SetHats
}
if(typeof(Set)=="list"){
Q = Sigma12 = p.value = check = vector("list", length(Set))
names(Set) =names(Q) = names(Sigma12) = names(p.value) = names(check) = paste0("MaxClique",1:length(Set))
for(i.VHat in 1:length(Set)){
VHat.i = Set[[i.VHat]]
betaHat.i = as.numeric((t(ITT_Y[VHat.i]) %*% ITT_D[VHat.i]) / (t(ITT_D[VHat.i]) %*% ITT_D[VHat.i]))
Sigma11.i = inputs$SigmaSqY + betaHat.i^2 * inputs$SigmaSqD - 2*betaHat.i * inputs$SigmaYD # Sigma11.hat
Sigma12.i = inputs$SigmaYD-betaHat.i*inputs$SigmaSqD # Sigma12.hat
Var1.i = Sigma11.i *(t(ITT_D[VHat.i]) %*% ((t(WUMat) %*% WUMat/ n)[VHat.i,VHat.i]) %*% ITT_D[VHat.i]) / (t(ITT_D[VHat.i]) %*% ITT_D[VHat.i])^2
Var2.i = SigmaSqY*SigmaSqD-SigmaYD^2+2*(betaHat.i*SigmaSqD-SigmaYD)^2 # VarThetahat
VarSig12.i = inputs$SigmaSqD^2*Var1.i+Var2.i
Q.i = sqrt(n)*Sigma12.i/sqrt(VarSig12.i) # our test statistic
p.value.i <- 2*(1-pnorm(abs(Q.i)))
check.i <- (abs(Q.i)>qnorm(1-alpha/2))
# store
Q[[i.VHat]] = Q.i
Sigma12[[i.VHat]] = Sigma12.i
p.value[[i.VHat]] = p.value.i
check[[i.VHat]] = check.i
}
if(!is.null(colnames(Z))){
Set = lapply(Set, FUN=function(x) colnames(Z)[x])
}
}else {
# Obtain point est and our test statistic Q
betaHat = t(inputs$ITT_Y[Set]) %*% inputs$ITT_D[Set] / (t(inputs$ITT_D[Set]) %*% inputs$ITT_D[Set])
Sigma11 = inputs$SigmaSqY + betaHat^2 * inputs$SigmaSqD - 2*betaHat * inputs$SigmaYD # Sigma11.hat
Sigma12 = inputs$SigmaYD-betaHat*inputs$SigmaSqD # Sigma12.hat
Var1 = Sigma11 *(t(inputs$ITT_D[Set]) %*% ((t(inputs$WUMat) %*% inputs$WUMat/ n)[Set,Set]) %*% inputs$ITT_D[Set]) / (t(inputs$ITT_D[Set]) %*% inputs$ITT_D[Set])^2
Var2 = inputs$SigmaSqY*inputs$SigmaSqD-inputs$SigmaYD^2+2*(betaHat*inputs$SigmaSqD-inputs$SigmaYD)^2 # VarThetahat
VarSig12 = inputs$SigmaSqD^2*Var1+Var2
Q = sqrt(n)*Sigma12/sqrt(VarSig12) # our test statistic
if (!is.null(colnames(Z))) {
Set = colnames(Z)[Set]
}
p.value <- 2*(1-pnorm(abs(Q)))
check <- (abs(Q)>qnorm(1-alpha/2))
}
endo.test.model <- list(Q=Q,Sigma12=Sigma12,VHat=Set,p.value = p.value,check = check,alpha = alpha)
class(endo.test.model) <- "endotest"
return(endo.test.model)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.