# $Id: condfstat.R 1943 2016-04-07 23:08:38Z sgaure $
ivbootstrap <- function(z, x, y, quantiles=0.95, N=100L, cluster=NULL) {
# estimate bias as E((z' x)^{-1} z' eps)
N <- max(N)
if(!is.null(cluster)) {
if(length(cluster) > 1)
warning('Only a single cluster is supported for IV bootstrap, using ',
names(cluster)[[1]])
clu <- cluster[[1]]
iclu <- as.integer(clu)
}
# zortho <- orthonormalize(z)
pint <- getOption('lfe.pint')
start <- last <- Sys.time()
# hmm, can we reduce the number of instruments?
n <- 0
bias <- replicate(N,{
n <<- n + 1
now <- Sys.time()
if(is.numeric(pint) && pint > 0 && now-last > pint) {
message(date(), ' Iteration ', n , ' of ', N , ' in IV bootstrap')
last <<- now
}
if(is.null(cluster)) {
# resampling observations for indep residuals
s <- sample(nrow(z),replace=TRUE)
} else {
# resample entire levels
cl <- sort(sample(nlevels(clu), replace=TRUE))
# find a faster way to do this:
# s <- sort(unlist(sapply(cl, function(ll) which(clu==ll))))
s <- NULL
while(length(cl) > 0) {
s <- c(s,which(iclu %in% cl))
cl <- cl[c(1L,diff(cl)) == 0]
}
}
# draw new instruments
zortho <- orthonormalize(z[s,,drop=FALSE])
# draw new X, and project it
zX <- crossprod(zortho, x[s,,drop=FALSE])
tryCatch(solve(crossprod(zX),
crossprod(zX, crossprod(zortho, y[s,,drop=FALSE]))),
error=function(e) {warning(e);NULL})
})
if(start != last) cat('\n')
if(is.list(bias)) {
# some of them returned NULL, so replicate returned a list
# discard those NULLs
bias <- simplify2array(bias[!sapply(bias,is.null)])
N <- dim(bias)[3]
}
# estimate quantiles of the bias
if(is.null(quantiles)) {
res <- bias
res <- aperm(res, c(1,(3:length(dim(res))),2))
} else {
qname <- paste(100*round(quantiles,3), '%',sep='')
dm <- 1:(length(dim(bias))-1)
res <- apply(bias,dm,function(s) quantile(s,probs=quantiles,na.rm=TRUE,type=4))
if(length(quantiles) == 1) {
dmn <- dimnames(res)
dim(res) <- c(1,dim(res))
dimnames(res) <- c(list(paste(round(quantiles,3),'%',sep='')),dmn)
}
res <- aperm(res,c(2,1,3:length(dim(res))))
}
structure(res, q=quantiles, samples=N)
}
# From "A weak instrument F-test in linear iv models with multiple
# " endogenous variables", Sanderson & Windmeijer, Disc. Paper 14/644 U of Bristol, 2014
# pp 22-23. There's an error in how tilde delta is computed at p. 23.
# it should be \hat X_{-j}, not X_{-j}. I.e. coefs for \hat X_{-j} when regressing
# x_j. But we should predict with with X_{-j}
# I.e. estimate delta in x_j = \hat X_{-j} * delta + eps
# Regress x_j - X_{-j}*delta = Z * kappa + xi
# wald test on kappa, with df = max(1,kz-#endog+1)
#' Compute conditional F statistic for weak instruments in an IV-estimation
#' with multiple endogenous variables
#'
#' When using multiple instruments for multiple endogenous variables, the
#' ordinary individual t-tests for the instruments in the first stage do not
#' always reveal a weak set of instruments. Conditional F statistics can be
#' used for such testing.
#'
#'
#' IV coefficient estimates are not normally distributed, in particular they do
#' not have the right expectation. They follow a quite complicated
#' distribution which is fairly close to normal if the instruments are good.
#' The conditional F-statistic is a measure of how good the instruments are.
#' If the F is large, the instruments are good, and any bias due to the
#' instruments is small compared to the estimated standard errors, and also
#' small relative to the bias in OLS. See \cite{Sanderson and Windmeijer
#' (2014)} and \cite{Stock and Yogo (2004)}. If F is small, the bias can be
#' large compared to the standard error.
#'
#' If \code{any(quantiles > 0.0)}, a bootstrap with \code{bN} samples will be
#' performed to estimate quantiles of the endogenous parameters which includes
#' the variance both from the 1st and 2nd stage. The result is returned in an
#' array attribute \code{quantiles} of the value returned by \code{condfstat}.
#' The argument \code{quantiles} can be a vector to estimate more than one
#' quantile at once. If \code{quantiles=NULL}, the bootstrapped estimates
#' themselves are returned. The bootstrap is normally much faster than running
#' \code{felm} over and over again. This is so because all exogenous variables
#' are projected out of the equations before doing the bootstrap.
#'
#' @param object object of class \code{"felm"}, a result of a call to
#' \code{\link{felm}}.
#' @param type character. Error structure. Passed to \code{\link{waldtest}}. If
#' \code{NULL}, both iid and robust Fs are returned.
#' @param quantiles numeric. Quantiles for bootstrap.
#' @param bN integer. Number of bootstrap samples.
#' @return A p x k matrix, where k is the number of endogenous variables. Each
#' row are the conditional F statistics on a residual equation as described in
#' \cite{Sanderson and Windmeijer (2014)}, for a certain error structure. The
#' default is to use iid, or cluster if a cluster was specified to
#' \code{\link{felm}}. The third choice is \code{'robust'}, for heteroskedastic
#' errors. If \code{type=NULL}, iid and robust Fs are returned, and cluster, if
#' that was specified to \code{felm}.
#'
#' Note that for these F statistics it is not the p-value that matters, it is
#' the F statistic itself which (coincidentally) pops up in the denominator for
#' the asymptotic bias of the IV estimates, and thus a large F is beneficial.
#' @note Please note that \code{condfstat} does not work with the old syntax
#' for IV in \code{\link{felm}(...,iv=)}. The new multipart syntax must be
#' used.
#' @references Sanderson, E. and F. Windmeijer (2014) \cite{A weak instrument
#' F-test in linear IV models with multiple endogenous variables}, Journal of
#' Econometrics, 2015.
#' \url{http://www.sciencedirect.com/science/article/pii/S0304407615001736}
#'
#' Stock, J.H. and M. Yogo (2004) \cite{Testing for weak instruments in linear
#' IV regression}, \url{http://ssrn.com/abstract=1734933} in
#' \cite{Identification and inference for econometric models: Essays in honor
#' of Thomas Rothenberg}, 2005.
#' @examples
#'
#' z1 <- rnorm(4000)
#' z2 <- rnorm(length(z1))
#' u <- rnorm(length(z1))
#' # make x1, x2 correlated with errors u
#'
#' x1 <- z1 + z2 + 0.2*u + rnorm(length(z1))
#' x2 <- z1 + 0.94*z2 - 0.3*u + rnorm(length(z1))
#' y <- x1 + x2 + u
#' est <- felm(y ~ 1 | 0 | (x1 | x2 ~ z1 + z2))
#' summary(est)
#' \dontrun{
#' summary(est$stage1, lhs='x1')
#' summary(est$stage1, lhs='x2')
#' }
#'
#' # the joint significance of the instruments in both the first stages are ok:
#' t(sapply(est$stage1$lhs, function(lh) waldtest(est$stage1, ~z1|z2, lhs=lh)))
#' # everything above looks fine, t-tests for instruments,
#' # as well as F-tests for excluded instruments in the 1st stages.
#' # The conditional F-test reveals that the instruments are jointly weak
#' # (it's close to being only one instrument, z1+z2, for both x1 and x2)
#' condfstat(est, quantiles=c(0.05, 0.95))
#'
#' @export condfstat
condfstat <- function(object, type='default', quantiles=0.0, bN=100L) {
est <- object
st1 <- est$stage1
if(is.null(st1))
stop('Conditional F statistic only makes sense for iv-estimation')
if(is.null(type)) {
types <- c('iid','robust')
if(!is.null(est$clustervar)) types <- c(types,'cluster')
} else {
if(identical(type,'default'))
types <- if(is.null(est$clustervar)) 'iid' else 'cluster'
else
types <- type
}
if(length(st1$lhs) == 1) {
# only a single endogenous variable
# reduce to ordinary F-test
df1 <- nrow(st1$coefficients)
result <- as.matrix(sapply(types, function(typ) {
waldtest(st1,st1$instruments, df1=df1, type=typ)['F']
}))
dimnames(result) <- list(st1$lhs,paste(types,'F'))
return(structure(t(result),df1=df1))
}
# first, transform away all the exogenous variables from
# instruments, endogenous variables and predicted endogenous variables
# there may be an intercept in ivx, we remove it
keep <- !(colnames(st1$ivx) %in% '(Intercept)')
if(all(keep)) ivx <- st1$ivx else ivx <- st1$ivx[,keep, drop=FALSE]
inames <- colnames(ivx)
y <- cbind(st1$ivy, ivx, st1$c.fitted.values, est$c.response)
fitnames <- makefitnames(colnames(st1$c.fitted.values))
setdimnames(y, list(NULL, c(colnames(st1$ivy), inames, fitnames, colnames(est$c.response))))
mm <- list(y=y, x=st1$centred.exo)
tvars <- newols(mm, nostats=TRUE)$residuals
rm(y,mm)
# should we estimate the relative bias?
if(any(quantiles > 0) || is.null(quantiles)) {
# use the predicted variables as instruments?
z <- tvars[,colnames(ivx),drop=FALSE]
# z <- xhat
x <- tvars[,colnames(st1$ivy),drop=FALSE]
y <- tvars[,colnames(est$c.response), drop=FALSE]
bias <- ivbootstrap(z,x,y,
quantiles=quantiles,N=bN,cluster=est$clustervar)
rm(z,x,y)
# Now, bias contains the bias distribution
# we subtract it from the estimate in object$coefficients
quant <- bias
# cf <- object$coefficients[fitnames,,drop=FALSE]
# quant <- sapply(seq_len(ncol(cf)), function(i) cf[,i] + bias[,,i,drop=FALSE])
# attributes(quant) <- attributes(bias)
quant <- drop(quant)
} else {
quant <- NULL
}
resid <- sapply(st1$lhs, function(ev) {
# do an ols on each
evrest <- st1$lhs[!(st1$lhs %in% ev)]
hatev <- makefitnames(ev)
hatevrest <- makefitnames(evrest)
Xlhs <- tvars[,ev,drop=FALSE]
Xhatrest <- tvars[,hatevrest,drop=FALSE]
Xrest <- tvars[,evrest,drop=FALSE]
Xlhs - Xrest %*% solve(crossprod(Xhatrest), t(Xhatrest) %*% Xlhs)
})
setdimnames(resid, list(NULL, st1$lhs))
# then regress on the instruments
mm <- list(y = resid, x=tvars[,inames], cluster=est$clustervar)
z <- newols(mm, nostats=FALSE)
df1 <- nrow(z$coefficients)-length(z$lhs)+1
result <- as.matrix(sapply(types, function(typ) {
sapply(z$lhs, function(lh) waldtest(z,inames,lhs=lh, df1=df1, type=typ)['F'])
}))
dimnames(result) <- list(z$lhs,paste(types,'F'))
structure(t(result),df1=df1, quantiles=quant)
}
oldcondfstat <- function(object, type='default') {
est <- object
if(is.null(est$stage1))
stop('Conditional F statistic only makes sense for iv-estimation')
# for each endogenous variable x_j, we move it to the lhs, and estimate the
# residuals with the other endogenous vars as explanatory variables.
# We put each of these residuals on the lhs in the 1st stage, and estimate
# the effects of the instruments. We do a Wald test on these estimates to obtain
# a conditional test for each endogenous variable.
st1 <- est$stage1
if(length(st1$lhs) == 1) {
# only a single endogenous variable
# reduce to ordinary F-test
W <- waldtest(st1,st1$instruments)
return(structure(W['F'],df1=W['df1']))
}
cl1 <- st1$call
newcl <- cl <- est$call
Form <- as.Formula(cl[['formula']])
baseform <- as.Formula(formula(Form,lhs=0,rhs=1:2))
ivForm <- as.Formula(formula(Form,lhs=0,rhs=3)[[2]][[2]])
endogForm <- formula(ivForm, lhs=NULL, rhs=0)[[2]]
endogvars <- all.vars(endogForm)
instrvars <- st1$instruments #all.vars(formula(ivForm, lhs=0, rhs=NULL)[[2]])
fitvars <- st1$endogvars
# Now, the residuals should be put into the lhs of the 1st stage
# The 1st stage formula is already ok, but we should make sure
# the lhs is picked up from the residuals we create.
# We create a new environment based on the old, but our new
# residuals into the new environment
fitenv <- environment(est$st2call[['formula']])
# an environment to store the new residual variables
# we can get rid of the exogeneous residuals by
# replacing the instruments and endogeneous variables
# by their residuals when regressing with the exogeneous variables
# If K is the number of endogenous variables, we do K+1 centerings
# of the exogenous variables below. If we project them out first,
# we will just do 1 centering.
# Collect the endogenous variables and instruments and their projections on the rhs
# Only the exogenous variables on the rhs. Store residuals in noexo environment.
noexo <- new.env()
# put endogenous variables, their predictions from 1st stage, and the
# instruments on the lhs, with the exogenous variables on the rhs
cvars <- c(endogvars, fitvars, instrvars)
cForm <- as.Formula(formula(paste(paste(cvars, collapse='|'),'~0')))
cForm <- as.Formula(update(cForm, as.formula(substitute(. ~ X,
list(X=baseform[[2]])))))
projcall <- cl
environment(cForm) <- fitenv
projcall[['formula']] <- cForm
projcall[['nostats']] <- TRUE
projest <- eval(projcall, envir=est$parent.frame)
# store residuals in environment under name 'x..(noexo)..
# We store the residuals we need for the next regression
# i.e. the x_j - X_{-j} \delta, under the name
# of the endogenous variable. That's all we need.
# and the projected instruments.
lhsvars <- colnames(st1$residuals)
endogfitvars <- paste(lhsvars,'(fit)',sep='')
for(ev in seq_along(lhsvars)) {
# do an OLS to compute delta, but implement just with crossprod on the right matrix
evnam <- lhsvars[ev]
restfitnam <- endogfitvars[-ev]
restnam <- lhsvars[-ev]
orig <- projest$residuals[,evnam,drop=FALSE]
Xjfit <- projest$residuals[,restfitnam,drop=FALSE]
Xj <- projest$residuals[,restnam,drop=FALSE]
delta <- solve(crossprod(Xjfit), t(Xjfit) %*% orig)
resid <- orig - Xj %*% delta
# these are the left hand sides in the stage 2 regression below
resnam <- paste(evnam,'..(noexo)..',sep='')
assign(resnam, resid, envir=noexo)
}
# store the residual instruments in noexo also
for(instr in instrvars) {
resnam <- paste(instr,'..(noexo)..',sep='')
assign(resnam, projest$residuals[,instr,drop=FALSE], envir=noexo)
}
# ditch projest to save memory, we don't need it anymore
rm(projest)
# compute concentration parameter matrix mu
# From Stock, Wright, Yogo section 4.2 p .522
# do this when I have the time to do it.
# Then do the regression on the endog. residuals in resenv
# on the lhs, and projected instruments from
# projest on the rhs
# make a formula
# Should we do clustering?
resendog <- paste('`',lhsvars,'..(noexo)..`',sep='')
resinst <- paste('`',instrvars,'..(noexo)..`',sep='')
if(length(Form)[2] == 4) {
cluform <- formula(formula(Form,lhs=0,rhs=4))[[2]]
F1 <- as.Formula(formula(paste(paste(resendog,collapse='|'),'~',
paste(resinst,collapse='+'),'+0|0|0|0')))
newform <- update(F1,as.formula(substitute(. ~ . |.|.|C, list(C=cluform))))
} else {
newform <- as.Formula(formula(paste(paste(resendog,collapse='|'),'~',
paste(resinst,collapse='+'),'+0')))
}
# everything except the cluster var is in noexo
# the cluster var is still in the original data frame, or in the parent
# of noexo. We must call felm with the original data. Hopefully our
# constructed names ..(residual).. are not present there
environment(newform) <- noexo
m <- match(c('formula','data'), names(cl),0L)
newcl <- cl[c(1L,m)]
newcl[['formula']] <- newform
e1 <- eval(newcl,est$parent.frame)
df1 <- length(instrvars)-length(lhsvars)+1
res <- sapply(e1$lhs, function(lh) {
waldtest(e1,resinst,lhs=lh, df1=df1, type=type)['F']
})
attr(res,'df1') <- df1
names(res) <- lhsvars
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.