R/condfstat.R
In lfe: Linear Group Fixed Effects

Documented in condfstat

# $Id: condfstat.R 1943 2016-04-07 23:08:38Z sgaure $
ivbootstrap <- function(z, x, y, quantiles = 0.95, N = 100L, cluster = NULL) {
  # estimate bias as E((z' x)^{-1} z' eps)
  N <- max(N)
  if (!is.null(cluster)) {
    if (length(cluster) > 1) {
      warning(
        "Only a single cluster is supported for IV bootstrap, using ",
        names(cluster)[[1]]
      )
    }
    clu <- cluster[[1]]
    iclu <- as.integer(clu)
  }
  #  zortho <- orthonormalize(z)

  pint <- getOption("lfe.pint")
  start <- last <- Sys.time()
  # hmm, can we reduce the number of instruments?

  n <- 0
  bias <- replicate(N, {
    n_temp <- n + 1
    assign("n", n_temp, inherits = TRUE) ## replaces n <<- n+1
    now <- Sys.time()
    if (is.numeric(pint) && pint > 0 && now - last > pint) {
      message(date(), " Iteration ", n, " of ", N, " in IV bootstrap")
      assign("last", now, inherits = TRUE) # replaces: last <<- now
    }
    if (is.null(cluster)) {
      # resampling observations for indep residuals
      s <- sample(nrow(z), replace = TRUE)
    } else {
      # resample entire levels
      cl <- sort(sample(nlevels(clu), replace = TRUE))
      # find a faster way to do this:
      # s <- sort(unlist(sapply(cl, function(ll) which(clu==ll))))
      s <- NULL
      while (length(cl) > 0) {
        s <- c(s, which(iclu %in% cl))
        cl <- cl[c(1L, diff(cl)) == 0]
      }
    }

    # draw new instruments
    zortho <- orthonormalize(z[s, , drop = FALSE])
    # draw new X, and project it
    zX <- crossprod(zortho, x[s, , drop = FALSE])
    tryCatch(
      solve(
        crossprod(zX),
        crossprod(zX, crossprod(zortho, y[s, , drop = FALSE]))
      ),
      error = function(e) {
        warning(e)
        NULL
      }
    )
  })
  if (start != last) cat("\n")

  if (is.list(bias)) {
    # some of them returned NULL, so replicate returned a list
    # discard those NULLs
    bias <- simplify2array(bias[!sapply(bias, is.null)])
    N <- dim(bias)[3]
  }
  # estimate quantiles of the bias

  if (is.null(quantiles)) {
    res <- bias
    res <- aperm(res, c(1, (3:length(dim(res))), 2))
  } else {
    qname <- paste(100 * round(quantiles, 3), "%", sep = "")
    dm <- 1:(length(dim(bias)) - 1)
    res <- apply(bias, dm, function(s) quantile(s, probs = quantiles, na.rm = TRUE, type = 4))
    if (length(quantiles) == 1) {
      dmn <- dimnames(res)
      dim(res) <- c(1, dim(res))
      dimnames(res) <- c(list(paste(round(quantiles, 3), "%", sep = "")), dmn)
    }
    res <- aperm(res, c(2, 1, 3:length(dim(res))))
  }

  structure(res, q = quantiles, samples = N)
}

# From "A weak instrument F-test in linear iv models with multiple
# " endogenous variables", Sanderson & Windmeijer, Disc. Paper 14/644 U of Bristol, 2014
# pp 22-23.  There's an error in how tilde delta is computed at p. 23.
# it should be \hat X_{-j}, not X_{-j}.  I.e. coefs for \hat X_{-j} when regressing
# x_j.  But we should predict with with X_{-j}
# I.e. estimate delta in x_j = \hat X_{-j} * delta + eps
# Regress x_j - X_{-j}*delta = Z * kappa + xi
# wald test on kappa, with df = max(1,kz-#endog+1)






#' Compute conditional F statistic for weak instruments in an IV-estimation
#' with multiple endogenous variables
#'
#' When using multiple instruments for multiple endogenous variables, the
#' ordinary individual t-tests for the instruments in the first stage do not
#' always reveal a weak set of instruments.  Conditional F statistics can be
#' used for such testing.
#'
#'
#' IV coefficient estimates are not normally distributed, in particular they do
#' not have the right expectation.  They follow a quite complicated
#' distribution which is fairly close to normal if the instruments are good.
#' The conditional F-statistic is a measure of how good the instruments are.
#' If the F is large, the instruments are good, and any bias due to the
#' instruments is small compared to the estimated standard errors, and also
#' small relative to the bias in OLS. See \cite{Sanderson and Windmeijer
#' (2014)} and \cite{Stock and Yogo (2004)}.  If F is small, the bias can be
#' large compared to the standard error.
#'
#' If `any(quantiles > 0.0)`, a bootstrap with `bN` samples will be
#' performed to estimate quantiles of the endogenous parameters which includes
#' the variance both from the 1st and 2nd stage.  The result is returned in an
#' array attribute `quantiles` of the value returned by `condfstat`.
#' The argument `quantiles` can be a vector to estimate more than one
#' quantile at once.  If `quantiles=NULL`, the bootstrapped estimates
#' themselves are returned.  The bootstrap is normally much faster than running
#' `felm` over and over again. This is so because all exogenous variables
#' are projected out of the equations before doing the bootstrap.
#'
#' @param object object of class `"felm"`, a result of a call to
#' [felm()].
#' @param type character. Error structure. Passed to [waldtest()]. If
#' `NULL`, both iid and robust Fs are returned.
#' @param quantiles numeric. Quantiles for bootstrap.
#' @param bN integer. Number of bootstrap samples.
#' @return A p x k matrix, where k is the number of endogenous variables. Each
#' row are the conditional F statistics on a residual equation as described in
#' \cite{Sanderson and Windmeijer (2014)}, for a certain error structure.  The
#' default is to use iid, or cluster if a cluster was specified to
#' [felm()]. The third choice is `'robust'`, for heteroskedastic
#' errors. If `type=NULL`, iid and robust Fs are returned, and cluster, if
#' that was specified to `felm`.
#'
#' Note that for these F statistics it is not the p-value that matters, it is
#' the F statistic itself which (coincidentally) pops up in the denominator for
#' the asymptotic bias of the IV estimates, and thus a large F is beneficial.
#' @note Please note that `condfstat` does not work with the old syntax
#' for IV in `[felm](...,iv=)`. The new multipart syntax must be
#' used.
#' @references Sanderson, E. and F. Windmeijer (2014) \cite{A weak instrument
#' F-test in linear IV models with multiple endogenous variables}, Journal of
#' Econometrics, 2015. <doi:10.1016/j.jeconom.2015.06.004>
#'
#' Stock, J.H. and M. Yogo (2004) \cite{Testing for weak instruments in linear
#' IV regression}, <doi:10.3386/t0284 > in
#' \cite{Identification and inference for econometric models: Essays in honor
#' of Thomas Rothenberg}, 2005.
#' @examples
#'
#' z1 <- rnorm(4000)
#' z2 <- rnorm(length(z1))
#' u <- rnorm(length(z1))
#' # make x1, x2 correlated with errors u
#'
#' x1 <- z1 + z2 + 0.2 * u + rnorm(length(z1))
#' x2 <- z1 + 0.94 * z2 - 0.3 * u + rnorm(length(z1))
#' y <- x1 + x2 + u
#' est <- felm(y ~ 1 | 0 | (x1 | x2 ~ z1 + z2))
#' summary(est)
#' \dontrun{
#' summary(est$stage1, lhs = "x1")
#' summary(est$stage1, lhs = "x2")
#' }
#'
#' # the joint significance of the instruments in both the first stages are ok:
#' t(sapply(est$stage1$lhs, function(lh) waldtest(est$stage1, ~ z1 | z2, lhs = lh)))
#' # everything above looks fine, t-tests for instruments,
#' # as well as F-tests for excluded instruments in the 1st stages.
#' # The conditional F-test reveals that the instruments are jointly weak
#' # (it's close to being only one instrument, z1+z2, for both x1 and x2)
#' condfstat(est, quantiles = c(0.05, 0.95))
#'
#' @export condfstat
condfstat <- function(object, type = "default", quantiles = 0.0, bN = 100L) {
  est <- object
  st1 <- est$stage1
  if (is.null(st1)) {
    stop("Conditional F statistic only makes sense for iv-estimation")
  }


  if (is.null(type)) {
    types <- c("iid", "robust")
    if (!is.null(est$clustervar)) types <- c(types, "cluster")
  } else {
    if (identical(type, "default")) {
      types <- if (is.null(est$clustervar)) "iid" else "cluster"
    } else {
      types <- type
    }
  }

  if (length(st1$lhs) == 1) {
    # only a single endogenous variable
    # reduce to ordinary F-test
    df1 <- nrow(st1$coefficients)
    result <- as.matrix(sapply(types, function(typ) {
      waldtest(st1, st1$instruments, df1 = df1, type = typ)["F"]
    }))
    dimnames(result) <- list(st1$lhs, paste(types, "F"))
    return(structure(t(result), df1 = df1))
  }


  # first, transform away all the exogenous variables from
  # instruments, endogenous variables and predicted endogenous variables
  # there may be an intercept in ivx, we remove it
  keep <- !(colnames(st1$ivx) %in% "(Intercept)")
  if (all(keep)) ivx <- st1$ivx else ivx <- st1$ivx[, keep, drop = FALSE]
  inames <- colnames(ivx)
  y <- cbind(st1$ivy, ivx, st1$c.fitted.values, est$c.response)
  fitnames <- makefitnames(colnames(st1$c.fitted.values))
  setdimnames(y, list(NULL, c(colnames(st1$ivy), inames, fitnames, colnames(est$c.response))))
  mm <- list(y = y, x = st1$centred.exo)
  tvars <- newols(mm, nostats = TRUE)$residuals
  rm(y, mm)
  # should we estimate the relative bias?
  if (any(quantiles > 0) || is.null(quantiles)) {
    # use the predicted variables as instruments?
    z <- tvars[, colnames(ivx), drop = FALSE]
    #    z <- xhat
    x <- tvars[, colnames(st1$ivy), drop = FALSE]
    y <- tvars[, colnames(est$c.response), drop = FALSE]
    bias <- ivbootstrap(z, x, y,
      quantiles = quantiles, N = bN, cluster = est$clustervar
    )
    rm(z, x, y)

    # Now, bias contains the bias distribution
    # we subtract it from the estimate in object$coefficients
    quant <- bias
    #    cf <- object$coefficients[fitnames,,drop=FALSE]
    #    quant <- sapply(seq_len(ncol(cf)), function(i) cf[,i] + bias[,,i,drop=FALSE])
    #    attributes(quant) <- attributes(bias)

    quant <- drop(quant)
  } else {
    quant <- NULL
  }

  resid <- sapply(st1$lhs, function(ev) {
    # do an ols on each
    evrest <- st1$lhs[!(st1$lhs %in% ev)]
    hatev <- makefitnames(ev)
    hatevrest <- makefitnames(evrest)
    Xlhs <- tvars[, ev, drop = FALSE]
    Xhatrest <- tvars[, hatevrest, drop = FALSE]
    Xrest <- tvars[, evrest, drop = FALSE]
    Xlhs - Xrest %*% solve(crossprod(Xhatrest), t(Xhatrest) %*% Xlhs)
  })

  setdimnames(resid, list(NULL, st1$lhs))


  # then regress on the instruments
  mm <- list(y = resid, x = tvars[, inames], cluster = est$clustervar)
  z <- newols(mm, nostats = FALSE)

  df1 <- nrow(z$coefficients) - length(z$lhs) + 1
  result <- as.matrix(sapply(types, function(typ) {
    sapply(z$lhs, function(lh) waldtest(z, inames, lhs = lh, df1 = df1, type = typ)["F"])
  }))
  dimnames(result) <- list(z$lhs, paste(types, "F"))
  structure(t(result), df1 = df1, quantiles = quant)
}

oldcondfstat <- function(object, type = "default") {
  est <- object
  if (is.null(est$stage1)) {
    stop("Conditional F statistic only makes sense for iv-estimation")
  }

  # for each endogenous variable x_j, we move it to the lhs, and estimate the
  # residuals with the other endogenous vars as explanatory variables.

  # We put each of these residuals on the lhs in the 1st stage, and estimate
  # the effects of the instruments.  We do a Wald test on these estimates to obtain
  # a conditional test for each endogenous variable.

  st1 <- est$stage1
  if (length(st1$lhs) == 1) {
    # only a single endogenous variable
    # reduce to ordinary F-test
    W <- waldtest(st1, st1$instruments)
    return(structure(W["F"], df1 = W["df1"]))
  }

  cl1 <- st1$call

  newcl <- cl <- est$call
  Form <- as.Formula(cl[["formula"]])
  baseform <- as.Formula(formula(Form, lhs = 0, rhs = 1:2))
  ivForm <- as.Formula(formula(Form, lhs = 0, rhs = 3)[[2]][[2]])
  endogForm <- formula(ivForm, lhs = NULL, rhs = 0)[[2]]
  endogvars <- all.vars(endogForm)
  instrvars <- st1$instruments # all.vars(formula(ivForm, lhs=0, rhs=NULL)[[2]])

  fitvars <- st1$endogvars

  # Now, the residuals should be put into the lhs of the 1st stage
  # The 1st stage formula is already ok, but we should make sure
  # the lhs is picked up from the residuals we create.
  # We create a new environment based on the old, but our new
  # residuals into the new environment

  fitenv <- environment(est$st2call[["formula"]])
  # an environment to store the new residual variables

  # we can get rid of the exogeneous residuals by
  # replacing the instruments and endogeneous variables
  # by their residuals when regressing with the exogeneous variables
  # If K is the number of endogenous variables, we do K+1 centerings
  # of the exogenous variables below.  If we project them out first,
  # we will just do 1 centering.
  # Collect the endogenous variables and instruments and their projections on the rhs
  # Only the exogenous variables on the rhs. Store residuals in noexo environment.

  noexo <- new.env()

  # put endogenous variables, their predictions from 1st stage, and the
  # instruments on the lhs, with the exogenous variables on the rhs
  cvars <- c(endogvars, fitvars, instrvars)
  cForm <- as.Formula(formula(paste(paste(cvars, collapse = "|"), "~0")))

  cForm <- as.Formula(update(cForm, as.formula(substitute(
    . ~ X,
    list(X = baseform[[2]])
  ))))
  projcall <- cl
  environment(cForm) <- fitenv
  projcall[["formula"]] <- cForm
  projcall[["nostats"]] <- TRUE
  projest <- eval(projcall, envir = est$parent.frame)

  # store residuals in environment under name 'x..(noexo)..
  # We store the residuals we need for the next regression
  # i.e. the x_j - X_{-j} \delta, under the name
  # of the endogenous variable.  That's all we need.
  # and the projected instruments.

  lhsvars <- colnames(st1$residuals)
  endogfitvars <- paste(lhsvars, "(fit)", sep = "")
  for (ev in seq_along(lhsvars)) {
    # do an OLS to compute delta, but implement just with crossprod on the right matrix
    evnam <- lhsvars[ev]
    restfitnam <- endogfitvars[-ev]
    restnam <- lhsvars[-ev]
    orig <- projest$residuals[, evnam, drop = FALSE]
    Xjfit <- projest$residuals[, restfitnam, drop = FALSE]
    Xj <- projest$residuals[, restnam, drop = FALSE]
    delta <- solve(crossprod(Xjfit), t(Xjfit) %*% orig)
    resid <- orig - Xj %*% delta
    # these are the left hand sides in the stage 2 regression below
    resnam <- paste(evnam, "..(noexo)..", sep = "")
    assign(resnam, resid, envir = noexo)
  }
  # store the residual instruments in noexo also

  for (instr in instrvars) {
    resnam <- paste(instr, "..(noexo)..", sep = "")
    assign(resnam, projest$residuals[, instr, drop = FALSE], envir = noexo)
  }
  # ditch projest to save memory, we don't need it anymore
  rm(projest)

  # compute concentration parameter matrix mu
  # From Stock, Wright, Yogo section 4.2 p .522
  # do this when I have the time to do it.

  # Then do the regression on the endog. residuals in resenv
  # on the lhs, and projected instruments from
  # projest on the rhs
  # make a formula
  # Should we do clustering?
  resendog <- paste("`", lhsvars, "..(noexo)..`", sep = "")
  resinst <- paste("`", instrvars, "..(noexo)..`", sep = "")
  if (length(Form)[2] == 4) {
    cluform <- formula(formula(Form, lhs = 0, rhs = 4))[[2]]
    F1 <- as.Formula(formula(paste(
      paste(resendog, collapse = "|"), "~",
      paste(resinst, collapse = "+"), "+0|0|0|0"
    )))
    newform <- update(F1, as.formula(substitute(. ~ . | . | . | C, list(C = cluform))))
  } else {
    newform <- as.Formula(formula(paste(
      paste(resendog, collapse = "|"), "~",
      paste(resinst, collapse = "+"), "+0"
    )))
  }

  # everything except the cluster var is in noexo
  # the cluster var is still in the original data frame, or in the parent
  # of noexo. We must call felm with the original data. Hopefully our
  # constructed names ..(residual).. are not present there
  environment(newform) <- noexo
  m <- match(c("formula", "data"), names(cl), 0L)
  newcl <- cl[c(1L, m)]
  newcl[["formula"]] <- newform
  e1 <- eval(newcl, est$parent.frame)

  df1 <- length(instrvars) - length(lhsvars) + 1
  res <- sapply(e1$lhs, function(lh) {
    waldtest(e1, resinst, lhs = lh, df1 = df1, type = type)["F"]
  })
  attr(res, "df1") <- df1
  names(res) <- lhsvars
  res
}