R/bccorr.R
In lfe: Linear Group Fixed Effects

Documented in bccorr fevcov varvars

# $Id: bccorr.R 2004 2016-04-21 10:31:20Z sgaure $
narowsum <- function(x, group) {
  # opt <- options(warn=-1) ## CRAN request to rather use suppressWarnings()
  res <- suppressWarnings(try(rowsum(x, group), silent = TRUE))
  # options(opt)
  narow <- is.na(rownames(res))
  if (any(narow)) {
    res[!narow, , drop = FALSE]
  } else {
    res
  }
}

# try without reference
narowsum <- rowsum
# to estimate the bias of the D-variance we need to find tr(D' P_1 D (D' P_{X,F} D)^{-1})
# where P_1 projects out the mean, P_{X,F} projects out X and F. Let's make function
# which evaluates the matrix inside the trace on a vector.

# Now, note that for matrix x, D %*% x is the same as x[f1,]
# whereas t(D) %*% x is the same as rowsum(x,f1). Similarly with F and f2.

# bias corrected variances

# Note that this file uses the scalecols() function which scales
# its argument in-place, and returns it.  This is a violation of Rs
# immutable semantics. Use with caution.  a <- scalecols(b,r) will scale a, but also change b!


#' Compute limited mobility bias corrected correlation between fixed effects
#' @concept Limited Mobility Bias
#'
#' @description
#' With a model like \eqn{y = X\beta + D\theta + F\psi + \epsilon}, where \eqn{D}
#' and \eqn{F} are matrices with dummy encoded factors, one application of \pkg{lfe} is to
#' study the correlation \eqn{cor(D\theta, F\psi)}.  However, if we use
#' estimates for \eqn{\theta} and \eqn{\psi}, the resulting correlation is biased.
#' The function `bccorr` computes a bias corrected correlation
#' as described in \cite{Gaure (2014)}.
#' @param est an object of class '"felm"', the result of a call to
#'  `[felm](keepX=TRUE)`.
#' @param alpha a data frame, the result of a call to [getfe()].
#' @param corrfactors integer or character vector of length 2. The factors to
#'    correlate. The default is fine if there are only two factors in the model.
#' @param nocovar logical. Assume no other covariates than the two
#'  factors are present, or that they are uncorrelated with them.
#' @param tol The absolute tolerance for the bias-corrected correlation.
#' @param maxsamples Maximum number of samples for the trace sample means estimates
#' @param lhs character. Name of left hand side if multiple left hand sides.

#' @return
#'  `bccorr` returns a named integer vector with the following fields:
#'
#'  \item{corr}{the bias corrected correlation.}
#'  \item{v1}{the bias corrected variance for the first factor specified
#'  by `corrfactors`.}
#'  \item{v2}{the bias corrected variance for the second factor.}
#'  \item{cov}{the bias corrected covariance between the two factors.}
#'  \item{d1}{the bias correction for the first factor.}
#'  \item{d2}{the bias correction for the second factor.}
#'  \item{d12}{the bias correction for covariance.}
#'
#'  The bias corrections have been subtracted from the bias estimates.
#'  E.g. v2 = v2' - d2, where v2' is the biased variance.

#' @details
#' The bias expressions from \cite{Andrews et al.} are of the form \eqn{tr(AB^{-1}C)}
#' where \eqn{A}, \eqn{B}, and \eqn{C} are  matrices too large to be handled
#' directly. `bccorr` estimates the trace by using the formula \eqn{tr(M) = E(x^t M x)}
#' where x is a vector with coordinates drawn uniformly from the set \eqn{\{-1,1\}}.
#' More specifically, the expectation is estimated by
#' sample means, i.e. in each sample a vector x is drawn, the
#' equation \eqn{Bv = Cx} is solved by a conjugate gradient method, and the
#' real number \eqn{x^t Av} is computed.
#'
#' There are three bias corrections, for the variances of \eqn{D\theta} (`vD`) and
#' \eqn{F\psi} (`vF`), and their covariance (`vDF`).The correlation is computed as
#' `rho <- vDF/sqrt(vD*vF)`.  The variances are estimated to a
#' relative tolerance specified by the argument `tol`. The covariance
#' bias is estimated to an absolute tolerance in the correlation `rho`
#' (conditional on the already bias corrected `vD` and `vF`) specified by
#' `tol`.  The CG algorithm does not need to be exceedingly precise,
#' it is terminated when the solution reaches a precision which is
#' sufficient for the chosen precision in `vD, vF, vDF`.
#'
#' If `est` is the result of a weighted [felm()] estimation,
#' the variances and correlations are weighted too.

#' @note
#' Bias correction for IV-estimates are not supported as of now.
#'
#' Note that if `est` is the result of a call to [felm()]
#' with `keepX=FALSE` (the default), the correlation will be computed
#' as if the covariates X are independent of the two factors. This will be
#' faster (typically by a factor of approx. 4), and possibly wronger.
#'
#' Note also that the computations performed by this function are
#' non-trivial, they may take quite some time.  It would be wise to start
#' out with quite liberal tolerances, e.g. \cite{tol=0.1}, to
#' get an idea of the time requirements.
#'
#' The algorithm used is not very well suited for small datasets with only
#' a few thousand levels in the factors.

#' @seealso [fevcov()]

#' @examples
#' x <- rnorm(500)
#' x2 <- rnorm(length(x))
#'
#' ## create individual and firm
#' id <- factor(sample(40, length(x), replace = TRUE))
#' firm <- factor(sample(30, length(x), replace = TRUE, prob = c(2, rep(1, 29))))
#' foo <- factor(sample(20, length(x), replace = TRUE))
#' ## effects
#' id.eff <- rnorm(nlevels(id))
#' firm.eff <- rnorm(nlevels(firm))
#' foo.eff <- rnorm(nlevels(foo))
#' ## left hand side
#' y <- x + 0.25 * x2 + id.eff[id] + firm.eff[firm] + foo.eff[foo] + rnorm(length(x))
#'
#' # make a data frame
#' fr <- data.frame(y, x, x2, id, firm, foo)
#' ## estimate and print result
#' est <- felm(y ~ x + x2 | id + firm + foo, data = fr, keepX = TRUE)
#' # find bias corrections
#' bccorr(est)

#' @references
#'  Gaure, S. (2014), \cite{Correlation bias correction in two-way
#'  fixed-effects linear regression}, Stat 3(1):379:390, 2014.
#' @export
bccorr <- function(est, alpha = getfe(est), corrfactors = 1L:2L,
                   nocovar = (length(est$X) == 0) && length(est$fe) == 2,
                   tol = 0.01, maxsamples = Inf, lhs = NULL) {
  if (nlevels(est$cfactor) > 1) stop("Data should have just a single connected component")
  if (length(est$fe) == 2 && nlevels(est$cfactor) != 1) stop("Bias correction only makes sense on data with 1 component")
  if (length(est$fe) < 2) stop("Bias correction only makes sense for two factors")
  if (length(corrfactors) != 2) stop("corrfactors must have length 2")
  if (is.character(corrfactors)) corrfactors <- match(corrfactors, names(est$fe))
  if (min(corrfactors) < 1 || max(corrfactors) > length(est$fe)) {
    stop("corrfactors specifies too small or large index")
  }
  if (!("fe" %in% colnames(alpha))) stop('alpha must have an "fe" column')
  #  if(!is.null(est$weights)) warning("Bias correction with weights not yet fully correct")

  #  fe <- est$fe
  f1 <- est$fe[[corrfactors[[1]]]]
  f2 <- est$fe[[corrfactors[[2]]]]
  nf1 <- names(est$fe)[[corrfactors[[1]]]]
  nf2 <- names(est$fe)[[corrfactors[[2]]]]
  if (!is.null(attr(f1, "x", exact = TRUE))) stop('Interacted factors "', nf1, '" are not supported')
  if (!is.null(attr(f2, "x", exact = TRUE))) stop('Interacted factors "', nf2, '" are not supported')
  effnam <- "effect"
  if (length(est$lhs) == 1) lhs <- est$lhs
  if (!("effect" %in% colnames(alpha))) {
    if (is.null(lhs)) {
      stop("Please specify lhs=[one of ", paste(est$lhs, collapse = ","), "]")
    }
    effnam <- paste("effect", lhs, sep = ".")
    if (!(effnam %in% colnames(alpha))) {
      stop("Can't find effect-column in alpha")
    }
  }
  resid <- est$residuals[, lhs]
  d1 <- alpha[alpha["fe"] == nf1, effnam][f1]
  d2 <- alpha[alpha["fe"] == nf2, effnam][f2]
  w <- if (is.null(est$weights)) 1.0 else est$weights
  if (is.null(est$weights)) {
    var1 <- var(d1)
    var2 <- var(d2)
    cov12 <- cov(d1, d2)
  } else {
    var1 <- wvar(d1, w^2)
    var2 <- wvar(d2, w^2)
    cov12 <- wcov(d1, d2, w^2)
  }
  delta1 <- varbias(corrfactors[[1]], est, tol, var1, maxsamples, resid = resid, weights = est$weights)
  if (is.null(est$weights)) {
    epsvar <- sum(resid^2) / est$df
  } else {
    epsvar <- sum(w^2 * resid^2) / est$df / sum(w^2)
  }

  if (nocovar) {
    f1 <- est$fe[[corrfactors[[1]]]]
    f2 <- est$fe[[corrfactors[[2]]]]
    N <- length(f1)
    delta2 <- delta1 - epsvar * (nlevels(f1) - nlevels(f2)) / N
    delta12 <- epsvar * nlevels(f1) / N - delta1
  } else {
    delta2 <- varbias(corrfactors[[2]], est, tol, var2, maxsamples, resid = resid, weights = est$weights)
    eps <- -tol * sqrt((var1 - delta1) * (var2 - delta2))
    delta12 <- covbias(corrfactors, est, eps, maxsamples = maxsamples, resid = resid, weights = est$weights)
  }

  vartheta <- var1 - delta1
  varpsi <- var2 - delta2
  covtp <- cov12 - delta12

  c(
    corr = covtp / sqrt(vartheta * varpsi),
    v1 = vartheta,
    v2 = varpsi,
    cov = covtp,
    d1 = delta1,
    d2 = delta2,
    d12 = delta12
  )
}

halftrace <- function(x, f, restf, MFX, tol, lmean, name, weights = NULL) {
  w <- weights
  if (is.null(w)) {
    ww <- ww2 <- 1.0
  } else {
    ww2 <- w^2
    ww <- w
  }

  if (is.null(MFX)) {
    invfun <- function(v) {
      crowsum(ww2 * demeanlist(v[f, ], restf, weights = w), f)
    }
  } else {
    invfun <- function(v) {
      crowsum(ww2 * demeanlist(demeanlist(ww * v[f, ], MFX) / ww, restf, weights = w), f)
    }
  }

  DtM1x <- crowsum(ww * demeanlist(x, lmean, weights = weights, scale = FALSE), f)
  # we use absolute tolerance, mctrace wil give us a trtol.
  # however, we square our result:  (Rx + err)^2 = Rx^2 + 2*err*Rx + err^2
  # so to get 2*err*Rx < tol, we must have err < tol/(2*Rx), i.e.
  # we should use a relative tolerance
  #  tol1 <- -tol/sqrt(colSums(DtM1x^2))
  tol1 <- tol
  #  message('cgsolve: tol1 ',tol1, ' tol ',tol)
  v <- cgsolve(invfun, DtM1x, eps = tol1, name = name)
  if (!is.null(MFX)) {
    Rx <- ww2 * demeanlist(demeanlist(ww * v[f, ], MFX) / ww, restf, weights = w)
  } else {
    Rx <- ww * demeanlist(v[f, ], restf, weights = weights)
  }

  #  message("|v| = ",mean(sqrt(colSums(v^2))), '|Rx|=',mean(sqrt(colSums(Rx^2))))
  Rx
}

# In case we're weighted, the weighted variance of x is
# c  x' W M_W W x, not x' M_1 x
# W is the square root of the weights, c is sum(w)/(sum(w)^2 - sum(w^2))
# (where w is the weights (not the square root))
# that is, the bias is similar as before, but use M_W instead of M_1
# and Wx instead of x (i.e. WDtheta instead of Dtheta).
# remember that the weights in the felm-object are the square roots.
varbias <- function(index, est, tol = 0.01, bvar, maxsamples = Inf,
                    robust = !is.null(est$clustervar), resid, weights = NULL, dfadj = 1) {
  if (length(index) != 1) stop("index must have length 1")
  if (!is.null(est$stage1)) stop("Bias correction with IV not supported yet")
  #  on.exit({rm(list=ls()); gc()})
  if (length(tol) == 1) {
    tracetol <- tol
    cgtf <- 1
  } else {
    tracetol <- tol[[1]]
    cgtf <- tol[[2]]
  }
  X <- est$X
  f <- est$fe[[index]]
  restf <- est$fe[-index]
  name <- paste("var(", names(est$fe)[[index]], ")", sep = "")
  N <- length(f)
  nlev <- nlevels(f)
  w <- weights
  if (is.null(w)) {
    wc <- ww <- ww2 <- 1.0
  } else {
    w <- w / sqrt(sum(w^2))
    ww2 <- w^2
    ww <- w
    wc <- 1 / (1 - sum(ww2^2))
  }

  # First, make a factor list for projecting out mean.
  fmean <- factor(rep(1, N))
  lmean <- list(fmean)

  # project out X and F, i.e. everything but f1
  # use M_{F,X} = M_F M_{M_F X}
  # precompute and orthonormalize M_F X

  if (length(X) > 0) {
    # Find NA's in the coefficients, remove corresponding variables from X
    bad <- apply(est$coefficients, 1, anyNA)
    if (any(bad)) X <- X[, !bad, drop = FALSE]
    MFX <- list(structure(fmean, x = orthonormalize(demeanlist(X, restf,
      weights = w, scale = c(TRUE, FALSE)
    ))))
    invfun <- function(v) {
      crowsum(ww * demeanlist(demeanlist(ww * v[f, ], MFX), restf, weights = w, scale = FALSE), f)
    }
  } else {
    MFX <- NULL
    invfun <- function(v) {
      crowsum(ww * demeanlist(v[f, ], restf, weights = w, scale = c(TRUE, FALSE)), f)
    }
  }

  if (is.null(w)) {
    epsvar <- sum(resid^2) / est$df
  } else {
    epsvar <- sum(ww2 * resid^2) * N / est$df
  }

  vvfoo <- epsvar
  if (robust) {
    # residuals present, do cluster robust correction
    # create all the cluster interactions, so we don't have to do
    # it each time in the iteration
    docluster <- !is.null(est$clustervar)
    toladj <- sqrt(sum((ww * resid)^2))
    if (docluster) {
      d <- length(est$clustervar)
      cia <- list()
      for (i in 1:(2^d - 1)) {
        # Find out which ones to interact
        iac <- as.logical(intToBits(i))[1:d]
        # interact the factors
        cia[[i]] <- factor(do.call(paste, c(est$clustervar[iac], sep = "\004")))
      }
    }
    wwres <- ww * resid
    trfun <- function(x, trtol) {
      # return crude estimate of the trace
      if (trtol == 0) {
        return(abs(nlev))
      }
      #      on.exit({rm(list=ls()); gc()})
      # since we square our result, we should use sqrt(trtol)/2 as a tolerance
      # trtol is the absolute tolerance, but halftrace is squared, we should
      # really use a relative tolerance, we use the square root of the tracetol
      #      message('trtol=',trtol, ' N ',N,' toladj ',toladj)
      Rx <- halftrace(x, f, restf, MFX, -sqrt(trtol / toladj), lmean, name, weights = w)
      #      Rx <- halftrace(x, f, restf, MFX, sqrt(trtol)/2, lmean, name, weights=w)
      #      Rx <- halftrace(x, f, restf, MFX, trtol, lmean, name, weights=w)

      # now apply cluster stuff
      # first, scale with (weighted) residuals
      #
      .Call(C_scalecols, Rx, wwres)
      if (!docluster) {
        # It's heteroscedastic
        return(colSums(Rx * Rx))
      } else {
        # it's one or more clusters, do the Cameron et al detour
        result <- vector("numeric", ncol(Rx))
        for (i in 1:(2^d - 1)) {
          ia <- cia[[i]]
          b <- crowsum(Rx, ia)
          # odd number is positive, even is negative
          sgn <- 2 * (sum(as.logical(intToBits(i))[1:d]) %% 2) - 1
          adj <- sgn * dfadj * nlevels(ia) / (nlevels(ia) - 1)
          result <- result + adj * colSums(b * b)
          rm(b)
          #          result <- result + vvfoo*adj* colSums(Rx * Rx)
        }
        return(result)
      }
    }
    epsvar <- 1 # now incorporated in the robust variance matrix, so don't scale
  } else {
    trfun <- function(x, trtol) {
      # return crude estimate of the trace
      if (trtol == 0) {
        return(abs(nlev))
      }
      #      on.exit({rm(list=ls()); gc()})
      DtM1x <- crowsum(ww * demeanlist(unnamed(x), lmean, weights = w, scale = FALSE), f)
      # we use absolute tolerance, mctrace wil give us a trtol.
      # we divide by the L2-norm of DtM1x, since we take the
      # inner product with this afterwards

      tol1 <- -trtol / cgtf / sqrt(colSums(DtM1x^2)) / 2
      v <- cgsolve(invfun, DtM1x, eps = tol1, name = name)
      colSums(DtM1x * v)
    }
  }
  attr(trfun, "IP") <- TRUE
  epsvar <- epsvar * wc

  # If we have weights, we should not divide the trace by N

  # epsvar*trace/N is the bias estimate
  # We want precision in the final estimate to be, say, 1%
  # Since we subtract the bias from the biased estimate, the precision
  # in the trace computation depends on the current value of the trace
  # i.e. absolute precision should be 0.01*(bvar*N - epsvar*tr)/epsvar
  # where bvar is the biased variance

  epsfun <- function(tr) {
    aa <- N * bvar - epsvar * tr
    if (aa < 0) {
      -abs(tracetol) * 0.5 * N * bvar / epsvar
    } else {
      -abs(tracetol) * aa / epsvar
    }
  }
  # the tolerance before mctrace has got a clue about where we are,
  # is a problem. If the bias is very large compared to the variance, we will
  # be in trouble.
  res <- epsvar * mctrace(trfun,
    N = N, tol = epsfun, trname = name,
    maxsamples = maxsamples
  ) / N
}


# if positive tolerance, the tolerance is relative to the bias corrected
# covariance. In this case, the biased covariance (bcov) and residual
# variance (epsvar) must be specified. A negative tolerance is an
# absolute tolerance
covbias <- function(index, est, tol = 0.01, maxsamples = Inf, resid, weights = NULL,
                    robust = !is.null(est$clustervar)) {
  if (length(index) != 2) stop("index must have length 2")
  if (!is.null(est$stage1)) stop("Bias correction with IV not supported yet")
  if (length(est$fe) < 2) stop("fe must have length >= 2")
  #  on.exit({rm(list=ls()); gc()})
  w <- weights
  if (is.null(w)) {
    wc <- ww2 <- ww <- 1
  } else {
    w <- w / sqrt(sum(w^2))
    ww <- w
    ww2 <- w^2
    wc <- sum(w^2) / (sum(w^2)^2 - sum(w^4))
  }
  X <- est$X
  f1 <- est$fe[[index[[1]]]]
  f2 <- est$fe[[index[[2]]]]
  nlev1 <- nlevels(f1)
  nlev2 <- nlevels(f2)
  N <- length(f1)
  name <- paste("cov(", paste(names(est$fe)[index], collapse = ","), ")", sep = "")
  name1 <- paste(name, names(est$fe)[index[[1]]], sep = ".")
  name2 <- paste(name, names(est$fe)[index[[2]]], sep = ".")
  no2list <- est$fe[-index[[2]]]
  no1list <- est$fe[-index[[1]]]

  restf <- est$fe[-index]
  fmean <- factor(rep(1, N))
  lmean <- list(fmean)
  if (is.null(w)) {
    epsvar <- sum(resid^2) / est$df
  } else {
    epsvar <- sum(ww2 * resid^2) * N / est$df
  }

  if (length(X) > 0) {
    bad <- apply(est$coefficients, 1, anyNA)
    if (any(bad)) X <- X[, !bad, drop = FALSE]
  }
  MDX <- MFX <- NULL

  if (length(X) > 0) {
    MDX <- list(structure(fmean, x = orthonormalize(demeanlist(X,
      no2list,
      weights = w, scale = c(TRUE, FALSE)
    ))))
    MFX <- list(structure(fmean, x = orthonormalize(demeanlist(X,
      no1list,
      weights = w, scale = c(TRUE, FALSE)
    ))))

    invfun <- function(v) {
      crowsum(ww * demeanlist(demeanlist(ww * v[f2, ], MDX), no2list,
        weights = w, scale = FALSE
      ), f2)
    }
    if (length(restf) > 0) {
      MX <- list(structure(fmean, x = orthonormalize(demeanlist(X,
        restf,
        weights = w, scale = c(TRUE, FALSE)
      ))))
      MXfun <- function(v) {
        ww * demeanlist(demeanlist(ww * v[f1, ], MX), restf,
          weights = w,
          scale = FALSE
        )
      }
    } else {
      MX <- list(structure(fmean, x = orthonormalize(ww * X)))
      MXfun <- function(v) ww * demeanlist(ww * v[f1, ], MX)
    }
  } else {
    invfun <- function(v) {
      crowsum(ww2 * demeanlist(v[f2, ], no2list, weights = w), f2)
    }
    MXfun <- function(v) ww2 * demeanlist(v[f1, ], restf, weights = w)
  }

  invfunX <- function(v) {
    crowsum(MXfun(v), f1)
  }

  if (robust) {
    # residuals present, do cluster robust correction
    # create all the cluster interactions, so we don't have to do
    # it each time in the iteration
    tolmod <- 1
    docluster <- !is.null(est$clustervar)
    if (docluster) {
      tolmod <- 0
      d <- length(est$clustervar)
      cia <- list()
      for (i in 1:(2^d - 1)) {
        # Find out which ones to interact
        iac <- as.logical(intToBits(i))[1:d]
        # interact the factors
        cia[[i]] <- factor(do.call(paste, c(est$clustervar[iac], sep = "\004")))
        tolmod <- tolmod + nlevels(cia[[i]])
      }
    }
    toladj <- sqrt(sum((ww * resid)^2))
    wwres <- ww * resid
    trfun <- function(x, trtol) {
      # return crude estimate of the trace
      if (trtol == 0) {
        return(abs(nlev1 - nlev2))
      }
      #      on.exit({rm(list=ls()); gc()})
      # since we square our result, we should use sqrt(trtol)/2 as a tolerance
      # trtol is the absolute tolerance, but halftrace is squared, we should
      # really use a relative tolerance, we use the square root of the tracetol
      #      message('cov trtol=',trtol, ' tolmod ',tolmod, ' N ',N)
      Lx <- halftrace(x, f1, no1list, MFX, -sqrt(trtol / toladj), lmean, name1, weights = w)
      Rx <- halftrace(x, f2, no2list, MDX, -sqrt(trtol / toladj), lmean, name2, weights = w)
      #      Rx <- halftrace(x, f, restf, MFX, sqrt(trtol)/2, lmean, name, weights=w)
      #      Rx <- halftrace(x, f, restf, MFX, trtol, lmean, name, weights=w)

      # now apply cluster stuff
      # first, scale with (weighted) residuals
      .Call(C_scalecols, Rx, wwres)
      .Call(C_scalecols, Lx, wwres)
      if (!docluster) {
        # It's heteroscedastic
        return(colSums(Lx * Rx))
      } else {
        # it's one or more clusters, do the Cameron et al detour
        result <- vector("numeric", ncol(Rx))
        for (i in 1:(2^d - 1)) {
          Lb <- crowsum(Lx, cia[[i]])
          Rb <- crowsum(Rx, cia[[i]])
          # odd number is positive, even is negative
          sgn <- 2 * (sum(as.logical(intToBits(i))[1:d]) %% 2) - 1
          result <- result + sgn * colSums(Lb * Rb)
        }
        return(result)
      }
    }
    epsvar <- 1 # now incorporated in the robust variance matrix, so don't scale
  } else {
    trfun <- function(x, trtol) {
      # return crude estimate of the trace
      if (trtol == 0) {
        return(-abs(nlev1 - nlev2))
      }
      #      on.exit({rm(list=ls()); gc()})
      M1x <- ww * demeanlist(unnamed(x), lmean, weights = w, scale = FALSE)
      DtM1x <- crowsum(M1x, f1)
      FtM1x <- crowsum(M1x, f2)
      d1 <- colSums(DtM1x^2)
      d2 <- colSums(FtM1x^2)
      v <- cgsolve(invfunX, DtM1x, eps = -trtol / sqrt(d1 + d2) / 3, name = name)
      MXv <- crowsum(MXfun(v), f2)
      sol <- cgsolve(invfun, FtM1x, eps = -trtol / sqrt(colSums(MXv^2)) / 3, name = name)
      -colSums(sol * MXv)
    }
  }
  # our function does the inner product, not just matrix application. Signal to mctrace.
  attr(trfun, "IP") <- TRUE
  epsvar <- epsvar * wc

  # absolute precision, scale by N and epsvar since it's trace level precision

  eps <- -abs(tol) * N / epsvar

  epsvar * mctrace(trfun,
    N = N, tol = eps, trname = name,
    maxsamples = maxsamples
  ) / N
}

# compute variance of the biased variance estimate
# using the formula for the variance of a quadratic form with normal distribution
# var(x^t A x) = 2 tr(AVAV) + 4mu^t*AVA*mu
# where V is the variance matrix of x, assumed to be sigma^2 I, and mu is the
# expectation of x (i.e. Dtheta).
varvar <- function(index, fe, X, pointest, resvar, tol = 0.01,
                   biascorrect = FALSE, weights = NULL) {
  w <- weights
  #  w <- NULL
  if (is.null(w)) {
    wc <- ww <- ww2 <- 1.0
  } else {
    w <- w / sqrt(sum(w^2))
    ww2 <- w^2
    ww <- w
    wc <- sum(w^2) / (sum(w^2)^2 - sum(w^4))
  }
  if (!is.null(w) && biascorrect) warning("bias corrected varvars with weights not tested")
  f <- fe[[index]]
  N <- length(f)
  lmean <- list(factor(rep(1, N)))
  name <- paste("varvar(", names(fe)[[index]], ")", sep = "")
  if (length(X) == 0) {
    MFX <- fe[-index]
    invfun <- function(x) {
      crowsum(ww2 * demeanlist(x[f, ], MFX, weights = w), f)
    }
  } else {
    #    M_{F,X} = M_F M_{M_F X}
    restf <- fe[-index]
    MFX <- list(structure(factor(rep(1, N)),
      x = orthonormalize(ww * demeanlist(X, restf, weights = w))
    ))
    invfun <- function(x) {
      crowsum(ww2 * demeanlist(demeanlist(ww * x[f, ], MFX) / ww, restf, weights = w), f)
    }
  }


  Dtheta <- pointest[f]
  DtM1D <- crowsum(ww2 * demeanlist(Dtheta, lmean, weights = w), f)
  v <- cgsolve(invfun, DtM1D, eps = tol / 4 / resvar / sqrt(sum(DtM1D^2)), name = name)
  meanpart <- 4 * resvar * sum(DtM1D * v)
  if (!biascorrect) {
    return(meanpart / N^2)
  }
  # the mean part is biased upwards. We should correct it.
  # it turns out that we can do this by changing the sign of the
  # trace term, the bias is the same expression as the trace part
  #  message('mean part=',meanpart/N^2)
  mytol <- meanpart / 10
  trfun <- function(x, trtol) {
    v <- ww * demeanlist(cgsolve(invfun, crowsum(ww2 * demeanlist(x, lmean, weights = w), f),
      eps = -mytol^2 / resvar^2 / 2, name = name
    )[f, ], lmean, weights = w)
    colSums(v * x)
  }
  attr(trfun, "IP") <- TRUE
  trpart <- 2 * resvar^2 * mctrace(trfun, N = length(f), trname = name, tol = -mytol / resvar / 2)
  if (!is.null(w)) trpart <- trpart / N
  #  message('mean part=', meanpart, ' trpart=',trpart)
  (meanpart - trpart) / N^2
}




#' Compute the variance of the fixed effect variance estimate
#'
#' @details
#' With a model like \eqn{y = X\beta + D\theta + F\psi + \epsilon}, where \eqn{D} and
#' \eqn{F} are matrices with dummy encoded factors, one application of \pkg{lfe} is
#' to study the variances \eqn{var(D\theta)}, \eqn{var(F\psi)} and covariances
#' \eqn{cov(D\theta, F\psi)}. The function [fevcov()] computes bias corrected
#' variances and covariances.  However, these variance estimates are still
#' random variables for which [fevcov()] only estimate the
#' expectation. The function `varvars` estimates the variance of these
#' estimates.
#'
#' This function returns valid results only for normally distributed residuals.
#' Note that the estimates for the fixed effect variances from
#' [fevcov()] are not normally distributed, but a sum of chi-square
#' distributions which depends on the eigenvalues of certain large matrices. We
#' do not compute that distribution. The variances returned by `varvars`
#' can therefore *not* be used directly to estimate confidence intervals,
#' other than through coarse methods like the Chebyshev inequality. These
#' estimates only serve as a rough guideline as to how wrong the variance
#' estimates from [fevcov()] might be.
#'
#' Like the fixed effect variances themselves, their variances are also biased
#' upwards.  Correcting this bias can be costly, and is therefore by default
#' switched off.
#'
#' The variances tend to zero with increasing number of observations. Thus, for
#' large datasets they will be quite small.
#'
#' @param est an object of class '"felm"', the result of a call to
#' `[felm](keepX=TRUE)`.
#' @param alpha a data frame, the result of a call to [getfe()].
#' @param tol numeric. The absolute tolerance for the bias-corrected
#' correlation.
#' @param biascorrect logical. Should the estimates be bias corrected?
#' @param lhs character. Name of left hand side if multiple left hand sides.
#' @return `varvars` returns a vector with a variance estimate for each
#' fixed effect variance.  I.e. for the diagonal returned by
#' [fevcov()].
#' @note The `tol` argument specifies the tolerance as in
#' [fevcov()].  Note that if `est` is the result of a call to
#' [felm()] with `keepX=FALSE` (the default), the variances will
#' be estimated as if the covariates X are independent of the factors.  There
#' is currently no function available for estimating the variance of the
#' covariance estimates from [fevcov()].
#'
#' The cited paper does not contain the expressions for the variances computed
#' by `varvars` (there's a 10 page limit in that journal), though they can
#' be derived in the same fashion as in the paper, with the formula for the
#' variance of a quadratic form.
#' @seealso [bccorr()] [fevcov()]
#' @references Gaure, S. (2014), \cite{Correlation bias correction in two-way
#' fixed-effects linear regression}, Stat 3(1):379-390, 2014.
#' @examples
#'
#' x <- rnorm(500)
#' x2 <- rnorm(length(x))
#'
#' ## create individual and firm
#' id <- factor(sample(40, length(x), replace = TRUE))
#' firm <- factor(sample(30, length(x), replace = TRUE, prob = c(2, rep(1, 29))))
#' foo <- factor(sample(20, length(x), replace = TRUE))
#' ## effects
#' id.eff <- rnorm(nlevels(id))
#' firm.eff <- rnorm(nlevels(firm))
#' foo.eff <- rnorm(nlevels(foo))
#' ## left hand side
#' id.m <- id.eff[id]
#' firm.m <- 2 * firm.eff[firm]
#' foo.m <- 3 * foo.eff[foo]
#' y <- x + 0.25 * x2 + id.m + firm.m + foo.m + rnorm(length(x))
#'
#' # make a data frame
#' fr <- data.frame(y, x, x2, id, firm, foo)
#' ## estimate and print result
#' est <- felm(y ~ x + x2 | id + firm + foo, data = fr, keepX = TRUE)
#' alpha <- getfe(est)
#' # estimate the covariance matrix of the fixed effects
#' fevcov(est, alpha)
#' # estimate variances of the diagonal
#' varvars(est, alpha)
#'
#' @export varvars
varvars <- function(est, alpha = getfe(est), tol = 0.01, biascorrect = FALSE, lhs = NULL) {
  if (nlevels(est$cfactor) > 1) stop("Data should have just a single connected component")

  fe <- est$fe
  e <- length(fe)
  if (length(tol) == 1) tol <- rep(tol, e)

  if (length(est$lhs) > 1 && is.null(lhs)) {
    stop("Please specify lhs=[one of ", paste(est$lhs, collapse = ","), "]")
  }
  if (length(est$lhs) == 1) lhs <- est$lhs
  effnam <- "effect"
  if (!("effect" %in% colnames(alpha))) {
    effnam <- paste("effect", lhs, sep = ".")
    if (!(effnam %in% colnames(alpha))) {
      stop("Can't find effect-column in alpha")
    }
  }

  effs <- lapply(names(fe), function(nm) alpha[alpha[, "fe"] == nm, effnam])
  w2 <- if (is.null(est$weights)) 1 else est$weights^2
  resvar <- sum(w2 * est$residuals[, lhs]^2) * length(w2) / est$df

  sapply(1:e, function(index) {
    varvar(index, fe, est$X, effs[[index]], resvar, tol[index], biascorrect, est$weights)
  })
}



# a function for computing the covariance matrix between
# all the fixed effects






#' Compute limited mobility bias corrected covariance matrix between fixed
#' effects
#'
#' With a model like \eqn{y = X\beta + D\theta + F\psi + \epsilon}, where
#' \eqn{D} and \eqn{F} are matrices with dummy encoded factors, one application
#' of \pkg{lfe} is to study the variances \eqn{var(D\theta)}, \eqn{var(F\psi)}
#' and covariances \eqn{cov(D\theta, F\psi)}. However, if we use estimates for
#' \eqn{\theta} and \eqn{\psi}, the resulting variances are biased. The
#' function `fevcov` computes a bias corrected covariance matrix as
#' described in \cite{Gaure (2014)}.
#'
#' The `tol` argument specifies the tolerance. The tolerance is relative
#' for the variances, i.e. the diagonal of the output.  For the covariances,
#' the tolerance is relative to the square root of the product of the
#' variances, i.e. an absolute tolerance for the correlation.  If a numeric of
#' length 1, `tol` specifies the same tolerance for all
#' variances/covariances.  If it is of length 2, `tol[1]` specifies the
#' variance tolerance, and `tol[2]` the covariance tolerance.  `tol`
#' can also be a square matrix of size `length(est$fe)`, in which case the
#' tolerance for each variance and covariance is specified individually.
#'
#' The function performs no checks for estimability. If the fixed effects are
#' not estimable, the result of a call to `fevcov` is not useable.
#' Moreover, there should be just a single connected component among the fixed
#' effects.
#'
#' `alpha` must contain a full set of coefficients, and contain columns
#' `'fe'` and `'effect'` like the default estimable functions from
#' [efactory()].
#'
#' In the case that the [felm()]-estimation has weights, it is the
#' weighted variances and covariance which are bias corrected.
#'
#' @param est an object of class '"felm"', the result of a call to
#' `[felm](keepX=TRUE)`.
#' @param alpha a data frame, the result of a call to [getfe()].
#' @param tol numeric. The absolute tolerance for the bias-corrected
#' correlation.
#' @param robust logical. Should robust (heteroskedastic or cluster) residuals
#' be used, rather than i.i.d.
#' @param maxsamples integer. Maximum number of samples for expectation
#' estimates.
#' @param lhs character. Name of left hand side if multiple left hand sides.
#' @return `fevcov` returns a square matrix with the bias corrected
#' covariances. An attribute `'bias'` contains the biases.  The bias
#' corrections have been subtracted from the bias estimates.  I.e. vc = vc' -
#' b, where vc' is the biased variance and b is the bias.
#' @note Bias correction for IV-estimates are not supported as of now.
#'
#' Note that if `est` is the result of a call to [felm()] with
#' `keepX=FALSE` (the default), the biases will be computed as if the
#' covariates X are independent of the factors. This will be faster (typically
#' by a factor of approx. 4), and possibly wronger.  Note also that the
#' computations performed by this function are non-trivial, they may take quite
#' some time.  It would be wise to start out with quite liberal tolerances,
#' e.g. \cite{tol=0.1}, to get an idea of the time requirements.
#'
#' If there are only two fixed effects, `fevcov` returns the same
#' information as [bccorr()], though in a slightly different format.
#' @seealso [varvars()] [bccorr()]
#' @references Gaure, S. (2014), \cite{Correlation bias correction in two-way
#' fixed-effects linear regression}, Stat 3(1):379-390, 2014.
#' \doi{10.1002/sta4.68}
#' @examples
#'
#' x <- rnorm(5000)
#' x2 <- rnorm(length(x))
#'
#' ## create individual and firm
#' id <- factor(sample(40, length(x), replace = TRUE))
#' firm <- factor(sample(30, length(x), replace = TRUE, prob = c(2, rep(1, 29))))
#' foo <- factor(sample(20, length(x), replace = TRUE))
#' ## effects
#' id.eff <- rnorm(nlevels(id))
#' firm.eff <- runif(nlevels(firm))
#' foo.eff <- rchisq(nlevels(foo), df = 1)
#' ## left hand side
#' id.m <- id.eff[id]
#' firm.m <- firm.eff[firm]
#' foo.m <- foo.eff[foo]
#' # normalize them
#' id.m <- id.m / sd(id.m)
#' firm.m <- firm.m / sd(firm.m)
#' foo.m <- foo.m / sd(foo.m)
#' y <- x + 0.25 * x2 + id.m + firm.m + foo.m + rnorm(length(x), sd = 2)
#' z <- x + 0.5 * x2 + 0.7 * id.m + 0.5 * firm.m + 0.3 * foo.m + rnorm(length(x), sd = 2)
#' # make a data frame
#' fr <- data.frame(y, z, x, x2, id, firm, foo)
#' ## estimate and print result
#' est <- felm(y | z ~ x + x2 | id + firm + foo, data = fr, keepX = TRUE)
#' # find bias corrections, there's little bias in this example
#' print(yv <- fevcov(est, lhs = "y"))
#' ## Here's how to compute the unbiased correlation matrix:
#' cm <- cov2cor(yv)
#' structure(cm, bias = NULL)
#'
#' @export fevcov
fevcov <- function(est, alpha = getfe(est), tol = 0.01, robust = !is.null(est$clustervar),
                   maxsamples = Inf, lhs = NULL) {
  if (nlevels(est$cfactor) > 1) stop("Data should have just a single connected component")
  iaf <- sapply(est$fe, function(f) !is.null(attr(f, "x", exact = TRUE)))
  if (any(iaf)) {
    stop("Bias correction for interacted factor ", paste(names(est$fe)[iaf], collapse = ", "), " not supported")
  }
  if (length(est$lhs) > 1 && is.null(lhs)) {
    stop("Please specify lhs=[one of ", paste(est$lhs, collapse = ","), "]")
  }
  if (length(est$lhs) == 1) lhs <- est$lhs
  if (is.null(lhs)) lhs <- 1
  effnam <- "effect"
  if (!("effect" %in% colnames(alpha))) {
    effnam <- paste("effect", lhs, sep = ".")
    if (!(effnam %in% colnames(alpha))) {
      stop("Can't find effect-column in alpha")
    }
  }
  if (is.na(match("fe", colnames(alpha)))) {
    stop("alpha should contain columns 'fe' and 'effect'")
  }

  #  if(!is.null(est$weights)) warning("Bias correction with weights not yet fully correct")

  if (length(tol) == 1) tol <- c(tol, -abs(tol))
  if (length(tol) != 2 && !is.matrix(tol)) {
    stop("tol must be either a matrix or of length 1 or 2")
  }
  K <- length(est$fe)
  if (is.matrix(tol) && (ncol(tol) != K || nrow(tol) != K)) {
    stop("tol matrix must be square, with size the number of fixed effects: ", K)
  }
  if (!is.matrix(tol)) {
    tmp <- tol
    tol <- matrix(0, K, K)
    diag(tol) <- tmp[1]
    tol[col(tol) != row(tol)] <- tmp[2]
  }

  # compute the biased variances
  fe <- est$fe
  effs <- lapply(names(fe), function(nm) alpha[alpha[, "fe"] == nm, effnam][fe[[nm]]])
  names(effs) <- names(fe)
  bvcv <- matrix(0, K, K)
  colnames(bvcv) <- rownames(bvcv) <- names(fe)
  diag(bvcv) <- sapply(effs, var)
  for (i in 1:K) {
    for (j in i:K) {
      if (is.null(est$weights)) {
        bvcv[i, j] <- bvcv[j, i] <- cov(effs[[i]], effs[[j]])
      } else {
        bvcv[i, j] <- bvcv[j, i] <- wcov(effs[[i]], effs[[j]], est$weights^2)
      }
    }
  }
  # compute the variances
  bias <- matrix(0, K, K)
  colnames(bias) <- rownames(bias) <- names(fe)
  resid <- est$residuals[, lhs]
  cgtf <- rep(1, K)
  diag(bias) <- sapply(1:K, function(i) {
    varbias(i, est, tol[i, i], bvcv[i, i],
      maxsamples,
      resid = resid, weights = est$weights,
      dfadj = (est$N - 1) / est$df
    )
  })

  wtf <- diag(bias) > diag(bvcv)
  # update off-diagonal tolerances by the variances
  offdiag <- col(tol) != row(tol)
  if (any(wtf)) {
    message("Some variance biases are larger than the variances. Setting them equal:")
    print(cbind(variance = diag(bvcv), bias = diag(bias)))
    diag(bias)[wtf] <- 0.999 * diag(bvcv)[wtf]
    tol[offdiag] <- -(abs(tol) * sqrt(abs(tcrossprod(diag(bvcv) - 0.9 * diag(bias)))))[offdiag]
  } else {
    tol[offdiag] <- -(abs(tol) * sqrt(abs(tcrossprod(diag(bvcv) - diag(bias)))))[offdiag]
  }
  # compute the covariances
  if (K > 1) {
    for (i in 1:(K - 1)) {
      for (j in (i + 1):K) {
        bias[i, j] <- bias[j, i] <- covbias(c(i, j), est, tol[i, j], maxsamples,
          resid = resid, weights = est$weights
        )
      }
    }
  }
  structure(bvcv - bias, bias = bias)
}