#' Bootstrap standard errors for the group fixed effects
#'
#' Bootstrap standard errors for the group fixed effects which were swept out
#' during an estimation with \code{\link{felm}}.
#'
#' The bootstrapping is done in parallel if \code{threads > 1}.
#' \code{\link{btrap}} is run automatically from \code{\link{getfe}} if
#' \code{se=TRUE} is specified. To save some overhead, the individual
#' iterations are grouped together, the memory available for this grouping is
#' fetched with \code{getOption('lfe.bootmem')}, which is initialized upon
#' loading of \pkg{lfe} to \code{options(lfe.bootmem=500)} (MB).
#'
#' If \code{robust=TRUE}, heteroskedastic robust standard errors are estimated.
#' If \code{robust=FALSE} and \code{cluster=TRUE}, clustered standard errors
#' with the cluster specified to \code{felm()} are estimated. If \code{cluster}
#' is a factor, it is used for the cluster definition. \code{cluster may} also
#' be a list of factors.
#'
#' @param alpha data frame returned from \code{\link{getfe}}
#' @param obj object of class \code{"felm"}, usually, a result of a call to
#' \code{\link{felm}}
#' @param N integer. The number of bootstrap iterations
#' @param ef function. An estimable function such as in \code{\link{getfe}}.
#' The default is to use the one used on \code{alpha}
#' @param eps double. Tolerance for centering, as in getfe
#' @param threads integer. The number of threads to use
#' @param robust logical. Should heteroskedastic standard errors be estimated?
#' @param cluster logical or factor. Estimate clustered standard errors.
#' @param lhs character vector. Specify which left hand side if \code{obj} has
#' multiple lhs.
#' @return A data-frame of the same size as alpha is returned, with standard
#' errors filled in.
#' @examples
#'
#' oldopts <- options(lfe.threads=2)
#' ## create covariates
#' x <- rnorm(3000)
#' x2 <- rnorm(length(x))
#'
#' ## create individual and firm
#' id <- factor(sample(700,length(x),replace=TRUE))
#' firm <- factor(sample(300,length(x),replace=TRUE))
#'
#' ## effects
#' id.eff <- rlnorm(nlevels(id))
#' firm.eff <- rexp(nlevels(firm))
#'
#' ## left hand side
#' y <- x + 0.25*x2 + id.eff[id] + firm.eff[firm] + rnorm(length(x))
#'
#' ## estimate and print result
#' est <- felm(y ~ x+x2 | id + firm)
#' summary(est)
#' ## extract the group effects
#' alpha <- getfe(est)
#' head(alpha)
#' ## bootstrap standard errors
#' head(btrap(alpha,est))
#'
#' ## bootstrap some differences
#' ef <- function(v,addnames) {
#' w <- c(v[2]-v[1],v[3]-v[2],v[3]-v[1])
#' if(addnames) {
#' names(w) <-c('id2-id1','id3-id2','id3-id1')
#' attr(w,'extra') <- list(note=c('line1','line2','line3'))
#' }
#' w
#' }
#' # check that it's estimable
#' is.estimable(ef,est$fe)
#'
#' head(btrap(alpha,est,ef=ef))
#' options(oldopts)
#'
#' @export btrap
btrap <- function(alpha,obj,N=100,ef=NULL,eps=getOption('lfe.eps'),
threads=getOption('lfe.threads'), robust=FALSE,
cluster=NULL, lhs=NULL) {
# bootstrap the stuff. The name 'btrap' is chosen to instill a feeling of being trapped
# (in some long running stuff which will never complete)
# bootstrapping is really to draw residuals over again, i.e. to change
# the outcome. Predictions of the estimated system are adjusted by
# drawing from the residuals. We need a new r.h.s to replace
# the current (I-P)(Y-Xbeta), i.e. (I-P)(Y-Xbeta+delta) where delta is resampled from
# the residuals PY-PXbeta. Then we adjust for degrees of freedom, which is component specific.
if(is.logical(cluster)) {
if(cluster) cluster <- obj$clustervar else cluster <- NULL
} else if(!is.null(cluster)) {
if(is.list(cluster))
cluster <- lapply(cluster,factor)
else
cluster <- list(factor(cluster))
}
if(is.null(ef)) {
# use the one used with alpha
ef <- attr(alpha,'ef')
} else {
if(is.character(ef)) ef <- efactory(obj,opt=ef)
# redo the point estimates
v <- ef(alpha[,'effect'],TRUE)
alpha <- data.frame(effect=v)
rownames(alpha) <- names(v)
if(!is.null(attr(v,'extra'))) alpha <- cbind(alpha,attr(v,'extra'))
}
if(is.null(lhs)) {
R <- obj$r.residuals-obj$residuals
smpdraw <- as.vector(obj$residuals)
} else {
R <- obj$r.residuals[,lhs]-obj$residuals[,lhs]
smpdraw <- as.vector(obj$residuals[,lhs])
}
w <- obj$weights
# if there are weights, smpdraw should be weighted
if(!is.null(w)) smpdraw <- smpdraw * w
# Now, we want to do everything in parallel, so we should allocate up a set
# of vectors, but we don't want to blow the memory. Stick to allocating two
# vectors per thread. The threaded stuff can't be interrupted, so this is
# an opportunity to control-c too.
# hmm, up to 500 MB of vectors, we say, but no less than two per thread
# (one per thread is bad for balance, if time to completion varies)
# divide by two because we use a copy in the demeanlist step.
maxB <- getOption('lfe.bootmem')*1e6/2
vpt <- max(2,as.integer(min(maxB/(length(R)*8),N)/threads))
vpb <- vpt*threads
blks <- as.integer(ceiling(N / vpb))
newN <- blks*vpb
vsum <- 0
vsq <- 0
start <- last <- as.integer(Sys.time())
gc()
if(is.null(lhs))
predy <- obj$fitted.values
else
predy <- obj$fitted.values[,lhs]
if(!is.null(cluster)) {
# now, what about multiway clustering?
# try the Cameron-Gelbach-Miller stuff.
# make the interacted factors, with sign
iac <- list()
d <- length(cluster)
for(i in 1:(2^d-1)) {
# Find out which ones to interact
iab <- as.logical(intToBits(i))[1:d]
# odd number is positive, even is negative
sgn <- 2*(sum(iab) %% 2) - 1
iac[[i]] <- list(sgn=sgn/choose(d,sum(iab)),ib=iab)
}
}
X <- model.matrix(obj,centred=NA)
cX <- attr(X,'cX')
for(i in 1:blks) {
if(robust) {
# robust residuals, variance is each squared residual
# rsamp <- rnorm(vpb*length(smpdraw))*abs(smpdraw)
rsamp <- lapply(1:vpb,function(i) rnorm(length(smpdraw))*abs(smpdraw))
} else if(!is.null(cluster)) {
# Wild bootstrap with Rademacher distribution
rsamp <- lapply(1:vpb, function (i) {
if(length(cluster) == 1) {
smpdraw*sample(c(-1,1),nlevels(cluster[[1]]), replace=TRUE)[cluster[[1]]]
} else {
# draw a Rademacher dist for each single cluster
rad <- lapply(cluster,function(f) sample(c(-1,1),nlevels(f),replace=TRUE)[f])
Reduce('+',lapply(iac, function(ia) ia$sgn*smpdraw * Reduce('*',rad[ia$ib])))
}
})
} else {
# IID residuals
rsamp <- lapply(1:vpb, function(i) sample(smpdraw, replace=TRUE))
}
if(!is.null(w)) rsamp <- lapply(rsamp, function(x) x/w)
if(length(cX) > 0) {
newr <- lapply(rsamp, function(rs) {
newy <- predy + rs
newbeta <- obj$inv %*% crossprod(cX, newy)
newbeta[is.na(newbeta)] <- 0
as.vector(newy - X %*% newbeta)
})
} else {
newr <- lapply(rsamp, function(rs) as.vector(predy) + rs)
}
rm(rsamp)
v <- kaczmarz(obj$fe, demeanlist(unnamed(newr), obj$fe, eps=eps, threads=threads,
means=TRUE, weights=w),
eps=eps, threads=threads)
rm(newr)
efv <- lapply(v,ef,addnames=FALSE)
vsum <- vsum + Reduce('+',efv)
vsq <- vsq + Reduce('+',Map(function(i) i^2, efv))
now <- as.integer(Sys.time())
if(now-last > 300) {
cat('...finished',i*vpb,'of',newN,'vectors in',now-start,'seconds\n')
last <- now
}
}
if(robust)
sename <- 'robustse'
else if(!is.null(cluster))
sename <- 'clusterse'
else
sename <- 'se'
fSEname <- SEname <- 'se'
fsename <- sename
if(!is.null(lhs)) {fsename <- paste(sename,lhs,sep='.'); fSEname <- paste(SEname,lhs,sep='.');}
alpha[,fsename] <- sqrt(vsq/newN - (vsum/newN)**2)/(1-0.75/newN-7/32/newN**2-9/128/newN**3)
if(sename != 'se') alpha[,fSEname] <- alpha[,fsename]
return(structure(alpha,sename=sename))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.