# $Id: oldfelm.R 1942 2016-04-07 21:25:18Z sgaure $
# Author: Simen Gaure
# Copyright: 2011, Simen Gaure
# Licence: Artistic 2.0
# Some things in this file is done in a weird way.
# In some cases there are efficiency reasons for this, e.g. because
# the "standard" way of doing things may result in a copy which is costly
# when the problem is *large*.
# In other cases it may simply be due to the author's unfamiliarity with how
# things should be done in R
# parse our formula
oldparseformula <- function(formula,data) {
trm <- terms(formula,specials=c('G'))
feidx <- attr(trm,'specials')$G+1
va <- attr(trm,'variables')
festr <- paste(sapply(feidx,function(i) deparse(va[[i]])),collapse='+')
if(festr != '') {
.Deprecated(msg="The G() syntax is deprecated, please use multipart formulas instead")
# remove the G-terms from formula
formula <- update(formula,paste('. ~ . -(',festr,') - 1'))
# then make a list of them, and find their names
felist <- parse(text=paste('list(',gsub('+',',',festr,fixed=TRUE),')',sep=''))
nm <- eval(felist,list(G=function(arg) deparse(substitute(arg))))
# replace G with factor, eval with this, and the parent frame, or with data
# allow interaction factors with '*' (dropped, never documented, use ':')
Gfunc <- function(f) if(is.null(attr(f,'xnam'))) factor(f) else f
Ginfunc <- function(x,f) {
if(is.factor(x)) {
structure(interaction(factor(f),factor(x),drop=TRUE),xnam=deparse(substitute(x)),fnam=deparse(substitute(f)))
} else {
structure(factor(f),x=x,xnam=deparse(substitute(x)), fnam=deparse(substitute(f)))
}
}
if(is.environment(data)) {
fl <- eval(felist,list(G=Gfunc, ':'=Ginfunc),data)
} else {
fl <- local({eval(felist,data)},list(G=Gfunc, ':'=Ginfunc))
}
names(fl) <- nm
gpart <- eval(parse(text=paste('~',paste(nm,collapse='+'))))
if(is.null(names(fl))) names(fl) <- paste('fe',1:length(fl),sep='')
} else {
fl <- NULL
gpart <- ~0
}
return(list(formula=formula, fl=fl, gpart=gpart,ivpart=~0,cpart=~0))
}
# parse
# use 2-part Formulas without G() syntax, like
# y ~ x1 + x2 | f1+f2
# or 3-part or more Formulas with iv-specification like
# y ~ x1 + x2 | f1+f2 | (q|w ~ x3+x4) | c1+c2
# returns a list containing
# formula=y~x1+x2
# fl = list(f1,f2)
# ivpart = list(q ~x3+x4, w ~x3+x4)
# cluster=list(c1,c2)
nopart <- function(x) length(all.vars(x))==0
parseformula <- function(form, data, noexpand=FALSE) {
f <- as.Formula(form)
len <- length(f)[[2]]
if(len == 1) return(oldparseformula(form,data))
opart <- formula(f,lhs=NULL,rhs=1)
if(len == 1) return(list(formula=opart,gpart=~0,ivpart=~0,cpart=~0))
# the factor part
gpart <- formula(f,lhs=0, rhs=2)
if(!nopart(gpart)) {
tm <- terms(gpart, keep.order=TRUE)
ft <- attr(tm,'factors')
var <- eval(attr(tm,'variables'),data)
varnames <- rownames(ft)
names(var) <- varnames
fl <- apply(ft, 2, function(v) {
nonz <- sum(v > 0)
vnam <- varnames[which(v > 0)]
if(nonz > 2) stop('Interaction only supported for two variables')
if(nonz == 1) {
# if(!is.factor(var[[vnam]])) warning('non-factor ',vnam, ' coerced to factor')
res <- list(factor(var[[vnam]]))
names(res) <- vnam
} else {
xnam <- vnam[[1]]
fnam <- vnam[[2]]
x <- var[[xnam]]
f <- var[[fnam]]
if(!is.factor(f) && !is.factor(x)) {
stop('interaction between ', xnam, ' and ', fnam, ', none of which are factors')
}
if(!is.factor(f) && is.factor(x)) {
tmp <- x
x <- f
f <- tmp
tmp <- xnam
xnam <- fnam
fnam <- tmp
}
if(is.factor(x)) {
res <- list(structure(interaction(factor(f),factor(x),drop=TRUE),xnam=xnam,fnam=fnam))
} else {
res <- list(structure(factor(f),x=x,xnam=xnam, fnam=fnam))
}
names(res) <- paste(xnam,fnam,sep=':')
}
res
})
nm <- names(fl)
fl <- unlist(fl, recursive=FALSE)
names(fl) <- nm
} else {
fl <- NULL
}
if(len == 2) return(list(formula=opart,fl=fl,gpart=gpart,ivpart=~0,cpart=~0))
# Then the iv-part
ivparts <- formula(f,lhs=0,rhs=3, drop=TRUE)
if(!nopart(ivparts) && length(ivparts[[2]])>1 && ivparts[[2]][[1]]=='(') {
# Now, make a list of the iv-formulas where we split the lhs in each
# to obtain q ~ x3+x4, w ~x3+x4
ivspec <- as.Formula(ivparts[[2]][[2]]) # it's now q|w ~ x3+x4
lhs <- formula(ivspec,rhs=0)
ivpart <- lapply(seq_along(all.vars(lhs)),function(i) formula(ivspec,lhs=i))
} else {
ivpart <- NULL
}
if(len == 3 && !is.null(ivpart)) return(list(formula=opart,fl=fl,iv=ivpart,gpart=gpart,ivpart=ivparts,cpart=~0))
# The cluster part, this could be the third part if there are no parentheses
if(len == 3 && is.null(ivpart)) {
cpart <- ivparts
ivparts <- NULL
} else {
cpart <- formula(f,lhs=0,rhs=4,drop=TRUE)
}
if(!nopart(cpart)) {
# handle the same way as the factors, but without the covariate interaction
tm <- terms(cpart, keep.order=TRUE)
nm <- parts <- attr(tm,'term.labels')
clist <- lapply(paste('factor(',parts,')',sep=''),function(e) parse(text=e))
cluster <- lapply(clist,eval,data)
names(cluster) <- nm
} else {
cluster <- NULL
}
list(formula=opart,fl=fl,iv=ivpart,cluster=cluster,gpart=gpart,ivpart=ivparts,cpart=cpart)
}
# ivresid is optional, used in 2. stage of 2sls to pass
# the difference between the original endogenous variable and the prediction
# for the purpose of computing sum of square residuals
doprojols <- function(psys, ivresid=NULL, exactDOF=FALSE, keepX=FALSE, nostats=FALSE) {
if(is.numeric(exactDOF)) {
df <- exactDOF
totvar <- length(psys$y) - df
} else {
# numrefs is also used later
numrefs <- nrefs(psys$fl, compfactor(psys$fl), exactDOF)
totvar <- totalpvar(psys$fl)-numrefs
df <- length(psys$y)-totvar
}
if(is.null(psys$yxz$x)) {
# No covariates
z <- list(N=psys$N, r.residuals=psys$y,fe=psys$fl,p=totvar,Pp=0,cfactor=compfactor(psys$fl),
na.action=psys$na.action, contrasts=psys$contrasts,
fitted.values=psys$y - psys$yxz$y,
coefficients=matrix(double(0),psys$N,0),
df=df,
nostats=FALSE,
model.assign=psys$assign,
model.labels=psys$model.labels,
residuals=psys$yxz$y,clustervar=psys$clustervar, call=match.call())
z$df.residual <- z$df
class(z) <- 'felm'
return(z)
}
yz <- psys$yxz$y
xz <- psys$yxz$x
y <- psys$y
x <- psys$x
fl <- psys$fl
icpt <- psys$icpt
# here we just do an lm.fit, however lm.fit is quite slow since
# it doesn't use blas (in particular it can't use e.g. threaded blas in acml)
# so we have rolled our own.
# we really don't return an 'lm' object or other similar stuff, so
# we should consider using more elementary operations which map to blas-3
# eg. solve(crossprod(xz),t(xz) %*% yz)
# Or, even invert by solve(crossprod(xz)) since we need
# the diagonal for standard errors. We could use the cholesky inversion
# chol2inv(chol(crossprod(xz)))
cp <- crossprod(xz)
b <- crossprod(xz,yz)
ch <- cholx(cp)
# ch <- chol(cp)
# beta <- drop(inv %*% (t(xz) %*% yz))
# remove multicollinearities
badvars <- attr(ch,'badvars')
if(is.null(badvars)) {
beta <- backsolve(ch,backsolve(ch,b,transpose=TRUE))
# beta <- as.vector(beta)
# beta <- as.vector(backsolve(ch,backsolve(ch,b,transpose=TRUE)))
if(!nostats) inv <- chol2inv(ch)
} else {
beta <- matrix(NaN, nrow(cp), ncol(b))
# beta <- rep(NaN,nrow(cp))
beta[-badvars,] <- backsolve(ch,backsolve(ch,b[-badvars,], transpose=TRUE))
if(!nostats) {
inv <- matrix(NA,nrow(cp),ncol(cp))
inv[-badvars,-badvars] <- chol2inv(ch)
}
}
rm(ch, b, cp)
if(length(fl) > 0 && icpt > 0)
rownames(beta) <- colnames(x)[-icpt] else rownames(beta) <- colnames(x)
colnames(beta) <- colnames(y)
if(ncol(beta) == 1) names(beta) <- rownames(beta)
z <- list(coefficients=beta,badconv=psys$badconv,Pp=ncol(xz))
z$N <- nrow(xz)
z$p <- ncol(xz) - length(badvars)
if(!nostats) {
z$inv <- inv
inv <- nazero(inv)
}
# how well would we fit with all the dummies?
# the residuals of the centered model equals the residuals
# of the full model, thus we may compute the fitted values
# resulting from the full model.
# for the 2. step in the 2sls, we should replace
# the instrumented variable with the real ones (the difference is in ivresid)
# when predicting, but only for the purpose of computing
# residuals.
nabeta <- nazero(beta)
zfit <- xz %*% nabeta
zresid <- yz - zfit
z$beta <- beta
z$response <- y
z$fitted.values <- y - zresid
z$residuals <- zresid
z$contrasts <- psys$contrasts
z$model.assign <- psys$model.assign
z$model.labels <- psys$model.labels
if(length(fl) > 0) {
# insert a zero at the intercept position (x may have an intercept, whereas xz has not)
# if(icpt > 0) ibeta <- append(beta,0,after=icpt-1) else ibeta <- beta
if(icpt > 0) {
pre <- seq_len(icpt-1)
post <- setdiff(seq_len(nrow(beta)), pre)
ibeta <- rbind(beta[pre,,drop=FALSE], 0, beta[post,,drop=FALSE])
} else {
ibeta <- beta
}
pred <- x %*% ifelse(is.na(ibeta),0,ibeta)
z$r.residuals <- y - pred
} else {
z$r.residuals <- zresid
}
rm(x)
z$lhs <- colnames(beta)
# the residuals should be the residuals from the original endogenous variables, not the predicted ones
# the difference are the ivresid, which we must multiply by beta and subtract.
# the residuals from the 2nd stage are in iv.residuals
# hmm, what about the r.residuals? We modify them as well. They are used in kaczmarz().
if(!is.null(ivresid)) {
if(!is.matrix(ivresid)) {
nm <- names(ivresid)
ivresid <- matrix(unlist(ivresid),z$N)
colnames(ivresid) <- nm
}
z$ivresid <- ivresid %*% nabeta[colnames(ivresid),,drop=FALSE]
z$iv.residuals <- z$residuals
z$residuals <- z$residuals - z$ivresid
z$r.iv.residuals <- z$r.residuals
z$r.residuals <- z$r.residuals - z$ivresid
}
z$terms <- psys$terms
z$cfactor <- compfactor(fl)
totlev <- totalpvar(fl)
if(is.numeric(exactDOF)) {
z$df <- exactDOF
numdum <- z$N - z$p - z$df
z$numrefs <- totlev - numdum
} else {
numdum <- totlev - numrefs
z$numrefs <- numrefs
z$df <- z$N - z$p - numdum
}
z$df.residual <- z$df
z$rank <- z$N - z$df
z$exactDOF <- exactDOF
z$fe <- fl
# should we subtract 1 for an intercept?
# a similar adjustment is done in summary.felm when computing rdf
z$p <- z$p + numdum - 1
z$xp <- z$p
z$na.action <- psys$na.action
class(z) <- 'felm'
cluster <- psys$clustervar
z$clustervar <- cluster
if(nostats) {
z$nostats <- TRUE
return(z)
}
z$nostats <- FALSE
# then we go about creating the covariance matrices and tests
# if there is a single lhs, they are just stored as matrices etc
# in z. If there are multiple lhs, these quantities are inserted
# in a list z$STATS indexed by z$lhs
# indexed by the name of the lhs
vcvnames <- list(rownames(beta), rownames(beta))
Ncoef <- nrow(beta)
singlelhs <- length(z$lhs) == 1
if(!singlelhs) z$STATS <- list()
for(lhs in z$lhs) {
res <- z$residuals[,lhs]
# if(!is.null(ivresid)) res <- res - z$ivresid
vcvfactor <- sum(res**2)/z$df
# when multiple lhs, vcvfactor is a vector
# we need a list of vcvs in this case
if(singlelhs) {
z$vcv <- z$inv * vcvfactor
setdimnames(z$vcv, vcvnames)
} else {
z$STATS[[lhs]] <- list()
z$STATS[[lhs]]$vcv <- z$inv * vcvfactor
setdimnames(z$STATS[[lhs]]$vcv, vcvnames)
}
# dimnames(z$vcv) <- list(names(beta),names(beta))
# We should make the robust covariance matrix too.
# it's inv * sum (X_i' u_i u_i' X_i) * inv
# where u_i are the (full) residuals (Wooldridge, 10.5.4 (10.59))
# i.e. inv * sum(u_i^2 X_i' X_i) * inv
# for large datasets the sum is probably best computed by a series of scaled
# rank k updates, i.e. the dsyrk blas routine, we make an R-version of it.
# need to check this computation, the SE's are slightly numerically different from Stata's.
# it seems stata does not do the small-sample adjustment
dfadj <- z$N/z$df
# Now, here's an optimzation for very large xz. If we use the wcrossprod and ccrossprod
# functions, we can't get rid of xz, we end up with a copy of it which blows away memory.
# we need to scale xz with the residuals in xz, but we don't want to expand res to a full matrix,
# and even get a copy in the result.
# Thus we modify it in place with a .Call. The scaled variant is also used in the cluster computation.
# z$robustvcv <- dfadj * inv %*% wcrossprod(xz,res) %*% inv
rscale <- ifelse(res==0,1e-40,res)
.Call(C_scalecols, xz, rscale)
if(singlelhs) {
z$robustvcv <- dfadj * inv %*% crossprod(xz) %*% inv
setdimnames(z$robustvcv, vcvnames)
} else {
z$STATS[[lhs]]$robustvcv <- dfadj * inv %*% crossprod(xz) %*% inv
setdimnames(z$STATS[[lhs]]$robustvcv, vcvnames)
}
# then the clustered covariance matrix
if(!is.null(cluster)) {
method <- attr(cluster,'method')
if(is.null(method)) method <- 'cgm'
dfadj <- (z$N-1)/z$df
d <- length(cluster)
if(method == 'cgm') {
# meat <- matrix(0,nrow(z$vcv),ncol(z$vcv))
meat <- matrix(0,Ncoef,Ncoef)
for(i in 1:(2^d-1)) {
# Find out which ones to interact
iac <- as.logical(intToBits(i))[1:d]
# odd number is positive, even is negative
sgn <- 2*(sum(iac) %% 2) - 1
# interact the factors
ia <- factor(do.call(paste,c(cluster[iac],sep='\004')))
adj <- sgn*dfadj*nlevels(ia)/(nlevels(ia)-1)
.Call(C_dsyrk,1,meat,adj,rowsum(xz,ia))
}
if(singlelhs) {
z$clustervcv <- inv %*% meat %*% inv
setdimnames(z$clustervcv, vcvnames)
} else {
z$STATS[[lhs]]$clustervcv <- inv %*% meat %*% inv
setdimnames(z$STATS[[lhs]]$clustervcv, vcvnames)
}
rm(meat)
## } else if(method == 'gaure') {
## .Call(C_scalecols, xz, 1/rscale)
## meat <- matrix(0,nrow(z$vcv),ncol(z$vcv))
## dm.res <- demeanlist(res,cluster)
## skel <- lapply(cluster, function(f) rep(0,nlevels(f)))
## means <- relist(kaczmarz(cluster,res-dm.res), skel)
## scale <- ifelse(dm.res==0,1e-40, dm.res)
## .Call(C_scalecols, xz, scale)
## .Call(C_dsyrk, 1, meat, dfadj, xz)
## .Call(C_scalecols, xz, 1/scale)
## for(i in seq_along(cluster)) {
## rs <- rowsum(xz, cluster[[i]])
## adj <- nlevels(cluster[[i]])/(nlevels(cluster[[i]])-1)
## .Call(C_scalecols, rs, means[[i]])
## .Call(C_dsyrk, 1, meat, dfadj*adj, rs)
## }
## rm(xz,rs)
## z$clustervcv <- inv %*% meat %*% inv
## rm(meat)
} else {
stop('unknown multi way cluster algorithm:',method)
}
if(singlelhs) {
z$cse <- sqrt(diag(z$clustervcv))
z$ctval <- coef(z)/z$cse
z$cpval <- 2*pt(abs(z$ctval),z$df,lower.tail=FALSE)
} else {
z$STATS[[lhs]]$cse <- sqrt(diag(z$STATS[[lhs]]$clustervcv))
z$STATS[[lhs]]$ctval <- z$coefficients[,lhs]/z$STATS[[lhs]]$cse
z$STATS[[lhs]]$cpval <- 2*pt(abs(z$STATS[[lhs]]$ctval),z$df,lower.tail=FALSE)
}
}
if(singlelhs) {
z$se <- sqrt(diag(z$vcv))
z$tval <- z$coefficients/z$se
z$pval <- 2*pt(abs(z$tval),z$df,lower.tail=FALSE)
z$rse <- sqrt(diag(z$robustvcv))
z$rtval <- coef(z)/z$rse
z$rpval <- 2*pt(abs(z$rtval),z$df,lower.tail=FALSE)
} else {
z$STATS[[lhs]]$se <- sqrt(diag(z$STATS[[lhs]]$vcv))
z$STATS[[lhs]]$tval <- z$coefficients[,lhs]/z$STATS[[lhs]]$se
z$STATS[[lhs]]$pval <- 2*pt(abs(z$STATS[[lhs]]$tval),z$df,lower.tail=FALSE)
z$STATS[[lhs]]$rse <- sqrt(diag(z$STATS[[lhs]]$robustvcv))
z$STATS[[lhs]]$rtval <- z$coefficients[,lhs]/z$STATS[[lhs]]$rse
z$STATS[[lhs]]$rpval <- 2*pt(abs(z$STATS[[lhs]]$rtval),z$df,lower.tail=FALSE)
}
# reset this for next lhs
if(!singlelhs) .Call(C_scalecols, xz, 1/rscale)
}
z
}
project <- function(mf,fl,data,contrasts,clustervar=NULL,pf=parent.frame()) {
m <- match(c("formula", "data", "subset", "na.action"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(model.frame)
subspec <- mf[['subset']]
# we should handle multiple lhs
# but how? model.frame() doesn't handle it, but we need
# model.frame for subsetting and na.action, with the left hand side
# included. We create an artifical single lhs by summing the left hand
# sides, just to get hold of the rhs. Then we extract the left hand side
Form <- as.Formula(mf[['formula']])
mf[['formula']] <- Form
mf <- eval(mf, pf)
mt <- attr(mf,'terms')
naact <- attr(mf,'na.action')
if(!is.null(naact))
naclass <- class(naact)
fullN <- nrow(mf) + length(naact)
# then obtain the response matrix through Formula::model.part
response <- as.matrix(model.part(Form, mf, lhs=NULL,rhs=0))
cmethod <- attr(clustervar,'method')
if(!is.null(clustervar)) {
if(is.character(clustervar)) clustervar <- as.list(clustervar)
if(!is.list(clustervar)) clustervar <- list(clustervar)
clustervar <- lapply(clustervar, function(cv) {
if(!is.character(cv)) factor(cv) else factor(data[,cv])
})
}
# we need to change clustervar and factor list to reflect
# subsetting and na.action. na.action is ok, it's set as an attribute in mf
# but subset must be done manually. It's done before na handling
if(!is.null(subspec)) {
subs <- eval(subspec,pf)
if(!is.null(clustervar)) clustervar <- lapply(clustervar,function(cv) cv[subs])
fl <- lapply(fl, function(fac) {
f <- factor(fac[subs])
x <- attr(f,'x')
if(is.null(x)) return(f)
structure(f,x=x[subs])
})
}
if(!is.null(naact)) {
if(!is.null(clustervar)) clustervar <- lapply(clustervar, function(cv) cv[-naact])
fl <- lapply(fl,function(fac) {
f <- factor(fac[-naact])
x <- attr(f,'x')
if(is.null(x)) return(f)
structure(f,x=x[-naact])
})
}
attr(clustervar,'method') <- cmethod
# ret <- list(fl=fl, na.action=naact,terms=mt,clustervar=clustervar, y=model.response(mf,'numeric'))
ret <- list(fl=fl, na.action=naact, fullN=fullN, terms=mt,clustervar=clustervar, y=response)
rm(mt,clustervar,naact)
lapply(ret$clustervar, function(f)
if(length(f) != nrow(ret$y)) stop('cluster factors are not the same length as data ',
length(f),'!=',nrow(ret$y)))
# in case of cluster factor specified with the clustervar argument:
# try a sparse model matrix to save memory when removing intercept
# though, demeanlist must be full. Ah, no, not much to save because
# it won't be sparse after centering
# we should rather let demeanlist remove the intercept, this
# will save memory by not copying. But we need to remove it below in x %*% beta
# (or should we extend beta with a zero at the right place, it's only
# a vector, eh, is it, do we not allow matrix lhs? No.)
# we make some effort to avoid copying the data matrix below
# this includes assigning to lists in steps, with gc() here and there.
# It's done for R 3.0.2. The copy semantics could be changed in later versions.
# ret$x <- model.matrix(ret$terms,mf,contrasts)
ret$x <- model.matrix(Form, mf, contrasts)
rm(mf)
ret$contrasts <- attr(ret$x,'contrasts')
ret$model.assign <- attr(ret$x, 'assign')
ret$model.labels <- attr(terms(Form[-2]), 'term.labels') # ditch lhs when finding terms
icpt <- attr(ret$x,'assign') == 0
if(!any(icpt)) icpt <- 0 else icpt <- which(icpt)
ret$icpt <- icpt
ncov <- ncol(ret$x) - (icpt > 0)
if(ncov == 0) {
ret$x <- NULL
ret$yxz <- list(y=demeanlist(ret$y,fl))
ret$Pp <- 0
ret$N <- length(ret$y)
ret$yx <- list(y=ret$y)
return(ret)
}
# here we need to demean things
# we take some care so that unexpected copies don't occur
# hmm, the list() copies the stuff. How can we avoid a copy
# and still enable parallelization over y and x in demeanlist? A vararg demeanlist?
# I.e. an .External version? Yes.
# yx <- list(y=ret$y, x=ret$x)
# gc()
# ret$yxz <- demeanlist(yx,fl,icpt)
# rm(fl,yx); gc()
# ret$yxz <- edemeanlist(y=ret$y,x=ret$x,fl=fl,icpt=c(0,icpt))
ret$yxz <- demeanlist(list(y=ret$y,x=ret$x),fl=fl,icpt=c(0,icpt))
ret$badconv <- attr(ret$yxz$x,'badconv') + attr(ret$yxz$y,'badconv')
# use our homebrewn setdimnames instead of colnames. colnames copies.
if(length(fl) > 0) {
if(icpt == 0)
setdimnames(ret$yxz$x, list(NULL,colnames(ret$x)))
else
setdimnames(ret$yxz$x, list(NULL,colnames(ret$x)[-icpt]))
}
ret
}
#' @export
..oldfelm <- function(formula, data, exactDOF=FALSE, subset, na.action, contrasts=NULL,...) {
knownargs <- c('iv', 'clustervar', 'cmethod', 'keepX', 'nostats')
keepX <- FALSE
cmethod <- 'cgm'
iv <- NULL
clustervar <- NULL
nostats <- FALSE
deprec <- c('iv', 'clustervar')
# sc <- names(sys.call())[-1]
# named <- knownargs[pmatch(sc,knownargs)]
# for(arg in c('iv', 'clustervar')) {
# if(!is.null(eval(as.name(arg))) && !(arg %in% named)) {
# warning("Please specify the '",arg,"' argument by name, or use a multi part formula. Its position in the argument list will change in a later version")
# }
# }
mf <- match.call(expand.dots = TRUE)
# Currently there shouldn't be any ... arguments
# check that the list is empty
# if(length(mf[['...']]) > 0) stop('unknown argument ',mf['...'])
# When moved to the ... list, we use this:
# we do it right away, iv and clustervar can't possibly end up in ... yet, not with normal users
args <- list(...)
ka <- knownargs[pmatch(names(args),knownargs, duplicates.ok=FALSE)]
names(args)[!is.na(ka)] <- ka[!is.na(ka)]
dpr <- deprec[match(ka, deprec)]
if(any(!is.na(dpr))) {
bad <- dpr[which(!is.na(dpr))]
warning('Argument(s) ',paste(bad,collapse=','), ' are deprecated and will be removed, use multipart formula instead')
}
env <- environment()
lapply(intersect(knownargs,ka), function(arg) assign(arg,args[[arg]], pos=env))
if(!(cmethod %in% c('cgm','gaure'))) stop('Unknown cmethod: ',cmethod)
# also implement a check for unknown arguments
unk <- setdiff(names(args), knownargs)
if(length(unk) > 0) stop('unknown arguments ',paste(unk, collapse=' '))
if(missing(data)) mf$data <- data <- environment(formula)
pf <- parent.frame()
pform <- parseformula(formula,data)
if(!is.null(iv) && !is.null(pform[['iv']])) stop("Specify EITHER iv argument(deprecated) OR multipart terms, not both")
if(!is.null(pform[['cluster']]) && !is.null(clustervar)) stop("Specify EITHER clustervar(deprecated) OR multipart terms, not both")
if(!is.null(pform[['cluster']])) clustervar <- structure(pform[['cluster']], method=cmethod)
if(is.null(iv) && is.null(pform[['iv']])) {
# no iv, just do the thing
fl <- pform[['fl']]
formula <- pform[['formula']]
mf[['formula']] <- formula
psys <- project(mf,fl,data,contrasts,clustervar,pf)
z <- doprojols(psys,exactDOF=exactDOF, nostats=nostats[1])
if(keepX) z$X <- if(psys$icpt > 0) psys$x[,-psys$icpt] else psys$x
rm(psys)
z$parent.frame <- pf
z$call <- match.call()
return(z)
}
# IV. Clean up formulas, set up for 1st stages
if(!is.null(iv)) {
# warning("argument iv is deprecated, use multipart formula instead")
if(!is.list(iv)) iv <- list(iv)
form <- pform[['formula']]
# Old syntax, the IV-variables are also in the main equation, remove them
for(ivv in iv) {
ivnam <- ivv[[2]]
# create the new formula by removing the IV lhs.
form <- update(form, substitute(. ~ . - Z,list(Z=ivnam)))
}
pform[['formula']] <- form
mf[['iv']] <- NULL
ivpart <- NULL
} else {
iv <- pform[['iv']]
ivpart <- as.Formula(pform[['ivpart']])
}
if(is.environment(data)) {
ivenv <- new.env(parent=data)
} else {
ivenv <- new.env(parent=pf)
}
if(!is.null(ivpart)) {
# ivpart is something like ~(P|Q ~ x+x2)
# strip ~ and ()
ivpart <- as.Formula(ivpart[[2]][[2]])
lhs <- formula(ivpart, lhs=NULL, rhs=0)
rhs <- as.Formula(formula(ivpart, lhs=0, rhs=1))
if(length(ivpart)[[2]] > 1) {
stop("Instruments can't be projected out: ", ivpart)
rhsg <- as.Formula(formula(ivpart, lhs=0, rhs=2))
} else {
rhsg <- list(NULL,NULL)
}
Form <- as.Formula(formula)
if(length(Form)[2] == 4) {
cluform <- formula(Form,lhs=0,rhs=4)[[2]]
step1form <- as.Formula(substitute(L ~ B + ivO | G|0|C,
list(L=lhs[[2]], B=pform[['formula']][[3]],
ivO=rhs[[2]],
G=pform[['gpart']][[2]],
C=cluform)))
} else {
step1form <- as.Formula(substitute(L ~ B + ivO | G,
list(L=lhs[[2]], B=pform[['formula']][[3]],
ivO=rhs[[2]],
G=pform[['gpart']][[2]])))
}
environment(step1form) <- environment(formula)
ivform <- parseformula(step1form,data)
fl <- ivform[['fl']]
mf[['formula']] <- ivform[['formula']]
environment(mf[['formula']]) <- environment(formula)
psys <- project(mf,fl,data,contrasts,clustervar,pf)
if(length(nostats) == 2)
nost <- nostats[2]
else
nost <- nostats[1]
z1 <- doprojols(psys, exactDOF=exactDOF, nostats=nost)
# put the fitted values in ivenv
for(n in colnames(z1$fitted.values)) {
nn <- paste(n,'(fit)',sep='')
assign(nn,z1$fitted.values[,n],envir=ivenv)
}
z1$endogvars <- paste('`',colnames(z1$fitted.values),'(fit)`',sep='')
# we should not use all.vars(rhs), to find the instruments, but
# pick up things from z1 somehow, in case there are expanded
# factors as instrumental variables
# in z1 there are model.assign and model.labels.
# we locate the instrument names (rhs) in model.labels, and extract the coefficient names from model.assign
cname <- rownames(z1$coefficients)
ivnam <- all.vars(rhs)
asgn <- z1$model.assign
if(length(asgn) != length(cname)) asgn <- asgn[asgn!=0]
# now assume there's a factor f among the instruments, with levels f2-f4 (1 is a reference)
# we look up 'f' in lab, it has position 5, then everything with assign == 5 belong to this factor
ivars <- cname[asgn %in% which(z1$model.labels %in% ivnam)]
z1$instruments <- ivars
# Store any dummy instruments in the ivenv, for possible later use in condfstat
# There's a psys$x which is the model matrix
# we may as well store all the instruments there
for(ivar in ivars) {
if(!(ivar %in% ivnam))
assign(ivar,psys$x[,ivar],envir=ivenv)
}
if(!nost) {
z1$iv1fstat <- lapply(z1$lhs, function(lh) waldtest(z1,ivars, lhs=lh))
names(z1$iv1fstat) <- z1$lhs
z1$rob.iv1fstat <- lapply(z1$lhs, function(lh) waldtest(z1,ivars,type='robust', lhs=lh))
names(z1$rob.iv1fstat) <- z1$lhs
}
z1$call <- match.call()
environment(step1form) <- environment(formula)
z1$call[['formula']] <- step1form
naact <- psys$na.action
FIT <- as.formula(paste('~',paste(z1$endogvars,sep='', collapse='+'),sep=''))
step2form <- as.Formula(substitute(y ~ B + FIT | G,
list(y=pform[['formula']][[2]],
B=pform[['formula']][[3]],
FIT=FIT[[2]],
G=pform[['gpart']][[2]])))
# do the first step
environment(step2form) <- environment(formula)
form2 <- parseformula(step2form, data)
fl <- form2[['fl']]
formula <- form2[['formula']]
environment(formula) <- ivenv
mf$formula <- formula
if(is.environment(mf$data)) mf$data <- ivenv
# remove the naact from 1st step
# any missings in the exogenous variables will be missing in both the 1st and 2nd stage
# If there are missing instruments or endogenous variables they will be missing in 1st stage, but not in the 2nd
# so we must make them missing in the 2nd stage as well. We implement that as a subset.
if(!is.null(naact)) {
# naact is a vector of observations to remove
# numbered after a subset has been taken.
# if there's no subset in mf, we create one
subset <- mf[['subset']]
if(is.null(subset)) {
mf[['subset']] <- -naact
} else {
# subset is an indexing vector for the rows in the data frame
# naact specifies rows to remove after subsetting.
# we simulate this process. Let's make an integer vector for the
# rows of the data. The total length can be found
mf[['subset']] <- seq_len(psys$fullN)[subset][-naact]
}
}
psys <- project(mf=mf,fl=fl,data=data,contrasts=contrasts,clustervar=clustervar,pf=pf)
ivres <- z1$residuals
colnames(ivres) <- as.character(sapply(all.vars(FIT),as.name))
z <- doprojols(psys, ivresid=ivres, exactDOF=exactDOF, nostats=nostats)
z$stage1 <- z1
z$st2call <- mf
# backwards compatibility
z$step1 <- lapply(z1$lhs, function(lh) {
# warning('Use stage1 instead of step1')
foo <- z1
foo$lhs <- lh
foo$beta <- z1$beta[,lh,drop=FALSE]
foo$coefficients <- foo$beta
foo$response <- z1$response[,lh,drop=FALSE]
foo$fitted.values <- z1$fitted.values[,lh,drop=FALSE]
foo$residuals <- z1$residuals[,lh,drop=FALSE]
foo$r.residuals <- z1$r.residuals[,lh,drop=FALSE]
if(!is.null(z1$iv1fstat)) {
foo$iv1fstat <- z1$iv1fstat[lh]
foo$rob.iv1fstat <- z1$rob.iv1fstat[lh]
}
if(!is.null(z1$STATS))
foo[names(z1$STATS[[lh]])] <- z1$STATS[[lh]]
foo[['STATS']] <- NULL
foo})
z$endovars <- z1$endogvars
z$parent.frame <- pf
rm(psys)
z$call <- match.call()
return(z)
}
# parse the IV-formulas, they may contain factor-parts
iv <- lapply(iv,parseformula,data)
# Now, insert the rhs of the IV in the formula
# find the ordinary and the factor part
# It may contain a factor part
# this is the template for the step1 formula, just insert a left hand side
step1form <- formula(as.Formula(substitute(~B +ivO | G + ivG,
list(B=pform[['formula']][[3]],G=pform[['gpart']][[2]]))))
# this is the template for the second stage, it is updated with the IV variables
step2form <- formula(as.Formula(substitute(y~B | G,
list(y=pform[['formula']][[2]],B=pform[['formula']][[3]],
G=pform[['gpart']][[2]]))))
nullbase <- formula(as.Formula(substitute(~B | G,
list(B=pform[['formula']][[3]],G=pform[['gpart']][[2]]))))
# we must do the 1. step for each instrumented variable
# collect the instrumented variables and remove them from origform
# then do the sequence of 1. steps
# we put the instrumented variable on the lhs, and add in the formula for it on the rhs
# A problem with this approach is that na.action is run independently between the first stages and
# the second stage. This may result in different number of observations. This isn't any good.
# I haven't figured out a good solution for this, except for creating the full model.matrix for
# all of the steps. This will blow our memory on large datasets.
ivarg <- list()
vars <- NULL
step1 <- list()
endolist <- c()
for(ivv in iv) {
# Now, make the full instrumental formula, i.e. with the rhs expanded with the
# instruments, and the lhs equal to the instrumented variable
ivlhs <- ivv[['formula']][[2]]
rhsivo <- formula(as.Formula(ivv[['formula']]),lhs=0)[[2]]
if(nopart(ivv[['gpart']]))
rhsivg <- 0
else
rhsivg <- formula(ivv[['gpart']],lhs=0,rhs=2)[[2]]
fformula <- substitute(Z ~ R, list(Z=ivlhs, R=step1form[[2]]))
fformula <- do.call(substitute, list(fformula,list(ivO=rhsivo,ivG=rhsivg)))
ivform <- parseformula(fformula,data)
fl <- ivform[['fl']]
mf[['formula']] <- ivform[['formula']]
# note that if there are no G() terms among the instrument variables,
# all the other covariates should only be centered once, not in every first stage and the
# second stage separately. We should rewrite and optimize for this.
psys <- project(mf,fl,data,contrasts,clustervar,pf)
z <- doprojols(psys, exactDOF=exactDOF)
mf[['formula']] <- fformula
z$call <- mf
rm(psys)
# now, we need an ftest between the first step with and without the instruments(null-model)
# We need the residuals with and without the
# instruments. We have them with the instruments, but must do another estimation
# for the null-model
mfnull <- mf
nullform <- substitute(Z ~ R,list(Z=ivlhs,R=nullbase[[2]]))
pformnull <- parseformula(nullform, data)
mfnull[['formula']] <- pformnull[['formula']]
znull <- doprojols(project(mfnull, pformnull[['fl']], data, contrasts, clustervar, pf),
exactDOF=exactDOF)
z$iv1fstat <- ftest(z,znull)
z$rob.iv1fstat <- ftest(z,znull,vcov=z$robustvcv)
if(!is.null(clustervar))
z$clu.iv1fstat <- ftest(z,znull,vcov=z$clustervcv)
step1 <- c(step1,list(z))
# then we lift the fitted variable and create a new name
ivz <- z
evar <- deparse(ivlhs)
new.var <- paste(evar,'(fit)',sep='')
# store them in an environment
assign(new.var,ivz$fitted.values,envir=ivenv)
# data[[new.var]] <- ivz$fitted
# save these, with the backtick for later use
vars <- c(vars,paste('`',new.var,'`',sep=''))
# keep the residuals, they are needed to reconstruct the residuals for the
# original variables in the 2. stage
ivarg[[paste('`',new.var,'`',sep='')]] <- ivz$residuals
# and add it to the equation
step2form <- update(as.Formula(step2form),as.formula(substitute(. ~ . + FIT | .,
list(FIT=as.name(new.var)))))
}
names(step1) <- names(iv)
# now we have a formula in step2form with all the iv-variables
# it's just to project it
pform <- parseformula(step2form,data)
fl <- pform[['fl']]
formula <- pform[['formula']]
environment(formula) <- ivenv
mf$formula <- formula
if(is.environment(mf$data)) mf$data <- ivenv
psys <- project(mf=mf,fl=fl,data=data,contrasts=contrasts,clustervar=clustervar,pf=pf)
z <- doprojols(psys,ivresid=ivarg,exactDOF=exactDOF)
z$step1 <- step1
z$endovars <- vars
rm(psys)
z$call <- match.call()
return(z)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.