eivreg <- function(formula, data, subset, weights, na.action, method = "qr", model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = FALSE, contrasts = NULL, reliability = NULL, Sigma_error = NULL, cluster_varname = NULL, df_adj = FALSE, stderr = TRUE, offset, ...) {
###############################################################
## errors in variable regression. code adapted from lm().
##
## function fits both the standard (uncorrected) linear model and the
## corrected version
###############################################################
if(singular.ok) {
stop("singular EIV models not currently implemented")
}
if( (is.null(reliability) && is.null(Sigma_error)) || (!is.null(reliability) && !is.null(Sigma_error)) ){
stop("exactly one of 'reliability' or 'Sigma_error' must be specified")
}
## check vector of reliabilities
if( !is.null(reliability) ){
if( !is.numeric(reliability) || !is.vector(reliability) || !is.null(dim(reliability)) ){
stop("reliability must be numeric vector")
}
if(any(is.na(reliability))){
stop("reliability cannot have missing values")
}
relnames <- names(reliability)
if(is.null(relnames)){
stop("reliability vector must be named")
}
if(any(duplicated(relnames))){
stop("reliability names not unique")
}
if(any(reliability <= 0) || any(reliability > 1)){
stop("reliability must take values in (0,1]")
}
numep <- length(reliability) # number of error-prone covariates
}
## check error covariance matrix
if( !is.null(Sigma_error) ){
if( !is.numeric(Sigma_error) || !is.matrix(Sigma_error) || (nrow(Sigma_error) != ncol(Sigma_error)) ){
stop("Sigma_error must be square numeric matrix")
}
if(any(is.na(Sigma_error))){
stop("Sigma_error cannot have missing values")
}
if(is.null(dimnames(Sigma_error))){
stop("Sigma_error must have dimnames")
}
numep <- nrow(Sigma_error)
.colnames <- colnames(Sigma_error)
.rownames <- rownames(Sigma_error)
if(!is.null(.colnames) && !is.null(.rownames) && any(.colnames != .rownames)){
stop("measurement error covariance matrix has mismatched row and column names")
}
if(!is.null(.colnames)){
relnames <- rownames(Sigma_error) <- .colnames
} else {
relnames <- colnames(Sigma_error) <- .rownames
}
if(any(duplicated(relnames))){
stop("Sigma_error names not unique")
}
if( (max(abs(Sigma_error - t(Sigma_error))) > 1e-10) || any(eigen(Sigma_error)$values < 0) ){
stop("Sigma_error must be symmetric and positive semi-definite")
}
}
## basic checks on cluster_varname argument (more checks on content later)
if( !is.null(cluster_varname) ){
if( !is.character(cluster_varname) ){
stop("cluster_varname must be a character string providing the name of the clustering variable")
}
if( !is.data.frame(data) ){
stop("cluster option requires data argument")
}
if( !(cluster_varname %in% names(data)) ){
stop("cluster_varname is not in data")
}
}
###############################################################
## begin code taken from lm()
###############################################################
ret.x <- x
ret.y <- y
cl <- match.call()
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "subset", "weights", "na.action", "offset"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
if (method == "model.frame"){
return(mf)
} else if (method != "qr") {
warning(gettextf("method = '%s' is not supported. Using 'qr'", method), domain = NA)
}
mt <- attr(mf, "terms")
y <- model.response(mf, "numeric")
w <- as.vector(model.weights(mf))
if (!is.null(w) && !is.numeric(w)) {
stop("'weights' must be a numeric vector")
}
## NOTE: added to ensure that weights are positive and sum to the number of
## observations, which is the only case we are considering for EIV regression.
## Also note that cases with missing weights are dropped prior to this, just
## like cases with missing covariates.
if ( !is.null(w) ){
if( any(w <= 0) ){
stop("weights must be positive")
}
w <- length(w) * (w / sum(w))
}
offset <- as.vector(model.offset(mf))
if (!is.null(offset)) {
if (length(offset) != NROW(y)) {
stop(gettextf("number of offsets is %d, should equal %d (number of observations)", length(offset), NROW(y)), domain = NA)
}
}
if (is.empty.model(mt)) {
x <- NULL
z <- list(coefficients = if (is.matrix(y)) matrix(, 0,
3) else numeric(), residuals = y, fitted.values = 0 *
y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w !=
0) else if (is.matrix(y)) nrow(y) else length(y))
if (!is.null(offset)) {
z$fitted.values <- offset
z$residuals <- y - offset
}
}
else {
x <- model.matrix(mt, mf, contrasts)
z <- if (is.null(w))
lm.fit(x, y, offset = offset, singular.ok = singular.ok,
...)
else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok,
...)
}
class(z) <- c(if (is.matrix(y)) "mlm", "lm")
z$na.action <- attr(mf, "na.action")
z$offset <- offset
z$contrasts <- attr(x, "contrasts")
z$xlevels <- .getXlevels(mt, mf)
z$call <- cl
z$terms <- mt
if (model)
z$model <- mf
if (ret.x)
z$x <- x
if (ret.y)
z$y <- y
## NOTE: QR needed later, so this code moved down
## if (!qr)
## z$qr <- NULL
###############################################################
## end code taken from lm()
###############################################################
class(z) <- c("eivlm")
if(is.null(x)){
return(z)
}
if(NCOL(y) > 1){
stop("EIV models with matrix response not currently implemented")
}
###############################################################
## deal with offset:
## define as 0s if it doesn't exist and use it throughout to simplify code
###############################################################
z$N <- .N <- nrow(x)
if(is.null(offset)){
offset <- rep(0, .N)
}
ymo <- y - offset
###############################################################
## deal with weights:
## define as 1s if it doesn't exist and use it throughout to simplify code.
###############################################################
if( !is.null(w) ){
wgt <- z$weights
} else {
wgt <- rep(1.0, .N)
}
###############################################################
## deal with cluster:
## if cases with missing values were dropped during fit, need to align
###############################################################
if( !is.null(cluster_varname) ){
z$cluster_varname <- cluster_varname
z$cluster_values <- data[,cluster_varname]
if( !is.null(attr(mf, "na.action")) ){
z$cluster_values <- z$cluster_values[ setdiff(1:nrow(data), as.vector(attr(mf, "na.action"))) ]
stopifnot(length(z$cluster_values) == .N)
}
if( any(is.na(z$cluster_values)) ){
stop("clustering variable cannot have missing values among cases used in estimation")
}
z$cluster_num <- length(unique(z$cluster_values))
if( z$cluster_num < 2 ){
stop("clustering variable must take on at least two distinct values")
}
}
###############################################################
## test that all error-prone variables are numeric
## and show up only as main effects
###############################################################
v <- attributes(mt)$term.labels
if( !all(relnames %in% v) ){
stop("there do not appear to be main effects for all error-prone variables")
}
if( !all(attributes(mt)$dataClasses[relnames] == "numeric") ){
stop("error-prone variables must be numeric")
}
v <- make.names(v)
ok <- sapply(relnames, function(n){
length(grep(paste("^",n,"\\.+|\\.+",n,"\\.+|\\.+",n,"$",sep=""), v)) == 0
})
if( !all(ok) ){
stop("at least one error-prone variable is used in a term that is not a main effect\n Not currently implemented")
}
###############################################################
## get cross-product matrix and correction matrix "S"
###############################################################
tmp <- cov.wt( cbind(ymo, x), wt = wgt, method = "ML")
meanYWZ <- tmp$center
varYWZ <- tmp$cov
rm(tmp); gc()
XpX <- crossprod(qr.R(z$qr)) ## == crossprod(x * sqrt(wgt))
S <- matrix(0.0, ncol = ncol(XpX), nrow = nrow(XpX))
dimnames(S) <- dimnames(XpX)
posep <- match(relnames, colnames(S)) ## positions of error-prone covariates
stopifnot(all(relnames == colnames(x)[posep]))
if( !is.null(reliability) ){
diag(S)[posep] <- .N * (1.0 - reliability) * diag(varYWZ)[1 + posep]
} else {
for(.i in 1:numep){
for(.j in 1:numep){
S[posep[.i],posep[.j]] <- .N * Sigma_error[.i,.j]
}
}
}
###############################################################
## check that estimated covariance matrix of (Y,X,Z) is valid
##
## NOTE: Stata manual notes that reliability for each variable must be greater
## than or equal to the R^2 of the regression of that variable on all others
## plus the outcome. This condition appears to be equivalent to testing that
## the estimated covariance matrix of (Y,X,Z) is positive semi-definite. Need
## to be careful because the presence of an intercept among (X,Z), or columns
## that add up to an intercept (e.g. mutually exclusive and exhaustive
## grouping indicators) leads varYXZ to have an eigenvalue of zero. There
## should be no more than one such eigenvalue because we are not allowing
## singular.ok=TRUE. Numerically, such an eigenvalue may evaluate to a number
## that is nearly zero, but still negative. So we have to invoke a threshold
## to decide that a negative eigenvalue is sufficiently negative to indicate a
## problem.
##
## NOTE: adjusted regression coefficients could be computed directly from varYXZ
##############################################################
varYXZ <- varYWZ - (rbind( rep(0, ncol(x) + 1), cbind( rep(0, ncol(x)), S)) /.N)
if(any(eigen(varYXZ)$values < -1e-8)){
stop("reliability is too small (or measurement error variances are too large)")
}
gc(verbose=FALSE)
###############################################################
## get adjusted coefficients and build other pieces
###############################################################
z$unadj_coefficients <- z$coefficients
z$coefficients <- bhat <- drop(solve(XpX - S, crossprod(x * wgt, ymo)))
z$unadj_fitted.values <- z$fitted.values
z$fitted.values <- drop(x %*% bhat) + offset
z$unadj_residuals <- z$residuals
z$residuals <- y - z$fitted.values
z$unadj_effects <- z$effects
z$effects <- NULL
if(!qr){
z$qr <- NULL
} else {
z$unadj_qr <- z$qr
z$qr <- NULL
}
z$reliability <- reliability
z$Sigma_error <- Sigma_error
z$relnames <- relnames
z$XpX_adj <- XpX - S
z$varYXZ <- varYXZ
z$latent_resvar <- (sum(wgt * (ymo^2)) - as.numeric( matrix(bhat,nrow=1) %*% z$XpX_adj %*% matrix(bhat,ncol=1) )) / z$df.residual
###############################################################
## Sandwich SEs
## NOTE: loop over observations rather than use lapply to reduce
## storage problems with large dim models
##
###############################################################
if(stderr){
## Fork on whether we are in the known reliability or known error
## variance case to define function that computes psi and psidot
if( !is.null(reliability) ){
get_psi_and_psidot <- function(i, theta){
## i = observation index in 1:.N
## theta = (means of error-prone covariates, MLE variances of error-prone covariates, beta)
## NOTE: references (reliability, ymo, x, posep, numep) in parent environment
stopifnot(length(theta) == (2*numep + ncol(x)))
vhat <- rep(0.0, ncol(x))
vhat[posep] <- (1.0 - reliability) * theta[numep + 1:numep]
qi <- matrix(x[i,], ncol=1)
qqprime_adj <- qi %*% t(qi)
diag(qqprime_adj) <- diag(qqprime_adj) - vhat
psi <- as.vector(c( qi[posep,] - theta[1:numep],
(qi[posep,] - theta[1:numep])^2 - theta[numep + 1:numep],
ymo[i]*qi - (qqprime_adj %*% theta[2*numep + 1:ncol(x)])))
row1 <- cbind(-1.0*diag(numep),
matrix(0.0, nrow=numep, ncol=numep),
matrix(0.0, nrow=numep, ncol=ncol(x)))
row2 <- cbind(-2.0*diag(as.vector(qi[posep,] - theta[1:numep]),nrow=numep, ncol=numep),
-1.0 * diag(numep),
matrix(0.0, nrow=numep, ncol=ncol(x)))
## tricky piece: derivatives of beta equations w.r.t. error-prone variable latent variances
tmp <- matrix(0.0, nrow=ncol(x), ncol=numep)
for(j in 1:numep){
tmp[posep[j],j] <- (1.0 - reliability[j]) * theta[2*numep + posep[j]]
}
row3 <- cbind(matrix(0.0, nrow=ncol(x), ncol=numep), tmp, -1.0 * qqprime_adj)
psidot <- rbind(row1, row2, row3)
stopifnot( (nrow(psidot) == ncol(psidot)) && (nrow(psidot) == (2*numep + ncol(x))) )
return(list(psi = psi, psidot = psidot))
}
} else {
get_psi_and_psidot <- function(i, theta){
## i = observation index in 1:.N
## theta = beta
## NOTE: references (S, ymo, x, posep) in parent environment
stopifnot(length(theta) == ncol(x))
qi <- matrix(x[i,], ncol=1)
qqprime_adj <- qi %*% t(qi)
qqprime_adj <- qqprime_adj - (S / .N)
return(list(psi = as.vector(ymo[i]*qi - (qqprime_adj %*% theta)),
psidot = -1.0 * qqprime_adj))
}
}
##############################################
## compute sandwich variance/covariance matrix
##############################################
## 1) define estimated parameter
if( !is.null(reliability) ){
thetahat <- c(meanYWZ[1 + posep], diag(varYWZ)[1 + posep], bhat)
} else {
thetahat <- bhat
}
## 2) get psi and psidot at solution and check that psi satisfies appropriate constraint
psi <- matrix(0.0, nrow = .N, ncol = length(thetahat))
psidot <- matrix(0.0, nrow = length(thetahat), ncol = length(thetahat))
for(i in 1:.N){
tmp <- get_psi_and_psidot(i, theta = thetahat)
psi[i,] <- tmp$psi
psidot <- psidot + (wgt[i] * tmp$psidot)
}
psidot <- psidot / .N
if(max(abs(apply(psi, 2, FUN = weighted.mean, w = wgt))) > 1e-9){
stop("EIV solution does not satisfy estimating equations - consider changing scale of data")
}
gc(verbose=FALSE)
## 3) compute variance, restricting to regression coeffs in known reliability case
## NOTE: B is either the sum of individual-level outer products, or the sum across
## clusters of the outer products of the cluster sums.
if( is.null(cluster_varname) ){
B <- crossprod(psi * wgt)
} else {
grpsums <- apply(psi * wgt, 2, function(x){ tapply(x, z$cluster_values, sum) })
B <- Reduce("+",lapply(1:nrow(grpsums), function(i){ tcrossprod(grpsums[i,]) }))
}
Ainv <- solve(psidot)
fudge <- ifelse(df_adj, (.N / z$df.residual), 1.0) *
ifelse(!is.null(z$cluster_varname), (z$cluster_num / (z$cluster_num -1)), 1.0)
z$vcov <- fudge * (Ainv %*% (B / .N^2) %*% t(Ainv))
if( !is.null(reliability) ){
z$vcov <- z$vcov[2*numep + 1:ncol(x),2*numep + 1:ncol(x)]
}
rownames(z$vcov) <- colnames(z$vcov) <- colnames(x)
rm(psi, psidot)
gc(verbose=FALSE)
}
z
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.