Nothing
#' Extract the first and second stage regressors of ivreg formula
#'
#' \code{extract_variables} takes a formula object for [ivreg::ivreg()], i.e.
#' in a format of \code{y ~ x1 + x2 | x1 + z2} and extracts the different
#' elements in a list.
#'
#' @param formula A formula for the [ivreg::ivreg] function, i.e. in format
#' \code{y ~ x1 + x2 | z1 + z2}.
#'
#' @return \code{extract_variables} returns a list with three components:
#' \code{$yvar} stores the name of the dependent variable, \code{$first} the
#' names of the regressors of the first stage and \code{$second} the names of
#' the second stage regressors.
#'
#' @export
extract_variables <- function(formula) {
# convert formula to character vector of length 1
# use Reduce() to avoid length > 1 if formula is too long
fml <- Reduce(paste, deparse(formula))
# check that formula contains both "~" and "|" symbols
if (!(grepl("~", fml) && grepl("\\|", fml))) {
stop(strwrap("The `formula` is not of the required format since it does not
include both symbols `~` and `|`", prefix = " ", initial = ""))
}
# split formula into its three party: y, x, z
fml_split <- strsplit(fml, "~|\\|")
# ensure that the formula contains three parts
if (length(fml_split[[1]]) != 3) {
stop(strwrap("The `formula` does not consist of three parts as in
y ~ x1 + x2 | x1 + z2", prefix = " ", initial = ""))
}
# delete symbols and leading & trailing spaces, collect in character vector
yvar <- trimws(fml_split[[1]][1])
second <- fml_split[[1]][2]
second <- trimws(strsplit(second, "\\+|\\*")[[1]])
first <- fml_split[[1]][3]
first <- trimws(strsplit(first, "\\+|\\*")[[1]])
if (yvar == "") {
stop(strwrap("The `formula` does not specify any dependent variable",
prefix = " ", initial = ""))
}
vars <- list(yvar = yvar, first = first, second = second)
return(vars)
}
#' Takes ivreg formula and returns formula compatible with model selection
#'
#' \code{new_formula} takes a formula object for [ivreg::ivreg()], i.e. in a
#' format of \code{y ~ x1 + x2 | x1 + z2}, and returns a list with element
#' suitable for model selection. For example, it updates the data by creating
#' an intercept if specified in the formula, checks for collinearity among the
#' regressors, and updates the formula accordingly.
#'
#' @inheritParams extract_variables
#' @inheritParams ivgets
#' @param data A data frame.
#'
#' @return A list with several named elements. Component \code{$fml} stores the
#' new baseline formula that will be used for model selection. Components
#' \code{y}, \code{x}, and \code{z} store the data of the dependent variable,
#' structural regressors, and excluded instruments. The entries
#' \code{$depvar}, \code{$x1}, \code{$x2}, \code{$z1}, and \code{$z2} contain
#' the names of the dependent variable, endogenous and exogenous regressors,
#' included and excluded instruments. \code{$dx1}, \code{$dx2}, \code{$dz1},
#' \code{$dz2} store the dimensions of the respective variables. Finally,
#' \code{$keep} and \code{$keep.names} contain the indices and names of the
#' regressors that will not be selected over.
#'
#' @export
new_formula <- function(formula, data, keep_exog) {
if (!(is.numeric(keep_exog) | is.character(keep_exog) | is.null(keep_exog))) {
stop("Argument 'keep_exog' must either be NULL, numeric, or a character.")
}
keep_intercept <- FALSE
# keep_exog = selection of exogenous regressors that should not select over
if (is.numeric(keep_exog)) {
keep_intercept <- (0 %in% keep_exog)
if (length(setdiff(keep_exog, 0)) == 0) { # only keep intercept
keep_names <- NULL
} else {
keep_names <- colnames(data)[setdiff(keep_exog, 0)]
}
} else if (is.character(keep_exog)) {
keep_intercept <- ("Intercept" %in% keep_exog)
if (length(setdiff(keep_exog, "Intercept")) == 0) { # only keep intercept
keep_names <- NULL
} else {
keep_names <- setdiff(keep_exog, "Intercept")
if (any(!(keep_names %in% colnames(data)))) {
stop("Argument 'keep_exog' specifies names that cannot be found in the data frame.")
}
}
} else {
keep_names <- NULL
}
# keep_intercept is TRUE if keep_exog had index 0 or "Intercept"
# keep_names contains the names of variables excluding any intercept name
# keep_names is NULL if no variables should be kept (except intercept maybe)
# check formula: extract regressors and check presence of intercept
vars <- extract_variables(formula = formula)
if (any(c("-1", "0") %in% vars$first)) {
intercept1 <- FALSE
} else {
intercept1 <- TRUE
}
if (any(c("-1", "0") %in% vars$second)) {
intercept2 <- FALSE
} else {
intercept2 <- TRUE
}
# throw errors for two specific cases; require (no) intercept to be specified
# symmetrically for the first and second stage at the moment
if (intercept2 == TRUE & intercept1 == FALSE) {
stop("If have intercept in the structural equation, it should also be in the first stage.")
}
if (intercept2 == FALSE & intercept1 == TRUE) {
stop("Using intercept as excluded instrument in the first stage is currently not supported.")
}
# extract the base variables (this excludes any intercept specifications)
reg1_base <- setdiff(vars$first, c("-1", "0", "1"))
reg2_base <- setdiff(vars$second, c("-1", "0", "1"))
x2_base <- setdiff(reg2_base, reg1_base)
x1_base <- setdiff(reg2_base, x2_base)
z1_base <- x1_base
z2_base <- setdiff(reg1_base, x1_base)
# extract the base vectors/matrices of regressand, regressors, instruments
y <- as.vector(data[, vars$yvar])
x <- as.matrix(data[, c(x1_base, x2_base), drop = FALSE], drop = FALSE)
z <- as.matrix(data[, z2_base, drop = FALSE], drop = FALSE)
# check for perfect multicollinearity issues
# multicollinearity in original matrix
if (length(colnames(x)) > length(colnames(gets::dropvar(x, silent = TRUE)))) {
stop("Original formula specification has perfect collinearity. Please adjust regressors.")
}
# create new structural equation regressor matrix
if (intercept2) {
xnames <- colnames(x)
# check whether regressor named "Intercept" already present in formula
if ("Intercept" %in% xnames) {
stop("Formula specification creates an intercept but regressor named \"Intercept\" is also included in formula. Please specify \"-1\" in the formula.")
}
# add an intercept
x <- cbind(1, x)
colnames(x)[1] <- "Intercept"
xnew <- gets::dropvar(x, silent = TRUE)
# returns matrix as before but now has a column with constant 1 called
# "Intercept". If had intercept present before then the other one is
# dropped.
dropnames <- setdiff(xnames, colnames(xnew))
# rank deficient by more than 1 (should not happen)
if (length(dropnames) > 1) { # nocov start
stop("Original structural equation had multicollinearity issues. Please re-specify the initial model such that no multicollinearity arises.")
} # nocov end
# dropnames can only have dropped a regressor that was perfectly collinear
# with the intercept; so can delete that regressor from the exogenous regr.
# but add the regressor Intercept
if (length(dropnames) == 1) { # rank deficient by 1
warning("Intercept has introduced collinearity. Drop one regressor: ", dropnames)
}
if (any(dropnames %in% keep_names)) {
stop("A regressor that was specified in 'keep_exog' has been dropped due to multicollinearity. Please re-specify model.")
}
x <- xnew
x1_base <- c("Intercept", setdiff(x1_base, dropnames))
z1_base <- x1_base
}
# find the indices of the regressors that should be kept, indices corr. to x
if (keep_intercept) {
keep_names <- union("Intercept", keep_names)
}
# also need to keep all endogenous regressors
keep <- union(keep_names, x2_base)
keep_names <- keep
keep <- which(colnames(x) %in% keep_names)
# in practice, keep should never be empty unless we are really in OLS case
# nocov start
if (length(keep) == 0) { keep <- NULL }
# nocov end
# save number of the types of regressors
dx1 <- length(x1_base)
dx2 <- length(x2_base)
dz1 <- length(z1_base)
dz2 <- length(z2_base)
# sanity check
if (!identical(dx1, dz1)) { # nocov start
stop("Something went wrong. dx1 should be equal to dz1.")
} # nocov end
# create new baseline formula
baseline_x <- paste(c("-1", x1_base, x2_base), sep = "", collapse = "+")
baseline_z <- paste(c("-1", z1_base, z2_base), sep = "", collapse = "+")
baseline <- paste(c(vars$yvar, baseline_x), sep = "", collapse = " ~ ")
baseline <- paste(c(baseline, baseline_z), sep = "", collapse = " | ")
out <- list(fml = baseline, y = y, x = x, z = z, depvar = vars$yvar,
x1 = x1_base, x2 = x2_base, z1 = z1_base, z2 = z2_base, dx1 = dx1,
dx2 = dx2, dz1 = dz1, dz2 = dz2, keep = keep,
keep.names = keep_names)
return(out)
}
#' Function factory for creating indicators from their names
#'
#' \code{factory_indicators} creates a function that takes the name of an
#' indicator and returns the corresponding indicator to be used in a regression.
#' For user-specified indicators, it extracts the corresponding column from the
#' uis matrix.
#'
#' @param n An integer specifying the length of the indicators.
#'
#' @return \code{factory_indicators} returns a function called \code{creator()}.
#'
#' @details Argument \code{n} should equal the number of observations in the
#' data set which will be augmented with the indicators.
#'
#' The created function takes a name of an indicator and the original uis
#' argument that was used in indicator saturation and returns the indicator.
#'
#' @export
factory_indicators <- function(n) {
if (!is.numeric(n)) {
stop("Argument 'n' must be a single numeric value (integer).")
}
if (!identical(length(n), 1L)) {
stop("Argument 'n' must have length 1.")
}
if (!(n %% 1 == 0)) {
stop("Argument 'n' must be an integer.")
}
if (n < 1) {
stop("Argument 'n' must equal the sample size, so cannot be 0 or negative.")
}
creator <- function(name, uis) {
type <- stringr::str_extract(string = name, pattern = "^[[:alpha:]]+")
no <- as.numeric(stringr::str_extract(string = name,
pattern = "[1-9]([0-9]+)?"))
indicator <- NULL # initialise as NULL
if (type %in% c("iis", "sis", "tis") & (no > n)) {
stop("Specified iis, sis, or tis of length larger than sample size.")
}
if (identical(type, "iis")) {
indicator <- c(rep(0, times = (no - 1)), 1, rep(0, times = (n - no)))
indicator <- as.matrix(indicator, ncol = 1, nrow = n)
colnames(indicator) <- name
} else if (identical(type, "sis")) {
indicator <- c(rep(0, times = (no - 1)), rep(1, times = (n - no + 1)))
indicator <- as.matrix(indicator, ncol = 1, nrow = n)
colnames(indicator) <- name
} else if (identical(type, "tis")) {
m <- matrix(0,n,n)
v1n <- seq(1,n)
loop.indx <- 1:n
tmp <- function(i){
m[c(i:n),i] <<- v1n[1:c(n-i+1)]
}
sapply(loop.indx,tmp)
indicator <- m[, no]
indicator <- as.matrix(indicator, ncol = 1, nrow = n)
colnames(indicator) <- name
} else if (is.matrix(uis)) { # uis is a matrix; could be named or not
if (identical(type, "uisxreg")) { # indicator from unnamed matrix uis
indicator <- uis[, no, drop = FALSE] # retrieve by indicator index
} else { # indicator from named matrix uis
indicator <- uis[, name, drop = FALSE] # retrieve by indicator name
} # end if uis is matrix
} else if (is.list(uis)) { # uis is a list of matrices; cols must be named
for (matx in seq_along(uis)) {
if (name %in% colnames(uis[[matx]])) { # found
indicator <- uis[[matx]][, name, drop = FALSE] # retrieve by ind name
break
} # end if found
} # end for through all matrices in uis
} # end if uis is list
if (is.null(indicator)) {
stop("Retained indicator could not be created or found.")
} else {
return(indicator)
} # check that indicator has really been found
}
return(creator) # return a function
}
#' 2SLS estimator
#'
#' 2SLS estimator that does not allow for weights, offset, other methods than
#' 2SLS, and does not calculate influence statistics. Supposedly faster but
#' returns little output.
#'
#' @inheritParams ivgets
#'
#' @return \code{twosls()} returns a list with eight names elements:
#' \code{$coefficients} stores the coefficient estimates of the second stage,
#' \code{$residuals} the residuals of the structural equation (i.e. using X
#' and not Xhat), \code{$fitted.values} the fitted values of the second stage,
#' \code{$n} and \code{$nobs} the sample size, \code{$k} the number of
#' regressors in the structural equation, \code{$cov.unscaled} the unscaled
#' variance-covariance matrix, and \code{$sigma} the degrees-of-freedom
#' adjusted equation standard error.
#'
#' @section WARNING:
#' The return value is given class [ivreg::ivreg()] but it is not a true
#' \code{"ivreg"} object. This does not pose any problems for internal use but
#' should not be used outside of its usage in its current form. The class
#' assignment is likely to change in the future.
#'
#' @importFrom stats delete.response lm.fit model.matrix model.response terms
#' @keywords internal
twosls <- function(formula, data) {
# can only use this function when "Formula" is installed
if (!requireNamespace("Formula", quietly = TRUE)) { # nocov start
stop("Package 'Formula' must be installed to use this function.", .call = FALSE)
} # nocov end
# capture function call
mf <- match.call()
mf$drop.unused.levels <- TRUE
formula <- Formula::as.Formula(formula)
mf$formula <- formula
# convert formula to model frame
mf[[1]] <- as.name("model.frame")
mf <- eval(mf, parent.frame())
Y <- model.response(mf, "numeric") # extract depvar
mtX <- terms(formula, data= data, rhs = 1)
X <- model.matrix(mtX, mf, contrasts.arg = NULL) # extract X
mtZ <- delete.response(terms(formula, data = data, rhs = 2))
Z <- model.matrix(mtZ, mf, contrasts.arg = NULL) # extract Z
n <- NROW(Y)
k <- ncol(X)
stopifnot(n == nrow(X))
stopifnot(n == nrow(Z))
stopifnot(ncol(Z) >= ncol(X))
fs <- lm.fit(Z, X) # run first stage
Xhat <- as.matrix(fs$fitted.values) # first stage fitted values
colnames(Xhat) <- colnames(X)
ss <- lm.fit(Xhat, Y) # run second stage
notna <- which(!is.na(ss$coefficients)) # regressors for which coefficient is not NA
Yhat <- drop(X[, notna, drop = FALSE] %*% ss$coefficients[notna]) # fitted values of second stage using actual x values
names(Yhat) <- names(Y)
res <- Y - Yhat # residuals of second stage
varcov <- chol2inv(ss$qr$qr[1:length(notna), 1:length(notna), drop = FALSE])
colnames(varcov) <- rownames(varcov) <- names(ss$coefficients[notna])
sigma <- sqrt(sum(res^2)/ss$df.residual)
rval <- list(coefficients = ss$coefficients, residuals = res,
fitted.values = Yhat, n = n, nobs = n, k = k,
cov.unscaled = varcov, sigma = sigma)
# give class "ivreg" so that can use methods for ivreg objects
class(rval) <- "ivreg"
return(rval)
}
#' 2SLS estimator alternative
#'
#' Test whether is faster than [ivgets::twosls()] but this does not seem to be
#' the case.
#'
#' @inheritParams twosls
#' @inheritSection twosls WARNING
#'
#' @importFrom stats delete.response lm.fit model.matrix model.response terms
#' @keywords internal
twosls.alt <- function(formula, data) { # nocov start
# can only use this function when "Formula" is installed
if (!requireNamespace("Formula", quietly = TRUE)) { # nocov start
stop("Package 'Formula' must be installed to use this function.", .call = FALSE)
} # nocov end
# capture function call
mf <- match.call()
mf$drop.unused.levels <- TRUE
formula <- Formula::as.Formula(formula)
mf$formula <- formula
# convert formula to model frame
mf[[1]] <- as.name("model.frame")
mf <- eval(mf, parent.frame())
Y <- model.response(mf, "numeric") # extract depvar
mtX <- terms(formula, data= data, rhs = 1)
X <- model.matrix(mtX, mf, contrasts.arg = NULL) # extract X
mtZ <- delete.response(terms(formula, data = data, rhs = 2))
Z <- model.matrix(mtZ, mf, contrasts.arg = NULL) # extract Z
n <- NROW(Y)
k <- ncol(X)
stopifnot(n == nrow(X))
stopifnot(n == nrow(Z))
stopifnot(ncol(Z) >= ncol(X))
# first stage
qZ <- qr(Z, tol = 1e-07, LAPACK = FALSE)
fscoef <- solve.qr(qZ, X, tol = 1e-07)
fsfit <- (Z %*% fscoef)
colnames(fsfit) <- colnames(X)
# second stage
qXhat <- qr(fsfit, tol = 1e-07, LAPACK = FALSE)
sscoef <- solve.qr(qXhat, Y, tol = 1e-07)
notna <- which(!is.na(sscoef)) # regressors for which coefficient is not NA
ssfit <- X[, notna, drop = FALSE] %*% sscoef[notna] # fitted values of second stage using actual x values
names(ssfit) <- names(Y)
ssres <- Y - ssfit
varcov <- chol2inv(qXhat$qr[1:length(notna), 1:length(notna), drop = FALSE])
colnames(varcov) <- rownames(varcov) <- names(sscoef[notna])
sigma <- sqrt(sum(ssres^2)/(n-k))
rval <- list(coefficients = sscoef, residuals = ssres,
fitted.values = ssfit, n = n, nobs = n, k = k,
cov.unscaled = varcov, sigma = sigma)
# give class "ivreg" so that can use methods for ivreg objects
class(rval) <- "ivreg"
return(rval)
} # nocov end
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.