Nothing
#' Estimation of limited dependent variable models
#'
#' mhurdle fits a large set of models relevant when the dependent variable is 0
#' for a part of the sample.
#'
#'
#' \code{mhurdle} fits models for which the dependent variable is zero for a
#' part of the sample. Null values of the dependent variable may occurs because
#' of one or several mechanisms : good rejection, lack of ressources and
#' purchase infrequency. The model is described using a three-parts formula :
#' the first part describes the selection process if any, the second part the
#' regression equation and the third part the purchase infrequency process.
#' \code{y ~ 0 | x1 + x2 | z1 + z2} means that there is no selection process.
#' \code{y ~ w1 + w2 | x1 + x2 | 0} and \code{y ~ w1 + w2 | x1 + x2} describe
#' the same model with no purchase infrequency process. The second part is
#' mandatory, it explains the positive values of the dependant variable. The
#' \code{dist} argument indicates the distribution of the error term. If
#' \code{dist = "n"}, the error term is normal and (at least part of) the zero
#' observations are also explained by the second part as the result of a corner
#' solution. Several models described in the litterature are obtained as
#' special cases :
#'
#' A model with a formula like \code{y~0|x1+x2} and \code{dist="n"} is the
#' Tobit model proposed by Tobin (1958).
#'
#' \code{y~w1+w2|x1+x2} and \code{dist="l"} or \code{dist="t"} is the single
#' hurdle model proposed by Cragg (1971). With \code{dist="n"}, the double
#' hurdle model also proposed by Cragg (1971) is obtained. With
#' \code{corr="h1"} we get the correlated version of this model described by
#' Blundell (1987).
#'
#' \code{y~0|x1+x2|z1+z2} is the P-Tobit model of Deaton and Irish (1984),
#' which can be a single hurdle model if \code{dist="t"} or \code{dist="l"} or
#' a double hurdle model if \code{dist="n"}.
#'
#' @aliases mhurdle coef.mhurdle vcov.mhurdle logLik.mhurdle print.mhurdle
#' summary.mhurdle print.summary.mhurdle predict.mhurdle update.mhurdle
#' fitted.mhurdle effects.mhurdle
#' @param formula a symbolic description of the model to be fitted,
#' @param data a \code{data.frame},
#' @param newdata a \code{data.frame} for which the predictions should be
#' computed,
#' @param subset see \code{\link{lm}},
#' @param weights see \code{\link{lm}},
#' @param na.action see \code{\link{lm}},
#' @param start starting values,
#' @param dist the distribution of the error of the consumption equation: one
#' of \code{"n"} (normal), \code{"ln"} (log-normal) \code{"bc"} (box-cox
#' normal) and \code{"ihs"} (inverse hyperbolic sinus transformation),
#' @param h2 if \code{TRUE} the second hurdle is effective, it is not
#' otherwise,
#' @param scaled if \code{TRUE}, the dependent variable is divided by its
#' geometric mean,
#' @param corr a boolean indicating whether the errors of the different
#' equations are correlated or not,
#' @param robust transformation of the structural parameters in order to avoid
#' numerical problems,
#' @param check.grad if \code{TRUE}, a matrix containing the analytical and the
#' numerical gradient for the starting values are returned,
#' @param naive a boolean, it \code{TRUE}, the likelihood of the naive model is
#' returned,
#' @param object,x an object of class \code{"mhurdle"},
#' @param new an updated formula for the \code{update} method,
#' @param digits see \code{\link{print}},
#' @param width see \code{\link{print}},
#' @param which which coefficients or covariances should be extracted ? Those
#' of the selection (\code{"h1"}), consumption (\code{"h2"}) or purchase
#' (\code{"h3"}) equation, the other coefficients \code{"other"} (the standard
#' error and the coefficient of corr), the standard error (\code{"sigma"}) or
#' the coefficient of correlation (\code{"rho"}),
#' @param covariate the covariate for which the effect has to be computed,
#' @param reflevel for the computation of effects for a factor, the reference
#' level,
#' @param mean if \code{TRUE}, the mean of the effects is returned,
#' @param \dots further arguments.
#' @return
#'
#' an object of class \code{c("mhurdle", "maxLik")}.
#'
#' A \code{"mhurdle"} object has the following elements :
#'
#' \describe{ \item{coefficients}{the vector of coefficients,} \item{vcov}{the
#' covariance matrix of the coefficients,} \item{fitted.values}{a matrix of
#' fitted.values, the first column being the probability of 0 and the second
#' one the mean values for the positive observations,} \item{logLik}{the
#' log-likelihood,} \item{gradient}{the gradient at convergence,}
#' \item{model}{a data.frame containing the variables used for the estimation,}
#' \item{coef.names}{a list containing the names of the coefficients in the
#' selection equation, the regression equation, the infrequency of purchase
#' equation and the other coefficients (the standard deviation of the error
#' term and the coefficient of correlation if \code{corr = TRUE}),}
#' \item{formula}{the model formula, an object of class \code{Formula},}
#' \item{call}{the call,} \item{rho}{the lagrange multiplier test of no
#' correlation.}
#'
#' }
#' @references
#'
#' Blundell R, Meghir C (1987). Bivariate Alternatives to the Tobit Model.
#' Journal of Econometrics, 34, 179-200.
#'
#' Cragg JG (1971). Some Statistical Models for Limited Dependent Variables
#' with Applications for the Demand for Durable Goods. Econometrica, 39(5),
#' 829-44.
#'
#' Deaton A, Irish M (1984). A Statistical Model for Zero Expenditures in
#' Household Budgets. Journal of Public Economics, 23, 59-80.
#'
#' Tobin J (1958). Estimation of Relationships for Limited Dependent Variables.
#' Econometrica, 26(1), 24-36.
#' @keywords regression
#' @examples
#'
#'
#' data("Interview", package = "mhurdle")
#'
#' # independent double hurdle model
#' idhm <- mhurdle(vacations ~ car + size | linc + linc2 | 0, Interview,
#' dist = "ln", h2 = TRUE, method = "bfgs")
#'
#' # dependent double hurdle model
#' ddhm <- mhurdle(vacations ~ car + size | linc + linc2 | 0, Interview,
#' dist = "ln", h2 = TRUE, method = "bfgs", corr = TRUE)
#'
#' # a double hurdle p-tobit model
#' ptm <- mhurdle(vacations ~ 0 | linc + linc2 | car + size, Interview,
#' dist = "ln", h2 = TRUE, method = "bfgs", corr = TRUE)
#'
#'
mhurdle <- function(formula, data, subset, weights, na.action,
start = NULL, dist = c("ln", "n", "bc", "ihs"), h2 = FALSE,
scaled = TRUE, corr = FALSE, robust = TRUE,
check.grad = FALSE, ...){
fitted = TRUE
dots <- list(...)
oldoptions <- options(warn = -1)
on.exit(options(oldoptions))
cl <- match.call()
posT <- as.list(cl) == "T" ; posF <- as.list(cl) == "F"
cl[posT] <- TRUE ; cl[posF] <- FALSE
dist <- match.arg(dist)
if (dist == "ln" & h2) dist <- "ln2"
if (dist == "n" & ! h2) dist <- "tn"
if (dist == "bc" & h2) dist <- "bc2"
# 1. Compute the model.frame and the model.matrix
if (! inherits(formula, "Formula")) formula <- Formula(formula)
if (length(formula)[2] > 4) stop("at most 4 rhs should be provided in the formula")
mf <- match.call(expand.dots = FALSE)
mc <- match.call(expand.dots = TRUE)
m <- match(c("formula", "data", "subset", "na.action", "weights"),
names(mc), 0L)
mf <- mc[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- as.name("model.frame")
mf$formula <- formula
mf <- eval(mf, parent.frame())
X1 <- model.matrix(formula, data = mf, rhs = 1)
X2 <- model.matrix(formula, data = mf, rhs = 2)
X3 <- model.matrix(formula, data = mf, rhs = 3)
X4 <- model.matrix(formula, data = mf, rhs = 4)
# Remove the intercept if any for the heteroscedastic model
if (ncol(X4)){
int4 <- attr(terms(formula, data = mf, rhs = 4), "intercept")
X4 <- X4[, - 1, drop = FALSE]
}
y <- model.response(mf)
if (scaled){
geomean <- exp(mean(log(y[y > 0])))
y <- y / geomean
attr(y, "geomean") <- attr(mf, "geomean") <- geomean
}
N <- length(y)
if (length(X1) == 0) X1 <- NULL
if (length(X2) == 0) stop("the second hurdle (consumption equation) is mandatory")
if (length(X3) == 0) X3 <- NULL
if (length(X4) == 0) X4 <- NULL
h1 <- ! is.null(X1)
h3 <- ! is.null(X3)
# 2. One equation models
if (! h1 && ! h3){
result <- onequation.mhurdle(X2, y, dist)
result$naive <- NULL#naive
result$call <- cl
result$model <- mf
result$formula <- formula
result$R2 <- c(null = NA, positive = NA)
return(result)
}
# 3. Selection single hurdle models without correlation that can
# be estimated simply in two parts using seperate.mhurdle()
if (h1 && !h3 && ! corr && !h2){
result <- seperate.mhurdle(X1, X2, y, dist = dist)
result$naive <- NULL#naive
result$call <- cl
result$model <- mf
result$formula <- formula
result$R2 <- c(null = NA, positive = NA)
return(result)
}
# 5. Compute the starting values if not provided (use the linear
# specification as the starting value for ihs and the log-linear
# specification for Box-Cox)
if (is.null(start)){
dist.start <- dist
if (dist %in% c("bc", "bc2", "ln2")) dist.start <- "ln"
if (dist == "ihs") dist.start <- "n"
start <- start.mhurdle(X1, X2, X3, y, dist.start)
# in case of heteroscedasctic model, add K4 zeros to the start
# vector and the intercept should be ln(sigma_o) (not sigma_o)
# because of the exp form
sd.pos <- getindex(X1, X2, X3, X4, corr, dist, which = "sd")
if (robust) start[sd.pos] <- log(start[sd.pos])
if (! is.null(X4)) start <- c(start[1:sd.pos], rep(0, ncol(X4)))
# add shape and/or scale parameters
if (corr){
if (robust) rhoinit <- tan(0.1 * pi / 2) else rhoinit <- 0.1
if (h1 + h3 == 2) start <- c(start, rho12 = rhoinit, rho13 = rhoinit, rho23 = rhoinit)
else start <- c(start, rhoinit)
}
if (dist %in% c("bc", "bc2", "ihs")) start <- c(start, tr = -0.1)
if (dist %in% c("bc2", "ln2")) start <- c(start, pos = 1)
}
else{
if (robust){
sd.pos <- getindex(X1, X2, X3, X4, corr, dist, which = "sd")
mu.pos <- getindex(X1, X2, X3, X4, corr, dist, which = "pos")
rho.pos <- getindex(X1, X2, X3, X4, corr, dist, which = "corr")
start[c(sd.pos, mu.pos)] <- log(start[c(sd.pos, mu.pos)])
start[rho.pos] <- tan(start[rho.pos] * pi / 2)
}
}
result <- mhurdle.fit(start, X1, X2, X3, X4, y, gradient = TRUE,
dist = dist, corr = corr, robust = robust,
fitted = fitted, check.grad = check.grad,
...)
if (check.grad) return(result)
# 3. Compute the naive model
Pnull <- mean(y == 0)
dist.naive <- dist
if (dist %in% c("ihs")) dist.naive <- "n"
if (dist %in% c("bc", "bc2", "ln2")) dist.naive <- "ln"
if (dist.naive != "ln"){
Ec <- mean(y[y > 0])
Vc <- var(y[y > 0])}
else{
Ec <- mean(log(y[y > 0]))
Vc <- var(log(y[y > 0]))
}
start.naive <- c(rep(0.1, 1 + h1 + h3), 1)
moments <- c(Pnull, Ec, Vc)
naive <- maxLik(lnl.naive, start = start.naive,
dist = dist.naive, moments = moments,
h1 = h1, h3 = h3)
coef.naive <- naive$est
parts <- attr(naive$max, "parts") * c(Pnull, 1 - Pnull, 1 - Pnull) * N
logLik.naive <- structure(as.numeric(naive$max) * N, nobs = length(y),
parts = parts,
df = length(coef.naive), class = "logLik")
naive <- list(coefficients = coef.naive, logLik = logLik.naive, code = naive$code)
result$naive <- naive
result$call <- cl
result$formula <- formula
result$model <- mf
lnL1c <- attr( naive$logLik, "parts")[1] + attr( naive$logLik, "parts")[2]
lnL1u <- attr(result$logLik, "parts")[1] + attr(result$logLik, "parts")[2]
lnL2c <- attr( naive$logLik, "parts")[3] - attr( naive$logLik, "parts")[2]
lnL2u <- attr(result$logLik, "parts")[3] - attr(result$logLik, "parts")[2]
R1 <- 1 - (lnL1c / lnL1u) ^ (2 / N * lnL1c)
R2 <- 1 - (exp(lnL2c) / exp(lnL2u)) ^ (2 / (N * (1 - Pnull)))
result$R2 <- c(null = R1, positive = R2)
result
}
mhurdle.fit <- function(start, X1, X2, X3, X4, y, gradient = FALSE, fit = FALSE,
dist = c("ln", "n", "tn", "bc", "ihs", "bc2", "ln2"),
corr = FALSE, robust = TRUE, fitted = FALSE,
check.grad = FALSE, ...){
start.time <- proc.time()
h1 <- ! is.null(X1)
h3 <- ! is.null(X3)
h4 <- ! is.null(X4)
# fancy coefficients names
sd.names <- "sd"
if (corr){
if (h1 & h3) rho.names <- c("corr12", "corr13", "corr23")
else rho.names <- ifelse(h1, "corr12", "corr23")
}
else rho.names <- NULL
if (dist %in% c("bc", "bc2", "ihs")) tr.names <- "tr" else tr.names <- NULL
if (dist %in% c("ln2", "bc2")) mu.names <- "pos" else mu.names <- NULL
coef.names <- list(h1 = colnames(X1),
h2 = colnames(X2),
h3 = colnames(X3),
sd = sd.names,
h4 = colnames(X4),
corr = rho.names,
tr = tr.names,
pos = mu.names)
start.names <- coef.names
if (h1) start.names$h1 <- paste("h1", start.names$h1, sep = ".")
start.names$h2 <- paste("h2", start.names$h2, sep = ".")
if (h3) start.names$h3 <- paste("h3", start.names$h3, sep = ".")
if (h4) start.names$h4 <- paste("h4", start.names$h4, sep = ".")
names(start) <- Reduce("c", start.names)
f <- function(param) mhurdle.lnl(param, X1 = X1, X2 = X2, X3 = X3, X4 = X4, y = y,
gradient = TRUE, fitted = FALSE,
dist = dist, corr = corr,
robust = robust)
if (check.grad){
ngrad <- c()
oparam <- start
fo <- f(start)
agrad <- apply(attr(fo, "gradient"), 2, sum)
eps <- 1E-05
for (i in 1:length(start)){
oparam[i] <- oparam[i] + eps
ngrad <- c(ngrad, sum((as.numeric(f(oparam)) - fo) / eps))
oparam <- start
}
return(cbind(start, agrad, ngrad))
}
maxl <- maxLik(f, start = start, control = list(lambdatol = 1E-20), ...)
nb.iter <- maxl$iterations
convergence.OK <- maxl$code <= 2
coefficients <- maxl$estimate
if (fitted) fitted.values <- attr(mhurdle.lnl(coefficients, X1 = X1, X2 = X2, X3 = X3, X4 = X4, y = y,
gradient = FALSE, fitted = TRUE, robust = robust,
dist = dist, corr = corr), "fitted")
else fitted.values <- NULL
# contribution of every single observation to the likelihood and
# its gradient (as an attribute)
logLik <- f(coefficients)
gradi <- attr(logLik, "gradi")
logLik <- structure(as.numeric(logLik), df = length(coefficients),
parts = attr(logLik, "parts"),
nobs = length(y), class = "logLik")
hessian <- maxl$hessian
elaps.time <- proc.time() - start.time
eps <- with(maxl, gradient %*% solve(- hessian) %*% gradient)
est.stat <- list(elaps.time = elaps.time,
nb.iter = nb.iter,
eps = eps,
method = maxl$type,
message = maxl$message
)
class(est.stat) <- "est.stat"
gtheta <- rep(1, length(coefficients))
if (robust){
if (corr){
poscor <- sub.mhurdle(coef.names, "corr")
gtheta[poscor] <- 2 / pi / (1 + coefficients[poscor] ^ 2)
coefficients[poscor] <- atan(coefficients[poscor]) * 2 / pi
}
if (dist %in% c("bc2", "ln2")){
posmu <- sub.mhurdle(coef.names, "pos")
gtheta[posmu] <- exp(coefficients[posmu])
coefficients[posmu] <- exp(coefficients[posmu])
}
possd <- sub.mhurdle(coef.names, "sd")
gtheta[possd] <- exp(coefficients[possd])
coefficients[possd] <- exp(coefficients[possd])
}
result <- list(coefficients = coefficients,
vcov = diag(gtheta) %*% (- solve(maxl$hessian) ) %*% diag(gtheta),
fitted.values = fitted.values,
logLik = logLik,
gradient = gradi,
hessian = hessian,
formula = NULL,
model = NULL,
coef.names = coef.names,
call = NULL,
est.stat = est.stat,
naive = NULL
)
class(result) <- c("mhurdle", class(result))
result
}
sanitize <- function(x, myeps = 1E-07, mymax = 1E02, string = c("", ""), replace = TRUE, verbal = TRUE){
string <- paste("of", string[1], "in", string[2])
if (replace){
if (any(is.na(x))){
if (verbal) cat(paste(sum(is.na(x)), "NA values", string, "replaced by 0\n"))
x[is.na(x)] <- 0
}
if ( any(x > 0 & x < myeps)){
if (verbal) cat(paste(sum(x > 0 & x < myeps), "values", string, "lower than", myeps,"replaced by", myeps, "\n"))
x[x > 0 & x < myeps] <- myeps
}
if (any(x < - mymax)){
if (verbal) cat(paste(sum(x < - mymax), "values", string, "lower than", - mymax, "replaced by", - mymax, "\n"))
x[x < - mymax] <- - mymax
}
if (any(x > mymax)){
if (verbal) cat(paste(sum(x > mymax), "values", string, "greater than", mymax, "replaced by", mymax, "\n"))
x[x > mymax] <- mymax
}
}
x
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.