Nothing
#' Estimation of limited dependent variable models
#'
#' mhurdle fits a large set of models relevant when the dependent variable is 0
#' for a part of the sample.
#'
#'
#' `mhurdle` fits models for which the dependent variable is zero for
#' a part of the sample. Null values of the dependent variable may
#' occurs because of one or several mechanisms : good rejection, lack
#' of ressources and purchase infrequency. The model is described
#' using a three-parts formula : the first part describes the
#' selection process if any, the second part the regression equation
#' and the third part the purchase infrequency process. `y ~ 0 | x1 +
#' x2 | z1 + z2` means that there is no selection process. `y ~ w1 +
#' w2 | x1 + x2 | 0` and `y ~ w1 + w2 | x1 + x2` describe the same
#' model with no purchase infrequency process. The second part is
#' mandatory, it explains the positive values of the dependant
#' variable. The `dist` argument indicates the distribution of the
#' error term. If `dist = "n"`, the error term is normal and (at least
#' part of) the zero observations are also explained by the second
#' part as the result of a corner solution. Several models described
#' in the litterature are obtained as special cases :
#'
#' A model with a formula like `y~0|x1+x2` and `dist="n"` is the Tobit
#' model proposed by \insertCite{TOBIN/58}{mhurdle}.
#'
#' `y~w1+w2|x1+x2` and `dist="l"` or `dist="t"` is the single hurdle
#' model proposed by \insertCite{CRAGG/71}{mhurdle}. With `dist="n"`,
#' the double hurdle model also proposed by
#' \insertCite{CRAGG/71}{mhurdle} is obtained. With `corr="h1"` we get
#' the correlated version of this model described by
#' \insertCite{BLUNDELL/87}{mhurdle}.
#'
#' `y~0|x1+x2|z1+z2` is the P-Tobit model of
#' \insertCite{DEATO/IRISH/84}{mhurdle}, which can be a single hurdle
#' model if `dist="t"` or `dist="l"` or a double hurdle model if
#' `dist="n"`.
#'
#' @name mhurdle
#' @aliases mhurdle
#' @param formula a symbolic description of the model to be fitted,
#' @param data a `data.frame`,
#' @param subset see [stats::lm()],
#' @param weights see [stats::lm()],
#' @param na.action see [stats::lm()],
#' @param start starting values,
#' @param dist the distribution of the error of the consumption
#' equation: one of `"n"` (normal), `"ln"` (log-normal) `"bc"`
#' (box-cox normal) and `"ihs"` (inverse hyperbolic sinus
#' transformation),
#' @param h2 if `TRUE` the second hurdle is effective, it is not
#' otherwise,
#' @param scaled if `TRUE`, the dependent variable is divided by its
#' geometric mean,
#' @param corr a boolean indicating whether the errors of the
#' different equations are correlated or not,
#' @param robust transformation of the structural parameters in order
#' to avoid numerical problems,
#' @param check_gradient if `TRUE`, a matrix containing the analytical and
#' the numerical gradient for the starting values are returned,
#' @param \dots further arguments.
#' @return
#' #' an object of class `c("mhurdle", "maxLik")`.
#'
#' A `mhurdle` object has the following elements :
#'
#' - coefficients: the vector of coefficients,
#' - vcov: the covariance matrix of the coefficients,
#' - fitted.values: a matrix of fitted.values, the first column being
#' the probability of 0 and the second one the mean values for the
#' positive observations,
#' - logLik: the log-likelihood,
#' - gradient: the gradient at convergence,
#' - model: a data.frame containing the variables used for the estimation,
#' - coef.names: a list containing the names of the coefficients in the
#' selection equation, the regression equation, the infrequency of purchase
#' equation and the other coefficients (the standard deviation of the error
#' term and the coefficient of correlation if `corr = TRUE`,
#' - formula: the model formula, an object of class `Formula`
#' - call: the call,
#' - rho: the lagrange multiplier test of no correlation.
#'
#' @references
#'
#' \insertRef{BLUNDELL/87}{mhurdle}
#'
#' \insertRef{CRAGG/71}{mhurdle}
#'
#' \insertRef{DEATO/IRISH/84}{mhurdle}
#'
#' \insertRef{TOBIN/58}{mhurdle}
#'
#' @keywords regression
#' @examples
#'
#' data("Interview", package = "mhurdle")
#'
#' # independent double hurdle model
#' idhm <- mhurdle(vacations ~ car + size | linc + linc2 | 0, Interview,
#' dist = "ln", h2 = TRUE, method = "bfgs")
#'
#' # dependent double hurdle model
#' ddhm <- mhurdle(vacations ~ car + size | linc + linc2 | 0, Interview,
#' dist = "ln", h2 = TRUE, method = "bfgs", corr = TRUE)
#'
#' # a double hurdle p-tobit model
#' ptm <- mhurdle(vacations ~ 0 | linc + linc2 | car + size, Interview,
#' dist = "ln", h2 = TRUE, method = "bfgs", corr = TRUE)
#' @importFrom Formula Formula
#' @importFrom survival survreg Surv
#' @importFrom truncreg truncreg
#' @importFrom stats binomial cor df.residual dnorm ecdf formula glm
#' integrate lm lm.fit model.frame model.matrix model.response
#' optimize pchisq pnorm printCoefmat qnorm quantile rnorm terms
#' uniroot var .getXlevels deviance
#' @importFrom maxLik maxLik activePar
#' @export
mhurdle <- function(formula, data, subset, weights, na.action,
start = NULL, dist = c("ln", "n", "bc", "ihs"), h2 = FALSE,
scaled = TRUE, corr = FALSE, robust = TRUE,
check_gradient = FALSE, ...){
fitted = TRUE
dots <- list(...)
oldoptions <- options(warn = -1)
on.exit(options(oldoptions))
cl <- match.call()
posT <- as.list(cl) == "T" ; posF <- as.list(cl) == "F"
cl[posT] <- TRUE ; cl[posF] <- FALSE
dist <- match.arg(dist)
if (dist == "ln" & h2) dist <- "ln2"
if (dist == "n" & ! h2) dist <- "tn"
if (dist == "bc" & h2) dist <- "bc2"
# 1. Compute the model.frame and the model.matrix
if (! inherits(formula, "Formula")) formula <- Formula(formula)
if (length(formula)[2] > 4) stop("at most 4 rhs should be provided in the formula")
mf <- match.call(expand.dots = FALSE)
mc <- match.call(expand.dots = TRUE)
m <- match(c("formula", "data", "subset", "na.action", "weights"),
names(mc), 0L)
mf <- mc[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- as.name("model.frame")
mf$formula <- formula
mf <- eval(mf, parent.frame())
.terms <- terms(mf)
X1 <- model.matrix(formula, data = mf, rhs = 1)
X2 <- model.matrix(formula, data = mf, rhs = 2)
X3 <- model.matrix(formula, data = mf, rhs = 3)
X4 <- model.matrix(formula, data = mf, rhs = 4)
# Remove the intercept if any for the heteroscedastic model
if (ncol(X4)){
int4 <- attr(terms(formula, data = mf, rhs = 4), "intercept")
X4 <- X4[, - 1, drop = FALSE]
}
y <- model.response(mf)
if (scaled){
geomean <- exp(mean(log(y[y > 0])))
y <- y / geomean
attr(y, "geomean") <- attr(mf, "geomean") <- geomean
}
N <- length(y)
if (length(X1) == 0) X1 <- NULL
if (length(X2) == 0) stop("the second hurdle (consumption equation) is mandatory")
if (length(X3) == 0) X3 <- NULL
if (length(X4) == 0) X4 <- NULL
h1 <- ! is.null(X1)
h3 <- ! is.null(X3)
# 2. One equation models
if (! h1 && ! h3 && is.null(X4)){
result <- one_equation_model(X2, y, dist, check_gradient = check_gradient, start = start)
if (check_gradient){
result$call <- cl
return(result)
}
result$naive <- NULL#naive
result$call <- cl
result$model <- mf
result$formula <- formula
result$terms <- .terms
result$R2 <- c(null = NA, positive = NA)
return(result)
}
# 3. Selection single hurdle models without correlation that can
# be estimated simply in two parts using seperate.mhurdle()
if (h1 && !h3 && ! corr && !h2){
result <- two_parts_model(X1, X2, y, dist = dist, check_gradient = check_gradient, start = start)
if (check_gradient){
result$call <- cl
return(result)
}
P0 <- mean(y == 0)
m <- mean(log(y[y > 0]))
s2 <- mean( log(y[y > 0]) ^ 2) - m ^ 2
parts <- c(log(P0), log(1 - P0), log(1 - P0) - m - 1 / 2 * log(s2) - 1 / 2 * log(2 * pi) - 1 /2)
parts <- parts * c(P0, 1 - P0, 1 - P0) * N
logLik.naive <- structure((parts[1] + parts[3]), nobs = length(y),
parts = parts,
df = 3, class = "logLik")
naive <- list(logLik = logLik.naive)
lnL1c <- attr( naive$logLik, "parts")[1] + attr( naive$logLik, "parts")[2]
lnL1u <- attr(result$logLik, "parts")[1] + attr(result$logLik, "parts")[2]
lnL2c <- attr( naive$logLik, "parts")[3] - attr( naive$logLik, "parts")[2]
lnL2u <- attr(result$logLik, "parts")[3] - attr(result$logLik, "parts")[2]
R1 <- 1 - (lnL1c / lnL1u) ^ (2 / N * lnL1c)
R2old <- 1 - (exp(lnL2c) / exp(lnL2u)) ^ (2 / (N * (1 - P0)))
R2 <- 1 - exp(- 2 * (lnL2u - lnL2c) / (N * (1 - P0)))
result$R2 <- c(null = unname(R1), positive = unname(R2), old = unname(R2old))
result$naive <- naive
result$call <- cl
result$model <- mf
result$formula <- formula
result$terms <- .terms
return(result)
}
# 5. Compute the starting values if not provided (use the linear
# specification as the starting value for ihs and the log-linear
# specification for Box-Cox)
if (is.null(start)){
dist.start <- dist
if (dist %in% c("bc", "bc2", "ln2")) dist.start <- "ln"
if (dist == "ihs") dist.start <- "n"
start <- start.mhurdle(X1, X2, X3, y, dist.start)
# in case of heteroscedasctic model, add K4 zeros to the start
# vector and the intercept should be ln(sigma_o) (not sigma_o)
# because of the exp form
sd.pos <- getindex(X1, X2, X3, X4, corr, dist, which = "sd")
if (robust) start[sd.pos] <- log(start[sd.pos])
if (! is.null(X4)) start <- c(start[1:sd.pos], rep(0, ncol(X4)))
# add shape and/or scale parameters
if (corr){
if (robust) rhoinit <- tan(0.1 * pi / 2) else rhoinit <- 0.1
if (h1 + h3 == 2) start <- c(start, rho12 = rhoinit, rho13 = rhoinit, rho23 = rhoinit)
else start <- c(start, rho = rhoinit)
}
if (dist %in% c("bc", "bc2", "ihs")) start <- c(start, tr = 0.1)
if (dist %in% c("bc2", "ln2")){
pos = 1
if (robust) pos <- max(log(pos), -100)
start <- c(start, pos = pos)
}
}
else{
if (robust){
sd.pos <- getindex(X1, X2, X3, X4, corr, dist, which = "sd")
if (dist %in% c("ln2", "bc2")) mu.pos <- getindex(X1, X2, X3, X4, corr, dist, which = "pos")
rho.pos <- getindex(X1, X2, X3, X4, corr, dist, which = "corr")
start[sd.pos] <- log(start[sd.pos])
if (dist %in% c("ln2", "bc2")) start[mu.pos] <- max(log(start[mu.pos]), - 50)
if (corr) start[rho.pos] <- tan(start[rho.pos] * pi / 2)
}
}
result <- mhurdle.fit(start, X1, X2, X3, X4, y, gradient = TRUE,
dist = dist, corr = corr, robust = robust,
fitted = fitted,
check_gradient = check_gradient,
...)
if (check_gradient){
result$call <- cl
return(result)
}
# 3. Compute the naive model
P0 <- mean(y == 0)
dist.naive <- dist
if (dist %in% c("ihs")) dist.naive <- "n"
if (dist %in% c("bc", "bc2", "ln2")) dist.naive <- "ln"
if (dist.naive != "ln"){
Ec <- mean(y[y > 0])
Vc <- var(y[y > 0])}
else{
Ec <- mean(log(y[y > 0]))
Vc <- var(log(y[y > 0]))
}
start.naive <- c(rep(0.1, 1 + h1 + h3), 1)
moments <- c(P0, Ec, Vc)
naive <- maxLik::maxLik(lnl.naive, start = start.naive,
dist = dist.naive, moments = moments,
h1 = h1, h3 = h3)
coef.naive <- naive$est
parts <- attr(naive$max, "parts") * c(P0, 1 - P0, 1 - P0) * N
logLik.naive <- structure(as.numeric(naive$max) * N, nobs = length(y),
parts = parts,
df = length(coef.naive), class = "logLik")
naive <- list(logLik = logLik.naive)
# QDF for log-normal models
# cat("sans h2\n")
if (dist == "ln"){
P0 <- mean(y == 0)
m <- mean(log(y[y > 0]))
s2 <- mean( log(y[y > 0]) ^ 2) - m ^ 2
parts <- c(log(P0), log(1 - P0), log(1 - P0) - m - 1 / 2 * log(s2) - 1 / 2 * log(2 * pi) - 1 /2)
parts <- parts * c(P0, 1 - P0, 1 - P0) * N
logLik.naive <- structure((parts[1] + parts[3]), nobs = length(y),
parts = parts,
df = 3, class = "logLik")
naive <- list(logLik = logLik.naive)
}
## cat("avec h2\n")
## param <- c(Np / N, mean(log(y[y > 0])), sd(log(y[y > 0])), 0.5)
## No <- sum(y == 0)
## Np <- sum(y > 0)
## f <- function(param){
## Phio <- param[1]
## m <- param[2]
## sig <- param[3]
## a <- param[4]
## v <- No * log(1 - Phio * pnorm((m - log(a)) / sig)) + Np * (log(Phio) - log(sig) - 1 / 2 * log(2 * pi)) -
## sum( log(y[y > 0] + a) + (log(y[y > 0] + a) - m) ^ 2 / (2 * sig ^ 2))
## v
## }
## print(maxLik::maxLik(f, start = param, method = "bfgs"))
## g <- function(param){
## m <- param[1]
## sig <- param[2]
## a <- param[3]
## Pp <- Np / N
## v <- No * log(1 - Pp) + Np * log(Pp) +
## Np * (- pnorm((m - log(a)) / sig, log.p = TRUE) - log(sig) - 1 / 2 * log(2 * pi)) -
## sum( log(y[y > 0] + a) + (log(y[y > 0] + a) - m) ^ 2 / (2 * sig ^ 2))
## v
## }
## print(maxLik::maxLik(g, start = param[-1], method = "bfgs"))
## a <- 0.00000001
## h <- function(param){
## m <- param[1]
## sig <- param[2]
## Pp <- Np / N
## v <- No * log(1 - Pp) + Np * log(Pp) +
## Np * (- pnorm((m - log(a)) / sig, log.p = TRUE) - log(sig) - 1 / 2 * log(2 * pi)) -
## sum( log(y[y > 0] + a) + (log(y[y > 0] + a) - m) ^ 2 / (2 * sig ^ 2))
## v
## }
## print(maxLik::maxLik(h, start = param[2:3], method = "bfgs"))
## stop()
result$naive <- naive
result$call <- cl
result$formula <- formula
result$model <- mf
result$terms <- .terms
result$xlevels = .getXlevels(.terms, mf)
lnL1c <- attr( naive$logLik, "parts")[1] + attr( naive$logLik, "parts")[2]
lnL1u <- attr(result$logLik, "parts")[1] + attr(result$logLik, "parts")[2]
lnL2c <- attr( naive$logLik, "parts")[3] - attr( naive$logLik, "parts")[2]
lnL2u <- attr(result$logLik, "parts")[3] - attr(result$logLik, "parts")[2]
R1 <- 1 - (lnL1c / lnL1u) ^ (2 / N * lnL1c)
R2old <- 1 - (exp(lnL2c) / exp(lnL2u)) ^ (2 / (N * (1 - P0)))
R2 <- 1 - exp(- 2 * (lnL2u - lnL2c) / (N * (1 - P0)))
result$R2 <- c(null = unname(R1), positive = unname(R2), old = unname(R2old))
if (result$type_hessian == "neg_definite")
result$dpar <- - sum(diag(solve(result$hessian, crossprod(result$gradient))))
if (robust){
result <- update(result, start = coef(result), iterlim = 0, robust = FALSE)
result$call$robust <- TRUE
result$call$iterlim <- NULL
result$call$start <- NULL
}
result
}
mhurdle.fit <- function(start, X1, X2, X3, X4, y, gradient = FALSE, fit = FALSE,
dist = c("ln", "n", "tn", "bc", "ihs", "bc2", "ln2"),
corr = FALSE, robust = TRUE, fitted = FALSE,
check_gradient = FALSE, ...){
start.time <- proc.time()
h1 <- ! is.null(X1)
h3 <- ! is.null(X3)
h4 <- ! is.null(X4)
# fancy coefficients names
sd.names <- "sd"
if (corr){
if (h1 & h3) rho.names <- c("corr12", "corr13", "corr23")
else rho.names <- ifelse(h1, "corr12", "corr23")
}
else rho.names <- NULL
if (dist %in% c("bc", "bc2", "ihs")) tr.names <- "tr" else tr.names <- NULL
if (dist %in% c("ln2", "bc2")) mu.names <- "pos" else mu.names <- NULL
coef.names <- list(h1 = colnames(X1),
h2 = colnames(X2),
h3 = colnames(X3),
sd = sd.names,
h4 = colnames(X4),
corr = rho.names,
tr = tr.names,
pos = mu.names)
start.names <- coef.names
if (h1) start.names$h1 <- paste("h1", start.names$h1, sep = ".")
start.names$h2 <- paste("h2", start.names$h2, sep = ".")
if (h3) start.names$h3 <- paste("h3", start.names$h3, sep = ".")
if (h4) start.names$h4 <- paste("h4", start.names$h4, sep = ".")
names(start) <- Reduce("c", start.names)
f <- function(param) mhurdle.lnl(param, X1 = X1, X2 = X2, X3 = X3, X4 = X4, y = y,
gradient = TRUE, fitted = FALSE,
dist = dist, corr = corr,
robust = robust)
if (check_gradient) return(compare_gradient(f, start))
maxl <- maxLik::maxLik(f, start = start, control = list(lambdatol = 1E-20), ...)
nb.iter <- maxl$iterations
convergence.OK <- maxl$code <= 2
coefficients <- maxl$estimate
h <- function(param) sum(mhurdle.lnl(param, X1 = X1, X2 = X2, X3 = X3, X4 = X4, y = y,
gradient = FALSE, fitted = FALSE,
dist = dist, corr = corr,
robust = FALSE))
#print(robust)
# H <- numDeriv::hessian(h, coefficients)
if (fitted) fitted.values <- attr(mhurdle.lnl(coefficients, X1 = X1, X2 = X2, X3 = X3, X4 = X4, y = y,
gradient = FALSE, fitted = TRUE, robust = robust,
dist = dist, corr = corr), "fitted")
else fitted.values <- NULL
# contribution of every single observation to the likelihood and
# its gradient (as an attribute)
logLik <- f(coefficients)
gradi <- attr(logLik, "gradi")
logLik <- structure(as.numeric(logLik), df = length(coefficients),
parts = attr(logLik, "parts"),
nobs = length(y), class = "logLik")
hessian <- maxl$hessian
ev <- eigen(- hessian)$values
type_hessian <- "neg_definite"
if (any(is.complex(ev))){
type_hessian <- "complex"
} else {
if (any(ev < 0)){
type_hessian <- "not_neg_definite"
} else {
if (any(ev < 1E-07)){
type_hessian <- "singular"
}
}
}
opg <- crossprod(gradi)
ev_opg <- eigen(opg)$values
elaps.time <- proc.time() - start.time
if (type_hessian == "neg_definite") information <- - hessian else information <- opg
# eps <- drop(crossprod(maxl$gradient, solve(information, maxl$gradient)))
eps <- NA
est.stat <- list(elaps.time = elaps.time,
nb.iter = nb.iter,
eps = eps,
method = maxl$type,
message = maxl$message
)
class(est.stat) <- "est.stat"
gtheta <- rep(1, length(coefficients))
## if (robust){
## if (corr){
## poscor <- sub.mhurdle(coef.names, "corr")
## gtheta[poscor] <- 2 / pi / (1 + coefficients[poscor] ^ 2)
## coefficients[poscor] <- atan(coefficients[poscor]) * 2 / pi
## }
## if (dist %in% c("bc2", "ln2")){
## posmu <- sub.mhurdle(coef.names, "pos")
## gtheta[posmu] <- exp(coefficients[posmu])
## coefficients[posmu] <- exp(coefficients[posmu])
## }
## possd <- sub.mhurdle(coef.names, "sd")
## gtheta[possd] <- exp(coefficients[possd])
## coefficients[possd] <- exp(coefficients[possd])
## print(gtheta)
## g1 <- apply(gradi, 2, sum)
## gradi <- t(t(gradi) / gtheta)
## g2 <- apply(gradi, 2, sum)
## print(cbind(g1, g2))
## }
if (robust){
if (corr){
poscor <- sub.mhurdle(coef.names, "corr")
coefficients[poscor] <- atan(coefficients[poscor]) * 2 / pi
}
if (dist %in% c("bc2", "ln2")){
posmu <- sub.mhurdle(coef.names, "pos")
coefficients[posmu] <- exp(coefficients[posmu])
}
possd <- sub.mhurdle(coef.names, "sd")
coefficients[possd] <- exp(coefficients[possd])
}
result <- list(coefficients = coefficients,
fitted.values = fitted.values,
logLik = logLik,
gradient = gradi,
hessian = hessian,
formula = NULL,
terms = NULL,
xlevels = NULL,
model = NULL,
coef.names = coef.names,
call = NULL,
est.stat = est.stat,
naive = NULL,
type_hessian = type_hessian
)
class(result) <- c("mhurdle", class(result))
result
}
sanitize <- function(x, myeps = 1E-07, mymax = 1E02, string = c("", ""), replace = TRUE, verbal = TRUE){
string <- paste("of", string[1], "in", string[2])
if (replace){
if (any(is.na(x))){
if (verbal) cat(paste(sum(is.na(x)), "NA values", string, "replaced by 0\n"))
x[is.na(x)] <- 0
}
if ( any(x > 0 & x < myeps)){
if (verbal) cat(paste(sum(x > 0 & x < myeps), "values", string, "lower than", myeps,"replaced by", myeps, "\n"))
x[x > 0 & x < myeps] <- myeps
}
if (any(x < - mymax)){
if (verbal) cat(paste(sum(x < - mymax), "values", string, "lower than", - mymax, "replaced by", - mymax, "\n"))
x[x < - mymax] <- - mymax
}
if (any(x > mymax)){
if (verbal) cat(paste(sum(x > mymax), "values", string, "greater than", mymax, "replaced by", mymax, "\n"))
x[x > mymax] <- mymax
}
}
x
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.