gravity: Gravity model with large scale data

Documented in fastzeroinfl

#' Fast implementation for zero-inflated Count Data Regression
#' 
#' @inheritParams pscl::zeroinfl
#' @import pscl
#' @export

fastzeroinfl <- function(formula, data, subset, na.action, weights, offset, 
                         dist = c("poisson", "negbin", "geometric"), link = c("logit", 
                                                                              "probit", "cloglog", "cauchit", "log"), control = pscl::zeroinfl.control(...), 
                         model = TRUE, y = TRUE, x = FALSE, ...) 
{
  ziPoisson <- function(parms) {
    mu <- as.vector(exp(X %*% parms[1:kx] + offsetx))
    phi <- as.vector(linkinv(Z %*% parms[(kx + 1):(kx + 
                                                     kz)] + offsetz))
    loglik0 <- log(phi + exp(log(1 - phi) - mu))
    loglik1 <- log(1 - phi) + dpois(Y, lambda = mu, log = TRUE)
    loglik <- sum(weights[Y0] * loglik0[Y0]) + sum(weights[Y1] * 
                                                     loglik1[Y1])
    loglik
  }
  ziNegBin <- function(parms) {
    mu <- as.vector(exp(X %*% parms[1:kx] + offsetx))
    phi <- as.vector(linkinv(Z %*% parms[(kx + 1):(kx + 
                                                     kz)] + offsetz))
    theta <- exp(parms[(kx + kz) + 1])
    loglik0 <- log(phi + exp(log(1 - phi) + suppressWarnings(dnbinom(0, 
                                                                     size = theta, mu = mu, log = TRUE))))
    loglik1 <- log(1 - phi) + suppressWarnings(dnbinom(Y, 
                                                       size = theta, mu = mu, log = TRUE))
    loglik <- sum(weights[Y0] * loglik0[Y0]) + sum(weights[Y1] * 
                                                     loglik1[Y1])
    loglik
  }
  ziGeom <- function(parms) ziNegBin(c(parms, 0))
  gradPoisson <- function(parms) {
    eta <- as.vector(X %*% parms[1:kx] + offsetx)
    mu <- exp(eta)
    etaz <- as.vector(Z %*% parms[(kx + 1):(kx + kz)] + 
                        offsetz)
    muz <- linkinv(etaz)
    clogdens0 <- -mu
    dens0 <- muz * (1 - as.numeric(Y1)) + exp(log(1 - muz) + 
                                                clogdens0)
    wres_count <- ifelse(Y1, Y - mu, -exp(-log(dens0) + 
                                            log(1 - muz) + clogdens0 + log(mu)))
    wres_zero <- ifelse(Y1, -1/(1 - muz) * linkobj$mu.eta(etaz), 
                        (linkobj$mu.eta(etaz) - exp(clogdens0) * linkobj$mu.eta(etaz))/dens0)
    colSums(cbind(wres_count * weights * X, wres_zero * 
                    weights * Z))
  }
  gradGeom <- function(parms) {
    eta <- as.vector(X %*% parms[1:kx] + offsetx)
    mu <- exp(eta)
    etaz <- as.vector(Z %*% parms[(kx + 1):(kx + kz)] + 
                        offsetz)
    muz <- linkinv(etaz)
    clogdens0 <- dnbinom(0, size = 1, mu = mu, log = TRUE)
    dens0 <- muz * (1 - as.numeric(Y1)) + exp(log(1 - muz) + 
                                                clogdens0)
    wres_count <- ifelse(Y1, Y - mu * (Y + 1)/(mu + 1), 
                         -exp(-log(dens0) + log(1 - muz) + clogdens0 - log(mu + 
                                                                             1) + log(mu)))
    wres_zero <- ifelse(Y1, -1/(1 - muz) * linkobj$mu.eta(etaz), 
                        (linkobj$mu.eta(etaz) - exp(clogdens0) * linkobj$mu.eta(etaz))/dens0)
    colSums(cbind(wres_count * weights * X, wres_zero * 
                    weights * Z))
  }
  gradNegBin <- function(parms) {
    eta <- as.vector(X %*% parms[1:kx] + offsetx)
    mu <- exp(eta)
    etaz <- as.vector(Z %*% parms[(kx + 1):(kx + kz)] + 
                        offsetz)
    muz <- linkinv(etaz)
    theta <- exp(parms[(kx + kz) + 1])
    clogdens0 <- dnbinom(0, size = theta, mu = mu, log = TRUE)
    dens0 <- muz * (1 - as.numeric(Y1)) + exp(log(1 - muz) + 
                                                clogdens0)
    wres_count <- ifelse(Y1, Y - mu * (Y + theta)/(mu + 
                                                     theta), -exp(-log(dens0) + log(1 - muz) + clogdens0 + 
                                                                    log(theta) - log(mu + theta) + log(mu)))
    wres_zero <- ifelse(Y1, -1/(1 - muz) * linkobj$mu.eta(etaz), 
                        (linkobj$mu.eta(etaz) - exp(clogdens0) * linkobj$mu.eta(etaz))/dens0)
    wres_theta <- theta * ifelse(Y1, digamma(Y + theta) - 
                                   digamma(theta) + log(theta) - log(mu + theta) + 
                                   1 - (Y + theta)/(mu + theta), exp(-log(dens0) + 
                                                                       log(1 - muz) + clogdens0) * (log(theta) - log(mu + 
                                                                                                                       theta) + 1 - theta/(mu + theta)))
    colSums(cbind(wres_count * weights * X, wres_zero * 
                    weights * Z, wres_theta))
  }
  dist <- match.arg(dist)
  loglikfun <- switch(dist, poisson = ziPoisson, geometric = ziGeom, 
                      negbin = ziNegBin)
  gradfun <- switch(dist, poisson = gradPoisson, geometric = gradGeom, 
                    negbin = gradNegBin)
  linkstr <- match.arg(link)
  linkobj <- make.link(linkstr)
  linkinv <- linkobj$linkinv
  if (control$trace) 
    cat("Zero-inflated Count Model\n", paste("count model:", 
                                             dist, "with log link\n"), paste("zero-inflation model: binomial with", 
                                                                             linkstr, "link\n"), sep = "")
  cl <- match.call()
  if (missing(data)) 
    data <- environment(formula)
  mf <- match.call(expand.dots = FALSE)
  m <- match(c("formula", "data", "subset", "na.action", "weights", 
               "offset"), names(mf), 0)
  mf <- mf[c(1, m)]
  mf$drop.unused.levels <- TRUE
  
  formula = as.formula(formula)
  
  if (length(formula[[3]]) > 1 && identical(formula[[3]][[1]], 
                                            as.name("|"))) {
    ff <- formula
    formula[[3]][1] <- call("+")
    mf$formula <- formula
    ffc <- . ~ .
    ffz <- ~.
    ffc[[2]] <- ff[[2]]
    ffc[[3]] <- ff[[3]][[2]]
    ffz[[3]] <- ff[[3]][[3]]
    ffz[[2]] <- NULL
  } else {
    ffz <- ffc <- ff <- formula
    ffz[[2]] <- NULL
  }
  
  
  if (inherits(try(terms(ffz), silent = TRUE), "try-error")) {
    ffz <- eval(parse(text = sprintf(paste("%s -", deparse(ffc[[2]])), 
                                     deparse(ffz))))
  }
  
  mf[[1]] <- as.name("model.frame")
  mf <- eval(mf, parent.frame())
  mt <- attr(mf, "terms")
  mtX <- terms(ffc, data = data)
  X <- model.matrix(mtX, mf)
  mtZ <- terms(ffz, data = data)
  mtZ <- terms(update(mtZ, ~.), data = data)
  Z <- model.matrix(mtZ, mf)
  Y <- model.response(mf, "numeric")
  if (length(Y) < 1) stop("empty model")
  if (all(Y > 0)) stop("invalid dependent variable, minimum count is not zero")
  if (!isTRUE(all.equal(as.vector(Y), as.integer(round(Y + 
                                                       0.001))))) 
    stop("invalid dependent variable, non-integer values")
  Y <- as.integer(round(Y + 0.001))
  if (any(Y < 0)) 
    stop("invalid dependent variable, negative counts")
  if (control$trace) {
    cat("dependent variable:\n")
    tab <- table(factor(Y, levels = 0:max(Y)), exclude = NULL)
    names(dimnames(tab)) <- NULL
    print(tab)
  }
  n <- length(Y)
  kx <- NCOL(X)
  kz <- NCOL(Z)
  Y0 <- Y <= 0
  Y1 <- Y > 0
  weights <- model.weights(mf)
  if (is.null(weights)) 
    weights <- 1
  if (length(weights) == 1) 
    weights <- rep.int(weights, n)
  weights <- as.vector(weights)
  names(weights) <- rownames(mf)
  offsetx <- model_offset_2(mf, terms = mtX, offset = TRUE)
  if (is.null(offsetx)) 
    offsetx <- 0
  if (length(offsetx) == 1) 
    offsetx <- rep.int(offsetx, n)
  offsetx <- as.vector(offsetx)
  offsetz <- model_offset_2(mf, terms = mtZ, offset = FALSE)
  if (is.null(offsetz)) 
    offsetz <- 0
  if (length(offsetz) == 1) 
    offsetz <- rep.int(offsetz, n)
  offsetz <- as.vector(offsetz)
  start <- control$start
  if (!is.null(start)) {
    valid <- TRUE
    if (!("count" %in% names(start))) {
      valid <- FALSE
      warning("invalid starting values, count model coefficients not specified")
      start$count <- rep.int(0, kx)
    }
    if (!("zero" %in% names(start))) {
      valid <- FALSE
      warning("invalid starting values, zero-inflation model coefficients not specified")
      start$zero <- rep.int(0, kz)
    }
    if (length(start$count) != kx) {
      valid <- FALSE
      warning("invalid starting values, wrong number of count model coefficients")
    }
    if (length(start$zero) != kz) {
      valid <- FALSE
      warning("invalid starting values, wrong number of zero-inflation model coefficients")
    }
    if (dist == "negbin") {
      if (!("theta" %in% names(start))) 
        start$theta <- 1
      start <- list(count = start$count, zero = start$zero, 
                    theta = as.vector(start$theta[1]))
    }
    else {
      start <- list(count = start$count, zero = start$zero)
    }
    if (!valid) 
      start <- NULL
  }
  
  if (is.null(start)) {
    if (control$trace) 
      cat("generating starting values...")
    
    # model_count <- glm.fit(X, Y, family = poisson(), weights = weights, 
    #                        offset = offsetx)
    model_count <- fastglm::fastglm(
      x = X, y = Y, family = poisson(), weights = weights,
      offset = offsetx,
      method = 2L
    )
    
    # model_zero <- glm.fit(Z, as.integer(Y0), weights = weights, 
    #                       family = binomial(link = linkstr), offset = offsetz)
    
    model_zero <- fastglm::fastglm(
      x = Z, y = as.integer(Y0), family = binomial(link = linkstr), weights = weights,
      offset = offsetz,
      method = 2L
    )
    
    start <- list(count = model_count$coefficients, zero = model_zero$coefficients)
    if (dist == "negbin") start$theta <- 1
    
    if (control$EM & dist == "poisson") {
      mui <- model_count$fitted
      probi <- model_zero$fitted
      probi <- probi/(probi + (1 - probi) * dpois(0, mui))
      probi[Y1] <- 0
      ll_new <- loglikfun(c(start$count, start$zero))
      ll_old <- 2 * ll_new
      while (abs((ll_old - ll_new)/ll_old) > control$reltol) {
        ll_old <- ll_new
        # model_count <- glm.fit(X, Y, weights = weights * 
        #                          (1 - probi), offset = offsetx, family = poisson(), 
        #                        start = start$count)
        model_count <- fastglm::fastglm(
          x = X, y = Y, weights = weights *  (1 - probi), offset = offsetx,
          family = poisson(), start = start$count,
          method = 2L
        )        
        # model_zero <- suppressWarnings(glm.fit(Z, probi, 
        #                                        weights = weights, offset = offsetz, family = binomial(link = linkstr), 
        #                                        start = start$zero))
        model_zero <- suppressWarnings(fastglm::fastglm(
          x = Z, y = probi, weights = weights, offset = offsetz,
          family = binomial(link = linkstr), start = start$count,
          method = 2L
        ))        
        
        mui <- model_count$fitted
        probi <- model_zero$fitted
        probi <- probi/(probi + (1 - probi) * dpois(0, 
                                                    mui))
        probi[Y1] <- 0
        start <- list(count = model_count$coefficients, 
                      zero = model_zero$coefficients)
        ll_new <- loglikfun(c(start$count, start$zero))
      }
    }
    if (control$EM & dist == "geometric") {
      mui <- model_count$fitted
      probi <- model_zero$fitted
      probi <- probi/(probi + (1 - probi) * dnbinom(0, 
                                                    size = 1, mu = mui))
      probi[Y1] <- 0
      ll_new <- loglikfun(c(start$count, start$zero))
      ll_old <- 2 * ll_new
      while (abs((ll_old - ll_new)/ll_old) > control$reltol) {
        ll_old <- ll_new
        # model_count <- suppressWarnings(glm.fit(X, Y, 
        #                                         weights = weights * (1 - probi), offset = offsetx, 
        #                                         family = MASS::negative.binomial(1), start = start$count))
        model_count <- suppressWarnings(fastglm::fastglm(
          x = X, y = Y, weights = weights *  (1 - probi), offset = offsetx,
          family = MASS::negative.binomial(1), start = start$count,
          method = 2L
        ))        
        
        # model_zero <- suppressWarnings(glm.fit(Z, probi, 
        #                                        weights = weights, offset = offsetz, family = binomial(link = linkstr), 
        #                                        start = start$zero))
        model_zero <- suppressWarnings(fastglm::fastglm(
          x = Z, y = probi, weights = weights, offset = offsetz,
          family = binomial(link = linkstr), start = start$count,
          method = 2L
        ))        
        
        start <- list(count = model_count$coefficients, 
                      zero = model_zero$coefficients)
        mui <- model_count$fitted
        probi <- model_zero$fitted
        probi <- probi/(probi + (1 - probi) * dnbinom(0, 
                                                      size = 1, mu = mui))
        probi[Y1] <- 0
        ll_new <- loglikfun(c(start$count, start$zero))
      }
    }
    if (control$EM & dist == "negbin") {
      mui <- model_count$fitted
      probi <- model_zero$fitted
      probi <- probi/(probi + (1 - probi) * dnbinom(0, 
                                                    size = start$theta, mu = mui))
      probi[Y1] <- 0
      ll_new <- loglikfun(c(start$count, start$zero, log(start$theta)))
      ll_old <- 2 * ll_new
      offset <- offsetx
      while (abs((ll_old - ll_new)/ll_old) > control$reltol) {
        ll_old <- ll_new
        
        model_count <- suppressWarnings(
          fastglm.nb(formula = Y ~ 0 + X + offset(offset),
                     weights = weights * (1 -  probi),
                     start = start$count,
                     init.theta = start$theta
          )
        )
        
        model_zero <- suppressWarnings(
          fastglm::fastglm(
            x = Z, y = probi, weights = weights, offset = offsetz,
            family = binomial(link = linkstr), start = start$zero,
            method = 2L)
        )   
        
        start <- list(count = model_count$coefficients, 
                      zero = model_zero$coefficients, theta = model_count$theta)
        mui <- model_count$fitted
        probi <- model_zero$fitted
        probi <- probi/(probi + (1 - probi) * dnbinom(0, 
                                                      size = start$theta, mu = mui))
        probi[Y1] <- 0
        ll_new <- loglikfun(c(start$count, start$zero, 
                              log(start$theta)))
      }
    }
    if (control$trace) 
      cat("done\n")
  }
  
  if (control$trace)  cat("calling optim() for ML estimation:\n")
  
  method <- control$method
  hessian <- control$hessian
  ocontrol <- control
  
  control$method <- control$hessian <- control$EM <- control$start <- NULL
  
  fit <- optim(fn = loglikfun, gr = gradfun, par = c(start$count, 
                                                     start$zero, if (dist == "negbin") log(start$theta) else NULL), 
               method = method, hessian = hessian, control = control)
  if (fit$convergence > 0) 
    warning("optimization failed to converge")
  coefc <- fit$par[1:kx]
  names(coefc) <- names(start$count) <- colnames(X)
  coefz <- fit$par[(kx + 1):(kx + kz)]
  names(coefz) <- names(start$zero) <- colnames(Z)
  vc <- -solve(as.matrix(fit$hessian))
  if (dist == "negbin") {
    np <- kx + kz + 1
    theta <- as.vector(exp(fit$par[np]))
    SE.logtheta <- as.vector(sqrt(diag(vc)[np]))
    vc <- vc[-np, -np, drop = FALSE]
  }
  else {
    theta <- NULL
    SE.logtheta <- NULL
  }
  colnames(vc) <- rownames(vc) <- c(paste("count", colnames(X), 
                                          sep = "_"), paste("zero", colnames(Z), sep = "_"))
  mu <- exp(X %*% coefc + offsetx)[, 1]
  phi <- linkinv(Z %*% coefz + offsetz)[, 1]
  Yhat <- (1 - phi) * mu
  res <- sqrt(weights) * (Y - Yhat)
  nobs <- sum(weights > 0)
  rval <- list(coefficients = list(count = coefc, zero = coefz), 
               residuals = res, fitted.values = Yhat, optim = fit, 
               method = method, control = ocontrol, start = start, 
               weights = if (identical(as.vector(weights), rep.int(1L, 
                                                                   n))) NULL else weights, offset = list(count = if (identical(offsetx, 
                                                                                                                               rep.int(0, n))) NULL else offsetx, zero = if (identical(offsetz, 
                                                                                                                                                                                       rep.int(0, n))) NULL else offsetz), n = nobs, df.null = nobs - 
                 2, df.residual = nobs - (kx + kz + (dist == "negbin")), 
               terms = list(count = mtX, zero = mtZ, full = mt), theta = theta, 
               SE.logtheta = SE.logtheta, loglik = fit$value, vcov = vc, 
               dist = dist, link = linkstr, linkinv = linkinv, converged = fit$convergence < 
                 1, call = cl, formula = ff, levels = .getXlevels(mt, 
                                                                  mf), contrasts = list(count = attr(X, "contrasts"), 
                                                                                        zero = attr(Z, "contrasts")))
  if (model) 
    rval$model <- mf
  if (y) 
    rval$y <- Y
  if (x) 
    rval$x <- list(count = X, zero = Z)
  class(rval) <- c("fastzeroinfl", "zeroinfl")
  return(rval)
}