R/cmultinom.R

Defines functions cmultinom cmultinomFit extract.cmultinom coef.cmultinom vcov.cmultinom summary.cmultinom confint.cmultinom predict.cmultinom print.summary.cmultinom mycluster.vcov.multinom myestfun.multinom

Documented in cmultinom coef.cmultinom confint.cmultinom extract.cmultinom predict.cmultinom summary.cmultinom vcov.cmultinom

#' Fit Multinomial Log-linear Models with standard errors clustered at municipality level
#' @param formula a formula expression as for regression models, of pooled form \code{cbind(y1, y2, y3) ~ .}
#' @param data a \code{data.frame} in which to interpret the variables occurring in \code{formula}
#' @param formulaMatch a formula used to compute propensity scores. Leave \code{NULL} if you don't want to 
#'   use matching. 
#' @param dataMatch a \code{data.frame} used for matching; must contain all corrupt/innocent municipalities. 
#' @param munCol column in \code{data} and \code{dataMatch} (if doing matching) containing the municipality id, syntax as in \code{dplyr}
#' @param yearCol column in \code{data} containing the year, syntax as in \code{dplyr}
#' @param stateCol column in \code{data} containing the state, syntax as in \code{dplyr}
#' @param treatCol column in \code{dataMatch} containing treatment status, syntax as in \code{dplyr}
#' @param pool should \code{data} be pooled automatically? 
#' @param nSamples number of bootstrap replicates. If \code{NULL}, uses analytical standard errors instead
#' @param parallel should the bootstrap be parallelized? if \code{TRUE}, the \code{cl} argument shoud not be \code{NULL}
#' @param seed optional seed
#' @param cl cluster used for parallelization
#' @param ... additional parameters passed on to \code{\link[nnet]{multinom}}
#' @details Make sure to check the \link{cmultinommethods} man page 
#' to see related methods. Also, check the \link{riskCurve} man page to compute a parametric risk curve. 
#' @export
cmultinom <- function(
  formula, data, 
  formulaMatch = NULL, dataMatch = NULL, 
  munCol = j, yearCol = year, stateCol = state, treatCol = treat, 
  pool = T, nSamples = NULL, parallel = F, seed = NULL, cl = NULL, 
  ...
) {
  munCol <- rlang::enquo(munCol)
  yearCol <- rlang::enquo(yearCol)
  stateCol <- rlang::enquo(stateCol)
  weights <- NULL
  matchMuns <- NULL
  # handle missing data
  nas <- model.frame(formula, data)
  nas <- attr(nas, "na.action")
  if(!is.null(nas)) data <- data[-nas,]
  muns <- dplyr::pull(data, !!munCol)
  # deal with matching
  if(!is.null(formulaMatch)) {
    treatCol <- rlang::enquo(treatCol)
    # remove obs for which propensity score can't be computed
    nas <- model.frame(delete.response(terms(formulaMatch)), dataMatch)
    nas <- attr(nas, "na.action")
    if(!is.null(nas)) dataMatch <- dataMatch[-nas,]
    # compute propensity scores and remake dataMatch so it contains only relevant columns (idMun, treat, propensity)
    modPropensity <- stats::glm(formulaMatch, data = dataMatch, family = "binomial")
    propensity <- predict(modPropensity, newdata = dataMatch, type = "response")
    dataMatch <- dplyr::select(dataMatch, !!munCol, treat = !!treatCol) %>% mutate(propensity = propensity)
    # adjust data frames such that they contain the same municipalities
    matchMuns <- dplyr::pull(dataMatch, !!munCol)
    keepMuns <- base::intersect(muns, matchMuns)
    data <- dplyr::filter(data, !!munCol %in% keepMuns)
    dataMatch <- dplyr::filter(dataMatch, !!munCol %in% keepMuns)
    muns <- dplyr::pull(data, !!munCol)
    matchMuns <- dplyr::pull(dataMatch, !!munCol)
  }
  # main fit
  mod <- c(list(...), list(formula = formula, data = data, formulaMatch = formulaMatch, dataMatch = dataMatch, munCol = munCol, pool = pool))
  mod$Hess <- T
  mod <- do.call(cmultinomFit, mod)
  matchObj <- mod$MatchIt
  mod <- mod$mod
  nObs <- getNObs(formula, data, !!munCol, !!yearCol)
  coefs <- linCoefs(mod)
  coefNames <- names(coefs)
  # vcov bootstrap
  if(!is.null(nSamples)) {
    if(is.null(seed)) seed <- sample.int(1e6, size = 1)
    set.seed(seed)
    seeds <- sample.int(1e6, nSamples, replace = F)
    bootSrc <- dplyr::select(data, mun = !!munCol, state = !!stateCol) %>% dplyr::distinct()
    boots <- getBoots(bootSrc, seeds)
    bootFUN <- function(boot, formula, data, formulaMatch, dataMatch, munCol, pool, coefNames, ...) {
      dataBoot <- unlist(sapply(boot, function(b) which(dplyr::pull(data, !!munCol) == b)))
      data <- data[dataBoot,]
      if(!is.null(dataMatch)) {
        dataMatchBoot <- unlist(sapply(boot, function(b) which(dplyr::pull(dataMatch, !!munCol) == b)))
        dataMatch <- dataMatch[dataMatchBoot,]
      }
      mod <- c(list(...), list(formula = formula, data = data, formulaMatch = formulaMatch, dataMatch = dataMatch, munCol = munCol, pool = pool))
      mod$trace <- F
      mod <- do.call(cmultinomFit, mod)
      mod <- mod$mod
      bootCoefAdjust(linCoefs(mod), coefNames)
    }
    if(!parallel) {
      boots <- lapply(
        boots, bootFUN, 
        formula = formula, data = data, 
        formulaMatch = formulaMatch, dataMatch = dataMatch, 
        munCol = munCol, pool = pool, coefNames = coefNames, ... = ...
      )
    } else {
      boots <- parallel::parLapplyLB(
        cl = cl, boots, bootFUN, 
        formula = formula, data = data, 
        formulaMatch = formulaMatch, dataMatch = dataMatch, 
        munCol = munCol, pool = pool, coefNames = coefNames, ... = ...
      )
    }
    boots <- do.call(rbind, boots)
    colnames(boots) <- coefNames
    vc <- cov(boots)
  } else {
    X <- model.matrix(delete.response(formula), data)
    Y <- model.response(model.frame(formula, data))
    vc <- vcov(mod)
    vc <- mycluster.vcov.multinom(X, Y, beta = t(coef(mod)), rank = mod$rank, vcov = vc, cluster = muns, df_correction = TRUE)
    dimnames(vc) <- list(coefNames, coefNames)
  }
  
  out <- list(
    formula = formula, 
    coef = coefs, 
    model = mod, 
    AIC = mod$AIC,
    deviance = mod$deviance, 
    df = mod$edf, 
    logLik = stats::logLik(mod), 
    nSpell = nObs$nSpell, 
    nObs = nObs$nObs,
    nMun = nObs$nMun,
    munCol = munCol, 
    yearCol = yearCol, 
    stateCol = stateCol, 
    vcov = vc, 
    call = match.call(), 
    convergence = mod$convergence  == 0, 
    nSamples = nSamples
  )
  if(!is.null(nSamples)) {
    out$seed <- seed
    out$boots <- boots
  }
  if(!is.null(formulaMatch)) {
    out$formulaMatch <- formulaMatch
    out$MatchIt <- matchObj
    out$modPropensity <- modPropensity
    out$dataMatch <- dataMatch
    out$treatCol <- treatCol
  }
  class(out) <- "cmultinom"
  out
}

cmultinomFit <- function(formula, data, formulaMatch, dataMatch, munCol, pool, ...) {
  weights <- NULL
  if(!is.null(formulaMatch)) {
    matchMod <- suppressWarnings(MatchIt::matchit(treat ~ 1, data = dataMatch, method = "full", distance = dataMatch$propensity))
    wMatch <- dplyr::select(dataMatch, !!munCol) %>% dplyr::mutate(weights = matchMod$weights)
    joinCol <- rlang::as_name(munCol)
    names(joinCol) <- joinCol
    data <- dplyr::inner_join(data, wMatch, by = joinCol)
    weights <- data$weights
  }
  if(pool) data <- poolDf(formula, data, weights)
  out <- list()
  out$mod <- nnet::multinom(formula = formula, data = data, ... = ...)
  if(!is.null(formulaMatch)) out$MatchIt <- matchMod
  out
}

#' @exportClass cmultinom
setClass("cmultinom")

#' @export
#' @rdname cmultinommethods
extract.cmultinom <- function(
  model, 
  include.aic = TRUE, include.loglik = TRUE, include.nobs = TRUE, include.nind = T, include.nmun = T, 
  level = .95, use.ci = T
) {
  cl <- list(
    coef.names = names(coef(model)), 
    coef = coef(model),
    se = sqrt(diag(vcov(model))), 
    pvalues = 2 * pnorm(abs(coef(model)), sd = sqrt(diag(vcov(model))), lower.tail = F), 
    gof.names = character(0), 
    gof = numeric(0), 
    gof.decimal = logical(0)
  )
  if(use.ci) {
    cl$ci.low <- confint(model, level = level)[,1]
    cl$ci.up <- confint(model, level = level)[,2]
  }
  
  if(include.loglik) {
    cl$gof.names <- c(cl$gof.names, "Log Likelihood")
    cl$gof <- c(cl$gof, as.numeric(model$logLik))
    cl$gof.decimal <- c(cl$gof.decimal, T)
  }
  if(include.aic) {
    cl$gof.names <- c(cl$gof.names, "AIC")
    cl$gof <- c(cl$gof, as.numeric(model$AIC))
    cl$gof.decimal <- c(cl$gof.decimal, T)
  }
  if(include.nobs) {
    cl$gof.names <- c(cl$gof.names, "Num. obs.")
    cl$gof <- c(cl$gof, as.numeric(model$nSpell))
    cl$gof.decimal <- c(cl$gof.decimal, F)
  }
  if(include.nind) {
    cl$gof.names <- c(cl$gof.names, "Num. ind.")
    cl$gof <- c(cl$gof, as.numeric(model$nObs))
    cl$gof.decimal <- c(cl$gof.decimal, F)
  }
  if(include.nmun) {
    cl$gof.names <- c(cl$gof.names, "Num. mun.")
    cl$gof <- c(cl$gof, as.numeric(model$nMun))
    cl$gof.decimal <- c(cl$gof.decimal, F)
  }
  do.call(texreg::createTexreg, cl)
}
setMethod("extract", signature = "cmultinom", definition = extract.cmultinom)



#' Methods for \code{\link{cmultinom}} objects
#' @param ... use to select variables a la \code{\link[dplyr]{select}}
#' @param level levels for confidence intervals
#' @param digits number of digits to be displayed
#' @param newdata a \code{data.frame} in which to look for variables with which to predict
#' @param type the type of prediction required. The default is on the scale of the linear predictors; the alternative \code{"exp-link"} 
#'   exponentiates the linear predictors (e.g. to get odds ratios); \code{"response"} returns predicted probabilities to belong in each class; 
#'   \code{"class"} returns the class with highest predicted probabilities. 
#' @param ci.level if not \code{NULL}, the levels of confidence intervals to be returned with predictions. Only used when \code{type} is \code{"link"} or
#'   \code{"exp-link"}. If not \code{NULL}, then \code{predict} returns a \code{tibble} of length \code{2 * nrow(newdata)} with predictions and 
#'   confidence intervals for, in turn, \code{beta1} and \code{beta2}. 
#' @param nSamples if \code{ci.level} is not \code{NULL}, number of Monte Carlo samples to be drawn 
#'   to compute the confidence intervals around predictions.
#' @param intercept0 should predictions set the intercept to 0? 
#' @param include.aic Should AIC be reported in \code{\link[texreg]{texreg}} output? 
#' @param include.loglik Should log-likelihood be reported in \code{\link[texreg]{texreg}} output? 
#' @param include.nobs Should number of observations be reported in \code{\link[texreg]{texreg}} output?
#' @param include.nind Should number of individuals be reported in \code{\link[texreg]{texreg}} output? 
#' @param include.nmun Should number of municipalities be reported in \code{\link[texreg]{texreg}} output?
#' @param use.ci Should confidence intervals be reported in \code{\link[texreg]{texreg}} output?
#' @name cmultinommethods
NULL

#' @export
#' @rdname cmultinommethods
coef.cmultinom <- function(object, ...) {
  if(...length() == 0) {
    object$coef
  } else {
    df <- tibble::as_tibble(t(as.matrix(object$coef)))
    df <- dplyr::select(df, ...)
    cnames <- colnames(df)
    df <- as.numeric(t(as.matrix(df)))
    names(df) <- cnames
    df
  }
}

#' @export
#' @rdname cmultinommethods
vcov.cmultinom <- function(object) {
  object$vcov
}

#' @export
#' @rdname cmultinommethods
summary.cmultinom <- function(
  object, 
  ..., 
  digits = 1) {
  out <- list(
    coefficients = coef(object, ... = ...), 
    call = object$call, 
    nObs = object$nObs, 
    nMun = object$nMun, 
    nSpell = object$nSpell, 
    digits = digits, 
    k = length(coef(object)), 
    convergence = object$convergence, 
    AIC = object$AIC, 
    deviance = object$deviance, 
    df = object$df, 
    boot = !is.null(object$nSamples), 
    nSamples = object$nSamples
  )
  
  out$sd <- se(object, ... = ...)
  out$z <- out$coefficients / out$sd
  out$pval <- 2 * pnorm(abs(out$coefficients), sd = out$sd, lower.tail = F)
  out$kselect <- length(out$coefficients)
  
  class(out) <- "summary.cmultinom"
  out
}

#' @export
#' @rdname cmultinommethods
confint.cmultinom <- function(object, ..., level = c(.9, .95)) {
  alpha <- (1-level)/2
  alpha <- sort(unique(c(alpha, 1-alpha)))
  coefs <- coef(object, ... = ...)
  se <- se(object, ... = ...)
  out <- t(mapply(qnorm, mean = coefs, sd = se, MoreArgs = list(p = alpha)))
  colnames(out) <- stats:::format.perc(alpha, 3)
  rownames(out) <- names(coefs)
  out
}

#' @export
#' @rdname cmultinommethods
predict.cmultinom <- function(object, newdata, 
                             type = c("link", "exp-link", "response", "class"), 
                             ci.level = c(.9, .95), nSamples = 1e3, intercept0 = F) {
  x <- as.data.frame(newdata)
  ter <- terms(formula(object))
  hasIntercept <- attr(ter, "intercept") == 1
  x <- model.matrix(delete.response(ter), newdata)
  if(intercept0 & hasIntercept) x <- x[, -1, drop = F]
  type <- type[1]
  beta <- matrix(coef(object), ncol = 2)
  if(intercept0 & hasIntercept) beta <- beta[-1, , drop = F]
  if(type %in% c("link", "exp-link")) {
    if(type == "exp-link") {
      tr <- exp
    } else {
      tr <- function(x) x
    }
    y <- tr(x %*% beta)
    if(!is.null(ci.level)) {
      alpha <- (1-ci.level)/2
      alpha <- sort(unique(c(alpha, 1-alpha)))
      betas <- MASS::mvrnorm(nSamples, coef(object), vcov(object))
      k <- ncol(betas)/2
      betas1 <- t(betas[,1:k])
      betas2 <- t(betas[,(k+1):ncol(betas)])
      if(intercept0 & hasIntercept) betas1 <- betas1[-1, , drop = F]
      if(intercept0 & hasIntercept) betas2 <- betas2[-1, , drop = F]
      ci <- dplyr::bind_rows(
        tibble::as_tibble(tr(t(apply(x %*% betas1, 1, quantile, probs = alpha)))), 
        tibble::as_tibble(tr(t(apply(x %*% betas2, 1, quantile, probs = alpha))))
      )
      y <- dplyr::bind_rows(
        tibble::tibble(param = "beta1", estimate = y[,1]), 
        tibble::tibble(param = "beta2", estimate = y[,2])
      )
      y <- dplyr::bind_cols(y, ci)
    }
  } else {
    y <- x %*% cbind(0,beta)
    y <- exp(y) / rowSums(exp(y))
    if(type == "class") y <- apply(y, 1, which.max)
  }
  y
}

#' @export
print.summary.cmultinom <- function(x) {
  if(!x$convergence) warning("The model did not converge. Try to increase the maxit parameter.")
  cat("Call:\n")
  print(x$call)
  cat("\nModel Info: \n")
  sprintf("  %-13s %i spells, %i individuals, %i municipalities\n", 
          "observations:", x$nSpell, x$nObs, x$nMun) %>% cat()
  if(x$boot) {
    sprintf("  %-13s bootsrap (%i samples)\n", 
            "SE:", x$nSamples) %>% cat()
  } else {
    sprintf("  %-13s analytical\n", 
            "SE:") %>% cat()
  }
  sprintf("  %-13s %i\n\n", 
          "predictors:", x$k) %>% cat()
  if(x$k == x$kselect) {
    cat("Estimates: \n")
  } else {
    sprintf("Estimates: (showing %i/%i predictors)\n", x$kselect, x$k) %>% cat()
  }
  comat <- cbind(x$coefficients, x$sd, x$z, x$pval)
  colnames(comat) <- c("Estimate", "Std. Error", "z value", "Pr(>|z|)")
  printCoefmat(comat, has.Pvalue = T, P.values = T, digits = x$digits)
  cat("\nFit: \n")
  sprintf("  Residual deviance: %.*f on %i degrees of freedom\n", x$digits, x$deviance, x$df) %>% cat()
  sprintf("  AIC: %.*f", x$digits, x$AIC) %>% cat()
}



mycluster.vcov.multinom <- function(X, Y, beta, rank, vcov, cluster, df_correction = TRUE) 
{
  n <- sum(Y)
  cluster <- match(cluster, unique(cluster))
  vcov_sign <- 1
  esttmp <- myestfun.multinom(X, Y, beta)
  df <- data.frame(
    M = length(unique(cluster)), 
    N = n, # crucial fix: use right number of observations
    K = rank
  )
  if (df_correction == TRUE) {
    df$dfc <- (df$M/(df$M - 1)) * ((df$N - 1)/(df$N - df$K))
  }
  else if (is.numeric(df_correction) == TRUE) {
    df$dfc <- df_correction
  }
  else {
    df$dfc <- 1
  }
  uj <- crossprod(rowsum(esttmp, cluster))
  meat <- vcov_sign * df$dfc * (uj/df$N)
  bread <- n * vcov # additional fix for right number of obs
  sandwich <- 1/n * (bread %*% meat %*% bread) # final fix for right number of obs
  sandwich
}


myestfun.multinom <- function(X, Y, beta) {
  N <- rowSums(Y)
  Y <- Y[,-1]
  probs <- X %*% cbind(0,beta)
  probs <- exp(probs) / rowSums(exp(probs))
  
  out <- cbind(
    Y[,1] * X - N * probs[,1] * X, 
    Y[,2] * X - N * probs[,2] * X
  )
  out
}
rferrali/rogali documentation built on May 26, 2019, 7 p.m.