R/EM.R

Defines functions EM EMfit coefToWts mdmultinom LSE estimate getWeights getMultLik logLik extract.EM coef.EM vcov.EM summary.EM print.summary.EM predict.EM posterior_probs.EM confusion_table.EM

Documented in coef.EM confusion_table.EM EM extract.EM posterior_probs.EM predict.EM summary.EM vcov.EM

#' Estimate a mixture model using an EM algorithm
#' 
#' @export
#' 
#' @param formula a formula expression as for regression models, of pooled form \code{cbind(y1, y2, y3) ~ .}
#' @param data a \code{data.frame} in which to interpret the variables occurring in \code{formula}
#' @param penalization a scalar between 0 and 1 to penalize observations whose type is unobserved. 
#'   1 discards those observations, 0 gives them weight equal to observations whose type is observed. 
#' @param munCol column in both \code{data} and \code{dataMun} 
#'   containing the municipality id, syntax as in \code{dplyr} 
#' @param zCol column in both \code{data} and \code{dataMun} 
#'   containing the (possibly missing) municipality type, syntax as in \code{dplyr}
#' @param yearCol column in \code{data} containing the year, syntax as in \code{dplyr}
#' @param stateCol column in \code{data} containing the state, syntax as in \code{dplyr}
#' @param tol threshold used to determined convergence
#' @param mixTrace report each \code{mixTrace} iteration Leave \code{NULL} for no reporting. 
#' @param mixMaxIt maximum number of iterations
#' @param pool should \code{data} be pooled automatically? 
#' @param k rounding parameter for numerical stability. Rounds to the order of \code{log(-k)}
#' @param nSamples number of bootstrap replicates. If \code{NULL}, standard errors are not estimated. 
#' @param parallel should the bootstrap be parallelized? if \code{TRUE}, the \code{cl} argument shoud not be \code{NULL}
#' @param seed optional seed
#' @param cl cluster used for parallelization
#' @param ... additional parameters passed on to \code{\link[nnet]{multinom}}
#' @details Make sure to check the \link[=EMmethods]{methods} man page 
#' to see related methods, as well as the \link{posteriorprobs} man page 
#' for how to deal with predicted probabilities. 
#' @export
EM <- function(formula, data, penalization = 0, 
               munCol = j, zCol = Z, yearCol = year, stateCol = state,
               tol = 1e-3, mixTrace = 1, mixMaxit = 1000,
               pool = T, k = 10, nSamples = NULL, parallel = F, seed = NULL, cl = NULL,
               ...) {
  munCol <- rlang::enquo(munCol)
  zCol <- rlang::enquo(zCol)
  yearCol <- rlang::enquo(yearCol)
  stateCol <- rlang::enquo(stateCol)

  data <- dplyr::rename(data, j = !!munCol) %>%
    dplyr::arrange(j) %>%
    dplyr::mutate(obs = as.integer(!is.na(!!zCol)))
  
  if(is.null(seed)) {
    seed <- sample.int(1e6, 1)
  }
  set.seed(seed)
  coefNames <- colnames(model.matrix(delete.response(terms(formula)), data))
  coefNames <- c(paste0("beta1-", coefNames), paste0("beta2-", coefNames))
  
  out <- EMfit(formula, data, 
               tol = tol, mixTrace = mixTrace, mixMaxit = mixMaxit, 
               seed = seed, pool = pool, penalization = penalization, 
               zCol = zCol, coefNames = coefNames, k = k, ...)
  
  replace0 <- list(0)
  names(replace0) <- rlang::as_name(zCol)
  nObs <- getNObs(formula, tidyr::replace_na(data, replace0), j, !!yearCol)
  
  if(!is.null(nSamples)) {
    if(is.null(seed)) seed <- sample.int(1e6, size = 1)
    set.seed(seed)
    seeds <- sample.int(1e6, nSamples, replace = F)
    seeds2 <- sample.int(1e6, nSamples, replace = F)
    bootSrc <- dplyr::select(data, mun = j, state = !!stateCol) %>% dplyr::distinct()
    boots <- getBoots(bootSrc, seeds)
    boots <- purrr::map2(boots, seeds2, ~ list(boot = .x, seed = .y))
    bootFUN <- function(boot, formula, data, 
                        tol, mixMaxit, 
                        pool, zCol, k, penalization, 
                        coefNames, munCol, ...) {
      dataBoot <- unlist(sapply(boot$boot, function(b) which(data$j == b)))
      data <- data[dataBoot,]
      mod <- c(
        list(...), 
        list(formula = formula, data = data, tol = tol, penalization = penalization, 
             mixTrace = NULL, mixMaxit = mixMaxit, seed = boot$seed, pool = pool, 
             zCol = zCol, coefNames = coefNames, k = k)
      )
      mod$trace <- F
      mod <- tryCatch(do.call(EMfit, mod), error = function(e) e)
      if(class(mod)[1] == "EMfit") {
        list(coef = bootCoefAdjust(mod$coef, c("gamma", coefNames)), 
             convergence = mod$convergence, 
             trace = mod$trace, 
             error = ifelse(!is.null(mod$error), mod$error, NA))
      } else {
        coef <- rep(NA, length(coefNames)+1)
        names(coef) <- c("gamma", coefNames)
        list(coef = coef, 
             convergence = F, 
             trace = NULL, 
             error = paste(as.character(unlist(mod)), collapse = " "))
      }
    }
    if(!parallel) {
      boots <- lapply(boots, bootFUN, formula = formula, data = data, tol = tol, 
          mixMaxit = mixMaxit, pool = pool, munCol = munCol, penalization = penalization, 
          zCol = zCol, coefNames = coefNames, k = k, ... = ...
        )
    } else {
      boots <- parallel::parLapplyLB(
        cl = cl, boots, bootFUN, 
        formula = formula, data = data, tol = tol, penalization = penalization, 
        mixMaxit = mixMaxit, pool = pool, munCol = munCol, 
        zCol = zCol, coefNames = coefNames, k = k, ... = ...
      )
    }
    bootCoefs <- do.call(rbind, purrr::map(boots, ~ .$coef))
    colnames(bootCoefs) <- c("gamma", coefNames)
    
    bootsDiag <- list(
      convergence = tibble::tibble(
        sample = 1:nSamples, 
        convergence = purrr::map_lgl(boots, ~ .$convergence), 
        error = purrr::map_chr(boots, ~ .$error)
      ), 
      traces = purrr::map_dfr(boots, ~ .$trace, .id = "sample") %>% 
        dplyr::mutate(sample = as.integer(sample))
    )
    vc <- cov(bootCoefs, use = "pairwise.complete.obs")
    out$boots <- bootCoefs
    out$vcov <- vc
    out$bootsDiag <- bootsDiag
  }
  out <- c(out, list(
    formula = formula, 
    nSpell = nObs$nSpell, 
    nObs = nObs$nObs,
    nMun = nObs$nMun,
    munCol = munCol, 
    yearCol = yearCol, 
    stateCol = stateCol, 
    zCol = zCol, 
    call = match.call(), 
    nSamples = nSamples, 
    df = length(out$coef), 
    penalization = penalization, 
    k = k
  ))
  out$deviance <- 2*out$logLik
  out$AIC <- 2*(out$df - out$logLik)
  out$weights <- dplyr::rename(out$weights, !!munCol := j, !!zCol := Z)

  class(out) <- "EM"
  out
}

EMfit <- function(formula, data, seed, tol, mixTrace, mixMaxit, zCol, coefNames, k, pool, penalization, ...) {
  # some data prep
  set.seed(seed)
  dataMun <- dplyr::select(data, j, !!zCol, obs) %>% dplyr::distinct()
  keyJ <- tibble::tibble(oJ = dataMun$j, nJ = match(dataMun$j, dataMun$j))
  data$j <- match(data$j, dataMun$j)
  dataMun$j <- 1:nrow(dataMun)
  
  replace0 <- list(0)
  replace1 <- list(1)
  names(replace0) <- names(replace1) <- rlang::as_name(zCol)
  
  dataAug <- dplyr::bind_rows(
    dplyr::mutate(data, trueZ = !!zCol, !!zCol := 0), 
    dplyr::mutate(data, trueZ = !!zCol, !!zCol := 1)
  ) %>% dplyr::arrange(Z, j)

  Y <- model.response(model.frame(formula, data = dataAug))
  X <- model.matrix(delete.response(terms(formula)), data = dataAug)
  Z <- w <- dplyr::pull(dataMun, !!zCol)

  # initialization
  if(!is.null(mixTrace)) message("initial calibration")
  calData <- dplyr::filter(data, obs == 1)
  if(pool) calData <- poolDf(formula, calData)
  params <- list(
    gamma = mean(w, na.rm = T),
    mod = nnet::multinom(formula, data = calData, ...)
  )
  initBeta <- linCoefs(params$mod)
  initBeta <- bootCoefAdjust(initBeta, coefNames)
  initBeta[is.na(initBeta)] <- 0
  initBeta <- t(matrix(initBeta, ncol = 2))
  params$beta <- initBeta
  w[is.na(w)] <- mean(w, na.rm = T)

  # EM
  diff <- tol + 1
  LLOld <- NULL
  i <- 0
  LLs <- c()
  diffs <- c()
  while(diff >= tol & i < mixMaxit) {
    multLik <- getMultLik(params, dataAug, Y, X, k, zCol)
    w <- getWeights(multLik, k)
    params <- estimate(formula, w, Z, dataAug, params$beta, pool, zCol, penalization, ...)
    if(is.null(LLOld)) {
      LL <- logLik(multLik, k, zCol, penalization)
      LLOld <- LL - (tol + 1)
    } else {
      LLOld <- LL
      LL <- logLik(multLik, k, zCol, penalization)
    }
    diff <- LL - LLOld
    LLs <- c(LLs, LL)
    diffs <- c(diffs, diff)
    i <- i + 1
    if(!is.null(mixTrace)) {
      if(i %% mixTrace == 0) message(sprintf("Iteration %s: Delta = %s", i, round(diff, 4)))
      if(diff < 0) warning(sprintf("/!\\ Iteration %s: Delta = %s", i, round(diff, 4)))
    }
  }

  coefs <- c(gamma = params$gamma, linCoefs(params$mod))
  convergence <- TRUE
  error <- NULL
  if(diff < 0) {
    convergence <- FALSE 
    error <- "decreasing LL"
  } 
  if(diff > tol) {
    convergence <- FALSE 
    error <- "reached max. number of iterations"
  }
  out <- list(
    coef = coefs,
    weights = tibble::tibble(j = keyJ$oJ, Z = Z, prob = w),
    model = params$mod,
    logLik = LL,
    convergence = convergence,
    error = error, 
    nIter = i,
    seed = seed,
    trace = tibble::tibble(
      iteration = 1:i,
      LL = LLs,
      diff = diffs
    )
  )
  class(out) <- "EMfit"
  out
}



# translate coefs from nnet::multinom to Wts (param for starting weights in nnet::nnet)
coefToWts <- function(co) {
  co <- cbind(0, t(co))
  co <- rbind(0, co)
  as.numeric(co)
}

# gets (log) density of a multinomial draw, i.e. f(x), from coefficients of multinomial -- saves log(exp(log(...)))
mdmultinom <- function(Y, X, beta, log = F, k) {
  Xbeta <- X %*% t(beta)
  icomp <- Y[,-1] * Xbeta - lgamma(Y[,-1]+1)
  icomp <- rowSums(icomp)
  N <- rowSums(Y)
  gcomp <- lgamma(N+1) - N * apply(cbind(0, Xbeta), 1, LSE, k = k)
  out <- icomp + gcomp
  if(!log) out <- exp(out)
  out
}

# log-sum-exp trick
# with smoothing as in http://u.cs.biu.ac.il/~shey/919-2011/index.files/underflow%20and%20smoothing%20in%20EM.pdf
LSE <- function(x, k) {
  m <- max(x)
  x <- x - m
  x <- x[x > -k]
  m + log(sum(exp(x)))
}

# function for M-step
estimate <- function(formula, w, Z, dfAug, start, pool, zCol, penalization, ...) {
  wGamma <- ifelse(is.na(Z), 1 - penalization, 1)
  gamma <- weighted.mean(ifelse(is.na(Z), w, Z), wGamma)
  
  zTmp <- dplyr::pull(dfAug, !!zCol)
  dfAug$wgt <- w[dfAug$j]
  dfAug$wgt <- ifelse(zTmp == 1, dfAug$wgt, 1-dfAug$wgt)
  dfAug$wgt <- ifelse(dfAug$obs == 1, as.integer(zTmp == dfAug$trueZ), dfAug$wgt)
  dfAug$wgt <- ifelse(dfAug$obs == 1, dfAug$wgt, (1-penalization) * dfAug$wgt)
  
  if(pool) {
    dfEst <- poolDf(formula, dfAug, dfAug$wgt)
    mod <- nnet::multinom(formula, data = dfEst, Wts = coefToWts(start), ...)
  } else {
    mod <- nnet::multinom(formula, data = dfAug, weights = wgt, Wts = coefToWts(start), ...)
  }
  
  list(beta = coef(mod), gamma = gamma, mod = mod)
}

# function for E-step
getWeights <- function(multLik, k) {
  n <- nrow(multLik)
  w <- matrix(multLik$pi + multLik$pr, ncol = 2)
  # smoothing as in http://u.cs.biu.ac.il/~shey/919-2011/index.files/underflow%20and%20smoothing%20in%20EM.pdf
  apply(w, 1, function(v) {
    m <- max(v)
    v <- v - m
    if(v[2] < -k) {
      return(0)
    } 
    if(v[1] < -k) {
      return(1)
    }
    v <- exp(v)
    return(v[2] / sum(v))
  })
}

# function for some intermediate object
getMultLik <- function(params, dfAug, Y, X, k, zCol) {
  pr <-  dplyr::mutate(dfAug, pr = mdmultinom(Y, X, coef(params$mod), log = T, k = k)) %>% 
    dplyr::group_by(j, !!zCol, trueZ, obs) %>% 
    dplyr::summarize(pr = sum(pr)) %>% 
    dplyr::ungroup() %>% 
    dplyr::mutate(
      pi = params$gamma, 
      pi = ifelse(!!zCol == 1, pi, 1-pi)
    ) %>%
    dplyr::select(j, Z = !!zCol, trueZ, obs, pr, pi) %>%
    dplyr::arrange(Z, j)
}

# compute incomplete likelihood
logLik <- function(multLik, k, zCol, penalization) {
  llk <-  dplyr::filter(multLik, obs == 1) %>% 
    dplyr::filter(trueZ == Z)
  llu <- dplyr::filter(multLik, obs == 0)
  llu <- cbind(
    llu$pi[llu$Z == 0] + llu$pr[llu$Z == 0], 
    llu$pi[llu$Z == 1] + llu$pr[llu$Z == 1]
  )
  ll <- sum(llk$pi + llk$pr) + 
    (1-penalization) * sum(apply(llu,1,LSE, k = k))
  ll
}

#### methods ####

#' @exportClass EM
setClass("EM")

#' @export
#' @rdname EMmethods
extract.EM <- function(
  model, 
  include.aic = TRUE, include.loglik = TRUE, include.nobs = TRUE, include.nind = T, include.nmun = T, 
  level = .95, use.ci = T
) {
  cl <- list(
    coef.names = names(coef(model)), 
    coef = coef(model),
    se = sqrt(diag(vcov(model))), 
    pvalues = 2 * pnorm(abs(coef(model)), sd = sqrt(diag(vcov(model))), lower.tail = F), 
    gof.names = character(0), 
    gof = numeric(0), 
    gof.decimal = logical(0)
  )
  if(use.ci) {
    cl$ci.low <- confint(model, level = level)[,1]
    cl$ci.up <- confint(model, level = level)[,2]
  }
  
  if(include.loglik) {
    cl$gof.names <- c(cl$gof.names, "Log Likelihood")
    cl$gof <- c(cl$gof, as.numeric(model$logLik))
    cl$gof.decimal <- c(cl$gof.decimal, T)
  }
  if(include.aic) {
    cl$gof.names <- c(cl$gof.names, "AIC")
    cl$gof <- c(cl$gof, as.numeric(model$AIC))
    cl$gof.decimal <- c(cl$gof.decimal, T)
  }
  if(include.nobs) {
    cl$gof.names <- c(cl$gof.names, "Num. obs.")
    cl$gof <- c(cl$gof, as.numeric(model$nSpell))
    cl$gof.decimal <- c(cl$gof.decimal, F)
  }
  if(include.nind) {
    cl$gof.names <- c(cl$gof.names, "Num. ind.")
    cl$gof <- c(cl$gof, as.numeric(model$nObs))
    cl$gof.decimal <- c(cl$gof.decimal, F)
  }
  if(include.nmun) {
    cl$gof.names <- c(cl$gof.names, "Num. mun.")
    cl$gof <- c(cl$gof, as.numeric(model$nMun))
    cl$gof.decimal <- c(cl$gof.decimal, F)
  }
  do.call(texreg::createTexreg, cl)
}
setMethod("extract", signature = "EM", definition = extract.EM)

#' Methods for \code{\link{EM}} objects
#' @param ... use to select variables a la \code{\link[dplyr]{select}}
#' @param level levels for confidence intervals
#' @param digits number of digits to be displayed
#' @param newdata a \code{data.frame} in which to look for variables with which to predict
#' @param type the type of prediction required. \code{"posterior"} returns the posterior probability of having Z = 1.
#' @param k rounding parameter for numerical stability. Rounds to the order of \code{log(-k)}
#' @param include.aic Should AIC be reported in \code{\link[texreg]{texreg}} output? 
#' @param include.loglik Should log-likelihood be reported in \code{\link[texreg]{texreg}} output? 
#' @param include.nobs Should number of observations be reported in \code{\link[texreg]{texreg}} output?
#' @param include.nind Should number of individuals be reported in \code{\link[texreg]{texreg}} output? 
#' @param include.nmun Should number of municipalities be reported in \code{\link[texreg]{texreg}} output?
#' @param use.ci Should confidence intervals be reported in \code{\link[texreg]{texreg}} output?
#' @name EMmethods
NULL


#' @export
#' @rdname EMmethods
coef.EM <- function(object, ...) {
  if(...length() == 0) {
    object$coef
  } else {
    df <- tibble::as_tibble(t(as.matrix(object$coef)))
    df <- dplyr::select(df, ...)
    cnames <- colnames(df)
    df <- as.numeric(t(as.matrix(df)))
    names(df) <- cnames
    df
  }
}

#' @export
#' @rdname EMmethods
vcov.EM <- function(object) {
  object$vcov
}

#' @export
#' @rdname EMmethods
summary.EM <- function(
  object, 
  ..., 
  digits = 1) {
  out <- list(
    coefficients = coef(object, ... = ...), 
    call = object$call, 
    nObs = object$nObs, 
    nMun = object$nMun, 
    nSpell = object$nSpell, 
    digits = digits, 
    k = length(coef(object)), 
    convergence = object$convergence, 
    error = object$error, 
    AIC = object$AIC, 
    deviance = object$deviance, 
    df = object$df, 
    boot = !is.null(object$nSamples), 
    nSamples = object$nSamples, 
    nIter = object$nIter
  )
  
  out$sd <- se(object, ... = ...)
  if(!is.null(out$sd)) {
    out$z <- out$coefficients / out$sd
    out$pval <- 2 * pnorm(abs(out$coefficients), sd = out$sd, lower.tail = F)
  } else {
    out$sd <- out$pval <- out$z <- rep(NA, length(out$coefficients))
  }
  if(!out$convergence) {
    if(rev(object$trace$diff)[1] < 0) out$diff <- rev(object$trace$diff)[1]
  }
  out$kselect <- length(out$coefficients)
  if(out$boot) {
    out$nConverge <- sum(object$bootsDiag$convergence$convergence)
    if(out$nConverge != out$nSamples) {
      diag <- table(object$bootsDiag$convergence$error[!object$bootsDiag$convergence$convergence])
      out$diag <- matrix(diag, ncol = 1, dimnames = list(names(diag), "N"))
      meanDiff <- object$bootsDiag$traces$diff
      meanDiff <- meanDiff[meanDiff < 0]
      if(length(meanDiff) > 0) out$meanDiff <- meanDiff
    }
  }
  class(out) <- "summary.EM"
  out
}

#' @export
print.summary.EM <- function(x) {
  cat("Call:\n")
  print(x$call)
  cat("\nModel Info: \n")
  sprintf("  %-13s %i spells, %i individuals, %i municipalities\n", 
          "observations:", x$nSpell, x$nObs, x$nMun) %>% cat()
  sprintf("  iterations: %i\n", x$nIter) %>% cat()
  if(x$boot) {
    sprintf("  %-13s bootsrap (%i samples)\n", 
            "  SE:", x$nSamples) %>% cat()
  } else {
    cat("  SE: none\n")
  }
  sprintf("  %-13s %i\n\n", 
          "predictors:", x$k) %>% cat()
  if(x$k == x$kselect) {
    cat("Estimates: \n")
  } else {
    sprintf("Estimates: (showing %i/%i predictors)\n", x$kselect, x$k) %>% cat()
  }
  comat <- cbind(x$coefficients, x$sd, x$z, x$pval)
  colnames(comat) <- c("Estimate", "Std. Error", "z value", "Pr(>|z|)")
  printCoefmat(comat, has.Pvalue = T, P.values = T, digits = x$digits)
  cat("\nFit: \n")
  sprintf("  Residual deviance: %.*f on %i degrees of freedom\n", x$digits, x$deviance, x$df) %>% cat()
  sprintf("  AIC: %.*f\n", x$digits, x$AIC) %>% cat()
  if(x$boot) {
    cat("\n Diagnostics: \n")
    if(!x$convergence) {
      sprintf("  The model did not converge.\n  Error: %s", x$error) %>% cat()
      if(!is.null(x$diff)) sprintf(", Delta last iteration: %.*g", x$digits, x$diff) %>% cat()
      cat("\n")
    } 
    sprintf("  %i/%i bootsrap samples converged.\n", x$nConverge, x$nSamples) %>% cat()
    if(!is.null(x$diag)) {
      cat("  Summary of errors in bootstrap samples: \n")
      print(x$diag)
    }
    if(!is.null(x$meanDiff)) {
      cat("  Decreasing LL errors in bootsrap samples:\n  Delta last iteration\n")
      print(summary(x$meanDiff))
    }
    
  }
}

#' @export
#' @rdname EMmethods
predict.EM <- function(object, newdata, type = c("posterior"), k = object$k) {
  type <- type[1]
  if(type == "posterior") {
    co <- coef(object)
    gamma <- co[1]
    betas <- t(matrix(co[-1], ncol = 2))
    zCol <- object$zCol
    munCol <- object$munCol
    newdata <- dplyr::arrange(newdata, !!munCol) %>% 
      dplyr::rename(j = !!munCol)
    muns <- dplyr::pull(newdata, j)
    data0 <- dplyr::mutate(newdata, !!zCol := 0)
    data1 <- dplyr::mutate(newdata, !!zCol := 1)
    y <- model.response(model.frame(formula(object), data0))
    x0 <- model.matrix(delete.response(terms(formula(object))), data0)
    x1 <- model.matrix(delete.response(terms(formula(object))), data1)
    ll0 <- log(1-gamma) + rowsum(mdmultinom(y, x0, betas, T, k), muns)
    ll1 <- log(gamma) + rowsum(mdmultinom(y, x1, betas, T, k), muns)
    w <- cbind(ll0, ll1)
    w <- apply(w, 1, function(v) {
      m <- max(v)
      v <- v - m
      if(v[2] < -k) {
        return(0)
      } 
      if(v[1] < -k) {
        return(1)
      }
      v <- exp(v)
      return(v[2] / sum(v))
    })
    names(w) <- unique(muns)
    w
  }
}

#' @rdname posteriorprobs
#' @export
posterior_probs.EM <- function(object) object$weights

#' @rdname posteriorprobs
#' @export
confusion_table.EM <- function(object, cutoff = .5) {
  pr <- posterior_probs(object) %>% na.omit() %>% 
    dplyr::mutate(prob = as.integer(prob > cutoff))
  table(predicted = pr$prob, true = dplyr::pull(pr, !!object$zCol))
}
rferrali/rogali documentation built on May 26, 2019, 7 p.m.