rogali: A series of functions for the corrupt_audit project

Documented in riskCurve

#' Fit a parametric risk curve
#' @param object a fitted model object
#' @param dataSample a \code{data.frame} on which the curve is to be calibrated against. Must contain all predictors of \code{object}
#' @param timeInvariant if not \code{NULL}, a 1 row \code{data.frame} setting time invariant variables used of the fit. 
#' @param timeVarying if not \code{NULL}, a \code{data.frame} with a column indicating years as in \code{object} 
#'   as well as other time-varying covariates. 
#' @param weights optional vector of weights of length \code{nrow(dataSample)} (useful if \code{object} was fitted using a matched sample)
#' @param level if not \code{NULL} levels for confidence intervals. 
#' @param nSamples if \code{level} is not \code{NULL}, number of Monte Carlo samples to be drawn 
#'   to compute the confidence intervals around predictions.
#' @details A risk curve is the cumulative probability of an event up to some time point. For instance, it would be the probability of having been fired 
#'   up to year 2010. Confidence intervals around each event denote the probability of this event as opposed to other events. As such, confidence 
#'   intervals for all events do not sum to 1. The risk curve is calibrated at mean sample values, using the data in \code{dataSample}. 
#' @return If \code{level} is \code{NULL}, a \code{length(years)} rows, 3 columns \code{tibble} with year, and probability for 2 of the three outcomes. If 
#'   \code{level} is not \code{NULL}, a \code{2 * length(years)} rows \code{tibble} with year and probability for 2 of the three outcomes, and as many columns 
#'   as necessary for confidence intervals. 
#' @export

riskCurve <- function(object, dataSample, timeInvariant = NULL, timeVarying = NULL, weights = NULL, level = c(.9, .95), nSamples = 1e3) {
  if(is.null(weights) & (!is.null(object$formulaMatch))) {
    warning(
      "The model was estimated using a matched sample, but you're not using weights to fit the survival curve. Are you sure? (weights are in object$MatchIt$weights)"
    )
  }
  yearCol <- object$yearCol
  years <- dplyr::pull(dataSample, !!yearCol) %>% unique()
  minYear <- min(as.numeric(as.character(years)))
  if(is.null(timeVarying)) timeVarying <- tibble::tibble(!!yearCol := years)
  if(!all(years %in% dplyr::pull(timeVarying, !!yearCol))) stop("timeVarying does not contain all years")
  if(nrow(timeVarying) != length(years)) stop("timeVarying does not contain one row per year")
  
  mf <- dplyr::filter(dataSample, as.character(!!yearCol) == as.character(minYear))
  y <- model.response(model.frame(formula(object), mf))
  if(!is.null(weights)) y <- y * weights
  mf <- model.frame(delete.response(terms(formula(object))), mf)
  mf <- cbind(y = rowSums(y), mf)
  cInvariant <- colnames(timeInvariant)
  cVarying <- colnames(timeVarying)
  cols <- c(cInvariant, cVarying)
  mf <- dplyr::select(mf, -cols)
  mf <- dplyr::group_by_at(mf, -1) %>% 
    dplyr::summarize(y = sum(y)) %>% 
    dplyr::ungroup()
  mf <- dplyr::bind_cols(mf, timeInvariant[rep(1,nrow(mf)),])
  mf <- dplyr::inner_join(
    dplyr::mutate(mf, joinCol = 1), 
    dplyr::mutate(timeVarying, joinCol = 1), 
    by = "joinCol") %>% 
    dplyr::select(-joinCol)
  
  probs <- predict(object, mf, type = "response")
  pe <- tibble::as_tibble(makeSurv(mf, probs, yearCol)) %>% 
    dplyr::mutate(year = sort(as.numeric(as.character(years)))) %>% 
    dplyr::select(year, outcome_1, outcome_2)
  
  if(!is.null(level)) {
    alpha <- (1-level)/2
    alpha <- sort(unique(c(alpha, 1-alpha)))
    betas <- MASS::mvrnorm(nSamples, coef(object), vcov(object))
    x <- model.matrix(delete.response(terms(formula(object))), mf)
    probs <- as.numeric(apply(betas, 1, predictSurv, x = x))
    probs <- array(probs, c(nrow(x), 3, nrow(betas)))
    survs <- as.numeric(apply(probs, 3, makeSurv, mf = mf, yearCol = yearCol))
    survs <- array(survs, c(length(years), 2, nrow(betas)))
    ci <- dplyr::bind_rows(
      tibble::as_tibble(t(apply(survs[,1,], 1, quantile, probs = alpha))), 
      tibble::as_tibble(t(apply(survs[,2,], 1, quantile, probs = alpha)))
    )
    pe <- tidyr::gather(pe, outcome, value, -year) %>% 
      dplyr::bind_cols(ci)
  }
  pe
}


predictSurv <- function(x, vBeta) {
  vBeta <- cbind(0, matrix(vBeta, ncol = 2))
  probs <- exp(x %*% vBeta)
  probs / rowSums(probs)
}


makeSurv <- function(mf, probs, yearCol) {
  colnames(probs) <- sprintf("outcome_%s", 0:2)
  probs <- tibble::as_tibble(probs)
  mf <- dplyr::bind_cols(
    dplyr::select(mf, year = !!yearCol, y), 
    probs
  ) %>% dplyr::group_by(year) %>% 
    dplyr::summarize(
      outcome_0 = stats::weighted.mean(outcome_0, y), 
      outcome_1 = stats::weighted.mean(outcome_1, y), 
      outcome_2 = stats::weighted.mean(outcome_2, y)
    ) %>% 
    dplyr::ungroup() %>% 
    dplyr::mutate(year = as.numeric(as.character(year))) %>% 
    dplyr::arrange(year)
  
  mf %>% 
    dplyr::mutate(
      outcome_0 = cumprod(outcome_0), 
      S = dplyr::lag(outcome_0)
    ) %>% 
    tidyr::replace_na(list(S = 1)) %>% 
    dplyr::mutate(
      outcome_1 = cumsum(outcome_1 * S), 
      outcome_2 = cumsum(outcome_2 * S)
    ) %>% 
    dplyr::select(outcome_1, outcome_2) %>% 
    as.matrix()
}