R/perfect_sensitivity_EM.R
In COMBO: Correcting Misclassified Binary Outcomes in Association Studies

Documented in perfect_sensitivity_EM

#' EM-Algorithm Estimation of the Binary Outcome Misclassification Model while Assuming Perfect Sensitivity
#'
#' Code is adapted by the SAMBA R package from Lauren Beesley and Bhramar Mukherjee.
#'
#' @param Ystar A numeric vector of indicator variables (1, 0) for the observed
#'   outcome \code{Y*}. The reference category is 0.
#' @param Z A numeric matrix of covariates in the true outcome mechanism.
#'   \code{Z} should not contain an intercept.
#' @param X A numeric matrix of covariates in the observation mechanism.
#'   \code{X} should not contain an intercept.
#' @param start Numeric vector of starting values for parameters in the true
#'   outcome mechanism (\eqn{\theta}) and the observation mechanism (\eqn{\beta}), respectively.
#' @param beta0_fixed Optional numeric vector of values of the observation mechanism
#'   intercept to profile over. If a single value is entered, this corresponds to
#'   fixing the intercept at the specified value. The default is \code{NULL}.
#' @param weights Optional vector of row-specific weights used for selection bias
#'   adjustment. The default is \code{NULL}.
#' @param expected A logical value indicating whether or not to calculate the
#'   covariance matrix via the expected Fisher information matrix. The default is \code{TRUE}.
#' @param tolerance A numeric value specifying when to stop estimation, based on
#'   the difference of subsequent log-likelihood estimates. The default is \code{1e-7}.
#' @param max_em_iterations An integer specifying the maximum number of
#'   iterations of the EM algorithm. The default is \code{1500}.
#'
#' @return \code{perfect_sensitivity_EM} returns a list containing nine elements.
#'   The elements are detailed in \code{?SAMBA::obsloglikEM} documentation. Code
#'   is adapted from the \code{SAMBA::obsloglikEM} function.
#'
#' @references Beesley, L. and Mukherjee, B. (2020).
#'   Statistical inference for association studies using electronic health records:
#'   Handling both selection bias and outcome misclassification.
#'   Biometrics, 78, 214-226.
#'
#' @include expit.R
#'
#' @importFrom stats rnorm rgamma rmultinom glm predict coef
#'
perfect_sensitivity_EM <- function(Ystar, Z, X, start, beta0_fixed = NULL,
                                   weights = NULL, expected = TRUE,
                                   tolerance = 1e-7, max_em_iterations = 1500)
{

  obsloglik_var <- utils::getFromNamespace("obsloglik_var", "SAMBA")
  obsloglik_var_weighted <- utils::getFromNamespace("obsloglik_var_weighted", "SAMBA")

  if (is.data.frame(Z))
    Z <- as.matrix(Z)
  if (!is.numeric(Z))
    stop("'Z' should be a numeric matrix.")

  if (is.vector(Z))
    Z <- as.matrix(Z)
  if (!is.matrix(Z))
    stop("'Z' should be a matrix or data.frame.")

  if (!is.null(X)) {
    if (is.data.frame(X))
      X <- as.matrix(X)
    if (!is.numeric(X))
      stop("'X' must be numeric.")
    if (is.vector(X))
      X <- as.matrix(X)
    if (!is.matrix(X))
      stop("'X' must be a data.frame or matrix.")
  }

  if (!is.numeric(Ystar) || !is.vector(Ystar))
    stop("'Ystar' must be a numeric vector.")
  if (length(setdiff(0:1, unique(Ystar))) != 0)
    stop("'Ystar' must be coded 0/1.")

  n <- length(Ystar)
  if (nrow(Z) != n)
    stop("The number of rows of 'Z' must match the length of 'Ystar'.")
  if (!is.null(X) && nrow(X) != n)
    stop("The number of rows of 'X' must match the length of 'Ystar'.")

  if (!is.logical(expected) || length(expected) > 1)
    stop("'expected' must be a length one logical.")

  # initialise p for EM
  theta <- start[1:(1 + ncol(Z))]
  beta  <- start[-(1:(1 + ncol(Z)))]
  pred1 <- expit(cbind(1, Z) %*% theta)
  pred2 <- expit(cbind(1, X) %*% beta)

  calculate.p <- function(pred1, pred2)
  {
   (Ystar) * (pred1 / (pred1 + (pred2 * (1 - pred1))))
  }
  p <- calculate.p(pred1, pred2)

  it <- 1
  converged <- F

  w <- 1

  param.seq  <- matrix(c(theta, beta), 1)
  loglik.seq <- -10 ^ 9
  while (!converged && it < max_em_iterations) {
    if (is.null(beta0_fixed)) {
      suppressWarnings({
        fit.beta <- stats::glm(Ystar ~ X, weights = (1 - p) * w,
                               family = stats::binomial())
      })
    } else {
      suppressWarnings({
        fit.beta <- stats::glm(Ystar ~ 0 + X, weights = (1 - p) * w,
                               offset = rep(beta0_fixed, length(p)),
                               family = stats::binomial())
      })
    }
    suppressWarnings({
      fit.theta <- stats::glm(p ~ Z, family = stats::binomial(),
                              weights = weights)
    })
    pred1 <- stats::predict(fit.theta, type = 'response')
    pred2 <- stats::predict(fit.beta, type = 'response')
    p     <- calculate.p(pred1, pred2)

    loglik <- sum(w * Ystar * log(pred1 + (pred2 * (1 - pred1))) +
                    w * (1 - Ystar) * log((1 - pred1) * (1 - pred2)))
    loglik.seq <- c(loglik.seq, loglik)

    it <- it + 1
    if (abs(loglik.seq[it] - loglik.seq[it - 1]) < tolerance)
      converged <- TRUE

    par <- c(stats::coef(fit.theta), beta0_fixed, stats::coef(fit.beta))
    param.seq <- rbind(param.seq, par)
  }

  param <- c(stats::coef(fit.theta), beta0_fixed, stats::coef(fit.beta))
  theta <- param[1:(ncol(Z) + 1)]
  beta  <- param[-(1:(ncol(Z) + 1))]

  if (is.null(weights)) {
    var <- obsloglik_var(Ystar, Z, X, theta, beta, beta0_fixed, expected)
  } else {
    var <- obsloglik_var_weighted(Ystar, Z, X, theta, beta, beta0_fixed,
                                  weights, expected)
  }

  results <- list(param = param, variance = var, param.seq = param.seq,
                  loglik.seq = loglik.seq, Ystar = Ystar, X = X, Z = Z,
                  weights = weights, beta0_fixed = beta0_fixed)

  return(results)
}