R/emyxmiss.R
In glmfitmiss: Fitting GLMs with Missing Data in Both Responses and Covariates

Documented in emyxmiss

#----------------------------- Fitting GLM with Missing Values ------------------------
# Implementation is for the logit link, but should work for other link.
# Missing data could be present in the response variable y or in x variables
# Missing values in the covriate are assumed to be missing at random (MAR)
# Missing values in the response variable y are assumed to be nonignorable
# ------------------------------------------------- Missing -------------------------

#' Fitting generalized linear models with Incomplete data
#'
#' @description This function enables users to fit generalized linear models when handling incomplete data in both the response variable and categorical covariates. The missing responses are assumed to be nonignorable, while missing categorical covariates are assumed to be missing at random. The model is fitted using a novel likelihood-based method proposed by Pradhan, Nychka, and Bandyopadhyay (2025).
#' @param formula a formula expression as for regression models, of the form \code{response ~ predictors}. The response should be a numeric binary variable with missing values, and predictors can be any variables. A predictor with categorical values with missing can be used in the model. See the documentation of formula for other details.
#' @param data an optional data frame in which to interpret the variables occurring in formula.
#' @param adtnlCovforR an optional list of covariates to be used to fit the logistic regression \code{logit(R) ~ response+predictors+adtnlCovforR}. \code{adtnlCovforR} has to be supplied as a vector. Default is \code{NULL}.
#' @param eps0 arguments to be used to for the convergence criteria of the maximum likelihood computation of the joint likelihood function. The default is 1e-3.
#' @param maxit arguments to be used to for the maximization of the joint likelihood function. The default is 50.
#' @param family A character string specifying the type of model family.
#' @param method a method="brglmFit" or method="glm.fit" will be used for fitting model. The method="brglmFit" fits generalized linear models using bias reduction methods (Kosmidis, 2014), and other penalized maximum likelihood methods.

#' @details The `family` parameter in the `emyxmmiss` function allows you to specify the probability distribution and link function for the response variable in the linear model. It determines the nature of the relationship between the predictors and the response variable.
#' The `family` argument is particularly important when working with binary data, where the response variable has only two possible outcomes. In such cases, you typically want to fit a logistic regression model.
#'
#' The following commonly used families are supported for binary data:
#'
#' - "binomial" for a binomial distribution, suitable for binary or dichotomous response variables.
#'
#' You can also specify different link functions within each family. For the "binomial" family, the default link function is the logit function, which models the log-odds of success. Other available link functions include:
#'
#' - "probit" for the probit link function, which models the cumulative standard normal distribution.
#'
#' - "cloglog" for the complementary log-log link function, which models the complementary log-log of the survival function.
#'
#' It is important to choose the appropriate `family` and `link` function based on the specific characteristics and assumptions of your binary data. The default "binomial" family with the logit link function is often a good starting point, but alternative link functions might be more appropriate depending on the research question and the nature of the data.
#'
#' @return return the generalized linear model estimates
#' @importFrom stats terms glm coef fitted binomial
#' @importFrom dplyr %>%
#' @export
#'
#' @references
#' Firth, D. (1993). Bias reduction of maximum likelihood estimates, Biometrika, 80, 27-38. doi:10.2307/2336755.
#'
#' Ibrahim, J. G. (1990). Incomplete data in generalized linear models. Journal of the American Statistical Association 85, 765–769.
#'
#' Ibrahim, J. G., and Lipsitz, S. R. (1996). Parameter Estimation from Incom- plete Data in Binomial Regression when the Missing Data Mechanism is Nonignorable, Biometrics, 52, 1071–1078.
#'
#' Kosmidis, I., Firth, D. (2021). Jeffreys-prior penalty, finiteness and shrinkage in binomial-response generalized linear models. Biometrika, 108, 71-82. doi:10.1093/biomet/asaa052.
#'
#' Louis, T. A. (1982). Finding the observed information when using the EM algorithm. Proceedings of the Royal Statistical Society, Ser B, 44, 226-233.
#'
#' Maiti, T., Pradhan, V. (2009). Bias reduction and a solution of separation of logistic regression with missing covariates. Biometrics, 65, 1262-1269.
#'
#' Pradhan V, Nychka DW, Bandyopadhyay S (2025).  Addressing Missing Responses and Categorical Covariates in Binary Regression Modeling: An Integrated Framework  (to be submitted).
#'

#' @examples
#' \donttest{
#' data(testyxm) # testyxm is a list called dt
#' dataWithMiss <- testyxm$dataMissing
#' # Binary regression with link=logit
#' fit_yx <- emyxmiss(Wheeze ~ city + soc + cond,
#'                    data = dataWithMiss,
#'                    adtnlCovforR = c("age"),
#'                    family = binomial(link = "logit"),
#'                    method = "brglmFit")
#' fit_yx

#'
#' # Binary regression with link=probit
#' fit_yx <- emyxmiss(Wheeze ~ city + soc + cond,
#'                    data = dataWithMiss,
#'                    adtnlCovforR = c("age"),
#'                    family = binomial(link = "probit"))
#' fit_yx
#'
#'
#' # Firth correction and link=probit
#' fit_yx <- emyxmiss(Wheeze ~ city + soc + cond,
#'                    data = dataWithMiss,
#'                    adtnlCovforR = c("age"),
#'                    family = binomial(link = "probit"),
#'                    method = "brglmFit")
#' fit_yx
#'
#' # on simulated data
#' demo_df <- simulateCovariateData(50, nCov=6)
#' simulated_df <- simulateData(demo_df)
#' testMissData <- simulated_df$dataMissing
#' fit_yx <- emyxmiss(y~x2+x3+x4,
#'                    data=testMissData,
#'                    adtnlCovforR=c("x1"),
#'                    family=binomial,
#'                    method="glm.fit")
#' fit_yx
#' summary(fit_yx$fit_y)
#'}
#' @importFrom utils globalVariables
#' @importFrom stats glm
#' @importFrom brglm2 brglmFit
#' @importFrom abind abind
#' @importFrom dplyr arrange group_by mutate summarise syms distinct
emyxmiss <- function(formula, data, adtnlCovforR = NULL, eps0 = 1e-3, maxit = 75, family = "binomial", method = "glm.fit") {

  # suppressMessages(library("dplyr"))
  # suppressMessages(require("abind"))
  # suppressMessages(library("MASS"))
  # suppressMessages(library("brglm2"))

  required_packages <- c("dplyr", "abind", "brglm2", "MASS")
  for (pkg in required_packages) {
    if (!requireNamespace(pkg, quietly = TRUE)) {
      stop(sprintf("Package '%s' needed for this function to work. Please install it.", pkg), call. = FALSE)
    }
  }

  formula_vars <- all.vars(formula)
  fn_data <- if (length(adtnlCovforR) > 0) {
    subset(data, select = c(formula_vars, adtnlCovforR))
  } else {
    subset(data, select = formula_vars)
  }

  resp <- all.vars(formula)[1]
  predictor_vars <- attr(terms(formula), "term.labels")
  VarWithMissingVal <- predictor_vars[colSums(is.na(data[, predictor_vars])) > 0]

  augmented <- dataAugmentation(fn_data, formula, adtnlCovforR = adtnlCovforR)
  df_augmented <- augmented$augData
  dfn <- augmented$distptrn

  theta <- rep(1 / dfn, dfn)
  beta <- rep(0.5, length(predictor_vars) + 1)

  data_all <- df_augmented
  xname <- predictor_vars
  xname_new <- if (length(adtnlCovforR) > 0) c(xname, adtnlCovforR) else xname
  yname_new <- resp
  rname_new <- "R"
  form_r <- form_gen(rname_new, c(yname_new, xname_new))
  N <- nrow(data_all)
  data_all$ry_weight <- 1

  fit_r <- suppressWarnings(stats::glm(form_r, data = data_all, family = stats::binomial(link = "logit"), method = if (method == "brglmFit") brglm2::brglmFit else method, weights = ry_weight))
  if (fit_r$converged) {
    oldalpha <- initial_coef_alpha <- fit_r$coefficients
    initial_se_alpha <- coef(summary(fit_r))[, 2]
    prob_r_1 <- fitted(fit_r)
    prob_r_0 <- rep(1, N) - prob_r_1
    prob_r <- sapply(seq_len(N), function(i) ifelse(data_all$R[i] == 0, prob_r_0[i], prob_r_1[i]))
  } else {
    return(list(converged = FALSE))
  }

  fit_y <- suppressWarnings(stats::glm(formula, data = data, family = family, method = if (method == "brglmFit") brglm2::brglmFit else method))
  initial_coef_beta <- fit_y$coefficients
  initial_se_beta <- coef(summary(fit_y))[, 2]
  init_omit <- length(fit_y$na.action)

  fit_y <- emforbeta(formula, data = data_all, family = family, method = method,
                     augmented = augmented, theta=theta,
                     VarWithMissingVal = VarWithMissingVal)
  if (fit_y$converged) {
    oldbeta <- c(fit_y$beta, fit_y$alpha)
    prob_y <- fit_y$fitem
  } else {
    return(list(converged = FALSE))
  }

  data_all$prod <- prob_r * prob_y
  data_all <- dplyr::arrange(data_all, grp)
  tt.wgt <- data_all %>% dplyr::group_by(R, grp) %>% dplyr::mutate(wgt = prod / sum(prod))
  data_all$ry_weight <- ifelse(tt.wgt$R == 0, 1, tt.wgt$wgt)
  data_all$prod <- NULL
  oldpar <- c(oldalpha, oldbeta)

  eps <- 1
  i <- 0
  while (eps > eps0 && i <= maxit) {
    fit_yem <- emforbeta(formula, data = data_all, family = family, method = method, augmented = augmented, theta=theta,
                         VarWithMissingVal = VarWithMissingVal)
    weight <- fit_yem$wgt * data_all$ry_weight
    data_all$y_weight <- weight
    fit_y <- suppressWarnings(stats::glm(formula, data = data_all, family = family, method = if (method == "brglmFit") brglm2::brglmFit else method, weights = y_weight))
    data_all$ttl <- colSums(data_all)["y_weight"]
    a.a <- data_all %>% dplyr::group_by(!!!syms(VarWithMissingVal)) %>% dplyr::summarise(wt = mean(sum(y_weight) / ttl))
    theta <- as.vector(t(a.a$wt))

    if (fit_y$converged) {
      newbeta <- c(fit_y$coefficients, theta)
      prob_y_1 <- fitted(fit_y)
      prob_y_0 <- rep(1, N) - prob_y_1
      prob_y <- sapply(seq_len(N), function(i) ifelse(eval(parse(text = paste0("data_all$", resp, "[i]"))) == 0, prob_y_0[i], prob_y_1[i]))
    } else {
      return(list(converged = FALSE))
    }

    fit_r <- suppressWarnings(stats::glm(form_r, data = data_all, family = stats::binomial(link = "logit"), method = if (method == "brglmFit") brglm2::brglmFit else method, weights = ry_weight))
    if (fit_r$converged) {
      newalpha <- fit_r$coefficients
      prob_r_1 <- fitted(fit_r)
      prob_r_0 <- rep(1, N) - prob_r_1
      prob_r <- sapply(seq_len(N), function(i) ifelse(data_all$R[i] == 0, prob_r_0[i], prob_r_1[i]))
    } else {
      return(list(converged = FALSE))
    }

    data_all$prod <- prob_r * prob_y
    data_all <- dplyr::arrange(data_all, grp)
    tt.wgt <- data_all %>% dplyr::group_by(R, grp) %>% dplyr::mutate(wgt = prod / sum(prod))
    data_all$ry_weight <- ifelse(tt.wgt$R == 0, 1, tt.wgt$wgt)
    data_all$prod <- NULL

    newpar <- c(newalpha, newbeta)
    diff <- abs(newpar - oldpar)
    eps <- sum(diff)
    oldpar <- newpar
    i <- i + 1
  }

  fit_y <- suppressWarnings(stats::glm(formula, family = family, data = data_all, method = if (method == "brglmFit") brglm2::brglmFit else method, weights = y_weight))
  newbeta.1 <- fit_y$coefficients

  alpha <- data.frame(initial_coef_alpha, newalpha)
  names(alpha) <- c("Initial Est", "Final Est")
  beta <- data.frame(initial_coef_beta, newbeta.1)
  names(beta) <- c("Initial Est", "Final Est")

  temp_data_all <- data_all
  temp_data_all$wgt <- data_all$y_weight
  vcov_beta <- louisvcov(formula, data = temp_data_all)
  se_beta_em <- sqrt(diag(vcov_beta))

  temp_data_all <- data_all
  temp_data_all$wgt <- data_all$ry_weight
  vcov_alpha <- louisvcov(form_r, data = temp_data_all)
  se_alpha_em <- sqrt(diag(vcov_alpha))

  conv_em_se_beta <- coef(summary(fit_y))[, 2]
  conv_em_se_alpha <- coef(summary(fit_r))[, 2]

  se_beta <- cbind(initial_se_beta, se_beta_em, conv_em_se_beta)
  colnames(se_beta) <- c("se_beta_ini", "se_beta_em", "se_beta_conv")

  se_alpha <- cbind(initial_se_alpha, se_alpha_em, conv_em_se_alpha)
  colnames(se_alpha) <- c("se_alpha_ini", "se_alpha_em", "se_alpha_conv")

  if (eps <= eps0 && i <= maxit) {
    return (list(fit_r = fit_r, fit_y = fit_y, alpha = alpha, beta = beta, se_alpha = se_alpha, se_beta = se_beta, vcov_alpha = vcov_alpha, vcov_beta = vcov_beta, iter = i, eps = eps, converged = TRUE, namit = init_omit))
  } else {
    return(list(converged = FALSE))
  }
}