R/logRegMAR.R
In glmfitmiss: Fitting GLMs with Missing Data in Both Responses and Covariates

Documented in logRegMAR

#' Fitting binary regression with missing categorical covariates using new likelihood based method
#'@description This function allows users to fit logistic regression models with incomplete predictors that are categorical. The model is fitted using a new likelihood-based method, which ensures reliable parameter estimation even when dealing with missing data. For more information on the underlying methodology, please refer to Pradhan, Nychka, and Bandyopadhyay (2024).
#'
#' @param formula A formula expression as for regression models, of the form \code{response ~ predictors}. The response should be a numeric binary variable with missing values, and predictors can be any variables. A predictor with categorical values with missing can be used in the model. See the documentation of formula for other details.
#' @param data Input data for fitting the model
#' @param conflev Confidence level, the default is 0.95
#' @param correctn a TRUE or FALSE value, by default it is TRUE.
#' @param verbose a TRUE or FALSE value, default is verbose = TRUE
#'
#' @return return the logistic regression estimates
#' @export
#'
#' @examples
#' \donttest{
#' # -----------------Example 1: Metastatic Melanoma --------------------------
#'
#' est1 <- logRegMAR (failcens ~ size+type+nodal+age+sex+trt,
#'                    data = metastmelanoma, conflev = 0.95, correctn = FALSE)
#'
#' est1
#' # -----------------Bias reduced estimates due to Firth (1993) --------------
#' est2 <- logRegMAR (failcens ~ size+type+nodal+age+sex+trt,
#'                    data = metastmelanoma, conflev = 0.95, correctn = TRUE)
#'
#' est2

#' # -----------------Bias reduced estimates due to Firth (1993) --------------

#' est2 <- logRegMAR (CaseCntrl ~ Numnill+Numsleep+Smoke+Set+Reftime,
#'                    data=meningitis, conflev = 0.95, correctn = TRUE)
#' est2
#' }

#' @references
#' Firth, D. (1993). Bias reduction of maximum likelihood estimates, Biometrika, 80, 27-38. doi:10.2307/2336755.
#'
#' Kosmidis, I., Firth, D. (2021). Jeffreys-prior penalty, finiteness and shrinkage in binomial-response generalized linear models. Biometrika, 108, 71-82. doi:10.1093/biomet/asaa052.
#'
#' Pradhan, V., Nychka, D. and Bandyopadhyay, S. (2025). Bridging Gaps in Logistic Regression: Tackling Missing Categorical Covariates with a New Likelihood Method (to be submitted).
#'
#' Pradhan, V., Nychka, D. and Bandyopadhyay, S. (2025). glmFitMiss: Binary Regression with Missing Data in R (to be submitted)
#'
#' @importFrom stats optim
logRegMAR <- function(formula, data, conflev=0.95, correctn=TRUE, verbose = TRUE){

  # Extract the response variable and covariates from the formula
  response_var <- all.vars(formula)[1]
  covariates <- all.vars(formula)[-1]

  # Check for missing values in the covariates
  if (!any(sapply(data[covariates], function(x) any(is.na(x))))) {
    if (verbose) message("This function is not applicable as there are no missing values in the covariates.\n")
    return(NULL)
  }

  # Check for missing values in the response variable
  if (any(is.na(data[[response_var]]))) {
    if (verbose) message("This function is not applicable as there are a missing values in the response variable.\n")
    return(NULL)
  }

  # Extract variable names from the formula
  term_labels <- attr(terms(formula), "term.labels")

  # Add the intercept name
  param_names <- c("(Intercept)", term_labels)

  zalpha <- abs(qnorm((1-conflev)/2))
  original_df <- data
  augData <- dataAugmentationDWN(subset(original_df, select = all.vars(formula)))
  k <- augData$distptrn

  # ----initialize the beta and theta parameters-------------
  beta = matrix(rep(1e-4), length(all.vars(formula)), 1)
  initial_theta <- matrix(1/k, k, 1)
  initial_par <- c(as.vector(beta), initial_theta)

  # Optimization call with additional arguments
  optim_result <- optim(par = initial_par, fn = llkmiss, data = original_df,
                        formula = formula, augData = augData, method = "BFGS",
                        biasCorr=correctn, hessian = TRUE)

  # Now, calculate the standard errors
  hessian_matrix <- optim_result$hessian
  hessian_submatrix_beta <- hessian_matrix[1:length(all.vars(formula)), 1:length(all.vars(formula))]
  if(is.null(hessian_submatrix_beta)){
    message("Hessian was not returned. Ensure that 'hessian=TRUE' and the method supports it")
  } else {
    hessian_inv <- solve(hessian_submatrix_beta)  # Inverse of the Hessian
    std_errors <- sqrt(diag(hessian_inv))  # Standard errors are sqrt of diagonal elements
  }

  # Extract beta estimates
  beta <- optim_result$par[1:length(beta)]
  # Set the names of the estimates
  names(beta) <- param_names

  se_beta_em <- std_errors

  # Compute the z-values from the betas and standard errors
  z_values <- beta / se_beta_em

  # Calculate the p-values using the standard normal distribution
  p_values <- 2 * (1 - pnorm(abs(z_values)))

  LowerCI <- beta -zalpha*se_beta_em
  UpperCI <- beta +zalpha*se_beta_em
  lkest <- data.frame(beta, se_beta_em, LowerCI, UpperCI, p_values)

  # rownames(lkest) <- c("Intercept", "x1", "x2", "x3")
  colnames(lkest) <- c("Estimate", "StdError", "LowerCI", "UpperCI","Pr(>|z|)")
  return (lkest)
}