R/final_prog_svyglm.R

Defines functions print.svyCausal final_prog_svyglm

Documented in final_prog_svyglm print.svyCausal

# ========================================
# final_prog_svyglm: Prognostic-weighted survey GLM
# ========================================
#' Prognostic-weighted survey GLM
#' Prognostic-weighted survey GLM
#'
#' Fits a survey-weighted logistic regression model using stabilized
#' prognostic score weights derived from a model predicting the outcome
#' conditional on baseline covariates and excluding the exposure effect.
#' The function supports design-based inference under complex survey
#' sampling while adjusting for confounding through prognostic weighting
#' @param data Data frame
#' @param dep_var Character; binary outcome
#' @param exposure Character; exposure variable
#' @param covariates Character vector; adjustment variables
#' @param id_var Character; PSU
#' @param strata_var Character; strata
#' @param weight_var Character; survey weight
#' @param outcome_covariates Character vector; optional covariates for final model
#' @param level Numeric; CI level
#' @param ... Additional args to svyglm
#' @examples
#' set.seed(123)
#' n <- 1000
#' dat <- data.frame(
#'   psu = sample(1:10, n, replace = TRUE),
#'   strata = sample(1:5, n, replace = TRUE),
#'   weight = runif(n, 0.5, 2),
#'   age = rnorm(n, 50, 10),
#'   sex = factor(sample(c("Male", "Female"), n, replace = TRUE)),
#'   exposure = rbinom(n, 1, 0.5)
#' )
#' dat$outcome <- rbinom(n, 1, plogis(-2 + 0.03*dat$age + 0.5*dat$exposure))
#'fit<-final_prog_svyglm(data = dat,
#'  dep_var = "outcome",
#'  exposure="exposure",
#'  covariates = c("age", "sex"),
#'  id_var = "psu",
#'  strata_var = "strata",
#'  weight_var = "weight",
#'  level = 0.95
#' )
#' names(fit)
#' fit$OR_table
#' @return A list with:
#' \itemize{
#'   \item \code{prog_model}: Prognostic svyglm.
#'   \item \code{final_model}: Weighted outcome svyglm.
#'   \item \code{OR_table}: Odds ratios with CI.
#'   \item \code{AUC}: Weighted AUC.
#'   \item \code{data}: Data with prognostic weights.
#' }
#' @importFrom survey svydesign svyglm
#' @importFrom stats coef confint predict reformulate quasibinomial as.formula
#' @export
#--------------------------------------------------------------------------
final_prog_svyglm <- function(
    data,
    dep_var,
    exposure,
    covariates,
    id_var,
    strata_var,
    weight_var,
    outcome_covariates = NULL,
    level = 0.95,
    ...
) {
  # ---- Safety Check: Remove rows with missing critical variables ----
  required_vars <- c(dep_var, exposure, covariates, outcome_covariates)
  required_vars <- required_vars[!sapply(required_vars, is.null)]  # exclude NULL
  missing_rows <- !complete.cases(data[, required_vars, drop = FALSE])

  if (any(missing_rows)) {
    warning(sprintf("Removing %d rows with missing values in outcome, exposure, or covariates", sum(missing_rows)))
    data <- data[!missing_rows, , drop = FALSE]
  }
  # Prognostic model
  prog_formula <- stats::reformulate(c(exposure, covariates), response=dep_var)
  design0 <- survey::svydesign(
    id = stats::as.formula(paste0("~", id_var)),
    strata = stats::as.formula(paste0("~", strata_var)),
    weights =stats::as.formula(paste0("~", weight_var)),
    data = data,
    nest = TRUE
  )

  prog_fit <- survey::svyglm(prog_formula, design=design0, family=stats::quasibinomial(), ...)

  # Linear predictor minus exposure
  lp0 <-as.numeric(stats::predict(prog_fit, type="link"))
  coef_fit <- stats::coef(prog_fit)
  lp_adj <- lp0
  if(is.factor(data[[exposure]])) {
    ref <- levels(data[[exposure]])[1]
    for(lv in levels(data[[exposure]])) {
      if(lv==ref) next
      coef_name <- paste0(exposure, lv)
      if(coef_name %in% names(coef_fit)) {
        lp_adj[data[[exposure]]==lv] <- lp_adj[data[[exposure]]==lv] - coef_fit[coef_name]
      }
    }
  } else {
    if(exposure %in% names(coef_fit)) lp_adj <- lp_adj - coef_fit[exposure] * data[[exposure]]
  }

  eps <- 1e-6
  data$var_pg_sw <- pmin(pmax(stats::plogis(lp_adj), eps), 1 - eps)
  mean_y <- mean(data[[dep_var]], na.rm=TRUE)
  data$ipgtwt_pg <- (data[[dep_var]] * mean_y / data$var_pg_sw) + ((1-data[[dep_var]])*(1-mean_y)/(1-data$var_pg_sw))
  data$new_weight_pg <- data$ipgtwt_pg * data[[weight_var]]

  design_pg <- survey::svydesign(
    id = stats::as.formula(paste0("~", id_var)),
    strata = stats::as.formula(paste0("~", strata_var)),
    weights = ~new_weight_pg,
    data = data,
    nest = TRUE
  )
  # ---- Safety Check: Ensure outcome has both 0 and 1 ----
  outcome_table <- table(data[[dep_var]])
  if (length(outcome_table) < 2) {
    stop(sprintf("Outcome '%s' does not contain both 0 and 1. Cannot fit logistic model.", dep_var))
  }

  if (any(outcome_table < 5)) {
    warning(sprintf("Outcome '%s' has very few events (%d) or non-events (%d). Estimates may be unstable.",
                    dep_var, outcome_table[1], outcome_table[2]))
  }
  ##########################################
  # Final model
  if(is.null(outcome_covariates)) {
    final_form <- stats::reformulate(exposure, response=dep_var)
  } else {
    final_form <- stats::reformulate(c(exposure, outcome_covariates), response=dep_var)
  }

  final_fit <- survey::svyglm(final_form, design=design_pg, family=stats::quasibinomial(), ...)

  # OR table
  coef_fit <- stats::coef(final_fit)
  ci <- stats::confint(final_fit, level=level)
  OR_table <- data.frame(
    Variable = names(coef_fit),
    OR = exp(coef_fit),
    CI_low = exp(ci[,1]),
    CI_high = exp(ci[,2]),
    p_value = summary(final_fit)$coefficients[, "Pr(>|t|)"],
    row.names = NULL
  )

  # AUC
  pred <- stats::predict(final_fit, type="response")
  pred_class <-ifelse(pred >= 0.5, 1L, 0L)

  out<-list(
    prognostic_model = prog_fit,
    final_model = final_fit,
    OR_table = OR_table,
    outcome = data[[dep_var]],
    final_weights= as.numeric(data$new_weight_pg),
    predictions=as.numeric(pred)
  )
  class(out) <- "svyCausal"
  return(invisible(out))
}
#'@exportS3Method
# Add this small helper function OUTSIDE your main function
# This tells R: "If someone looks at this object, only show the OR Table"
print.svyCausal <- function(x, ...) {
  cat("\n--- svyCausalGLM Results ---\n")
  print(x$OR_table)
}

Try the svyCausalGLM package in your browser

Any scripts or data that you put into this service are public.

svyCausalGLM documentation built on March 3, 2026, 5:08 p.m.