R/rm_mvsum2.R

Defines functions combine_uv_mv get_model_args rm_mvsum

Documented in rm_mvsum

#'Format a regression model nicely for 'Rmarkdown'
#'
#'Multivariable (or univariate) regression models are re-formatted for reporting
#'and a global p-value is added for the evaluation of factor variables.
#'
#'Global p-values are likelihood ratio tests for lm, glm and polr models. For
#'lme models an attempt is made to re-fit the model using ML and if successful
#'LRT is used to obtain a global p-value. For lmer models (lme4), if the
#'lmerTest package is installed, Satterthwaite-based p-values and F-test global
#'p-values are used; otherwise Wald z-based p-values and chi-squared LRT global
#'p-values are returned. For glmer models (lme4), Wald z-based p-values are
#'used with chi-squared LRT global p-values. Estimates are exponentiated for
#'binomial (OR) and poisson/negative binomial (RR) families. For coxph models
#'the model is re-run without robust variances with and without each variable
#'and a LRT is presented. If unsuccessful a Wald p-value is returned. For GEE
#'and CRR models Wald global p-values are returned. For negative binomial
#'models a deviance test is used.
#'
#'If the variance inflation factor is requested (VIF=TRUE, default) then a
#'generalised VIF will be calculated in the same manner as the car package.
#'
#'As of version 0.1.1 if global p-values are requested they will be included in
#'the p-value column.
#'
#' As of R 4.4.0 profile likelihood confidence intervals will
#'be calculated automatically and there is no longer an option to force Wald
#'tests.
#'
#'The number of decimals places to display the statistics can be changed with
#'digits, but this will not change the display of p-values. If more significant
#'digits are required for p-values then use tableOnly=TRUE and format as
#'desired.
#'@param model model fit
#'@param data `r lifecycle::badge("deprecated")` data is no longer required as
#'  it is extracted from the model. This argument will be removed in the future
#'@param digits number of digits to round estimates to, does not affect p-values
#'@param covTitle character with the names of the covariate (predictor) column.
#'  The default is to leave this empty for output or, for table only output to
#'  use the column name 'Covariate'.
#'@param showN boolean indicating sample sizes should be shown for each
#'  comparison, can be useful for interactions
#'@param showEvent boolean indicating if number of events should be shown. Only
#'  available for logistic.
#'@param CIwidth width for confidence intervals, defaults to 0.95
#'@param vif boolean indicating if the variance inflation factor should be
#'  included. See details
#'@param whichp string indicating whether you want to display p-values for
#'  levels within categorical data ("levels"), global p values ("global"), or
#'  both ("both"). Irrelevant for continuous predictors.
#'@param caption table caption
#'@param tableOnly boolean indicating if unformatted table should be returned
#'@param p.adjust p-adjustments to be performed. Uses the [p.adjust] function
#'  from base R
#'@param unformattedp boolean indicating if you would like the p-value to be
#'  returned unformatted (ie not rounded or prefixed with '<'). Should be used
#'  in conjunction with the digits argument.
#'@param nicenames boolean indicating if you want to replace . and _ in strings
#'  with a space
#' @param include_unadjusted Logical. If TRUE, includes univariate estimates
#'   alongside multivariable estimates. Default is FALSE.
#'@param chunk_label Deprecated, previously used in Word to allow
#'  cross-referencing, this should now be done at the chunk level.
#'@param fontsize PDF/HTML output only, manually set the table fontsize
#'@return A character vector of the table source code, unless tableOnly=TRUE in
#'  which case a data frame is returned
#'@export
#'@references John Fox & Georges Monette (1992) Generalized Collinearity
#'  Diagnostics, Journal of the American Statistical Association, 87:417,
#'  178-183, \doi{10.1080/01621459.1992.10475190}
#'@references  John Fox and Sanford Weisberg (2019). An {R} Companion to Applied
#'  Regression, Third Edition. Thousand Oaks CA: Sage.
#' @examples
#' data("pembrolizumab")
#' glm_fit = glm(change_ctdna_group~sex:age+baseline_ctdna+l_size,
#' data=pembrolizumab,family = 'binomial')
#' rm_mvsum(glm_fit)
#'
#' #linear model with p-value adjustment
#' lm_fit=lm(baseline_ctdna~age+sex+l_size+tmb,data=pembrolizumab)
#' rm_mvsum(lm_fit,p.adjust = "bonferroni")
#' #Coxph
#' require(survival)
#' res.cox <- coxph(Surv(os_time, os_status) ~ sex+age+l_size+tmb, data = pembrolizumab)
#' rm_mvsum(res.cox, vif=TRUE)
#'
#' # lmer (lme4 mixed effects model) - single random intercept
#' \donttest{
#' if (require(lme4)){
#' lmer_fit <- lme4::lmer(age ~ sex + pdl1 + (1|cohort), data = pembrolizumab)
#' rm_mvsum(lmer_fit)
#' }
#' }
#'
#' # lmer with multiple random effects and global p-values
#' \donttest{
#' if (require(lme4) && require(geepack)){
#' data(dietox, package = "geepack")
#' dietox$Cu <- as.factor(dietox$Cu)
#' lmer_fit2 <- lme4::lmer(Weight ~ Cu + Time + (1|Pig) + (1|Litter), data = dietox)
#' rm_mvsum(lmer_fit2, whichp = "both")
#' }
#' }
#'
#' # glmer (binomial mixed effects model) - odds ratios
#' \donttest{
#' if (require(lme4)){
#' data(cbpp, package = "lme4")
#' glmer_fit <- lme4::glmer(cbind(incidence, size - incidence) ~ period + (1|herd),
#'   data = cbpp, family = binomial)
#' rm_mvsum(glmer_fit)
#' }
#' }
#'
#' # glmer.nb (negative binomial mixed effects model) - rate ratios
#' \donttest{
#' if (require(lme4) && require(geepack)){
#' data(dietox, package = "geepack")
#' dietox$Cu <- as.factor(dietox$Cu)
#' nb_fit <- lme4::glmer.nb(Weight ~ Cu + Time + (1|Pig), data = dietox)
#' rm_mvsum(nb_fit, whichp = "both")
#' }
#' }
rm_mvsum <- function(model, data, digits=getOption("reportRmd.digits",2),covTitle='',showN=TRUE,showEvent=TRUE,CIwidth=0.95, vif=TRUE,
                     whichp=c("levels","global","both"),
                     caption=NULL,tableOnly=FALSE,p.adjust='none',unformattedp=FALSE,nicenames = TRUE,include_unadjusted=FALSE,
                     chunk_label, fontsize){
  if (unformattedp) formatp <- function(x) {as.numeric(x)}
  whichp <- match.arg(whichp)

  # Handle multiply imputed (mira) models
  is_mira <- inherits(model, "mira")
  if (is_mira) {
    if (!requireNamespace("mice", quietly = TRUE))
      stop("The mice package is required for multiply imputed model summaries.")
    fit1 <- model$analyses[[1]]
  }

  if (!missing(data)) lifecycle::deprecate_soft("0.1.1","rm_mvsum(data)")
  if (!missing(chunk_label)) lifecycle::deprecate_soft("0.1.1","rm_mvsum(chunk_label)")
  model_coef <- get_model_coef(model)
  if (any(is.na(model_coef))) warning(paste0('The following model coefficients could not be estimated and are excluded from the table:\n',
                                             paste(names(model_coef)[is.na(model_coef)],collapse = ", "),
                                             "\nConsider re-fitting the model or running droplevels."))
  # get the table
  tab <- m_summary(model, CIwidth = CIwidth, digits = digits, vif = vif, whichp = whichp, for_plot = FALSE)
  if (include_unadjusted && is_mira) {
    message("Unadjusted estimates are not supported for multiply imputed models.")
    include_unadjusted <- FALSE
  }
  if (include_unadjusted) {
    m_sum <- m_summary(model, CIwidth = CIwidth, digits = digits, vif = vif, whichp = whichp, for_plot = TRUE)
    ma <- get_model_args(model)

    # For univariate models, we need the original data with individual columns
    # not the model frame which may have Surv() objects for survival models
    uv_data <- if (!missing(data)) data else get_model_data(model)

    tabUV <- rm_uvsum(response = ma$response, covs = ma$predictors, data = uv_data,
                      tableOnly = TRUE, nicenames = FALSE, unformattedp = unformattedp)
    tab <- combine_uv_mv(tabUV, m_sum, tab)
  }
  if (!showN) {
    rmc <- grep("^N(\\s|$)", names(tab))
    if (length(rmc)>0) tab <- tab[,-rmc ]
  }
  if (!showEvent) {
    rmc <- grep("^Event(\\s|$)", names(tab))
    if (length(rmc)>0) tab <- tab[,-rmc ]
  }
  att_tab <- attributes(tab)
  model_terms <- if (is_mira) fit1$terms else tryCatch(model$terms, error = function(e) terms(model))
  to_indent <- which(!(tab[["Variable"]] %in% attr(model_terms, "term.labels")))
  bold_cells <- cbind(which(tab[["Variable"]] %in% attr(model_terms, "term.labels")), rep(1, length(which(tab[["Variable"]] %in% attr(model_terms, "term.labels")))))
  attr(tab, "to_indent") <- to_indent
  attr(tab, "bold_cells") <- bold_cells

  # Find all estimate columns (contain "95%CI")
  est_cols <- grep("\\(95%CI\\)", names(tab), value = TRUE)

  # Temporarily remove already-formatted unadjusted p-values before
  # format_bold_pvalues, which would corrupt string values like "<0.001"
  unadj_p_col <- NULL
  if (include_unadjusted && "Unadjusted p-value" %in% names(tab)) {
    unadj_p_col <- tab[["Unadjusted p-value"]]
    tab[["Unadjusted p-value"]] <- NULL
  }

  # Format and bold p-values
  pv <- format_bold_pvalues(tab, bold_cells,
                            unformattedp = unformattedp, p.adjust = p.adjust)
  tab <- pv$tab; bold_cells <- pv$bold_cells

  # Restore unadjusted p-values in the correct position (after unadjusted estimate)
  if (!is.null(unadj_p_col)) {
    unadj_est_col <- grep("^Unadjusted.*\\(95%CI\\)", names(tab), value = TRUE)[1]
    insert_pos <- if (!is.na(unadj_est_col)) which(names(tab) == unadj_est_col) + 1 else 2
    tab <- data.frame(
      tab[, seq_len(insert_pos - 1), drop = FALSE],
      `Unadjusted p-value` = unadj_p_col,
      tab[, insert_pos:ncol(tab), drop = FALSE],
      check.names = FALSE,
      stringsAsFactors = FALSE
    )
  }

  # changing UB to Inf, LB to 0 for all estimate columns
  for (ecol in est_cols) {
    tab[[ecol]] <- sapply(tab[[ecol]], process_ci)
  }

  if (nicenames){
    attr(tab,"termnames") <- tab$Variable
    md <- try(get_model_data(if (is_mira) fit1 else model))
    if (inherits(md, "try-error")) {
      warning("Unable to extract data from model, using variable names")
    } else {
      tab$Variable <- replaceLbl(md, tab$Variable)
    }
  }

  names(tab)[1] <- covTitle
  for (a in setdiff(names(att_tab),names(attributes(tab)))) attr(tab,a) <- att_tab[[a]]
  if (tableOnly){
    if (names(tab)[1]=='') names(tab)[1] <- 'Covariate'
    attr(tab, 'to_indent') <- to_indent
    attr(tab,'bold_cells') <- bold_cells
    attr(tab,'dimchk') <- dim(tab)
    return(tab)
  }
  argL <- list(tab=tab,to_indent=to_indent,bold_cells = bold_cells,
               caption=caption, digits = digits,
               chunk_label=ifelse(missing(chunk_label),'NOLABELTOADD',chunk_label))
  if (!missing(fontsize)) argL[['fontsize']] <- fontsize
  do.call(outTable, argL)
}

get_model_args <- function(model) {
  av <- as.character(model$call)
  av_f <- av[which(grepl("~",av))]
  f <- as.formula(av_f)

  #  response variables (strip functions like Surv)
  response_vars <- all.vars(f[[2]])

  #  predictor variables (ignore interactions)
  term_labels <- attr(terms(f), "term.labels")

  predictor_vars <- unique(unlist(
    strsplit(term_labels, "[:*]")
  ))

  list(
    response = response_vars,
    predictors = predictor_vars
  )
}


combine_uv_mv <- function(tabUV, m_sum, tabMV) {

  # Detect estimate column name dynamically
  est_col_uv <- grep("\\(95%CI\\)", names(tabUV), value = TRUE)[1]
  est_col_mv <- grep("\\(95%CI\\)", names(tabMV), value = TRUE)[1]

  if (is.na(est_col_uv)) stop("Cannot find estimate column in tabUV")
  if (is.na(est_col_mv)) stop("Cannot find estimate column in tabMV")

  # Standardize column names for internal processing
  tabUV_work <- tabUV
  names(tabUV_work)[names(tabUV_work) == est_col_uv] <- "Est_CI"

  tabMV_work <- tabMV
  names(tabMV_work)[names(tabMV_work) == est_col_mv] <- "Est_CI"

  # 1. Reconstruct (var, lvl) for tabUV using its structure
  tabUV2 <- tabUV_work |>
    dplyr::mutate(
      is_header = (is.na(Est_CI) | Est_CI == "<NA>") &
                  (is.na(`p-value`) | `p-value` == "<NA>"),
      var = dplyr::if_else(is_header, Covariate, NA_character_),
      lvl = dplyr::if_else(!is_header, Covariate, NA_character_)
    ) |>
    tidyr::fill(var, .direction = "down")

  # 2. Attach (var, lvl) to MV table and merge with UV
  # Preserve original tabMV row order
  out <- tabMV_work |>
    dplyr::mutate(
      mv_order = dplyr::row_number(),
      is_header = is.na(Est_CI) | Est_CI == "<NA>",
      var = ifelse(is_header, Variable, NA_character_)
    ) |>
    tidyr::fill(var, .direction = "down") |>
    dplyr::mutate(
      lvl = ifelse(is_header, NA_character_, Variable)
    ) |>
    dplyr::left_join(
      tabUV2 |>
        dplyr::select(dplyr::any_of(c("var", "lvl", "Est_CI", "p-value"))) |>
        dplyr::rename(
          `Unadjusted Est_CI` = Est_CI,
          `Unadjusted p-value` = `p-value`
        ),
      by = c("var", "lvl")
    ) |>
    dplyr::rename(
      `Adjusted Est_CI` = Est_CI,
      `Adjusted p-value` = `p-value`
    ) |>
    dplyr::arrange(mv_order) |>
    dplyr::select(dplyr::any_of(c(
      "Variable", "var",
      "Unadjusted Est_CI", "Unadjusted p-value",
      "Adjusted Est_CI", "Adjusted p-value",
      "N", "Event", "VIF"
    )))

  # 3. Identify UV-only main effects from interactions
  mv_main_vars <- unique(out$var[!grepl(":", out$var)])
  uv_only_vars <- c()

  # Find all interaction terms in MV model and extract their components
  interaction_terms <- unique(out$var[grepl(":", out$var)])
  for (int_term in interaction_terms) {
    components <- unlist(strsplit(int_term, ":"))
    for (comp in components) {
      if (!(comp %in% mv_main_vars) && !(comp %in% uv_only_vars)) {
        uv_only_vars <- c(uv_only_vars, comp)
      }
    }
  }

  # Remove 'var' column before appending UV-only rows
  out <- out |> dplyr::select(-var)
  out_cols <- names(out)

  # 4. Append UV-only main effects at the end
  if (length(uv_only_vars) > 0) {
    uv_rows_to_add <- list()

    for (v in uv_only_vars) {
      uv_rows <- tabUV2 |> dplyr::filter(var == v)
      if (nrow(uv_rows) == 0) next

      has_header <- any(uv_rows$is_header)

      if (has_header) {
        # Categorical: add header then levels
        header_row <- uv_rows |> dplyr::filter(is_header)
        level_rows <- uv_rows |> dplyr::filter(!is_header)

        new_header <- stats::setNames(as.list(rep(NA, length(out_cols))), out_cols)
        new_header$Variable <- v
        new_header$N <- header_row$N[1]
        uv_rows_to_add[[length(uv_rows_to_add) + 1]] <- as.data.frame(new_header, check.names = FALSE, stringsAsFactors = FALSE)

        for (k in seq_len(nrow(level_rows))) {
          lv_row <- level_rows[k, ]
          new_row <- stats::setNames(as.list(rep(NA, length(out_cols))), out_cols)
          new_row$Variable <- lv_row$lvl
          new_row$`Unadjusted Est_CI` <- lv_row$Est_CI
          new_row$`Unadjusted p-value` <- lv_row$`p-value`
          new_row$N <- lv_row$N
          if ("Event" %in% out_cols && "Event" %in% names(lv_row)) new_row$Event <- lv_row$Event
          uv_rows_to_add[[length(uv_rows_to_add) + 1]] <- as.data.frame(new_row, check.names = FALSE, stringsAsFactors = FALSE)
        }
      } else {
        # Continuous: single row
        uv_row <- uv_rows[1, ]
        new_row <- stats::setNames(as.list(rep(NA, length(out_cols))), out_cols)
        new_row$Variable <- v
        new_row$`Unadjusted Est_CI` <- uv_row$Est_CI
        new_row$`Unadjusted p-value` <- uv_row$`p-value`
        new_row$N <- uv_row$N
        if ("Event" %in% out_cols && "Event" %in% names(uv_row)) new_row$Event <- uv_row$Event
        uv_rows_to_add[[length(uv_rows_to_add) + 1]] <- as.data.frame(new_row, check.names = FALSE, stringsAsFactors = FALSE)
      }
    }

    if (length(uv_rows_to_add) > 0) {
      out <- dplyr::bind_rows(out, uv_rows_to_add)
    }
  }

  # Rename columns back to original format
  names(out)[names(out) == "Unadjusted Est_CI"] <-
    paste0("Unadjusted ", est_col_mv)
  names(out)[names(out) == "Adjusted Est_CI"] <-
    paste0("Adjusted ", est_col_mv)

  out
}

Try the reportRmd package in your browser

Any scripts or data that you put into this service are public.

reportRmd documentation built on March 20, 2026, 5:06 p.m.