R/modelsummary_rms.R
In rmsMD: Output Results from 'rms' Models for Medical Journals

Documented in modelsummary_rms

#' @title Create model summary for rms models
#'
#' @description The `modelsummary_rms` function processes the output from models fitted using the `rms` package and generates a summarized dataframe of the results.
#' This summary is tailored for publication in medical journals, presenting effect estimates, confidence intervals, and p-values.
#'
#' @param modelfit The output from an rms model.
#' @param combine_ci If `TRUE`, combines the effect estimates and 95% confidence intervals into a single column. Default is `TRUE`.
#' @param round_dp_coef Specifies the number of decimal places to display for the effect estimates. Default is `3`.
#' @param round_dp_p Specifies the number of decimal places to display for P values. Default is `3`.
#' @param rcs_overallp If `TRUE`, provides an overall P value for Restricted Cubic Spline (RCS) terms, sourced from `anova(modelfit)`. Automatically selects appropriate test (LR, F or Wald)
#' @param MI_lrt If `TRUE` then overall p-values for RCS terms from models with multiple imputed data from `fit.mult.impute` will represent likelihood ratio chi-square tests from `rms::processMI()`, rather than Wald tests.
#' @param hide_rcs_coef If `TRUE`, hides the individual coefficients for Restricted Cubic Spline (RCS) variables.
#' @param exp_coef If `TRUE`, outputs the exponentiated coefficients (`exp(coef)`) as the effect estimates. Applicable only for model types other than `ols`, `lrm`, or `cph`. If `NULL`, no exponentiation is performed. Default is `NULL`.
#' @param fullmodel If `TRUE`, includes all intermediate steps in the summary, allowing users to verify and compare with standard model outputs.
#'
#' @return Returns a dataframe of results. This can easily be outputted to word using
#' packages such as flextable and officer.
#'
#' @import rms
#' @importFrom stats anova coef pnorm qnorm terms vcov
#'
#' @examples
#' # For detailed examples please see the provided vignettes
#'
#' @export
#'
#'
modelsummary_rms <- function(modelfit,
                             combine_ci = TRUE,
                             round_dp_coef = 3,
                             round_dp_p = 3,
                             rcs_overallp = TRUE,
                             hide_rcs_coef = TRUE,
                             exp_coef = NULL,
                             fullmodel = FALSE,
                             MI_lrt = FALSE) {

  # Warning if modelfit isn't an rms object and no exp_coef given to specify if coef or exp(coef) should be given
  if (!inherits(modelfit, "rms") && !"exp_coef" %in% names(match.call())) {
    stop(
      "The model fit does not belong to the 'rms' class.\nYou must specify the exp_coef argument to determine table output."
    )
  }

  # note the categorical variable ref groups won't pull if datadist not set
  if (inherits(modelfit, "rms") && is.null(getOption("datadist"))) {
    warning(
      "Please ensure data distribution was set before the rms model was fit\n",
      "using:\n  dd <- datadist(data)\n  options(datadist = 'dd')"
    )
  }

  ########## defining arguments based on model class ##########

  if (inherits(modelfit, "ols")) {
    exp_coef <- FALSE
  } else if (inherits(modelfit, "lrm")) {
    exp_coef <- TRUE
    exp_coef_name <- "OR"
  } else if (inherits(modelfit, "cph")) {
    exp_coef <- TRUE
    exp_coef_name <- "HR"
  } else if(is.null(exp_coef)){
    stop("Model not ols, lrm or cph.\nYou must specify the exp_coef argument to determine table output.")
  }

  # this checks if rcs_overallp is set to TRUE by default or if the user specifically set it to TRUE
  user_set_rcs <- "rcs_overallp" %in% names(match.call())

  if (!inherits(modelfit, "rms")) {
    rcs_overallp <- FALSE
    hide_rcs_coef <- FALSE
  } else {
    no_rcs <- all(unlist(modelfit$Design$nonlinear) == FALSE)
    if (no_rcs) {
      if (user_set_rcs && rcs_overallp) {
        warning(
          "rcs_overallp was set to TRUE by the user, but no RCS terms were found in the model fit.\nSetting rcs_overallp to FALSE."
        )
      }
      #set defaults to false if model happpens to have no RCS
      rcs_overallp <- FALSE
      hide_rcs_coef <- FALSE
    }
  }

  # Ensure rcs_overallp is TRUE if hide_rcs_coef is TRUE
  if (hide_rcs_coef && !rcs_overallp) {
    warning(
      "When hiding RCS coefficients (hide_rcs_coef = TRUE),\n",
      "the overall RCS p-value should be provided (rcs_overallp = TRUE).\n",
      "We recommend you set rcs_overallp = TRUE if hide_rcs_coef = TRUE."
    )
  }

  ########## Extract coefficients and standard errors ##########
  coef_values <- coef(modelfit)
  se_values <- sqrt(diag(vcov(modelfit)))
  # Ensure standard errors are in the same order as coefficients
  se_values <- se_values[names(coef_values)]

  ########## Handle RCS terms (if applicable) ##########
  if (rcs_overallp) {
    # Extract indices and terms associated with RCS from the model design
    nonlinear_res <- modelfit$Design$nonlinear
    int_spline_indices <- sapply(nonlinear_res, any)
    any_rcs_coef_index <- unlist(modelfit$assign[int_spline_indices])  # Indices of coefficients with spline components

    # Get the names of terms with nonlinear components
    rcs_terms_incl_interaction <- modelfit$Design$name[int_spline_indices]

    # Remove RCS coefficients if hide_rcs_coef is TRUE
    if (hide_rcs_coef) {
      coef_values <- coef_values[-any_rcs_coef_index]
      se_values <- se_values[-any_rcs_coef_index]
    }
  }

  ########## make coef and SE into a dataframe ##########

  output_df <- data.frame(
    variable = names(coef_values), # Names of the variables
    coef = coef_values, # Rounded coefficients
    SE = se_values, # Standard errors
    stringsAsFactors = FALSE # Do not treat character columns as factors
  )

  ############### p-values and CI   ###############

  # note, don't want to round here, as still have to add in anova p vals and deal with exp(coef)
  output_df$p_values_raw <- 2 * (1 - pnorm(abs(output_df$coef / output_df$SE)))
  output_df$coef_lower95 <- output_df$coef + qnorm(0.025) * output_df$SE
  output_df$coef_upper95 <- output_df$coef + qnorm(0.975) * output_df$SE

  if(exp_coef){
    for(var in c("coef", "coef_lower95", "coef_upper95")){
      output_df[[paste0("exp_",var)]] <- exp(output_df[[var]])
    }
  }

  ########## Add in anova results for RCS terms (if applicable) ##########

  if (rcs_overallp) {


    # first get the anova_result
    # deals with all combos of MI vs complete case, and ols/lrm/cph. if none of those, defaults to anova.rms default

    test_arg <- NULL

    if(!MI_lrt){
      if(!inherits(modelfit, "fit.mult.impute")){
        if(inherits(modelfit, "ols")){
          # complete case and ols
          test_arg <- "F"
        }
        if(inherits(modelfit, c("lrm", "cph"))) {
          # complete case and lrm/cph
          # attempt to give LR, but if not able then give wald and warning
          if(!is.null(modelfit$x) && !is.null(modelfit$y)){
            test_arg <- "LR"
          } else {
            test_arg <- "Chisq"
            message(
              "RCS overall p-values displayed are from Wald tests.\n",
              "To use the recommended test for this model type (LR test),\n",
              "please set 'x = TRUE, y = TRUE' when fitting the model."
            )
          }
        }
      } else {
        # for fit.mult.impute model fit objects
        # nb if MI_lrt for LR tests this is handled below in a seperate block
        if(inherits(modelfit, "ols")){
          # MI and ols
          test_arg <- "Chisq"
        }
        if(inherits(modelfit, c("lrm", "cph"))) {
          # MI and lrm/cph, but MI_lrt not TRUE
          test_arg <- "Chisq"
          message(
            "RCS overall p-values displayed are from Wald tests.\n",
            "To use LR test set `MI_lrt = TRUE` in modelsummary_rms(), and set ",
            "`lrt = TRUE` in fit.mult.impute() when fitting the model."
          )
        }

      }

      # making anova result based on test_arg
      # if no test_arg then it does the anova.rms defaults
      # note do.call etc all break with large ols models if you try define subset of variables
      if(!is.null(test_arg)){
        anova_result <- anova(modelfit, india = FALSE, indnl = FALSE, test = test_arg)
      } else {
        # use anova.rms default test as back-up
        anova_result <- anova(modelfit, india = FALSE, indnl = FALSE)
      }


    } else {
      # uses processMI for when MI_lrt is TRUE

      if (!inherits(modelfit, c("cph","lrm"))) {
        stop("MI_lrt = TRUE is currently available for lrm() and cph() `rms` models only.")
      }

      if (!inherits(modelfit, "fit.mult.impute")) {
        stop(
          "MI_lrt = TRUE was set, but the model object is not a fit.mult.impute() object.\n",
          "MI_lrt is only applicable to fit.mult.impute() objects."
        )
      }

      if(modelfit$fmimethod == "ordinary"){
        stop("MI_lrt = TRUE was set, but when fitting the model with fit.mult.impute(), \n`lrt = TRUE` was not used.")
      }

      anova_result <- processMI(modelfit, "anova")

      # just for labelling later
      test_arg <- "LR"

    }

    anova_rows <- rownames(anova_result)

    rows_keep <- vapply(anova_rows, function(row) {
      any(vapply(rcs_terms_incl_interaction, function(term) {
        grepl(paste0("^", term, "\\b"), row)
      }, logical(1)))
    }, logical(1))

    filtered_anova_result <- anova_result[rows_keep, , drop = FALSE]

    # Extract the relevant rows and p-values for RCS terms
    anova_df <- data.frame(
      variable = paste0("RCSoverallP: ", rownames(filtered_anova_result)),
      p_values_raw = filtered_anova_result[, "P"],
      stringsAsFactors = FALSE
    )

    # Add missing columns to match `output_df` structure
    output_columns <- colnames(output_df)
    for (col in output_columns) {
      if (!col %in% colnames(anova_df)) {
        anova_df[[col]] <- NA
      }
    }

    # Merge the filtered ANOVA results with the main output
    output_df <- rbind(output_df, anova_df[, output_columns])

  }


  ########## Format p-values ##########
  threshold <- 10^(-round_dp_p)
  spr_text_p <- paste("%.", round_dp_p, "f", sep = "")
  output_df$Pvalue <- ifelse(output_df$p_values_raw < threshold,
                             paste0("<", format(threshold, scientific = FALSE)),
                             sprintf(spr_text_p, output_df$p_values_raw))

  ############# make list of effect estimate columns to output ###########
  # key_vars are those that will be formatted with round_dp_coef and
  # combined with CI

  if(exp_coef){
    key_vars <- c("exp_coef","exp_coef_lower95","exp_coef_upper95")
  } else {
    key_vars <- c("coef","coef_lower95","coef_upper95")
  }


  ########## format effect estimates and confidence intervals ############
  # dynamic based on key vars

  spr_text_coef <- paste0("%.", round_dp_coef, "f")

  for(var in key_vars){
    output_df[[var]] <- ifelse(is.na(output_df[[var]]),
                               NA,
                               sprintf(spr_text_coef, output_df[[var]]))
  }

  ##################### make a combined CI column ###############################
  # dynamic based on key_vars

  if(combine_ci){
    effect_est <- key_vars[1]
    lower95 <- key_vars[2]
    upper95 <- key_vars[3]

    column_name <- paste0(effect_est,"_95CI")

    if(rcs_overallp){
      rcs_test <- if(test_arg == "LR"){
        "LR test"
      } else if(test_arg == "Chisq"){
        "Wald test"
      } else if(test_arg == "F"){
        "F test"
      } else{
        "RCS terms"
      }
    }

    output_df[[column_name]] <- ifelse(is.na(output_df[[effect_est]]),
                                       rcs_test,
                                       paste(output_df[[effect_est]],
                                             " (", output_df[[lower95]],
                                             " to ", output_df[[upper95]], ")", sep = ""))
  }

  ########## adding ref levels for categorical variables ##########

  # nb relies on model structure, so only for main model types
  # nb the combined column and p values already done and formatted
  if (inherits(modelfit, "rms") && any(class(modelfit) %in% c("ols", "lrm", "cph"))){

    # indexes and variables that are categorical
    index_categorical <- modelfit$Design$assume == "category"
    vars_cat <- modelfit$Design$name[index_categorical]

    # getting the reference levels and adding them in
    limits <- modelfit$Design$limits

    for (i in seq_along(vars_cat)) {
      cat_var <- vars_cat[[i]]
      ref_group <- levels(limits[[cat_var]])[1]

      # find first index
      row_index <- which(grepl(paste0("^", cat_var, "="), output_df$variable))[1]

      if (!is.na(row_index)) {
        ref_row <- output_df[1, ]
        ref_row[] <- "Ref"
        ref_row$variable <- paste0(cat_var, "=", ref_group)
        ref_row$Pvalue <- "-"
        output_df <- rbind(
          output_df[seq_len(row_index - 1), ],
          ref_row,
          output_df[seq(row_index, nrow(output_df)), ]
        )
      }

    }


  }

  ########## Return final output ##########
  # helper function to format the final output
  final_output <- rmsMD_format_final_output(output_df, fullmodel,
                                            combine_ci, exp_coef, key_vars)

  if(exp_coef & exists("exp_coef_name")){
    colnames(final_output) <- gsub("exp_coef", exp_coef_name, colnames(final_output))
  }

  # ensure no rownames
  rownames(final_output) <- NULL

  # Return the formatted output
  return(final_output)
}