R/fit.R

Defines functions print.fit_result fit

Documented in fit print.fit_result

#' Fit Regression Model with Publication-Ready Output
#'
#' Provides a unified interface for fitting various types of regression models 
#' with automatic formatting of results for publication. Supports generalized 
#' linear models, linear models, survival models, and mixed-effects models with 
#' consistent syntax and output formatting. Handles both univariable and 
#' multivariable models automatically.
#' 
#' @param data Data frame or data.table containing the analysis dataset. 
#'   Required for formula-based workflow; optional for model-based workflow
#'   (extracted from model if not provided).
#'   
#' @param outcome Character string specifying the outcome variable name. For 
#'   survival analysis, use \code{Surv()} syntax from the \pkg{survival} package 
#'   (\emph{e.g.,} \code{"Surv(time, status)"} or \code{"Surv(os_months, os_status)"}).
#'   Required for formula-based workflow; ignored if \code{model} is provided.
#'   
#' @param predictors Character vector of predictor variable names to include in 
#'   the model. All predictors are included simultaneously (multivariable model). 
#'   For univariable models, provide a single predictor. Can include continuous, 
#'   categorical (factor), or binary variables. Required for formula-based
#'   workflow; ignored if \code{model} is provided.
#'
#' @param model Optional pre-fitted model object to format. When provided,
#'   \code{outcome} and \code{predictors} are ignored and the model is formatted
#'   directly. Supported model classes include:
#'   \itemize{
#'     \item \code{glm} - Generalized linear models
#'     \item \code{negbin} - Negative binomial models from \code{MASS::glm.nb()}
#'     \item \code{lm} - Linear models
#'     \item \code{coxph} - Cox proportional hazards models
#'     \item \code{lmerMod}, \code{glmerMod} - Mixed-effects models from \pkg{lme4}
#'     \item \code{coxme} - Mixed-effects Cox models
#'   }
#'   
#' @param model_type Character string specifying the type of regression model. 
#'   Ignored if \code{model} is provided. Options include:
#'   \itemize{
#'     \item \code{"glm"} - Generalized linear model (default). Supports multiple 
#'       distributions via the \code{family} parameter including logistic, Poisson, 
#'       Gamma, Gaussian, and quasi-likelihood models.
#'     \item \code{"lm"} - Linear regression for continuous outcomes with normally 
#'       distributed errors. Equivalent to \code{glm} with \code{family = "gaussian"}.
#'     \item \code{"coxph"} - Cox proportional hazards model for time-to-event 
#'       survival analysis. Requires \code{Surv()} outcome syntax.
#'     \item \code{"clogit"} - Conditional logistic regression for matched 
#'       case-control studies or stratified analyses.
#'     \item \code{"negbin"} - Negative binomial regression for overdispersed count 
#'       data (requires \pkg{MASS} package). Estimates an additional dispersion parameter 
#'       compared to Poisson regression.
#'     \item \code{"lmer"} - Linear mixed-effects model for hierarchical or clustered 
#'       data with continuous outcomes (requires \pkg{lme4} package).
#'     \item \code{"glmer"} - Generalized linear mixed-effects model for hierarchical 
#'       or clustered data with non-normal outcomes (requires \pkg{lme4} package).
#'     \item \code{"coxme"} - Cox mixed-effects model for clustered survival data 
#'       (requires coxme package).
#'   }
#'   
#' @param family For GLM and GLMER models, specifies the error distribution and link 
#'   function. Can be a character string, a family function, or a family object.
#'   Ignored for non-GLM/GLMER models.
#'   
#'   \strong{Binary/Binomial outcomes:}
#'   \itemize{
#'     \item \code{"binomial"} or \code{binomial()} - Logistic regression for binary 
#'       outcomes (0/1, TRUE/FALSE). Returns odds ratios (OR). Default.
#'     \item \code{"quasibinomial"} or \code{quasibinomial()} - Logistic regression 
#'       with overdispersion. Use when residual deviance >> residual df.
#'     \item \code{binomial(link = "probit")} - Probit regression (normal CDF link).
#'     \item \code{binomial(link = "cloglog")} - Complementary log-log link for 
#'       asymmetric binary outcomes.
#'   }
#'   
#'   \strong{Count outcomes:}
#'   \itemize{
#'     \item \code{"poisson"} or \code{poisson()} - Poisson regression for count 
#'       data. Returns rate ratios (RR). Assumes mean = variance.
#'     \item \code{"quasipoisson"} or \code{quasipoisson()} - Poisson regression 
#'       with overdispersion. Use when variance > mean.
#'   }
#'   
#'   \strong{Continuous outcomes:}
#'   \itemize{
#'     \item \code{"gaussian"} or \code{gaussian()} - Normal/Gaussian distribution 
#'       for continuous outcomes. Equivalent to linear regression.
#'     \item \code{gaussian(link = "log")} - Log-linear model for positive continuous 
#'       outcomes. Returns multiplicative effects.
#'     \item \code{gaussian(link = "inverse")} - Inverse link for specific applications.
#'   }
#'   
#'   \strong{Positive continuous outcomes:}
#'   \itemize{
#'     \item \code{"Gamma"} or \code{Gamma()} - Gamma distribution for positive, 
#'       right-skewed continuous data (\emph{e.g.,} costs, lengths of stay). When
#'       passed as a string, resolves to log link for interpretable multiplicative
#'       effects. The canonical inverse link can be specified explicitly with
#'       \code{Gamma(link = "inverse")}.
#'     \item \code{Gamma(link = "log")} - Gamma with log link (equivalent to
#'       \code{"Gamma"} string shorthand).
#'     \item \code{Gamma(link = "inverse")} - Gamma with inverse (canonical) link.
#'     \item \code{Gamma(link = "identity")} - Gamma with identity link for additive 
#'       effects on positive outcomes.
#'     \item \code{"inverse.gaussian"} or \code{inverse.gaussian()} - Inverse Gaussian 
#'       for positive, highly right-skewed data.
#'   }
#'   
#'   For negative binomial regression (overdispersed counts), use 
#'   \code{model_type = "negbin"} instead of the \code{family} parameter.
#'   
#'   See \code{\link[stats]{family}} for additional details and options.
#'
#' @param random Character string specifying the random-effects formula for 
#'   mixed-effects models (\code{glmer}, \code{lmer}, \code{coxme}). Use standard
#'   \pkg{lme4}/\pkg{coxme} syntax, \emph{e.g.,} \code{"(1|site)"} for random
#'   intercepts by site, \code{"(1|site/patient)"} for nested random effects, or 
#'   \code{"(1|site) + (1|doctor)"} for crossed random effects. Required when 
#'   \code{model_type} is a mixed-effects model type. Alternatively, random 
#'   effects can be included directly in the \code{predictors} vector using the
#'   same syntax. Default is \code{NULL}.
#'   
#' @param interactions Character vector of interaction terms using colon 
#'   notation (\emph{e.g.,} \code{c("age:sex", "treatment:stage")}). Interaction terms 
#'   are added to the model in addition to main effects. Default is \code{NULL} 
#'   (no interactions).
#'   
#' @param strata For Cox or conditional logistic models, character string naming 
#'   the stratification variable. Creates separate baseline hazards for each 
#'   stratum level without estimating stratum effects. Default is \code{NULL}.
#'   
#' @param cluster For Cox models, character string naming the variable for 
#'   robust clustered standard errors. Accounts for within-cluster correlation 
#'   (\emph{e.g.,} patients within hospitals). Default is \code{NULL}.
#'   
#' @param weights Character string naming the weights variable in \code{data}. 
#'   The specified column should contain numeric values for observation weights. 
#'   Used for weighted regression, survey data, or inverse probability weighting. 
#'   Default is \code{NULL}.
#'   
#' @param conf_level Numeric confidence level for confidence intervals. Must be 
#'   between 0 and 1. Default is 0.95 (95\% confidence intervals).
#'   
#' @param reference_rows Logical. If \code{TRUE}, adds rows for reference 
#'   categories of factor variables with baseline values (OR/HR/RR = 1, 
#'   coefficient = 0). Default is \code{TRUE}.
#'   
#' @param show_n Logical. If \code{TRUE}, includes the sample size column in 
#'   the output. Default is \code{TRUE}.
#'   
#' @param show_events Logical. If \code{TRUE}, includes the events column in 
#'   the output (for survival and logistic regression). Default is \code{TRUE}.
#'   
#' @param digits Integer specifying the number of decimal places for effect 
#'   estimates (OR, HR, RR, coefficients). Default is 2.
#'   
#' @param p_digits Integer specifying the number of decimal places for
#'   \emph{p}-values. Values smaller than \code{10^(-p_digits)} are displayed
#'   as \code{"< 0.001"} (for \code{p_digits = 3}), \code{"< 0.0001"} (for
#'   \code{p_digits = 4}), etc. Default is 3.
#'
#' @param labels Named character vector or list providing custom display 
#'   labels for variables. Names should match variable names, values are display 
#'   labels. Default is \code{NULL}.
#'   
#' @param keep_qc_stats Logical. If \code{TRUE}, includes model quality statistics 
#'   (AIC, BIC, R-squared, concordance, \emph{etc.}) in the raw data attribute for 
#'   model diagnostics and comparison. Default is \code{TRUE}.
#'   
#' @param exponentiate Logical. Whether to exponentiate coefficients. Default 
#'   is \code{NULL}, which automatically exponentiates for logistic, Poisson, 
#'   and Cox models, and displays raw coefficients for linear models. Set to 
#'   \code{TRUE} to force exponentiation or \code{FALSE} to force coefficients.
#'
#' @param conf_method Character string controlling the confidence interval method.
#'   If \code{NULL} (default), uses \code{getOption("summata.conf_method", "profile")}.
#'   \itemize{
#'     \item \code{"profile"} - Profile likelihood intervals for GLM and negative
#'       binomial models (via \code{MASS::confint.glm()}), exact \emph{t}-distribution
#'       intervals for linear models. Falls back to Wald on profiling failure.
#'       Quasi-likelihood families always use Wald (no true likelihood).
#'     \item \code{"wald"} - Wald intervals (coefficient \eqn{\pm} \emph{z}
#'       \eqn{\times} SE) for all model types. Faster but less accurate near
#'       boundary conditions or with small subgroups.
#'   }
#'   Cox and mixed-effects models use Wald intervals regardless of this setting.
#'
#' @param number_format Character string or two-element character vector
#'   controlling thousand and decimal separators in formatted output. Named
#'   presets:
#'   \itemize{
#'     \item \code{"us"} - Comma thousands, period decimal: \code{1,234.56} [default]
#'     \item \code{"eu"} - Period thousands, comma decimal: \code{1.234,56}
#'     \item \code{"space"} - Thin-space thousands, period decimal: \code{1 234.56}
#'       (SI/ISO 31-0)
#'     \item \code{"none"} - No thousands separator: \code{1234.56}
#'   }
#'   Or provide a custom two-element vector \code{c(big.mark, decimal.mark)},
#'   \emph{e.g.}, \code{c("'", ".")} for Swiss-style: \verb{1'234.56}.
#'
#'   When \code{NULL} (default), uses
#'   \code{getOption("summata.number_format", "us")}. Set the global option
#'   once per session to avoid passing this argument repeatedly:
#'   \preformatted{
#'     options(summata.number_format = "eu")
#'   }
#'
#' @param verbose Logical. If \code{TRUE}, displays model fitting warnings
#'   (\emph{e.g.,} singular fit, convergence issues). If \code{FALSE} (default),
#'   routine fitting messages are suppressed while unexpected warnings are
#'   preserved. When \code{NULL}, uses
#'   \code{getOption("summata.verbose", FALSE)}.
#'
#' @param ... Additional arguments passed to the underlying model fitting 
#'   function (\code{\link[stats]{glm}}, \code{\link[stats]{lm}}, 
#'   \code{\link[survival]{coxph}}, \code{\link[lme4]{glmer}}, \emph{etc.}). Common 
#'   options include \code{subset}, \code{na.action}, and model-specific control 
#'   parameters.
#'
#' @return A data.table with S3 class \code{"fit_result"} containing formatted 
#'   regression results. The table structure includes:
#'   \describe{
#'     \item{Variable}{Character. Predictor name or custom label}
#'     \item{Group}{Character. For factor variables: category level. For 
#'       interactions: interaction term. For continuous: typically empty}
#'     \item{n}{Integer. Total sample size (if \code{show_n = TRUE})}
#'     \item{n_group}{Integer. Sample size for this factor level}
#'     \item{events}{Integer. Total number of events (if \code{show_events = TRUE})}
#'     \item{events_group}{Integer. Events for this factor level}
#'     \item{OR/HR/RR/Coefficient or aOR/aHR/aRR/Adj. Coefficient (95\% CI)}{Character. 
#'       Formatted effect estimate with confidence interval. Column name depends on 
#'       model type and scope. Univariable models use: OR, HR, RR, Coefficient.
#'       Multivariable models use adjusted notation: aOR, aHR, aRR, Adj. Coefficient}
#'     \item{\emph{p}-value}{Character. Formatted \emph{p}-value from Wald test}
#'   }
#'   
#'   The returned object includes the following attributes accessible via \code{attr()}:
#'   \describe{
#'     \item{model}{The fitted model object (glm, lm, coxph, \emph{etc.}). Access for 
#'       diagnostics, predictions, or further analysis}
#'     \item{raw_data}{data.table. Unformatted numeric results with columns for 
#'       coefficients, standard errors, confidence bounds, quality statistics, \emph{etc.}}
#'     \item{outcome}{Character. The outcome variable name}
#'     \item{predictors}{Character vector. The predictor variable names}
#'     \item{formula_str}{Character. The complete model formula as a string}
#'     \item{model_scope}{Character. "Univariable" (one predictor) or 
#'       "Multivariable" (multiple predictors)}
#'     \item{model_type}{Character. The regression model type used}
#'     \item{interactions}{Character vector (if interactions specified). 
#'       The interaction terms included}
#'     \item{strata}{Character (if stratification used). The stratification variable}
#'     \item{cluster}{Character (if clustering used). The cluster variable}
#'     \item{weights}{Character (if weighting used). The weights variable}
#'     \item{significant}{Character vector. Names of predictors with \emph{p}-value
#'       below 0.05, suitable for downstream variable selection workflows}
#'   }
#'
#' @details
#' \strong{Model Scope Detection:}
#' 
#' The function automatically detects whether the model is:
#' \itemize{
#'   \item \strong{Univariable}: Single predictor (\emph{e.g.,} \code{predictors = "age"}). 
#'       Effect estimates are labeled as unadjusted ("OR", "HR", \emph{etc.}), representing
#'       crude (unadjusted) association
#'   \item \strong{Multivariable}: Multiple predictors (\emph{e.g.,} 
#'     \code{predictors = c("age", "sex", "treatment")}) 
#'       Effect estimates are labeled as adjusted ("aOR", "aHR", \emph{etc.}), representing
#'       associations adjusted for confounding
#' }
#' 
#' \strong{Interaction Terms:}
#' 
#' Interactions are specified using colon notation and added to the model:
#' \itemize{
#'   \item \code{interactions = c("age:treatment")} creates interaction 
#'     between age and treatment
#'   \item Main effects for both variables are automatically included
#'   \item Multiple interactions can be specified: 
#'     \code{c("age:sex", "treatment:stage")}
#'   \item For interactions between categorical variables, separate terms are 
#'     created for each combination of levels
#' }
#' 
#' \strong{Stratification (Cox/Conditional Logistic):}
#' 
#' The \code{strata} parameter creates separate baseline hazards:
#' \itemize{
#'   \item Allows baseline hazard to vary across strata without estimating 
#'     stratum effects
#'   \item Useful when proportional hazards assumption violated across strata
#'   \item Example: \code{strata = "center"} for multicenter studies
#'   \item Stratification variable is not included as a predictor
#' }
#' 
#' \strong{Clustering (Cox Models):}
#' 
#' The \code{cluster} parameter computes robust standard errors:
#' \itemize{
#'   \item Accounts for within-cluster correlation (\emph{e.g.,} multiple observations 
#'     per patient)
#'   \item Uses sandwich variance estimator
#'   \item Does not change point estimates, only standard errors and \emph{p}-values
#' }
#' 
#' \strong{Weighting:}
#' 
#' The \code{weights} parameter enables weighted regression:
#' \itemize{
#'   \item For survey data with sampling weights
#'   \item Inverse probability weighting for causal inference
#'   \item Frequency weights for aggregated data
#'   \item Weights should be in a column of \code{data}
#' }
#' 
#' \strong{Mixed-Effects Models (lmer/glmer/coxme):}
#' 
#' Mixed effects models handle hierarchical or clustered data:
#' \itemize{
#'   \item Use \code{model_type = "lmer"} for continuous/normal outcomes
#'   \item Use \code{model_type = "glmer"} with appropriate \code{family} for GLM outcomes
#'   \item Use \code{model_type = "coxme"} for survival outcomes with clustering
#'   \item Random effects are specified in predictors using lme4 syntax:
#'     \itemize{
#'       \item \code{"(1|site)"} - Random intercepts by site
#'       \item \code{"(treatment|site)"} - Random slopes for treatment by site
#'       \item \code{"(1 + treatment|site)"} - Both random intercepts and slopes
#'     }
#'   \item Include random effects as part of the predictors vector
#'   \item Example: \code{predictors = c("age", "treatment", "(1|site)")}
#' }
#' 
#' \strong{Effect Measures by Model Type:}
#' \itemize{
#'   \item \strong{Logistic} (\code{family = "binomial"/"quasibinomial"}): Odds ratios (OR/aOR)
#'   \item \strong{Cox} (\code{model_type = "coxph"}): Hazard ratios (HR/aHR)
#'   \item \strong{Poisson/Count} (\code{family = "poisson"/"quasipoisson"}): Rate ratios (RR/aRR)
#'   \item \strong{Negative binomial} (\code{model_type = "negbin"}): Rate ratios (RR/aRR)
#'   \item \strong{Gamma/Log-link}: Ratios (multiplicative effects)
#'   \item \strong{Linear/Gaussian}: Raw coefficient estimates (additive effects)
#' }
#' 
#' \strong{Confidence Intervals:}
#' 
#' Confidence interval computation is tailored to each model class using the
#' best available method:
#' \itemize{
#'   \item \strong{GLM and negative binomial}: Profile likelihood intervals via
#'     \code{MASS::confint.glm()}, which invert the profile deviance and account
#'     for asymmetry in the likelihood surface. More accurate than the Wald
#'     approximation when subgroup sizes are small or estimates are near boundary
#'     values. Quasi-likelihood families (\code{quasibinomial}, \code{quasipoisson})
#'     fall back to Wald intervals because they lack a true likelihood function.
#'   \item \strong{Linear models}: Exact \emph{t}-distribution intervals via
#'     \code{confint.lm()}, based on the known sampling distribution under
#'     normality.
#'   \item \strong{Cox proportional hazards}: Wald intervals (\emph{i.e.,}
#'     coefficient \eqn{\pm} \emph{z} \eqn{\times} SE), the standard approach in
#'     the survival analysis literature.
#'   \item \strong{Mixed-effects models} (lmer, glmer, coxme): Wald intervals.
#'     Profile likelihood is available for \pkg{lme4} models via
#'     \code{confint(model, method = "profile")} but can be prohibitively slow
#'     for complex random-effects structures and is not used by default.
#' }
#' 
#' If profile likelihood computation fails for any reason (\emph{e.g.,}
#' non-convergence during profiling), the function falls back silently to Wald
#' intervals.
#'
#' @seealso 
#' \code{\link{uniscreen}} for univariable screening of multiple predictors,
#' \code{\link{fullfit}} for complete univariable-to-multivariable workflow,
#' \code{\link{compfit}} for comparing multiple models,
#' \code{\link{m2dt}} for model-to-table conversion
#'
#' @examples
#' # Load example data
#' data(clintrial)
#' data(clintrial_labels)
#' library(survival)
#' 
#' # Example 1: Univariable logistic regression
#' uni_model <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = "age"
#' )
#' print(uni_model)
#' # Labeled as "Univariable OR"
#'
#' \donttest{
#' 
#' # Example 2: Multivariable logistic regression
#' multi_model <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "bmi", "treatment"),
#'     labels = clintrial_labels
#' )
#' print(multi_model)
#' 
#' # Example 3: Cox proportional hazards model
#' cox_model <- fit(
#'     data = clintrial,
#'     outcome = "Surv(os_months, os_status)",
#'     predictors = c("age", "sex", "treatment", "stage"),
#'     model_type = "coxph",
#'     labels = clintrial_labels
#' )
#' print(cox_model)
#' 
#' # Example 4: Model with interaction terms
#' interact_model <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "treatment", "sex"),
#'     interactions = c("age:treatment"),
#'     labels = clintrial_labels
#' )
#' print(interact_model)
#' 
#' # Example 5: Cox model with stratification
#' strat_model <- fit(
#'     data = clintrial,
#'     outcome = "Surv(os_months, os_status)",
#'     predictors = c("age", "sex", "treatment"),
#'     model_type = "coxph",
#'     strata = "site",  # Separate baseline hazards by site
#'     labels = clintrial_labels
#' )
#' print(strat_model)
#' 
#' # Example 6: Cox model with clustering
#' cluster_model <- fit(
#'     data = clintrial,
#'     outcome = "Surv(os_months, os_status)",
#'     predictors = c("age", "treatment"),
#'     model_type = "coxph",
#'     cluster = "site",  # Robust SEs accounting for site clustering
#'     labels = clintrial_labels
#' )
#' print(cluster_model)
#' 
#' # Example 7: Linear regression
#' linear_model <- fit(
#'     data = clintrial,
#'     outcome = "bmi",
#'     predictors = c("age", "sex", "smoking"),
#'     model_type = "lm",
#'     labels = clintrial_labels
#' )
#' print(linear_model)
#' 
#' # Example 8: Poisson regression for equidispersed count data
#' # fu_count has variance ~= mean, appropriate for standard Poisson
#' poisson_model <- fit(
#'     data = clintrial,
#'     outcome = "fu_count",
#'     predictors = c("age", "stage", "treatment", "surgery"),
#'     model_type = "glm",
#'     family = "poisson",
#'     labels = clintrial_labels
#' )
#' print(poisson_model)
#' # Returns rate ratios (RR/aRR)
#' 
#' # Example 9: Negative binomial regression for overdispersed counts
#' # ae_count has variance > mean (overdispersed), use negbin or quasipoisson
#' if (requireNamespace("MASS", quietly = TRUE)) {
#'     nb_result <- fit(
#'         data = clintrial,
#'         outcome = "ae_count",
#'         predictors = c("age", "treatment", "diabetes", "surgery"),
#'         model_type = "negbin",
#'         labels = clintrial_labels
#'     )
#'     print(nb_result)
#' }
#' 
#' # Example 10: Gamma regression for positive continuous outcomes
#' gamma_model <- fit(
#'     data = clintrial,
#'     outcome = "los_days",
#'     predictors = c("age", "treatment", "surgery"),
#'     model_type = "glm",
#'     family = Gamma(link = "log"),
#'     labels = clintrial_labels
#' )
#' print(gamma_model)
#' 
#' # Example 11: Access the underlying fitted model
#' result <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "bmi")
#' )
#' 
#' # Get the model object
#' model_obj <- attr(result, "model")
#' summary(model_obj)
#' 
#' # Model diagnostics
#' plot(model_obj)
#' 
#' # Predictions
#' preds <- predict(model_obj, type = "response")
#' 
#' # Example 12: Access raw numeric data
#' raw_data <- attr(result, "raw_data")
#' print(raw_data)
#' # Contains unformatted coefficients, SEs, CIs, AIC, BIC, etc.
#' 
#' # Example 13: Multiple interactions
#' complex_model <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "treatment", "bmi"),
#'     interactions = c("age:treatment", "sex:bmi"),
#'     labels = clintrial_labels
#' )
#' print(complex_model)
#' 
#' # Example 14: Customize output columns
#' minimal <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "treatment"),
#'     show_n = FALSE,
#'     show_events = FALSE,
#'     reference_rows = FALSE
#' )
#' print(minimal)
#' 
#' # Example 15: Different confidence levels
#' ci90 <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "treatment"),
#'     conf_level = 0.90  # 90% confidence intervals
#' )
#' print(ci90)
#' 
#' # Example 16: Force coefficient display instead of OR
#' coef_model <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "bmi"),
#'     exponentiate = FALSE  # Show log odds instead of OR
#' )
#' print(coef_model)
#' 
#' # Example 17: Confidence interval method
#' # Default: profile likelihood CIs for GLM (more accurate)
#' profile_result <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "treatment"),
#'     p_digits = 4,
#'     conf_method = "profile"
#' )
#' print(profile_result)
#'
#' # Wald CIs (faster, suitable for simulation or exploratory work)
#' wald_result <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "treatment"),
#'     p_digits = 4,
#'     conf_method = "wald"
#' )
#' print(wald_result)
#' 
#' # Example 18: Check model quality statistics
#' result <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "treatment", "stage"),
#'     keep_qc_stats = TRUE
#' )
#' 
#' raw <- attr(result, "raw_data")
#' cat("AIC:", raw$AIC[1], "\n")
#' cat("BIC:", raw$BIC[1], "\n")
#' cat("C-statistic:", raw$c_statistic[1], "\n")
#' 
#' # Example 19: Interaction effects - treatment effect modified by stage
#' interaction_model <- fit(
#'     data = clintrial,
#'     outcome = "Surv(os_months, os_status)",
#'     predictors = c("age", "treatment", "stage"),
#'     interactions = c("treatment:stage"),
#'     model_type = "coxph",
#'     labels = clintrial_labels
#' )
#' print(interaction_model)
#' # Shows main effects plus all treatment×stage interaction terms
#' 
#' # Example 20: Multiple interactions in logistic regression
#' multi_interaction <- fit(
#'     data = clintrial,
#'     outcome = "readmission_30d",
#'     predictors = c("age", "sex", "surgery", "diabetes"),
#'     interactions = c("surgery:diabetes", "age:sex"),
#'     labels = clintrial_labels
#' )
#' print(multi_interaction)
#' 
#' # Example 21: Quasipoisson for overdispersed count data
#' # Alternative to negative binomial when MASS not available
#' quasi_model <- fit(
#'     data = clintrial,
#'     outcome = "ae_count",
#'     predictors = c("age", "treatment", "diabetes", "surgery"),
#'     model_type = "glm",
#'     family = "quasipoisson",
#'     labels = clintrial_labels
#' )
#' print(quasi_model)
#' # Adjusts standard errors for overdispersion
#' 
#' # Example 22: Quasibinomial for overdispersed binary data
#' quasi_logistic <- fit(
#'     data = clintrial,
#'     outcome = "any_complication",
#'     predictors = c("age", "bmi", "diabetes", "surgery"),
#'     model_type = "glm",
#'     family = "quasibinomial",
#'     labels = clintrial_labels
#' )
#' print(quasi_logistic)
#' 
#' # Example 23: Gamma regression with identity link for additive effects
#' gamma_identity <- fit(
#'     data = clintrial,
#'     outcome = "los_days",
#'     predictors = c("age", "treatment", "surgery", "any_complication"),
#'     model_type = "glm",
#'     family = Gamma(link = "identity"),
#'     labels = clintrial_labels
#' )
#' print(gamma_identity)
#' # Shows additive effects (coefficients) instead of multiplicative (ratios)
#' 
#' # Example 24: Inverse Gaussian regression for highly skewed data
#' inverse_gaussian <- fit(
#'     data = clintrial,
#'     outcome = "recovery_days",
#'     predictors = c("age", "surgery", "pain_score"),
#'     model_type = "glm",
#'     family = inverse.gaussian(link = "log"),
#'     labels = clintrial_labels
#' )
#' print(inverse_gaussian)
#' 
#' # Example 25: Linear mixed effects with random intercepts
#' # Accounts for clustering of patients within sites
#' if (requireNamespace("lme4", quietly = TRUE)) {
#'     lmer_model <- fit(
#'         data = clintrial,
#'         outcome = "los_days",
#'         predictors = c("age", "treatment", "stage", "(1|site)"),
#'         model_type = "lmer",
#'         labels = clintrial_labels
#'     )
#'     print(lmer_model)
#' }
#' 
#' # Example 26: Generalized linear mixed effects (logistic with random effects)
#' if (requireNamespace("lme4", quietly = TRUE)) {
#'     glmer_model <- fit(
#'         data = clintrial,
#'         outcome = "readmission_30d",
#'         predictors = c("age", "surgery", "los_days", "(1|site)"),
#'         model_type = "glmer",
#'         family = "binomial",
#'         labels = clintrial_labels
#'     )
#'     print(glmer_model)
#' }
#' 
#' # Example 27: Cox mixed effects for clustered survival data
#' if (requireNamespace("coxme", quietly = TRUE)) {
#'     coxme_model <- fit(
#'         data = clintrial,
#'         outcome = "Surv(os_months, os_status)",
#'         predictors = c("age", "treatment", "stage", "(1|site)"),
#'         model_type = "coxme",
#'         labels = clintrial_labels
#'     )
#'     print(coxme_model)
#' }
#' 
#' # Example 28: Random slopes - treatment effect varies by site
#' if (requireNamespace("lme4", quietly = TRUE)) {
#'     random_slopes <- fit(
#'         data = clintrial,
#'         outcome = "los_days",
#'         predictors = c("age", "treatment", "stage", "(treatment|site)"),
#'         model_type = "lmer",
#'         labels = clintrial_labels
#'     )
#'     print(random_slopes)
#' }
#' 
#' # Example 29: Format a pre-fitted model (model-based workflow)
#' # Useful for models fitted outside of fit()
#' pre_fitted <- glm(os_status ~ age + sex + treatment,
#'                   family = binomial, data = clintrial)
#' result <- fit(model = pre_fitted, 
#'               data = clintrial,
#'               labels = clintrial_labels)
#' print(result)
#'
#' }
#' 
#' @family regression functions
#' @export
fit <- function(data = NULL, 
                outcome = NULL,
                predictors = NULL,
                model = NULL,
                model_type = "glm",
                family = "binomial",
                random = NULL,
                interactions = NULL,
                strata = NULL,
                cluster = NULL,
                weights = NULL,
                conf_level = 0.95,
                reference_rows = TRUE,
                show_n = TRUE,
                show_events = TRUE,
                digits = 2,
                p_digits = 3,
                labels = NULL,
                keep_qc_stats = TRUE,
                exponentiate = NULL,
                conf_method = NULL,
                number_format = NULL,
                verbose = NULL,
                ...) {
    
    ## Check if we're using model-based workflow
    
    ## Resolve verbose setting
    verbose <- if (is.null(verbose)) getOption("summata.verbose", FALSE) else verbose
    
    ## Resolve number formatting marks
    validate_number_format(number_format)
    marks <- resolve_number_marks(number_format)
    
    if (!is.null(model)) {
        ## Model-based workflow: format existing model
        
        ## Get data from model if not provided
        if (is.null(data)) {
            data <- get_model_data(model)
            if (is.null(data)) {
                stop("Cannot extract data from model. Please provide 'data' parameter.")
            }
        }
        
        ## Convert to data.table
        if (!data.table::is.data.table(data)) {
            data <- data.table::as.data.table(data)
        }
        
        ## Store data in model for m2dt
        if (isS4(model)) {
            attr(model, "data") <- data
        } else {
            model$data <- data
            attr(model, "data") <- data
        }
        
        ## Convert to readable format using m2dt
        raw_data <- m2dt(data = data,
                         model = model, 
                         conf_level = conf_level,
                         keep_qc_stats = keep_qc_stats,
                         include_intercept = FALSE,
                         reference_rows = reference_rows,
                         skip_counts = (!show_n && !show_events),
                         conf_method = conf_method)

        ## Propagate cached confint from m2dt to model for downstream reuse
        cached_ci <- attr(raw_data, "cached_confint")
        if (!is.null(cached_ci)) {
            attr(model, "cached_confint") <- cached_ci
            attr(model, "cached_confint_level") <- attr(raw_data, "cached_confint_level")
        }

        ## Format results for publication
        formatted <- format_model_table(raw_data,
                                        show_n = show_n,
                                        show_events = show_events,
                                        digits = digits,
                                        p_digits = p_digits,
                                        labels = labels,
                                        exponentiate = exponentiate,
                                        conf_level = conf_level,
                                        marks = marks)
        ## Extract formula and predictors from model
        formula_str <- deparse(stats::formula(model))
        
        ## Attach metadata
        data.table::setattr(formatted, "model", model)
        data.table::setattr(formatted, "raw_data", raw_data)
        data.table::setattr(formatted, "formula_str", formula_str)
        data.table::setattr(formatted, "model_scope", raw_data$model_scope[1])
        data.table::setattr(formatted, "model_type", raw_data$model_type[1])
        
        ## Extract significant predictors (p < 0.05)
        if ("p_value" %in% names(raw_data) && "variable" %in% names(raw_data)) {
            sig_vars <- unique(
                raw_data[!is.na(p_value) & p_value < 0.05]$variable
            )
            data.table::setattr(formatted, "significant", sig_vars)
        }
        
        class(formatted) <- c("fit_result", class(formatted))
        
        return(formatted)
    }
    
    ## Formula-based workflow: fit new model
    
    ## Validate required parameters for formula-based workflow
    if (is.null(data)) {
        stop("'data' parameter is required when not providing a pre-fitted model.")
    }
    if (is.null(outcome)) {
        stop("'outcome' parameter is required when not providing a pre-fitted model.")
    }
    if (is.null(predictors)) {
        stop("'predictors' parameter is required when not providing a pre-fitted model.")
    }
    
    ## Store original data name for reference
    data_name <- deparse(substitute(data))
    
    ## Convert to data.table once at the start
    if (!data.table::is.data.table(data)) {
        data <- data.table::as.data.table(data)
    }
    
    ## Validate inputs and auto-correct model type if needed
    validation <- validate_fit_inputs(
        data = data,
        outcome = outcome,
        predictors = predictors,
        model_type = model_type,
        family = if (model_type %in% c("glm", "glmer")) family else NULL,
        conf_level = conf_level,
        digits = digits,
        p_digits = p_digits,
        auto_correct_model = TRUE
    )
    
    ## Apply any auto-corrections
    if (validation$auto_corrected) {
        model_type <- validation$model_type
    }

    ## Resolve Gamma family string to log link
    ## The canonical link for Gamma is inverse (1/μ), which produces
    ## uninterpretable coefficients. The log link yields exp(β) = ratio,
    ## consistent with the multiplicative effect interpretation used for
    ## OR, HR, and RR throughout summata.
    if (model_type %in% c("glm", "glmer") && is.character(family) &&
        tolower(family) == "gamma") {
        family <- stats::Gamma(link = "log")
    }
    
    ## Validate random effects for mixed-effects models
    ## Support both explicit 'random' parameter AND random effects embedded in predictors
    mixed_types <- c("glmer", "lmer", "coxme")
    has_random_in_predictors <- any(grepl("\\|", predictors))
    
    if (model_type %in% mixed_types && is.null(random) && !has_random_in_predictors) {
        stop("'random' parameter is required for mixed-effects models (", model_type, ").\n",
             "Example: random = \"(1|site)\" for random intercepts by site.\n",
             "Alternatively, include random effects in 'predictors' using the same syntax.")
    }
    if (!is.null(random) && !model_type %in% mixed_types) {
        warning("'random' parameter is ignored for non-mixed-effects models (", model_type, ").")
        random <- NULL
    }
    
    ## Extract random effects from predictors if present (for mixed-effects models)
    ## The random effects get added to the formula but aren't treated as regular predictors
    if (model_type %in% mixed_types && has_random_in_predictors) {
        random_mask <- grepl("\\|", predictors)
        random_in_preds <- predictors[random_mask]
        fixed_predictors <- predictors[!random_mask]
        
        ## Combine explicit random with embedded random (if both provided)
        if (!is.null(random)) {
            message("Note: Random effects specified in both 'random' parameter and 'predictors'. ",
                    "Combining: ", random, " + ", paste(random_in_preds, collapse = " + "))
            random <- paste(c(random, random_in_preds), collapse = " + ")
        } else {
            random <- paste(random_in_preds, collapse = " + ")
        }
        
        ## Update predictors to only include fixed effects
        predictors <- fixed_predictors
    }
    
    ## Build formula string
    if (!is.null(interactions)) {
        formula_str <- paste(outcome, "~", 
                             paste(c(predictors, interactions), collapse = " + "))
    } else {
        formula_str <- paste(outcome, "~", paste(predictors, collapse = " + "))
    }
    
    ## Add random effects if provided (for lmer/glmer - coxme uses same formula syntax)
    if (!is.null(random) && model_type %in% c("glmer", "lmer", "coxme")) {
        formula_str <- paste(formula_str, "+", random)
    }
    
    ## Add strata if provided (for Cox models)
    if (!is.null(strata) && model_type %in% c("coxph", "clogit")) {
        formula_str <- paste(formula_str, "+ strata(", strata, ")")
    }
    
    formula <- stats::as.formula(formula_str)
    
    ## Fit model based on type - use 'data' directly
    model <- quiet_fit(quote({
        if (model_type == "glm") {
            if (!is.null(weights)) {
                stats::glm(formula, data = data, family = family, 
                           weights = data[[weights]], ...)
            } else {
                stats::glm(formula, data = data, family = family, ...)
            }
            
        } else if (model_type == "negbin") {
            if (!requireNamespace("MASS", quietly = TRUE))
                stop("Package 'MASS' required for negative binomial models")
            if (!is.null(weights)) {
                MASS::glm.nb(formula, data = data, 
                              weights = data[[weights]], ...)
            } else {
                MASS::glm.nb(formula, data = data, ...)
            }
            
        } else if (model_type == "lm") {
            if (!is.null(weights)) {
                stats::lm(formula, data = data, weights = data[[weights]], ...)
            } else {
                stats::lm(formula, data = data, ...)
            }
            
        } else if (model_type == "coxph") {
            if (!requireNamespace("survival", quietly = TRUE)) 
                stop("Package 'survival' required for Cox models")
            
            if (!is.null(cluster)) {
                survival::coxph(formula, data = data, 
                                cluster = data[[cluster]], ...)
            } else {
                survival::coxph(formula, data = data, ...)
            }
            
        } else if (model_type == "clogit") {
            if (!requireNamespace("survival", quietly = TRUE)) 
                stop("Package 'survival' required for conditional logistic regression")
            survival::clogit(formula, data = data, ...)
            
        } else if (model_type == "coxme") {
            if (!requireNamespace("coxme", quietly = TRUE))
                stop("Package 'coxme' required for mixed effects Cox models")
            coxme::coxme(formula, data = data, ...)
            
        } else if (model_type == "lmer") {
            if (!requireNamespace("lme4", quietly = TRUE))
                stop("Package 'lme4' required for linear mixed effects models")
            lme4::lmer(formula, data = data, ...)
            
        } else if (model_type == "glmer") {
            if (!requireNamespace("lme4", quietly = TRUE))
                stop("Package 'lme4' required for mixed effects models")
            lme4::glmer(formula, data = data, family = family, ...)
            
        } else {
            stop("Unsupported model type: ", model_type)
        }
    }), verbose = verbose)
    
    ## Store the data directly in the model
    if (isS4(model)) {
        attr(model, "data") <- data
    } else {
        model$data <- data
        attr(model, "data") <- data
    }
    
    ## Attach metadata as attributes
    data.table::setattr(model, "formula_str", formula_str)
    data.table::setattr(model, "predictors", predictors)
    data.table::setattr(model, "model_type", model_type)
    data.table::setattr(model, "data_name", data_name)
    
    if (!is.null(interactions)) {
        data.table::setattr(model, "interactions", interactions)
    }
    if (!is.null(random)) {
        data.table::setattr(model, "random", random)
    }
    if (!is.null(strata)) {
        data.table::setattr(model, "strata", strata)
    }
    if (!is.null(cluster)) {
        data.table::setattr(model, "cluster", cluster)
    }
    if (!is.null(weights)) {
        data.table::setattr(model, "weights", weights)
    }
    
    ## Convert to readable format using m2dt
    raw_data <- m2dt(data = data,
                     model = model, 
                     conf_level = conf_level,
                     keep_qc_stats = keep_qc_stats,
                     include_intercept = FALSE,
                     reference_rows = reference_rows,
                     skip_counts = (!show_n && !show_events),
                     conf_method = conf_method)

    ## Propagate cached confint from m2dt to model for downstream reuse
    ## (avoids redundant profiling in forest plot functions)
    cached_ci <- attr(raw_data, "cached_confint")
    if (!is.null(cached_ci)) {
        attr(model, "cached_confint") <- cached_ci
        attr(model, "cached_confint_level") <- attr(raw_data, "cached_confint_level")
    }

    ## Format results for publication
    formatted <- format_model_table(raw_data,
                                    show_n = show_n,
                                    show_events = show_events,
                                    digits = digits,
                                    p_digits = p_digits,
                                    labels = labels,
                                    exponentiate = exponentiate,
                                    conf_level = conf_level,
                                    marks = marks)

    ## Attach metadata
    data.table::setattr(formatted, "model", model)
    data.table::setattr(formatted, "raw_data", raw_data)
    data.table::setattr(formatted, "outcome", outcome)
    data.table::setattr(formatted, "predictors", predictors)
    data.table::setattr(formatted, "formula_str", formula_str)
    data.table::setattr(formatted, "model_scope", raw_data$model_scope[1])
    data.table::setattr(formatted, "model_type", raw_data$model_type[1])

    ## Extract significant predictors (p < 0.05)
    if ("p_value" %in% names(raw_data) && "variable" %in% names(raw_data)) {
        sig_vars <- unique(
            raw_data[!is.na(p_value) & p_value < 0.05]$variable
        )
        data.table::setattr(formatted, "significant", sig_vars)
    }

    ## Add metadata from model fitting
    if (!is.null(interactions)) {
        data.table::setattr(formatted, "interactions", interactions)
    }
    if (!is.null(random)) {
        data.table::setattr(formatted, "random", random)
    }
    if (!is.null(strata)) {
        data.table::setattr(formatted, "strata", strata)
    }
    if (!is.null(cluster)) {
        data.table::setattr(formatted, "cluster", cluster)
    }
    if (!is.null(weights)) {
        data.table::setattr(formatted, "weights", weights)
    }

    class(formatted) <- c("fit_result", class(formatted))

    return(formatted)
}

#' Print method for fit results
#' 
#' Displays a summary header with model scope (Univariable/Multivariable),
#' model type, formula, sample size, and event count before printing the
#' formatted results table.
#' 
#' @param x fit_result object.
#' @param ... Additional arguments passed to print methods.
#' @return Invisibly returns the input object \code{x}. Called for its
#'   side effect of printing a formatted summary to the console.
#' @keywords internal
#' @family regression functions
#' @export
print.fit_result <- function(x, ...) {
    cat("\n", attr(x, "model_scope"), " ", attr(x, "model_type"), " Model\n", sep = "")
    cat("Formula: ", attr(x, "formula_str"), "\n", sep = "")
    
    ## Get sample size from raw results
    raw <- attr(x, "raw_data")
    if (!is.null(raw)) {
        cat("n = ", raw$n[1], sep = "")
        if (!is.na(raw$events[1])) cat(", Events = ", raw$events[1], sep = "")
        cat("\n")
    }
    cat("\n")
    
    NextMethod("print", x)
    invisible(x)
}

Try the summata package in your browser

Any scripts or data that you put into this service are public.

summata documentation built on May 7, 2026, 5:07 p.m.