R/uniscreen.R

Defines functions print.uniscreen_result uniscreen

Documented in print.uniscreen_result uniscreen

#' Univariable Screening for Multiple Predictors
#'
#' Performs comprehensive univariable (unadjusted) regression analyses by fitting 
#' separate models for each predictor against a single outcome. This function is 
#' designed for initial variable screening, hypothesis generation, and understanding 
#' crude associations before multivariable modeling. Returns publication-ready 
#' formatted results with optional \emph{p}-value filtering.
#'
#' @param data Data frame or data.table containing the analysis dataset. The 
#'   function automatically converts data frames to data.tables for efficient 
#'   processing.
#'   
#' @param outcome Character string specifying the outcome variable name. For 
#'   survival analysis, use \code{Surv()} syntax from the \pkg{survival} package 
#'   (\emph{e.g.,} \code{"Surv(time, status)"} or \code{"Surv(os_months, os_status)"}).
#'   
#' @param predictors Character vector of predictor variable names to screen. Each 
#'   predictor is tested independently in its own univariable model. Can include 
#'   continuous, categorical (factor), or binary variables.
#'   
#' @param model_type Character string specifying the type of regression model to 
#'   fit. Options include:
#'   \itemize{
#'     \item \code{"glm"} - Generalized linear model (default). Supports multiple 
#'       distributions via the \code{family} parameter including logistic, Poisson, 
#'       Gamma, Gaussian, and quasi-likelihood models.
#'     \item \code{"lm"} - Linear regression for continuous outcomes with normally 
#'       distributed errors. Equivalent to \code{glm} with \code{family = "gaussian"}.
#'     \item \code{"coxph"} - Cox proportional hazards model for time-to-event 
#'       survival analysis. Requires \code{Surv()} outcome syntax.
#'     \item \code{"clogit"} - Conditional logistic regression for matched 
#'       case-control studies or stratified analyses.
#'     \item \code{"negbin"} - Negative binomial regression for overdispersed count 
#'       data (requires MASS package). Estimates an additional dispersion parameter 
#'       compared to Poisson regression.
#'     \item \code{"glmer"} - Generalized linear mixed-effects model for hierarchical 
#'       or clustered data with non-normal outcomes (requires \pkg{lme4} package and 
#'       \code{random} parameter).
#'     \item \code{"lmer"} - Linear mixed-effects model for hierarchical or clustered 
#'       data with continuous outcomes (requires \pkg{lme4} package and \code{random} 
#'       parameter).
#'     \item \code{"coxme"} - Cox mixed-effects model for clustered survival data 
#'       (requires coxme package and \code{random} parameter).
#'   }
#'
#' @param random Character string specifying the random-effects formula for 
#'   mixed-effects models (\code{glmer}, \code{lmer}, \code{coxme}). Use standard
#'   \pkg{lme4}/\pkg{coxme} syntax, \emph{e.g.,} \code{"(1|site)"} for random
#'   intercepts by site, \code{"(1|site/patient)"} for nested random effects.
#'   Required when \code{model_type} is a mixed-effects model type unless random
#'   effects are included in the \code{predictors} vector. Alternatively, random
#'   effects can be included directly in the \code{predictors} vector using the same 
#'   syntax (\emph{e.g.,} \code{predictors = c("age", "sex", "(1|site)")}), though 
#'   they will not be iterated over as predictors. Default is \code{NULL}.
#'   
#' @param family For GLM and GLMER models, specifies the error distribution and link 
#'   function. Can be a character string, a family function, or a family object.
#'   Ignored for non-GLM/GLMER models.
#'   
#'   \strong{Binary/Binomial outcomes:}
#'   \itemize{
#'     \item \code{"binomial"} or \code{binomial()} - Logistic regression for binary 
#'       outcomes (0/1, TRUE/FALSE). Returns odds ratios (OR). Default.
#'     \item \code{"quasibinomial"} or \code{quasibinomial()} - Logistic regression 
#'       with overdispersion. Use when residual deviance >> residual df.
#'     \item \code{binomial(link = "probit")} - Probit regression (normal CDF link).
#'     \item \code{binomial(link = "cloglog")} - Complementary log-log link for 
#'       asymmetric binary outcomes.
#'   }
#'   
#'   \strong{Count outcomes:}
#'   \itemize{
#'     \item \code{"poisson"} or \code{poisson()} - Poisson regression for count 
#'       data. Returns rate ratios (RR). Assumes mean = variance.
#'     \item \code{"quasipoisson"} or \code{quasipoisson()} - Poisson regression 
#'       with overdispersion. Use when variance > mean.
#'   }
#'   
#'   \strong{Continuous outcomes:}
#'   \itemize{
#'     \item \code{"gaussian"} or \code{gaussian()} - Normal/Gaussian distribution 
#'       for continuous outcomes. Equivalent to linear regression.
#'     \item \code{gaussian(link = "log")} - Log-linear model for positive continuous 
#'       outcomes. Returns multiplicative effects.
#'     \item \code{gaussian(link = "inverse")} - Inverse link for specific applications.
#'   }
#'   
#'   \strong{Positive continuous outcomes:}
#'   \itemize{
#'     \item \code{"Gamma"} or \code{Gamma()} - Gamma distribution for positive, 
#'       right-skewed continuous data (\emph{e.g.,} costs, lengths of stay). When
#'       passed as a string, resolves to log link for interpretable multiplicative
#'       effects.
#'     \item \code{Gamma(link = "inverse")} - Gamma with inverse (canonical) link.
#'     \item \code{Gamma(link = "identity")} - Gamma with identity link for additive 
#'       effects on positive outcomes.
#'     \item \code{"inverse.gaussian"} or \code{inverse.gaussian()} - Inverse Gaussian 

#'       for positive, highly right-skewed data.
#'   }
#'   
#'   For negative binomial regression (overdispersed counts), use 
#'   \code{model_type = "negbin"} instead of the \code{family} parameter.
#'   
#'   See \code{\link[stats]{family}} for additional details and options.
#'   
#' @param p_threshold Numeric value between 0 and 1 specifying the \emph{p}-value threshold 
#'   used to count significant predictors in the printed summary. All predictors
#'   are always included in the output table. Default is 0.05.
#'   
#' @param conf_level Numeric confidence level for confidence intervals. Must be 
#'   between 0 and 1. Default is 0.95 (95\% confidence intervals).
#'   
#' @param reference_rows Logical. If \code{TRUE}, adds rows for reference 
#'   categories of factor variables with baseline values (OR/HR/RR = 1, 
#'   coefficient = 0). Makes tables complete and easier to interpret. 
#'   Default is \code{TRUE}.
#'   
#' @param show_n Logical. If \code{TRUE}, includes the sample size column in 
#'   the output table. Default is \code{TRUE}.
#'   
#' @param show_events Logical. If \code{TRUE}, includes the events column in the 
#'   output table (relevant for survival and logistic regression). Default is 
#'   \code{TRUE}.
#'   
#' @param digits Integer specifying the number of decimal places for effect 
#'   estimates (OR, HR, RR, coefficients). Default is 2.
#'   
#' @param p_digits Integer specifying the number of decimal places for
#'   \emph{p}-values. Values smaller than \code{10^(-p_digits)} are displayed
#'   as \code{"< 0.001"} (for \code{p_digits = 3}), \code{"< 0.0001"} (for
#'   \code{p_digits = 4}), etc. Default is 3.
#'
#' @param labels Named character vector or list providing custom display 
#'   labels for variables. Names should match predictor names, values are the 
#'   display labels. Predictors not in \code{labels} use their original names. 
#'   Default is \code{NULL}.
#'   
#' @param keep_models Logical. If \code{TRUE}, stores all fitted model objects 
#'   in the output as an attribute. This allows access to models for diagnostics, 
#'   predictions, or further analysis, but can consume significant memory for 
#'   large datasets or many predictors. Models are accessible via 
#'   \code{attr(result, "models")}. Default is \code{FALSE}.
#'   
#' @param exponentiate Logical. Whether to exponentiate coefficients (display 
#'   OR/HR/RR instead of log odds/log hazards). Default is \code{NULL}, which 
#'   automatically exponentiates for logistic, Poisson, and Cox models, and 
#'   displays raw coefficients for linear models and other GLM families. Set 
#'   to \code{TRUE} to force exponentiation or \code{FALSE} to force coefficients.
#'
#' @param conf_method Character string controlling the confidence interval method.
#'   If \code{NULL} (default), uses \code{getOption("summata.conf_method", "profile")}.
#'   \itemize{
#'     \item \code{"profile"} - Profile likelihood intervals for GLM and negative
#'       binomial models (via \code{MASS::confint.glm()}), exact \emph{t}-distribution
#'       intervals for linear models. Falls back to Wald on profiling failure.
#'       Quasi-likelihood families always use Wald (no true likelihood).
#'     \item \code{"wald"} - Wald intervals (coefficient \eqn{\pm} \emph{z}
#'       \eqn{\times} SE) for all model types. Faster but less accurate near
#'       boundary conditions or with small subgroups.
#'   }
#'   Cox and mixed-effects models use Wald intervals regardless of this setting.
#'   Set globally with \code{options(summata.conf_method = "wald")} to use Wald
#'   throughout a session.
#'
#' @param parallel Logical. If \code{TRUE} (default), fits models in parallel 
#'   using multiple CPU cores for improved performance with many predictors. 
#'   On Unix/Mac systems, uses fork-based parallelism via \code{mclapply}; 
#'   on Windows, uses socket clusters via \code{parLapply}. Set to \code{FALSE} 
#'   for sequential processing.
#'
#' @param n_cores Integer specifying the number of CPU cores to use for parallel 
#'   processing. Default is \code{NULL}, which automatically detects available 
#'   cores and uses \code{detectCores() - 1}. During R CMD check, the number 
#'   of cores is automatically limited to 2 per CRAN policy. Ignored when 
#'   \code{parallel = FALSE}.
#'
#' @param number_format Character string or two-element character vector
#'   controlling thousand and decimal separators in formatted output. Named
#'   presets:
#'   \itemize{
#'     \item \code{"us"} - Comma thousands, period decimal: \code{1,234.56} [default]
#'     \item \code{"eu"} - Period thousands, comma decimal: \code{1.234,56}
#'     \item \code{"space"} - Thin-space thousands, period decimal: \code{1 234.56}
#'       (SI/ISO 31-0)
#'     \item \code{"none"} - No thousands separator: \code{1234.56}
#'   }
#'   Or provide a custom two-element vector \code{c(big.mark, decimal.mark)},
#'   \emph{e.g.}, \code{c("'", ".")} for Swiss-style: \verb{1'234.56}.
#'
#'   When \code{NULL} (default), uses
#'   \code{getOption("summata.number_format", "us")}. Set the global option
#'   once per session to avoid passing this argument repeatedly:
#'   \preformatted{
#'     options(summata.number_format = "eu")
#'   }
#'
#' @param verbose Logical. If \code{TRUE}, displays model fitting warnings
#'   (\emph{e.g.,} singular fit, convergence issues). If \code{FALSE} (default),
#'   routine fitting messages are suppressed while unexpected warnings are
#'   preserved. When \code{NULL}, uses
#'   \code{getOption("summata.verbose", FALSE)}.
#'
#' @param ... Additional arguments passed to the underlying model fitting functions 
#'   (\code{\link[stats]{glm}}, \code{\link[stats]{lm}}, 
#'   \code{\link[survival]{coxph}}, \emph{etc.}). Common options include \code{weights}, 
#'   \code{subset}, \code{na.action}, and model-specific control parameters.
#'
#' @return A data.table with S3 class \code{"uniscreen_result"} containing formatted 
#'   univariable screening results. The table structure includes:
#'   \describe{
#'     \item{Variable}{Character. Predictor name or custom label (from \code{labels})}
#'     \item{Group}{Character. For factor variables: category level. For continuous 
#'       variables: typically empty or descriptive statistic label}
#'     \item{n}{Integer. Sample size used in the model (if \code{show_n = TRUE})}
#'     \item{n_group}{Integer. Sample size for this specific factor level 
#'       (factor variables only)}
#'     \item{events}{Integer. Total number of events in the model for survival 
#'       or logistic regression (if \code{show_events = TRUE})}
#'     \item{events_group}{Integer. Number of events for this specific factor 
#'       level (factor variables only)}
#'     \item{OR/HR/RR/Coefficient (95\% CI)}{Character. Formatted effect 
#'       estimate with confidence interval. Column name depends on model type:
#'       "OR (95\% CI)" for logistic, "HR (95\% CI)" for survival, 
#'       "RR (95\% CI)" for counts, "Coefficient (95\% CI)" for linear models}
#'     \item{\emph{p}-value}{Character. Formatted \emph{p}-value from the Wald test}
#'   }
#'   
#'   The returned object includes the following attributes accessible via \code{attr()}:
#'   \describe{
#'     \item{raw_data}{data.table. Unformatted numeric results with separate 
#'       columns for coefficients, standard errors, confidence interval bounds, 
#'       \emph{etc.} Suitable for further statistical analysis or custom formatting}
#'     \item{models}{List (if \code{keep_models = TRUE}). Named list of fitted 
#'       model objects, with predictor names as list names. Access specific models 
#'       via \code{attr(result, "models")[["predictor_name"]]}}
#'     \item{outcome}{Character. The outcome variable name used}
#'     \item{model_type}{Character. The regression model type used}
#'     \item{model_scope}{Character. Always "Univariable" for screening results}
#'     \item{screening_type}{Character. Always "univariable" to identify the 
#'       analysis type}
#'     \item{p_threshold}{Numeric. The \emph{p}-value threshold used for significance}
#'     \item{significant}{Character vector. Names of predictors with \emph{p}-value 
#'       below the screening threshold, suitable for passing directly to 
#'       downstream modeling functions}
#'   }
#'
#' @details
#' \strong{Analysis Approach:}
#' 
#' The function implements a comprehensive univariable screening workflow:
#' \enumerate{
#'   \item For each predictor in \code{predictors}, fits a separate model: 
#'     \code{outcome ~ predictor}
#'   \item Extracts coefficients, confidence intervals, and \emph{p}-values from each model
#'   \item Combines results into a single table for easy comparison
#'   \item Formats output for publication with appropriate effect measures
#' }
#' 
#' Each predictor is tested \emph{independently} - these are crude (unadjusted) 
#' associations that do not account for confounding or interaction effects.
#' 
#' \strong{When to Use Univariable Screening:}
#' \itemize{
#'   \item \strong{Initial variable selection}: Identify predictors associated 
#'     with the outcome before building multivariable models
#'   \item \strong{Hypothesis generation}: Explore potential associations in 
#'     exploratory analyses
#'   \item \strong{Understanding crude associations}: Report unadjusted effects 
#'     alongside adjusted estimates
#'   \item \strong{Variable reduction}: Use \emph{p}-value thresholds (\emph{e.g.,}
#'     \emph{p} < 0.20) to identify candidates for multivariable modeling
#'   \item \strong{Checking multicollinearity}: Compare univariable and 
#'     multivariable effects to identify potential collinearity
#' }
#' 
#' \strong{Threshold for \emph{p}-values:}
#' 
#' The \code{p_threshold} parameter controls the significance threshold used 
#' in the printed summary to count how many predictors are significant. All
#' predictors are always included in the output table regardless of this setting.
#' 
#' \strong{Effect Measures by Model Type:}
#' \itemize{
#'   \item \strong{Logistic regression} (\code{model_type = "glm"}, 
#'     \code{family = "binomial"}): Odds ratios (OR)
#'   \item \strong{Cox regression} (\code{model_type = "coxph"}): Hazard ratios (HR)
#'   \item \strong{Poisson regression} (\code{model_type = "glm"}, 
#'     \code{family = "poisson"}): Rate/risk ratios (RR)
#'   \item \strong{Negative binomial} (\code{model_type = "negbin"}): Rate ratios (RR)
#'   \item \strong{Linear regression} (\code{model_type = "lm"} or GLM with 
#'     identity link): Raw coefficient estimates
#'   \item \strong{Gamma regression} (\code{model_type = "glm"}, 
#'     \code{family = "Gamma"}): Multiplicative effects (with default log link)
#' }
#' 
#' \strong{Memory Considerations:}
#' 
#' When \code{keep_models = FALSE} (default), fitted models are discarded after 
#' extracting results to conserve memory. Set \code{keep_models = TRUE} only when 
#' the following are needed:
#' \itemize{
#'   \item Model diagnostic plots
#'   \item Predictions from individual models
#'   \item Additional model statistics not extracted by default
#'   \item Further analysis of specific models
#' }
#'
#' @seealso 
#' \code{\link{fit}} for fitting a single multivariable model,
#' \code{\link{fullfit}} for complete univariable-to-multivariable workflow,
#' \code{\link{compfit}} for comparing multiple models,
#' \code{\link{m2dt}} for converting individual models to tables
#'
#' @examples
#' # Load example data
#' data(clintrial)
#' data(clintrial_labels)
#' 
#' # Example 1: Basic logistic regression screening
#' screen1 <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "bmi", "smoking", "hypertension"),
#'     model_type = "glm",
#'     family = "binomial",
#'     parallel = FALSE
#' )
#' print(screen1)
#'
#' \donttest{
#' 
#' # Example 2: With custom variable labels
#' screen2 <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "bmi", "treatment"),
#'     labels = clintrial_labels,
#'     parallel = FALSE
#' )
#' print(screen2)
#' 
#' # Example 3: Filter by p-value threshold
#' # Only keep predictors with p < 0.20 (common for screening)
#' screen3 <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "bmi", "smoking", "hypertension", 
#'                   "diabetes", "stage"),
#'     p_threshold = 0.20,
#'     labels = clintrial_labels,
#'     parallel = FALSE
#' )
#' print(screen3)
#' # Only significant predictors are shown
#' 
#' # Example 4: Cox proportional hazards screening
#' library(survival)
#' cox_screen <- uniscreen(
#'     data = clintrial,
#'     outcome = "Surv(os_months, os_status)",
#'     predictors = c("age", "sex", "treatment", "stage", "grade"),
#'     model_type = "coxph",
#'     labels = clintrial_labels,
#'     parallel = FALSE
#' )
#' print(cox_screen)
#' # Returns hazard ratios (HR) instead of odds ratios
#' 
#' # Example 5: Keep models for diagnostics
#' screen5 <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "bmi", "creatinine"),
#'     keep_models = TRUE,
#'     parallel = FALSE
#' )
#' 
#' # Access stored models
#' models <- attr(screen5, "models")
#' summary(models[["age"]])
#' plot(models[["age"]])  # Diagnostic plots
#' 
#' # Example 6: Linear regression screening
#' linear_screen <- uniscreen(
#'     data = clintrial,
#'     outcome = "bmi",
#'     predictors = c("age", "sex", "smoking", "creatinine", "hemoglobin"),
#'     model_type = "lm",
#'     labels = clintrial_labels,
#'     parallel = FALSE
#' )
#' print(linear_screen)
#' 
#' # Example 7: Poisson regression for equidispersed count outcomes
#' # fu_count has variance ~= mean, appropriate for standard Poisson
#' poisson_screen <- uniscreen(
#'     data = clintrial,
#'     outcome = "fu_count",
#'     predictors = c("age", "stage", "treatment", "surgery"),
#'     model_type = "glm",
#'     family = "poisson",
#'     labels = clintrial_labels,
#'     parallel = FALSE
#' )
#' print(poisson_screen)
#' # Returns rate ratios (RR)
#' 
#' # Example 8: Negative binomial for overdispersed counts
#' # ae_count has variance > mean (overdispersed), use negbin
#' if (requireNamespace("MASS", quietly = TRUE)) {
#'     nb_screen <- uniscreen(
#'         data = clintrial,
#'         outcome = "ae_count",
#'         predictors = c("age", "treatment", "diabetes", "surgery"),
#'         model_type = "negbin",
#'         labels = clintrial_labels,
#'         parallel = FALSE
#'     )
#'     print(nb_screen)
#' }
#' 
#' # Example 9: Gamma regression for positive continuous outcomes (\emph{e.g.,} costs)
#' gamma_screen <- uniscreen(
#'     data = clintrial,
#'     outcome = "los_days",
#'     predictors = c("age", "sex", "treatment", "surgery"),
#'     model_type = "glm",
#'     family = Gamma(link = "log"),
#'     labels = clintrial_labels,
#'     parallel = FALSE
#' )
#' print(gamma_screen)
#' 
#' # Example 10: Hide reference rows for factor variables
#' screen10 <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("treatment", "stage", "grade"),
#'     reference_rows = FALSE,
#'     parallel = FALSE
#' )
#' print(screen10)
#' # Reference categories not shown
#' 
#' # Example 11: Customize decimal places
#' screen11 <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "bmi", "creatinine"),
#'     digits = 3,      # 3 decimal places for OR
#'     p_digits = 4     # 4 decimal places for p-values
#' )
#' print(screen11)
#' 
#' # Example 12: Hide sample size and event columns
#' screen12 <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "bmi"),
#'     show_n = FALSE,
#'     show_events = FALSE,
#'     parallel = FALSE
#' )
#' print(screen12)
#' 
#' # Example 13: Access raw numeric data
#' screen13 <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "treatment"),
#'     parallel = FALSE
#' )
#' raw_data <- attr(screen13, "raw_data")
#' print(raw_data)
#' # Contains unformatted coefficients, SEs, CIs, etc.
#' 
#' # Example 14: Force coefficient display instead of OR
#' screen14 <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "bmi"),
#'     model_type = "glm",
#'     family = "binomial",
#'     parallel = FALSE,
#'     exponentiate = FALSE  # Show log odds instead of OR
#' )
#' print(screen14)
#' 
#' # Example 15: Screening with weights
#' screen15 <- uniscreen(
#'     data = clintrial,
#'     outcome = "Surv(os_months, os_status)",
#'     predictors = c("age", "sex", "bmi"),
#'     model_type = "coxph",
#'     weights = runif(nrow(clintrial), min = 0.5, max = 2),  # Random numbers for example
#'     parallel = FALSE
#' )
#' 
#' # Example 16: Strict significance filter (p < 0.05)
#' sig_only <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "bmi", "smoking", "hypertension", 
#'                   "diabetes", "ecog", "treatment", "stage", "grade"),
#'     p_threshold = 0.05,
#'     labels = clintrial_labels,
#'     parallel = FALSE
#' )
#' 
#' # Check how many predictors passed the filter
#' n_significant <- length(unique(sig_only$Variable[sig_only$Variable != ""]))
#' cat("Significant predictors:", n_significant, "\n")
#' 
#' # Example 17: Complete workflow - screen then use in multivariable
#' # Step 1: Screen with liberal threshold
#' candidates <- uniscreen(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = c("age", "sex", "bmi", "smoking", "hypertension",
#'                   "diabetes", "treatment", "stage", "grade"),
#'     p_threshold = 0.20,
#'     parallel = FALSE
#' )
#' 
#' # Step 2: Extract significant predictor names
#' sig_predictors <- attr(candidates, "significant")
#' 
#' # Step 3: Fit multivariable model with selected predictors
#' multi_model <- fit(
#'     data = clintrial,
#'     outcome = "os_status",
#'     predictors = sig_predictors,
#'     labels = clintrial_labels
#' )
#' print(multi_model)
#'
#' # Example 18: Mixed-effects logistic regression (glmer)
#' # Accounts for clustering by site
#' if (requireNamespace("lme4", quietly = TRUE)) {
#'     glmer_screen <- uniscreen(
#'         data = clintrial,
#'         outcome = "os_status",
#'         predictors = c("age", "sex", "treatment", "stage"),
#'         model_type = "glmer",
#'         random = "(1|site)",
#'         family = "binomial",
#'         labels = clintrial_labels,
#'         parallel = FALSE
#'     )
#'     print(glmer_screen)
#' }
#'
#' # Example 19: Mixed-effects linear regression (lmer)
#' if (requireNamespace("lme4", quietly = TRUE)) {
#'     lmer_screen <- uniscreen(
#'         data = clintrial,
#'         outcome = "biomarker_x",
#'         predictors = c("age", "sex", "treatment", "smoking"),
#'         model_type = "lmer",
#'         random = "(1|site)",
#'         labels = clintrial_labels,
#'         parallel = FALSE
#'     )
#'     print(lmer_screen)
#' }
#'
#' # Example 20: Mixed-effects Cox model (coxme)
#' if (requireNamespace("coxme", quietly = TRUE)) {
#'     coxme_screen <- uniscreen(
#'         data = clintrial,
#'         outcome = "Surv(os_months, os_status)",
#'         predictors = c("age", "sex", "treatment", "stage"),
#'         model_type = "coxme",
#'         random = "(1|site)",
#'         labels = clintrial_labels,
#'         parallel = FALSE
#'     )
#'     print(coxme_screen)
#' }
#'
#' # Example 21: Mixed-effects with nested random effects
#' # Patients nested within sites
#' if (requireNamespace("lme4", quietly = TRUE)) {
#'     nested_screen <- uniscreen(
#'         data = clintrial,
#'         outcome = "os_status",
#'         predictors = c("age", "treatment"),
#'         model_type = "glmer",
#'         random = "(1|site/patient_id)",
#'         family = "binomial",
#'         parallel = FALSE
#'     )
#' }
#' 
#' # Example 22: Quasipoisson for overdispersed count data
#' # Alternative to negative binomial when MASS not available
#' quasi_screen <- uniscreen(
#'     data = clintrial,
#'     outcome = "ae_count",
#'     predictors = c("age", "treatment", "diabetes", "surgery", "stage"),
#'     model_type = "glm",
#'     family = "quasipoisson",
#'     labels = clintrial_labels,
#'     parallel = FALSE
#' )
#' print(quasi_screen)
#' # Adjusts standard errors for overdispersion
#' 
#' # Example 23: Quasibinomial for overdispersed binary data
#' quasibin_screen <- uniscreen(
#'     data = clintrial,
#'     outcome = "any_complication",
#'     predictors = c("age", "bmi", "diabetes", "surgery", "ecog"),
#'     model_type = "glm",
#'     family = "quasibinomial",
#'     labels = clintrial_labels,
#'     parallel = FALSE
#' )
#' print(quasibin_screen)
#' 
#' # Example 24: Inverse Gaussian for highly skewed positive data
#' invgauss_screen <- uniscreen(
#'     data = clintrial,
#'     outcome = "recovery_days",
#'     predictors = c("age", "surgery", "pain_score", "los_days"),
#'     model_type = "glm",
#'     family = inverse.gaussian(link = "log"),
#'     labels = clintrial_labels,
#'     parallel = FALSE
#' )
#' print(invgauss_screen)
#'
#' }
#' 
#' @family regression functions
#' @export
uniscreen <- function(data,
                      outcome,
                      predictors,
                      model_type = "glm",
                      family = "binomial",
                      random = NULL,
                      p_threshold = 0.05,
                      conf_level = 0.95,
                      reference_rows = TRUE,
                      show_n = TRUE,
                      show_events = TRUE,
                      digits = 2,
                      p_digits = 3,
                      labels = NULL,
                      keep_models = FALSE,
                      exponentiate = NULL,
                      conf_method = NULL,
                      parallel = TRUE,
                      n_cores = NULL,
                      number_format = NULL,
                      verbose = NULL,
                      ...) {
    
    ## Convert to data.table once at the start
    if (!data.table::is.data.table(data)) {
        data <- data.table::as.data.table(data)
    }
    
    ## Resolve verbose setting
    verbose <- if (is.null(verbose)) getOption("summata.verbose", FALSE) else verbose
    
    ## Validate inputs and auto-correct model type if needed
    validation <- validate_fit_inputs(
        data = data,
        outcome = outcome,
        predictors = predictors,
        model_type = model_type,
        family = if (model_type %in% c("glm", "glmer")) family else NULL,
        conf_level = conf_level,
        digits = digits,
        p_digits = p_digits,
        p_threshold = p_threshold,
        auto_correct_model = TRUE
    )
    
    ## Apply any auto-corrections
    if (validation$auto_corrected) {
        model_type <- validation$model_type
    }
    
    ## Resolve Gamma family string to log link (see fit.R for rationale)
    if (model_type %in% c("glm", "glmer") && is.character(family) &&
        tolower(family) == "gamma") {
        family <- stats::Gamma(link = "log")
    }

    ## Validate random effects for mixed-effects models
    ## Support both explicit 'random' parameter AND random effects embedded in predictors
    mixed_types <- c("glmer", "lmer", "coxme")
    has_random_in_predictors <- any(grepl("\\|", predictors))
    
    if (model_type %in% mixed_types && is.null(random) && !has_random_in_predictors) {
        stop("'random' parameter is required for mixed-effects models (", model_type, ").\n",
             "Example: random = \"(1|site)\" for random intercepts by site.\n",
             "Alternatively, include random effects in 'predictors' using the same syntax.")
    }
    if (!is.null(random) && !model_type %in% mixed_types) {
        warning("'random' parameter is ignored for non-mixed-effects models (", model_type, ").")
        random <- NULL
    }
    
    ## Extract random effects from predictors if present (for mixed-effects models)
    ## These will be used in formula building but not iterated over as predictors
    random_in_preds <- NULL
    fixed_predictors <- predictors
    if (model_type %in% mixed_types && has_random_in_predictors) {
        random_mask <- grepl("\\|", predictors)
        random_in_preds <- predictors[random_mask]
        fixed_predictors <- predictors[!random_mask]
        
        ## Combine explicit random with embedded random (if both provided)
        if (!is.null(random)) {
            ## User provided both - combine them
            message("Note: Random effects specified in both 'random' parameter and 'predictors'. ",
                    "Combining: ", random, " + ", paste(random_in_preds, collapse = " + "))
            all_random <- paste(c(random, random_in_preds), collapse = " + ")
            random <- all_random
        } else {
            ## Only embedded random effects
            random <- paste(random_in_preds, collapse = " + ")
        }
        
        ## Update predictors to only include fixed effects for iteration
        predictors <- fixed_predictors
        
        if (length(predictors) == 0) {
            stop("No fixed-effect predictors specified. Random effects should be specified ",
                 "via the 'random' parameter or alongside fixed-effect predictors.")
        }
    }
    
    ## Define model fitting function (used by lapply/mclapply)
    fit_one_predictor <- function(pred) {
        ## Build formula - for mixed-effects, append random effects
        formula_str <- paste(outcome, "~", pred)
        if (model_type %in% c("glmer", "lmer")) {
            formula_str <- paste(formula_str, "+", random)
        }
        formula <- stats::as.formula(formula_str)
        
        ## Fit model based on type
        ## Use model=FALSE for glm/lm when not keeping models to save memory
        model <- quiet_fit(quote(
            switch(model_type,
                   "glm" = stats::glm(formula, data = data, family = family, 
                                      model = keep_models, x = FALSE, y = TRUE, ...),
                   "negbin" = {
                       if (!requireNamespace("MASS", quietly = TRUE))
                           stop("Package 'MASS' required for negative binomial models")
                       MASS::glm.nb(formula, data = data, 
                                     model = keep_models, x = FALSE, y = TRUE, ...)
                   },
                   "lm" = stats::lm(formula, data = data, 
                                    model = keep_models, x = FALSE, y = TRUE, ...),
                   "coxph" = {
                       if (!requireNamespace("survival", quietly = TRUE)) 
                           stop("Package 'survival' required for Cox models")
                       survival::coxph(formula, data = data, 
                                       model = keep_models, x = FALSE, y = TRUE, ...)
                   },
                   "clogit" = {
                       if (!requireNamespace("survival", quietly = TRUE))
                           stop("Package 'survival' required for conditional logistic regression")
                       survival::clogit(formula, data = data, ...)
                   },
                   "glmer" = {
                       if (!requireNamespace("lme4", quietly = TRUE))
                           stop("Package 'lme4' required for glmer models")
                       lme4::glmer(formula, data = data, family = family, ...)
                   },
                   "lmer" = {
                       if (!requireNamespace("lme4", quietly = TRUE))
                           stop("Package 'lme4' required for lmer models")
                       lme4::lmer(formula, data = data, ...)
                   },
                   "coxme" = {
                       if (!requireNamespace("coxme", quietly = TRUE))
                           stop("Package 'coxme' required for coxme models")
                       if (!requireNamespace("survival", quietly = TRUE))
                           stop("Package 'survival' required for coxme models")
                       ## coxme uses different formula syntax - random in formula
                       coxme_formula <- stats::as.formula(paste(outcome, "~", pred, "+", random))
                       coxme::coxme(coxme_formula, data = data, ...)
                   },
                   stop("Unsupported model type: ", model_type)
                   )
        ), verbose = verbose)
        
        raw_result <- m2dt(
            data = data,
            model = model,
            conf_level = conf_level,
            keep_qc_stats = FALSE,
            include_intercept = FALSE,
            reference_rows = reference_rows,
            skip_counts = (!show_n && !show_events),
            conf_method = conf_method
        )

        ## Add predictor name for tracking
        raw_result[, predictor := pred]
        
        ## Return both raw result and optionally the model
        list(
            raw = raw_result,
            model = if (keep_models) model else NULL
        )
    }
    
    ## Fit models (parallel or sequential)
    can_use_windows_parallel <- .Platform$OS.type == "windows" && 
        interactive() && 
        !nzchar(Sys.getenv("_R_CHECK_PACKAGE_NAME_", ""))
    
    use_parallel <- parallel && 
        length(predictors) > 1 && 
        (.Platform$OS.type != "windows" || can_use_windows_parallel)
    
    if (use_parallel) {
        ## Determine number of cores
        if (is.null(n_cores)) {
            n_cores <- max(1L, parallel::detectCores() - 1L)
        }
        ## CRAN policy: respect _R_CHECK_LIMIT_CORES_ environment variable
        chk <- tolower(Sys.getenv("_R_CHECK_LIMIT_CORES_", ""))
        if (nzchar(chk) && chk == "true") {
            n_cores <- min(n_cores, 2L)
        }
        
        if (.Platform$OS.type == "windows") {
            ## Windows: use parLapply() with PSOCK cluster
            ## Only reaches here in interactive sessions with package installed
            cl <- parallel::makeCluster(n_cores)
            on.exit(parallel::stopCluster(cl), add = TRUE)
            
            ## Export the fit_one_predictor closure and all required local objects
            parallel::clusterExport(cl, 
                                    varlist = c("fit_one_predictor", "data", "outcome", 
                                                "model_type", "family", "random", 
                                                "conf_level", "reference_rows", 
                                                "keep_models", "show_n", "show_events",
                                                "conf_method"),
                                    envir = environment()
                                    )
            
            ## Export m2dt from summata namespace
            parallel::clusterExport(cl, 
                                    varlist = "m2dt",
                                    envir = asNamespace("summata")
                                    )
            
            ## Load required packages on workers
            parallel::clusterEvalQ(cl, library(data.table))
            
            ## Load survival if needed
            if (model_type %in% c("coxph", "clogit", "coxme")) {
                parallel::clusterEvalQ(cl, library(survival))
            }
            
            ## Load lme4 if needed
            if (model_type %in% c("glmer", "lmer")) {
                parallel::clusterEvalQ(cl, library(lme4))
            }
            
            ## Load coxme if needed
            if (model_type == "coxme") {
                parallel::clusterEvalQ(cl, library(coxme))
            }
            
            results <- parallel::parLapply(cl, predictors, fit_one_predictor)
            
        } else {
            ## Unix/Mac: use mclapply (fork-based, shares memory so no export needed)
            results <- parallel::mclapply(
                                     predictors, 
                                     fit_one_predictor,
                                     mc.cores = n_cores
                                 )
        }
    } else {
        ## Sequential processing with lapply()
        ## Used when: parallel = FALSE, single predictor, or Windows in non-interactive context
        results <- lapply(predictors, fit_one_predictor)
    }
    
    ## Extract raw results and combine
    raw_results <- lapply(results, `[[`, "raw")
    combined_raw <- data.table::rbindlist(raw_results, fill = TRUE)
    
    ## Extract models if kept
    if (keep_models) {
        models <- lapply(results, `[[`, "model")
        names(models) <- predictors
        models <- models[!vapply(models, is.null, logical(1))]
    }
    
    ## Note: p_threshold is used for reporting "Significant" count, not filtering
    
    ## Format results
    validate_number_format(number_format)
    marks <- resolve_number_marks(number_format)
    
    formatted <- format_model_table(
        combined_raw,
        show_n = show_n,
        show_events = show_events,
        digits = digits,
        p_digits = p_digits,
        labels = labels,
        exponentiate = exponentiate,
        conf_level = conf_level,
        marks = marks
    )
    
    ## Attach attributes
    data.table::setattr(formatted, "raw_data", combined_raw)
    
    if (keep_models) {
        data.table::setattr(formatted, "models", models)
    }
    
    data.table::setattr(formatted, "outcome", outcome)
    data.table::setattr(formatted, "model_type", unique(combined_raw$model_type)[1])
    data.table::setattr(formatted, "model_scope", "Univariable")
    data.table::setattr(formatted, "screening_type", "univariable")
    data.table::setattr(formatted, "p_threshold", p_threshold)
    
    ## Extract significant predictor names
    if ("p_value" %in% names(combined_raw)) {
        sig_predictors <- unique(
            combined_raw[!is.na(p_value) & p_value < p_threshold]$predictor
        )
        data.table::setattr(formatted, "significant", sig_predictors)
    }
    
    class(formatted) <- c("uniscreen_result", class(formatted))
    
    return(formatted)
}


#' Print method for uniscreen results
#'
#' @param x Object of class \code{uniscreen_result}.
#' @param ... Additional arguments passed to print methods.
#' @return Invisibly returns the input object \code{x}. Called for its
#'   side effect of printing a formatted summary to the console.
#' @keywords internal
#' @family regression functions
#' @export
print.uniscreen_result <- function(x, ...) {
    cat("\nUnivariable Screening Results\n")
    cat("Outcome: ", attr(x, "outcome"), "\n", sep = "")
    cat("Model Type: ", attr(x, "model_type"), "\n", sep = "")
    
    n_predictors <- length(unique(x$Variable[x$Variable != ""]))
    cat("Predictors Screened: ", n_predictors, "\n", sep = "")
    
    raw <- attr(x, "raw_data")
    p_thresh <- attr(x, "p_threshold")
    if (is.null(p_thresh)) p_thresh <- 0.05
    
    if (!is.null(raw) && "p_value" %in% names(raw)) {
        sig_predictors <- unique(raw[p_value < p_thresh]$predictor)
        n_sig <- length(sig_predictors)
        cat("Significant (p < ", p_thresh, "): ", n_sig, "\n", sep = "")
    }
    
    if (!is.null(attr(x, "models"))) {
        cat("Models stored: Yes (", length(attr(x, "models")), ")\n", sep = "")
    }
    
    cat("\n")
    NextMethod("print", x)
    invisible(x)
}

Try the summata package in your browser

Any scripts or data that you put into this service are public.

summata documentation built on May 7, 2026, 5:07 p.m.