tests/testthat/test_fit.R

#' Test Suite for fit()
#' 
#' Comprehensive tests covering all model types, output formatting,
#' interactions, stratification, clustering, weights, and edge cases.
#' 
#' @details Run with testthat::test_file("tests/testthat/test-fit.R")

library(testthat)
library(data.table)
library(summata)

## Load survival package for Surv() and strata() functions in formula evaluation
if (requireNamespace("survival", quietly = TRUE)) {
    library(survival)
}

## ============================================================================
## Setup: Create test data and helper functions
## ============================================================================

## Use package's built-in clinical trial data
data(clintrial)
data(clintrial_labels)

## Create a complete-case subset for tests requiring no missing data
clintrial_complete <- na.omit(clintrial)

## Create binary outcome for logistic regression tests
## IMPORTANT: Add to BOTH datasets
clintrial$response <- as.integer(clintrial$os_status == 1 & clintrial$os_months < 24)
clintrial_complete$response <- as.integer(clintrial_complete$os_status == 1 & clintrial_complete$os_months < 24)

## Create integer count outcome for Poisson/quasipoisson/negative binomial tests
## IMPORTANT: Add to BOTH datasets
set.seed(123)
clintrial$count_outcome <- as.integer(ceiling(clintrial$los_days))
clintrial_complete$count_outcome <- as.integer(ceiling(clintrial_complete$los_days))

## Create a weights column for weighted regression tests
set.seed(42)
clintrial$weight <- runif(nrow(clintrial), 0.5, 2.0)
clintrial_complete$weight <- runif(nrow(clintrial_complete), 0.5, 2.0)

## Helper function to check fit_result structure
expect_fit_result <- function(result) {
    expect_s3_class(result, "fit_result")
    expect_s3_class(result, "data.table")
    
    ## Required columns
    expect_true("Variable" %in% names(result))
    expect_true("Group" %in% names(result))
    
    ## Required attributes
    expect_true(!is.null(attr(result, "model")))
    expect_true(!is.null(attr(result, "raw_data")))
    expect_true(!is.null(attr(result, "outcome")))
    expect_true(!is.null(attr(result, "predictors")))
    expect_true(!is.null(attr(result, "formula_str")))
    expect_true(!is.null(attr(result, "model_scope")))
    expect_true(!is.null(attr(result, "model_type")))
}

## Helper to check effect column exists and is properly formatted
expect_effect_column <- function(result, effect_type = NULL) {
    col_names <- names(result)
    
    ## Find effect column (contains OR, HR, RR, Coefficient, or Estimate with CI)
    effect_cols <- grep("(OR|HR|RR|Coefficient|Estimate).*CI", col_names, value = TRUE)
    expect_true(length(effect_cols) >= 1, 
                info = paste("Expected effect column, found:", paste(col_names, collapse = ", ")))
    
    if (!is.null(effect_type)) {
        expect_true(any(grepl(effect_type, effect_cols)),
                    info = paste("Expected", effect_type, "in columns, found:", 
                                 paste(effect_cols, collapse = ", ")))
    }
}

## Helper to check p-value column
expect_pvalue_column <- function(result) {
    expect_true("p-value" %in% names(result))
    
    ## Check p-values are properly formatted (numeric string, < 0.001, or -)
    pvals <- result$`p-value`
    valid_pvals <- grepl("^[0-9]\\.[0-9]+$|^< 0\\.001$|^-$", pvals)
    expect_true(all(valid_pvals), 
                info = paste("Invalid p-values found:", 
                             paste(pvals[!valid_pvals], collapse = ", ")))
}

## Helper to verify reference rows
expect_reference_rows <- function(result, vars_with_refs) {
    raw <- attr(result, "raw_data")
    for (v in vars_with_refs) {
        var_rows <- raw[variable == v]
        if (nrow(var_rows) > 1) {
            ## Factor variables should have a reference row
            has_ref <- any(var_rows$reference == "reference", na.rm = TRUE)
            expect_true(has_ref, 
                        info = paste("Expected reference row for factor:", v))
        }
    }
}


## ============================================================================
## SECTION 1: Basic GLM (Logistic Regression) Tests
## ============================================================================

test_that("fit works with basic logistic regression - univariable", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = "age",
        model_type = "glm",
        family = "binomial"
    )
    
    expect_fit_result(result)
    expect_equal(attr(result, "model_scope"), "Univariable")
    expect_effect_column(result, "OR")
    expect_pvalue_column(result)
    
    ## Should have single predictor
    expect_equal(length(attr(result, "predictors")), 1)
    
    ## Effect column should say "OR (95% CI)" for univariable
    col_names <- names(result)
    expect_true(any(grepl("^OR \\(95% CI\\)$", col_names)))
})


test_that("fit works with basic logistic regression - multivariable", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "smoking"),
        model_type = "glm",
        family = "binomial"
    )
    
    expect_fit_result(result)
    expect_equal(attr(result, "model_scope"), "Multivariable")
    expect_effect_column(result, "aOR")
    expect_pvalue_column(result)
    
    ## Should have multiple predictors
    expect_equal(length(attr(result, "predictors")), 3)
    
    ## Effect column should say "aOR (95% CI)" for multivariable
    col_names <- names(result)
    expect_true(any(grepl("^aOR \\(95% CI\\)$", col_names)))
})


test_that("fit handles factor variables correctly in GLM", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "stage"),
        model_type = "glm",
        family = "binomial",
        reference_rows = TRUE
    )
    
    expect_fit_result(result)
    
    ## Stage is a factor - should have multiple rows
    raw <- attr(result, "raw_data")
    stage_rows <- raw[variable == "stage"]
    expect_true(nrow(stage_rows) >= 2)  # At least reference + one level
    
    ## Check reference row exists
    has_ref <- any(stage_rows$reference == "reference", na.rm = TRUE)
    expect_true(has_ref)
})


test_that("fit reference_rows = FALSE excludes reference rows", {
    
    result_with_ref <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "stage"),
        model_type = "glm",
        reference_rows = TRUE
    )
    
    result_no_ref <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "stage"),
        model_type = "glm",
        reference_rows = FALSE
    )
    
    ## Without reference rows should have fewer rows
    expect_true(nrow(result_no_ref) < nrow(result_with_ref))
    
    ## Check no reference markers in raw data
    raw_no_ref <- attr(result_no_ref, "raw_data")
    if ("reference" %in% names(raw_no_ref)) {
        has_refs <- sum(raw_no_ref$reference == "reference", na.rm = TRUE)
        expect_equal(has_refs, 0)
    }
})


test_that("fit shows n and events columns correctly for GLM", {
    
    ## With n and events
    result_full <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        show_n = TRUE,
        show_events = TRUE
    )
    expect_true("n" %in% names(result_full))
    expect_true("Events" %in% names(result_full))
    
    ## Without n
    result_no_n <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        show_n = FALSE,
        show_events = TRUE
    )
    expect_false("n" %in% names(result_no_n))
    expect_true("Events" %in% names(result_no_n))
    
    ## Without events
    result_no_events <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        show_n = TRUE,
        show_events = FALSE
    )
    expect_true("n" %in% names(result_no_events))
    expect_false("Events" %in% names(result_no_events))
})


test_that("fit respects digits parameter for GLM", {
    
    result_2dig <- fit(
        data = clintrial,
        outcome = "response",
        predictors = "age",
        model_type = "glm",
        digits = 2
    )
    
    result_3dig <- fit(
        data = clintrial,
        outcome = "response",
        predictors = "age",
        model_type = "glm",
        digits = 3
    )
    
    ## Extract effect columns
    effect_col_2 <- grep("OR.*CI", names(result_2dig), value = TRUE)[1]
    effect_col_3 <- grep("OR.*CI", names(result_3dig), value = TRUE)[1]
    
    ## 2-digit result should have pattern like "1.02 (0.99-1.05)"
    ## 3-digit result should have pattern like "1.023 (0.993-1.053)"
    val_2 <- result_2dig[[effect_col_2]][1]
    val_3 <- result_3dig[[effect_col_3]][1]
    
    ## Count decimal places in the first number
    first_num_2 <- sub(" .*", "", val_2)
    first_num_3 <- sub(" .*", "", val_3)
    
    decimals_2 <- nchar(sub(".*\\.", "", first_num_2))
    decimals_3 <- nchar(sub(".*\\.", "", first_num_3))
    
    expect_equal(decimals_2, 2)
    expect_equal(decimals_3, 3)
})


test_that("fit respects p_digits parameter", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        p_digits = 4
    )
    
    ## P-values should have up to 4 decimal places
    pvals <- result$`p-value`
    numeric_pvals <- pvals[grepl("^0\\.", pvals)]
    
    if (length(numeric_pvals) > 0) {
        ## Check that numeric p-values have 4 decimal places
        decimals <- sapply(numeric_pvals, function(p) {
            nchar(sub("0\\.", "", p))
        })
        expect_true(all(decimals == 4))
    }
})


## ============================================================================
## SECTION 2: Linear Model Tests
## ============================================================================

test_that("fit works with linear regression - univariable", {
    
    result <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = "age",
        model_type = "lm"
    )
    
    expect_fit_result(result)
    expect_equal(attr(result, "model_scope"), "Univariable")
    expect_equal(attr(result, "model_type"), "Linear")
    
    ## Linear models use Coefficient, not OR
    expect_effect_column(result, "Coefficient")
    
    ## Should NOT have Events column
    expect_false("Events" %in% names(result))
})


test_that("fit works with linear regression - multivariable", {
    
    result <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = c("age", "sex", "stage"),
        model_type = "lm"
    )
    
    expect_fit_result(result)
    expect_equal(attr(result, "model_scope"), "Multivariable")
    expect_effect_column(result, "Coefficient")
})


test_that("fit linear model show_events is ignored", {
    
    ## Even if show_events = TRUE, linear models shouldn't show events
    result <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "lm",
        show_events = TRUE
    )
    
    expect_false("Events" %in% names(result))
})


test_that("fit linear model QC stats are captured", {
    
    result <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = c("age", "sex", "stage"),
        model_type = "lm",
        keep_qc_stats = TRUE
    )
    
    raw <- attr(result, "raw_data")
    
    ## Linear models should have R-squared and other stats
    expect_true("AIC" %in% names(raw))
    expect_true("BIC" %in% names(raw))
})


## ============================================================================
## SECTION 3: Cox Proportional Hazards Tests
## ============================================================================

test_that("fit works with Cox regression - univariable", {
    
    skip_if_not_installed("survival")
    
    result <- fit(
        data = clintrial,
        outcome = "Surv(os_months, os_status)",
        predictors = "age",
        model_type = "coxph"
    )
    
    expect_fit_result(result)
    expect_equal(attr(result, "model_scope"), "Univariable")
    expect_effect_column(result, "HR")
    
    ## Should have Events column
    expect_true("Events" %in% names(result))
})


test_that("fit works with Cox regression - multivariable", {
    
    skip_if_not_installed("survival")
    
    result <- fit(
        data = clintrial,
        outcome = "Surv(os_months, os_status)",
        predictors = c("age", "sex", "stage", "treatment"),
        model_type = "coxph"
    )
    
    expect_fit_result(result)
    expect_equal(attr(result, "model_scope"), "Multivariable")
    
    ## Multivariable Cox should show aHR
    col_names <- names(result)
    expect_true(any(grepl("aHR", col_names)))
})


test_that("fit Cox model with stratification", {
    
    skip_if_not_installed("survival")
    
    result <- fit(
        data = clintrial,
        outcome = "Surv(os_months, os_status)",
        predictors = c("age", "treatment"),
        model_type = "coxph",
        strata = "sex"
    )
    
    expect_fit_result(result)
    
    ## Check strata is stored in attributes
    expect_equal(attr(result, "strata"), "sex")
    
    ## Formula should include strata()
    formula_str <- attr(result, "formula_str")
    expect_true(grepl("strata\\(.*sex.*\\)", formula_str))
    
    ## Sex should NOT appear as a predictor in the output
    ## (it's used for stratification, not estimation)
    raw <- attr(result, "raw_data")
    expect_false("sex" %in% raw$variable)
})


test_that("fit Cox model with clustering", {
    
    skip_if_not_installed("survival")
    
    result <- fit(
        data = clintrial,
        outcome = "Surv(os_months, os_status)",
        predictors = c("age", "sex", "treatment"),
        model_type = "coxph",
        cluster = "site"
    )
    
    expect_fit_result(result)
    
    ## Check cluster is stored in attributes
    expect_equal(attr(result, "cluster"), "site")
    
    ## Model should have robust standard errors
    model <- attr(result, "model")
    expect_true(!is.null(model$naive.var) || !is.null(attr(model, "cluster")))
})


test_that("fit Cox model with both strata and cluster", {
    
    skip_if_not_installed("survival")
    
    ## Create a second grouping variable for stratification
    clintrial_test <- data.table::as.data.table(clintrial)
    clintrial_test[, region := sample(c("North", "South"), .N, replace = TRUE)]
    
    result <- fit(
        data = clintrial_test,
        outcome = "Surv(os_months, os_status)",
        predictors = c("age", "treatment"),
        model_type = "coxph",
        strata = "region",
        cluster = "site"
    )
    
    expect_fit_result(result)
    expect_equal(attr(result, "strata"), "region")
    expect_equal(attr(result, "cluster"), "site")
})


## ============================================================================
## SECTION 4: Poisson Regression Tests
## ============================================================================

test_that("fit works with Poisson regression", {
    
    ## Create count outcome
    clintrial_test <- data.table::as.data.table(clintrial)
    clintrial_test[, complications := rpois(.N, lambda = 2)]
    
    result <- fit(
        data = clintrial_test,
        outcome = "complications",
        predictors = c("age", "sex", "stage"),
        model_type = "glm",
        family = "poisson"
    )
    
    expect_fit_result(result)
    
    ## Poisson models should show RR (rate ratio)
    expect_effect_column(result, "RR")
})


## ============================================================================
## SECTION 5: Interaction Terms Tests
## ============================================================================

test_that("fit handles basic interaction terms", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        interactions = c("age:sex"),
        model_type = "glm"
    )
    
    expect_fit_result(result)
    
    ## Check interactions stored in attributes
    expect_equal(attr(result, "interactions"), c("age:sex"))
    
    ## Formula should include interaction
    formula_str <- attr(result, "formula_str")
    expect_true(grepl("age:sex", formula_str))
    
    ## Raw data should have interaction term rows
    raw <- attr(result, "raw_data")
    interaction_rows <- raw[grepl(":", variable)]
    expect_true(nrow(interaction_rows) >= 1)
})


test_that("fit handles multiple interaction terms", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "treatment"),
        interactions = c("age:sex", "age:treatment"),
        model_type = "glm"
    )
    
    expect_fit_result(result)
    
    formula_str <- attr(result, "formula_str")
    expect_true(grepl("age:sex", formula_str))
    expect_true(grepl("age:treatment", formula_str))
})

## ============================================================================
## SECTION 6: Weighted Regression Tests
## ============================================================================

test_that("fit handles weights in GLM", {
    
    ## Suppress expected "non-integer #successes" warning from weighted binomial
    result <- suppressWarnings(fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        weights = "weight"
    ))
    
    expect_fit_result(result)
    expect_equal(attr(result, "weights"), "weight")
    
    ## Model should have weights
    model <- attr(result, "model")
    expect_true(!is.null(model$prior.weights))
})


test_that("fit handles weights in linear model", {
    
    result <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "lm",
        weights = "weight"
    )
    
    expect_fit_result(result)
    expect_equal(attr(result, "weights"), "weight")
})


test_that("weighted vs unweighted models produce different results", {
    
    result_unweighted <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    ## Suppress expected "non-integer #successes" warning from weighted binomial
    result_weighted <- suppressWarnings(fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        weights = "weight"
    ))
    
    ## Get coefficients from models
    coef_unweighted <- coef(attr(result_unweighted, "model"))
    coef_weighted <- coef(attr(result_weighted, "model"))
    
    ## Coefficients should be different (not identical)
    expect_false(all(abs(coef_unweighted - coef_weighted) < 1e-10))
})


## ============================================================================
## SECTION 7: Custom Labels Tests
## ============================================================================

test_that("fit applies custom labels to variables", {
    
    custom_labels <- c(
        age = "Age (years)",
        sex = "Sex",
        stage = "Cancer Stage"
    )
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "stage"),
        model_type = "glm",
        labels = custom_labels
    )
    
    ## Check that labels are applied
    ## Variable column should contain labeled names
    variables <- unique(result$Variable[result$Variable != ""])
    
    expect_true("Age (years)" %in% variables || any(grepl("Age", variables)))
    expect_true("Sex" %in% variables || any(grepl("Sex", variables)))
    expect_true("Cancer Stage" %in% variables || any(grepl("Stage", variables)))
})


test_that("fit applies labels to interaction terms", {
    
    custom_labels <- c(
        age = "Age",
        sex = "Sex"
    )
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        interactions = c("age:sex"),
        model_type = "glm",
        labels = custom_labels
    )
    
    ## Interaction should show labeled variables with " * " separator
    variables <- result$Variable
    
    ## Should contain interaction with labeled names
    has_labeled_interaction <- any(grepl("Age.*Sex|Sex.*Age", variables))
    expect_true(has_labeled_interaction)
})


test_that("fit labels work with clintrial_labels dataset", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "stage", "treatment"),
        model_type = "glm",
        labels = clintrial_labels
    )
    
    expect_fit_result(result)
    
    ## Labels should be applied
    variables <- result$Variable[result$Variable != ""]
    
    ## Check some expected labels exist
    expect_true(length(variables) > 0)
})


## ============================================================================
## SECTION 8: Confidence Level Tests
## ============================================================================

test_that("fit respects conf_level parameter", {
    
    result_95 <- fit(
        data = clintrial,
        outcome = "response",
        predictors = "age",
        model_type = "glm",
        conf_level = 0.95
    )
    
    result_90 <- fit(
        data = clintrial,
        outcome = "response",
        predictors = "age",
        model_type = "glm",
        conf_level = 0.90
    )
    
    ## Get raw data to compare CI widths
    raw_95 <- attr(result_95, "raw_data")
    raw_90 <- attr(result_90, "raw_data")
    
    ## 90% CI should be narrower than 95% CI
    ci_width_95 <- raw_95$exp_upper[1] - raw_95$exp_lower[1]
    ci_width_90 <- raw_90$exp_upper[1] - raw_90$exp_lower[1]
    
    expect_true(ci_width_90 < ci_width_95)
})


test_that("fit works with 99% confidence interval", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        conf_level = 0.99
    )
    
    expect_fit_result(result)
    
    ## 99% CI column should be named appropriately or at least exist
    col_names <- names(result)
    expect_true(any(grepl("CI", col_names)))
})


## ============================================================================
## SECTION 9: Exponentiate Parameter Tests
## ============================================================================

test_that("fit auto-exponentiates for logistic regression", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = "age",
        model_type = "glm",
        family = "binomial"
    )
    
    ## Should show OR, not raw coefficients
    col_names <- names(result)
    expect_true(any(grepl("OR", col_names)))
})


test_that("fit exponentiate = FALSE shows raw coefficients", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = "age",
        model_type = "glm",
        family = "binomial",
        exponentiate = FALSE
    )
    
    ## Should show Coefficient, not OR
    col_names <- names(result)
    expect_true(any(grepl("Coefficient", col_names)))
    expect_false(any(grepl("\\bOR\\b", col_names)))
})


test_that("fit exponentiate = TRUE forces OR even for linear model", {
    
    result <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = "age",
        model_type = "lm",
        exponentiate = TRUE
    )
    
    ## Should show exponentiated values
    raw <- attr(result, "raw_data")
    
    ## exp_coef should exist in raw data
    expect_true("exp_coef" %in% names(raw))
})


## ============================================================================
## SECTION 10: Mixed Effects Models Tests
## ============================================================================

test_that("fit works with lmer (linear mixed effects)", {
    
    skip_if_not_installed("lme4")
    skip_on_cran()  # lme4 can segfault in test environments
    
    gc()  # Force garbage collection before lme4
    
    ## Use smaller dataset for stability
    test_data <- clintrial_complete[1:200, ]
    
    result <- fit(
        data = test_data,
        outcome = "los_days",
        predictors = c("age", "sex", "(1|site)"),
        model_type = "lmer"
    )
    
    expect_fit_result(result)
    expect_equal(attr(result, "model_type"), "Linear Mixed")
    
    ## Model should be lmerMod
    model <- attr(result, "model")
    expect_true(inherits(model, "lmerMod"))
})


test_that("fit works with glmer (generalized linear mixed effects)", {
    
    skip_if_not_installed("lme4")
    skip_on_cran()
    
    gc()
    
    test_data <- clintrial_complete[1:200, ]
    
    result <- fit(
        data = test_data,
        outcome = "response",
        predictors = c("age", "sex", "(1|site)"),
        model_type = "glmer",
        family = "binomial"
    )
    
    expect_fit_result(result)
    
    ## Model should be glmerMod
    model <- attr(result, "model")
    expect_true(inherits(model, "glmerMod"))
})


test_that("fit works with coxme (mixed effects Cox)", {
    
    skip_if_not_installed("coxme")
    skip_if_not_installed("survival")
    skip_on_cran()
    
    gc()
    
    test_data <- clintrial_complete[1:200, ]
    
    result <- fit(
        data = test_data,
        outcome = "Surv(os_months, os_status)",
        predictors = c("age", "sex", "(1|site)"),
        model_type = "coxme"
    )
    
    expect_fit_result(result)
    
    ## Model should be coxme
    model <- attr(result, "model")
    expect_true(inherits(model, "coxme"))
})


## ============================================================================
## SECTION 11: Conditional Logistic Regression Tests
## ============================================================================

test_that("fit works with conditional logistic regression", {
    
    skip_if_not_installed("survival")
    
    ## Create matched case-control data
    set.seed(123)
    n_strata <- 50
    matched_data <- data.table(
        stratum = rep(1:n_strata, each = 3),
        case = rep(c(1, 0, 0), n_strata),
        age = rnorm(n_strata * 3, 60, 10),
        exposure = rbinom(n_strata * 3, 1, 0.3)
    )
    
    result <- fit(
        data = matched_data,
        outcome = "case",
        predictors = c("age", "exposure"),
        model_type = "clogit",
        strata = "stratum"
    )
    
    expect_fit_result(result)
    expect_equal(attr(result, "strata"), "stratum")
    
    ## Conditional logistic shows effect estimates (could be OR or HR depending on implementation)
    expect_effect_column(result)
})


## ============================================================================
## SECTION 12: QC Stats and Raw Data Tests
## ============================================================================

test_that("fit keeps QC stats when requested", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "stage"),
        model_type = "glm",
        keep_qc_stats = TRUE
    )
    
    raw <- attr(result, "raw_data")
    
    ## Should have QC statistics
    expect_true("AIC" %in% names(raw))
    expect_true("BIC" %in% names(raw))
    expect_true("n" %in% names(raw))
    expect_true("events" %in% names(raw))
})


test_that("fit raw_data has correct structure", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    raw <- attr(result, "raw_data")
    
    ## Check required columns in raw data
    expect_true("variable" %in% names(raw))
    expect_true("coef" %in% names(raw))
    expect_true("se" %in% names(raw))
    expect_true("p_value" %in% names(raw))
    expect_true("exp_coef" %in% names(raw))  # For GLM
})


test_that("fit model object is accessible and valid", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    model <- attr(result, "model")
    
    ## Model should be a valid glm object
    expect_s3_class(model, "glm")
    
    ## Should be able to extract coefficients
    expect_true(length(coef(model)) > 0)
    
    ## Should be able to get predictions
    expect_true(length(predict(model)) > 0)
})


## ============================================================================
## SECTION 13: Edge Cases and Error Handling
## ============================================================================

test_that("fit handles missing data appropriately", {
    
    ## clintrial has some missing values
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "smoking"),  # smoking may have NAs
        model_type = "glm"
    )
    
    expect_fit_result(result)
    
    ## N should reflect complete cases
    raw <- attr(result, "raw_data")
    expect_true(raw$n[1] > 0)
    expect_true(raw$n[1] <= nrow(clintrial))
})


test_that("fit works with single-level factors appropriately", {
    
    ## Create data with a near-single-level factor
    test_data <- data.table::as.data.table(clintrial)
    test_data[, constant_var := factor("A")]
    
    ## This should either work or give an informative error
    ## depending on how the model handles it
    result_or_error <- tryCatch({
        fit(
            data = test_data,
            outcome = "response",
            predictors = c("age", "constant_var"),
            model_type = "glm"
        )
    }, error = function(e) e)
    
    ## Either succeeds or gives an error (both are acceptable)
    expect_true(inherits(result_or_error, "fit_result") || 
                inherits(result_or_error, "error"))
})


test_that("fit errors on invalid model_type", {
    
    expect_error(
        fit(
            data = clintrial,
            outcome = "response",
            predictors = "age",
            model_type = "invalid_model_type"
        ),
        regexp = "[Uu]nsupported model type"
    )
})


test_that("fit errors when required package missing for coxph", {
    
    ## This test verifies the error message when survival isn't loaded
    ## but since it's typically available, we just verify the check exists
    
    result <- fit(
        data = clintrial,
        outcome = "Surv(os_months, os_status)",
        predictors = "age",
        model_type = "coxph"
    )
    
    expect_fit_result(result)
})


test_that("fit handles empty predictors appropriately", {
    
    ## Empty predictors should cause an error
    expect_error(
        fit(
            data = clintrial,
            outcome = "response",
            predictors = character(0),
            model_type = "glm"
        )
    )
})


## ============================================================================
## SECTION 13B: Input Validation Tests
## ============================================================================

test_that("fit auto-corrects Surv outcome with GLM model_type", {
    
    ## Should auto-switch to coxph and emit a message
    expect_message(
        result <- fit(
            data = clintrial,
            outcome = "Surv(os_months, os_status)",
            predictors = c("age", "sex"),
            model_type = "glm"
        ),
        regexp = "[Ss]witch|[Ss]urvival"
    )
    
    expect_fit_result(result)
    ## Model type attribute stores display name ("Cox PH"), not input parameter
    expect_true(grepl("[Cc]ox", attr(result, "model_type")))
})


test_that("fit auto-corrects Surv outcome with lm model_type", {
    
    ## Should auto-switch to coxph and emit a message
    expect_message(
        result <- fit(
            data = clintrial,
            outcome = "Surv(os_months, os_status)",
            predictors = c("age", "sex"),
            model_type = "lm"
        ),
        regexp = "[Ss]witch|[Ss]urvival"
    )
    
    expect_fit_result(result)
    expect_true(grepl("[Cc]ox", attr(result, "model_type")))
})


test_that("fit errors when coxph used without Surv outcome", {
    
    expect_error(
        fit(
            data = clintrial,
            outcome = "response",
            predictors = c("age", "sex"),
            model_type = "coxph"
        ),
        regexp = "[Ss]urv|[Ss]urvival"
    )
})


test_that("fit errors when coxme used without Surv outcome", {
    
    skip_if_not_installed("coxme")
    
    expect_error(
        fit(
            data = clintrial,
            outcome = "response",
            predictors = c("age", "sex", "(1|site)"),
            model_type = "coxme"
        ),
        regexp = "[Ss]urv|[Ss]urvival"
    )
})


test_that("fit errors with continuous outcome and binomial family", {
    
    expect_error(
        fit(
            data = clintrial,
            outcome = "los_days",
            predictors = c("age", "sex"),
            model_type = "glm",
            family = "binomial"
        ),
        regexp = "[Cc]ontinuous|binomial"
    )
})


test_that("fit warns with binary outcome and gaussian family", {
    
    expect_warning(
        result <- fit(
            data = clintrial,
            outcome = "response",
            predictors = c("age", "sex"),
            model_type = "glm",
            family = "gaussian"
        ),
        regexp = "[Bb]inary|binomial|gaussian"
    )
    
    expect_fit_result(result)
})


test_that("fit warns with binary outcome and lm model_type", {
    
    expect_warning(
        result <- fit(
            data = clintrial,
            outcome = "response",
            predictors = c("age", "sex"),
            model_type = "lm"
        ),
        regexp = "[Bb]inary|logistic|binomial"
    )
    
    expect_fit_result(result)
})


test_that("fit errors when outcome variable not found", {
    
    expect_error(
        fit(
            data = clintrial,
            outcome = "nonexistent_variable",
            predictors = c("age", "sex"),
            model_type = "glm"
        ),
        regexp = "not found|[Oo]utcome"
    )
})


test_that("fit errors when Surv variables not found", {
    
    expect_error(
        fit(
            data = clintrial,
            outcome = "Surv(nonexistent_time, nonexistent_status)",
            predictors = c("age", "sex"),
            model_type = "coxph"
        ),
        regexp = "not found|[Ss]urvival"
    )
})


test_that("fit errors when predictor variable not found", {
    
    expect_error(
        fit(
            data = clintrial,
            outcome = "response",
            predictors = c("age", "nonexistent_predictor"),
            model_type = "glm"
        ),
        regexp = "not found|[Pp]redictor"
    )
})


test_that("fit errors with invalid conf_level", {
    
    expect_error(
        fit(
            data = clintrial,
            outcome = "response",
            predictors = c("age", "sex"),
            model_type = "glm",
            conf_level = 1.5
        ),
        regexp = "conf_level|between 0 and 1"
    )
    
    expect_error(
        fit(
            data = clintrial,
            outcome = "response",
            predictors = c("age", "sex"),
            model_type = "glm",
            conf_level = 0
        ),
        regexp = "conf_level|between 0 and 1"
    )
})


test_that("fit errors with invalid digits parameter", {
    
    expect_error(
        fit(
            data = clintrial,
            outcome = "response",
            predictors = c("age", "sex"),
            model_type = "glm",
            digits = -1
        ),
        regexp = "digits|non-negative"
    )
})


test_that("fit errors with empty data", {
    
    empty_data <- clintrial[0, ]
    
    expect_error(
        fit(
            data = empty_data,
            outcome = "response",
            predictors = c("age", "sex"),
            model_type = "glm"
        ),
        regexp = "empty|no rows|non-empty"
    )
})


test_that("fit handles very long predictor lists", {
    
    ## Test with many predictors
    many_predictors <- c("age", "sex", "smoking", "diabetes", "hypertension",
                         "stage", "grade", "ecog", "treatment", "surgery")
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = many_predictors,
        model_type = "glm"
    )
    
    expect_fit_result(result)
    expect_equal(length(attr(result, "predictors")), length(many_predictors))
})


## ============================================================================
## SECTION 14: Print Method Tests
## ============================================================================

test_that("print.fit_result produces output", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    ## Capture print output
    output <- capture.output(print(result))
    
    ## Should contain model information
    expect_true(any(grepl("Multivariable|Univariable", output)))
    expect_true(any(grepl("Formula", output)))
    expect_true(any(grepl("n =", output)))
})


test_that("print.fit_result shows events for survival models", {
    
    skip_if_not_installed("survival")
    
    result <- fit(
        data = clintrial,
        outcome = "Surv(os_months, os_status)",
        predictors = c("age", "sex"),
        model_type = "coxph"
    )
    
    output <- capture.output(print(result))
    
    ## Should show Events
    expect_true(any(grepl("Events", output)))
})


## ============================================================================
## SECTION 15: Integration and Consistency Tests
## ============================================================================

test_that("fit produces consistent results across runs", {
    
    result1 <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    result2 <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    ## Results should be identical
    expect_equal(attr(result1, "raw_data")$coef, attr(result2, "raw_data")$coef)
    expect_equal(attr(result1, "raw_data")$p_value, attr(result2, "raw_data")$p_value)
})


test_that("fit model coefficients match direct model fitting", {
    
    ## Fit using fit()
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = "binomial"
    )
    
    ## Fit directly
    direct_model <- glm(response ~ age + sex, data = clintrial, family = binomial)
    
    ## Coefficients should match
    fit_coefs <- coef(attr(result, "model"))
    direct_coefs <- coef(direct_model)
    
    expect_equal(fit_coefs, direct_coefs, tolerance = 1e-10)
})


test_that("fit works with data.frame input (not just data.table)", {
    
    df <- as.data.frame(clintrial)
    
    result <- fit(
        data = df,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    expect_fit_result(result)
})


test_that("fit preserves factor level ordering", {
    
    ## Create data with ordered factor
    test_data <- data.table::as.data.table(clintrial)
    test_data[, stage_ordered := factor(stage, levels = c("I", "II", "III", "IV"), 
                                        ordered = TRUE)]
    
    result <- fit(
        data = test_data,
        outcome = "response",
        predictors = c("age", "stage_ordered"),
        model_type = "glm"
    )
    
    expect_fit_result(result)
    
    ## Check that stage levels appear in correct order in output
    raw <- attr(result, "raw_data")
    stage_rows <- raw[variable == "stage_ordered"]
    
    if (nrow(stage_rows) > 1) {
        ## Groups should be in factor order
        groups <- stage_rows$group[!is.na(stage_rows$group) & stage_rows$group != ""]
        
        ## Reference should be first level (I), so we should see II, III, IV
        expect_true(length(groups) >= 1)
    }
})


## ============================================================================
## SECTION 16: Different GLM Families Tests
## ============================================================================

## ----------------------------------------------------------------------------
## 16a: Gaussian Family Tests
## ----------------------------------------------------------------------------

test_that("fit works with gaussian family", {
    
    result <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = "gaussian"
    )
    
    expect_fit_result(result)
    
    ## Should produce coefficients (not exponentiated)
    col_names <- names(result)
    expect_true(any(grepl("Coefficient.*CI|Estimate.*CI", col_names)),
                info = paste("Expected Coefficient column for gaussian, found:", 
                             paste(col_names, collapse = ", ")))
})


test_that("fit gaussian coefficients match direct glm", {
    
    result <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = "gaussian"
    )
    
    ## Fit directly
    direct_model <- glm(los_days ~ age + sex, data = clintrial, family = gaussian)
    
    ## Coefficients should match
    fit_coefs <- coef(attr(result, "model"))
    direct_coefs <- coef(direct_model)
    
    expect_equal(fit_coefs, direct_coefs, tolerance = 1e-10)
})


test_that("fit gaussian matches lm results", {
    
    result_glm <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = "gaussian"
    )
    
    result_lm <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "lm"
    )
    
    ## Coefficients should be essentially identical
    coefs_glm <- coef(attr(result_glm, "model"))
    coefs_lm <- coef(attr(result_lm, "model"))
    
    expect_equal(coefs_glm, coefs_lm, tolerance = 1e-10)
})


## ----------------------------------------------------------------------------
## 16b: Quasibinomial Family Tests (overdispersed binary)
## ----------------------------------------------------------------------------

test_that("fit works with quasibinomial family", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "treatment"),
        model_type = "glm",
        family = "quasibinomial"
    )
    
    expect_fit_result(result)
    
    ## Should produce odds ratios like binomial
    col_names <- names(result)
    expect_true(any(grepl("OR.*CI", col_names)),
                info = paste("Expected OR column for quasibinomial, found:", 
                             paste(col_names, collapse = ", ")))
})


test_that("fit quasibinomial coefficients match direct glm", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = "quasibinomial"
    )
    
    ## Fit directly
    direct_model <- glm(response ~ age + sex, data = clintrial, family = quasibinomial)
    
    ## Coefficients should match
    fit_coefs <- coef(attr(result, "model"))
    direct_coefs <- coef(direct_model)
    
    expect_equal(fit_coefs, direct_coefs, tolerance = 1e-10)
})


test_that("fit quasibinomial point estimates match binomial", {
    
    result_quasi <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = "quasibinomial"
    )
    
    result_binom <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = "binomial"
    )
    
    ## Point estimates should be identical (SEs differ due to dispersion)
    coefs_quasi <- coef(attr(result_quasi, "model"))
    coefs_binom <- coef(attr(result_binom, "model"))
    
    expect_equal(coefs_quasi, coefs_binom, tolerance = 1e-10)
})


test_that("fit quasibinomial handles overdispersion", {
    
    ## Quasibinomial allows dispersion != 1
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "treatment", "stage"),
        model_type = "glm",
        family = "quasibinomial"
    )
    
    model <- attr(result, "model")
    
    ## Dispersion parameter should be estimated
    summ <- summary(model)
    expect_true(!is.null(summ$dispersion))
})


## ----------------------------------------------------------------------------
## 16c: Quasipoisson Family Tests (overdispersed counts)
## ----------------------------------------------------------------------------

test_that("fit works with quasipoisson family", {
    
    result <- fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex", "treatment"),
        model_type = "glm",
        family = "quasipoisson"
    )
    
    expect_fit_result(result)
    
    ## Should produce rate ratios like poisson
    col_names <- names(result)
    expect_true(any(grepl("RR.*CI", col_names)),
                info = paste("Expected RR column for quasipoisson, found:", 
                             paste(col_names, collapse = ", ")))
})


test_that("fit quasipoisson coefficients match direct glm", {
    
    result <- fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = "quasipoisson"
    )
    
    ## Fit directly
    direct_model <- glm(count_outcome ~ age + sex, data = clintrial, family = quasipoisson)
    
    ## Coefficients should match
    fit_coefs <- coef(attr(result, "model"))
    direct_coefs <- coef(direct_model)
    
    expect_equal(fit_coefs, direct_coefs, tolerance = 1e-10)
})


test_that("fit quasipoisson point estimates match poisson", {
    
    result_quasi <- fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = "quasipoisson"
    )
    
    result_pois <- fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = "poisson"
    )
    
    ## Point estimates should be identical (SEs differ due to dispersion)
    coefs_quasi <- coef(attr(result_quasi, "model"))
    coefs_pois <- coef(attr(result_pois, "model"))
    
    expect_equal(coefs_quasi, coefs_pois, tolerance = 1e-10)
})


test_that("fit quasipoisson handles overdispersion", {
    
    result <- fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex", "treatment", "stage"),
        model_type = "glm",
        family = "quasipoisson"
    )
    
    model <- attr(result, "model")
    
    ## Dispersion parameter should be estimated
    summ <- summary(model)
    expect_true(!is.null(summ$dispersion))
    expect_true(summ$dispersion != 1)  ## Should differ from 1 for overdispersed data
})


## ----------------------------------------------------------------------------
## 16d: Gamma Family Tests (positive continuous)
## ----------------------------------------------------------------------------

test_that("fit works with Gamma family (log link)", {
    
    ## Create positive outcome
    test_data <- data.table::as.data.table(clintrial)
    test_data <- test_data[los_days > 0]
    
    result <- suppressMessages(fit(
        data = test_data,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = Gamma(link = "log")
    ))
    
    expect_fit_result(result)
    
    ## With log link, should produce ratios (exponentiated)
    col_names <- names(result)
    expect_true(any(grepl("Ratio.*CI|Coefficient.*CI", col_names)),
                info = paste("Expected Ratio or Coefficient column for Gamma, found:", 
                             paste(col_names, collapse = ", ")))
})


test_that("fit works with Gamma family (inverse link - default)", {
    
    ## Create positive outcome
    test_data <- data.table::as.data.table(clintrial)
    test_data <- test_data[los_days > 0]
    
    result <- suppressMessages(fit(
        data = test_data,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = Gamma()  ## Default inverse link
    ))
    
    expect_fit_result(result)
})


test_that("fit Gamma coefficients match direct glm", {
    
    ## Create positive outcome
    test_data <- data.table::as.data.table(clintrial)
    test_data <- test_data[los_days > 0]
    
    result <- suppressMessages(fit(
        data = test_data,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = Gamma(link = "log")
    ))
    
    ## Fit directly
    direct_model <- glm(los_days ~ age + sex, data = test_data, family = Gamma(link = "log"))
    
    ## Coefficients should match
    fit_coefs <- coef(attr(result, "model"))
    direct_coefs <- coef(direct_model)
    
    expect_equal(fit_coefs, direct_coefs, tolerance = 1e-10)
})


test_that("fit Gamma handles factor variables", {
    
    ## Create positive outcome
    test_data <- data.table::as.data.table(clintrial)
    test_data <- test_data[los_days > 0]
    
    result <- suppressMessages(fit(
        data = test_data,
        outcome = "los_days",
        predictors = c("age", "stage"),
        model_type = "glm",
        family = Gamma(link = "log"),
        reference_rows = TRUE
    ))
    
    expect_fit_result(result)
    
    ## Stage should have reference row
    raw <- attr(result, "raw_data")
    stage_rows <- raw[variable == "stage"]
    expect_true(nrow(stage_rows) >= 2)
})


## ----------------------------------------------------------------------------
## 16e: Inverse Gaussian Family Tests (positive, right-skewed)
## ----------------------------------------------------------------------------

test_that("fit works with inverse.gaussian family", {
    
    ## Create positive outcome
    test_data <- data.table::as.data.table(clintrial)
    test_data <- test_data[los_days > 0]
    
    result <- suppressMessages(fit(
        data = test_data,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = inverse.gaussian()
    ))
    
    expect_fit_result(result)
})


test_that("fit inverse.gaussian with log link works", {
    
    ## Create positive outcome
    test_data <- data.table::as.data.table(clintrial)
    test_data <- test_data[los_days > 0]
    
    result <- suppressMessages(fit(
        data = test_data,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = inverse.gaussian(link = "log")
    ))
    
    expect_fit_result(result)
    
    ## With log link, should produce ratios
    col_names <- names(result)
    expect_true(any(grepl("Ratio.*CI|Coefficient.*CI", col_names)),
                info = paste("Expected Ratio or Coefficient column, found:", 
                             paste(col_names, collapse = ", ")))
})


test_that("fit inverse.gaussian coefficients match direct glm", {
    
    ## Create positive outcome
    test_data <- data.table::as.data.table(clintrial)
    test_data <- test_data[los_days > 0]
    
    result <- suppressMessages(fit(
        data = test_data,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "glm",
        family = inverse.gaussian(link = "log")
    ))
    
    ## Fit directly
    direct_model <- glm(los_days ~ age + sex, data = test_data, 
                        family = inverse.gaussian(link = "log"))
    
    ## Coefficients should match
    fit_coefs <- coef(attr(result, "model"))
    direct_coefs <- coef(direct_model)
    
    expect_equal(fit_coefs, direct_coefs, tolerance = 1e-10)
})


## ----------------------------------------------------------------------------
## 16f: Negative Binomial (negbin) Tests
## ----------------------------------------------------------------------------

test_that("fit works with negative binomial regression (negbin)", {
    
    skip_if_not_installed("MASS")
    
    result <- suppressWarnings(fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex", "treatment"),
        model_type = "negbin"
    ))
    
    expect_fit_result(result)
    
    ## Should produce rate ratios
    col_names <- names(result)
    expect_true(any(grepl("RR.*CI", col_names)),
                info = paste("Expected RR column for negative binomial, found:", 
                             paste(col_names, collapse = ", ")))
    
    ## Model should be of class negbin
    model <- attr(result, "model")
    expect_true(inherits(model, "negbin"))
})


test_that("fit negbin produces correct effect estimates", {
    
    skip_if_not_installed("MASS")
    
    ## Fit using fit()
    result <- suppressWarnings(fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex"),
        model_type = "negbin"
    ))
    
    ## Fit directly with MASS
    direct_model <- suppressWarnings(MASS::glm.nb(count_outcome ~ age + sex, data = clintrial))
    
    ## Coefficients should match
    fit_coefs <- coef(attr(result, "model"))
    direct_coefs <- coef(direct_model)
    
    expect_equal(fit_coefs, direct_coefs, tolerance = 1e-10)
})


test_that("fit negbin works with multivariable models", {
    
    skip_if_not_installed("MASS")
    
    result <- suppressWarnings(fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex", "treatment", "stage"),
        model_type = "negbin"
    ))
    
    expect_fit_result(result)
    expect_equal(attr(result, "model_scope"), "Multivariable")
    
    ## Should have aRR column for multivariable
    col_names <- names(result)
    expect_true(any(grepl("aRR.*CI", col_names)),
                info = paste("Expected aRR column for multivariable negbin, found:", 
                             paste(col_names, collapse = ", ")))
})


test_that("fit negbin handles factor variables correctly", {
    
    skip_if_not_installed("MASS")
    
    result <- suppressWarnings(fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "stage"),
        model_type = "negbin",
        reference_rows = TRUE
    ))
    
    expect_fit_result(result)
    
    ## Stage is a factor - should have multiple rows with reference
    raw <- attr(result, "raw_data")
    stage_rows <- raw[variable == "stage"]
    expect_true(nrow(stage_rows) >= 2)
    
    ## Check reference row exists
    has_ref <- any(stage_rows$reference == "reference", na.rm = TRUE)
    expect_true(has_ref)
})


test_that("fit negbin respects formatting parameters", {
    
    skip_if_not_installed("MASS")
    
    result <- suppressWarnings(fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex"),
        model_type = "negbin",
        digits = 3,
        p_digits = 4,
        show_n = TRUE,
        show_events = FALSE
    ))
    
    expect_fit_result(result)
    expect_true("n" %in% names(result))
    expect_false("Events" %in% names(result))
})


test_that("fit negbin works with labels", {
    
    skip_if_not_installed("MASS")
    
    result <- suppressWarnings(fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex", "treatment"),
        model_type = "negbin",
        labels = clintrial_labels
    ))
    
    expect_fit_result(result)
    
    ## Labels should be applied
    expect_true(any(grepl("Age|Sex|Treatment", result$Variable)))
})


test_that("fit negbin works with interactions", {
    
    skip_if_not_installed("MASS")
    
    result <- suppressWarnings(fit(
        data = clintrial,
        outcome = "count_outcome",
        predictors = c("age", "sex", "treatment"),
        interactions = c("age:sex"),
        model_type = "negbin"
    ))
    
    expect_fit_result(result)
    
    ## Interaction should be in formula
    formula_str <- attr(result, "formula_str")
    expect_true(grepl("age:sex", formula_str))
})


test_that("fit with pre-fitted model works for negbin", {
    
    skip_if_not_installed("MASS")
    
    ## Fit model directly with MASS
    nb_model <- suppressWarnings(MASS::glm.nb(count_outcome ~ age + sex + treatment, data = clintrial))
    
    ## Pass to fit()
    result <- fit(
        model = nb_model,
        data = clintrial,
        labels = clintrial_labels
    )
    
    ## Check basic structure (without outcome/predictors which aren't available for pre-fitted models)
    expect_s3_class(result, "fit_result")
    expect_s3_class(result, "data.table")
    expect_true("Variable" %in% names(result))
    expect_true("Group" %in% names(result))
    expect_true(!is.null(attr(result, "model")))
    expect_true(!is.null(attr(result, "raw_data")))
    
    ## Should recognize it as negative binomial
    model_type <- attr(result, "model_type")
    expect_true(grepl("Negative Binomial", model_type))
})


## ============================================================================
## SECTION 17: Model Attribute Preservation Tests
## ============================================================================

test_that("fit preserves all specified attributes", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "stage"),
        interactions = c("age:sex"),
        model_type = "glm",
        family = "binomial"
    )
    
    ## Check all attributes are preserved
    expect_equal(attr(result, "outcome"), "response")
    expect_equal(attr(result, "predictors"), c("age", "sex", "stage"))
    expect_equal(attr(result, "interactions"), c("age:sex"))
    
    ## Formula should be correct
    formula_str <- attr(result, "formula_str")
    expect_true(grepl("response", formula_str))
    expect_true(grepl("age", formula_str))
    expect_true(grepl("sex", formula_str))
    expect_true(grepl("stage", formula_str))
    expect_true(grepl("age:sex", formula_str))
})


test_that("fit stores data in model object", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    model <- attr(result, "model")
    
    ## Data should be accessible from model
    model_data <- attr(model, "data")
    expect_true(!is.null(model_data) || !is.null(model$data))
})


## ============================================================================
## SECTION 18: Performance and Large Data Tests
## ============================================================================

test_that("fit handles moderately large datasets", {
    
    ## Create larger dataset by replicating
    large_data <- rbindlist(rep(list(clintrial), 10))
    large_data[, id := .I]  # Add unique ID
    
    result <- fit(
        data = large_data,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    expect_fit_result(result)
    
    ## N should reflect larger dataset
    raw <- attr(result, "raw_data")
    expect_true(raw$n[1] > nrow(clintrial))
})


## ============================================================================
## SECTION 19: Numeric Precision Tests
## ============================================================================

test_that("fit handles extreme coefficient values", {
    
    ## Create data that might produce extreme coefficients
    test_data <- data.table::as.data.table(clintrial)
    test_data[, tiny_var := age / 10000]  # Very small scale
    
    result <- fit(
        data = test_data,
        outcome = "response",
        predictors = c("tiny_var", "sex"),
        model_type = "glm"
    )
    
    expect_fit_result(result)
    
    ## Should still produce valid output
    raw <- attr(result, "raw_data")
    expect_true(all(!is.na(raw$coef)))
})


test_that("fit handles perfect separation gracefully",
{
    ## Create data with near-perfect separation
    test_data <- data.table(
        outcome = c(rep(0, 50), rep(1, 50)),
        predictor = c(rep(0, 50), rep(1, 50)),
        noise = rnorm(100)
    )
    
    ## This might produce warnings about convergence
    result <- suppressWarnings(
        fit(
            data = test_data,
            outcome = "outcome",
            predictors = c("predictor", "noise"),
            model_type = "glm",
            family = "binomial"
        )
    )
    
    ## Should still return a valid result
    expect_fit_result(result)
})


## ============================================================================
## SECTION 20: Formula String Validation Tests
## ============================================================================

test_that("fit formula_str is valid R formula", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "stage"),
        interactions = c("age:sex"),
        model_type = "glm"
    )
    
    formula_str <- attr(result, "formula_str")
    
    ## Should be parseable as formula
    parsed_formula <- as.formula(formula_str)
    expect_s3_class(parsed_formula, "formula")
})


test_that("fit formula includes all components correctly", {
    
    result <- fit(
        data = clintrial,
        outcome = "Surv(os_months, os_status)",
        predictors = c("age", "treatment"),
        model_type = "coxph",
        strata = "sex"
    )
    
    formula_str <- attr(result, "formula_str")
    
    ## Check all components present
    expect_true(grepl("Surv\\(os_months, os_status\\)", formula_str))
    expect_true(grepl("age", formula_str))
    expect_true(grepl("treatment", formula_str))
    expect_true(grepl("strata\\(.*sex.*\\)", formula_str))
})


## ============================================================================
## SECTION 22: Profile Likelihood CI and Caching
## ============================================================================

test_that("fit propagates cached confint to model attribute", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    model <- attr(result, "model")
    
    ## Model should carry cached confint from m2dt
    cached <- attr(model, "cached_confint")
    expect_true(!is.null(cached),
                info = "fit() should propagate cached_confint to model object")
    expect_true(is.matrix(cached))
    expect_equal(attr(model, "cached_confint_level"), 0.95)
})


test_that("fit cached confint matches stats::confint output", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex"),
        model_type = "glm"
    )
    
    model <- attr(result, "model")
    cached <- attr(model, "cached_confint")
    
    ## Compute confint directly for comparison
    direct_ci <- suppressMessages(suppressWarnings(
        stats::confint(model, level = 0.95)
    ))
    
    ## Cached values should match direct computation
    ## (non-NA rows; intercept row may be NA if skipped)
    shared_rows <- intersect(rownames(cached)[!is.na(cached[, 1])],
                             rownames(direct_ci))
    expect_true(length(shared_rows) > 0)
    
    for (rn in shared_rows) {
        expect_equal(cached[rn, 1], direct_ci[rn, 1], tolerance = 1e-10,
                     info = paste("Cached CI lower mismatch for", rn))
        expect_equal(cached[rn, 2], direct_ci[rn, 2], tolerance = 1e-10,
                     info = paste("Cached CI upper mismatch for", rn))
    }
})


test_that("fit GLM profile CIs differ from Wald", {
    
    result <- fit(
        data = clintrial,
        outcome = "response",
        predictors = c("age", "sex", "stage"),
        model_type = "glm"
    )
    
    raw <- attr(result, "raw_data")
    non_ref <- raw[is.na(reference) | reference == ""]
    
    ## Compute Wald CIs
    model <- attr(result, "model")
    coefs <- coef(model)[-1]
    ses <- summary(model)$coefficients[-1, "Std. Error"]
    z <- qnorm(0.975)
    wald_lower <- coefs - z * ses
    
    ## Profile should differ from Wald for at least one term
    any_differs <- any(abs(non_ref$coef_lower - wald_lower) > 1e-6)
    expect_true(any_differs,
                info = "fit() GLM CIs should use profile likelihood, not Wald")
})


test_that("fit lm CIs use exact t-distribution", {
    
    result <- fit(
        data = clintrial,
        outcome = "los_days",
        predictors = c("age", "sex"),
        model_type = "lm"
    )
    
    raw <- attr(result, "raw_data")
    non_ref <- raw[is.na(reference) | reference == ""]
    
    ## Exact t CIs should match confint.lm() output
    model <- attr(result, "model")
    direct_ci <- confint(model, level = 0.95)
    ## Remove intercept
    direct_ci <- direct_ci[rownames(direct_ci) != "(Intercept)", , drop = FALSE]
    
    expect_equal(unname(non_ref$coef_lower), unname(direct_ci[, 1]), tolerance = 1e-10,
                 info = "lm CIs should match confint.lm()")
})

Try the summata package in your browser

Any scripts or data that you put into this service are public.

summata documentation built on May 7, 2026, 5:07 p.m.