R/data.R
In summata: Publication-Ready Summary Tables and Forest Plots

#' Simulated Clinical Trial Dataset
#'
#' A simulated dataset from a hypothetical multi-center oncology clinical trial
#' comparing two experimental drugs against control. Designed to demonstrate
#' the full capabilities of descriptive and regression analysis functions.
#'
#' @format A data frame with 850 observations and 32 variables:
#' \describe{
#'   \item{patient_id}{Unique patient identifier (character)}
#'   \item{age}{Age at enrollment in years (numeric: 18-90)}
#'   \item{sex}{Biological sex (factor: Female, Male)}
#'   \item{race}{Self-reported race (factor: White, Black, Asian, Other)}
#'   \item{ethnicity}{Hispanic ethnicity (factor: Non-Hispanic, Hispanic)}
#'   \item{bmi}{Body mass index in kg/m\eqn{^2} (numeric)}
#'   \item{smoking}{Smoking history (factor: Never, Former, Current)}
#'   \item{hypertension}{Hypertension diagnosis (factor: No, Yes)}
#'   \item{diabetes}{Diabetes diagnosis (factor: No, Yes)}
#'   \item{ecog}{ECOG performance status (factor: 0, 1, 2, 3)}
#'   \item{creatinine}{Baseline creatinine in mg/dL (numeric)}
#'   \item{hemoglobin}{Baseline hemoglobin in g/dL (numeric)}
#'   \item{biomarker_x}{Serum biomarker A in ng/mL (numeric)}
#'   \item{biomarker_y}{Serum biomarker B in U/L (numeric)}
#'   \item{site}{Enrolling site (factor: Site Alpha through Site Kappa)}
#'   \item{grade}{Tumor grade (factor: Well/Moderately/Poorly differentiated)}
#'   \item{stage}{Disease stage at diagnosis (factor: I, II, III, IV)}
#'   \item{treatment}{Randomized treatment (factor: Control, Drug A, Drug B)}
#'   \item{surgery}{Surgical resection (factor: No, Yes)}
#'   \item{any_complication}{Any post-operative complication (factor: No, Yes)}
#'   \item{wound_infection}{Post-operative wound infection (factor: No, Yes)}
#'   \item{icu_admission}{ICU admission required (factor: No, Yes)}
#'   \item{readmission_30d}{Hospital readmission within 30 days (factor: No, Yes)}
#'   \item{pain_score}{Pain score at discharge (numeric: 0-10)}
#'   \item{recovery_days}{Days to functional recovery (numeric)}
#'   \item{los_days}{Hospital length of stay in days (numeric)}
#'   \item{ae_count}{Adverse event count (integer). Overdispersed count suitable
#'     for negative binomial or quasipoisson regression.}
#'   \item{fu_count}{Follow-up visit count (integer). Equidispersed count
#'     suitable for standard Poisson regression.}
#'   \item{pfs_months}{Progression-Free Survival Time (months)}
#'   \item{pfs_status}{Progression or Death Event}
#'   \item{os_months}{Overall survival time in months (numeric)}
#'   \item{os_status}{Death indicator (numeric: 0=censored, 1=death)}
#' }
#' 
#' @details
#' This dataset includes realistic correlations between variables:
#' - Survival is worse with higher stage, ECOG, age, and biomarker_x
#' - Treatment effects show Drug B > Drug A > Control
#' - \code{ae_count} is overdispersed (variance > mean) for negative binomial demos
#' - \code{fu_count} is equidispersed (variance \eqn{\approx} mean) for Poisson demos
#' - Approximately 2\% of values are missing at random
#' - Median follow-up is approximately 30 months
#' 
#' @source Simulated data for demonstration purposes
#' @family sample data
#' 
#' @examples
#' data(clintrial)
#' data(clintrial_labels)
#' 
#' # Descriptive statistics by treatment arm
#' desctable(clintrial,
#'         by = "treatment", 
#'         variables = c("age", "sex", "stage", "ecog", 
#'                      "biomarker_x", "Surv(os_months, os_status)"),
#'         labels = clintrial_labels)
#'
#' \donttest{
#' # Poisson regression for equidispersed counts
#' fit(clintrial,
#'     outcome = "fu_count",
#'     predictors = c("age", "stage", "treatment"),
#'     model_type = "glm",
#'     family = "poisson",
#'     labels = clintrial_labels)
#' 
#' # Negative binomial for overdispersed counts
#' fit(clintrial,
#'     outcome = "ae_count",
#'     predictors = c("age", "treatment", "diabetes"),
#'     model_type = "negbin",
#'     labels = clintrial_labels)
#' 
#' # Complete analysis pipeline
#' fullfit(clintrial,
#'         outcome = "Surv(os_months, os_status)",
#'         predictors = c("age", "sex", "stage", "grade", "ecog",
#'                       "smoking", "biomarker_x", "biomarker_y", "treatment"),
#'         method = "screen",
#'         p_threshold = 0.20,
#'         model_type = "coxph",
#'         labels = clintrial_labels)
#' }
#'         
"clintrial"

#' Variable Labels for Clinical Trial Dataset  
#'
#' A named character vector providing descriptive labels for all variables
#' in the clinical_trial dataset. Use with labels parameter in functions.
#'
#' @format Named character vector with 24 elements
#' @family sample data
#' @seealso \code{\link{clintrial}}
"clintrial_labels"