knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 )
The crrstep package provides stepwise variable selection for the Fine-Gray proportional subdistribution hazards model for competing risks data. This vignette demonstrates how to use the package with both simple and complex formula specifications.
install.packages('crrstep')
library(crrstep) library(cmprsk)
set.seed(123) n <- 400 age <- rnorm(n, mean = 60, sd = 10) sex <- factor(sample(c('Male', 'Female'), n, replace = TRUE)) treatment <- factor(sample(c('Control', 'Treatment'), n, replace = TRUE)) biomarker <- rnorm(n, mean = 100, sd = 20) linpred <- 0.03 * (age - 60) + 0.5 * (sex == 'Male') - 0.4 * (treatment == 'Treatment') + 0.01 * biomarker base_hazard <- 0.1 true_time <- rexp(n, rate = base_hazard * exp(linpred)) p_cause1 <- plogis(linpred) cause <- ifelse(runif(n) < p_cause1, 1, 2) censor_time <- rexp(n, rate = 0.05) ftime <- pmin(true_time, censor_time) fstatus <- ifelse(ftime == censor_time, 0, cause) sim_data <- data.frame(ftime, fstatus, age, sex, treatment, biomarker) table(sim_data$fstatus)
result_back <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_back)
result_fwd <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'forward', criterion = 'AIC', trace = TRUE ) print(result_fwd)
result_bic <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'BIC', trace = FALSE ) cat('AIC selected variables:', nrow(result_back$coefficients), '\\n') cat('BIC selected variables:', nrow(result_bic$coefficients), '\\n')
result_min <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~ age + sex, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_min)
set.seed(456) sim_data$stage <- factor(sample(c('I', 'II', 'III', 'IV'), n, replace = TRUE)) sim_data$region <- factor(sample(c('North', 'South', 'East', 'West'), n, replace = TRUE)) result_factors <- crrstep( formula = ftime ~ age + sex + stage + region, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'BIC', trace = TRUE ) summary(result_factors) coef(result_factors) logLik(result_factors) BIC(result_factors)
result_interact <- crrstep( formula = ftime ~ age + sex + treatment + age:sex + sex:treatment, scope.min = ~ age + sex, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_interact)
sim_data$biomarker_c <- scale(sim_data$biomarker, scale = FALSE)[,1] result_poly <- crrstep( formula = ftime ~ age + sex + biomarker_c + I(biomarker_c^2), scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_poly)
result_complex <- crrstep( formula = ftime ~ age + treatment + biomarker_c + I(biomarker_c^2) + biomarker_c:treatment + I(biomarker_c^2):treatment, scope.min = ~ age, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'BIC', trace = TRUE ) print(result_complex)
sim_data$biomarker_pos <- sim_data$biomarker - min(sim_data$biomarker) + 1 result_log <- crrstep( formula = ftime ~ age + sex + log(biomarker_pos) + log(biomarker_pos):sex, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_log)
result_threeway <- crrstep( formula = ftime ~ age + sex + treatment + stage + sex:treatment + sex:stage + treatment:stage + sex:treatment:stage, scope.min = ~ age, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'BIC', trace = TRUE ) print(result_threeway)
fit_full <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~1, etype = fstatus, data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = FALSE, crr.object = TRUE ) cat('Coefficients:\\n') print(fit_full$coef) cat('\\nLog-likelihood:\\n') print(fit_full$loglik)
sim_data_miss <- sim_data sim_data_miss$age[sample(1:n, 20)] <- NA sim_data_miss$biomarker[sample(1:n, 15)] <- NA result_miss <- crrstep( formula = ftime ~ age + sex + treatment + biomarker, scope.min = ~1, etype = fstatus, data = sim_data_miss, failcode = 1, direction = 'backward', criterion = 'AIC', trace = TRUE ) print(result_miss)
result_subset <- crrstep( formula = ftime ~ age + treatment + biomarker, scope.min = ~1, etype = fstatus, subset = sim_data$sex == 'Female', data = sim_data, failcode = 1, direction = 'backward', criterion = 'AIC', trace = FALSE ) print(result_subset)
Convergence issues: Center/scale variables, remove correlated variables Singular matrix errors: Remove redundant variables, combine sparse factor levels
Kuk, D. and Varadhan, R. (2013). Model selection in competing risks regression. Statistics in Medicine.
Fine, J.P. and Gray, R.J. (1999). A proportional hazards model for the subdistribution of a competing risk. Journal of the American Statistical Association, 94(446), 496-509.
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.