knitr::opts_chunk$set(echo = T, warning = F, message = F, cache = F, eval = T) set.seed(1) options(scipen = 999)
The xgboost package survival model returns predictions on the hazard ratio scale (i.e., as $HR = exp(marginal_prediction) in the proportional hazard function $h(t) = h0(t) * HR$. This quantity is equivalent to the type = "risk" in coxph. This package provides a thin wrapper that enables using the xgboost package to perform full survival curve estimation.
See also discussion in stackoverflow
Below is a short usage demo.
First, prepare the data:
library(survXgboost) library(survival) library(xgboost) data("lung") lung <- lung[complete.cases(lung), ] # doesn't handle missing values at the moment lung$status <- lung$status - 1 # format status variable correctly such that 1 is event/death and 0 is censored/alive label <- ifelse(lung$status == 1, lung$time, -lung$time) val_ind <- sample.int(nrow(lung), 0.1 * nrow(lung)) x_train <- as.matrix(lung[-val_ind, !names(lung) %in% c("time", "status")]) x_label <- label[-val_ind] x_val <- xgb.DMatrix(as.matrix(lung[val_ind, !names(lung) %in% c("time", "status")]), label = label[val_ind])
Below we train an xgboost survival model using the function from the survXgboost package rather than the xgboost package:
# train surv_xgboost surv_xgboost_model <- xgb.train.surv( params = list( objective = "survival:cox", eval_metric = "cox-nloglik", eta = 0.05 # larger eta leads to algorithm not converging, resulting in NaN predictions ), data = x_train, label = x_label, watchlist = list(val2 = x_val), nrounds = 1000, early_stopping_rounds = 30 )
Next we can predict full survival curves:
# predict survival curves times <- seq(10, 1000, 50) survival_curves <- predict(object = surv_xgboost_model, newdata = x_train, type = "surv", times = times) matplot(times, t(survival_curves[1:5, ]), type = "l")
We can also predict the risk scores like in the original xgboost package:
# predict risk score risk_scores <- predict(object = surv_xgboost_model, newdata = x_train, type = "risk") hist(risk_scores)
We can see the package can produce survival estimates that are well calibrated
if(require(riskRegression)){ data(cancer, package="survival") status_mgus2 <- ifelse(mgus2$death == 0, -mgus2$futime, mgus2$futime) # We use na.pass since XGBoost can predict for missing data formula_mgus2 <- ~ age + sex + dxyr + hgb + mspike - 1 x_mgus2 <- model.matrix( formula_mgus2, model.frame(formula_mgus2, mgus2, na.action = "na.pass") ) # Note: this model is likely overfitting horribly, but that helps demonstrate calibration mgus2_model <- xgb.train.surv( params = list( objective = "survival:cox", eval_metric = "cox-nloglik", eta = 0.2 ), data = x_mgus2, label = status_mgus2, nrounds = 10 ) surv_predictions <- predict(mgus2_model, x_mgus2, type = "surv", times= 60) score <- Score(list(model1=1-surv_predictions),Surv(futime,death) ~1 ,data=mgus2, times=60,plots="cal") plotCalibration(score, rug = TRUE) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.