Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
fig.align = "center",
fig.width = 7,
fig.height = 5,
message = FALSE,
warning = FALSE,
comment = "#>"
)
options(mc.cores = 1, rf.cores = 1)
## ----packages-----------------------------------------------------------------
library(ggplot2)
library(dplyr)
library(tidyr)
library(randomForestSRC)
library(survival)
if (requireNamespace("ggRandomForests", quietly = TRUE)) {
library(ggRandomForests)
} else if (requireNamespace("pkgload", quietly = TRUE)) {
pkgload::load_all(export_all = FALSE, helpers = FALSE, attach_testthat = FALSE)
} else {
stop("Install ggRandomForests (or pkgload for dev builds) to render this vignette.")
}
theme_set(theme_bw())
# Plotting constants
event_marks <- c(1, 4)
event_labels <- c("Censored", "Death")
event_colors <- c("steelblue", "firebrick")
## ----data-load----------------------------------------------------------------
data("pbc", package = "randomForestSRC")
## ----data-clean---------------------------------------------------------------
pbc <- pbc |>
mutate(
years = days / 365.25,
age = age / 365.25,
treatment = factor(
ifelse(treatment == 1, "DPCA",
ifelse(treatment == 2, "Placebo", NA)),
levels = c("DPCA", "Placebo")
)
) |>
select(-days)
# Low-cardinality numerics (including binary 0/1) to factor.
# NOTE: do NOT convert to logical — partial.rfsrc() fails with logical
# predictors in survival forests (randomForestSRC <= 3.5.1).
# Exclude the response columns (status, years) from conversion.
resp_cols <- c("status", "years")
for (nm in setdiff(names(pbc), resp_cols)) {
v <- pbc[[nm]]
if (is.numeric(v) && !is.factor(v) && length(unique(v[!is.na(v)])) <= 5) {
pbc[[nm]] <- factor(v)
}
}
## ----variable-labels----------------------------------------------------------
# Human-readable labels for plot axes
st_labs <- c(
status = "Death Event",
treatment = "Treatment",
age = "Age (years)",
sex = "Female",
ascites = "Ascites",
hepato = "Hepatomegaly",
spiders = "Spiders",
edema = "Edema (0, 0.5, 1)",
bili = "Serum Bilirubin (mg/dl)",
chol = "Serum Cholesterol (mg/dl)",
albumin = "Albumin (gm/dl)",
copper = "Urine Copper (ug/day)",
alk.phos = "Alkaline Phosphatase (U/liter)",
ast = "SGOT (U/ml)",
trig = "Triglycerides (mg/dl)",
platelet = "Platelets (per cubic ml/1000)",
prothrombin = "Prothrombin Time (sec)",
stage = "Histologic Stage",
years = "Follow-up Time (years)"
)
## ----eda-categorical, fig.height=4, fig.cap="EDA for categorical variables. Bar color indicates factor level; white = missing."----
cls <- sapply(pbc, class)
cnt_idx <- which(cls %in% c("numeric", "integer"))
fct_idx <- setdiff(seq_along(pbc), cnt_idx)
fct_idx <- union(fct_idx, which(names(pbc) == "years"))
dta_cat <- suppressWarnings(
pbc[, fct_idx] |>
pivot_longer(-years, names_to = "variable", values_to = "value",
values_transform = list(value = as.character))
)
ggplot(dta_cat, aes(x = years, fill = value)) +
geom_histogram(color = "black", binwidth = 1) +
labs(y = "", x = st_labs["years"]) +
scale_fill_brewer(palette = "RdBu", na.value = "white") +
facet_wrap(~variable, scales = "free_y", nrow = 2) +
theme(legend.position = "none")
## ----eda-continuous, fig.height=4, fig.cap="EDA for continuous variables. Points colored by death event; rug marks indicate missing values."----
cnt_idx <- union(cnt_idx, which(names(pbc) == "status"))
dta_cont <- pbc[, cnt_idx] |>
pivot_longer(c(-years, -status),
names_to = "variable", values_to = "value")
ggplot(dta_cont |> filter(!is.na(value)),
aes(x = years, y = value, color = factor(status), shape = factor(status))) +
geom_point(alpha = 0.4) +
geom_rug(data = dta_cont |> filter(is.na(value)), color = "grey50") +
labs(y = "", x = st_labs["years"], color = "Death", shape = "Death") +
scale_color_manual(values = event_colors) +
scale_shape_manual(values = event_marks) +
facet_wrap(~variable, scales = "free_y", ncol = 4) +
theme(legend.position = c(0.8, 0.2))
## ----km-survival, fig.cap="KM survival estimates by treatment group with 95% confidence bands."----
pbc_trial <- pbc |> filter(!is.na(treatment))
pbc_test <- pbc |> filter(is.na(treatment))
gg_km <- gg_survival(interval = "years", censor = "status",
by = "treatment", data = pbc_trial,
conf.int = 0.95)
plot(gg_km) +
labs(y = "Survival Probability", x = "Time (years)",
color = "Treatment", fill = "Treatment") +
theme(legend.position = c(0.2, 0.2)) +
coord_cartesian(ylim = c(0, 1.01))
## ----km-cumhaz, fig.cap="KM cumulative hazard estimates by treatment group."----
plot(gg_km, type = "cum_haz") +
labs(y = "Cumulative Hazard", x = "Time (years)",
color = "Treatment", fill = "Treatment") +
theme(legend.position = c(0.2, 0.8)) +
coord_cartesian(ylim = c(-0.02, 1.22))
## ----km-bili, fig.width=6, fig.cap="KM survival stratified by bilirubin groups."----
pbc_bili <- pbc_trial |>
mutate(bili_grp = cut(bili, breaks = c(0, 0.8, 1.3, 3.4, 29)))
plot(gg_survival(interval = "years", censor = "status",
by = "bili_grp", data = pbc_bili),
error = "none") +
labs(y = "Survival Probability", x = "Time (years)", color = "Bilirubin")
## ----rfsrc-fit----------------------------------------------------------------
# Step 1: impute missing values via random forest proximity
pbc_imputed <- impute.rfsrc(Surv(years, status) ~ .,
data = pbc_trial,
ntree = 500,
nimpute = 2)
# Step 2: grow the survival forest on the complete imputed data
rfsrc_pbc <- rfsrc(Surv(years, status) ~ .,
data = pbc_imputed,
nsplit = 10,
tree.err = TRUE,
importance = TRUE)
## ----error-plot, fig.height=4, fig.cap="OOB prediction error vs. number of trees."----
plot(gg_error(rfsrc_pbc))
## ----rfsrc-predicted, fig.cap="OOB predicted survival curves. Blue = censored, red = death events."----
gg_rsf <- plot(gg_rfsrc(rfsrc_pbc), alpha = 0.2) +
scale_color_manual(values = event_colors) +
theme(legend.position = "none") +
labs(y = "Survival Probability", x = "Time (years)") +
coord_cartesian(ylim = c(-0.01, 1.01))
gg_rsf
## ----rfsrc-by-treatment, fig.cap="Median predicted survival by treatment group with 95% confidence bands."----
plot(gg_rfsrc(rfsrc_pbc, by = "treatment")) +
theme(legend.position = c(0.2, 0.2)) +
labs(y = "Survival Probability", x = "Time (years)") +
coord_cartesian(ylim = c(-0.01, 1.01))
## ----predict-test, fig.cap="Predicted survival for non-trial patients (test set)."----
rfsrc_pbc_test <- predict(rfsrc_pbc, newdata = pbc_test,
na.action = "na.impute",
importance = TRUE)
plot(gg_rfsrc(rfsrc_pbc_test), alpha = 0.2) +
scale_color_manual(values = event_colors) +
theme(legend.position = "none") +
labs(y = "Survival Probability", x = "Time (years)") +
coord_cartesian(ylim = c(-0.01, 1.01))
## ----vimp-plot, fig.width=6, fig.cap="Variable importance ranking. Blue = positive VIMP, red = negative."----
plot(gg_vimp(rfsrc_pbc), lbls = st_labs) +
theme(legend.position = c(0.8, 0.2)) +
labs(fill = "VIMP > 0")
## ----varsel-------------------------------------------------------------------
md_pbc <- max.subtree(rfsrc_pbc)
## ----select-vars--------------------------------------------------------------
xvar <- c("bili", "albumin", "copper", "prothrombin", "age")
xvar_cat <- "edema"
xvar_all <- c(xvar, xvar_cat)
## ----vardep-bili, fig.height=4, fig.cap="Variable dependence of survival at 1 and 3 years on bilirubin."----
gg_v <- gg_variable(rfsrc_pbc, time = c(1, 3),
time.labels = c("1 Year", "3 Years"))
plot(gg_v, xvar = "bili", alpha = 0.4) +
labs(y = "Survival", x = st_labs["bili"]) +
theme(legend.position = "none") +
scale_color_manual(values = event_colors, labels = event_labels) +
scale_shape_manual(values = event_marks, labels = event_labels) +
coord_cartesian(ylim = c(-0.01, 1.01))
## ----vardep-panel, fig.height=5, fig.cap="Variable dependence at 1 and 3 years for continuous predictors."----
plot(gg_v, xvar = xvar[-1], panel = TRUE, alpha = 0.4) +
labs(y = "Survival") +
theme(legend.position = "none") +
scale_color_manual(values = event_colors, labels = event_labels) +
scale_shape_manual(values = event_marks, labels = event_labels) +
coord_cartesian(ylim = c(-0.05, 1.05))
## ----vardep-categorical, fig.height=4, fig.cap="Variable dependence on edema (categorical)."----
plot(gg_v, xvar = xvar_cat, alpha = 0.4) +
labs(y = "Survival") +
theme(legend.position = "none") +
scale_color_manual(values = event_colors, labels = event_labels) +
scale_shape_manual(values = event_marks, labels = event_labels) +
coord_cartesian(ylim = c(-0.01, 1.02))
## ----partial-dep, error=TRUE, fig.height=5, fig.cap="Partial dependence of survival at approximately 1 and 3 years on continuous predictors."----
try({
# partial.rfsrc() requires times that match the model's time.interest grid;
# gg_partial_rfsrc() snaps the requested values to the nearest observed times.
ti <- rfsrc_pbc$time.interest
t1yr <- ti[which.min(abs(ti - 1))]
t3yr <- ti[which.min(abs(ti - 3))]
pd <- gg_partial_rfsrc(rfsrc_pbc, xvar.names = xvar,
partial.time = c(t1yr, t3yr))
# Quick S3 plot — survival forests produce time-series curves per predictor value
plot(pd)
})
## ----partial-dep-custom, error=TRUE, fig.height=5, fig.cap="Partial dependence (custom styling)."----
try({
ggplot(pd$continuous, aes(x = x, y = yhat,
color = factor(round(time, 2)),
group = factor(time))) +
geom_line(linewidth = 1) +
facet_wrap(~name, scales = "free_x") +
labs(y = "Predicted Survival", x = "", color = "Year") +
scale_color_manual(values = setNames(c("steelblue", "firebrick"),
as.character(round(c(t1yr, t3yr), 2)))) +
theme_bw()
})
## ----coplot-edema, fig.width=7, fig.height=4, fig.cap="Variable dependence on bilirubin, conditional on edema group."----
gg_v1 <- gg_variable(rfsrc_pbc, time = 1)
gg_v1$edema <- paste("edema =", gg_v1$edema)
plot(gg_v1, xvar = "bili", alpha = 0.5) +
labs(y = "Survival at 1 Year", x = st_labs["bili"]) +
theme(legend.position = "none") +
scale_color_manual(values = event_colors, labels = event_labels) +
scale_shape_manual(values = event_marks, labels = event_labels) +
facet_grid(~edema) +
coord_cartesian(ylim = c(-0.01, 1.01))
## ----coplot-albumin, fig.width=7, fig.height=5, fig.cap="Variable dependence on bilirubin, conditional on albumin groups."----
albumin_cts <- quantile_pts(gg_v1$albumin, groups = 6, intervals = TRUE)
gg_v1$albumin_grp <- cut(gg_v1$albumin, breaks = albumin_cts)
levels(gg_v1$albumin_grp) <- paste("albumin =",
levels(gg_v1$albumin_grp))
plot(gg_v1, xvar = "bili", alpha = 0.5) +
labs(y = "Survival at 1 Year", x = st_labs["bili"]) +
theme(legend.position = "none") +
scale_color_manual(values = event_colors, labels = event_labels) +
scale_shape_manual(values = event_marks, labels = event_labels) +
facet_wrap(~albumin_grp) +
coord_cartesian(ylim = c(-0.01, 1.01))
## ----surface-data, error=TRUE, cache=FALSE------------------------------------
try({
# Create grid of albumin values
alb_grid <- quantile_pts(pbc_trial$albumin, groups = 25)
# For each albumin value, compute partial dependence on bili at ~1 year
surface_list <- lapply(alb_grid, function(alb_val) {
newx <- pbc_trial[, rfsrc_pbc$xvar.names, drop = FALSE]
newx$albumin <- alb_val
pd_alb <- gg_partial_rfsrc(rfsrc_pbc, xvar.names = "bili",
newx = newx, partial.time = t1yr)
df <- pd_alb$continuous
df$albumin <- alb_val
df
})
surface_df <- bind_rows(surface_list)
})
## ----plotly-surface, error=TRUE, fig.cap="Interactive partial dependence surface: survival as a function of bilirubin and albumin."----
try({
if (!exists("surface_df")) {
message("surface_df not available — skipping plotly surface (see surface-data chunk error above).")
} else if (requireNamespace("plotly", quietly = TRUE)) {
# Reshape for surface
library(plotly)
surface_wide <- surface_df |>
select(bili = x, albumin, survival = yhat) |>
arrange(albumin, bili)
# Create matrix form
bili_vals <- sort(unique(surface_wide$bili))
alb_vals <- sort(unique(surface_wide$albumin))
z_matrix <- matrix(surface_wide$survival,
nrow = length(alb_vals),
ncol = length(bili_vals),
byrow = TRUE)
plot_ly(x = bili_vals, y = alb_vals, z = z_matrix) |>
add_surface(colorscale = "Viridis", showscale = TRUE) |>
layout(
scene = list(
xaxis = list(title = "Bilirubin"),
yaxis = list(title = "Albumin"),
zaxis = list(title = "Survival")
)
)
} else {
message("Install the plotly package for interactive 3D surfaces.")
# Fallback: contour plot with ggplot2
ggplot(surface_df, aes(x = x, y = albumin, fill = yhat)) +
geom_tile() +
scale_fill_viridis_c(name = "Survival") +
labs(x = "Bilirubin", y = "Albumin") +
theme_bw()
}
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.