inst/doc/ggRandomForests-regression.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  fig.align = "center",
  fig.width = 7,
  fig.height = 5,
  message = FALSE,
  warning = FALSE,
  comment = "#>"
)
options(mc.cores = 1, rf.cores = 1)


## ----packages-----------------------------------------------------------------
library(ggplot2)
library(dplyr)
library(tidyr)
library(randomForestSRC)

if (requireNamespace("ggRandomForests", quietly = TRUE)) {
  library(ggRandomForests)
} else if (requireNamespace("pkgload", quietly = TRUE)) {
  pkgload::load_all(export_all = FALSE, helpers = FALSE, attach_testthat = FALSE)
} else {
  stop("Install ggRandomForests (or pkgload for dev builds) to render this vignette.")
}

theme_set(theme_bw())


## ----data-load----------------------------------------------------------------
data(Boston, package = "MASS")
Boston$chas <- as.logical(Boston$chas) # nolint: object_name_linter


## ----variable-labels----------------------------------------------------------
st_labs <- c(
  crim    = "Crime rate by town",
  zn      = "Residential land zoned > 25k sq ft (%)",
  indus   = "Non-retail business acres (%)",
  chas    = "Borders Charles River",
  nox     = "Nitrogen oxides (10 ppm)",
  rm      = "Rooms per dwelling",
  age     = "Units built before 1940 (%)",
  dis     = "Distance to employment centers",
  rad     = "Highway accessibility index",
  tax     = "Property tax rate per $10,000",
  ptratio = "Pupil-teacher ratio",
  black   = "Proportion of Black residents",
  lstat   = "Lower status population (%)",
  medv    = "Median home value ($1000s)"
)


## ----eda, fig.cap="EDA: each predictor vs. median home value, colored by Charles River indicator."----
dta <- Boston |>
  pivot_longer(c(-medv, -chas), names_to = "variable", values_to = "value")

ggplot(dta, aes(x = medv, y = value, color = chas)) +
  geom_point(alpha = 0.4) +
  geom_smooth(aes(x = medv, y = value), color = "grey30",
              inherit.aes = FALSE, se = FALSE) +
  labs(y = "", x = st_labs["medv"]) +
  scale_color_brewer(palette = "Set2") +
  facet_wrap(~variable, scales = "free_y", ncol = 3)


## ----rfsrc-fit----------------------------------------------------------------
rfsrc_Boston <- rfsrc(medv ~ ., data = Boston, # nolint: object_name_linter
                      importance = TRUE, err.block = 5)
rfsrc_Boston


## ----error-plot, fig.height=4, fig.cap="OOB mean squared error vs. number of trees."----
gg_e <- gg_error(rfsrc_Boston)
gg_e <- gg_e |> filter(!is.na(error))
class(gg_e) <- c("gg_error", class(gg_e))
plot(gg_e)


## ----rfsrc-predicted, fig.height=4, fig.cap="OOB predicted median home values. Points are jittered; boxplot shows the distribution."----
plot(gg_rfsrc(rfsrc_Boston), alpha = 0.5) +
  coord_cartesian(ylim = c(5, 49))


## ----vimp-plot, fig.cap="VIMP ranking. Longer blue bars indicate more important variables."----
plot(gg_vimp(rfsrc_Boston), lbls = st_labs)


## ----varsel-------------------------------------------------------------------
md_Boston <- max.subtree(rfsrc_Boston) # nolint: object_name_linter


## ----select-vars--------------------------------------------------------------
xvar <- md_Boston$topvars


## ----vardep-panel, fig.cap="Variable dependence for top predictors (minimal depth rank order)."----
gg_v <- gg_variable(rfsrc_Boston)

plot(gg_v, xvar = xvar, panel = TRUE, alpha = 0.5) +
  labs(y = st_labs["medv"], x = "")


## ----vardep-chas, fig.height=4, fig.cap="Variable dependence for Charles River (categorical)."----
plot(gg_v, xvar = "chas", alpha = 0.4) +
  labs(y = st_labs["medv"])


## ----partial-dep, fig.cap="Partial dependence for top predictors."------------
pd <- gg_partial_rfsrc(rfsrc_Boston, xvar.names = xvar)

# Quick S3 plot — works out of the box for the standard regression case
plot(pd)


## ----partial-dep-custom, fig.cap="Partial dependence (custom styling)."-------
ggplot(pd$continuous, aes(x = x, y = yhat)) +
  geom_line(color = "steelblue", linewidth = 1) +
  facet_wrap(~name, scales = "free_x") +
  labs(y = st_labs["medv"], x = "") +
  theme_bw()


## ----coplot-chas, fig.height=4, fig.cap="Variable dependence on lstat, conditional on Charles River."----
gg_v$chas_label <- ifelse(gg_v$chas, "Borders Charles River",
                          "Does not border")

plot(gg_v, xvar = "lstat", alpha = 0.5) +
  labs(y = st_labs["medv"], x = st_labs["lstat"]) +
  theme(legend.position = "none") +
  facet_wrap(~chas_label)


## ----coplot-rm, fig.cap="Predicted medv vs. lstat, conditional on rooms-per-dwelling groups."----
rm_pts <- quantile_pts(rfsrc_Boston$xvar$rm, groups = 6, intervals = TRUE)
gg_v$rm_grp <- cut(rfsrc_Boston$xvar$rm, breaks = rm_pts)
levels(gg_v$rm_grp) <- paste("rm in", levels(gg_v$rm_grp))

plot(gg_v, xvar = "lstat", alpha = 0.5) +
  labs(y = st_labs["medv"], x = st_labs["lstat"]) +
  theme(legend.position = "none") +
  scale_color_brewer(palette = "Set3") +
  facet_wrap(~rm_grp)


## ----coplot-lstat, fig.cap="Predicted medv vs. rooms, conditional on lower-status groups."----
lstat_pts <- quantile_pts(rfsrc_Boston$xvar$lstat, groups = 6,
                          intervals = TRUE)
gg_v$lstat_grp <- cut(rfsrc_Boston$xvar$lstat, breaks = lstat_pts)
levels(gg_v$lstat_grp) <- paste("lstat in", levels(gg_v$lstat_grp))

plot(gg_v, xvar = "rm", alpha = 0.5) +
  labs(y = st_labs["medv"], x = st_labs["rm"]) +
  theme(legend.position = "none") +
  scale_color_brewer(palette = "Set3") +
  facet_wrap(~lstat_grp)


## ----surface-data, cache=TRUE-------------------------------------------------
rm_grid <- quantile_pts(rfsrc_Boston$xvar$rm, groups = 25)

surface_list <- lapply(rm_grid, function(rm_val) {
  newx <- rfsrc_Boston$xvar
  newx$rm <- rm_val
  pd_rm <- gg_partial_rfsrc(rfsrc_Boston, xvar.names = "lstat", newx = newx)
  df <- pd_rm$continuous
  df$rm <- rm_val
  df
})

surface_df <- bind_rows(surface_list)


## ----plotly-surface, fig.cap="Interactive partial dependence surface: median home value as a function of lstat and rm."----
if (requireNamespace("plotly", quietly = TRUE)) {
  library(plotly)

  surface_wide <- surface_df |>
    select(lstat = x, rm, medv = yhat) |>
    arrange(rm, lstat)

  lstat_vals <- sort(unique(surface_wide$lstat))
  rm_vals    <- sort(unique(surface_wide$rm))
  z_matrix   <- matrix(surface_wide$medv,
                        nrow = length(rm_vals),
                        ncol = length(lstat_vals),
                        byrow = TRUE)

  plot_ly(x = lstat_vals, y = rm_vals, z = z_matrix) |>
    add_surface(colorscale = "Viridis", showscale = TRUE) |>
    layout(
      scene = list(
        xaxis = list(title = "Lower Status (%)"),
        yaxis = list(title = "Rooms per Dwelling"),
        zaxis = list(title = "Median Value ($1000s)")
      )
    )
} else {
  message("Install the plotly package for interactive 3D surfaces.")
  ggplot(surface_df, aes(x = x, y = rm, fill = yhat)) +
    geom_tile() +
    scale_fill_viridis_c(name = "Median Value") +
    labs(x = "Lower Status (%)", y = "Rooms per Dwelling") +
    theme_bw()
}

Try the ggRandomForests package in your browser

Any scripts or data that you put into this service are public.

ggRandomForests documentation built on May 2, 2026, 5:06 p.m.