knitr::opts_chunk$set(
  fig.width=10,
  fig.height=6)
data.table::setDTthreads(1)
## output: rmarkdown::html_vignette above creates html where figures are limited to 700px wide.
## Above CSS from https://stackoverflow.com/questions/34906002/increase-width-of-entire-html-rmarkdown-output main-container is for html_document, body is for html_vignette
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
mlr3resampling::ResamplingSameOtherCV$new()

The goal of this vignette is to explain the older resamplers: ResamplingVariableSizeTrainCV and ResamplingSameOtherCV, which output some data which are useful for visualizing the train/test splits. If you do not want to visualize the train/test splits, then it is recommended to instead use the newer resampler, ResamplingSameOtherSizesCV (see other vignette).

Same/Other/All resampler

The goal of thie section is to explain how to quantify the extent to which it is possible to train on one data subset, and predict on another data subset. This kind of problem occurs frequently in many different problem domains:

The ideas are similar to my previous blog posts about how to do this in python and R. Below we explain how to use mlr3resampling for this purpose, in simulated regression and classification problems. To use this method in real data, the important sections to read below are named "Benchmark: computing test error," which show how to create these cross-validation experiments using mlr3 code.

Simulated regression problems

We begin by generating some data which can be used with regression algorithms. Assume there is a data set with some rows from one person, some rows from another,

N <- 300
library(data.table)
set.seed(1)
abs.x <- 2
reg.dt <- data.table(
  x=runif(N, -abs.x, abs.x),
  person=rep(1:2, each=0.5*N))
reg.pattern.list <- list(
  easy=function(x, person)x^2,
  impossible=function(x, person)(x^2+person*3)*(-1)^person)
reg.task.list <- list()
for(task_id in names(reg.pattern.list)){
  f <- reg.pattern.list[[task_id]]
  yname <- paste0("y_",task_id)
  reg.dt[, (yname) := f(x,person)+rnorm(N)][]
  task.dt <- reg.dt[, c("x","person",yname), with=FALSE]
  reg.task <- mlr3::TaskRegr$new(
    task_id, task.dt, target=yname)
  reg.task$col_roles$subset <- "person"
  reg.task$col_roles$stratum <- "person"
  reg.task$col_roles$feature <- "x"
  reg.task.list[[task_id]] <- reg.task
}
reg.dt

The table above shows some simulated data for two regression problems:

Static visualization of simulated data

First we reshape the data using the code below,

(reg.tall <- nc::capture_melt_single(
  reg.dt,
  task_id="easy|impossible",
  value.name="y"))

The table above is a more convenient form for the visualization which we create using the code below,

if(require(animint2)){
  my_theme <- theme_bw(20)
  theme_set(my_theme)
  ggplot()+
    geom_point(aes(
      x, y),
      data=reg.tall)+
    facet_grid(
      task_id ~ person,
      labeller=label_both,
      space="free",
      scales="free")+
    scale_y_continuous(
      breaks=seq(-100, 100, by=2))
}

In the simulated data above, we can see that

Benchmark: computing test error

In the code below, we define a K-fold cross-validation experiment.

(reg_same_other <- mlr3resampling::ResamplingSameOtherCV$new())

In the code below, we define two learners to compare,

(reg.learner.list <- list(
  if(requireNamespace("rpart"))mlr3::LearnerRegrRpart$new(),
  mlr3::LearnerRegrFeatureless$new()))

In the code below, we define the benchmark grid, which is all combinations of tasks (easy and impossible), learners (rpart and featureless), and the one resampling method.

(reg.bench.grid <- mlr3::benchmark_grid(
  reg.task.list,
  reg.learner.list,
  reg_same_other))

In the code below, we execute the benchmark experiment (in parallel using the multisession future plan).

if(FALSE){#for CRAN.
  if(require(future))plan("multisession")
}
if(require(lgr))get_logger("mlr3")$set_threshold("warn")
(reg.bench.result <- mlr3::benchmark(
  reg.bench.grid, store_models = TRUE))

The code below computes the test error for each split,

reg.bench.score <- mlr3resampling::score(reg.bench.result)
reg.bench.score[1]

The code below visualizes the resulting test accuracy numbers.

if(require(animint2)){
  ggplot()+
    scale_x_log10()+
    geom_point(aes(
      regr.mse, train.subsets, color=algorithm),
      shape=1,
      data=reg.bench.score)+
    facet_grid(
      task_id ~ person,
      labeller=label_both,
      scales="free")
}

It is clear from the plot above that

Interactive visualization of data, test error, and splits

The code below can be used to create an interactive data visualization which allows exploring how different functions are learned during different splits.

inst <- reg.bench.score$resampling[[1]]$instance
rect.expand <- 0.3
grid.dt <- data.table(x=seq(-abs.x, abs.x, l=101), y=0)
grid.task <- mlr3::TaskRegr$new("grid", grid.dt, target="y")
pred.dt.list <- list()
point.dt.list <- list()
for(score.i in 1:nrow(reg.bench.score)){
  reg.bench.row <- reg.bench.score[score.i]
  task.dt <- data.table(
    reg.bench.row$task[[1]]$data(),
    reg.bench.row$resampling[[1]]$instance$id.dt)
  names(task.dt)[1] <- "y"
  set.ids <- data.table(
    set.name=c("test","train")
  )[
  , data.table(row_id=reg.bench.row[[set.name]][[1]])
  , by=set.name]
  i.points <- set.ids[
    task.dt, on="row_id"
  ][
    is.na(set.name), set.name := "unused"
  ]
  point.dt.list[[score.i]] <- data.table(
    reg.bench.row[, .(task_id, iteration)],
    i.points)
  i.learner <- reg.bench.row$learner[[1]]
  pred.dt.list[[score.i]] <- data.table(
    reg.bench.row[, .(
      task_id, iteration, algorithm
    )],
    as.data.table(
      i.learner$predict(grid.task)
    )[, .(x=grid.dt$x, y=response)]
  )
}
(pred.dt <- rbindlist(pred.dt.list))
(point.dt <- rbindlist(point.dt.list))
set.colors <- c(
  train="#1B9E77",
  test="#D95F02",
  unused="white")
algo.colors <- c(
  featureless="blue",
  rpart="red")
make_person_subset <- function(DT){
  DT[, "person/subset" := person]
}
make_person_subset(point.dt)
make_person_subset(reg.bench.score)

if(require(animint2)){
  viz <- animint(
    title="SOAK algorithm: train/predict on subsets, regression",
    video="https://vimeo.com/1053413000",
    pred=ggplot()+
      ggtitle("Predictions for selected train/test split")+
      theme_animint(height=400)+
      scale_fill_manual(values=set.colors)+
      geom_point(aes(
        x, y, fill=set.name),
        showSelected="iteration",
        size=3,
        help="One dot for each train/test/unused data point.",
        shape=21,
        data=point.dt)+
      scale_color_manual(values=algo.colors)+
      geom_line(aes(
        x, y, color=algorithm,
        group=paste(algorithm, iteration)),
        help="One line for each learned prediction function.",
        showSelected="iteration",
        data=pred.dt)+
      facet_grid(
        task_id ~ `person/subset`,
        labeller=label_both,
        space="free",
        scales="free")+
      scale_x_continuous(
        "x = input/feature in regression")+
      scale_y_continuous(
        "y = output to predict in regression",
        breaks=seq(-100, 100, by=2)),
    err=ggplot()+
      ggtitle("Test error for each split")+
      theme_animint(height=400, width=350)+
      guides(fill="none")+
      scale_y_log10(
        "Mean squared error on test set")+
      scale_fill_manual(values=algo.colors)+
      scale_x_discrete(
        "People/subsets in train set")+
      geom_point(aes(
        train.subsets, regr.mse, fill=algorithm),
        help="One dot per test set and learning algorithm.",
        shape=1,
        size=5,
        stroke=2,
        color="black",
        color_off=NA,
        showSelected="algorithm",
        clickSelects="iteration",
        data=reg.bench.score)+
      facet_grid(
        task_id ~ `person/subset`,
        labeller=label_both,
        scales="free"),
    diagram=ggplot()+
      ggtitle("Select train/test split")+
      theme_animint(height=400, width=300)+
      facet_grid(
        . ~ train.subsets,
        scales="free",
        space="free")+
      scale_size_manual(values=c(subset=3, fold=1))+
      scale_color_manual(values=c(subset="orange", fold="grey50"))+
      geom_rect(aes(
        xmin=-Inf, xmax=Inf,
        color=rows,
        size=rows,
        ymin=display_row, ymax=display_end),
        help="One rect per chunk of data with common fold (grey) and subset (gold).",
        fill=NA,
        data=inst$viz.rect.dt)+
      scale_fill_manual(values=set.colors)+
      geom_text(aes(
        x=ifelse(rows=="subset", Inf, -Inf),
        y=(display_row+display_end)/2,
        hjust=ifelse(rows=="subset", 1, 0),
        label=paste0(rows, "=", ifelse(rows=="subset", subset, fold))),
        help="Text labels indicate chunks of data with common fold (grey) and subset (gold).",
        showSelected="rows",
        data=data.table(train.name="same", inst$viz.rect.dt))+
      geom_rect(aes(
        xmin=iteration-rect.expand, ymin=display_row,
        xmax=iteration+rect.expand, ymax=display_end,
        fill=set.name),
        help="One rect per chunk of data assigned to train/test set in cross-validation.",
        alpha=0.5,
        alpha_off=0.5,
        color="black",
        color_off=NA,
        clickSelects="iteration",
        data=inst$viz.set.dt)+
      scale_x_continuous(
        "Split number",
        breaks=c(1,6, 7,12, 13,18))+
      scale_y_continuous(
        "Row number"),
    source="https://github.com/tdhock/mlr3resampling/blob/main/vignettes/Older_resamplers.Rmd")
}
if(FALSE){
  animint2pages(viz, "2023-12-13-train-predict-subsets-regression")
}

If you are viewing this in an installed package or on CRAN, then there will be no data viz on this page, but you can view it on: https://tdhock.github.io/2023-12-13-train-predict-subsets-regression/

Simulated classification problems

The previous section investigated a simulated regression problem, whereas in this section we simulate a binary classification problem. Assume there is a data set with some rows from one person, some rows from another,

N <- 200
library(data.table)
(full.dt <- data.table(
  label=factor(rep(c("spam","not spam"), l=N)),
  person=rep(1:2, each=0.5*N)
)[, signal := ifelse(label=="not spam", 0, 3)][])

Above each row has an person ID between 1 and 2. We can imagine a spam filtering system, that has training data for multiple people (here just two). Each row in the table above represents a message which has been labeled as spam or not, by one of the two people. Can we train on one person, and accurately predict on the other person? To do that we will need some features, which we generate/simulate below:

set.seed(1)
n.people <- length(unique(full.dt$person))
for(person.i in 1:n.people){
  use.signal.vec <- list(
    easy=rep(if(person.i==1)TRUE else FALSE, N),
    impossible=full.dt$person==person.i)
  for(task_id in names(use.signal.vec)){
    use.signal <- use.signal.vec[[task_id]]
    full.dt[
    , paste0("x",person.i,"_",task_id) := ifelse(
      use.signal, signal, 0
    )+rnorm(N)][]
  }
}
full.dt

In the table above, there are two sets of two features:

Static visualization of simulated data

Below we reshape the data to a table which is more suitable for visualization:

(scatter.dt <- nc::capture_melt_multiple(
  full.dt,
  column="x[12]",
  "_",
  task_id="easy|impossible"))

Below we visualize the pattern for each person and feature type:

if(require(animint2)){
  ggplot()+
    geom_point(aes(
      x1, x2, color=label),
      shape=1,
      data=scatter.dt)+
    facet_grid(
      task_id ~ person,
      labeller=label_both)
}

In the plot above, it is apparent that

Benchmark: computing test error

We use the code below to create a list of classification tasks, for use in the mlr3 framework.

class.task.list <- list()
for(task_id in c("easy","impossible")){
  feature.names <- grep(task_id, names(full.dt), value=TRUE)
  task.col.names <- c(feature.names, "label", "person")
  task.dt <- full.dt[, task.col.names, with=FALSE]
  this.task <- mlr3::TaskClassif$new(
    task_id, task.dt, target="label")
  this.task$col_roles$subset <- "person"
  this.task$col_roles$stratum <- c("person","label")
  this.task$col_roles$feature <- setdiff(names(task.dt), this.task$col_roles$stratum)
  class.task.list[[task_id]] <- this.task
}
class.task.list

Note in the code above that person is assigned roles subset and stratum, whereas label is assigned roles target and stratum. When adapting the code above to real data, the important part is the mlr3::TaskClassif line which tells mlr3 what data set to use, and what columns should be used for target/subset/stratum.

The code below is used to define a K-fold cross-validation experiment,

(class_same_other <- mlr3resampling::ResamplingSameOtherCV$new())

The code below is used to define the learning algorithms to test,

(class.learner.list <- list(
  if(requireNamespace("rpart"))mlr3::LearnerClassifRpart$new(),
  mlr3::LearnerClassifFeatureless$new()))

The code below defines the grid of tasks, learners, and resamplings.

(class.bench.grid <- mlr3::benchmark_grid(
  class.task.list,
  class.learner.list,
  class_same_other))

The code below runs the benchmark experiment grid. Note that each iteration can be parallelized by declaring a future plan.

if(FALSE){
  if(require(future))plan("multisession")
}
if(require(lgr))get_logger("mlr3")$set_threshold("warn")
(class.bench.result <- mlr3::benchmark(
  class.bench.grid, store_models = TRUE))

Below we compute scores (test error) for each resampling iteration, and show the first row of the result.

class.bench.score <- mlr3resampling::score(class.bench.result)
class.bench.score[1]

Finally we plot the test error values below.

if(require(animint2)){
  ggplot()+
    geom_point(aes(
      classif.ce, train.subsets, color=algorithm),
      shape=1,
      data=class.bench.score)+
    facet_grid(
      person ~ task_id,
      labeller=label_both,
      scales="free")
}

It is clear from the plot above that

Interactive visualization of data, test error, and splits

The code below can be used to create an interactive data visualization which allows exploring how different functions are learned during different splits.

inst <- class.bench.score$resampling[[1]]$instance
rect.expand <- 0.3
grid.value.dt <- scatter.dt[
, lapply(.SD, function(x)do.call(seq, c(as.list(range(x)), l=21)))
, .SDcols=c("x1","x2")]
grid.class.dt <- data.table(
  label=full.dt$label[1],
  do.call(
    CJ, grid.value.dt
  )
)
class.pred.dt.list <- list()
class.point.dt.list <- list()
for(score.i in 1:nrow(class.bench.score)){
  class.bench.row <- class.bench.score[score.i]
  task.dt <- data.table(
    class.bench.row$task[[1]]$data(),
    class.bench.row$resampling[[1]]$instance$id.dt)
  names(task.dt)[2:3] <- c("x1","x2")
  set.ids <- data.table(
    set.name=c("test","train")
  )[
  , data.table(row_id=class.bench.row[[set.name]][[1]])
  , by=set.name]
  i.points <- set.ids[
    task.dt, on="row_id"
  ][
    is.na(set.name), set.name := "unused"
  ][]
  class.point.dt.list[[score.i]] <- data.table(
    class.bench.row[, .(task_id, iteration)],
    i.points)
  if(class.bench.row$algorithm!="featureless"){
    i.learner <- class.bench.row$learner[[1]]
    i.learner$predict_type <- "prob"
    i.task <- class.bench.row$task[[1]]
    setnames(grid.class.dt, names(i.task$data()))
    grid.class.task <- mlr3::TaskClassif$new(
      "grid", grid.class.dt, target="label")
    pred.grid <- as.data.table(
      i.learner$predict(grid.class.task)
    )[, data.table(grid.class.dt, prob.spam)]
    names(pred.grid)[2:3] <- c("x1","x2")
    pred.wide <- dcast(pred.grid, x1 ~ x2, value.var="prob.spam")
    prob.mat <- as.matrix(pred.wide[,-1])
    contour.list <- contourLines(
      grid.value.dt$x1, grid.value.dt$x2, prob.mat, levels=0.5)
    class.pred.dt.list[[score.i]] <- data.table(
      class.bench.row[, .(
        task_id, iteration, algorithm
      )],
      data.table(contour.i=seq_along(contour.list))[, {
        do.call(data.table, contour.list[[contour.i]])[, .(level, x1=x, x2=y)]
      }, by=contour.i]
    )
  }
}
(class.pred.dt <- rbindlist(class.pred.dt.list))
(class.point.dt <- rbindlist(class.point.dt.list))

set.colors <- c(
  train="#1B9E77",
  test="#D95F02",
  unused="white")
algo.colors <- c(
  featureless="blue",
  rpart="red")
make_person_subset <- function(DT){
  DT[, "person/subset" := person]
}
make_person_subset(class.point.dt)
make_person_subset(class.bench.score)
if(require(animint2)){
  viz <- animint(
    title="SOAK algorithm: train/predict on subsets, classification",
    video="https://vimeo.com/manage/videos/1053464329",
    pred=ggplot()+
      ggtitle("Predictions for selected train/test split")+
      theme_animint(height=350, width=350)+
      scale_fill_manual(values=set.colors)+
      scale_color_manual(values=c(spam="black","not spam"="white"))+
      geom_point(aes(
        x1, x2, color=label, fill=set.name),
        showSelected="iteration",
        size=3,
        help="One dot for each train/test/unused data point.",
        stroke=2,
        shape=21,
        data=class.point.dt)+
      geom_path(aes(
        x1, x2, 
        group=paste(algorithm, iteration, contour.i)),
        showSelected=c("iteration","algorithm"),
        help="Red path represents decision boundary of rpart decision tree learning algorithm.",
        color=algo.colors[["rpart"]],
        data=class.pred.dt)+
      facet_grid(
        task_id ~ `person/subset`,
        labeller=label_both,
        space="free",
        scales="free")+
      scale_y_continuous(
        breaks=seq(-100, 100, by=2)),
    err=ggplot()+
      ggtitle("Test error for each split")+
      theme_animint(height=350, width=350)+
      theme(panel.margin=grid::unit(1, "lines"))+
      scale_y_continuous(
        "Classification error on test set",
        breaks=seq(0, 1, by=0.25))+
      scale_fill_manual(values=algo.colors)+
      scale_x_discrete(
        "People/subsets in train set")+
      geom_hline(aes(
        yintercept=yint),
        help="Horizontal lines highlight baseline error rate of 50%.",
        data=data.table(yint=0.5),
        color="grey50")+
      geom_point(aes(
        train.subsets, classif.ce, fill=algorithm),
        help="One dot per test set and learning algorithm.",
        shape=1,
        size=5,
        stroke=2,
        color="black",
        color_off=NA,
        clickSelects="iteration",
        data=class.bench.score)+
      facet_grid(
        task_id ~ `person/subset`,
        labeller=label_both),
    diagram=ggplot()+
      ggtitle("Select train/test split")+
      theme_animint(height=350, width=300)+
      facet_grid(
        . ~ train.subsets,
        scales="free",
        space="free")+
      scale_size_manual(values=c(subset=3, fold=1))+
      scale_color_manual(values=c(subset="orange", fold="grey50"))+
      geom_rect(aes(
        xmin=-Inf, xmax=Inf,
        color=rows,
        size=rows,
        ymin=display_row, ymax=display_end),
        help="One rect per chunk of data with common fold (grey) and subset (gold).",
        fill=NA,
        data=inst$viz.rect.dt)+
      scale_fill_manual(values=set.colors)+
      geom_text(aes(
        x=ifelse(rows=="subset", Inf, -Inf),
        y=(display_row+display_end)/2,
        hjust=ifelse(rows=="subset", 1, 0),
        label=paste0(rows, "=", ifelse(rows=="subset", subset, fold))),
        help="Text labels indicate chunks of data with common fold (grey) and subset (gold).",
        showSelected="rows",
        data=data.table(train.name="same", inst$viz.rect.dt))+
      geom_rect(aes(
        xmin=iteration-rect.expand, ymin=display_row,
        xmax=iteration+rect.expand, ymax=display_end,
        fill=set.name),
        help="One rect per chunk of data assigned to train/test set in cross-validation.",
        alpha=0.5,
        alpha_off=0.5,
        color="black",
        color_off=NA,
        clickSelects="iteration",
        data=inst$viz.set.dt)+
      scale_x_continuous(
        "Split number / cross-validation iteration",
        breaks=c(1,6, 7,12, 13,18))+
      scale_y_continuous(
        "Row number"),
    source="https://github.com/tdhock/mlr3resampling/blob/main/vignettes/Older_resamplers.Rmd")
}
if(FALSE){
  animint2pages(viz, "2023-12-13-train-predict-subsets-classification")
}

If you are viewing this in an installed package or on CRAN, then there will be no data viz on this page, but you can view it on: https://tdhock.github.io/2023-12-13-train-predict-subsets-classification/

Conclusion

In this section we have shown how to use mlr3resampling for comparing test error of models trained on same/all/other subsets.

Variable size train resampler

The goal of this section is to explain how to ResamplingVariableSizeTrainCV, which can be used to determine how many train data are necessary to provide accurate predictions on a given test set.

Simulated regression problems

The code below creates data for simulated regression problems. First we define a vector of input values,

N <- 300
abs.x <- 10
set.seed(1)
x.vec <- runif(N, -abs.x, abs.x)
str(x.vec)

Below we define a list of two true regression functions (tasks in mlr3 terminology) for our simulated data,

reg.pattern.list <- list(
  sin=sin,
  constant=function(x)0)

The constant function represents a regression problem which can be solved by always predicting the mean value of outputs (featureless is the best possible learning algorithm). The sin function will be used to generate data with a non-linear pattern that will need to be learned. Below we use a for loop over these two functions/tasks, to simulate the data which will be used as input to the learning algorithms:

library(data.table)
reg.task.list <- list()
reg.data.list <- list()
for(task_id in names(reg.pattern.list)){
  f <- reg.pattern.list[[task_id]]
  task.dt <- data.table(
    x=x.vec,
    y = f(x.vec)+rnorm(N,sd=0.5))
  reg.data.list[[task_id]] <- data.table(task_id, task.dt)
  reg.task.list[[task_id]] <- mlr3::TaskRegr$new(
    task_id, task.dt, target="y"
  )
}
(reg.data <- rbindlist(reg.data.list))

In the table above, the input is x, and the output is y. Below we visualize these data, with one task in each facet/panel:

if(require(animint2)){
  ggplot()+
    geom_point(aes(
      x, y),
      data=reg.data)+
    facet_grid(task_id ~ ., labeller=label_both)
}

In the plot above we can see two different simulated data sets (constant and sin). Note that the code above used the animint2 package, which provides interactive extensions to the static graphics of the ggplot2 package (see below section Interactive data viz).

Visualizing instance table

In the code below, we define a K-fold cross-validation experiment, with K=3 folds.

reg_size_cv <- mlr3resampling::ResamplingVariableSizeTrainCV$new()
reg_size_cv$param_set$values$train_sizes <- 6
reg_size_cv

In the output above we can see the parameters of the resampling object, all of which should be integer scalars:

Below we instantiate the resampling on one of the tasks:

reg_size_cv$instantiate(reg.task.list[["sin"]])
reg_size_cv$instance

Above we see the instance, which need not be examined by the user, but for informational purposes, it contains the following data:

Benchmark: computing test error

In the code below, we define two learners to compare,

(reg.learner.list <- list(
  if(requireNamespace("rpart"))mlr3::LearnerRegrRpart$new(),
  mlr3::LearnerRegrFeatureless$new()))

The code above defines

In the code below, we define the benchmark grid, which is all combinations of tasks (constant and sin), learners (rpart and featureless), and the one resampling method.

(reg.bench.grid <- mlr3::benchmark_grid(
  reg.task.list,
  reg.learner.list,
  reg_size_cv))

In the code below, we execute the benchmark experiment (optionally in parallel using the multisession future plan).

if(FALSE){
  if(require(future))plan("multisession")
}
if(require(lgr))get_logger("mlr3")$set_threshold("warn")
(reg.bench.result <- mlr3::benchmark(
  reg.bench.grid, store_models = TRUE))

The code below computes the test error for each split, and visualizes the information stored in the first row of the result:

reg.bench.score <- mlr3resampling::score(reg.bench.result)
reg.bench.score[1]

The output above contains all of the results related to a particular train/test split. In particular for our purposes, the interesting columns are:

The code below visualizes the resulting test accuracy numbers.

train_size_vec <- unique(reg.bench.score$train_size)
if(require(animint2)){
  ggplot()+
    scale_x_log10(
      breaks=train_size_vec)+
    scale_y_log10()+
    geom_line(aes(
      train_size, regr.mse,
      group=paste(algorithm, seed),
      color=algorithm),
      shape=1,
      data=reg.bench.score)+
    geom_point(aes(
      train_size, regr.mse, color=algorithm),
      shape=1,
      data=reg.bench.score)+
    facet_grid(
      test.fold~task_id,
      labeller=label_both,
      scales="free")
}

Above we plot the test error for each fold and train set size. There is a different panel for each task and test fold. Each line represents a random seed (ordering of data in train set), and each dot represents a specific train set size. So the plot above shows that some variation in test error, for a given test fold, is due to the random ordering of the train data.

Below we summarize each train set size, by taking the mean and standard deviation over each random seed.

reg.mean.dt <- dcast(
  reg.bench.score,
  task_id + train_size + test.fold + algorithm ~ .,
  list(mean, sd),
  value.var="regr.mse")
if(require(animint2)){
  ggplot()+
    scale_x_log10(
      breaks=train_size_vec)+
    scale_y_log10()+
    geom_ribbon(aes(
      train_size,
      ymin=regr.mse_mean-regr.mse_sd,
      ymax=regr.mse_mean+regr.mse_sd,
      fill=algorithm),
      alpha=0.5,
      data=reg.mean.dt)+
    geom_line(aes(
      train_size, regr.mse_mean, color=algorithm),
      shape=1,
      data=reg.mean.dt)+
    facet_grid(
      test.fold~task_id,
      labeller=label_both,
      scales="free")
}

The plot above shows a line for the mean, and a ribbon for the standard deviation, over the three random seeds. It is clear from the plot above that

Interactive data viz

The code below can be used to create an interactive data visualization which allows exploring how different functions are learned during different splits.

grid.dt <- data.table(x=seq(-abs.x, abs.x, l=101), y=0)
grid.task <- mlr3::TaskRegr$new("grid", grid.dt, target="y")
pred.dt.list <- list()
point.dt.list <- list()
for(score.i in 1:nrow(reg.bench.score)){
  reg.bench.row <- reg.bench.score[score.i]
  task.dt <- data.table(
    reg.bench.row$task[[1]]$data(),
    reg.bench.row$resampling[[1]]$instance$id.dt)
  set.ids <- data.table(
    set.name=c("test","train")
  )[
  , data.table(row_id=reg.bench.row[[set.name]][[1]])
  , by=set.name]
  i.points <- set.ids[
    task.dt, on="row_id"
  ][
    is.na(set.name), set.name := "unused"
  ]
  point.dt.list[[score.i]] <- data.table(
    reg.bench.row[, .(task_id, iteration)],
    i.points)
  i.learner <- reg.bench.row$learner[[1]]
  pred.dt.list[[score.i]] <- data.table(
    reg.bench.row[, .(
      task_id, iteration, algorithm
    )],
    as.data.table(
      i.learner$predict(grid.task)
    )[, .(x=grid.dt$x, y=response)]
  )
}
(pred.dt <- rbindlist(pred.dt.list))
(point.dt <- rbindlist(point.dt.list))
set.colors <- c(
  train="#1B9E77",
  test="#D95F02",
  unused="white")
algo.colors <- c(
  featureless="blue",
  rpart="red")
if(require(animint2)){
  viz <- animint(
    title="Variable size train set, regression",
    pred=ggplot()+
      ggtitle("Predictions for selected train/test split")+
      theme_animint(height=400)+
      scale_fill_manual(values=set.colors)+
      geom_point(aes(
        x, y, fill=set.name),
        help="One dot per sample in train/test/unused set.",
        showSelected="iteration",
        size=3,
        shape=21,
        data=point.dt)+
      scale_size_manual(values=c(
        featureless=3,
        rpart=2))+
      scale_color_manual(values=algo.colors)+
      geom_line(aes(
        x, y,
        color=algorithm,
        size=algorithm,
        group=paste(algorithm, iteration)),
        help="One line per learned prediction function.",
        showSelected="iteration",
        data=pred.dt)+
      facet_grid(
        task_id ~ .,
        labeller=label_both),
    err=ggplot()+
      ggtitle("Test error for each split")+
      theme_animint(width=500)+
      theme(
        panel.margin=grid::unit(1, "lines"),
        legend.position="none")+
      scale_y_log10(
        "Mean squared error on test set")+
      scale_color_manual(values=algo.colors)+
      scale_x_log10(
        "Train set size",
        breaks=train_size_vec)+
      geom_line(aes(
        train_size, regr.mse,
        group=paste(algorithm, seed),
        color=algorithm),
        help="One line per algorithm and random seed used to order train set.",
        clickSelects="seed",
        alpha_off=0.2,
        showSelected="algorithm",
        size=4,
        data=reg.bench.score)+
      facet_grid(
        test.fold~task_id,
        labeller=label_both,
        scales="free")+
      geom_point(aes(
        train_size, regr.mse,
        color=algorithm),
        help="One point per algorithm and train set size, for the selected random ordering.",
        size=5,
        stroke=3,
        fill="black",
        fill_off=NA,
        showSelected=c("algorithm","seed"),
        clickSelects="iteration",
        data=reg.bench.score),
    video="https://vimeo.com/manage/videos/1053467310",
    source="https://github.com/tdhock/mlr3resampling/blob/main/vignettes/Older_resamplers.Rmd")
}
if(FALSE){
  animint2pages(viz, "2023-12-26-train-sizes-regression")
}

If you are viewing this in an installed package or on CRAN, then there will be no data viz on this page, but you can view it on: https://tdhock.github.io/2023-12-26-train-sizes-regression/

The interactive data viz consists of two plots:

Simulated classification problems

Whereas in the section above, we focused on regression (output is a real number), in this section we simulate a binary classification problem (output if a factor with two levels).

class.N <- 900
class.abs.x <- 1
rclass <- function(){
  runif(class.N, -class.abs.x, class.abs.x)
}
library(data.table)
set.seed(1)
class.x.dt <- data.table(x1=rclass(), x2=rclass())
class.fun.list <- list(
  constant=function(...)0.5,
  xor=function(x1, x2)xor(x1>0, x2>0))
class.data.list <- list()
class.task.list <- list()
for(task_id in names(class.fun.list)){
  class.fun <- class.fun.list[[task_id]]
  y <- factor(ifelse(
    class.x.dt[, class.fun(x1, x2)+rnorm(class.N, sd=0.5)]>0.5,
    "spam", "not"))
  task.dt <- data.table(class.x.dt, y)
  this.task <- mlr3::TaskClassif$new(
    task_id, task.dt, target="y")
  this.task$col_roles$stratum <- "y"
  class.task.list[[task_id]] <- this.task
  class.data.list[[task_id]] <- data.table(task_id, task.dt)
}
(class.data <- rbindlist(class.data.list))

The simulated data table above consists of two input features (x1 and x2) along with an output/label to predict (y). Below we count the number of times each label appears in each task:

class.data[, .(count=.N), by=.(task_id, y)]

The table above shows that the spam label is the minority class (not is majority, so that will be the prediction of the featureless baseline). Below we visualize the data in the feature space:

if(require(animint2)){
  ggplot()+
    geom_point(aes(
      x1, x2, color=y),
      shape=1,
      data=class.data)+
    facet_grid(. ~ task_id, labeller=label_both)+
    coord_equal()
}

The plot above shows how the output y is related to the two inputs x1 and x2, for the two tasks.

In the mlr3 code below, we define a list of learners, our resampling method, and a benchmark grid:

class.learner.list <- list(
  if(requireNamespace("rpart"))mlr3::LearnerClassifRpart$new(),
  mlr3::LearnerClassifFeatureless$new())
size_cv <- mlr3resampling::ResamplingVariableSizeTrainCV$new()
(class.bench.grid <- mlr3::benchmark_grid(
  class.task.list,
  class.learner.list,
  size_cv))

Below we run the learning algorithm for each of the train/test splits defined by our benchmark grid:

if(FALSE){
  if(require(future))plan("multisession")
}
if(require(lgr))get_logger("mlr3")$set_threshold("warn")
(class.bench.result <- mlr3::benchmark(
  class.bench.grid, store_models = TRUE))

Below we compute scores (test error) for each resampling iteration, and show the first row of the result.

class.bench.score <- mlr3resampling::score(class.bench.result)
class.bench.score[1]

The output above has columns which are very similar to the regression example in the previous section. The main difference is the classif.ce column, which is the classification error on the test set.

Finally we plot the test error values below.

if(require(animint2)){
  ggplot()+
    geom_line(aes(
      train_size, classif.ce,
      group=paste(algorithm, seed),
      color=algorithm),
      shape=1,
      data=class.bench.score)+
    geom_point(aes(
      train_size, classif.ce, color=algorithm),
      shape=1,
      data=class.bench.score)+
    facet_grid(
      task_id ~ test.fold,
      labeller=label_both)+
    scale_x_log10(
      breaks=unique(class.bench.score$train_size))+
    scale_y_continuous(
      "Test error rate",
      limits=c(0.1,0.6),
      breaks=seq(0.1,0.6,by=0.1))
}

It is clear from the plot above that

Exercise for the reader: compute and plot mean and SD for these classification tasks, similar to the plot for the regression tasks in the previous section.

Interactive visualization of data, test error, and splits

The code below can be used to create an interactive data visualization which allows exploring how different functions are learned during different splits.

class.grid.vec <- seq(-class.abs.x, class.abs.x, l=21)
class.grid.dt <- CJ(x1=class.grid.vec, x2=class.grid.vec)
class.pred.dt.list <- list()
class.point.dt.list <- list()
for(score.i in 1:nrow(class.bench.score)){
  class.bench.row <- class.bench.score[score.i]
  task.dt <- data.table(
    class.bench.row$task[[1]]$data(),
    class.bench.row$resampling[[1]]$instance$id.dt)
  set.ids <- data.table(
    set.name=c("test","train")
  )[
  , data.table(row_id=class.bench.row[[set.name]][[1]])
  , by=set.name]
  i.points <- set.ids[
    task.dt, on="row_id"
  ][
    is.na(set.name), set.name := "unused"
  ][]
  class.point.dt.list[[score.i]] <- data.table(
    class.bench.row[, .(task_id, iteration)],
    i.points)
  if(class.bench.row$algorithm!="featureless"){
    i.learner <- class.bench.row$learner[[1]]
    i.learner$predict_type <- "prob"
    i.task <- class.bench.row$task[[1]]
    grid.class.task <- mlr3::TaskClassif$new(
      "grid", class.grid.dt[, label:=factor(NA,levels(task.dt$y))], target="label")
    pred.grid <- as.data.table(
      i.learner$predict(grid.class.task)
    )[, data.table(class.grid.dt, prob.spam)]
    pred.wide <- dcast(pred.grid, x1 ~ x2, value.var="prob.spam")
    prob.mat <- as.matrix(pred.wide[,-1])
    if(length(table(prob.mat))>1){
      contour.list <- contourLines(
        class.grid.vec, class.grid.vec, prob.mat, levels=0.5)
      class.pred.dt.list[[score.i]] <- data.table(
        class.bench.row[, .(
          task_id, iteration, algorithm
        )],
        data.table(contour.i=seq_along(contour.list))[, {
          do.call(data.table, contour.list[[contour.i]])[, .(level, x1=x, x2=y)]
        }, by=contour.i]
      )
    }
  }
}
(class.pred.dt <- rbindlist(class.pred.dt.list))
(class.point.dt <- rbindlist(class.point.dt.list))

set.colors <- c(
  train="#1B9E77",
  test="#D95F02",
  unused="white")
algo.colors <- c(
  featureless="blue",
  rpart="red")
if(require(animint2)){
  viz <- animint(
    title="Variable size train sets, classification",
    pred=ggplot()+
      ggtitle("Predictions for selected train/test split")+
      theme(panel.margin=grid::unit(1, "lines"))+
      theme_animint(width=600)+
      coord_equal()+
      scale_fill_manual(values=set.colors)+
      scale_color_manual(values=c(spam="black","not spam"="white"))+
      geom_point(aes(
        x1, x2, color=y, fill=set.name),
        showSelected="iteration",
        help="One dot per data sample in the train/test/unused set.",
        size=3,
        stroke=2,
        shape=21,
        data=class.point.dt)+
      geom_path(aes(
        x1, x2, 
        group=paste(algorithm, iteration, contour.i)),
        showSelected=c("iteration","algorithm"),
        help="Red path represents decision boundary of rpart decision tree learning algorithm.",
        color=algo.colors[["rpart"]],
        data=class.pred.dt)+
      facet_grid(
        . ~ task_id,
        labeller=label_both,
        space="free",
        scales="free"),
    err=ggplot()+
      ggtitle("Test error for each split")+
      theme_animint(height=400)+
      theme(panel.margin=grid::unit(1, "lines"))+
      scale_y_continuous(
        "Classification error on test set",
        limits=c(0.1,0.6),
        breaks=seq(0.1,0.6,by=0.1))+
      scale_color_manual(values=algo.colors)+
      scale_x_log10(
        "Train set size",
        breaks=unique(class.bench.score$train_size))+
      geom_line(aes(
        train_size, classif.ce,
        group=paste(algorithm, seed),
        color=algorithm),
        help="One line per algorithm and random seed used to order train set.",
        clickSelects="seed",
        alpha_off=0.2,
        showSelected="algorithm",
        size=4,
        data=class.bench.score)+
      facet_grid(
        test.fold~task_id,
        labeller=label_both,
        scales="free")+
      geom_point(aes(
        train_size, classif.ce,
        color=algorithm),
        size=5,
        stroke=3,
        fill="black",
        fill_off=NA,
        help="One point per algorithm and train set size, for the selected random ordering.",
        showSelected=c("algorithm","seed"),
        clickSelects="iteration",
        data=class.bench.score),
    video="https://vimeo.com/1053477025",
    source="https://github.com/tdhock/mlr3resampling/blob/main/vignettes/Older_resamplers.Rmd")
}
if(FALSE){
  animint2pages(viz, "2023-12-27-train-sizes-classification")
}

If you are viewing this in an installed package or on CRAN, then there will be no data viz on this page, but you can view it on: https://tdhock.github.io/2023-12-27-train-sizes-classification/

The interactive data viz consists of two plots

Conclusion

In this section we have shown how to use mlr3resampling for comparing test error of models trained on different sized train sets.

Session info

sessionInfo()


Try the mlr3resampling package in your browser

Any scripts or data that you put into this service are public.

mlr3resampling documentation built on June 23, 2025, 5:08 p.m.