R/LearnerClassifCompboost.R

Defines functions getMinDF

getMinDF = function(fn, dat, df) {
  checkmate::assertDataFrame(dat)
  checkmate::assertChoice(fn, colnames(dat))
  checkmate::assertNumber(df, lower = 1)
  x = dat[[fn]]
  if (is.numeric(x))
    stop("Assertion on x failed, expected character or factor.")
  ncat = length(unique(x))
  dfout = df
  if (df > ncat) {
    message(sprintf("Number of groups in '%s' is smaller than the degrees of freedom %s. Setting 'df = %s' for feature '%s'.", fn, df, ncat, fn))
    dfout = ncat
  }
  return(dfout)
}


#' @title CompBoost Classification Learner
#'
#' @name mlr_learners_classif.compboost
#'
#' @description
#' Componentwise boosting
#' Calls [compboost::Compboost()] from package \CRANpkg{compboost}.
#'
#' @template section_dictionary_learner
#' @templateVar id classif.compboost
#'
#' @export
#' @template seealso_learner
#' @template example
LearnerClassifCompboost = R6Class("LearnerClassifCompboost",
  inherit = LearnerClassif,
  public = list(

    #' @description
    #' Create a `LearnerClassifCompboost` object.
    initialize = function() {
      cores_max = parallel::detectCores()
      ps = ParamSet$new(
        params = list(
          ParamDbl$new(id = "df", default = 4, lower = 1, tags = "train"),
          ParamDbl$new(id = "df_cat", default = 4, lower = 1, tags = "train"),
          ParamInt$new(id = "iters_max_univariate", default = 10000L, lower = 1L, tags = "train"),
          ParamInt$new(id = "iters_max_interactions", default = 10000L, lower = 1L, tags = "train"),
          # ParamDbl$new(id = "learning_rate_univariate", default = 0.01, lower = 0),
          # ParamDbl$new(id = "learning_rate_interactions", default = 0.1, lower = 0),
          ParamDbl$new(id = "n_knots_univariate", default = 20L, lower = 4, tags = "train"),
          ParamDbl$new(id = "n_knots_interactions", default = 10L, lower = 4, tags = "train"),
          ParamLgl$new(id = "use_components", default = TRUE, tags = "train"),
          ParamInt$new(id = "ncores", default = 1L, lower = 1L, upper = parallel::detectCores() - 1L, tags = c("train", "test")),
          ParamDbl$new(id = "stop_epsylon_for_break", default = 0.00001, lower = 0, upper = 1, tags = "train"),
          ParamDbl$new(id = "stop_patience", default = 10L, lower = 1L, tags = "train"),
          ParamDbl$new(id = "val_fraction", default = 0.33, lower = 0, upper = 1, tags = "train"),
          ParamDbl$new(id = "top_interactions", default = 0.02, lower = 0.01, upper = 1, tags = "train"),
          ParamDbl$new(id = "n_min_interactions", default = 10L, lower = 0, tags = "train"),
          ParamLgl$new(id = "use_early_stopping", default = TRUE, tags = "train"),
          ParamLgl$new(id = "show_output", default = FALSE, tags = "train"),
          ParamLgl$new(id = "just_univariate", default = FALSE, tags = "train"),
          ParamLgl$new(id = "add_deeper_interactions", default = FALSE, tags = "train"),
          ParamInt$new(id = "iters_deeper_interactions", default = 500, lower = 0, tags = "train"),
          #ParamDbl$new(id = "learning_rate_deeper_interactions", default = 0.2, lower = 0, upper = 1),
          ParamInt$new(id = "train_time_total", default = 0, lower = 0, tags = "train"),
          ParamInt$new(id = "n_threshold_binning", default = 4900, lower = 0, tags = "train"),
          ParamDbl$new(id = "learning_rate", default = 0.01, lower = 0, tags = "train")
        ))
      ps$values = list(
        # General pars:
        df = 6,
        n_threshold_binning = 4000L,
        use_components = TRUE,
        show_output = FALSE,
        learning_rate = 0.01,
        ncores = parallel::detectCores() - 1L,

        # Univariate model:
        #learning_rate_univariate = 0.1,
        n_knots_univariate = 15,
        iters_max_univariate = 10000L,

        # Interaction model (tensor splines):
        just_univariate = FALSE,
        top_interactions = 0.02,
        #learning_rate_interactions = 0.15,
        n_knots_interactions = 8,
        iters_max_interactions = 10000L,

        # Control deeper interactions (trees):
        add_deeper_interactions = TRUE,
        iters_deeper_interactions = 500L,
        #learning_rate_deeper_interactions = 0.15,

        # Control early stopping:
        use_early_stopping = TRUE,
        stop_patience = 10L,
        stop_epsylon_for_break = 1e-6,
        train_time_total = 120
      ) # Restrict the training to 2 hours

      super$initialize(
        id = "classif.compboost",
        packages = "compboost",
        feature_types = c("numeric", "factor", "integer", "character"),
        predict_types = c("response", "prob"),
        param_set = ps,
        properties = c("twoclass")
      )
    },

    getRiskStages = function(log_entry = "train_risk") {
      if (is.null(self$model))
        stop("Train learner first to extract risk values.")

      out = data.frame(stage = c("featureless", "univariate", "pairwise-interactions", "deep-interactions"),
        value = rep(NA_real_, 4L), explained = rep(NA_real_, 4L), percentage = rep(NA_real_, 4L),
        iterations = rep(NA_integer_, 4L))

      if ("univariate" %in% names(self$model)) {
        logs = self$model$univariate$getLoggerData()
        if (! log_entry %in% names(logs))
          stop("No log entry", log_entry, "found in compboost log.")
        out$value[1] = logs[[log_entry]][logs$baselearner == "intercept"]
        out$value[2] = tail(logs[[log_entry]], 1)
        out$iterations[1] = 0
        out$iterations[2] = max(logs[["_iterations"]])
      } else {
        stop("At least univariate model must be given!")
      }
      if ("interactions" %in% names(self$model)) {
        logs = self$model$interactions$getLoggerData()
        if (! log_entry %in% names(logs))
          stop("No log entry", log_entry, "found in compboost log.")
        out$value[3] = tail(logs[[log_entry]], 1)
        out$iterations[3] = max(logs[["_iterations"]])
      }
      if ("deeper_interactions" %in% names(self$model)) {
        vals = vapply(X = self$model$deeper_interactions$trees, FUN = function(tree) {
          if (! log_entry %in% names(tree))
            stop("Cannot find log entry", log_entry, "in tree.")
          return(tree[[log_entry]])
        }, FUN.VALUE = numeric(1L))
        if (length(vals) == 0)
          out$value[4] = NA_real_
        else
          out$value[4] = tail(vals, 1L)

        out$iterations[4] = length(vals)
      }
      out$explained = c(0, -diff(out$value))
      out$percentage = out$explained / sum(out$explained, na.rm = TRUE)
      return(out)
    }
  ),

  private = list(
    .train = function(task) {

      time0 = proc.time()

      # TODO: Use the loss/risk function of the loss object in cboost:
      logLoss = function(truth, pred) log(1 + exp(-2 * truth * pred))
      logRisk = function(truth, pred) mean(logLoss(truth, pred))

      ### Set params with default and user defined ones:
      pdefaults = self$param_set$default
      pars      = self$param_set$values

      self$param_set$values = mlr3misc::insert_named(pdefaults, pars)

      #
      if (task$nrow >= self$param_set$values$n_threshold_binning)
        bin_roots = c(seq(2, 1, length.out = 4)[-4], 0)
      else
        bin_roots = 0L


      ### Get stop arguments and train/test indices:
      stop_args = list(eps_for_break = self$param_set$values$stop_epsylon_for_break,
        patience = self$param_set$values$stop_patience)

      if (self$param_set$values$use_early_stopping)
        test_idx  = as.integer(sample(seq_len(task$nrow), trunc(task$nrow * self$param_set$values$val_fraction)))
      else
        test_idx = NULL

      train_idx = setdiff(seq_len(task$nrow), test_idx)

      ## It can happen that too aggressive binning reduces the data size so much that
      ## the maths fails, we try to catch that by lowering the bin_root depending
      ## on an error that may occur or not.
      for (bin_root in bin_roots) {

        optimizer = OptimizerCoordinateDescent$new(self$param_set$values$ncores)

        ### Define compboost model for univariate features:
        loss = compboost::LossBinomial$new()
        cboost_uni = Compboost$new(data = task$data(), target = task$target_names,
          loss = loss, learning_rate = self$param_set$values$learning_rate,
          #loss = loss, learning_rate = self$param_set$values$learning_rate_univariate,
          optimizer = optimizer, idx_oob = test_idx, stop_args = stop_args,
          early_stop = self$param_set$values$use_early_stopping
        )

        ### If a maximum time is given, the logger is used for stopping. Otherwise the time is just logged:
        if (self$param_set$values$train_time_total > 0)
          cboost_uni$addLogger(LoggerTime, TRUE, "minutes", self$param_set$values$train_time_total, "minutes")
        else
          cboost_uni$addLogger(LoggerTime, FALSE, "minutes", 0, "minutes")

        # Try to train a model, if this fails, try with a smaller binning root:
        e = try({
          nuisance = lapply(task$feature_names, function(nm) {
            einternal = try({
              ### Add base-learner/components (linear + centered spline):
              if (is.numeric(task$data()[[nm]])) {
                if (self$param_set$values$use_components) {
                  cboost_uni$addComponents(nm, n_knots = self$param_set$values$n_knots_univariat,
                    df = self$param_set$values$df, bin_root = bin_root)
                } else {
                  cboost_uni$addBaselearner(nm, "spline", BaselearnerPSpline,
                    n_knots = self$param_set$values$n_knots_univariat,
                    df = self$param_set$values$df, bin_root = bin_root)
                }
              } else {
                dfcat = getMinDF(nm, cboost_uni$data, self$param_set$values$df_cat)
                #dfcat = self$param_set$values$df_cat
                #ncat = length(unique(cboost_uni$data[[nm]]))
                #if (dfcat > ncat) {
                  #message(sprintf("Number of groups in '%s' is smaller than the degrees of freedom %s. Setting 'df = %s' for feature '%s'.", nm, dfcat, ncat, nm))
                  #dfcat = ncat
                #}
                cboost_uni$addBaselearner(nm, "category", BaselearnerCategoricalRidge, df = dfcat)
              }
            }, silent = TRUE)
            if (inherits(einternal, "try-error")) {
              stop(sprintf("Error while processing feature '%s': %s", nm, attr(einternal, "condition")$message))
            }
          })

          ### Train model:
          if (self$param_set$values$show_output)
            cboost_uni$train(self$param_set$values$iters_max_univariate)
          else
            nuisance = capture.output(cboost_uni$train(self$param_set$values$iters_max_univariate))
        }, silent = TRUE)

        if (inherits(e, "try-error")) {
          ecatch = FALSE
          if (grepl("chol()", attr(e, "condition")) && (bin_root > 0)) {
            i = which(bin_root == bin_roots)
            msg = paste0("Trying to catch Cholesky decomposition error.",
              "This may appear due to too aggressive binning with a root of ",
              round(bin_root, 2))
            if (i == (length(bin_roots) - 1)) {
              msg = paste0(msg, ". Trying to fit model without binning.")
            } else {
              msg = paste0(msg, ". Now trying with a smaller root of ", round(bin_roots[i+1]), ".")
            }
            warning(msg)
            ecatch = TRUE
          }
          if (grepl("toms748", attr(e, "condition")) && (bin_root > 0)) {
            i = which(bin_root == bin_roots)
            msg = sprintf("Trying to catch optimization error with toms748.
              This may appear due to too aggressive binning with a root of %s",
              round(bin_root, 2))
            if (i == (length(bin_roots) - 1)) {
              msg = paste0(msg, ". Trying to fit model without binning.")
            } else {
              msg = paste0(msg, ". Now trying with a smaller root of ", round(bin_roots[i+1]), ".")
            }
            stop(sprintf("%s: This most likely occurred because of degrees of freedom bigger than the number of groups or unique values in a numerical features.", attr(e, "condition")$message))
            ecatch = TRUE
          }
          if (! ecatch) {
            stop(e)
          }
        } else {
          break
        }
      }

      out = list()
      out[["univariate"]] = cboost_uni

      if (self$param_set$values$just_univariate) return(out)

      ### Create new task for interaction detection:
      df_new = task$data()
      df_new[[task$target_names]] = NULL

      # Access predictions:
      bin_response = ifelse(task$data()[[task$target_names]] == cboost_uni$response$getPositiveClass(), 1, -1)
      pred_uni     = cboost_uni$predict(task$data())
      pseudo_uni   = as.vector(loss$calculatePseudoResiduals(cbind(bin_response), pred_uni))

      # Define new task with 'residuals' as target
      df_new$residuals = pseudo_uni
      tsk_new = TaskRegr$new(id = "residuals", backend = df_new, target = "residuals")

      ### Extract interactions based on random forest:
      extracted_interactions = na.omit(po("extract_interactions", degree = 2)$train(list(tsk_new))$output)

      ninteractions = nrow(extracted_interactions)
      ntopinteractions = ceiling(ninteractions * self$param_set$values$top_interactions)
      if (ntopinteractions < self$param_set$values$n_min_interactions)
        ntopinteractions = min(self$param_set$values$n_min_interactions, ninteractions)

      ### STOP if no interactions were detected:
      if (ntopinteractions == 0) {
        if (self$param_set$values$show_output)
          warning("No interactions were selected! Algorithm is now stopped!")
        return(out)
      }

      ### Just use the top interactions (defined by the user, too much interactions makes the model too slow):
      top_interactions = seq_len(ntopinteractions)

      ### Define optimizer and loss with predictions as offset to continue training instead of
      ### start from all over again:
      optimizer_int  = compboost::OptimizerCoordinateDescent$new(self$param_set$values$ncores)
      loss_int_inbag = compboost::LossBinomial$new(cbind(pred_uni[train_idx]), TRUE)

      if (self$param_set$values$use_early_stopping)
        loss_int_oob = compboost::LossBinomial$new(cbind(pred_uni[test_idx]), TRUE)
      else
        loss_int_oob = compboost::LossBinomial$new()

      ### Define interaction model based on tensor splines:
      cboost_int = Compboost$new(data = task$data(), target = task$target_names, loss = loss_int_inbag,
        #learning_rate = self$param_set$values$learning_rate_interactions, optimizer = optimizer_int,
        learning_rate = self$param_set$values$learning_rate, optimizer = optimizer_int,
        idx_oob = test_idx, stop_args = c(stop_args, loss_oob = loss_int_oob),
        early_stop = self$param_set$values$use_early_stopping)

      nuisance = lapply(top_interactions, function(i) {
        e = try({
          f1 = extracted_interactions$feat1[i]
          f2 = extracted_interactions$feat2[i]

          df1 = df2 = self$param_set$values$df
          df_cat1 = df_cat2 = self$param_set$values$df_cat

          if (! is.numeric(cboost_int$data[[f1]])) {
            df1 = getMinDF(f1, cboost_int$data, df_cat1)
          }
          if (! is.numeric(cboost_int$data[[f1]])) {
            df2 = getMinDF(f2, cboost_int$data, df_cat2)
          }

          cboost_int$addTensor(feature1 = f1, feature2 = f2,
            df1 = df1, df2 = df2, isotrop = TRUE)
        }, silent = TRUE)
      })

      ### Check if train is used as logging. If so calculate remaining budget:
      if (self$param_set$values$train_time_total > 0)
        tint = proc.time() - time0
      else
        tint = -Inf

      ### STOP if time is exhausted:
      if (self$param_set$values$train_time_total <= (tint[3] / 60)) return(out)

      if (self$param_set$values$train_time_total > 0) {
        tintuse = ceiling(tint[3] / 60)
        cboost_int$addLogger(LoggerTime, TRUE, "minutes", self$param_set$values$train_time_total - tintuse, "minutes")
      } else {
        cboost_int$addLogger(LoggerTime, FALSE, "minutes", 0, "minutes")
        tintuse = Inf
      }

      if (length(cboost_int$getBaselearnerNames()) > 0) {
        ### Train interaction model:
        if (self$param_set$values$show_output)
          cboost_int$train(self$param_set$values$iters_max_interactions)
        else
          nuisance = capture.output(cboost_int$train(self$param_set$values$iters_max_interactions))

        ### Post check if model was really trained on the same data::
        ch1 = all.equal(cboost_uni$data, cboost_int$data)
        if (!ch1) stop("Check failed! Data for both models is not equal!")

        if (self$param_set$values$use_early_stopping) {
          ch2 = all.equal(cboost_uni$response_oob$getResponse(), cboost_int$response_oob$getResponse())
          if (!ch2) stop("Check failed! Response for both models is not equal!")
        }
        out[["interactions"]] = cboost_int
      } else {
        warning("No interactions were included! Cannot train interaction model.")
      }

      ### STOP if no deeper interactions are specified:
      if (! self$param_set$values$add_deeper_interactions) return(out)

      # Calculate pseudo residuals from the fitted univariate model:
      pred_int = cboost_uni$predict(task$data())
      if (! is.null(cboost_int$model))
        pred_int = pred_int + cboost_int$predict(task$data())

      pseudo_int = as.vector(loss$calculatePseudoResiduals(cbind(bin_response), pred_int))

      # Define new task with 'residuals' as target
      df_new$residuals = pseudo_int
      tsk_new = TaskRegr$new(id = "residuals", backend = df_new, target = "residuals")

      #residual_booster = boostRpart(tsk_new, lr = self$param_set$values$learning_rate_deeper_interactions,
      residual_booster = boostRpart(tsk_new, lr = self$param_set$values$learning_rate,
        iters = self$param_set$values$iters_deeper_interactions,
        patience = stop_args$patience, eps_for_break = stop_args$eps_for_break,
        use_es = self$param_set$values$use_early_stopping, idx_train = train_idx,
        idx_test = test_idx, logRisk = logRisk, binary_response = bin_response,
        prediction_offset = pred_int, pseudoResiduals = loss$calculatePseudoResiduals,
        max_time = self$param_set$values$train_time_total - (proc.time() - time0)[3] / 60)

      out[["deeper_interactions"]] = residual_booster
      return(out)
    },

    .predict = function(task) {
      newdata = task$data(cols = task$feature_names)

      lin_pred = self$model$univariate$predict(newdata)
      if (! is.null(self$model$interactions))
        lin_pred = lin_pred + self$model$interactions$predict(newdata)

      if (! is.null(self$model$deeper_interactions)) {
        df_new = task$data()
        df_new$residuals = 0
        tsk_new = TaskRegr$new(id = "residuals", backend = df_new, target = "residuals")
        lin_pred = lin_pred + predict(self$model$deeper_interactions, tsk_new)
        #lin_pred = lin_pred + self$model$deeper_interactions$lrn$predict(tsk_new)$response
      }

      probs = 1 / (1 + exp(-lin_pred))

      pos = self$model$univariate$response$getPositiveClass()
      neg = setdiff(names(self$model$univariate$response$getClassTable()), pos)
      pmat = matrix(c(probs, 1 - probs), ncol = 2L, nrow = length(probs))
      colnames(pmat) = c(pos, neg)

      if (self$predict_type == "prob")
        list(prob = pmat)

      if (self$predict_type == "response")
        list(response = ifelse(probs > self$model$univariate$response$getThreshold(), pos, neg))
      else
        list(prob = pmat)
    }
  )
)



if (FALSE) {

devtools::load_all()

#library(mlr3extralearners)
#devtools::install_github("zeehio/facetscales")
#devtools::install("~/repos/compboost")

cboost_pars = list("classif.compboost",
  predict_type = "prob", df = 6, show_output = TRUE, top_interactions = 0.02,
  learning_rate_univariate = 0.01, learning_rate_interactions = 0.05,
  train_time_total = 5,
  iters_max_univariate = 2500L, iters_max_interactions = 2500L,
  n_knots_univariate = 10, n_knots_interactions = 10,
  use_early_stopping = TRUE, stop_patience = 10L, stop_epsylon_for_break = 1e-6,
  ncores = 4)

task = tsk("spam")

lr = lrn("classif.compboost", id = "cboost",
  predict_type = "prob", df = 6, show_output = TRUE, top_interactions = 0.02,
  learning_rate_univariate = 0.01, learning_rate_interactions = 0.05,
  train_time_total = 5,
  iters_max_univariate = 50000L, iters_max_interactions = 50000L,
  n_knots_univariate = 10, n_knots_interactions = 10,
  use_early_stopping = TRUE, stop_patience = 6L, stop_epsylon_for_break = 1e-6,
  ncores = 6)
lr$train(task)

lr = do.call(lrn, c(cboost_pars, id = "cboost"))
lr_wrf = do.call(lrn, c(cboost_pars, add_deeper_interactions = TRUE, id = "cboost with rf"))
lr_uni = do.call(lrn, c(cboost_pars, just_univariate = TRUE, id = "cboost univariate"))

options("mlr3.debug" = TRUE)

#task = tsk("oml", task_id = 359994)
grid2 = benchmark_grid(task,
  list(
    lr_uni, lr, lr_wrf,
    lrn("classif.ranger", predict_type = "prob", id = "ranger"),
    lrn("classif.log_reg", predict_type = "prob", id = "logistic regression"),
    lrn("classif.gamboost", predict_type = "prob", mstop = 5000, id = "gamboost"),
    lrn("classif.cv_glmnet", predict_type = "prob", id = "glmnet")
  ), rsmp("cv", folds = 10L))
bm = benchmark(grid2)


vizBMR = function(bm) {
  library(ggplot2)
  library(dplyr)
  library(tidyr)

  scales_y = list(
    `time_train` = scale_y_continuous(trans = "log2"),
    `classif.auc` = scale_y_continuous()
  )
  bm$score(msrs(c("classif.auc", "time_train"))) %>%
    select(learner_id, classif.auc, time_train) %>%
    pivot_longer(cols = c("classif.auc", "time_train"), names_to = "Measure", values_to = "value") %>%
    ggplot(aes(x = learner_id, y = value, color = learner_id, fill = learner_id)) +
      geom_boxplot(alpha = 0.2) +
      ggsci::scale_color_uchicago() +
      ggsci::scale_fill_uchicago() +
      #scale_color_brewer(palette = "Set1") +
      #scale_fill_brewer(palette = "Set1") +
      labs(color = "Learner", fill = "Learner") +
      xlab("") +
      ylab("") +
      scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
      facetscales::facet_grid_sc(vars(Measure), scales = list(y = scales_y))
}

ggsave(vizBMR(bm), filename = paste0(here::here(), "/temp/fig-test-bmr.pdf"))




devtools::load_all()

cboost_pars = list("classif.compboost",
  predict_type = "prob", df = 6, show_output = TRUE, top_interactions = 0.02,
  n_min_interactions = 5L, train_time_total = 5, use_components = FALSE,
  learning_rate_univariate = 0.01, learning_rate_interactions = 0.05,
  iters_max_univariat = 50000L, iters_max_interactions = 50000L,
  n_knots_univariat = 10, n_knots_interactions = 10,
  use_early_stopping = TRUE, stop_patience = 10L, stop_epsylon_for_break = 1e-8)

task = tsk("sonar")

#lr_wrf = do.call(lrn, c(cboost_pars, add_deeper_interactions = TRUE, id = "cboost with rf"))
#lr_wrf$train(task)

mstop = 5000L
lrc = lrn("classif.compboost", id = "cboost components", predict_type = "prob", just_univariat = TRUE,
  use_early_stopping = FALSE, iters_max_univariat = mstop, df = 4, n_knots_univariat = 20L, show_output = TRUE,
  learning_rate_univariate = 0.1, ncores = 1)
#lrc$train(task)

lrc_mc = lrn("classif.compboost", id = "cboost components multicore", predict_type = "prob", just_univariat = TRUE,
  use_early_stopping = FALSE, iters_max_univariat = mstop, df = 4, n_knots_univariat = 20L, show_output = TRUE,
  learning_rate_univariate = 0.1, ncores = 7)
lrc_mc$train(task)

lrs = lrn("classif.compboost", id = "cboost splines", predict_type = "prob", use_components = FALSE,
  just_univariat = TRUE, use_early_stopping = FALSE, iters_max_univariat = mstop, df = 4,
  n_knots_univariat = 20L, show_output = TRUE, learning_rate_univariate = 0.1, ncores = 1)
lrs$train(task)

lrs_mc = lrn("classif.compboost", id = "cboost splines multicore", predict_type = "prob", use_components = FALSE,
  just_univariat = TRUE, use_early_stopping = FALSE, iters_max_univariat = mstop, df = 4,
  n_knots_univariat = 20L, show_output = TRUE, learning_rate_univariate = 0.1, ncores = 7)
lrs_mc$train(task)

library(mlr3extralearners)
lr_gb = lrn("classif.gamboost", predict_type = "prob", mstop = mstop, id = "gamboost")

options("mlr3.debug" = TRUE)
grid = benchmark_grid(task, list(lrc, lrc_mc, lrs, lrs_mc, lr_gb), rsmp("subsampling", repeats = 3, ratio = 0.9))
bm = benchmark(grid)
vizBMR(bm)



q()
R
devtools::load_all()

lr_wrf = lrn("classif.compboost", id = "cboost",
  predict_type = "prob", df = 6, show_output = TRUE, top_interactions = 0.02,
  learning_rate_univariate = 0.01, learning_rate_interactions = 0.01,
  train_time_total = 5, add_deeper_interactions = TRUE,
  iters_max_univariat = 50000L, iters_max_interactions = 50000L,
  n_knots_univariat = 10, n_knots_interactions = 8,
  use_early_stopping = TRUE, stop_patience = 4L, stop_epsylon_for_break = 1e-7,
  ncores = 4, just_univariat = FALSE, learning_rate_deeper_interactions = 0.01,
  iters_deeper_interactions = 1000L, n_threshold_binning = 100)

task = tsk("sonar")
lr_wrf$train(task)
pred = lr_wrf$predict(task)
pred$score(msr("classif.auc"))

inbag1 = lr_wrf$model$univariat$getInbagRisk()
inbag2 = lr_wrf$model$interactions$getInbagRisk()
inbag3 = sapply(lr_wrf$model$deeper_interactions$trees, function(x) x$train_risk)
if (length(inbag3) == 0) inbag3 = NA

oob1   = lr_wrf$model$univariat$getLoggerData()$oob_risk
oob2   = lr_wrf$model$interactions$getLoggerData()$oob_risk
oob3 = sapply(lr_wrf$model$deeper_interactions$trees, function(x) x$test_risk)
if (length(oob3) == 0) oob3 = NA

cutoff = length(inbag1)
cutoff2 = cutoff + length(inbag2)

rall = na.omit(c(inbag1, inbag2, inbag3, oob1, oob2, oob3))
yrg = c(min(rall), max(rall))
xrg = c(0, length(c(oob1, oob2, oob3)))

plot(c(inbag1, inbag2, inbag3), type = "l", col = "red", ylim = yrg, xlim = xrg)
lines(c(oob1, oob2, oob3), col = "blue")
abline(v = c(cutoff, cutoff2), lty = 2, col = "dark grey")
legend("bottomleft", lty = 1, col = c("red", "blue"), legend = c("Train risk", "Validation risk"))
text(x = c(cutoff, cutoff2) + 0.01 * (length(c(inbag1, inbag2))), y = max(c(inbag1, oob1)) - 0.05 * (yrg[2] - yrg[1]),
  labels = c("Switch from univariate\nto interaction model", "Switch to deep\ninteractions"), adj = c(0, 0))


q()
R
#devtools::install("~/repos/compboost")
devtools::load_all()

#task = tsk("spam")
task = tsk("oml", task_id = 359994)

mstop = 100L

lr_bin = lrn("classif.compboost", predict_type = "prob", just_univariate = TRUE,
  use_components = TRUE, n_threshold_binning = 0, show_output = TRUE,
  iters_max_univariate = mstop, use_early_stopping = FALSE, ncores = 4)
#lr$train(task)

lr_nobin = lrn("classif.compboost", predict_type = "prob", just_univariate = TRUE,
  use_components = TRUE, n_threshold_binning = 40000000, show_output = TRUE,
  iters_max_univariate = mstop, use_early_stopping = FALSE, ncores = 4)
#lr0$train(task)

mb = microbenchmark::microbenchmark(lr_bin$train(task), lr_nobin$train(task), times = 3L)
mb

}
Coorsaa/autocompboost documentation built on March 19, 2023, 12:08 p.m.