R/compboost.R

#' @title Component-wise boosting
#'
#' @description
#' Fit a component-wise boosting model (`r mlr3misc::cite_bib("buhlmann2003boosting")`).
#' This class wraps the `S4` class system with [Compboost_internal]
#' as internal model representation exposed by `Rcpp`.
#' The two convenient wrapper [boostLinear()] and [boostSplines()] are
#' also creating objects of this class.
#'
#' Visualizing the internals see [plotBaselearnerTraces()], [plotBaselearner()], [plotFeatureImportance()],
#' [plotPEUni()], [plotTensor()], and [plotRisk()]. Visualizing the contribution for
#' one new observation see [plotIndividualContribution()].
#'
#' @references
#' `r mlr3misc::format_bib("buhlmann2003boosting")`
#'
#' @examples
#' cboost = Compboost$new(mtcars, "mpg", loss = LossQuadratic$new(), oob_fraction = 0.3)
#' cboost$addBaselearner("hp", "spline", BaselearnerPSpline, degree = 3,
#'   n_knots = 10, df = 3, differences = 2)
#' cboost$addBaselearner("wt", "spline", BaselearnerPSpline)
#' cboost$train(1000, 0)
#'
#' table(cboost$getSelectedBaselearner())
#' head(cboost$logs)
#' names(cboost$baselearner_list)
#'
#' # Access information about the a base learner in the list:
#' cboost$baselearner_list$hp_spline$factory$getDF()
#' cboost$baselearner_list$hp_spline$factory$getPenalty()
#' plotBaselearner(cboost, "hp_spline")
#' @export
Compboost = R6::R6Class("Compboost",
  public = list(

    #' @field data (`data.frame()`)\cr
    #' The data used for training the model. Note: If `oob_fraction` is set, the
    #' input data is split into `data` and `data_oob`. Hence, `data` contains a
    #' subset of the input data to train the model.
    data = NULL,

    #' @field data_oob (`data.frame()`)\cr
    #' An out-of-bag data set used for risk logging or early stopping. `data_oob`
    #' is split from the input data (see the `data` field).
    data_oob = NULL,

    #' @field oob_fraction (`numeric(1)`)\cr
    #' The fraction of `nrow(input data)` defining the number of observations in
    #' `data_oob`.
    oob_fraction = NULL,

    #' @field response ([ResponseRegr] | [ResponseBinaryClassif])\cr
    #' A `S4` response object. See `?ResponseRegr` or `?ResponseBinaryClassif` for help.
    #' This object holds the current prediction, pseudo residuals and functions to
    #' transform scores. Note: This response corresponds to the `data` field and holds
    #' the predictions for that `data.frame`.
    response = NULL,

    #' @field response_oob ([ResponseRegr] | [ResponseBinaryClassif])\cr
    #' A `S4` response object. See `?ResponseRegr` or `?ResponseBinaryClassif` for help.
    #' Same as `response` but for `data_oob`.
    response_oob = NULL,

    #' @field target (`character(1)`)\cr
    #' Name of the target variable in `data`.
    target = NULL,

    #' @field id (`character(1)`)\cr
    #' Name of the data object defined in `$new(data, ...)`.
    id = NULL,

    #' @template field-optimizer
    optimizer = NULL,

    #' @template field-loss
    loss = NULL,

    #' @field learning_rate (`numeric(1)`)\cr
    #' The learning rate of the model. Note: Some optimizer do dynamically vary the learning rate.
    learning_rate = NULL,

    #' @field model ([Compboost_internal])\cr
    #' The internal Compboost object exported from `Rcpp`. See `?Compboost_internal` for details.
    model = NULL,

    #' @field bl_factory_list ([BlearnerFactoryList)\cr
    #' A container with all base learners. See `?BlearnerFactoryList` for details.
    bl_factory_list = NULL,

    #' @field positive (`character(1)`)\cr
    #' The positive class in the case of binary classification.
    positive = NULL,

    #' @field stop_all (`logical(1)`)\cr
    #' Indicator whether all stopper must return `TRUE` to early stop the algorithm.
    #' Comparable to `all()` if `stop_all = TRUE` and `any()` if `stop_all = FALSE`.
    stop_all = FALSE,

    #' @field early_stop (`logical(1)`)\cr
    #' Indicator whether early stopping is used or not.
    early_stop = FALSE,

    #' @description
    #' Creates a new instance of this [R6][R6::R6Class] class.
    #'
    #' @param data (`data.frame`)\cr
    #' The data set to build the object. Note: This data set is completely used for training if `is.null(idx_oob)`.
    #' Otherwise, the data set is split into `data = data[idx_train, ]` and `data_oob = data[idx_oob, ]`.
    #' @param target (`character(1)`)\cr
    #' Character indicating the name of the target variable.
    #' @template param-optimizer
    #' @template param-loss
    #' @param learning_rate (`numeric(1)`)\cr
    #' Learning rate of the model (default is `0.05`).
    #' @param positive (`character(1)`)\cr
    #' The name of the positive class (in the case of binary classification).
    #' @param oob_fraction (`numeric(1)`)\cr
    #' The fraction of `nrow(input data)` defining the number of observations in
    #' `data_oob`. This argument is ignored if `idx_oob` is set.
    #' @param early_stop (`logical(1)`)\cr
    #' Indicator whether early stopping should be used or not.
    #' @template param-idx_oob
    #' @param stop_args (`list(integer(1), integer(1))`)\cr
    #' `list` containing two elements `patience` and `eps_for_break` which are used for early stopping.
    #' @param file (`character(1`)\cr
    #' File from which a model should be loaded. If `NULL`, `data` and `target` must be defined.
    initialize = function(data = NULL, target = NULL, optimizer = NULL, loss = NULL,
      learning_rate = 0.05, positive = NULL, oob_fraction = NULL, early_stop = FALSE,
      idx_oob = NULL, stop_args = list(eps_for_break = 0, patience = 10L), file = NULL) {

      if (all(is.null(file), is.null(data), is.null(target))) {
        stop("Make sure to specify `data` and `target` or load from a file with `file = [filename].json`")
      }

      if (is.null(file)) {

        # CODE TO CREATE COMPBOOST OBJECT FROM ARGUMENTS:

        if (is.null(data)) {
          stop("Data is a required argument if no file is given.")
        } else {

          # CHECKS:
          checkmate::assertDataFrame(data, any.missing = FALSE, min.rows = 1)

        }
        if (is.null(target)) {
          stop("target is a required argument if no file is given.")
        } else {

          # CHECKS:
          if (! is.null(positive)) {
            x = data[[target]]
            ct = class(data[[target]])
            if (! inherits(ct, c("character", "factor")))
              stop("Target must be of class `character` or `factor` if `positive` is specified. Target class is: ", ct)

            nux = length(unique(x))
            if (nux != 2)
              stop("Just binary classification is supported. The target has ", nux, " classes.")
          }
          checkmate::assertChoice(positive, unique(data[[target]]), null.ok = TRUE)

          if (! isRcppClass(target, "Response")) {
            if (! target %in% names(data)) {
              stop("The target ", target, " is not present within the data")
            }
          }
        }
        if (is.null(optimizer)) {
          optimizer = OptimizerCoordinateDescent$new()
        }
        if (is.null(loss)) {
          if (isRcppClass(target, "Response")) {
            tname = target$getTargetName()
          } else {
            tname = target
          }
          linit = FALSE
          if (is.numeric(data[[tname]])) {
            loss = LossQuadratic$new()
            linit = TRUE
          }
          if (is.character(data[[tname]]) || is.factor(data[[tname]])) {
            loss = LossBinomial$new()
            linit = TRUE
          }
          if (! linit) {
            stop("Was not able to automatically guess a loss class.")
          }
        }

        checkmate::assertNumeric(learning_rate, lower = 0, upper = 1, any.missing = FALSE, len = 1)
        checkmate::assertNumeric(oob_fraction, lower = 0, upper = 1, any.missing = FALSE, len = 1, null.ok = TRUE)
        checkmate::assertLogical(early_stop, any.missing = FALSE, len = 1L)
        checkmate::assertInteger(idx_oob, null.ok = TRUE, upper = nrow(data), unique = TRUE, any.missing = FALSE)

        if (inherits(loss, "C++Class")) {
          stop("Loss should be an initialized loss object by calling the constructor: ",
            deparse(substitute(loss)), "$new()")
        }

        if (! "eps_for_break" %in% names(stop_args)) stop_args[["eps_for_break"]] = 0
        if (! "patience" %in% names(stop_args)) stop_args[["patience"]] = 10L

        self$id = deparse(substitute(data))
        data = droplevels(as.data.frame(data))

        if ((! is.null(idx_oob)) || (! is.null(oob_fraction))) {
          if (is.null(idx_oob)) {
            private$p_idx_oob = sample(x = seq_len(nrow(data)), size = floor(oob_fraction * nrow(data)), replace = FALSE)
          } else {
            private$p_idx_oob = idx_oob
          }
          if ((! is.null(idx_oob)) && (! is.null(oob_fraction))) {
            warning("`oob_fraction` is ignored when a specific test index is given.")
          }
        }
        private$p_idx_train = setdiff(seq_len(nrow(data)), private$p_idx_oob)

        if (is.character(target)) {
          checkmate::assertCharacter(target)
          if (! target %in% names(data))
            stop("The target ", target, " is not present within the data")

          # With .vectorToRespone we are very restricted to the task types.
          # We can just guess for regression or classification. For every
          # other task one should use the Response interface!
          self$response = vectorToResponse(data[[target]][private$p_idx_train], target, positive)
        } else {
          assertRcppClass(target, "Response")
          if (nrow(target$getResponse()) != nrow(data))
            stop("Response must have same number of observations as the given dataset")
          self$response = target
        }

        self$oob_fraction = oob_fraction
        self$early_stop = early_stop
        self$target = self$response$getTargetName()
        self$data = data[private$p_idx_train, !colnames(data) %in% self$target, drop = FALSE]
        self$optimizer = optimizer
        self$loss = loss
        self$learning_rate = learning_rate

        if (self$early_stop || (! is.null(self$oob_fraction) || (! is.null(idx_oob)))) {
          self$data_oob = data[private$p_idx_oob, !colnames(data) %in% target, drop = FALSE]
          self$response_oob = data[private$p_idx_oob, self$target]
          self$response_oob = self$prepareResponse(self$response_oob)
        }

        # Initialize new base-learner factory list. All factories which are defined in
        # `addBaselearner` are registered here:
        self$bl_factory_list = BlearnerFactoryList$new()

        # Check and set stop args:
        scount = 0
        if (! is.null(stop_args$oob_offset)) scount = 1
        if (length(stop_args) > scount) {
          for (nm in c("patience", "eps_for_break")) {
            if (! nm %in% names(stop_args)) stop("Cannot find ", nm, " in 'stop_args'")
          }
          checkmate::assertCount(stop_args$patience, positive = TRUE)
          checkmate::assertNumeric(stop_args$eps_for_break, len = 1L)
        }
        private$p_stop_args = stop_args

      } else {

        # LOAD COMPBOOST FROM FILE:
        private$loadFromJson(file)

      }
    },

    #' @description
    #' Add a logger to the model.
    #'
    #' @param logger ([LoggerIteration] | [LoggerTime] | [LoggerInbagRisk] | [LoggerOobRisk])\cr
    #' The uninitialized logger.
    #' @param use_as_stopper (`logical(1)`)\cr
    #' Indicator defining the logger as stopper considering it for early stopping.
    #' @param logger_id (`character(1)`)\cr
    #' The id of the logger. This allows to define two logger of the same type (`e.g. risk logging`) but with different arguments.
    #' @param ... Additional arguments passed to `loger$new(logger_id, use_as_stopper, ...)`.
    addLogger = function(logger, use_as_stopper = FALSE, logger_id, ...) {
      if (! is.null(self$model)) {
        stop("Logger can not be added after training was started")
      }
      private$p_l_list[[logger_id]] = logger$new(logger_id, use_as_stopper = use_as_stopper, ...)
    },

    #' @description
    #' Get the number of the current iteration.
    #'
    #' @return
    #' `integer(1)` value.
    getCurrentIteration = function() {
      if (!is.null(self$model) && self$model$isTrained()) {
        return(length(self$model$getSelectedBaselearner()))
      }	else {
        return(0)
      }
    },

    #' @description
    #' This functions adds a base learner that adjusts the intercept (if selected).
    #' Adding an intercept base learner may be necessary, e.g., when adding linear effects
    #' without intercept.
    #'
    #' @param id (`character(1)`)\cr
    #' The id of the base learner (default is `"intercept"`).
    #' @template param-data_source
    addIntercept = function(id = "intercept", data_source = InMemoryData) {
      id_int = paste0(id, "_")
      private$p_boost_intercept = TRUE
      private$p_bl_list[[id_int]] = list()
      private$p_bl_list[[id_int]]$source = data_source$new(as.matrix(rep(1, nrow(self$data))), "intercept")
      private$p_bl_list[[id_int]]$feature = "intercept"
      private$p_bl_list[[id_int]]$factory = BaselearnerPolynomial$new(private$p_bl_list[[id_int]]$source, "",
        list(degree = 1, intercept = FALSE))

      self$bl_factory_list$registerFactory(private$p_bl_list[[id_int]]$factory)
      private$p_bl_list[[id_int]]$source = NULL
    },

    #' @description
    #' Add a base learner of one feature to the model that is considered in each iteration.
    #' Using `$addBaselearner()` just allows including univariate features. See `$addTensor()` for
    #' bivariate effect modelling and `$addComponents()` for an effect decomposition.
    #'
    #' @template param-feature
    #' @param id (`character(1)`)\cr
    #' The name of the base learner.
    #' @template param-bl_factory
    #' @template param-data_source
    #' @param ... Further argument spassed to the `$new(...)` constructor of `bl_factory`.
    addBaselearner = function(feature, id, bl_factory, data_source = InMemoryData, ...) {
      if (!is.null(self$model)) {
        stop("No base-learners can be added after training is started")
      }

      # Clear base-learners which are within the bl_list but not registered:
      idx_remove = ! names(private$p_bl_list) %in% self$bl_factory_list$getRegisteredFactoryNames()
      if (any(idx_remove)) {
        for (i in which(idx_remove)) {
          private$p_bl_list[[i]] = NULL
        }
      }

      data_columns = self$data[, feature, drop = FALSE]

      if (ncol(data_columns) == 1 && !is.numeric(data_columns[, 1])) {
        e = try(
          private$addSingleCatBl(data_columns, feature, id, bl_factory, data_source, ...),
          silent = TRUE
        )
      }	else {
        e = try(
          private$addSingleNumericBl(data_columns, feature, id, bl_factory, data_source, ...),
          silent = TRUE
        )
      }
      ## Remove list element if factory was not created.
      if (inherits(e, "try-error")) {
        idx_missing_factory = vapply(private$p_bl_list, function(bl) "factory" %in% names(bl), logical(1))
        private$p_bl_list[idx_missing_factory] = NULL
        df0 = NULL
        if ("df" %in% names(list(...))) df0 = list(...)$df
        stop(catchInternalException(e, self$data[[feature]], feature, df0))
      }
    },

    #' @description
    #' Remove a base learner from the model.
    #'
    #' @param blname (`character(1)`)\cr
    #' Name of the base learner that should be removed. Must be an element of `$getBaselearnerNames()`.
    rmBaselearner = function(blname) {
      #checkmate::assertChoice(blname, choices = names(private$p_bl_list))
      checkmate::assertChoice(blname, choices = self$bl_factory_list$getRegisteredFactoryNames())

      self$bl_factory_list$rmBaselearnerFactory(factory_id)
    },

    #' @description
    #' Add a row-wise tensor product of features. Note: The base learner are pre-defined
    #' by the type of the feature. Numerical features uses a `BaselearnerPSpline` while categorical
    #' features are included using a `BaselearnerCategoricalRidge` base learner.
    #' To include an arbitrary tensor product requires to use the `S4` API with using
    #' `BaselearnerTensor` on two base learners of any type.
    #'
    #' @param feature1 (`character(1)`)\cr
    #' Name of the first feature. Must be an element of `names(data)`.
    #' @param feature2 (`character(1)`)\cr
    #' Name of the second feature. Must be an element of `names(data)`.
    #' @param df (`numeric(1)`)\cr
    #' The degrees of freedom used for both base learner (this parameter overwrites `df1` and `df2`).
    #' @param df1 (`numeric(1)`)\cr
    #' The degrees of freedom used for the first base learner.
    #' @param df2 (`numeric(1)`)\cr
    #' The degrees of freedom used for the first base learner.
    #' @param isotrop (`logical(1)`)\cr
    #' Indicator how the two penalties should be combined, if `isotrop == TRUE`,
    #' the total degrees of freedom are uniformly distributed over the dimensions while
    #' `isotrop == FALSE` allows to define how strong each of the two dimensions is penalized.
    #' @param ... Additional arguments passed to the `$new()` constructor of the [BaselearnerPSpline] class.
    addTensor = function(feature1, feature2, df = NULL, df1 = NULL, df2 = NULL, isotrop = FALSE, ...) {
      if (!is.null(self$model)) {
        stop("No base-learners can be added after training is started")
      }
      checkmate::assertChoice(feature1, choices = names(self$data))
      checkmate::assertChoice(feature2, choices = names(self$data))

      # Clear base-learners which are within the bl_list but not registered:
      idx_remove = ! names(private$p_bl_list) %in% self$bl_factory_list$getRegisteredFactoryNames()
      if (any(idx_remove)) {
        for (i in which(idx_remove)) {
          private$p_bl_list[[i]] = NULL
        }
      }

      args = list(...)
      if ("df" %in% names(args))
        warning("'df' were specified in '...', please use `df1` and `df2` to specify the degrees of freedom. Alternative: Use `df` to set `df1 = df`, and `df2 = df` to an equal value.")

      if (! is.null(df)) {
        if ((! is.null(df1)) || (! is.null(df2))) {
          warning("`df` overwrites the parmaeter `df1` and `df2`")
        }
        df1 = df
        df2 = df
      }

      args1 = args2 = args
      if (! is.null(df1)) {
        args1$df = df1
        argc1 = list(df = df1)
      } else {
        argc1 = list()
      }

      if (! is.null(df2)) {
        args2$df = df2
        argc2 = list(df = df2)
      } else {
        argc2 = list()
      }

      x1 = self$data[[feature1]]
      #checkmate::assertNumeric(x1)
      if (is.numeric(x1)) {
        ds1 = InMemoryData$new(cbind(x1), feature1)
        fac1 = BaselearnerPSpline$new(ds1, "spline", args1)
      } else {
        ds1 = CategoricalDataRaw$new(x1, feature1)
        fac1 = BaselearnerCategoricalRidge$new(ds1, "categorical", argc1)
      }

      x2 = self$data[[feature2]]
      #checkmate::assertNumeric(x2)
      if (is.numeric(x2)) {
        ds2 = InMemoryData$new(cbind(x2), feature2)
        fac2 = BaselearnerPSpline$new(ds2, "spline", args2)
      } else {
        ds2 = CategoricalDataRaw$new(x2, feature2)
        fac2 = BaselearnerCategoricalRidge$new(ds2, "categorical", argc2)
      }

      tensor = BaselearnerTensor$new(fac1, fac2, "tensor", isotrop)

      # Register tensor:
      id = paste0(feature1, "_", feature2, "_tensor")
      private$p_bl_list[[id]] = list()
      private$p_bl_list[[id]]$feature = c(feature1, feature2)
      private$p_bl_list[[id]]$factory = tensor
      self$bl_factory_list$registerFactory(private$p_bl_list[[id]]$factory)
    },

    #' @description
    #' Add an effect with individual components. A linear term is added as well as
    #' a non-linear term without the linear effect. This ensures that the linear
    #' component is selected prior to the non-linear effect. The non-linear effect
    #' is only included if a deviation from a linear effect is required.
    #'
    #' Note: Internally, a [BaselearnerPolynomial] with degree one and a [BaselearnerCentered] is added.
    #' Centering a base learner makes the design matrix dense and hence memory is filled very fast.
    #' Considering binning may be an option to reduce the memory consumption.
    #'
    #' @template param-feature
    #' @param ... Additional arguments passed to the `$new()` constructor of the [BaselearnerPSpline] class.
    addComponents = function(feature, ...) {
      if (!is.null(self$model)) {
        stop("No base-learners can be added after training is started")
      }

      checkmate::assertChoice(feature, choices = names(self$data))

      # Clear base-learners which are within the bl_list but not registered:
      idx_remove = ! names(private$p_bl_list) %in% self$bl_factory_list$getRegisteredFactoryNames()
      if (any(idx_remove)) {
        for (i in which(idx_remove)) {
          private$p_bl_list[[i]] = NULL
        }
      }
      x = self$data[[feature]]
      checkmate::assertNumeric(x)

      pars = list(...)
      if ("bin_root" %in% names(pars))
        broot = pars[["bin_root"]]
      else
        broot = 0

      ds1 = InMemoryData$new(cbind(x), feature)
      fac1 = BaselearnerPolynomial$new(ds1, "linear", list(degree = 1, bin_root = broot))
      fac2 = BaselearnerPSpline$new(ds1, "spline", pars)
      f2cen = BaselearnerCentered$new(fac2, fac1, "spline_centered")

      # Register linear factory:
      id_lin = fac1$getBaselearnerId()
      private$p_bl_list[[id_lin]] = list()
      private$p_bl_list[[id_lin]]$feature = feature
      private$p_bl_list[[id_lin]]$factory = fac1

      self$bl_factory_list$registerFactory(private$p_bl_list[[id_lin]]$factory)

      # Register centered spline:
      id_sp = f2cen$getBaselearnerId()
      private$p_bl_list[[id_sp]] = list()
      private$p_bl_list[[id_sp]]$feature = feature
      private$p_bl_list[[id_sp]]$factory = f2cen

      self$bl_factory_list$registerFactory(private$p_bl_list[[id_sp]]$factory)
    },

    #' @description
    #' Start fitting a model.
    #'
    #' @param iteration (`integer(1)`)\cr
    #' The maximal number of iteration. The algorithm can be stopped earlier
    #' if early stopping is active.
    #' @param trace (`integer(1)`)\cr
    #' The number of integers after which the status of the fitting is printed to the screen.
    #' The default `trace = -1` internally uses `trace = round(iteration / 40)`.
    #' To silently fit the model use `trace = 0`.
    train = function(iteration = 100, trace = -1) {

      if (self$bl_factory_list$getNumberOfRegisteredFactories() == 0) {
        stop("Could not train without any registered base-learner.")
      }

      checkmate::assertCount(iteration, positive = TRUE, null.ok = TRUE)
      checkmate::assertIntegerish(trace, lower = -1, upper = iteration, len = 1, null.ok = FALSE)

      if (trace == -1) {
        trace = round(iteration / 40)
      }

      # Check if it is necessary to add a initial iteration logger. This is not the case
      # when the user already has add one by calling `addLogger`:
      if (is.null(self$model)) {
        # If iteration is NULL, then there is no new iteration logger defined. This could be
        # used, for example, to train the algorithm an break it after a defined number of
        # hours or minutes.
        if (! is.null(iteration)) {
          # Add new logger in the case that there isn't already a custom defined one:
          if ("Rcpp_LoggerIteration" %in% vapply(private$p_l_list, class, character(1))) {
            warning("Training iterations are ignored since custom iteration logger is already defined")
          } else {
            self$addLogger(LoggerIteration, TRUE, logger_id = "_iterations", iter.max = iteration)
          }
        }
        if (self$early_stop || (! is.null(self$oob_fraction) || (! is.null(private$p_idx_oob)))) private$addOobLogger()
        # After calling `initializeModel` it isn't possible to add base-learner or logger.
        private$initializeModel()
      }
      # Just call train for the initial fitting process. If the model is alredy initially trained,
      # then we use `setToIteration` to set to a lower iteration or retrain the model.
      if (! self$model$isTrained())
        self$model$train(trace)
      else {
        self$model$setToIteration(iteration, trace)
      }
      return(invisible(NULL))
    },

    #' @description
    #' Internally, each base learner is build on a [InMemoryData] object. Some
    #' methods (e.g. adding a [LoggerOobRisk]) requires to pass the data as
    #' `list(InMemoryData | CategoricalDataRaw)` with data objects as elements.
    #' This function converts the given `data.frame` into that format.
    #'
    #' @template param-newdata
    #'
    #' @return
    #' `list(InMemoryData | CategoricalDataRaw)` with data container as elements.
    #' Numeric features are wrapped by [InMemoryData] while categorical features
    #' are included with [CategoricalDataRaw].
    prepareData = function(newdata) {
      bl_features = unique(unlist(lapply(private$p_bl_list, function(x) x$feature)))

      if (private$p_boost_intercept)
        newdata = cbind(newdata, intercept = 1)

      nuisance = lapply(bl_features, function(blf) {
        if (! blf %in% names(newdata))
          warning("Missing feature ", blf, " in newdata. Note that this feature will be ignored in predictions.")
      })
      new_sources = list()

      blf_in_newdata = bl_features[bl_features %in% names(newdata)]
      out = lapply(blf_in_newdata, function(blf) {
        if (! is.numeric(newdata[[blf]])) {
          new_sources[[blf]] = CategoricalDataRaw$new(newdata[[blf]], blf)
        } else {
          new_sources[[blf]] = InMemoryData$new(cbind(newdata[[blf]]), blf)
        }
      })
      names(out) = blf_in_newdata
      return(out)
    },

    #' @description
    #' Same as for `$prepareData()` but for the response. Internally, `vectorToResponse()` is
    #' used to generate a [ResponseRegr] or [ResponseBinaryClassif] object.
    #'
    #' @param response (`vector()`)\cr
    #' A vector of type `numberic` or `categorical` that is transformed to an
    #' response object.
    #'
    #' @return
    #' [ResponseRegr] | [ResponseBinaryClassif] object.
    prepareResponse = function(response) {
      pos_class = NULL
      if (grepl(pattern = "ResponseBinaryClassif", x = class(self$response)))
        pos_class = self$response$getPositiveClass()

      return(vectorToResponse(vec = response, target = self$target, pos_class = pos_class))
    },

    #' @description
    #' Calculate predictions.
    #'
    #' @template param-newdata
    #' @param as_response (`logical(1)`)\cr
    #' In the case of binary classification, `as_response = TRUE` returns predictions as
    #' response, i.e. classes.
    #'
    #' @return
    #' Vector of predictions.
    predict = function(newdata = NULL, as_response = FALSE) {
      checkmate::assertDataFrame(newdata, null.ok = TRUE, min.rows = 1)
      if (is.null(newdata)) {
        return(self$model$getPrediction(as_response))
      } else {
        return(self$model$predict(self$prepareData(newdata), as_response))
      }
    },

    #' @description
    #' While `$predict()` returns the sum of all base learner predictions, this function
    #' returns a `list` with the predictions for each base learner.
    #'
    #' @template param-newdata
    #'
    #' @return
    #' Named `list()` with the included base learner names as names and the base learner
    #' predictions as elements.
    predictIndividual = function(newdata) {
      checkmate::assertDataFrame(newdata, null.ok = FALSE, min.rows = 1)
      return(self$model$predictIndividual(self$prepareData(newdata)))
    },

    #' @description
    #' Get design matrices of all (or a subset) base learners for a new `data.frame`.
    #'
    #' @template param-newdata
    #' @param blnames (`character()`)\cr
    #' Names of the base learners for which the design matrices are returned. If
    #' `is.null(blnames)`, compboost tries to guess all base learners that were
    #' constructed based on the feature names of `newdata`.
    #'
    #' @return
    #' `list(matrix | Matrix::Matrix)` matrices as elements.
    transformData = function(newdata, blnames = NULL) {
      if (is.null(self$model)) stop("Model must be trained first.")

      nnew = names(newdata)
      ndat = setdiff(names(self$data), self$target)
      checkmate::assertCharacter(blnames, null.ok = TRUE)
      if (! is.null(blnames)) {
        nuisance = lapply(blnames, checkmate::assertChoice, choices = names(private$p_bl_list))
      } else {
        blnames = names(private$p_bl_list)
      }
      unused_cols = nnew[! nnew %in% ndat]
      if (length(unused_cols) > 0) {
        warning(sprintf("Unused features '%s' in 'newdata' are ignored.", paste(unused_cols, collapse = ", ")))
        newdata = newdata[, setdiff(nnew, unused_cols), drop = FALSE]
        nnew = names(newdata)
      }

      nuisance = lapply(nnew, checkmate::assertChoice, choices = ndat)

      ndat = self$prepareData(newdata)
      lout = lapply(names(private$p_bl_list), function(bln) {
        bl = private$p_bl_list[[bln]]
        if (all(bl$feature %in% nnew)) {
          if (bln %in% blnames) {
            return(bl$factory$transformData(ndat)$design)
          } else {
            return(NULL)
          }
        } else {
          return(NULL)
        }
      })
      names(lout) = names(private$p_bl_list)
      lout[vapply(lout, is.null, logical(1))] = NULL
      return(lout)
    },

    #' @description
    #' Return the training risk of each iteration.
    #'
    #' @return
    #' `numeric()` vector of risk values or `NULL` if `$train()` was not called previously.
    getInbagRisk = function() {
      if (! is.null(self$model)) {
        # Return the risk + intercept, hence the current iteration + 1:
        return(self$model$getRiskVector()[seq_len(self$getCurrentIteration() + 1)])
      }
      return(NULL)
    },

    #' @description
    #' Get a vector with the name of the selected base learner of each iteration.
    #'
    #' @return
    #' `character()` vector of base learner names.
    getSelectedBaselearner = function() {
      if (!is.null(self$model))
        return(self$model$getSelectedBaselearner())
      return(NULL)
    },

    #' @description
    #' Printer of the object.
    #'
    #' @return
    #' Invisibly returns the object.
    print = function() {
      p = sprintf(paste("\n",
          "Component-Wise Gradient Boosting\n",
          "Target variable: %s",
          "Number of base-learners: %s",
          "Learning rate: %s",
          "Iterations: %s\n", sep = "\n"),
        self$target, self$bl_factory_list$getNumberOfRegisteredFactories(),
        self$learning_rate, self$getCurrentIteration())

      if (! is.null(self$positive))
        p = paste0(p, sprintf("\nPositive class: %s", self$positive))

      if (! is.null(self$model)){
        offset = round(self$model$getOffset(), 4)
        if (length(offset) == 1)
          p = paste0(p, sprintf("\nOffset: %s", offset))
      }
      cat(p)
      cat("\n\n")
      print(self$loss)

      return(invisible(self))
    },

    #' @description
    #' Get the estimated coefficients.
    #'
    #' @return
    #' `list(pars, offset)` with estimated coefficients/parameters and intercept/offset.
    getCoef = function() {
      bl_classes = vapply(private$p_bl_list, function(bl) class(bl$factory), character(1L))
      bl_cat = bl_classes[grepl("CategoricalRidge", bl_classes)]
      if (! is.null(self$model)) {
        pars = self$model$getEstimatedParameter()
        for (blc in intersect(names(bl_cat), names(pars))) {
          dict = private$p_bl_list[[blc]]$factory$getDictionary()
          rownames(pars[[blc]]) = names(sort(dict))
        }
        for (i in seq_along(pars)) {
          bln = names(pars)[i]
          attr(pars[[bln]], "blclass") = unname(bl_classes[bln])
        }
        return(c(pars, offset = self$model$getOffset()))
      }
      return(NULL)
    },
    #' @description
    #' DEPRICATED use `$getCoef()` instead.
    #' Get the estimated coefficients.
    #'
    #' @return
    #' `list(pars, offset)` with estimated coefficients/parameters and intercept/offset.
    getEstimatedCoef = function() {
      message("Depricated, use `$getCoef()` instead.")
      bl_classes = vapply(private$p_bl_list, function(bl) class(bl$factory), character(1L))
      bl_cat = bl_classes[grepl("Categorical", bl_classes)]
      if (! is.null(self$model)) {
        pars = self$model$getEstimatedParameter()
        for (blc in intersect(names(bl_cat), names(pars))) {
          dict = private$p_bl_list[[blc]]$factory$getDictionary()
          rownames(pars[[blc]]) = names(sort(dict))
        }
        for (i in seq_along(pars)) {
          bln = names(pars)[i]
          attr(pars[[bln]], "blclass") = unname(bl_classes[bln])
        }
        return(c(pars, offset = self$model$getOffset()))
      }
      return(NULL)
    },

    #' @description
    #' Get the names of the registered base learners.
    #'
    #' @return
    #' `charcter()` of base learner names.
    getBaselearnerNames = function() {
      return(names(private$p_bl_list))
    },

    #' @description
    #' Get the logged information.
    #'
    #' @return
    #' `data.frame` of logging information.
    getLoggerData = function() {
      checkModelPlotAvailability(self, check_ggplot = FALSE)

      out_list = self$model$getLoggerData()
      out_mat = out_list[[2]]
      colnames(out_mat) = out_list[[1]]

      private$p_logs = as.data.frame(out_mat)
      private$p_logs$baselearner = self$getSelectedBaselearner()
      if (! "train_risk" %in% names(private$p_logs)) {
        private$p_logs = rbind(NA, private$p_logs)
        private$p_logs$train_risk = self$getInbagRisk()
        private$p_logs$baselearner[1] = "intercept"
        if ("_iterations" %in% names(private$p_logs))
          private$p_logs[["_iterations"]][1] = 0
      }
      return(private$p_logs)
    },

    #' @description
    #' Calculate feature important based on the training risk. Note that early
    #' stopping should be used to get adequate importance measures.
    #'
    #' @param num_feats (`integer(1)`)\cr
    #' The number considered features, the `num_feats` most important feature names and
    #' the respective value is returned. If `num_feats = NULL`, all features are considered.
    #' @param aggregate_bl_by_feat (`logical(1)`)\cr
    #' Indicator whether the importance is aggregated based on feature level. For example,
    #' adding components included two different base learners for the same feature. If
    #' `aggregate_bl_by_feat == TRUE`, the importance of these two base learners is aggregated
    #' instead of considering them individually.
    #'
    #' @return
    #' Named `numeric()` vector of length `num_feats` (if at least `num_feats` were selected)
    #' with importance values as elements.
    calculateFeatureImportance = function(num_feats = NULL, aggregate_bl_by_feat = FALSE) {
      checkModelPlotAvailability(self, check_ggplot = FALSE)

      inbag_risk_differences = abs(diff(self$getInbagRisk()))
      selected_learner = self$getSelectedBaselearner()
      fcol = "baselearner"
      if (aggregate_bl_by_feat) {
        feats = vapply(private$p_bl_list, function(bl) paste(unique(bl$feature), collapse = "_"), character(1L))
        selected_learner = feats[selected_learner]
        fcol = "feature"
      }

      max_feats = length(unique(selected_learner))
      checkmate::assert_integerish(x = num_feats, lower = 1, upper = max_feats,
        any.missing = FALSE, len = 1L, null.ok = TRUE)

      if (is.null(num_feats)) {
        num_feats = max_feats
        #if (num_feats > 15L) num_feats = 15L
      }

      blearner_sums = aggregate(inbag_risk_differences, by = list(selected_learner), FUN = sum)
      colnames(blearner_sums) = c(fcol, "risk_reduction")

      out = blearner_sums[order(blearner_sums[["risk_reduction"]], decreasing = TRUE)[seq_len(num_feats)], ]
      return(out)
    },

    #' @description
    #' Save a [Compboost] object to a JSON file. Because of the underlying \code{C++} objects,
    #' it is not possible to use \code{R}'s native load and save methods.
    #'
    #' @param file (`character(1)`)\cr
    #'   Name/path to the file.
    #' @param rm_data (`logical(1)`)\cr
    #'   Remove all data from the model. This applies to the training data, response, as well as
    #'   the test data and response used for the test risk logging. Note: If data is removed, no
    #'   continuation of the training is possible after reloading. Also, everything related to
    #'   predictions based on the training data throws an error.
    saveToJson = function(file, rm_data = FALSE) {
      checkmate::assertString(file)
      checkmate::assertLogical(rm_data, len = 1)
      ext = strsplit(file, "[.]")[[1]][2]
      checkmate::assertChoice(ext, c("json", "JSON", "Json"))
      self$model$saveJson(file, rm_data)
    }

  ), # end public

  active = list(

    #' @field offset (`numeric()`)\cr
    #' Offset of the estimated model.
    offset = function(x) {
      if (! missing(x)) stop("`offset` is read only.")
      if (is.null(self$model)) return(NULL)
      return(self$model$getOffset())
    },

    #' @field baselearner_list (`list()`)\cr
    #' Named `list` with names `$getBaselearnerNames()`. Each elements contains
    #'
    #' * `"feature"` (`character(1)`): The name of the feature from `data`.
    #' * `"factory"` (`Baselearner*`): The raw base learner as  `factory`object. See `?Baselearner*` for details.
    baselearner_list = function(x) {
      if (! missing(x)) stop("`baselearner_list` is read only.")
      return(private$p_bl_list)
    },

    #' @field boost_intercept (`logical(1)`)\cr
    #' Logical value indicating whether an intercept base learner was added with `$addIntercept()` or not.
    boost_intercept = function(x) {
      if (! missing(x)) stop("`boost_intercept` is read only.")
      return(private$p_boost_intercept)
    },

    #' @field logs (`data.frame`)\cr
    #' Basic information such as risk, selected base learner etc. about each iteration.
    #' If `oob_data` is set, further information about the validation/oob risk is also logged.
    #' The same applies for time logging etc. Note: Using the field `logs` internally is set and updated
    #' after each call to `$getLoggerData()`. Hence, it cashes the logged data set instead of
    #' recalculating the data set as it is done for `$getLoggerData()`.
    logs = function(x) {
      if (! missing(x)) stop("`logs` is read only.")
      if (is.null(self$model)) {
        stop("Logger data can only be returned after the model has been trained.")
      }
      if (is.null(private$p_logs)) {
        private$p_logs = self$getLoggerData()
      }
      return(private$p_logs)
    },

    #' @template field-idx_oob
    idx_oob = function(x) {
      if (! missing(x)) stop("`idx_oob` is read only.")
      return(private$p_idx_oob)
    },

    #' @template field-idx_train
    idx_train = function(x) {
      if (! missing(x)) stop("`idx_train` is read only.")
      return(private$p_idx_train)
    }
  ), # end active

  private = list(
    # @field p_l_list (`list()`)\cr
    # A `list` containing the uninitialized raw logger classes, e.g. [LoggerIteration], [LoggerInbagRisk], etc.
    p_l_list = list(),

    # @field p_bl_list (`list()`)\cr
    # Named `list` with names `$getBaselearnerNames()`. Each elements contains
    #
    # * `"feature"` (`character(1)`): The name of the feature from `data`.
    # * `"factory"` (`Baselearner*`): The raw base learner as  `factory`object. See `?Baselearner*` for details.
    p_bl_list = list(),

    # @field p_logger_list ([LoggerList])\cr
    # The raw [LoggerList] object (see `?LoggerList` for details). Used to manage
    # the base learners.
    p_logger_list = list(),

    # @field p_stop_args (`list()`)\cr
    # List of the arguments used to early stop the algorithm. The possible elements are:
    # * `"eps_for_break"`: See `?LoggerOobRisk` for details.
    # * `"patience"`: Number of consecutive iterations after which is stopped if the loss didn't get better.
    # * `"loss_oob"`: Initialized loss object (see the `loss` field for details).
    p_stop_args = list(),

    # @field p_idx_oob (see field `idx_oob`).
    p_idx_oob = NULL,

    # @field p_idx_train (see field `idx_train`).
    p_idx_train = NULL,

    # @field p_boost_intercept (see field `boost_intercept).
    p_boost_intercept = FALSE,

    # @field p_logs (see field `logs`).
    p_logs = NULL,

    # @description
    # Initialize the model by calling the `$new()` constructor of the [Compboost_internal] object.
    initializeModel = function() {

      private$p_logger_list = LoggerList$new()
      lapply(private$p_l_list, function(logger) private$p_logger_list$registerLogger(logger))

      self$model = Compboost_internal$new(self$response, self$learning_rate,
        self$stop_all, self$bl_factory_list, self$loss,
        private$p_logger_list, self$optimizer)
    },

    # @description
    # Wrapper to add a logger for tracking the validation risk.
    addOobLogger = function() {
      if ("loss_oob" %in% names(private$p_stop_args)) {
        loss_oob = private$p_stop_args$loss_oob
        assertRcppClass(loss_oob, class(self$loss))
        control = "loss_oob"
      } else {
        loss_oob = eval(parse(text = paste0(gsub("Rcpp_", "", class(self$loss)), "$new()")))
        control = "new loss"
      }
      if (self$early_stop || (! is.null(self$oob_fraction)) || (! is.null(private$p_idx_oob))) {
        self$addLogger(logger = LoggerOobRisk, use_as_stopper = self$early_stop, logger_id = "oob_risk",
          used.loss = loss_oob, eps.for.break = private$p_stop_args$eps_for_break, patience = private$p_stop_args$patience,
          oob_data = self$prepareData(self$data_oob), oob.response = self$response_oob)
      }
    },

    # @description
    # Wrapper to add a numerical base learner.
    #
    # @param data_columns (`matrix()`)\cr
    # The raw data columns from `data` as matrix.
    # @param feature (`character()`)\cr
    # The feature names of the columns in `data_columns`.
    # @param id_fac (`character(1)`)\cr
    # The identifier of the base learner used to define the raw factory.
    # @template param-bl_factory
    # @template param-data_source
    # @param ... Additional arguments passed to the `$new(...)` call of `bl_factory`.
    addSingleNumericBl = function(data_columns, feature, id_fac, bl_factory, data_source, ...) {

      dsource = data_source$new(as.matrix(data_columns), paste(feature, collapse = "_"))
      factory = bl_factory$new(dsource, id_fac, list(...))
      id_insert = factory$getBaselearnerId()
      private$p_bl_list[[id_insert]] = list()
      private$p_bl_list[[id_insert]]$feature = feature
      private$p_bl_list[[id_insert]]$factory = factory

      self$bl_factory_list$registerFactory(private$p_bl_list[[id_insert]]$factory)
    },

    # @description
    # Wrapper to add a categorical base learner.
    #
    # @param data_column (`data.frame()`)\cr
    # The raw data column from `data`.
    # @param feature (`character()`)\cr
    # The feature names of the columns in `data_columns`.
    # @param id_fac (`character(1)`)\cr
    # The identifier of the base learner used to define the raw factory.
    # @template param-bl_factory
    # @template param-data_source
    # @param ... Additional arguments passed to the `$new(...)` call of `bl_factory`.
    addSingleCatBl = function(data_column, feature, id_fac, bl_factory, data_source, ...) {
      raw_dsource = CategoricalDataRaw$new(as.character(data_column[[feature]]), feature)
      if (bl_factory@.Data == "Rcpp_BaselearnerCategoricalRidge") {

        factory = BaselearnerCategoricalRidge$new(raw_dsource, id_fac, list(...))
        id_insert = factory$getBaselearnerId()

        private$p_bl_list[[id_insert]] = list()
        private$p_bl_list[[id_insert]]$feature = feature
        private$p_bl_list[[id_insert]]$factory = factory

        self$bl_factory_list$registerFactory(private$p_bl_list[[id_insert]]$factory)
      }
      if (bl_factory@.Data == "Rcpp_BaselearnerCategoricalBinary") {
        lvls = unlist(unique(data_column))
        # Create dummy variable for each category and use that vector as data matrix. Hence,
        # if a categorical feature has 3 groups, then these 3 groups are added as 3 different
        # base learners.
        for (lvl in lvls) {

          factory = bl_factory$new(raw_dsource, lvl, id_fac)
          cat_feat_id = factory$getBaselearnerId()

          private$p_bl_list[[cat_feat_id]] = list()
          private$p_bl_list[[cat_feat_id]]$feature = feature
          private$p_bl_list[[cat_feat_id]]$factory = factory

          self$bl_factory_list$registerFactory(private$p_bl_list[[cat_feat_id]]$factory)
        }
      }
      if (! bl_factory@.Data %in% c("Rcpp_BaselearnerCategoricalBinary", "Rcpp_BaselearnerCategoricalRidge")) {
        stop("Use `bl_factory = BaselearnerCategoricalRidge` or `bl_factory = BaselearnerCategoricalBinary` for categorical features.")
      }
    },

    # @description
    # Load a [Compboost] object from a JSON file. Because of the underlying \code{C++} objects,
    # it is not possible to use \code{R}'s native load and save methods.
    #
    # @param file (`character(1)`)\cr
    #   Name/path to the file.
    loadFromJson = function(file) {
      checkmate::assertFile(file, extension = c("json", "JSON", "Json"))

      self$model = Compboost_internal$new(file)
      self$learning_rate = self$model$getLearningRate()
      self$stop_all = self$model$useGlobalStopping()

      # RESPONSE:
      self$response = extractResponse(self$model$getResponse())
      self$positive = attr(self$response, "positive")
      self$target = self$response$getTargetName()

      # OPTIMIZER:
      self$optimizer = extractOptimizer(self$model$getOptimizer())

      # LOSS:
      self$loss = extractLoss(self$model$getLoss())

      # BASELEARNERLIST:
      self$bl_factory_list = self$model$getBaselearnerList()

      private$p_boost_intercept = "intercept" %in% self$bl_factory_list$getDataNames()
      private$p_bl_list = lapply(self$model$getFactoryMap(), function(f) {
        out = list(feature = f$getFeatureName(), factory = extractBaselearnerFactory(f))
      })

      # make active binding?
      dtmp = lapply(self$model$getDataMap(), extractData)
      self$data = do.call(data.frame, lapply(dtmp, function(d) {
        if (d$getDataType() == "in_memory") return(d$getData())
        if (d$getDataType() == "categorical") return(d$getRawData())
      }))
    }
  ) # end private
) # end Compboost
schalkdaniel/compboost documentation built on April 15, 2023, 9:03 p.m.