R/StackedLearner.R

Defines functions getPseudoData rowiseRatio makeSuperLearnerTask getResponse compressBaseLearners hillclimbBaseLearners stackCV stackNoCV averageBaseLearners setPredictType.StackedLearner predictLearner.StackedLearner trainLearner.StackedLearner getStackedBaseLearnerPredictions makeStackedLearner

Documented in getStackedBaseLearnerPredictions makeStackedLearner

#' @title Create a stacked learner object.
#'
#' @description A stacked learner uses predictions of several base learners and
#'   fits a super learner using these predictions as features in order to
#'   predict the outcome. The following stacking methods are available:
#'
#'   - `average`\cr Averaging of base learner predictions without weights.
#'   - `stack.nocv`\cr Fits the super learner, where in-sample predictions of
#'   the base learners are used.
#'   - `stack.cv`\cr Fits the super learner, where the base learner predictions
#'   are computed by cross-validated predictions (the resampling strategy can be
#'   set via the `resampling` argument).
#'   - `hill.climb`\cr Select a subset of base learner predictions by hill
#'   climbing algorithm.
#'   - `compress`\cr Train a neural network to compress the model from a
#'   collection of base learners.
#'
#' @param base.learners ((list of) [Learner])\cr
#'   A list of learners created with `makeLearner`.
#' @param super.learner ([Learner] | character(1))\cr
#'   The super learner that makes the final prediction based on the base
#'   learners. If you pass a string, the super learner will be created via
#'   `makeLearner`. Not used for `method = 'average'`. Default is `NULL`.
#' @param predict.type (`character(1)`)\cr
#'   Sets the type of the final prediction for `method = 'average'`. For other
#'   methods, the predict type should be set within `super.learner`. If the type
#'   of the base learner prediction, which is set up within `base.learners`, is
#'
#'    - `"prob"`\cr then `predict.type = 'prob'` will use the average of all
#'    base learner predictions and `predict.type = 'response'` will use the
#'    class with highest probability as final prediction.
#'    - `"response"`\cr then, for classification tasks with `predict.type =
#'    'prob'`, the final prediction will be the relative frequency based on the
#'    predicted base learner classes and classification tasks with `predict.type
#'    = 'response'` will use majority vote of the base learner predictions to
#'    determine the final prediction. For regression tasks, the final prediction
#'    will be the average of the base learner predictions.
#'
#' @param method (`character(1)`)\cr
#'   \dQuote{average} for averaging the predictions of the base learners,
#'   \dQuote{stack.nocv} for building a super learner using the predictions of
#'   the base learners,
#'   \dQuote{stack.cv} for building a super learner using cross-validated
#'   predictions of the base learners.
#'   \dQuote{hill.climb} for averaging the predictions of the base learners,
#'   with the weights learned from hill climbing algorithm and
#'   \dQuote{compress} for compressing the model to mimic the predictions of a
#'   collection of base learners while speeding up the predictions and reducing
#'   the size of the model. Default is \dQuote{stack.nocv},
#' @param use.feat (`logical(1)`)\cr
#'   Whether the original features should also be passed to the super learner.
#'   Not used for `method = 'average'`.
#'   Default is `FALSE`.
#' @param resampling ([ResampleDesc])\cr
#'   Resampling strategy for `method = 'stack.cv'`.
#'   Currently only CV is allowed for resampling.
#'   The default `NULL` uses 5-fold CV.
#' @param parset the parameters for `hill.climb` method, including
#'   - `replace`\cr Whether a base learner can be selected more than once.
#'   - `init`\cr Number of best models being included before the selection algorithm.
#'   - `bagprob`\cr The proportion of models being considered in one round of selection.
#'   - `bagtime`\cr The number of rounds of the bagging selection.
#'   - `metric`\cr The result evaluation metric function taking two parameters
#'   `pred` and `true`, the smaller the score the better.
#'
#' the parameters for `compress` method, including
#'
#'    - k\cr the size multiplier of the generated data
#'    - prob\cr the probability to exchange values
#'    - s\cr the standard deviation of each numerical feature
#' @examples
#' \dontshow{ if (requireNamespace("rpart")) \{ }
#' \dontshow{ if (requireNamespace("rpart")) \{ }
#' \dontshow{ if (requireNamespace("MASS")) \{ }
#' \dontshow{ if (requireNamespace("rpart")) \{ }
#' \dontshow{ if (requireNamespace("e1071")) \{ }
#' # Classification
#' data(iris)
#' tsk = makeClassifTask(data = iris, target = "Species")
#' base = c("classif.rpart", "classif.lda", "classif.svm")
#' lrns = lapply(base, makeLearner)
#' lrns = lapply(lrns, setPredictType, "prob")
#' m = makeStackedLearner(base.learners = lrns,
#'   predict.type = "prob", method = "hill.climb")
#' tmp = train(m, tsk)
#' res = predict(tmp, tsk)
#'
#' # Regression
#' data(BostonHousing, package = "mlbench")
#' tsk = makeRegrTask(data = BostonHousing, target = "medv")
#' base = c("regr.rpart", "regr.svm")
#' lrns = lapply(base, makeLearner)
#' m = makeStackedLearner(base.learners = lrns,
#'   predict.type = "response", method = "compress")
#' tmp = train(m, tsk)
#' res = predict(tmp, tsk)
#' \dontshow{ \} }
#' \dontshow{ \} }
#' \dontshow{ \} }
#' \dontshow{ \} }
#' \dontshow{ \} }
#' @export
makeStackedLearner = function(base.learners, super.learner = NULL, predict.type = NULL,
  method = "stack.nocv", use.feat = FALSE, resampling = NULL, parset = list()) {

  if (is.character(base.learners)) base.learners = lapply(base.learners, checkLearner)
  if (is.null(super.learner) && method == "compress") {
    super.learner = makeLearner(stri_paste(base.learners[[1]]$type, ".nnet"))
  }
  if (!is.null(super.learner)) {
    super.learner = checkLearner(super.learner)
    if (!is.null(predict.type)) super.learner = setPredictType(super.learner, predict.type)
  }

  base.type = unique(extractSubList(base.learners, "type"))
  if (!is.null(resampling) & method != "stack.cv") {
    stop("No resampling needed for this method")
  }
  if (is.null(resampling)) {
    resampling = makeResampleDesc("CV", iters = 5L,
      stratify = ifelse(base.type == "classif", TRUE, FALSE))
  }
  assertChoice(method, c("average", "stack.nocv", "stack.cv", "hill.climb", "compress"))
  assertClass(resampling, "ResampleDesc")

  pts = unique(extractSubList(base.learners, "predict.type"))
  if ("se" %in% pts || (!is.null(predict.type) && predict.type == "se") ||
    (!is.null(super.learner) && super.learner$predict.type == "se")) {
    stop("Predicting standard errors currently not supported.")
  }
  if (length(pts) > 1L) {
    stop("Base learner must all have the same predict type!")
  }
  if ((method == "average" || method == "hill.climb") & (!is.null(super.learner) || is.null(predict.type))) {
    stop("No super learner needed for this method or the 'predict.type' is not specified.")
  }
  if (method != "average" & method != "hill.climb" & is.null(super.learner)) {
    stop("You have to specify a super learner for this method.")
  }
  # if (method != "average" & !is.null(predict.type))
  #  stop("Predict type has to be specified within the super learner.")
  if ((method == "average" || method == "hill.climb") & use.feat) {
    stop("The original features can not be used for this method")
  }
  if (!inherits(resampling, "CVDesc")) {
    stop("Currently only CV is allowed for resampling!")
  }

  # lrn$predict.type is "response" by default change it using setPredictType
  lrn = makeBaseEnsemble(
    id = "stack",
    base.learners = base.learners,
    cl = "StackedLearner"
  )

  # get predict.type from super learner or from predict.type
  if (!is.null(super.learner)) {
    lrn = setPredictType(lrn, predict.type = super.learner$predict.type)
  } else {
    lrn = setPredictType(lrn, predict.type = predict.type)
  }

  lrn$fix.factors.prediction = TRUE
  lrn$use.feat = use.feat

  lrn$method = method
  lrn$super.learner = super.learner
  lrn$resampling = resampling
  lrn$parset = parset
  return(lrn)
}

# FIXME: see FIXME in predict.StackedLearner I don't know how to make it better.
#'
#' @title Returns the predictions for each base learner.
#'
#' @description Returns the predictions for each base learner.
#'
#' @param model ([WrappedModel])\cr Wrapped model, result of train.
#' @param newdata ([data.frame])\cr
#' New observations, for which the predictions using the specified base learners should be returned.
#' Default is `NULL` and extracts the base learner predictions that were made during the training.
#'
#' @details None.
#'
#' @export
getStackedBaseLearnerPredictions = function(model, newdata = NULL) {
  # get base learner and predict type
  bms = model$learner.model$base.models
  method = model$learner.model$method

  if (is.null(newdata)) {
    probs = model$learner.model$pred.train
  } else {
    # if (model == "stack.cv") warning("Crossvalidated predictions for new data is not possible for this method.")
    # predict prob vectors with each base model
    probs = vector("list", length(bms))
    for (i in seq_along(bms)) {
      pred = predict(bms[[i]], newdata = newdata)
      probs[[i]] = getResponse(pred, full.matrix = ifelse(method %in% c("average", "hill.climb"), TRUE, FALSE))
    }

    names(probs) = sapply(bms, function(X) X$learner$id) # names(.learner$base.learners)
  }
  return(probs)
}

#' @export
trainLearner.StackedLearner = function(.learner, .task, .subset, ...) {
  # reduce to subset we want to train ensemble on
  .task = subsetTask(.task, subset = .subset)
  switch(.learner$method,
    average = averageBaseLearners(.learner, .task),
    stack.nocv = stackNoCV(.learner, .task),
    stack.cv = stackCV(.learner, .task),
    # hill.climb = hillclimbBaseLearners(.learner, .task, ...)
    hill.climb = do.call(hillclimbBaseLearners, c(list(.learner, .task), .learner$parset)),
    compress = compressBaseLearners(.learner, .task, .learner$parset)
  )
}

# FIXME: if newdata is the same data that was also used by training, then getBaseLearnerPrediction
# won't use the crossvalidated predictions (for method = "stack.cv").
#' @export
predictLearner.StackedLearner = function(.learner, .model, .newdata, ...) {

  use.feat = .model$learner$use.feat

  # get predict.type from learner and super model (if available)
  sm.pt = .model$learner$predict.type
  sm = .model$learner.model$super.model

  # get base learner and predict type
  bms.pt = unique(extractSubList(.model$learner$base.learners, "predict.type"))

  # get task information (classif)
  td = .model$task.desc
  type = ifelse(td$type == "regr", "regr",
    ifelse(length(td$class.levels) == 2L, "classif", "multiclassif"))

  # predict prob vectors with each base model
  if (.learner$method != "compress") {
    probs = getStackedBaseLearnerPredictions(model = .model, newdata = .newdata)
  } else {
    probs = .newdata
  }

  if (.learner$method %in% c("average", "hill.climb")) {
    if (.learner$method == "hill.climb") {
      model.weight = .model$learner.model$weights
    } else {
      model.weight = rep(1 / length(probs), length(probs))
    }
    if (bms.pt == "prob") {
      # if base learner predictions are probabilities for classification
      for (i in seq_along(probs)) {
        probs[[i]] = probs[[i]] * model.weight[i]
      }
      prob = Reduce("+", probs)
      if (sm.pt == "prob") {
        # if super learner predictions should be probabilities
        return(as.matrix(prob))
      } else {
        # if super learner predictions should be responses
        return(factor(colnames(prob)[max.col(prob)], td$class.levels))
      }
    } else {
      probs = as.data.frame(probs)
      # if base learner predictions are responses
      if (type == "classif" || type == "multiclassif") {
        # if base learner predictions are responses for classification
        if (sm.pt == "prob") {
          # if super learner predictions should be probabilities, iter over rows to get proportions
          # FIXME: this is very slow + CUMBERSOME. we also do it in more places
          # we need a bbmisc fun for counting proportions in rows or cols
          # probs = apply(probs, 1L, function(x) (table(factor(x, td$class.levels))/length(x)))
          # return(setColNames(t(probs), td$class.levels))
          probs = rowiseRatio(probs, td$class.levels, model.weight)
          return(probs)
        } else {
          # if super learner predictions should be responses
          return(factor(apply(probs, 1L, computeMode), td$class.levels))
        }
      }
      if (type == "regr") {
        # if base learner predictions are responses for regression
        prob = Reduce("+", probs) / length(probs) # rowMeans(probs)
        return(prob)
      }
    }
  } else if (.learner$method == "compress") {
    probs = as.data.frame(probs)
    pred = predict(sm, newdata = probs)
    if (sm.pt == "prob") {
      return(as.matrix(getPredictionProbabilities(pred, cl = td$class.levels)))
    } else {
      return(pred$data$response)
    }
  } else {
    probs = as.data.frame(probs)
    # feed probs into super model and we are done
    feat = .newdata[, colnames(.newdata) %nin% td$target, drop = FALSE]

    if (use.feat) {
      pred.data = cbind(probs, feat)
    } else {
      pred.data = probs
    }

    pred = predict(sm, newdata = pred.data)
    if (sm.pt == "prob") {
      return(as.matrix(getPredictionProbabilities(pred, cl = td$class.levels)))
    } else {
      return(pred$data$response)
    }
  }
}

# Sets the predict.type for the super learner of a stacked learner
#' @export
setPredictType.StackedLearner = function(learner, predict.type) {
  lrn = setPredictType.Learner(learner, predict.type)
  lrn$predict.type = predict.type
  if ("super.learner" %in% names(lrn)) lrn$super.learner$predict.type = predict.type
  return(lrn)
}

### helpers to implement different ensemble types ###

# super simple averaging of base-learner predictions without weights. we should beat this
averageBaseLearners = function(learner, task) {
  bls = learner$base.learners
  base.models = probs = vector("list", length(bls))
  for (i in seq_along(bls)) {
    bl = bls[[i]]
    model = train(bl, task)
    base.models[[i]] = model
    #
    pred = predict(model, task = task)
    probs[[i]] = getResponse(pred, full.matrix = TRUE)
  }
  names(probs) = names(bls)
  list(method = "average", base.models = base.models, super.model = NULL,
    pred.train = probs)
}

# stacking where we predict the training set in-sample, then super-learn on that
stackNoCV = function(learner, task) {

  td = getTaskDesc(task)
  type = ifelse(td$type == "regr", "regr",
    ifelse(length(td$class.levels) == 2L, "classif", "multiclassif"))
  bls = learner$base.learners
  use.feat = learner$use.feat
  base.models = probs = vector("list", length(bls))
  for (i in seq_along(bls)) {
    bl = bls[[i]]
    model = train(bl, task)
    base.models[[i]] = model
    pred = predict(model, task = task)
    probs[[i]] = getResponse(pred, full.matrix = FALSE)
  }
  names(probs) = names(bls)

  pred.train = probs

  if (type == "regr" || type == "classif") {
    probs = as.data.frame(probs)
  } else {
    probs = as.data.frame(lapply(probs, function(X) X)) # X[, -ncol(X)]))
  }

  # now fit the super learner for predicted_probs --> target
  probs[[td$target]] = getTaskTargets(task)
  if (use.feat) {
    # add data with normal features
    feat = getTaskData(task)
    feat = feat[, colnames(feat) %nin% td$target, drop = FALSE]
    probs = cbind(probs, feat)
    super.task = makeSuperLearnerTask(learner, data = probs,
      target = td$target)
  } else {
    super.task = makeSuperLearnerTask(learner, data = probs, target = td$target)
  }
  super.model = train(learner$super.learner, super.task)
  list(method = "stack.no.cv", base.models = base.models,
    super.model = super.model, pred.train = pred.train)
}

# stacking where we crossval the training set with the base learners, then super-learn on that
stackCV = function(learner, task) {

  td = getTaskDesc(task)
  type = ifelse(td$type == "regr", "regr",
    ifelse(length(td$class.levels) == 2L, "classif", "multiclassif"))
  bls = learner$base.learners
  use.feat = learner$use.feat
  # cross-validate all base learners and get a prob vector for the whole dataset for each learner
  base.models = probs = vector("list", length(bls))
  rin = makeResampleInstance(learner$resampling, task = task)
  for (i in seq_along(bls)) {
    bl = bls[[i]]
    r = resample(bl, task, rin, show.info = FALSE)
    probs[[i]] = getResponse(r$pred, full.matrix = FALSE)
    # also fit all base models again on the complete original data set
    base.models[[i]] = train(bl, task)
  }
  names(probs) = names(bls)

  if (type == "regr" || type == "classif") {
    probs = as.data.frame(probs)
  } else {
    probs = as.data.frame(lapply(probs, function(X) X)) # X[, -ncol(X)]))
  }

  # add true target column IN CORRECT ORDER
  tn = getTaskTargetNames(task)
  test.inds = unlist(rin$test.inds)

  pred.train = as.list(probs[order(test.inds), , drop = FALSE])

  probs[[tn]] = getTaskTargets(task)[test.inds]

  # now fit the super learner for predicted_probs --> target
  probs = probs[order(test.inds), , drop = FALSE]
  if (use.feat) {
    # add data with normal features IN CORRECT ORDER
    feat = getTaskData(task) # [test.inds, ]
    feat = feat[, !colnames(feat) %in% tn, drop = FALSE]
    pred.data = cbind(probs, feat)
    super.task = makeSuperLearnerTask(learner, data = pred.data, target = tn)
  } else {
    super.task = makeSuperLearnerTask(learner, data = probs, target = tn)
  }
  super.model = train(learner$super.learner, super.task)
  list(method = "stack.cv", base.models = base.models,
    super.model = super.model, pred.train = pred.train)
}

hillclimbBaseLearners = function(learner, task, replace = TRUE, init = 0, bagprob = 1, bagtime = 1,
  metric = NULL, ...) {

  assertFlag(replace)
  assertInt(init, lower = 0)
  assertNumber(bagprob, lower = 0, upper = 1)
  assertInt(bagtime, lower = 1)

  td = getTaskDesc(task)
  type = ifelse(td$type == "regr", "regr",
    ifelse(length(td$class.levels) == 2L, "classif", "multiclassif"))
  if (is.null(metric)) {
    if (type == "regr") {
      metric = function(pred, true) mean((pred - true)^2)
    } else {
      metric = function(pred, true) {
        pred = colnames(pred)[max.col(pred)]
        tb = table(pred, true)
        return(1 - sum(diag(tb)) / sum(tb))
      }
    }
  }
  assertFunction(metric)

  bls = learner$base.learners
  if (type != "regr") {
    for (i in seq_along(bls)) {
      if (bls[[i]]$predict.type == "response") {
        stop("Hill climbing algorithm only takes probability predict type for classification.")
      }
    }
  }
  # cross-validate all base learners and get a prob vector for the whole dataset for each learner
  base.models = probs = vector("list", length(bls))
  rin = makeResampleInstance(learner$resampling, task = task)
  for (i in seq_along(bls)) {
    bl = bls[[i]]
    r = resample(bl, task, rin, show.info = FALSE)
    if (type == "regr") {
      probs[[i]] = matrix(getResponse(r$pred), ncol = 1)
    } else {
      probs[[i]] = getResponse(r$pred, full.matrix = TRUE)
      colnames(probs[[i]]) = task$task.desc$class.levels
    }
    # also fit all base models again on the complete original data set
    base.models[[i]] = train(bl, task)
  }
  names(probs) = names(bls)

  # add true target column IN CORRECT ORDER
  tn = getTaskTargetNames(task)
  test.inds = unlist(rin$test.inds)

  # now start the hill climbing
  probs = lapply(probs, function(x) x[order(test.inds), , drop = FALSE])
  probs[[tn]] = getTaskTargets(task)[test.inds]
  probs[[tn]] = probs[[tn]][order(test.inds)]
  # probs = probs[order(test.inds), , drop = FALSE]
  m = length(bls)
  weights = rep(0, m)
  flag = TRUE
  for (bagind in 1:bagtime) {
    # bagging of models
    bagsize = ceiling(m * bagprob)
    bagmodel = sample(1:m, bagsize)
    weight = rep(0, bagsize)

    # Initial selection of strongest learners
    inds = NULL
    if (init > 0) {
      score = rep(Inf, bagsize)
      for (i in bagmodel) {
        score[i] = metric(probs[[i]], probs[[tn]])
      }
      inds = order(score)[1:init]
      weight[inds] = 1
    }

    selection.size = init
    selection.ind = inds
    # current.prob = rep(0, nrow(probs))
    current.prob = matrix(0, nrow(probs[[1]]), ncol(probs[[1]]))
    old.score = Inf
    if (selection.size > 0) {
      current.prob = Reduce("+", probs[selection.ind])
      old.score = metric(current.prob / selection.size, probs[[tn]])
    }
    flag = TRUE

    while (flag) {
      score = rep(Inf, bagsize)
      for (i in bagmodel) {
        score[i] = metric((probs[[i]] + current.prob) / (selection.size + 1), probs[[tn]])
      }
      inds = order(score)
      if (!replace) {
        ind = setdiff(inds, selection.ind)[1]
      } else {
        ind = inds[1]
      }

      new.score = score[ind]
      if (old.score - new.score < 1e-8) {
        flag = FALSE
      } else {
        current.prob = current.prob + probs[[ind]]
        weights[ind] = weights[ind] + 1
        selection.ind = c(selection.ind, ind)
        selection.size = selection.size + 1
        old.score = new.score
      }
    }
    weights[bagmodel] = weights[bagmodel] + weight
  }
  weights = weights / sum(weights)

  list(method = "hill.climb", base.models = base.models, super.model = NULL,
    pred.train = probs, weights = weights)
}

compressBaseLearners = function(learner, task, parset = list()) {

  lrn = learner
  lrn$method = "hill.climb"
  ensemble.model = train(lrn, task)

  data = getTaskData(task, target.extra = TRUE)
  data = data[[1]]

  pseudo.data = do.call(getPseudoData, c(list(data), parset))
  pseudo.target = predict(ensemble.model, newdata = pseudo.data)
  pseudo.data = data.frame(pseudo.data, target = pseudo.target$data$response)

  td = ensemble.model$task.desc
  type = ifelse(td$type == "regr", "regr",
    ifelse(length(td$class.levels) == 2L, "classif", "multiclassif"))

  if (type == "regr") {
    new.task = makeRegrTask(data = pseudo.data, target = "target")
    if (is.null(learner$super.learner)) {
      m = makeLearner("regr.nnet", predict.type = ) # nolint
    } else {
      m = learner$super.learner
    }
  } else {
    new.task = makeClassifTask(data = pseudo.data, target = "target")
    if (is.null(learner$super.learner)) {
      m = makeLearner("classif.nnet", predict.type = "")
    } else {
      m = learner$super.learner
    }
  }

  super.model = train(m, new.task)

  list(method = "compress", base.learners = lrn$base.learners, super.model = super.model,
    pred.train = pseudo.data)
}

### other helpers ###

# Returns response for correct usage in stackNoCV and stackCV and for predictions
getResponse = function(pred, full.matrix = TRUE) {
  # if classification with probabilities
  if (pred$predict.type == "prob") {
    if (full.matrix) {
      # return matrix of probabilities
      td = pred$task.desc
      pred.return = pred$data[, stri_paste("prob", td$class.levels, sep = ".")]
      colnames(pred.return) = td$class.levels
      return(pred.return)
    } else {
      # return only vector of probabilities for binary classification
      return(getPredictionProbabilities(pred))
    }
  } else {
    # if regression task
    pred$data$response
  }
}

# Create a super learner task
makeSuperLearnerTask = function(learner, data, target) {
  if (learner$super.learner$type == "classif") {
    makeClassifTask(data = data, target = target)
  } else {
    makeRegrTask(data = data, target = target)
  }
}

# Count the ratio
rowiseRatio = function(probs, levels, model.weight = NULL) {
  m = length(levels)
  p = ncol(probs)
  if (is.null(model.weight)) {
    model.weight = rep(1 / p, p)
  }
  mat = matrix(0, nrow(probs), m)
  for (i in 1:m) {
    ids = matrix(probs == levels[i], nrow(probs), p)
    for (j in 1:p) {
      ids[, j] = ids[, j] * model.weight[j]
    }
    mat[, i] = rowSums(ids)
  }
  colnames(mat) = levels
  return(mat)
}

getPseudoData = function(.data, k = 3, prob = 0.1, s = NULL, ...) {

  res = NULL
  n = nrow(.data)
  ori.names = names(.data)
  feat.class = sapply(.data, class)
  ind2 = which(feat.class == "factor")
  ind1 = setdiff(seq_len(ncol(.data)), ind2)
  if (length(ind2) > 0) {
    ori.labels = lapply(.data[[ind2]], levels)
  }
  .data = lapply(.data, as.numeric)
  .data = as.data.frame(.data)
  # Normalization
  mn = rep(0, ncol(.data))
  mx = rep(0, ncol(.data))
  for (i in ind1) {
    mn[i] = min(.data[, i])
    mx[i] = max(.data[, i])
    .data[, i] = (.data[, i] - mn[i]) / (mx[i] - mn[i])
  }
  if (is.null(s)) {
    s = rep(0, ncol(.data))
    for (i in ind1) {
      s[i] = sd(.data[, i])
    }
  }
  testNumeric(s, len = ncol(.data), lower = 0)

  # Func to calc dist
  hamming = function(mat) {
    n = nrow(mat)
    m = ncol(mat)
    res = matrix(0, n, n)
    for (j in 1:m) {
      unq = unique(mat[, j])
      p = length(unq)
      for (i in 1:p) {
        ind = which(mat[, j] == unq[i])
        res[ind, -ind] = res[ind, -ind] + 1
      }
    }
    return(res)
  }

  one.nn = function(mat, ind1, ind2) {
    n = nrow(mat)
    dist.mat.1 = matrix(0, n, n)
    dist.mat.2 = matrix(0, n, n)
    if (length(ind1) > 0) {
      dist.mat.1 = as.matrix(stats::dist(mat[, ind1, drop = FALSE]))
    }
    if (length(ind2) > 0) {
      dist.mat.2 = hamming(mat[, ind2, drop = FALSE])
    }
    dist.mat = dist.mat.1 + dist.mat.2
    neighbour = max.col(-dist.mat - diag(Inf, n))
    return(neighbour)
  }

  # Get the neighbour
  neighbour = one.nn(.data, ind1, ind2)

  # Start the loop
  p = ncol(.data)
  for (loop in 1:k) {
    data = .data
    prob.mat = matrix(sample(c(0, 1), n * p, replace = TRUE, prob = c(prob, 1 - prob)), n, p)
    prob.mat = prob.mat == 0
    for (i in 1:n) {
      e = as.numeric(data[i, ])
      ee = as.numeric(data[neighbour[i], ])

      # continuous
      for (j in ind1) {
        if (prob.mat[i, j]) {
          current.sd = abs(e[j] - ee[j]) / s[j]
          tmp1 = rnorm(1, ee[j], current.sd)
          tmp2 = rnorm(1, e[j], current.sd)
          e[j] = tmp1
          ee[j] = tmp2
        }
      }
      for (j in ind2) {
        if (prob.mat[i, j]) {
          tmp = e[j]
          e[j] = ee[j]
          ee[j] = tmp
        }
      }

      data[i, ] = ee
      data[neighbour[i], ] = e
    }
    res = rbind(res, data)
  }
  for (i in ind1) {
    res[, i] = res[, i] * (mx[i] - mn[i]) + mn[i]
  }
  res = data.frame(res)
  names(res) = ori.names
  for (i in ind2) {
    res[[i]] = factor(res[[i]], labels = ori.labels[[i]])
  }
  return(res)
}

# FIXMEs:
# - document + test + export
# - benchmark stuff on openml
# - allow base.learners to be character of learners (not only list of learners)
# - rename 'probs' in code into 'preds'
# - allow option to remove predictions for one class in multiclass tasks (to avoid collinearity)
# - DONE: return predictions from each single base learner
# - DONE: allow predict.type = "response" for classif using majority vote (for super learner predict type "response")
#   and using average for super learner predict type "prob".
# - DONE: add option to use normal features in super learner
# - DONE: super learner can also return predicted probabilites
# - DONE: allow regression as well

Try the mlr package in your browser

Any scripts or data that you put into this service are public.

mlr documentation built on June 22, 2024, 10:51 a.m.