automlr: Automatic Machine Learning in R

#' @include mlrLearners.R

mboSaveMode = TRUE

#' @title mbo backend configuration
#'
#' @description
#' Create an \code{AutomlrBackendConfig} object that can be fed to
#' \code{\link{automlr}} to perform optimization with the "mbo" backend.
#'
#' @param focussearch.restarts [\code{integer(1)}]\cr
#'   number of restarts to perform in focussearch surrogate model optimizer
#' @param focussearch.maxit [\code{integer(1)}]\cr
#'   number of iterations for one focussearch round
#' @param focussearch.points [\code{integer(1)}]\cr
#'   number of points to sample in focussearch.
#' @param mbo.save.mode [\code{logical(1)}]\cr
#'   Simplify search space for mbo backend. You should probably not change the
#'   default.
#' @param resampling [\code{ResampleDesc}]\cr
#'   resampling to evaluate model performance.
#' @export
makeBackendconfMbo = registerBackend("mbo",
    function(focussearch.restarts = 1, focussearch.maxit = 5,
        focussearch.points = 1000, mbo.save.mode = TRUE, resampling = hout) {
      assertCount(focussearch.restarts)
      assertCount(focussearch.maxit)
      assertCount(focussearch.points)
      assertClass(resampling, "ResampleDesc")
      argsToList()
    })

amaddprior.ammbo = function(env, prior) {
  NULL
}

amgetprior.ammbo = function(env) {
  NULL
}

amsetup.ammbo = function(env, opt, prior, learner, task, measure, verbosity) {
  requirePackages("mlrMBO", why = "optMBO", default.method = "load")
  requirePackages("smoof", why = "optMBO", default.method = "load")
  # FIXME things that could be variable:
  #  infill control: focussearch, something else? how many points?

  env$zeroWalltime = 0
  env$zeroEvals = 0

  # the following must be set here since mbo() creates the initial design,
  # which queries the budget an numcpus.
  numcpus = parallelGetOptions()$settings$cpus
  numcpus[is.na(numcpus)] = 1

  env$budget = 0

  env$hardTimeout = Inf  # for the init evaluations

  isOutOfBudget = function(opt.state) {
    stopcondition(env$budget, spentBudget(opt.state, env))
  }

  objectiveFun = function(x) {
    origx = x
    if (mboSaveMode) {
      x = complicateParams(x, getSearchspace(learner))
    } else {
      x = removeMissingValues(x)
    }
    l = setHyperPars(learner, par.vals = x)

    hardTimeoutRemaining = env$hardTimeout - proc.time()[3]

    if (verbosity.traceout(verbosity)) {
      cat("Evaluating function:\n")
      outlist = removeMissingValues(origx)
      for (n in names(outlist)) {
        catf("%s: %s; ", n, outlist[[n]])
      }
      cat("\n")
    }

    rwt = runWithTimeout(
        resample(l, task, resDesc, list(measure), show.info = FALSE)$aggr,
        hardTimeoutRemaining, throwError = TRUE)
    rwt$result
  }

  usedParset = getSearchspace(learner)
  if (mboSaveMode) {
    usedParset = simplifyParams(usedParset)
  }

  resDesc = opt$resampling
  objective = smoof::makeSingleObjectiveFunction(
      name = "automlr learner optimization",
      id = "automlr.objective",
      has.simple.signature = FALSE,
      vectorized = FALSE,
      noisy = TRUE,
      minimize = measure$minimize,
      par.set = usedParset,
      fn = objectiveFun)

  imputeval = generateRealisticImputeVal(measure, learner, task)
  imputefun = function(x, y, opt.path) imputeval

  control = mlrMBO::makeMBOControl(impute.y.fun = imputefun)
  control = mlrMBO::setMBOControlInfill(control, opt = "focussearch",
      opt.focussearch.points = opt$focussearch.points,
      opt.focussearch.maxit = opt$focussearch.maxit,
      opt.restarts = opt$focussearch.restarts)
  control = mlrMBO::setMBOControlTermination(control, iters = NULL,
      more.termination.conds = list(function(opt.state) {
            if (isOutOfBudget(opt.state)) {
              list(term = TRUE, message = "automlr term", code = "iter")
            } else {
              list(term = FALSE, message = NA_character_, code = "iter")
            }
          }))



  mboLearner = mlrMBO:::checkLearner(NULL, usedParset, control, objective)
  mboLearner$config = list(on.learner.error = "stop",
      on.learner.warning = "warn",
      show.learner.output = verbosity.traceout(verbosity))
  if (any(c("factors", "ordered") %in% getLearnerProperties(mboLearner))) {
    mboLearner = cpoFixFactors() %>>%
        selectedLearnerSplitter() %>>%
        cpoDropConstants(id = "predrop", ignore.na = TRUE) %>>%
        cpoImputeHist(affect.type = "numeric", id = "numimp") %>>%
        cpoImputeConstFact(affect.type = c("ordered", "factor")) %>>%
        cpoDropConstants(id = "postdrop") %>>%
        mboLearner
  } else {
    mboLearner = cpoFixFactors() %>>%
        cpoDropConstants(id = "predrop", ignore.na = TRUE) %>>%
        cpoImputeHist(affect.type = "numeric", id = "numimp") %>>%
        cpoDummyEncode(TRUE) %>>%
        cpoDropConstants(id = "postdrop") %>>%
        mboLearner
  }

  myMBO = mlrMBO::mbo
  environment(myMBO) = new.env(parent = asNamespace("mlrMBO"))
  environment(myMBO)$mboFinalize2 = identity
  env$opt.state = myMBO(objective, learner = mboLearner, control = control,
      show.info = verbosity.traceout(verbosity))
  parent.env(env$opt.state$opt.path$env) = emptyenv()

  env$zeroWalltime = as.numeric(env$opt.state$time.used, units = "secs")
  env$zeroEvals = getOptPathLength(env$opt.state$opt.path)
  # clean up environment, it is used in objectiveFun().
}

reverselearnercats = unlist(lapply(names(learnercats), function(x) {
  namedList(learnercats[[x]], x)
}), FALSE)

SLSplit = function(data) {
  sl = data$selected.learner
  if (is.null(sl)) {
    return(data)
  }
  rest = dropNamed(data, "selected.learner")
  newdat = do.call(data.frame, sapply(learnercats, function(lvl) {
    factor(sl, levels = lvl)
  }, simplify = FALSE))
  slc = data.frame(selected.learner.cat = unlist(
    reverselearnercats[as.character(sl)], use.names = FALSE))
  cbind(slc, selected = newdat, rest)
}


selectedLearnerSplitter = makeCPO("selectedLearnerSplitter",
  .properties.needed = "missings", .datasplit = "target",
  .stateless = TRUE, cpo.trafo = { data = SLSplit(data) },
  cpo.retrafo = { data = SLSplit(data) })


amresult.ammbo = function(env) {
  mboResult = mlrMBO:::mboFinalize2(env$opt.state)
  list(learner = env$learner,
      opt.point = removeMissingValues(mboResult$x),
      opt.val = mboResult$y,
      opt.path = mboResult$opt.path,
      result = mboResult)
}

amoptimize.ammbo = function(env, stepbudget, verbosity, deadline) {
  # initialize for spent budget computation
  starttime = proc.time()[3]

  # FIXME: right now, the infill crit optimization does not respect the
  # deadline. It is possible to change this, by changing the termination
  # criterion of the mbo run so that only one iteration gets performed per
  # call, and additionally creating backups of the opt.state before each call.
  # I will choose the elegant (= quick) over the correct solution here though.
  env$hardTimeout = starttime + deadline

  env$budget = stepbudget

  runWithTimeout(withCallingHandlers(
          mlrMBO:::mboTemplate.OptState(env$opt.state),
          warning = function(w) {
            if (any(grepl("Empty factor levels were dropped for columns", w))) {
              invokeRestart("muffleWarning")
            }
          }), deadline, backend = "native")
  spent = spentBudget(env$opt.state, env)
  if ("walltime" %in% names(spent)) {
    spent["walltime"] = proc.time()[3] - starttime  # b/c of possible timeout
  }
  env$zeroWalltime %+=% spent["walltime"]
  env$zeroEvals %+=% spent["evals"]
  spent
}

spentBudget = function(opt.state, zero) {
  spent = numeric(0)
  totalWallTime = as.numeric(opt.state$time.used, units = "secs")
  spent["walltime"] = totalWallTime - zero$zeroWalltime
  spent["evals"] = getOptPathLength(opt.state$opt.path) - zero$zeroEvals
  spent
}