R/utils.R

Defines functions filterDuplicated makeECRResult makeFitnessMatrix getNumberOfParentsNeededForMating getNumberOfChildren fixDesignFactors listToDf setMosmafsVectorized heuristicHasSimpleSig getPopulations getStatistics naiveHoldoutDomHV unbiasedHoldoutDomHV initSelector collectResult availableAttributes popAggregate clonelog deepclone

deepclone <- function(obj) {
  unserialize(serialize(obj, NULL))
}

clonelog <- function(log) {
  log$env <- as.environment(as.list(log$env))
  log
}

#' @title Aggregate Population Results
#'
#' @description
#' Extract attributes saved for individuums in a
#' log object to a more accessible matrix or data.frame.
#'
#' @param log `[ecr_logger]` ecr log object.
#' @param extract `[character]` names of attributes to extract, currently 
#' "names", "fitness", "runtime", "fitness.holdout" and "fidelity" (if used) are supported.
#' @param simplify `[logical(1)]` whether to create a
#'   `matrix`/`data.frame` for each generation (default).
#'    Otherwise a list is returned for each generation containing
#'    the value (if `length(extract) == 1`) or a named list of values.
#' @param data.frame `[logical(1)]` whether to return a `data.frame`
#'   with rows for each individuum (if `TRUE`) or to return a `matrix`
#'   with columns for each individuum compatible as fitness matrix
#'   with various ecr tools (if `FALSE`, default). Only effective if
#'  `simplify` is `TRUE`.
#' @return `[matrix]` if `simplify` is `TRUE`, `[list]` otherwise.
#' @export
popAggregate <- function(log, extract, simplify = TRUE, data.frame = FALSE) {
  assertCharacter(extract, min.len = 1, any.missing = FALSE)
  assertFlag(simplify)
  assertFlag(data.frame)
  pop <- getPopulations(log)
  if (!(all(extract %in% availableAttributes(log)))) {
    stop("Assertion on 'extract' failed: Must be name(s) of attributes of population")
  }
  lapply(pop, function(generation) {
    aggr <- sapply(generation$population, function(individuum) {
      info <- attributes(individuum)[extract]
      if (length(extract) == 1) {
        info[[1]]
      } else if (simplify) {
        unlist(info, recursive = FALSE)
      } else {
        info
      }
    }, simplify = simplify)
    if (simplify && data.frame) {
      if (is.matrix(aggr)) {
        as.data.frame(t(aggr))
      } else {
        df <- data.frame(aggr)
        colnames(df) <- extract
        df
      }
    } else {
      aggr
    }
  })
}

#' @title  List Attributes that can be Aggregated
#'
#' @description
#' Looks at the first individuum in the first generation and
#' returns its attributes. If `check` is TRUE it checks that the
#' log object is consistent and throws an error if not. "Consistent"
#' here means that all individuals in all generations have the same
#' attributes.
#' @param log `[ecr_logger]` ecr log object.
#' @param check `[logical(1)]` whether to check consistency.
#' @return `[character]` attributes of individuals in population.
#' @export
availableAttributes <- function(log, check = FALSE) {
  pop <- getPopulations(log)
  if (length(pop) == 0) return(character(0))
  if (length(pop[[1]]$population) == 0) {
    stop("population of size 0 not allowed")
  }
  attrs <- names(attributes(pop[[1]]$population[[1]]))
  if (check) {
    lapply(pop, function(gen) lapply(gen$population, function(ind)
      assertSetEqual(names(attributes(ind)), attrs)))
  }
  attrs
}

#' @title Collect Result Information
#'
#' @description
#' Merge both `log` and `log.newinds` data for a complete `data.frame`
#' with information about progress (both on training data and on
#' holdout data) and ressource usage.
#'
#' @param ecr.object `[MosmafsResult]` `slickEcr()` result to analyse.
#' @param aggregate.perresult `[list]` list of functions
#'   to apply to fitness and holdout fitness. Every entry must either be
#'   a `character(1)` naming the function to use, or a function, in which
#'   that entry must have a name. Each function must return exactly one
#'   numeric value when fed a fitness matrix of one generation. This is
#'   ignored for single-objective runs.
#' @param aggregate.perobjective `[list]` list of functions
#'   to apply to fitness and holdout fitness matrix rows, formatted like
#'   `aggregate.perresult`. Each function must return exactly one numeric
#'   value when fed a fitness vector.
#' @param ref.point `[numeric]` reference point to use for HV computation.
#' @param cor.fun `[function]` function to use for calculation of
#'   correlation between objective and holdout. Must take two `numeric`
#'   arguments and return a `numeric(1)`.
#' @return `data.frame` 
#' @examples
#' library(mlrCPO)
#' 
#' # Setup of optimization problem 
#' ps.simple <- pSS(
#'  a: numeric [0, 10],
#'  selector.selection: logical^10)
#'  
#' mutator.simple <- combine.operators(ps.simple,
#'  a = mutGauss,
#'  selector.selection = mutBitflipCHW)
#'  
#' crossover.simple <- combine.operators(ps.simple,
#'  a = recSBX,
#'  selector.selection = recPCrossover)
#'
#' initials <- sampleValues(ps.simple, 30, discrete.names = TRUE)
#' 
#' fitness.fun <- smoof::makeMultiObjectiveFunction(
#'  sprintf("simple test"),
#'  has.simple.signature = FALSE, par.set = ps.simple, n.objectives = 2, 
#'  noisy = TRUE,
#'  ref.point = c(10, 1),
#'  fn = function(args, fidelity = NULL, holdout = FALSE) {
#'    pfeat <- mean(args$selector.selection)
#'    c(perform = args$a, pfeat = pfeat)
#'  })
#' fitness.fun.single <- smoof::makeMultiObjectiveFunction(
#'  sprintf("simple test"),
#'  has.simple.signature = FALSE, par.set = ps.simple, n.objectives = 1, 
#'  noisy = TRUE,
#'  ref.point = c(10),
#'  fn = function(args, fidelity = NULL, holdout = FALSE) {
#'    propfeat <- mean(args$selector.selection)
#'    c(propfeat = propfeat)
#'  })
#'
#' # Run NSGA-II
#' results <- slickEcr(fitness.fun = fitness.fun, lambda = 10, population = initials, 
#'  mutator = mutator.simple, recombinator = crossover.simple, generations = 10)
#'
#' # Collect results
#' colres <- collectResult(results)
#' print(colres)
#'  
#' @export
collectResult <- function(ecr.object, aggregate.perresult = list(domHV = function(x) computeHV(x, ref.point)), aggregate.perobjective = list("min", "mean", "max"), ref.point = smoof::getRefPoint(ecr.object$control$task$fitness.fun), cor.fun = cor) {
  assertClass(ecr.object, "MosmafsResult")

  nobj <- ecr.object$task$n.objectives
  normalize.funlist <- function(fl) {
    assertList(fl, any.missing = FALSE, types = c("function", "character"))
    
    charentries <- vlapply(fl, is.character)
    names(fl)[charentries] <- ifelse(is.na(names2(fl)[charentries]),
      unlist(fl[charentries], recursive = FALSE),
      names2(fl)[charentries])
    fl[charentries] <- lapply(fl[charentries], get,
      envir = .GlobalEnv, mode = "function")
    assertList(fl, any.missing = FALSE, types = "function", names = "unique")
  }
  
  aggregate.perresult <- normalize.funlist(aggregate.perresult)
  aggregate.perobjective <- normalize.funlist(aggregate.perobjective)

  if (nobj > 1) {
    assertNumeric(ref.point, any.missing = FALSE, finite = TRUE,
      len = ecr.object$task$n.objectives)
  }

  assertFunction(cor.fun)
  
  aggregate.fitness <- function(fitness) {
    resmat <- sapply(fitness, function(fit) {
      if (ecr.object$task$n.objectives == 1) {
        vnapply(aggregate.perobjective, function(f) f(fit))
      } else {
        if (is.null(rownames(fit))) {
          rownames(fit) <- paste0("obj.", seq_len(nrow(fit)))
        }
        c(
          unlist(apply(fit, 1, function(frow) {
            as.list(vnapply(aggregate.perobjective, function(f) f(frow)))
          })),
          vnapply(aggregate.perresult, function(f) f(fit))
        )
      }
    })
    as.data.frame(t(resmat))
  }
  
  
  fitnesses <- popAggregate(ecr.object$log, "fitness")
  
  stats <- getStatistics(ecr.object$log)
  stats.newinds <- getStatistics(ecr.object$log.newinds)
  
  no.fid <- is.null(stats.newinds$fidelity)
  if (no.fid) {
    stats.newinds$fidelity.sum <- 0
  } else {
    reevals <- stats.newinds$gen[stats.newinds$population == "fidelity.reeval"]
  }
  
  resdf <- with(stats.newinds, data.frame(
    gen,
    runtime = cumsum(runtime.sum),
    evals = cumsum(size),
    cum.fid = cumsum(fidelity.sum)))
  resdf <- resdf[!rev(duplicated(rev(resdf$gen))), ]
  assertTRUE(all.equal(resdf$gen, seq_len(nrow(resdf)) - 1))
  assertTRUE(all.equal(resdf$gen, stats$gen))
  if (no.fid) {
    resdf$cum.fid <- NULL
  } else {
    resdf$fid.reeval <- resdf$gen %in% reevals
  }
  resdf <- cbind(resdf,
    eval = aggregate.fitness(fitnesses))
  
  hofitnesses <- popAggregate(ecr.object$log, "fitness.holdout")
  
  if (any(vlapply(hofitnesses, function(x) any(is.finite(x))))) {
    
    if (nobj == 1) {
      corcols <- mapply(function(eval.fit, hout.fit) {
        suppressWarnings(cor.fun(eval.fit, hout.fit))
      }, fitnesses, hofitnesses)
      names(corcols) <- "obj.1"
      true.hout.domHV <- NA
      naive.hout.domHV <- NA
      resdf <- cbind(resdf, hout = aggregate.fitness(hofitnesses),
        cor = corcols)
    } else {
      corcols <- lapply(seq_len(ecr.object$task$n.objectives), function(idx) {
        mapply(function(eval.fit, hout.fit) {
          suppressWarnings(cor.fun(eval.fit[idx, ], hout.fit[idx, ]))
        }, fitnesses, hofitnesses)
      })
      names(corcols) <- rownames(fitnesses[[1]]) %??% paste0("obj.", seq_len(corcols))
      
      true.hout.domHV <- mapply(function(eval.fit, hout.fit) {
        unbiasedHoldoutDomHV(eval.fit, hout.fit, ref.point)
      }, fitnesses, hofitnesses)
      
      naive.hout.domHV <- mapply(function(eval.fit, hout.fit) {
        naiveHoldoutDomHV(eval.fit, hout.fit, ref.point)
      }, fitnesses, hofitnesses)
      resdf <- cbind(resdf, hout = aggregate.fitness(hofitnesses),
        true.hout.domHV, naive.hout.domHV,
        cor = corcols)
    }
  }
  resdf
}

#' @title Initialize Selector
#'
#' @description
#' Sample the `vector.name` variable such that the number of ones
#' has a given distribution.
#'
#' @param individuals `[list of named lists]` the individuals to initialize.
#' @param vector.name `[character(1)]` the variable name, whose entries are 
#' sampled.
#' @param distribution `[function]` function that returns a random integer
#'   from 0 to the length of each individual's `vector.name` slot.
#'   Defaults to the uniform distribution from 1 to `length()`.
#' @param soften.op `[ecr_mutator]` an optional mutator to apply to the
#'   `vector.name` variable.
#' @param soften.op.strategy `function` an optional function that can set
#'   the `soften.op`'s parameters. See [`combine.operators`] strategy parameters.
#'   Ignored if `soften.op` is not given.
#' @param soften.op.repeat `[integer(1)]` how often to repeat `soften.op`
#'   application. Ignored if `soften.op` is not given.
#' @param reject.condition `[function | NULL]` reject condition as a function applied
#' to newly generated values of `vector.name`. If set to NULL, no rejection is done. 
#' @return `list of named lists` the individuals with initialized
#'   `[[vector.name]]`.
#' @examples
#' library(mlrCPO)
#' 
#' # Initialize parameter set and sample candidates
#' ps <- pSS(
#'  maxdepth: integer[1, 30],
#'  minsplit: integer[2, 30],
#'  cp: numeric[0.001, 0.999], 
#'  selector.selection: logical^5)
#' 
#' initials <- sampleValues(ps, 15, discrete.names = TRUE)
#' 
#' # Resample logical vector selector.selection of initials 
#' # with binomial distribution 
#' initSelector(initials, distribution = function() rbinom(n = 5, size = 5, 
#'  prob = 0.5))
#' 
#' @export
initSelector <- function(individuals, vector.name = "selector.selection", distribution = function() floor(runif(1, 0, length(individuals[[1]][[vector.name]]) + 1)), soften.op = NULL, soften.op.strategy = NULL, soften.op.repeat = 1, reject.condition = function(x) !any(x)) {
  
  ilen <- length(individuals[[1]][[vector.name]])
  iint <- is.numeric(individuals[[1]][[vector.name]])
  assertList(individuals, types = "list", min.len = 1)
  assertTRUE(all(viapply(individuals, function(x) {
    length(x[[vector.name]])
  }) == ilen))
  
  if (!is.null(soften.op)) {
    assertClass(soften.op, "ecr_mutator")
    assertTRUE("binary" %in%
        getSupportedRepresentations(soften.op))
    assertFunction(soften.op.strategy, null.ok = TRUE)
    assertInt(soften.op.repeat, lower = 0)
  }
  assertFunction(reject.condition, null.ok = TRUE)
  
  lapply(individuals, function(ind) {
    repeat {  # repeat when rejecting 0s
      ind.new <- ind
      new.selection <- sample(ilen) <= distribution()
      if (!is.null(soften.op)) {
        new.selection <- as.numeric(new.selection)
        for (rp in seq_len(soften.op.repeat)) {
          opargs <- list(new.selection)
          if (!is.null(soften.op.strategy)) {
            strat.args <- soften.op.strategy(ind)
            assertList(strat.args, names = "unique")
            opargs <- c(opargs, strat.args)
          }
          new.selection <- do.call(soften.op, opargs)
          assertIntegerish(new.selection, len = ilen, any.missing = FALSE)
        }
        new.selection <- new.selection > 0.5
      }
      if (is.null(reject.condition) || !reject.condition(new.selection)) {
        if (iint) {
          new.selection <- as.integer(new.selection)
        }
        ind.new[[vector.name]] <- new.selection
        break
      }
    }
    ind.new
  })
}

#' @title Unbiased Dominated Hypervolume on Holdout Data
#'
#' @description
#' Calculate dominated hypervolume on holdout data. The result is
#' unbiased with respect to (uncorrelated with respect to objectives) noise in holdout data
#' performance, but it is *not* an estimate of real "dominated hypervolume".
#'
#' Only works on two-objective performance matrices.
#' @param fitness `[matrix]` fitness matrix of training data.
#' @param holdout `[matrix]` fitness matrix of holdout data.
#' @param refpoint `[numeric]` reference point.
#' @return `numeric`
#' @export
unbiasedHoldoutDomHV <- function(fitness, holdout, refpoint) {
  assertMatrix(fitness, nrows = 2)
  assertMatrix(holdout, nrows = 2, ncols = ncol(fitness))
  assertNumeric(refpoint, finite = TRUE, len = 2, any.missing = FALSE)
  ordering <- order(fitness[2, ], decreasing = TRUE)
  ordering <- intersect(ordering, which.nondominated(fitness))
  fitness <- fitness[, ordering, drop = FALSE]
  holdout <- holdout[, ordering, drop = FALSE]

  # only first objective of last point is interesting
  holdout <- cbind(holdout, c(refpoint[1], 0))

  last.point <- c(-Inf, refpoint[2])
  area <- 0
  for (idx in seq_len(ncol(holdout))) {
    this.point <- holdout[, idx]
    area <- area + (last.point[2] - this.point[2]) * (refpoint[1] - this.point[1])
    last.point <- this.point
  }
  area
}

#' @title Naive Hypervolume on Holdout Data
#'
#' @description
#' Calculate dominated hypervolume on holdout data. The result is
#' biased depending on noise in holdout data performance.
#'
#' @param fitness `[matrix]` fitness matrix of training data.
#' @param holdout `[matrix]` fitness matrix of holdout data.
#' @param refpoint `[numeric]` reference point.
#' @return `numeric`
#' @export
naiveHoldoutDomHV <- function(fitness, holdout, refpoint) {
  assertMatrix(holdout, nrows = nrow(fitness), ncols = ncol(fitness))
  assertNumeric(refpoint, finite = TRUE, len = nrow(fitness), any.missing = FALSE)
  computeHV(holdout[, nondominated(fitness), drop = FALSE], ref.point = refpoint)
}


#' @title Get Statistics 
#' @description
#' Get statistics from ecr_logger. Replaces [`ecr::getStatistics`] 
#' because original is buggy.
#' @param log `[ecr_logger]` ecr log object
#' @return `data.frame` of logged statistics.
#' @export
getStatistics <- function(log) {
  assertClass(log, "ecr_logger")
  log$env$stats[seq_len(log$env$cur.line - 1), , drop = FALSE]
}

#' @title Get Populations
#' @description 
#' Get populations from ecr_logger. Replaces ecr::getPopulation because 
#' original is buggy.
#' 
#' @param log `[ecr_logger]` ecr log object
#' @return `list` of populations. 
#' @export
getPopulations <- function(log) {
  assertClass(log, "ecr_logger")
  log$env$pop[seq_len(log$env$cur.line - 1)]
}


heuristicHasSimpleSig <- function(fn) { 
  identical(names(environment(fn)), "fn")
}


#' @title Set mosmafs.vectorize
#' @description 
#' Set or change attribute `mosmafs.vectorized` in fitness function. 
#' @param fn `smoof_multi_objective_function` fitness function. 
#' @param vectorize `[logical(1)]` whether to force `slickEvaluateFitness` to 
#' pass candidates to fitness function as `data.frame` or not.
#' @return `smoof_multi_objective_function`.
#' @export
setMosmafsVectorized <- function(fn, vectorize = TRUE) { 
  if (vectorize && heuristicHasSimpleSig(fn)) {
    warningf("attribute has.simple.signature of fn was set to TRUE, correct fitness evaluation can not be guaranteed")  ## schreiben wir sind uns nicht sicher weeil wir nur eine heuristik haben, aber wahrscheinlich macht der user was falsch
  }
  attr(fn, "mosmafs.vectorize") = vectorize
  fn
}


#' @title List to data.frame
#' 
#' @description 
#' Converts a list to a data.frame based on given parameter set.
#' 
#' List elements must have the correct type with respect to 
#' parameter set. Exceptions are discrete parameters, whose values should be 
#' factors, only characters are accepted and factors are returned.           
#' 
#' Returned `data.frame` has column names equal to parameter ids. In case 
#' of vector parameters column names will be numbered. 
#' 
#' @param list.object `[list]` list of individuals, each with elements named 
#' by parameter ids.
#' @param par.set `[ParamSet]` parameter set.
#' @return `[data.frame]`
#' @examples 
#' library(mlrCPO)
#' 
#' # Create parameter set
#' temp <- c("a", "b", "c")
#' ps.simple <- pSS(
#'  num: numeric [0, 10],
#'  int: integer[0, 10] [[trafo = function(x) x / 10]],
#'  char: discrete [temp], 
#'  selector.selection: logical^10)
#'  
#' # Sample values as list and convert list to data frame
#' init.list <- sampleValues(ps.simple, 5, discrete.names = TRUE)
#' result <- listToDf(init.list, ps.simple)
#' result
#' 
#' @export
listToDf = function(list.object, par.set) {
  assertList(list.object)
  checkClass <- function(l) {
    if(any(lapply(l, class) %in% "factor")) {
      stop("list elements are not allowed to be of type factor.")
    } 
  }
  lapply(list.object, checkClass)
  df = lapply(list.object, function(x) {
    Map(function(p, v) {
      if (isScalarNA(v)) 
        v = as.data.frame(t(rep(v, p$len)))
      else as.data.frame(t(v))
    }, par.set$pars, x)
  })
  df = lapply(df, do.call, what = cbind)
  df = as.data.frame(do.call(rbind, df))
  colnames(df) = getParamIds(par.set, repeated = TRUE, with.nr = TRUE)
  fixDesignFactors(df, par.set)
}

fixDesignFactors <- function(df, ps) {
  coltypes <- getParamTypes(ps, df.cols = TRUE)
  psvals <- rep(getValues(ps)[getParamIds(ps)], getParamLengths(ps))
  for (col in seq_along(df)) {
    if (coltypes[col] == "factor") {
      df[[col]] <- factor(df[[col]], levels = names(psvals[[col]]))
    }
  }
  return(df)
}


getNumberOfChildren <- function(recombinator) {
  attr(recombinator, "n.children")
}

getNumberOfParentsNeededForMating <- function(recombinator) {
  attr(recombinator, "n.parents")
}


makeFitnessMatrix <- function(fitness, control) {
  fitness <- addClasses(fitness, "ecr_fitness_matrix")
  attr(fitness, "minimize") <- control$task$minimize
  fitness
}


makeECRResult = function(control, log, population, fitness, stop.object, ...) {
  n.obj <- control$task$n.objectives
  if (n.obj == 1L) {
    return(
      makeS3Obj(
        task = control$task,
        best.x = log$env$best.x,
        best.y = log$env$best.y,
        log = log,
        last.population = population,
        last.fitness = as.numeric(fitness),
        message = stop.object$message,
        classes = c("ecr_single_objective_result", "ecr_result")
        )
    )
  }
  pareto.idx <- which.nondominated(fitness)
  pareto.front <- as.data.frame(t(fitness[, pareto.idx, drop = FALSE]))
  colnames(pareto.front) <- control$task$objective.names
  mo.res <- makeS3Obj(
    task = control$task,
    log = log,
    pareto.idx = pareto.idx,
    pareto.front = pareto.front,
    pareto.set = population[pareto.idx],
    last.population = population,
    message = stop.object$message,
    classes = c("ecr_multi_objective_result", "ecr_result")
  )
  mo.res <- filterDuplicated(mo.res)
}

filterDuplicated <- function(result, ...) {
  id.dup = duplicated(result$pareto.front)
  result$pareto.front = result$pareto.front[!id.dup, , drop = FALSE]
  result$pareto.set = result$pareto.set[!id.dup]
  return(result)
}
compstat-lmu/mosmafs documentation built on Nov. 2, 2022, 12:18 a.m.