R/stats.R

Defines functions pairwise.test.DataSetList pairwise.test.list pairwise.test bootstrap_RT SP

Documented in bootstrap_RT pairwise.test pairwise.test.DataSetList pairwise.test.list SP

#' Estimator 'SP' for the Expected Running Time (ERT)
#'
#' @param data A dataframe or matrix. Each row stores the runtime sample points from
#' several runs
#' @param max_runtime The budget to use for calculating ERT. If this is a vector, the largest value is taken.
#' Using this as a vector is being deprecated, and will be removed in a future update
#'
#' @return A list containing ERTs, number of succesfull runs and the succes rate
#' @export
#' @examples
#' SP(dsl[[1]]$RT, max(dsl[[1]]$RT))
SP <- function(data, max_runtime) {
  N <- ncol(data)
  if (length(max_runtime) > 1) {
    max_runtime <- max(max_runtime)
  }
  idx <- is.na(data) | data > max_runtime
  succ <- N - rowSums(idx)
  succ_rate <- succ / N
  data[idx] <- max_runtime

  list(ERT = rowSums(data) / succ,
       runs = succ,
       succ_rate = succ_rate)
}

#' Bootstrapping for running time samples
#'
#' @param x A numeric vector. A sample of the running time.
#' @param max_eval A numeric vector, containing the maximal running time in
#' each run. It should have the same size as x
#' @param bootstrap.size integer, the size of the bootstrapped sample
#'
#' @return A numeric vector of the bootstrapped running time sample
#' @export
#' @examples
#' ds <- dsl[[1]]
#' x <- get_RT_sample(ds, ftarget = 16, output = 'long')
#' max_eval <- get_maxRT(dsl, output = 'long')
#' bootstrap_RT(x$RT, max_eval$maxRT, bootstrap.size = 30)
bootstrap_RT <- function(x, max_eval, bootstrap.size) {
  x_succ <- x[!is.na(x)]
  x_unsucc <- max_eval[is.na(x)]
  n_succ <- length(x_succ)
  n_unsucc <- length(x_unsucc)

  p <- n_succ / length(x)
  N <- rgeom(bootstrap.size, p)

  if (n_succ == 0) {
    return (rep(Inf, bootstrap.size))
  }

  sapply(N,
         function(size) {
           if (size > 0)
             x <- sum(sample(x_unsucc, size, replace = T))
           else
             x <- 0
           x <- x + sample(x_succ, 1, replace = T)
         })
}

# TODO: remove the bootstrapping part as it does not make much sense here...
#' Performs a pairwise Kolmogorov-Smirnov test on the bootstrapped running times
#' among a data set
#'
#' @description This function performs a Kolmogorov-Smirnov test on each pair of
#' algorithms in the input x to determine which algorithm gives a significantly
#' smaller running time. The resulting p-values are arranged in a matrix, where
#' each cell (i, j) contains a p-value from the test with alternative hypothesis:
#' the running time of algorithm i is smaller (thus better) than that of j.
#'
#' @param x either a list that contains running time sample for each algorithm as
#' sub-lists, or a DataSetList object
#' @param bootstrap.size integer, the size of the bootstrapped sample. Set to 0 to disable bootstrapping
#' @param ... all other options
#' @return A matrix containing p-values of the test
#' @export
#' @examples
#' pairwise.test(subset(dsl, funcId == 1), 16)
pairwise.test <- function(x, ...)
  UseMethod('pairwise.test', x)

#' @param max_eval list that contains the maximal running time for each algorithm
#' as sub-lists
#' @export
#' @rdname pairwise.test
pairwise.test.list <-
  function(x, max_eval, bootstrap.size = 30, ...) {
    if ("DataSetList" %in% class(x)) {
      class(x) <- rev(class(x))
      pairwise.test.DataSetList(x)
    }

    N <- length(x)
    p.value <- matrix(NA, N, N)

    for (i in seq(1, N - 1)) {
      for (j in seq(i + 1, N)) {
        if (bootstrap.size == 0) {
          x1 <- x[[i]]
          x2 <- x[[j]]
        } else {
          x1 <- bootstrap_RT(x[[i]], max_eval[[i]], bootstrap.size)
          x2 <- bootstrap_RT(x[[j]], max_eval[[j]], bootstrap.size)
        }
        if (all(is.na(x1))) {
          if (all(is.na(x2))) {
            next
          }
          else {
            p.value[i, j] <- 1
            p.value[j, i] <- 0
          }
        }
        else if (all(is.na(x2))) {
          p.value[i, j] <- 0
          p.value[j, i] <- 1
        }
        else {
          options(warn = -1)
          p.value[i, j] <-
            ks.test(x1, x2, alternative = 'greater', exact = F)$p.value
          p.value[j, i] <-
            ks.test(x1, x2, alternative = 'less', exact = F)$p.value
          options(warn = 0)
        }
      }
    }

    p.value.adjust <- p.adjust(as.vector(p.value), method = 'holm')
    p.value <- matrix(p.value.adjust, N, N)
    colnames(p.value) <- rownames(p.value) <- convert_string_symbols(names(x))
    p.value
  }

#' @param ftarget float, the target value used to determine the running / hitting
#' @param which wheter to do fixed-target ('by_FV') or fixed-budget ('by_RT') comparison
#' time
#' @export
#' @rdname pairwise.test
pairwise.test.DataSetList <-
  function(x,
           ftarget,
           bootstrap.size = 0,
           which = 'by_FV',
           ...) {
    if (which == 'by_FV') {
      dt <- get_RT_sample(x, ftarget, output = 'long')
      maxRT <- get_maxRT(x, output = 'long')
      maxRT <- split(maxRT$maxRT, maxRT$ID)
      s <- split(dt$RT, dt$ID)
    }
    else if (which == 'by_RT') {
      dt <- get_FV_sample(x, ftarget, output = 'long')
      maxRT <- NULL
      if (bootstrap.size > 0)
        warning("Bootstrapping is currently not supported for
                                    fixed-budget statistics.")
      bootstrap.size = 0
      s <- split(dt$`f(x)`, dt$ID)
    }
    else
      stop("Unsupported argument 'which'. Available options are 'by_FV' and 'by_RT'")

    return(pairwise.test.list(s, maxRT, bootstrap.size))
  }


#' Replace greek symbols in text to avoid conversion problem in igraph-plots
#'
#' @param X Vector of strings in which to replace values
#' @noRd
convert_string_symbols <- function(x){
  symbols <- c("\u03b1",
               "\u03b2",
               "\u03b3",
               "\u03b4",
               "\u03b5",
               "\u03bb",
               "\u03bc")
  names <-  c("alpha", "beta", "gamma",
              "delta", "epsilon", "lambda", "mu")
  return(stri_replace_all_fixed(x, symbols, names, vectorize_all = F))
}

# TODO: move those two functions to a separate file
# TODO: review / re-write this function
#' Function for generating sequences of function values
#'
#' @param FV A list of function values
#' @param from Starting function value. Will be replaced by min(FV) if it is NULL or too small
#' @param to Stopping function value. Will be replaced by max(FV) if it is NULL or too large
#' @param by Stepsize of the sequence. Will be replaced if it is too small
#' @param length.out Number of values in the sequence.
#'   'by' takes preference if both it and length.out are provided.
#' @param scale Scaling of the sequence. Can be either 'linear' or 'log', indicating a
#'   linear or log-linear spacing respectively. If NULL, the scale will be predicted
#'   based on FV
#' @param force_limits Whether the from and to values are hard, or should be modified
#' based on detected FV values (default False)
#'
#' @return A sequence of function values
#' @export
#' @examples
#' FVall <- get_runtimes(dsl)
#' seq_FV(FVall, 10, 16, 1, scale='linear')
seq_FV <-
  function(FV,
           from = NULL,
           to = NULL,
           by = NULL,
           length.out = NULL,
           scale = NULL,
           force_limits = FALSE) {
    if (is.null(from) || !force_limits) {
      from <- max(from, min(FV))
    }
    if (is.null(to) || !force_limits) {
      to <- min(to, max(FV))
    }

    rev_trans <- function(x)
      x

    # Auto detect scaling
    # TODO: Improve this detection (based on FV?). Currently very arbitrary
    if (is.null(scale)) {
      if (to < 0 || from < 0)
        scale <- 'linear'
      else if (abs(log10(mean(FV)) - log10(median(FV))) > 1)
        scale <- 'log'
      else
        scale <- 'linear'
    }

    if (scale == 'log') {
      trans <- log10
      rev_trans <- function(x)
        10 ^ x
      # TODO: Better way to deal with negative values
      #       set lowest possible target globally instead of arbitrary 1e-12
      from <- max(1e-12, from[from > 0])
      to <- max(1e-12 , to[to > 0])
      from <- trans(from)
      to <- trans(to)
    }

    #Avoid generating too many samples
    if (!is.null(by)) {
      nr_samples_generated <- (to - from) / by
      if (nr_samples_generated > getOption("IOHanalyzer.max_samples", default = 100)) {
        by <- NULL
        if (is.null(length.out))
          length.out <-
            getOption("IOHanalyzer.max_samples", default = 100)
      }
    }

    if (is.null(by) || by > to - from) {
      if (is.null(length.out)) {
        length.out <- 10
        args <-
          list(
            from = from,
            to = to,
            by = (to - from) / (length.out - 1)
          )
      } else
        args <- list(from = from,
                     to = to,
                     length.out = length.out)
    } else
      args <- list(from = from, to = to, by = by)

    # tryCatch({
    do.call(seq, args) %>%
      c(from, ., to) %>%    # always include the starting / ending value
      unique %>%
      rev_trans
    # }, error = function(e) {
    # c()
    # })
  }

# TODO: review / re-write this function
#' Function for generating sequences of runtime values
#'
#' @param RT A list of runtime values
#' @param from Starting runtime value. Will be replaced by min(RT) if it is NULL or too small
#' @param to Stopping runtime value. Will be replaced by max(RT) if it is NULL or too large
#' @param by Stepsize of the sequence. Will be replaced if it is too small
#' @param length.out Number of values in the sequence.
#'   'by' takes preference if both it and length.out are provided.
#' @param scale Scaling of the sequence. Can be either 'linear' or 'log', indicating a
#'   linear or log-linear spacing respectively.
#'
#' @return A sequence of runtime values
#' @export
#' @examples
#' RTall <- get_runtimes(dsl)
#' seq_RT(RTall, 0, 500, length.out=10, scale='log')
seq_RT <-
  function(RT,
           from = NULL,
           to = NULL,
           by = NULL,
           length.out = NULL,
           scale = 'linear') {
    rev_trans <- function(x)
      x

    # Do this first to avoid the log-values being overwritten.
    from <- max(from, min(RT))
    to <- min(to, max(RT))

    if (scale == 'log') {
      RT <- log10(RT)
      rev_trans <- function(x)
        10 ^ x
      if (!is.null(from))
        from <- log10(from)
      if (!is.null(to))
        to <- log10(to)
      if (!is.null(by))
        by <- log10(by)
    }

    #Avoid generating too many samples
    if (!is.null(by)) {
      nr_samples_generated <- (to - from) / by
      if (nr_samples_generated > getOption("IOHanalyzer.max_samples", default = 100)) {
        by <- NULL
        if (is.null(length.out))
          length.out <-
            getOption("IOHanalyzer.max_samples", default = 100)
      }
    }

    # Also reset by if it is too large
    if (is.null(by) || by > to - from) {
      if (is.null(length.out)) {
        length.out <- 10
        args <-
          list(
            from = from,
            to = to,
            by = (to - from) / (length.out - 1)
          )
      } else
        args <- list(from = from,
                     to = to,
                     length.out = length.out)
    } else
      args <- list(from = from, to = to, by = by)

    do.call(seq, args) %>%
      c(from, ., to) %>%    # always include the starting / ending value
      unique %>%
      rev_trans %>%
      round
  }

# TODO: implement the empirical p.m.f. for runtime
EPMF <- function() {

}

#' Empirical Cumulative Dsitribution Function of Runtime of a single data set
#'
#' @param ds A DataSet or DataSetList object.
#' @param ftarget A Numerical vector. Function values at which runtime values are consumed
#' @param ... Arguments passed to other methods
#'
#' @return a object of type 'ECDF'
#' @export
#' @examples
#' ECDF(dsl,c(12,14))
#' ECDF(dsl[[1]],c(12,14))
ECDF <- function(ds, ftarget, ...)
  UseMethod("ECDF", ds)

# TODO: also implement the ecdf functions for function values and parameters
#' @rdname ECDF
#' @export
ECDF.DataSet <- function(ds, ftarget, ...) {
  runtime <- get_RT_sample(ds, ftarget, output = 'long')$RT
  runtime <- runtime[!is.na(runtime)]
  if (length(runtime) == 0)
    fun <- ecdf(Inf)
  else
    fun <- ecdf(runtime)

  class(fun)[1] <- 'ECDF'
  attr(fun, 'min') <- min(runtime)
  attr(fun, 'samples') <- length(runtime)
  attr(fun, 'max') <-
    max(runtime)  # the sample can be retrieved by knots(fun)
  fun
}

# TODO: review / re-write this function
#' @rdname ECDF
#' @export
ECDF.DataSetList <- function(ds, ftarget, ...) {
  if (length(ds) == 0)
    return(NULL)

  dims <- unique(get_dim(ds))
  funcs <- unique(get_funcId(ds))

  if (is.data.table(ftarget)) {
    runtime <- sapply(seq(nrow(ftarget)), function(i) {
      if (length(dims) > 1 && length(funcs) > 1) {
        names_temp <- ftarget[i][[1]] %>%
          strsplit(., ';')
        FuncId <- names_temp[[1]][[1]]
        Dim <- names_temp[[1]][[2]]
      }
      else if (length(dims) > 1) {
        FuncId <- funcs[[1]]
        Dim <- ftarget[i][[1]]
      }
      else if (length(funcs) > 1) {
        FuncId <- ftarget[i][[1]]
        Dim <- dims[[1]]
      }
      data <- subset(ds, funcId == FuncId, DIM == Dim)
      if (length(data) == 0)
        return(NA)
      temp_targets <- ftarget[i] %>%
        unlist %>%
        as.numeric
      names(temp_targets) <- NULL
      res <-
        get_RT_sample(data, temp_targets[2:11], output = 'long')$RT
      res[is.na(res)] <- Inf
      res
    }) %>%
      unlist
  } else if (is.list(ftarget)) {
    runtime <- sapply(seq_along(ftarget), function(i) {
      if (length(dims) > 1 && length(funcs) > 1) {
        names_temp <- names(ftarget[i])[[1]] %>%
          strsplit(., ';')
        FuncId <- names_temp[[1]][[1]]
        Dim <- names_temp[[1]][[2]]
      }
      else if (length(dims) > 1) {
        FuncId <- funcs[[1]]
        Dim <- names(ftarget[i])[[1]]
      }
      else if (length(funcs) > 1) {
        FuncId <- names(ftarget[i])[[1]]
        Dim <- dims[[1]]
      } else {
        FuncId <- funcs[[1]]
        Dim <- dims[[1]]
      }
      data <- subset(ds, funcId == FuncId, DIM == Dim)
      if (length(data) == 0)
        return(NA)
      res <- get_RT_sample(data, ftarget[[i]], output = 'long')$RT
      res[is.na(res)] <- Inf
      res
    }) %>%
      unlist
  } else {
    runtime <- get_RT_sample(ds, ftarget, output = 'long')$RT
    runtime[is.na(runtime)] <- Inf
  }


  if (length(runtime) == 0)
    return(NULL)

  fun <- ecdf(runtime)
  class(fun)[1] <- 'ECDF'
  attr(fun, 'min') <- min(runtime)
  attr(fun, 'max') <-
    max(runtime)  # the sample can be retrieved by knots(fun)
  fun
}

#' Area Under Curve (Empirical Cumulative Dsitribution Function)
#'
#' @param fun A ECDF object.
#' @param from double. Starting point of the area on x-axis
#' @param to   double. Ending point of the area on x-axis
#'
#' @return a object of type 'ECDF'
#' @export
#' @examples
#' ecdf <- ECDF(dsl,c(12,14))
#' AUC(ecdf, 0, 100)
AUC <- function(fun, from = NULL, to = NULL)
  UseMethod('AUC', fun)

#' @rdname AUC
#' @export
AUC.ECDF <- function(fun, from = NULL, to = NULL) {
  if (is.null(from))
    from <- attr(fun, 'min')
  if (is.null(to))
    to <- attr(fun, 'max')

  if (is.null(fun))
    0
  else
    integrate(fun,
              lower = from,
              upper = to,
              subdivisions = 1e3L)$value / (to - from)
}

#' Generate datatables of runtime or function value targets for a DataSetList
#'
#' Only one target is generated per (function, dimension)-pair, as opposed to the
#' function `get_default_ECDF_targets`, which generates multiple targets.
#'
#' @param dsList A DataSetList
#' @param which Whether to generate fixed-target ('by_FV') or fixed-budget ('by_RT') targets
#'
#' @return a data.table of targets
#' @export
#' @examples
#' get_target_dt(dsl)
get_target_dt <- function(dsList, which = 'by_RT') {
  vals <- c()
  funcs <- get_funcId(dsList)
  dims <- get_dim(dsList)
  dt <- as.data.table(expand.grid(funcs, dims))
  colnames(dt) <- c("funcId", "DIM")
  if (which == 'by_FV') {
    target_func <- get_target_FV
  }
  else if (which == 'by_RT') {
    target_func <- get_target_RT
  }
  else
    stop("Invalid argument for `which`; can only be `by_FV` or `by_RT`.")
  targets <- apply(dt, 1,
                   function(x) {
                     dsl_sub <- subset(dsList, funcId == x[[1]], DIM == x[[2]])
                     if (length(dsl_sub) > 0)
                       target_func(dsl_sub)
                     else
                       NULL
                   })
  dt[, target := targets]

  return(dt[!sapply(dt[['target']], is.null)])
}

#' Helper function for `get_target_dt`
#' @noRd
get_target_FV <- function(dsList) {
  vals <-
    get_FV_summary(dsList, Inf)[[paste0(100 * getOption("IOHanalyzer.quantiles", 0.02)[[1]], '%')]]
  if (is.null(attr(dsList, 'maximization')) ||
      attr(dsList, 'maximization')) {
    return(max(vals))
  }
  else {
    return(min(vals))
  }
}

#' Helper function for `get_target_dt`
#' @noRd
get_target_RT <- function(dsList) {
  return(max(get_runtimes(dsList)))
}

#' Glicko2 raning of algorithms
#'
#' This procedure ranks algorithms based on a glicko2-procedure.
#' Every round (total nr_rounds), for every function and dimension of the datasetlist,
#' each pair of algorithms competes. This competition samples a random runtime for the
#' provided target (defaults to best achieved target). Whichever algorithm has the lower
#' runtime wins the game. Then, from these games, the glicko2-rating is determined.
#'
#' @param dsl The DataSetList, can contain multiple functions and dimensions, but should have the
#' same algorithms for all of them
#' @param nr_rounds The number of rounds to run. More rounds leads to a more accurate ranking.
#' @param which Whether to use fixed-target ('by_FV') or fixed-budget ('by_RT') perspective
#' @param target_dt Custom data.table target value to use. When NULL, this is selected automatically.
#' @return A dataframe containing the glicko2-ratings and some additional info
#'
#' @export
#' @examples
#' glicko2_ranking(dsl, nr_round = 25)
#' glicko2_ranking(dsl, nr_round = 25, which = 'by_RT')
glicko2_ranking <-
  function(dsl,
           nr_rounds = 100,
           which = 'by_FV',
           target_dt = NULL) {
    if (!requireNamespace("PlayerRatings", quietly = TRUE)) {
      stop(
        "Package \"PlayerRatings\" is needed for this function to work. Please install it.",
        call. = FALSE
      )
    }
    req(length(get_id(dsl)) > 1)

    if (!is.null(target_dt) &&
        !('data.table' %in% class(target_dt))) {
      warning("Provided `target_dt` argument is not a data.table")
      target_dt <- NULL
    }

    if (is.null(target_dt))
      target_dt <- get_target_dt(dsl, which)

    if (!(which %in% c('by_FV', 'by_RT')))
      stop("Invalid argument: 'which' can only be 'by_FV' or 'by_RT'")
    p1 <- NULL
    p2 <- NULL
    scores <- NULL
    weeks <- NULL
    get_dims <- function() {
      dims <- get_dim(dsl)
      if (length(dims) > 1) {
        dims <- sample(dims)
      }
      dims
    }
    get_funcs <- function() {
      funcs <- get_funcId(dsl)
      if (length(funcs) > 1) {
        funcs <- sample(funcs)
      }
      funcs
    }
    n_algs = length(get_id(dsl))
    alg_names <- NULL
    for (k in seq(1, nr_rounds)) {
      for (dim in get_dims()) {
        targets_temp <- target_dt[target_dt$DIM == dim]
        for (fid in get_funcs()) {
          dsl_s <- subset(dsl, funcId == fid && DIM == dim)
          if (length(dsl_s) == 0)
            next
          if (which == 'by_FV') {
            target <- targets_temp[targets_temp$funcId == fid]$target
            x_arr <- get_RT_sample(dsl_s, target)
            win_operator <- `<`
          }
          else {
            target <- targets_temp[targets_temp$funcId == fid]$target
            x_arr <- get_FV_sample(dsl_s, target)
            win_operator <-
              ifelse(attr(dsl, 'maximization'), `>`, `<`)
          }
          if (is.null(alg_names))
            alg_names <- x_arr[, 3]
          vals = array(dim = c(n_algs, ncol(x_arr) - 4))
          for (i in seq(1, n_algs)) {
            ds <- subset(dsl_s, algId == alg_names[i][[1]])
            n_valid <- length(attr(ds[[1]], 'instance'))

            z <- x_arr[i]
            y <- as.numeric(z[, 5:ncol(x_arr)])


            vals[i, ] = sample(y[1:n_valid], length(vals[i, ]), replace = TRUE)
          }

          for (i in seq(1, n_algs)) {
            for (j in seq(i, n_algs)) {
              if (i == j)
                next
              weeks <- c(weeks, k)
              s1 <- sample(vals[i, ], 1)
              s2 <- sample(vals[j, ], 1)
              if (is.na(s1)) {
                if (is.na(s2)) {
                  won <- 0.5
                }
                else{
                  won <- 0
                }
              }
              else{
                if (is.na(s2)) {
                  won <- 1
                }
                else if (s1 == s2) {
                  won <- 0.5 #Tie
                }
                else {
                  won <- win_operator(s1, s2)
                }
              }
              p1 <- c(p1, i)
              p2 <- c(p2, j)
              scores <- c(scores, won)

            }
          }
        }
      }
    }
    # weeks <- seq(1,1,length.out = length(p1))
    games <-
      data.frame(
        Weeks = weeks,
        Player1 = p1,
        Player2 = p2,
        Scores = as.numeric(scores)
      )
    lout <- PlayerRatings::glicko2(games, init =  c(1500, 350, 0.06))
    lout$ratings$Player <- alg_names[lout$ratings$Player]
    lout
  }


#' Verify that the credentials for DSCtool have been set
#'
#' This uses the keyring package to store and load credentials.
#' If the keyring package does not exists, it will default to look for
#' a config-file in the 'repository'-folder, under your home directory.
#' This can be changed by setting the option IOHprofiler.config_dir
#' If you already have an account, please call `set_DSC_credentials`
#' with the corresponding username and password.
#' If you don't have an account, you can register for one using `register_DSC`
#'
#'
#'
#' @export
#' @examples
#' check_dsc_configured()
check_dsc_configured <- function() {
  if (!requireNamespace("keyring", quietly = TRUE)) {
    warning(
      "It is recommended to have the 'keyring' package installed to store
            DSCtool settings. Since this package is not found, we default
            to a local settings-file instead."
    )
    repo_dir <- paste0(file.path(Sys.getenv('HOME'), 'repository'))
    if (!is.null(getOption("IOHprofiler.repo_dir"))) {
      repo_dir <- getOption("IOHprofiler.repo_dir")
    }
    if (!file.exists(paste0(repo_dir, "/config.rds"))) {
      return(FALSE)
    }
    data <- readRDS(paste0(repo_dir, "/config.rds"))
    if (is.null(data$DSCusername) || is.null(data$DSCpassword)) {
      return(FALSE)
    }
    return(TRUE)
  }
  tryCatch({
    username <- keyring::key_get("DSCtool_name")
    pwd <- keyring::key_get("DSCtool")
    return(TRUE)
  },
  error = function(e) {
    print(
      "DSCtool connection not yet set. Please use the function `register_DSC` to
            create an account to use the DSC tool locally (or use 'add_DSC_credentials'
            to use an existing account)"
    )
    return(FALSE)
  })
}

#' Register an account to the DSCtool API
#'
#' This uses the keyring package to store and load credentials.
#' If you already have an account, please call `set_DSC_credentials` instead
#'
#' @param name Your name
#' @param username A usename to be identified with. Will be stored on keyring under 'DSCtool_name'
#' @param affiliation Your affiliation (university / company)
#' @param email Your email adress
#' @param password The password to use. If NULL, this will be generated at random.
#' Will be stored on keyring under 'DSCtool'
#'
#' @export
#' @examples
#' \dontrun{
#' register_DSC('John Doe', 'jdoe', 'Sample University', "j.doe.sample.com")
#' }
register_DSC <-
  function(name,
           username,
           affiliation,
           email,
           password = NULL) {
    if (is.null(password))
      password <- stri_rand_strings(1, 10)
    url <- "https://ws.ijs.si:8443/dsc-1.5/service/manage/user"
    result_json <-
      POST(
        url,
        add_headers(
          .headers = c('Content-Type' = "application/json",
                       'Accept' = "application/json")
        ),
        body = list(
          name = name,
          affiliation = affiliation,
          email = email,
          username = username,
          password = password
        ),
        encode = "json"
      )
    if (result_json$status_code == 200) {
      if (!requireNamespace("keyring", quietly = TRUE)) {
        repo_dir <- paste0(file.path(Sys.getenv('HOME'), 'repository'))
        if (!is.null(getOption("IOHprofiler.repo_dir"))) {
          repo_dir <- getOption("IOHprofiler.repo_dir")
        }
        saveRDS(
          list(DSCusername = username, DSCpassword = password),
          paste0(repo_dir, "/config.rds")
        )
        return(TRUE)
      }
      else {
        keyring::key_set_with_value("DSCtool", password = password)
        keyring::key_set_with_value("DSCtool_name", password = username)
        return(TRUE)
      }
    }
    else {
      stop("Something went wrong in registering for DSCtool")
    }
  }

#' Register an account to the DSCtool API
#'
#' This uses the keyring package to store and load credentials.
#' If you already have an account, please call `add_DSC_credentials` instead
#'
#' @param username The usename you use on DSCtool. Will be stored on keyring under 'DSCtool_name'
#' @param password The password you use on DSCtool. Will be stored on keyring under 'DSCtool'
#'
#' @export
#' @examples
#' \dontrun{set_DSC_credentials('jdoe', 'monkey123')}
set_DSC_credentials <- function(username, password) {
  if (!requireNamespace("keyring", quietly = TRUE)) {
    repo_dir <- paste0(file.path(Sys.getenv('HOME'), 'repository'))
    if (!is.null(getOption("IOHprofiler.repo_dir"))) {
      repo_dir <- getOption("IOHprofiler.repo_dir")
    }
    saveRDS(
      list(DSCusername = username, DSCpassword = password),
      paste0(repo_dir, "/config.rds")
    )
  }
  else {
    keyring::keyring_unlock(password = "")
    keyring::key_set_with_value("DSCtool", password = password)
    keyring::key_set_with_value("DSCtool_name", password = username)
  }
}

#' Load stored credentials for DSCtool
#'
#'
#' @noRd
#' @examples
#' \dontrun{
#' get_DSC_credentials()
#' }
get_DSC_credentials <- function() {
  if (!requireNamespace("keyring", quietly = TRUE)) {
    repo_dir <- paste0(file.path(Sys.getenv('HOME'), 'repository'))
    if (!is.null(getOption("IOHprofiler.repo_dir"))) {
      repo_dir <- getOption("IOHprofiler.repo_dir")
    }
    data <- readRDS(paste0(repo_dir, "/config.rds"))
    return(list(name = data$DSCusername, pwd = data$DSCpassword))
  }
  else {
    keyring::keyring_unlock(password = "")
    return(list(
      name = keyring::key_get("DSCtool_name"),
      pwd = keyring::key_get("DSCtool")
    ))
  }
}

#' Convert a DataSetList to the format needed for the DSCtool
#'
#'
#' @param dsList The DataSetList object
#' @param targets Optional list of target values (Runtime or target value)
#' @param which Whether to use a fixed-target 'by_RT' perspective or fixed-budget 'by_FV'
#' @param na.correction How to deal with missing values. Only used in fixed-target perspective.
#'
#' @noRd
#' @examples
#' convert_to_dsc_compliant(dsl)
convert_to_dsc_compliant <-
  function(dsList,
           targets = NULL,
           which = 'by_RT',
           na.correction = NULL) {
    maximize <- attr(dsList, 'maximization')
    variable <-
      fid <- value <- NULL #Set local binding to remove warnings
    by_rt <- which == 'by_RT'

    if (is.null(targets)) {
      targets <- get_target_dt(dsList, which)
    }


    final_list <- lapply(get_id(dsList), function(id) {
      dsl_sub <- subset(dsList, ID == id)
      problems <- lapply(dsl_sub, function(ds) {
        target <-
          targets[funcId == attr(ds, 'funcId') &
                    DIM == attr(ds, 'DIM'), 'target']
        if (!is.numeric(target)) {
          #TODO: more robust solution
          target <- target[[1]][[1]]
        }
        if (by_rt) {
          data <- get_RT_sample(ds, target, output = 'long')$RT
          if (any(is.na(data)) & !is.null(na.correction)) {
            if (na.correction == 'PAR-1') {
              budgets <- attr(ds, 'maxRT')
              data[is.na(data)] <- budgets[is.na(data)]
            }
            else if (na.correction == 'PAR-10') {
              budgets <- attr(ds, 'maxRT')
              data[is.na(data)] <- 10 * budgets[is.na(data)]
            }
            else if (na.correction == 'ERT') {
              ert <- as.numeric(get_ERT(ds, target)$ERT)
              if (is.infinite(ert)) {
                ert <- sum(attr(ds, 'maxRT'))
              }
              data[is.na(data)] <- ert
            }
            else if (na.correction == 'Remove-na') {
              data <- data[!is.na(data)]
            }
            else {
              stop('This value for `na.correction` is not supported')
            }
          }
        }
        else {
          data <- get_FV_sample(ds, target, output = 'long')$`f(x)`
          if (attr(ds, 'maximization'))
            data <- -1  * data
        }
        if (any(is.na(data))) {
          warning(
            "NA-values detected in data preparation for DSCtool. This will likely result in an
                  error. Please verify the provided DataSetList."
          )
        }
        return(list(
          name = paste0("F", attr(ds, 'funcId'), "_", attr(ds, 'DIM'), "D"),
          data = data
        ))
      })
      return(list(algorithm = id, problems = problems))
    })


    return(final_list)
  }

#' Get the matrix of rankings using the DSCtool api for a DataSetList
#'
#'
#' @param dsList The DataSetList object
#' @param targets Optional list of target values (Runtime or target value)
#' @param which Whether to use a fixed-target 'by_RT' perspective or fixed-budget 'by_FV'
#' @param test_type Either 'AD' for Anderson-Darling or KS for Kolmogorov-Smirnov tests
#' @param alpha Threshold value for statistical significance
#' @param epsilon Minimum threshold to have practical difference between algorithms (eDSC)
#' @param monte_carlo_iterations How many monte-carlo-simulations to perform
#' (set to 0 to use regular DSC)
#' @param na.correction How to deal with missing values. Only used in fixed-target perspective.
#' Options are:
#' - 'NULL': No correction is done. This will likely result in an error, as the DSCtool
#' does not allow for na values
#' - 'PAR-1' Replace missing values with Budget (budget taken from relevant DataSet)
#' - 'PAR-10' Replace missing values with 10*Budget (budget taken from relevant DataSet)
#' - 'ERT' Replace NA values with the Expected Running Time. If all values are NA, this
#' reverts to nr_runs * budget
#' - 'Remove-na' Removes all NA values
#'
#' @return A named list containing a ranked-matrix which has the rankin of each algorithm
#' on each problem, as well as a list of which omnibus tests can be used to further process
#' this data. This can be further analyzed using `get_dsc_omnibus`
#'
#' @export
#' @examples
#' get_dsc_rank(dsl, na.correction = 'PAR-10')
get_dsc_rank <-
  function(dsList,
           targets = NULL,
           which = 'by_RT',
           test_type = "AD",
           alpha = 0.05,
           epsilon = 0,
           monte_carlo_iterations = 0,
           na.correction = NULL) {
    if (!check_dsc_configured())
      return(NULL)
    url <- "https://ws.ijs.si:8443/dsc-1.5/service/rank"
    dsc_list <-
      convert_to_dsc_compliant(dsList, targets, which, na.correction = na.correction)
    json_list <-
      list(
        method = list(name = test_type, alpha = alpha),
        epsilon = epsilon,
        data = dsc_list,
        monte_carlo_iterations = monte_carlo_iterations
      )

    result_json <- POST(
      url,
      authenticate(get_DSC_credentials()$name, get_DSC_credentials()$pwd),
      add_headers(
        .headers = c('Content-Type' = "application/json",
                     'Accept' = "application/json")
      ),
      body = json_list,
      encode = "json"
    )

    return(content(result_json)$result)
  }


#' Perform omnibus statistical tests on the matrix of rankings from the DSCtool api
#'
#'
#' @param res The result of a call to the `get_dsc_rank`
#' @param method Which method to use to do the tests.
#' Has be be one of the allowed ones in `res$valid_methods`.
#' When NULL, the first valid option is chosen by default
#' @param alpha Threshold value for statistical significance
#'
#' @return A named list containing the algorithm means
#'
#' @export
#' @examples
#' get_dsc_omnibus(get_dsc_rank(dsl, na.correction = 'PAR-10'))
get_dsc_omnibus <- function(res, method = NULL, alpha = 0.05) {
  if (!check_dsc_configured())
    return(NULL)
  if (is.null(method))
    method <- res$valid_methods[[1]]
  else
    req(method %in% res$valid_methods)
  url <- "https://ws.ijs.si:8443/dsc-1.5/service/omnibus"
  new_json <- list(
    method = list(name = method, alpha = alpha),
    ranked_matrix = res$ranked_matrix,
    number_algorithms = res$number_algorithms,
    parametric_tests = res$parametric_tests
  )
  result_json <- POST(
    url,
    authenticate(get_DSC_credentials()$name, get_DSC_credentials()$pwd),
    add_headers(
      .headers = c('Content-Type' = "application/json",
                   'Accept' = "application/json")
    ),
    body = new_json,
    encode = "json"
  )

  return(content(result_json)$result)
}

#' Perform post-hoc processing on data from DSCtool
#'
#'
#' @param omni_res The result from a call to `get_dsc_omnibus`
#' @param nr_algs The number of algorithms present in `omni_res`
#' @param nr_problems The number of problems present in `omni_res`
#' @param base_algorithm The base algorithm to which the other are compared.
#' This has to be present in `omni_res$algorithm_means` as an `algorithm` property
#' @param method Either 'friedman' or 'friedman-aligned-rank'
#' @param alpha Threshold value for statistical significance
#'
#' @return A named list containing 4 types of analyses:
#' * Zvalue
#' * UnadjustedPValue
#' * Holm
#' * Hochberg
#'
#' @export
#' @examples
#' get_dsc_posthoc(get_dsc_omnibus(get_dsc_rank(dsl, na.correction = 'PAR-10')), 2, 2)
get_dsc_posthoc <-
  function(omni_res,
           nr_algs,
           nr_problems,
           base_algorithm = NULL,
           method = "friedman",
           alpha = 0.05) {
    if (!check_dsc_configured())
      return(NULL)
    if (is.null(base_algorithm))
      base_algorithm <- omni_res$algorithm_means[[1]]$algorithm
    else
      req(base_algorithm %in% unlist(omni_res$algorithm_means))
    url <- "https://ws.ijs.si:8443/dsc-1.5/service/posthoc"
    new_json <- list(
      method = list(name = method, alpha = alpha),
      algorithm_means = omni_res$algorithm_means,
      n = nr_problems,
      k = nr_algs,
      base_algorithm = base_algorithm
    )
    result_json <- POST(
      url,
      authenticate(get_DSC_credentials()$name, get_DSC_credentials()$pwd),
      add_headers(
        .headers = c('Content-Type' = "application/json",
                     'Accept' = "application/json")
      ),
      body = new_json,
      encode = "json"
    )

    return(content(result_json)$result)
  }


#' Get the marginal contribution of an algorithm to a portfolio
#'
#' Based on the contribution to the ECDF-curve of the VBS of the portfolio
#'
#' @param id The id for which to get the contribution
#' @param perm The permutation of algorithms to which is being contributed
#' @param j At which point in the permutation the contribution should be measured
#' @param dt The datatable in which the raw ecdf-values are stored (see `generate_data.ECDF_raw`)
#'
#' @export
#' @examples
#' dt <- generate_data.ECDF_raw(dsl, get_ECDF_targets(dsl))
#' get_marg_contrib_ecdf(get_id(dsl)[[1]], get_id(dsl), 1, dt)
get_marg_contrib_ecdf <- function(id, perm, j, dt) {
  hit <- NULL #Bind to avoid notes
  ids <- perm[0:j]
  v_pre <-
    sum(dt[ID %in% ids, list(hit = max(hit)), by = c('funcId', 'DIM', 'target', 'rt')][['hit']])
  ids <- perm[0:(j - 1)]
  if (j == 1) {
    v_post <- 0
  }
  else
    v_post <-
    sum(dt[ID %in% ids, list(hit = max(hit)), by = c('funcId', 'DIM', 'target', 'rt')][['hit']])
  v_pre - v_post
}


#' Get the shapley-values of a portfolio of algorithms
#'
#' Based on the contribution to the ECDF-curve of the VBS of the portfolio
#'
#' @param dsList The DataSetList object
#' @param targets A list or data.table containing the targets per function / dimension. If this is
#' a data.table, it needs columns 'target', 'DIM' and 'funcId'
#' @param scale.log Whether to use logarithmic scaling for the runtimes at which the ecdf will be sampled or not
#' @param group_size How many permutation groups will be considered
#' @param max_perm_size The maximum limit for permutations to be considered
#' @param normalize Whether or not to ensure the resulting values will be in [0,1]
#'
#' @export
#' @examples
#' \dontshow{data.table::setDTthreads(1)}
#' dsl_sub <- subset(dsl, funcId == 1)
#' get_shapley_values(dsl_sub, get_ECDF_targets(dsl_sub), group_size = 2)
get_shapley_values <-
  function(dsList,
           targets,
           scale.log = T,
           group_size = 5,
           max_perm_size = 10,
           normalize = T) {
    hit <- NULL #Bind to avoid notes
    ids_full <- get_id(dsList)

    dt <-
      generate_data.ECDF_raw(dsList, targets, scale_log = scale.log)

    nr_players <- length(ids_full)

    perms <-
      lapply(seq_len(nr_players * group_size), function(i) {
        sample(ids_full)
      })

    max_val <-
      sum(dt[, list(hit = max(hit)), by = c('funcId', 'DIM', 'target', 'rt')][['hit']])
    ### For each algorithm, calculate shapley value
    shapleys <- lapply(ids_full, function(id) {
      ### For each group of permutations
      temp <- mean(unlist(lapply(seq_len(max_perm_size), function(j) {
        perm_sub <- perms[((group_size * (j - 1)) + 1):(group_size * (j))]
        ###
        mean(unlist(lapply(perm_sub, function(perm) {
          perm <- replace(perm, c(j, which(perm == id)), c(id, perm[[j]]))
          get_marg_contrib_ecdf(id, perm, j, dt)
        })))

      })))
      if (normalize)
        temp <-
          temp / max_val #Scale so Shapleys always fall between 0 and 1
      temp
    })


    data.table(ID = ids_full, shapley = shapleys)
  }

#' Get the list of available options for data from the OPTION ontology
#'
#'
#' @param varname The variable for which to get the options. Restricted to [Fid, Iid, DIM, AlgId, Suite]
#' @param datasource The datasource for which to get the attributes. Either BBOB
#' or Nevergrad, or NULL if looking at a specific `study` argument
#' @param study Which study to load the requested variables for (NULL if no study is considered)
#' @param algs Which algorithms to get the requested variables for. Required for varnames in [Fid, Iid, DIM]
#'
#' @param ... Additional arguments to the OPTION call. Currently only supports 'Suite' for nevergrad.
#'
#' @return the options of varname given the specified datasource
#'
#' @export
#' @examples
#' get_ontology_var("Fid", "BBOB")
get_ontology_var <-
  function(varname,
           datasource = NULL,
           study = NULL,
           algs = NULL,
           ...) {
    url_base <- "http://semanticannotations.ijs.si:8080/"

    if (is.null(datasource)) {
      parameters_list <- list()
    } else if (datasource == "BBOB")
      parameters_list <- list(testbed = "COCO-BBOB")
    else{
      dot_args = list(...)
      parameters_list <-
        list(testbed = "Nevergrad", nevergradTestbed = dot_args[['suite']])
    }

    if (!is.null(study))
      parameters_list['study'] <- study

    if (!is.null(algs)) {
      temp <- lapply(algs, function(x) {
        return(paste0('\"', x, '\"'))
      })
      parameters_list['algorithms'] <- paste0(temp, collapse = ',')
    }

    url_appendix <- switch(
      varname,
      "Fid" = "functions",
      "DIM" = "dimensions",
      "Iid" = "instance",
      "Suite" = "nevergrad_testbeds",
      "AlgId" = "algorithms",
      "Study" = "studies",
      "Repo" = "studyDataSource"
    )

    url <- paste0(url_base, url_appendix)

    status <- try({
      resp <- GET(url, query = parameters_list)
      res <- content(resp)$results
    }, silent = TRUE)

    if (inherits(status, "try-error")) {
      return(NULL)
    } else {
      return(res)
    }
  }

#' Get the list of available options for data from the OPTION ontology
#'
#'
#' @param datasource The datasource: either BBOB or Nevergrad
#' @param fids The function names as given by `get_ontology_var`
#' @param dims The dimensionalities as given by `get_ontology_var`
#' @param algs The algorithm names as given by `get_ontology_var`
#' @param iids The instances as given by `get_ontology_var` (only for BBOB data)
#' @param funcsuites The function suite as given by `get_ontology_var` (only for Nevergrad data)
#' @param min_target The minimum target value for which to return data
#' @param max_target The maximum target value for which to return data
#' @param min_budget The minimum budget value for which to return data
#' @param max_budget The maximum budget value for which to return data
#'
#' @return a DataSetList object matching the selected attributes.
#'
#' @export
#' @examples
#' get_ontology_data("BBOB", "f5", 5, "IPOP400D", 1)
get_ontology_data <-
  function(datasource,
           fids,
           dims,
           algs,
           iids = NULL,
           funcsuites = NULL,
           min_target = NULL,
           max_target = NULL,
           min_budget = NULL,
           max_budget = NULL) {
    url_base <- "http://semanticannotations.ijs.si:8080/query"

    if (datasource == "BBOB")
      parameters_list <- list(testbed = "COCO-BBOB")
    else
      parameters_list <- list(testbed = "Nevergrad")

    parameters_list$functions <- paste0(fids, collapse = ',')
    parameters_list$dimensions  <- paste0(dims, collapse = ',')
    parameters_list$algorithms  <-
      paste0('\"', algs, '\"', collapse = ',')
    if (!is.null(iids))
      parameters_list$instances  <- paste0(iids, collapse = ',')
    if (!is.null(funcsuites))
      parameters_list$nevergradTestbed  <- funcsuites

    if (!is.null(min_target) & !is.null(max_target)) {
      parameters_list$target <- paste0(min_target, '-', max_target)
    } else if (!is.null(min_target)) {
      parameters_list$target <- paste0('>=', min_target)
    } else if (!is.null(max_target)) {
      parameters_list$target <- paste0('<=', max_target)
    }

    if (!is.null(min_budget) & !is.null(max_budget)) {
      parameters_list$budget <- paste0(min_budget, '-', max_budget)
    } else if (!is.null(min_budget)) {
      parameters_list$budget <- paste0('>=', min_budget)
    } else if (!is.null(max_budget)) {
      parameters_list$budget <- paste0('<=', max_budget)
    }


    status <- try({
      resp <- POST(url_base, body = parameters_list, encode = "form")
      results <- content(resp)
    }, silent = TRUE)

    if (inherits(status, "try-error")) {
      return(NULL)
    } else {
      dt_temp <- rbindlist(results$results)
      res <- convert_from_OPTION(dt_temp, datasource)
      return(res)
    }
  }
IOHprofiler/IOHanalyzer documentation built on Feb. 1, 2024, 11:35 a.m.