aitoaEvaluate: An R Package for Evaluating the aitoa Results

Documented in aitoa.load.log.file

#' @importFrom bit64 as.integer64
.int64.max.dbl.int <- as.integer64((2 ^ 53) - 1);
.int64.0 <- as.integer64(0L);


#' @title Load a Single Log File
#' @description Load a log file and return the results as a data frame. The data
#'   frame will have (at most) three columns: \describe{\item{\code{t}}{contains
#'   the time in milliseconds consumed since the start of the optimization
#'   process and is monotonously increasing} \item{\code{fes}}{contains the
#'   number of objective function evaluations since the start of the
#'   optimization process and is strictly monotonously increasing}
#'   \item{\code{f}}{contains the objective value and is strictly monotonously
#'   decreasing, with the except of the last point which might have the same
#'   objective value as the second-to-last point.}}
#'
#'   The data frame will be enriched with four attributes:
#'   \describe{\item{file}{the normalized absolute file path to the log file}
#'   \item{algorithm}{the id of the algorithm used to generate the log file}
#'   \item{instance}{the problem instance to which the algorithm was applied}
#'   \item{seed}{the hexadecimal string representation of the random seed}}
#'
#'   Time and FEs have a resolution of 1 and a maximum value of \code{2^53 - 1},
#'   as this is the highest integer value that can represented precisely with a
#'   double. If a higher value is encountered in any of the two columns,
#'   \link[base]{stop} will be invoked, i.e., higher values are not permitted.
#'   In order to test for this, we use the 64 bit integers from the \code{bit64}
#'   package, since \code{R} does not support 64 bit integers natively.
#' @param file the log file to load
#' @param keep.columns the columns to keep, any vector containing elements
#'   \code{"t"} (for time), \code{"f"} (for the objective value), and
#'   \code{"fes"} (for the consumed FEs)
#' @param f.must.be.improving \code{true} if the logged objective values must be
#'   strictly improving? This is the default way logs are generated by aitoa.
#'   However, you can also create a log where every single sampled solution is
#'   logged, so then you must set \code{f.must.be.improving=FALSE} to load the
#'   data.
#' @param make.time.unique should we make the time indices unique (except maybe
#'   for the first and last point)? This makes sense when we want to plot
#'   diagrams over a time axis, as we then have removed redundant points right
#'   away. If \code{make.time.unique==FALSE}, then there may be multiple
#'   improvements at the same time index due to the resolution of the computer
#'   clock (while each improvement will definitely have a unique FE).
#' @return a data frame with the columns \code{t} (time in ms), \code{fes}
#'   (function evaluations), and \code{f} (objective value), all of which are
#'   numeric or integer valued, with integer type being preferred if it can be
#'   used without loss of precision
#' @seealso \link{aitoa.parse.file.name}
#' @importFrom bit64 as.integer64
#' @export aitoa.load.log.file
#' @include parse_file_name.R
#' @include utils.R
aitoa.load.log.file <- function(file,
                                keep.columns = c("fes", "t", "f"),
                                make.time.unique=FALSE,
                                f.must.be.improving=TRUE) {
  old.options <- options(warn=2);

  stopifnot(!is.null(file),
            is.character(file),
            length(file) == 1L,
            !is.na(file),
            !is.null(keep.columns),
            is.character(keep.columns),
            length(keep.columns) > 0L,
            !any(is.na(keep.columns)),
            !is.null(make.time.unique),
            is.logical(make.time.unique),
            length(make.time.unique) == 1L,
            isTRUE(make.time.unique) || isFALSE(make.time.unique),
            !is.null(f.must.be.improving),
            is.logical(f.must.be.improving),
            length(f.must.be.improving) == 1L,
            isTRUE(f.must.be.improving) || isFALSE(f.must.be.improving));

  keep.columns <- unique(keep.columns);
  stopifnot(length(keep.columns) > 0L,
            all(keep.columns %in% c("fes", "t", "f")));

  file <- .file.exists(file);

  # read file as text file, one line = one element
  data <- readLines(con=file, warn=FALSE);
  data <- force(data);

  # detect consumed FEs
  consumedFEs <- grep("# CONSUMED_FES:", data, fixed = TRUE);
  consumedFEs <- force(consumedFEs);
  stopifnot(length(consumedFEs) == 1L);
  consumedFEs <- as.integer64(trimws(strsplit(data[consumedFEs[[1L]]],":", fixed=TRUE)[[1L]][2L]));
  consumedFEs <- force(consumedFEs);
  stopifnot(is.finite(consumedFEs),
            consumedFEs <= .int64.max.dbl.int,
            consumedFEs > .int64.0);
  consumedFEs <- as.numeric(consumedFEs);
  if(consumedFEs <= .Machine$integer.max) {
    .fes <- as.integer;
    consumedFEs <- as.integer(consumedFEs);
  } else {
    .fes <- as.numeric;
  }

  # detect consumed time
  consumedTime <- grep("# CONSUMED_TIME:", data, fixed = TRUE);
  consumedTime <- force(consumedTime);
  stopifnot(length(consumedTime) == 1L);
  consumedTime <- as.integer64(trimws(strsplit(data[consumedTime[[1L]]],":", fixed=TRUE)[[1L]][2L]));
  consumedTime <- force(consumedTime);
  stopifnot(is.finite(consumedTime),
            consumedTime <= .int64.max.dbl.int,
            consumedTime >= .int64.0);
  consumedTime <- as.numeric(consumedTime);
  if(consumedTime <= .Machine$integer.max) {
    .time <- as.integer;
    consumedTime <- as.integer(consumedTime);
  } else {
    .time <- as.numeric;
  }

  # extract the correct lines
  start <- grep("# BEGIN_LOG", data, fixed=TRUE)[[1L]];
  start <- force(start);
  end <- grep("# END_OF_LOG", data, fixed=TRUE)[[1L]];
  end <- force(end);
  stopifnot(start > 0L,
            end > (start + 2L),
            is.finite(start),
            is.finite(end));

  # load data as CSV
  data <- strsplit(data[(start+2L):(end-1L)], ";", fixed=TRUE);
  data <- force(data);
  f   <- as.numeric(vapply(data, `[[`, "", 1L));
  stopifnot(all(is.finite(f)));
  f   <- force(f);

  fes <- .fes(vapply(data, `[[`, "", 2L));
  fes <- force(fes);
  stopifnot(all(is.finite(fes)),
            all(fes > 0),
            all(ceiling(fes) == floor(fes)),
            all(fes <= consumedFEs));

  t   <- .time(vapply(data, `[[`, "", 3L));
  t   <- force(t);
  stopifnot(all(is.finite(t)),
            all(t >= 0),
            all(ceiling(t) == floor(t)),
            all(t <= consumedTime));
  rm("data");

  stopifnot(length(unique(fes)) == length(fes));

  startPointAdded <- FALSE;
  if(isTRUE(make.time.unique)) {
    # take care of re-occuring time values
    t.unique.indexes <- findInterval(unique(t), t);
    if(t.unique.indexes[1L] != 1L) {
      t.unique.indexes <- c(1L, t.unique.indexes);
      startPointAdded <- TRUE;
    }
    t <- t[t.unique.indexes];
    t <- force(t);
    f <- f[t.unique.indexes];
    f <- force(f);
    fes <- fes[t.unique.indexes];
    fes <- force(fes);
  }

  l <- length(t);
  stopifnot(l >= 1L);

  # add an end point if necessary
  endPointAdded <- FALSE;
  if(fes[l] < consumedFEs) {
    stopifnot(consumedTime >= t[l]);
    l <- l + 1L;
    f[l]   <- f[l - 1L];
    fes[l] <- consumedFEs;
    t[l]   <- consumedTime;
    endPointAdded <- TRUE;
  } else {
    stopifnot(t[l] <= consumedTime);
    consumedTime <- t[l];
  }

  stopifnot(fes[l] == consumedFEs,
              t[l] == consumedTime);

  minLength <- 1L;
  if(startPointAdded) { minLength <- minLength + 1L; }
  if(endPointAdded) { minLength <- minLength + 1L; }
  stopifnot(length(t) >= minLength,
            length(t) == length(f),
            length(f) == length(fes),
            length(t) == l,
            all(is.finite(t)),
            all(is.finite(f)),
            all(is.finite(fes)),
            all(is.numeric(t)),
            all(is.numeric(f)),
            all(is.numeric(fes)),
            all(fes > 0),
            all(fes <= consumedFEs),
            all(t >= 0),
            all(t <= consumedTime));

# check how the objective values progress: always decreasing
  if(l > 1L) {
    before <- 1L:(l-1L);
    after  <- 2L:l;
    stopifnot(all(fes[before] <= fes[after]),
              all(t[before] <= t[after]));
    if(f.must.be.improving) {
      stopifnot(all(f[before] >= f[after]));
    }
  }

  if(endPointAdded) {
    if(l > 2L) {
      stopifnot(fes[1L:(l-2L)] < fes[2L:(l-1L)]);
    }
    if(f.must.be.improving) {
      stopifnot(f[l] == f[l-1L]);
    }
  }
  if(startPointAdded) {
    stopifnot(t[1L] == t[2L]);
    if(f.must.be.improving) {
      stopifnot(f[1L] >= f[2L]);
    }
  }

  # OK, we should have valid data
  t <- force(t);
  fes <- force(fes);
  f <- force(f);

  # try to convert the data to the cheapest format
  if(all((f >= (-.Machine$integer.max)) &
         (f <= .Machine$integer.max))) {
    .ft <- as.integer(f);
    if(all(.ft == f)) {
      f <- .ft;
    }
  }

  # finished converting
  data <- unique(data.frame(t=t, fes=fes, f=f)[keep.columns]);
  rm("f");
  rm("fes");
  rm("t");

  data <- force(data);
  data <- do.call(force, list(x=data));
  stopifnot(is.data.frame(data),
            nrow(data) > 0L,
            ncol(data) == length(keep.columns),
            names(data) == keep.columns);

  attr(data, "file") <- file;
  info <- unname(unlist(aitoa.parse.file.name(file)));
  stopifnot(length(info) == 3L,
            is.character(info),
            all(nchar(info) > 0L));
  attr(data, "algorithm") <- info[[1L]];
  attr(data, "instance") <- info[[2L]];
  attr(data, "seed") <- info[[3L]];
  options(old.options);

  return(data);
}