R/output_interpreter.R

#file = "C:/Users/M/Documents/r_seminar/2015-12-21_standard_BBOB_run/CMAES_output_1_1.txt"
#read file
#data = read.table(file, skip = 1, fill = TRUE)

#name columns
#when x1 and x2 are not given
#colnames only for the original BBOB output which we do not use anymore (we use our own framework)
#colnames(data) = c("FE", "fitness_minus_Fopt", "best_fitness_minus_Fopt", 
#              "measured_fitness", "best_measured_fitness")
#when x1 and x2 are given

#colnames(data) = c("FE", "fitness_minus_Fopt", "best_fitness_minus_Fopt", 
#                   "measured_fitness", "best_measured_fitness", "x1", "x2")

#convert everything to integer
#values that are no integers are indicative of a new function run
#dataframe should be separated at these points

#if (!"BBmisc" %in% rownames(installed.packages())) install.packages("BBmisc")
#if (!"snow" %in% rownames(installed.packages())) install.packages("snow")
#if (!"parallel" %in% rownames(installed.packages())) install.packages("parallel")
#require(BBmisc)
#require(snow)
#require(parallel)

#' @title Interpretation of BBoB Data
#' @description
#' \code{readOutput} is used to read and interpret an output file generated by the benchmarking functions
#' \code{\link{bbob_custom}} or \code{\link{bbob_custom_parallel}}.
#' @details
#' There are several functions for reading and interpreting the output generated by \code{\link{bbob_custom}} 
#' or \code{\link{bbob_custom_parallel}}. \code{readOutput} is used to read one single output file and returns a list of
#' results of that data (see value section). The user might execute something as follows: 
#' \code{\link{readOutput}}(\code{output_function1}).
#' @param file
#' The user has to specify the exact datapath to the bbob data to be read and interpreted.
#' @return \code{readOutput} returns an object of the class \code{single_bbob_result}. A list of those can be passed to
#' \code{\link{aggregateResults}}. Each \code{single_bbob_result} is a list that contains the following components:
#'   \item{allBest}{best fitness value of each processed function instance.}
#'   \item{avgBest}{average best fitness value of all processed function instances.}
#'   \item{overallBest}{overall best fitness value of all processed function instances.}
#'   \item{overallWorst}{overall worst fitness of all processed function instances.}
#'   \item{sdbest}{standard deviation of all best fitness values (\code{sd(allBest)}).}
#'   \item{allRuns}{number of iterations of each processed function instance that has been required for optimization.}
#'   \item{longestRun}{overall highest required number of iterations of all function instances.}
#'   \item{shortestRun}{overall lowest required number of iterations of all function instances.}
#'   \item{avgRuns}{average required iterations of all functions instances.}
#'   \item{sdRuns}{standard deviation of required iterations of all function instances (\code{sd(allRuns})).}
#'   \item{allRunsEval }{number of function evaluations of each processed function instance that has been required for optimization.}
#'   \item{longestRunEval}{overall highest required number of function evaluations of all function instances.}
#'   \item{shortestRunEval}{overall lowest required number of function evaluations of all function instances.}
#'   \item{avgRunEval}{average required function evaluations of all functions instances.}
#'   \item{sdRunsEval}{standard deviation of required function evaluations of all function instances (\code{sd(allRunsEval})).}
#'   \item{allStagnations}{number of final iterations without improvement in the fitness value.}
#'   \item{longestStagnation}{highest number of iterations without improvement in the fitness value of all function instances.}
#'   \item{shortestStagnation}{lowest number of iterations without improvement in the fitness value of all function instances..}
#'   \item{avgStagnation}{average number of final iterations without improvement in the fitness value.}
#'   \item{sdStagnation}{standard deviation of the number of final iterations 
#'   without improvement in the fitness value of all function instances (\code{sd(allStagnation})).}
#'   \item{allConvergence}{a \code{\link{data.frame}} that stores the gap between the fitness value and the global optimal value for each
#'   iteration.}
#'   \item{avgConvergence}{a vector that stores \code{mean(allConvergence)} of all processed instances applied 
#'   over all iterations.}
#'   \item{allRestarts}{number of restarts occured in optimizing one single function instance.}
#'   \item{t_test_termination}{number of t-test terminations while using OCD as a stopping condition.}
#'   \item{chi_test_termination}{number of chi\code{^2}-test terminations while using OCD as a stopping condition.}
#'   \item{functionID}{the identifier of the bbob function that has been optimized.}
#'   \item{dimension}{the problem dimensions for the optimization.}
#' @export
######################################################################
#for own output (CMAESr)
readOutput = function(file) {
  data = read.table(file, skip = 1, fill = TRUE)
  
  data = suppressWarnings(apply(data, 2, as.double))
  
  
  #get number of restarts
  #indicated by a -1 value in the first column
  allRestarts = data[which(data[,1] == -1),2]
  #remove restart rows
  #check if no restarts were logged (backward-compatibility)
  if(length(-which(data[,1] == -1)) > 0) data = data[-which(data[,1] == -1),]
  
  #get type of test that caused termination of cmaes (only for OCD)
  # t-test is indicated by a -3 value in the first column
  # chi-squared-test is indicated by a -2 value in the first column
  chi_test_termination = data[which(data[,1] == -3),2]
  t_test_termination = data[which(data[,1] == -2),2]
  # remove rows of termination conditions
  if(length(-which(data[,1]==-2)) > 0) data = data[-which(data[,1] == -2),]
  if(length(-which(data[,1]==-3)) > 0) data = data[-which(data[,1] == -3),]
  
  #get separate runs
  #get split points (NAs) and increment run counter accordingly
  data = as.data.frame(cbind(data, run_id = integer(nrow(data))))
  
  #faster version of run_ids
  #get breaks (where one run stopped)
  breaks = which(is.na(data[,1]))
  
  #if there is only one instance (case of random sort)
  if (length(breaks) < 2) {
    data$run_id = 1
  }
  else {
    #remove every second break (because there are two lines separating different runs, except for the last run)
    breaks = breaks[-seq(from = 1, to = length(breaks)-2, by = 2)]
    #add one break in front
    breaks = c(0, breaks)
    for (i in 1:(length(breaks)-1)) {
      data$run_id[breaks[i]:breaks[i+1]] = i
    }
  }
  allRunIDs = unique(data$run_id)
  
  ##################################
  #old
  #save all run ids for later use
  #allRunIDs = 1
  #inBreakPoint = FALSE
  #for (i in 1:nrow(data)) {
  #  data$run_id[i] = run
  #  if (is.na(data[i,1]) && inBreakPoint == FALSE) {
  #    run = run + 1
  #    allRunIDs = c(allRunIDs, run)
  #    inBreakPoint = TRUE
  #  }
  #  if (!is.na(data[i,1]) && inBreakPoint == TRUE) inBreakPoint = FALSE
  #}
  #remove all run ids overlap (if there are 20 runs, the above code actually detects 21)
  #allRunIDs = allRunIDs[1:(length(allRunIDs)-1)]
  
  #remove NA rows
  data = data[!is.na(data[,1]),]
  
  #clean fitness values
  #due to rounding errors in the cmaesr there might be fitness values of less than zero 
  #(smaller than global optimum), which of course, does not make sense
  data[,3] = ifelse(data[,3] < 0, 0, data[,3])

  #get data for fitness
  
  #get overall best fitness
  overallBest = min(data[,3])
  
  #get average best
  allBest = double()
  for (i in allRunIDs) {
    allBest = c(allBest, min(data[which(data$run_id %in% i), 3]))
  }
  avgBest = mean(allBest)
  
  #get sd
  sdBest = sd(allBest)
  
  #get worst best fitness
  overallWorst = max(allBest)
  
  #get data for iterations
  longestRun = max(data[,1])
  allRuns = integer()
  for (i in allRunIDs) {
    allRuns = c(allRuns, max(data[which(data$run_id %in% i), 1]))
  }
  shortestRun = min(allRuns)
  avgRun = mean(allRuns)
  sdRuns = sd(allRuns)
  
  #get data for function evaluations
  longestRunEval = max(data[,2])
  allRunsEval = double()
  for (i in allRunIDs) {
    allRunsEval = c(allRunsEval, max(data[which(data$run_id %in% i), 2]))
  }
  shortestRunEval = min(allRunsEval)
  avgRunEval = mean(allRunsEval)
  sdRunsEval = sd(allRunsEval)
  
  #get number of final iterations without improvement
  allStagnations = integer(0)
  for (i in allRunIDs) {
    allStagnations = c(allStagnations, sum(data[which(data$run_id %in% allRunIDs[i]),3] == allBest[i]))
  }
  longestStagnation = max(allStagnations)
  shortestStagnation = min(allStagnations)
  avgStagnation = mean(allStagnations)
  sdStagnations = sd(allStagnations)
  
  #analyze convergence behavior 
  #get average convergence
  #for this purpose pad all runs that are shorter than the longest run with their last best found value
  #this is desired in order to average the convergence over all instances
  #since only each result per iteration and not per function evaluation is logged, when there are different numbers
  #of individuals, FEs might differ within instances
  #therefore, average convergence cannot simply be aggregated over the rows
  #instead, store convergence in steps of size 100 and find the corresponding value, that is closest to this
  allConvergenceTicks = seq(from = 1, to = max(data[,2]), by = 100)
  allConvergence = NULL
  for (i in allRunIDs) {
    #get the data for the current run id
    tempData = data[which(data$run_id == i),]
    #get iteration that corresponds closest to the convergence ticks
    iterations = findInterval(allConvergenceTicks, tempData[,2])
    #findInterval returns 0 for the first element(s), replace by 1
    iterations[which(iterations == 0)] = 1
    allConvergence = as.data.frame(cbind(allConvergence, tempData[iterations,3]))
  }
  allConvergence = as.data.frame(cbind(allConvergenceTicks, allConvergence))
  #if there is only one run, no need to average anything
  if (ncol(allConvergence) > 2) {
    avgConvergence = apply(allConvergence[,-1], 1, mean)
    avgConvergence = cbind(allConvergenceTicks, avgConvergence)
  }
  else avgConvergence = allConvergence
  
  #save information about the function and the dimension
  #exctract from file name
  #this is somewhat dirty, but works if you do not rename files
  filePart = substr(file, gregexpr("output", file)[[1]][length(gregexpr("output", file)[[1]])], nchar(file))
  split = strsplit(filePart, "_")
  functionID = as.numeric(split[[1]][2])
  dimension = as.numeric(strsplit(split[[1]][3], "\\.")[[1]][1])
  #format return value
  result = list(allBest = allBest, avgBest = avgBest, overallBest = overallBest, overallWorst = overallWorst,
                sdBest = sdBest, 
                allRuns = allRuns, longestRun = longestRun, shortestRun = shortestRun, avgRun = avgRun, 
                sdRuns = sdRuns, allRunsEval = allRunsEval, longestRunEval = longestRunEval, 
                shortestRunEval = shortestRunEval, avgRunEval = avgRunEval, sdRunsEval = sdRunsEval, 
                allStagnations = allStagnations, longestStagnation = longestStagnation,
                shortestStagnation = shortestStagnation, avgStagnation = avgStagnation, 
                sdStagnations = sdStagnations, allConvergence = allConvergence, avgConvergence = avgConvergence,
                allRestarts = allRestarts, t_test_termination = t_test_termination, chi_test_termination=chi_test_termination,
                functionID = functionID, dimension = dimension)
  class(result) = "single_bbob_result"
  return(result)
}

#' @title Aggregation of Single BBoB Results
#' @description
#' \code{aggregateResults} is used to aggregate the results generated by \code{\link{readOutput}}.
#' 
#' @details 
#' The function \code{aggregateResults} takes a number of result objects, produced by applying
#' \code{readOutput} on single data files, and aggregates those results. 
#' For example: The user can pass the results of two function optimization runs. First, he generates the results of each single output by
#' calling \code{\link{readOutput}} for every single file (e.g. \code{result1 = }\code{\link{readOutput}}(\code{output_function1}), 
#' \code{result2 = }\code{\link{readOutput}}(\code{output_function2}). 
#' Then, the user passes these results to \code{\link{aggregateResults}} 
#' (e.g. \code{\link{aggregateResults}}(\code{c(result1, result2}))
#' @param allResults
#' The user has to pass a number of result objects produced by applying \code{\link{readOutput}}.
#' @return \code{aggregateResults} returns a list that contains the following components aggregated over all passed single bbob results
#' (see \code{\link{readOutput}} for information on the non aggregated single bbob results):
#'   \item{aggregatedAllBest}{vector of all best fitness values.}
#'   \item{aggregatedAvgBest}{aggregated average best fitness values.}
#'   \item{aggregatedOverallBest}{overall best fitness values.}
#'   \item{aggregatedOverallWorst}{overall worst fitness values.}
#'   \item{aggregatedSDBests}{standard deviations of all best fitness values.}
#'   \item{aggregatedAllRuns}{vector of the number of iterations.}
#'   \item{aggregatedLongestRun}{overall highest required number of iterations.}
#'   \item{aggregatedShortestRun}{overall lowest required number of iterations.}
#'   \item{aggregatedAvgRun}{average required iterations.}
#'   \item{aggregatedSDRuns}{standard deviation of required iterations (\code{sd(aggregatedAllRuns})).}
#'   \item{aggregatedAllRunsEval}{vector of the number of function evaluations.}
#'   \item{aggregatedLongestRunEval}{overall highest required number of function evaluations.}
#'   \item{aggregatedShortestRunEval}{overall lowest required number of function evaluations.}
#'   \item{aggregatedAvgRunEva}{average required function evaluations.}
#'   \item{aggregatedSDRunsEval}{standard deviation of required function evaluations (\code{sd(aggregatedAllRunsEval})).}
#'   \item{aggregatedAllStagnation}{vector of the final iterations without improvement.}
#'   \item{aggregatedLongestStagnation}{overall highest number of iterations without improvement.}
#'   \item{aggregatedShortestStagnation}{overall number of iterations without improvement.}
#'   \item{aggregatedAvgStagnation}{average number of final iterations without improvement in the fitness value.}
#'   \item{aggregatedSDStagnation}{standard deviation of the number of final iterations without improvement 
#'   (sd of \code{aggregatedAllStagnation}).}
#'   \item{aggregatedAllConvergence}{a \code{\link{data.frame}} that stores the average convergence of every single bbob result.}
#'   \item{aggregatedAvgConvergence}{the mean of the convergence of all single bbob results (\code{mean(aggregatedAllConvergence})).}
#'   \item{aggregatedAllRestarts}{vector of the number of restarts.}
#'   \item{aggregated_t_test_termination}{vector of the number of t-test terminations while using OCD as a stopping condition.}
#'   \item{aggregated_chi_test_termination}{vector of the number of chi\code{^2}-test terminations while using OCD as a stopping condition.}
#' @export
aggregateResults = function(allResults) {
  #do some input checks
  if (!is.list(allResults)) stop("input must be of type list")
  for (i in 1:length(allResults)) {
    if(class(allResults[[i]]) != "single_bbob_result") stop("all elements of result list must be of type single_bbob_result")
  }
  
  #aggregate results from input single_bbob_results
  #aggregate best fitness values
  aggregatedAllBest = numeric(0)
  for (i in 1:length(allResults)) {
    aggregatedAllBest = c(aggregatedAllBest, allResults[[i]]$allBest)
  }
  aggregatedAvgBest = mean(aggregatedAllBest)
  aggregatedOverallBest = min(aggregatedAllBest)
  aggregatedOverallWorst = max(aggregatedAllBest)
  aggregatedSDBests = sd(aggregatedAllBest)
  
  #aggregate runtimes
  aggregatedAllRuns = integer(0)
  for (i in 1:length(allResults)) {
    aggregatedAllRuns = c(aggregatedAllRuns, allResults[[i]]$allRuns)
  }
  aggregatedLongestRun = max(aggregatedAllRuns)
  aggregatedShortestRun = min(aggregatedAllRuns)
  aggregatedAvgRun = mean(aggregatedAllRuns)
  aggregatedSDRuns = sd(aggregatedAllRuns)
  
  #aggregate runtimes by function evaluations
  aggregatedAllRunsEval = integer(0)
  for (i in 1:length(allResults)) {
    aggregatedAllRunsEval = c(aggregatedAllRunsEval, allResults[[i]]$allRunsEval)
  }
  aggregatedLongestRunEval = max(aggregatedAllRunsEval)
  aggregatedShortestRunEval = min(aggregatedAllRunsEval)
  aggregatedAvgRunEval = mean(aggregatedAllRunsEval)
  aggregatedSDRunsEval = sd(aggregatedAllRunsEval)
  
  #aggregate stagnation
  aggregatedAllStagnation = integer(0)
  for (i in 1:length(allResults)) {
    aggregatedAllStagnation = c(aggregatedAllStagnation, allResults[[i]]$allStagnations)
  }
  aggregatedLongestStagnation = max(aggregatedAllStagnation)
  aggregatedShortestStagnation = min(aggregatedAllStagnation)
  aggregatedAvgStagnation = mean(aggregatedAllStagnation)
  aggregatedSDStagnation = sd(aggregatedAllStagnation)
  
  #aggregate convergence
  #follows the same logic as the single convergence aggregation
  #except for that entries already correspond to ticks, so we just need to take the row
  #corresponding to the current tick
  allConvergenceTicks = seq(from = 1, to = aggregatedLongestRunEval, by = 100)
  aggregatedAllConvergence = matrix(nrow = length(allConvergenceTicks), ncol = length(allResults))

  for (i in 1:length(allResults)) {
    currentConvergence = allResults[[i]]$avgConvergence
    #find all ticks that are included in the current convergence, pad the rest
    currentConvergenceTicks = c(currentConvergence[,1], rep(collapse(currentConvergence[nrow(currentConvergence),1]), 
                                                            times = (length(allConvergenceTicks) - nrow(currentConvergence))))
    #convert to indexes
    currentIndexes = ceiling(as.numeric(currentConvergenceTicks)/100)
    aggregatedAllConvergence[,i] = currentConvergence[currentIndexes,2]
  }
  aggregatedAvgConvergence = apply(aggregatedAllConvergence, 1, mean)
  aggregatedAvgConvergence = cbind(allConvergenceTicks, aggregatedAvgConvergence)
  aggregatedAllConvergence = cbind(allConvergenceTicks, aggregatedAllConvergence)
  
  #aggregate restarts
  aggregatedAllRestarts = integer(0)
  for (i in 1:length(allResults)) {
    aggregatedAllRestarts = c(aggregatedAllRestarts, allResults[[i]]$allRestarts)
  }  
  
  #aggregate t_test_termination
  aggregated_t_test_termination = integer(0)
  for (i in 1:length(allResults)) {
    aggregated_t_test_termination = c(aggregated_t_test_termination, allResults[[i]]$t_test_termination)
  }
  
  #aggregate chi_test_termination
  aggregated_chi_test_termination = integer(0)
  for (i in 1:length(allResults)) {
    aggregated_chi_test_termination = c(aggregated_chi_test_termination, allResults[[i]]$chi_test_termination)
  }
  
  #format return value
  result = list(aggregatedAllBest = aggregatedAllBest, aggregatedAvgBest = aggregatedAvgBest,
                aggregatedOverallBest = aggregatedOverallBest, aggregatedOverallWorst = aggregatedOverallWorst,
                aggregatedSDBests = aggregatedSDBests, aggregatedAllRuns = aggregatedAllRuns, 
                aggregatedLongestRun = aggregatedLongestRun, aggregatedShortestRun = aggregatedShortestRun,
                aggregatedAvgRun = aggregatedAvgRun, aggregatedSDRuns = aggregatedSDRuns, 
                aggregatedAllRunsEval = aggregatedAllRunsEval, aggregatedLongestRunEval = aggregatedLongestRunEval,
                aggregatedShortestRunEval = aggregatedShortestRunEval, aggregatedAvgRunEval = aggregatedAvgRunEval,
                aggregatedSDRunsEval = aggregatedSDRunsEval, aggregatedAllStagnation = aggregatedAllStagnation,
                aggregatedLongestStagnation = aggregatedLongestStagnation, 
                aggregatedShortestStagnation = aggregatedShortestStagnation,
                aggregatedAvgStagnation = aggregatedAvgStagnation,
                aggregatedSDStagnation = aggregatedSDStagnation, 
                aggregatedAllConvergence = aggregatedAllConvergence,
                aggregatedAvgConvergence = aggregatedAvgConvergence,
                aggregatedAllRestarts = aggregatedAllRestarts,
                aggregated_t_test_termination = aggregated_t_test_termination,
                aggregated_chi_test_termination = aggregated_chi_test_termination)
  return(result)
}

#' @title Empirical Cumulative Distribution Function of BBoB data
#' @description 
#' \code{extractECDFofFunctions} returns a cumulative distribution function of 
#' the functions that were solved within the desired fitness gap by the number of function evaluations this took
#' @details 
#' \code{extractECDFofFunctions} can be used to observe which fraction of functions has been solved within the desired fitness gap
#' passed to the function. For example, a bbob experiment could terminate an optimization run after 100000 function evaluations.
#' If the specified fitness gap is not reached after this number of function evaluations, the corresponding function remains unsolved
#' with respect to that specific gap.
#' @param results
#' \code{results} must be a return object of \code{\link{aggregatedResults}}, i.e. the aggregated results of several function optimizations.
#' @return 
#' \code{extractECDFofFunctions} returns an empirical cumulative ditribution function based on bbob data which can be plotted directly.
#' @export
extractECDFofFunctions = function(results, fitnessGap = 1e-08) {
  allConvergence = results$aggregatedAllConvergence[,-1]
  thresholds = integer(0)
  for (i in 1:ncol(allConvergence)) {
    if (!length(which(allConvergence[,i]<fitnessGap)) == 0) {
      thresholds = c(thresholds, ((min(which(allConvergence[,i]<fitnessGap))-1) * 100 + 1))
    }
  }
  #sort thresholds ascending
  thresholds = sort(thresholds)
  #make % values for cumulative distribution function
  breaks = seq(from = 1/ncol(allConvergence), to = 1, length.out = ncol(allConvergence))
  #remove % values that are not reached (these functions did not reach the desired value)
  breaks = breaks[1:length(thresholds)]
  #add a point (all iterations,max(breaks)) with the maximum number of FEs in order to show the stagnation in the plot
  #max 100001 are evaluated if there are 100000 FEs
  breaks = c(breaks, max(breaks))
  thresholds = c(thresholds, (nrow(allConvergence)-1) * 100 + 1)
  #add (0,0) for better plots and return
  return(rbind(c(0,0), cbind(thresholds, breaks)))
}

#' @name load_results
#' @aliases loadAllResults
#' @aliases loadAllResultsParallel
#' @title Load All Single BBoB Results From a Folder
#' @description 
#' \code{loadAllResults} loads the bbob results of a a number of single bbob output files generated by \code{\link{bbob_custom}} or 
#' \code{\link{bbob_custom_parallel}}.
#' @details 
#' \code{loadAllResults} detects and loads the files to be loaded/interpreted, 
#' i.e. \code{\link{readOutput}} is applied to every single file specified by \code{usedFunctions, usedDimensions, path, algorithmName}.
#' For parallelisation of the loading process, the user might invoke \code{loadAllResultsParallel} with the same parameter setup.
#' @param usedFunctions
#' the function identifiers for which bbob data exists and interpretation is desired.
#' @param usedDimensions
#' the problem dimensions that have been used for optimization.
#' @param path
#' the path of the folder containing the output data of a bbob experiment
#' @param algorithmName
#' The identifier of the optimizer for detecting single output files in the specified folder
#' @return 
#' \code{loadAllResults} returns an object \code{allResults}, which is a list of single bbob results as generated by \code{readOutput}
#' (see \code{\link{readOutput}} for details on the lists' elements).
#' @export
#loads all results that correspond to the naming conventions used by bbob_custom
loadAllResults = function(usedFunctions, usedDimensions, path, algorithmName) {
  allResults = NULL
  pbar = makeProgressBar(min = 0, max = length(usedFunctions)*length(usedDimensions))
  pbar$set(0)
  for (i in 1:length(usedFunctions)) {
    for (j in 1:length(usedDimensions)) {
      file = paste(path, "/", algorithmName, "_output_", usedFunctions[i], "_", usedDimensions[j], ".txt", sep = "")
      result = readOutput(file)
      if (is.null(allResults)) allResults = list(result)
      else allResults = c(allResults, list(result))
      pbar$set((i-1)*length(usedDimensions)+j)
    }
  }
  return(allResults)
}

#' @rdname load_results
#' @importFrom parallel detectCores
#' @importFrom snow makeCluster stopCluster clusterApply
#' @export
#loads all results that correspond to the naming conventions used by bbob_custom (parallel version)
#gets more efficient the more different functions were used
loadAllResultsParallel = function(usedFunctions, usedDimensions, path, algorithmName) {
  allResults = NULL
  pbar = makeProgressBar(min = 0, max = length(usedDimensions))
  pbar$set(0)
  nCores = parallel::detectCores()
  cluster = snow::makeCluster(nCores, type = "SOCK")
  #export all environment functions
  ex = Filter(function(x) is.function(get(x, .GlobalEnv)), ls(.GlobalEnv))
  clusterExport(cluster, ex)
  for (i in 1:length(usedDimensions)) {
    results = snow::clusterApply(cl = cluster, x = usedFunctions, function(x) readOutput(
      paste(path, "/", algorithmName, "_output_", x, "_", usedDimensions[i], ".txt", sep = "")
    ))
    if (is.null(allResults)) allResults = results
    else allResults = c(allResults, results)
    pbar$set(i)
  }
  snow::stopCluster(cluster)
  #no order results as they might be out of order due to parallel jobs
  sortedResults = NULL
  for (i in usedFunctions) {
    for (j in usedDimensions) {
      for (k in 1:length(allResults)) {
        if (allResults[[k]]$functionID == i & allResults[[k]]$dimension == j) {
          if (is.null(sortedResults)) sortedResults = allResults[k]
          else sortedResults = c(sortedResults, allResults[k])
        }
      }
    }
  }
  return(sortedResults)
}

#' @name allresults_processing
#' @aliases getAggregatedConvergenceFunctions
#' @aliases getAvgBestPerFunctionAndDimension
#' @aliases getAvgBestPerFunction
#' @aliases getAvgBestPerDimension
#' @title Interpretation of Aggregated BBoB Results
#' @description 
#' \code{allresults_processing} is a collection of functions for interpreting aggregated bbob results 
#' (i.e. a return object of \code{\link{aggregateResults}}).
#' @details
#' The functions as defined above serve the following purposes:
#' \describe{
#'   \item{\code{getAggregatedConvergenceFunctions}}{A function that averages the convergence 
#'   for each optimized function over all dimensions specified.}
#'   \item{\code{getAvgBestPerFunctionAndDimension}}{A function that averages the best results
#'   for each function and all dimensions specified.}
#'   \item{\code{getAvgBestPerFunction}}{A function that averages the best results
#'   for each function over all dimensions specified.}
#'   \item{\code{getAvgBestPerDimension}}{A function that averages the best results
#'   for each dimension over all functions specified.}
#'   }
#' @param results
#' \code{results} must be a return object of \code{\link{aggregatedResults}}, i.e. the aggregated results of several function optimizations.
#' @param nFunctions
#' the number of function for which data exists and results should be computed
#' @param nDimensions
#' the number of dimensions for which data exists and results should be computed (nDimensions is the total number of logged dimensions,
#' i.e. \code{nDimensions} has to be a counting value, not the actual dimensionality)
#' @return The return objects are function specific and as follows:
#'   \item{aggregatedConvergenceFunctions}{A matrix of the fitness gaps for each function averaged over all dimensions.}
#'   \item{getAvgBestPerFunctionAndDimension}{A vector that stores the average best value (mean over all instances)
#'   for each function and each dimension of that function.}
#'   \item{getAvgBestPerFunction}{A vector that stores the average best value (mean over all instances) for each function over
#'   (dimensions are not considered separately.}
#'   \item{getAvgBestPerDimension}{A vector that stores the average the best value
#'   for each dimension over all functions (functions are not considered separately).}
#' @export
#get convergence averaged per function (over all dimensions)
getAggregatedConvergenceFunctions = function(results, nFunctions, nDimensions) {
  allConvergence = results$aggregatedAllConvergence
  ticks = allConvergence[,1]
  allConvergence = allConvergence[,-1]
  aggregatedConvergenceFunctions = matrix(nrow = nrow(allConvergence), ncol = nFunctions, data = 0)
  for (i in 1:nFunctions) {
    aggregatedConvergenceFunctions[,i] = apply(allConvergence[,((i-1) * nDimensions + 1):(i * nDimensions)], 
                                               1, mean)
  }
  aggregatedConvergenceFunctions = cbind(ticks, aggregatedConvergenceFunctions)
  return(aggregatedConvergenceFunctions)
}

#' @rdname allresults_processing
#' @export
#get best results averaged per function and dimension
getAvgBestPerFunctionAndDimension = function(results, nFunctions, nDimensions) {
  avgBest = double(0)
  nInstances = length(results$aggregatedAllBest)/nFunctions/nDimensions
  for (i in 1:(nFunctions*nDimensions)) {
    avgBest = c(avgBest, mean(results$aggregatedAllBest[((i-1)*nInstances+1):(i*nInstances)]))
  }
  return(avgBest)
}

#' @rdname allresults_processing
#' @export
#get best results averaged per function
getAvgBestPerFunction = function(results, nFunctions, nDimensions) {
  avgBest = double(0)
  nInstances = length(results$aggregatedAllBest)/nFunctions/nDimensions
  for (i in 1:nFunctions) {
    avgBest = c(avgBest, mean(results$aggregatedAllBest[((i-1)*nInstances*nDimensions+1):(i*nInstances*nDimensions)]))
  }
  return(avgBest)
}

#' @rdname allresults_processing
#' @export
#get best results averaged per dimension
getAvgBestPerDimension = function(results, nFunctions, nDimensions) {
  avgBest = double(0)
  nInstances = length(results$aggregatedAllBest)/nFunctions/nDimensions
  for (i in 1:nDimensions) {
    currentAvg = double(0)
    for (j in 1:nFunctions) {
      indexes = (((j-1)*nDimensions*nInstances+1+(i-1)*nInstances):((j-1)*nDimensions*nInstances+i*nInstances))
      currentAvg = c(currentAvg, mean(results$aggregatedAllBest[indexes]))
    }
    currentAvg = mean(currentAvg)
    avgBest = c(avgBest, currentAvg)
  }
  return(avgBest)
}

#' @title 
#' Get the Number of Active Functions in an Interation
#' @description 
#' \code{getActiveFunctions} returns the number of functions per iteration that are not yet stopped.
#' Functions stop e.g. when a certain solution quality is reached
#' @param results
#' \code{results} must be a return object of \code{\link{aggregatedResults}}, i.e. the aggregated results of several function optimizations.
#' @return
#' \code{getActiveFunctions} returns a vector that contains the number of functions (the number of instances of all functions) 
#' that are not stopped (per iteration).
#' @export
getActiveFunctions = function(results) {
  notConverged = integer(0)
  allRunsEval = results$aggregatedAllRunsEval
  ticks = seq(from = 1, to = results$aggregatedLongestRunEval, by = 100)
  for (i in ticks) {
    currentlyNotConverged = 0
    #track remove runs from the vector that did not satisfy the if clause in the loop (because 
    #they will never again)
    removeVector = integer(0)
    for (j in 1:length(allRunsEval)) {
      if (allRunsEval[j] > i) currentlyNotConverged = currentlyNotConverged + 1
      else removeVector = c(removeVector, j)
    }
    if (length(removeVector) > 0) allRunsEval = allRunsEval[-removeVector]
    j = j - length(removeVector)
    notConverged = c(notConverged, currentlyNotConverged)
  }
  return(notConverged)
}

#' @title Average Convergence per Function and/or per Dimension
#' @description
#' \code{averageConvergence} returns a matrix with the convergence values per function or per dimension or per a combination of both.
#' @param allConvergence
#' matrix of convergence values
#' @param nDimensions 
#' the number of dimensions for which data exists and results should be computed (nDimensions is the total number of logged dimensions,
#' i.e. \code{nDimensions} has to be a counting value, not the actual dimensionality)
#' @param includedFunctions
#' functions for which the average convergence should be computed
#' @param includedDimensions
#' dimensions for which the average convergence should be computed
#' @return
#' \code{averageConvergence} returns the average of the convergence values for the functions and/or dimensions specified when calling the function.
#average convergence per function or per dimension or per a combination of both
#nDimensions is the total number of logged dimensions, not only of the included ones
#included dimensions has to be a counting value, not the actual dimensionality
averageConvergence = function(allConvergence, includedFunctions, includedDimensions, nDimensions) {
  avgConvergence = numeric(nrow(allConvergence))
  tempConvergence = allConvergence[,-1]
  for (i in includedFunctions) {
    for (j in includedDimensions) {
      avgConvergence = avgConvergence + tempConvergence[,i*nDimensions-nDimensions+j]
    }
  }
  avgConvergence = avgConvergence / (length(includedFunctions)*length(includedDimensions))
  avgConvergence = cbind(allConvergence[,1], avgConvergence)
}


#checks whether all required logs for the R file output_analysis.R exist
#' @export
checkLogCompleteness = function(usedFunctions = 1:24, usedDimensions = c(2, 5, 10, 20), nInstances = 15) {
  checkSuccessful = TRUE
  #get all directories in current working directory
  allDirs = dir()[file.info(dir())$isdir]
  allDirs = c(allDirs, paste("OCD_parametrization/", dir("./OCD_parametrization"), sep = ""))
  requiredDirs = c("CMAES_default_with_restart", "CMAES_OCD_no_restarts", "CMAES_only_default", "GA_default",
                   "GA_OCD", "OCD_disp", "OCD_disp_fit", "OCD_evo", "OCD_evo_disp", "OCD_evo_disp_fit", 
                   "OCD_evo_fit", "OCD_fit", "Random_Search_100000", "OCD_parametrization/OCD_RUN_0.01_10", 
                   "OCD_parametrization/OCD_RUN_0.01_100", "OCD_parametrization/OCD_RUN_0.01_1000", "OCD_parametrization/OCD_RUN_0.001_10",
                   "OCD_parametrization/OCD_RUN_0.001_100", "OCD_parametrization/OCD_RUN_0.001_1000", 
                   "OCD_parametrization/OCD_RUN_0.0001_10", "OCD_parametrization/OCD_RUN_0.0001_100", 
                   "OCD_parametrization/OCD_RUN_0.0001_1000", "OCD_parametrization/OCD_RUN_0.00001_10",
                   "OCD_parametrization/OCD_RUN_0.00001_100","OCD_parametrization/OCD_RUN_0.00001_1000",
                   "CMAES_default_with_restart2", 
                   "GA_default2", "OCD_evo_disp2", "GA_OCD2")
  pbar = makeProgressBar(min = 1, max = length(requiredDirs))
  for (i in 1:length(requiredDirs)) {
    if (length(grep(requiredDirs[i], allDirs)) == 0) {
      print(paste("Required directory", requiredDirs[i], "is missing."))
      checkSuccessful = FALSE
    }
  }
  #list all names of the algorithms, the dimensions and functions to check for names
  algorithmNames = c("cmaes", "CMAES_OCD", "GA", "random search")
  #match algorithm names to dirs
  dirAlgorithmMatch = c(1, 2, 1, 3, 3, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 2, 3)
  #check if all required .txt files exist
  for (i in 1:length(requiredDirs)) {
    pbar$set(i)
    currentDir = requiredDirs[i]
    currentFiles = dir(currentDir)
    for (j in 1:length(usedFunctions)) {
      for (k in 1:length(usedDimensions)) {
        currentFile = paste(algorithmNames[dirAlgorithmMatch[i]], "_output_", usedFunctions[j], "_", usedDimensions[k], ".txt", 
                            sep = "")
        if (length(grep(currentFile, currentFiles, ignore.case = TRUE)) == 0) {
          print(paste("Required file", currentFile, "in directory", currentDir, "is missing"))
          checkSuccessful = FALSE
        }
      }
    }
  }
  #separate test for the restart run test
  if (length(grep("CMAES_restart_test", allDirs)) == 0) {
    print("Required directory CMAES_restart_test is missing.")
    checkSuccessful = FALSE
  }
  else {
    currentFiles = dir("CMAES_restart_test")
    if (length(grep("cmaes1_output_12_20.txt", currentFiles)) == 0) {
      print("Required file cmaes1_output_12_20 in directory CMAES_restart_test is missing.")
      checkSuccessful = FALSE
    }
    if (length(grep("cmaes2_output_12_20.txt", currentFiles)) == 0) {
      print("Required file cmaes2_output_12_20 in directory CMAES_restart_test is missing.")
      checkSuccessful = FALSE
    }
    if (length(grep("cmaes3_output_12_20.txt", currentFiles)) == 0) {
      print("Required file cmaes3_output_12_20 in directory CMAES_restart_test is missing.")
      checkSuccessful = FALSE
    }
    if (length(grep("cmaes4_output_12_20.txt", currentFiles)) == 0) {
      print("Required file cmaes4_output_12_20 in directory CMAES_restart_test is missing.")
      checkSuccessful = FALSE
    }
    if (length(grep("cmaes5_output_12_20.txt", currentFiles)) == 0) {
      print("Required file cmaes5_output_12_20 in directory CMAES_restart_test is missing.")
      checkSuccessful = FALSE
    }
    if (length(grep("cmaes6_output_12_20", currentFiles)) == 0) {
      print("Required file cmaes6_output_12_20 in directory CMAES_restart_test is missing.")
      checkSuccessful = FALSE
    }
  }
  
  if(checkSuccessful) print("Syntax check revealed no anomalies. Proceed to generate output")
  return(checkSuccessful)
}
andreas-he/cmaesbenchmarking documentation built on May 10, 2019, 10:30 a.m.