R/readFiles.R

Defines functions read_single_file_SOS read_index_file__IOH read_index_file__json read_index_file scan_index_file limit.data

Documented in limit.data read_index_file scan_index_file

# sourceCpp('src/align.cc')
# sourceCpp('src/read.cc')

#' Reduce the size of the data set by evenly subsampling the records
#'
#' @param df The data to subsample
#' @param n The amount of samples
#' @return A smaller data.frame
limit.data <- function(df, n) {
  N <- nrow(df)
  if (N > n) {
    idx <- unique(c(1, seq(1, N, length.out = n), N))
    df[idx, ]
  } else
    df
}

#' Scan *.info files for IOHProfiler or COCO
#'
#' @param folder The folder containing the .info or .json files
#' @return The paths to all found .info and .json-files
#' @export
#' @note This automatically filetrs our files of size 0
#' @examples
#' path <- system.file("extdata", "ONE_PLUS_LAMDA_EA", package="IOHanalyzer")
#' scan_index_file(path)
scan_index_file <- function(folder) {
  folder <- trimws(folder)
  files <- list.files(folder, pattern = '.(info|json)$', recursive = T, full.names = T)
  files[file.size(files) > 0]
}

#' Read .info files and extract information
#'
#' @param fname The path to the .info file
#' @return The data contained in the .info file
#' @export
#' @examples
#' path <- system.file("extdata", "ONE_PLUS_LAMDA_EA", package="IOHanalyzer")
#' info <- read_index_file(file.path(path,"IOHprofiler_f1_i1.info"))
read_index_file <- function(fname) {
  format <- tools::file_ext(fname)
  if (format == 'json')
    read_index_file__json(fname)
  else {
    tryCatch(
      read_index_file__IOH(fname),
      warning = function(e) read_index_file__COCO(fname),
      error = function(e) read_index_file__COCO(fname),
      finally = function(e) stop(paste0('Error in reading .info files ', e))
    )
  }
}

#' Read IOHprofiler-based .json files and extract information
#'
#' @param fname The path to the json info-file
#' @return The data contained in the json info-file
#' @noRd
read_index_file__json <- function(fname) {

  json_data <- fromJSON(file = fname)
  base_dir <- dirname(fname)
  exp_attrs <- sapply(json_data$experiment_attributes, function(x) {x})

  data <- list()
  tryCatch({
    fid <- json_data$function_id
    fname <- json_data$function_name
    suite <- json_data$suite
    maximization <- json_data$maximization
    algid <- json_data$algorithm$name
    attributes <- json_data$attributes
    version <- json_data$version
  }, error = function(e) {return(NULL)})

  data <- lapply(json_data$scenarios, function(scenario) {

    run_attrs <- list()

    for (run_attr in json_data$run_attributes) {
      attr(run_attrs, run_attr) <- sapply(scenario$runs, function(x) x$run_attr)
    }

    datafile <- file.path(base_dir, scenario$path)

    temp <- c(list(
      funcId = fid,
      funcName = fname,
      suite = suite,
      maximization = maximization,
      algId = algid,
      DIM = scenario$dimension,
      attributes = attributes,
      version = version,
      datafile = datafile,
      instance = sapply(scenario$runs, function(x) x$instance),
      maxRT = sapply(scenario$runs, function(x) x$evals),
      finalFV = sapply(scenario$runs, function(x) x$best$y),
      final_pos = sapply(scenario$runs, function(x) x$best$x)
    ), run_attrs,
    exp_attrs)

  })
  data
}

#' Read IOHprofiler-based .info files and extract information
#'
#' @param fname The path to the .info file
#' @return The data contained in the .info file
#' @noRd
read_index_file__IOH <- function(fname) {
  f <- file(fname, 'r')
  path <- dirname(fname)
  data <- list()
  i <- 1

  while (TRUE) {
    # TODO: remove suppressWarnings later
    lines <- suppressWarnings(readLines(f, n = 3))
    if (length(lines) == 0)
      break

    # TODO: make this quote symbol ' or " as the configurable parameter
    # TODO: Fix this
    name_value <- read.csv(text = lines[1], header = F, quote = c("\"","'")) %>%
      as.list %>%
      unlist %>%
      as.vector

    header <- name_value %>%
      trimws %>% {
        regmatches(., regexpr("=", .), invert = T) # match the first appearance of '='
      } %>%
      unlist %>%
      trimws %>%
      matrix(nrow = 2) %>% {
        ans <- as.list(.[2, ])
        names(ans) <- .[1, ]
        for (name in .[1, ]) {
          value <- ans[[name]]
          ans[[name]] <- gsub("'", '', value)

          if (name == 'maximization' || name == 'constrained')
            value <- as.logical(value)
          else
            value <- suppressWarnings(as.numeric(value)) # convert quoted numeric values to numeric

          if (!is.na(value)) ans[[name]] <- value
        }
        ans
      }

    record <- trimws(strsplit(lines[3], ',')[[1]])
    if (length(record) == 1){
      next
    }

    has_dynattr <- !is.null(header$dynamicAttribute)

    # TODO: this must also be removed...
    if (record[2] == "") {
      warning(sprintf('File %s is incomplete!', fname))
      finalFVs <- NULL
      instances <- NULL
      maxRTs <- NULL
    } else {
      res <- matrix(unlist(strsplit(record[-1], ':')), nrow = 2)
      info <- matrix(unlist(strsplit(res[2, ], '\\|')), nrow = 2)
      #Check for incorrect usages of reset_problem and remove them
      maxRTs <- as.numeric(info[1,])
      idx_correct <- which(maxRTs > 0)
      if (has_dynattr){
        info_split <- strsplit(info[2,], ';')
        finalFVs <- as.numeric(sapply(info_split, `[[`, 1))[idx_correct]

        dynamic_attrs <- sapply(info_split, `[[`, 2)[idx_correct]
        # dynamic_attrs <- dynamic_attrs[idx_correct
      }
      else {
        finalFVs <- as.numeric(info[2,])[idx_correct]
      }
      instances <- as.numeric(res[1,])[idx_correct]
      maxRTs <- maxRTs[idx_correct]
    }

    record[1] <- gsub("\\\\", "/", record[1])
    datafile <- file.path(path, record[1])

    attr_list = list(
      comment = lines[2],
      datafile = datafile,
      instance = instances,
      maxRT = maxRTs,
      finalFV = finalFVs
    )



    if (has_dynattr){
      attr_list[[header$dynamicAttribute]] <- dynamic_attrs
    }

    # TODO: Make this code more readable
    data[[i]] <- c(
      header,
      attr_list
    )

    i <- i + 1
  }
  close(f)
  datafiles <- unlist(lapply(data, function(x) x$datafile))
  if (length(datafiles) > length(unique(datafiles)))
    return(merge_indexinfo(data))
  else
    return(data)
}

#' Process IOHprofiler-based .info files if they contain multiple references
#' to a single data-file
#'
#' This is needed to assure that the meta-information is concatenated properly
#' and no datafile is processed more often than nessecary
#'
#' @param indexInfo The info-list to reduce
#' @return a reduced version of the provided indexInfo, preserving original order
#' @noRd
merge_indexinfo <- function(indexInfo) {
  datafiles <- unlist(lapply(indexInfo, function(x) x$datafile))
  lapply(unique(datafiles), function(dfile) {
    new_info <- list()
    idxs <- datafiles == dfile
    infos <- indexInfo[idxs]
    nr_runs <- length(unlist(lapply(infos, function(x) x$instance)))
    for (a in attributes(infos[[1]])$names) {
      temp <- unlist(lapply(infos, function(x) x[[a]]))
      if (length(temp) == nr_runs)
        new_info[[a]] <- temp
      else
        new_info[[a]] <- unique(temp)
    }
    new_info
  })
}

#' Read single-objective COCO-based .info files and extract information
#'
#' @param fname The path to the .info file
#' @return The data contained in the .info file
#' @noRd
read_index_file__COCO <- function(fname) {
  f <- file(fname, 'r')
  path <- dirname(fname)
  data <- list()
  i <- 1
  while (TRUE) {

    lines <- suppressWarnings(readLines(f, n = 3))  # read header and comments
    if (length(lines) < 3) {
      break
    }
    comment <- lines[2]
    name_value <- as.vector(unlist(as.list(read.csv(text = lines[1], header = F, quote = "'"))))

    header <- trimws(name_value) %>% {
      regmatches(., regexpr("=", .), invert = T)  # match the first appearance of '='
    } %>%
      unlist %>%
      trimws %>%
      matrix(nrow = 2) %>% {
        ans <- as.list(.[2, ])
        names(ans) <- .[1, ]
        for (name in .[1, ]) {
          value <- ans[[name]]
          ans[[name]] <- gsub("'", '', value)
          value <- suppressWarnings(as.numeric(value)) # convert quoted numeric values to numeric
          if (!is.na(value))
            ans[[name]] <- value
        }
        ans
      }

    names(header) <- gsub('algorithm', 'algId', names(header))

    record <- strsplit(lines[3], ',')[[1]] %>% trimws

    if (length(record) < 2) {
      warning(sprintf('File %s is incomplete!', fname))
      res <- NULL
      info <- NULL
    } else {
      res <- matrix(unlist(strsplit(record[-c(1)], ':')), nrow = 2)
      info <- matrix(as.numeric(unlist(strsplit(res[2, ], '\\|'))), nrow = 2)
    }

    record[1] <-  gsub("\\\\", "/", record[1])
    if ('folder' %in% names(header))
      datafile <- file.path(path, header$folder, record[1])
    else
      datafile <- file.path(path, record[1])


    # TODO: check the name of the attributes and fix them!
    data[[i]] <- c(
      header,
      list(
        comment = comment,
        datafile = datafile,
        instance = as.numeric(res[1, ]),
        maxRT = info[1, ],
        finalFV = info[2, ]
      )
    )
    i <- i + 1
  }
  close(f)
  data
}

#' Read bi-objective COCO-based .info files and extract information
#'
#' @param fname The path to the .info file
#' @return The data contained in the .info file
#' @noRd
read_index_file__BIOBJ_COCO <- function(fname) {
  f <- file(fname, 'r')
  path <- dirname(fname)
  data <- list()
  i <- 1

  lines <- suppressWarnings(readLines(f, n = 2))  # read header and comments
  comment <- lines[2]
  name_value <- as.vector(unlist(as.list(read.csv(text = lines[1], header = F, quote = "'"))))

  header <- trimws(name_value) %>% {
      regmatches(., regexpr("=", .), invert = T)  # match the first appearance of '='
    } %>%
    unlist %>%
    trimws %>%
    matrix(nrow = 2) %>% {
      ans <- as.list(.[2, ])
      names(ans) <- .[1, ]
      for (name in .[1, ]) {
        value <- ans[[name]]
        ans[[name]] <- gsub("'", '', value)
        value <- suppressWarnings(as.numeric(value)) # convert quoted numeric values to numeric
        if (!is.na(value))
          ans[[name]] <- value
      }
      ans
    }

  names(header) <- gsub('algorithm', 'algId', names(header))
  while (TRUE) {
    # TODO: remove suppressWarnings later
    lines <- suppressWarnings(readLines(f, n = 1))
    if (length(lines) == 0)
      break

    record <- strsplit(lines[1], ',')[[1]] %>% trimws

    # TODO: this must also be removed...
    if (record[4] == "") {
      warning(sprintf('File %s is incomplete!', fname))
      res <- NULL
      info <- NULL
    } else {
      res <- matrix(unlist(strsplit(record[-c(1, 2, 3)], ':')), nrow = 2)
      info <- matrix(as.numeric(unlist(strsplit(res[2, ], '\\|'))), nrow = 2)
    }

    record[3] <-  gsub("\\\\", "/", record[3])
    if ('folder' %in% names(header))
      datafile <- file.path(path, header$folder, record[3])
    else
      datafile <- file.path(path, record[3])

    funcId <- trimws(strsplit(record[1], '=')[[1]][2])
    funcId.int <- suppressWarnings(as.integer(funcId.int))
    if(!any(is.na(funcId.int))) {
      if(all((funcId.int >= 0L) & (funcId.int <= 1000000000L))) {
        funcId <- funcId.int
      }
    }

    DIM <- as.numeric(trimws(strsplit(record[2], '=')[[1]][2]))

    # TODO: check the name of the attributes and fix them!
    data[[i]] <- c(
      header,
      list(
        comment = comment,
        funcId = funcId,
        DIM = DIM,
        datafile = datafile,
        instance = as.numeric(res[1, ]),
        maxRT = info[1, ],
        finalFV = info[2, ]
      )
    )
    i <- i + 1
  }
  close(f)
  data
}

#' Check the format of data
#'
#' Throws a warning when multiple formats are found in the same folder.
#'
#' @param path The path to the folder to check
#' @return The format of the data in the given folder. Either 'COCO', 'IOHprofiler',
#' 'NEVERGRAD' or 'SOS'.
#' @export
#' @examples
#' path <- system.file("extdata", "ONE_PLUS_LAMDA_EA", package = "IOHanalyzer")
#' check_format(path)
check_format <- function(path) {
  if (sub('[^\\.]*\\.', '', basename(path), perl = T) == "csv")
    return(NEVERGRAD)

  if (sub('[^\\.]*\\.', '', basename(path), perl = T) == "rds")
    return("RDS")

  index_files <- scan_index_file(path)
  if (length(index_files) == 0)
    return(SOS)

  info <- unlist(lapply(index_files, read_index_file), recursive = F)
  if (all(unlist(lapply(info, function(x) {
      return (ifelse(is.null(x$version), F, (compareVersion(x$version, "0.3.3") >= 0)))
    })))) {
    return(IOHprofiler)
  }

  datafile <- sapply(info, function(item) item$datafile)

  format <- lapply(datafile, function(file) {
    tryCatch({
      if (!file.exists(file)) {
        cdatfile <- stri_replace(file, ".cdat", fixed = ".dat")
        tdatfile <- stri_replace(file, ".tdat", fixed = ".dat")
        if (file.exists(cdatfile)) file <- cdatfile
        else file <- tdatfile
      }
      first_line <- scan(file, what = 'character', sep = '\n', n = 1, quiet = T)
    }, error = function(e) {
      stop("Error detecting data files specified in .info, please verify the
             integrity of the provided files.")
    })

    if (startsWith(first_line, '% function') || startsWith(first_line, '% f evaluations'))
      COCO
    else if (startsWith(first_line, '\"function')) {
      n_col <- ncol(fread(file, header = FALSE, sep = ' ',
                          colClasses = 'character', fill = T, nrows = 1))
      if (n_col == 2)
        TWO_COL
      else
        IOHprofiler
    }
    else if (first_line == '%')  # Bi-objective COCO format...
      BIBOJ_COCO
    else {
      stop("Error detecting file format of file ", file, "; Please verify
           the integrity of this file.")
    }
  }) %>%
    unlist %>%
    unique

  csv_files <- file.path(path, list.files(path, pattern = '.csv', recursive = T))
  if (length(csv_files) > 0)
    format <- c(format, NEVERGRAD)

  txt_files <- file.path(path, list.files(path, pattern = '.txt', recursive = T))
  if (length(txt_files) > 0)
    format <- c(format, SOS)

  if (length(format) > 1) {
    stop(
      paste(
        path,
        'contains multiple data formats. This is not allowed for data processing.
        Please check the returned dataframe for more information.'
      )
    )
  } else
    format
}

#' Read IOHProfiler *.dat files
#'
#' @param fname The path to the .dat file
#' @param subsampling Whether to subsample the data or not
#' @noRd
#' @return A list of data.frames
read_dat <- function(fname, subsampling = FALSE) {
  # TODO: use the same data loading method as in read_dat__COCO
  df <- fread(fname, header = FALSE, sep = ' ', colClasses = 'character', fill = T)
  colnames(df) <- as.character(df[1, ])
  idx <- which(!grepl('\\d+', df[[1]], perl = T))

  # check for data consistence
  header_len <- min(apply(df[idx, ] != "", 1, sum))
  idx <- c(idx, nrow(df) + 1)
  df <- df[, 1:header_len]

  # turn off the warnings of the data coersion below
  options(warn = -1)
  # TOOD: this opeartor is the bottelneck
  df <- sapply(df, function(c) {class(c) <- 'numeric'; c})
  options(warn = 0)

  res <- lapply(seq(length(idx) - 1), function(i) {
    i1 <- idx[i] + 1
    i2 <- idx[i + 1] - 1
    ans <- df[i1:i2, ]
    if (i1 == i2)
      ans <- as.matrix(t(ans))

    # TODO: determine the number of record in the 'efficient mode'
    if (subsampling)
      ans <- limit.data(ans, n = 500)
    else
      ans
  })
  res
}

# TODO: this method is deprecated. Remove it later
# TODO: maybe not subsampling for COCO data
#' read COCO '.dat'-like file
#'
#' @param fname The path to the .dat file
#' @param subsampling Whether to subsample the data or not
#' @noRd
#' @return A list of data.frames
read_dat__COCO_ <- function(fname, subsampling = FALSE) {
  c_read_dat(path.expand(fname), 7, '%')
}

#' read COCO '.dat'-like file directly in R
#'
#' @param fname The path to the .dat file
#' @param subsampling Whether to subsample the data or not
#' @noRd
#' @return A list of data.frames
read_dat__COCO <- function(fname, subsampling = FALSE) {
  select <- seq(5)
  # read the file as a character vector (one string per row)
  X <- fread(fname, header = FALSE, sep = '\n', colClasses = 'character')[[1]]
  idx <- which(startsWith(X, '%'))
  X <- gsub('\\s+|\\t', ' ', X, perl = T)

  df <- fread(text = X[-idx], header = F, sep = ' ', select = select, fill = T)
  idx <- c((idx + 1) - seq_along(idx), nrow(df))

  lapply(seq(length(idx) - 1),
         function(i) {
           i1 <- idx[i]
           i2 <- idx[i + 1] - 1
           as.matrix(df[i1:i2, ])
         })
}

read_dat__BIOBJ_COCO <- function(fname, subsampling = FALSE) {
  if (endsWith(fname, '.dat'))
    select <- seq(3)
  else if (endsWith(fname, '.tdat'))
    select <- seq(2)

  # read the file as a character vector (one string per row)
  X <- fread(fname, header = FALSE, sep = '\n', colClasses = 'character')[[1]]
  idx <- which(startsWith(X, '%'))
  X <- gsub('\\s+|\\t', ' ', X, perl = T)

  df <- fread(text = X[-idx], header = F, sep = ' ', select = select, fill = T)
  idx <- which(startsWith(X, '% function'))
  idx <- c((idx + 1) - seq_along(idx) * 4, nrow(df))

  lapply(seq(length(idx) - 1), function(i) {
    i1 <- idx[i]
    i2 <- idx[i + 1] - 1
    as.matrix(df[i1:i2, ])
  })
}

# global variables for the alignment functions
idxEvals <- 1
idxTarget <- 3
n_data_column <- 5

# TODO: add docs to the following three functions
check_contiguous <- function(data) {
  sapply(data,
         function(d) {
           v <- d[, idxEvals]
           N <- length(v)
           v[1] == 1 && v[N] == N
         }) %>%
    all
}

align_contiguous <- function(data, idx, rownames) {
  N <- length(data)
  nrow <- length(rownames)
  lapply(data,
         function(d) {
           v <- d[, idx]
           r <- nrow - length(v)
           if (r > 0) {
             v <- c(v, rep(v[length(v)], r))
           }
           v
         }) %>%
    unlist %>%
    matrix(nrow = nrow, ncol = N) %>%
    set_rownames(rownames)
}

align_non_contiguous <- function(data, idx, rownames) {
  N <- length(data)
  nrow <- length(rownames)
  lapply(data,
         function(d) {
           y <- d[, idxEvals]
           y[is.na(y)] <- Inf #Prevent problems with NA when no improvements are found in a run
           c_impute(d[, idx], y, rownames)
         }) %>%
    unlist %>%
    matrix(nrow = nrow, ncol = N) %>%
    set_rownames(rownames)
}

#' Align data by runtimes
#' @param data The data to align
#' @param format Whether the data is form IOHprofiler or COCO
#' @param include_param Whether to include the recorded parameters in the alignment
#' @noRd
#' @return Data aligned by the running time
align_running_time <- function(data, format = IOHprofiler, include_param = TRUE,
                               maximization = TRUE) {
  if (format == IOHprofiler)
    idxTarget <- 3
  else if (format == COCO)
    idxTarget <- 3
  else if (format == BIBOJ_COCO) {
    n_data_column <- 3
    idxTarget <- 2
  }
  else if (format == TWO_COL) {
    n_data_column <- 2
    idxTarget <- 2
  }

  FV <- sort(unique(unlist(lapply(data, function(x) x[, idxTarget]))),
             decreasing = !maximization)
  n_column <- unique(sapply(data, ncol))

  if (format == COCO) {
    n_param <- 0
    idxValue <- idxEvals
    param_names <- NULL
  }
  else if (format == IOHprofiler) {
    n_param <- n_column - n_data_column
    if (include_param && n_param > 0) {
      param_names <- colnames(data[[1]])[(n_data_column + 1):n_column]
      idxValue <- c(idxEvals, (n_data_column + 1):n_column)
    }
    else {
      param_names <- NULL
      idxValue <- idxEvals
    }
  }
  else {
    param_names <- NULL
    idxValue <- idxEvals
  }

  res <- c_align_running_time(data, FV, idxValue - 1, maximization, idxTarget - 1)
  names(res) <- c('RT', param_names)
  res
}

#' Align data by function values
#' @param data The data to align
#' @param format Whether the data is form IOHprofiler or COCO.
#' @param include_param Whether to include the recorded parameters in the alignment
#' @noRd
#' @return Data aligned by the function value
align_function_value <- function(data, include_param = TRUE, format = IOHprofiler) {
  n_column <- unique(sapply(data, ncol))
  stopifnot(length(n_column) == 1)

  if (format == COCO) {
    idxTarget <- 3
    n_param <- 0
  }
  else if (format == IOHprofiler) {
    idxTarget <- 3
    n_param <- n_column - n_data_column
  }
  else if (format == BIBOJ_COCO) {  # bi-objective COCO format
    idxTarget <- 2
    n_data_column <- 2
    n_param <- 0                   # no parameter is allowed in this case
  }
  else if (format == TWO_COL) {
    idxTarget <- 2
    n_param <- 0
  }

  if (check_contiguous(data)) {
    nrow <- sapply(data, nrow) %>% max
    runtime <- seq(nrow)
    align_func <- align_contiguous
  } else {
    runtime <- sort(unique(unlist(lapply(data, function(x) x[, idxEvals]))))
    nrow <- length(runtime)
    align_func <- align_non_contiguous
  }

  FV <- align_func(data, idxTarget, runtime)
  include_param <- include_param && (n_param > 0)

  if (include_param) {
    param_names <- colnames(data[[1]])[(n_data_column + 1):n_column]
    param <- list()
    for (i in seq(n_param)) {
      name <- param_names[i]
      param[[name]] <- align_func(data, i + n_data_column, runtime)
    }
  }

  if (include_param) {
    c(list(FV = FV), param)
  } else {
    list(FV = FV)
  }
}


#' Read Nevergrad data
#'
#' Read .csv files in nevergrad format and extract information as a DataSetList
#'
#' @param fname The path to the .csv file
#' @return The DataSetList extracted from the .csv file provided
#' @noRd
read_nevergrad <- function(path){
  dt <- fread(path)

  if (!'name' %in% colnames(dt)) {
    dt[, name := function_class]
  }

  triplets <- unique(dt[, .(optimizer_name, dimension, name)])
  algIds <- unique(triplets$optimizer_name)
  DIMs <- unique(triplets$dimension)
  funcIds <- unique(triplets$name)

  res <- list()

  idx <- 1

  for (i in seq(nrow(triplets))) {
    algId <- triplets$optimizer_name[i]
    DIM <- triplets$dimension[i]
    funcId <- triplets$name[i]

    rescale_name <- 'rescale'
    if ( !('rescale' %in% colnames(dt))) {
      if ( 'transform' %in% colnames(dt))
        colnames(dt)[colnames(dt) == "transform"] <- "rescale"
      else{
        dt$rescale <- NA
      }
    }

    data <- dt[optimizer_name == algId & dimension == DIM & name == funcId,
               .(budget, loss, rescale)]

    for (scaled in unique(data$rescale)) {
      if (!is.na(scaled)) {
        data_reduced <- data[rescale == scaled, .(budget, loss)]
      }
      else {
        data_reduced <- data[is.na(rescale), .(budget, loss)]
      }

      if (!is.na(scaled) && scaled) {
        funcId_name <- paste0(funcId, '_rescaled')
      }
      else {
        funcId_name <- funcId
      }

      rows <- unique(data_reduced$budget) %>% sort
      FV <- lapply(rows,
             function(b) {
               data_reduced[budget == b, loss]
             }
          ) %>%
        do.call(rbind, .) %>%
        set_rownames(rows)

      RT <- list()

      ds <-  structure(list(RT = RT, FV = FV),
                       class = c('DataSet', 'list'),
                       maxRT = max(rows),
                       finalFV = min(FV),
                       format = 'NEVERGRAD',
                       maximization = FALSE,
                       algId = algId,
                       funcId = funcId_name,
                       DIM = DIM)
      res[[idx]] <- ds
      idx <- idx + 1
    }
  }
  class(res) %<>% c('DataSetList')
  attr(res, 'DIM') <- DIMs
  attr(res, 'funcId') <- funcIds
  attr(res, 'algId') <- algIds
  attr(res, 'suite') <- 'NEVERGRAD'
  attr(res, 'maximization') <- F
  res

}

#' Read single DataSet of SOS-based data
#'
#' Read single .txt files in SOS format and extract information as a DataSet
#'
#' @param file The path to the .txt file
#' @return The DataSet extracted from the .txt file provided
#' @noRd
read_single_file_SOS <- function(file) {
  V1 <- NULL #Local binding to remove CRAN warnings

  algId <- substr(basename(file), 1,  stringi::stri_locate_last(basename(file), fixed = 'D')[[1]] - 1)

  dt <- fread(file, header = F)
  header <- scan(file, what = 'character', sep = '\n', n = 1, quiet = T)
  splitted <- header %>% trimws %>% strsplit("\\s+") %>% .[[1]] %>% .[2:length(.)]
  info <- list(algId = algId)
  for (i in seq_len(length(splitted) / 2)) {
    temp <- splitted[[2*i]]
    name <- splitted[[2*i - 1]]
    if (name == 'function') name <- 'funcId'
    if (name == 'dim') name <- 'DIM'
    names(temp) <- name
    info <- c(info, temp)
  }

  dim <- as.numeric(info$DIM)
  #Hardcoded fix for SB-related data
  if (is.null(dim) || length(dim) == 0) {
    warning("Dimension not explicitly defined, setting as 30 by default")
    dim <- 30
    info$DIM <- dim
  }

  RT_raw <- dt[[colnames(dt)[[ncol(dt) - dim - 1]]]]
  names(RT_raw) <- dt[[colnames(dt)[[ncol(dt) - dim - 2]]]]
  RT <- as.matrix(RT_raw)
  mode(RT) <- 'integer'

  FV_raw <- dt[[colnames(dt)[[ncol(dt) - dim - 2]]]]
  names(FV_raw) <- dt[[colnames(dt)[[ncol(dt) - dim - 1]]]]
  FV <- as.matrix(FV_raw)


  pos <- dt[, (ncol(dt) - dim + 1):ncol(dt)]
  colnames(pos) <- as.character(seq_len(dim))

  maxRT <- max(RT)
  finalFV <- min(FV)

  idxs_avail <- dt[['V1']]
  idxs_replaced <- dt[['V6']]

  idxs_final <- setdiff(idxs_avail, idxs_replaced)

  idx_final_best <- idxs_final[[which.min(FV[idxs_final])]]
  final_pos <- as.numeric(pos[idx_final_best, ])
  # if (sum(FV == finalFV) > 1) {
  #   #Reconstruct population to determine which best solution is final position
  #   ids_min <- dt[FV_raw == finalFV, V1]
  #   replaced_idxs <- dt[[colnames(dt)[[ncol(dt) - dim]]]]
  #   #If none, take the last one added
  #   pos_idx <- max(ids_min)
  #   for (i in ids_min) {
  #     if (all(replaced_idxs != i)) {
  #       #If multiple, take the first one added
  #       pos_idx <- i
  #       break
  #     }
  #   }
  #   final_pos <- as.numeric(pos[pos_idx, ])
  # }
  # else {
  #   final_pos <- as.numeric(pos[which.min(FV), ])
  # }

  PAR <- list(
    # 'position' = list(pos),
    'final_position' = list(final_pos),
    'by_FV' = NULL,
    'by_RT' = NULL
  )



  object <- list()
  class(object) <- c('DataSet', class(object))
  object$RT <- RT
  object$FV <- FV
  object$PAR <- PAR
  attr(object, 'maxRT') <- maxRT
  attr(object, 'finalFV') <- finalFV
  attr(object, 'format') <- "SOS"
  attr(object, 'maximization') <- F
  attr(object, 'suite') <- "SOS"
  for (i in seq_along(info)) {
    attr(object, names(info)[[i]]) <- type.convert(info[[i]], as.is = T)
  }
  attr(object, 'ID') <- attr(object, 'algId')
  object
}


#' Read DataSetList of SOS-based data
#'
#' Read directory containing .txt files in SOS format and extract information as a DataSetList
#'
#' @param dir The path to the directory file
#' @param corrections_file A file containing boundary-correction ratios for the files in `dir`
#' @return The DataSetList extracted from the directory provided
#' @noRd
read_datasetlist_SOS <- function(dir, corrections_files = NULL) {
  V1 <- V3 <- V4 <- NULL #Local binding to remove CRAN warnings
  res <- list()
  dims <- list()
  funcIds <- list()
  algIds <- list()
  suites <- list()
  maximizations <- list()

  idx <- 1

  corrs <- as.data.table(rbindlist(lapply(corrections_files, fread)))

  for (f in list.files(dir, recursive = T, pattern = "*.txt", full.names = T)) {
    if (f %in% corrections_files) next
    ds <- read_single_file_SOS(f)

    dims[[idx]] <- attr(ds, 'DIM')
    funcIds[[idx]] <- attr(ds, 'funcId')
    algIds[[idx]] <- attr(ds, 'algId')
    suites[[idx]] <- attr(ds, 'suite')
    maximizations[[idx]] <- attr(ds, 'maximization')

    if (nrow(corrs) > 0) {
      fn <- substr(basename(f), 1, nchar(basename(f)) - 4)
      corr_opts <- corrs[V1 == fn, ]
      if (stri_detect_fixed(fn, "DE")) {
        corr <- corr_opts[V3 == attr(ds, 'F'), ][V4 == attr(ds, 'CR'), 'V2'][['V2']]
      }
      else if (stri_detect_fixed(fn, "RIS")) {
        corr <- corr_opts[['V2']]
      }
      else {
        warning("Unknown algorithm, so skipping lookup of boundary corrections ratio")
        corr <- NULL
      }
      if (length(corr) == 1)
        ds$PAR$'corrections' <- corr[[1]]
      else
        warning(paste0("No boundary corrections ratio found for ", fn))
    }

    res[[idx]] <- ds
    idx <- idx + 1
  }
  class(res) %<>% c('DataSetList')
  attr(res, 'DIM') <- dims
  attr(res, 'funcId') <- funcIds
  attr(res, 'algId') <- algIds
  attr(res, 'ID_attributes') <- c('algId')

  suite <- unique(suites)
  maximization <- unique(maximizations)
  if (length(suite) != 1 || length(maximization) != 1) {
    warning("Multipe different suites detected!")
  }

  attr(res, 'suite') <- suite
  attr(res, 'maximization') <- maximization
  res
  clean_DataSetList(res)
}

#' Find corrections-files in SOS-based folder
#'
#' Read directory containing .txt files in SOS format and extract the corrections-files
#'
#' @param path The path to the directory file
#' @return The relative paths to the corection files
#' @noRd
locate_corrections_files <- function(path) {
  files <- list.files(path, recursive = T, pattern = "*.txt", full.names = T)
  files[stri_detect_fixed(files, 'corrections')]
}

#' Read DataSetList of OPTION-based data
#'
#' Processes the data.table object created from the OPTION response into a DataSetList object
#'
#' @param dt The data.table object created from the OPTION request
#' @param source The type of data which is loaded, currently either BBOB or Nevergrad
#' @param ... Additional parameters to add to each DataSet object (e.g. function suite of nevergrad data)
#'
#' @return The DataSetList extracted from the data.table provided
#' @noRd
convert_from_OPTION <- function(dt, source, ...) {
  #Initialize variables used in data.table to avoid CRAN-check notes.
  algorithm_name <- dimensionality <- benchmark_problem <- NULL
  instance_id <- num_experiment_run <- num_function_run <- NULL
  precision_value <- elapsed_budget <- rotated <- NULL


  triplets <- unique(dt[, .(algorithm_name, dimensionality, benchmark_problem )])
  algIds <- list()
  DIMs <- list()
  funcIds <- list()

  res <- list()

  idx <- 1

  for (i in seq(nrow(triplets))) {
    algId <- triplets$algorithm_name[i]
    DIM <- as.numeric(triplets$dimensionality[i])
    funcId <- triplets$benchmark_problem[i]

    if (source == "BBOB") {
      data <- dt[algorithm_name == algId & dimensionality == DIM & benchmark_problem == funcId,
                 .(instance_id, num_experiment_run, num_function_run, precision_value)]

      funcId_no_f <- as.numeric(stri_sub(funcId, 2))

      for (iid in unique(data$instance_id)) {
        for (rep in unique(data$num_experiment_run)) {
          data_reduced <- data[instance_id == iid & num_experiment_run == rep,
                               .(num_function_run = as.numeric(num_function_run),
                                 precision_value =  as.numeric(precision_value))]

          rows <- unique(data_reduced$num_function_run) %>% sort
          FV <- lapply(rows,
                       function(b) {
                         data_reduced[num_function_run == b, precision_value]
                       }
          ) %>%
            do.call(rbind, .) %>%
            set_rownames(rows)

          data_twocol <- as.matrix(data_reduced[order(num_function_run)])
          RT <- align_running_time(list(data_twocol), TWO_COL, maximization = F, include_param = F)

          ds <-  structure(list(RT = RT$RT, FV = FV),
                           class = c('DataSet', 'list'),
                           maxRT = max(rows),
                           finalFV = min(FV),
                           format = 'OPTION',
                           suite = COCO,
                           maximization = FALSE,
                           algId = algId,
                           instance = iid,
                           funcId = funcId_no_f,
                           DIM = DIM,
                           ID = algId)
          res[[idx]] <- ds
          idx <- idx + 1
          algIds <- c(algIds, algId)
          funcIds <- c(funcIds, funcId_no_f)
          DIMs <- c(DIMs, DIM)
        }
      }
    }
    else {
      data <- dt[algorithm_name == algId & dimensionality == DIM & benchmark_problem == funcId,
                 .(elapsed_budget, precision_value, rotated, noise_level)]

      for (rotation in unique(data$rotated)) {
        for (noise_level in unique(data$noise_level)) {
          data_reduced <- data[rotated == rotation & noise_level == noise_level,
                               .(num_function_run = as.numeric(elapsed_budget),
                                 precision_value =  as.numeric(precision_value))]

          rows <- unique(data_reduced$num_function_run) %>% sort
          FV <- lapply(rows,
                       function(b) {
                         data_reduced[num_function_run == b, precision_value]
                       }
          ) %>%
            do.call(rbind, .) %>%
            set_rownames(rows)

          data_twocol <- as.matrix(data_reduced[order(num_function_run)])
          RT <- align_running_time(list(data_twocol), TWO_COL, maximization = F)

          ds <-  structure(list(RT = RT$RT, FV = FV),
                           class = c('DataSet', 'list'),
                           maxRT = max(rows),
                           finalFV = min(FV),
                           format = 'OPTION',
                           suite = NEVERGRAD,
                           maximization = FALSE,
                           algId = algId,
                           funcId = as.character(funcId),
                           DIM = DIM,
                           ID = algId,
                           rotated = rotation,
                           noise_level = noise_level)
          res[[idx]] <- ds
          idx <- idx + 1
          algIds <- c(algIds, algId)
          funcIds <- c(funcIds, as.character(funcId))
          DIMs <- c(DIMs, DIM)
        }
      }
    }
  }
  class(res) %<>% c('DataSetList')
  attr(res, 'DIM') <- DIMs
  attr(res, 'funcId') <- funcIds
  attr(res, 'algId') <- algIds
  #To be enabled when merged with version 1.6
  # attr(res, 'ID') <- algIds
  # attr(res, 'ID_attributes') <- c('algId')
  attr(res, 'suite') <- source
  attr(res, 'maximization') <- F
  clean_DataSetList(res)
}


#' Read Nevergrad data
#'
#' Read .csv files in arbitrary format
#'
#' @param path The path to the .csv file
#' @param neval_name The name of the column to use for the evaluation count.
#' If NULL, will be assumed to be sequential
#' @param fval_name The name of the column to use for the function values
#' @param fname_name The name of the column to use for the function name
#' @param algname_name The name of the column to use for the algorithm name
#' @param dim_name The name of the column to use for the dimension
#' @param run_name The name of the column to use for the run number
#' @param maximization Boolean indicating whether the data is resulting from maximization or minimization
#' @param static_attrs Named list containing the static values for missing columns.
#' When a parameter is not present in the csv file, its name-parameter should
#' be set to NULL, and the static value should be added to this static_attrs list.
#'
#' @return The DataSetList extracted from the .csv file provided
#' @export
read_pure_csv <- function(path, neval_name, fval_name, fname_name,
                          algname_name, dim_name, run_name, maximization = F,
                          static_attrs = NULL){
  fname <- algname <- neval <- fval <- NULL #Ugly fix for CRAN warnings
  dt <- fread(path)

  #If columns are not specified, check if they have static values or should be imputed
  impute_evalnrs <- is.null(neval_name)


  if (!fval_name %in% colnames(dt)) {
    warning(paste0("The function value column named ", fval_name, " does not exist
            in the provided file!"))
    return(NULL)
  }
  colnames(dt)[colnames(dt) == fval_name] <- "fval"

  if (!impute_evalnrs) {
    colnames(dt)[colnames(dt) == neval_name] <- "neval"
  }

  if (is.null(run_name)) {
    dt$run <- 1
  } else {
    colnames(dt)[colnames(dt) == run_name] <- "run"
  }

  if (is.null(fname_name)) {
    dt$fname <- static_attrs$fname
  } else {
    colnames(dt)[colnames(dt) == fname_name] <- "fname"
  }

  if (is.null(algname_name)) {
    dt$algname <- static_attrs$algname
  } else {
    colnames(dt)[colnames(dt) == algname_name] <- "algname"
  }

  if (is.null(dim_name)) {
    dt$dim <- static_attrs$dim
  } else {
    colnames(dt)[colnames(dt) == dim_name] <- "dim"
  }


  # if (!all(c(neval_name, run_name) %in% colnames(dt))) {
  #   warning("One or more specified column names do not exist
  #           in the provided file!")
  #   return(NULL)
  # }
  #
  # colnames(dt)[colnames(dt) == neval_name] <- "neval"
  # colnames(dt)[colnames(dt) == fval_name] <- "fval"
  # colnames(dt)[colnames(dt) == fname_name] <- "fname"
  # colnames(dt)[colnames(dt) == algname_name] <- "algname"
  # colnames(dt)[colnames(dt) == dim_name] <- "dim"
  # colnames(dt)[colnames(dt) == run_name] <- "run"

  triplets <- unique(dt[, .(fname, dim, algname)])

  algIds <- unique(triplets$algname)
  DIMs <- unique(triplets$dim)
  funcIds <- unique(triplets$fname)


  res <- list()

  idx <- 1

  for (i in seq(nrow(triplets))) {
    algId <- triplets$algname[i]
    DIM <- triplets$dim[i]
    funcId <- triplets$fname[i]

    # data <- dt[algname == algId & dim == DIM & fname == funcId,
    #            .(neval, fval, run)]

    data <- dt[algname == algId & dim == DIM & fname == funcId,]

    if (impute_evalnrs) {
      data$neval <-  ave(data$fval, data$run, FUN = seq_along)
    }

    dt_for_allign <- dcast(data, neval ~ run, value.var = 'fval')

    FV_mat <- as.matrix(dt_for_allign[,2:ncol(dt_for_allign)])
    runtimes <- dt_for_allign$neval

    if (maximization) {
      FV <- do.call(cbind, lapply(seq(ncol(FV_mat)), function(x) cummax(FV_mat[,x])))
      FV <- apply(FV, 2, function(x) {x[is.na(x)] <- max(x, na.rm = T); x})
    }
    else {
      FV <- do.call(cbind, lapply(seq(ncol(FV_mat)), function(x) cummin(FV_mat[,x])))
      FV <- apply(FV, 2, function(x) {x[is.na(x)] <- min(x, na.rm = T); x})
    }
    rownames(FV) <- runtimes



    FV_temp <- sort(unique(FV_mat), decreasing = !maximization)
    index <- as.numeric(runtimes)
    RT <- c_align_running_time_matrix(FV_mat, FV_temp, as.numeric(index), maximization)
    rownames(RT) <- FV_temp
    RT[RT < 1] <- NA #Avoids weird values from impossible imputes at the end

    ds <-  structure(list(RT = RT, FV = FV),
                     class = c('DataSet', 'list'),
                     maxRT = max(runtimes),
                     finalFV = min(FV),
                     format = 'Custom',
                     maximization = maximization,
                     algId = algId,
                     funcId = funcId,
                     DIM = DIM,
                     ID = algId)
    res[[idx]] <- ds
    idx <- idx + 1
  }
  class(res) %<>% c('DataSetList')
  attr(res, 'DIM') <- DIMs
  attr(res, 'funcId') <- funcIds
  attr(res, 'algId') <- algIds
  attr(res, 'suite') <- 'Custom'
  attr(res, 'maximization') <- maximization
  res

}

#' Read Nevergrad data
#'
#' Read .csv files in arbitrary format
#'
#' @param info A List containing all meta-data about the dataset to create
#' @param full_sampling Logical. Whether the raw (unaligned) FV matrix should be stored.
#' Currently only useful when a correlation plot between function values and parameters should be made
#'
#' @return The DataSetList extracted from the .csv file provided
#' @export
read_IOH_v1plus <- function(info, full_sampling = FALSE){

  df <- fread(info$datafile, header = FALSE, sep = ' ', colClasses = 'character', fill = T)
  colnames(df) <- as.character(df[1, ])
  idx <- which(!grepl('\\d+', df[[1]], perl = T))

  # check for data consistence
  header_len <- min(apply(df[idx, ] != "", 1, sum))
  idx <- c(idx, nrow(df) + 1)
  df <- df[, 1:header_len]

  # turn off the warnings of the data coersion below
  options(warn = -1)
  # TOOD: this opeartor is the bottelneck
  df <- sapply(df, function(c) {class(c) <- 'numeric'; c})
  options(warn = 0)

  data <- rbindlist(lapply(seq(length(idx) - 1), function(i) {
    i1 <- idx[i] + 1
    i2 <- idx[i + 1] - 1
    ans <- df[i1:i2, ]
    if (i1 == i2)
      ans <- as.matrix(t(ans))
    data.table(ans, runnr = i)
  }))


  runnr <- evaluations <- raw_y <- NULL #Ugly fix for CRAN warnings

  #If columns are not specified, check if they have static values or should be imputed
  impute_evalnrs <- !('evaluations' %in% info$attributes)

  algId <- info$algId
  DIM <- info$dim
  funcId <- attr(info, getOption('IOHanalyzer.function_representation', 'funcId'))

  if (impute_evalnrs) {
    data$evaluations <-  ave(data$raw_y, data$runnr, FUN = seq_along)
  }

  dt_for_allign <- dcast(data, evaluations ~ runnr, value.var = 'raw_y')

  FV_mat <- as.matrix(dt_for_allign[,2:ncol(dt_for_allign)])
  runtimes <- dt_for_allign$evaluations

  if (info$maximization) {
    FV_mat <- apply(FV_mat, 2, function(x) {x[is.na(x)] <- min(x, na.rm = T); x})
    FV <- do.call(cbind, lapply(seq(ncol(FV_mat)), function(x) cummax(FV_mat[,x])))
    # FV <- apply(FV, 2, function(x) {x[is.na(x)] <- max(x, na.rm = T); x})
  }
  else {
    FV_mat <- apply(FV_mat, 2, function(x) {x[is.na(x)] <- max(x, na.rm = T); x})
    FV <- do.call(cbind, lapply(seq(ncol(FV_mat)), function(x) cummin(FV_mat[,x])))
    # FV <- apply(FV, 2, function(x) {x[is.na(x)] <- max(x, na.rm = T); x})
  }
  rownames(FV) <- runtimes


  FV_temp <- unique(sort(FV_mat, decreasing = !info$maximization))
  index <- as.numeric(runtimes)
  RT <- c_align_running_time_matrix(FV_mat, FV_temp, as.numeric(index), info$maximization)
  rownames(RT) <- FV_temp
  RT[RT < 1] <- NA #Avoids weird values from impossible imputes at the end


  paramnames <- info$attributes[!info$attributes %in% c("evaluations", "raw_y")]

  PAR <- list('by_RT' = lapply(paramnames, function(parname) {
      dt_for_allign <- dcast(data, evaluations ~ runnr, value.var = parname)

      mat_temp <- as.matrix(dt_for_allign[,2:ncol(dt_for_allign)])
      rownames(mat_temp) <- runtimes
      mat_temp
    }))
  names(PAR$by_RT) <- paramnames

  ds <- do.call(
    function(...)
      structure(list(RT = RT, FV = FV, PAR = PAR), class = c('DataSet', 'list'), ...),
    c(info, list(maxRT = max(runtimes), finalFV = min(FV), format = IOHprofiler,
                 ID = info$algId))
  )
    if (getOption('IOHanalyzer.function_representation', 'funcId') == 'funcName') {
      attr(ds, 'funcId') <- attr(ds, 'funcName')
    }


  if (full_sampling || 'violation' %in% info$attributes) {
    rownames(FV_mat) <- runtimes
    ds$FV_raw_mat <- FV_mat
    attr(ds, 'contains_full_FV') <- TRUE
  }

  if ('violation' %in% info$attributes) {
    attr(ds, 'constrained') <- TRUE
  }

  return(ds)

}

Try the IOHanalyzer package in your browser

Any scripts or data that you put into this service are public.

IOHanalyzer documentation built on Sept. 20, 2023, 5:07 p.m.