R/darwin_data.R

Defines functions zero_pad_last update.darwin_data init_penalties summarise_fitness_penalties_by_key_models summarise_fitness_penalties_by_iteration summarise_fitness_by_iteration get_value_from_messages get_search_overview get_messages get_fitness_function_text get_search_algorithm_parameters_text get_crash_value get_penalties darwin_data

Documented in darwin_data summarise_fitness_by_iteration summarise_fitness_penalties_by_iteration

#' @import ggplot2
#' @import dplyr
#' @import xpose
#' @import Certara.Xpose.NLME
#'
NULL

`%>%`<- dplyr::`%>%`

#' Initialize darwin data structure.
#'
#' @param project_dir Directory containing input files for pyDarwin (e.g., options.json).
#' @param working_dir Directory containing misc results folders generated from a pyDarwin search.
#' This is the default location of the key_models, output, and temp folders.
#' @param output_dir Directory containing output files such as "results.csv" and final control files.
#' Default location is inside \code{working_dir/output}.
#' @param key_models_dir Directory of the key_models folder. Default location is inside \code{working_dir/key_models}.
#' Note, key models are not imported if argument is \code{NULL}, explicitly specify \code{key_models_dir} to import files for \code{\link{darwinReportUI}}.
#' @param ... Additional args.
#' @details
#' If \code{working_dir} and \code{output_dir} are sub directories of \code{project_dir}, these arguments can be ignored.
#' The \code{key_models_dir} is not required to initialize the \code{darwin_data} object. If specified, however, key models data will
#' be imported which may take time given the number of key models and size of output tables. See \code{\link{import_key_models}}.
#'
#'
#'
#' @return Object of class \code{darwin_data}.
#' @export
#'
#'
darwin_data <- function(project_dir, working_dir = NULL, output_dir = NULL, key_models_dir = NULL, ...) {

  args <- list(...)
  stopifnot(dir.exists(project_dir))

  if (!is.null(working_dir) && working_dir != "") {
    stopifnot(dir.exists(working_dir))
  } else {
    working_dir <- project_dir
  }

  if (!is.null(output_dir) && output_dir != "") {
    stopifnot(dir.exists(output_dir))
    results_file <- file.path(output_dir, "results.csv")
  } else {
    output_dir <- file.path(working_dir, "output")
    results_file <- file.path(output_dir, "results.csv")
  }
  stopifnot(file.exists(results_file))

  options_file <- file.path(project_dir, "options.json")

  stopifnot(file.exists(options_file))

  options <- jsonlite::fromJSON(options_file)

  if (!is.null(options$algorithm) && grepl("^ex", options$algorithm , ignore.case = TRUE)) {
    stop("Must select a machine learning algorithm. Certara.DarwinReporter is not compatible with an EXHAUSTIVE search.")
  }

  if (!is.null(options$engine_adapter) && options$engine_adapter == "nlme") {
    software <- "nlme"
  } else {
    software <- "nonmem"
  }

  if (requireNamespace("data.table", quietly = TRUE)) {
    results <- data.table::fread(
      file = results_file,
      data.table = FALSE
    )
  } else if (requireNamespace("readr", quietly = TRUE)) {
    results <- readr::read_csv(
      results_file
    )
  } else {
    results <- utils::read.csv(
      results_file,
      numerals = "no.loss"
      )
  }
  results <- results[,1:19]
  colnames(results) <-
    c(
      "iteration",
      "model number",
      "run directory",
      "ref run",
      "status",
      "fitness",
      "model",
      "ofv",
      "success",
      "covariance",
      "correlation",
      "ntheta",
      "nomega",
      "nsigm",
      "condition num",
      "r penalty",
      "python penalty",
      "translation messages",
      "runtime errors"
    )

  messages <- get_messages(working_dir)
  if (!is.null(messages)) {
    search_overview <- tryCatch({
      get_search_overview(messages)
    }, error = function(e)
      return(NULL))
    if (is.null(search_overview)) {
      search_overview <- tryCatch({
        get_search_overview(messages, sep = "=")
      }, error = function(e)
        return(NULL))
    }
  } else {
    search_overview <- NULL
  }

  if (!is.null(key_models_dir) && dir.exists(key_models_dir)) {
    key_models <- import_key_models(darwin_data = NULL,
                                    dir = key_models_dir,
                                    software = software)
  } else {
    key_models <- NULL
  }

  list(project_dir = project_dir,
       working_dir = working_dir,
       output_dir = output_dir,
       key_models_dir = key_models_dir,
       software = software,
       options = options,
       search_overview = search_overview,
       results = results,
       messages = messages,
       key_models = key_models) %>%
    structure(class = "darwin_data")
}

get_penalties <- function(darwin_data) {

  stopifnot(inherits(darwin_data, "darwin_data"))

  default_penalties <- init_penalties()
  user_penalties <- darwin_data$options$penalty

  if (is.null(user_penalties)) {
    penalties <- default_penalties
  } else {
    penalties <-
      c(user_penalties, default_penalties[!(names(default_penalties) %in% names(user_penalties))])
  }

  return(penalties)

}

get_crash_value <- function(darwin_data) {

  stopifnot(inherits(darwin_data, "darwin_data"))

  crash_value <- darwin_data$options$crash_value

  if (is.null(crash_value)) {
    crash_value <- 99999999
  }

  return(crash_value)

}

get_search_algorithm_parameters_text <- function(darwin_data) {
  options <- darwin_data$options
  algorithm <- options$algorithm
  num_generations <- options$num_generations
  population_size <- options$population_size
  total_iterations <- unique(darwin_data$results$iteration)

  text <-
    paste0(
      "The search algorithm used was ",
      algorithm,
      ". There were a total of ",
      num_generations,
      " iterations performed in the machine learning optimization stage. The number of models in each iteration was ",
      population_size,
      ".\n"
    )

# Periodic Downhill
  if (!is.null(options$downhill_period) && options$downhill_period > 0) {
    downhill_iterations <-
      total_iterations[grepl("FN", total_iterations)]
    downhill_period_iterations <-
      total_iterations[grepl("D", total_iterations)]
    downhill_period_iterations <- setdiff(downhill_period_iterations, downhill_iterations)
    downhill_period_search_text <-
      paste0(
        "A periodic downhill search was performed at an interval of every ",
        options$downhill_period,
        " iterations, resulting in an additional ",
        length(downhill_period_iterations),
        " iterations in the search. "
      )
    text <- paste0(text, downhill_period_search_text)
  }
# Final Downhill
  if (!is.null(options$final_downhill_search) && options$final_downhill_search) {
    downhill_iterations <-
      total_iterations[grepl("FN", total_iterations)]
    downhill_search_text <-
      paste0(
        "A final downhill search was performed with an additional ",
        length(downhill_iterations),
        " iterations."
      )

    text <- paste0(text, downhill_search_text)
  }

  text <- paste0(text, paste0("\nA total of ", length(total_iterations), " iterations were performed in the search.\n"))

  return(text)
}

get_fitness_function_text <- function(darwin_data) {
  text <- "The fitness function was defined as the sum of:\n"

  penalties <- get_penalties(darwin_data)

  penalties_text <- paste0("* ", penalties$theta, " points for each estimated THETA parameter\n",
                           "* ", penalties$omega, " points for each estimated OMEGA parameter\n",
                           "* ", penalties$sigma, " points for each estimated SIGMA parameter\n",
                           "* ", penalties$convergence, " points for failing to converge\n",
                           "* ", penalties$covariance, " points for failing the covariance step\n",
                           "* ", penalties$correlation, " points if the absolute value of any off diagonal of the estimation correlation matrix is > 0.95\n",
                           "* ", penalties$condition_number, " points for a condition number > 1000\n"
  )

  text <- c(text, penalties_text)

  options <- darwin_data$options
  if (!is.null(options$postprocess$use_r) && isTRUE(options$postprocess$use_r)) {
    text <- c(text,
    "A penalty for <DESCRIBE PENALTY> calculated by the R code included in appendix.\n"
    )
  }
  if (!is.null(options$postprocess$use_python) && isTRUE(options$postprocess$use_python)) {
    text <- c(text,
              "A penalty for <DESCRIBE PENALTY> calculated by the Python code included in appendix."
    )
  }
  return(text)
}




get_messages <- function(working_dir) {
  messages_file <- file.path(working_dir, "messages.txt")
  if(file.exists(messages_file)) {
  messages <- readLines(messages_file)
  } else {
    warning("cannot create search_overview, messages.txt not found in working directory")
    messages <- NULL
  }

  return(messages)
}

get_search_overview <- function(messages, sep = ":") {

  num_models_in_search_space <- get_value_from_messages(messages,
                                                        type = "numeric",
                                                        pattern = paste0("Search space size", sep, " (\\d+)"))

  num_considered_models <- get_value_from_messages(messages,
                                                   type = "numeric",
                                                   pattern = paste0("Number of considered models", sep, " (\\d+)"))
  num_models_run_in_search <- get_value_from_messages(messages,
                                                      type = "numeric",
                                                      pattern = paste0("Number of models that were run during the search", sep, " (\\d+)"))

  num_unique_models_to_best_model <- get_value_from_messages(messages,
                                                      type = "numeric",
                                                      pattern = paste0("Number of unique models to best model", sep, " (\\d+)"))

  time_to_best_model <- get_value_from_messages(messages,
                                                type = "character",
                                                pattern = paste0("Time to best model", sep, " ([0-9.]+ \\w+)"))
  best_overall_fitness <- strsplit(
    get_value_from_messages(rev(messages),
                            type = "character",
                            pattern = paste0("Best overall fitness", sep, " (.+)$")),
    split = ","
  )
  best_overall_fitness <- sapply(best_overall_fitness[[1]], trimws)

  best_overall_fitness <- sapply(best_overall_fitness, function(x) {
    gsub(".* ", "", x)
  })

  best_overall_fitness <- list(
    "Fitness" = as.numeric(best_overall_fitness[[1]]),
    "Iteration" = best_overall_fitness[[2]],
    "Model" = best_overall_fitness[[3]]
  )

  elapsed_time <- get_value_from_messages(messages,
                                          type = "character",
                                          pattern = paste0("Elapsed time", sep, " ([0-9.]+ \\w+)"))

  search_overview <- list(best_overall_fitness = best_overall_fitness,
                          num_models_in_search_space = num_models_in_search_space,
                          num_considered_models = num_considered_models,
                          num_models_run_in_search = num_models_run_in_search,
                          num_unique_models_to_best_model = num_unique_models_to_best_model,
                          time_to_best_model = time_to_best_model,
                          elapsed_time = elapsed_time
  )

  return(search_overview)
}

get_value_from_messages <- function(messages, pattern, type = c("numeric", "character")) {
  matches <- gregexpr(pattern, messages)
  extracted_values <- regmatches(messages, matches)
  extracted_values <- Filter(length, extracted_values)

  # Extract the numeric value from the first match (if exists)
  if (length(extracted_values) > 0) {
    extracted_value <- gsub(pattern, "\\1", extracted_values[[1]])
    # Convert to numeric if needed
    if (type == "numeric") {
    extracted_value <- as.numeric(extracted_value)
    }
  } else {
    extracted_value <- NULL
  }

  if (is.null(extracted_value)) {
    index <- grep(pattern, messages)
    if (length(index) > 0) {
      relevant_string <- messages[index]
      extracted_value <- sub(paste0(".*", pattern, ".*", collapse = ""), "\\1", relevant_string)
      if (type == "numeric") {
        extracted_value <- as.numeric(extracted_value)
      }
    }
  }

  return(extracted_value)
}


#' Summarise fitness by iteration
#'
#' Summarise minimum, cumulative minimum, and mean fitness values by pyDarwin search iteration/generation.
#'
#' @param darwin_data Object of class \code{darwin_data}.
#'
#' @return \code{data.frame} with columns \code{iteration},
#' \code{min_fitness}, \code{mean_fitness}, and \code{min_cum_fitness}
#' @export
#'
summarise_fitness_by_iteration <- function(darwin_data) {

  stopifnot(inherits(darwin_data, "darwin_data"))

  crash_value <- get_crash_value(darwin_data)

  fitness_by_iteration <- darwin_data$results %>%
    filter(fitness < crash_value) %>%
    group_by(iteration)

  iteration_number <- fitness_by_iteration %>%
    select(iteration) %>%
    distinct() %>%
    ungroup() %>%
    mutate(n = row_number())

  fitness_by_iteration <- fitness_by_iteration %>%
    summarise(min_fitness = min(fitness),
              mean_fitness = mean(fitness)) %>%
    left_join(iteration_number, by = "iteration") %>%
    arrange(n) %>%
    mutate(iteration = factor(iteration, levels = unique(iteration))) %>%
    select(-n)  %>%
    mutate(min_cum_fitness = cummin(min_fitness))

  return(fitness_by_iteration)
}

#' Summarize minimum fitness and penalty values by iteration
#'
#' Summarise minimum fitness, ofv, and penalty values used in calculation of overall fitness values by pyDarwin search iteration/generation.
#'
#' @param darwin_data Object of class \code{darwin_data}.
#' @param group_penalties Logical. Set to \code{TRUE} to group penalties.
#'
#' @return \code{data.frame} of columns \code{"iteration", "fitness", "ofv"} and corresponding penalty columns.
#' @export
#'
summarise_fitness_penalties_by_iteration <- function(darwin_data, group_penalties = FALSE) {

  stopifnot(inherits(darwin_data, "darwin_data"))

  crash_value <- get_crash_value(darwin_data)

  penalties <- get_penalties(darwin_data)

  fitness_penalties_by_iteration <- darwin_data$results %>%
    filter(fitness < crash_value) %>%
    group_by(iteration) %>%
    filter(fitness == min(fitness)) %>%
    dplyr::distinct(iteration, fitness, .keep_all = TRUE) %>%
    ungroup() %>%
    mutate(penalty_ntheta = ntheta * penalties$theta,
           penalty_nomega = nomega * penalties$omega,
           penalty_nsigma = nsigm * penalties$sigma,
           penalty_success = ifelse(success, 0, penalties$convergence),
           penalty_covar = ifelse(covariance, 0, penalties$covariance),
           penalty_corr = ifelse(`correlation`, 0, penalties$correlation),
           penalty_condition = ifelse(`condition num` < 1000, 0, penalties$condition_number),
           penalty_r = `r penalty`,
           penalty_python = `python penalty`,
           iteration = factor(iteration, levels = unique(iteration))
    ) %>%
    select(iteration,
           fitness,
           ofv,
           penalty_ntheta,
           penalty_nomega,
           penalty_nsigma,
           penalty_success,
           penalty_covar,
           penalty_corr,
           penalty_condition,
           penalty_r,
           penalty_python)

  if (group_penalties) {
    fitness_penalties_by_iteration <- fitness_penalties_by_iteration %>%
      mutate(
        penalties_nparams = penalty_ntheta + penalty_nomega + penalty_nsigma,
        penalties_covariance_step = penalty_covar + penalty_corr + penalty_condition
      ) %>%
      select(
        iteration,
        fitness,
        ofv,
        penalties_nparams,
        penalties_covariance_step,
        penalty_success,
        penalty_r,
        penalty_python
      )

  }

  # only select penalties that have non-zero values
  fitness_penalties_by_iteration <- fitness_penalties_by_iteration %>%
    select_if(function(col) any(col != 0))

  return(fitness_penalties_by_iteration)

}

summarise_fitness_penalties_by_key_models <- function(darwin_data, group_penalties = FALSE) {

  stopifnot(inherits(darwin_data, "darwin_data"))
  stopifnot(!is.null(darwin_data$key_models))

  crash_value <- get_crash_value(darwin_data)

  penalties <- get_penalties(darwin_data)

  if (darwin_data$software == "nlme") {
    model_prefix <- "NLME"
  } else {
    model_prefix <- "NM"
  }

  results <- darwin_data$results %>%
    dplyr::mutate(model_iteration = paste(model_prefix,
                                           iteration,
                                           sprintf("%02d", `model number`),
                                           sep = "_"))

  fitness_penalties_by_key_models <-
    data.frame(model_iteration = names(darwin_data$key_models$run_dirs),
               run_dir = unlist(darwin_data$key_models$run_dirs))

  fitness_penalties_by_key_models$model_iteration <-
    unlist(
      lapply(
        fitness_penalties_by_key_models$model_iteration,
        zero_pad_last,
        num_zeros = 2
      )
    )

  fitness_penalties_by_key_models <- fitness_penalties_by_key_models %>%
    dplyr::left_join(results, by = "model_iteration") %>%
    dplyr::mutate(penalty_ntheta = ntheta * penalties$theta,
           penalty_nomega = nomega * penalties$omega,
           penalty_nsigma = nsigm * penalties$sigma,
           penalty_success = ifelse(success, 0, penalties$convergence),
           penalty_covar = ifelse(covariance, 0, penalties$covariance),
           penalty_corr = ifelse(`correlation`, 0, penalties$correlation),
           penalty_condition = ifelse(`condition num` < 1000, 0, penalties$condition_number),
           penalty_r = `r penalty`,
           penalty_python = `python penalty`,
           iteration = factor(iteration, levels = unique(iteration))
    ) %>%
    dplyr::select(iteration,
           model_iteration,
           run_dir,
           fitness,
           ofv,
           penalty_ntheta,
           penalty_nomega,
           penalty_nsigma,
           penalty_success,
           penalty_covar,
           penalty_corr,
           penalty_condition,
           penalty_r,
           penalty_python)

  if (group_penalties) {
    fitness_penalties_by_key_models <- fitness_penalties_by_key_models %>%
      mutate(
        penalties_nparams = penalty_ntheta + penalty_nomega + penalty_nsigma,
        penalties_covariance_step = penalty_covar + penalty_corr + penalty_condition
      ) %>%
      select(
        iteration,
        model_iteration,
        run_dir,
        fitness,
        ofv,
        penalties_nparams,
        penalties_covariance_step,
        penalty_success,
        penalty_r,
        penalty_python
      )

  }

  # only select penalties that have non-zero values
  fitness_penalties_by_key_models <- fitness_penalties_by_key_models %>%
    select_if(function(col) any(col != 0))

  return(fitness_penalties_by_key_models)
}


init_penalties <- function() {
  list(
    condition_number = 100,
    convergence = 100,
    correlation = 100,
    covariance = 100,
    non_influential_tokens = 1e-05,
    omega = 10,
    sigma = 10,
    theta = 10
  )
}

#' update method for darwin_data
#'
#' @param object class darwin_dat
#' @param ... additional args
#'
#' @noRd
#' @keywords internal
update.darwin_data <- function(object, ...) {
  args <- list(...)
  for (i in names(args)) {
    object[[i]] <- args[[i, exact=TRUE]]
  }
  object
}

#ensure model number has correct leading zeroes
zero_pad_last <- function(x, num_zeros = 2) {
  parts <- strsplit(x, "_")[[1]]
  last_part <- parts[length(parts)]
  new_last_part <- sprintf(paste0("%0", num_zeros, "d"), as.integer(last_part))
  parts[length(parts)] <- new_last_part
  return(paste(parts, collapse = "_"))
}

Try the Certara.DarwinReporter package in your browser

Any scripts or data that you put into this service are public.

Certara.DarwinReporter documentation built on April 4, 2025, 2:22 a.m.