R/darwin_data.R
In Certara.DarwinReporter: Data Visualization Utilities for 'pyDarwin' Machine Learning Pharmacometric Model Development

Documented in darwin_data summarise_fitness_by_iteration summarise_fitness_penalties_by_iteration

#' @import ggplot2
#' @import dplyr
#' @import xpose
#' @import Certara.Xpose.NLME
#'
NULL

`%>%`<- dplyr::`%>%`

#' Initialize darwin data structure.
#'
#' @param project_dir Directory containing input files for pyDarwin (e.g., options.json).
#' @param working_dir Directory containing misc results folders generated from a pyDarwin search.
#' This is the default location of the key_models, output, and temp folders.
#' @param output_dir Directory containing output files such as "results.csv" and final control files.
#' Default location is inside \code{working_dir/output}.
#' @param key_models_dir Directory of the key_models folder. Default location is inside \code{working_dir/key_models}.
#' Note, key models are not imported if argument is \code{NULL}, explicitly specify \code{key_models_dir} to import files for \code{\link{darwinReportUI}}.
#' @param ... Additional args.
#' @details
#' If \code{working_dir} and \code{output_dir} are sub directories of \code{project_dir}, these arguments can be ignored.
#' The \code{key_models_dir} is not required to initialize the \code{darwin_data} object. If specified, however, key models data will
#' be imported which may take time given the number of key models and size of output tables. See \code{\link{import_key_models}}.
#'
#'
#'
#' @return Object of class \code{darwin_data}.
#' @export
#'
#'
darwin_data <- function(project_dir, working_dir = NULL, output_dir = NULL, key_models_dir = NULL, ...) {

  args <- list(...)
  stopifnot(dir.exists(project_dir))

  if (!is.null(working_dir) && working_dir != "") {
    stopifnot(dir.exists(working_dir))
  } else {
    working_dir <- project_dir
  }

  if (!is.null(output_dir) && output_dir != "") {
    stopifnot(dir.exists(output_dir))
    results_file <- file.path(output_dir, "results.csv")
  } else {
    output_dir <- file.path(working_dir, "output")
    results_file <- file.path(output_dir, "results.csv")
  }
  stopifnot(file.exists(results_file))

  options_file <- file.path(project_dir, "options.json")

  stopifnot(file.exists(options_file))

  options <- jsonlite::fromJSON(options_file)

  if (!is.null(options$algorithm) && grepl("^ex", options$algorithm , ignore.case = TRUE)) {
    stop("Must select a machine learning algorithm. Certara.DarwinReporter is not compatible with an EXHAUSTIVE search.")
  }

  if (!is.null(options$engine_adapter) && options$engine_adapter == "nlme") {
    software <- "nlme"
  } else {
    software <- "nonmem"
  }

  if (requireNamespace("data.table", quietly = TRUE)) {
    results <- data.table::fread(
      file = results_file,
      data.table = FALSE
    )
  } else if (requireNamespace("readr", quietly = TRUE)) {
    results <- readr::read_csv(
      results_file
    )
  } else {
    results <- utils::read.csv(
      results_file,
      numerals = "no.loss"
      )
  }
  results <- results[,1:19]
  colnames(results) <-
    c(
      "iteration",
      "model number",
      "run directory",
      "ref run",
      "status",
      "fitness",
      "model",
      "ofv",
      "success",
      "covariance",
      "correlation",
      "ntheta",
      "nomega",
      "nsigm",
      "condition num",
      "r penalty",
      "python penalty",
      "translation messages",
      "runtime errors"
    )

  messages <- get_messages(working_dir)
  if (!is.null(messages)) {
    search_overview <- tryCatch({
      get_search_overview(messages)
    }, error = function(e)
      return(NULL))
    if (is.null(search_overview)) {
      search_overview <- tryCatch({
        get_search_overview(messages, sep = "=")
      }, error = function(e)
        return(NULL))
    }
  } else {
    search_overview <- NULL
  }

  if (!is.null(key_models_dir) && dir.exists(key_models_dir)) {
    key_models <- import_key_models(darwin_data = NULL,
                                    dir = key_models_dir,
                                    software = software)
  } else {
    key_models <- NULL
  }

  list(project_dir = project_dir,
       working_dir = working_dir,
       output_dir = output_dir,
       key_models_dir = key_models_dir,
       software = software,
       options = options,
       search_overview = search_overview,
       results = results,
       messages = messages,
       key_models = key_models) %>%
    structure(class = "darwin_data")
}

get_penalties <- function(darwin_data) {

  stopifnot(inherits(darwin_data, "darwin_data"))

  default_penalties <- init_penalties()
  user_penalties <- darwin_data$options$penalty

  if (is.null(user_penalties)) {
    penalties <- default_penalties
  } else {
    penalties <-
      c(user_penalties, default_penalties[!(names(default_penalties) %in% names(user_penalties))])
  }

  return(penalties)

}

get_crash_value <- function(darwin_data) {

  stopifnot(inherits(darwin_data, "darwin_data"))

  crash_value <- darwin_data$options$crash_value

  if (is.null(crash_value)) {
    crash_value <- 99999999
  }

  return(crash_value)

}

get_search_algorithm_parameters_text <- function(darwin_data) {
  options <- darwin_data$options
  algorithm <- options$algorithm
  num_generations <- options$num_generations
  population_size <- options$population_size
  total_iterations <- unique(darwin_data$results$iteration)

  text <-
    paste0(
      "The search algorithm used was ",
      algorithm,
      ". There were a total of ",
      num_generations,
      " iterations performed in the machine learning optimization stage. The number of models in each iteration was ",
      population_size,
      ".\n"
    )

# Periodic Downhill
  if (!is.null(options$downhill_period) && options$downhill_period > 0) {
    downhill_iterations <-
      total_iterations[grepl("FN", total_iterations)]
    downhill_period_iterations <-
      total_iterations[grepl("D", total_iterations)]
    downhill_period_iterations <- setdiff(downhill_period_iterations, downhill_iterations)
    downhill_period_search_text <-
      paste0(
        "A periodic downhill search was performed at an interval of every ",
        options$downhill_period,
        " iterations, resulting in an additional ",
        length(downhill_period_iterations),
        " iterations in the search. "
      )
    text <- paste0(text, downhill_period_search_text)
  }
# Final Downhill
  if (!is.null(options$final_downhill_search) && options$final_downhill_search) {
    downhill_iterations <-
      total_iterations[grepl("FN", total_iterations)]
    downhill_search_text <-
      paste0(
        "A final downhill search was performed with an additional ",
        length(downhill_iterations),
        " iterations."
      )

    text <- paste0(text, downhill_search_text)
  }

  text <- paste0(text, paste0("\nA total of ", length(total_iterations), " iterations were performed in the search.\n"))

  return(text)
}

get_fitness_function_text <- function(darwin_data) {
  text <- "The fitness function was defined as the sum of:\n"

  penalties <- get_penalties(darwin_data)

  penalties_text <- paste0("* ", penalties$theta, " points for each estimated THETA parameter\n",
                           "* ", penalties$omega, " points for each estimated OMEGA parameter\n",
                           "* ", penalties$sigma, " points for each estimated SIGMA parameter\n",
                           "* ", penalties$convergence, " points for failing to converge\n",
                           "* ", penalties$covariance, " points for failing the covariance step\n",
                           "* ", penalties$correlation, " points if the absolute value of any off diagonal of the estimation correlation matrix is > 0.95\n",
                           "* ", penalties$condition_number, " points for a condition number > 1000\n"
  )

  text <- c(text, penalties_text)

  options <- darwin_data$options
  if (!is.null(options$postprocess$use_r) && isTRUE(options$postprocess$use_r)) {
    text <- c(text,
    "A penalty for <DESCRIBE PENALTY> calculated by the R code included in appendix.\n"
    )
  }
  if (!is.null(options$postprocess$use_python) && isTRUE(options$postprocess$use_python)) {
    text <- c(text,
              "A penalty for <DESCRIBE PENALTY> calculated by the Python code included in appendix."
    )
  }
  return(text)
}




get_messages <- function(working_dir) {
  messages_file <- file.path(working_dir, "messages.txt")
  if(file.exists(messages_file)) {
  messages <- readLines(messages_file)
  } else {
    warning("cannot create search_overview, messages.txt not found in working directory")
    messages <- NULL
  }

  return(messages)
}

get_search_overview <- function(messages, sep = ":") {

  num_models_in_search_space <- get_value_from_messages(messages,
                                                        type = "numeric",
                                                        pattern = paste0("Search space size", sep, " (\\d+)"))

  num_considered_models <- get_value_from_messages(messages,
                                                   type = "numeric",
                                                   pattern = paste0("Number of considered models", sep, " (\\d+)"))
  num_models_run_in_search <- get_value_from_messages(messages,
                                                      type = "numeric",
                                                      pattern = paste0("Number of models that were run during the search", sep, " (\\d+)"))

  num_unique_models_to_best_model <- get_value_from_messages(messages,
                                                      type = "numeric",
                                                      pattern = paste0("Number of unique models to best model", sep, " (\\d+)"))

  time_to_best_model <- get_value_from_messages(messages,
                                                type = "character",
                                                pattern = paste0("Time to best model", sep, " ([0-9.]+ \\w+)"))
  best_overall_fitness <- strsplit(
    get_value_from_messages(rev(messages),
                            type = "character",
                            pattern = paste0("Best overall fitness", sep, " (.+)$")),
    split = ","
  )
  best_overall_fitness <- sapply(best_overall_fitness[[1]], trimws)

  best_overall_fitness <- sapply(best_overall_fitness, function(x) {
    gsub(".* ", "", x)
  })

  best_overall_fitness <- list(
    "Fitness" = as.numeric(best_overall_fitness[[1]]),
    "Iteration" = best_overall_fitness[[2]],
    "Model" = best_overall_fitness[[3]]
  )

  elapsed_time <- get_value_from_messages(messages,
                                          type = "character",
                                          pattern = paste0("Elapsed time", sep, " ([0-9.]+ \\w+)"))

  search_overview <- list(best_overall_fitness = best_overall_fitness,
                          num_models_in_search_space = num_models_in_search_space,
                          num_considered_models = num_considered_models,
                          num_models_run_in_search = num_models_run_in_search,
                          num_unique_models_to_best_model = num_unique_models_to_best_model,
                          time_to_best_model = time_to_best_model,
                          elapsed_time = elapsed_time
  )

  return(search_overview)
}

get_value_from_messages <- function(messages, pattern, type = c("numeric", "character")) {
  matches <- gregexpr(pattern, messages)
  extracted_values <- regmatches(messages, matches)
  extracted_values <- Filter(length, extracted_values)

  # Extract the numeric value from the first match (if exists)
  if (length(extracted_values) > 0) {
    extracted_value <- gsub(pattern, "\\1", extracted_values[[1]])
    # Convert to numeric if needed
    if (type == "numeric") {
    extracted_value <- as.numeric(extracted_value)
    }
  } else {
    extracted_value <- NULL
  }

  if (is.null(extracted_value)) {
    index <- grep(pattern, messages)
    if (length(index) > 0) {
      relevant_string <- messages[index]
      extracted_value <- sub(paste0(".*", pattern, ".*", collapse = ""), "\\1", relevant_string)
      if (type == "numeric") {
        extracted_value <- as.numeric(extracted_value)
      }
    }
  }

  return(extracted_value)
}


#' Summarise fitness by iteration
#'
#' Summarise minimum, cumulative minimum, and mean fitness values by pyDarwin search iteration/generation.
#'
#' @param darwin_data Object of class \code{darwin_data}.
#'
#' @return \code{data.frame} with columns \code{iteration},
#' \code{min_fitness}, \code{mean_fitness}, and \code{min_cum_fitness}
#' @export
#'
summarise_fitness_by_iteration <- function(darwin_data) {

  stopifnot(inherits(darwin_data, "darwin_data"))

  crash_value <- get_crash_value(darwin_data)

  fitness_by_iteration <- darwin_data$results %>%
    filter(fitness < crash_value) %>%
    group_by(iteration)

  iteration_number <- fitness_by_iteration %>%
    select(iteration) %>%
    distinct() %>%
    ungroup() %>%
    mutate(n = row_number())

  fitness_by_iteration <- fitness_by_iteration %>%
    summarise(min_fitness = min(fitness),
              mean_fitness = mean(fitness)) %>%
    left_join(iteration_number, by = "iteration") %>%
    arrange(n) %>%
    mutate(iteration = factor(iteration, levels = unique(iteration))) %>%
    select(-n)  %>%
    mutate(min_cum_fitness = cummin(min_fitness))

  return(fitness_by_iteration)
}

#' Summarize minimum fitness and penalty values by iteration
#'
#' Summarise minimum fitness, ofv, and penalty values used in calculation of overall fitness values by pyDarwin search iteration/generation.
#'
#' @param darwin_data Object of class \code{darwin_data}.
#' @param group_penalties Logical. Set to \code{TRUE} to group penalties.
#'
#' @return \code{data.frame} of columns \code{"iteration", "fitness", "ofv"} and corresponding penalty columns.
#' @export
#'
summarise_fitness_penalties_by_iteration <- function(darwin_data, group_penalties = FALSE) {

  stopifnot(inherits(darwin_data, "darwin_data"))

  crash_value <- get_crash_value(darwin_data)

  penalties <- get_penalties(darwin_data)

  fitness_penalties_by_iteration <- darwin_data$results %>%
    filter(fitness < crash_value) %>%
    group_by(iteration) %>%
    filter(fitness == min(fitness)) %>%
    dplyr::distinct(iteration, fitness, .keep_all = TRUE) %>%
    ungroup() %>%
    mutate(penalty_ntheta = ntheta * penalties$theta,
           penalty_nomega = nomega * penalties$omega,
           penalty_nsigma = nsigm * penalties$sigma,
           penalty_success = ifelse(success, 0, penalties$convergence),
           penalty_covar = ifelse(covariance, 0, penalties$covariance),
           penalty_corr = ifelse(`correlation`, 0, penalties$correlation),
           penalty_condition = ifelse(`condition num` < 1000, 0, penalties$condition_number),
           penalty_r = `r penalty`,
           penalty_python = `python penalty`,
           iteration = factor(iteration, levels = unique(iteration))
    ) %>%
    select(iteration,
           fitness,
           ofv,
           penalty_ntheta,
           penalty_nomega,
           penalty_nsigma,
           penalty_success,
           penalty_covar,
           penalty_corr,
           penalty_condition,
           penalty_r,
           penalty_python)

  if (group_penalties) {
    fitness_penalties_by_iteration <- fitness_penalties_by_iteration %>%
      mutate(
        penalties_nparams = penalty_ntheta + penalty_nomega + penalty_nsigma,
        penalties_covariance_step = penalty_covar + penalty_corr + penalty_condition
      ) %>%
      select(
        iteration,
        fitness,
        ofv,
        penalties_nparams,
        penalties_covariance_step,
        penalty_success,
        penalty_r,
        penalty_python
      )

  }

  # only select penalties that have non-zero values
  fitness_penalties_by_iteration <- fitness_penalties_by_iteration %>%
    select_if(function(col) any(col != 0))

  return(fitness_penalties_by_iteration)

}

summarise_fitness_penalties_by_key_models <- function(darwin_data, group_penalties = FALSE) {

  stopifnot(inherits(darwin_data, "darwin_data"))
  stopifnot(!is.null(darwin_data$key_models))

  crash_value <- get_crash_value(darwin_data)

  penalties <- get_penalties(darwin_data)

  if (darwin_data$software == "nlme") {
    model_prefix <- "NLME"
  } else {
    model_prefix <- "NM"
  }

  results <- darwin_data$results %>%
    dplyr::mutate(model_iteration = paste(model_prefix,
                                           iteration,
                                           sprintf("%02d", `model number`),
                                           sep = "_"))

  fitness_penalties_by_key_models <-
    data.frame(model_iteration = names(darwin_data$key_models$run_dirs),
               run_dir = unlist(darwin_data$key_models$run_dirs))

  fitness_penalties_by_key_models$model_iteration <-
    unlist(
      lapply(
        fitness_penalties_by_key_models$model_iteration,
        zero_pad_last,
        num_zeros = 2
      )
    )

  fitness_penalties_by_key_models <- fitness_penalties_by_key_models %>%
    dplyr::left_join(results, by = "model_iteration") %>%
    dplyr::mutate(penalty_ntheta = ntheta * penalties$theta,
           penalty_nomega = nomega * penalties$omega,
           penalty_nsigma = nsigm * penalties$sigma,
           penalty_success = ifelse(success, 0, penalties$convergence),
           penalty_covar = ifelse(covariance, 0, penalties$covariance),
           penalty_corr = ifelse(`correlation`, 0, penalties$correlation),
           penalty_condition = ifelse(`condition num` < 1000, 0, penalties$condition_number),
           penalty_r = `r penalty`,
           penalty_python = `python penalty`,
           iteration = factor(iteration, levels = unique(iteration))
    ) %>%
    dplyr::select(iteration,
           model_iteration,
           run_dir,
           fitness,
           ofv,
           penalty_ntheta,
           penalty_nomega,
           penalty_nsigma,
           penalty_success,
           penalty_covar,
           penalty_corr,
           penalty_condition,
           penalty_r,
           penalty_python)

  if (group_penalties) {
    fitness_penalties_by_key_models <- fitness_penalties_by_key_models %>%
      mutate(
        penalties_nparams = penalty_ntheta + penalty_nomega + penalty_nsigma,
        penalties_covariance_step = penalty_covar + penalty_corr + penalty_condition
      ) %>%
      select(
        iteration,
        model_iteration,
        run_dir,
        fitness,
        ofv,
        penalties_nparams,
        penalties_covariance_step,
        penalty_success,
        penalty_r,
        penalty_python
      )

  }

  # only select penalties that have non-zero values
  fitness_penalties_by_key_models <- fitness_penalties_by_key_models %>%
    select_if(function(col) any(col != 0))

  return(fitness_penalties_by_key_models)
}


init_penalties <- function() {
  list(
    condition_number = 100,
    convergence = 100,
    correlation = 100,
    covariance = 100,
    non_influential_tokens = 1e-05,
    omega = 10,
    sigma = 10,
    theta = 10
  )
}

#' update method for darwin_data
#'
#' @param object class darwin_dat
#' @param ... additional args
#'
#' @noRd
#' @keywords internal
update.darwin_data <- function(object, ...) {
  args <- list(...)
  for (i in names(args)) {
    object[[i]] <- args[[i, exact=TRUE]]
  }
  object
}

#ensure model number has correct leading zeroes
zero_pad_last <- function(x, num_zeros = 2) {
  parts <- strsplit(x, "_")[[1]]
  last_part <- parts[length(parts)]
  new_last_part <- sprintf(paste0("%0", num_zeros, "d"), as.integer(last_part))
  parts[length(parts)] <- new_last_part
  return(paste(parts, collapse = "_"))
}
Any scripts or data that you put into this service are public.
Certara.DarwinReporter documentation built on April 4, 2025, 2:22 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Certara.DarwinReporter
Data Visualization Utilities for 'pyDarwin' Machine Learning Pharmacometric Model Development

R/darwin_data.R
In Certara.DarwinReporter: Data Visualization Utilities for 'pyDarwin' Machine Learning Pharmacometric Model Development

Documented in darwin_data summarise_fitness_by_iteration summarise_fitness_penalties_by_iteration

Try the Certara.DarwinReporter package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Certara.DarwinReporter Data Visualization Utilities for 'pyDarwin' Machine Learning Pharmacometric Model Development

R/darwin_data.R In Certara.DarwinReporter: Data Visualization Utilities for 'pyDarwin' Machine Learning Pharmacometric Model Development

Documented in darwin_data summarise_fitness_by_iteration summarise_fitness_penalties_by_iteration

Try the Certara.DarwinReporter package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Certara.DarwinReporter
Data Visualization Utilities for 'pyDarwin' Machine Learning Pharmacometric Model Development

R/darwin_data.R
In Certara.DarwinReporter: Data Visualization Utilities for 'pyDarwin' Machine Learning Pharmacometric Model Development