Nothing
#' @import ggplot2
#' @import dplyr
#' @import xpose
#' @import Certara.Xpose.NLME
#'
NULL
`%>%`<- dplyr::`%>%`
#' Initialize darwin data structure.
#'
#' @param project_dir Directory containing input files for pyDarwin (e.g., options.json).
#' @param working_dir Directory containing misc results folders generated from a pyDarwin search.
#' This is the default location of the key_models, output, and temp folders.
#' @param output_dir Directory containing output files such as "results.csv" and final control files.
#' Default location is inside \code{working_dir/output}.
#' @param key_models_dir Directory of the key_models folder. Default location is inside \code{working_dir/key_models}.
#' Note, key models are not imported if argument is \code{NULL}, explicitly specify \code{key_models_dir} to import files for \code{\link{darwinReportUI}}.
#' @param ... Additional args.
#' @details
#' If \code{working_dir} and \code{output_dir} are sub directories of \code{project_dir}, these arguments can be ignored.
#' The \code{key_models_dir} is not required to initialize the \code{darwin_data} object. If specified, however, key models data will
#' be imported which may take time given the number of key models and size of output tables. See \code{\link{import_key_models}}.
#'
#'
#'
#' @return Object of class \code{darwin_data}.
#' @export
#'
#'
darwin_data <- function(project_dir, working_dir = NULL, output_dir = NULL, key_models_dir = NULL, ...) {
args <- list(...)
stopifnot(dir.exists(project_dir))
if (!is.null(working_dir) && working_dir != "") {
stopifnot(dir.exists(working_dir))
} else {
working_dir <- project_dir
}
if (!is.null(output_dir) && output_dir != "") {
stopifnot(dir.exists(output_dir))
results_file <- file.path(output_dir, "results.csv")
} else {
output_dir <- file.path(working_dir, "output")
results_file <- file.path(output_dir, "results.csv")
}
stopifnot(file.exists(results_file))
options_file <- file.path(project_dir, "options.json")
stopifnot(file.exists(options_file))
options <- jsonlite::fromJSON(options_file)
if (!is.null(options$algorithm) && grepl("^ex", options$algorithm , ignore.case = TRUE)) {
stop("Must select a machine learning algorithm. Certara.DarwinReporter is not compatible with an EXHAUSTIVE search.")
}
if (!is.null(options$engine_adapter) && options$engine_adapter == "nlme") {
software <- "nlme"
} else {
software <- "nonmem"
}
if (requireNamespace("data.table", quietly = TRUE)) {
results <- data.table::fread(
file = results_file,
data.table = FALSE
)
} else if (requireNamespace("readr", quietly = TRUE)) {
results <- readr::read_csv(
results_file
)
} else {
results <- utils::read.csv(
results_file,
numerals = "no.loss"
)
}
results <- results[,1:19]
colnames(results) <-
c(
"iteration",
"model number",
"run directory",
"ref run",
"status",
"fitness",
"model",
"ofv",
"success",
"covariance",
"correlation",
"ntheta",
"nomega",
"nsigm",
"condition num",
"r penalty",
"python penalty",
"translation messages",
"runtime errors"
)
messages <- get_messages(working_dir)
if (!is.null(messages)) {
search_overview <- tryCatch({
get_search_overview(messages)
}, error = function(e)
return(NULL))
if (is.null(search_overview)) {
search_overview <- tryCatch({
get_search_overview(messages, sep = "=")
}, error = function(e)
return(NULL))
}
} else {
search_overview <- NULL
}
if (!is.null(key_models_dir) && dir.exists(key_models_dir)) {
key_models <- import_key_models(darwin_data = NULL,
dir = key_models_dir,
software = software)
} else {
key_models <- NULL
}
list(project_dir = project_dir,
working_dir = working_dir,
output_dir = output_dir,
key_models_dir = key_models_dir,
software = software,
options = options,
search_overview = search_overview,
results = results,
messages = messages,
key_models = key_models) %>%
structure(class = "darwin_data")
}
get_penalties <- function(darwin_data) {
stopifnot(inherits(darwin_data, "darwin_data"))
default_penalties <- init_penalties()
user_penalties <- darwin_data$options$penalty
if (is.null(user_penalties)) {
penalties <- default_penalties
} else {
penalties <-
c(user_penalties, default_penalties[!(names(default_penalties) %in% names(user_penalties))])
}
return(penalties)
}
get_crash_value <- function(darwin_data) {
stopifnot(inherits(darwin_data, "darwin_data"))
crash_value <- darwin_data$options$crash_value
if (is.null(crash_value)) {
crash_value <- 99999999
}
return(crash_value)
}
get_search_algorithm_parameters_text <- function(darwin_data) {
options <- darwin_data$options
algorithm <- options$algorithm
num_generations <- options$num_generations
population_size <- options$population_size
total_iterations <- unique(darwin_data$results$iteration)
text <-
paste0(
"The search algorithm used was ",
algorithm,
". There were a total of ",
num_generations,
" iterations performed in the machine learning optimization stage. The number of models in each iteration was ",
population_size,
".\n"
)
# Periodic Downhill
if (!is.null(options$downhill_period) && options$downhill_period > 0) {
downhill_iterations <-
total_iterations[grepl("FN", total_iterations)]
downhill_period_iterations <-
total_iterations[grepl("D", total_iterations)]
downhill_period_iterations <- setdiff(downhill_period_iterations, downhill_iterations)
downhill_period_search_text <-
paste0(
"A periodic downhill search was performed at an interval of every ",
options$downhill_period,
" iterations, resulting in an additional ",
length(downhill_period_iterations),
" iterations in the search. "
)
text <- paste0(text, downhill_period_search_text)
}
# Final Downhill
if (!is.null(options$final_downhill_search) && options$final_downhill_search) {
downhill_iterations <-
total_iterations[grepl("FN", total_iterations)]
downhill_search_text <-
paste0(
"A final downhill search was performed with an additional ",
length(downhill_iterations),
" iterations."
)
text <- paste0(text, downhill_search_text)
}
text <- paste0(text, paste0("\nA total of ", length(total_iterations), " iterations were performed in the search.\n"))
return(text)
}
get_fitness_function_text <- function(darwin_data) {
text <- "The fitness function was defined as the sum of:\n"
penalties <- get_penalties(darwin_data)
penalties_text <- paste0("* ", penalties$theta, " points for each estimated THETA parameter\n",
"* ", penalties$omega, " points for each estimated OMEGA parameter\n",
"* ", penalties$sigma, " points for each estimated SIGMA parameter\n",
"* ", penalties$convergence, " points for failing to converge\n",
"* ", penalties$covariance, " points for failing the covariance step\n",
"* ", penalties$correlation, " points if the absolute value of any off diagonal of the estimation correlation matrix is > 0.95\n",
"* ", penalties$condition_number, " points for a condition number > 1000\n"
)
text <- c(text, penalties_text)
options <- darwin_data$options
if (!is.null(options$postprocess$use_r) && isTRUE(options$postprocess$use_r)) {
text <- c(text,
"A penalty for <DESCRIBE PENALTY> calculated by the R code included in appendix.\n"
)
}
if (!is.null(options$postprocess$use_python) && isTRUE(options$postprocess$use_python)) {
text <- c(text,
"A penalty for <DESCRIBE PENALTY> calculated by the Python code included in appendix."
)
}
return(text)
}
get_messages <- function(working_dir) {
messages_file <- file.path(working_dir, "messages.txt")
if(file.exists(messages_file)) {
messages <- readLines(messages_file)
} else {
warning("cannot create search_overview, messages.txt not found in working directory")
messages <- NULL
}
return(messages)
}
get_search_overview <- function(messages, sep = ":") {
num_models_in_search_space <- get_value_from_messages(messages,
type = "numeric",
pattern = paste0("Search space size", sep, " (\\d+)"))
num_considered_models <- get_value_from_messages(messages,
type = "numeric",
pattern = paste0("Number of considered models", sep, " (\\d+)"))
num_models_run_in_search <- get_value_from_messages(messages,
type = "numeric",
pattern = paste0("Number of models that were run during the search", sep, " (\\d+)"))
num_unique_models_to_best_model <- get_value_from_messages(messages,
type = "numeric",
pattern = paste0("Number of unique models to best model", sep, " (\\d+)"))
time_to_best_model <- get_value_from_messages(messages,
type = "character",
pattern = paste0("Time to best model", sep, " ([0-9.]+ \\w+)"))
best_overall_fitness <- strsplit(
get_value_from_messages(rev(messages),
type = "character",
pattern = paste0("Best overall fitness", sep, " (.+)$")),
split = ","
)
best_overall_fitness <- sapply(best_overall_fitness[[1]], trimws)
best_overall_fitness <- sapply(best_overall_fitness, function(x) {
gsub(".* ", "", x)
})
best_overall_fitness <- list(
"Fitness" = as.numeric(best_overall_fitness[[1]]),
"Iteration" = best_overall_fitness[[2]],
"Model" = best_overall_fitness[[3]]
)
elapsed_time <- get_value_from_messages(messages,
type = "character",
pattern = paste0("Elapsed time", sep, " ([0-9.]+ \\w+)"))
search_overview <- list(best_overall_fitness = best_overall_fitness,
num_models_in_search_space = num_models_in_search_space,
num_considered_models = num_considered_models,
num_models_run_in_search = num_models_run_in_search,
num_unique_models_to_best_model = num_unique_models_to_best_model,
time_to_best_model = time_to_best_model,
elapsed_time = elapsed_time
)
return(search_overview)
}
get_value_from_messages <- function(messages, pattern, type = c("numeric", "character")) {
matches <- gregexpr(pattern, messages)
extracted_values <- regmatches(messages, matches)
extracted_values <- Filter(length, extracted_values)
# Extract the numeric value from the first match (if exists)
if (length(extracted_values) > 0) {
extracted_value <- gsub(pattern, "\\1", extracted_values[[1]])
# Convert to numeric if needed
if (type == "numeric") {
extracted_value <- as.numeric(extracted_value)
}
} else {
extracted_value <- NULL
}
if (is.null(extracted_value)) {
index <- grep(pattern, messages)
if (length(index) > 0) {
relevant_string <- messages[index]
extracted_value <- sub(paste0(".*", pattern, ".*", collapse = ""), "\\1", relevant_string)
if (type == "numeric") {
extracted_value <- as.numeric(extracted_value)
}
}
}
return(extracted_value)
}
#' Summarise fitness by iteration
#'
#' Summarise minimum, cumulative minimum, and mean fitness values by pyDarwin search iteration/generation.
#'
#' @param darwin_data Object of class \code{darwin_data}.
#'
#' @return \code{data.frame} with columns \code{iteration},
#' \code{min_fitness}, \code{mean_fitness}, and \code{min_cum_fitness}
#' @export
#'
summarise_fitness_by_iteration <- function(darwin_data) {
stopifnot(inherits(darwin_data, "darwin_data"))
crash_value <- get_crash_value(darwin_data)
fitness_by_iteration <- darwin_data$results %>%
filter(fitness < crash_value) %>%
group_by(iteration)
iteration_number <- fitness_by_iteration %>%
select(iteration) %>%
distinct() %>%
ungroup() %>%
mutate(n = row_number())
fitness_by_iteration <- fitness_by_iteration %>%
summarise(min_fitness = min(fitness),
mean_fitness = mean(fitness)) %>%
left_join(iteration_number, by = "iteration") %>%
arrange(n) %>%
mutate(iteration = factor(iteration, levels = unique(iteration))) %>%
select(-n) %>%
mutate(min_cum_fitness = cummin(min_fitness))
return(fitness_by_iteration)
}
#' Summarize minimum fitness and penalty values by iteration
#'
#' Summarise minimum fitness, ofv, and penalty values used in calculation of overall fitness values by pyDarwin search iteration/generation.
#'
#' @param darwin_data Object of class \code{darwin_data}.
#' @param group_penalties Logical. Set to \code{TRUE} to group penalties.
#'
#' @return \code{data.frame} of columns \code{"iteration", "fitness", "ofv"} and corresponding penalty columns.
#' @export
#'
summarise_fitness_penalties_by_iteration <- function(darwin_data, group_penalties = FALSE) {
stopifnot(inherits(darwin_data, "darwin_data"))
crash_value <- get_crash_value(darwin_data)
penalties <- get_penalties(darwin_data)
fitness_penalties_by_iteration <- darwin_data$results %>%
filter(fitness < crash_value) %>%
group_by(iteration) %>%
filter(fitness == min(fitness)) %>%
dplyr::distinct(iteration, fitness, .keep_all = TRUE) %>%
ungroup() %>%
mutate(penalty_ntheta = ntheta * penalties$theta,
penalty_nomega = nomega * penalties$omega,
penalty_nsigma = nsigm * penalties$sigma,
penalty_success = ifelse(success, 0, penalties$convergence),
penalty_covar = ifelse(covariance, 0, penalties$covariance),
penalty_corr = ifelse(`correlation`, 0, penalties$correlation),
penalty_condition = ifelse(`condition num` < 1000, 0, penalties$condition_number),
penalty_r = `r penalty`,
penalty_python = `python penalty`,
iteration = factor(iteration, levels = unique(iteration))
) %>%
select(iteration,
fitness,
ofv,
penalty_ntheta,
penalty_nomega,
penalty_nsigma,
penalty_success,
penalty_covar,
penalty_corr,
penalty_condition,
penalty_r,
penalty_python)
if (group_penalties) {
fitness_penalties_by_iteration <- fitness_penalties_by_iteration %>%
mutate(
penalties_nparams = penalty_ntheta + penalty_nomega + penalty_nsigma,
penalties_covariance_step = penalty_covar + penalty_corr + penalty_condition
) %>%
select(
iteration,
fitness,
ofv,
penalties_nparams,
penalties_covariance_step,
penalty_success,
penalty_r,
penalty_python
)
}
# only select penalties that have non-zero values
fitness_penalties_by_iteration <- fitness_penalties_by_iteration %>%
select_if(function(col) any(col != 0))
return(fitness_penalties_by_iteration)
}
summarise_fitness_penalties_by_key_models <- function(darwin_data, group_penalties = FALSE) {
stopifnot(inherits(darwin_data, "darwin_data"))
stopifnot(!is.null(darwin_data$key_models))
crash_value <- get_crash_value(darwin_data)
penalties <- get_penalties(darwin_data)
if (darwin_data$software == "nlme") {
model_prefix <- "NLME"
} else {
model_prefix <- "NM"
}
results <- darwin_data$results %>%
dplyr::mutate(model_iteration = paste(model_prefix,
iteration,
sprintf("%02d", `model number`),
sep = "_"))
fitness_penalties_by_key_models <-
data.frame(model_iteration = names(darwin_data$key_models$run_dirs),
run_dir = unlist(darwin_data$key_models$run_dirs))
fitness_penalties_by_key_models$model_iteration <-
unlist(
lapply(
fitness_penalties_by_key_models$model_iteration,
zero_pad_last,
num_zeros = 2
)
)
fitness_penalties_by_key_models <- fitness_penalties_by_key_models %>%
dplyr::left_join(results, by = "model_iteration") %>%
dplyr::mutate(penalty_ntheta = ntheta * penalties$theta,
penalty_nomega = nomega * penalties$omega,
penalty_nsigma = nsigm * penalties$sigma,
penalty_success = ifelse(success, 0, penalties$convergence),
penalty_covar = ifelse(covariance, 0, penalties$covariance),
penalty_corr = ifelse(`correlation`, 0, penalties$correlation),
penalty_condition = ifelse(`condition num` < 1000, 0, penalties$condition_number),
penalty_r = `r penalty`,
penalty_python = `python penalty`,
iteration = factor(iteration, levels = unique(iteration))
) %>%
dplyr::select(iteration,
model_iteration,
run_dir,
fitness,
ofv,
penalty_ntheta,
penalty_nomega,
penalty_nsigma,
penalty_success,
penalty_covar,
penalty_corr,
penalty_condition,
penalty_r,
penalty_python)
if (group_penalties) {
fitness_penalties_by_key_models <- fitness_penalties_by_key_models %>%
mutate(
penalties_nparams = penalty_ntheta + penalty_nomega + penalty_nsigma,
penalties_covariance_step = penalty_covar + penalty_corr + penalty_condition
) %>%
select(
iteration,
model_iteration,
run_dir,
fitness,
ofv,
penalties_nparams,
penalties_covariance_step,
penalty_success,
penalty_r,
penalty_python
)
}
# only select penalties that have non-zero values
fitness_penalties_by_key_models <- fitness_penalties_by_key_models %>%
select_if(function(col) any(col != 0))
return(fitness_penalties_by_key_models)
}
init_penalties <- function() {
list(
condition_number = 100,
convergence = 100,
correlation = 100,
covariance = 100,
non_influential_tokens = 1e-05,
omega = 10,
sigma = 10,
theta = 10
)
}
#' update method for darwin_data
#'
#' @param object class darwin_dat
#' @param ... additional args
#'
#' @noRd
#' @keywords internal
update.darwin_data <- function(object, ...) {
args <- list(...)
for (i in names(args)) {
object[[i]] <- args[[i, exact=TRUE]]
}
object
}
#ensure model number has correct leading zeroes
zero_pad_last <- function(x, num_zeros = 2) {
parts <- strsplit(x, "_")[[1]]
last_part <- parts[length(parts)]
new_last_part <- sprintf(paste0("%0", num_zeros, "d"), as.integer(last_part))
parts[length(parts)] <- new_last_part
return(paste(parts, collapse = "_"))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.