#' Checks if jobs were run with the same seed
#'
#' Checks every log file inside a folder for the record of the used seed.
#' Returns duplicated seeds and corresponding job arrays.
#'
#' @inheritParams default_params_doc
#'
#' @return A data frame with four columns. Each line contains the information
#' of one result with a duplicated seed. There will be no lines if there are
#' no duplicated seeds in the any of the logs.
#' Columns are as follows:
#' * `Data`: A character vector with the name of the data set where duplicates
#' were found.
#' * `Models`: A numeric with corresponding array index. Will be empty if no
#' duplicates were found.
#' * `Seeds`: A numeric with the corresponding seed that was duplicated.
#' * `Array_indices`: A numeric with corresponding array index.
#' @export
#' @author Pedro Santos Neves
#'
#' @note
#' This function is the preferred method for checking for the presence of
#' repeated seeds.
#' However, it will fail if the log files were generated by older versions of
#' the package, as it expects that the seed and array information are always at
#' the same location. If you encounter issues, try running
#' [check_rep_seeds_depr()] instead. For all other cases, give preference
#' to this function as better optimization for log output parsing is possible.
#'
#' @seealso [check_rep_seeds_depr()] for deprecated checking compatible
#' with older log files.
#'
#' @examples
#' \dontrun{
#' repeated_seeds <- check_rep_seeds(logs_path = "/logs/")
#' }
check_rep_seeds <- function(logs_path) {
testit::assert(fact = "Folder exists", dir.exists(logs_path))
logfiles <- list.files(logs_path, full.names = TRUE)
testit::assert(fact = "Folder has logfiles", length(logfiles) >= 1)
log_heads <- lapply(logfiles, readLines, n = 5)
data_names <- c()
array_indices <- c()
seeds <- c()
model_names <- c()
for (i in seq_along(log_heads)) {
data_line <- log_heads[[i]][2]
model_line <- log_heads[[i]][3]
array_line <- log_heads[[i]][4]
seed_line <- log_heads[[i]][5]
testit::assert(
fact = "Array line exists. Try check_rep_seeds_depr() if this fails",
grepl("Data name:", data_line)
)
testit::assert(
fact = "Array line exists. Try check_rep_seeds_depr() if this fails",
grepl("Running analysis with array index:", array_line)
)
testit::assert(
fact = "Seed line exists. Try check_rep_seeds_depr() if this fails",
grepl("Running analysis with seed:", seed_line)
)
testit::assert(
fact = "Seed line exists. Try check_rep_seeds_depr() if this fails",
grepl("Model name:", model_line)
)
data_names[i] <- sub(".*: ", "", data_line)
model_names[i] <- sub(".*: ", "", model_line)
array_indices[i] <- sub(".*: ", "", array_line)
seeds[i] <- sub(".*: ", "", seed_line)
}
duplicated_seeds <- seeds[seeds %in% unique(seeds[duplicated(seeds)])]
duplicated_seeds <- as.numeric(duplicated_seeds)
duplicated_seed_indices <- which(seeds %in% unique(seeds[duplicated(seeds)]))
duplicated_data_names <- data_names[duplicated_seed_indices]
duplicated_model_names <- model_names[duplicated_seed_indices]
duplicated_array_indices <- as.numeric(array_indices[duplicated_seed_indices])
out <- data.frame(
"Data" = duplicated_data_names,
"Models" = duplicated_model_names,
"Seeds" = duplicated_seeds,
"Array_indices" = duplicated_array_indices
)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.