R/check_rep_seeds.R

Defines functions check_rep_seeds

Documented in check_rep_seeds

#' Checks if jobs were run with the same seed
#'
#' Checks every log file inside a folder for the record of the used seed.
#' Returns duplicated seeds and corresponding job arrays.
#'
#' @inheritParams default_params_doc
#'
#' @return A data frame with four columns. Each line contains the information
#'   of one result with a duplicated seed. There will be no lines if there are
#'   no duplicated seeds in the any of the logs.
#'   Columns are as follows:
#'   * `Data`: A character vector with the name of the data set where duplicates
#'     were found.
#'   * `Models`: A numeric with corresponding array index. Will be empty if no
#'     duplicates were found.
#'   * `Seeds`: A numeric with the corresponding seed that was duplicated.
#'   * `Array_indices`: A numeric with corresponding array index.
#' @export
#' @author Pedro Santos Neves
#'
#' @note
#' This function is the preferred method for checking for the presence of
#' repeated seeds.
#' However, it will fail if the log files were generated by older versions of
#' the package, as it expects that the seed and array information are always at
#' the same location. If you encounter issues, try running
#' [check_rep_seeds_depr()] instead. For all other cases, give preference
#' to this function as better optimization for log output parsing is possible.
#'
#' @seealso [check_rep_seeds_depr()] for deprecated checking compatible
#' with older log files.
#'
#' @examples
#' \dontrun{
#' repeated_seeds <- check_rep_seeds(logs_path = "/logs/")
#' }
check_rep_seeds <- function(logs_path) {
  testit::assert(fact = "Folder exists", dir.exists(logs_path))

  logfiles <- list.files(logs_path, full.names = TRUE)
  testit::assert(fact = "Folder has logfiles", length(logfiles) >= 1)

  log_heads <- lapply(logfiles, readLines, n = 5)

  data_names <- c()
  array_indices <- c()
  seeds <- c()
  model_names <- c()
  for (i in seq_along(log_heads)) {
    data_line <- log_heads[[i]][2]
    model_line <- log_heads[[i]][3]
    array_line <- log_heads[[i]][4]
    seed_line <- log_heads[[i]][5]
    testit::assert(
      fact = "Array line exists. Try check_rep_seeds_depr() if this fails",
      grepl("Data name:", data_line)
    )
    testit::assert(
      fact = "Array line exists. Try check_rep_seeds_depr() if this fails",
      grepl("Running analysis with array index:", array_line)
    )
    testit::assert(
      fact = "Seed line exists. Try check_rep_seeds_depr() if this fails",
      grepl("Running analysis with seed:", seed_line)
    )
    testit::assert(
      fact = "Seed line exists. Try check_rep_seeds_depr() if this fails",
      grepl("Model name:", model_line)
    )

    data_names[i] <- sub(".*: ", "", data_line)
    model_names[i] <- sub(".*: ", "", model_line)
    array_indices[i] <- sub(".*: ", "", array_line)
    seeds[i] <- sub(".*: ", "", seed_line)
  }

  duplicated_seeds <- seeds[seeds %in% unique(seeds[duplicated(seeds)])]
  duplicated_seeds <- as.numeric(duplicated_seeds)
  duplicated_seed_indices <- which(seeds %in% unique(seeds[duplicated(seeds)]))

  duplicated_data_names <- data_names[duplicated_seed_indices]
  duplicated_model_names <- model_names[duplicated_seed_indices]
  duplicated_array_indices <- as.numeric(array_indices[duplicated_seed_indices])

  out <- data.frame(
    "Data" = duplicated_data_names,
    "Models" = duplicated_model_names,
    "Seeds" = duplicated_seeds,
    "Array_indices" = duplicated_array_indices
  )

  return(out)
}
tece-lab/DAISIEutils documentation built on Jan. 31, 2024, 12:09 p.m.