R/ExperimentSetup.R

Defines functions .check_experimental_design_section_table .report_experimental_design .complete_experimental_design_section_table .get_available_subsample_methods .get_experimental_design_section_table .experimental_design_is_file extract_experimental_setup

Documented in extract_experimental_setup

#' Parse experimental design
#'
#' @param message_indent Spacing inserted before messages.
#' @param verbose Sets verbosity.
#' @inheritParams .parse_experiment_settings
#'
#' @details This function converts the experimental_design string
#'
#' @return data.table with subsampler information at different levels of the
#' experimental design.
#'
#' @md
#' @keywords internal
extract_experimental_setup <- function(
    experimental_design,
    file_dir,
    message_indent = 0L,
    verbose = TRUE) {
  
  if (.experimental_design_is_file(
    file_dir = file_dir,
    experimental_design = experimental_design)) {
    
    return(waiver())
  }
  
  # Remove all whitespace
  experimental_design <- gsub(
    pattern = " ",
    replacement = "",
    x = experimental_design)
  
  # Generate a section table
  section_table <- .get_experimental_design_section_table(
    experimental_design = experimental_design)
  
  # Identify the subsampler algorithms
  section_table <- .complete_experimental_design_section_table(
    section_table = section_table,
    experimental_design = experimental_design)
  
  # Check consistency of the table, e.g. feature selection should only appear
  # once, etc.
  .check_experimental_design_section_table(
    section_table = section_table)
  
  # Report experimental design to the user.
  .report_experimental_design(
    section_table = section_table,
    message_indent = message_indent,
    verbose = verbose)
  
  return(section_table)
}



.experimental_design_is_file <- function(
    file_dir,
    experimental_design) {
  # Check if the experimental design argument is actually a path to a file.
  
  if (is.null(file_dir)) return(FALSE)
  
  # Check if the file exists at all.
  if (!file.exists(file.path(file_dir, experimental_design)) &&
      !file.exists(experimental_design)) {
    return(FALSE)
  }
  
  # Check whether the file is an RDS file.
  return(tolower(.file_extension(basename(experimental_design))) == "rds")
}



.get_experimental_design_section_table <- function(experimental_design) {
  
  # Determine the number of experimental levels
  # First we locate the position of parentheses
  left_parenthesis <- gregexpr(
    pattern = "(",
    text = experimental_design,
    fixed = TRUE)[[1]]
  right_parenthesis <- gregexpr(
    pattern = ")",
    text = experimental_design,
    fixed = TRUE)[[1]]
  
  if (left_parenthesis[1] == -1) left_parenthesis <- integer(0)
  if (right_parenthesis[1] == -1) right_parenthesis <- integer(0)
  
  # Subsequently generate the corresponding experimental levels
  experiment_levels <- integer(nchar(experimental_design))
  for (ii in right_parenthesis) {
    experiment_levels[1:ii] <- experiment_levels[1:ii] + 1
  }
  
  for (ii in left_parenthesis) {
    experiment_levels[1L:(ii - 1L)] <- experiment_levels[1L:(ii - 1L)] - 1
  }
  
  # Generate setup sections
  sections <- rle(experiment_levels)
  sections$end <- cumsum(sections$lengths)
  sections$start <- sections$end - sections$lengths + 1L
  class(sections) <- NULL
  
  # Set up data table with experimental sections
  section_table <- data.table::as.data.table(sections)
  section_table[, "lengths" := NULL]
  data.table::setnames(
    x = section_table,
    old = c("values", "end", "start"),
    new = c("exp_level_id", "sect_end", "sect_start"))
  
  # Set up columns to be filled
  section_table[, ":="(
    "ref_data_id" = 0,
    "main_data_id" = 0,
    "feat_sel" = FALSE,
    "model_building" = FALSE,
    "external_validation" = FALSE,
    "perturb_method" = "none",
    "perturb_n_rep" = 0,
    "perturb_n_folds" = 0)]
  
  return(section_table)
}

.get_available_subsample_methods <- function() {
  return(c(
    "main",
    "limited_bootstrap",
    "full_bootstrap",
    "cross_val",
    "loocv",
    "imbalance_partition"))
}


.complete_experimental_design_section_table <- function(
    section_table,
    experimental_design) {
  
  # Suppress NOTES due to non-standard evaluation in data.table
  exp_level_id <- sect_start <- perturb_method <- NULL
  
  # Iterator
  main_data_id_iter <- 1
  
  #Identify samplers------------------------------------------------------------
  # Iterate over sections to set main_data_id
  for (ii in seq_len(nrow(section_table))) {
    
    # Check if ip, bt, lv or cv preceeds the current section
    if (section_table$sect_start[ii] > 2) {
      sampler_str <- substr(
        x = experimental_design,
        start = section_table$sect_start[ii] - 2L,
        stop = section_table$sect_start[ii] - 1L)
      
      # Check for imbalance partition (ip), limited bootstrap (bt), full
      # bootstrap (bs), cross-validation (cv) and leave-one-out-cross-validation
      # (lv)
      if (sampler_str == "bt") {
        section_table$perturb_method[ii] <- "limited_bootstrap"
        section_table$main_data_id[ii] <- main_data_id_iter
        main_data_id_iter <- main_data_id_iter + 1
      
      } else if (sampler_str == "bs") {
        section_table$perturb_method[ii] <- "full_bootstrap"
        section_table$main_data_id[ii] <- main_data_id_iter
        main_data_id_iter <- main_data_id_iter + 1
        
      } else if (sampler_str == "cv") {
        section_table$perturb_method[ii] <- "cross_val"
        section_table$main_data_id[ii] <- main_data_id_iter
        main_data_id_iter <- main_data_id_iter + 1
        
      } else if (sampler_str == "lv") {
        section_table$perturb_method[ii] <- "loocv"
        section_table$main_data_id[ii] <- main_data_id_iter
        main_data_id_iter <- main_data_id_iter + 1
        
      } else if (sampler_str == "ip") {
        section_table$perturb_method[ii] <- "imbalance_partition"
        section_table$main_data_id[ii] <- main_data_id_iter
        main_data_id_iter <- main_data_id_iter + 1
      }
      
      rm(sampler_str)
    } else {
      section_table$perturb_method[ii] <- "main"
      section_table$main_data_id[ii] <- main_data_id_iter
      main_data_id_iter <- main_data_id_iter + 1
    }
  }
  
  rm(main_data_id_iter)
  
  
  # Add main_data_id and ref_data_id -------------------------------------------
  # Iterate over sections to set main_data_id (where missing) and ref_data_id
  for (ii in seq_len(nrow(section_table))) {
    
    if (section_table$main_data_id[ii] == 0) {
      # Make subselection of data at the same level and select only preceding sections
      dt_sub <- section_table[
        exp_level_id == section_table$exp_level_id[ii] &
          sect_start < section_table$sect_start[ii], ]
      
      # Set data id from nearest preceding lower level section
      section_table$main_data_id[ii] <- dt_sub$main_data_id[nrow(dt_sub)]
      
      rm(dt_sub)
    }
    
    # Set reference data id
    if (section_table$exp_level_id[ii] == 0) {
      # At the lowest level (main) there is no reference
      section_table$ref_data_id[ii] <- 0
      
    } else {
      # Make subselection of data one level higher and select only preceding sections
      dt_sub <- section_table[
        exp_level_id == section_table$exp_level_id[ii] - 1 &
          sect_start < section_table$sect_start[ii], ]
      
      # Set data id from nearest preceding lower level section
      section_table$ref_data_id[ii] <- dt_sub$main_data_id[nrow(dt_sub)]
      
      rm(dt_sub)
    }
  }
  
  # Complete details------------------------------------------------------------
  # Add details for perturbations and other sections
  for (ii in seq_len(nrow(section_table))) {
    
    if (section_table$perturb_method[ii] %in% .get_available_subsample_methods()) {
      
      # Create readable string for current data id
      curr_data_id_str <- NULL
      for (jj in which(section_table$main_data_id == section_table$main_data_id[ii])) {
        curr_data_id_str <- c(
          curr_data_id_str,
          substr(
            x = experimental_design,
            start = section_table$sect_start[jj],
            stop = section_table$sect_end[jj]))
      }
      
      curr_data_id_str <- paste0(curr_data_id_str, collapse = "")
      
      # Drop parentheses and split string by comma
      curr_data_id_str <- gsub(
        pattern = "\\(|\\)",
        replacement = "",
        x = curr_data_id_str)
      curr_data_id_str <- strsplit(
        x = curr_data_id_str,
        split = ",",
        fixed = TRUE)[[1]]

      # Check if feature selection is included in the current section
      if (grepl(pattern = "fs", x = curr_data_id_str[1])) {
        section_table$feat_sel[ii] <- TRUE
      }
      
      # Check if model building is included in the current section
      if (grepl(pattern = "mb", x = curr_data_id_str[1])) {
        section_table$model_building[ii] <- TRUE
      }
      
      # Check if external validation is included in the current section
      if (grepl(pattern = "ev", x = curr_data_id_str[1])) {
        section_table$external_validation[ii] <- TRUE
      }
      
      # Read bootstrap data
      if (section_table$perturb_method[ii] %in% c("limited_bootstrap", "full_bootstrap")) {
        if (length(curr_data_id_str) < 2) {
          stop(paste0(
            "The number of bootstraps should be indicated when using the bt ",
            "(bootstrap) subsampler. None was found."))
        }
        
        # Determine the number of bootstraps
        n_reps <- .perform_type_conversion(
          x = curr_data_id_str[2],
          to_type = "integer",
          var_name = "The number of bootstraps",
          req_length = 1L)
        
        # Check whether the number of bootstraps is at least 1
        .check_number_in_valid_range(
          x = n_reps,
          var_name = "The number of bootstraps",
          range = c(1L, Inf))

        # Add the number of bootstraps to the section table
        section_table$perturb_n_rep[ii] <- n_reps
      }
      
      # Read cross-validation settings
      if (section_table$perturb_method[ii] == "cross_val") {
        if (length(curr_data_id_str) < 2L) {
          stop(paste0(
            "The number of folds should be indicated when using the cv ",
            "(cross-validation) subsampler. None was found."))
        }
        
        # Determine the number of folds
        n_folds <- .perform_type_conversion(
          x = curr_data_id_str[2],
          to_type = "integer",
          var_name = "The number of cross-validation folds",
          req_length = 1L)
        
        # Check whether the number of folds is at least 2
        .check_number_in_valid_range(
          x = n_folds,
          var_name = "The number of cross-validations folds",
          range = c(2L, Inf))
        
        # Add number of folds to the section_table
        section_table$perturb_n_folds[ii] <- n_folds
        
        # Check the number of repetitions
        if (length(curr_data_id_str) >= 3) {
          
          n_reps <- .perform_type_conversion(
            x = curr_data_id_str[3],
            to_type = "integer",
            var_name = "The number of cross-validation repetitions",
            req_length = 1L)
          
          # Check whether the number of CV repetitions is at least 1
          .check_number_in_valid_range(
            x = n_folds,
            var_name = "The number of cross-validations repetitions",
            range = c(1L, Inf))
          
          # Add number of repetitions to the section table
          section_table$perturb_n_rep[ii] <- n_reps
          
        } else {
          # Set the number of repetitions to one
          section_table$perturb_n_rep[ii] <- 1L
        }
      }
      
      # Read leave-one-out-cross-validation settings
      if (section_table$perturb_method[ii] == "loocv") {
        section_table$perturb_n_folds[ii] <- -1L
        section_table$perturb_n_rep[ii] <- 1L
      }
      
      rm(curr_data_id_str, jj)
    }
  }
  
  # Remove unnessary rows and columns
  section_table <- section_table[perturb_method != "none", ]
  section_table[, ":="(
    "exp_level_id" = NULL,
    "sect_end" = NULL,
    "sect_start" = NULL)]
  
  return(section_table)
}



.report_experimental_design <- function(
    section_table,
    message_indent = 0L,
    verbose = TRUE) {
  # Suppress NOTES due to non-standard evaluation in data.table
  feat_sel <- model_building <- main_data_id <- NULL
  
  # Report on validation data:
  if (any(section_table$external_validation)) {
    logger_message(
      "Setup report: Validation is external.",
      indent = message_indent,
      verbose = verbose)
    
  } else {
    logger_message(
      "Setup report: Validation is internal only.",
      indent = message_indent,
      verbose = verbose)
  }
  
  # Report on model building and feature selection
  if (any(section_table$feat_sel * section_table$model_building)) {
    main_message <- "Setup report: Feature selection and model building on"
    
    # Iteratively append message
    dt_sub <- section_table[feat_sel == TRUE & model_building == TRUE, ]
    curr_ref_data_id <- dt_sub$main_data_id[1]
    while (curr_ref_data_id > 0) {
      dt_sub <- section_table[main_data_id == curr_ref_data_id, ]
      
      if (dt_sub$perturb_method[1] == "main") {
        main_message <- c(
          main_message,
          "the training data.")
        
      } else if (dt_sub$perturb_method[1] %in% c("limited_bootstrap", "full_bootstrap")) {
        main_message <- c(
          main_message,
          paste0(dt_sub$perturb_n_rep[1], " bootstraps of"))
        
      } else if (dt_sub$perturb_method[1] == "cross_val") {
        main_message <- c(
          main_message,
          paste0(
            dt_sub$perturb_n_rep[1], " repetitions of ",
            dt_sub$perturb_n_folds, "-fold cross validation of"))
      
      } else if (dt_sub$perturb_method[1] == "loocv") {
        main_message <- c(
          main_message,
          "folds of leave-one-out-cross-validation of")
        
      } else if (dt_sub$perturb_method[1] == "imbalance_partition") {
        main_message <- c(
          main_message,
          "class-balanced partitions of")
      } 
      
      curr_ref_data_id <- dt_sub$ref_data_id[1]
    }
    
    logger_message(
      paste0(main_message, collapse = " "),
      indent = message_indent,
      verbose = verbose)
    
  } else {
    # Feature selection first
    main_message <- "Setup report: Feature selection on"
    
    # Iteratively append message
    dt_sub <- section_table[feat_sel == TRUE, ]
    curr_ref_data_id <- dt_sub$main_data_id[1]
    
    while (curr_ref_data_id > 0) {
      
      dt_sub <- section_table[main_data_id == curr_ref_data_id, ]
      
      if (dt_sub$perturb_method[1] == "main") {
        main_message <- c(
          main_message,
          "the training data.")
        
      } else if (dt_sub$perturb_method[1] %in% c("limited_bootstrap", "full_bootstrap")) {
        main_message <- c(
          main_message,
          paste0(dt_sub$perturb_n_rep[1], " bootstraps of"))
        
      } else if (dt_sub$perturb_method[1] == "cross_val") {
        main_message <- c(
          main_message,
          paste0(
            dt_sub$perturb_n_rep[1], " repetitions of ",
            dt_sub$perturb_n_folds, "-fold cross validation of"))
        
      } else if (dt_sub$perturb_method[1] == "loocv") {
        main_message <- c(
          main_message,
          "folds of leave-one-out-cross-validation of")
        
      } else if (dt_sub$perturb_method[1] == "imbalance_partition") {
        main_message <- c(
          main_message,
          "class-balanced partitions of")
      }
      
      curr_ref_data_id <- dt_sub$ref_data_id[1]
    }
    
    logger_message(
      paste0(main_message, collapse = " "),
      indent = message_indent,
      verbose = verbose)
    
    # Model building second
    main_message <- "Setup report: Model building on"
    
    # Iteratively append message
    dt_sub <- section_table[model_building == TRUE, ]
    curr_ref_data_id <- dt_sub$main_data_id[1]
    
    while (curr_ref_data_id > 0) {
      
      dt_sub <- section_table[main_data_id == curr_ref_data_id, ]
      
      if (dt_sub$perturb_method[1] == "main") {
        main_message <- c(
          main_message,
          "the training data.")
        
      } else if (dt_sub$perturb_method[1] %in% c("limited_bootstrap", "full_bootstrap")) {
        main_message <- c(
          main_message,
          paste0(dt_sub$perturb_n_rep[1], " bootstraps of"))
        
      } else if (dt_sub$perturb_method[1] == "cross_val") {
        main_message <- c(
          main_message,
          paste0(
            dt_sub$perturb_n_rep[1], " repetitions of ",
            dt_sub$perturb_n_folds, "-fold cross validation of"))
        
      } else if (dt_sub$perturb_method[1] == "loocv") {
        main_message <- c(
          main_message,
          "folds of leave-one-out-cross-validation of")
        
      } else if (dt_sub$perturb_method[1] == "imbalance_partition") {
        main_message <- c(
          main_message,
          "class-balanced partitions of")
      }
      
      curr_ref_data_id <- dt_sub$ref_data_id[1]
    }
    
    logger_message(
      paste0(main_message, collapse = " "),
      indent = message_indent,
      verbose = verbose)
  }
}


.check_experimental_design_section_table <- function(section_table) {
  if (sum(section_table$feat_sel) > 1) {
    stop(paste0(
      "The fs component for feature selection may only be used once ",
      "in the experimental design."))
  }
  
  if (sum(section_table$feat_sel) == 0) {
    stop(paste0(
      "The fs component for feature selection must appear in the ",
      "experimental design. It was not found."))
  }
  
  if (sum(section_table$model_building) > 1) {
    stop(paste0(
      "The mb component for model building may only be used once ",
      "in the experimental design."))
  }
  
  if (sum(section_table$model_building) == 0) {
    stop(paste0(
      "The mb component for model building must appear in the ",
      "experimental design. It was not found."))
  }
  
  if (sum(section_table$external_validation) > 1) {
    stop(paste0(
      "The ev component for external validation can only appear once ",
      "in the experimental design."))
  }
  
  return(invisible(TRUE))
}

Try the familiar package in your browser

Any scripts or data that you put into this service are public.

familiar documentation built on Sept. 30, 2024, 9:18 a.m.