R/Familiar.R

Defines functions .clean_familiar_environment .import_all_familiar_objects get_project_list .assign_project_info_to_global get_file_paths .assign_file_paths_to_global get_settings .assign_settings_to_global get_xml_config .clean_configuration_comments .load_configuration_file .is_absolute_path train_familiar precompute_vimp precompute_feature_info precompute_data_assignment summon_familiar

Documented in get_xml_config precompute_data_assignment precompute_feature_info precompute_vimp summon_familiar train_familiar

#' familiar: Fully Automated Machine Learning with Interpretable Analysis of Results
#'
#' End-to-end, automated machine learning package for creating 
#' trustworthy and interpretable machine learning models. Familiar supports
#' modelling of regression, categorical and time-to-event (survival) outcomes.
#' Models created using familiar are self-containing, and their use does not
#' require additional information such as baseline survival, feature
#' clustering, or feature transformation and normalisation parameters. In
#' addition, an novelty or out-of-distribution detector is trained
#' simultaneously and contained with every model. Model performance,
#' calibration, risk group stratification, (permutation) variable importance,
#' individual conditional expectation, partial dependence, and more, are
#' assessed automatically as part of the evaluation process and exported in
#' tabular format and plotted, and may also be computed manually using export
#' and plot functions. Where possible, metrics and values obtained during the
#' evaluation process come with confidence intervals.
#'
#' @docType package
#' @name familiar
#' @import data.table
#' @import methods
#' @importClassesFrom rstream rstream.runif
#' @importFrom stats predict coef vcov
#' @importFrom survival Surv coxph survreg
#' @importFrom utils head tail
#' @importFrom rlang quo quos enquo enquos sym syms ensym ensyms parse_expr parse_exprs
"_PACKAGE"



#' Perform end-to-end machine learning and data analysis
#'
#' @param formula An R formula. The formula can only contain feature names and
#'   dot (`.`). The `*` and `+1` operators are not supported as these refer to
#'   columns that are not present in the data set.
#'   
#'   Use of the formula interface is optional.
#'
#' @param data A `data.table` object, a `data.frame` object, list containing
#'   multiple `data.table` or `data.frame` objects, or paths to data files.
#'
#'   `data` should be provided if no file paths are provided to the `data_files`
#'   argument. If both are provided, only `data` will be used.
#'
#'   All data is expected to be in wide format, and ideally has a sample
#'   identifier (see `sample_id_column`), batch identifier (see `cohort_column`)
#'   and outcome columns (see `outcome_column`).
#'
#'   In case paths are provided, the data should be stored as `csv`, `rds` or
#'   `RData` files. See documentation for the `data_files` argument for more
#'   information.
#'   
#' @param experiment_data Experimental data may provided in the form of 
#' 
#' @param cl Cluster created using the `parallel` package. This cluster is then
#'   used to speed up computation through parallelisation. When a cluster is not
#'   provided, parallelisation is performed by setting up a cluster on the local
#'   machine.
#'
#'   This parameter has no effect if the `parallel` argument is set to `FALSE`.
#' @param config List containing configuration parameters, or path to an `xml`
#'   file containing these parameters. An empty configuration file can obtained
#'   using the `get_xml_config` function.
#'
#'   All parameters can also be set programmatically. These supersede any
#'   arguments derived from the configuration list.
#'
#' @param config_id Identifier for the configuration in case the list or `xml`
#'   table indicated by `config` contains more than one set of configurations.
#' @param verbose Indicates verbosity of the results. Default is TRUE, and all
#'   messages and warnings are returned.
#' @param .stop_after Variable for internal use.
#'
#' @inheritDotParams .parse_file_paths -config -verbose
#' @inheritDotParams .parse_experiment_settings -config
#' @inheritDotParams .parse_setup_settings -config
#' @inheritDotParams .parse_preprocessing_settings -config -data -parallel -outcome_type
#' @inheritDotParams .parse_feature_selection_settings -config -data -parallel -outcome_type
#' @inheritDotParams .parse_model_development_settings -config -data -parallel -outcome_type
#' @inheritDotParams .parse_hyperparameter_optimisation_settings -config -parallel -outcome_type
#' @inheritDotParams .parse_evaluation_settings -config -data -parallel -outcome_type -hpo_metric -development_batch_id -vimp_aggregation_rank_threshold -vimp_aggregation_method -prep_cluster_method -prep_cluster_linkage_method -prep_cluster_cut_method -prep_cluster_similarity_threshold -prep_cluster_similarity_metric
#'
#' @return Nothing. All output is written to the experiment directory. If the 
#'   experiment directory is in a temporary location, a list with all
#'   familiarModel, familiarEnsemble, familiarData and familiarCollection
#'   objects will be returned.
#'
#' @inherit .parse_general_settings references
#'
#' @export
#' @md
summon_familiar <- function(
    formula = NULL,
    data = NULL,
    experiment_data = NULL,
    cl = NULL,
    config = NULL,
    config_id = 1L,
    verbose = TRUE,
    .stop_after = "evaluation",
    ...) {
  
  # Set options.
  # Disable randomForestSRC OpenMP core use.
  options(rf.cores = as.integer(1))
  on.exit(options(rf.cores = -1L), add = TRUE)
  
  # Disable multithreading on data.table to prevent reduced performance due to
  # resource collisions with familiar parallelisation.
  data.table::setDTthreads(1L)
  on.exit(data.table::setDTthreads(0L), add = TRUE)
  
  # Check .stop_after
  .check_parameter_value_is_valid(
    x = .stop_after,
    var_name = ".stop_after",
    values = c("setup", "preprocessing", "vimp", "training", "evaluation"))
  
  
  # Load configuration file ----------------------------------------------------
  config <- .load_configuration_file(
    config = config,
    config_id = config_id)
  
  
  # Test arguments provided by ... and config ----------------------------------
  .check_dots_is_parameter(dots = names(list(...)))
  .check_configuration_tag_is_parameter(config = config)
  
  
  # File paths -----------------------------------------------------------------
  # Parse file paths
  file_paths  <- do.call(
    .parse_file_paths,
    args = c(
      list("config" = config, "verbose" = verbose),
      list(...)))
  
  # Set paths to data
  if (!is.null(file_paths$data) && is.null(data)) {
    data <- file_paths$data
  } 
  
  # Make sure to remove the directory if it is on the temporary path.
  if (file_paths$is_temporary) {
    on.exit(
      unlink(file_paths$experiment_dir, recursive = TRUE),
      add = TRUE)
  } 
  
  # Load data ------------------------------------------------------------------
  dots <- list(...)
  
  # Add placeholder for experimental_design in case experiment_data is set.
  if (is.null(dots$experimental_design) && !is.null(experiment_data)) {
    dots$experimental_design <- "fs+mb"
  }
  
  # Parse experiment and data settings
  settings <- do.call(
    .parse_initial_settings,
    args = c(list("config" = config), dots))
  
  if (is(data, "dataObject")) {
    # Reconstitute settings from the data.
    new_settings <- extract_settings_from_data(data)
    
    # Replace settings with new settings.
    settings$data[names(new_settings$data)] <- new_settings$data
    
    # Extract data as a data.table.
    data <- data@data
    
  } else {
    # Load data.
    data <- do.call(
      .load_data,
      args = c(list("data" = data), settings$data))
    
    # Update settings
    settings <- .update_initial_settings(
      formula = formula,
      data = data,
      settings = settings)
  }
  
  # Parse data
  data <- .finish_data_preparation(
    data = data,
    sample_id_column = settings$data$sample_col,
    batch_id_column = settings$data$batch_col,
    series_id_column = settings$data$series_col,
    outcome_column = settings$data$outcome_col,
    outcome_type = settings$data$outcome_type,
    include_features = settings$data$include_features,
    class_levels = settings$data$class_levels,
    censoring_indicator = settings$data$censoring_indicator,
    event_indicator = settings$data$event_indicator,
    competing_risk_indicator = settings$data$competing_risk_indicator,
    reference_method = settings$data$reference_method)
  
  
  # Load experiment data -------------------------------------------------------
  if (!is.null(experiment_data)) {
    
    # Write experiment data to the file system
    experiment_data <- load_experiment_data(
      experiment_data,
      file_paths = file_paths)
    
    # Force the use of existing experiment data.
    settings$data$exp_design <- .get_iteration_file_name(
      project_id = experiment_data@project_id,
      file_paths = file_paths)
  }
  
  rm(experiment_data)
  
  # Experimental setup and settings --------------------------------------------
  
  # Derive experimental design
  experiment_setup <- extract_experimental_setup(
    experimental_design = settings$data$exp_design,
    file_dir = file_paths$iterations_dir,
    verbose = verbose)
  
  # Check experiment settings
  settings <- .update_experimental_design_settings(
    section_table = experiment_setup,
    data = data,
    settings = settings)
  
  # Import remaining settings
  settings <- .parse_general_settings(
    config = config,
    data = data,
    settings = settings,
    ...)
  
  # Create a generic outcome object
  outcome_info <- create_outcome_info(settings = settings)
  
  # Define iterations etc.
  project_info <- .get_iteration_data(
    file_paths = file_paths,
    data = data,
    experiment_setup = experiment_setup,
    settings = settings,
    verbose = verbose)
  
  # In case the iterations are loaded from a iterations file provided by the
  # user, perform some checks on the experimental design given the current data
  # set.
  if (is.waive(experiment_setup)) {
    # Replace experiment_setup with the setup that was loaded from the
    # iterations file.
    experiment_setup <- project_info$experiment_setup
    
    # Check experiment settings for the current dataset.
    settings <- .update_experimental_design_settings(
      section_table = experiment_setup,
      data = data,
      settings = settings)
  }
  
  # Backend and parallellisation -----------------------------------------------
  
  # Identify if an external cluster is provided, and required.
  if (settings$run$parallel) {
    is_external_cluster <- inherits(cl, "cluster")
    
  } else {
    is_external_cluster <- FALSE
  }
  
  # Assign objects that should be accessible everywhere to the familiar global
  # environment. Note that .assign_data_to_backend will also start backend
  # server processes.
  .assign_settings_to_global(settings = settings)
  .assign_file_paths_to_global(file_paths = file_paths)
  .assign_project_info_to_global(project_info = project_info)
  .assign_outcome_info_to_global(outcome_info = outcome_info)
  .assign_backend_options_to_global(
    backend_type = settings$run$backend_type,
    server_port = settings$run$server_port)
  .assign_data_to_backend(
    data = data,
    backend_type = settings$run$backend_type,
    server_port = settings$run$server_port)

  .assign_parallel_options_to_global(
    is_external_cluster = is_external_cluster,
    restart_cluster = settings$run$restart_cluster,
    n_cores = settings$run$parallel_nr_cores,
    cluster_type = settings$run$cluster_type)
  
  # Make sure that backend server will close after the process finishes.
  on.exit(
    shutdown_backend_server(
      backend_type = settings$run$backend_type,
      server_port = settings$run$server_port),
    add = TRUE)
  
  if (settings$run$parallel &&
      !settings$run$restart_cluster &&
      !is_external_cluster) {
    # Start local cluster in the overall process.
    cl <- .restart_cluster(cl = NULL, assign = "all")
    on.exit(.terminate_cluster(cl), add = TRUE)
    
  } else if (settings$run$parallel &&
             settings$run$restart_cluster &&
             !is_external_cluster) {
    # Start processes locally.
    cl <- waiver()
    
  } else if (settings$run$parallel && is_external_cluster) {
    # Make sure that everything is present on the external cluster.
    cl <- .update_cluster(cl = cl, assign = "all")
    
  } else if (!settings$run$parallel) {
    # No cluster is created when 
    cl <- NULL
  }
  
  # Clean familiar environment on exit. This is run last to avoid cleaning up
  # the familiar environment prior to shutting down the socket server process.
  on.exit(.clean_familiar_environment(), add = TRUE)
  
  if (.stop_after %in% c("setup")) {
    return(create_experiment_data(
      project_id = project_info$project_id,
      experiment_setup = experiment_setup,
      iteration_list = project_info$iter_list))
  }
  
  # Pre-processing -------------------------------------------------------------
  feature_info <- NULL
  
  # Start pre-processing
  run_preprocessing(
    cl = cl,
    feature_info_list = feature_info,
    project_info = project_info,
    settings = settings,
    file_paths = file_paths,
    verbose = verbose)
  
  # Check if the process should be stopped at this point.
  if (.stop_after %in% c("preprocessing")) {
    return(create_experiment_data(
      project_id = project_info$project_id,
      experiment_setup = experiment_setup,
      iteration_list = project_info$iter_list,
      feature_info = get_feature_info_from_backend(
        data_id = waiver(),
        run_id = waiver())))
  }
  
  # Variable importance --------------------------------------------------------
  
  # Start feature selection
  run_feature_selection(
    cl = cl,
    project_list = project_info,
    settings = settings,
    file_paths = file_paths,
    verbose = verbose)
  
  # Check if the process should be stopped at this point.
  if (.stop_after %in% c("vimp")) {
    return(create_experiment_data(
      project_id = project_info$project_id,
      experiment_setup = experiment_setup,
      iteration_list = project_info$iter_list,
      feature_info = get_feature_info_from_backend(
        data_id = waiver(),
        run_id = waiver()),
      vimp_table_list = .retrieve_feature_selection_data(
        fs_method = settings$fs$fs_method,
        project_list = project_info,
        file_paths = file_paths)))
  }
  
  # Training -------------------------------------------------------------------
  
  # Start model building
  run_model_development(
    cl = cl,
    project_list = project_info,
    settings = settings,
    file_paths = file_paths,
    verbose = verbose)
  
  # Check if the process should be stopped at this point.
  if (.stop_after %in% c("training")) {
    return(.import_all_familiar_objects(file_paths = file_paths)$familiarModel)
  }
  
  # Explanation and evaluation -------------------------------------------------
  
  # Start evaluation
  run_evaluation(
    cl = cl,
    project_list = project_info,
    settings = settings,
    file_paths = file_paths,
    verbose = verbose)
  
  if (file_paths$is_temporary) {
    # Collect all familiarModels, familiarEnsemble, familiarData and
    # familiarCollection objects.
    familiar_list <- .import_all_familiar_objects(file_paths = file_paths)
    
    # Return list with objects
    return(familiar_list)
    
  } else {
    return(invisible(TRUE))
  }
}



#' Pre-compute data assignment
#'
#' @description Creates data assignment.
#'
#' @param experimental_design (**required**) Defines what the experiment looks
#'   like, e.g. `cv(bt(fs,20)+mb,3,2)` for 2 times repeated 3-fold
#'   cross-validation with nested feature selection on 20 bootstraps and
#'   model-building. The basic workflow components are:
#'
#'   * `fs`: (required) feature selection step.
#'
#'   * `mb`: (required) model building step.
#'
#'   * `ev`: (optional) external validation. If validation batches or cohorts
#'   are present in the dataset (`data`), these should be indicated in the
#'   `validation_batch_id` argument.
#'
#'   The different components are linked using `+`.
#'
#'   Different subsampling methods can be used in conjunction with the basic
#'   workflow components:
#'
#'   * `bs(x,n)`: (stratified) .632 bootstrap, with `n` the number of
#'   bootstraps. In contrast to `bt`, feature pre-processing parameters and
#'   hyperparameter optimisation are conducted on individual bootstraps.
#'
#'   * `bt(x,n)`: (stratified) .632 bootstrap, with `n` the number of
#'   bootstraps. Unlike `bs` and other subsampling methods, no separate
#'   pre-processing parameters or optimised hyperparameters will be determined
#'   for each bootstrap.
#'
#'   * `cv(x,n,p)`: (stratified) `n`-fold cross-validation, repeated `p` times.
#'   Pre-processing parameters are determined for each iteration.
#'
#'   * `lv(x)`: leave-one-out-cross-validation. Pre-processing parameters are
#'   determined for each iteration.
#'
#'   * `ip(x)`: imbalance partitioning for addressing class imbalances on the
#'   data set. Pre-processing parameters are determined for each partition. The
#'   number of partitions generated depends on the imbalance correction method
#'   (see the `imbalance_correction_method` parameter).
#'
#'   As shown in the example above, sampling algorithms can be nested.
#'
#'   Though neither variable importance is determined nor models are learned
#'   within `precompute_data_assignment`, the corresponding elements are still
#'   required to prevent issues when using the resulting `experimentData` object
#'   to warm-start the experiments.
#'
#'   The simplest valid experimental design is `fs+mb`. This is the default in
#'   `precompute_data_assignment`, and will simply assign all instances to the
#'   training set.
#'
#' @inheritParams summon_familiar
#' @inheritParams .parse_feature_selection_settings
#' @inheritDotParams .parse_experiment_settings -config
#' @inheritDotParams .parse_setup_settings -config
#' @inheritDotParams .parse_preprocessing_settings -config -data -parallel
#'   -outcome_type
#'
#' @details This is a thin wrapper around `summon_familiar`, and functions like
#'   it, but automatically skips computation of variable importance, learning
#'   and subsequent evaluation steps.
#'
#'   The function returns an `experimentData` object, which can be used to
#'   warm-start other experiments by providing it to the `experiment_data`
#'   argument.
#'
#' @return An `experimentData` object.
#'
#' @export
precompute_data_assignment <- function(
    formula = NULL,
    data = NULL,
    experiment_data = NULL,
    cl = NULL,
    experimental_design = "fs+mb",
    verbose = TRUE,
    ...) {
  
  # Isolate dots.
  dots <- list(...)
  
  # Drop skip_evaluation_elements, fs_method and learner if present.
  dots$skip_evaluation_elements <- NULL
  dots$fs_method <- NULL
  dots$learner <- NULL
  
  # Summon a familiar and compute everything up to variable importance data.
  experiment_data <- do.call(
    summon_familiar,
    args = c(
      list(
        "formula" = formula,
        "data" = data,
        "experiment_data" = experiment_data,
        "cl" = cl,
        "experimental_design" = experimental_design,
        "fs_method" = "none",
        "learner" = "glm",
        "skip_evaluation_elements" = "all",
        "verbose" = verbose,
        ".stop_after" = "setup"),
      dots))
  
  # Extract familiar models.
  return(experiment_data)
}



#' Pre-compute feature information
#'
#' @description Creates data assignment and subsequently extracts feature
#'   information such as normalisation and clustering parameters.
#'
#' @param experimental_design (**required**) Defines what the experiment looks
#'   like, e.g. `cv(bt(fs,20)+mb,3,2)` for 2 times repeated 3-fold
#'   cross-validation with nested feature selection on 20 bootstraps and
#'   model-building. The basic workflow components are:
#'
#'   * `fs`: (required) feature selection step.
#'
#'   * `mb`: (required) model building step.
#'
#'   * `ev`: (optional) external validation. If validation batches or cohorts
#'   are present in the dataset (`data`), these should be indicated in the
#'   `validation_batch_id` argument.
#'
#'   The different components are linked using `+`.
#'
#'   Different subsampling methods can be used in conjunction with the basic
#'   workflow components:
#'
#'   * `bs(x,n)`: (stratified) .632 bootstrap, with `n` the number of
#'   bootstraps. In contrast to `bt`, feature pre-processing parameters and
#'   hyperparameter optimisation are conducted on individual bootstraps.
#'
#'   * `bt(x,n)`: (stratified) .632 bootstrap, with `n` the number of
#'   bootstraps. Unlike `bs` and other subsampling methods, no separate
#'   pre-processing parameters or optimised hyperparameters will be determined
#'   for each bootstrap.
#'
#'   * `cv(x,n,p)`: (stratified) `n`-fold cross-validation, repeated `p` times.
#'   Pre-processing parameters are determined for each iteration.
#'
#'   * `lv(x)`: leave-one-out-cross-validation. Pre-processing parameters are
#'   determined for each iteration.
#'
#'   * `ip(x)`: imbalance partitioning for addressing class imbalances on the
#'   data set. Pre-processing parameters are determined for each partition. The
#'   number of partitions generated depends on the imbalance correction method
#'   (see the `imbalance_correction_method` parameter).
#'
#'   As shown in the example above, sampling algorithms can be nested.
#'
#'   Though neither variable importance is determined nor models are learned
#'   within `precompute_feature_info`, the corresponding elements are still
#'   required to prevent issues when using the resulting `experimentData` object
#'   to warm-start the experiments.
#'
#'   The simplest valid experimental design is `fs+mb`. This is the default in
#'   `precompute_feature_info`, and will determine feature parameters over the
#'   entire dataset.
#'   
#'   This argument is ignored if the `experiment_data` argument is set.
#'
#' @inheritParams summon_familiar
#' @inheritParams .parse_feature_selection_settings
#' @inheritDotParams .parse_experiment_settings -config
#' @inheritDotParams .parse_setup_settings -config
#' @inheritDotParams .parse_preprocessing_settings -config -data -parallel
#'   -outcome_type
#'
#' @details This is a thin wrapper around `summon_familiar`, and functions like
#'   it, but automatically skips computation of variable importance, learning
#'   and subsequent evaluation steps.
#'
#'   The function returns an `experimentData` object, which can be used to
#'   warm-start other experiments by providing it to the `experiment_data`
#'   argument.
#'
#' @return An `experimentData` object.
#'
#' @export
precompute_feature_info <- function(
    formula = NULL,
    data = NULL,
    experiment_data = NULL,
    cl = NULL,
    experimental_design = "fs+mb",
    verbose = TRUE,
    ...) {
  
  # Isolate dots.
  dots <- list(...)
  
  # Drop skip_evaluation_elements, fs_method and learner if present.
  dots$skip_evaluation_elements <- NULL
  dots$fs_method <- NULL
  dots$learner <- NULL
  
  # Summon a familiar and compute everything up to variable importance data.
  experiment_data <- do.call(
    summon_familiar,
    args = c(
      list(
        "formula" = formula,
        "data" = data,
        "experiment_data" = experiment_data,
        "cl" = cl,
        "experimental_design" = experimental_design,
        "fs_method" = "none",
        "learner" = "glm",
        "skip_evaluation_elements" = "all",
        "verbose" = verbose,
        ".stop_after" = "preprocessing"),
      dots))
  
  # Extract familiar models.
  return(experiment_data)
}



#' Pre-compute variable importance
#'
#' @description Creates data assignment, extracts feature information and
#'   subsequently computes variable importance.
#'
#' @param experimental_design (**required**) Defines what the experiment looks
#'   like, e.g. `cv(bt(fs,20)+mb,3,2)` for 2 times repeated 3-fold
#'   cross-validation with nested feature selection on 20 bootstraps and
#'   model-building. The basic workflow components are:
#'
#'   * `fs`: (required) feature selection step.
#'
#'   * `mb`: (required) model building step. Though models are not learned by
#'   `precompute_vimp`, this element is still required to prevent issues when
#'   using the resulting `experimentData` object to warm-start the experiments.
#'
#'   * `ev`: (optional) external validation. If validation batches or cohorts
#'   are present in the dataset (`data`), these should be indicated in the
#'   `validation_batch_id` argument.
#'
#'   The different components are linked using `+`.
#'
#'   Different subsampling methods can be used in conjunction with the basic
#'   workflow components:
#'
#'   * `bs(x,n)`: (stratified) .632 bootstrap, with `n` the number of
#'   bootstraps. In contrast to `bt`, feature pre-processing parameters and
#'   hyperparameter optimisation are conducted on individual bootstraps.
#'
#'   * `bt(x,n)`: (stratified) .632 bootstrap, with `n` the number of
#'   bootstraps. Unlike `bs` and other subsampling methods, no separate
#'   pre-processing parameters or optimised hyperparameters will be determined
#'   for each bootstrap.
#'
#'   * `cv(x,n,p)`: (stratified) `n`-fold cross-validation, repeated `p` times.
#'   Pre-processing parameters are determined for each iteration.
#'
#'   * `lv(x)`: leave-one-out-cross-validation. Pre-processing parameters are
#'   determined for each iteration.
#'
#'   * `ip(x)`: imbalance partitioning for addressing class imbalances on the
#'   data set. Pre-processing parameters are determined for each partition. The
#'   number of partitions generated depends on the imbalance correction method
#'   (see the `imbalance_correction_method` parameter).
#'
#'   As shown in the example above, sampling algorithms can be nested.
#'
#'   The simplest valid experimental design is `fs+mb`. This is the default in
#'   `precompute_vimp`, and will compute variable importance over the entire
#'   dataset.
#'   
#'   This argument is ignored if the `experiment_data` argument is set.
#'
#' @inheritParams summon_familiar
#' @inheritParams .parse_feature_selection_settings
#' @inheritDotParams .parse_experiment_settings -config
#' @inheritDotParams .parse_setup_settings -config
#' @inheritDotParams .parse_preprocessing_settings -config -data -parallel
#'   -outcome_type
#' @inheritDotParams .parse_feature_selection_settings
#'   parallel_feature_selection
#'
#' @details This is a thin wrapper around `summon_familiar`, and functions like
#'   it, but automatically skips learning and subsequent evaluation steps.
#'
#'   The function returns an `experimentData` object, which can be used to
#'   warm-start other experiments by providing it to the `experiment_data`
#'   argument. Variable importance may be retrieved from this object using the
#'   `get_vimp_table` and `aggregate_vimp_table` methods.
#'
#' @return An `experimentData` object.
#'
#' @seealso \code{\link{get_vimp_table}}, \code{\link{aggregate_vimp_table}}
#'
#' @export
precompute_vimp <- function(
    formula = NULL,
    data = NULL,
    experiment_data = NULL,
    cl = NULL,
    experimental_design = "fs+mb",
    fs_method = NULL,
    fs_method_parameter = NULL,
    verbose = TRUE,
    ...) {
  
  # Check that a single learner is present.
  fs_method <- .parse_arg(
    x_config = NULL,
    x_var = fs_method,
    var_name = "fs_method",
    type = "character_list",
    optional = FALSE)
  
  # Hyperparameters may be interpreted as belonging to the specified learner.
  fs_method_parameter <- .parse_arg(
    x_config = NULL,
    x_var = fs_method_parameter,
    var_name = "fs_method_parameter",
    type = "list",
    optional = TRUE,
    default = list())
  
  # Encode hyperparameter as expected by parsing it to a nested list.
  if (length(fs_method_parameter) > 0 && length(fs_method) == 1) {
    if (is.null(fs_method_parameter[[fs_method]])) {
      fs_method_parameter_list <- list()
      fs_method_parameter_list[[fs_method]] <- fs_method_parameter
      fs_method_parameter <- fs_method_parameter_list
    }
  }
  
  # Isolate dots.
  dots <- list(...)
  
  # Drop skip_evaluation_elements, and learner if present.
  dots$skip_evaluation_elements <- NULL
  dots$learner <- NULL
  
  # Summon a familiar and compute everything up to variable importance data.
  experiment_data <- do.call(
    summon_familiar,
    args = c(
      list(
        "formula" = formula,
        "data" = data,
        "experiment_data" = experiment_data,
        "cl" = cl,
        "experimental_design" = experimental_design,
        "fs_method" = fs_method,
        "fs_method_parameter" = fs_method_parameter,
        "learner" = "glm",
        "skip_evaluation_elements" = "all",
        "verbose" = verbose,
        ".stop_after" = "vimp"),
      dots))
  
  # Extract familiar models.
  return(experiment_data)
}



#' Create models using end-to-end machine learning
#'
#' @description Train models using familiar. Evaluation is not performed.
#'
#' @param experimental_design (**required**) Defines what the experiment looks
#'   like, e.g. `cv(bt(fs,20)+mb,3,2)` for 2 times repeated 3-fold
#'   cross-validation with nested feature selection on 20 bootstraps and
#'   model-building. The basic workflow components are:
#'
#'   * `fs`: (required) feature selection step.
#'
#'   * `mb`: (required) model building step.
#'
#'   * `ev`: (optional) external validation. Setting this is not required for
#'   `train_familiar`, but if validation batches or cohorts are present in the
#'   dataset (`data`), these should be indicated in the `validation_batch_id`
#'   argument.
#'
#'   The different components are linked using `+`.
#'
#'   Different subsampling methods can be used in conjunction with the basic
#'   workflow components:
#'
#'   * `bs(x,n)`: (stratified) .632 bootstrap, with `n` the number of
#'   bootstraps. In contrast to `bt`, feature pre-processing parameters and
#'   hyperparameter optimisation are conducted on individual bootstraps.
#'
#'   * `bt(x,n)`: (stratified) .632 bootstrap, with `n` the number of
#'   bootstraps. Unlike `bs` and other subsampling methods, no separate
#'   pre-processing parameters or optimised hyperparameters will be determined
#'   for each bootstrap.
#'
#'   * `cv(x,n,p)`: (stratified) `n`-fold cross-validation, repeated `p` times.
#'   Pre-processing parameters are determined for each iteration.
#'
#'   * `lv(x)`: leave-one-out-cross-validation. Pre-processing parameters are
#'   determined for each iteration.
#'
#'   * `ip(x)`: imbalance partitioning for addressing class imbalances on the
#'   data set. Pre-processing parameters are determined for each partition. The
#'   number of partitions generated depends on the imbalance correction method
#'   (see the `imbalance_correction_method` parameter).
#'
#'   As shown in the example above, sampling algorithms can be nested.
#'
#'   The simplest valid experimental design is `fs+mb`. This is the default in
#'   `train_familiar`, and will create one model for each feature selection
#'   method in `fs_method`. To create more models, a subsampling method should
#'   be introduced, e.g. `bs(fs+mb,20)` to create 20 models based on bootstraps
#'   of the data.
#'   
#'   This argument is ignored if the `experiment_data` argument is set.
#'
#' @param learner (**required**) Name of the learner used to develop a model. A
#'   sizeable number learners is supported in `familiar`. Please see the
#'   vignette on learners for more information concerning the available
#'   learners. Unlike the `summon_familiar` function, `train_familiar` only
#'   allows for a single learner.
#' @param hyperparameter (*optional*) List, or nested list containing
#'   hyperparameters for learners. If a nested list is provided, each sublist
#'   should have the name of the learner method it corresponds to, with list
#'   elements being named after the intended hyperparameter, e.g.
#'   \code{"glm_logistic"=list("sign_size"=3)}
#'
#'   All learners have hyperparameters. Please refer to the vignette on learners
#'   for more details. If no parameters are provided, sequential model-based
#'   optimisation is used to determine optimal hyperparameters.
#'
#'   Hyperparameters provided by the user are never optimised. However, if more
#'   than one value is provided for a single hyperparameter, optimisation will
#'   be conducted using these values.
#'
#' @inheritParams summon_familiar
#' @inheritDotParams .parse_experiment_settings -config
#' @inheritDotParams .parse_setup_settings -config
#' @inheritDotParams .parse_preprocessing_settings -config -data -parallel -outcome_type
#' @inheritDotParams .parse_feature_selection_settings -config -data -parallel -outcome_type
#' @inheritDotParams .parse_model_development_settings -config -data -parallel -outcome_type
#' @inheritDotParams .parse_hyperparameter_optimisation_settings -config -parallel -outcome_type
#'
#' @details This is a thin wrapper around `summon_familiar`, and functions like
#'   it, but automatically skips all evaluation steps. Only a single learner is
#'   allowed.
#'
#' @return One or more familiarModel objects.
#'   
#' @export
train_familiar <- function(
    formula = NULL,
    data = NULL,
    experiment_data = NULL,
    cl = NULL,
    experimental_design = "fs+mb",
    learner = NULL,
    hyperparameter = NULL,
    verbose = TRUE,
    ...) {
  
  # Check that a single learner is present.
  learner <- .parse_arg(
    x_config = NULL,
    x_var = learner,
    var_name = "learner",
    type = "character",
    optional = FALSE)
  
  # Hyperparameters may be interpreted as belonging to the specified learner.
  hyperparameter <- .parse_arg(
    x_config = NULL,
    x_var = hyperparameter,
    var_name = "hyperparameter",
    type = "list",
    optional = TRUE,
    default = list())
  
  # Encode hyperparameter as expected by parsing it to a nested list.
  if (length(hyperparameter) > 0 && is.null(hyperparameter[[learner]])) {
    hyperparameter_list <- list()
    hyperparameter_list[[learner]] <- hyperparameter
    hyperparameter <- hyperparameter_list
  }
  
  # Isolate dots.
  dots <- list(...)
  
  # Drop skip_evaluation_elements, project_dir, and experiment_dir if present.
  dots$skip_evaluation_elements <- NULL
  dots$project_dir <- NULL
  dots$experiment_dir <- NULL
  
  # Summon a familiar.
  familiar_models <- do.call(
    summon_familiar,
    args = (c(
      list(
        "formula" = formula,
        "data" = data,
        "experiment_data" = experiment_data,
        "cl" = cl,
        "experimental_design" = experimental_design,
        "learner" = learner,
        "hyperparameter" = hyperparameter,
        "experiment_dir" = NULL,
        "project_dir" = NULL,
        "skip_evaluation_elements" = "all",
        "verbose" = verbose,
        ".stop_after" = "training"),
      dots)))
  
  # Extract familiar models.
  return(familiar_models)
}



.is_absolute_path <- function(x) {
  return(dir.exists(paste0(
    unlist(strsplit(x, split = .Platform$file.sep))[1],
    .Platform$file.sep)))
}



.load_configuration_file <- function(config, config_id = 1) {
  
  if (!is.null(config)) {
    if (is.character(config)) {
      if (length(config) > 1) {
        stop(paste0(
          "Configuration: the path to the configuration file is expected ",
          "to be a single character string. Multiple strings were found."))
      }
      
      # Normalise file paths
      config <- normalizePath(config)
      
      # Check that the xml2 package is installed
      require_package("xml2", "to configure familiar using a configuration file")
      
      # Read xml file, parse to list and remove comments
      config <- xml2::as_list(xml2::read_xml(config))[[1]][[config_id]]
      config <- .clean_configuration_comments(config = config)
      
    } else {
      if (!is.list(config) || !is.recursive(config)) {
        stop("Configuration: the input configuration data is not a list of lists.")
      }
    }
  } else {
    config <- NULL
  }
  
  return(config)
}



.clean_configuration_comments <- function(config) {
  
  cleaning_cycle <- function(conf_list) {
    # Nested function for iterating through configuration list
    
    # If the current position is no longer a list, skip the following steps
    if (is.list(conf_list)) {
      
      # Retain only those list entries that are not comments "[ comment ]".
      conf_list <- Filter(Negate(function(x) ((is.character(x[1]) & x[1] == "[ comment ]"))), conf_list)
      
      # Go one level deeper by applying this function to the list entries of the current list
      conf_list <- lapply(conf_list, cleaning_cycle)
    }
    
    return(conf_list)
  }
  
  # Remove comments by running the iterative cleaningCycle script
  config_new <- cleaning_cycle(conf_list = config)
  
  return(config_new)
}



#' Create an empty xml configuration file
#'
#' This function creates an empty configuration xml file in the directory
#' specified by `dir_path`. This provides an alternative to the use of input
#' arguments for familiar.
#'
#' @param dir_path Path to the directory where the configuration file should be
#'  created. The directory should exist, and no file named `config.xml` should
#'  be present.
#'
#' @return Nothing. A file named `config.xml` is created in the directory
#' indicated by `dir_path`.
#' @export
#'
#' @examples
#' \dontrun{
#' # Creates a config.xml file in the working directory
#' get_xml_config(dir_path=getwd())
#' }
#' @md
#' @keywords IO
get_xml_config <- function(dir_path) {
  
  file.copy(
    from = system.file("config.xml", package = "familiar"),
    to = dir_path,
    overwrite = FALSE)
  
  return(invisible(TRUE))
}



.assign_settings_to_global <- function(settings) {
  # Assign settings into the familiar global, environment
  assign(
    x = "settings",
    value = settings,
    envir = familiar_global_env)
  
  return(invisible(TRUE))
}



get_settings <- function() {
  
  # Retrieve settings from the backend
  if (exists("familiar_global_env")) {
    if (exists("settings", where = familiar_global_env)) {
      data_env <- familiar_global_env
      
    } else if (exists("settings", where = .GlobalEnv)) {
      data_env <- .GlobalEnv
      
    } else {
      stop("Settings not found in backend.")
    }
    
  } else if (exists("settings", where = .GlobalEnv)) {
    data_env <- .GlobalEnv
    
  } else {
    stop("Settings not found in backend.")
  }
  
  return(get("settings", envir = data_env))
}



.assign_file_paths_to_global <- function(file_paths) {
  # Put file_paths into the familiar environment
  assign(
    x = "file_paths",
    value = file_paths,
    envir = familiar_global_env)
  
  return(invisible(TRUE))
}



get_file_paths <- function() {
  
  # Retrieve the paths to files and directories
  if (exists("familiar_global_env")) {
    if (exists("file_paths", where = familiar_global_env)) {
      data_env <- familiar_global_env
      
    } else if (exists("file_paths", where = .GlobalEnv)) {
      data_env <- .GlobalEnv
      
    } else {
      stop("File paths were not found in backend.")
    }
    
  } else if (exists("file_paths", where = .GlobalEnv)) {
    data_env <- .GlobalEnv
    
  } else {
    stop("File paths were not found in backend.")
  }
  
  return(get("file_paths", envir = data_env))
}



.assign_project_info_to_global <- function(project_info) {
  # Put project_info_list into the familiar environment
  assign(
    x = "project_info_list",
    value = project_info,
    envir = familiar_global_env)
  
  return(invisible(TRUE))
}



get_project_list <- function() {
  
  # Retrieve the project list
  if (exists("familiar_global_env")) {
    if (exists("project_info_list", where = familiar_global_env)) {
      data_env <- familiar_global_env
      
    } else if (exists("project_info_list", where = .GlobalEnv)) {
      data_env <- .GlobalEnv
      
    } else {
      stop("Project list not found in backend.")
    }
  } else if (exists("project_info_list", where = .GlobalEnv)) {
    data_env <- .GlobalEnv
    
  } else {
    stop("Project list not found in backend.")
  }
  
  return(get("project_info_list", envir = data_env))
}


.import_all_familiar_objects <- function(file_paths) {
  familiar_list <- list()
  
  # Find familiarModel files
  model_files <- list.files(
    path = file_paths$mb_dir,
    pattern = "model.RDS",
    recursive = TRUE)
  model_files <- sapply(
    model_files,
    function(x, dir_path) (file.path(dir_path, x)),
    dir_path = file_paths$mb_dir)
  
  # Load familiarModel files and add to list
  familiar_list$familiarModel <- load_familiar_object(model_files)
  
  # Find familiarEnsemble files
  ensemble_files <- list.files(
    path = file_paths$mb_dir,
    pattern = "ensemble.RDS",
    recursive = TRUE)
  ensemble_files <- sapply(
    ensemble_files,
    function(x, dir_path) (file.path(dir_path, x)),
    dir_path = file_paths$mb_dir)
  
  # Load familiarEnsemble files and add to list
  familiar_list$familiarEnsemble <- load_familiar_object(ensemble_files)
  
  # Find familiarData files
  data_files <- list.files(
    path = file_paths$fam_data_dir,
    pattern = "data.RDS")
  data_files <- sapply(
    data_files,
    function(x, dir_path) (file.path(dir_path, x)),
    dir_path = file_paths$fam_data_dir)
  
  # Load familiarData files and add to list
  familiar_list$familiarData <- load_familiar_object(data_files)
  
  # Find familiarCollection files
  coll_files <- list.files(
    path = file_paths$fam_coll_dir,
    pattern = "ensemble_data_|pooled_data.RDS")
  coll_files <- sapply(
    coll_files,
    function(x, dir_path) (file.path(dir_path, x)),
    dir_path = file_paths$fam_coll_dir)
  
  # Load familiarCollection files and add to list
  familiar_list$familiarCollection <- load_familiar_object(coll_files)
  
  return(familiar_list)
}



.clean_familiar_environment <- function() {
  # Cleans all objects assigned to the familiar global environment.
  if (exists("familiar_global_env")) {
    rm(
      list = ls(envir = familiar_global_env),
      envir = familiar_global_env)
  }
  
  return(invisible(TRUE))
}

Try the familiar package in your browser

Any scripts or data that you put into this service are public.

familiar documentation built on Sept. 30, 2024, 9:18 a.m.