R/DataObject.R

Defines functions create_data_column_info .get_required_features .process_input_data .preprocess_data ..set_identifier_column

#' @include FamiliarS4Generics.R
#' @include FamiliarS4Classes.R
#' @include LearnerS4Naive.R
NULL


# as_data_object methods -------------------------------------------------------

#'@title Creates a valid data object from input data.
#'
#'@description Creates `dataObject` a object from input data. Input data can be
#'  a `data.frame` or `data.table`, a path to such tables on a local or network
#'  drive, or a path to tabular data that may be converted to these formats.
#'
#'  In addition, a `familiarEnsemble` or `familiarModel` object can be passed
#'  along to check whether the data are formatted correctly, e.g. by checking
#'  the levels of categorical features, whether all expected columns are
#'  present, etc.
#'
#'@param data A `data.frame` or `data.table`, a path to such tables on a local
#'  or network drive, or a path to tabular data that may be converted to these
#'  formats.
#'
#'@param object A `familiarEnsemble` or `familiarModel` object that is used to
#'  check consistency of these objects.
#'
#'@param check_stringency Specifies stringency of various checks. This is mostly:
#'
#'  * `strict`: default value used for `summon_familiar`. Thoroughly checks
#'  input data. Used internally for checking development data.
#'  
#'  * `external_warn`: value used for `extract_data` and related methods. Less
#'  stringent checks, but will warn for possible issues. Used internally for
#'  checking data for evaluation and explanation.
#'  
#'  * `external`: value used for external methods such as `predict`. Less
#'  stringent checks, particularly for identifier and outcome columns, which may
#'  be completely absent. Used internally for `predict`.
#'
#'@inheritParams .parse_experiment_settings
#'
#'@details You can specify settings for your data manually, e.g. the column for
#'  sample identifiers (`sample_id_column`). This prevents you from having to
#'  change the column name externally. In the case you provide a `familiarModel`
#'  or `familiarEnsemble` for the `object` argument, any parameters you provide
#'  take precedence over parameters specified by the object.
#'
#'@return A `dataObject` object.
#'@exportMethod as_data_object
#'@md
#'@rdname as_data_object-methods



## as_data_object (generic) ----------------------------------------------------
setGeneric(
  "as_data_object",
  function(data, ...) standardGeneric("as_data_object"))



## as_data_object (dataObject) -------------------------------------------------
#'@rdname as_data_object-methods
setMethod(
  "as_data_object",
  signature(data = "dataObject"),
  function(data, object = NULL, ...) return(data))



## as_data_object (data.table) -------------------------------------------------
#'@rdname as_data_object-methods
setMethod(
  "as_data_object",
  signature(data = "data.table"),
  function(
    data,
    object = NULL,
    sample_id_column = waiver(),
    batch_id_column = waiver(),
    series_id_column = waiver(),
    development_batch_id = waiver(),
    validation_batch_id = waiver(),
    outcome_name = waiver(),
    outcome_column = waiver(),
    outcome_type = waiver(),
    event_indicator = waiver(),
    censoring_indicator = waiver(),
    competing_risk_indicator = waiver(),
    class_levels = waiver(),
    exclude_features = waiver(),
    include_features = waiver(),
    reference_method = waiver(),
    check_stringency = "strict",
    ...) {
    
    # Suppress NOTES due to non-standard evaluation in data.table
    type <- NULL
    
    # Determine whether the object contains data concerning columns, and
    # outcome. Note that user-provided names always take precedence.
    has_model_object <- is(object, "familiarModel") ||
      is(object, "familiarEnsemble") ||
      is(object, "familiarNoveltyDetector")
    
    # Check whether a model potentially has outcome information.
    if (has_model_object) {
      has_outcome_info_slot <- methods::.hasSlot(object, "outcome_info")
      
    } else {
      has_outcome_info_slot <- FALSE
    }
    
    if (check_stringency != "strict") {
      if (!has_model_object) stop("Dummy columns cannot be set without a model or ensemble object.")
    } 
    
    # Attempt to identify a sample identifier column.
    if (is.waive(sample_id_column)) {
      if (has_model_object) {
        if (!is_empty(object@data_column_info)) {
          # Find the sample id column stored with the model.
          model_sample_id_column <- object@data_column_info[type == "sample_id_column"]$external
          
          # Check that the model actually has a column name (not character(0))
          # that is not NA, and set this column name.
          if (length(model_sample_id_column) > 0) {
            if (!is.na(model_sample_id_column)) sample_id_column <- model_sample_id_column
          }
        }
      }
    }
    
    # Attempt to identify a batch identifier column.
    if (is.waive(batch_id_column)) {
      if (has_model_object) {
        if (!is_empty(object@data_column_info)) {
          # Find the batch id column stored with the model.
          model_batch_id_column <- object@data_column_info[type == "batch_id_column"]$external
          
          # Check that the model actually has a column name (not character(0))
          # that is not NA, and set this column name.
          if (length(model_batch_id_column) > 0) {
            if (!is.na(model_batch_id_column)) batch_id_column <- model_batch_id_column
          }
        }
      }
    }
    
    # Attempt to identify a series identifier column.
    if (is.waive(series_id_column)) {
      if (has_model_object) {
        if (!is_empty(object@data_column_info)) {
          # Find the series id column stored with the model.
          model_series_id_column <- object@data_column_info[type == "series_id_column"]$external
          
          # Check that the model actually has a column name (not character(0))
          # that is not NA, and set this column name.
          if (length(model_series_id_column) > 0) {
            if (!is.na(model_series_id_column)) series_id_column <- model_series_id_column
          }
        }
      }
    }
    
    # Development and validation batch ids are not incorporated into
    # familiarModel or familiarEnsemble objects.
    
    # Attempt to identify the name of the outcome.
    if (is.waive(outcome_name)) {
      if (has_model_object & has_outcome_info_slot) {
        if (is(object@outcome_info, "outcomeInfo")) {
          
          # Check that the outcome name is not empty.
          if (length(object@outcome_info@name) >= 1) outcome_name <- object@outcome_info@name
        }
      }
    }
    
    # Attempt to identify the outcome columns.
    if (is.waive(outcome_column)) {
      if (has_model_object & has_outcome_info_slot) {
        if (!is_empty(object@data_column_info)) {
          # Find the model columns.
          outcome_column <- object@data_column_info[type == "outcome_column"]$external
        }
      }
    }
    
    # Attempt to identify the type of outcome.
    if (is.waive(outcome_type)) {
      if (is(object, "familiarNoveltyDetector")) {
        outcome_type <- "unsupervised"
        
      } else if (has_model_object) {
        outcome_type <- object@outcome_type
      }
    }
    
    # Attempt to identify the event indicator.
    if (is.waive(event_indicator)) {
      if (has_model_object & has_outcome_info_slot) {
        if (is(object@outcome_info, "outcomeInfo")) {
          if (length(object@outcome_info@event) > 0) {
            if (!is.na(object@outcome_info@event)) {
              event_indicator <- object@outcome_info@event
            } 
          }
        }
      }
    }
    
    # Attempt to identify the censoring indicator.
    if (is.waive(censoring_indicator)) {
      if (has_model_object & has_outcome_info_slot) {
        if (is(object@outcome_info, "outcomeInfo")) {
          if (length(object@outcome_info@censored) > 0) {
            if (!is.na(object@outcome_info@censored)) {
              censoring_indicator <- object@outcome_info@censored
            } 
          }
        }
      }
    }
    
    # Attempt to identify the competing risk indicator.
    if (is.waive(competing_risk_indicator)) {
      if (has_model_object & has_outcome_info_slot) {
        if (is(object@outcome_info, "outcomeInfo")) {
          if (length(object@outcome_info@competing_risk) > 0) {
            if (!is.na(object@outcome_info@competing_risk)) {
              competing_risk_indicator <- object@outcome_info@competing_risk
            } 
          }
        }
      }
    }
    
    # Attempt to identify class levels of the outcome.
    if (is.waive(class_levels)) {
      if (has_model_object & has_outcome_info_slot) {
        if (is(object@outcome_info, "outcomeInfo")) {
          if (length(object@outcome_info@levels) > 0) {
            class_levels <- object@outcome_info@levels
          }
        }
      }
    }
    
    # Load settings from input.
    settings <- do.call(
      .parse_initial_settings,
      args = c(
        list(
          "experimental_design" = "fs+mb",
          "sample_id_column" = sample_id_column,
          "batch_id_column" = batch_id_column,
          "series_id_column" = series_id_column,
          "development_batch_id" = development_batch_id,
          "validation_batch_id" = validation_batch_id,
          "outcome_name" = outcome_name,
          "outcome_column" = outcome_column,
          "outcome_type" = outcome_type,
          "event_indicator" = event_indicator,
          "censoring_indicator" = censoring_indicator,
          "competing_risk_indicator" = competing_risk_indicator,
          "class_levels" = class_levels,
          "exclude_features" = exclude_features,
          "include_features" = include_features,
          "reference_method" = reference_method),
        list(...)))
    
    # Prepare data.table.
    data <- .load_data(
      data = data,
      sample_id_column = settings$data$sample_col,
      batch_id_column = settings$data$batch_col,
      series_id_column = settings$data$series_col)
    
    # Update settings
    settings <- .update_initial_settings(
      data = data,
      settings = settings,
      check_stringency = check_stringency)
    
    # Parse data
    data <- .finish_data_preparation(
      data = data,
      sample_id_column = settings$data$sample_col,
      batch_id_column = settings$data$batch_col,
      series_id_column = settings$data$series_col,
      outcome_column = settings$data$outcome_col,
      outcome_type = settings$data$outcome_type,
      include_features = settings$data$include_features,
      class_levels = settings$data$class_levels,
      censoring_indicator = settings$data$censoring_indicator,
      event_indicator = settings$data$event_indicator,
      competing_risk_indicator = settings$data$competing_risk_indicator,
      check_stringency = check_stringency,
      reference_method = settings$data$reference_method
    )
    
    # Update the dataset according to the feature info list.
    if (has_model_object) data <- update_data_set(data = data, object = object)
    
    # Add outcome information, preferentially from the familiarModel or
    # familiarEnsemble, as it is more complete.
    if (has_model_object && has_outcome_info_slot) {
      outcome_info <- object@outcome_info
      
    } else {
      outcome_info <- create_outcome_info(settings = settings)
    }
    
    if (has_model_object) {
      if (!is_empty(object@data_column_info)) {
        data_info <- object@data_column_info
        
      } else {
        data_info <- create_data_column_info(settings = settings)
      }
      
    } else {
      data_info <- create_data_column_info(settings = settings)
    }
    
    # Convert to dataObject
    data <- methods::new(
      "dataObject",
      data = data,
      preprocessing_level = "none",
      outcome_type = settings$data$outcome_type,
      outcome_info = outcome_info,
      data_column_info = data_info)
    
    return(data)
  }
)



## as_data_object (ANY) --------------------------------------------------------
#'@rdname as_data_object-methods
setMethod(
  "as_data_object",
  signature(data = "ANY"),
  function(
    data,
    object = NULL,
    sample_id_column = waiver(),
    batch_id_column = waiver(),
    series_id_column = waiver(),
    ...) {
    
    # Suppress NOTES due to non-standard evaluation in data.table
    type <- NULL
    
    # Determine whether the object contains data concerning columns. Note that
    # user-provided names always take precedence.
    has_model_object <- FALSE
    if (is(object, "familiarModel") || is(object, "familiarEnsemble")) {
      if (!is_empty(object@data_column_info)) has_model_object <- TRUE
    }
    
    # Create a local copy of sample_id_column to pass on to .load_data.
    if (is.waive(sample_id_column)) {
      sample_id_column_local <- NULL
      
      if (has_model_object) {
        # Find the sample id column stored with the model.
        model_sample_id_column <- object@data_column_info[type == "sample_id_column"]$external
        
        # Check that the model actually has a column name (not character(0))
        # that is not NA, and set this column name.
        if (length(model_sample_id_column) > 0) {
          if (!is.na(model_sample_id_column)) sample_id_column_local <- model_sample_id_column
        }
      }
      
    } else {
      sample_id_column_local <- sample_id_column
    }
    
    # Create a local copy of batch_id_column to pass on to .load_data.
    if (is.waive(batch_id_column)) {
      batch_id_column_local <- NULL
      
      if (has_model_object) {
        # Find the batch id column stored with the model.
        model_batch_id_column <- object@data_column_info[type == "batch_id_column"]$external
        
        # Check that the model actually has a column name (not character(0))
        # that is not NA, and set this column name.
        if (length(model_batch_id_column) > 0) {
          if (!is.na(model_batch_id_column)) batch_id_column_local <- model_batch_id_column
        }
      }
      
    } else {
      batch_id_column_local <- batch_id_column
    }
    
    # Create a local copy of series_id_column to pass on to .load_data
    if (is.waive(series_id_column)) {
      series_id_column_local <- NULL
      
      if (has_model_object) {
        # Find the series id column stored with the model.
        model_series_id_column <- object@data_column_info[type == "series_id_column"]$external
        
        # Check that the model actually has a column name (not
        # character(0)) that is not NA, and set this column name.
        if (length(model_series_id_column) > 0) {
          if (!is.na(model_series_id_column)) series_id_column_local <- model_series_id_column
        }
      }
      
    } else {
      series_id_column_local <- series_id_column
    }
    
    # Load data and convert to data.table
    data <- .load_data(
      data = data,
      sample_id_column = sample_id_column_local,
      batch_id_column = batch_id_column_local,
      series_id_column = series_id_column_local)
    
    # Pass on to data.table method.
    return(do.call(
      as_data_object,
      args = c(
        list(
          "data" = data,
          "object" = object,
          "sample_id_column" = sample_id_column,
          "batch_id_column" = batch_id_column,
          "series_id_column" = series_id_column),
        list(...))))
  }
)



# extract_settings_from_data ---------------------------------------------------
setMethod(
  "extract_settings_from_data",
  signature(data = "dataObject"),
  function(
    data,
    settings = NULL,
    signature = NULL) {
    
    # Suppress NOTES due to non-standard evaluation in data.table
    type <- NULL
    
    if (is.null(settings)) settings <- list("data" = list())
    
    # Placeholders
    sample_id_column <- batch_id_column <- series_id_column <- outcome_columns <- NULL
    
    if (!is_empty(data@data_column_info)) {
      # Sample identifier
      sample_id_column <- ..set_identifier_column(
        current = sample_id_column,
        data = data@data,
        internal = data@data_column_info[type == "sample_id_column"]$internal,
        external = data@data_column_info[type == "sample_id_column"]$external)
      
      # Batch identifier
      batch_id_column <- ..set_identifier_column(
        current = batch_id_column,
        data = data@data,
        internal = data@data_column_info[type == "batch_id_column"]$internal,
        external = data@data_column_info[type == "batch_id_column"]$external)
      
      # Series identifier
      series_id_column <- ..set_identifier_column(
        current = series_id_column,
        data = data@data,
        internal = data@data_column_info[type == "series_id_column"]$internal,
        external = data@data_column_info[type == "series_id_column"]$external)
      
      # Outcome columns
      outcome_columns <- ..set_identifier_column(
        current = outcome_columns,
        data = data@data,
        internal = data@data_column_info[type == "outcome_column"]$internal,
        external = data@data_column_info[type == "outcome_column"]$external)
    }
    
    if (is.null(outcome_columns)) get_outcome_columns(data)
    
    # Sample identifier column
    settings$data$sample_col <- sample_id_column
    settings$data$batch_col <- batch_id_column
    settings$data$series_col <- series_id_column
    settings$data$outcome_col <- outcome_columns
    settings$data$outcome_type <- data@outcome_type
    settings$data$outcome_name <- get_outcome_name(data@outcome_info)
    settings$data$class_levels <- get_outcome_class_levels(data@outcome_info)
    settings$data$event_indicator <- data@outcome_info@event
    settings$data$censoring_indicator <- data@outcome_info@censored
    settings$data$competing_risk_indicator <- data@outcome_info@competing_risk
    settings$data$signature <- signature
    settings$data$include_features <- get_feature_columns(data)
    
    return(settings)
  }
)



..set_identifier_column <- function(
    current = NULL,
    data = NULL,
    internal,
    external) {
  
  if (!(is.waive(current) || is.null(current))) return(current)
  
  temporary <- NULL
  
  # Prefer external before internal, as long as it is present in data.
  if (all(sapply(external, length) > 0)) {
    if (!any(sapply(external, is.na))) {
      if (data.table::is.data.table(data)) {
        if (all(external %in% colnames(data))) temporary <- external
      }
    }
  }
  
  # Prefer internal if it is present in data.
  if (data.table::is.data.table(data) && is.null(temporary)) {
    if (all(internal %in% colnames(data))) temporary <- internal
  }
  
  # Use external if internal is not present in data.
  if (all(sapply(external, length) > 0) && is.null(temporary)) {
    if (!any(sapply(external, is.na))) temporary <- internal
  }
  
  if (is.null(temporary)) {
    return(current)
    
  } else {
    return(temporary)
  }
}


# load_delayed_data methods ----------------------------------------------------

## load_delayed_data (model) ---------------------------------------------------
setMethod(
  "load_delayed_data",
  signature(
    data = "dataObject",
    object = "ANY"),
  function(
    data,
    object,
    stop_at,
    keep_novelty = FALSE) {
    # Loads data from internal memory
    
    if (!(is(object, "familiarModel") ||
         is(object, "familiarVimpMethod") ||
         is(object, "familiarNoveltyDetector"))) {
      ..error_reached_unreachable_code(paste0(
        "load_delayed_data: object is expected to be a familiarModel, ",
        "familiarVimpMethod or familiarNoveltyDetector."))
    }
    
    # Check if loading was actually delayed
    if (!data@delay_loading) return(data)
    
    # Read project list and settings
    iteration_list <- get_project_list()$iter_list
    
    # Read required features
    required_features <- object@required_features
    
    # Get columns in data frame which are not features, but identifiers and
    # outcome instead.
    non_feature_cols <- get_non_feature_columns(x = object)
    
    # Find the identifiers for the current run.
    run_id_list <- .get_iteration_identifiers(
      run = list("run_table" = object@run_table),
      perturb_level = data@perturb_level)
    
    # Derive sample identifiers based on the selected iteration data.
    sample_identifiers <- .get_sample_identifiers(
      iteration_list = iteration_list,
      data_id = run_id_list$data,
      run_id = run_id_list$run,
      train_or_validate = ifelse(data@load_validation, "valid", "train"))
    
    # Currently select only unique samples from the backend.
    if (!is_empty(sample_identifiers)) {
      unique_sample_identifiers <- unique(sample_identifiers)
      
    } else {
      # Return an updated data object, but without data
      return(methods::new(
        "dataObject",
        data = NULL,
        preprocessing_level = "none",
        outcome_type = data@outcome_type,
        aggregate_on_load = data@aggregate_on_load))
    }
    
    # Prepare a new data object
    new_data <- methods::new(
      "dataObject",
      data = get_data_from_backend(
        sample_identifiers = unique_sample_identifiers,
        column_names = c(non_feature_cols, required_features)),
      preprocessing_level = "none",
      outcome_type = data@outcome_type,
      delay_loading = FALSE,
      perturb_level = NA_integer_,
      load_validation = data@load_validation,
      aggregate_on_load = data@aggregate_on_load,
      sample_set_on_load = data@sample_set_on_load)
    
    # Preprocess data
    new_data <- preprocess_data(
      data = new_data,
      object = object,
      stop_at = stop_at,
      keep_novelty = keep_novelty)
    
    # Recreate iteration. Note that we here also use duplicate samples to
    # recreate e.g. bootstraps.
    new_data <- select_data_from_samples(
      data = new_data,
      samples = sample_identifiers)

    if (new_data@aggregate_on_load) {
      
      # Aggregate data if required
      new_data <- aggregate_data(data = new_data)
      
      # Reset flag to FALSE, as data has been loaded
      new_data@aggregate_on_load <- FALSE
    }
    
    return(new_data)
  }
)



## load_delayed_data (ensemble) ------------------------------------------------
setMethod(
  "load_delayed_data",
  signature(
    data = "dataObject",
    object = "familiarEnsemble"),
  function(
    data,
    object,
    stop_at = "clustering",
    keep_novelty = FALSE) {
    # Loads data from internal memory -- for familiarEnsemble objects
    
    # Suppress NOTES due to non-standard evaluation in data.table
    perturb_level <- NULL
    
    # Check if loading was actually delayed
    if (!data@delay_loading) return(data)
    
    # Read project list
    iteration_list <- get_project_list()$iter_list

    # Read required features
    required_features <- object@required_features
    
    # Get columns in data frame which are not features, but identifiers and outcome instead
    non_feature_cols <- get_non_feature_columns(x = object)
    
    # Join run tables to identify the runs that should be evaluated.
    combined_run_table <- lapply(
      object@run_table$run_table,
      function(model_run_table, data_perturb_level) {
        return(model_run_table[perturb_level == data_perturb_level])
      },
      data_perturb_level = data@perturb_level)
    
    # Merge to single table
    combined_run_table <- data.table::rbindlist(combined_run_table)
    
    # Remove duplicate rows
    combined_run_table <- unique(combined_run_table)
    
    # Check length and extract sample identifiers.
    if (nrow(combined_run_table) == 1) {
      sample_identifiers <- .get_sample_identifiers(
        iteration_list = iteration_list,
        data_id = combined_run_table$data_id,
        run_id = combined_run_table$run_id,
        train_or_validate = ifelse(data@load_validation, "valid", "train"))
      
    } else {
      # Extract all sample identifiers. This happens if the the data is pooled.
      sample_identifiers <- data.table::rbindlist(lapply(
        seq_len(nrow(combined_run_table)),
        function(ii, run_table, iteration_list, train_or_validate) {
        
          sample_identifiers <- .get_sample_identifiers(
            iteration_list = iteration_list,
            data_id = run_table$data_id[ii],
            run_id = run_table$run_id[ii],
            train_or_validate = train_or_validate)
          
          return(sample_identifiers)
        },
        run_table = combined_run_table,
        iteration_list = iteration_list,
        train_or_validate = ifelse(data@load_validation, "valid", "train")))
      
      # Select only unique sample identifiers.
      sample_identifiers <- unique(sample_identifiers)
    }
    
    # Currently select only unique samples from the backend.
    if (!is_empty(sample_identifiers)) {
      unique_sample_identifiers <- unique(sample_identifiers)
      
    } else {
      # Return an updated data object, but without data
      return(methods::new(
        "dataObject",
        data = NULL,
        preprocessing_level = "none",
        outcome_type = data@outcome_type,
        aggregate_on_load = data@aggregate_on_load))
    }
    
    # Prepare a new data object
    new_data <- methods::new(
      "dataObject",
      data = get_data_from_backend(
        sample_identifiers = unique_sample_identifiers,
        column_names = c(non_feature_cols, required_features)),
      preprocessing_level = "none",
      outcome_type = data@outcome_type,
      delay_loading = FALSE,
      perturb_level = NA_integer_,
      load_validation = data@load_validation,
      aggregate_on_load = data@aggregate_on_load,
      sample_set_on_load = data@sample_set_on_load)
    
    # Preprocess data
    new_data <- preprocess_data(
      data = new_data,
      object = object,
      stop_at = stop_at,
      keep_novelty = keep_novelty)
    
    # Recreate iteration. Note that we here also use duplicate samples
    # to recreate e.g. bootstraps.
    new_data <- select_data_from_samples(
      data = new_data,
      samples = sample_identifiers)
    
    # Aggregate data if required
    if (new_data@aggregate_on_load) {
      
      # Aggregate
      new_data <- aggregate_data(data = new_data)
      
      # Reset flag to FALSE, as data has been loaded
      new_data@aggregate_on_load <- FALSE
    }
    
    return(new_data)
  }
)



# preprocess_data methods ------------------------------------------------------

## preprocess_data (vimp method) -----------------------------------------------
setMethod(
  "preprocess_data",
  signature(
    data = "dataObject",
    object = "familiarVimpMethod"),
  function(
    data,
    object,
    stop_at = "clustering",
    keep_novelty = FALSE, ...) {
    
    # Pre-process the data.
    data <-  .preprocess_data(
      data = data,
      object = object,
      stop_at = stop_at,
      keep_novelty = keep_novelty)
    
    return(data)
  }
)



## preprocess_data (model) -----------------------------------------------------
setMethod(
  "preprocess_data",
  signature(
    data = "dataObject",
    object = "familiarModel"),
  function(
    data,
    object,
    stop_at = "clustering",
    keep_novelty = FALSE,
    ...) {
    
    # Pre-process the data.
    data <- .preprocess_data(
      data = data,
      object = object,
      stop_at = stop_at,
      keep_novelty = keep_novelty)
    
    # Post-process the data to select the correct feature and identifier set.
    data <- postprocess_data(
      data = data,
      object = object,
      stop_at = stop_at,
      keep_novelty = keep_novelty,
      ...)
    
    return(data)
  }
)



## preprocess_data (familiarNaiveModel) ----------------------------------------
setMethod(
  "preprocess_data",
  signature(
    data = "dataObject",
    object = "familiarNaiveModel"),
  function(
    data,
    object,
    stop_at = "clustering",
    keep_novelty = FALSE,
    ...) {
    
    if (.as_preprocessing_level(data@preprocessing_level) > .as_preprocessing_level(stop_at)) {
      ..error_reached_unreachable_code(paste0(
        "preprocess_data,dataObject,familiarNaiveModel: data were preprocessed ",
        "at a higher level than required by stop_at."))
    }
    
    # familiarNaiveModels do not have any features, so we remove them.
    data <- select_features(data = data, features = NULL)
    
    # Set the pre-processing level.
    data@preprocessing_level <- stop_at  
    
    return(data)
  }
)



## preprocess_data (novelty detector) ------------------------------------------
setMethod(
  "preprocess_data",
  signature(
    data = "dataObject",
    object = "familiarNoveltyDetector"),
  function(
    data,
    object,
    stop_at = "clustering",
    ...) {
    
    # Note that keep_novelty is always false to prevent reading novelty_features
    # slot. Novelty features are stored in the model_features slot of
    # familiarNoveltyDetector objects.
    
    # Assign "unsupervised" outcome type to the data to prevent outcome columns
    # being selected.
    data@outcome_type <- "unsupervised"
    
    # Pre-process the data.
    data <- .preprocess_data(
      data = data,
      object = object,
      stop_at = stop_at,
      keep_novelty = FALSE)
    
    # Post-process the data to select the correct feature and identifier set.
    data <- postprocess_data(
      data = data,
      object = object,
      stop_at = stop_at,
      ...)
    
    return(data)
  }
)


## preprocess_data (ensemble) --------------------------------------------------
setMethod(
  "preprocess_data",
  signature(
    data = "dataObject",
    object = "familiarEnsemble"),
  function(
    data,
    object,
    stop_at = "clustering",
    keep_novelty = FALSE,
    ...) {
    
    # Pre-process the data.
    data <- .preprocess_data(
      data = data,
      object = object,
      stop_at = stop_at,
      keep_novelty = keep_novelty)
    
    return(data)
  }
) 



.preprocess_data <- function(
    data,
    object,
    stop_at,
    keep_novelty = FALSE) {
  
  # Convert the preprocessing_level attained and the requested stopping level to
  # ordinals.
  preprocessing_level_attained <- .as_preprocessing_level(data@preprocessing_level)
  stop_at <- .as_preprocessing_level(stop_at)
  
  # Check whether pre-processing is required
  if (preprocessing_level_attained == stop_at) {
    
    return(data)
    
  } else if (preprocessing_level_attained > stop_at) {
    ..error_reached_unreachable_code(paste0(
      "preprocess_data,dataObject,ANY: data were preprocessed at a higher level", 
      " than required by stop_at."))
  }
  
  if (preprocessing_level_attained < "signature" && stop_at >= "signature") { 
    # Apply the signature.
    data <- select_features(
      data = data,
      features = object@required_features)
    
    # Update pre-processing level externally as it is not limited to
    # pre-processing per sé.
    data@preprocessing_level <- "signature"
    
  } else if (preprocessing_level_attained == "signature" && stop_at >= "signature") {
    
    required_features <- object@required_features
    selected_features <- object@model_features
    if (keep_novelty) selected_features <- union(selected_features, object@novelty_features)
    
    if (length(required_features) > 0 && length(selected_features) > 0 && has_feature_data(data)) {
      
      # Select available features specific to the object.
      if (all(required_features %in% get_feature_columns(data))) {
        data <- select_features(
          data = data,
          features = required_features)
        
      } else if (all(selected_features %in% get_feature_columns(data))) {
        data <- select_features(
          data = data,
          features = selected_features)
        
      } else {
        ..error_reached_unreachable_code(
          ".preprocess_data: could not identify overlapping features")
      }
    }
  }
  
  if (preprocessing_level_attained < "transformation" && stop_at >= "transformation") {
    # Transform the features.
    data <- transform_features(
      data = data,
      feature_info_list = object@feature_info)
  }
  
  if (preprocessing_level_attained < "normalisation" && stop_at >= "normalisation") {
    # Normalise feature values.
    data <- normalise_features(
      data = data,
      feature_info_list = object@feature_info)
  }
  
  if (preprocessing_level_attained < "batch_normalisation" && stop_at >= "batch_normalisation") {
    # Batch-normalise feature values
    data <- batch_normalise_features(
      data = data,
      feature_info_list = object@feature_info)
  }
  
  if (preprocessing_level_attained < "imputation" && stop_at >= "imputation") {
    # Impute missing values
    data  <- impute_features(
      data = data,
      feature_info_list = object@feature_info)
  }
  
  if (preprocessing_level_attained < "clustering" && stop_at >= "clustering") {
    # Cluster features
    data <- cluster_features(
      data = data,
      feature_info_list = object@feature_info)
  }
  
  return(data)
}


# postprocess_data methods -----------------------------------------------------

## postprocess_data (model)-----------------------------------------------------
setMethod(
  "postprocess_data",
  signature(
    data = "dataObject",
    object = "familiarModel"),
  function(
    data,
    object,
    stop_at = "clustering",
    keep_novelty = FALSE,
    force_check = FALSE,
    ...) {
    
    # Convert the preprocessing_level attained and the requested stopping level
    # to ordinals.
    preprocessing_level_attained <- .as_preprocessing_level(data@preprocessing_level)
    stop_at <- .as_preprocessing_level(stop_at)
    
    if (stop_at < "clustering" || preprocessing_level_attained < "clustering") return(data)
    
    # Select features.
    features <- object@model_features
    if (keep_novelty) features <- union(features, object@novelty_features)
    
    # Return data if there are no features.
    if (length(features) == 0 && !force_check) return(data)
    
    # Determine the features after clustering.
    features <- features_after_clustering(
      features = features,
      feature_info_list = object@feature_info)
    
    # Create a slice of the data for the feature set.
    data <- select_features(
      data = data,
      features = features)
    
    return(data)
  }
)



## postprocess_data (novelty detector)------------------------------------------
setMethod(
  "postprocess_data",
  signature(
    data = "dataObject",
    object = "familiarNoveltyDetector"),
  function(
    data,
    object,
    stop_at = "clustering",
    force_check = FALSE,
    ...) {
    
    # Convert the preprocessing_level attained and the requested stopping level
    # to ordinals.
    preprocessing_level_attained <- .as_preprocessing_level(data@preprocessing_level)
    stop_at <- .as_preprocessing_level(stop_at)
    
    if (stop_at < "clustering" || preprocessing_level_attained < "clustering") return(data)
    
    # Select features.
    features <- object@model_features
    
    # Return data if there are no features.
    if (length(features) == 0 && !force_check) return(data)
    
    # Determine the features after clustering.
    features <- features_after_clustering(
      features = features,
      feature_info_list = object@feature_info)
    
    # Create a slice of the data for the feature set.
    data <- select_features(
      data = data,
      features = features)
    
    return(data)
  }
)



## process_input_data (vimp method) --------------------------------------------
setMethod(
  "process_input_data",
  signature(
    object = "familiarVimpMethod",
    data = "ANY"),
  function(
    object,
    data,
    is_pre_processed = FALSE,
    stop_at = "clustering",
    keep_novelty = FALSE,
    ...) {
    
    data <- .process_input_data(
      object = object,
      data = data,
      is_pre_processed = is_pre_processed,
      stop_at = stop_at,
      keep_novelty = keep_novelty,
      ...)
    
    return(data)
  }
)

## process_input_data (novelty detector) ---------------------------------------
setMethod(
  "process_input_data",
  signature(
    object = "familiarNoveltyDetector",
    data = "ANY"),
  function(
    object,
    data,
    is_pre_processed = FALSE, 
    stop_at = "clustering",
    ...) {
    
    # Ensure that the outcome_type of data is changed to "unsupervised". We
    # don't need any outcome columns.
    data@outcome_type <- "unsupervised"
    
    # Process data.
    data <- .process_input_data(
      object = object,
      data = data,
      is_pre_processed = is_pre_processed,
      stop_at = stop_at,
      keep_novelty = FALSE,
      ...)
    
    return(data)
  }
)



## process_input_data (model) --------------------------------------------------
setMethod(
  "process_input_data",
  signature(
    object = "familiarModel",
    data = "ANY"),
  function(
    object,
    data,
    is_pre_processed = FALSE,
    stop_at = "clustering",
    keep_novelty = FALSE,
    ...) {
    
    data <- .process_input_data(
      object = object,
      data = data,
      is_pre_processed = is_pre_processed,
      stop_at = stop_at,
      keep_novelty = keep_novelty,
      ...)
    
    return(data)
  }
)



## process_input_data (ensemble) -----------------------------------------------
setMethod(
  "process_input_data",
  signature(
    object = "familiarEnsemble",
    data = "ANY"),
  function(
    object,
    data,
    is_pre_processed = FALSE,
    stop_at = "clustering",
    keep_novelty = FALSE) {
    
    data <- .process_input_data(
      object = object,
      data = data,
      is_pre_processed = is_pre_processed,
      stop_at = stop_at,
      keep_novelty = keep_novelty)
    
    return(data)
  }
)



.process_input_data <- function(
    object,
    data,
    is_pre_processed,
    stop_at,
    keep_novelty = FALSE,
    ...) {
  
  # Check whether data is a dataObject, and create one otherwise
  if (!is(data, "dataObject")) {
    data <- as_data_object(
      data = data,
      object = object)
    
    # Set pre-processing level.
    data@preprocessing_level <- ifelse(is_pre_processed, "clustering", "none")
  }
  
  # Load data from internal memory, if not provided otherwise
  if (data@delay_loading) {
    data <- load_delayed_data(
      data = data,
      object = object,
      stop_at = stop_at,
      keep_novelty = keep_novelty)
  }
  
  # Pre-process data in case it has not been pre-processed
  data <- preprocess_data(
    data = data,
    object = object,
    stop_at = stop_at,
    keep_novelty = keep_novelty,
    ...)
  
  # Return data
  return(data)
}



# select_data_from_samples -----------------------------------------------------
setMethod(
  "select_data_from_samples",
  signature(
    data = "dataObject",
    samples = "ANY"),
  function(data, samples = NULL) {
    
    # Check if data is loaded
    if (data@delay_loading) {
      
      # Store samples until the data is loaded.
      data@sample_set_on_load <- samples
      
    } else {
      
      # Determine the names of the id-columns, up to the series level.
      id_columns <- get_id_columns(id_depth = "series")
      
      if (is_empty(samples) && is.null(data@sample_set_on_load)) {
        # Return an empty data set if no samples are provided
        data@data <- head(data@data, n = 0)
        
      } else if (is_empty(samples) && !is.null(data@sample_set_on_load)) {
        # Use samples in the sample_set_on_load attribute.
        data@data <- merge(
          x = data@sample_set_on_load,
          y = data@data,
          by = id_columns,
          all = FALSE,
          allow.cartesian = TRUE)
        
      } else if (!is_empty(samples) & is.null(data@sample_set_on_load)) {
        # Use samples from the samples function argument. allow.cartesian is set
        # to true to allow use with repeated measurements.
        if (all(id_columns %in% colnames(samples))) {
          data@data <- merge(
            x = samples,
            y = data@data,
            by = id_columns,
            all = FALSE,
            allow.cartesian = TRUE)
          
        } else {
          data@data <- merge(
            x = samples,
            y = data@data,
            by = get_id_columns(id_depth = "sample"),
            all = FALSE,
            allow.cartesian = TRUE)
        }
        
      } else {
        # Use samples that appear both as function argument and within the
        # sample_set_on_load attribute. The sample_set_on_load attribute is used
        # as a filter. allow.cartesian is set to true to allow use with repeated
        # measurements.
        samples <- data.table::fintersect(samples, data@sample_set_on_load)
        
        if (is_empty(samples)) {
          # Return an empty data set if no samples are left.
          data@data <- head(data@data, n = 0)
          
        } else {
          # Check if series identifiers are present. They may be absent if
          # samples were generated using fam_sample
          
          if (all(id_columns %in% colnames(sample))) {
            data@data <- merge(
              x = samples,
              y = data@data,
              by = id_columns,
              all = FALSE,
              allow.cartesian = TRUE)
            
          } else {
            data@data <- merge(
              x = samples,
              y = data@data,
              by = get_id_columns(id_depth = "sample"),
              all = FALSE,
              allow.cartesian = TRUE)
          }
        }
      }
    }
    
    return(data)
  }
)



# aggregate_data ---------------------------------------------------------------
setMethod(
  "aggregate_data",
  signature(data = "dataObject"),
  function(data) {
    
    # Check if loading of the data object was delayed
    if (data@delay_loading) {
      # Mark for future aggregation after loading the data
      data@aggregate_on_load <- TRUE
      return(data)
      
    } else {
      # Set aggregation flag to FALSE and continue
      data@aggregate_on_load <- FALSE
    }
    
    # Check if the data is empty
    if (is_empty(data)) return(data)
    
    # Drop any duplicates (e.g. from bootstraps).
    data@data <- unique(data@data)
    
    # Identify the columns containing outcome, series, sample, and batch
    # identifiers.
    id_cols <- get_non_feature_columns(x = data, id_depth = "series")
    
    # Determine the number of different entries
    
    if (all(data@data$repetition_id == 1)) return(data)

    # Identify feature columns for repeated measurement data.
    feature_columns <- get_feature_columns(x = data)
    
    # Identify class of features
    column_class <- lapply(
      feature_columns,
      function(ii, data) (class(data[[ii]])[1]),
      data = data@data)
    
    # Determine numerical features
    numeric_features <- sapply(
      column_class,
      function(selected_column_class) (any(selected_column_class %in% c("numeric", "integer"))))
    
    numeric_features <- feature_columns[numeric_features]
    
    # Determine categorical features
    categorical_features <- sapply(
      column_class,
      function(selected_column_class) (any(selected_column_class %in% c("logical", "character", "factor"))))
    
    categorical_features <- feature_columns[categorical_features]
    
    # Find non-duplicate id columns
    aggregated_data <- unique(data@data[, mget(id_cols)])
    
    # Add aggregated numeric columns
    if (length(numeric_features) > 0) {
      numeric_data <- data@data[, lapply(
        .SD, stats::median),
        by = id_cols,
        .SDcols = numeric_features]
      
      aggregated_data <- merge(
        x = aggregated_data,
        y = numeric_data,
        by = id_cols)
    }
    
    # Add aggregated factor columns
    if (length(categorical_features) > 0) {
      categorical_data <- data@data[, lapply(
        .SD, get_mode),
        by = id_cols,
        .SDcols = categorical_features]
      
      aggregated_data <- merge(
        x = aggregated_data,
        y = categorical_data,
        by = id_cols)
    }
    
    # Add in repetition_id column again
    aggregated_data[, "repetition_id" := -1]
    
    # Reorder columns so that it matches the input
    data.table::setcolorder(
      aggregated_data,
      neworder = colnames(data@data))
    
    data@data <- aggregated_data
    
    return(data)
  }
)


# filter_features --------------------------------------------------------------
setMethod(
  "filter_features",
  signature(data = "dataObject"),
  function(
    data,
    remove_features = NULL,
    available_features = NULL) {
    # Removes features from a data set
    
    # If both are provided, use remove_features
    if (!is.null(remove_features) && !is.null(available_features)) {
      available_features <- NULL
    }
    
    # Do not do anything if both are null
    if (is.null(remove_features) && is.null(available_features)) {
      return(data)
      
      # Based on remove_features input
    } else if (!is.null(remove_features)) {
      
      # Determine which remove_features are actually present in data
      remove_features <- intersect(remove_features, get_feature_columns(x = data))
      
      # Based on available_features input
    } else if (!is.null(available_features)) {
      
      # Skip if length equals 0
      if (length(available_features) == 0) return(data)
      
      # Determine which features should be removed
      remove_features <- setdiff(get_feature_columns(x = data), available_features)
      
      # Only keep features that are in data.
      available_features <- intersect(get_feature_columns(x = data), available_features)
      
    } else {
      ..error_reached_unreachable_code(
        "This point should never be reachable. Check for inconsistencies if it does.")
    }
    
    # Make a copy to prevent updating by reference.
    data@data <- data.table::copy(data@data)
    
    # Remove features from data if there is 1 or more feature to remove
    if (length(remove_features) > 0) data@data[, (remove_features) := NULL]
    
    # Make sure that the column order is the same as available_features
    if (!is.null(available_features)) {
      data.table::setcolorder(
        x = data@data,
        neworder = c(get_non_feature_columns(x = data), available_features))
    }
    
    return(data)
  }
)


# filter_missing_outcome ------------------------------------------------------
setMethod(
  "filter_missing_outcome",
  signature(data = "dataObject"),
  function(data, is_validation = FALSE) {
    
    # Check if data is empty
    if (is_empty(data)) return(data)
    
    # Change behaviour by outcome_type
    if (data@outcome_type == "survival") {
      outcome_is_valid <- is_valid_data(data@data[["outcome_time"]]) & is_valid_data(data@data[["outcome_event"]])
      
    } else if (data@outcome_type %in% c("binomial", "multinomial", "continuous", "count")) {
      outcome_is_valid <- is_valid_data(data@data[["outcome"]])
      
    } else {
      stop(paste0("Implementation for outcome_type ", data@outcome_type, " is missing."))
    }
    
    if (is_validation) {
      # Check whether all outcome information is missing for validation. It may
      # be a prospective study. In that case, keep all data.
      if (all(!outcome_is_valid)) outcome_is_valid <- !outcome_is_valid
    }
    
    # Keep only data for which the outcome exists
    data@data <- data@data[(outcome_is_valid), ]
    
    return(data)
  }
)



# filter_bad_samples -----------------------------------------------------------
setMethod(
  "filter_bad_samples",
  signature(data = "dataObject"),
  function(data, threshold) {
    
    # Check if data is empty
    if (is_empty(data)) return(data)
    
    # Find the columns containing features
    feature_columns <- get_feature_columns(x = data)
    
    # Still very fast, but is much friendlier to our poor memory.
    n_missing <- numeric(nrow(data@data))
    for (feature in feature_columns) {
      n_missing <- n_missing + !is_valid_data(data@data[[feature]])
    }
    
    # Set threshold number, as threshold is a fraction.
    threshold_number <- length(feature_columns) * threshold
    
    # Only keep samples with a number of missing values below the threshold
    data@data <- data@data[(n_missing <= threshold_number)]
    
    return(data)
  }
)


# transform_features methods ---------------------------------------------------

## transform_features (dataObject) ---------------------------------------------
setMethod(
  "transform_features",
  signature(data = "dataObject"),
  function(
    data,
    feature_info_list,
    invert = FALSE) {
    
    # Check if transformation was already performed.
    if (!invert && .as_preprocessing_level(data) >= "transformation") {
      ..error_reached_unreachable_code(
        "transform_features,dataObject: attempting to transform data that are already transformed.")
    }
    
    # Update the preprocessing level.
    if (!invert) data@preprocessing_level <- "transformation"
    if (invert) data@preprocessing_level <- "signature"
    
    # Check if data is empty
    if (is_empty(data)) return(data)
    
    # Find the columns containing features
    feature_columns <- get_feature_columns(x = data)
    
    # Transform features
    data@data <- transform_features(
      data = data@data,
      feature_info_list = feature_info_list,
      features = feature_columns,
      invert = invert)
    
    return(data)
  }
)


## transform_features (data.table) ---------------------------------------------
setMethod(
  "transform_features",
  signature(data = "data.table"),
  function(
    data,
    feature_info_list,
    features,
    invert = FALSE) {
    
    # Check if data is empty
    if (is_empty(data)) return(data)
    
    # Apply transformations
    transformed_list <- lapply(
      features,
      function(ii, data, feature_info_list, invert) {
        
        x <- apply_feature_info_parameters(
          object = feature_info_list[[ii]]@transformation_parameters,
          data = data[[ii]],
          invert = invert)
        
        return(x)
      },
      data = data,
      feature_info_list = feature_info_list,
      invert = invert)
    
    # Update name of data in columns
    names(transformed_list) <- features
    
    # Update with replacement in the data object
    data <- update_with_replacement(
      data = data,
      replacement_list = transformed_list)
    
    return(data)
  }
)


# normalise_features methods ---------------------------------------------------

## normalise_features (dataObject) ---------------------------------------------
setMethod(
  "normalise_features",
  signature(data = "dataObject"),
  function(
    data,
    feature_info_list,
    invert = FALSE) {
    
    # Check if normalisation was already performed.
    if (!invert & .as_preprocessing_level(data) >= "normalisation") {
      ..error_reached_unreachable_code(
        "normalise_features,dataObject: attempting to normalise data that are already normalised.")
    }
    
    # Check if the previous step (transformation) was conducted.
    if (!invert & .as_preprocessing_level(data) < "transformation") {
      ..error_reached_unreachable_code(
        "normalise_features,dataObject: data should be transformed prior to normalisation.")
    }
    
    # Update the preprocessing_level.
    if (!invert) data@preprocessing_level <- "normalisation"
    if (invert) data@preprocessing_level <- "transformation"
    
    # Check if data is empty
    if (is_empty(data)) return(data)
    
    # Find the columns containing features
    feature_columns <- get_feature_columns(x = data)
    
    # Apply normalisation
    data@data <- normalise_features(
      data = data@data,
      feature_info_list = feature_info_list,
      features = feature_columns,
      invert = invert)
    
    return(data)
  }
)


## normalise_features (data.table) ---------------------------------------------
setMethod(
  "normalise_features",
  signature(data = "data.table"),
  function(
    data,
    feature_info_list,
    features,
    invert = FALSE) {
    
    # Check if data is empty.
    if (is_empty(data)) return(data)
    
    # Apply normalisation.
    normalised_list <- lapply(
      features,
      function(ii, data, feature_info_list, invert) {
        
        x <- apply_feature_info_parameters(
          object = feature_info_list[[ii]]@normalisation_parameters,
          data = data[[ii]],
          invert = invert)
        
        return(x)
      },
      data = data,
      feature_info_list = feature_info_list,
      invert = invert)
    
    # Update name of data in columns.
    names(normalised_list) <- features
    
    # Update with replacement in the data object.
    data <- update_with_replacement(
      data = data,
      replacement_list = normalised_list)
    
    return(data)
  }
)


# batch_normalise_features -----------------------------------------------------
setMethod(
  "batch_normalise_features",
  signature(data = "dataObject"),
  function(
    data,
    feature_info_list,
    cl = NULL,
    invert = FALSE) {
    
    # Check if batch normalisation was already performed.
    if (!invert && .as_preprocessing_level(data) >= "batch_normalisation") {
      ..error_reached_unreachable_code(paste0(
        "batch_normalise_features,dataObject: attempting to batch normalise data ",
        "that are already batch normalised."))
    }
    
    # Check if the previous step (normalisation) was conducted.
    if (!invert && .as_preprocessing_level(data) < "normalisation") {
      ..error_reached_unreachable_code(paste0(
        "batch_normalise_features,dataObject: data should be normalised globally ",
        "prior to batch normalisation."))
    }
    
    # Update the preprocessing_level.
    if (!invert) data@preprocessing_level <- "batch_normalisation"
    if (invert) data@preprocessing_level <- "normalisation"
    
    # Check if data is empty
    if (is_empty(data)) return(data)
    
    # Find the columns containing features
    feature_columns <- get_feature_columns(x = data)
    
    # Update feature_info_list by adding info for missing batches
    feature_info_list <- add_batch_normalisation_parameters(
      feature_info_list = feature_info_list[feature_columns],
      data = data)
    
    # Apply batch-normalisation
    batch_normalised_list <- lapply(
      feature_columns,
      function(ii, data, feature_info_list, invert) {
        
        # Dispatch to apply-method.
        x <- apply_feature_info_parameters(
          object = feature_info_list[[ii]]@batch_normalisation_parameters,
          data = data@data[, mget(c(ii, get_id_columns("batch")))],
          invert = invert)
        
        return(x)
      },
      data = data,
      feature_info_list = feature_info_list,
      invert = invert)
    
    # Update name of data in columns
    names(batch_normalised_list) <- feature_columns
    
    # Update with replacement in the data object
    data <- update_with_replacement(
      data = data,
      replacement_list = batch_normalised_list)
    
    return(data)
  }
)



# impute_features --------------------------------------------------------------
setMethod(
  "impute_features",
  signature(data = "dataObject"),
  function(
    data,
    feature_info_list,
    cl = NULL) {
    
    # Check if imputation was already performed.
    if (.as_preprocessing_level(data) >= "imputation") {
      ..error_reached_unreachable_code(
        "impute_features,dataObject: attempting to impute data that already have been imputed.")
    }
    
    # Check if the previous step (batch normalisation) was conducted.
    if (.as_preprocessing_level(data) < "batch_normalisation") {
      ..error_reached_unreachable_code(
        "impute_features,dataObject: data should be batch normalised prior to imputation.")
    }
    
    # Update the attained processing level.
    data@preprocessing_level <- "imputation"
    
    # Check if data is empty
    if (is_empty(data)) return(data)
    
    # Check if data has features
    if (!has_feature_data(x = data)) return(data)
    
    # Apply univariate results to the dataset.
    imputed_data <- .impute_features(
      data = data,
      feature_info_list = feature_info_list,
      initial_imputation = TRUE)
    
    # Apply multivariate model to the dataset.
    data <- .impute_features(
      data = imputed_data,
      feature_info_list = feature_info_list,
      initial_imputation = FALSE,
      mask_data = data)
    
    return(data)
  }
)


# cluster_features -------------------------------------------------------------
setMethod(
  "cluster_features",
  signature(data = "dataObject"),
  function(data, feature_info_list) {
    
    # Suppress NOTES due to non-standard evaluation in data.table
    feature_required <- feature_name <- NULL
    
    if (.as_preprocessing_level(data) >= "clustering") {
      ..error_reached_unreachable_code(
        "cluster_features,dataObject: attempting to cluster data that already have been clustered.")
    }
    
    # Check if the previous step (imputation) was conducted.
    if (.as_preprocessing_level(data) < "imputation") {
      ..error_reached_unreachable_code(
        "cluster_features,dataObject: data should be imputed prior to clustering.")
    }
    
    # Update the attained processing level.
    data@preprocessing_level <- "clustering"
    
    # Check if data is empty
    if (is_empty(data)) return(data)
    
    # Check if data has features
    if (!has_feature_data(x = data)) return(data)
    
    # Find the columns containing features
    feature_columns <- get_feature_columns(x = data)
    
    # Derive clustering table.
    cluster_table <- .create_clustering_table(
      feature_info_list = feature_info_list,
      selected_features = feature_columns)
    
    # Update data using the clustering table. Note that only features
    # that are both required and present are processed.
    clustered_data <- lapply(
      split(
        cluster_table[feature_required == TRUE & feature_name %in% feature_columns],
        by = "cluster_name"),
      set_clustered_data,
      data = data,
      feature_info_list = feature_info_list)
    
    # Attach the clustered data.
    data@data <- cbind(
      data@data[, mget(get_non_feature_columns(data))],
      data.table::setDT(clustered_data))
    
    return(data)
  }
)



# update_with_replacement methods ----------------------------------------------

## update_with_replacement (dataObject) ----------------------------------------
setMethod(
  "update_with_replacement",
  signature(data = "dataObject"),
  function(data, replacement_list) {
    
    # Replace data by passing to the data.table method.
    data@data <- update_with_replacement(
      data = data@data,
      replacement_list = replacement_list)
    
    return(data)
  }
)



## update_with_replacement (data.table) ----------------------------------------
setMethod(
  "update_with_replacement",
  signature(data = "data.table"),
  function(data, replacement_list) {
    # Updates columns of a data table with replacement data from the replacement
    # list.
    replacement_table <- data.table::copy(data)
    
    # Find feature names corresponding to columns to be replaced
    replace_features <- names(replacement_list)
    
    # Iterate over replacement list entries
    for (current_feature in replace_features) {
      replacement_table[, (current_feature) := replacement_list[[current_feature]]]
    }
    
    return(replacement_table)
  })


# select_features --------------------------------------------------------------
setMethod(
  "select_features",
  signature(data = "dataObject"),
  function(data, features) {
    # Allows for slicing the data.
    
    # Find non-feature columns
    non_feature_columns <- get_non_feature_columns(x = data)
    
    # Check if features are present as column name
    if (length(features) > 0) {
      if (!all(features %in% colnames(data@data))) {
        logger_stop("Not all features were found in the data set.")
      }
    }
    
    # Define the selected columns
    selected_columns <- unique(c(non_feature_columns, features))
    
    # Check if all columns are already present in the data. In that case we do
    # not need to copy the dataset.
    if (!setequal(selected_columns, colnames(data@data))) {
      # Select features
      data@data <- data.table::copy(data@data[, mget(selected_columns)])
    }
    
    return(data)
  }
)



# get_required_features methods ------------------------------------------------

## get_required_features (dataObject) ------------------------------------------
setMethod(
  "get_required_features",
  signature(x = "dataObject"),
  function(
    x,
    feature_info_list,
    features = NULL,
    exclude_signature = FALSE,
    exclude_novelty = FALSE,
    ...) {
    
    # Check if features are provided externally.
    is_external <- !is.null(features)
    
    # Create features from columns in the dataset, if unset.
    if (!is_external && x@delay_loading) {
      # Get features from the feature info list.
      features <- get_available_features(
        feature_info_list = feature_info_list,
        exclude_signature = exclude_signature,
        exclude_novelty = exclude_novelty)
      
    } else if (!is_external) {
      # Get features directly.
      features <- get_feature_columns(x)
    }
    
    # Pass to underlying function.
    return(.get_required_features(
      features = features,
      feature_info_list,
      is_clustered = .as_preprocessing_level(x@preprocessing_level) == "clustering",
      is_external = is_external,
      exclude_signature = exclude_signature,
      exclude_novelty = exclude_novelty))
  }
)



## get_required_features (character) -------------------------------------------
setMethod(
  "get_required_features",
  signature(x = "character"),
  function(
    x,
    feature_info_list,
    is_clustered,
    exclude_signature = FALSE,
    exclude_novelty = FALSE,
    ...) {
    # Method intended for directly providing features.
    
    if (length(x) == 0) return(NULL)
    
    return(.get_required_features(
      features = x,
      feature_info_list = feature_info_list,
      is_clustered = is_clustered,
      is_external = TRUE,
      exclude_signature = exclude_signature,
      exclude_novelty = exclude_novelty))
  }
)



## get_required_features (list) ------------------------------------------------
setMethod(
  "get_required_features",
  signature(x = "list"),
  function(
    x,
    exclude_signature = FALSE,
    exclude_novelty = FALSE,
    ...) {
    # Method intended for directly providing a list of featureInfo objects.
    
    if (is_empty(x)) return(NULL)
    
    # Sanity check. x should be a list of 
    if (!all(sapply(x, is, "featureInfo"))) {
      ..error_reached_unreachable_code(
        "get_required_features,list: expected a list of featureInfo objects.")
    }
    
    # Get features from the featureInfo objects.
    features <- get_available_features(
      feature_info_list = x,
      exclude_signature = exclude_signature,
      exclude_novelty = exclude_novelty)
    
    return(.get_required_features(
      features = features,
      feature_info_list = x,
      is_clustered = FALSE,
      is_external = FALSE,
      exclude_signature = exclude_signature,
      exclude_novelty = exclude_novelty))
  }
)



## get_required_features (NULL) ------------------------------------------------
setMethod(
  "get_required_features",
  signature(x = "NULL"),
  function(x, ...) {
    return(NULL)            
  }
)



.get_required_features <- function(
    features,
    feature_info_list,
    is_clustered,
    is_external,
    exclude_signature = FALSE,
    exclude_novelty = FALSE,
    exclude_imputation = FALSE,
    ...) {
  # What are required features? Required features are features that are:
  #
  # 1. Directly used for clustering (representative features).
  #
  # 2. Used for imputation of features mentioned in 1).
  
  # Suppress NOTES due to non-standard evaluation in data.table
  cluster_name <- feature_name <- feature_required <- NULL
  
  # If the features are empty
  if (is_empty(features) && is_external) return(NULL)
  
  # Generate a cluster table
  cluster_table <- .create_clustering_table(feature_info_list = feature_info_list)
  
  # Identify which features are required to form clusters.
  if (is_clustered) {
    # Data are clustered.
    required_features <- cluster_table[
      cluster_name %in% features & feature_required == TRUE]$feature_name
    
    # Sanity check. All externally provided features should be present
    # in the cluster table.
    if (is_external) {
      if (!all(features %in% cluster_table$cluster_name)) {
        ..error_reached_unreachable_code(paste0(
          ".get_required_features: some features do not appear in the ",
          " cluster table. Potentially, an incomplete feature_info_list was passed."))
      }
    }
    
  } else {
    # Data are not yet clustered.
    required_features <- cluster_table[
      feature_name %in% features & feature_required == TRUE]$feature_name
    
    # Sanity check. All externally provided features should be present
    # in the cluster table.
    if (is_external) {
      if (!all(features %in% cluster_table$feature_name)) {
        ..error_reached_unreachable_code(paste0(
          ".get_required_features: some features do not appear in the ",
          " cluster table. Potentially, an incomplete feature_info_list was passed."))
      }
    }
  }
  
  # Determine available features from feature info list.
  available_features <- get_available_features(
    feature_info_list = feature_info_list,
    exclude_signature = exclude_signature,
    exclude_novelty = exclude_novelty)
  
  # Sanity check. All features that we provide externally should be
  # present in the available features.
  if (is_external) {
    if (length(required_features) == 0) {
      ..error_reached_unreachable_code(paste0(
        "get_required_features,dataObject: there are no required features, ",
        "whereas at least one is expected."))
    }
    
    if (!all(required_features %in% available_features)) {
      ..error_reached_unreachable_code(paste0(
        "get_required_features,dataObject: there are required features for clustering ",
        "that are not available."))
    }
  }
  
  # Determine the intersect of the available features and features.
  required_features <- intersect(available_features, required_features)
  
  # Return NULL if no features are required.
  if (length(required_features) == 0) return(NULL)
  
  if (!exclude_imputation) {
    # Now, additionally select which features are required for imputation.
    required_features <- unique(unlist(lapply(
      feature_info_list[required_features],
      function(x) x@required_features)))
  }
  
  return(required_features)
}

# get_model_features methods ---------------------------------------------------

## get_model_features (dataObject) ---------------------------------------------
setMethod(
  "get_model_features",
  signature(x = "dataObject"),
  function(
    x,
    feature_info_list,
    features = NULL,
    exclude_signature = FALSE,
    exclude_novelty = FALSE,
    ...) {
    
    # Check if features are provided externally.
    is_external <- !is.null(features)
    
    # Create features from columns in the dataset, if unset.
    if (!is_external && x@delay_loading) {
      # Get features from the feature info list.
      features <- get_available_features(
        feature_info_list = x,
        exclude_signature = exclude_signature,
        exclude_novelty = exclude_novelty)
      
    } else if (!is_external) {
      # Get features directly.
      features <- get_feature_columns(x)
    }
    
    # Pass to underlying function.
    return(.get_required_features(
      features = features,
      feature_info_list,
      is_clustered = .as_preprocessing_level(x@preprocessing_level) == "clustering",
      is_external = is_external,
      exclude_signature = exclude_signature,
      exclude_novelty = exclude_novelty,
      exclude_imputation = TRUE,
      ...))
  }
)



## get_model_features (character) ----------------------------------------------
setMethod(
  "get_model_features",
  signature(x = "character"),
  function(
    x,
    feature_info_list,
    is_clustered,
    exclude_signature = FALSE,
    exclude_novelty = FALSE,
    ...) {
    
    return(.get_required_features(
      features = x,
      feature_info_list = feature_info_list,
      is_clustered = is_clustered,
      is_external = TRUE,
      exclude_signature = exclude_signature,
      exclude_novelty = exclude_novelty,
      exclude_imputation = TRUE,
      ...))
  }
)



## get_model_features (NULL) ---------------------------------------------------
setMethod(
  "get_model_features",
  signature(x = "NULL"),
  function(x, ...) {
    return(NULL)            
  }
)



create_data_column_info <- function(settings) {
  
  # Read from settings. If not set, these will be NULL.
  sample_id_column <- settings$data$sample_col
  batch_id_column <- settings$data$batch_col
  series_id_column <- settings$data$series_col
  
  # Replace any missing.
  if (is.null(sample_id_column)) sample_id_column <- NA_character_
  if (is.null(batch_id_column)) batch_id_column <- NA_character_
  if (is.null(series_id_column)) series_id_column <- NA_character_
  
  # Repetition column ids are only internal.
  repetition_id_column <- NA_character_
  
  # Create table
  data_info_table <- data.table::data.table(
    "type" = c(
      "batch_id_column",
      "sample_id_column",
      "series_id_column",
      "repetition_id_column"),
    "internal" = 
      get_id_columns(),
    "external" = c(
      batch_id_column,
      sample_id_column,
      series_id_column,
      repetition_id_column))
  
  if (settings$data$outcome_type %in% c("survival", "competing_risk")) {
    
    # Find internal and external outcome column names.
    internal_outcome_columns <- get_outcome_columns(settings$data$outcome_type)
    external_outcome_columns <- settings$data$outcome_col
    
    if (is.null(external_outcome_columns)) {
      external_outcome_columns <- c(NA_character_, NA_character_)
    }
    
    # Add to table
    outcome_info_table <- data.table::data.table(
      "type" = c("outcome_column", "outcome_column"),
      "internal" = internal_outcome_columns,
      "external" = external_outcome_columns)
    
  } else if (settings$data$outcome_type %in% c("binomial", "multinomial", "continuous", "count")) {
    
    # Find internal and external outcome column names.
    internal_outcome_columns <- get_outcome_columns(settings$data$outcome_type)
    external_outcome_columns <- settings$data$outcome_col
    
    if (is.null(external_outcome_columns)) {
      external_outcome_columns <- c(NA_character_)
    }
    
    # Add to table
    outcome_info_table <- data.table::data.table(
      "type" = "outcome_column",
      "internal" = internal_outcome_columns,
      "external" = external_outcome_columns)
    
  } else if (settings$data$outcome_type %in% c("unsupervised")) {
    
    # There is no outcome for unsupervised learners.
    outcome_info_table <- NULL
    
  } else {
    ..error_no_known_outcome_type(outcome_type = settings$data$outcome_type)
  }
  
  # Combine into one table and add to object
  return(rbind(data_info_table, outcome_info_table))
}

Try the familiar package in your browser

Any scripts or data that you put into this service are public.

familiar documentation built on Sept. 30, 2024, 9:18 a.m.