R/FamiliarDataComputationICE.R

Defines functions .check_anchor_value ..anchor_ice_values_2D ..anchor_ice_values_1D ..restrict_ice_samples ..update_ice_and_pd_output .update_ice_and_pd_output .create_pd_object .create_ice_and_pd_objects .create_feature_range ...extract_ice_data ..extract_ice_data .extract_ice

#' @include FamiliarS4Generics.R
#' @include FamiliarS4Classes.R
NULL

# familiarDataElementIndividualConditionalExpectation object -------------------
setClass(
  "familiarDataElementIndividualConditionalExpectation",
  contains = "familiarDataElement")



# familiarDataElementPartialDependence object ----------------------------------
setClass(
  "familiarDataElementPartialDependence",
  contains = "familiarDataElement")



# extract_ice (generic) --------------------------------------------------------

#'@title Internal function to extract data for individual conditional
#'  expectation plots.
#'
#'@description Computes data for individual conditional expectation plots and
#'  partial dependence plots for the model(s) in a `familiarEnsemble` object.
#'
#'@param features Names of the feature or features (2) assessed simultaneously.
#'  By default `NULL`, which means that all features are assessed one-by-one.
#'@param feature_x_range When one or two features are defined using `features`,
#'  `feature_x_range` can be used to set the range of values for the first
#'  feature. For numeric features, a vector of two values is assumed to indicate
#'  a range from which `n_sample_points` are uniformly sampled. A vector of more
#'  than two values is interpreted as is, i.e. these represent the values to be
#'  sampled. For categorical features, values should represent a (sub)set of
#'  available levels.
#'@param feature_y_range As `feature_x_range`, but for the second feature in
#'  case two features are defined.
#'@param n_sample_points Number of points used to sample continuous features.
#'@inheritParams extract_data
#'
#'@return A data.table containing individual conditional expectation plot data.
#'@md
#'@keywords internal
setGeneric(
  "extract_ice",
  function(
    object,
    data,
    cl = NULL,
    features = NULL,
    feature_x_range = NULL,
    feature_y_range = NULL,
    n_sample_points = 50L,
    ensemble_method = waiver(),
    evaluation_times = waiver(),
    sample_limit = waiver(),
    detail_level = waiver(),
    estimation_type = waiver(),
    aggregate_results = waiver(),
    confidence_level = waiver(),
    bootstrap_ci_method = waiver(),
    is_pre_processed = FALSE,
    message_indent = 0L,
    verbose = FALSE,
    ...) {
    standardGeneric("extract_ice")
  }
)



# extract_ice (familiarEnsemble) -----------------------------------------------
setMethod(
  "extract_ice",
  signature(object = "familiarEnsemble"),
  function(
    object,
    data,
    cl = NULL,
    features = NULL,
    feature_x_range = NULL,
    feature_y_range = NULL,
    n_sample_points = 50L,
    ensemble_method = waiver(),
    evaluation_times = waiver(),
    sample_limit = waiver(),
    detail_level = waiver(),
    estimation_type = waiver(),
    aggregate_results = waiver(),
    confidence_level = waiver(),
    bootstrap_ci_method = waiver(),
    is_pre_processed = FALSE,
    message_indent = 0L,
    verbose = FALSE,
    ...) {
    
    # Message extraction start
    if (is.null(features)) {
      logger_message(
        paste0(
          "Computing individual conditional expectation and partial dependence ",
          "data for features in the dataset."),
        indent = message_indent,
        verbose = verbose)
      
    } else {
      logger_message(
        paste0(
          "Computing individual conditional expectation and partial dependence ",
          "data for the selected features."),
        indent = message_indent,
        verbose = verbose)
    }
    
    # Load evaluation_times from the object settings attribute, if it is not provided.
    if (is.waive(evaluation_times)) {
      evaluation_times <- object@settings$eval_times
    }
    
    # Check evaluation_times argument
    if (object@outcome_type %in% c("survival")) {
      sapply(
        evaluation_times,
        .check_number_in_valid_range,
        var_name = "evaluation_times",
        range = c(0.0, Inf),
        closed = c(FALSE, TRUE))
    }
    
    # Check n_sample_points argument
    .check_number_in_valid_range(
      x = n_sample_points,
      var_name = "n_sample_points",
      range = c(1, Inf))
    
    # Obtain ensemble method from stored settings, if required.
    if (is.waive(ensemble_method)) {
      ensemble_method <- object@settings$ensemble_method
    }
    
    # Check ensemble_method argument
    .check_parameter_value_is_valid(
      x = ensemble_method,
      var_name = "ensemble_method",
      values = .get_available_ensemble_prediction_methods())
    
    # Load confidence alpha from object settings attribute if not provided
    # externally.
    if (is.waive(confidence_level)) {
      confidence_level <- object@settings$confidence_level
    }
    
    # Check confidence_level input argument
    .check_number_in_valid_range(
      x = confidence_level,
      var_name = "confidence_level",
      range = c(0.0, 1.0),
      closed = c(FALSE, FALSE))
    
    # Load the bootstrap method
    if (is.waive(bootstrap_ci_method)) {
      bootstrap_ci_method <- object@settings$bootstrap_ci_method
    } 
    
    .check_parameter_value_is_valid(
      x = bootstrap_ci_method,
      var_name = "bootstrap_ci_method",
      values = .get_available_bootstrap_confidence_interval_methods())
    
    # Check the sample limit.
    sample_limit <- .parse_sample_limit(
      x = sample_limit,
      object = object,
      default = Inf,
      data_element = "ice_data")
    
    # Check the level detail.
    detail_level <- .parse_detail_level(
      x = detail_level,
      object = object,
      default = "hybrid",
      data_element = "ice_data")
    
    # Check the estimation type.
    estimation_type <- .parse_estimation_type(
      x = estimation_type,
      object = object,
      default = "bootstrap_confidence_interval",
      data_element = "ice_data",
      detail_level = detail_level,
      has_internal_bootstrap = FALSE)
    
    # Check whether results should be aggregated.
    aggregate_results <- .parse_aggregate_results(
      x = aggregate_results,
      object = object,
      default = TRUE,
      data_element = "ice_data")
    
    # Test if models are properly loaded
    if (!is_model_loaded(object = object)) ..error_ensemble_models_not_loaded()
    
    # Test if any model in the ensemble was successfully trained.
    if (!model_is_trained(object = object)) return(NULL)
    
    # Generate a prototype data element.
    proto_data_element <- new(
      "familiarDataElementIndividualConditionalExpectation",
      detail_level = detail_level,
      estimation_type = estimation_type,
      confidence_level = confidence_level,
      bootstrap_ci_method = bootstrap_ci_method)
    
    # Generate elements to send to dispatch.
    ice_data <- extract_dispatcher(
      FUN = .extract_ice,
      has_internal_bootstrap = FALSE,
      cl = cl,
      object = object,
      data = data,
      features = features,
      feature_x_range = feature_x_range,
      feature_y_range = feature_y_range,
      sample_limit = sample_limit,
      n_sample_points = n_sample_points,
      proto_data_element = proto_data_element,
      is_pre_processed = is_pre_processed,
      ensemble_method = ensemble_method,
      evaluation_times = evaluation_times,
      aggregate_results = TRUE,
      message_indent = message_indent + 1L,
      verbose = verbose)
    
    return(ice_data)
  }
)



.extract_ice <- function(
    object,
    data,
    proto_data_element,
    evaluation_times = NULL,
    features = NULL,
    sample_limit,
    aggregate_results,
    n_models,
    is_pre_processed = FALSE,
    cl,
    message_indent = 0L,
    verbose = FALSE,
    progress_bar = FALSE,
    ...) {
  
  # Ensure that the object is loaded
  object <- load_familiar_object(object)
  
  # Add model name.
  proto_data_element <- add_model_name(proto_data_element, object = object)
  
  # Retrieve input data.
  data <- process_input_data(
    object = object,
    data = data,
    stop_at = "signature",
    is_pre_processed = is_pre_processed)
  
  # Maintain only important features. The current set is based on the
  # required features.
  data <- filter_features(
    data = data,
    available_features = object@model_features)
  
  # Check if the input data is not empty
  if (is_empty(data)) return(NULL)
  
  # Select samples up to sample_limit.
  data <- get_subsample(
    data = data,
    size = sample_limit,
    seed = 0L)
  
  # Aggregate data.
  data <- aggregate_data(data)
  
  # Add evaluation time as a identifier to the data element.
  if (length(evaluation_times) > 0 && object@outcome_type == "survival") {
    data_elements <- add_data_element_identifier(
      x = proto_data_element, 
      evaluation_time = evaluation_times)
    
  } else {
    data_elements <- list(proto_data_element)
  }
  
  if (!is.null(features)) {
    
    # Check that the features exist in the data set.
    if (!all(features %in% c(object@model_features))) {
      warning(paste0(
        "Data for individual conditional expectation or partial dependence plots ",
        "could not be computed for ",
        paste_s(setdiff(features, object@model_features)),
        " feature(s) as they are not used by the model."))
    }
    
    # Add features as identifier.
    if (length(features) == 1) {
      data_elements <- add_data_element_identifier(
        x = data_elements,
        feature_x = features)
      
    } else if (length(features) == 2) {
      
      if (length(unique(features)) != 2) {
        stop(paste0(
          "Data for individual conditional expectation or partial dependence plots ",
          "could not be computed as the provided features are not unique: ",
          paste_s(features), "."))
      }
      
      data_elements <- add_data_element_identifier(
        x = data_elements,
        feature_x = features[1])
      data_elements <- add_data_element_identifier(
        x = data_elements,
        feature_y = features[2])
      
    } else {
      stop(paste0(
        "Data for individual conditional expectation or partial dependence plots cannot ",
        "be computed for more than 2 features simultaneously. Found: ",
        paste_s(features), "."))
    }
    
  } else {
    # Add features as identifier.
    data_elements <- add_data_element_identifier(
      x = data_elements,
      feature_x = object@model_features)
  }
  
  # Iterate over elements.
  data_elements <- fam_mapply(
    cl = cl,
    assign = NULL,
    FUN = ..extract_ice_data,
    data_element = data_elements,
    MoreArgs = c(
      list(
        "data" = data,
        "object" = object,
        "verbose" = verbose && !progress_bar && n_models == 1,
        "message_indent" = message_indent),
      list(...)),
    progress_bar = progress_bar,
    chopchop = TRUE)
  
  # Flatten list of data elements.
  data_elements <- unlist(data_elements)
  if (!is.list(data_elements)) data_elements <- list(data_elements)
  
  # Merge data elements
  data_elements <- merge_data_elements(data_elements)
  
  if (aggregate_results) data_elements <- .compute_data_element_estimates(x = data_elements)
  
  return(data_elements)
}



..extract_ice_data <- function(
    cl = NULL,
    data_element,
    data,
    object,
    aggregate_results,
    feature_x_range = NULL,
    feature_y_range = NULL,
    n_sample_points,
    ensemble_method,
    verbose = FALSE,
    message_indent,
    ...) {
  # Divide feature(s) into points.
  
  # Generate range
  feature_x_range <- .create_feature_range(
    feature_info = object@feature_info,
    feature = data_element@identifiers$feature_x,
    column_type = class(data@data[[data_element@identifiers$feature_x]]),
    feature_range = feature_x_range,
    n = n_sample_points)
  
  # Add feature values.
  data_elements <- add_data_element_identifier(
    x = data_element,
    feature_x_value = feature_x_range)
  
  # Mention feature.
  message_str <- paste0(
    "Computing ICE / PD curves for \"", data_element@identifiers$feature_x, "\"")
  
  if (!is.null(data_element@identifiers$feature_y)) {
    feature_y_range <- .create_feature_range(
      feature_info = object@feature_info,
      feature = data_element@identifiers$feature_y,
      column_type = class(data@data[[data_element@identifiers$feature_y]]),
      feature_range = feature_y_range,
      n = n_sample_points)
    
    # Add feature values.
    data_elements <- add_data_element_identifier(
      x = data_elements,
      feature_y_value = feature_y_range)
    
    # Mention feature.
    message_str <- c(
      message_str, 
      paste0(" and \"", data_element@identifiers$feature_y, "\""))
  }
  
  # Add evaluation time.
  if (length(data_element@identifiers$evaluation_time) > 0) {
    message_str <- c(
      message_str,
      paste0(" at time ", data_element@identifiers$evaluation_time, "."))
    
  } else {
    message_str <- c(message_str, ".")
  }
  
  logger_message(
    paste0(message_str, collapse = ""),
    indent = message_indent,
    verbose = verbose)
  
  # Iterate over elements.
  data_elements <- lapply(
    data_elements,
    ...extract_ice_data,
    data = data,
    object = object,
    ensemble_method = ensemble_method)
  
  return(data_elements)
}



...extract_ice_data <- function(
    data_element,
    data,
    object,
    ensemble_method) {
  
  # Make a local copy of the data.
  data@data <- data.table::copy(data@data)
  
  # Replace the feature indicated by feature_x by the value in feature_x_value.
  data@data[, (data_element@identifiers$feature_x) := data_element@identifiers$feature_x_value]
  
  # Replace the feature indicated by feature_y by the value in feature_y_value.
  if (!is.null(data_element@identifiers$feature_y)) {
    data@data[, (data_element@identifiers$feature_y) := data_element@identifiers$feature_y_value]
  }
  
  # Predict both novelty
  if (object@outcome_type %in% c("survival", "competing_risk")) {
    type <- c("survival_probability", "novelty")
    
  } else {
    type <- c("default", "novelty")
  }
  
  # Compute performance data.
  prediction_data <- .predict(
    object = object,
    data = data,
    ensemble_method = ensemble_method,
    time = data_element@identifiers$evaluation_time,
    type = type,
    aggregate_results = TRUE)
  
  # Check that valid prediction data were generated.
  if (!any_predictions_valid(
    prediction_data,
    outcome_type = object@outcome_type)) {
    return(NULL)
  }
  
  # Select prediction columns
  if (object@outcome_type %in% c("survival", "competing_risk")) {
    prediction_columns <- c("survival_probability", "novelty")
    
  } else if (object@outcome_type %in% c("binomial", "multinomial")) {
    prediction_columns <- c(
      get_class_probability_name(object), "novelty")
    
  } else if (object@outcome_type %in% c("count", "continuous")) {
    prediction_columns <- c("predicted_outcome", "novelty")
    
  } else {
    ..error_no_known_outcome_type(object@outcome_type)
  }
  
  # Select only the prediction columns.
  ice_data <- prediction_data[, mget(prediction_columns)]
  
  # Create unique row names for samples and insert.
  ice_data[, "sample" := get_unique_row_names(x = data)]

  # Remove data with missing predictions.
  ice_data <- remove_nonvalid_predictions(
    ice_data,
    outcome_type = object@outcome_type)
  
  # Check if removing invalid predictions leaves any data.
  if (is_empty(ice_data)) return(NULL)
  
  if (object@outcome_type %in% c("binomial", "multinomial")) {
    
    # Determine class levels.
    class_levels <- get_outcome_class_levels(object)
    if (object@outcome_type == "binomial") class_levels <- class_levels[2]
    
    # Find probability names.
    probability_names <- get_class_probability_name(class_levels)
    
    # Convert table from wide to long format.
    ice_data <- data.table::melt(
      data = ice_data,
      measure.vars = probability_names,
      variable.name = "positive_class",
      value.name = "probability")
    
    # Replace labels by class names.
    ice_data$positive_class <- factor(
      x = ice_data$positive_class,
      levels = probability_names,
      labels = class_levels)
    
    # Add positive class as identifier.
    data_elements <- add_data_element_identifier(
      x = data_element,
      positive_class = class_levels)
    
    # Create ice and pd plot data.
    data_elements <- lapply(
      data_elements,
      .create_ice_and_pd_objects,
      data = ice_data,
      outcome_type = object@outcome_type,
      value_columns = c("probability", "novelty"))
    
   } else {
     # Create ice and pd plot data.
     data_elements <- .create_ice_and_pd_objects(
       data_element,
       data = ice_data,
       outcome_type = object@outcome_type,
       value_columns = prediction_columns)
   }
  
  return(data_elements)
}



.create_feature_range <- function(
    feature_info,
    feature, 
    feature_range, 
    n, 
    column_type) {
  
  # Find the feature information associated with the feature.
  feature_info <- feature_info[[feature]]
  
  # Check that the feature info is present.
  if (is.null(feature_info)) {
    stop(paste0("Feature information could not be found for the ", feature, " feature."))
  }
  
  # Determine if the feature is categorical or numerical.
  if (feature_info@feature_type == "factor") {
    
    # Get the levels.
    feature_levels <- feature_info@levels
    
    if (is.null(feature_range)) {
      feature_range <- factor(feature_levels, levels = feature_levels)
      
    } else if (!all(feature_range %in% feature_levels)) {
      stop(paste0(
        "One or more levels defined in the feature range for creating ",
        "individual conditional expectation and partial dependence plots do not ",
        "match levels found in training data: ",
        paste_s(setdiff(feature_range, feature_levels)),
        ". Check for misspelled levels."))
    }
    # If not null, and no mismatches occur, use feature_range directly.
    
  } else if (feature_info@feature_type == "numeric") {
    
    if (!is.null(feature_range)) {
      # Check that values are numeric.
      if (!is.numeric(feature_range)) {
        stop(paste0(
          "Numeric values are required to define the feature range for creating ",
          "individual conditional expectation and partial dependence plots."))
      } 
      
      # Check that all values are finite.
      if (any(!is.finite(feature_range))) {
        stop(paste0(
          "Numeric values for creating individual conditional expectation and partial dependence ",
          " plots should be finite. NA and infinite values are not allowed."))
      }
        
      # Sort values.
      feature_range <- sort(unique(feature_range))
      
      # If two values are defined, interpret as range, and sample from it.
      # If not, use feature_range directly.
      if (length(feature_range) == 2) {
        feature_range <- stats::approx(
          x = c(0.00, 1.00),
          y = feature_range,
          n = n,
          method = "linear")$y
      }
      
    } else {
      # Create the range of values from the feature distribution.
      if (n == 1) {
        feature_range <- as.numeric(feature_info@distribution$fivenum)[3]
        
      } else {
        # Sample the five-number summary.
        feature_range <- stats::spline(
          x = c(0.00, 0.25, 0.50, 0.75, 1.00),
          y = as.numeric(feature_info@distribution$fivenum),
          n = n,
          method = "hyman")$y
      }
    }
    
    # Convert to integer if required.
    if (any(column_type == "integer")) {
      feature_range <- as.integer(feature_range)
    }
    
    # Select unique values.
    feature_range <- unique(feature_range)
    
  } else {
    ..error_reached_unreachable_code(paste0(
      ".create_feature_range: encountered unknown feature type (",
      feature_info@feature_type, ") for the ", feature, " feature."))
  }
  
  return(feature_range)
}



.create_ice_and_pd_objects <- function(
    data_element,
    data,
    outcome_type,
    value_columns) {
  
  # Suppress NOTES due to non-standard evaluation in data.table
  positive_class <- NULL
  
  # Make a local copy.
  data <- data.table::copy(data)
  
  if (outcome_type %in% c("binomial", "multinomial")) {
    # Select only data corresponding to the positive class.
    data <- data[positive_class == data_element@identifiers$positive_class]
    
    # Drop the positive class column.
    data[, "positive_class" := NULL]
  }
  
  # Create ice and pd data elements.
  ice_data_element <- data_element
  
  # Update ice data element.
  ice_data_element@grouping_column <- "sample"
  ice_data_element@data <- data
  ice_data_element@value_column <- value_columns
  
  # Update pd data element.
  pd_data_element <- .create_pd_object(ice_data_element)
  
  return(list(ice_data_element, pd_data_element))
}
  


.create_pd_object <- function(ice_data_element) {
  
  # Create partial dependence data.
  pd_data_element <- methods::new(
    "familiarDataElementPartialDependence",
    ice_data_element)
  
  # Select grouping columns.
  grouping_columns <- setdiff(ice_data_element@grouping_column, "sample")
  
  if (length(grouping_columns) == 0) grouping_columns <- NULL
  
  pd_data_element@grouping_column <- grouping_columns
  
  # Average data.
  if (length(grouping_columns) > 0) {
    pd_data_element@data <- pd_data_element@data[
      , lapply(.SD, mean, na.rm = TRUE),
      .SDcols = ice_data_element@value_column,
      by = c(grouping_columns)]
    
  } else {
    pd_data_element@data <- pd_data_element@data[
      , lapply(.SD, mean, na.rm = TRUE),
      .SDcols = ice_data_element@value_column]
  }
  
  return(pd_data_element)
}



.update_ice_and_pd_output <- function(
    ice_data,
    pd_data,
    outcome_type,
    anchor_values = NULL,
    n_samples = NULL,
    seed) {
  
  if (is_empty(ice_data)) {
    return(list(
      "ice_data" = ice_data,
      "pd_data" = pd_data))
  } 
  
  # Find anchor value for the x-feature. It will be NULL if the current feature
  # does does not appear in anchor_values.
  x_anchor <- tryCatch(
    anchor_values[[ice_data@identifiers$feature_x]],
    error = function(err) (return(NULL)))
  
  if (!is.null(x_anchor)) {
    if (length(x_anchor) > 1) {
      stop(paste0(
        "Only a single value can be provided as an anchor value for the ",
        ice_data@identifiers$feature_x, " feature."))
    } 
  }
  
  # Find anchor value for the y-feature. It will be NULL if the current feature
  # does does not appear in anchor_values, or there is not y-feature associated
  # with the data.
  y_anchor <- tryCatch(
    anchor_values[[ice_data@identifiers$feature_y]],
    error = function(err) (return(NULL)))
  
  if (!is.null(y_anchor)) {
    if (length(y_anchor) > 1) {
      stop(paste0(
        "Only a single value can be provided as an anchor value for the ",
        ice_data@identifiers$feature_y, " feature."))
    }
  }
  
  # Rename main value column to a consist name.
  if (outcome_type %in% c("binomial", "multinomial")) {
    old_value_column <- "probability"
    
  } else if (outcome_type %in% c("continuous", "count")) {
    old_value_column <- "predicted_outcome"
    
  } else if (outcome_type %in% c("survival")) {
    old_value_column <- "survival_probability"
    
  } else {
    ..error_outcome_type_not_implemented(outcome_type)
  }
  
  # Update confidence interval names.
  if (ice_data@estimation_type %in% c("bci", "bootstrap_confidence_interval")) {
    old_value_column <- paste0(old_value_column, c("", "_ci_low", "_ci_up"))
    new_value_column <- c("value", "value_ci_low", "value_ci_up")
    
  } else {
    new_value_column <- "value"
  }
  
  # Replace column names.
  data.table::setnames(
    x = ice_data@data,
    old = old_value_column,
    new = new_value_column)
  data.table::setnames(
    x = pd_data@data, 
    old = old_value_column,
    new = new_value_column)
  
  # Update value column attributes.
  ice_data@value_column <- c(
    new_value_column,
    setdiff(ice_data@value_column, old_value_column))
  pd_data@value_column <- c(
    new_value_column,
    setdiff(pd_data@value_column, old_value_column))
  
  if (!is.null(x_anchor) || !is.null(y_anchor)) {
    
    # Update individual conditional expectation data.
    ice_data <- ..update_ice_and_pd_output(
      x = ice_data,
      x_anchor = x_anchor,
      y_anchor = y_anchor,
      value_column = new_value_column,
      outcome_type = outcome_type,
      anchor_values = anchor_values)
    
    # Update partial dependence data.
    pd_data <- .create_pd_object(ice_data)
  }
  
  if (!is.null(n_samples)) {
    # Select up to n_samples. These samples are random between experiments, but
    # fixed within one, as we explicitly set the random seed to be used.
    cropped_ice_data <- lapply(
      split(ice_data@data, by = "data_set", drop = TRUE),
      ..restrict_ice_samples,
      n_samples = n_samples,
      seed = seed)
    
    # Replace data attribute with the limited sample list.
    ice_data@data <- data.table::rbindlist(
      cropped_ice_data,
      use.names = TRUE)
  }
  
  return(list(
    "ice_data" = ice_data,
    "pd_data" = pd_data))
}



..update_ice_and_pd_output <- function(
    x,
    x_anchor = NULL,
    y_anchor = NULL, 
    value_column, 
    outcome_type, 
    anchor_values = NULL) {
  
  # Suppress NOTES due to non-standard evaluation in data.table
  feature_x_value <- feature_x_value <- feature_y_value <- value <- NULL
  
  # Subtract anchor values.
  if (!is.null(x_anchor) && !is.null(y_anchor)) {
  
    # Set grouping columns.
    grouping_columns <- setdiff(
      x@grouping_column,
      c("feature_x_value", "feature_y_value"))
    
    # Subtract anchor value for 2D plots with x and y anchors.
    if (length(grouping_columns) > 0) {
      x@data[, (value_column) := lapply(
        .SD,
        ..anchor_ice_values_2D,
        x = feature_x_value,
        x_anchor = x_anchor,
        y = feature_y_value,
        y_anchor = y_anchor,
        value_offset = value,
        x_name = x@identifiers$feature_x,
        y_name = x@identifiers$feature_y),
        by = c(grouping_columns),
        .SDcols = value_column]
      
    } else {
      x@data[, (value_column) := lapply(
        .SD,
        ..anchor_ice_values_1D,
        x = feature_x_value,
        x_anchor = x_anchor,
        y = feature_y_value,
        y_anchor = y_anchor,
        value_offset = value,
        x_name = x@identifiers$feature_x,
        y_name = x@identifiers$feature_y),
        .SDcols = value_column]
    }
    
  } else if (!is.null(x_anchor)) {
    # Set grouping columns.
    grouping_columns <- setdiff(x@grouping_column, c("feature_x_value"))
    
    # Subtract anchor value for x anchor.
    if (length(grouping_columns) > 0) {
      x@data[, (value_column) := lapply(
        .SD,
        ..anchor_ice_values_1D,
        x = feature_x_value,
        x_anchor = x_anchor,
        value_offset = value,
        name = x@identifiers$feature_x),
        by = c(grouping_columns),
        .SDcols = value_column]
      
    } else {
      x@data[, (value_column) := lapply(
        .SD,
        ..anchor_ice_values_1D,
        x = feature_x_value,
        x_anchor = x_anchor,
        value_offset = value,
        name = x@identifiers$feature_x),
        .SDcols = value_column]
    }
    
  } else if (!is.null(y_anchor)) {
    # Set grouping columns.
    grouping_columns <- setdiff(x@grouping_column, c("feature_y_value"))
    
    # Subtract anchor value for y anchor.
    if (length(grouping_columns) > 0) {
      x@data[, (value_column) := lapply(
        .SD,
        ..anchor_ice_values_1D,
        x = feature_y_value,
        x_anchor = y_anchor,
        value_offset = value,
        name = x@identifiers$feature_y),
        by = c(grouping_columns),
        .SDcols = value_column]
      
    } else {
      x@data[, (value_column) := lapply(
        .SD,
        ..anchor_ice_values_1D,
        x = feature_y_value,
        x_anchor = y_anchor,
        value_offset = value,
        name = x@identifiers$feature_y),
        .SDcols = value_column]
    }
    
  } else {
    # No anchor values were set.
    return(x)
  }
  
  return(x)
}



..restrict_ice_samples <- function(x, n_samples, seed = NULL) {
  
  # Suppress NOTES due to non-standard evaluation in data.table
  sample <- NULL
  
  if (is.null(n_samples)) return(x)

  # Select unique sample identifiers.
  sample_identifiers <- unique(x$sample)
  
  # Do not sample if the number of available samples is smaller than n_samples.
  if (length(sample_identifiers) < n_samples) return(x)
  
  # Select samples.
  selected_identifiers <- fam_sample(
    x = sample_identifiers,
    size = n_samples,
    replace = FALSE,
    seed = seed) 
  
  x <- x[sample %in% selected_identifiers]
  
  return(x)
}



..anchor_ice_values_1D <- function(
    value_in, 
    x, 
    x_anchor, 
    value_offset, 
    name) {
  
  # Check anchor value.
  x_anchor <- .check_anchor_value(
    x = x,
    x_anchor = x_anchor,
    name = name)
  
  if (is.numeric(x)) {
    # Set anchor value
    if (x_anchor %in% x) {
      anchor_value <- value_offset[x == x_anchor]
        
    } else {
      # Interpolate.
      anchor_value <- stats::spline(
        x = x,
        y = value_offset,
        xout = x_anchor)$y
    }
  } else {
    # Set anchor value by selecting the value corresponding to x_anchor.
    anchor_value <- value_offset[x == x_anchor]
  }
  
  # Subtract the anchor value.
  return(value_in - anchor_value)
}



..anchor_ice_values_2D <- function(
    value_in, 
    x,
    x_anchor, 
    y, 
    y_anchor, 
    value_offset, 
    x_name, 
    y_name) {
  # This is a bit more tricky since we need to do a surface interpolation which
  # R does not support out of the box. The akima package has a weird licence.
  
  # Check anchor values.
  x_anchor <- .check_anchor_value(
    x = x,
    x_anchor = x_anchor, 
    name = x_name)
  y_anchor <- .check_anchor_value(
    x = y, 
    x_anchor = y_anchor, 
    name = y_name)
  
  if (x_anchor %in% x && y_anchor %in% y) {
    # Simple - no interpolation needed.
    anchor_value <- value_offset[x == x_anchor & y == y_anchor]
    
  } else if (x_anchor %in% x) {
    # Interpolation in y.
    anchor_value <- stats::spline(
      x = y[x == x_anchor],
      y = value_offset[x == x_anchor],
      xout = y_anchor)$y
    
  } else if (y_anchor %in% y) {
    # Interpolation in x.
    anchor_value <- stats::spline(
      x = x[y == y_anchor],
      y = value_offset[y == y_anchor],
      xout = x_anchor)$y
    
  } else {
    require_package(
      x = "ranger",
      purpose = "to anchor ICE/PD plot curves")
    
    # Interpolation using random forest.
    model <- ranger::ranger(
      value ~ x + y,
      data = data.table::data.table(
        "x" = x,
        "y" = y,
        "value" = value_offset),
      num.trees = 100,
      num.threads = 1L,
      seed = 1L)
    
    anchor_value <- predict(
      model,
      data.table::data.table(
        "x" = x_anchor,
        "y" = y_anchor),
      num.threads = 1L)$predictions
  }
  
  # Subtract the anchor value.
  return(value_in - anchor_value)
}



.check_anchor_value <- function(x, x_anchor, name) {
  # Checks the anchor value, and updates it if needed.
  
  if (is.numeric(x)) {
    if (!is.numeric(x_anchor)) {
      stop(paste0("Anchor value of the ", name, " feature should be numeric."))
    }
    
    # Check for out-of-range anchor values.
    if (x_anchor < min(x) || x_anchor > max(x)) {
      x_anchor_old <- x_anchor
      
      if (x_anchor < min(x)) x_anchor <- min(x)
      if (x_anchor > max(x)) x_anchor <- max(x)
      
      warning(paste0(
        "Anchor value (", x_anchor_old, ") of the ", name,
        " feature lies outside computed range. ",
        "The nearest value (", x_anchor, ") is used instead."))
    }
    
  } else if (is.factor(x)) {
    
    # Check that the anchor value appears in the levels of x.
    if (!x_anchor %in% levels(x)) {
      stop(paste0(
        "Anchor value (", x_anchor, ") of the ", name,
        " feature was not found among the computed levels (",
        paste_s(levels(x)), ")."))
    }
    
  } else {
    if (!x_anchor %in% x) {
      stop(paste0(
        "Anchor value (", x_anchor, ") of the ", name,
        " feature was not found among the available values."))
    }
  }
  
  return(x_anchor)
}



# export_ice_data (generic) ----------------------------------------------------

#'@title Extract and export individual conditional expectation data.
#'
#'@description Extract and export individual conditional expectation data.
#'
#'@inheritParams export_all
#'@inheritParams export_univariate_analysis_data
#'
#'@inheritDotParams extract_ice
#'@inheritDotParams as_familiar_collection
#'
#'@details Data is usually collected from a `familiarCollection` object.
#'  However, you can also provide one or more `familiarData` objects, that will
#'  be internally converted to a `familiarCollection` object. It is also
#'  possible to provide a `familiarEnsemble` or one or more `familiarModel`
#'  objects together with the data from which data is computed prior to export.
#'  Paths to the previous files can also be provided.
#'
#'  All parameters aside from `object` and `dir_path` are only used if `object`
#'  is not a `familiarCollection` object, or a path to one.
#'
#'@return A list of data.tables (if `dir_path` is not provided), or nothing, as
#'  all data is exported to `csv` files.
#'@exportMethod export_ice_data
#'@md
#'@rdname export_ice_data-methods
setGeneric(
  "export_ice_data",
  function(
    object,
    dir_path = NULL,
    aggregate_results = TRUE,
    export_collection = FALSE,
    ...) {
    standardGeneric("export_ice_data")
  }
)



# export_ice_data (collection) -------------------------------------------------

#'@rdname export_ice_data-methods
setMethod(
  "export_ice_data",
  signature(object = "familiarCollection"),
  function(
    object,
    dir_path = NULL,
    aggregate_results = TRUE,
    export_collection = FALSE,
    ...) {
    
    # Make sure the collection object is updated.
    object <- update_object(object = object)
    
    # Obtain individual conditional expectation plots.
    return(.export(
      x = object,
      data_slot = "ice_data",
      dir_path = dir_path,
      aggregate_results = aggregate_results,
      type = "explanation",
      subtype = "ice",
      object_class = "familiarDataElementIndividualConditionalExpectation",
      export_collection = export_collection))
  }
)



# export_ice_data (general) ----------------------------------------------------

#'@rdname export_ice_data-methods
setMethod(
  "export_ice_data",
  signature(object = "ANY"),
  function(
    object,
    dir_path = NULL,
    aggregate_results = TRUE,
    export_collection = FALSE,
    ...) {
    
    # Attempt conversion to familiarCollection object.
    object <- do.call(
      as_familiar_collection,
      args = c(
        list(
          "object" = object,
          "data_element" = "ice_data",
          "aggregate_results" = aggregate_results),
        list(...)))
    
    return(do.call(
      export_ice_data,
      args = c(
        list(
          "object" = object,
          "dir_path" = dir_path,
          "aggregate_results" = aggregate_results,
          "export_collection" = export_collection),
        list(...))))
  }
)



# .export (familiarDataElementIndividualConditionalExpectation) ----------------
setMethod(
  ".export",
  signature(x = "familiarDataElementIndividualConditionalExpectation"),
  function(
    x,
    x_list, 
    aggregate_results = FALSE,
    ...) {
    
    if (aggregate_results) {
      x_list <- .compute_data_element_estimates(x_list)
    }
    
    # Determine identifiers that should be merged. Since the feature values of
    # the x and y features may be different (e.g. numeric and factor), merging
    # them would cause features values to merged incorrectly.
    merging_identifiers <- setdiff(
      names(x@identifiers),
      c("feature_x", "feature_y"))
    
    # Merge data elements.
    x <- merge_data_elements(
      x = x_list,
      as_data = merging_identifiers,
      as_grouping_column = TRUE,
      force_data_table = TRUE)
    
    return(x)
  }
)



# export_partial_dependence_data (generic) -------------------------------------

#'@title Extract and export partial dependence data.
#'
#'@description Extract and export partial dependence data.
#'
#'@inheritParams export_all
#'@inheritParams export_univariate_analysis_data
#'
#'@inheritDotParams extract_ice
#'@inheritDotParams as_familiar_collection
#'
#'@details Data is usually collected from a `familiarCollection` object.
#'  However, you can also provide one or more `familiarData` objects, that will
#'  be internally converted to a `familiarCollection` object. It is also
#'  possible to provide a `familiarEnsemble` or one or more `familiarModel`
#'  objects together with the data from which data is computed prior to export.
#'  Paths to the previous files can also be provided.
#'
#'  All parameters aside from `object` and `dir_path` are only used if `object`
#'  is not a `familiarCollection` object, or a path to one.
#'
#'@return A list of data.tables (if `dir_path` is not provided), or nothing, as
#'  all data is exported to `csv` files.
#'@exportMethod export_partial_dependence_data
#'@md
#'@rdname export_partial_dependence_data-methods
setGeneric(
  "export_partial_dependence_data",
  function(
    object,
    dir_path = NULL,
    aggregate_results = TRUE,
    export_collection = FALSE,
    ...) {
    standardGeneric("export_partial_dependence_data")
  }
)



# export_partial_dependence_data (collection) ----------------------------------

#'@rdname export_partial_dependence_data-methods
setMethod(
  "export_partial_dependence_data",
  signature(object = "familiarCollection"),
  function(
    object,
    dir_path = NULL,
    aggregate_results = TRUE,
    export_collection = FALSE,
    ...) {
    
    # Make sure the collection object is updated.
    object <- update_object(object = object)
    
    # Obtain partial dependence
    return(.export(
      x = object,
      data_slot = "ice_data",
      dir_path = dir_path,
      aggregate_results = aggregate_results,
      type = "explanation",
      subtype = "pd",
      object_class = "familiarDataElementPartialDependence",
      export_collection = export_collection))
  }
)



# export_partial_dependence_data (general) -------------------------------------

#'@rdname export_partial_dependence_data-methods
setMethod(
  "export_partial_dependence_data",
  signature(object = "ANY"),
  function(
    object,
    dir_path = NULL,
    aggregate_results = TRUE,
    export_collection = FALSE,
    ...) {
    
    # Attempt conversion to familiarCollection object.
    object <- do.call(
      as_familiar_collection,
      args = c(
        list(
          "object" = object,
          "data_element" = "ice_data",
          "aggregate_results" = aggregate_results),
        list(...)))
    
    return(do.call(
      export_partial_dependence_data,
      args = c(
        list(
          "object" = object,
          "dir_path" = dir_path,
          "aggregate_results" = aggregate_results,
          "export_collection" = export_collection),
        list(...))))
  }
)


# .export (familiarDataElementPartialDependence) -------------------------------
setMethod(
  ".export",
  signature(x = "familiarDataElementPartialDependence"),
  function(
    x,
    x_list,
    aggregate_results = FALSE,
    ...) {
    
    if (aggregate_results) {
      x_list <- .compute_data_element_estimates(x_list)
    }
    
    # Determine identifiers that should be merged. Since the feature values of
    # the x and y features may be different (e.g. numeric and factor), merging
    # them would cause features values to merged incorrectly.
    merging_identifiers <- setdiff(
      names(x@identifiers),
      c("feature_x", "feature_y"))
    
    # Merge data elements.
    x <- merge_data_elements(
      x = x_list,
      as_data = merging_identifiers,
      as_grouping_column = TRUE,
      force_data_table = TRUE)
    
    return(x)
  }
)

Try the familiar package in your browser

Any scripts or data that you put into this service are public.

familiar documentation built on Sept. 30, 2024, 9:18 a.m.