R/FamiliarDataComputationFeatureSimilarity.R

Defines functions .create_feature_similarity_cluster_method_object .compute_feature_similarity_cluster_table .append_feature_similarity_clustering .append_feature_similarity_clusters .append_feature_similarity_dendrogram ..extract_feature_similarity .extract_feature_similarity

#' @include FamiliarS4Generics.R
#' @include FamiliarS4Classes.R
NULL

# familiarDataElementFeatureSimilarity object ----------------------------------
setClass(
  "familiarDataElementFeatureSimilarity",
  contains = "familiarDataElement",
  slots = list(
    "similarity_metric" = "character",
    "cluster_method" = "character",
    "linkage_method" = "character",
    "cluster_cut_method" = "character",
    "similarity_threshold" = "ANY",
    "dendrogram" = "ANY"),
  prototype = methods::prototype(
    detail_level = "ensemble",
    similarity_metric = NA_character_,
    cluster_method = NA_character_,
    linkage_method = NA_character_,
    cluster_cut_method = NA_character_,
    similarity_threshold = NULL,
    dendrogram = NULL,
    value_column = "value",
    grouping_column = c("feature_name_1", "feature_name_2")))


# extract_feature_similarity (generic) -----------------------------------------

#'@title Internal function to extract the feature distance table.
#'
#'@description Computes and extracts the feature distance table for features
#'  used in a `familiarEnsemble` object. This table can be used to cluster
#'  features, and is exported directly by `export_feature_similarity`.
#'
#'@inheritParams extract_data
#'
#'@return A data.table containing pairwise distance between features. This data
#'  is only the upper triangular of the complete matrix (i.e. the sparse
#'  unitriangular representation). Diagonals will always be 0.0 and the lower
#'  triangular is mirrored.
#'@md
#'@keywords internal
setGeneric(
  "extract_feature_similarity",
  function(
    object,
    data,
    cl = NULL,
    estimation_type = waiver(),
    aggregate_results = waiver(),
    confidence_level = waiver(),
    bootstrap_ci_method = waiver(),
    is_pre_processed = FALSE,
    feature_cluster_method = waiver(),
    feature_linkage_method = waiver(),
    feature_cluster_cut_method = waiver(),
    feature_similarity_threshold = waiver(),
    feature_similarity_metric = waiver(),
    verbose = FALSE,
    message_indent = 0L,
    ...) {
    standardGeneric("extract_feature_similarity")
  }
)


# extract_feature_similarity (familiarEnsemble) --------------------------------
setMethod(
  "extract_feature_similarity",
  signature(object = "familiarEnsemble"),
  function(
    object,
    data,
    cl = NULL,
    estimation_type = waiver(),
    aggregate_results = waiver(),
    confidence_level = waiver(),
    bootstrap_ci_method = waiver(),
    is_pre_processed = FALSE,
    feature_cluster_method = waiver(),
    feature_linkage_method = waiver(),
    feature_cluster_cut_method = waiver(),
    feature_similarity_threshold = waiver(),
    feature_similarity_metric = waiver(),
    verbose = FALSE,
    message_indent = 0L,
    ...) {
    
    # Message extraction start
    logger_message(
      paste0("Computing pairwise similarity between features."),
      indent = message_indent,
      verbose = verbose)
    
    # Obtain cluster method from stored settings, if required.
    if (is.waive(feature_cluster_method)) {
      feature_cluster_method <- object@settings$feature_cluster_method
    }
    
    # Obtain linkage function from stored settings, if required.
    if (is.waive(feature_linkage_method)) {
      feature_linkage_method <- object@settings$feature_linkage_method
    } 
    
    # Obtain feature cluster cut method from stored settings, if required.
    if (is.waive(feature_cluster_cut_method)) {
      feature_cluster_cut_method <- object@settings$feature_cluster_cut_method
    } 
    
    # Obtain cluster similarity threshold from stored settings, if required.
    if (is.waive(feature_similarity_threshold)) {
      feature_similarity_threshold <- object@settings$feature_similarity_threshold
    }
    
    # Obtain similarity metric from stored settings, if required.
    if (is.waive(feature_similarity_metric)) {
      feature_similarity_metric <- object@settings$feature_similarity_metric
    }
    
    # Replace feature cluster method == "none" with "hclust"
    if (feature_cluster_method == "none") {
      feature_cluster_method <- "hclust"
    } 
    
    .check_cluster_parameters(
      cluster_method = feature_cluster_method,
      cluster_linkage = feature_linkage_method,
      cluster_cut_method = feature_cluster_cut_method,
      cluster_similarity_threshold = feature_similarity_threshold,
      cluster_similarity_metric = feature_similarity_metric,
      data_type = "feature")
    
    # Obtain confidence level from the settings file stored with the
    # familiarEnsemble object.
    if (is.waive(confidence_level)) {
      confidence_level <- object@settings$confidence_level
    }
    
    # Check alpha
    .check_number_in_valid_range(
      x = confidence_level,
      var_name = "confidence_level",
      range = c(0.0, 1.0),
      closed = c(FALSE, FALSE))
    
    # Load the bootstrap method
    if (is.waive(bootstrap_ci_method)) {
      bootstrap_ci_method <- object@settings$bootstrap_ci_method
    }
    
    .check_parameter_value_is_valid(
      x = bootstrap_ci_method,
      var_name = "bootstrap_ci_methpd",
      values = .get_available_bootstrap_confidence_interval_methods())
    
    # Check the estimation type.
    estimation_type <- .parse_estimation_type(
      x = estimation_type,
      object = object,
      default = "point",
      data_element = "feature_similarity",
      detail_level = "ensemble",
      has_internal_bootstrap = TRUE)
    
    # Check whether results should be aggregated.
    aggregate_results <- .parse_aggregate_results(
      x = aggregate_results,
      object = object,
      default = TRUE,
      data_element = "feature_similarity")
    
    # Generate a prototype data element.
    proto_data_element <- new(
      "familiarDataElementFeatureSimilarity",
      estimation_type = estimation_type,
      confidence_level = confidence_level,
      bootstrap_ci_method = bootstrap_ci_method,
      similarity_metric = feature_similarity_metric,
      cluster_method = feature_cluster_method,
      linkage_method = feature_linkage_method,
      cluster_cut_method = feature_cluster_cut_method,
      similarity_threshold = feature_similarity_threshold)
    
    # Generate elements to send to dispatch.
    similarity_data <- extract_dispatcher(
      FUN = .extract_feature_similarity,
      has_internal_bootstrap = TRUE,
      cl = cl,
      object = object,
      data = data,
      proto_data_element = proto_data_element,
      is_pre_processed = is_pre_processed,
      aggregate_results = aggregate_results,
      message_indent = message_indent + 1L,
      verbose = verbose)
    
    return(similarity_data)
  }
)



.extract_feature_similarity <- function(
    object,
    data,
    proto_data_element,
    cl = NULL,
    is_pre_processed,
    progress_bar = FALSE,
    aggregate_results,
    n_bootstraps,
    message_indent = 0L,
    verbose = FALSE,
    ...) {
  
  # Add the name of the ensemble model
  data_element <- add_model_name(data = proto_data_element, object = object)
  
  # Retrieve input data.
  data <- process_input_data(
    object = object,
    data = data,
    stop_at = "imputation",
    is_pre_processed = is_pre_processed)
  
  # Check if the input data is not empty
  if (is_empty(data)) return(NULL)
  
  # Check if the number of samples is sufficient (>5), and return an empty table
  # if not.
  if (data.table::uniqueN(
    data@data,
    by = get_id_columns(id_depth = "series")) <= 5) {
    return(data_element)
  }
  
  # Maintain only important features. The current set is based on the required
  # features.
  data <- filter_features(
    data = data,
    available_features = object@model_features)
  
  # Identify eligible columns.
  feature_columns <- get_feature_columns(x = data)
  
  # Break if there are not at least 2 features present between which correlation
  # can be compared.
  if (length(feature_columns) < 2) return(data_element)
  
  # Add bootstrap data.
  bootstrap_data <- add_data_element_bootstrap(
    x = data_element,
    n_bootstraps = n_bootstraps,
    ...)
  
  # Iterate over elements.
  if (n_bootstraps > 1) {
    data_elements <- fam_mapply(
      cl = cl,
      assign = NULL,
      FUN = ..extract_feature_similarity,
      data_element = bootstrap_data$data_element,
      bootstrap = bootstrap_data$bootstrap,
      bootstrap_seed = bootstrap_data$seed,
      MoreArgs = list(
        "data" = data,
        "feature_info_list" = object@feature_info),
      progress_bar = progress_bar,
      chopchop = TRUE)
    
  } else {
    data_elements <- fam_mapply(
      cl = NULL,
      assign = NULL,
      FUN = ..extract_feature_similarity,
      data_element = bootstrap_data$data_element,
      bootstrap = bootstrap_data$bootstrap,
      bootstrap_seed = bootstrap_data$seed,
      MoreArgs = list(
        "data" = data,
        "feature_info_list" = object@feature_info,
        "cl" = cl,
        "verbose" = verbose,
        "message_indent" = message_indent),
      progress_bar = FALSE)
    
  }
  
  # Merge data elements
  data_elements <- merge_data_elements(data_elements)
  
  if (aggregate_results) data_elements <- .compute_data_element_estimates(x = data_elements)
  
  return(data_elements)
}



..extract_feature_similarity <- function(
    cl = NULL,
    data_element,
    data,
    feature_info_list,
    bootstrap,
    bootstrap_seed,
    message_indent = 0L,
    verbose = FALSE) {
  
  # Bootstrap the data.
  if (bootstrap) {
    data <- get_bootstrap_sample(
      data = data,
      seed = bootstrap_seed)
  }
  
  # Check if the number of samples is sufficient (>5), and return an
  # empty table if not.
  if (data.table::uniqueN(
    data@data, 
    by = get_id_columns(id_depth = "series")) <= 5) {
    return(NULL)
  }
  
  # Identify eligible columns.
  feature_columns <- get_feature_columns(x = data)
  
  # Compute the similarity table
  data_element@data <- set_similarity_table(
    data = data,
    feature_info_list = feature_info_list[feature_columns],
    similarity_metric = data_element@similarity_metric,
    data_type = "feature",
    cl = cl,
    message_indent = message_indent + 1L,
    verbose = verbose)
  
  return(data_element)
}            



.append_feature_similarity_dendrogram <- function(x) {
  
  if (is_empty(x)) return(x)
  
  # Create a cluster method object using data stored in x.
  cluster_method_object <- .create_feature_similarity_cluster_method_object(x = x)
  
  if (is.null(cluster_method_object)) return(x)
  
  # Create the cluster object.
  object <- apply_cluster_method(object = cluster_method_object)
  
  # Attach to data element.
  x@dendrogram <- object@object
  
  return(x)
}



.append_feature_similarity_clusters <- function(x) {
  
  if (is_empty(x)) return(x)
  
  # Generate the clustering table.
  cluster_table <- .compute_feature_similarity_cluster_table(x = x)
  
  # Check for empty cluster tables.
  if (is_empty(cluster_table)) return(x)
  
  # Keep only name and cluster_id columns
  cluster_table <- cluster_table[, mget(c("name", "cluster_id"))]
  
  # Compute cluster size
  cluster_table[, "cluster_size" := .N, by = "cluster_id"]
  
  # Rename name column to 
  data.table::setnames(
    x = cluster_table,
    old = "name",
    new = "feature")
  
  # Set cluster info as data.
  x@data <- cluster_table
  
  # Reset value and grouping columns.
  x@value_column <- c("cluster_id", "cluster_size")
  x@grouping_column <- "feature"
  
  return(x)
}



.append_feature_similarity_clustering <- function(x) {
  
  if (is_empty(x)) return(x)
  
  # Generate the clustering table.
  cluster_table <- .compute_feature_similarity_cluster_table(x = x)
  
  # Check for empty cluster tables.
  if (is_empty(cluster_table)) return(x)
  
  # Keep only name and label_order columns,
  cluster_table <- cluster_table[, mget(c("name", "label_order"))]
  
  # Merge ordering into feature_similarity_table. The table is first
  # merged on feature_name_1 and then on feature_name_2.
  mutual_correlation_table <- data.table::copy(x@data)
  mutual_correlation_table <- merge(
    x = mutual_correlation_table,
    y = cluster_table,
    by.x = "feature_name_1",
    by.y = "name",
    all.x = TRUE,
    all.y = FALSE)
  
  mutual_correlation_table <- merge(
    x = mutual_correlation_table,
    y = cluster_table,
    by.x = "feature_name_2",
    by.y = "name",
    all.x = TRUE,
    all.y = FALSE)
  
  # Rename columns
  data.table::setnames(
    x = mutual_correlation_table,
    old = c("label_order.x", "label_order.y"),
    new = c("label_order_1", "label_order_2"))
  
  # Add to data element.
  x@data <- mutual_correlation_table
  
  # Add grouping column.
  x@grouping_column <- c(x@grouping_column, "label_order_1", "label_order_2")
  
  return(x)
}



.compute_feature_similarity_cluster_table <- function(x) {
  # Computes the feature similarity cluster table from the similarity table in
  # x.
  
  # Create a cluster method object using data stored in x.
  cluster_method_object <- .create_feature_similarity_cluster_method_object(x = x)
  
  if (is.null(x)) return(NULL)
  
  # Compute the cluster table.
  cluster_table <- create_clusters(
    object = cluster_method_object,
    as_cluster_object = FALSE)
  
  return(cluster_table)
}



.create_feature_similarity_cluster_method_object <- function(x) {
  
  if (is_empty(x)) return(NULL)
  
  if (length(x@similarity_threshold) > 1) {
    # Remove 1.0 because that does not yield clustering info.
    available_thresholds <- setdiff(x@similarity_threshold, 1.0)
    
    # Select the maximum threshold.
    x@similarity_threshold <- max(available_thresholds)
  }
  
  # Create cluster method object.
  cluster_method_object <- create_cluster_method_object(
    cluster_method = x@cluster_method,
    data_type = "feature",
    cluster_linkage = x@linkage_method,
    cluster_cut_method = x@cluster_cut_method,
    cluster_similarity_threshold = x@similarity_threshold,
    cluster_similarity_metric = x@similarity_metric,
    cluster_representation_method = "none")
  
  # Attach the similarity table to the cluster_method_object.
  cluster_method_object@similarity_table <- methods::new(
    "similarityTable",
    data = x@data[, mget(c("feature_name_1", "feature_name_2", "value"))],
    similarity_metric = x@similarity_metric,
    data_type = cluster_method_object@data_type)
  
  return(cluster_method_object)
}



# export_feature_similarity (generic) ------------------------------------------

#'@title Extract and export mutual correlation between features.
#'
#'@description Extract and export mutual correlation between features in a
#'  familiarCollection.
#'
#'@param export_dendrogram Add dendrogram in the data element objects.
#'@param export_ordered_data Add feature label ordering to data in the data
#'  element objects.
#'@param export_clustering Add clustering information to data.
#'
#'@inheritParams export_all
#'@inheritParams extract_data
#'@inheritParams plot_univariate_importance
#'
#'@inheritDotParams as_familiar_collection
#'
#'@details Data is usually collected from a `familiarCollection` object.
#'  However, you can also provide one or more `familiarData` objects, that will
#'  be internally converted to a `familiarCollection` object. It is also
#'  possible to provide a `familiarEnsemble` or one or more `familiarModel`
#'  objects together with the data from which data is computed prior to export.
#'  Paths to the previous files can also be provided.
#'
#'  All parameters aside from `object` and `dir_path` are only used if `object`
#'  is not a `familiarCollection` object, or a path to one.
#'
#'@return A list containing a data.table (if `dir_path` is not provided), or
#'  nothing, as all data is exported to `csv` files.
#'@exportMethod export_feature_similarity
#'@md
#'@rdname export_feature_similarity-methods
setGeneric(
  "export_feature_similarity",
  function(
    object,
    dir_path = NULL,
    aggregate_results = TRUE,
    feature_cluster_method = waiver(),
    feature_linkage_method = waiver(),
    feature_cluster_cut_method = waiver(),
    feature_similarity_threshold = waiver(),
    export_dendrogram = FALSE,
    export_ordered_data = FALSE,
    export_clustering = FALSE,
    export_collection = FALSE,
    ...) {
    standardGeneric("export_feature_similarity")
  } 
)

# export_feature_similarity (collection) ---------------------------------------

#'@rdname export_feature_similarity-methods
setMethod(
  "export_feature_similarity",
  signature(object = "familiarCollection"),
  function(
    object,
    dir_path = NULL,
    aggregate_results = TRUE,
    feature_cluster_method = waiver(),
    feature_linkage_method = waiver(),
    feature_cluster_cut_method = waiver(),
    feature_similarity_threshold = waiver(),
    export_dendrogram = FALSE,
    export_ordered_data = FALSE,
    export_clustering = FALSE,
    export_collection = FALSE,
    ...) {
    
    # Make sure the collection object is updated.
    object <- update_object(object = object)
    
    # Extract data.
    x <- object@feature_similarity
    
    if (export_ordered_data && export_clustering) {
      stop("Cannot simultaneously export cluster information and ordering of features.")
    }
    
    # Check that the data are not empty.
    if (is_empty(x)) return(NULL)
    
    if (!is.waive(feature_cluster_method)) {
      
      # Set clustering method.
      x <- lapply(
        x,
        function(x, feature_cluster_method) {
          x@cluster_method <- feature_cluster_method
          return(x)
        },
        feature_cluster_method = feature_cluster_method)
    }
    
    if (!is.waive(feature_linkage_method)) {
      
      # Set linkage method.
      x <- lapply(
        x,
        function(x, feature_linkage_method) {
          x@linkage_method <- feature_linkage_method
          return(x)
        },
        feature_linkage_method = feature_linkage_method)
    }
    
    if (!is.waive(feature_cluster_cut_method)) {
      
      # Set cut method
      x <- lapply(
        x,
        function(x, feature_cluster_cut_method) {
          x@cluster_cut_method <- feature_cluster_cut_method
          return(x)
        },
        feature_cluster_cut_method = feature_cluster_cut_method)
    }
    
    if (!is.waive(feature_similarity_threshold)) {
      
      # Set similarity threshold.
      x <- lapply(
        x,
        function(x, feature_similarity_threshold) {
          x@similarity_threshold <- feature_similarity_threshold
          return(x)
        },
        feature_similarity_threshold = feature_similarity_threshold)
    }
    
    # Check whether the input parameters are valid and create a cluster object.
    .check_cluster_parameters(
      cluster_method = x[[1]]@cluster_method,
      data_type = "feature",
      cluster_linkage = x[[1]]@linkage_method,
      cluster_cut_method = x[[1]]@cluster_cut_method,
      cluster_similarity_threshold = x[[1]]@similarity_threshold,
      cluster_similarity_metric = x[[1]]@similarity_metric,
      cluster_representation_method = "none")
    
    if (aggregate_results ||
        export_dendrogram ||
        export_ordered_data ||
        export_clustering) {
      x <- .compute_data_element_estimates(x)
      
      if (export_dendrogram || export_ordered_data || export_clustering) {
        # Add dendrogram and other cluster objects.
        x <- lapply(x, .append_feature_similarity_dendrogram)
      }
      
      if (export_clustering) {
        x <- lapply(x, .append_feature_similarity_clusters)
      }
      
      if (export_ordered_data) {
        # Add clustering information.
        x <- lapply(x, .append_feature_similarity_clustering)
      }
    }
    
    return(.export(
      x = object,
      data_elements = x,
      dir_path = dir_path,
      aggregate_results = aggregate_results,
      type = "feature_similarity",
      subtype = x[[1]]@similarity_metric,
      export_dendrogram = export_dendrogram,
      export_ordered_data = export_ordered_data,
      export_collection = export_collection))
  }
)



# export_feature_similarity (general) ------------------------------------------

#'@rdname export_feature_similarity-methods
setMethod(
  "export_feature_similarity",
  signature(object = "ANY"),
  function(
    object,
    dir_path = NULL,
    aggregate_results = TRUE,
    feature_cluster_method = waiver(),
    feature_linkage_method = waiver(),
    feature_cluster_cut_method = waiver(),
    feature_similarity_threshold = waiver(),
    export_collection = FALSE,
    ...) {
    
    # Attempt conversion to familiarCollection object.
    object <- do.call(
      as_familiar_collection,
      args = c(
        list(
          "object" = object,
          "data_element" = "feature_similarity",
          "aggregate_results" = aggregate_results,
          "feature_cluster_method" = feature_cluster_method,
          "feature_linkage_method" = feature_linkage_method,
          "feature_cluster_cut_method" = feature_cluster_cut_method,
          "feature_similarity_threshold" = feature_similarity_threshold),
        list(...)))
    
    return(do.call(
      export_feature_similarity,
      args = c(
        list(
          "object" = object,
          "dir_path" = dir_path,
          "aggregate_results" = aggregate_results,
          "feature_cluster_method" = feature_cluster_method,
          "feature_linkage_method" = feature_linkage_method,
          "feature_cluster_cut_method" = feature_cluster_cut_method,
          "feature_similarity_threshold" = feature_similarity_threshold,
          "export_collection" = export_collection),
        list(...))))
  }
)



#.export (familiarDataElementFeatureSimilarity)---------------------------------
setMethod(
  ".export",
  signature(x = "familiarDataElementFeatureSimilarity"),
  function(
    x,
    x_list,
    aggregate_results = FALSE,
    export_dendrogram,
    export_ordered_data, 
    ...) {
    # This is like .export,familiarDataElement, but the elements are merged
    # prior to computing estimates.
    
    # Only merge if dendrograms are missing for all entries.
    if (!export_dendrogram && !export_ordered_data) {
      # Merge data elements.
      x <- merge_data_elements(
        x = x_list,
        as_data = "all",
        as_grouping_column = TRUE,
        force_data_table = TRUE)
      
    } else {
      x <- x_list
    }
    
    if (aggregate_results) {
      x <- .compute_data_element_estimates(x)
    }
    
    return(x)
  }
)

Try the familiar package in your browser

Any scripts or data that you put into this service are public.

familiar documentation built on Sept. 30, 2024, 9:18 a.m.