Nothing
#' @title Get Feature Importances
#' @description This function extracts feature importances from a Scikit-learn pipeline
#' that has a Gradient Boosting Classifier as the final step.
#' @param pipeline A Scikit-learn pipeline object with a Gradient Boosting Classifier
#' as the final step.
#' @param X_train A DataFrame containing the training data.
#' @param pipeline_name Strings (names of the selected_pipelines list) representing pipeline names that were constructed for the feature selection
#' @param iter An integer that is indicating current iteration of the train-test split
#' @return A dataframe containing the selected feature names and their importances, ranked by importance, or NULL
#' if the classifier does not have the appropriate attributes or the feature selector
#' does not have the 'get_support' or 'support_' method. Each row represents a feature, with columns for
#' feature names, importances, and ranks.
#' @importFrom reticulate py_has_attr py_to_r
get_feature_importances <- function(pipeline, X_train, pipeline_name, iter) {
classifier <- pipeline$named_steps[['classifier']]
if (reticulate::py_has_attr(classifier, "coef_")) {
feature_importances <- classifier$coef_
if (dim(feature_importances)[1] == 1) {
feature_importances <- feature_importances[1,]
}
} else if (reticulate::py_has_attr(classifier, "feature_importances_")) {
feature_importances <- classifier$feature_importances_
} else {
message("Classifier doesn't have coef_ or feature_importances_ attributes")
return(NULL)
}
feature_selector <- pipeline$named_steps[["feature_selector"]]
original_feature_names <- colnames(reticulate::py_to_r(X_train))
if (reticulate::py_has_attr(feature_selector, "get_support")) {
selected_indices <- which(feature_selector$get_support())
} else if (reticulate::py_has_attr(feature_selector, "support_")) {
selected_indices <- which(feature_selector$support_)
} else {
message("Feature selector doesn't have get_support() or support_ attribute")
return(NULL)
}
selected_feature_names <- original_feature_names[selected_indices]
importances <- data.frame(feature=selected_feature_names, importance=feature_importances[selected_indices])
importances <- importances[order(-importances$importance),]
importances$rank <- seq_len(nrow(importances))
column_name <- as.character(glue::glue('rank_{pipeline_name}_split_{iter}'))
colnames(importances)[colnames(importances) == 'rank'] <- column_name
return(importances)
}
#' @title Calculate Permutation Feature Importance
#' @description This function calculates permutation feature importance for a Scikit-learn
#' pipeline with a trained classifier as the final step.
#' @param pipeline A Scikit-learn pipeline object with a trained classifier as the final step.
#' @param X_train A DataFrame containing the training data.
#' @param y_train A DataFrame containing the training labels.
#' @param n_repeats An integer specifying the number of times to permute each feature.
#' @param random_state An integer specifying the seed for the random number generator.
#' @param njobs An integer specifying number of cores to use. Set up by the master GeneSelectR function.
#' @param pipeline_name Strings (names of the selected_pipelines list) representing pipeline names that were constructed for the feature selection
#' @param iter An integer that is indicating current iteration of the train-test split
#' @return A dataframe containing the feature names and their permutation importance scores, ranked by importance.
#' Each row represents a feature, with columns for feature names, importances, and ranks.
#' @importFrom reticulate import py_to_r
calculate_permutation_feature_importance <- function(pipeline,
X_train,
y_train,
n_repeats=10L,
random_state=0L,
njobs = njobs,
pipeline_name,
iter) {
# Import the required function
permutation_importance <- reticulate::import("sklearn.inspection", convert = FALSE)$permutation_importance
# Compute the permutation feature importance
perm_importance <- permutation_importance(pipeline, X_train, y_train, n_repeats = n_repeats, random_state = random_state, n_jobs = njobs)
# Extract the importances and feature names
importances <- reticulate::py_to_r(perm_importance$importances_mean)
feature_names <- colnames(reticulate::py_to_r(X_train))
# Create a data frame
importance_df <- data.frame(feature=feature_names, importance=importances)
importance_df <- importance_df[order(-importance_df$importance),]
# Calculate the rank of the feature importances
importance_df$rank <- seq_len(nrow(importance_df))
# Get the pipeline name and append it to the rank column name
column_name <- as.character(glue::glue('rank_{pipeline_name}_split_{iter}'))
colnames(importance_df)[colnames(importance_df) == 'rank'] <- column_name
return(importance_df)
}
#' @title Create Pipelines
#' @description This function creates a list of Scikit-learn pipelines using the specified feature selection methods, preprocessing steps, and classifier.
#' @param feature_selection_methods A list of feature selection methods to use for the pipelines.
#' @param preprocessing_steps A list of preprocessing steps to use for the pipelines.
#' @param selected_methods A vector of names of feature selection methods to use from the default set.
#' @param classifier A Scikit-learn classifier to use as the final step in the pipelines.
#' @param fs_param_grids param grid
#' @return A list of Scikit-learn pipeline objects. Each pipeline is constructed based on the provided
#' feature selection methods, preprocessing steps, and classifier. The list is named by feature selection methods.
#' @importFrom reticulate import tuple py_bool
create_pipelines <- function(feature_selection_methods, preprocessing_steps, selected_methods, classifier, fs_param_grids) {
sklearn <- reticulate::import('sklearn')
pipeline <- sklearn$pipeline$Pipeline
named_pipelines <- list()
selected_methods <- names(feature_selection_methods)
for (feature_selector_name in selected_methods) {
if (feature_selector_name %in% names(feature_selection_methods)) {
feature_selector_method <- feature_selection_methods[[feature_selector_name]]
base_model <- classifier
steps <- c(preprocessing_steps, list("feature_selector" = feature_selector_method))
steps <- c(steps, list("classifier" = base_model))
tuple_steps <- steps_to_tuples(steps)
# Add feature selection parameters to the pipeline if they are provided
if (feature_selector_name %in% names(fs_param_grids)) {
fs_params <- fs_param_grids[[feature_selector_name]]
# Incorporate the parameters from fs_params into the appropriate estimator objects
for (i in seq_along(tuple_steps)) {
if (reticulate::py_bool(tuple_steps[[i]][[1]] == "feature_selector")) {
tuple_steps[[i]][[2]] <- do.call(tuple_steps[[i]][[2]], fs_params)
}
}
}
named_pipelines[[feature_selector_name]] <- pipeline(steps = tuple_steps)
} else {
message("Warning: Feature selection method", feature_selector_name, "not found.\n")
}
}
return(named_pipelines)
}
#' @title Convert Steps to Tuples
#' @description This function converts a list of steps to tuples for use in a Scikit-learn pipeline.
#' @param steps A list of steps to convert to tuples.
#' @return A list of tuples, where each tuple represents a step in a Scikit-learn pipeline.
#' The tuple contains the name of the step and the corresponding step object.
#' @importFrom reticulate tuple
steps_to_tuples <- function(steps) {
tuple_steps <- c()
for (step_name in names(steps)) {
step_obj <- steps[[step_name]]
tuple_steps <- c(tuple_steps, reticulate::tuple(step_name, step_obj))
}
return(tuple_steps)
}
#' Aggregate Feature Importances
#'
#' This function aggregates the feature importances for each method across all splits.
#'
#' @param selected_features A list of selected features. Each element of the list represents a split and should be a named list where the names are the methods and the values are data frames containing the feature importances for that method in that split.
#'
#' @return A list containing aggregated feature importances for each feature selection method.
#' Each element in the list is a dataframe with mean and standard deviation of the feature importances
#' for a particular method across all splits. The dataframe includes columns for feature names, mean importances,
#' standard deviations, and ranks.
#' @importFrom magrittr %>%
#' @importFrom dplyr group_by summarize filter
#' @importFrom stats sd
#' @importFrom rlang .data
aggregate_feature_importances <- function(selected_features) {
aggregated_importances <- list()
for (method in names(selected_features[[1]])) {
feature_importances <- lapply(selected_features, function(split) {
# Reshape the data from wide format to long format
split_df <- tidyr::pivot_longer(split[[method]],
cols = starts_with("rank_"),
names_to = "method",
values_to = "rank")
as.data.frame(split_df)
})
combined_importances <- do.call(rbind, feature_importances)
importances_df <- combined_importances %>%
dplyr::group_by(.data$feature) %>%
dplyr::summarize(mean_importance = mean(.data$importance, na.rm = TRUE),
std = stats::sd(.data$importance, na.rm = TRUE))
# Add rank columns back
rank_df <- combined_importances %>%
dplyr::select(.data$feature, .data$method, .data$rank)
# Join importances_df with rank_df
importances_df <- dplyr::left_join(importances_df, rank_df, by = "feature")
# Reshape back to wide format
importances_df <- tidyr::pivot_wider(importances_df,
names_from = method,
values_from = rank)
importances_df <- importances_df %>%
dplyr::filter(.data$mean_importance > 0)
# Add the aggregated importances for the current method to the results list
aggregated_importances[[method]] <- importances_df
}
return(aggregated_importances)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.