#' Calculate Quantiles of Feature Importance
#'
#' This function computes the mean, lower, and upper quantiles of permuted feature importance scores
#' and compares them with the observed importance scores from the true values. It is used to
#' assess the significance of feature importances from random forest models by comparing them
#' against a distribution of importances obtained through permutation.
#'
#' @param truevalues A data frame containing true feature importance data.
#' It should have columns: feature_rank, feature_importance, and log_feature_importance.
#' This is typically obtained from the `calculate_feature_importances` function. (It is feat_importances$true_importances).
#' @param permutedvalues A data frame containing permuted feature importance data.
#' It should have the same structure as `truevalues` and is typically obtained from the `calculate_feature_importances` function. (It is feat_importances$permuted_importances).
#' @param alpha Numeric value representing the significance level used to calculate the upper and lower quantiles of the permuted importances.
#' It must be between 0 and 1, exclusive. Default is 0.05.
#' @return A tibble/data frame with columns for feature rank, mean permuted importance, lower and upper quantile bounds,
#' observed true importance, and their respective logarithmic transformations.
#' This data frame provides a comprehensive view of where true importances lie in relation to the distribution of permuted importances.
#' @import dplyr
#' @importFrom stats quantile
#' @importFrom stats aggregate
#' @export
#' @examples
#' quantile_results <- calculate_quantiles(feat_importances$true_importances, feat_importances$permuted_importances, alpha=0.05)
#'
calculate_quantiles<-function(truevalues,permutedvalues,alpha=0.05){
if (!is.data.frame(truevalues) || !is.data.frame(permutedvalues)) {
stop("Inputs must be data frames.")
}
if (alpha <= 0 || alpha >= 1) {
stop("Alpha must be between 0 and 1.")
}
# Calculate the mean, lower and upper quantiles for permuted values
dm <- permutedvalues %>% group_by(feature_rank) %>%
summarise(mean = mean(feature_importance),
lower = stats::quantile(feature_importance, prob=alpha),
upper = stats::quantile(feature_importance, prob=1-alpha)) %>%
mutate(observed = truevalues$feature_importance) %>%
mutate(logmean=log(mean)) %>%
mutate(logupper=log(upper)) %>%
mutate(loglower=log(lower)) %>%
mutate(logobserved=log(observed))
return(dm)
}
#' Calculate Proportion-Based P-Value for Entire Feature Set
#'
#' This function calculates the proportion-based p-value for the entire feature set by comparing
#' the observed sum of absolute deviations from the mean feature importance to those from permuted data.
#' It is particularly useful in permutation tests to assess the statistical significance of the observed data.
#'
#' @param permutedvalues A data frame containing the permuted feature importances.
#' It should include columns for feature importance and permutation number.
#' Typically, this data frame is generated by a function that performs permutation tests.
#' @param quantiledata A data frame containing quantile information of feature importances.
#' This data frame should include columns for feature rank, mean permuted importance, observed true importance,
#' and optionally their logarithmic transformations. This is usually output from `calculateQuantiles` function.
#' @return A data frame with a single column "p_val_set".
#' This column contains the calculated p-value, which quantifies the probability of observing a sum of absolute deviations
#' as extreme as the one observed, under the null hypothesis. If the p-value is extremely low (below the resolution of the
#' number of permutations), it is reported as less than the reciprocal of the number of permutations.
#' @import dplyr
#' @examples
#' pvalue_set <- calculate_full_set_pvalue(feat_importances$permuted_importances, quantile_data)
#' @export
calculate_full_set_pvalue <- function(permutedvalues, quantiledata) {
d <- permutedvalues
numberofpermutations <- max(d$permutation)
# Calculate mean for each permuted set
d <- d %>%
ungroup() %>%
mutate(permuted_mean = rep(quantiledata$mean, times = numberofpermutations))
# Calculate deviations
d <- d %>%
dplyr::mutate(Mean = rep(quantiledata$mean, times = numberofpermutations)) %>%
dplyr::mutate(permuted_std_dev = feature_importance - permuted_mean)
# Sum of absolute deviations for permuted sets
pi_permuted <- d %>%
group_by(permutation) %>%
dplyr::summarise(sum_abs_deviations = sum(abs(permuted_std_dev)))
# Sum of absolute deviations for observed data
pi_obs <- sum(abs(quantiledata$observed - quantiledata$mean))
# Calculate p-value
pvalpi <- mean(pi_permuted$sum_abs_deviations >= pi_obs)
# Handle zero p-values
if (pvalpi == 0) {
pvalpi <- paste0("< ", 1 / numberofpermutations)
}
outdat <- data.frame("p_val_set" = pvalpi)
return(outdat)
}
#' Calculate p-values for Each Feature Rank
#'
#' This function calculates p-values for each feature rank in the dataset by comparing
#' the observed feature importances against the distribution of importances in the permuted data.
#' It returns ranks and their respective p-values and proportions up to a specified alpha threshold.
#' Note: P-values returned as 0 actually represent p-values less than the smallest detectable limit.
#' With 1,000 permutations, the smallest detectable p-value is 0.001 (1/n_permutations). Therefore,
#' a reported p-value of 0 should be interpreted as p < 0.001.
#' @param truevalues A data frame containing true feature importances and ranks.
#' @param permutedvalues A data frame containing permuted feature importances and ranks.
#' @param alpha Significance level for determining the cutoff in the one-tailed test (default 0.05).
#' @return A data frame with feature ranks, feature name (name of the feature at that rank in true model set), feature importance, log importance, proportion of observations from permuted set with importances greater than the true set importance value for that rank, and proportion calculated p-values for each rank.
#' Includes ranks up to the first one where the p-value exceeds the alpha threshold. Results sorted by Rank.
#' @import dplyr
#' @importFrom stats aggregate
#' @examples
#' pvalues_ranks <- calculate_ranked_based_pvalues(feat_importances$true_importances, feat_importances$permuted_importances, alpha=0.05)
#' @export
calculate_ranked_based_pvalues <- function(truevalues, permutedvalues, alpha = 0.05) {
# Error checks
if (!is.data.frame(truevalues) || !is.data.frame(permutedvalues)) {
stop("truevalues and permutedvalues must be data frames.")
}
if (!all(c("feature_importance", "feature_rank") %in% colnames(truevalues)) ||
!all(c("feature_importance", "feature_rank") %in% colnames(permutedvalues))) {
stop("Both truevalues and permutedvalues must contain 'feature_importance' and 'feature_rank' columns.")
}
if (!is.numeric(alpha) || alpha <= 0 || alpha >= 1) {
stop("alpha must be a numeric value between 0 and 1.")
}
numberofpermutations <- max(permutedvalues$permutation)
# Aggregate permuted feature importances
fweights <- stats::aggregate(feature_importance ~ feature_rank, permutedvalues, c)
# Calculate observed mean and proportions
fweights <- fweights %>%
mutate(observedmean = truevalues$feature_importance[match(feature_rank, truevalues$feature_rank)]) %>%
rowwise() %>%
dplyr::mutate(
proportion = if_else(observedmean > 0, sum(feature_importance > observedmean, na.rm = TRUE), NA_integer_),
pvalue = if_else(is.na(proportion), NA_real_, proportion / numberofpermutations)
)
# Prepare final data frame
truevalues$permutation <- NULL
fweightPvals <- dplyr::select(fweights, feature_rank, proportion, pvalue)
fweightPvals <- left_join(truevalues, fweightPvals, by = "feature_rank")
# Apply alpha cutoff
if(any(fweightPvals$pvalue >= alpha, na.rm = TRUE)) {
cutoff_index <- which(fweightPvals$pvalue >= alpha, arr.ind = TRUE)[1]
fweightPvals <- fweightPvals[1:(cutoff_index - 1), ]
}
# P-value interpretation warning
message("Note: P-values returned as 0 actually represent p-values less than the smallest detectable limit.\n",
"With 1,000 permutations, the smallest detectable p-value is 0.001 (1/n_permutations). Therefore, ",
"a reported p-value of 0 should be interpreted as p < 0.001.")
return(fweightPvals)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.