R/calculatePvalues.R

Defines functions calculate_ranked_based_pvalues calculate_full_set_pvalue calculate_quantiles

Documented in calculate_full_set_pvalue calculate_quantiles calculate_ranked_based_pvalues

#' Calculate Quantiles of Feature Importance
#'
#' This function computes the mean, lower, and upper quantiles of permuted feature importance scores
#' and compares them with the observed importance scores from the true values. It is used to
#' assess the significance of feature importances from random forest models by comparing them
#' against a distribution of importances obtained through permutation.
#'
#' @param truevalues A data frame containing true feature importance data.
#'   It should have columns: feature_rank, feature_importance, and log_feature_importance.
#'   This is typically obtained from the `calculate_feature_importances` function. (It is feat_importances$true_importances).
#' @param permutedvalues A data frame containing permuted feature importance data.
#'   It should have the same structure as `truevalues` and is typically obtained from the `calculate_feature_importances` function. (It is feat_importances$permuted_importances).
#' @param alpha Numeric value representing the significance level used to calculate the upper and lower quantiles of the permuted importances.
#'   It must be between 0 and 1, exclusive. Default is 0.05.
#' @return A tibble/data frame with columns for feature rank, mean permuted importance, lower and upper quantile bounds,
#'   observed true importance, and their respective logarithmic transformations.
#'   This data frame provides a comprehensive view of where true importances lie in relation to the distribution of permuted importances.
#' @import dplyr
#' @importFrom stats quantile
#' @importFrom stats aggregate
#' @export
#' @examples
#' quantile_results <- calculate_quantiles(feat_importances$true_importances, feat_importances$permuted_importances, alpha=0.05)
#'
calculate_quantiles<-function(truevalues,permutedvalues,alpha=0.05){
  if (!is.data.frame(truevalues) || !is.data.frame(permutedvalues)) {
    stop("Inputs must be data frames.")
  }

  if (alpha <= 0 || alpha >= 1) {
    stop("Alpha must be between 0 and 1.")
  }

  # Calculate the mean, lower and upper quantiles for permuted values
  dm <- permutedvalues %>% group_by(feature_rank) %>%
  summarise(mean = mean(feature_importance),
            lower = stats::quantile(feature_importance, prob=alpha),
            upper = stats::quantile(feature_importance, prob=1-alpha)) %>%
            mutate(observed = truevalues$feature_importance) %>%
            mutate(logmean=log(mean)) %>%
            mutate(logupper=log(upper)) %>%
            mutate(loglower=log(lower)) %>%
            mutate(logobserved=log(observed))
  return(dm)
}


#' Calculate Proportion-Based P-Value for Entire Feature Set
#'
#' This function calculates the proportion-based p-value for the entire feature set by comparing
#' the observed sum of absolute deviations from the mean feature importance to those from permuted data.
#' It is particularly useful in permutation tests to assess the statistical significance of the observed data.
#'
#' @param permutedvalues A data frame containing the permuted feature importances.
#'   It should include columns for feature importance and permutation number.
#'   Typically, this data frame is generated by a function that performs permutation tests.
#' @param quantiledata A data frame containing quantile information of feature importances.
#'   This data frame should include columns for feature rank, mean permuted importance, observed true importance,
#'   and optionally their logarithmic transformations. This is usually output from `calculateQuantiles` function.
#' @return A data frame with a single column "p_val_set".
#'   This column contains the calculated p-value, which quantifies the probability of observing a sum of absolute deviations
#'   as extreme as the one observed, under the null hypothesis. If the p-value is extremely low (below the resolution of the
#'   number of permutations), it is reported as less than the reciprocal of the number of permutations.
#' @import dplyr
#' @examples
#' pvalue_set <- calculate_full_set_pvalue(feat_importances$permuted_importances, quantile_data)
#' @export
calculate_full_set_pvalue <- function(permutedvalues, quantiledata) {
  d <- permutedvalues
  numberofpermutations <- max(d$permutation)

  # Calculate mean for each permuted set
  d <- d %>%
    ungroup() %>%
    mutate(permuted_mean = rep(quantiledata$mean, times = numberofpermutations))

  # Calculate deviations
  d <- d %>%
    dplyr::mutate(Mean = rep(quantiledata$mean, times = numberofpermutations)) %>%
    dplyr::mutate(permuted_std_dev = feature_importance - permuted_mean)

  # Sum of absolute deviations for permuted sets
  pi_permuted <- d %>%
    group_by(permutation) %>%
    dplyr::summarise(sum_abs_deviations = sum(abs(permuted_std_dev)))

  # Sum of absolute deviations for observed data
  pi_obs <- sum(abs(quantiledata$observed - quantiledata$mean))

  # Calculate p-value
  pvalpi <- mean(pi_permuted$sum_abs_deviations >= pi_obs)

  # Handle zero p-values
  if (pvalpi == 0) {
    pvalpi <- paste0("< ", 1 / numberofpermutations)
  }

  outdat <- data.frame("p_val_set" = pvalpi)

  return(outdat)
}


#' Calculate p-values for Each Feature Rank
#'
#' This function calculates p-values for each feature rank in the dataset by comparing
#' the observed feature importances against the distribution of importances in the permuted data.
#' It returns ranks and their respective p-values and proportions up to a specified alpha threshold.
#' Note: P-values returned as 0 actually represent p-values less than the smallest detectable limit.
#' With 1,000 permutations, the smallest detectable p-value is 0.001 (1/n_permutations). Therefore,
#' a reported p-value of 0 should be interpreted as p < 0.001.
#' @param truevalues A data frame containing true feature importances and ranks.
#' @param permutedvalues A data frame containing permuted feature importances and ranks.
#' @param alpha Significance level for determining the cutoff in the one-tailed test (default 0.05).
#' @return A data frame with feature ranks, feature name (name of the feature at that rank in true model set), feature importance, log importance, proportion of observations from permuted set with importances greater than the true set importance value for that rank, and proportion calculated p-values for each rank.
#'   Includes ranks up to the first one where the p-value exceeds the alpha threshold. Results sorted by Rank.
#' @import dplyr
#' @importFrom stats aggregate
#' @examples
#' pvalues_ranks <- calculate_ranked_based_pvalues(feat_importances$true_importances, feat_importances$permuted_importances, alpha=0.05)
#' @export
calculate_ranked_based_pvalues <- function(truevalues, permutedvalues, alpha = 0.05) {
  # Error checks
  if (!is.data.frame(truevalues) || !is.data.frame(permutedvalues)) {
    stop("truevalues and permutedvalues must be data frames.")
  }
  if (!all(c("feature_importance", "feature_rank") %in% colnames(truevalues)) ||
      !all(c("feature_importance", "feature_rank") %in% colnames(permutedvalues))) {
    stop("Both truevalues and permutedvalues must contain 'feature_importance' and 'feature_rank' columns.")
  }
  if (!is.numeric(alpha) || alpha <= 0 || alpha >= 1) {
    stop("alpha must be a numeric value between 0 and 1.")
  }

  numberofpermutations <- max(permutedvalues$permutation)

  # Aggregate permuted feature importances
  fweights <- stats::aggregate(feature_importance ~ feature_rank, permutedvalues, c)

  # Calculate observed mean and proportions
  fweights <- fweights %>%
    mutate(observedmean = truevalues$feature_importance[match(feature_rank, truevalues$feature_rank)]) %>%
    rowwise() %>%
    dplyr::mutate(
      proportion = if_else(observedmean > 0, sum(feature_importance > observedmean, na.rm = TRUE), NA_integer_),
      pvalue = if_else(is.na(proportion), NA_real_, proportion / numberofpermutations)
    )

  # Prepare final data frame
  truevalues$permutation <- NULL
  fweightPvals <- dplyr::select(fweights, feature_rank, proportion, pvalue)
  fweightPvals <- left_join(truevalues, fweightPvals, by = "feature_rank")

  # Apply alpha cutoff
  if(any(fweightPvals$pvalue >= alpha, na.rm = TRUE)) {
    cutoff_index <- which(fweightPvals$pvalue >= alpha, arr.ind = TRUE)[1]
    fweightPvals <- fweightPvals[1:(cutoff_index - 1), ]
  }

  # P-value interpretation warning
  message("Note: P-values returned as 0 actually represent p-values less than the smallest detectable limit.\n",
          "With 1,000 permutations, the smallest detectable p-value is 0.001 (1/n_permutations). Therefore, ",
          "a reported p-value of 0 should be interpreted as p < 0.001.")

  return(fweightPvals)
}
tkolisnik/Rf2pval documentation built on Feb. 20, 2024, 5:39 a.m.