R/pathway_errorbar_table.R
In ggpicrust2: Make 'PICRUSt2' Output Analysis and Visualization Easier

Documented in pathway_errorbar_table

#' Generate Abundance Statistics Table for Pathway Analysis
#'
#' This function generates a table containing mean relative abundance, 
#' standard deviation, and log2 fold change statistics for pathways, 
#' similar to the data used in pathway_errorbar plots but returned as a 
#' data frame instead of a plot.
#'
#' @param abundance A data frame or matrix containing predicted functional 
#'        pathway abundance, with pathways/features as rows and samples as 
#'        columns. The column names should match the sample names in metadata.
#' @param daa_results_df A data frame containing differential abundance 
#'        analysis results from pathway_daa function. Must contain columns: 
#'        feature, group1, group2, p_adjust.
#' @param Group A vector containing group assignments for each sample in the
#'        same order as the columns in abundance matrix. Alternatively, if
#'        metadata is provided, this should match the order of samples in metadata.
#' @param ko_to_kegg Logical value indicating whether to use KO to KEGG 
#'        conversion. Default is FALSE.
#' @param p_values_threshold Numeric value for p-value threshold to filter 
#'        significant features. Default is 0.05.
#' @param select Character vector of specific features to include. If NULL, 
#'        all significant features are included.
#' @param max_features Maximum number of features to include in the table.
#'        Default is 30.
#' @param metadata Optional data frame containing sample metadata. If provided,
#'        the Group vector will be reordered to match the abundance column order.
#' @param sample_col Character string specifying the column name in metadata
#'        that contains sample identifiers. Default is "sample_name".
#'
#' @return A data frame containing the following columns:
#' \itemize{
#'   \item \code{feature}: Feature/pathway identifier
#'   \item \code{group1}: Reference group name
#'   \item \code{group2}: Comparison group name
#'   \item \code{mean_rel_abundance_group1}: Mean relative abundance for group1
#'   \item \code{sd_rel_abundance_group1}: Standard deviation of relative
#'         abundance for group1
#'   \item \code{mean_rel_abundance_group2}: Mean relative abundance for group2
#'   \item \code{sd_rel_abundance_group2}: Standard deviation of relative
#'         abundance for group2
#'   \item \code{log2_fold_change}: Log2 fold change (group2/group1)
#'   \item \code{p_adjust}: Adjusted p-value from differential analysis
#'   \item Additional annotation columns (e.g., \code{description},
#'         \code{pathway_name}, \code{pathway_class}) if present in the input
#'         daa_results_df
#' }
#'
#' @examples
#' \dontrun{
#' # Load example data
#' data("ko_abundance")
#' data("metadata")
#' 
#' # Convert KO abundance to KEGG pathways
#' kegg_abundance <- ko2kegg_abundance(data = ko_abundance)
#' 
#' # Perform differential abundance analysis
#' daa_results_df <- pathway_daa(
#'   abundance = kegg_abundance,
#'   metadata = metadata,
#'   group = "Environment",
#'   daa_method = "ALDEx2"
#' )
#' 
#' # Filter for specific method
#' daa_sub_method_results_df <- daa_results_df[
#'   daa_results_df$method == "ALDEx2_Welch's t test", 
#' ]
#' 
#' # Annotate results
#' daa_annotated_sub_method_results_df <- pathway_annotation(
#'   pathway = "KO",
#'   daa_results_df = daa_sub_method_results_df,
#'   ko_to_kegg = TRUE
#' )
#' 
#' # Generate abundance statistics table
#' abundance_stats_table <- pathway_errorbar_table(
#'   abundance = kegg_abundance,
#'   daa_results_df = daa_annotated_sub_method_results_df,
#'   Group = metadata$Environment,
#'   ko_to_kegg = TRUE,
#'   p_values_threshold = 0.05
#' )
#' 
#' # View the results
#' head(abundance_stats_table)
#' }
#'
#' @importFrom stats setNames
#' @export
pathway_errorbar_table <- function(abundance,
                                  daa_results_df,
                                  Group,
                                  ko_to_kegg = FALSE,
                                  p_values_threshold = 0.05,
                                  select = NULL,
                                  max_features = 30,
                                  metadata = NULL,
                                  sample_col = "sample_name") {
  
  # Input validation
  if (!is.matrix(abundance) && !is.data.frame(abundance)) {
    stop("'abundance' must be a matrix or data frame")
  }

  if (!is.data.frame(daa_results_df)) {
    stop("'daa_results_df' must be a data frame")
  }

  # Handle Group vector ordering
  if (!is.null(metadata)) {
    # If metadata is provided, reorder Group to match abundance column order
    if (!sample_col %in% colnames(metadata)) {
      stop("Sample column '", sample_col, "' not found in metadata")
    }

    # Create a mapping from sample to group
    sample_to_group <- setNames(Group, metadata[[sample_col]])

    # Reorder Group to match abundance column order
    abundance_samples <- colnames(abundance)
    Group <- sample_to_group[abundance_samples]

    # Check for missing samples
    if (any(is.na(Group))) {
      missing_samples <- abundance_samples[is.na(Group)]
      stop("Some samples in abundance data not found in metadata: ",
           paste(missing_samples, collapse = ", "))
    }
  }
  
  required_cols <- c("feature", "group1", "group2", "p_adjust")
  missing_cols <- setdiff(required_cols, colnames(daa_results_df))
  if (length(missing_cols) > 0) {
    stop("Missing required columns in daa_results_df: ", 
         paste(missing_cols, collapse = ", "))
  }
  
  if (length(Group) != ncol(abundance)) {
    stop("Length of Group must match number of columns in abundance matrix")
  }
  
  # Check for single method
  if (nlevels(factor(daa_results_df$method)) != 1) {
    stop("The 'method' column in the 'daa_results_df' data frame contains ",
         "more than one method. Please filter it to contain only one method.")
  }
  
  if (nlevels(factor(daa_results_df$group1)) != 1 || 
      nlevels(factor(daa_results_df$group2)) != 1) {
    stop("The 'group1' or 'group2' column in the 'daa_results_df' data frame ",
         "contains more than one group. Please filter each to contain only ",
         "one group.")
  }
  
  # Filter for significant features
  daa_results_filtered_df <- daa_results_df[
    daa_results_df$p_adjust < p_values_threshold, 
  ]
  
  if (!is.null(select)) {
    daa_results_filtered_sub_df <- daa_results_filtered_df[
      daa_results_filtered_df$feature %in% select, 
    ]
  } else {
    daa_results_filtered_sub_df <- daa_results_filtered_df
  }
  
  if (nrow(daa_results_filtered_sub_df) > max_features) {
    warning("The number of features with statistical significance exceeds ", 
            max_features, ". Consider using 'select' parameter or increasing ",
            "'max_features' to include all features.")
  }
  
  if (nrow(daa_results_filtered_sub_df) == 0) {
    stop("No features with statistical significance found. ",
         "Consider adjusting p_values_threshold.")
  }
  
  # Get group names
  group1_name <- unique(daa_results_filtered_sub_df$group1)[1]
  group2_name <- unique(daa_results_filtered_sub_df$group2)[1]
  
  # Calculate abundance statistics using the helper function
  # Create metadata that matches the abundance column order
  # The Group vector should be in the same order as colnames(abundance)
  abundance_metadata <- data.frame(
    sample = colnames(abundance),
    group_col = Group,
    stringsAsFactors = FALSE
  )

  # Ensure the Group vector length matches the number of samples
  if (length(Group) != ncol(abundance)) {
    stop("Length of Group vector (", length(Group), ") does not match number of samples in abundance data (", ncol(abundance), ")")
  }

  abundance_stats <- calculate_abundance_stats(
    abundance = abundance,
    metadata = abundance_metadata,
    group = "group_col",
    features = daa_results_filtered_sub_df$feature,
    group1 = group1_name,
    group2 = group2_name
  )
  
  # Identify annotation columns to include (excluding columns already in abundance_stats)
  abundance_stats_cols <- colnames(abundance_stats)
  annotation_cols <- setdiff(colnames(daa_results_filtered_sub_df),
                             c(abundance_stats_cols, "method", "p_values"))

  # Include p_adjust and annotation columns for merging
  merge_cols <- c("feature", "p_adjust", annotation_cols)
  merge_cols <- intersect(merge_cols, colnames(daa_results_filtered_sub_df))

  # Merge with DAA results to include p-values and other information
  result_table <- merge(
    abundance_stats,
    daa_results_filtered_sub_df[, merge_cols, drop = FALSE],
    by = "feature",
    all.x = TRUE
  )

  # Reorder columns for better readability
  base_column_order <- c(
    "feature", "group1", "group2",
    "mean_rel_abundance_group1", "sd_rel_abundance_group1",
    "mean_rel_abundance_group2", "sd_rel_abundance_group2",
    "log2_fold_change", "p_adjust"
  )

  # Add annotation columns at the end
  # Recalculate annotation columns from the actual result table
  result_annotation_cols <- setdiff(colnames(result_table), base_column_order)
  column_order <- c(base_column_order, result_annotation_cols)

  # Only select columns that actually exist in the result table
  column_order <- intersect(column_order, colnames(result_table))

  result_table <- result_table[, column_order, drop = FALSE]

  # Order by p-value (most significant first)
  result_table <- result_table[order(result_table$p_adjust), ]

  # Reset row names
  rownames(result_table) <- NULL

  return(result_table)
}