test-cross_pathway_consistency.R
In ggpicrust2: Make 'PICRUSt2' Output Analysis and Visualization Easier

# Comprehensive Cross-Pathway Type Consistency Tests
# Testing KEGG, MetaCyc, and GO pathway integration and consistency

library(testthat)
library(ggpicrust2)

# Test Data Generators for Cross-Pathway Analysis
create_cross_pathway_test_data <- function(n_samples = 24, effect_size = 1.5) {
  set.seed(42)
  
  # Create comprehensive KO abundance data (for KEGG and GO)
  ko_features <- c(
    # Glycolysis pathway
    "K00844", "K12407", "K00845", "K00886", "K08074", "K01810", "K00134",
    # TCA cycle
    "K00239", "K00240", "K00241", "K00242", "K01902", "K01903", "K00031",
    # Pentose phosphate pathway
    "K00016", "K00018", "K00128", "K01595", "K01596", "K00036",
    # Fatty acid metabolism
    "K01623", "K01624", "K11645", "K01803", "K15633", "K00059",
    # Additional random KOs
    paste0("K", sprintf("%05d", sample(1000:9999, 30)))
  )
  
  ko_abundance <- matrix(0, nrow = length(ko_features), ncol = n_samples)
  rownames(ko_abundance) <- ko_features
  colnames(ko_abundance) <- paste0("Sample", 1:n_samples)
  
  # Create realistic log-normal abundance with group differences
  for (i in 1:length(ko_features)) {
    # Base abundance
    base_abundance <- rlnorm(n_samples, meanlog = runif(1, 3, 7), sdlog = 0.8)
    
    # Add group-specific effects for some features
    if (i <= 20) {  # First 20 features show group differences
      group_effect <- c(rep(-effect_size/2, n_samples/2), rep(effect_size/2, n_samples/2))
      base_abundance <- base_abundance * exp(group_effect + rnorm(n_samples, 0, 0.2))
    }
    
    ko_abundance[i, ] <- base_abundance
  }
  
  # Create EC abundance data (for MetaCyc) - similar structure but EC numbers
  ec_features <- c(
    # Core metabolic ECs
    "EC:2.7.1.1", "EC:2.7.1.2", "EC:4.1.2.13", "EC:1.2.1.12", "EC:2.3.1.12",
    "EC:1.1.1.27", "EC:4.2.1.11", "EC:1.8.1.4", "EC:6.2.1.1", "EC:1.3.1.9",
    "EC:1.1.1.35", "EC:2.8.3.5", "EC:1.1.1.40", "EC:4.1.1.31", "EC:2.3.1.16",
    # Additional random ECs
    paste0("EC:", sample(1:6, 25, replace = TRUE), ".", 
           sample(1:20, 25, replace = TRUE), ".", 
           sample(1:50, 25, replace = TRUE), ".",
           sample(1:100, 25, replace = TRUE))
  )
  
  ec_abundance <- matrix(0, nrow = length(ec_features), ncol = n_samples)
  rownames(ec_abundance) <- ec_features
  colnames(ec_abundance) <- paste0("Sample", 1:n_samples)
  
  for (i in 1:length(ec_features)) {
    base_abundance <- rlnorm(n_samples, meanlog = runif(1, 3, 7), sdlog = 0.8)
    if (i <= 15) {  # First 15 features show group differences
      group_effect <- c(rep(-effect_size/2, n_samples/2), rep(effect_size/2, n_samples/2))
      base_abundance <- base_abundance * exp(group_effect + rnorm(n_samples, 0, 0.2))
    }
    ec_abundance[i, ] <- base_abundance
  }
  
  # Create metadata
  metadata <- data.frame(
    sample_name = paste0("Sample", 1:n_samples),
    Environment = factor(rep(c("Healthy", "Disease"), each = n_samples/2)),
    Batch = factor(rep(1:3, length.out = n_samples)),
    Age = runif(n_samples, 25, 75),
    BMI = rnorm(n_samples, 25, 4)
  )
  rownames(metadata) <- metadata$sample_name
  
  return(list(
    ko_abundance = ko_abundance,
    ec_abundance = ec_abundance,
    metadata = metadata
  ))
}

create_mock_gene_sets <- function(pathway_type, gene_names) {
  set.seed(123)
  
  if (pathway_type == "KEGG") {
    pathways <- paste0("ko", sprintf("%05d", 10:25))
    gene_sets <- list()
    for (i in seq_along(pathways)) {
      pathway_size <- sample(8:25, 1)
      genes <- sample(gene_names, min(pathway_size, length(gene_names)))
      gene_sets[[pathways[i]]] <- genes
    }
  } else if (pathway_type == "MetaCyc") {
    pathways <- paste0("PWY-", sample(1000:9999, 16))
    gene_sets <- list()
    for (i in seq_along(pathways)) {
      pathway_size <- sample(5:20, 1)
      genes <- sample(gene_names, min(pathway_size, length(gene_names)))
      gene_sets[[pathways[i]]] <- genes
    }
  } else if (pathway_type == "GO") {
    pathways <- paste0("GO:", sprintf("%07d", sample(1:9999999, 14)))
    gene_sets <- list()
    for (i in seq_along(pathways)) {
      pathway_size <- sample(10:30, 1)
      genes <- sample(gene_names, min(pathway_size, length(gene_names)))
      gene_sets[[pathways[i]]] <- genes
    }
  }
  
  return(gene_sets)
}

# API Consistency Tests
test_that("all pathway types have identical function signatures", {
  # Test that pathway_gsea accepts all pathway types with same parameters
  test_data <- create_cross_pathway_test_data()
  
  # Create base parameters
  base_params <- list(
    metadata = test_data$metadata,
    group = "Environment",
    method = "fgsea",
    rank_method = "signal2noise",
    nperm = 100,
    min_size = 5,
    max_size = 200,
    p.adjust = "BH",
    seed = 123
  )
  
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  for (pathway_type in pathway_types) {
    # Mock prepare_gene_sets for each pathway type
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      abundance_data <- test_data$ko_abundance
      gene_names <- rownames(abundance_data)
    } else {
      abundance_data <- test_data$ec_abundance
      gene_names <- rownames(abundance_data)
    }
    
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
    
    # Mock fgsea result
    mock_result <- data.frame(
      pathway_id = names(mock_gene_sets),
      pathway_name = names(mock_gene_sets),
      size = sapply(mock_gene_sets, length),
      ES = runif(length(mock_gene_sets), -2, 2),
      NES = runif(length(mock_gene_sets), -3, 3),
      pvalue = runif(length(mock_gene_sets), 0.001, 0.5),
      p.adjust = runif(length(mock_gene_sets), 0.005, 0.8),
      leading_edge = sapply(mock_gene_sets, function(x) paste(sample(x, min(5, length(x))), collapse = ";")),
      stringsAsFactors = FALSE
    )
    
    mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
    
    # Test function call
    params <- c(list(abundance = abundance_data, pathway_type = pathway_type), base_params)
    
    expect_no_error({
      result <- do.call(pathway_gsea, params)
    }, info = paste("pathway_gsea should accept", pathway_type, "pathway type"))
    
    # Check result structure is identical
    expected_columns <- c("pathway_id", "pathway_name", "size", "ES", "NES", 
                         "pvalue", "p.adjust", "leading_edge", "method")
    expect_named(result, expected_columns,
                 info = paste("Result structure should be consistent for", pathway_type))
  }
})

test_that("return data structures are identical across pathway types", {
  test_data <- create_cross_pathway_test_data()
  
  # Test each pathway type and collect results
  results <- list()
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  for (pathway_type in pathway_types) {
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      abundance_data <- test_data$ko_abundance
      gene_names <- rownames(abundance_data)
    } else {
      abundance_data <- test_data$ec_abundance
      gene_names <- rownames(abundance_data)
    }
    
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
    
    # Create standardized mock result
    mock_result <- data.frame(
      pathway_id = names(mock_gene_sets),
      pathway_name = names(mock_gene_sets),
      size = rep(15, length(mock_gene_sets)),  # Same size for comparison
      ES = rep(c(-1.5, 1.2, -0.8, 2.1), length.out = length(mock_gene_sets)),
      NES = rep(c(-2.3, 1.8, -1.2, 3.1), length.out = length(mock_gene_sets)),
      pvalue = rep(c(0.01, 0.05, 0.2, 0.001), length.out = length(mock_gene_sets)),
      p.adjust = rep(c(0.05, 0.1, 0.3, 0.01), length.out = length(mock_gene_sets)),
      leading_edge = rep("gene1;gene2;gene3", length(mock_gene_sets)),
      stringsAsFactors = FALSE
    )
    
    mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
    
    results[[pathway_type]] <- pathway_gsea(
      abundance = abundance_data,
      metadata = test_data$metadata,
      group = "Environment",
      pathway_type = pathway_type,
      method = "fgsea",
      seed = 123
    )
  }
  
  # Compare column names and data types across all pathway types
  for (i in 1:(length(results)-1)) {
    for (j in (i+1):length(results)) {
      type1 <- names(results)[i]
      type2 <- names(results)[j]
      
      expect_equal(names(results[[i]]), names(results[[j]]),
                   info = paste("Column names should match between", type1, "and", type2))
      
      expect_equal(sapply(results[[i]], class), sapply(results[[j]], class),
                   info = paste("Column types should match between", type1, "and", type2))
    }
  }
})

# Statistical Consistency Tests
test_that("statistical calculations are consistent across pathway types", {
  skip_if_not_installed("fgsea")
  
  test_data <- create_cross_pathway_test_data()
  
  # Use same ranking method and compare statistical properties
  ranking_methods <- c("signal2noise", "t_test", "log2_ratio", "diff_abundance")
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  for (rank_method in ranking_methods) {
    rank_results <- list()
    
    for (pathway_type in pathway_types) {
      if (pathway_type == "KEGG" || pathway_type == "GO") {
        abundance_data <- test_data$ko_abundance
      } else {
        abundance_data <- test_data$ec_abundance
      }
      
      # Calculate ranking metric directly
      rank_metric <- calculate_rank_metric(
        abundance = abundance_data,
        metadata = test_data$metadata,
        group = "Environment",
        method = rank_method
      )
      
      rank_results[[pathway_type]] <- rank_metric
    }
    
    # Compare statistical properties of ranking metrics
    rank_stats <- lapply(rank_results, function(x) {
      list(
        mean = mean(x, na.rm = TRUE),
        sd = sd(x, na.rm = TRUE),
        min = min(x, na.rm = TRUE),
        max = max(x, na.rm = TRUE),
        n_finite = sum(is.finite(x))
      )
    })
    
    # All ranking methods should produce valid statistics
    for (pathway_type in pathway_types) {
      expect_true(is.finite(rank_stats[[pathway_type]]$mean),
                   info = paste(rank_method, "should produce finite mean for", pathway_type))
      expect_true(rank_stats[[pathway_type]]$sd > 0,
                   info = paste(rank_method, "should produce positive SD for", pathway_type))
      expect_true(rank_stats[[pathway_type]]$n_finite > 0,
                   info = paste(rank_method, "should produce finite values for", pathway_type))
    }
  }
})

test_that("p-value distributions are appropriate for all pathway types", {
  skip_if_not_installed("fgsea")
  
  test_data <- create_cross_pathway_test_data()
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  pvalue_distributions <- list()
  
  for (pathway_type in pathway_types) {
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      abundance_data <- test_data$ko_abundance
      gene_names <- rownames(abundance_data)
    } else {
      abundance_data <- test_data$ec_abundance
      gene_names <- rownames(abundance_data)
    }
    
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
    
    # Create realistic p-value distribution
    n_pathways <- length(mock_gene_sets)
    # Mix of significant and non-significant p-values
    pvalues <- c(
      runif(n_pathways * 0.2, 0.001, 0.05),  # 20% significant
      runif(n_pathways * 0.8, 0.05, 1.0)     # 80% non-significant
    )[1:n_pathways]
    
    mock_result <- data.frame(
      pathway_id = names(mock_gene_sets),
      pathway_name = names(mock_gene_sets),
      size = sapply(mock_gene_sets, length),
      ES = runif(n_pathways, -2, 2),
      NES = runif(n_pathways, -3, 3),
      pvalue = pvalues,
      p.adjust = p.adjust(pvalues, method = "BH"),
      leading_edge = sapply(mock_gene_sets, function(x) paste(sample(x, min(3, length(x))), collapse = ";")),
      stringsAsFactors = FALSE
    )
    
    mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
    
    result <- pathway_gsea(
      abundance = abundance_data,
      metadata = test_data$metadata,
      group = "Environment",
      pathway_type = pathway_type,
      method = "fgsea",
      seed = 123
    )
    
    pvalue_distributions[[pathway_type]] <- result$pvalue
  }
  
  # Test p-value distribution properties
  for (pathway_type in pathway_types) {
    pvals <- pvalue_distributions[[pathway_type]]
    
    # P-values should be between 0 and 1
    expect_true(all(pvals >= 0 & pvals <= 1),
                info = paste("P-values should be valid for", pathway_type))
    
    # Should have reasonable distribution (not all 0 or 1)
    expect_true(mean(pvals) > 0.01 & mean(pvals) < 0.99,
                info = paste("P-value distribution should be reasonable for", pathway_type))
    
    # Should have some variation
    expect_true(sd(pvals) > 0.01,
                info = paste("P-values should show variation for", pathway_type))
  }
})

# Integration Workflow Tests
test_that("switching between pathway types in same session works", {
  skip_if_not_installed("fgsea")
  
  test_data <- create_cross_pathway_test_data()
  
  # Simulate a workflow where user switches between pathway types
  workflow_results <- list()
  
  # KEGG analysis
  kegg_gene_sets <- create_mock_gene_sets("KEGG", rownames(test_data$ko_abundance))
  mockery::stub(pathway_gsea, "prepare_gene_sets", function(pathway_type, ...) {
    if (pathway_type == "KEGG") return(kegg_gene_sets)
    else return(list())
  })
  
  kegg_mock <- data.frame(
    pathway_id = names(kegg_gene_sets)[1:10],
    pathway_name = names(kegg_gene_sets)[1:10],
    size = rep(15, 10),
    ES = runif(10, -2, 2),
    NES = runif(10, -3, 3),
    pvalue = runif(10, 0.01, 0.3),
    p.adjust = runif(10, 0.05, 0.4),
    leading_edge = rep("K00001;K00002", 10),
    stringsAsFactors = FALSE
  )
  
  mockery::stub(pathway_gsea, "run_fgsea", function(...) kegg_mock)
  
  workflow_results$kegg <- pathway_gsea(
    abundance = test_data$ko_abundance,
    metadata = test_data$metadata,
    group = "Environment",
    pathway_type = "KEGG",
    method = "fgsea",
    seed = 123
  )
  
  # Switch to MetaCyc analysis
  metacyc_gene_sets <- create_mock_gene_sets("MetaCyc", rownames(test_data$ec_abundance))
  mockery::stub(pathway_gsea, "prepare_gene_sets", function(pathway_type, ...) {
    if (pathway_type == "MetaCyc") return(metacyc_gene_sets)
    else return(list())
  })
  
  metacyc_mock <- data.frame(
    pathway_id = names(metacyc_gene_sets)[1:10],
    pathway_name = names(metacyc_gene_sets)[1:10],
    size = rep(12, 10),
    ES = runif(10, -1.5, 1.5),
    NES = runif(10, -2.5, 2.5),
    pvalue = runif(10, 0.02, 0.4),
    p.adjust = runif(10, 0.06, 0.5),
    leading_edge = rep("EC:1.1.1.1;EC:2.2.2.2", 10),
    stringsAsFactors = FALSE
  )
  
  mockery::stub(pathway_gsea, "run_fgsea", function(...) metacyc_mock)
  
  workflow_results$metacyc <- pathway_gsea(
    abundance = test_data$ec_abundance,
    metadata = test_data$metadata,
    group = "Environment",
    pathway_type = "MetaCyc",
    method = "fgsea",
    seed = 123
  )
  
  # Switch to GO analysis  
  go_gene_sets <- create_mock_gene_sets("GO", rownames(test_data$ko_abundance))
  mockery::stub(pathway_gsea, "prepare_gene_sets", function(pathway_type, ...) {
    if (pathway_type == "GO") return(go_gene_sets)
    else return(list())
  })
  
  go_mock <- data.frame(
    pathway_id = names(go_gene_sets)[1:10],
    pathway_name = names(go_gene_sets)[1:10],
    size = rep(18, 10),
    ES = runif(10, -1.8, 1.8),
    NES = runif(10, -2.8, 2.8),
    pvalue = runif(10, 0.005, 0.25),
    p.adjust = runif(10, 0.02, 0.35),
    leading_edge = rep("K00100;K00200", 10),
    stringsAsFactors = FALSE
  )
  
  mockery::stub(pathway_gsea, "run_fgsea", function(...) go_mock)
  
  workflow_results$go <- pathway_gsea(
    abundance = test_data$ko_abundance,
    metadata = test_data$metadata,
    group = "Environment",
    pathway_type = "GO",
    method = "fgsea",
    seed = 123
  )
  
  # Verify all analyses completed successfully
  for (pathway_type in names(workflow_results)) {
    expect_s3_class(workflow_results[[pathway_type]], "data.frame",
                    info = paste("Workflow result should be data frame for", pathway_type))
    expect_true(nrow(workflow_results[[pathway_type]]) == 10,
                info = paste("Workflow should return expected number of pathways for", pathway_type))
  }
  
  # Verify no contamination between analyses
  expect_true(all(grepl("^ko", workflow_results$kegg$pathway_id)),
              info = "KEGG results should contain KEGG pathway IDs")
  expect_true(all(grepl("^PWY-", workflow_results$metacyc$pathway_id)),
              info = "MetaCyc results should contain MetaCyc pathway IDs")
  expect_true(all(grepl("^GO:", workflow_results$go$pathway_id)),
              info = "GO results should contain GO pathway IDs")
})

test_that("data format compatibility across pathway transitions", {
  test_data <- create_cross_pathway_test_data()
  
  # Test that user can easily switch between analyses
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  results <- list()
  
  for (pathway_type in pathway_types) {
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      abundance_data <- test_data$ko_abundance
      gene_names <- rownames(abundance_data)
    } else {
      abundance_data <- test_data$ec_abundance  
      gene_names <- rownames(abundance_data)
    }
    
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
    
    mock_result <- data.frame(
      pathway_id = names(mock_gene_sets)[1:8],
      pathway_name = names(mock_gene_sets)[1:8],
      size = rep(12, 8),
      ES = runif(8, -1, 1),
      NES = runif(8, -2, 2),
      pvalue = runif(8, 0.01, 0.3),
      p.adjust = runif(8, 0.05, 0.4),
      leading_edge = rep("gene1;gene2", 8),
      stringsAsFactors = FALSE
    )
    
    mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
    
    results[[pathway_type]] <- pathway_gsea(
      abundance = abundance_data,
      metadata = test_data$metadata,
      group = "Environment",
      pathway_type = pathway_type,
      method = "fgsea",
      seed = 123
    )
  }
  
  # Test that results can be combined for comparison
  combined_results <- do.call(rbind, lapply(names(results), function(type) {
    df <- results[[type]]
    df$pathway_type <- type
    return(df)
  }))
  
  expect_s3_class(combined_results, "data.frame")
  expect_equal(nrow(combined_results), 24)  # 8 pathways × 3 types
  expect_true("pathway_type" %in% colnames(combined_results))
  
  # Test that each pathway type is represented
  type_counts <- table(combined_results$pathway_type)
  expect_equal(as.numeric(type_counts), rep(8, 3))
  expect_equal(names(type_counts), c("GO", "KEGG", "MetaCyc"))
})

# Visualization Consistency Tests  
test_that("visualize_gsea works consistently across pathway types", {
  skip_if_not_installed("ggplot2")
  
  test_data <- create_cross_pathway_test_data()
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  for (pathway_type in pathway_types) {
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      gene_names <- rownames(test_data$ko_abundance)
    } else {
      gene_names <- rownames(test_data$ec_abundance)
    }
    
    # Create mock GSEA results
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    gsea_results <- data.frame(
      pathway_id = names(mock_gene_sets)[1:5],
      pathway_name = names(mock_gene_sets)[1:5],
      size = rep(15, 5),
      ES = c(-1.5, 1.2, -0.8, 2.1, -1.1),
      NES = c(-2.3, 1.8, -1.2, 3.1, -1.7),
      pvalue = c(0.01, 0.05, 0.2, 0.001, 0.08),
      p.adjust = c(0.05, 0.1, 0.3, 0.01, 0.15),
      leading_edge = rep("gene1;gene2;gene3", 5),
      method = "fgsea",
      stringsAsFactors = FALSE
    )
    
    # Test different plot types
    plot_types <- c("enrichment_plot", "dotplot", "barplot")
    
    for (plot_type in plot_types) {
      expect_no_error({
        plot <- visualize_gsea(
          gsea_results = gsea_results,
          plot_type = plot_type,
          n_pathways = 5
        )
      }, info = paste("visualize_gsea should work with", pathway_type, plot_type))
      
      expect_s3_class(plot, "ggplot",
                       info = paste("Should return ggplot object for", pathway_type, plot_type))
    }
  }
})

test_that("pathway annotation system consistency", {
  test_data <- create_cross_pathway_test_data()
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  for (pathway_type in pathway_types) {
    # Create mock GSEA results
    if (pathway_type == "KEGG") {
      pathway_ids <- paste0("ko", sprintf("%05d", 10:15))
    } else if (pathway_type == "MetaCyc") {
      pathway_ids <- paste0("PWY-", 1000:1005)
    } else {
      pathway_ids <- paste0("GO:", sprintf("%07d", 1:6))
    }
    
    gsea_results <- data.frame(
      pathway_id = pathway_ids,
      pathway_name = pathway_ids,  # Will be updated by annotation
      size = rep(15, 6),
      ES = runif(6, -2, 2),
      NES = runif(6, -3, 3),
      pvalue = runif(6, 0.01, 0.3),
      p.adjust = runif(6, 0.05, 0.4),
      leading_edge = rep("gene1;gene2", 6),
      method = "fgsea",
      stringsAsFactors = FALSE
    )
    
    # Mock annotation references
    if (pathway_type == "KEGG") {
      mock_reference <- data.frame(
        pathway = pathway_ids,
        pathway_name = paste("KEGG pathway", 1:6),
        class = rep("Metabolism", 6),
        stringsAsFactors = FALSE
      )
      mockery::stub(gsea_pathway_annotation, "load", function(file) {
        kegg_reference <<- mock_reference
      })
    } else if (pathway_type == "MetaCyc") {
      mock_reference <- data.frame(
        V1 = pathway_ids,
        V2 = paste("MetaCyc pathway", 1:6),
        stringsAsFactors = FALSE
      )
      mockery::stub(gsea_pathway_annotation, "load", function(file) {
        MetaCyc_reference <<- mock_reference
      })
    } else {
      mock_reference <- data.frame(
        go_id = pathway_ids,
        go_name = paste("GO term", 1:6),
        category = rep("BP", 6),
        ko_members = rep("K00001;K00002", 6),
        stringsAsFactors = FALSE
      )
      mockery::stub(gsea_pathway_annotation, "ko_to_go_reference", mock_reference)
    }
    
    # Test annotation function
    expect_no_error({
      annotated_results <- gsea_pathway_annotation(
        gsea_results = gsea_results,
        pathway_type = pathway_type
      )
    }, info = paste("Annotation should work for", pathway_type))
    
    # Check that pathway names were added
    expect_true("pathway_name" %in% colnames(annotated_results),
                info = paste("Annotated results should have pathway_name column for", pathway_type))
    
    # Check that some annotations were applied
    expect_true(any(annotated_results$pathway_name != annotated_results$pathway_id),
                info = paste("Some pathway names should be different from IDs for", pathway_type))
  }
})

# Performance Consistency Tests
test_that("performance is comparable across pathway types", {
  skip_if_not_installed("fgsea")
  
  test_data <- create_cross_pathway_test_data(n_samples = 40)
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  execution_times <- list()
  
  for (pathway_type in pathway_types) {
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      abundance_data <- test_data$ko_abundance
      gene_names <- rownames(abundance_data)
    } else {
      abundance_data <- test_data$ec_abundance
      gene_names <- rownames(abundance_data)
    }
    
    # Create larger gene sets for performance testing
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    # Expand to more pathways
    for (i in 16:50) {
      if (pathway_type == "KEGG") {
        pathway_id <- paste0("ko", sprintf("%05d", i))
      } else if (pathway_type == "MetaCyc") {
        pathway_id <- paste0("PWY-", 1000 + i)
      } else {
        pathway_id <- paste0("GO:", sprintf("%07d", i))
      }
      pathway_size <- sample(8:25, 1)
      genes <- sample(gene_names, min(pathway_size, length(gene_names)))
      mock_gene_sets[[pathway_id]] <- genes
    }
    
    mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
    
    # Create comprehensive mock result
    mock_result <- data.frame(
      pathway_id = names(mock_gene_sets),
      pathway_name = names(mock_gene_sets),
      size = sapply(mock_gene_sets, length),
      ES = runif(length(mock_gene_sets), -2, 2),
      NES = runif(length(mock_gene_sets), -3, 3),
      pvalue = runif(length(mock_gene_sets), 0.001, 0.5),
      p.adjust = runif(length(mock_gene_sets), 0.005, 0.8),
      leading_edge = sapply(mock_gene_sets, function(x) paste(sample(x, min(5, length(x))), collapse = ";")),
      stringsAsFactors = FALSE
    )
    
    mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
    
    # Measure execution time
    start_time <- Sys.time()
    
    result <- pathway_gsea(
      abundance = abundance_data,
      metadata = test_data$metadata,
      group = "Environment",
      pathway_type = pathway_type,
      method = "fgsea",
      nperm = 100,
      seed = 123
    )
    
    end_time <- Sys.time()
    execution_times[[pathway_type]] <- as.numeric(end_time - start_time, units = "secs")
    
    # Verify result completeness
    expect_s3_class(result, "data.frame")
    expect_true(nrow(result) > 30)
    expect_true(all(!is.na(result$pvalue)))
  }
  
  # Compare execution times - should be reasonably similar
  times <- unlist(execution_times)
  expect_true(all(times < 5), "All pathway types should complete within 5 seconds")
  
  # No pathway type should be more than 3x slower than the fastest
  time_ratio <- max(times) / min(times)
  expect_true(time_ratio < 3, 
              paste("Performance ratio should be reasonable. Actual ratio:", round(time_ratio, 2)))
})

# Comparative Analysis Tests
test_that("pathway overlap analysis between types works", {
  test_data <- create_cross_pathway_test_data()
  
  # Create overlapping gene sets between pathway types
  # Some KOs appear in both KEGG and GO pathways
  common_kos <- rownames(test_data$ko_abundance)[1:20]
  
  kegg_sets <- list(
    "ko00010" = common_kos[1:10],
    "ko00020" = common_kos[5:15],
    "ko00030" = common_kos[10:20]
  )
  
  go_sets <- list(
    "GO:0006096" = common_kos[1:8],   # Overlap with ko00010
    "GO:0006099" = common_kos[12:18], # Overlap with ko00030
    "GO:0008152" = common_kos[1:5]    # Overlap with ko00010
  )
  
  metacyc_sets <- list(
    "PWY-1001" = rownames(test_data$ec_abundance)[1:8],
    "PWY-1002" = rownames(test_data$ec_abundance)[5:12],
    "PWY-1003" = rownames(test_data$ec_abundance)[10:15]
  )
  
  # Mock results for each pathway type
  mock_kegg <- data.frame(
    pathway_id = names(kegg_sets),
    pathway_name = c("Glycolysis", "Citrate cycle", "Pentose phosphate"),
    size = c(10, 11, 11),
    ES = c(-1.5, 1.2, -0.8),
    NES = c(-2.3, 1.8, -1.2),
    pvalue = c(0.01, 0.05, 0.15),
    p.adjust = c(0.03, 0.1, 0.2),
    leading_edge = c("K00001;K00002", "K00010;K00011", "K00020"),
    method = "fgsea",
    stringsAsFactors = FALSE
  )
  
  mock_go <- data.frame(
    pathway_id = names(go_sets),
    pathway_name = c("Glycolytic process", "TCA cycle", "Metabolic process"),
    size = c(8, 7, 5),
    ES = c(-1.3, 1.0, -1.1),
    NES = c(-2.0, 1.5, -1.8),
    pvalue = c(0.02, 0.08, 0.12),
    p.adjust = c(0.06, 0.15, 0.18),
    leading_edge = c("K00001;K00003", "K00015;K00016", "K00002"),
    method = "fgsea",
    stringsAsFactors = FALSE
  )
  
  mock_metacyc <- data.frame(
    pathway_id = names(metacyc_sets),
    pathway_name = c("Glucose degradation", "Fatty acid oxidation", "Amino acid biosynthesis"),
    size = c(8, 8, 6),
    ES = c(-1.0, 0.9, 1.3),
    NES = c(-1.7, 1.4, 2.0),
    pvalue = c(0.04, 0.06, 0.03),
    p.adjust = c(0.08, 0.12, 0.06),
    leading_edge = c("EC:1.1.1.1;EC:2.2.2.2", "EC:3.3.3.3", "EC:4.4.4.4;EC:5.5.5.5"),
    method = "fgsea",
    stringsAsFactors = FALSE
  )
  
  # Test that we can identify complementary biological insights
  expect_true(nrow(mock_kegg) > 0, "KEGG analysis should find pathways")
  expect_true(nrow(mock_go) > 0, "GO analysis should find pathways")
  expect_true(nrow(mock_metacyc) > 0, "MetaCyc analysis should find pathways")
  
  # Test pathway name similarity analysis
  kegg_names <- tolower(mock_kegg$pathway_name)
  go_names <- tolower(mock_go$pathway_name)
  
  # Should find some conceptual overlap
  name_similarity <- outer(kegg_names, go_names, function(x, y) {
    sapply(seq_along(x), function(i) {
      sum(strsplit(x[i], " ")[[1]] %in% strsplit(y[i], " ")[[1]])
    })
  })
  
  expect_true(any(name_similarity > 0), 
              "Should find some conceptual overlap between KEGG and GO pathway names")
})

test_that("biological interpretation consistency check", {
  # Test that the three pathway types provide complementary insights
  # This is more of a sanity check for biological coherence
  
  # Mock results representing typical microbiome analysis outcomes
  kegg_results <- data.frame(
    pathway_id = c("ko00010", "ko00020", "ko00030", "ko00230", "ko00260"),
    pathway_name = c("Glycolysis", "Citrate cycle", "Pentose phosphate", "Purine metabolism", "Glycine metabolism"),
    NES = c(-2.1, 1.5, -1.2, 2.3, -1.8),
    pvalue = c(0.001, 0.02, 0.15, 0.005, 0.03),
    p.adjust = c(0.005, 0.04, 0.2, 0.015, 0.06),
    pathway_type = "KEGG",
    stringsAsFactors = FALSE
  )
  
  go_results <- data.frame(
    pathway_id = c("GO:0006096", "GO:0006099", "GO:0008152", "GO:0009058", "GO:0006520"),
    pathway_name = c("Glycolytic process", "TCA cycle", "Metabolic process", "Biosynthetic process", "Amino acid metabolism"),
    NES = c(-1.9, 1.3, -1.5, 2.0, -1.6),
    pvalue = c(0.002, 0.03, 0.08, 0.01, 0.04),
    p.adjust = c(0.01, 0.06, 0.12, 0.02, 0.08),
    pathway_type = "GO",
    stringsAsFactors = FALSE
  )
  
  metacyc_results <- data.frame(
    pathway_id = c("PWY-1001", "PWY-1002", "PWY-1003", "PWY-1004", "PWY-1005"),
    pathway_name = c("Glucose degradation", "TCA cycle", "Glycogen biosynthesis", "Fatty acid oxidation", "Leucine biosynthesis"),
    NES = c(-2.0, 1.4, 1.7, -1.3, 2.2),
    pvalue = c(0.001, 0.025, 0.015, 0.12, 0.008),
    p.adjust = c(0.005, 0.05, 0.03, 0.15, 0.02),
    pathway_type = "MetaCyc",
    stringsAsFactors = FALSE
  )
  
  # Test biological coherence
  # 1. Similar pathways should show similar enrichment directions
  glycolysis_kegg <- kegg_results[kegg_results$pathway_name == "Glycolysis", "NES"]
  glycolysis_go <- go_results[go_results$pathway_name == "Glycolytic process", "NES"] 
  glycolysis_metacyc <- metacyc_results[metacyc_results$pathway_name == "Glucose degradation", "NES"]
  
  # All should be negative (depleted) - consistent with each other
  expect_true(sign(glycolysis_kegg) == sign(glycolysis_go),
              "KEGG and GO glycolysis pathways should have same enrichment direction")
  expect_true(sign(glycolysis_kegg) == sign(glycolysis_metacyc),
              "KEGG and MetaCyc glucose degradation should have same enrichment direction")
  
  # 2. Test that we have both positive and negative enrichments (realistic biology)
  for (results in list(kegg_results, go_results, metacyc_results)) {
    expect_true(any(results$NES > 0) && any(results$NES < 0),
                "Should have both upregulated and downregulated pathways")
    
    expect_true(any(results$pvalue < 0.05),
                "Should have some significant pathways")
  }
  
  # 3. P-value distributions should be reasonable
  all_pvals <- c(kegg_results$pvalue, go_results$pvalue, metacyc_results$pvalue)
  expect_true(mean(all_pvals) > 0.01 && mean(all_pvals) < 0.8,
              "P-value distribution should be reasonable across all pathway types")
})

# Summary test for overall consistency
test_that("overall cross-pathway consistency summary", {
  # This test summarizes that all major consistency checks pass
  
  test_data <- create_cross_pathway_test_data()
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  # Test that all pathway types can be analyzed successfully
  success_count <- 0
  
  for (pathway_type in pathway_types) {
    tryCatch({
      if (pathway_type == "KEGG" || pathway_type == "GO") {
        abundance_data <- test_data$ko_abundance
        gene_names <- rownames(abundance_data)
      } else {
        abundance_data <- test_data$ec_abundance
        gene_names <- rownames(abundance_data)
      }
      
      mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
      mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
      
      mock_result <- data.frame(
        pathway_id = names(mock_gene_sets)[1:5],
        pathway_name = names(mock_gene_sets)[1:5],
        size = rep(15, 5),
        ES = runif(5, -2, 2),
        NES = runif(5, -3, 3),
        pvalue = runif(5, 0.01, 0.3),
        p.adjust = runif(5, 0.05, 0.4),
        leading_edge = rep("gene1;gene2", 5),
        stringsAsFactors = FALSE
      )
      
      mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
      
      result <- pathway_gsea(
        abundance = abundance_data,
        metadata = test_data$metadata,
        group = "Environment",
        pathway_type = pathway_type,
        method = "fgsea",
        seed = 123
      )
      
      # Basic validation
      if (is.data.frame(result) && nrow(result) == 5 && 
          all(c("pathway_id", "NES", "pvalue") %in% colnames(result))) {
        success_count <- success_count + 1
      }
      
    }, error = function(e) {
      # Test should not error
    })
  }
  
  expect_equal(success_count, 3)
  
  # Print summary message
  message("Cross-pathway consistency tests completed successfully!")
  message("✓ API consistency across KEGG, MetaCyc, and GO")
  message("✓ Statistical consistency in calculations")
  message("✓ Visualization compatibility")  
  message("✓ Annotation system uniformity")
  message("✓ Performance parity")
  message("✓ Integration workflow reliability")
})
Any scripts or data that you put into this service are public.
ggpicrust2 documentation built on Aug. 26, 2025, 1:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ggpicrust2
Make 'PICRUSt2' Output Analysis and Visualization Easier

tests/testthat/test-cross_pathway_consistency.R
In ggpicrust2: Make 'PICRUSt2' Output Analysis and Visualization Easier

Try the ggpicrust2 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ggpicrust2 Make 'PICRUSt2' Output Analysis and Visualization Easier

tests/testthat/test-cross_pathway_consistency.R In ggpicrust2: Make 'PICRUSt2' Output Analysis and Visualization Easier

Try the ggpicrust2 package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ggpicrust2
Make 'PICRUSt2' Output Analysis and Visualization Easier

tests/testthat/test-cross_pathway_consistency.R
In ggpicrust2: Make 'PICRUSt2' Output Analysis and Visualization Easier