tests/testthat/test-cross_pathway_consistency.R

# Comprehensive Cross-Pathway Type Consistency Tests
# Testing KEGG, MetaCyc, and GO pathway integration and consistency

library(testthat)
library(ggpicrust2)

# Test Data Generators for Cross-Pathway Analysis
create_cross_pathway_test_data <- function(n_samples = 24, effect_size = 1.5) {
  set.seed(42)
  
  # Create comprehensive KO abundance data (for KEGG and GO)
  ko_features <- c(
    # Glycolysis pathway
    "K00844", "K12407", "K00845", "K00886", "K08074", "K01810", "K00134",
    # TCA cycle
    "K00239", "K00240", "K00241", "K00242", "K01902", "K01903", "K00031",
    # Pentose phosphate pathway
    "K00016", "K00018", "K00128", "K01595", "K01596", "K00036",
    # Fatty acid metabolism
    "K01623", "K01624", "K11645", "K01803", "K15633", "K00059",
    # Additional random KOs
    paste0("K", sprintf("%05d", sample(1000:9999, 30)))
  )
  
  ko_abundance <- matrix(0, nrow = length(ko_features), ncol = n_samples)
  rownames(ko_abundance) <- ko_features
  colnames(ko_abundance) <- paste0("Sample", 1:n_samples)
  
  # Create realistic log-normal abundance with group differences
  for (i in 1:length(ko_features)) {
    # Base abundance
    base_abundance <- rlnorm(n_samples, meanlog = runif(1, 3, 7), sdlog = 0.8)
    
    # Add group-specific effects for some features
    if (i <= 20) {  # First 20 features show group differences
      group_effect <- c(rep(-effect_size/2, n_samples/2), rep(effect_size/2, n_samples/2))
      base_abundance <- base_abundance * exp(group_effect + rnorm(n_samples, 0, 0.2))
    }
    
    ko_abundance[i, ] <- base_abundance
  }
  
  # Create EC abundance data (for MetaCyc) - similar structure but EC numbers
  ec_features <- c(
    # Core metabolic ECs
    "EC:2.7.1.1", "EC:2.7.1.2", "EC:4.1.2.13", "EC:1.2.1.12", "EC:2.3.1.12",
    "EC:1.1.1.27", "EC:4.2.1.11", "EC:1.8.1.4", "EC:6.2.1.1", "EC:1.3.1.9",
    "EC:1.1.1.35", "EC:2.8.3.5", "EC:1.1.1.40", "EC:4.1.1.31", "EC:2.3.1.16",
    # Additional random ECs
    paste0("EC:", sample(1:6, 25, replace = TRUE), ".", 
           sample(1:20, 25, replace = TRUE), ".", 
           sample(1:50, 25, replace = TRUE), ".",
           sample(1:100, 25, replace = TRUE))
  )
  
  ec_abundance <- matrix(0, nrow = length(ec_features), ncol = n_samples)
  rownames(ec_abundance) <- ec_features
  colnames(ec_abundance) <- paste0("Sample", 1:n_samples)
  
  for (i in 1:length(ec_features)) {
    base_abundance <- rlnorm(n_samples, meanlog = runif(1, 3, 7), sdlog = 0.8)
    if (i <= 15) {  # First 15 features show group differences
      group_effect <- c(rep(-effect_size/2, n_samples/2), rep(effect_size/2, n_samples/2))
      base_abundance <- base_abundance * exp(group_effect + rnorm(n_samples, 0, 0.2))
    }
    ec_abundance[i, ] <- base_abundance
  }
  
  # Create metadata
  metadata <- data.frame(
    sample_name = paste0("Sample", 1:n_samples),
    Environment = factor(rep(c("Healthy", "Disease"), each = n_samples/2)),
    Batch = factor(rep(1:3, length.out = n_samples)),
    Age = runif(n_samples, 25, 75),
    BMI = rnorm(n_samples, 25, 4)
  )
  rownames(metadata) <- metadata$sample_name
  
  return(list(
    ko_abundance = ko_abundance,
    ec_abundance = ec_abundance,
    metadata = metadata
  ))
}

create_mock_gene_sets <- function(pathway_type, gene_names) {
  set.seed(123)
  
  if (pathway_type == "KEGG") {
    pathways <- paste0("ko", sprintf("%05d", 10:25))
    gene_sets <- list()
    for (i in seq_along(pathways)) {
      pathway_size <- sample(8:25, 1)
      genes <- sample(gene_names, min(pathway_size, length(gene_names)))
      gene_sets[[pathways[i]]] <- genes
    }
  } else if (pathway_type == "MetaCyc") {
    pathways <- paste0("PWY-", sample(1000:9999, 16))
    gene_sets <- list()
    for (i in seq_along(pathways)) {
      pathway_size <- sample(5:20, 1)
      genes <- sample(gene_names, min(pathway_size, length(gene_names)))
      gene_sets[[pathways[i]]] <- genes
    }
  } else if (pathway_type == "GO") {
    pathways <- paste0("GO:", sprintf("%07d", sample(1:9999999, 14)))
    gene_sets <- list()
    for (i in seq_along(pathways)) {
      pathway_size <- sample(10:30, 1)
      genes <- sample(gene_names, min(pathway_size, length(gene_names)))
      gene_sets[[pathways[i]]] <- genes
    }
  }
  
  return(gene_sets)
}

# API Consistency Tests
test_that("all pathway types have identical function signatures", {
  # Test that pathway_gsea accepts all pathway types with same parameters
  test_data <- create_cross_pathway_test_data()
  
  # Create base parameters
  base_params <- list(
    metadata = test_data$metadata,
    group = "Environment",
    method = "fgsea",
    rank_method = "signal2noise",
    nperm = 100,
    min_size = 5,
    max_size = 200,
    p.adjust = "BH",
    seed = 123
  )
  
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  for (pathway_type in pathway_types) {
    # Mock prepare_gene_sets for each pathway type
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      abundance_data <- test_data$ko_abundance
      gene_names <- rownames(abundance_data)
    } else {
      abundance_data <- test_data$ec_abundance
      gene_names <- rownames(abundance_data)
    }
    
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
    
    # Mock fgsea result
    mock_result <- data.frame(
      pathway_id = names(mock_gene_sets),
      pathway_name = names(mock_gene_sets),
      size = sapply(mock_gene_sets, length),
      ES = runif(length(mock_gene_sets), -2, 2),
      NES = runif(length(mock_gene_sets), -3, 3),
      pvalue = runif(length(mock_gene_sets), 0.001, 0.5),
      p.adjust = runif(length(mock_gene_sets), 0.005, 0.8),
      leading_edge = sapply(mock_gene_sets, function(x) paste(sample(x, min(5, length(x))), collapse = ";")),
      stringsAsFactors = FALSE
    )
    
    mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
    
    # Test function call
    params <- c(list(abundance = abundance_data, pathway_type = pathway_type), base_params)
    
    expect_no_error({
      result <- do.call(pathway_gsea, params)
    }, info = paste("pathway_gsea should accept", pathway_type, "pathway type"))
    
    # Check result structure is identical
    expected_columns <- c("pathway_id", "pathway_name", "size", "ES", "NES", 
                         "pvalue", "p.adjust", "leading_edge", "method")
    expect_named(result, expected_columns,
                 info = paste("Result structure should be consistent for", pathway_type))
  }
})

test_that("return data structures are identical across pathway types", {
  test_data <- create_cross_pathway_test_data()
  
  # Test each pathway type and collect results
  results <- list()
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  for (pathway_type in pathway_types) {
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      abundance_data <- test_data$ko_abundance
      gene_names <- rownames(abundance_data)
    } else {
      abundance_data <- test_data$ec_abundance
      gene_names <- rownames(abundance_data)
    }
    
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
    
    # Create standardized mock result
    mock_result <- data.frame(
      pathway_id = names(mock_gene_sets),
      pathway_name = names(mock_gene_sets),
      size = rep(15, length(mock_gene_sets)),  # Same size for comparison
      ES = rep(c(-1.5, 1.2, -0.8, 2.1), length.out = length(mock_gene_sets)),
      NES = rep(c(-2.3, 1.8, -1.2, 3.1), length.out = length(mock_gene_sets)),
      pvalue = rep(c(0.01, 0.05, 0.2, 0.001), length.out = length(mock_gene_sets)),
      p.adjust = rep(c(0.05, 0.1, 0.3, 0.01), length.out = length(mock_gene_sets)),
      leading_edge = rep("gene1;gene2;gene3", length(mock_gene_sets)),
      stringsAsFactors = FALSE
    )
    
    mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
    
    results[[pathway_type]] <- pathway_gsea(
      abundance = abundance_data,
      metadata = test_data$metadata,
      group = "Environment",
      pathway_type = pathway_type,
      method = "fgsea",
      seed = 123
    )
  }
  
  # Compare column names and data types across all pathway types
  for (i in 1:(length(results)-1)) {
    for (j in (i+1):length(results)) {
      type1 <- names(results)[i]
      type2 <- names(results)[j]
      
      expect_equal(names(results[[i]]), names(results[[j]]),
                   info = paste("Column names should match between", type1, "and", type2))
      
      expect_equal(sapply(results[[i]], class), sapply(results[[j]], class),
                   info = paste("Column types should match between", type1, "and", type2))
    }
  }
})

# Statistical Consistency Tests
test_that("statistical calculations are consistent across pathway types", {
  skip_if_not_installed("fgsea")
  
  test_data <- create_cross_pathway_test_data()
  
  # Use same ranking method and compare statistical properties
  ranking_methods <- c("signal2noise", "t_test", "log2_ratio", "diff_abundance")
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  for (rank_method in ranking_methods) {
    rank_results <- list()
    
    for (pathway_type in pathway_types) {
      if (pathway_type == "KEGG" || pathway_type == "GO") {
        abundance_data <- test_data$ko_abundance
      } else {
        abundance_data <- test_data$ec_abundance
      }
      
      # Calculate ranking metric directly
      rank_metric <- calculate_rank_metric(
        abundance = abundance_data,
        metadata = test_data$metadata,
        group = "Environment",
        method = rank_method
      )
      
      rank_results[[pathway_type]] <- rank_metric
    }
    
    # Compare statistical properties of ranking metrics
    rank_stats <- lapply(rank_results, function(x) {
      list(
        mean = mean(x, na.rm = TRUE),
        sd = sd(x, na.rm = TRUE),
        min = min(x, na.rm = TRUE),
        max = max(x, na.rm = TRUE),
        n_finite = sum(is.finite(x))
      )
    })
    
    # All ranking methods should produce valid statistics
    for (pathway_type in pathway_types) {
      expect_true(is.finite(rank_stats[[pathway_type]]$mean),
                   info = paste(rank_method, "should produce finite mean for", pathway_type))
      expect_true(rank_stats[[pathway_type]]$sd > 0,
                   info = paste(rank_method, "should produce positive SD for", pathway_type))
      expect_true(rank_stats[[pathway_type]]$n_finite > 0,
                   info = paste(rank_method, "should produce finite values for", pathway_type))
    }
  }
})

test_that("p-value distributions are appropriate for all pathway types", {
  skip_if_not_installed("fgsea")
  
  test_data <- create_cross_pathway_test_data()
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  pvalue_distributions <- list()
  
  for (pathway_type in pathway_types) {
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      abundance_data <- test_data$ko_abundance
      gene_names <- rownames(abundance_data)
    } else {
      abundance_data <- test_data$ec_abundance
      gene_names <- rownames(abundance_data)
    }
    
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
    
    # Create realistic p-value distribution
    n_pathways <- length(mock_gene_sets)
    # Mix of significant and non-significant p-values
    pvalues <- c(
      runif(n_pathways * 0.2, 0.001, 0.05),  # 20% significant
      runif(n_pathways * 0.8, 0.05, 1.0)     # 80% non-significant
    )[1:n_pathways]
    
    mock_result <- data.frame(
      pathway_id = names(mock_gene_sets),
      pathway_name = names(mock_gene_sets),
      size = sapply(mock_gene_sets, length),
      ES = runif(n_pathways, -2, 2),
      NES = runif(n_pathways, -3, 3),
      pvalue = pvalues,
      p.adjust = p.adjust(pvalues, method = "BH"),
      leading_edge = sapply(mock_gene_sets, function(x) paste(sample(x, min(3, length(x))), collapse = ";")),
      stringsAsFactors = FALSE
    )
    
    mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
    
    result <- pathway_gsea(
      abundance = abundance_data,
      metadata = test_data$metadata,
      group = "Environment",
      pathway_type = pathway_type,
      method = "fgsea",
      seed = 123
    )
    
    pvalue_distributions[[pathway_type]] <- result$pvalue
  }
  
  # Test p-value distribution properties
  for (pathway_type in pathway_types) {
    pvals <- pvalue_distributions[[pathway_type]]
    
    # P-values should be between 0 and 1
    expect_true(all(pvals >= 0 & pvals <= 1),
                info = paste("P-values should be valid for", pathway_type))
    
    # Should have reasonable distribution (not all 0 or 1)
    expect_true(mean(pvals) > 0.01 & mean(pvals) < 0.99,
                info = paste("P-value distribution should be reasonable for", pathway_type))
    
    # Should have some variation
    expect_true(sd(pvals) > 0.01,
                info = paste("P-values should show variation for", pathway_type))
  }
})

# Integration Workflow Tests
test_that("switching between pathway types in same session works", {
  skip_if_not_installed("fgsea")
  
  test_data <- create_cross_pathway_test_data()
  
  # Simulate a workflow where user switches between pathway types
  workflow_results <- list()
  
  # KEGG analysis
  kegg_gene_sets <- create_mock_gene_sets("KEGG", rownames(test_data$ko_abundance))
  mockery::stub(pathway_gsea, "prepare_gene_sets", function(pathway_type, ...) {
    if (pathway_type == "KEGG") return(kegg_gene_sets)
    else return(list())
  })
  
  kegg_mock <- data.frame(
    pathway_id = names(kegg_gene_sets)[1:10],
    pathway_name = names(kegg_gene_sets)[1:10],
    size = rep(15, 10),
    ES = runif(10, -2, 2),
    NES = runif(10, -3, 3),
    pvalue = runif(10, 0.01, 0.3),
    p.adjust = runif(10, 0.05, 0.4),
    leading_edge = rep("K00001;K00002", 10),
    stringsAsFactors = FALSE
  )
  
  mockery::stub(pathway_gsea, "run_fgsea", function(...) kegg_mock)
  
  workflow_results$kegg <- pathway_gsea(
    abundance = test_data$ko_abundance,
    metadata = test_data$metadata,
    group = "Environment",
    pathway_type = "KEGG",
    method = "fgsea",
    seed = 123
  )
  
  # Switch to MetaCyc analysis
  metacyc_gene_sets <- create_mock_gene_sets("MetaCyc", rownames(test_data$ec_abundance))
  mockery::stub(pathway_gsea, "prepare_gene_sets", function(pathway_type, ...) {
    if (pathway_type == "MetaCyc") return(metacyc_gene_sets)
    else return(list())
  })
  
  metacyc_mock <- data.frame(
    pathway_id = names(metacyc_gene_sets)[1:10],
    pathway_name = names(metacyc_gene_sets)[1:10],
    size = rep(12, 10),
    ES = runif(10, -1.5, 1.5),
    NES = runif(10, -2.5, 2.5),
    pvalue = runif(10, 0.02, 0.4),
    p.adjust = runif(10, 0.06, 0.5),
    leading_edge = rep("EC:1.1.1.1;EC:2.2.2.2", 10),
    stringsAsFactors = FALSE
  )
  
  mockery::stub(pathway_gsea, "run_fgsea", function(...) metacyc_mock)
  
  workflow_results$metacyc <- pathway_gsea(
    abundance = test_data$ec_abundance,
    metadata = test_data$metadata,
    group = "Environment",
    pathway_type = "MetaCyc",
    method = "fgsea",
    seed = 123
  )
  
  # Switch to GO analysis  
  go_gene_sets <- create_mock_gene_sets("GO", rownames(test_data$ko_abundance))
  mockery::stub(pathway_gsea, "prepare_gene_sets", function(pathway_type, ...) {
    if (pathway_type == "GO") return(go_gene_sets)
    else return(list())
  })
  
  go_mock <- data.frame(
    pathway_id = names(go_gene_sets)[1:10],
    pathway_name = names(go_gene_sets)[1:10],
    size = rep(18, 10),
    ES = runif(10, -1.8, 1.8),
    NES = runif(10, -2.8, 2.8),
    pvalue = runif(10, 0.005, 0.25),
    p.adjust = runif(10, 0.02, 0.35),
    leading_edge = rep("K00100;K00200", 10),
    stringsAsFactors = FALSE
  )
  
  mockery::stub(pathway_gsea, "run_fgsea", function(...) go_mock)
  
  workflow_results$go <- pathway_gsea(
    abundance = test_data$ko_abundance,
    metadata = test_data$metadata,
    group = "Environment",
    pathway_type = "GO",
    method = "fgsea",
    seed = 123
  )
  
  # Verify all analyses completed successfully
  for (pathway_type in names(workflow_results)) {
    expect_s3_class(workflow_results[[pathway_type]], "data.frame",
                    info = paste("Workflow result should be data frame for", pathway_type))
    expect_true(nrow(workflow_results[[pathway_type]]) == 10,
                info = paste("Workflow should return expected number of pathways for", pathway_type))
  }
  
  # Verify no contamination between analyses
  expect_true(all(grepl("^ko", workflow_results$kegg$pathway_id)),
              info = "KEGG results should contain KEGG pathway IDs")
  expect_true(all(grepl("^PWY-", workflow_results$metacyc$pathway_id)),
              info = "MetaCyc results should contain MetaCyc pathway IDs")
  expect_true(all(grepl("^GO:", workflow_results$go$pathway_id)),
              info = "GO results should contain GO pathway IDs")
})

test_that("data format compatibility across pathway transitions", {
  test_data <- create_cross_pathway_test_data()
  
  # Test that user can easily switch between analyses
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  results <- list()
  
  for (pathway_type in pathway_types) {
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      abundance_data <- test_data$ko_abundance
      gene_names <- rownames(abundance_data)
    } else {
      abundance_data <- test_data$ec_abundance  
      gene_names <- rownames(abundance_data)
    }
    
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
    
    mock_result <- data.frame(
      pathway_id = names(mock_gene_sets)[1:8],
      pathway_name = names(mock_gene_sets)[1:8],
      size = rep(12, 8),
      ES = runif(8, -1, 1),
      NES = runif(8, -2, 2),
      pvalue = runif(8, 0.01, 0.3),
      p.adjust = runif(8, 0.05, 0.4),
      leading_edge = rep("gene1;gene2", 8),
      stringsAsFactors = FALSE
    )
    
    mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
    
    results[[pathway_type]] <- pathway_gsea(
      abundance = abundance_data,
      metadata = test_data$metadata,
      group = "Environment",
      pathway_type = pathway_type,
      method = "fgsea",
      seed = 123
    )
  }
  
  # Test that results can be combined for comparison
  combined_results <- do.call(rbind, lapply(names(results), function(type) {
    df <- results[[type]]
    df$pathway_type <- type
    return(df)
  }))
  
  expect_s3_class(combined_results, "data.frame")
  expect_equal(nrow(combined_results), 24)  # 8 pathways × 3 types
  expect_true("pathway_type" %in% colnames(combined_results))
  
  # Test that each pathway type is represented
  type_counts <- table(combined_results$pathway_type)
  expect_equal(as.numeric(type_counts), rep(8, 3))
  expect_equal(names(type_counts), c("GO", "KEGG", "MetaCyc"))
})

# Visualization Consistency Tests  
test_that("visualize_gsea works consistently across pathway types", {
  skip_if_not_installed("ggplot2")
  
  test_data <- create_cross_pathway_test_data()
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  for (pathway_type in pathway_types) {
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      gene_names <- rownames(test_data$ko_abundance)
    } else {
      gene_names <- rownames(test_data$ec_abundance)
    }
    
    # Create mock GSEA results
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    gsea_results <- data.frame(
      pathway_id = names(mock_gene_sets)[1:5],
      pathway_name = names(mock_gene_sets)[1:5],
      size = rep(15, 5),
      ES = c(-1.5, 1.2, -0.8, 2.1, -1.1),
      NES = c(-2.3, 1.8, -1.2, 3.1, -1.7),
      pvalue = c(0.01, 0.05, 0.2, 0.001, 0.08),
      p.adjust = c(0.05, 0.1, 0.3, 0.01, 0.15),
      leading_edge = rep("gene1;gene2;gene3", 5),
      method = "fgsea",
      stringsAsFactors = FALSE
    )
    
    # Test different plot types
    plot_types <- c("enrichment_plot", "dotplot", "barplot")
    
    for (plot_type in plot_types) {
      expect_no_error({
        plot <- visualize_gsea(
          gsea_results = gsea_results,
          plot_type = plot_type,
          n_pathways = 5
        )
      }, info = paste("visualize_gsea should work with", pathway_type, plot_type))
      
      expect_s3_class(plot, "ggplot",
                       info = paste("Should return ggplot object for", pathway_type, plot_type))
    }
  }
})

test_that("pathway annotation system consistency", {
  test_data <- create_cross_pathway_test_data()
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  for (pathway_type in pathway_types) {
    # Create mock GSEA results
    if (pathway_type == "KEGG") {
      pathway_ids <- paste0("ko", sprintf("%05d", 10:15))
    } else if (pathway_type == "MetaCyc") {
      pathway_ids <- paste0("PWY-", 1000:1005)
    } else {
      pathway_ids <- paste0("GO:", sprintf("%07d", 1:6))
    }
    
    gsea_results <- data.frame(
      pathway_id = pathway_ids,
      pathway_name = pathway_ids,  # Will be updated by annotation
      size = rep(15, 6),
      ES = runif(6, -2, 2),
      NES = runif(6, -3, 3),
      pvalue = runif(6, 0.01, 0.3),
      p.adjust = runif(6, 0.05, 0.4),
      leading_edge = rep("gene1;gene2", 6),
      method = "fgsea",
      stringsAsFactors = FALSE
    )
    
    # Mock annotation references
    if (pathway_type == "KEGG") {
      mock_reference <- data.frame(
        pathway = pathway_ids,
        pathway_name = paste("KEGG pathway", 1:6),
        class = rep("Metabolism", 6),
        stringsAsFactors = FALSE
      )
      mockery::stub(gsea_pathway_annotation, "load", function(file) {
        kegg_reference <<- mock_reference
      })
    } else if (pathway_type == "MetaCyc") {
      mock_reference <- data.frame(
        V1 = pathway_ids,
        V2 = paste("MetaCyc pathway", 1:6),
        stringsAsFactors = FALSE
      )
      mockery::stub(gsea_pathway_annotation, "load", function(file) {
        MetaCyc_reference <<- mock_reference
      })
    } else {
      mock_reference <- data.frame(
        go_id = pathway_ids,
        go_name = paste("GO term", 1:6),
        category = rep("BP", 6),
        ko_members = rep("K00001;K00002", 6),
        stringsAsFactors = FALSE
      )
      mockery::stub(gsea_pathway_annotation, "ko_to_go_reference", mock_reference)
    }
    
    # Test annotation function
    expect_no_error({
      annotated_results <- gsea_pathway_annotation(
        gsea_results = gsea_results,
        pathway_type = pathway_type
      )
    }, info = paste("Annotation should work for", pathway_type))
    
    # Check that pathway names were added
    expect_true("pathway_name" %in% colnames(annotated_results),
                info = paste("Annotated results should have pathway_name column for", pathway_type))
    
    # Check that some annotations were applied
    expect_true(any(annotated_results$pathway_name != annotated_results$pathway_id),
                info = paste("Some pathway names should be different from IDs for", pathway_type))
  }
})

# Performance Consistency Tests
test_that("performance is comparable across pathway types", {
  skip_if_not_installed("fgsea")
  
  test_data <- create_cross_pathway_test_data(n_samples = 40)
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  execution_times <- list()
  
  for (pathway_type in pathway_types) {
    if (pathway_type == "KEGG" || pathway_type == "GO") {
      abundance_data <- test_data$ko_abundance
      gene_names <- rownames(abundance_data)
    } else {
      abundance_data <- test_data$ec_abundance
      gene_names <- rownames(abundance_data)
    }
    
    # Create larger gene sets for performance testing
    mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
    # Expand to more pathways
    for (i in 16:50) {
      if (pathway_type == "KEGG") {
        pathway_id <- paste0("ko", sprintf("%05d", i))
      } else if (pathway_type == "MetaCyc") {
        pathway_id <- paste0("PWY-", 1000 + i)
      } else {
        pathway_id <- paste0("GO:", sprintf("%07d", i))
      }
      pathway_size <- sample(8:25, 1)
      genes <- sample(gene_names, min(pathway_size, length(gene_names)))
      mock_gene_sets[[pathway_id]] <- genes
    }
    
    mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
    
    # Create comprehensive mock result
    mock_result <- data.frame(
      pathway_id = names(mock_gene_sets),
      pathway_name = names(mock_gene_sets),
      size = sapply(mock_gene_sets, length),
      ES = runif(length(mock_gene_sets), -2, 2),
      NES = runif(length(mock_gene_sets), -3, 3),
      pvalue = runif(length(mock_gene_sets), 0.001, 0.5),
      p.adjust = runif(length(mock_gene_sets), 0.005, 0.8),
      leading_edge = sapply(mock_gene_sets, function(x) paste(sample(x, min(5, length(x))), collapse = ";")),
      stringsAsFactors = FALSE
    )
    
    mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
    
    # Measure execution time
    start_time <- Sys.time()
    
    result <- pathway_gsea(
      abundance = abundance_data,
      metadata = test_data$metadata,
      group = "Environment",
      pathway_type = pathway_type,
      method = "fgsea",
      nperm = 100,
      seed = 123
    )
    
    end_time <- Sys.time()
    execution_times[[pathway_type]] <- as.numeric(end_time - start_time, units = "secs")
    
    # Verify result completeness
    expect_s3_class(result, "data.frame")
    expect_true(nrow(result) > 30)
    expect_true(all(!is.na(result$pvalue)))
  }
  
  # Compare execution times - should be reasonably similar
  times <- unlist(execution_times)
  expect_true(all(times < 5), "All pathway types should complete within 5 seconds")
  
  # No pathway type should be more than 3x slower than the fastest
  time_ratio <- max(times) / min(times)
  expect_true(time_ratio < 3, 
              paste("Performance ratio should be reasonable. Actual ratio:", round(time_ratio, 2)))
})

# Comparative Analysis Tests
test_that("pathway overlap analysis between types works", {
  test_data <- create_cross_pathway_test_data()
  
  # Create overlapping gene sets between pathway types
  # Some KOs appear in both KEGG and GO pathways
  common_kos <- rownames(test_data$ko_abundance)[1:20]
  
  kegg_sets <- list(
    "ko00010" = common_kos[1:10],
    "ko00020" = common_kos[5:15],
    "ko00030" = common_kos[10:20]
  )
  
  go_sets <- list(
    "GO:0006096" = common_kos[1:8],   # Overlap with ko00010
    "GO:0006099" = common_kos[12:18], # Overlap with ko00030
    "GO:0008152" = common_kos[1:5]    # Overlap with ko00010
  )
  
  metacyc_sets <- list(
    "PWY-1001" = rownames(test_data$ec_abundance)[1:8],
    "PWY-1002" = rownames(test_data$ec_abundance)[5:12],
    "PWY-1003" = rownames(test_data$ec_abundance)[10:15]
  )
  
  # Mock results for each pathway type
  mock_kegg <- data.frame(
    pathway_id = names(kegg_sets),
    pathway_name = c("Glycolysis", "Citrate cycle", "Pentose phosphate"),
    size = c(10, 11, 11),
    ES = c(-1.5, 1.2, -0.8),
    NES = c(-2.3, 1.8, -1.2),
    pvalue = c(0.01, 0.05, 0.15),
    p.adjust = c(0.03, 0.1, 0.2),
    leading_edge = c("K00001;K00002", "K00010;K00011", "K00020"),
    method = "fgsea",
    stringsAsFactors = FALSE
  )
  
  mock_go <- data.frame(
    pathway_id = names(go_sets),
    pathway_name = c("Glycolytic process", "TCA cycle", "Metabolic process"),
    size = c(8, 7, 5),
    ES = c(-1.3, 1.0, -1.1),
    NES = c(-2.0, 1.5, -1.8),
    pvalue = c(0.02, 0.08, 0.12),
    p.adjust = c(0.06, 0.15, 0.18),
    leading_edge = c("K00001;K00003", "K00015;K00016", "K00002"),
    method = "fgsea",
    stringsAsFactors = FALSE
  )
  
  mock_metacyc <- data.frame(
    pathway_id = names(metacyc_sets),
    pathway_name = c("Glucose degradation", "Fatty acid oxidation", "Amino acid biosynthesis"),
    size = c(8, 8, 6),
    ES = c(-1.0, 0.9, 1.3),
    NES = c(-1.7, 1.4, 2.0),
    pvalue = c(0.04, 0.06, 0.03),
    p.adjust = c(0.08, 0.12, 0.06),
    leading_edge = c("EC:1.1.1.1;EC:2.2.2.2", "EC:3.3.3.3", "EC:4.4.4.4;EC:5.5.5.5"),
    method = "fgsea",
    stringsAsFactors = FALSE
  )
  
  # Test that we can identify complementary biological insights
  expect_true(nrow(mock_kegg) > 0, "KEGG analysis should find pathways")
  expect_true(nrow(mock_go) > 0, "GO analysis should find pathways")
  expect_true(nrow(mock_metacyc) > 0, "MetaCyc analysis should find pathways")
  
  # Test pathway name similarity analysis
  kegg_names <- tolower(mock_kegg$pathway_name)
  go_names <- tolower(mock_go$pathway_name)
  
  # Should find some conceptual overlap
  name_similarity <- outer(kegg_names, go_names, function(x, y) {
    sapply(seq_along(x), function(i) {
      sum(strsplit(x[i], " ")[[1]] %in% strsplit(y[i], " ")[[1]])
    })
  })
  
  expect_true(any(name_similarity > 0), 
              "Should find some conceptual overlap between KEGG and GO pathway names")
})

test_that("biological interpretation consistency check", {
  # Test that the three pathway types provide complementary insights
  # This is more of a sanity check for biological coherence
  
  # Mock results representing typical microbiome analysis outcomes
  kegg_results <- data.frame(
    pathway_id = c("ko00010", "ko00020", "ko00030", "ko00230", "ko00260"),
    pathway_name = c("Glycolysis", "Citrate cycle", "Pentose phosphate", "Purine metabolism", "Glycine metabolism"),
    NES = c(-2.1, 1.5, -1.2, 2.3, -1.8),
    pvalue = c(0.001, 0.02, 0.15, 0.005, 0.03),
    p.adjust = c(0.005, 0.04, 0.2, 0.015, 0.06),
    pathway_type = "KEGG",
    stringsAsFactors = FALSE
  )
  
  go_results <- data.frame(
    pathway_id = c("GO:0006096", "GO:0006099", "GO:0008152", "GO:0009058", "GO:0006520"),
    pathway_name = c("Glycolytic process", "TCA cycle", "Metabolic process", "Biosynthetic process", "Amino acid metabolism"),
    NES = c(-1.9, 1.3, -1.5, 2.0, -1.6),
    pvalue = c(0.002, 0.03, 0.08, 0.01, 0.04),
    p.adjust = c(0.01, 0.06, 0.12, 0.02, 0.08),
    pathway_type = "GO",
    stringsAsFactors = FALSE
  )
  
  metacyc_results <- data.frame(
    pathway_id = c("PWY-1001", "PWY-1002", "PWY-1003", "PWY-1004", "PWY-1005"),
    pathway_name = c("Glucose degradation", "TCA cycle", "Glycogen biosynthesis", "Fatty acid oxidation", "Leucine biosynthesis"),
    NES = c(-2.0, 1.4, 1.7, -1.3, 2.2),
    pvalue = c(0.001, 0.025, 0.015, 0.12, 0.008),
    p.adjust = c(0.005, 0.05, 0.03, 0.15, 0.02),
    pathway_type = "MetaCyc",
    stringsAsFactors = FALSE
  )
  
  # Test biological coherence
  # 1. Similar pathways should show similar enrichment directions
  glycolysis_kegg <- kegg_results[kegg_results$pathway_name == "Glycolysis", "NES"]
  glycolysis_go <- go_results[go_results$pathway_name == "Glycolytic process", "NES"] 
  glycolysis_metacyc <- metacyc_results[metacyc_results$pathway_name == "Glucose degradation", "NES"]
  
  # All should be negative (depleted) - consistent with each other
  expect_true(sign(glycolysis_kegg) == sign(glycolysis_go),
              "KEGG and GO glycolysis pathways should have same enrichment direction")
  expect_true(sign(glycolysis_kegg) == sign(glycolysis_metacyc),
              "KEGG and MetaCyc glucose degradation should have same enrichment direction")
  
  # 2. Test that we have both positive and negative enrichments (realistic biology)
  for (results in list(kegg_results, go_results, metacyc_results)) {
    expect_true(any(results$NES > 0) && any(results$NES < 0),
                "Should have both upregulated and downregulated pathways")
    
    expect_true(any(results$pvalue < 0.05),
                "Should have some significant pathways")
  }
  
  # 3. P-value distributions should be reasonable
  all_pvals <- c(kegg_results$pvalue, go_results$pvalue, metacyc_results$pvalue)
  expect_true(mean(all_pvals) > 0.01 && mean(all_pvals) < 0.8,
              "P-value distribution should be reasonable across all pathway types")
})

# Summary test for overall consistency
test_that("overall cross-pathway consistency summary", {
  # This test summarizes that all major consistency checks pass
  
  test_data <- create_cross_pathway_test_data()
  pathway_types <- c("KEGG", "MetaCyc", "GO")
  
  # Test that all pathway types can be analyzed successfully
  success_count <- 0
  
  for (pathway_type in pathway_types) {
    tryCatch({
      if (pathway_type == "KEGG" || pathway_type == "GO") {
        abundance_data <- test_data$ko_abundance
        gene_names <- rownames(abundance_data)
      } else {
        abundance_data <- test_data$ec_abundance
        gene_names <- rownames(abundance_data)
      }
      
      mock_gene_sets <- create_mock_gene_sets(pathway_type, gene_names)
      mockery::stub(pathway_gsea, "prepare_gene_sets", function(...) mock_gene_sets)
      
      mock_result <- data.frame(
        pathway_id = names(mock_gene_sets)[1:5],
        pathway_name = names(mock_gene_sets)[1:5],
        size = rep(15, 5),
        ES = runif(5, -2, 2),
        NES = runif(5, -3, 3),
        pvalue = runif(5, 0.01, 0.3),
        p.adjust = runif(5, 0.05, 0.4),
        leading_edge = rep("gene1;gene2", 5),
        stringsAsFactors = FALSE
      )
      
      mockery::stub(pathway_gsea, "run_fgsea", function(...) mock_result)
      
      result <- pathway_gsea(
        abundance = abundance_data,
        metadata = test_data$metadata,
        group = "Environment",
        pathway_type = pathway_type,
        method = "fgsea",
        seed = 123
      )
      
      # Basic validation
      if (is.data.frame(result) && nrow(result) == 5 && 
          all(c("pathway_id", "NES", "pvalue") %in% colnames(result))) {
        success_count <- success_count + 1
      }
      
    }, error = function(e) {
      # Test should not error
    })
  }
  
  expect_equal(success_count, 3)
  
  # Print summary message
  message("Cross-pathway consistency tests completed successfully!")
  message("✓ API consistency across KEGG, MetaCyc, and GO")
  message("✓ Statistical consistency in calculations")
  message("✓ Visualization compatibility")  
  message("✓ Annotation system uniformity")
  message("✓ Performance parity")
  message("✓ Integration workflow reliability")
})

Try the ggpicrust2 package in your browser

Any scripts or data that you put into this service are public.

ggpicrust2 documentation built on Aug. 26, 2025, 1:07 a.m.