tests/testthat/test-gsea_pathway_annotation-comprehensive.R

# Comprehensive tests for gsea_pathway_annotation function
library(testthat)

# Helper function to create realistic test GSEA results
create_realistic_gsea_results <- function(n_pathways = 20, pathway_type = "KEGG") {
  set.seed(123)
  
  if (pathway_type == "KEGG") {
    pathway_ids <- paste0("ko", sprintf("%05d", sample(10:999, n_pathways)))
  } else if (pathway_type == "MetaCyc") {
    pathway_ids <- c(
      "GLYCOLYSIS", "TCA-CYCLE", "PENTOSE-P-PWY", "CALVIN-PWY", 
      "FATTY-ACID-BIOSYNTHESIS", "AMINO-ACID-BIOSYN", "NUCLEOTIDE-BIOSYN",
      "METHANE-OXIDATION", "SULFATE-REDUCTION", "NITROGEN-FIXATION",
      paste0("PWY-", sample(1000:9999, n_pathways - 10))
    )[1:n_pathways]
  } else {
    pathway_ids <- paste0("GO:", sprintf("%07d", sample(1:9999999, n_pathways)))
  }
  
  data.frame(
    pathway_id = pathway_ids,
    pathway_name = pathway_ids,  # Will be replaced by annotation
    size = sample(5:200, n_pathways, replace = TRUE),
    ES = runif(n_pathways, -1.0, 1.0),
    NES = rnorm(n_pathways, 0, 1.5),
    pvalue = runif(n_pathways, 0.001, 0.2),
    p.adjust = runif(n_pathways, 0.001, 0.3),
    leading_edge = replicate(n_pathways, {
      genes <- paste0("K", sprintf("%05d", sample(1:10000, sample(3:15, 1))))
      paste(genes, collapse = ";")
    }),
    method = rep("fgsea", n_pathways),
    stringsAsFactors = FALSE
  )
}

# Helper function to create realistic KEGG reference data
create_mock_kegg_reference <- function(pathways = NULL) {
  if (is.null(pathways)) {
    pathways <- paste0("ko", sprintf("%05d", sample(10:999, 30)))
  }
  
  pathway_classes <- c("Metabolism", "Genetic Information Processing", 
                      "Environmental Information Processing", "Cellular Processes",
                      "Human Diseases", "Drug Development")
  
  data.frame(
    pathway = pathways,
    pathway_name = paste("KEGG", pathways, "-", 
                        sample(c("Glycolysis", "TCA cycle", "Fatty acid biosynthesis", 
                               "Amino acid metabolism", "Nucleotide metabolism",
                               "Cell cycle", "DNA replication", "RNA transcription"), 
                               length(pathways), replace = TRUE)),
    pathway_class = sample(pathway_classes, length(pathways), replace = TRUE),
    description = paste("Detailed description for pathway", pathways),
    stringsAsFactors = FALSE
  )
}

# Helper function to create realistic MetaCyc reference data  
create_mock_metacyc_reference <- function(pathways = NULL) {
  if (is.null(pathways)) {
    pathways <- c("GLYCOLYSIS", "TCA-CYCLE", "PENTOSE-P-PWY", "CALVIN-PWY", 
                 "FATTY-ACID-BIOSYNTHESIS", paste0("PWY-", sample(1000:9999, 20)))
  }
  
  # MetaCyc uses X1 and X2 column names based on the data structure we saw
  data.frame(
    X1 = pathways,
    X2 = paste(pathways, "pathway description", sep = " - "),
    stringsAsFactors = FALSE
  )
}

test_that("gsea_pathway_annotation handles KEGG pathways with full annotation", {
  # Create test data with specific pathway IDs
  test_pathways <- paste0("ko", sprintf("%05d", c(10, 20, 30, 40, 50)))
  gsea_results <- create_realistic_gsea_results(5, "KEGG")
  gsea_results$pathway_id <- test_pathways
  
  # Create mock KEGG reference data with all test pathways
  mock_kegg_ref <- create_mock_kegg_reference(test_pathways)
  
  # Mock the file existence check and loading
  with_mock(
    `file.exists` = function(...) TRUE,
    `load` = function(file) {
      # Simulate loading kegg_reference into the calling environment
      parent_env <- parent.frame()
      assign("kegg_reference", mock_kegg_ref, envir = parent_env)
    },
    {
      # Test annotation
      annotated <- gsea_pathway_annotation(gsea_results, pathway_type = "KEGG")
      
      # Check the result structure
      expect_s3_class(annotated, "data.frame")
      expect_equal(nrow(annotated), nrow(gsea_results))
      
      # Check that all original columns are preserved
      original_cols <- colnames(gsea_results)
      expect_true(all(original_cols %in% colnames(annotated)))
      
      # Check that annotation columns were added
      expect_true("pathway_name" %in% colnames(annotated))
      expect_true("pathway_class" %in% colnames(annotated))
      expect_true("description" %in% colnames(annotated))
      
      # Check that pathway names were properly annotated
      expect_true(all(grepl("KEGG", annotated$pathway_name)))
      expect_true(all(!is.na(annotated$pathway_class)))
    }
  )
})

test_that("gsea_pathway_annotation handles KEGG pathways with missing reference file", {
  gsea_results <- create_realistic_gsea_results(5, "KEGG")
  
  # Mock file existence check to return FALSE
  with_mock(
    `file.exists` = function(...) FALSE,
    {
      expect_error(
        gsea_pathway_annotation(gsea_results, pathway_type = "KEGG"),
        "kegg_reference data file not found"
      )
    }
  )
})

test_that("gsea_pathway_annotation handles MetaCyc pathways correctly", {
  # Create test data
  test_pathways <- c("GLYCOLYSIS", "TCA-CYCLE", "PENTOSE-P-PWY")
  gsea_results <- create_realistic_gsea_results(3, "MetaCyc")
  gsea_results$pathway_id <- test_pathways
  
  # Create mock MetaCyc reference data
  mock_metacyc_ref <- create_mock_metacyc_reference(test_pathways)
  
  # Mock the data loading - MetaCyc uses the data() function
  with_mock(
    `data` = function(x, package, envir) {
      if (x == "metacyc_reference") {
        # Note: the actual data uses "metacyc_reference" but file has "MetaCyc_reference"
        assign("metacyc_reference", mock_metacyc_ref, envir = envir)
      }
    },
    {
      annotated <- gsea_pathway_annotation(gsea_results, pathway_type = "MetaCyc")
      
      # Check the result
      expect_s3_class(annotated, "data.frame")
      expect_equal(nrow(annotated), nrow(gsea_results))
      expect_true("pathway_name" %in% colnames(annotated))
      
      # Check that pathway names were updated from MetaCyc reference
      expect_true(all(grepl("pathway description", annotated$pathway_name)))
    }
  )
})

test_that("gsea_pathway_annotation handles partial matches in reference data", {
  # Create test data with some pathways not in reference
  test_pathways <- paste0("ko", sprintf("%05d", c(10, 20, 30, 40, 50)))
  gsea_results <- create_realistic_gsea_results(5, "KEGG")
  gsea_results$pathway_id <- test_pathways
  
  # Create mock reference with only first 3 pathways
  mock_kegg_ref <- create_mock_kegg_reference(test_pathways[1:3])
  
  with_mock(
    `file.exists` = function(...) TRUE,
    `load` = function(file) {
      parent_env <- parent.frame()
      assign("kegg_reference", mock_kegg_ref, envir = parent_env)
    },
    {
      annotated <- gsea_pathway_annotation(gsea_results, pathway_type = "KEGG")
      
      # Check that all rows are preserved
      expect_equal(nrow(annotated), 5)
      
      # Check that missing pathways use pathway_id as name
      # Pathways 4 and 5 should not be in reference and use pathway_id
      missing_pathways <- test_pathways[4:5]
      for (pathway in missing_pathways) {
        pathway_row <- annotated[annotated$pathway_id == pathway, ]
        expect_equal(pathway_row$pathway_name, pathway)
      }
      
      # Check that found pathways have proper names
      found_pathways <- test_pathways[1:3]
      for (pathway in found_pathways) {
        pathway_row <- annotated[annotated$pathway_id == pathway, ]
        expect_true(grepl("KEGG", pathway_row$pathway_name))
      }
    }
  )
})

test_that("gsea_pathway_annotation handles empty GSEA results", {
  # Create empty GSEA results with proper structure
  empty_gsea <- data.frame(
    pathway_id = character(0),
    pathway_name = character(0),
    size = integer(0),
    ES = numeric(0),
    NES = numeric(0),
    pvalue = numeric(0),
    p.adjust = numeric(0),
    leading_edge = character(0),
    method = character(0),
    stringsAsFactors = FALSE
  )
  
  mock_kegg_ref <- create_mock_kegg_reference(character(0))
  
  with_mock(
    `file.exists` = function(...) TRUE,
    `load` = function(file) {
      parent_env <- parent.frame()
      assign("kegg_reference", mock_kegg_ref, envir = parent_env)
    },
    {
      annotated <- gsea_pathway_annotation(empty_gsea, pathway_type = "KEGG")
      
      expect_s3_class(annotated, "data.frame")
      expect_equal(nrow(annotated), 0)
      expect_true("pathway_name" %in% colnames(annotated))
    }
  )
})

test_that("gsea_pathway_annotation handles GO pathways (not implemented)", {
  gsea_results <- create_realistic_gsea_results(5, "GO")
  
  expect_warning(
    annotated <- gsea_pathway_annotation(gsea_results, pathway_type = "GO"),
    "GO pathway annotation not yet implemented"
  )
  
  # Should return original results unchanged
  expect_equal(annotated, gsea_results)
})

test_that("gsea_pathway_annotation validates input parameters", {
  gsea_results <- create_realistic_gsea_results(5)
  
  # Test invalid gsea_results type
  expect_error(
    gsea_pathway_annotation(gsea_results = list()),
    "'gsea_results' must be a data frame"
  )
  
  expect_error(
    gsea_pathway_annotation(gsea_results = "not_a_dataframe"),
    "'gsea_results' must be a data frame"
  )
  
  # Test invalid pathway_type
  expect_error(
    gsea_pathway_annotation(gsea_results, pathway_type = "INVALID"),
    "pathway_type must be one of 'KEGG', 'MetaCyc', or 'GO'"
  )
  
  expect_error(
    gsea_pathway_annotation(gsea_results, pathway_type = c("KEGG", "MetaCyc")),
    "pathway_type must be one of 'KEGG', 'MetaCyc', or 'GO'"
  )
  
  # Test missing required columns
  incomplete_gsea <- gsea_results[, !names(gsea_results) %in% "pathway_id"]
  expect_error(
    gsea_pathway_annotation(incomplete_gsea),
    "GSEA results missing required column: pathway_id"
  )
})

test_that("gsea_pathway_annotation handles various data types and edge cases", {
  gsea_results <- create_realistic_gsea_results(5)
  
  # Test with factor pathway_ids
  gsea_results_factor <- gsea_results
  gsea_results_factor$pathway_id <- as.factor(gsea_results_factor$pathway_id)
  
  mock_kegg_ref <- create_mock_kegg_reference(as.character(gsea_results_factor$pathway_id))
  
  with_mock(
    `file.exists` = function(...) TRUE,
    `load` = function(file) {
      parent_env <- parent.frame()
      assign("kegg_reference", mock_kegg_ref, envir = parent_env)
    },
    {
      annotated <- gsea_pathway_annotation(gsea_results_factor, pathway_type = "KEGG")
      expect_s3_class(annotated, "data.frame")
      expect_equal(nrow(annotated), 5)
    }
  )
  
  # Test with NA values in pathway_id
  gsea_results_na <- gsea_results
  gsea_results_na$pathway_id[1] <- NA
  
  with_mock(
    `file.exists` = function(...) TRUE,
    `load` = function(file) {
      parent_env <- parent.frame()
      assign("kegg_reference", mock_kegg_ref, envir = parent_env)
    },
    {
      annotated <- gsea_pathway_annotation(gsea_results_na, pathway_type = "KEGG")
      expect_s3_class(annotated, "data.frame")
      expect_equal(nrow(annotated), 5)
    }
  )
})

test_that("gsea_pathway_annotation preserves original column order and types", {
  gsea_results <- create_realistic_gsea_results(3)
  original_names <- colnames(gsea_results)
  original_classes <- sapply(gsea_results, class)
  
  mock_kegg_ref <- create_mock_kegg_reference(gsea_results$pathway_id)
  
  with_mock(
    `file.exists` = function(...) TRUE,
    `load` = function(file) {
      parent_env <- parent.frame()
      assign("kegg_reference", mock_kegg_ref, envir = parent_env)
    },
    {
      annotated <- gsea_pathway_annotation(gsea_results, pathway_type = "KEGG")
      
      # Check that all original columns are preserved
      expect_true(all(original_names %in% colnames(annotated)))
      
      # Check that original column types are preserved (for existing columns)
      for (col in original_names) {
        if (col != "pathway_name") {  # pathway_name gets replaced
          expect_equal(class(annotated[[col]]), original_classes[[col]])
        }
      }
    }
  )
})

test_that("gsea_pathway_annotation handles large datasets efficiently", {
  # Create a larger dataset to test performance
  large_gsea_results <- create_realistic_gsea_results(100, "KEGG")
  mock_kegg_ref <- create_mock_kegg_reference(large_gsea_results$pathway_id)
  
  with_mock(
    `file.exists` = function(...) TRUE,
    `load` = function(file) {
      parent_env <- parent.frame()
      assign("kegg_reference", mock_kegg_ref, envir = parent_env)
    },
    {
      start_time <- Sys.time()
      annotated <- gsea_pathway_annotation(large_gsea_results, pathway_type = "KEGG")
      end_time <- Sys.time()
      
      # Check that it completes in reasonable time (less than 5 seconds)
      expect_true(as.numeric(end_time - start_time, units = "secs") < 5)
      
      # Check results
      expect_s3_class(annotated, "data.frame")
      expect_equal(nrow(annotated), 100)
      expect_true(all(c("pathway_name", "pathway_class") %in% colnames(annotated)))
    }
  )
})

test_that("gsea_pathway_annotation handles reference data conversion correctly", {
  # Test that the function properly converts reference data to data frame
  gsea_results <- create_realistic_gsea_results(3, "KEGG")
  
  # Create mock reference as matrix (should be converted to data frame)
  mock_kegg_matrix <- as.matrix(create_mock_kegg_reference(gsea_results$pathway_id))
  
  with_mock(
    `file.exists` = function(...) TRUE,
    `load` = function(file) {
      parent_env <- parent.frame()
      assign("kegg_reference", mock_kegg_matrix, envir = parent_env)
    },
    {
      annotated <- gsea_pathway_annotation(gsea_results, pathway_type = "KEGG")
      
      expect_s3_class(annotated, "data.frame")
      expect_equal(nrow(annotated), 3)
    }
  )
})

Try the ggpicrust2 package in your browser

Any scripts or data that you put into this service are public.

ggpicrust2 documentation built on Aug. 26, 2025, 1:07 a.m.