tests/testthat/test-metacyc_reference_data_loading.R

# MetaCyc Reference Data Loading System Tests
# Testing data loading mechanisms, backup functions, and error recovery
# Following Linus principles: robust data handling, clear error messages, reliable fallbacks

test_that("MetaCyc reference data file existence and accessibility", {
  # Test that the reference data file exists and is accessible
  metacyc_ref_path <- system.file("extdata", "metacyc_to_ec_reference.RData", package = "ggpicrust2")
  
  # Basic file system checks
  expect_true(file.exists(metacyc_ref_path), 
              info = "MetaCyc reference data file should exist in inst/extdata/")
  expect_gt(file.size(metacyc_ref_path), 0, 
            info = "MetaCyc reference data file should not be empty")
  
  # Test file is readable
  expect_no_error({
    load(metacyc_ref_path, envir = environment())
  }, info = "MetaCyc reference data file should be loadable without errors")
  
  # Verify loaded object structure
  expect_true(exists("metacyc_to_ec_reference", envir = environment()))
  expect_s3_class(metacyc_to_ec_reference, "data.frame")
  expect_gt(nrow(metacyc_to_ec_reference), 0)
  expect_equal(ncol(metacyc_to_ec_reference), 2)
  expect_equal(colnames(metacyc_to_ec_reference), c("pathway", "ec_numbers"))
})

test_that("MetaCyc reference data loading in prepare_gene_sets function", {
  # Test the actual loading mechanism used in prepare_gene_sets
  skip_if_not_installed("fgsea")
  
  # Clear environment to test fresh loading
  if (exists("metacyc_to_ec_reference")) {
    rm(metacyc_to_ec_reference)
  }
  
  # Test loading through prepare_gene_sets
  expect_no_error({
    gene_sets <- prepare_gene_sets(pathway_type = "MetaCyc")
  }, info = "prepare_gene_sets should load MetaCyc reference data without errors")
  
  # Validate that gene sets were created successfully
  expect_type(gene_sets, "list")
  expect_gt(length(gene_sets), 0)
  
  # Test that the loading creates proper gene set structure
  for (pathway_id in names(gene_sets)[1:min(5, length(gene_sets))]) {
    expect_type(gene_sets[[pathway_id]], "character")
    expect_gt(length(gene_sets[[pathway_id]]), 0)
    # All should be EC numbers with proper format
    for (ec in gene_sets[[pathway_id]]) {
      expect_true(startsWith(ec, "EC:"))
    }
  }
})

test_that("MetaCyc reference data loading error handling", {
  # Test behavior when reference data is missing or corrupted
  skip_if_not_installed("fgsea")
  
  # This test verifies graceful error handling when data file is not found
  # We can't easily simulate missing file, but we can test the error path
  
  # Test with invalid pathway_type (should not reach MetaCyc loading code)
  expect_error({
    prepare_gene_sets(pathway_type = "InvalidPathwayType")
  })
  
  # Test the specific error message structure for MetaCyc loading failure
  # This tests the tryCatch block in prepare_gene_sets for MetaCyc
  
  # Mock a scenario where the loaded data is malformed
  # Create a temporary environment to test loading behavior
  test_env <- new.env()
  
  # Test that function handles empty or malformed reference data gracefully
  # If the reference data was somehow corrupted to be empty
  test_env$metacyc_to_ec_reference <- data.frame(
    pathway = character(0),
    ec_numbers = character(0)
  )
  
  # Test handling of empty reference data
  expect_no_error({
    # This should not crash even with empty reference data
    gene_sets_empty <- list()
    for (i in 1:nrow(test_env$metacyc_to_ec_reference)) {
      pathway_id <- test_env$metacyc_to_ec_reference[i, "pathway"]
      ec_string <- as.character(test_env$metacyc_to_ec_reference[i, "ec_numbers"])
      
      if (is.na(ec_string) || ec_string == "" || ec_string == "NA") {
        next
      }
      
      ec_numbers <- strsplit(ec_string, ";")[[1]]
      ec_numbers <- trimws(ec_numbers)
      ec_numbers <- ec_numbers[ec_numbers != ""]
      
      if (length(ec_numbers) > 0) {
        ec_numbers <- ifelse(grepl("^EC:", ec_numbers), ec_numbers, paste0("EC:", ec_numbers))
        gene_sets_empty[[pathway_id]] <- ec_numbers
      }
    }
  })
})

test_that("MetaCyc data format validation during loading", {
  # Test that loaded reference data has correct format and structure
  metacyc_ref_path <- system.file("extdata", "metacyc_to_ec_reference.RData", package = "ggpicrust2")
  load(metacyc_ref_path, envir = environment())
  
  # Validate data frame structure
  expect_s3_class(metacyc_to_ec_reference, "data.frame")
  expect_equal(ncol(metacyc_to_ec_reference), 2)
  expect_true("pathway" %in% colnames(metacyc_to_ec_reference))
  expect_true("ec_numbers" %in% colnames(metacyc_to_ec_reference))
  
  # Validate data types
  expect_type(metacyc_to_ec_reference$pathway, "character")
  expect_type(metacyc_to_ec_reference$ec_numbers, "character")
  
  # Validate pathway IDs are reasonable
  pathway_ids <- metacyc_to_ec_reference$pathway
  expect_true(all(!is.na(pathway_ids)))
  expect_true(all(nchar(pathway_ids) > 0))
  expect_true(all(nchar(pathway_ids) <= 50))  # Reasonable length limit
  
  # Validate EC number strings
  for (i in 1:min(20, nrow(metacyc_to_ec_reference))) {
    ec_string <- metacyc_to_ec_reference[i, "ec_numbers"]
    if (!is.na(ec_string) && ec_string != "" && ec_string != "NA") {
      # Should be semicolon-separated EC numbers
      expect_true(grepl("^\\d+\\.\\d+\\.\\d+\\.\\d+(;\\d+\\.\\d+\\.\\d+\\.\\d+)*$", ec_string) ||
                  grepl("^EC:\\d+\\.\\d+\\.\\d+\\.\\d+(;EC:\\d+\\.\\d+\\.\\d+\\.\\d+)*$", ec_string))
    }
  }
  
  # Test uniqueness of pathway IDs
  expect_equal(length(pathway_ids), length(unique(pathway_ids)),
               info = "Pathway IDs should be unique in reference data")
})

test_that("MetaCyc gene set creation from reference data", {
  # Test the specific logic that converts reference data to gene sets
  metacyc_ref_path <- system.file("extdata", "metacyc_to_ec_reference.RData", package = "ggpicrust2")
  load(metacyc_ref_path, envir = environment())
  
  # Manually replicate the gene set creation logic to test it
  gene_sets_manual <- list()
  
  for (i in 1:nrow(metacyc_to_ec_reference)) {
    pathway_id <- metacyc_to_ec_reference[i, "pathway"]
    ec_string <- as.character(metacyc_to_ec_reference[i, "ec_numbers"])
    
    # Skip pathways with no EC mappings
    if (is.na(ec_string) || ec_string == "" || ec_string == "NA") {
      next
    }
    
    # Split EC numbers by semicolon
    ec_numbers <- strsplit(ec_string, ";")[[1]]
    ec_numbers <- trimws(ec_numbers)  # Remove whitespace
    ec_numbers <- ec_numbers[ec_numbers != ""]  # Remove empty strings
    
    if (length(ec_numbers) > 0) {
      # Add EC: prefix if not present for consistency
      ec_numbers <- ifelse(grepl("^EC:", ec_numbers), ec_numbers, paste0("EC:", ec_numbers))
      gene_sets_manual[[pathway_id]] <- ec_numbers
    }
  }
  
  # Validate manual gene set creation
  expect_type(gene_sets_manual, "list")
  expect_gt(length(gene_sets_manual), 0)
  
  # Compare with prepare_gene_sets output
  skip_if_not_installed("fgsea")
  gene_sets_function <- prepare_gene_sets(pathway_type = "MetaCyc")
  
  # Should have same pathways (allowing for potential filtering differences)
  common_pathways <- intersect(names(gene_sets_manual), names(gene_sets_function))
  expect_gt(length(common_pathways), length(gene_sets_manual) * 0.8)  # At least 80% overlap
  
  # For common pathways, EC numbers should match
  for (pathway in common_pathways[1:min(10, length(common_pathways))]) {
    manual_ecs <- sort(gene_sets_manual[[pathway]])
    function_ecs <- sort(gene_sets_function[[pathway]])
    expect_equal(manual_ecs, function_ecs,
                info = paste("EC numbers should match for pathway:", pathway))
  }
})

test_that("MetaCyc reference data consistency across versions", {
  # Test that the reference data maintains consistency and expected properties
  metacyc_ref_path <- system.file("extdata", "metacyc_to_ec_reference.RData", package = "ggpicrust2")
  load(metacyc_ref_path, envir = environment())
  
  # Basic consistency checks
  expect_gte(nrow(metacyc_to_ec_reference), 50)   # Minimum reasonable coverage
  expect_lte(nrow(metacyc_to_ec_reference), 1000) # Maximum reasonable coverage
  
  # Check for expected key MetaCyc pathways
  expected_pathways <- c("1CMET2-PWY", "ANAGLYCOLYSIS-PWY", "ARG+POLYAMINE-SYN")
  pathway_ids <- metacyc_to_ec_reference$pathway
  
  for (expected in expected_pathways) {
    if (expected %in% pathway_ids) {
      # If pathway exists, it should have EC mappings
      row_idx <- which(pathway_ids == expected)
      ec_string <- metacyc_to_ec_reference[row_idx, "ec_numbers"]
      expect_true(!is.na(ec_string) && ec_string != "",
                  info = paste("Expected pathway", expected, "should have EC mappings"))
    }
  }
  
  # Test data completeness
  non_empty_mappings <- sum(!is.na(metacyc_to_ec_reference$ec_numbers) & 
                           metacyc_to_ec_reference$ec_numbers != "" &
                           metacyc_to_ec_reference$ec_numbers != "NA")
  completeness_rate <- non_empty_mappings / nrow(metacyc_to_ec_reference)
  expect_gt(completeness_rate, 0.7,  # At least 70% should have mappings
           info = paste("Data completeness rate:", round(completeness_rate * 100, 2), "%"))
})

test_that("MetaCyc reference data memory efficiency", {
  # Test memory usage and loading efficiency
  metacyc_ref_path <- system.file("extdata", "metacyc_to_ec_reference.RData", package = "ggpicrust2")
  
  # Test file size is reasonable
  file_size_mb <- file.size(metacyc_ref_path) / (1024^2)
  expect_lt(file_size_mb, 10,  # Should be less than 10MB
           info = paste("Reference data file size:", round(file_size_mb, 2), "MB"))
  
  # Test loading time
  start_time <- Sys.time()
  load(metacyc_ref_path, envir = new.env())
  end_time <- Sys.time()
  loading_time <- as.numeric(difftime(end_time, start_time, units = "secs"))
  
  expect_lt(loading_time, 2,  # Should load within 2 seconds
           info = paste("Loading time:", round(loading_time, 3), "seconds"))
  
  # Test memory footprint of loaded data
  load(metacyc_ref_path, envir = environment())
  data_size_mb <- as.numeric(object.size(metacyc_to_ec_reference)) / (1024^2)
  expect_lt(data_size_mb, 5,  # Should use less than 5MB in memory
           info = paste("In-memory size:", round(data_size_mb, 2), "MB"))
})

test_that("MetaCyc backup function validation", {
  # Test if there's a backup/fallback function for creating MetaCyc mappings
  # This would be similar to create_basic_go_mapping() for GO pathways
  
  # Check if there's a create_basic_metacyc_mapping function or similar
  # If it exists, test it; if not, document that it should be implemented
  
  # First check if such a function exists
  backup_function_exists <- exists("create_basic_metacyc_mapping", mode = "function")
  
  if (backup_function_exists) {
    # Test the backup function
    expect_no_error({
      backup_mapping <- create_basic_metacyc_mapping()
    })
    
    expect_s3_class(backup_mapping, "data.frame")
    expect_true("pathway" %in% colnames(backup_mapping))
    expect_true("ec_numbers" %in% colnames(backup_mapping))
    expect_gt(nrow(backup_mapping), 0)
    
  } else {
    # Document that backup function should be implemented
    skip("create_basic_metacyc_mapping function not implemented - consider adding for robustness")
  }
})

test_that("MetaCyc data loading integration with GSEA workflow", {
  # Test that the loaded reference data integrates properly with GSEA workflow
  skip_if_not_installed("fgsea")
  
  # Test complete workflow from data loading to gene set preparation
  expect_no_error({
    gene_sets <- prepare_gene_sets(pathway_type = "MetaCyc")
  })
  
  # Test that gene sets can be used in fgsea
  if (length(gene_sets) > 0) {
    # Create minimal test data
    test_genes <- unique(unlist(gene_sets))[1:min(20, length(unique(unlist(gene_sets))))]
    test_stats <- rnorm(length(test_genes))
    names(test_stats) <- test_genes
    
    expect_no_error({
      # This tests that gene sets have correct format for fgsea
      fgsea_result <- fgsea::fgsea(
        pathways = gene_sets[1:min(5, length(gene_sets))],  # Test subset
        stats = test_stats,
        nperm = 10,  # Minimal for testing
        minSize = 1
      )
    })
  }
})

test_that("MetaCyc reference data update compatibility", {
  # Test that the current loading system would handle updated reference data
  metacyc_ref_path <- system.file("extdata", "metacyc_to_ec_reference.RData", package = "ggpicrust2")
  load(metacyc_ref_path, envir = environment())
  
  # Current data structure requirements
  required_columns <- c("pathway", "ec_numbers")
  expect_true(all(required_columns %in% colnames(metacyc_to_ec_reference)))
  
  # Test that additional columns would be handled gracefully
  extended_data <- metacyc_to_ec_reference
  extended_data$description <- paste("Description for", extended_data$pathway)
  extended_data$category <- sample(c("Metabolism", "Biosynthesis", "Degradation"), 
                                  nrow(extended_data), replace = TRUE)
  
  # Simulate loading extended data (the current loading code should handle extra columns)
  expect_no_error({
    gene_sets_extended <- list()
    
    for (i in 1:min(10, nrow(extended_data))) {
      pathway_id <- extended_data[i, "pathway"]
      ec_string <- as.character(extended_data[i, "ec_numbers"])
      
      if (!is.na(ec_string) && ec_string != "" && ec_string != "NA") {
        ec_numbers <- strsplit(ec_string, ";")[[1]]
        ec_numbers <- trimws(ec_numbers)
        ec_numbers <- ec_numbers[ec_numbers != ""]
        
        if (length(ec_numbers) > 0) {
          ec_numbers <- ifelse(grepl("^EC:", ec_numbers), ec_numbers, paste0("EC:", ec_numbers))
          gene_sets_extended[[pathway_id]] <- ec_numbers
        }
      }
    }
  })
  
  expect_type(gene_sets_extended, "list")
})

Try the ggpicrust2 package in your browser

Any scripts or data that you put into this service are public.

ggpicrust2 documentation built on Aug. 26, 2025, 1:07 a.m.