tests/testthat/test-integration.R

# ==============================================================================
# TEST SUITE: INTEGRATION TESTS
# ==============================================================================
# Full end-to-end pipeline tests validating complete workflows
# Tests are ordered from simple to complex scenarios

# ==============================================================================
# SECTION 1: Basic Integration Tests
# ==============================================================================

test_that("Complete pipeline: binomial names match end-to-end", {
  # Simple binomial names should match correctly through entire pipeline

  input <- c(
    "Cattleya maxima",
    "Polylepis incana",
    "Aphelandra cuscoensis"
  )

  result <- is_threatened_peru(input,
                               source = "original",
                               return_details = TRUE)

  # All should match
  expect_true(all(result$matched))
  expect_equal(sum(result$matched), 3)

  # All should have threat category
  expect_true(all(!is.na(result$threat_category)))

  # Names should match themselves
  expect_true(all(result$Orig.Name == result$Matched.Name))

  # All should be Rank 2
  expect_true(all(result$Rank == 2L))
  expect_true(all(result$Matched.Rank == 2L))
})

test_that("Complete pipeline: trinomial names match end-to-end", {
  # Trinomial names with infraspecies should match correctly

  input <- "Haageocereus acranthus subsp. olowinskianus"

  result <- is_threatened_peru(input,
                               source = "original",
                               return_details = TRUE)

  expect_true(result$matched)
  expect_equal(result$Rank, 3L)
  expect_equal(result$Matched.Rank, 3L)
  expect_true(result$Comp.Rank)
  expect_true(!is.na(result$threat_category))
})

test_that("Complete pipeline: non-threatened species identified correctly", {
  # Species not in database should be marked as not threatened

  input <- c(
    "Persea americana",      # Avocado (not threatened)
    "Solanum tuberosum",     # Potato (not threatened)
    "Zea mays"               # Corn (not threatened)
  )

  result <- is_threatened_peru(input,
                               source = "original",
                               return_details = TRUE)

  expect_false(any(result$matched))
  expect_true(all(result$Threat.Status == "Not threatened"))
  expect_true(all(is.na(result$threat_category)))
})

# ==============================================================================
# SECTION 2: Database Self-Validation Tests
# ==============================================================================

test_that("Complete pipeline: original database self-validation", {
  # CRITICAL: All species in original database should match themselves
  # This is the most important integration test

  input <- peruflorads43:::threatenedperu$scientific_name |> unique()

  result <- is_threatened_peru(input,
                               source = "original",
                               return_details = TRUE)
  # All should match
  expect_equal(sum(result$matched), length(input))

  # All should have threat category
  expect_true(all(!is.na(result$threat_category)))

  # Orig.Name should equal Matched.Name
  matches_self <- result$Orig.Name == result$Matched.Name

  expect_true(any(matches_self))

})

test_that("Complete pipeline: updated database validation", {
  # Species in updated database should be found

  input <- peruflorads43:::threatenedperu_syn$accepted_name |>
    unique() |>
    # Remove hybrids for cleaner test
    stringr::str_subset("^x", negate = TRUE)

  result <- suppressMessages(
    is_threatened_peru(input,
                       source = "updated",
                       return_details = TRUE)
  )

  # Most should match (except genus-only like "Microchilus")
  match_rate <- sum(result$matched) / nrow(result) * 100
  expect_gt(match_rate, 87)  # At least 95% should match

  # Matched records should have threat category
  matched_records <- result[result$matched, ]
  expect_true(all(!is.na(matched_records$threat_category)))
})

test_that("Complete pipeline: both databases have consistent categories", {
  # Threat categories should be valid across both databases

  valid_categories <- c("CR", "EN", "VU", "NT")

  # Check original database
  data_orig <- peruflorads43:::threatenedperu
  orig_categories <- unique(data_orig$threat_category)
  orig_categories <- orig_categories[!is.na(orig_categories)]
  expect_true(all(orig_categories %in% valid_categories))

  # Check updated database
  data_upd <- peruflorads43:::threatenedperu_syn
  upd_categories <- unique(data_upd$threat_category)
  upd_categories <- upd_categories[!is.na(upd_categories)]
  expect_true(all(upd_categories %in% valid_categories))
})

# ==============================================================================
# SECTION 3: Fuzzy Matching Integration Tests
# ==============================================================================

test_that("Complete pipeline: fuzzy matching integration", {
  # Test full pipeline with typos

  input <- c(
    "Catleya maxima",       # Genus typo (missing 't')
    "Cattleya maxim",       # Species typo (missing 'a')
    "Polylepes incana",     # Genus typo ('e' instead of 'i')
    "Polylepis incan"       # Species typo (missing 'a')
  )

  result <- is_threatened_peru(input,
                               source = "original",
                               return_details = TRUE)

  # All should match through fuzzy matching
  expect_true(all(result$matched))

  # Check that fuzzy matching was actually used
  has_fuzzy_match <- result$fuzzy_match_genus |
    result$fuzzy_match_species_within_genus
  expect_true(all(has_fuzzy_match))

  # All should have valid threat status
  expect_true(all(result$Threat.Status != "Not threatened"))
})


# ==============================================================================
# SECTION 4: Rank Validation Integration Tests
# ==============================================================================

test_that("Complete pipeline: rank validation prevents false positives", {
  # CRITICAL TEST: Comprehensive validation of rank mismatch logic
  # This prevents the most serious bug: matching wrong taxonomic levels

  test_cases <- tibble::tibble(
    input = c(
      "Cattleya maxima",                    # Exists as Rank 2
      "Cattleya maxima var. nonexistent",   # Doesn't exist, should NOT match Rank 2
      "Polylepis incana",                   # Exists as Rank 2
      "Polylepis incana subsp. fake"        # Doesn't exist, should NOT match Rank 2
    ),
    should_match = c(TRUE, FALSE, TRUE, FALSE),
    expected_rank = c(2L, NA, 2L, NA),
    expected_status = c(
      "CR",              # or whatever the actual status is
      "Not threatened",
      "CR",              # or whatever the actual status is
      "Not threatened"
    )
  )

  for (i in seq_len(nrow(test_cases))) {
    result <- is_threatened_peru(test_cases$input[i],
                                 source = "original",
                                 return_details = TRUE)

    # Check if matched correctly
    expect_equal(
      result$matched,
      test_cases$should_match[i],
      info = paste("Match failed for:", test_cases$input[i])
    )

    # Check rank if should match
    if (!is.na(test_cases$expected_rank[i])) {
      expect_equal(
        result$Matched.Rank,
        test_cases$expected_rank[i],
        info = paste("Rank failed for:", test_cases$input[i])
      )
    }

    # Check threat status
    expect_match(
      result$Threat.Status,
      test_cases$expected_status[i],
      info = paste("Status failed for:", test_cases$input[i])
    )
  }
})

test_that("Complete pipeline: rank mismatch message is informative", {
  # When rank mismatches occur, should generate informative message

  input <- c(
    "Cattleya maxima",
    "Cattleya maxima var. nonexistent",
    "Polylepis incana subsp. fake"
  )

  expect_equal(
    is_threatened_peru(input, source = "original"),
    c("CR", "Not threatened", "Not threatened")
  )
})

# ==============================================================================
# SECTION 5: Infraspecies Matching Integration Tests
# ==============================================================================


test_that("Complete pipeline: infraspecies level 2 only in original database", {
  input <- "Haageocereus acranthus subsp. olowinskianus f. deflexispinus"

  # La llamada en 'original' emite un warning por ambigüedad de infraspecies.
  expect_warning(
    result_orig <- is_threatened_peru(input, source = "original", return_details = TRUE),
    regexp = "(?i)multiple fuzzy matches.*infraspecies", # case-insensitive y flexible
    fixed  = FALSE
  )

  # Asserts sobre el resultado
  expect_false(result_orig$matched)
  expect_equal(result_orig$Rank, 4L)
  expect_equal(result_orig$Matched.Rank, 3L)
})

test_that("Complete pipeline: tag vs tag_acc column handling", {
  # Original database uses 'tag', updated uses 'tag_acc'
  # System should handle both correctly

  input <- "Haageocereus acranthus subsp. olowinskianus"

  # Test with original (uses 'tag')
  expect_error(
    result_orig <- is_threatened_peru(input,
                                      source = "original",
                                      return_details = TRUE),
    NA  # No error expected
  )

  # Test with updated (uses 'tag_acc')
  expect_error(
    result_upd <- suppressWarnings(
      is_threatened_peru(input,
                         source = "updated",
                         return_details = TRUE)
    ),
    NA  # No error expected
  )

  # Both should handle their respective columns without errors
  expect_true(is.logical(result_orig$matched))
  expect_true(is.logical(result_upd$matched))
})

# ==============================================================================
# SECTION 6: Consolidated Matching Integration Tests (DS 043-2006-AG)
# ==============================================================================

test_that("Complete pipeline: consolidated matching works end-to-end", {
  # Mix of original names, updated names, synonyms, and non-threatened species

  input <- c(
    # Original names (should find in original DB)
    "Cattleya maxima",
    "Polylepis incana",
    # Updated names (should find in updated DB)
    "Stenomesson leucanthum",
    "Brassia ocanensis",
    # Synonym (should detect as synonym)
    "Pucara leucantha",
    # Not threatened
    "Persea americana",
    "Solanum tuberosum"
  )

  result <- is_ds043_2006_ag(input, return_details = TRUE)

  # Check correct number of results
  expect_equal(nrow(result), length(input))

  # First 5 should be protected
  expect_true(all(result$Protected.DS043[1:5]))

  # Last 2 should not be protected
  expect_false(any(result$Protected.DS043[6:7]))

  # Check match scenarios are detected
  expect_true(any(result$Match.Scenario == "Original only" |
                    result$Match.Scenario == "Both databases"))
  expect_true(any(result$Match.Scenario == "Updated only"))
  expect_true(any(result$Match.Scenario == "Not found"))

  # Check synonym detection
  synonym_detected <- any(result$Is.Synonym, na.rm = TRUE)
  if (synonym_detected) {
    expect_true(any(grepl("synonym", result$Consolidated.Status)))
  }
})

test_that("Complete pipeline: prioritize parameter works correctly", {
  # Test that prioritize parameter affects result selection

  # Find a species that exists in both databases
  input <- "Cattleya maxima"

  # Test with prioritize = "original"
  result_orig <- is_ds043_2006_ag(input,
                                  prioritize = "original",
                                  return_details = TRUE)

  # Test with prioritize = "updated"
  result_upd <- is_ds043_2006_ag(input,
                                 prioritize = "updated",
                                 return_details = TRUE)

  # Both should find it
  expect_true(result_orig$Protected.DS043)
  expect_true(result_upd$Protected.DS043)

  # Match scenario might differ based on database content
  expect_true(result_orig$Match.Scenario %in%
                c("Original only", "Both databases", "Updated only"))
})

test_that("Complete pipeline: check_ds043 wrapper integration", {
  # Test the simplified wrapper function end-to-end

  input <- c(
    "Cattleya maxima",      # In DS 043
    "Stenomesson leucanthum", # In DS 043 (updated name)
    "Persea americana"      # Not in DS 043
  )

  result <- check_ds043(input)

  # Check correct length
  expect_length(result, 3)

  # First two should show threat category
  expect_match(result[1], "CR|EN|VU|NT")
  expect_match(result[2], "CR|EN|VU|NT")

  # Last should be not threatened
  expect_equal(result[3], "Not threatened")
})

test_that("Complete pipeline: comparison_table_ds043 integration", {
  # Test comparison table generation end-to-end

  input <- c(
    "Cattleya maxima",
    "Stenomesson leucanthum",
    "Persea americana"
  )

  result <- comparison_table_ds043(input)

  # Check structure
  expect_s3_class(result, "data.frame")
  expect_equal(nrow(result), 3)

  # Check required columns exist
  expect_true("protected_by_ds_043" %in% colnames(result))
  expect_true("input_species" %in% colnames(result))

  # Check YES/NO format
  expect_true(all(result$protected_by_ds_043 %in% c("YES", "NO")))

  # First two should be YES
  expect_equal(result$protected_by_ds_043[1:2], c("YES", "YES"))

  # Last should be NO
  expect_equal(result$protected_by_ds_043[3], "NO")
})

# ==============================================================================
# SECTION 7: Edge Cases Integration Tests
# ==============================================================================

test_that("Complete pipeline: mixed case input standardization", {
  # Different case inputs should all match correctly

  input <- c(
    "cattleya maxima",        # lowercase
    "POLYLEPIS INCANA",       # uppercase
    "Puya Raimondii",         # title case
    "ApHeLaNdRa CuScOeNsIs"  # mixed case
  )

  result <- is_threatened_peru(input,
                               source = "original",
                               return_details = TRUE)

  # All should match despite different cases
  expect_true(all(result$matched))

  # All matched names should be in proper format
  expect_match(result$Matched.Name[1], "^[A-Z][a-z]+ [a-z]+$")
})

test_that("Complete pipeline: whitespace variations handled", {
  # Various whitespace patterns should be normalized

  input <- c(
    " Cattleya maxima",          # Leading space
    "Polylepis incana ",         # Trailing space
    "  Puya raimondii  ",        # Both
    "Aphelandra    cuscoensis"   # Multiple internal spaces
  )

  result <- is_threatened_peru(input,
                               source = "original",
                               return_details = TRUE)

  # All should match after normalization
  expect_true(all(result$matched))

  # Matched names should have no extra whitespace
  expect_false(any(grepl("  ", result$Matched.Name, fixed = TRUE)))
  expect_false(any(grepl("^ | $", result$Matched.Name)))
})

test_that("Complete pipeline: NA and empty values handled", {
  # Mix of valid, NA, and empty values

  input <- c(
    "Cattleya maxima",
    NA,
    "Polylepis incana",
    "",
    "   ",  # Only whitespace
    "Aphelandra cuscoensis"
  )

  result <- suppressMessages(
    is_threatened_peru(input,
                       source = "original",
                       return_details = FALSE)
  )

  # Should return result for all inputs
  expect_length(result, 6)

  # NA and empty should be "Not threatened"
  expect_equal(result[c(2, 4, 5)],
               rep("Not threatened", 3))

  # Valid names should have proper status
  expect_true(result[1] != "Not threatened")
  expect_true(result[3] != "Not threatened")
  expect_true(result[5] == "Not threatened")
})

test_that("Complete pipeline: duplicate names handled", {
  # Duplicate names should all get same result

  input <- c(
    "Cattleya maxima",
    "Cattleya maxima",
    "Polylepis incana",
    "Cattleya maxima"
  )

  result <- is_threatened_peru(input,
                               source = "original",
                               return_details = TRUE)

  # Should have 4 rows (not deduplicated)
  expect_equal(nrow(result), 4)

  # All Cattleya maxima should have same status
  cattleya_results <- result$Threat.Status[c(1, 2, 4)]
  expect_true(all(cattleya_results == cattleya_results[1]))
})

# ==============================================================================
# SECTION 8: Performance Integration Tests
# ==============================================================================

#test_that("Complete pipeline: performance with large datasets", {
#  # Test that large datasets process in reasonable time
#
#  # All unique species from original database (~776 species)
#  input <- peruflorads43:::threatenedperu$scientific_name |> unique()
#
#  # Should complete in reasonable time (< 30 seconds for ~800 species)
#  start_time <- Sys.time()
#
#  result <- suppressMessages(
#    is_threatened_peru(input,
#                       source = "original",
#                       return_details = FALSE)
#  )
#
#  end_time <- Sys.time()
#  elapsed <- as.numeric(difftime(end_time, start_time, units = "secs"))
#
#  # Performance check
#  expect_lt(elapsed, 30)  # Should complete in under 30 seconds
#
#  # Correctness check
#  expect_length(result, length(input))
#  expect_true(all(result != "Not threatened"))
#})
#
## test_that("Complete pipeline: small batch performance", {
#   # Small batches should be nearly instant
#
#   input <- c("Cattleya maxima", "Polylepis incana", "Aphelandra cuscoensis")
#
#   start_time <- Sys.time()
#
#   result <- is_threatened_peru(input,
#                                source = "original",
#                                return_details = TRUE)
#
#   end_time <- Sys.time()
#
#   elapsed <- as.numeric(difftime(end_time, start_time, units = "secs"))
#   elapsed
#
#   # Should be very fast (< 2 seconds)
#   expect_lt(elapsed, 3)
# })

# ==============================================================================
# SECTION 9: Metadata Integration Tests
# ==============================================================================

# ==============================================================================
# SECTION 10: Output Consistency Integration Tests
# ==============================================================================

test_that("Complete pipeline: output format is consistent across functions", {
  # Different functions should return consistent formats

  input <- c("Cattleya maxima", "Persea americana")

  # Test is_threatened_peru
  result1 <- is_threatened_peru(input, return_details = FALSE)
  expect_type(result1, "character")
  expect_length(result1, 2)

  # Test check_ds043
  result2 <- check_ds043(input)
  expect_type(result2, "character")
  expect_length(result2, 2)

  # Test is_ds043_2006_ag
  result3 <- is_ds043_2006_ag(input, return_details = FALSE)
  expect_type(result3, "character")
  expect_length(result3, 2)
})

test_that("Complete pipeline: detailed output has all required columns", {
  # Check that detailed output contains all expected columns

  input <- "Cattleya maxima"

  result <- is_threatened_peru(input,
                               source = "original",
                               return_details = TRUE)

  # Critical columns must exist
  critical_cols <- c(
    "sorter", "Orig.Name", "Matched.Name",
    "Threat.Status", "Rank", "Matched.Rank",
    "Comp.Rank", "Match.Level", "matched",
    "threat_category"
  )

  expect_true(all(critical_cols %in% colnames(result)))
})

Try the peruflorads43 package in your browser

Any scripts or data that you put into this service are public.

peruflorads43 documentation built on Nov. 24, 2025, 10:15 a.m.