tests/testthat/test-semantic_search.R

test_that("semantic_search generates correct n-grams for single word query", {
  skip_if_not_installed("dplyr")

  # Create mock census vector data
  mock_vectors <- data.frame(
    vector = c("v_TEST_1", "v_TEST_2", "v_TEST_3"),
    details = c("Total population count", "Male population aged 15", "Female income median"),
    label = c("Population", "Male", "Female"),
    stringsAsFactors = FALSE
  )

  # Test with single word - should match "population"
  result <- cancensus:::semantic_search("population", mock_vectors)

  # Should find vectors with "population" in details
  expect_true(!is.null(result))
  expect_true(nrow(result) > 0)
  expect_true(any(grepl("population", result$details, ignore.case = TRUE)))
})

test_that("semantic_search generates correct n-grams for multi-word query", {
  skip_if_not_installed("dplyr")

  mock_vectors <- data.frame(
    vector = c("v_TEST_1", "v_TEST_2", "v_TEST_3", "v_TEST_4"),
    details = c(
      "Total population count by age",
      "Male population aged 15 to 24",
      "Female population aged 25 to 34",
      "Income median for households"
    ),
    label = c("Pop1", "Pop2", "Pop3", "Income"),
    stringsAsFactors = FALSE
  )

  # Test with two-word query
  result <- cancensus:::semantic_search("male population", mock_vectors)

  # Should match vectors with "male" and "population"
  expect_true(!is.null(result))
  if (nrow(result) > 0) {
    expect_true(any(grepl("male", result$details, ignore.case = TRUE)))
  }
})

test_that("semantic_search handles empty vectors", {
  skip_if_not_installed("dplyr")

  mock_vectors <- data.frame(
    vector = character(0),
    details = character(0),
    label = character(0),
    stringsAsFactors = FALSE
  )

  # Should handle empty input - may error or warn depending on implementation
  # This is an edge case that likely doesn't occur in real usage
  expect_error(
    result <- cancensus:::semantic_search("population", mock_vectors)
  )
})

test_that("semantic_search handles vectors with punctuation", {
  skip_if_not_installed("dplyr")

  mock_vectors <- data.frame(
    vector = c("v_TEST_1", "v_TEST_2"),
    details = c(
      "Population: total, all ages (2021)",
      "Income - median household income"
    ),
    label = c("Pop", "Income"),
    stringsAsFactors = FALSE
  )

  # Punctuation should be handled correctly
  result <- cancensus:::semantic_search("population total", mock_vectors)

  expect_true(!is.null(result))
  # Should find match despite punctuation
  if (nrow(result) > 0) {
    expect_true(any(grepl("population", result$details, ignore.case = TRUE)))
  }
})

test_that("semantic_search handles case insensitivity", {
  skip_if_not_installed("dplyr")

  mock_vectors <- data.frame(
    vector = c("v_TEST_1", "v_TEST_2"),
    details = c(
      "POPULATION TOTAL",
      "population total"
    ),
    label = c("Pop1", "Pop2"),
    stringsAsFactors = FALSE
  )

  # Should find matches regardless of case
  result1 <- cancensus:::semantic_search("POPULATION", mock_vectors)
  result2 <- cancensus:::semantic_search("population", mock_vectors)
  result3 <- cancensus:::semantic_search("Population", mock_vectors)

  # All should return results
  expect_true(!is.null(result1))
  expect_true(!is.null(result2))
  expect_true(!is.null(result3))
})

test_that("semantic_search with no close matches warns user", {
  skip_if_not_installed("dplyr")

  mock_vectors <- data.frame(
    vector = c("v_TEST_1", "v_TEST_2"),
    details = c(
      "Population total count",
      "Income median value"
    ),
    label = c("Pop", "Income"),
    stringsAsFactors = FALSE
  )

  # Query with no close match should warn
  expect_warning(
    result <- cancensus:::semantic_search("zzzzxxxxxqqqqq", mock_vectors),
    "No close matches found"
  )
})

test_that("semantic_search handles short sentences correctly", {
  skip_if_not_installed("dplyr")

  mock_vectors <- data.frame(
    vector = c("v_TEST_1", "v_TEST_2", "v_TEST_3"),
    details = c(
      "Total",  # Very short
      "Population",  # Single word
      "Total population count by age groups"  # Long
    ),
    label = c("T", "P", "TPC"),
    stringsAsFactors = FALSE
  )

  # Should handle varying length details
  result <- cancensus:::semantic_search("total", mock_vectors)

  expect_true(!is.null(result))
  if (nrow(result) > 0) {
    expect_true(any(grepl("total", result$details, ignore.case = TRUE)))
  }
})

test_that("semantic_search n-gram optimization produces identical results", {
  skip_if_not_installed("dplyr")

  # Create a realistic-sized test set
  mock_vectors <- data.frame(
    vector = paste0("v_TEST_", 1:50),
    details = c(
      "Total population by age groups and gender",
      "Male population aged 0 to 14 years",
      "Female population aged 15 to 24 years",
      "Total households by family composition",
      "Median household income after tax",
      "Average household income before tax",
      "Total dwelling units by structure type",
      "Population density per square kilometer",
      "Labour force participation rate by age",
      "Unemployment rate for all ages",
      rep("Other census variable details", 40)
    ),
    label = paste0("Label_", 1:50),
    stringsAsFactors = FALSE
  )

  # Test several queries
  queries <- c("population", "household income", "aged 15", "total")

  for (query in queries) {
    # The function should still work correctly with optimization
    result <- cancensus:::semantic_search(query, mock_vectors)

    # Should return results for reasonable queries
    if (query %in% c("population", "household income", "total")) {
      expect_true(!is.null(result) || inherits(result, "data.frame"))
    }
  }
})

Try the cancensus package in your browser

Any scripts or data that you put into this service are public.

cancensus documentation built on June 19, 2026, 9:07 a.m.