test-data.R
In babynamesIL: Israel Baby Names 1949-2024

# Snapshot tests for data integrity
test_that("data integrity - snapshots", {
    old <- options(tibble.print_min = 20)
    on.exit(options(old))
    expect_snapshot(babynamesIL[c(1:20, (nrow(babynamesIL) - 19):nrow(babynamesIL)), ])
    expect_snapshot(babynamesIL_totals[c(1:20, (nrow(babynamesIL_totals) - 19):nrow(babynamesIL_totals)), ])
})

# Structural tests for main dataset
test_that("babynamesIL has correct structure", {
    expect_equal(ncol(babynamesIL), 6)
    expect_true(all(c("sector", "year", "sex", "name", "n", "prop") %in% names(babynamesIL)))

    # Check types
    expect_type(babynamesIL$sector, "character")
    expect_type(babynamesIL$year, "double")
    expect_type(babynamesIL$sex, "character")
    expect_type(babynamesIL$name, "character")
    expect_type(babynamesIL$n, "integer")
    expect_type(babynamesIL$prop, "double")
})

test_that("babynamesIL has correct sectors and sex values", {
    expect_setequal(unique(babynamesIL$sector), c("Jewish", "Muslim", "Christian-Arab", "Druze"))
    expect_setequal(unique(babynamesIL$sex), c("F", "M"))
})

test_that("babynamesIL has correct year range", {
    expect_equal(min(babynamesIL$year), 1949)
    expect_equal(max(babynamesIL$year), 2024)
})

test_that("babynamesIL proportions are valid", {
    expect_true(all(babynamesIL$prop >= 0 & babynamesIL$prop <= 1))

    # Check proportions sum to ~1 for each year/sector/sex group
    prop_sums <- babynamesIL %>%
        dplyr::group_by(sector, year, sex) %>%
        dplyr::summarise(prop_sum = sum(prop), .groups = "drop")
    expect_true(all(abs(prop_sums$prop_sum - 1) < 0.01))
})

test_that("high-volume names do not have long recent gaps", {
    gap_check <- babynamesIL %>%
        dplyr::group_by(sector, sex, name) %>%
        dplyr::summarise(
            years = list(sort(unique(year))),
            max_n = max(n),
            .groups = "drop"
        ) %>%
        dplyr::mutate(
            gap_info = purrr::map(
                years,
                ~ {
                    yrs <- .x
                    if (length(yrs) < 2) {
                        return(tibble::tibble(prev_year = numeric(), next_year = numeric(), gap = numeric()))
                    }
                    tibble::tibble(
                        prev_year = yrs[-length(yrs)],
                        next_year = yrs[-1],
                        gap = diff(yrs) - 1
                    )
                }
            )
        ) %>%
        dplyr::select(-years) %>%
        tidyr::unnest(gap_info) %>%
        dplyr::filter(max_n >= 1000, prev_year >= 2005, gap >= 3)

    offenders <- unique(gap_check$name)
    expect_equal(
        nrow(gap_check),
        0,
        info = if (length(offenders) == 0) "" else paste("Names with gaps:", paste(offenders, collapse = ", "))
    )
})

# Structural tests for totals dataset
test_that("babynamesIL_totals has correct structure", {
    expect_equal(ncol(babynamesIL_totals), 4)
    expect_true(all(c("sector", "sex", "name", "total") %in% names(babynamesIL_totals)))

    expect_setequal(unique(babynamesIL_totals$sector), c("Jewish", "Muslim", "Christian-Arab", "Druze"))
    expect_setequal(unique(babynamesIL_totals$sex), c("F", "M"))
})

test_that("totals and yearly data are consistent", {
    # Every name in yearly data must have a total
    yearly_names <- babynamesIL %>%
        dplyr::distinct(sector, sex, name)
    matched <- dplyr::inner_join(yearly_names, babynamesIL_totals, by = c("sector", "sex", "name"))
    expect_equal(nrow(matched), nrow(yearly_names))

    # CBS totals must be >= sum of filtered yearly data
    yearly_sums <- babynamesIL %>%
        dplyr::group_by(sector, sex, name) %>%
        dplyr::summarise(yearly_sum = sum(n), .groups = "drop") %>%
        dplyr::inner_join(babynamesIL_totals, by = c("sector", "sex", "name"))
    expect_true(all(yearly_sums$total >= yearly_sums$yearly_sum))

    # Totals may include names not in yearly data (sub-threshold names)
    expect_gte(nrow(babynamesIL_totals), nrow(yearly_names))
})

# Tests for archive datasets
test_that("babynamesIL_1948 exists and has correct structure", {
    expect_true(exists("babynamesIL_1948"))
    expect_equal(ncol(babynamesIL_1948), 6)
    expect_true(all(c("sector", "year", "sex", "name", "n", "prop") %in% names(babynamesIL_1948)))

    # All rows should be from 1948
    expect_equal(unique(babynamesIL_1948$year), 1948)

    # 1948 data uses old "Christian" naming
    expect_true("Christian" %in% unique(babynamesIL_1948$sector))
})

test_that("babynamesIL_other exists and has correct structure", {
    expect_true(exists("babynamesIL_other"))
    expect_equal(ncol(babynamesIL_other), 6)
    expect_true(all(c("sector", "year", "sex", "name", "n", "prop") %in% names(babynamesIL_other)))

    # All rows should be from "Other" sector
    expect_equal(unique(babynamesIL_other$sector), "Other")

    # Year range should be 1985-2021
    expect_equal(min(babynamesIL_other$year), 1985)
    expect_equal(max(babynamesIL_other$year), 2021)
})