tests/testthat/test-clean.R

# Helper to create minimal survey for testing
create_test_survey <- function(participants, contacts = NULL) {
  if (is.null(contacts)) {
    contacts <- data.frame(
      part_id = participants$part_id[1],
      cnt_age_exact = 25
    )
  }
  new_contact_survey(participants, contacts)
}

# Test 1: Country name normalisation
test_that("clean() normalises 2-letter ISO country codes", {
  participants <- data.frame(
    part_id = 1:3,
    part_age = c(20, 30, 40),
    country = c("GB", "DE", "FR"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  expect_identical(
    as.character(cleaned$participants$country),
    c("United Kingdom", "Germany", "France")
  )
})

test_that("clean() normalises 3-letter ISO country codes", {
  participants <- data.frame(
    part_id = 1:3,
    part_age = c(20, 30, 40),
    country = c("GBR", "DEU", "FRA"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  expect_identical(
    as.character(cleaned$participants$country),
    c("United Kingdom", "Germany", "France")
  )
})

test_that("clean() normalises full country names", {
  participants <- data.frame(
    part_id = 1:3,
    part_age = c(20, 30, 40),
    country = c("United Kingdom", "Germany", "France"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  expect_identical(
    as.character(cleaned$participants$country),
    c("United Kingdom", "Germany", "France")
  )
})

test_that("clean() preserves completely unrecognised country names", {
  # When countrycode returns NA, the original value is preserved
  participants <- data.frame(
    part_id = 1:2,
    part_age = c(20, 30),
    country = c("Germany", "XYZABC123"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  expect_identical(
    as.character(cleaned$participants$country),
    c("Germany", "XYZABC123")
  )
})

# Test 2: Non-numeric age entries become NA
test_that("clean() sets non-numeric age entries to NA", {
  participants <- data.frame(
    part_id = 1:4,
    part_age = c("25", "unknown", "N/A", "thirty"),
    stringsAsFactors = FALSE
  )

  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  expect_identical(cleaned$participants$part_age_exact[1], 25L)
  expect_true(is.na(cleaned$participants$part_age_exact[2]))
  expect_true(is.na(cleaned$participants$part_age_exact[3]))
  expect_true(is.na(cleaned$participants$part_age_exact[4]))
})

# Test 3: "Under X" format conversion
test_that("clean() converts 'Under X' format to '0-X'", {
  participants <- data.frame(
    part_id = 1:3,
    part_age = c("Under 1", "Under 5", "Under 18"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  expect_identical(cleaned$participants$part_age_est_min, c(0, 0, 0))
  expect_identical(cleaned$participants$part_age_est_max, c(1, 5, 18))
})

# Test 4: Age range parsing
test_that("clean() creates est_min and est_max for age ranges", {
  survey <- polymod
  # Avoid mutating polymod by reference (participants is a data.table)
  survey$participants <- data.table::copy(survey$participants)

  # Replace exact ages with ranges
  survey$participants[, part_age := "20-30"]
  survey$participants[, part_age_exact := NULL]

  cleaned <- clean(survey)

  expect_true("part_age_est_min" %in% names(cleaned$participants))
  expect_true("part_age_est_max" %in% names(cleaned$participants))
  expect_equal(cleaned$participants$part_age_est_min[1], 20, tolerance = 0)
  expect_equal(cleaned$participants$part_age_est_max[1], 30, tolerance = 0)
})

test_that("clean() handles single age values (no range)", {
  participants <- data.frame(
    part_id = 1:2,
    part_age = c("25", "30"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  # For single values, est_min and est_max should be the same
  expect_identical(cleaned$participants$part_age_est_min, c(25, 30))
  expect_identical(cleaned$participants$part_age_est_max, c(25, 30))
})

# Test 5: Exact age column creation
test_that("clean() creates part_age_exact from numeric ages", {
  participants <- data.frame(
    part_id = 1:3,
    part_age = c("25", "30", "45"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  expect_true("part_age_exact" %in% names(cleaned$participants))
  expect_identical(cleaned$participants$part_age_exact, c(25L, 30L, 45L))
})

test_that("clean() sets part_age_exact to NA for age ranges", {
  participants <- data.frame(
    part_id = 1:2,
    part_age = c("25-30", "40-50"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  # Age ranges should result in NA for exact age
  expect_true(is.na(cleaned$participants$part_age_exact[1]))
  expect_true(is.na(cleaned$participants$part_age_exact[2]))
})

# Test 6: Survey with no cleaning needed
test_that("clean() handles survey that doesn't need cleaning", {
  # polymod is already clean
  cleaned <- clean(polymod)

  # Should return without errors and maintain structure
  expect_s3_class(cleaned, "contact_survey")
  expect_true("participants" %in% names(cleaned))
  expect_true("contacts" %in% names(cleaned))
})

# Test 7: Empty survey
test_that("clean() handles empty participants", {
  participants <- data.frame(
    part_id = integer(0),
    part_age = character(0),
    stringsAsFactors = FALSE
  )
  contacts <- data.frame(
    part_id = integer(0),
    cnt_age_exact = integer(0)
  )
  survey <- new_contact_survey(participants, contacts)

  cleaned <- clean(survey)
  expect_identical(nrow(cleaned$participants), 0L)
})

# Test 8: Survey without country column
test_that("clean() handles survey without country column", {
  participants <- data.frame(
    part_id = 1:2,
    part_age = c("25", "30"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)

  cleaned <- clean(survey)
  expect_false("country" %in% names(cleaned$participants))
})

# Test 9: Custom participant_age_column
test_that("clean() works with custom participant_age_column", {
  participants <- data.frame(
    part_id = 1:2,
    age = c("20-30", "40-50"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey, participant_age_column = "age")

  expect_true("age_est_min" %in% names(cleaned$participants))
  expect_true("age_est_max" %in% names(cleaned$participants))
  expect_identical(cleaned$participants$age_est_min, c(20, 40))
  expect_identical(cleaned$participants$age_est_max, c(30, 50))
})

# Test 10: Deprecated argument warning
test_that("clean() deprecated argument produces warning", {
  lifecycle::expect_deprecated(
    clean(polymod, participant.age.column = "part_age")
  )
})

# Test 11: Survey with numeric ages (no conversion needed)
test_that("clean() handles already numeric age column", {
  participants <- data.frame(
    part_id = 1:3,
    part_age = c(25, 30, 45)
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  # Numeric ages without NAs shouldn't trigger age cleaning
  expect_identical(cleaned$participants$part_age, c(25, 30, 45))
})

# Test 12: Age unit conversion (months to years)
test_that("clean() converts months to years", {
  participants <- data.frame(
    part_id = 1:2,
    part_age = c("6 months", "18 months"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  expect_equal(cleaned$participants$part_age_est_min[1], 0.5, tolerance = 0.01)
  expect_equal(cleaned$participants$part_age_est_min[2], 1.5, tolerance = 0.01)
})

test_that("clean() converts weeks to years", {
  participants <- data.frame(
    part_id = 1:2,
    part_age = c("52 weeks", "26 weeks"),
    stringsAsFactors = FALSE
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  expect_equal(cleaned$participants$part_age_est_min[1], 1, tolerance = 0.02)
  expect_equal(cleaned$participants$part_age_est_min[2], 0.5, tolerance = 0.02)
})

# Test 13: Mixed NA and valid ages
test_that("clean() handles mix of valid and NA ages", {
  participants <- data.frame(
    part_id = 1:4,
    part_age = c(25, NA, 30, NA)
  )
  survey <- create_test_survey(participants)
  cleaned <- clean(survey)

  # NA values should trigger age cleaning but numeric values are preserved
  expect_true("part_age_est_min" %in% names(cleaned$participants))
  expect_identical(cleaned$participants$part_age_est_min[1], 25)
  expect_true(is.na(cleaned$participants$part_age_est_min[2]))
  expect_identical(cleaned$participants$part_age_est_min[3], 30)
  expect_true(is.na(cleaned$participants$part_age_est_min[4]))
})

Try the socialmixr package in your browser

Any scripts or data that you put into this service are public.

socialmixr documentation built on April 29, 2026, 9:07 a.m.