Nothing
library(SmokingHistoryGenerator)
library(glue)
library(testthat)
# Normalize CRLF / stray \\r for cross-platform comparison (Windows may emit CRLF in file output).
read_output_lines <- function(path) {
lines <- readLines(path, warn = FALSE, encoding = "UTF-8")
if (length(lines)) {
lines[1] <- sub("^\ufeff", "", lines[1])
}
gsub("\r", "", lines, fixed = TRUE)
}
# Locate the legacy XML <RUN>...</RUN> data block (tolerant of whitespace).
xml_run_bounds <- function(lines) {
trimmed <- trimws(lines)
run_start <- which(trimmed == "<RUN>")
run_end <- which(trimmed == "</RUN>")
list(start = run_start, end = run_end)
}
extract_tag <- function(vector, tag) {
# Find all occurrences of start and end tags
start_tag <- paste0("<", tag, ">")
end_tag <- paste0("</", tag, ">")
start_indices <- which(grepl(start_tag, vector, fixed = TRUE))
end_indices <- which(grepl(end_tag, vector, fixed = TRUE))
# Check if tags exist
if (length(start_indices) == 0 || length(end_indices) == 0) {
return(NULL)
}
# Handle single-line case
if (any(start_indices == end_indices)) {
single_line_idx <- intersect(start_indices, end_indices)[1]
line_content <- vector[single_line_idx]
content <- gsub(paste0(".*", start_tag, "(.+)", end_tag, ".*"), "\\1", line_content)
return(content)
}
# Handle multi-line case
first_start <- start_indices[1]+1
first_end <- end_indices[end_indices > first_start][1]-1
if (is.na(first_end)) {
return(NULL)
}
return(vector[first_start:first_end])
}
# Legacy XML goldens: compare only tags that define the simulation rows, not <RUNINFO>.
# <RUNINFO>/<DATAFILES> paths differ by machine, cwd, and devtools vs R CMD check; goldens
# use portable inst/extdata + tests/... strings. extract_tag(..., "RUN") ignores all of that.
get_run_details <- function(file_path) {
vector <- read_output_lines(file_path)
run <- extract_tag(vector, "RUN")
cessation <- extract_tag(vector, "CESSATION_YR")
return(list(run = run, cessation = cessation))
}
# Core version embedded in legacy XML goldens (CLI WriteRunInfoTag). When goldens are pinned to
# an older CLI (e.g. 6.4.0 + wide .txt inputs), R parity checks against the current engine skip.
read_fixture_core_ver <- function(file_path) {
v <- extract_tag(read_output_lines(file_path), "VERSION")
if (is.null(v) || !length(v)) {
return(NA_character_)
}
as.character(v[[1]])
}
write_input_file_from_template <- function(template_path, rng_strategy, yob, cessation_yr, data_folder, outputs_folder) {
# The main motivation to write custom config files was due to pathing discrepancies between devtools:test() and CMD Check
input_filepath <- test_path(glue("../inputs/test_input_{rng_strategy}_{yob}_{cessation_yr}.txt"))
formatted_input <- glue(paste(template_path, collapse = "\n"))
writeLines(as.character(formatted_input), con = input_filepath)
return(input_filepath)
}
generate_output <- function(rng_strategy, yob, cessation_yr, outputs_dir_abs) {
template_path <- readLines(test_path("../templates/test_input_example.txt"))
input_filepath <- write_input_file_from_template(
template_path, rng_strategy, yob, cessation_yr, data_folder, outputs_dir_abs
)
shg$LegacyRunWebVersion(input_filepath)
out_file <- file.path(outputs_dir_abs, glue("test_output_{rng_strategy}_{yob}_{cessation_yr}.txt"))
return(get_run_details(out_file))
}
clear_test_artifacts <- function(folder) {
folder <- test_path(folder)
if (dir.exists(folder)) {
files <- list.files(folder, full.names = TRUE)
file.remove(files)
file.remove(folder)
}
}
get_mean_from_column <- function(df, column) {
return(mean(df[[column]][df[[column]] >= 0]))
}
get_stats_from_df <- function(df) {
mean_initiation <- get_mean_from_column(df, "smoking_initiation_age")
mean_cessation <- get_mean_from_column(df, "smoking_cessation_age")
age_at_death <- get_mean_from_column(df, "age_at_death")
return(list(mean_initiation = mean_initiation, mean_cessation = mean_cessation, age_at_death = age_at_death))
}
# Integer race/sex/birth_cohort columns for runSimFromDataFrame (typed literals hidden here)
test_pop_df <- function(n, race = 0, sex = 0, birth_cohort = 1940) {
data.frame(
race = as.integer(rep(race, n)),
sex = as.integer(rep(sex, n)),
birth_cohort = as.integer(rep(birth_cohort, n))
)
}
# Tests
shg <- new(SHGInterface)
# Legacy XML fixtures were generated with ACM (all-cause) mortality tables
shg$mortality_filename <- "mort/acm.csv"
shg$num_threads <- 1
shg$number_of_segments <- 1
shg$rng_strategy <- "MersenneTwister"
N <- 10^4 # Individuals to simulate (REPEAT)
# TODO: maybe a better way to reference the input data folder in the package?
# Bundled CSV inputs install under system.file("extdata", "2018", package=...) (see inst/extdata in source).
# LegacyRunWebVersion ignores input_data_folder; config paths are cwd-relative or absolute.
data_folder <- system.file("extdata", "2018", package = "SmokingHistoryGenerator")
test_that("SHG extdata folder exists and contains bundled CSV inputs", {
expect_true(nzchar(data_folder) && dir.exists(data_folder))
expect_true(file.exists(file.path(data_folder, "smok", "initiation.csv")))
})
shg$input_data_folder <- data_folder
clear_test_artifacts("../inputs")
clear_test_artifacts("../outputs")
dir.create(test_path("../inputs"), recursive = TRUE, showWarnings = FALSE)
dir.create(test_path("../outputs"), recursive = TRUE, showWarnings = FALSE)
outputs_dir_abs <- normalizePath(test_path("../outputs"), winslash = "/", mustWork = FALSE)
outputs_folder <- outputs_dir_abs
MT_output_A <- generate_output("MersenneTwister", 1950, 0, outputs_folder)
MT_fixture_A <- get_run_details(test_path("../fixtures/2018/MT/yob_1950_cessation_0.txt"))
MT_output_B <- generate_output("MersenneTwister", 2010, 2050, outputs_folder)
MT_fixture_B <- get_run_details(test_path("../fixtures/2018/MT/yob_2010_cessation_2050.txt"))
# One canonical fixture per scenario: same config must produce the same <RUN> lines on every OS.
# If a platform diverges, fix determinism in the engine — do not maintain alternate goldens or relaxed checks.
# Path-agnostic: we pass only the <RUN>...</RUN> line vectors from get_run_details(), never whole-file XML.
compare_legacy_run_body <- function(actual_run, fixture_run) {
expect_equal(actual_run, fixture_run)
}
fixture_core <- read_fixture_core_ver(test_path("../fixtures/2018/MT/yob_1950_cessation_0.txt"))
pkg_core <- shg$get_shg_core_version()
fixture_skip_msg <- paste0(
"2018 XML goldens are core ", fixture_core, "; this build is ", pkg_core,
". Regenerate with tools/regenerate-legacy-xml-fixtures-cli.sh using a matching CLI, or upgrade goldens."
)
test_that("MersenneTwister simulation output in R does not differ from C++ fixtures", {
skip_if(!is.na(fixture_core) && !identical(fixture_core, pkg_core), fixture_skip_msg)
compare_legacy_run_body(MT_output_A$run, MT_fixture_A$run)
expect_equal(MT_output_A$cessation, "0")
expect_equal(MT_fixture_A$cessation, "0")
compare_legacy_run_body(MT_output_B$run, MT_fixture_B$run)
expect_equal(MT_output_B$cessation, "2050")
expect_equal(MT_fixture_B$cessation, "2050")
})
RS_output_A <- generate_output("RngStream", 1950, 0, outputs_folder)
RS_fixture_A <- get_run_details(test_path("../fixtures/2018/RS/yob_1950_cessation_0.txt"))
RS_output_B <- generate_output("RngStream", 2010, 2050, outputs_folder)
RS_fixture_B <- get_run_details(test_path("../fixtures/2018/RS/yob_2010_cessation_2050.txt"))
test_that("RngStream simulation output in R does not differ from C++ fixtures", {
skip_if(!is.na(fixture_core) && !identical(fixture_core, pkg_core), fixture_skip_msg)
compare_legacy_run_body(RS_output_A$run, RS_fixture_A$run)
expect_equal(RS_output_A$cessation, "0")
expect_equal(RS_fixture_A$cessation, "0")
compare_legacy_run_body(RS_output_B$run, RS_fixture_B$run)
expect_equal(RS_output_B$cessation, "2050")
expect_equal(RS_fixture_B$cessation, "2050")
})
test_that("MT vs RngStream and fixed vs dataframe entry points agree", {
shg$rng_strategy <- "MersenneTwister"
MT_SIM <- shg$runSimFromFixedValues(N, 0, 0, 1940)
shg$rng_strategy <- "RngStream"
RS_SIM <- shg$runSimFromFixedValues(N, 0, 0, 1940)
MT_STATS <- get_stats_from_df(MT_SIM)
RS_STATS <- get_stats_from_df(RS_SIM)
expect_equal(dim(RS_SIM), dim(MT_SIM))
expect_equal(MT_STATS$mean_initiation, RS_STATS$mean_initiation, tolerance = 0.01)
# MT vs RngStream can differ more on cessation means for sparse older cohorts at N=1e4
expect_equal(MT_STATS$mean_cessation, RS_STATS$mean_cessation, tolerance = 1)
expect_equal(MT_STATS$age_at_death, RS_STATS$age_at_death, tolerance = 0.01)
# If MT_STATS and RS_STATS are equal, it would indicate there is a problem with the RNG
# Results should be very similar but *not* identical
expect_false(isTRUE(all.equal(MT_STATS, RS_STATS)))
pop <- list(
race = rep(0, N),
sex = rep(0, N),
birth_cohort = rep(1940, N)
)
shg$rng_strategy <- "MersenneTwister"
MT_SIM_POP <- shg$runSimFromDataFrame(pop)
shg$rng_strategy <- "RngStream"
RS_SIM_POP <- shg$runSimFromDataFrame(pop)
MT_STATS_POP <- get_stats_from_df(MT_SIM_POP)
RS_STATS_POP <- get_stats_from_df(RS_SIM_POP)
expect_equal(MT_STATS_POP$mean_initiation, RS_STATS_POP$mean_initiation, tolerance = 0.01)
expect_equal(MT_STATS_POP$mean_cessation, RS_STATS_POP$mean_cessation, tolerance = 1)
expect_equal(MT_STATS_POP$age_at_death, RS_STATS_POP$age_at_death, tolerance = 0.01)
expect_identical(MT_STATS_POP$mean_initiation, MT_STATS$mean_initiation)
expect_identical(MT_STATS_POP$mean_cessation, MT_STATS$mean_cessation)
expect_identical(MT_STATS_POP$age_at_death, MT_STATS$age_at_death)
expect_identical(RS_STATS_POP$mean_initiation, RS_STATS$mean_initiation)
expect_identical(RS_STATS_POP$mean_cessation, RS_STATS$mean_cessation)
expect_identical(RS_STATS_POP$age_at_death, RS_STATS$age_at_death)
})
test_that("Invalid input configuration path fails with proper error message", {
input_filepath <- "file_does_not_exist.txt"
expect_error(shg$LegacyRunWebVersion(input_filepath), "The specified input file 'file_does_not_exist.txt' could not be opened for reading.")
})
test_that("Invalid input parameter path (eg initiation) records error in legacy error file", {
# RunWebVersion catches SimException and writes <ERROR>...</ERROR> to ERRORFILE; do not rely on
# Rcpp::warning() alone (Windows/GHA can surface warnings inconsistently vs expect_warning).
template_path <- readLines(test_path("../templates/test_input_example_incorrect_init_path.txt"))
input_filepath <- write_input_file_from_template(template_path, "MersenneTwister", 1950, 0, data_folder, outputs_folder)
err_path <- file.path(outputs_folder, "test_errors_MersenneTwister_1950_0.txt")
unlink(err_path)
suppressWarnings(shg$LegacyRunWebVersion(input_filepath))
expect_true(file.exists(err_path))
err_txt <- paste(read_output_lines(err_path), collapse = "\n")
expect_match(err_txt, "[Tt]he specified input file")
expect_match(err_txt, "initiation_does_not_exist")
expect_match(err_txt, "LoadProbabilityData")
})
test_that("Invalid output path fails with proper error message", {
template_path <- readLines(test_path("../templates/test_input_example.txt"))
outputs_folder <- "folder_does_not_exist"
input_filepath <- write_input_file_from_template(template_path, "MersenneTwister", 1950, 0, data_folder, outputs_folder)
# C++ may emit Windows path separators in the error string on Win builders
expect_error(
shg$LegacyRunWebVersion(input_filepath),
regexp = "Specified error file: '.*folder_does_not_exist.*test_errors_MersenneTwister_1950_0\\.txt' could not be opened for writing"
)
})
# TODO: Compare Legacy tests with runSimFromFixedValues(): requires parsing of results
# Tests for configuration management
test_that("getConfig() with no arguments works (R does not apply C++ default args)", {
shg <- new(SHGInterface)
cfg <- shg$getConfig()
expect_type(cfg, "list")
expect_equal(cfg$config_version, "1.0")
cfg_named <- shg$getConfig(debug = FALSE)
expect_equal(sort(names(cfg)), sort(names(cfg_named)))
})
test_that("factory default mortality_filename is mortality/acm.csv", {
shg <- new(SHGInterface)
expect_equal(shg$mortality_filename, "mort/acm.csv")
shg$mortality_filename <- "mort/ocm-excl-lung-cancer.csv"
shg$reset_to_factory_defaults()
expect_equal(shg$mortality_filename, "mort/acm.csv")
})
test_that("getConfig() returns correct structure with config_version", {
shg_test <- new(SHGInterface)
config <- shg_test$getConfig(debug = FALSE)
expect_true(is.list(config))
expect_equal(config$config_version, "1.0")
expect_true("rng_strategy" %in% names(config))
expect_true("number_of_segments" %in% names(config))
expect_true("num_threads" %in% names(config))
expect_true("seeds" %in% names(config))
expect_true("input_data_folder" %in% names(config))
expect_true("mortality_filename" %in% names(config))
expect_true("smok_params_source" %in% names(config))
expect_true("mort_params_source" %in% names(config))
expect_true("mort_params_type" %in% names(config))
expect_true("cohort_year" %in% names(config))
expect_true("repeat" %in% names(config))
expect_true("race" %in% names(config))
expect_true("sex" %in% names(config))
expect_true("timestamp" %in% names(config))
})
test_that("getConfig() returns concrete default seeds", {
shg_rs <- new(SHGInterface)
shg_rs$rng_strategy <- "RngStream"
cfg_rs <- shg_rs$getConfig(debug = FALSE)
expect_equal(cfg_rs$seeds, c(12345, 12345, 12345, 12345, 12345, 12345))
shg_mt <- new(SHGInterface)
shg_mt$rng_strategy <- "MersenneTwister"
cfg_mt <- shg_mt$getConfig(debug = FALSE)
expect_equal(cfg_mt$seeds, c(1898587603, 1468371936, 1551308340, 1590227640))
})
test_that("getConfig() keeps intent values after simulation", {
shg_rt <- new(SHGInterface)
shg_rt$input_data_folder <- data_folder
shg_rt$rng_strategy <- "RngStream"
shg_rt$number_of_segments <- -1
shg_rt$num_threads <- -1
shg_rt$runSimFromFixedValues(5000, 0, 0, 1950)
cfg <- shg_rt$getConfig(debug = FALSE)
expect_equal(cfg$number_of_segments, -1)
expect_equal(cfg$num_threads, -1)
})
test_that("getReproConfig() captures effective segments but omits num_threads", {
shg_rt <- new(SHGInterface)
shg_rt$input_data_folder <- data_folder
shg_rt$rng_strategy <- "RngStream"
shg_rt$number_of_segments <- -1
shg_rt$num_threads <- -1
shg_rt$runSimFromFixedValues(5000, 0, 0, 1950)
cfg <- shg_rt$getReproConfig(debug = FALSE)
expect_true(cfg$number_of_segments >= 1)
expect_false(identical(cfg$number_of_segments, -1))
expect_false("num_threads" %in% names(cfg))
expect_equal(shg_rt$num_threads, -1)
})
test_that("getReproConfig() errors before any completed simulation", {
shg_rt <- new(SHGInterface)
expect_error(
shg_rt$getReproConfig(),
"No completed simulation is available"
)
})
test_that("getConfig() records cohort_year for single-cohort runs", {
shg_rt <- new(SHGInterface)
shg_rt$input_data_folder <- data_folder
shg_rt$runSimFromFixedValues(500, 0, 0, 1950)
cfg <- shg_rt$getConfig(debug = FALSE)
expect_equal(cfg$cohort_year, 1950)
})
test_that("getConfig() records repeat/race/sex after runSimFromFixedValues", {
shg_rt <- new(SHGInterface)
shg_rt$input_data_folder <- data_folder
shape <- shg_rt$get_data_shape()
race_value <- as.integer(max(0, shape$num_races - 1))
shg_rt$runSimFromFixedValues(500, race_value, 0, 1950)
cfg <- shg_rt$getConfig(debug = FALSE)
expect_equal(cfg[["repeat"]], 500)
expect_equal(cfg[["race"]], race_value)
expect_equal(cfg[["sex"]], 0)
})
test_that("runSimFromFixedValues errors clearly when cohort/race/sex not available", {
shg_rt <- new(SHGInterface)
shg_rt$input_data_folder <- data_folder
shape <- shg_rt$get_data_shape()
starts <- as.integer(shape$cohort_start_years)
ends <- as.integer(shape$cohort_end_years)
min_y <- min(starts, na.rm = TRUE)
max_y <- max(ends, na.rm = TRUE)
years <- min_y:max_y
covered <- rep(FALSE, length(years))
for (i in seq_along(starts)) {
covered[years >= starts[i] & years <= ends[i]] <- TRUE
}
missing_years <- years[!covered]
skip_if_not(length(missing_years) > 0, "No cohort gaps in current parameter set.")
bad_cohort <- as.integer(missing_years[[1]])
bad_race <- as.integer(shape$num_races)
bad_sex <- as.integer(shape$num_sexes)
valid_cohort <- as.integer(starts[[1]])
out <- "not-assigned"
expect_error(
out <- shg_rt$runSimFromFixedValues(10, 0, 0, bad_cohort),
"Requested cohort_year .* not available"
)
expect_identical(out, "not-assigned")
expect_error(
shg_rt$runSimFromFixedValues(10, bad_race, 0, valid_cohort),
"Requested race value .* not available"
)
expect_error(
shg_rt$runSimFromFixedValues(10, 0, bad_sex, valid_cohort),
"Requested sex value .* not available"
)
})
test_that("getConfig(debug=TRUE) includes debug info", {
shg_test <- new(SHGInterface)
config <- shg_test$getConfig(debug = TRUE)
expect_true("rng_state_fingerprint" %in% names(config))
expect_true("package_version" %in% names(config))
expect_true("package_source" %in% names(config))
expect_true("r_version" %in% names(config))
expect_true("platform" %in% names(config))
expect_true("memory_usage" %in% names(config))
expect_true(nchar(config$package_version) > 0)
expect_true(nchar(config$r_version) > 0)
})
test_that("useConfig() correctly configures instance", {
shg1 <- new(SHGInterface)
shg1$rng_strategy <- "RngStream"
shg1$number_of_segments <- 4
shg1$num_threads <- -1 # auto multi-threaded
shg1$input_data_folder <- "/test/path"
shg1$immediate_cessation_year <- 2025
config <- shg1$getConfig(debug = FALSE)
shg2 <- new(SHGInterface)
shg2$useConfig(config)
expect_equal(shg2$rng_strategy, shg1$rng_strategy)
expect_equal(shg2$number_of_segments, shg1$number_of_segments)
expect_equal(shg2$num_threads, shg1$num_threads)
expect_equal(shg2$input_data_folder, shg1$input_data_folder)
expect_equal(shg2$immediate_cessation_year, shg1$immediate_cessation_year)
})
test_that("useConfig() clears stale effective runtime cache", {
shg_rt <- new(SHGInterface)
shg_rt$input_data_folder <- data_folder
shg_rt$rng_strategy <- "RngStream"
shg_rt$number_of_segments <- -1
shg_rt$num_threads <- -1
shg_rt$runSimFromFixedValues(500, 0, 0, 1950)
cfg_repro <- shg_rt$getReproConfig()
expect_true(cfg_repro$number_of_segments >= 1)
shg_rt$useConfig(list(
config_version = "1.0",
number_of_segments = -1,
num_threads = -1,
rng_strategy = "RngStream"
))
cfg_intent <- shg_rt$getConfig()
expect_equal(cfg_intent$number_of_segments, -1)
expect_equal(cfg_intent$num_threads, -1)
expect_error(
shg_rt$getReproConfig(),
"No completed simulation is available"
)
})
test_that("useConfig() validates config_version", {
shg_test <- new(SHGInterface)
# Missing config_version should still work (assume current format)
config_no_version <- list(rng_strategy = "RngStream")
expect_silent(shg_test$useConfig(config_no_version))
# Unsupported version should warn
config_bad_version <- list(config_version = "2.0", rng_strategy = "RngStream")
expect_warning(shg_test$useConfig(config_bad_version), "may not be fully supported")
})
test_that("useConfig() warns on unknown fields", {
shg_test <- new(SHGInterface)
config <- list(
config_version = "1.0",
rng_strategy = "RngStream",
unknown_field = "test"
)
expect_warning(shg_test$useConfig(config), "Unknown config field")
})
test_that("useConfig() clears stale params provenance when input paths change without bundle keys", {
shg <- new(SHGInterface)
shg$smok_params_source <- "https://example.invalid/smok.zip"
shg$mort_params_source <- "https://example.invalid/mort.zip"
shg$mort_params_type <- "ocm"
shg$useConfig(list(
config_version = "1.0",
rng_strategy = "RngStream",
input_data_folder = data_folder
))
cfg <- shg$getConfig()
expect_true(is.na(cfg$smok_params_source))
expect_true(is.na(cfg$mort_params_source))
expect_true(is.na(cfg$mort_params_type))
})
test_that("Round-trip: getConfig() -> useConfig() -> verify", {
shg1 <- new(SHGInterface)
shg1$num_threads <- 1
shg1$number_of_segments <- 1
shg1$rng_strategy <- "MersenneTwister"
shg1$immediate_cessation_year <- 2020
config <- shg1$getConfig(debug = FALSE)
# Save and reload simulation
temp_file <- tempfile(fileext = ".rds")
saveRDS(config, temp_file)
config_loaded <- readRDS(temp_file)
shg2 <- new(SHGInterface)
shg2$useConfig(config_loaded)
cfg2 <- shg2$getConfig(debug = FALSE)
expect_equal(shg2$rng_strategy, shg1$rng_strategy)
expect_equal(shg2$number_of_segments, shg1$number_of_segments)
expect_equal(shg2$num_threads, shg1$num_threads)
expect_equal(shg2$immediate_cessation_year, shg1$immediate_cessation_year)
expect_equal(cfg2$cohort_year, config_loaded$cohort_year)
expect_equal(cfg2[["repeat"]], config_loaded[["repeat"]])
expect_equal(cfg2[["race"]], config_loaded[["race"]])
expect_equal(cfg2[["sex"]], config_loaded[["sex"]])
unlink(temp_file)
})
test_that("Constructor with config parameter works", {
config <- list(
config_version = "1.0",
rng_strategy = "RngStream",
number_of_segments = 4,
num_threads = -1, # auto multi-threaded
immediate_cessation_year = 2025
)
shg <- new(SHGInterface, config = config)
expect_equal(shg$rng_strategy, "RngStream")
expect_equal(shg$number_of_segments, 4)
expect_equal(shg$num_threads, -1)
expect_equal(shg$immediate_cessation_year, 2025)
})
test_that("Constructor with empty config works", {
shg <- new(SHGInterface, config = list())
# Should use defaults
expect_equal(shg$rng_strategy, "RngStream")
expect_equal(shg$number_of_segments, -1) # default is -1 (auto)
})
# Tests for MersenneTwister restrictions
test_that("MersenneTwister cannot be used with multiple segments", {
shg_test <- new(SHGInterface)
suppressMessages(shg_test$rng_strategy <- "MersenneTwister")
expect_error(shg_test$number_of_segments <- 2, "MersenneTwister RNG cannot maintain IID properties with multiple segments")
})
test_that("MersenneTwister cannot be used with multi-threading", {
shg_test <- new(SHGInterface)
suppressMessages(shg_test$rng_strategy <- "MersenneTwister")
expect_error(shg_test$num_threads <- -1, "MersenneTwister RNG requires single-threaded execution")
expect_error(shg_test$num_threads <- 4, "MersenneTwister RNG requires single-threaded execution")
})
test_that("Switching to MersenneTwister resets segments and threads to valid values", {
shg_test <- new(SHGInterface)
shg_test$rng_strategy <- "RngStream"
shg_test$number_of_segments <- 4
shg_test$num_threads <- -1 # auto multi-threaded
expect_message(
expect_message(
shg_test$rng_strategy <- "MersenneTwister",
"Resetting number_of_segments to 1"
),
"Resetting num_threads to 1"
)
expect_equal(shg_test$number_of_segments, 1)
expect_equal(shg_test$num_threads, 1)
})
test_that("Multi-threading with single segment warns but allows", {
shg_test <- new(SHGInterface)
shg_test$number_of_segments <- 1
expect_warning(shg_test$num_threads <- -1, "num_threads > 1 or -1 \\(auto\\) has no effect when number_of_segments is 1")
})
test_that("MersenneTwister with multiple segments is reset to 1 segment", {
shg_test <- new(SHGInterface)
shg_test$input_data_folder <- data_folder
# Use RngStream then switch to MersenneTwister
shg_test$rng_strategy <- "RngStream"
shg_test$number_of_segments <- 2
expect_message(
expect_message(
shg_test$rng_strategy <- "MersenneTwister",
"Resetting number_of_segments to 1"
),
"Resetting num_threads to 1"
)
expect_equal(shg_test$number_of_segments, 1)
# Should work fine with 1 segment
pop <- list(
race = rep(0, 100),
sex = rep(0, 100),
birth_cohort = rep(1940, 100)
)
result <- shg_test$runSimFromDataFrame(pop)
expect_equal(nrow(result), 100)
})
test_that("RngStream allows multiple segments and multi-threading", {
shg_test <- new(SHGInterface)
shg_test$rng_strategy <- "RngStream"
shg_test$number_of_segments <- 4
shg_test$num_threads <- -1 # auto multi-threaded
expect_equal(shg_test$rng_strategy, "RngStream")
expect_equal(shg_test$number_of_segments, 4)
expect_equal(shg_test$num_threads, -1)
})
test_that("MersenneTwister: custom seeds produce different results and reverting to defaults restores original results", {
N <- 1000
shg_mt <- new(SHGInterface)
shg_mt$input_data_folder <- data_folder
shg_mt$num_threads <- 1
shg_mt$number_of_segments <- 1
shg_mt$rng_strategy <- "MersenneTwister"
# Baseline: run with default seeds (no seeds set)
baseline <- shg_mt$runSimFromFixedValues(N, 0, 0, 1940)
baseline_stats <- get_stats_from_df(baseline)
# Run with custom seeds
shg_mt$mt_seeds <- c(1111111111, 2222222222, 3333333333, 4444444444)
custom_seed_run <- shg_mt$runSimFromFixedValues(N, 0, 0, 1940)
custom_stats <- get_stats_from_df(custom_seed_run)
# Verify custom seeds produce different results
expect_false(isTRUE(all.equal(baseline_stats, custom_stats)))
# Run with different custom seeds
shg_mt$mt_seeds <- c(9999999999, 8888888888, 7777777777, 6666666666)
different_seed_run <- shg_mt$runSimFromFixedValues(N, 0, 0, 1940)
different_stats <- get_stats_from_df(different_seed_run)
# Verify different seeds produce different results
expect_false(isTRUE(all.equal(custom_stats, different_stats)))
# Revert to defaults by manually setting default seeds
# Default MT seeds: 1898587603, 1468371936, 1551308340, 1590227640
shg_mt$mt_seeds <- c(1898587603, 1468371936, 1551308340, 1590227640)
reset_run <- shg_mt$runSimFromFixedValues(N, 0, 0, 1940)
reset_stats <- get_stats_from_df(reset_run)
# Verify reverting to defaults produces same results as baseline
expect_equal(baseline_stats$mean_initiation, reset_stats$mean_initiation)
expect_equal(baseline_stats$mean_cessation, reset_stats$mean_cessation)
expect_equal(baseline_stats$age_at_death, reset_stats$age_at_death)
})
test_that("RngStream: custom seed produces different results and reverting to defaults restores original results", {
N <- 1000
shg_rs <- new(SHGInterface)
shg_rs$input_data_folder <- data_folder
shg_rs$rng_strategy <- "RngStream"
shg_rs$number_of_segments <- 1
shg_rs$num_threads <- 1 # single-threaded
# Baseline: run with default seed (no seed set)
baseline <- shg_rs$runSimFromFixedValues(N, 0, 0, 1940)
baseline_stats <- get_stats_from_df(baseline)
# Run with custom seed
shg_rs$rngstream_seed <- c(11111, 22222, 33333, 44444, 55555, 66666)
custom_seed_run <- shg_rs$runSimFromFixedValues(N, 0, 0, 1940)
custom_stats <- get_stats_from_df(custom_seed_run)
# Verify custom seed produces different results
expect_false(isTRUE(all.equal(baseline_stats, custom_stats)))
# Run with different custom seed
shg_rs$rngstream_seed <- c(99999, 88888, 77777, 66666, 55555, 44444)
different_seed_run <- shg_rs$runSimFromFixedValues(N, 0, 0, 1940)
different_stats <- get_stats_from_df(different_seed_run)
# Verify different seed produces different results
expect_false(isTRUE(all.equal(custom_stats, different_stats)))
# Revert to defaults by manually setting default seed
# Default RngStream seed: c(12345, 12345, 12345, 12345, 12345, 12345)
shg_rs$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
reset_run <- shg_rs$runSimFromFixedValues(N, 0, 0, 1940)
reset_stats <- get_stats_from_df(reset_run)
# Verify reverting to defaults produces same results as baseline
expect_equal(baseline_stats$mean_initiation, reset_stats$mean_initiation)
expect_equal(baseline_stats$mean_cessation, reset_stats$mean_cessation)
expect_equal(baseline_stats$age_at_death, reset_stats$age_at_death)
})
test_that("get_current_seeds() returns correct seeds based on RNG strategy", {
shg_mt <- new(SHGInterface)
shg_mt$num_threads <- 1
shg_mt$number_of_segments <- 1
shg_mt$rng_strategy <- "MersenneTwister"
shg_mt$mt_seeds <- c(1111111111, 2222222222, 3333333333, 4444444444)
# Should return MT seeds when MT strategy is selected
current_seeds <- shg_mt$get_current_seeds()
expect_equal(length(current_seeds), 4)
expect_equal(current_seeds, c(1111111111, 2222222222, 3333333333, 4444444444))
shg_rs <- new(SHGInterface)
shg_rs$rng_strategy <- "RngStream"
shg_rs$rngstream_seed <- c(11111, 22222, 33333, 44444, 55555, 66666)
# Should return RngStream seed when RngStream strategy is selected
current_seeds_rs <- shg_rs$get_current_seeds()
expect_equal(length(current_seeds_rs), 6)
expect_equal(current_seeds_rs, c(11111, 22222, 33333, 44444, 55555, 66666))
})
test_that("reset_seeds_to_defaults() resets seeds to default values", {
N <- 1000
shg_mt <- new(SHGInterface)
shg_mt$input_data_folder <- data_folder
shg_mt$num_threads <- 1
shg_mt$number_of_segments <- 1
shg_mt$rng_strategy <- "MersenneTwister"
# Set custom seeds
shg_mt$mt_seeds <- c(1111111111, 2222222222, 3333333333, 4444444444)
custom_run <- shg_mt$runSimFromFixedValues(N, 0, 0, 1940)
custom_stats <- get_stats_from_df(custom_run)
# Reset to defaults using the method
shg_mt$reset_seeds_to_defaults()
# Verify seeds were reset
current_seeds <- shg_mt$get_current_seeds()
expect_equal(current_seeds, c(1898587603, 1468371936, 1551308340, 1590227640))
# Verify reset produces same results as baseline
baseline <- shg_mt$runSimFromFixedValues(N, 0, 0, 1940)
baseline_stats <- get_stats_from_df(baseline)
# Create a fresh instance for comparison
shg_baseline <- new(SHGInterface)
shg_baseline$input_data_folder <- data_folder
shg_baseline$num_threads <- 1
shg_baseline$number_of_segments <- 1
shg_baseline$rng_strategy <- "MersenneTwister"
# Don't set seeds, so defaults will be used
baseline_fresh <- shg_baseline$runSimFromFixedValues(N, 0, 0, 1940)
baseline_fresh_stats <- get_stats_from_df(baseline_fresh)
expect_equal(baseline_stats$mean_initiation, baseline_fresh_stats$mean_initiation)
expect_equal(baseline_stats$mean_cessation, baseline_fresh_stats$mean_cessation)
expect_equal(baseline_stats$age_at_death, baseline_fresh_stats$age_at_death)
# Test RngStream reset
shg_rs <- new(SHGInterface)
shg_rs$input_data_folder <- data_folder
shg_rs$rng_strategy <- "RngStream"
shg_rs$number_of_segments <- 1
shg_rs$num_threads <- 1 # single-threaded
# Set custom seed
shg_rs$rngstream_seed <- c(11111, 22222, 33333, 44444, 55555, 66666)
# Reset to defaults using the method
shg_rs$reset_seeds_to_defaults()
# Verify seeds were reset
current_seeds_rs <- shg_rs$get_current_seeds()
expect_equal(current_seeds_rs, c(12345, 12345, 12345, 12345, 12345, 12345))
})
test_that("get_rng_state_fingerprint() verifies seeds actually affect RNG internal state", {
shg_mt1 <- new(SHGInterface)
shg_mt1$input_data_folder <- data_folder
shg_mt1$num_threads <- 1
shg_mt1$number_of_segments <- 1
shg_mt1$rng_strategy <- "MersenneTwister"
shg_mt1$mt_seeds <- c(1111111111, 2222222222, 3333333333, 4444444444)
# Get fingerprint with custom seeds
fingerprint1 <- shg_mt1$get_rng_state_fingerprint()
expect_equal(length(fingerprint1), 12) # MT returns 12 values (3 from each of 4 streams)
# Set different seeds
shg_mt1$mt_seeds <- c(9999999999, 8888888888, 7777777777, 6666666666)
fingerprint2 <- shg_mt1$get_rng_state_fingerprint()
# Verify different seeds produce different fingerprints
expect_false(isTRUE(all.equal(fingerprint1, fingerprint2)))
# Reset to defaults
shg_mt1$reset_seeds_to_defaults()
fingerprint_default <- shg_mt1$get_rng_state_fingerprint()
# Verify default seeds produce different fingerprint than custom seeds
expect_false(isTRUE(all.equal(fingerprint1, fingerprint_default)))
expect_false(isTRUE(all.equal(fingerprint2, fingerprint_default)))
# Test RngStream
shg_rs1 <- new(SHGInterface)
shg_rs1$input_data_folder <- data_folder
shg_rs1$rng_strategy <- "RngStream"
shg_rs1$rngstream_seed <- c(11111, 22222, 33333, 44444, 55555, 66666)
# Get fingerprint with custom seed
fingerprint_rs1 <- shg_rs1$get_rng_state_fingerprint()
expect_equal(length(fingerprint_rs1), 24) # RngStream returns 24 values (6 from each of 4 streams)
# Set different seed
shg_rs1$rngstream_seed <- c(99999, 88888, 77777, 66666, 55555, 44444)
fingerprint_rs2 <- shg_rs1$get_rng_state_fingerprint()
# Verify different seeds produce different fingerprints
expect_false(isTRUE(all.equal(fingerprint_rs1, fingerprint_rs2)))
# Reset to defaults
shg_rs1$reset_seeds_to_defaults()
fingerprint_rs_default <- shg_rs1$get_rng_state_fingerprint()
# Verify default seed produces different fingerprint than custom seeds
expect_false(isTRUE(all.equal(fingerprint_rs1, fingerprint_rs_default)))
expect_false(isTRUE(all.equal(fingerprint_rs2, fingerprint_rs_default)))
# Verify that same seeds produce same fingerprints (for RngStream, which returns actual state)
shg_rs2 <- new(SHGInterface)
shg_rs2$input_data_folder <- data_folder
shg_rs2$rng_strategy <- "RngStream"
shg_rs2$rngstream_seed <- c(11111, 22222, 33333, 44444, 55555, 66666)
fingerprint_rs2_same <- shg_rs2$get_rng_state_fingerprint()
# RngStream should produce identical fingerprints for same seed
expect_equal(fingerprint_rs1, fingerprint_rs2_same)
})
# TODO: Compare Legacy tests with runSimFromFixedValues(): requires parsing of results
# ============================================================
# CPD Format Tests
# ============================================================
test_that("cpd_format = none produces no CPD column", {
shg <- new(SHGInterface)
shg$input_data_folder <- data_folder
shg$cpd_format <- "none"
shg$rng_strategy <- "RngStream"
shg$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg$number_of_segments <- 1
shg$num_threads <- 1
N <- 100
df <- test_pop_df(N)
result <- shg$runSimFromDataFrame(df)
expect_equal(nrow(result), N)
expect_true("smoking_initiation_age" %in% names(result))
expect_false("cigarettes_per_day" %in% names(result))
})
test_that("cpd_format = sparse produces compact CPD", {
shg <- new(SHGInterface)
shg$input_data_folder <- data_folder
shg$cpd_format <- "sparse"
shg$rng_strategy <- "RngStream"
shg$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg$number_of_segments <- 1
shg$num_threads <- 1
N <- 100
df <- test_pop_df(N)
result <- shg$runSimFromDataFrame(df)
expect_true("cigarettes_per_day" %in% names(result))
# Sparse format should NOT have parentheses (no age info)
smoker_idx <- which(result$smoking_initiation_age != -999)[1]
if (!is.na(smoker_idx)) {
cpd <- result$cigarettes_per_day[smoker_idx]
expect_false(grepl("\\(", cpd), info = "Sparse format should not contain parentheses")
}
})
test_that("cpd_format = legacy produces age-cpd format", {
shg <- new(SHGInterface)
shg$input_data_folder <- data_folder
shg$cpd_format <- "legacy"
shg$rng_strategy <- "RngStream"
shg$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg$number_of_segments <- 1
shg$num_threads <- 1
N <- 100
df <- test_pop_df(N)
result <- shg$runSimFromDataFrame(df)
expect_true("cigarettes_per_day" %in% names(result))
# Legacy format should have parentheses (with age info)
smoker_idx <- which(result$smoking_initiation_age != -999)[1]
if (!is.na(smoker_idx)) {
cpd <- result$cigarettes_per_day[smoker_idx]
expect_true(grepl("\\(", cpd), info = "Full format should contain parentheses")
}
})
test_that("cpd_format validation rejects invalid values", {
shg <- new(SHGInterface)
expect_error(shg$cpd_format <- "invalid", "cpd_format must be")
})
test_that("sparse and legacy produce equivalent CPD values", {
N <- 100
df <- test_pop_df(N)
# Sparse format
shg_sparse <- new(SHGInterface)
shg_sparse$input_data_folder <- data_folder
shg_sparse$cpd_format <- "sparse"
shg_sparse$rng_strategy <- "RngStream"
shg_sparse$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg_sparse$number_of_segments <- 1
shg_sparse$num_threads <- 1
result_sparse <- shg_sparse$runSimFromDataFrame(df)
# Legacy format (same seed)
shg_legacy <- new(SHGInterface)
shg_legacy$input_data_folder <- data_folder
shg_legacy$cpd_format <- "legacy"
shg_legacy$rng_strategy <- "RngStream"
shg_legacy$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg_legacy$number_of_segments <- 1
shg_legacy$num_threads <- 1
result_legacy <- shg_legacy$runSimFromDataFrame(df)
# Non-CPD columns should match exactly
expect_equal(result_sparse$smoking_initiation_age, result_legacy$smoking_initiation_age)
expect_equal(result_sparse$smoking_cessation_age, result_legacy$smoking_cessation_age)
expect_equal(result_sparse$age_at_death, result_legacy$age_at_death)
# Extract CPD values from both formats and compare
for (i in 1:min(10, N)) {
if (result_sparse$smoking_initiation_age[i] != -999) {
sparse_vals <- as.numeric(strsplit(result_sparse$cigarettes_per_day[i], ", ")[[1]])
legacy_vals <- as.numeric(gsub(".*\\(([0-9.]+)\\)", "\\1",
strsplit(result_legacy$cigarettes_per_day[i], ", ")[[1]]))
expect_equal(sparse_vals, legacy_vals, info = paste("Individual", i))
}
}
})
# ============================================================
# File Output Mode Tests
test_that("Windows: disk output + multi-thread fails before simulation (no output file)", {
skip_if_not(.Platform$OS.type == "windows")
output_path <- tempfile(fileext = ".csv")
on.exit(unlink(output_path), add = TRUE)
shg <- new(SHGInterface)
shg$input_data_folder <- data_folder
shg$rng_strategy <- "RngStream"
shg$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg$number_of_segments <- 1
suppressWarnings(shg$num_threads <- -1) # set_num_threads warns when segments==1 (unused threads)
shg$output_file <- output_path
df <- test_pop_df(1)
expect_error(
shg$runSimFromDataFrame(df),
"cannot be used with multi-threaded execution"
)
expect_false(file.exists(output_path), info = "must stop before creating output")
})
test_that("output_file writes results to disk", {
output_path <- tempfile(fileext = ".csv")
shg <- new(SHGInterface)
shg$input_data_folder <- data_folder
shg$rng_strategy <- "RngStream"
shg$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg$number_of_segments <- 2
# Windows forbids disk + num_threads != 1; Unix test still exercises 2 segments / 2 threads.
if (.Platform$OS.type == "windows") {
shg$num_threads <- 1
} else {
shg$num_threads <- 2
}
shg$output_file <- output_path
N <- 1000
df <- test_pop_df(N)
result <- shg$runSimFromDataFrame(df)
# Should return info DataFrame
expect_true(grepl("file", result$info, ignore.case = TRUE))
expect_equal(result$rows, N)
# File should exist
expect_true(file.exists(output_path))
lines <- read_output_lines(output_path)
# Find data section (between <RUN> and </RUN>)
rb <- xml_run_bounds(lines)
run_start <- rb$start
run_end <- rb$end
expect_true(length(run_start) > 0 && length(run_end) > 0, "File should have <RUN> tags")
data_lines <- lines[(run_start[1]+1):(run_end[1]-1)]
expect_equal(length(data_lines), N) # N data lines
# Header should have expected XML structure
expect_true(any(grepl("<VERSION>", lines)))
unlink(output_path)
})
test_that("runSimFromDataFrame output_file argument writes file without mutating property", {
output_path <- tempfile(fileext = ".csv")
on.exit(unlink(output_path), add = TRUE)
shg <- new(SHGInterface)
shg$input_data_folder <- data_folder
shg$rng_strategy <- "RngStream"
shg$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg$number_of_segments <- 1
shg$num_threads <- 1
shg$output_file <- ""
N <- 200
df <- test_pop_df(N)
result <- shg$runSimFromDataFrame(df, output_path)
expect_true(file.exists(output_path))
expect_true(grepl("file", result$info, ignore.case = TRUE))
expect_equal(result$rows, N)
expect_equal(shg$output_file, "")
})
test_that("output_file parallel execution works (disk + multi-thread, non-Windows)", {
skip_on_os("windows")
output_path <- tempfile(fileext = ".csv")
on.exit(unlink(output_path), add = TRUE)
shg <- new(SHGInterface)
shg$input_data_folder <- data_folder
shg$rng_strategy <- "RngStream"
shg$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg$number_of_segments <- 10
shg$num_threads <- -1
shg$output_file <- output_path
N <- 10000
df <- test_pop_df(N)
shg$runSimFromDataFrame(df)
lines <- read_output_lines(output_path)
rb <- xml_run_bounds(lines)
run_start <- rb$start
run_end <- rb$end
expect_true(length(run_start) > 0 && length(run_end) > 0)
data_lines <- lines[(run_start[1] + 1):(run_end[1] - 1)]
expect_equal(length(data_lines), N)
})
test_that("output_file produces same init/cess/ocd as memory mode", {
output_path <- tempfile(fileext = ".csv")
N <- 500
df <- test_pop_df(N)
# Memory mode
shg_mem <- new(SHGInterface)
shg_mem$input_data_folder <- data_folder
shg_mem$rng_strategy <- "RngStream"
shg_mem$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg_mem$cpd_format <- "legacy"
shg_mem$number_of_segments <- 1
shg_mem$num_threads <- 1
result_mem <- shg_mem$runSimFromDataFrame(df)
# File mode (same seed)
shg_file <- new(SHGInterface)
shg_file$input_data_folder <- data_folder
shg_file$rng_strategy <- "RngStream"
shg_file$rngstream_seed <- c(12345, 12345, 12345, 12345, 12345, 12345)
shg_file$number_of_segments <- 1
shg_file$num_threads <- 1
shg_file$output_file <- output_path
shg_file$runSimFromDataFrame(df)
# Parse file and compare (skip XML header, find <RUN> tag to get data lines)
lines <- read_output_lines(output_path)
rb <- xml_run_bounds(lines)
run_start <- rb$start
run_end <- rb$end
if (length(run_start) > 0 && length(run_end) > 0) {
data_lines <- lines[(run_start[1]+1):(run_end[1]-1)]
} else {
# Fallback for old format (simple header)
data_lines <- lines[-1]
}
file_init_ages <- sapply(data_lines, function(line) {
as.integer(strsplit(line, ";")[[1]][4])
})
expect_equal(unname(file_init_ages), result_mem$smoking_initiation_age)
unlink(output_path)
})
test_that("NHIS csv-partial bundle lists expected filenames (checked in test-nhis-partial-inputs.R)", {
nh <- test_path("../testdata/2018/csv-partial")
skip_if_not(dir.exists(nh), "NHIS csv-partial fixtures missing (not in CRAN tarball; use full git checkout)")
req <- c(
file.path("smok", "initiation.csv"),
file.path("smok", "cessation.csv"),
file.path("smok", "cpd.csv"),
file.path("mort", "acm.csv"),
file.path("mort", "ocm-excl-lung-cancer.csv")
)
for (f in req) {
expect_true(file.exists(file.path(nh, f)), info = f)
}
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.