Nothing
#!/usr/bin/env Rscript
# Cohort sweep benchmark: time runSimFromFixedValues per birth cohort, write CSV + HTML chart.
# Spec: tmp/benchmark.md (and legacy SHG_COHORT_BENCH_* env vars).
#
# Usage (from repo root):
# Rscript tools/benchmark-cohort-variance.R
# Rscript tools/benchmark-cohort-variance.R --n=100000 --step=10 --runs=2
# SHG_COHORT_BENCH_CSV=out.csv SHG_COHORT_BENCH_HTML=out.html Rscript tools/benchmark-cohort-variance.R
#
# Re-render HTML only from a saved CSV (no simulations, no SmokingHistoryGenerator load):
# Rscript tools/benchmark-cohort-variance.R --from-csv=docs/cohort-benchmark-latest.csv
# Rscript tools/benchmark-cohort-variance.R --from-csv=run.csv --html=out.html --render-input-dir=/path/to/data
#
# Optional: SHG_BENCHMARK_USE_PKGLOAD=1 to pkgload::load_all() from source instead of library().
#
# Charts use Highcharts bundled at docs/vendor/highcharts.js (copied beside the HTML output
# so file:// works offline). Refresh the bundle: see docs/vendor/README.txt
repo_root <- (function() {
ca <- commandArgs(trailingOnly = FALSE)
ff <- sub("^--file=", "", ca[grepl("^--file=", ca)][1])
if (nzchar(ff) && file.exists(ff)) {
return(normalizePath(file.path(dirname(normalizePath(ff)), ".."), winslash = "/"))
}
normalizePath(getwd(), winslash = "/")
})()
parse_cli <- function() {
a <- commandArgs(trailingOnly = TRUE)
out <- list(
n = 1e6,
step = 5L,
runs_per_cohort = 3L,
subdir = "csv-complete",
input_root = NA_character_,
year_min = NA_integer_,
year_max = NA_integer_,
immediate = 0L,
cpd_format = "sparse",
rng_strategy = "RngStream",
num_threads = -1L,
number_of_segments = -1L,
csv = Sys.getenv("SHG_COHORT_BENCH_CSV", ""),
html = Sys.getenv("SHG_COHORT_BENCH_HTML", ""),
compare_immediate = Sys.getenv("SHG_COHORT_BENCH_COMPARE_IMMEDIATE_CESSATION", ""),
from_csv = "",
render_input_dir = "",
cpd_none_sweep = "",
nocpd_sweep = "",
nocpd_last_n = 0L,
chart_wall_iter = FALSE,
help = FALSE,
with_nocpd = FALSE
)
for (x in a) {
if (x %in% c("-h", "--help")) {
out$help <- TRUE
next
}
if (!grepl("^--", x)) next
kv <- sub("^--", "", x)
if (!grepl("=", kv, fixed = TRUE)) next
sp <- regexpr("=", kv, fixed = TRUE)
key <- substr(kv, 1L, sp - 1L)
val <- substr(kv, sp + 1L, nchar(kv))
switch(key,
n = ,
N = out$n <- as.numeric(val),
step = out$step <- as.integer(val),
runs = ,
runs_per_cohort = out$runs_per_cohort <- as.integer(val),
subdir = out$subdir <- val,
input_root = out$input_root <- val,
year_min = out$year_min <- if (toupper(trimws(val)) %in% c("", "NA")) NA_integer_ else suppressWarnings(as.integer(val)),
year_max = out$year_max <- if (toupper(trimws(val)) %in% c("", "NA")) NA_integer_ else suppressWarnings(as.integer(val)),
immediate = out$immediate <- as.integer(val),
cpd_format = out$cpd_format <- val,
rng_strategy = out$rng_strategy <- val,
num_threads = out$num_threads <- as.integer(val),
number_of_segments = out$number_of_segments <- as.integer(val),
csv = out$csv <- val,
html = out$html <- val,
compare_immediate = out$compare_immediate <- val,
from_csv = out$from_csv <- val,
`from-csv` = out$from_csv <- val,
render_input_dir = out$render_input_dir <- val,
`render-input-dir` = out$render_input_dir <- val,
cpd_none_sweep = out$cpd_none_sweep <- val,
`cpd-none-sweep` = out$cpd_none_sweep <- val,
nocpd_sweep = out$nocpd_sweep <- val,
`nocpd-sweep` = out$nocpd_sweep <- val,
nocpd_last_n = out$nocpd_last_n <- suppressWarnings(as.integer(val)),
`nocpd-last-n` = out$nocpd_last_n <- suppressWarnings(as.integer(val)),
chart_wall_iter = out$chart_wall_iter <- (tolower(trimws(val)) %in% c("1", "true", "yes")),
`chart-wall-iter` = out$chart_wall_iter <- (tolower(trimws(val)) %in% c("1", "true", "yes")),
with_nocpd = out$with_nocpd <- (tolower(trimws(val)) %in% c("1", "true", "yes")),
`with-nocpd` = out$with_nocpd <- (tolower(trimws(val)) %in% c("1", "true", "yes")),
{}
)
}
out
}
cohort_years_from_initiation <- function(init_path) {
line1 <- readLines(init_path, n = 1L, warn = FALSE)
cols <- strsplit(line1, ",", fixed = TRUE)[[1L]]
yr <- suppressWarnings(as.integer(cols))
yr <- yr[!is.na(yr) & yr >= 1800L & yr <= 2200L]
sort(unique(yr))
}
filter_years <- function(years, step, year_min, year_max) {
y <- years[years %% step == 0L]
if (!is.na(year_min)) y <- y[y >= year_min]
if (!is.na(year_max)) y <- y[y <= year_max]
y
}
resolve_input_dir <- function(repo_root, input_root, subdir) {
candidates <- character()
if (!is.na(input_root) && nzchar(input_root)) {
candidates <- c(candidates, normalizePath(file.path(input_root, subdir), winslash = "/", mustWork = FALSE))
}
candidates <- c(
candidates,
normalizePath(file.path(repo_root, "tests/testdata/NHIS-1965-2018", subdir), winslash = "/", mustWork = FALSE),
normalizePath(file.path(repo_root, "tests/testdata/NHIS-1965-2018", "csv-partial"), winslash = "/", mustWork = FALSE)
)
for (p in candidates) {
if (file.exists(file.path(p, "initiation.csv"))) return(p)
}
stop(
"Could not find initiation.csv under any of:\n",
paste0(" ", candidates, collapse = "\n"),
"\nSet --input_root=... to the folder that contains csv-complete/ and csv-partial/."
)
}
load_shg_package <- function(repo_root) {
if (nzchar(Sys.getenv("SHG_BENCHMARK_USE_PKGLOAD", ""))) {
if (!requireNamespace("pkgload", quietly = TRUE)) {
stop("Install pkgload or unset SHG_BENCHMARK_USE_PKGLOAD", call. = FALSE)
}
pkgload::load_all(repo_root, quiet = TRUE)
} else {
library(SmokingHistoryGenerator)
}
}
run_one_cohort <- function(shg_template, cohort_year, n, runs, cpd_format_override = NULL) {
secs <- numeric(runs)
for (r in seq_len(runs)) {
shg <- new(SHGInterface)
shg$input_data_folder <- shg_template$input_data_folder
shg$initiation_filename <- shg_template$initiation_filename
shg$cessation_filename <- shg_template$cessation_filename
shg$cpd_filename <- shg_template$cpd_filename
shg$mortality_filename <- shg_template$mortality_filename
shg$cpd_format <- if (!is.null(cpd_format_override)) {
cpd_format_override
} else {
shg_template$cpd_format
}
shg$rng_strategy <- shg_template$rng_strategy
shg$num_threads <- shg_template$num_threads
shg$number_of_segments <- shg_template$number_of_segments
shg$immediate_cessation_year <- shg_template$immediate_cessation_year
gc()
t <- system.time({
sim_out <- shg$runSimFromFixedValues(as.integer(n), 0L, 0L, as.integer(cohort_year))
rm(sim_out)
})
secs[r] <- unname(t["elapsed"])
}
sdv <- if (runs > 1L) stats::sd(secs) else 0
c(mean = mean(secs), sd = sdv, min = min(secs), max = max(secs))
}
default_html_path <- function(repo_root) {
d <- file.path(repo_root, "docs")
if (!dir.exists(d)) dir.create(d, recursive = TRUE)
file.path(d, "cohort-benchmark-latest.html")
}
default_csv_path <- function(repo_root) {
d <- file.path(repo_root, "docs")
if (!dir.exists(d)) dir.create(d, recursive = TRUE)
file.path(d, "cohort-benchmark-latest.csv")
}
highcharts_vendor_repo_path <- function(repo_root) {
file.path(repo_root, "docs", "vendor", "highcharts.js")
}
# Copy bundled Highcharts next to the HTML file so relative src=vendor/highcharts.js works
# for file:// and offline (no CDN).
ensure_highcharts_vendor_for_html <- function(repo_root, html_path) {
src <- normalizePath(highcharts_vendor_repo_path(repo_root), winslash = "/", mustWork = FALSE)
if (!isTRUE(file.exists(src))) {
stop(
"Bundled Highcharts not found at:\n ", src, "\n",
"See docs/vendor/README.txt",
call. = FALSE
)
}
html_path <- normalizePath(html_path, winslash = "/", mustWork = FALSE)
dest_dir <- file.path(dirname(html_path), "vendor")
dest <- file.path(dest_dir, "highcharts.js")
if (!dir.exists(dest_dir)) dir.create(dest_dir, recursive = TRUE)
src <- normalizePath(src, winslash = "/", mustWork = TRUE)
dest <- normalizePath(dest, winslash = "/", mustWork = FALSE)
if (src != dest) file.copy(src, dest, overwrite = TRUE)
invisible(dest)
}
merge_preserve_cpd_none_from_csv <- function(results, csv_path) {
none_nm <- c("mean_sec_none", "sd_sec_none", "min_sec_none", "max_sec_none")
for (nm in none_nm) {
if (!nm %in% names(results)) results[[nm]] <- NA_real_
}
if (!file.exists(csv_path)) return(results)
old <- utils::read.csv(csv_path, stringsAsFactors = FALSE, check.names = FALSE)
if (!any(none_nm %in% names(old))) return(results)
for (i in seq_len(nrow(results))) {
cy <- results$cohort_year[i]
imm <- results$immediate_cessation_year[i]
j <- which(old$cohort_year == cy & old$immediate_cessation_year == imm)
if (length(j) != 1L) next
for (nm in none_nm) {
if (nm %in% names(old)) results[i, nm] <- old[j[1L], nm]
}
}
results
}
merge_preserve_nocpd_from_csv <- function(results, csv_path) {
noc_nm <- c("mean_sec_nocpd", "sd_sec_nocpd", "min_sec_nocpd", "max_sec_nocpd")
for (nm in noc_nm) {
if (!nm %in% names(results)) results[[nm]] <- NA_real_
}
if (!file.exists(csv_path)) return(results)
old <- utils::read.csv(csv_path, stringsAsFactors = FALSE, check.names = FALSE)
if (!any(noc_nm %in% names(old))) return(results)
for (i in seq_len(nrow(results))) {
cy <- results$cohort_year[i]
imm <- results$immediate_cessation_year[i]
j <- which(old$cohort_year == cy & old$immediate_cessation_year == imm)
if (length(j) != 1L) next
for (nm in noc_nm) {
if (nm %in% names(old)) results[i, nm] <- old[j[1L], nm]
}
}
results
}
merge_preserve_bench_iter_from_csv <- function(results, csv_path) {
if (!"bench_iter_sec" %in% names(results)) results$bench_iter_sec <- NA_real_
if (!file.exists(csv_path)) return(results)
old <- utils::read.csv(csv_path, stringsAsFactors = FALSE, check.names = FALSE)
if (!"bench_iter_sec" %in% names(old)) return(results)
for (i in seq_len(nrow(results))) {
cy <- results$cohort_year[i]
imm <- results$immediate_cessation_year[i]
j <- which(old$cohort_year == cy & old$immediate_cessation_year == imm)
if (length(j) != 1L) next
results[i, "bench_iter_sec"] <- old[j[1L], "bench_iter_sec"]
}
results
}
normalize_benchmark_csv <- function(df) {
names(df) <- trimws(names(df))
req <- c("cohort_year", "mean_sec")
miss <- setdiff(req, names(df))
if (length(miss)) {
stop("CSV missing required column(s): ", paste(miss, collapse = ", "), call. = FALSE)
}
if (!"min_sec" %in% names(df)) df$min_sec <- df$mean_sec
if (!"max_sec" %in% names(df)) df$max_sec <- df$mean_sec
if (!"sd_sec" %in% names(df)) df$sd_sec <- NA_real_
if (!"N" %in% names(df)) df$N <- NA_real_
if (!"runs_per_cohort" %in% names(df)) df$runs_per_cohort <- 1L
if (!"immediate_cessation_year" %in% names(df)) df$immediate_cessation_year <- 0L
df$cohort_year <- as.integer(df$cohort_year)
df$mean_sec <- as.numeric(df$mean_sec)
df$sd_sec <- as.numeric(df$sd_sec)
df$min_sec <- as.numeric(df$min_sec)
df$max_sec <- as.numeric(df$max_sec)
df$N <- as.numeric(df$N)
df$runs_per_cohort <- as.integer(df$runs_per_cohort)
df$immediate_cessation_year <- as.integer(df$immediate_cessation_year)
none_nm <- c("mean_sec_none", "sd_sec_none", "min_sec_none", "max_sec_none")
for (nm in none_nm) {
if (!nm %in% names(df)) df[[nm]] <- NA_real_
}
for (nm in none_nm) df[[nm]] <- as.numeric(df[[nm]])
noc_nm <- c("mean_sec_nocpd", "sd_sec_nocpd", "min_sec_nocpd", "max_sec_nocpd")
for (nm in noc_nm) {
if (!nm %in% names(df)) df[[nm]] <- NA_real_
}
for (nm in noc_nm) df[[nm]] <- as.numeric(df[[nm]])
if (!"bench_iter_sec" %in% names(df)) df$bench_iter_sec <- NA_real_
df$bench_iter_sec <- as.numeric(df$bench_iter_sec)
df
}
build_meta_from_csv_results <- function(results, opt) {
n_med <- suppressWarnings(stats::median(results$N, na.rm = TRUE))
if (!is.finite(n_med)) n_med <- opt$n
rp_med <- suppressWarnings(stats::median(results$runs_per_cohort, na.rm = TRUE))
if (!is.finite(rp_med)) rp_med <- opt$runs_per_cohort
inp <- opt$render_input_dir
if (!nzchar(inp)) inp <- "(not recorded in CSV; pass --render-input-dir= to annotate)"
list(
when = format(Sys.time(), usetz = TRUE),
source_note = "Chart/table rendered from existing CSV (simulations not re-run).",
r_version = R.version.string,
platform = R.version$platform,
n = n_med,
runs_per_cohort = as.integer(rp_med),
input_dir = inp,
show_bench_iter = isTRUE(opt$chart_wall_iter)
)
}
render_from_csv <- function(opt) {
src <- opt$from_csv
if (!nzchar(src)) stop("Internal: empty from_csv", call. = FALSE)
path <- normalizePath(src, winslash = "/", mustWork = TRUE)
raw <- utils::read.csv(path, stringsAsFactors = FALSE, check.names = FALSE)
results <- normalize_benchmark_csv(raw)
rownames(results) <- NULL
html_path <- opt$html
if (!nzchar(html_path)) html_path <- default_html_path(repo_root)
ensure_highcharts_vendor_for_html(repo_root, html_path)
meta <- build_meta_from_csv_results(results, opt)
writeLines(build_html(repo_root, results, meta), html_path, useBytes = TRUE)
message("Wrote HTML from CSV:\n ", html_path, "\nSource: ", path)
invisible(list(html = html_path, results = results, csv_source = path))
}
abbr_n_display <- function(n) {
n <- as.numeric(n)
if (length(n) != 1L || is.na(n) || !is.finite(n)) return("-")
ni <- suppressWarnings(as.integer(round(n)))
if (!is.na(ni) && ni == 1000000L) return("1M")
if (!is.na(ni) && ni == 100000L) return("100k")
if (n >= 1e6 && abs(n %% 1e6) < 1e-6) return(sprintf("%gM", n / 1e6))
if (n >= 1e3 && abs(n %% 1e3) < 1e-6) return(sprintf("%gk", n / 1e3))
format(n, scientific = FALSE, trim = TRUE)
}
# JSON string for embedding in <script> (avoids closing </script> in HTML)
json_string_for_script <- function(s) {
s <- enc2utf8(as.character(s)[1L])
s <- gsub("\\", "\\\\", s, fixed = TRUE)
s <- gsub("\"", "\\\"", s, fixed = TRUE)
s <- gsub("\n", "\\n", s, fixed = TRUE)
s <- gsub("\r", "\\r", s, fixed = TRUE)
s <- gsub("\t", "\\t", s, fixed = TRUE)
s <- gsub("<", "\\u003c", s, fixed = TRUE)
paste0("\"", s, "\"")
}
hc_js_num <- function(x) {
format(as.numeric(x), scientific = FALSE, trim = TRUE, digits = 14)
}
# Per birth-cohort column in initiation/cessation.csv (RACE=0, SEX=0): sum probabilities
# then log1p(sum_init) - log1p(sum_cess). Higher ≈ more initiation mass vs cessation mass
# in the bundled NHIS tables (exploratory; meant to correlate with sim work).
cohort_input_driver_by_year <- function(input_dir) {
if (!nzchar(input_dir) || !dir.exists(input_dir)) return(NULL)
ip <- file.path(input_dir, "initiation.csv")
cp <- file.path(input_dir, "cessation.csv")
if (!file.exists(ip) || !file.exists(cp)) return(NULL)
init <- utils::read.csv(ip, check.names = FALSE, stringsAsFactors = FALSE)
cess <- utils::read.csv(cp, check.names = FALSE, stringsAsFactors = FALSE)
nm <- toupper(names(init))
ir <- match("RACE", nm)
isx <- match("SEX", nm)
if (is.na(ir) || is.na(isx)) return(NULL)
init_sub <- init[init[[ir]] == 0 & init[[isx]] == 0, , drop = FALSE]
cess_sub <- cess[cess[[ir]] == 0 & cess[[isx]] == 0, , drop = FALSE]
nc <- ncol(init)
if (nc < 4L) return(NULL)
out <- numeric()
labs <- character()
for (j in 4L:nc) {
cn <- names(init)[j]
cy <- suppressWarnings(as.integer(cn))
if (is.na(cy)) next
if (!cn %in% names(cess_sub)) next
isum <- sum(suppressWarnings(as.numeric(init_sub[[cn]])), na.rm = TRUE)
csum <- sum(suppressWarnings(as.numeric(cess_sub[[cn]])), na.rm = TRUE)
out <- c(out, log1p(isum) - log1p(csum))
labs <- c(labs, cn)
}
if (!length(out)) return(NULL)
stats::setNames(out, labs)
}
# Highcharts: vendor/highcharts.js next to the HTML (see ensure_highcharts_vendor_for_html).
# Optional column mean_sec_none enables a second series (CPD none vs sparse baseline mean_sec).
# input_dir: when initiation.csv + cessation.csv exist, adds right-axis driver line (input tables only).
# show_bench_iter: if TRUE, plot bench_iter_sec (CSV/HTML step timing); default FALSE so 3rd curve can be nocpd.
highcharts_cohort_chart <- function(chart_id, df, title, avg_sec, input_dir = "", show_bench_iter = FALSE) {
df <- df[order(df$cohort_year), , drop = FALSE]
if (!nrow(df)) return("<p>No data.</p>")
xmin <- min(df$cohort_year)
xmax <- max(df$cohort_year)
has_none_pts <- "mean_sec_none" %in% names(df) &&
any(is.finite(suppressWarnings(as.numeric(df$mean_sec_none))))
avg_none_sec <- if (has_none_pts) {
mean(suppressWarnings(as.numeric(df$mean_sec_none)), na.rm = TRUE)
} else {
NA_real_
}
has_nocpd_pts <- "mean_sec_nocpd" %in% names(df) &&
any(is.finite(suppressWarnings(as.numeric(df$mean_sec_nocpd))))
avg_nocpd_sec <- if (has_nocpd_pts) {
mean(suppressWarnings(as.numeric(df$mean_sec_nocpd)), na.rm = TRUE)
} else {
NA_real_
}
has_bench_data <- "bench_iter_sec" %in% names(df) &&
any(is.finite(suppressWarnings(as.numeric(df$bench_iter_sec))))
has_wall_pts <- isTRUE(show_bench_iter) && has_bench_data
drv_all <- cohort_input_driver_by_year(input_dir)
drv_vals <- if (!is.null(drv_all)) {
idx <- match(as.character(df$cohort_year), names(drv_all))
unname(drv_all[idx])
} else {
rep(NA_real_, nrow(df))
}
has_driver <- any(is.finite(drv_vals))
ymax <- max(c(df$mean_sec, avg_sec, 1), na.rm = TRUE)
if (has_none_pts) {
mn <- suppressWarnings(as.numeric(df$mean_sec_none))
ymax <- max(c(ymax, mn[is.finite(mn)], avg_none_sec), na.rm = TRUE)
}
if (has_nocpd_pts) {
mn2 <- suppressWarnings(as.numeric(df$mean_sec_nocpd))
ymax <- max(c(ymax, mn2[is.finite(mn2)], avg_nocpd_sec), na.rm = TRUE)
}
if (has_wall_pts) {
bw <- suppressWarnings(as.numeric(df$bench_iter_sec))
ymax <- max(c(ymax, bw[is.finite(bw)]), na.rm = TRUE)
}
ymax <- ymax * 1.08
if (!is.finite(ymax) || ymax <= 0) ymax <- 1
ymax_drv <- if (has_driver) {
dv <- drv_vals[is.finite(drv_vals)]
if (!length(dv)) {
1
} else {
r <- range(dv)
pad <- (r[2] - r[1]) * 0.08 + 1e-6
r[2] + pad
}
} else {
1
}
ymin_drv <- if (has_driver) {
dv <- drv_vals[is.finite(drv_vals)]
if (!length(dv)) {
0
} else {
r <- range(dv)
pad <- (r[2] - r[1]) * 0.08 + 1e-6
r[1] - pad
}
} else {
0
}
pairs_sparse <- paste(
sprintf("[%s,%s]", df$cohort_year, hc_js_num(df$mean_sec)),
collapse = ","
)
data_sparse_js <- paste0("[", pairs_sparse, "]")
title_js <- json_string_for_script(title)
leg_left <- if (has_none_pts || has_nocpd_pts) {
paste0(
"Blue = sparse CPD",
if (has_none_pts) "; amber = cpd_format none (full CPD calc; omit column)" else "",
if (has_nocpd_pts) "; cyan = nocpd (skip CPD calculation in engine)" else "",
".",
if (has_driver) {
" Right axis: input driver = log1p(sum initiation) - log1p(sum cessation), RACE=0 SEX=0 (exploratory)."
} else {
""
}
)
} else if (has_driver) {
"Right axis: input driver = log1p(sum initiation) - log1p(sum cessation), RACE=0 SEX=0 (exploratory)."
} else {
""
}
if (has_wall_pts) {
leg_left <- paste0(
leg_left,
if (nzchar(leg_left)) " " else "",
"Slate = wall-clock per cohort step (timed simulation(s) + write CSV + refresh HTML)."
)
}
sub_txt <- if (nzchar(leg_left)) {
paste0("Left y-axis: wall-clock seconds (mean of timed runs). ", leg_left)
} else {
"Wall-clock seconds per cohort (mean of timed runs)"
}
subtitle_js <- json_string_for_script(sub_txt)
cid <- html_escape(chart_id)
sparse_series <- paste0(
"{\n",
" type: \"line\",\n",
" yAxis: 0,\n",
if (has_none_pts || has_nocpd_pts) {
" name: \"CPD sparse (default)\",\n"
} else {
" name: \"Mean (wall-clock s)\",\n"
},
" data: ", data_sparse_js, ",\n",
" color: \"#2563eb\",\n",
" lineWidth: 2.5,\n",
" marker: {\n",
" enabled: true, radius: 4.5,\n",
" lineWidth: 2, lineColor: \"#1d4ed8\", fillColor: \"#ffffff\"\n",
" }\n",
" }"
)
none_series <- ""
if (has_none_pts) {
idx <- which(is.finite(suppressWarnings(as.numeric(df$mean_sec_none))))
pairs_none <- paste(
sprintf("[%s,%s]", df$cohort_year[idx], hc_js_num(df$mean_sec_none[idx])),
collapse = ","
)
data_none_js <- paste0("[", pairs_none, "]")
none_series <- paste0(
", {\n",
" type: \"line\",\n",
" yAxis: 0,\n",
" name: \"CPD none (omit column, full calc)\",\n",
" data: ", data_none_js, ",\n",
" color: \"#d97706\",\n",
" lineWidth: 2.5,\n",
" marker: {\n",
" enabled: true, radius: 4.5,\n",
" lineWidth: 2, lineColor: \"#b45309\", fillColor: \"#ffffff\"\n",
" }\n",
" }"
)
}
nocpd_series <- ""
if (has_nocpd_pts) {
idxn <- which(is.finite(suppressWarnings(as.numeric(df$mean_sec_nocpd))))
pairs_nocpd <- paste(
sprintf("[%s,%s]", df$cohort_year[idxn], hc_js_num(df$mean_sec_nocpd[idxn])),
collapse = ","
)
data_nocpd_js <- paste0("[", pairs_nocpd, "]")
nocpd_series <- paste0(
", {\n",
" type: \"line\",\n",
" yAxis: 0,\n",
" name: \"CPD nocpd (skip CPD calc)\",\n",
" data: ", data_nocpd_js, ",\n",
" color: \"#0891b2\",\n",
" lineWidth: 2.5,\n",
" marker: {\n",
" enabled: true, radius: 4.5,\n",
" lineWidth: 2, lineColor: \"#0e7490\", fillColor: \"#ffffff\"\n",
" }\n",
" }"
)
}
wall_series <- ""
if (has_wall_pts) {
idxw <- which(is.finite(suppressWarnings(as.numeric(df$bench_iter_sec))))
pairs_wall <- paste(
sprintf("[%s,%s]", df$cohort_year[idxw], hc_js_num(df$bench_iter_sec[idxw])),
collapse = ","
)
data_wall_js <- paste0("[", pairs_wall, "]")
wall_series <- paste0(
", {\n",
" type: \"line\",\n",
" yAxis: 0,\n",
" name: \"Wall clock (cohort step)\",\n",
" data: ", data_wall_js, ",\n",
" color: \"#475569\",\n",
" dashStyle: \"ShortDash\",\n",
" lineWidth: 2,\n",
" marker: {\n",
" enabled: true, radius: 3,\n",
" lineWidth: 1.5, lineColor: \"#334155\", fillColor: \"#ffffff\"\n",
" }\n",
" }"
)
}
driver_series <- ""
if (has_driver) {
pairs_drv <- paste(
sprintf("[%s,%s]", df$cohort_year, hc_js_num(drv_vals)),
collapse = ","
)
data_drv_js <- paste0("[", pairs_drv, "]")
driver_series <- paste0(
", {\n",
" type: \"line\",\n",
" yAxis: 1,\n",
" name: \"Input driver (init vs cess mass)\",\n",
" data: ", data_drv_js, ",\n",
" color: \"#7c3aed\",\n",
" dashStyle: \"ShortDot\",\n",
" lineWidth: 2,\n",
" marker: { enabled: false },\n",
" tooltip: { valueDecimals: 3 }\n",
" }"
)
}
# Left-axis plotLines: sparse avg, optional none avg, 1 s reference
pl_sparse <- paste0(
" {\n",
" value: avg, color: \"#16a34a\", dashStyle: \"Dash\", width: 2, zIndex: 3,\n",
" label: {\n",
" text: ", json_string_for_script(sprintf("sparse avg %.2fs", avg_sec)), ",\n",
" align: \"right\", style: { color: \"#16a34a\", fontWeight: \"600\" }\n",
" }\n",
" }"
)
pl_none <- if (has_none_pts && is.finite(avg_none_sec)) {
paste0(
",\n",
" {\n",
" value: avgNone, color: \"#d97706\", dashStyle: \"Dash\", width: 2, zIndex: 3,\n",
" label: {\n",
" text: ", json_string_for_script(sprintf("none avg %.2fs", avg_none_sec)), ",\n",
" align: \"right\", style: { color: \"#d97706\", fontWeight: \"600\" }\n",
" }\n",
" }"
)
} else {
""
}
pl_nocpd <- if (has_nocpd_pts && is.finite(avg_nocpd_sec)) {
paste0(
",\n",
" {\n",
" value: avgNocpd, color: \"#0891b2\", dashStyle: \"Dash\", width: 2, zIndex: 3,\n",
" label: {\n",
" text: ", json_string_for_script(sprintf("nocpd avg %.2fs", avg_nocpd_sec)), ",\n",
" align: \"right\", style: { color: \"#0891b2\", fontWeight: \"600\" }\n",
" }\n",
" }"
)
} else {
""
}
pl_one <- paste0(
",\n",
" {\n",
" value: 1, color: \"#64748b\", dashStyle: \"ShortDash\", width: 1.5, zIndex: 3,\n",
" label: { text: \"1 s\", align: \"right\", style: { color: \"#64748b\" } }\n",
" }"
)
plot_lines_left <- paste0(pl_sparse, pl_none, pl_nocpd, pl_one)
yaxis_block <- if (has_driver) {
paste0(
" yAxis: [\n",
" {\n",
" title: { text: \"Wall-clock seconds\" },\n",
" min: 0,\n",
" max: yMax,\n",
" gridLineColor: \"#e2e8f0\",\n",
" plotLines: [\n",
plot_lines_left,
"\n",
" ]\n",
" },\n",
" {\n",
" title: { text: \"Input driver (log mass gap)\", style: { color: \"#7c3aed\" } },\n",
" opposite: true,\n",
" min: drvMin,\n",
" max: drvMax,\n",
" gridLineWidth: 0,\n",
" labels: { style: { color: \"#7c3aed\" } }\n",
" }\n",
" ],\n"
)
} else {
paste0(
" yAxis: {\n",
" title: { text: \"Wall-clock seconds\" },\n",
" min: 0,\n",
" max: yMax,\n",
" gridLineColor: \"#e2e8f0\",\n",
" plotLines: [\n",
plot_lines_left,
"\n",
" ]\n",
" },\n"
)
}
paste0(
'<div id="', cid, '" class="shg-hc-chart" style="width:100%;max-width:920px;height:460px;margin:12px 0;',
'border:1px solid #e5e7eb;background:#fafafa"></div>\n',
"<script>\n",
"(function(){\n",
"var avg = ", hc_js_num(avg_sec), ";\n",
if (has_none_pts && is.finite(avg_none_sec)) {
paste0("var avgNone = ", hc_js_num(avg_none_sec), ";\n")
} else {
""
},
if (has_nocpd_pts && is.finite(avg_nocpd_sec)) {
paste0("var avgNocpd = ", hc_js_num(avg_nocpd_sec), ";\n")
} else {
""
},
"var yMax = ", hc_js_num(ymax), ";\n",
if (has_driver) {
paste0(
"var drvMin = ", hc_js_num(ymin_drv), ", drvMax = ", hc_js_num(ymax_drv), ";\n"
)
} else {
""
},
"var xMin = ", as.integer(xmin), ", xMax = ", as.integer(xmax), ";\n",
"var titleText = ", title_js, ";\n",
"var subtitleText = ", subtitle_js, ";\n",
"if (typeof Highcharts === \"undefined\") {\n",
" document.getElementById(\"", cid, "\").innerHTML = ",
" '<p style=\"color:#b91c1c\">Highcharts failed to load. Ensure vendor/highcharts.js exists next to this HTML (re-run the benchmark script).</p>';\n",
" return;\n",
"}\n",
"Highcharts.chart(\"", cid, "\", {\n",
" chart: { type: \"line\", backgroundColor: \"#fafafa\" },\n",
" title: { text: titleText },\n",
" subtitle: { text: subtitleText, style: { color: \"#64748b\" } },\n",
" credits: { enabled: false },\n",
" legend: { enabled: true },\n",
" xAxis: {\n",
" title: { text: \"Birth cohort year\" },\n",
" min: xMin, max: xMax,\n",
" tickInterval: 5,\n",
" gridLineWidth: 1,\n",
" gridLineColor: \"#e2e8f0\"\n",
" },\n",
yaxis_block,
" series: [", sparse_series, none_series, nocpd_series, wall_series, driver_series, "],\n",
" tooltip: { shared: true, valueDecimals: 2 }\n",
"});\n",
"})();\n",
"</script>\n"
)
}
html_escape <- function(x) {
x <- gsub("&", "&", x, fixed = TRUE)
x <- gsub("<", "<", x, fixed = TRUE)
x <- gsub(">", ">", x, fixed = TRUE)
x <- gsub("\"", """, x, fixed = TRUE)
x
}
chart_sections_html <- function(results, meta) {
imm_vals <- sort(unique(results$immediate_cessation_year))
parts <- character(length(imm_vals))
multi_phase <- length(imm_vals) > 1L
for (ii in seq_along(imm_vals)) {
imm <- imm_vals[ii]
sub <- results[results$immediate_cessation_year == imm, , drop = FALSE]
avg_sec <- mean(sub$mean_sec)
tit <- if (!multi_phase) {
sprintf("N=%s | %d timed run(s) averaged per cohort", abbr_n_display(meta$n), meta$runs_per_cohort)
} else {
sprintf(
"Policy cessation year %s | N=%s | %d run(s)/cohort",
imm, abbr_n_display(meta$n), meta$runs_per_cohort
)
}
head_html <- if (multi_phase) paste0("<h3>", html_escape(as.character(imm)), "</h3>") else ""
chart_cols <- c("cohort_year", "mean_sec")
if ("mean_sec_none" %in% names(sub)) chart_cols <- c(chart_cols, "mean_sec_none")
if ("mean_sec_nocpd" %in% names(sub)) chart_cols <- c(chart_cols, "mean_sec_nocpd")
if ("bench_iter_sec" %in% names(sub) && isTRUE(meta$show_bench_iter)) {
chart_cols <- c(chart_cols, "bench_iter_sec")
}
idir <- if (is.null(meta$input_dir)) "" else as.character(meta$input_dir)[1L]
parts[ii] <- paste0(
head_html,
highcharts_cohort_chart(
sprintf("shg-cohort-chart-%d", ii),
sub[, chart_cols, drop = FALSE],
tit,
avg_sec,
if (nzchar(idir) && !grepl("^\\(not recorded", idir)) idir else "",
isTRUE(meta$show_bench_iter)
)
)
}
paste(parts, collapse = "\n")
}
build_html <- function(repo_root, results, meta) {
nu <- length(unique(results$cohort_year))
has_none_summary <- "mean_sec_none" %in% names(results) &&
any(is.finite(suppressWarnings(as.numeric(results$mean_sec_none))))
has_nocpd_summary <- "mean_sec_nocpd" %in% names(results) &&
any(is.finite(suppressWarnings(as.numeric(results$mean_sec_nocpd))))
avg_line <- if (has_none_summary || has_nocpd_summary) {
parts <- list(sprintf(
"Sparse CPD — avg cohort mean: <strong>%.2f</strong> s (min %.2f, max %.2f)",
mean(results$mean_sec),
min(results$mean_sec),
max(results$mean_sec)
))
if (has_none_summary) {
mn <- suppressWarnings(as.numeric(results$mean_sec_none))
parts[[length(parts) + 1L]] <- sprintf(
"CPD none — avg: <strong>%.2f</strong> s (min %.2f, max %.2f)",
mean(mn, na.rm = TRUE),
suppressWarnings(min(mn, na.rm = TRUE)),
suppressWarnings(max(mn, na.rm = TRUE))
)
}
if (has_nocpd_summary) {
m2 <- suppressWarnings(as.numeric(results$mean_sec_nocpd))
parts[[length(parts) + 1L]] <- sprintf(
"nocpd (skip CPD trajectory) — avg: <strong>%.2f</strong> s (min %.2f, max %.2f)",
mean(m2, na.rm = TRUE),
suppressWarnings(min(m2, na.rm = TRUE)),
suppressWarnings(max(m2, na.rm = TRUE))
)
}
paste0(
paste(unlist(parts), collapse = " | "),
sprintf(" | distinct cohort years: %d | N=%s", nu, abbr_n_display(meta$n))
)
} else {
sprintf(
paste0(
"Average across cohort means: <strong>%.2f</strong> s | min mean: %.2f s | ",
"max mean: %.2f s | distinct cohort years: %d | N=%s"
),
mean(results$mean_sec),
min(results$mean_sec),
max(results$mean_sec),
nu,
abbr_n_display(meta$n)
)
}
cpd_table <- '
<h2>Reference: CPD format (1M individuals, ~12 cores)</h2>
<table border="1" cellpadding="6" cellspacing="0" style="border-collapse:collapse;font-family:system-ui,sans-serif;font-size:14px;">
<thead><tr><th>Format</th><th>Time</th><th>Notes</th></tr></thead>
<tbody>
<tr><td><code>none</code></td><td>vs sparse</td><td>Full CPD trajectory in C++; no <code>cigarettes_per_day</code> column (output only)</td></tr>
<tr><td><code>nocpd</code></td><td>faster</td><td>Skips CPD trajectory in engine (benchmark third line vs sparse/none)</td></tr>
<tr><td><code>sparse</code></td><td>~1.3s</td><td>Default, recommended</td></tr>
<tr><td><code>legacy</code></td><td>~1.5s</td><td>Backwards compatibility</td></tr>
</tbody></table>
<p style="color:#64748b;font-size:13px;">Numbers are indicative; this run's hardware and <code>num_threads</code> dominate.</p>
'
chart_block <- chart_sections_html(results, meta)
show_imm <- length(unique(results$immediate_cessation_year)) > 1L
base_cn <- if (show_imm) {
c("immediate_cessation_year", "cohort_year", "N", "runs_per_cohort",
"mean_sec", "sd_sec", "min_sec", "max_sec")
} else {
c("cohort_year", "N", "runs_per_cohort", "mean_sec", "sd_sec", "min_sec", "max_sec")
}
none_cn <- intersect(
c("mean_sec_none", "sd_sec_none", "min_sec_none", "max_sec_none"),
names(results)
)
nocpd_cn <- intersect(
c("mean_sec_nocpd", "sd_sec_nocpd", "min_sec_nocpd", "max_sec_nocpd"),
names(results)
)
bench_cn <- intersect(c("bench_iter_sec"), names(results))
col_names <- c(base_cn, none_cn, nocpd_cn, bench_cn)
num_cols <- intersect(
col_names,
c(
"cohort_year", "N", "runs_per_cohort", "mean_sec", "sd_sec", "min_sec", "max_sec",
"mean_sec_none", "sd_sec_none", "min_sec_none", "max_sec_none",
"mean_sec_nocpd", "sd_sec_nocpd", "min_sec_nocpd", "max_sec_nocpd",
"bench_iter_sec"
)
)
fmt_tbl_cell <- function(nm, v) {
if (nm == "N") return(abbr_n_display(v))
if (nm %in% c("runs_per_cohort", "cohort_year", "immediate_cessation_year")) {
vn <- suppressWarnings(as.numeric(v))
if (length(vn) != 1L || is.na(vn) || !is.finite(vn)) return("-")
return(as.character(as.integer(round(vn))))
}
vn <- suppressWarnings(as.numeric(v))
if (length(vn) != 1L || is.na(vn) || !is.finite(vn)) return("-")
sprintf("%.2f", vn)
}
th_align <- function(nm) {
if (nm %in% num_cols) " style=\"text-align:right\"" else ""
}
td_align <- function(nm) {
if (nm %in% num_cols) " style=\"text-align:right\"" else ""
}
thead <- paste(vapply(col_names, function(nm) {
paste0("<th", th_align(nm), ">", nm, "</th>")
}, character(1L)), collapse = "")
tbody <- paste(vapply(seq_len(nrow(results)), function(i) {
row <- results[i, , drop = TRUE]
tds <- vapply(col_names, function(nm) {
paste0("<td", td_align(nm), ">", html_escape(fmt_tbl_cell(nm, row[[nm]])), "</td>")
}, character(1L))
paste0("<tr>", paste(tds, collapse = ""), "</tr>")
}, character(1L)), collapse = "")
tbl <- paste0(
"<table border=\"1\" cellpadding=\"6\" cellspacing=\"0\" ",
"style=\"border-collapse:collapse;font-family:system-ui,sans-serif;font-size:13px;width:100%%\">",
"<thead><tr>", thead, "</tr></thead><tbody>",
tbody,
"</tbody></table>"
)
paste0(
"<!DOCTYPE html><html><head><meta charset=\"utf-8\"><title>SHG cohort benchmark</title>",
"<style>body{font-family:system-ui,sans-serif;margin:24px;max-width:1100px;line-height:1.45}</style>",
"<script src=\"vendor/highcharts.js\"></script>",
"</head><body>",
"<h1>SmokingHistoryGenerator - cohort sweep benchmark</h1>",
"<p>", html_escape(meta$when), " | ", html_escape(meta$r_version), " | ",
html_escape(meta$platform), "</p>",
if (isTRUE(nzchar(meta$source_note))) {
paste0("<p><em>", html_escape(meta$source_note), "</em></p>")
} else "",
"<p>", avg_line, "</p>",
"<p><code>input_data_folder</code>: ", html_escape(meta$input_dir), "</p>",
cpd_table,
"<h2>Chart (mean seconds by cohort year)</h2>",
chart_block,
"<h2>Per-cohort table</h2>",
tbl,
"</body></html>"
)
}
main_cpd_none_sweep <- function(opt) {
load_shg_package(repo_root)
csv_path <- opt$csv
if (!nzchar(csv_path)) csv_path <- default_csv_path(repo_root)
html_path <- opt$html
if (!nzchar(html_path)) html_path <- default_html_path(repo_root)
if (!file.exists(csv_path)) {
stop("CPD none sweep: CSV not found:\n ", csv_path, call. = FALSE)
}
baseline <- utils::read.csv(csv_path, stringsAsFactors = FALSE, check.names = FALSE)
names(baseline) <- trimws(names(baseline))
none_nm <- c("mean_sec_none", "sd_sec_none", "min_sec_none", "max_sec_none")
for (nm in none_nm) {
if (!nm %in% names(baseline)) baseline[[nm]] <- NA_real_
}
if (!"bench_iter_sec" %in% names(baseline)) baseline$bench_iter_sec <- NA_real_
input_dir <- resolve_input_dir(repo_root, opt$input_root, opt$subdir)
imm <- opt$immediate
rows <- baseline$immediate_cessation_year == imm
years <- sort(unique(as.integer(baseline$cohort_year[rows])))
if (!length(years)) {
stop("CPD none sweep: no cohort_year rows for immediate_cessation_year=", imm, call. = FALSE)
}
ensure_highcharts_vendor_for_html(repo_root, html_path)
bench_started <- Sys.time()
shg_template <- new(SHGInterface)
shg_template$input_data_folder <- input_dir
shg_template$initiation_filename <- "initiation.csv"
shg_template$cessation_filename <- "cessation.csv"
shg_template$cpd_filename <- "cpd.csv"
shg_template$mortality_filename <- "acm.csv"
shg_template$cpd_format <- "none"
shg_template$rng_strategy <- opt$rng_strategy
shg_template$num_threads <- opt$num_threads
shg_template$number_of_segments <- opt$number_of_segments
shg_template$immediate_cessation_year <- imm
message(
"CPD=none sweep: ",
length(years),
" cohort(s), ",
opt$runs_per_cohort,
" run(s)/cohort — baseline sparse line preserved in mean_sec"
)
for (cy in years) {
message(" cohort_year=", cy, " (cpd_format=none) ... ", appendLF = FALSE)
st <- run_one_cohort(shg_template, cy, opt$n, opt$runs_per_cohort)
message(sprintf("%.3f s (mean of %d runs)", st["mean"], opt$runs_per_cohort))
idx <- which(
baseline$cohort_year == cy & baseline$immediate_cessation_year == imm
)
if (!length(idx)) {
stop("CPD none sweep: cohort_year ", cy, " not in baseline CSV", call. = FALSE)
}
meta_partial <- list(
when = format(Sys.time(), usetz = TRUE),
source_note = sprintf(
paste0(
"Second line (amber): cpd_format=none sweep in progress (started %s). ",
"Blue line unchanged (sparse CPD). Refresh to update."
),
format(bench_started, usetz = TRUE)
),
r_version = R.version.string,
platform = R.version$platform,
n = opt$n,
runs_per_cohort = opt$runs_per_cohort,
input_dir = input_dir,
show_bench_iter = isTRUE(opt$chart_wall_iter)
)
tw <- system.time({
baseline[idx[1L], "mean_sec_none"] <- st["mean"]
baseline[idx[1L], "sd_sec_none"] <- st["sd"]
baseline[idx[1L], "min_sec_none"] <- st["min"]
baseline[idx[1L], "max_sec_none"] <- st["max"]
utils::write.csv(baseline, csv_path, row.names = FALSE)
writeLines(build_html(repo_root, baseline, meta_partial), html_path, useBytes = TRUE)
})
baseline[idx[1L], "bench_iter_sec"] <- unname(tw["elapsed"])
utils::write.csv(baseline, csv_path, row.names = FALSE)
writeLines(build_html(repo_root, baseline, meta_partial), html_path, useBytes = TRUE)
message(
" → refreshed ",
basename(csv_path),
" + ",
basename(html_path),
" (",
sum(is.finite(as.numeric(baseline$mean_sec_none))),
" none rows); bench_iter_sec=",
sprintf("%.3f", baseline[idx[1L], "bench_iter_sec"])
)
}
meta <- list(
when = format(Sys.time(), usetz = TRUE),
source_note = NA_character_,
r_version = R.version.string,
platform = R.version$platform,
n = opt$n,
runs_per_cohort = opt$runs_per_cohort,
input_dir = input_dir,
show_bench_iter = isTRUE(opt$chart_wall_iter)
)
writeLines(build_html(repo_root, baseline, meta), html_path, useBytes = TRUE)
message("\nCPD none sweep complete.\nFinal CSV: ", csv_path, "\nFinal HTML: ", html_path)
invisible(list(csv = csv_path, html = html_path, results = baseline))
}
main_nocpd_sweep <- function(opt) {
load_shg_package(repo_root)
csv_path <- opt$csv
if (!nzchar(csv_path)) csv_path <- default_csv_path(repo_root)
html_path <- opt$html
if (!nzchar(html_path)) html_path <- default_html_path(repo_root)
if (!file.exists(csv_path)) {
stop("nocpd sweep: CSV not found:\n ", csv_path, call. = FALSE)
}
baseline <- utils::read.csv(csv_path, stringsAsFactors = FALSE, check.names = FALSE)
names(baseline) <- trimws(names(baseline))
noc_nm <- c("mean_sec_nocpd", "sd_sec_nocpd", "min_sec_nocpd", "max_sec_nocpd")
for (nm in noc_nm) {
if (!nm %in% names(baseline)) baseline[[nm]] <- NA_real_
}
if (!"bench_iter_sec" %in% names(baseline)) baseline$bench_iter_sec <- NA_real_
input_dir <- resolve_input_dir(repo_root, opt$input_root, opt$subdir)
imm <- opt$immediate
rows <- baseline$immediate_cessation_year == imm
years <- sort(unique(as.integer(baseline$cohort_year[rows])))
if (!length(years)) {
stop("nocpd sweep: no cohort_year rows for immediate_cessation_year=", imm, call. = FALSE)
}
nlim <- as.integer(opt$nocpd_last_n)[1L]
if (length(nlim) == 1L && !is.na(nlim) && nlim > 0L) {
if (length(years) > nlim) {
years <- years[(length(years) - nlim + 1L):length(years)]
}
message(
"nocpd sweep: limiting to last ", length(years), " cohort year(s) (--nocpd-last-n=", nlim, ")"
)
}
ensure_highcharts_vendor_for_html(repo_root, html_path)
bench_started <- Sys.time()
shg_template <- new(SHGInterface)
shg_template$input_data_folder <- input_dir
shg_template$initiation_filename <- "initiation.csv"
shg_template$cessation_filename <- "cessation.csv"
shg_template$cpd_filename <- "cpd.csv"
shg_template$mortality_filename <- "acm.csv"
shg_template$cpd_format <- "nocpd"
shg_template$rng_strategy <- opt$rng_strategy
shg_template$num_threads <- opt$num_threads
shg_template$number_of_segments <- opt$number_of_segments
shg_template$immediate_cessation_year <- imm
message(
"CPD=nocpd sweep: ",
length(years),
" cohort(s), ",
opt$runs_per_cohort,
" run(s)/cohort — sparse + none lines unchanged in CSV"
)
for (cy in years) {
message(" cohort_year=", cy, " (cpd_format=nocpd) ... ", appendLF = FALSE)
st <- run_one_cohort(shg_template, cy, opt$n, opt$runs_per_cohort)
message(sprintf("%.3f s (mean of %d runs)", st["mean"], opt$runs_per_cohort))
idx <- which(
baseline$cohort_year == cy & baseline$immediate_cessation_year == imm
)
if (!length(idx)) {
stop("nocpd sweep: cohort_year ", cy, " not in baseline CSV", call. = FALSE)
}
meta_partial <- list(
when = format(Sys.time(), usetz = TRUE),
source_note = sprintf(
paste0(
"Third line (cyan): cpd_format=nocpd sweep in progress (started %s). ",
"Sparse + none columns unchanged. Refresh to update."
),
format(bench_started, usetz = TRUE)
),
r_version = R.version.string,
platform = R.version$platform,
n = opt$n,
runs_per_cohort = opt$runs_per_cohort,
input_dir = input_dir,
show_bench_iter = isTRUE(opt$chart_wall_iter)
)
tw <- system.time({
baseline[idx[1L], "mean_sec_nocpd"] <- st["mean"]
baseline[idx[1L], "sd_sec_nocpd"] <- st["sd"]
baseline[idx[1L], "min_sec_nocpd"] <- st["min"]
baseline[idx[1L], "max_sec_nocpd"] <- st["max"]
utils::write.csv(baseline, csv_path, row.names = FALSE)
writeLines(build_html(repo_root, baseline, meta_partial), html_path, useBytes = TRUE)
})
baseline[idx[1L], "bench_iter_sec"] <- unname(tw["elapsed"])
utils::write.csv(baseline, csv_path, row.names = FALSE)
writeLines(build_html(repo_root, baseline, meta_partial), html_path, useBytes = TRUE)
message(
" → refreshed ",
basename(csv_path),
" + ",
basename(html_path),
" (",
sum(is.finite(as.numeric(baseline$mean_sec_nocpd))),
" nocpd rows); bench_iter_sec=",
sprintf("%.3f", baseline[idx[1L], "bench_iter_sec"])
)
}
meta <- list(
when = format(Sys.time(), usetz = TRUE),
source_note = NA_character_,
r_version = R.version.string,
platform = R.version$platform,
n = opt$n,
runs_per_cohort = opt$runs_per_cohort,
input_dir = input_dir,
show_bench_iter = isTRUE(opt$chart_wall_iter)
)
writeLines(build_html(repo_root, baseline, meta), html_path, useBytes = TRUE)
message("\nnocpd sweep complete.\nFinal CSV: ", csv_path, "\nFinal HTML: ", html_path)
invisible(list(csv = csv_path, html = html_path, results = baseline))
}
main <- function() {
opt <- parse_cli()
if (nzchar(opt$from_csv)) {
return(render_from_csv(opt))
}
if (nzchar(opt$nocpd_sweep) &&
tolower(trimws(opt$nocpd_sweep)) %in% c("1", "true", "yes")) {
return(main_nocpd_sweep(opt))
}
if (nzchar(opt$cpd_none_sweep) &&
tolower(trimws(opt$cpd_none_sweep)) %in% c("1", "true", "yes")) {
return(main_cpd_none_sweep(opt))
}
load_shg_package(repo_root)
input_dir <- resolve_input_dir(repo_root, opt$input_root, opt$subdir)
init_path <- file.path(input_dir, "initiation.csv")
years_all <- cohort_years_from_initiation(init_path)
ym <- opt$year_min
yx <- opt$year_max
if (length(ym) != 1L) ym <- NA_integer_
if (length(yx) != 1L) yx <- NA_integer_
years <- filter_years(years_all, opt$step, ym, yx)
if (!length(years)) stop("No cohort years after filtering; relax step or year bounds.")
phases <- list(list(immediate = opt$immediate, label = "primary"))
if (nzchar(opt$compare_immediate)) {
ci <- suppressWarnings(as.integer(opt$compare_immediate))
if (is.na(ci)) stop("'compare_immediate' must be an integer year.", call. = FALSE)
phases <- c(phases, list(list(
immediate = ci,
label = paste0("compare_", ci)
)))
}
csv_path <- opt$csv
if (!nzchar(csv_path)) csv_path <- default_csv_path(repo_root)
html_path <- opt$html
if (!nzchar(html_path)) html_path <- default_html_path(repo_root)
ensure_highcharts_vendor_for_html(repo_root, html_path)
bench_started <- Sys.time()
results_list <- list()
for (ph in phases) {
shg_template <- new(SHGInterface)
shg_template$input_data_folder <- input_dir
shg_template$initiation_filename <- "initiation.csv"
shg_template$cessation_filename <- "cessation.csv"
shg_template$cpd_filename <- "cpd.csv"
shg_template$mortality_filename <- "acm.csv"
shg_template$cpd_format <- opt$cpd_format
shg_template$rng_strategy <- opt$rng_strategy
shg_template$num_threads <- opt$num_threads
shg_template$number_of_segments <- opt$number_of_segments
shg_template$immediate_cessation_year <- ph$immediate
message("Phase immediate_cessation_year=", ph$immediate, " (", ph$label, ")")
for (cy in years) {
message(" cohort_year=", cy, " ... ", appendLF = FALSE)
st <- run_one_cohort(shg_template, cy, opt$n, opt$runs_per_cohort)
message(sprintf("%.3f s (mean of %d runs)", st["mean"], opt$runs_per_cohort))
row <- data.frame(
immediate_cessation_year = ph$immediate,
cohort_year = cy,
N = as.numeric(opt$n),
runs_per_cohort = opt$runs_per_cohort,
mean_sec = st["mean"],
sd_sec = st["sd"],
min_sec = st["min"],
max_sec = st["max"],
bench_iter_sec = NA_real_,
stringsAsFactors = FALSE
)
if (isTRUE(opt$with_nocpd)) {
message(" cpd_format=nocpd (skip CPD trajectory) ... ", appendLF = FALSE)
st_n <- run_one_cohort(shg_template, cy, opt$n, opt$runs_per_cohort, "nocpd")
message(sprintf("%.3f s (mean of %d runs)", st_n["mean"], opt$runs_per_cohort))
row$mean_sec_nocpd <- st_n["mean"]
row$sd_sec_nocpd <- st_n["sd"]
row$min_sec_nocpd <- st_n["min"]
row$max_sec_nocpd <- st_n["max"]
}
tw <- system.time({
results_list[[length(results_list) + 1L]] <- row
results_partial <- do.call(rbind, results_list)
rownames(results_partial) <- NULL
results_partial <- merge_preserve_cpd_none_from_csv(results_partial, csv_path)
results_partial <- merge_preserve_nocpd_from_csv(results_partial, csv_path)
results_partial <- merge_preserve_bench_iter_from_csv(results_partial, csv_path)
})
ni <- nrow(results_partial)
results_partial$bench_iter_sec[ni] <- unname(tw["elapsed"])
results_list[[ni]]$bench_iter_sec <- results_partial$bench_iter_sec[ni]
utils::write.csv(results_partial, csv_path, row.names = FALSE)
meta_partial <- list(
when = format(Sys.time(), usetz = TRUE),
source_note = sprintf(
"Benchmark in progress (started %s). Partial results — refresh the page to update.",
format(bench_started, usetz = TRUE)
),
r_version = R.version.string,
platform = R.version$platform,
n = opt$n,
runs_per_cohort = opt$runs_per_cohort,
input_dir = input_dir,
show_bench_iter = isTRUE(opt$chart_wall_iter)
)
writeLines(build_html(repo_root, results_partial, meta_partial), html_path, useBytes = TRUE)
message(sprintf(
" → refreshed %s + %s (%d row%s); bench_iter_sec=%.3f",
basename(csv_path), basename(html_path), nrow(results_partial),
if (nrow(results_partial) == 1L) "" else "s",
results_partial$bench_iter_sec[ni]
))
}
}
results <- do.call(rbind, results_list)
rownames(results) <- NULL
results <- merge_preserve_cpd_none_from_csv(results, csv_path)
results <- merge_preserve_nocpd_from_csv(results, csv_path)
results <- merge_preserve_bench_iter_from_csv(results, csv_path)
write.csv(results, csv_path, row.names = FALSE)
message("\nFinal CSV: ", csv_path)
meta <- list(
when = format(Sys.time(), usetz = TRUE),
source_note = NA_character_,
r_version = R.version.string,
platform = R.version$platform,
n = opt$n,
runs_per_cohort = opt$runs_per_cohort,
input_dir = input_dir,
show_bench_iter = isTRUE(opt$chart_wall_iter)
)
writeLines(build_html(repo_root, results, meta), html_path, useBytes = TRUE)
message("Final HTML: ", html_path)
invisible(list(csv = csv_path, html = html_path, results = results))
}
.help_text <- function() {
paste(c(
"benchmark-cohort-variance.R - cohort sweep timing + CSV + HTML chart",
"",
"Usage: Rscript tools/benchmark-cohort-variance.R [options]",
"",
"Options (also SHG_COHORT_BENCH_CSV / SHG_COHORT_BENCH_HTML):",
" --n=N individuals per cohort (default 1e6)",
" --step=YEAR keep cohort columns where year %% step == 0 (default 5)",
" --runs=N timed repeats per cohort, mean reported (default 3)",
" --subdir=NAME under tests/testdata/NHIS-1965-2018/ (default csv-complete)",
" --input_root=PATH override parent of subdir (folder containing csv-complete/)",
" --year_min= / --year_max= optional cohort bounds",
" --immediate=N immediate_cessation_year (default 0)",
" --compare_immediate=N second sweep with this immediate year (or env SHG_COHORT_BENCH_COMPARE_IMMEDIATE_CESSATION)",
" --cpd_format=sparse|none|legacy|nocpd (none=omit CPD column; nocpd=skip CPD calc in engine)",
" --with-nocpd=1 also time cpd_format=nocpd (3rd line: no CPD calc; use with cpd-none-sweep for 2nd line=none)",
" --rng_strategy=RngStream|MersenneTwister",
" --num_threads=-1 SHGInterface default",
" --number_of_segments=-1",
" --csv=PATH --html=PATH output paths",
" (during a full sweep, CSV/HTML refresh after each cohort)",
"",
"Render HTML only from a prior CSV (no sims; does not load SmokingHistoryGenerator):",
" --from-csv=PATH required columns: cohort_year, mean_sec;",
" optional: min_sec, max_sec, sd_sec, N, runs_per_cohort, immediate_cessation_year",
" --render-input-dir=PATH shown in HTML as input_data_folder when re-rendering",
"",
" --cpd-none-sweep=1 re-time every cohort in existing CSV with cpd_format=none;",
" fills mean_sec_none / sd_sec_none / min/max_none; keeps sparse line;",
" refreshes CSV/HTML after each cohort (amber second line);",
" records bench_iter_sec = wall-clock for sim + CSV + HTML each step",
" --nocpd-sweep=1 re-time every cohort with cpd_format=nocpd (skip CPD calc);",
" fills mean_sec_nocpd / sd/min/max_nocpd; keeps sparse + none columns;",
" --nocpd-last-n=N with nocpd-sweep only: last N cohort years (sorted); omit or 0 = all.",
" After CSV exists, --from-csv re-renders HTML without re-running sims.",
" --chart-wall-iter=1 include slate bench_iter_sec line on chart (default off;",
" csv column still stored for reference)",
"",
"SHG_BENCHMARK_USE_PKGLOAD=1 load package from source via pkgload::load_all(repo root).",
"",
"Charts load docs/vendor/highcharts.js (copied beside the HTML). Offline/file:// safe."
), collapse = "\n")
}
# Fix help: getSrcFilename fails in Rscript; use simple help
if ("--help" %in% commandArgs(trailingOnly = TRUE) || "-h" %in% commandArgs(trailingOnly = TRUE)) {
writeLines(.help_text())
quit(status = 0)
}
tryCatch(main(), error = function(e) {
message(conditionMessage(e))
quit(status = 1)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.