tools/benchmark-cohort-variance.R

#!/usr/bin/env Rscript
# Cohort sweep benchmark: time runSimFromFixedValues per birth cohort, write CSV + HTML chart.
# Spec: tmp/benchmark.md (and legacy SHG_COHORT_BENCH_* env vars).
#
# Usage (from repo root):
#   Rscript tools/benchmark-cohort-variance.R
#   Rscript tools/benchmark-cohort-variance.R --n=100000 --step=10 --runs=2
#   SHG_COHORT_BENCH_CSV=out.csv SHG_COHORT_BENCH_HTML=out.html Rscript tools/benchmark-cohort-variance.R
#
# Re-render HTML only from a saved CSV (no simulations, no SmokingHistoryGenerator load):
#   Rscript tools/benchmark-cohort-variance.R --from-csv=docs/cohort-benchmark-latest.csv
#   Rscript tools/benchmark-cohort-variance.R --from-csv=run.csv --html=out.html --render-input-dir=/path/to/data
#
# Optional: SHG_BENCHMARK_USE_PKGLOAD=1 to pkgload::load_all() from source instead of library().
#
# Charts use Highcharts bundled at docs/vendor/highcharts.js (copied beside the HTML output
# so file:// works offline). Refresh the bundle: see docs/vendor/README.txt

repo_root <- (function() {
  ca <- commandArgs(trailingOnly = FALSE)
  ff <- sub("^--file=", "", ca[grepl("^--file=", ca)][1])
  if (nzchar(ff) && file.exists(ff)) {
    return(normalizePath(file.path(dirname(normalizePath(ff)), ".."), winslash = "/"))
  }
  normalizePath(getwd(), winslash = "/")
})()

parse_cli <- function() {
  a <- commandArgs(trailingOnly = TRUE)
  out <- list(
    n = 1e6,
    step = 5L,
    runs_per_cohort = 3L,
    subdir = "csv-complete",
    input_root = NA_character_,
    year_min = NA_integer_,
    year_max = NA_integer_,
    immediate = 0L,
    cpd_format = "sparse",
    rng_strategy = "RngStream",
    num_threads = -1L,
    number_of_segments = -1L,
    csv = Sys.getenv("SHG_COHORT_BENCH_CSV", ""),
    html = Sys.getenv("SHG_COHORT_BENCH_HTML", ""),
    compare_immediate = Sys.getenv("SHG_COHORT_BENCH_COMPARE_IMMEDIATE_CESSATION", ""),
    from_csv = "",
    render_input_dir = "",
    cpd_none_sweep = "",
    nocpd_sweep = "",
    nocpd_last_n = 0L,
    chart_wall_iter = FALSE,
    help = FALSE,
    with_nocpd = FALSE
  )
  for (x in a) {
    if (x %in% c("-h", "--help")) {
      out$help <- TRUE
      next
    }
    if (!grepl("^--", x)) next
    kv <- sub("^--", "", x)
    if (!grepl("=", kv, fixed = TRUE)) next
    sp <- regexpr("=", kv, fixed = TRUE)
    key <- substr(kv, 1L, sp - 1L)
    val <- substr(kv, sp + 1L, nchar(kv))
    switch(key,
      n = ,
      N = out$n <- as.numeric(val),
      step = out$step <- as.integer(val),
      runs = ,
      runs_per_cohort = out$runs_per_cohort <- as.integer(val),
      subdir = out$subdir <- val,
      input_root = out$input_root <- val,
      year_min = out$year_min <- if (toupper(trimws(val)) %in% c("", "NA")) NA_integer_ else suppressWarnings(as.integer(val)),
      year_max = out$year_max <- if (toupper(trimws(val)) %in% c("", "NA")) NA_integer_ else suppressWarnings(as.integer(val)),
      immediate = out$immediate <- as.integer(val),
      cpd_format = out$cpd_format <- val,
      rng_strategy = out$rng_strategy <- val,
      num_threads = out$num_threads <- as.integer(val),
      number_of_segments = out$number_of_segments <- as.integer(val),
      csv = out$csv <- val,
      html = out$html <- val,
      compare_immediate = out$compare_immediate <- val,
      from_csv = out$from_csv <- val,
      `from-csv` = out$from_csv <- val,
      render_input_dir = out$render_input_dir <- val,
      `render-input-dir` = out$render_input_dir <- val,
      cpd_none_sweep = out$cpd_none_sweep <- val,
      `cpd-none-sweep` = out$cpd_none_sweep <- val,
      nocpd_sweep = out$nocpd_sweep <- val,
      `nocpd-sweep` = out$nocpd_sweep <- val,
      nocpd_last_n = out$nocpd_last_n <- suppressWarnings(as.integer(val)),
      `nocpd-last-n` = out$nocpd_last_n <- suppressWarnings(as.integer(val)),
      chart_wall_iter = out$chart_wall_iter <- (tolower(trimws(val)) %in% c("1", "true", "yes")),
      `chart-wall-iter` = out$chart_wall_iter <- (tolower(trimws(val)) %in% c("1", "true", "yes")),
      with_nocpd = out$with_nocpd <- (tolower(trimws(val)) %in% c("1", "true", "yes")),
      `with-nocpd` = out$with_nocpd <- (tolower(trimws(val)) %in% c("1", "true", "yes")),
      {}
    )
  }
  out
}

cohort_years_from_initiation <- function(init_path) {
  line1 <- readLines(init_path, n = 1L, warn = FALSE)
  cols <- strsplit(line1, ",", fixed = TRUE)[[1L]]
  yr <- suppressWarnings(as.integer(cols))
  yr <- yr[!is.na(yr) & yr >= 1800L & yr <= 2200L]
  sort(unique(yr))
}

filter_years <- function(years, step, year_min, year_max) {
  y <- years[years %% step == 0L]
  if (!is.na(year_min)) y <- y[y >= year_min]
  if (!is.na(year_max)) y <- y[y <= year_max]
  y
}

resolve_input_dir <- function(repo_root, input_root, subdir) {
  candidates <- character()
  if (!is.na(input_root) && nzchar(input_root)) {
    candidates <- c(candidates, normalizePath(file.path(input_root, subdir), winslash = "/", mustWork = FALSE))
  }
  candidates <- c(
    candidates,
    normalizePath(file.path(repo_root, "tests/testdata/NHIS-1965-2018", subdir), winslash = "/", mustWork = FALSE),
    normalizePath(file.path(repo_root, "tests/testdata/NHIS-1965-2018", "csv-partial"), winslash = "/", mustWork = FALSE)
  )
  for (p in candidates) {
    if (file.exists(file.path(p, "initiation.csv"))) return(p)
  }
  stop(
    "Could not find initiation.csv under any of:\n",
    paste0("  ", candidates, collapse = "\n"),
    "\nSet --input_root=... to the folder that contains csv-complete/ and csv-partial/."
  )
}

load_shg_package <- function(repo_root) {
  if (nzchar(Sys.getenv("SHG_BENCHMARK_USE_PKGLOAD", ""))) {
    if (!requireNamespace("pkgload", quietly = TRUE)) {
      stop("Install pkgload or unset SHG_BENCHMARK_USE_PKGLOAD", call. = FALSE)
    }
    pkgload::load_all(repo_root, quiet = TRUE)
  } else {
    library(SmokingHistoryGenerator)
  }
}

run_one_cohort <- function(shg_template, cohort_year, n, runs, cpd_format_override = NULL) {
  secs <- numeric(runs)
  for (r in seq_len(runs)) {
    shg <- new(SHGInterface)
    shg$input_data_folder <- shg_template$input_data_folder
    shg$initiation_filename <- shg_template$initiation_filename
    shg$cessation_filename <- shg_template$cessation_filename
    shg$cpd_filename <- shg_template$cpd_filename
    shg$mortality_filename <- shg_template$mortality_filename
    shg$cpd_format <- if (!is.null(cpd_format_override)) {
      cpd_format_override
    } else {
      shg_template$cpd_format
    }
    shg$rng_strategy <- shg_template$rng_strategy
    shg$num_threads <- shg_template$num_threads
    shg$number_of_segments <- shg_template$number_of_segments
    shg$immediate_cessation_year <- shg_template$immediate_cessation_year
    gc()
    t <- system.time({
      sim_out <- shg$runSimFromFixedValues(as.integer(n), 0L, 0L, as.integer(cohort_year))
      rm(sim_out)
    })
    secs[r] <- unname(t["elapsed"])
  }
  sdv <- if (runs > 1L) stats::sd(secs) else 0
  c(mean = mean(secs), sd = sdv, min = min(secs), max = max(secs))
}

default_html_path <- function(repo_root) {
  d <- file.path(repo_root, "docs")
  if (!dir.exists(d)) dir.create(d, recursive = TRUE)
  file.path(d, "cohort-benchmark-latest.html")
}

default_csv_path <- function(repo_root) {
  d <- file.path(repo_root, "docs")
  if (!dir.exists(d)) dir.create(d, recursive = TRUE)
  file.path(d, "cohort-benchmark-latest.csv")
}

highcharts_vendor_repo_path <- function(repo_root) {
  file.path(repo_root, "docs", "vendor", "highcharts.js")
}

# Copy bundled Highcharts next to the HTML file so relative src=vendor/highcharts.js works
# for file:// and offline (no CDN).
ensure_highcharts_vendor_for_html <- function(repo_root, html_path) {
  src <- normalizePath(highcharts_vendor_repo_path(repo_root), winslash = "/", mustWork = FALSE)
  if (!isTRUE(file.exists(src))) {
    stop(
      "Bundled Highcharts not found at:\n  ", src, "\n",
      "See docs/vendor/README.txt",
      call. = FALSE
    )
  }
  html_path <- normalizePath(html_path, winslash = "/", mustWork = FALSE)
  dest_dir <- file.path(dirname(html_path), "vendor")
  dest <- file.path(dest_dir, "highcharts.js")
  if (!dir.exists(dest_dir)) dir.create(dest_dir, recursive = TRUE)
  src <- normalizePath(src, winslash = "/", mustWork = TRUE)
  dest <- normalizePath(dest, winslash = "/", mustWork = FALSE)
  if (src != dest) file.copy(src, dest, overwrite = TRUE)
  invisible(dest)
}

merge_preserve_cpd_none_from_csv <- function(results, csv_path) {
  none_nm <- c("mean_sec_none", "sd_sec_none", "min_sec_none", "max_sec_none")
  for (nm in none_nm) {
    if (!nm %in% names(results)) results[[nm]] <- NA_real_
  }
  if (!file.exists(csv_path)) return(results)
  old <- utils::read.csv(csv_path, stringsAsFactors = FALSE, check.names = FALSE)
  if (!any(none_nm %in% names(old))) return(results)
  for (i in seq_len(nrow(results))) {
    cy <- results$cohort_year[i]
    imm <- results$immediate_cessation_year[i]
    j <- which(old$cohort_year == cy & old$immediate_cessation_year == imm)
    if (length(j) != 1L) next
    for (nm in none_nm) {
      if (nm %in% names(old)) results[i, nm] <- old[j[1L], nm]
    }
  }
  results
}

merge_preserve_nocpd_from_csv <- function(results, csv_path) {
  noc_nm <- c("mean_sec_nocpd", "sd_sec_nocpd", "min_sec_nocpd", "max_sec_nocpd")
  for (nm in noc_nm) {
    if (!nm %in% names(results)) results[[nm]] <- NA_real_
  }
  if (!file.exists(csv_path)) return(results)
  old <- utils::read.csv(csv_path, stringsAsFactors = FALSE, check.names = FALSE)
  if (!any(noc_nm %in% names(old))) return(results)
  for (i in seq_len(nrow(results))) {
    cy <- results$cohort_year[i]
    imm <- results$immediate_cessation_year[i]
    j <- which(old$cohort_year == cy & old$immediate_cessation_year == imm)
    if (length(j) != 1L) next
    for (nm in noc_nm) {
      if (nm %in% names(old)) results[i, nm] <- old[j[1L], nm]
    }
  }
  results
}

merge_preserve_bench_iter_from_csv <- function(results, csv_path) {
  if (!"bench_iter_sec" %in% names(results)) results$bench_iter_sec <- NA_real_
  if (!file.exists(csv_path)) return(results)
  old <- utils::read.csv(csv_path, stringsAsFactors = FALSE, check.names = FALSE)
  if (!"bench_iter_sec" %in% names(old)) return(results)
  for (i in seq_len(nrow(results))) {
    cy <- results$cohort_year[i]
    imm <- results$immediate_cessation_year[i]
    j <- which(old$cohort_year == cy & old$immediate_cessation_year == imm)
    if (length(j) != 1L) next
    results[i, "bench_iter_sec"] <- old[j[1L], "bench_iter_sec"]
  }
  results
}

normalize_benchmark_csv <- function(df) {
  names(df) <- trimws(names(df))
  req <- c("cohort_year", "mean_sec")
  miss <- setdiff(req, names(df))
  if (length(miss)) {
    stop("CSV missing required column(s): ", paste(miss, collapse = ", "), call. = FALSE)
  }
  if (!"min_sec" %in% names(df)) df$min_sec <- df$mean_sec
  if (!"max_sec" %in% names(df)) df$max_sec <- df$mean_sec
  if (!"sd_sec" %in% names(df)) df$sd_sec <- NA_real_
  if (!"N" %in% names(df)) df$N <- NA_real_
  if (!"runs_per_cohort" %in% names(df)) df$runs_per_cohort <- 1L
  if (!"immediate_cessation_year" %in% names(df)) df$immediate_cessation_year <- 0L
  df$cohort_year <- as.integer(df$cohort_year)
  df$mean_sec <- as.numeric(df$mean_sec)
  df$sd_sec <- as.numeric(df$sd_sec)
  df$min_sec <- as.numeric(df$min_sec)
  df$max_sec <- as.numeric(df$max_sec)
  df$N <- as.numeric(df$N)
  df$runs_per_cohort <- as.integer(df$runs_per_cohort)
  df$immediate_cessation_year <- as.integer(df$immediate_cessation_year)
  none_nm <- c("mean_sec_none", "sd_sec_none", "min_sec_none", "max_sec_none")
  for (nm in none_nm) {
    if (!nm %in% names(df)) df[[nm]] <- NA_real_
  }
  for (nm in none_nm) df[[nm]] <- as.numeric(df[[nm]])
  noc_nm <- c("mean_sec_nocpd", "sd_sec_nocpd", "min_sec_nocpd", "max_sec_nocpd")
  for (nm in noc_nm) {
    if (!nm %in% names(df)) df[[nm]] <- NA_real_
  }
  for (nm in noc_nm) df[[nm]] <- as.numeric(df[[nm]])
  if (!"bench_iter_sec" %in% names(df)) df$bench_iter_sec <- NA_real_
  df$bench_iter_sec <- as.numeric(df$bench_iter_sec)
  df
}

build_meta_from_csv_results <- function(results, opt) {
  n_med <- suppressWarnings(stats::median(results$N, na.rm = TRUE))
  if (!is.finite(n_med)) n_med <- opt$n
  rp_med <- suppressWarnings(stats::median(results$runs_per_cohort, na.rm = TRUE))
  if (!is.finite(rp_med)) rp_med <- opt$runs_per_cohort
  inp <- opt$render_input_dir
  if (!nzchar(inp)) inp <- "(not recorded in CSV; pass --render-input-dir= to annotate)"
  list(
    when = format(Sys.time(), usetz = TRUE),
    source_note = "Chart/table rendered from existing CSV (simulations not re-run).",
    r_version = R.version.string,
    platform = R.version$platform,
    n = n_med,
    runs_per_cohort = as.integer(rp_med),
    input_dir = inp,
    show_bench_iter = isTRUE(opt$chart_wall_iter)
  )
}

render_from_csv <- function(opt) {
  src <- opt$from_csv
  if (!nzchar(src)) stop("Internal: empty from_csv", call. = FALSE)
  path <- normalizePath(src, winslash = "/", mustWork = TRUE)
  raw <- utils::read.csv(path, stringsAsFactors = FALSE, check.names = FALSE)
  results <- normalize_benchmark_csv(raw)
  rownames(results) <- NULL

  html_path <- opt$html
  if (!nzchar(html_path)) html_path <- default_html_path(repo_root)

  ensure_highcharts_vendor_for_html(repo_root, html_path)
  meta <- build_meta_from_csv_results(results, opt)
  writeLines(build_html(repo_root, results, meta), html_path, useBytes = TRUE)
  message("Wrote HTML from CSV:\n  ", html_path, "\nSource: ", path)
  invisible(list(html = html_path, results = results, csv_source = path))
}

abbr_n_display <- function(n) {
  n <- as.numeric(n)
  if (length(n) != 1L || is.na(n) || !is.finite(n)) return("-")
  ni <- suppressWarnings(as.integer(round(n)))
  if (!is.na(ni) && ni == 1000000L) return("1M")
  if (!is.na(ni) && ni == 100000L) return("100k")
  if (n >= 1e6 && abs(n %% 1e6) < 1e-6) return(sprintf("%gM", n / 1e6))
  if (n >= 1e3 && abs(n %% 1e3) < 1e-6) return(sprintf("%gk", n / 1e3))
  format(n, scientific = FALSE, trim = TRUE)
}

# JSON string for embedding in <script> (avoids closing </script> in HTML)
json_string_for_script <- function(s) {
  s <- enc2utf8(as.character(s)[1L])
  s <- gsub("\\", "\\\\", s, fixed = TRUE)
  s <- gsub("\"", "\\\"", s, fixed = TRUE)
  s <- gsub("\n", "\\n", s, fixed = TRUE)
  s <- gsub("\r", "\\r", s, fixed = TRUE)
  s <- gsub("\t", "\\t", s, fixed = TRUE)
  s <- gsub("<", "\\u003c", s, fixed = TRUE)
  paste0("\"", s, "\"")
}

hc_js_num <- function(x) {
  format(as.numeric(x), scientific = FALSE, trim = TRUE, digits = 14)
}

# Per birth-cohort column in initiation/cessation.csv (RACE=0, SEX=0): sum probabilities
# then log1p(sum_init) - log1p(sum_cess). Higher ≈ more initiation mass vs cessation mass
# in the bundled NHIS tables (exploratory; meant to correlate with sim work).
cohort_input_driver_by_year <- function(input_dir) {
  if (!nzchar(input_dir) || !dir.exists(input_dir)) return(NULL)
  ip <- file.path(input_dir, "initiation.csv")
  cp <- file.path(input_dir, "cessation.csv")
  if (!file.exists(ip) || !file.exists(cp)) return(NULL)
  init <- utils::read.csv(ip, check.names = FALSE, stringsAsFactors = FALSE)
  cess <- utils::read.csv(cp, check.names = FALSE, stringsAsFactors = FALSE)
  nm <- toupper(names(init))
  ir <- match("RACE", nm)
  isx <- match("SEX", nm)
  if (is.na(ir) || is.na(isx)) return(NULL)
  init_sub <- init[init[[ir]] == 0 & init[[isx]] == 0, , drop = FALSE]
  cess_sub <- cess[cess[[ir]] == 0 & cess[[isx]] == 0, , drop = FALSE]
  nc <- ncol(init)
  if (nc < 4L) return(NULL)
  out <- numeric()
  labs <- character()
  for (j in 4L:nc) {
    cn <- names(init)[j]
    cy <- suppressWarnings(as.integer(cn))
    if (is.na(cy)) next
    if (!cn %in% names(cess_sub)) next
    isum <- sum(suppressWarnings(as.numeric(init_sub[[cn]])), na.rm = TRUE)
    csum <- sum(suppressWarnings(as.numeric(cess_sub[[cn]])), na.rm = TRUE)
    out <- c(out, log1p(isum) - log1p(csum))
    labs <- c(labs, cn)
  }
  if (!length(out)) return(NULL)
  stats::setNames(out, labs)
}

# Highcharts: vendor/highcharts.js next to the HTML (see ensure_highcharts_vendor_for_html).
# Optional column mean_sec_none enables a second series (CPD none vs sparse baseline mean_sec).
# input_dir: when initiation.csv + cessation.csv exist, adds right-axis driver line (input tables only).
# show_bench_iter: if TRUE, plot bench_iter_sec (CSV/HTML step timing); default FALSE so 3rd curve can be nocpd.
highcharts_cohort_chart <- function(chart_id, df, title, avg_sec, input_dir = "", show_bench_iter = FALSE) {
  df <- df[order(df$cohort_year), , drop = FALSE]
  if (!nrow(df)) return("<p>No data.</p>")

  xmin <- min(df$cohort_year)
  xmax <- max(df$cohort_year)
  has_none_pts <- "mean_sec_none" %in% names(df) &&
    any(is.finite(suppressWarnings(as.numeric(df$mean_sec_none))))

  avg_none_sec <- if (has_none_pts) {
    mean(suppressWarnings(as.numeric(df$mean_sec_none)), na.rm = TRUE)
  } else {
    NA_real_
  }

  has_nocpd_pts <- "mean_sec_nocpd" %in% names(df) &&
    any(is.finite(suppressWarnings(as.numeric(df$mean_sec_nocpd))))

  avg_nocpd_sec <- if (has_nocpd_pts) {
    mean(suppressWarnings(as.numeric(df$mean_sec_nocpd)), na.rm = TRUE)
  } else {
    NA_real_
  }

  has_bench_data <- "bench_iter_sec" %in% names(df) &&
    any(is.finite(suppressWarnings(as.numeric(df$bench_iter_sec))))
  has_wall_pts <- isTRUE(show_bench_iter) && has_bench_data

  drv_all <- cohort_input_driver_by_year(input_dir)
  drv_vals <- if (!is.null(drv_all)) {
    idx <- match(as.character(df$cohort_year), names(drv_all))
    unname(drv_all[idx])
  } else {
    rep(NA_real_, nrow(df))
  }
  has_driver <- any(is.finite(drv_vals))

  ymax <- max(c(df$mean_sec, avg_sec, 1), na.rm = TRUE)
  if (has_none_pts) {
    mn <- suppressWarnings(as.numeric(df$mean_sec_none))
    ymax <- max(c(ymax, mn[is.finite(mn)], avg_none_sec), na.rm = TRUE)
  }
  if (has_nocpd_pts) {
    mn2 <- suppressWarnings(as.numeric(df$mean_sec_nocpd))
    ymax <- max(c(ymax, mn2[is.finite(mn2)], avg_nocpd_sec), na.rm = TRUE)
  }
  if (has_wall_pts) {
    bw <- suppressWarnings(as.numeric(df$bench_iter_sec))
    ymax <- max(c(ymax, bw[is.finite(bw)]), na.rm = TRUE)
  }
  ymax <- ymax * 1.08
  if (!is.finite(ymax) || ymax <= 0) ymax <- 1

  ymax_drv <- if (has_driver) {
    dv <- drv_vals[is.finite(drv_vals)]
    if (!length(dv)) {
      1
    } else {
      r <- range(dv)
      pad <- (r[2] - r[1]) * 0.08 + 1e-6
      r[2] + pad
    }
  } else {
    1
  }
  ymin_drv <- if (has_driver) {
    dv <- drv_vals[is.finite(drv_vals)]
    if (!length(dv)) {
      0
    } else {
      r <- range(dv)
      pad <- (r[2] - r[1]) * 0.08 + 1e-6
      r[1] - pad
    }
  } else {
    0
  }

  pairs_sparse <- paste(
    sprintf("[%s,%s]", df$cohort_year, hc_js_num(df$mean_sec)),
    collapse = ","
  )
  data_sparse_js <- paste0("[", pairs_sparse, "]")

  title_js <- json_string_for_script(title)
  leg_left <- if (has_none_pts || has_nocpd_pts) {
    paste0(
      "Blue = sparse CPD",
      if (has_none_pts) "; amber = cpd_format none (full CPD calc; omit column)" else "",
      if (has_nocpd_pts) "; cyan = nocpd (skip CPD calculation in engine)" else "",
      ".",
      if (has_driver) {
        " Right axis: input driver = log1p(sum initiation) - log1p(sum cessation), RACE=0 SEX=0 (exploratory)."
      } else {
        ""
      }
    )
  } else if (has_driver) {
    "Right axis: input driver = log1p(sum initiation) - log1p(sum cessation), RACE=0 SEX=0 (exploratory)."
  } else {
    ""
  }
  if (has_wall_pts) {
    leg_left <- paste0(
      leg_left,
      if (nzchar(leg_left)) " " else "",
      "Slate = wall-clock per cohort step (timed simulation(s) + write CSV + refresh HTML)."
    )
  }
  sub_txt <- if (nzchar(leg_left)) {
    paste0("Left y-axis: wall-clock seconds (mean of timed runs). ", leg_left)
  } else {
    "Wall-clock seconds per cohort (mean of timed runs)"
  }
  subtitle_js <- json_string_for_script(sub_txt)
  cid <- html_escape(chart_id)

  sparse_series <- paste0(
    "{\n",
    "    type: \"line\",\n",
    "    yAxis: 0,\n",
    if (has_none_pts || has_nocpd_pts) {
      "    name: \"CPD sparse (default)\",\n"
    } else {
      "    name: \"Mean (wall-clock s)\",\n"
    },
    "    data: ", data_sparse_js, ",\n",
    "    color: \"#2563eb\",\n",
    "    lineWidth: 2.5,\n",
    "    marker: {\n",
    "      enabled: true, radius: 4.5,\n",
    "      lineWidth: 2, lineColor: \"#1d4ed8\", fillColor: \"#ffffff\"\n",
    "    }\n",
    "  }"
  )

  none_series <- ""
  if (has_none_pts) {
    idx <- which(is.finite(suppressWarnings(as.numeric(df$mean_sec_none))))
    pairs_none <- paste(
      sprintf("[%s,%s]", df$cohort_year[idx], hc_js_num(df$mean_sec_none[idx])),
      collapse = ","
    )
    data_none_js <- paste0("[", pairs_none, "]")
    none_series <- paste0(
      ", {\n",
      "    type: \"line\",\n",
      "    yAxis: 0,\n",
      "    name: \"CPD none (omit column, full calc)\",\n",
      "    data: ", data_none_js, ",\n",
      "    color: \"#d97706\",\n",
      "    lineWidth: 2.5,\n",
      "    marker: {\n",
      "      enabled: true, radius: 4.5,\n",
      "      lineWidth: 2, lineColor: \"#b45309\", fillColor: \"#ffffff\"\n",
      "    }\n",
      "  }"
    )
  }

  nocpd_series <- ""
  if (has_nocpd_pts) {
    idxn <- which(is.finite(suppressWarnings(as.numeric(df$mean_sec_nocpd))))
    pairs_nocpd <- paste(
      sprintf("[%s,%s]", df$cohort_year[idxn], hc_js_num(df$mean_sec_nocpd[idxn])),
      collapse = ","
    )
    data_nocpd_js <- paste0("[", pairs_nocpd, "]")
    nocpd_series <- paste0(
      ", {\n",
      "    type: \"line\",\n",
      "    yAxis: 0,\n",
      "    name: \"CPD nocpd (skip CPD calc)\",\n",
      "    data: ", data_nocpd_js, ",\n",
      "    color: \"#0891b2\",\n",
      "    lineWidth: 2.5,\n",
      "    marker: {\n",
      "      enabled: true, radius: 4.5,\n",
      "      lineWidth: 2, lineColor: \"#0e7490\", fillColor: \"#ffffff\"\n",
      "    }\n",
      "  }"
    )
  }

  wall_series <- ""
  if (has_wall_pts) {
    idxw <- which(is.finite(suppressWarnings(as.numeric(df$bench_iter_sec))))
    pairs_wall <- paste(
      sprintf("[%s,%s]", df$cohort_year[idxw], hc_js_num(df$bench_iter_sec[idxw])),
      collapse = ","
    )
    data_wall_js <- paste0("[", pairs_wall, "]")
    wall_series <- paste0(
      ", {\n",
      "    type: \"line\",\n",
      "    yAxis: 0,\n",
      "    name: \"Wall clock (cohort step)\",\n",
      "    data: ", data_wall_js, ",\n",
      "    color: \"#475569\",\n",
      "    dashStyle: \"ShortDash\",\n",
      "    lineWidth: 2,\n",
      "    marker: {\n",
      "      enabled: true, radius: 3,\n",
      "      lineWidth: 1.5, lineColor: \"#334155\", fillColor: \"#ffffff\"\n",
      "    }\n",
      "  }"
    )
  }

  driver_series <- ""
  if (has_driver) {
    pairs_drv <- paste(
      sprintf("[%s,%s]", df$cohort_year, hc_js_num(drv_vals)),
      collapse = ","
    )
    data_drv_js <- paste0("[", pairs_drv, "]")
    driver_series <- paste0(
      ", {\n",
      "    type: \"line\",\n",
      "    yAxis: 1,\n",
      "    name: \"Input driver (init vs cess mass)\",\n",
      "    data: ", data_drv_js, ",\n",
      "    color: \"#7c3aed\",\n",
      "    dashStyle: \"ShortDot\",\n",
      "    lineWidth: 2,\n",
      "    marker: { enabled: false },\n",
      "    tooltip: { valueDecimals: 3 }\n",
      "  }"
    )
  }

  # Left-axis plotLines: sparse avg, optional none avg, 1 s reference
  pl_sparse <- paste0(
    "      {\n",
    "        value: avg, color: \"#16a34a\", dashStyle: \"Dash\", width: 2, zIndex: 3,\n",
    "        label: {\n",
    "          text: ", json_string_for_script(sprintf("sparse avg %.2fs", avg_sec)), ",\n",
    "          align: \"right\", style: { color: \"#16a34a\", fontWeight: \"600\" }\n",
    "        }\n",
    "      }"
  )
  pl_none <- if (has_none_pts && is.finite(avg_none_sec)) {
    paste0(
      ",\n",
      "      {\n",
      "        value: avgNone, color: \"#d97706\", dashStyle: \"Dash\", width: 2, zIndex: 3,\n",
      "        label: {\n",
      "          text: ", json_string_for_script(sprintf("none avg %.2fs", avg_none_sec)), ",\n",
      "          align: \"right\", style: { color: \"#d97706\", fontWeight: \"600\" }\n",
      "        }\n",
      "      }"
    )
  } else {
    ""
  }
  pl_nocpd <- if (has_nocpd_pts && is.finite(avg_nocpd_sec)) {
    paste0(
      ",\n",
      "      {\n",
      "        value: avgNocpd, color: \"#0891b2\", dashStyle: \"Dash\", width: 2, zIndex: 3,\n",
      "        label: {\n",
      "          text: ", json_string_for_script(sprintf("nocpd avg %.2fs", avg_nocpd_sec)), ",\n",
      "          align: \"right\", style: { color: \"#0891b2\", fontWeight: \"600\" }\n",
      "        }\n",
      "      }"
    )
  } else {
    ""
  }
  pl_one <- paste0(
    ",\n",
    "      {\n",
    "        value: 1, color: \"#64748b\", dashStyle: \"ShortDash\", width: 1.5, zIndex: 3,\n",
    "        label: { text: \"1 s\", align: \"right\", style: { color: \"#64748b\" } }\n",
    "      }"
  )
  plot_lines_left <- paste0(pl_sparse, pl_none, pl_nocpd, pl_one)

  yaxis_block <- if (has_driver) {
    paste0(
      "  yAxis: [\n",
      "    {\n",
      "      title: { text: \"Wall-clock seconds\" },\n",
      "      min: 0,\n",
      "      max: yMax,\n",
      "      gridLineColor: \"#e2e8f0\",\n",
      "      plotLines: [\n",
      plot_lines_left,
      "\n",
      "      ]\n",
      "    },\n",
      "    {\n",
      "      title: { text: \"Input driver (log mass gap)\", style: { color: \"#7c3aed\" } },\n",
      "      opposite: true,\n",
      "      min: drvMin,\n",
      "      max: drvMax,\n",
      "      gridLineWidth: 0,\n",
      "      labels: { style: { color: \"#7c3aed\" } }\n",
      "    }\n",
      "  ],\n"
    )
  } else {
    paste0(
      "  yAxis: {\n",
      "    title: { text: \"Wall-clock seconds\" },\n",
      "    min: 0,\n",
      "    max: yMax,\n",
      "    gridLineColor: \"#e2e8f0\",\n",
      "    plotLines: [\n",
      plot_lines_left,
      "\n",
      "    ]\n",
      "  },\n"
    )
  }

  paste0(
    '<div id="', cid, '" class="shg-hc-chart" style="width:100%;max-width:920px;height:460px;margin:12px 0;',
    'border:1px solid #e5e7eb;background:#fafafa"></div>\n',
    "<script>\n",
    "(function(){\n",
    "var avg = ", hc_js_num(avg_sec), ";\n",
    if (has_none_pts && is.finite(avg_none_sec)) {
      paste0("var avgNone = ", hc_js_num(avg_none_sec), ";\n")
    } else {
      ""
    },
    if (has_nocpd_pts && is.finite(avg_nocpd_sec)) {
      paste0("var avgNocpd = ", hc_js_num(avg_nocpd_sec), ";\n")
    } else {
      ""
    },
    "var yMax = ", hc_js_num(ymax), ";\n",
    if (has_driver) {
      paste0(
        "var drvMin = ", hc_js_num(ymin_drv), ", drvMax = ", hc_js_num(ymax_drv), ";\n"
      )
    } else {
      ""
    },
    "var xMin = ", as.integer(xmin), ", xMax = ", as.integer(xmax), ";\n",
    "var titleText = ", title_js, ";\n",
    "var subtitleText = ", subtitle_js, ";\n",
    "if (typeof Highcharts === \"undefined\") {\n",
    "  document.getElementById(\"", cid, "\").innerHTML = ",
    "    '<p style=\"color:#b91c1c\">Highcharts failed to load. Ensure vendor/highcharts.js exists next to this HTML (re-run the benchmark script).</p>';\n",
    "  return;\n",
    "}\n",
    "Highcharts.chart(\"", cid, "\", {\n",
    "  chart: { type: \"line\", backgroundColor: \"#fafafa\" },\n",
    "  title: { text: titleText },\n",
    "  subtitle: { text: subtitleText, style: { color: \"#64748b\" } },\n",
    "  credits: { enabled: false },\n",
    "  legend: { enabled: true },\n",
    "  xAxis: {\n",
    "    title: { text: \"Birth cohort year\" },\n",
    "    min: xMin, max: xMax,\n",
    "    tickInterval: 5,\n",
    "    gridLineWidth: 1,\n",
    "    gridLineColor: \"#e2e8f0\"\n",
    "  },\n",
    yaxis_block,
    "  series: [", sparse_series, none_series, nocpd_series, wall_series, driver_series, "],\n",
    "  tooltip: { shared: true, valueDecimals: 2 }\n",
    "});\n",
    "})();\n",
    "</script>\n"
  )
}

html_escape <- function(x) {
  x <- gsub("&", "&amp;", x, fixed = TRUE)
  x <- gsub("<", "&lt;", x, fixed = TRUE)
  x <- gsub(">", "&gt;", x, fixed = TRUE)
  x <- gsub("\"", "&quot;", x, fixed = TRUE)
  x
}

chart_sections_html <- function(results, meta) {
  imm_vals <- sort(unique(results$immediate_cessation_year))
  parts <- character(length(imm_vals))
  multi_phase <- length(imm_vals) > 1L
  for (ii in seq_along(imm_vals)) {
    imm <- imm_vals[ii]
    sub <- results[results$immediate_cessation_year == imm, , drop = FALSE]
    avg_sec <- mean(sub$mean_sec)
    tit <- if (!multi_phase) {
      sprintf("N=%s | %d timed run(s) averaged per cohort", abbr_n_display(meta$n), meta$runs_per_cohort)
    } else {
      sprintf(
        "Policy cessation year %s | N=%s | %d run(s)/cohort",
        imm, abbr_n_display(meta$n), meta$runs_per_cohort
      )
    }
    head_html <- if (multi_phase) paste0("<h3>", html_escape(as.character(imm)), "</h3>") else ""
    chart_cols <- c("cohort_year", "mean_sec")
    if ("mean_sec_none" %in% names(sub)) chart_cols <- c(chart_cols, "mean_sec_none")
    if ("mean_sec_nocpd" %in% names(sub)) chart_cols <- c(chart_cols, "mean_sec_nocpd")
    if ("bench_iter_sec" %in% names(sub) && isTRUE(meta$show_bench_iter)) {
      chart_cols <- c(chart_cols, "bench_iter_sec")
    }
    idir <- if (is.null(meta$input_dir)) "" else as.character(meta$input_dir)[1L]
    parts[ii] <- paste0(
      head_html,
      highcharts_cohort_chart(
        sprintf("shg-cohort-chart-%d", ii),
        sub[, chart_cols, drop = FALSE],
        tit,
        avg_sec,
        if (nzchar(idir) && !grepl("^\\(not recorded", idir)) idir else "",
        isTRUE(meta$show_bench_iter)
      )
    )
  }
  paste(parts, collapse = "\n")
}

build_html <- function(repo_root, results, meta) {
  nu <- length(unique(results$cohort_year))
  has_none_summary <- "mean_sec_none" %in% names(results) &&
    any(is.finite(suppressWarnings(as.numeric(results$mean_sec_none))))
  has_nocpd_summary <- "mean_sec_nocpd" %in% names(results) &&
    any(is.finite(suppressWarnings(as.numeric(results$mean_sec_nocpd))))
  avg_line <- if (has_none_summary || has_nocpd_summary) {
    parts <- list(sprintf(
      "Sparse CPD — avg cohort mean: <strong>%.2f</strong> s (min %.2f, max %.2f)",
      mean(results$mean_sec),
      min(results$mean_sec),
      max(results$mean_sec)
    ))
    if (has_none_summary) {
      mn <- suppressWarnings(as.numeric(results$mean_sec_none))
      parts[[length(parts) + 1L]] <- sprintf(
        "CPD none — avg: <strong>%.2f</strong> s (min %.2f, max %.2f)",
        mean(mn, na.rm = TRUE),
        suppressWarnings(min(mn, na.rm = TRUE)),
        suppressWarnings(max(mn, na.rm = TRUE))
      )
    }
    if (has_nocpd_summary) {
      m2 <- suppressWarnings(as.numeric(results$mean_sec_nocpd))
      parts[[length(parts) + 1L]] <- sprintf(
        "nocpd (skip CPD trajectory) — avg: <strong>%.2f</strong> s (min %.2f, max %.2f)",
        mean(m2, na.rm = TRUE),
        suppressWarnings(min(m2, na.rm = TRUE)),
        suppressWarnings(max(m2, na.rm = TRUE))
      )
    }
    paste0(
      paste(unlist(parts), collapse = " | "),
      sprintf(" | distinct cohort years: %d | N=%s", nu, abbr_n_display(meta$n))
    )
  } else {
    sprintf(
      paste0(
        "Average across cohort means: <strong>%.2f</strong> s | min mean: %.2f s | ",
        "max mean: %.2f s | distinct cohort years: %d | N=%s"
      ),
      mean(results$mean_sec),
      min(results$mean_sec),
      max(results$mean_sec),
      nu,
      abbr_n_display(meta$n)
    )
  }

  cpd_table <- '
<h2>Reference: CPD format (1M individuals, ~12 cores)</h2>
<table border="1" cellpadding="6" cellspacing="0" style="border-collapse:collapse;font-family:system-ui,sans-serif;font-size:14px;">
<thead><tr><th>Format</th><th>Time</th><th>Notes</th></tr></thead>
<tbody>
<tr><td><code>none</code></td><td>vs sparse</td><td>Full CPD trajectory in C++; no <code>cigarettes_per_day</code> column (output only)</td></tr>
<tr><td><code>nocpd</code></td><td>faster</td><td>Skips CPD trajectory in engine (benchmark third line vs sparse/none)</td></tr>
<tr><td><code>sparse</code></td><td>~1.3s</td><td>Default, recommended</td></tr>
<tr><td><code>legacy</code></td><td>~1.5s</td><td>Backwards compatibility</td></tr>
</tbody></table>
<p style="color:#64748b;font-size:13px;">Numbers are indicative; this run&apos;s hardware and <code>num_threads</code> dominate.</p>
'

  chart_block <- chart_sections_html(results, meta)

  show_imm <- length(unique(results$immediate_cessation_year)) > 1L
  base_cn <- if (show_imm) {
    c("immediate_cessation_year", "cohort_year", "N", "runs_per_cohort",
      "mean_sec", "sd_sec", "min_sec", "max_sec")
  } else {
    c("cohort_year", "N", "runs_per_cohort", "mean_sec", "sd_sec", "min_sec", "max_sec")
  }
  none_cn <- intersect(
    c("mean_sec_none", "sd_sec_none", "min_sec_none", "max_sec_none"),
    names(results)
  )
  nocpd_cn <- intersect(
    c("mean_sec_nocpd", "sd_sec_nocpd", "min_sec_nocpd", "max_sec_nocpd"),
    names(results)
  )
  bench_cn <- intersect(c("bench_iter_sec"), names(results))
  col_names <- c(base_cn, none_cn, nocpd_cn, bench_cn)
  num_cols <- intersect(
    col_names,
    c(
      "cohort_year", "N", "runs_per_cohort", "mean_sec", "sd_sec", "min_sec", "max_sec",
      "mean_sec_none", "sd_sec_none", "min_sec_none", "max_sec_none",
      "mean_sec_nocpd", "sd_sec_nocpd", "min_sec_nocpd", "max_sec_nocpd",
      "bench_iter_sec"
    )
  )

  fmt_tbl_cell <- function(nm, v) {
    if (nm == "N") return(abbr_n_display(v))
    if (nm %in% c("runs_per_cohort", "cohort_year", "immediate_cessation_year")) {
      vn <- suppressWarnings(as.numeric(v))
      if (length(vn) != 1L || is.na(vn) || !is.finite(vn)) return("-")
      return(as.character(as.integer(round(vn))))
    }
    vn <- suppressWarnings(as.numeric(v))
    if (length(vn) != 1L || is.na(vn) || !is.finite(vn)) return("-")
    sprintf("%.2f", vn)
  }

  th_align <- function(nm) {
    if (nm %in% num_cols) " style=\"text-align:right\"" else ""
  }
  td_align <- function(nm) {
    if (nm %in% num_cols) " style=\"text-align:right\"" else ""
  }

  thead <- paste(vapply(col_names, function(nm) {
    paste0("<th", th_align(nm), ">", nm, "</th>")
  }, character(1L)), collapse = "")

  tbody <- paste(vapply(seq_len(nrow(results)), function(i) {
    row <- results[i, , drop = TRUE]
    tds <- vapply(col_names, function(nm) {
      paste0("<td", td_align(nm), ">", html_escape(fmt_tbl_cell(nm, row[[nm]])), "</td>")
    }, character(1L))
    paste0("<tr>", paste(tds, collapse = ""), "</tr>")
  }, character(1L)), collapse = "")

  tbl <- paste0(
    "<table border=\"1\" cellpadding=\"6\" cellspacing=\"0\" ",
    "style=\"border-collapse:collapse;font-family:system-ui,sans-serif;font-size:13px;width:100%%\">",
    "<thead><tr>", thead, "</tr></thead><tbody>",
    tbody,
    "</tbody></table>"
  )

  paste0(
    "<!DOCTYPE html><html><head><meta charset=\"utf-8\"><title>SHG cohort benchmark</title>",
    "<style>body{font-family:system-ui,sans-serif;margin:24px;max-width:1100px;line-height:1.45}</style>",
    "<script src=\"vendor/highcharts.js\"></script>",
    "</head><body>",
    "<h1>SmokingHistoryGenerator - cohort sweep benchmark</h1>",
    "<p>", html_escape(meta$when), " | ", html_escape(meta$r_version), " | ",
    html_escape(meta$platform), "</p>",
    if (isTRUE(nzchar(meta$source_note))) {
      paste0("<p><em>", html_escape(meta$source_note), "</em></p>")
    } else "",
    "<p>", avg_line, "</p>",
    "<p><code>input_data_folder</code>: ", html_escape(meta$input_dir), "</p>",
    cpd_table,
    "<h2>Chart (mean seconds by cohort year)</h2>",
    chart_block,
    "<h2>Per-cohort table</h2>",
    tbl,
    "</body></html>"
  )
}

main_cpd_none_sweep <- function(opt) {
  load_shg_package(repo_root)

  csv_path <- opt$csv
  if (!nzchar(csv_path)) csv_path <- default_csv_path(repo_root)
  html_path <- opt$html
  if (!nzchar(html_path)) html_path <- default_html_path(repo_root)

  if (!file.exists(csv_path)) {
    stop("CPD none sweep: CSV not found:\n  ", csv_path, call. = FALSE)
  }
  baseline <- utils::read.csv(csv_path, stringsAsFactors = FALSE, check.names = FALSE)
  names(baseline) <- trimws(names(baseline))
  none_nm <- c("mean_sec_none", "sd_sec_none", "min_sec_none", "max_sec_none")
  for (nm in none_nm) {
    if (!nm %in% names(baseline)) baseline[[nm]] <- NA_real_
  }
  if (!"bench_iter_sec" %in% names(baseline)) baseline$bench_iter_sec <- NA_real_

  input_dir <- resolve_input_dir(repo_root, opt$input_root, opt$subdir)
  imm <- opt$immediate
  rows <- baseline$immediate_cessation_year == imm
  years <- sort(unique(as.integer(baseline$cohort_year[rows])))
  if (!length(years)) {
    stop("CPD none sweep: no cohort_year rows for immediate_cessation_year=", imm, call. = FALSE)
  }

  ensure_highcharts_vendor_for_html(repo_root, html_path)
  bench_started <- Sys.time()

  shg_template <- new(SHGInterface)
  shg_template$input_data_folder <- input_dir
  shg_template$initiation_filename <- "initiation.csv"
  shg_template$cessation_filename <- "cessation.csv"
  shg_template$cpd_filename <- "cpd.csv"
  shg_template$mortality_filename <- "acm.csv"
  shg_template$cpd_format <- "none"
  shg_template$rng_strategy <- opt$rng_strategy
  shg_template$num_threads <- opt$num_threads
  shg_template$number_of_segments <- opt$number_of_segments
  shg_template$immediate_cessation_year <- imm

  message(
    "CPD=none sweep: ",
    length(years),
    " cohort(s), ",
    opt$runs_per_cohort,
    " run(s)/cohort — baseline sparse line preserved in mean_sec"
  )

  for (cy in years) {
    message("  cohort_year=", cy, " (cpd_format=none) ... ", appendLF = FALSE)
    st <- run_one_cohort(shg_template, cy, opt$n, opt$runs_per_cohort)
    message(sprintf("%.3f s (mean of %d runs)", st["mean"], opt$runs_per_cohort))
    idx <- which(
      baseline$cohort_year == cy & baseline$immediate_cessation_year == imm
    )
    if (!length(idx)) {
      stop("CPD none sweep: cohort_year ", cy, " not in baseline CSV", call. = FALSE)
    }
    meta_partial <- list(
      when = format(Sys.time(), usetz = TRUE),
      source_note = sprintf(
        paste0(
          "Second line (amber): cpd_format=none sweep in progress (started %s). ",
          "Blue line unchanged (sparse CPD). Refresh to update."
        ),
        format(bench_started, usetz = TRUE)
      ),
      r_version = R.version.string,
      platform = R.version$platform,
      n = opt$n,
      runs_per_cohort = opt$runs_per_cohort,
      input_dir = input_dir,
      show_bench_iter = isTRUE(opt$chart_wall_iter)
    )
    tw <- system.time({
      baseline[idx[1L], "mean_sec_none"] <- st["mean"]
      baseline[idx[1L], "sd_sec_none"] <- st["sd"]
      baseline[idx[1L], "min_sec_none"] <- st["min"]
      baseline[idx[1L], "max_sec_none"] <- st["max"]
      utils::write.csv(baseline, csv_path, row.names = FALSE)
      writeLines(build_html(repo_root, baseline, meta_partial), html_path, useBytes = TRUE)
    })
    baseline[idx[1L], "bench_iter_sec"] <- unname(tw["elapsed"])
    utils::write.csv(baseline, csv_path, row.names = FALSE)
    writeLines(build_html(repo_root, baseline, meta_partial), html_path, useBytes = TRUE)
    message(
      "  → refreshed ",
      basename(csv_path),
      " + ",
      basename(html_path),
      " (",
      sum(is.finite(as.numeric(baseline$mean_sec_none))),
      " none rows); bench_iter_sec=",
      sprintf("%.3f", baseline[idx[1L], "bench_iter_sec"])
    )
  }

  meta <- list(
    when = format(Sys.time(), usetz = TRUE),
    source_note = NA_character_,
    r_version = R.version.string,
    platform = R.version$platform,
    n = opt$n,
    runs_per_cohort = opt$runs_per_cohort,
    input_dir = input_dir,
    show_bench_iter = isTRUE(opt$chart_wall_iter)
  )
  writeLines(build_html(repo_root, baseline, meta), html_path, useBytes = TRUE)
  message("\nCPD none sweep complete.\nFinal CSV: ", csv_path, "\nFinal HTML: ", html_path)
  invisible(list(csv = csv_path, html = html_path, results = baseline))
}

main_nocpd_sweep <- function(opt) {
  load_shg_package(repo_root)

  csv_path <- opt$csv
  if (!nzchar(csv_path)) csv_path <- default_csv_path(repo_root)
  html_path <- opt$html
  if (!nzchar(html_path)) html_path <- default_html_path(repo_root)

  if (!file.exists(csv_path)) {
    stop("nocpd sweep: CSV not found:\n  ", csv_path, call. = FALSE)
  }
  baseline <- utils::read.csv(csv_path, stringsAsFactors = FALSE, check.names = FALSE)
  names(baseline) <- trimws(names(baseline))
  noc_nm <- c("mean_sec_nocpd", "sd_sec_nocpd", "min_sec_nocpd", "max_sec_nocpd")
  for (nm in noc_nm) {
    if (!nm %in% names(baseline)) baseline[[nm]] <- NA_real_
  }
  if (!"bench_iter_sec" %in% names(baseline)) baseline$bench_iter_sec <- NA_real_

  input_dir <- resolve_input_dir(repo_root, opt$input_root, opt$subdir)
  imm <- opt$immediate
  rows <- baseline$immediate_cessation_year == imm
  years <- sort(unique(as.integer(baseline$cohort_year[rows])))
  if (!length(years)) {
    stop("nocpd sweep: no cohort_year rows for immediate_cessation_year=", imm, call. = FALSE)
  }
  nlim <- as.integer(opt$nocpd_last_n)[1L]
  if (length(nlim) == 1L && !is.na(nlim) && nlim > 0L) {
    if (length(years) > nlim) {
      years <- years[(length(years) - nlim + 1L):length(years)]
    }
    message(
      "nocpd sweep: limiting to last ", length(years), " cohort year(s) (--nocpd-last-n=", nlim, ")"
    )
  }

  ensure_highcharts_vendor_for_html(repo_root, html_path)
  bench_started <- Sys.time()

  shg_template <- new(SHGInterface)
  shg_template$input_data_folder <- input_dir
  shg_template$initiation_filename <- "initiation.csv"
  shg_template$cessation_filename <- "cessation.csv"
  shg_template$cpd_filename <- "cpd.csv"
  shg_template$mortality_filename <- "acm.csv"
  shg_template$cpd_format <- "nocpd"
  shg_template$rng_strategy <- opt$rng_strategy
  shg_template$num_threads <- opt$num_threads
  shg_template$number_of_segments <- opt$number_of_segments
  shg_template$immediate_cessation_year <- imm

  message(
    "CPD=nocpd sweep: ",
    length(years),
    " cohort(s), ",
    opt$runs_per_cohort,
    " run(s)/cohort — sparse + none lines unchanged in CSV"
  )

  for (cy in years) {
    message("  cohort_year=", cy, " (cpd_format=nocpd) ... ", appendLF = FALSE)
    st <- run_one_cohort(shg_template, cy, opt$n, opt$runs_per_cohort)
    message(sprintf("%.3f s (mean of %d runs)", st["mean"], opt$runs_per_cohort))
    idx <- which(
      baseline$cohort_year == cy & baseline$immediate_cessation_year == imm
    )
    if (!length(idx)) {
      stop("nocpd sweep: cohort_year ", cy, " not in baseline CSV", call. = FALSE)
    }
    meta_partial <- list(
      when = format(Sys.time(), usetz = TRUE),
      source_note = sprintf(
        paste0(
          "Third line (cyan): cpd_format=nocpd sweep in progress (started %s). ",
          "Sparse + none columns unchanged. Refresh to update."
        ),
        format(bench_started, usetz = TRUE)
      ),
      r_version = R.version.string,
      platform = R.version$platform,
      n = opt$n,
      runs_per_cohort = opt$runs_per_cohort,
      input_dir = input_dir,
      show_bench_iter = isTRUE(opt$chart_wall_iter)
    )
    tw <- system.time({
      baseline[idx[1L], "mean_sec_nocpd"] <- st["mean"]
      baseline[idx[1L], "sd_sec_nocpd"] <- st["sd"]
      baseline[idx[1L], "min_sec_nocpd"] <- st["min"]
      baseline[idx[1L], "max_sec_nocpd"] <- st["max"]
      utils::write.csv(baseline, csv_path, row.names = FALSE)
      writeLines(build_html(repo_root, baseline, meta_partial), html_path, useBytes = TRUE)
    })
    baseline[idx[1L], "bench_iter_sec"] <- unname(tw["elapsed"])
    utils::write.csv(baseline, csv_path, row.names = FALSE)
    writeLines(build_html(repo_root, baseline, meta_partial), html_path, useBytes = TRUE)
    message(
      "  → refreshed ",
      basename(csv_path),
      " + ",
      basename(html_path),
      " (",
      sum(is.finite(as.numeric(baseline$mean_sec_nocpd))),
      " nocpd rows); bench_iter_sec=",
      sprintf("%.3f", baseline[idx[1L], "bench_iter_sec"])
    )
  }

  meta <- list(
    when = format(Sys.time(), usetz = TRUE),
    source_note = NA_character_,
    r_version = R.version.string,
    platform = R.version$platform,
    n = opt$n,
    runs_per_cohort = opt$runs_per_cohort,
    input_dir = input_dir,
    show_bench_iter = isTRUE(opt$chart_wall_iter)
  )
  writeLines(build_html(repo_root, baseline, meta), html_path, useBytes = TRUE)
  message("\nnocpd sweep complete.\nFinal CSV: ", csv_path, "\nFinal HTML: ", html_path)
  invisible(list(csv = csv_path, html = html_path, results = baseline))
}

main <- function() {
  opt <- parse_cli()

  if (nzchar(opt$from_csv)) {
    return(render_from_csv(opt))
  }

  if (nzchar(opt$nocpd_sweep) &&
      tolower(trimws(opt$nocpd_sweep)) %in% c("1", "true", "yes")) {
    return(main_nocpd_sweep(opt))
  }

  if (nzchar(opt$cpd_none_sweep) &&
      tolower(trimws(opt$cpd_none_sweep)) %in% c("1", "true", "yes")) {
    return(main_cpd_none_sweep(opt))
  }

  load_shg_package(repo_root)

  input_dir <- resolve_input_dir(repo_root, opt$input_root, opt$subdir)
  init_path <- file.path(input_dir, "initiation.csv")
  years_all <- cohort_years_from_initiation(init_path)
  ym <- opt$year_min
  yx <- opt$year_max
  if (length(ym) != 1L) ym <- NA_integer_
  if (length(yx) != 1L) yx <- NA_integer_
  years <- filter_years(years_all, opt$step, ym, yx)
  if (!length(years)) stop("No cohort years after filtering; relax step or year bounds.")

  phases <- list(list(immediate = opt$immediate, label = "primary"))
  if (nzchar(opt$compare_immediate)) {
    ci <- suppressWarnings(as.integer(opt$compare_immediate))
    if (is.na(ci)) stop("'compare_immediate' must be an integer year.", call. = FALSE)
    phases <- c(phases, list(list(
      immediate = ci,
      label = paste0("compare_", ci)
    )))
  }

  csv_path <- opt$csv
  if (!nzchar(csv_path)) csv_path <- default_csv_path(repo_root)
  html_path <- opt$html
  if (!nzchar(html_path)) html_path <- default_html_path(repo_root)
  ensure_highcharts_vendor_for_html(repo_root, html_path)
  bench_started <- Sys.time()

  results_list <- list()
  for (ph in phases) {
    shg_template <- new(SHGInterface)
    shg_template$input_data_folder <- input_dir
    shg_template$initiation_filename <- "initiation.csv"
    shg_template$cessation_filename <- "cessation.csv"
    shg_template$cpd_filename <- "cpd.csv"
    shg_template$mortality_filename <- "acm.csv"
    shg_template$cpd_format <- opt$cpd_format
    shg_template$rng_strategy <- opt$rng_strategy
    shg_template$num_threads <- opt$num_threads
    shg_template$number_of_segments <- opt$number_of_segments
    shg_template$immediate_cessation_year <- ph$immediate

    message("Phase immediate_cessation_year=", ph$immediate, " (", ph$label, ")")
    for (cy in years) {
      message("  cohort_year=", cy, " ... ", appendLF = FALSE)
      st <- run_one_cohort(shg_template, cy, opt$n, opt$runs_per_cohort)
      message(sprintf("%.3f s (mean of %d runs)", st["mean"], opt$runs_per_cohort))
      row <- data.frame(
        immediate_cessation_year = ph$immediate,
        cohort_year = cy,
        N = as.numeric(opt$n),
        runs_per_cohort = opt$runs_per_cohort,
        mean_sec = st["mean"],
        sd_sec = st["sd"],
        min_sec = st["min"],
        max_sec = st["max"],
        bench_iter_sec = NA_real_,
        stringsAsFactors = FALSE
      )
      if (isTRUE(opt$with_nocpd)) {
        message("    cpd_format=nocpd (skip CPD trajectory) ... ", appendLF = FALSE)
        st_n <- run_one_cohort(shg_template, cy, opt$n, opt$runs_per_cohort, "nocpd")
        message(sprintf("%.3f s (mean of %d runs)", st_n["mean"], opt$runs_per_cohort))
        row$mean_sec_nocpd <- st_n["mean"]
        row$sd_sec_nocpd <- st_n["sd"]
        row$min_sec_nocpd <- st_n["min"]
        row$max_sec_nocpd <- st_n["max"]
      }
      tw <- system.time({
        results_list[[length(results_list) + 1L]] <- row
        results_partial <- do.call(rbind, results_list)
        rownames(results_partial) <- NULL
        results_partial <- merge_preserve_cpd_none_from_csv(results_partial, csv_path)
        results_partial <- merge_preserve_nocpd_from_csv(results_partial, csv_path)
        results_partial <- merge_preserve_bench_iter_from_csv(results_partial, csv_path)
      })
      ni <- nrow(results_partial)
      results_partial$bench_iter_sec[ni] <- unname(tw["elapsed"])
      results_list[[ni]]$bench_iter_sec <- results_partial$bench_iter_sec[ni]
      utils::write.csv(results_partial, csv_path, row.names = FALSE)
      meta_partial <- list(
        when = format(Sys.time(), usetz = TRUE),
        source_note = sprintf(
          "Benchmark in progress (started %s). Partial results — refresh the page to update.",
          format(bench_started, usetz = TRUE)
        ),
        r_version = R.version.string,
        platform = R.version$platform,
        n = opt$n,
        runs_per_cohort = opt$runs_per_cohort,
        input_dir = input_dir,
        show_bench_iter = isTRUE(opt$chart_wall_iter)
      )
      writeLines(build_html(repo_root, results_partial, meta_partial), html_path, useBytes = TRUE)
      message(sprintf(
        "  → refreshed %s + %s (%d row%s); bench_iter_sec=%.3f",
        basename(csv_path), basename(html_path), nrow(results_partial),
        if (nrow(results_partial) == 1L) "" else "s",
        results_partial$bench_iter_sec[ni]
      ))
    }
  }

  results <- do.call(rbind, results_list)
  rownames(results) <- NULL
  results <- merge_preserve_cpd_none_from_csv(results, csv_path)
  results <- merge_preserve_nocpd_from_csv(results, csv_path)
  results <- merge_preserve_bench_iter_from_csv(results, csv_path)

  write.csv(results, csv_path, row.names = FALSE)
  message("\nFinal CSV: ", csv_path)

  meta <- list(
    when = format(Sys.time(), usetz = TRUE),
    source_note = NA_character_,
    r_version = R.version.string,
    platform = R.version$platform,
    n = opt$n,
    runs_per_cohort = opt$runs_per_cohort,
    input_dir = input_dir,
    show_bench_iter = isTRUE(opt$chart_wall_iter)
  )
  writeLines(build_html(repo_root, results, meta), html_path, useBytes = TRUE)
  message("Final HTML: ", html_path)

  invisible(list(csv = csv_path, html = html_path, results = results))
}

.help_text <- function() {
  paste(c(
    "benchmark-cohort-variance.R - cohort sweep timing + CSV + HTML chart",
    "",
    "Usage: Rscript tools/benchmark-cohort-variance.R [options]",
    "",
    "Options (also SHG_COHORT_BENCH_CSV / SHG_COHORT_BENCH_HTML):",
    "  --n=N                  individuals per cohort (default 1e6)",
    "  --step=YEAR            keep cohort columns where year %% step == 0 (default 5)",
    "  --runs=N               timed repeats per cohort, mean reported (default 3)",
    "  --subdir=NAME          under tests/testdata/NHIS-1965-2018/ (default csv-complete)",
    "  --input_root=PATH      override parent of subdir (folder containing csv-complete/)",
    "  --year_min= / --year_max=  optional cohort bounds",
    "  --immediate=N          immediate_cessation_year (default 0)",
    "  --compare_immediate=N    second sweep with this immediate year (or env SHG_COHORT_BENCH_COMPARE_IMMEDIATE_CESSATION)",
    "  --cpd_format=sparse|none|legacy|nocpd   (none=omit CPD column; nocpd=skip CPD calc in engine)",
    "  --with-nocpd=1         also time cpd_format=nocpd (3rd line: no CPD calc; use with cpd-none-sweep for 2nd line=none)",
    "  --rng_strategy=RngStream|MersenneTwister",
    "  --num_threads=-1       SHGInterface default",
    "  --number_of_segments=-1",
    "  --csv=PATH --html=PATH output paths",
    "                         (during a full sweep, CSV/HTML refresh after each cohort)",
    "",
    "Render HTML only from a prior CSV (no sims; does not load SmokingHistoryGenerator):",
    "  --from-csv=PATH        required columns: cohort_year, mean_sec;",
    "                           optional: min_sec, max_sec, sd_sec, N, runs_per_cohort, immediate_cessation_year",
    "  --render-input-dir=PATH  shown in HTML as input_data_folder when re-rendering",
    "",
    "  --cpd-none-sweep=1      re-time every cohort in existing CSV with cpd_format=none;",
    "                           fills mean_sec_none / sd_sec_none / min/max_none; keeps sparse line;",
    "                           refreshes CSV/HTML after each cohort (amber second line);",
    "                           records bench_iter_sec = wall-clock for sim + CSV + HTML each step",
    "  --nocpd-sweep=1         re-time every cohort with cpd_format=nocpd (skip CPD calc);",
    "                           fills mean_sec_nocpd / sd/min/max_nocpd; keeps sparse + none columns;",
    "  --nocpd-last-n=N        with nocpd-sweep only: last N cohort years (sorted); omit or 0 = all.",
    "                           After CSV exists, --from-csv re-renders HTML without re-running sims.",
    "  --chart-wall-iter=1     include slate bench_iter_sec line on chart (default off;",
    "                           csv column still stored for reference)",
    "",
    "SHG_BENCHMARK_USE_PKGLOAD=1  load package from source via pkgload::load_all(repo root).",
    "",
    "Charts load docs/vendor/highcharts.js (copied beside the HTML). Offline/file:// safe."
  ), collapse = "\n")
}

# Fix help: getSrcFilename fails in Rscript; use simple help
if ("--help" %in% commandArgs(trailingOnly = TRUE) || "-h" %in% commandArgs(trailingOnly = TRUE)) {
  writeLines(.help_text())
  quit(status = 0)
}

tryCatch(main(), error = function(e) {
  message(conditionMessage(e))
  quit(status = 1)
})

Try the SmokingHistoryGenerator package in your browser

Any scripts or data that you put into this service are public.

SmokingHistoryGenerator documentation built on June 13, 2026, 1:08 a.m.