tools/analyze-cpd-trajectory-structure.R

#!/usr/bin/env Rscript
# Summarize sparse CPD trajectories for one cohort: change frequency, runs, first==last vs constant.
# Usage (repo root): SHG_BENCHMARK_USE_PKGLOAD=1 Rscript tools/analyze-cpd-trajectory-structure.R
#   [--n=50000] [--cohort-year=1950] [--subdir=csv-complete]
#
# Does not modify simulation code — exploratory statistics only.

repo_root <- normalizePath(file.path(dirname(normalizePath(sub("^--file=", "", grep("^--file=", commandArgs(trailingOnly = FALSE), value = TRUE)[1]))), ".."), winslash = "/")

parse_sparse_cpd <- function(s) {
  if (length(s) != 1L || is.na(s) || !nzchar(trimws(s))) return(integer())
  parts <- strsplit(gsub(",", " , ", s, fixed = TRUE), ",", fixed = FALSE)[[1L]]
  parts <- trimws(parts[nzchar(trimws(parts))])
  suppressWarnings(as.integer(parts))
}

argv <- commandArgs(trailingOnly = TRUE)
n_ind <- 50000L
cy <- 1950L
subdir <- "csv-complete"
for (a in argv) {
  if (grepl("^--n=", a)) n_ind <- as.integer(sub("^--n=", "", a))
  if (grepl("^--cohort-year=", a)) cy <- as.integer(sub("^--cohort-year=", "", a))
  if (grepl("^--subdir=", a)) subdir <- sub("^--subdir=", "", a)
}

if (nzchar(Sys.getenv("SHG_BENCHMARK_USE_PKGLOAD", ""))) {
  if (!requireNamespace("pkgload", quietly = TRUE)) stop("Install pkgload", call. = FALSE)
  pkgload::load_all(repo_root, quiet = TRUE)
} else {
  suppressPackageStartupMessages(library(SmokingHistoryGenerator))
}

input_dir <- normalizePath(
  file.path(repo_root, "tests/testdata/NHIS-1965-2018", subdir),
  winslash = "/",
  mustWork = TRUE
)

shg <- new(SHGInterface)
shg$input_data_folder <- input_dir
shg$initiation_filename <- "initiation.csv"
shg$cessation_filename <- "cessation.csv"
shg$cpd_filename <- "cpd.csv"
shg$mortality_filename <- "acm.csv"
shg$cpd_format <- "sparse"
shg$rng_strategy <- "RngStream"
shg$rngstream_seed <- rep(12345L, 6L)
shg$num_threads <- 1L
shg$number_of_segments <- 1L
shg$immediate_cessation_year <- 0L

message(
  "Running N=", n_ind, " race=0 sex=0 cohort_year=", cy,
  "\ninput_data_folder=", input_dir
)
tm <- system.time({
  df <- shg$runSimFromFixedValues(as.integer(n_ind), 0L, 0L, as.integer(cy))
})

stopifnot("cigarettes_per_day" %in% names(df))

sm <- df$smoking_initiation_age != -999L
idx <- which(sm)
message(sprintf("Smokers: %d / %d (%.1f%%)", length(idx), nrow(df), 100 * length(idx) / nrow(df)))
message(sprintf("Wall time: %.2f s\n", unname(tm["elapsed"])))

cpd_stats <- function(v) {
  L <- length(v)
  if (!L) {
    return(list(
      L = 0L, n_change = NA_integer_, n_runs = NA_integer_,
      constant = NA, first_last_same = NA,
      changes_per_year = NA_real_
    ))
  }
  d <- diff(v)
  n_change <- sum(d != 0L, na.rm = TRUE)
  n_runs <- length(rle(v)$lengths)
  list(
    L = L,
    n_change = as.integer(n_change),
    n_runs = as.integer(n_runs),
    constant = n_change == 0L,
    first_last_same = v[1L] == v[L],
    changes_per_year = if (L > 1L) n_change / (L - 1L) else 0
  )
}

rows <- lapply(df$cigarettes_per_day[idx], parse_sparse_cpd)
stats <- lapply(rows, cpd_stats)
L <- vapply(stats, function(s) s$L, integer(1))
n_change <- vapply(stats, function(s) s$n_change, integer(1))
n_runs <- vapply(stats, function(s) s$n_runs, integer(1))
constant <- vapply(stats, function(s) isTRUE(s$constant), logical(1))
first_last_same <- vapply(stats, function(s) isTRUE(s$first_last_same), logical(1))
 cpe <- vapply(stats, function(s) if (is.finite(s$changes_per_year)) s$changes_per_year else NA_real_, numeric(1))

ok <- L > 0L
message("\n=== Among smokers with non-empty CPD vector ===")
message(sprintf("Count: %d", sum(ok)))
message(sprintf(
  "Smoking-years length L:  median=%.0f  mean=%.1f  max=%d",
  stats::median(L[ok]),
  mean(L[ok]),
  max(L[ok])
))

message(sprintf(
  "\nCPD value changes (adjacent ages):  median=%d  mean=%.2f  max=%d",
  stats::median(n_change[ok]),
  mean(n_change[ok]),
  max(n_change[ok])
))
message(sprintf(
  "Contiguous constant runs (rle):    median=%d  mean=%.2f",
  stats::median(n_runs[ok]),
  mean(n_runs[ok])
))

message(sprintf(
  "\nShare with ZERO changes (flat CPD over all smoking years): %.2f%%",
  100 * mean(constant[ok])
))
message(sprintf(
  "Share with first CPD == last CPD:                          %.2f%%",
  100 * mean(first_last_same[ok])
))
fl_not_const <- ok & first_last_same & !constant
message(sprintf(
  "Share first==last but NOT constant (middle differs):      %.2f%% — endpoint equality does not imply flat middle",
  100 * mean(fl_not_const[ok])
))

message(sprintf(
  "\nChanges per year-of-smoking (among L>=2):  median=%.4f  mean=%.4f",
  stats::median(cpe[ok & L >= 2], na.rm = TRUE),
  mean(cpe[ok & L >= 2], na.rm = TRUE)
))

L10 <- ok & L >= 10L
message(sprintf(
  "\nAmong smokers with L>=10:  n=%d  |  changes<=2: %.1f%%  |  changes<=5: %.1f%%",
  sum(L10),
  100 * mean(n_change[L10] <= 2L),
  100 * mean(n_change[L10] <= 5L)
))

invisible(NULL)

Try the SmokingHistoryGenerator package in your browser

Any scripts or data that you put into this service are public.

SmokingHistoryGenerator documentation built on June 13, 2026, 1:08 a.m.