Nothing
#!/usr/bin/env Rscript
# Summarize sparse CPD trajectories for one cohort: change frequency, runs, first==last vs constant.
# Usage (repo root): SHG_BENCHMARK_USE_PKGLOAD=1 Rscript tools/analyze-cpd-trajectory-structure.R
# [--n=50000] [--cohort-year=1950] [--subdir=csv-complete]
#
# Does not modify simulation code — exploratory statistics only.
repo_root <- normalizePath(file.path(dirname(normalizePath(sub("^--file=", "", grep("^--file=", commandArgs(trailingOnly = FALSE), value = TRUE)[1]))), ".."), winslash = "/")
parse_sparse_cpd <- function(s) {
if (length(s) != 1L || is.na(s) || !nzchar(trimws(s))) return(integer())
parts <- strsplit(gsub(",", " , ", s, fixed = TRUE), ",", fixed = FALSE)[[1L]]
parts <- trimws(parts[nzchar(trimws(parts))])
suppressWarnings(as.integer(parts))
}
argv <- commandArgs(trailingOnly = TRUE)
n_ind <- 50000L
cy <- 1950L
subdir <- "csv-complete"
for (a in argv) {
if (grepl("^--n=", a)) n_ind <- as.integer(sub("^--n=", "", a))
if (grepl("^--cohort-year=", a)) cy <- as.integer(sub("^--cohort-year=", "", a))
if (grepl("^--subdir=", a)) subdir <- sub("^--subdir=", "", a)
}
if (nzchar(Sys.getenv("SHG_BENCHMARK_USE_PKGLOAD", ""))) {
if (!requireNamespace("pkgload", quietly = TRUE)) stop("Install pkgload", call. = FALSE)
pkgload::load_all(repo_root, quiet = TRUE)
} else {
suppressPackageStartupMessages(library(SmokingHistoryGenerator))
}
input_dir <- normalizePath(
file.path(repo_root, "tests/testdata/NHIS-1965-2018", subdir),
winslash = "/",
mustWork = TRUE
)
shg <- new(SHGInterface)
shg$input_data_folder <- input_dir
shg$initiation_filename <- "initiation.csv"
shg$cessation_filename <- "cessation.csv"
shg$cpd_filename <- "cpd.csv"
shg$mortality_filename <- "acm.csv"
shg$cpd_format <- "sparse"
shg$rng_strategy <- "RngStream"
shg$rngstream_seed <- rep(12345L, 6L)
shg$num_threads <- 1L
shg$number_of_segments <- 1L
shg$immediate_cessation_year <- 0L
message(
"Running N=", n_ind, " race=0 sex=0 cohort_year=", cy,
"\ninput_data_folder=", input_dir
)
tm <- system.time({
df <- shg$runSimFromFixedValues(as.integer(n_ind), 0L, 0L, as.integer(cy))
})
stopifnot("cigarettes_per_day" %in% names(df))
sm <- df$smoking_initiation_age != -999L
idx <- which(sm)
message(sprintf("Smokers: %d / %d (%.1f%%)", length(idx), nrow(df), 100 * length(idx) / nrow(df)))
message(sprintf("Wall time: %.2f s\n", unname(tm["elapsed"])))
cpd_stats <- function(v) {
L <- length(v)
if (!L) {
return(list(
L = 0L, n_change = NA_integer_, n_runs = NA_integer_,
constant = NA, first_last_same = NA,
changes_per_year = NA_real_
))
}
d <- diff(v)
n_change <- sum(d != 0L, na.rm = TRUE)
n_runs <- length(rle(v)$lengths)
list(
L = L,
n_change = as.integer(n_change),
n_runs = as.integer(n_runs),
constant = n_change == 0L,
first_last_same = v[1L] == v[L],
changes_per_year = if (L > 1L) n_change / (L - 1L) else 0
)
}
rows <- lapply(df$cigarettes_per_day[idx], parse_sparse_cpd)
stats <- lapply(rows, cpd_stats)
L <- vapply(stats, function(s) s$L, integer(1))
n_change <- vapply(stats, function(s) s$n_change, integer(1))
n_runs <- vapply(stats, function(s) s$n_runs, integer(1))
constant <- vapply(stats, function(s) isTRUE(s$constant), logical(1))
first_last_same <- vapply(stats, function(s) isTRUE(s$first_last_same), logical(1))
cpe <- vapply(stats, function(s) if (is.finite(s$changes_per_year)) s$changes_per_year else NA_real_, numeric(1))
ok <- L > 0L
message("\n=== Among smokers with non-empty CPD vector ===")
message(sprintf("Count: %d", sum(ok)))
message(sprintf(
"Smoking-years length L: median=%.0f mean=%.1f max=%d",
stats::median(L[ok]),
mean(L[ok]),
max(L[ok])
))
message(sprintf(
"\nCPD value changes (adjacent ages): median=%d mean=%.2f max=%d",
stats::median(n_change[ok]),
mean(n_change[ok]),
max(n_change[ok])
))
message(sprintf(
"Contiguous constant runs (rle): median=%d mean=%.2f",
stats::median(n_runs[ok]),
mean(n_runs[ok])
))
message(sprintf(
"\nShare with ZERO changes (flat CPD over all smoking years): %.2f%%",
100 * mean(constant[ok])
))
message(sprintf(
"Share with first CPD == last CPD: %.2f%%",
100 * mean(first_last_same[ok])
))
fl_not_const <- ok & first_last_same & !constant
message(sprintf(
"Share first==last but NOT constant (middle differs): %.2f%% — endpoint equality does not imply flat middle",
100 * mean(fl_not_const[ok])
))
message(sprintf(
"\nChanges per year-of-smoking (among L>=2): median=%.4f mean=%.4f",
stats::median(cpe[ok & L >= 2], na.rm = TRUE),
mean(cpe[ok & L >= 2], na.rm = TRUE)
))
L10 <- ok & L >= 10L
message(sprintf(
"\nAmong smokers with L>=10: n=%d | changes<=2: %.1f%% | changes<=5: %.1f%%",
sum(L10),
100 * mean(n_change[L10] <= 2L),
100 * mean(n_change[L10] <= 5L)
))
invisible(NULL)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.