Nothing
#!/usr/bin/env Rscript
# Populate tests/testdata/NHIS-1965-2018 from:
# - shg-params CSV release (smoking to 2018, mortality 2016 bundle)
# - shg-params wide legacy .txt drop (NHIS 2018 smoking tables + 2016 mortality text)
#
# Usage (from package root):
# Rscript tools/bootstrap-nhis-2018-testdata.R
# Optional args (positional):
# 1: path to usa-national@smok-2018-mort-2016 directory (CSV release root)
# 2: path to legacy ref folder (contains new_shg_*.txt and lbc_smokehist_* mortality)
args <- commandArgs(trailingOnly = TRUE)
repo_root <- normalizePath(".", winslash = "/", mustWork = TRUE)
default_release <- normalizePath(
file.path(repo_root, "..", "shg-params", "releases", "usa-national@smok-2018-mort-2016"),
winslash = "/", mustWork = FALSE
)
default_legacy_ref <- normalizePath(
file.path(
repo_root, "..", "shg-params", "tmp", "shg-data-raw-ref-files",
"2025-04-09-SHG_NHIS2018_SendToJohn_Jihyoun_04092025"
),
winslash = "/", mustWork = FALSE
)
release_dir <- if (length(args) >= 1) args[[1]] else default_release
legacy_ref <- if (length(args) >= 2) args[[2]] else default_legacy_ref
stopifnot(dir.exists(release_dir))
stopifnot(dir.exists(legacy_ref))
out_root <- file.path(repo_root, "tests", "testdata", "NHIS-1965-2018")
csv_complete <- file.path(out_root, "csv-complete")
csv_partial <- file.path(out_root, "csv-partial")
legacy_complete <- file.path(out_root, "legacy-complete")
legacy_partial <- file.path(out_root, "legacy-partial")
for (d in c(csv_complete, csv_partial, legacy_complete, legacy_partial)) {
dir.create(d, recursive = TRUE, showWarnings = FALSE)
}
file.copy(
file.path(release_dir, "smoking", "initiation.csv"),
file.path(csv_complete, "initiation.csv"),
overwrite = TRUE
)
file.copy(
file.path(release_dir, "smoking", "cessation.csv"),
file.path(csv_complete, "cessation.csv"),
overwrite = TRUE
)
file.copy(
file.path(release_dir, "smoking", "cpd.csv"),
file.path(csv_complete, "cpd.csv"),
overwrite = TRUE
)
file.copy(
file.path(release_dir, "mortality", "acm.csv"),
file.path(csv_complete, "acm.csv"),
overwrite = TRUE
)
file.copy(
file.path(release_dir, "mortality", "ocm-excl-lung-cancer.csv"),
file.path(csv_complete, "ocm-excl-lung-cancer.csv"),
overwrite = TRUE
)
file.copy(
file.path(legacy_ref, "new_shg_initiation.txt"),
file.path(legacy_complete, "initiation.txt"),
overwrite = TRUE
)
file.copy(
file.path(legacy_ref, "new_shg_cessation.txt"),
file.path(legacy_complete, "cessation.txt"),
overwrite = TRUE
)
file.copy(
file.path(legacy_ref, "new_shg_cpd.txt"),
file.path(legacy_complete, "cpd.txt"),
overwrite = TRUE
)
file.copy(
file.path(legacy_ref, "lbc_smokehist_ac_mortality_02212016.txt"),
file.path(legacy_complete, "acm.txt"),
overwrite = TRUE
)
file.copy(
file.path(legacy_ref, "lbc_smokehist_oc_mortality_02212016.txt"),
file.path(legacy_complete, "ocm-excl-lung-cancer.txt"),
overwrite = TRUE
)
cohort_cols <- c("1950", "2010")
read_csv_flex <- function(path) {
read.csv(path, stringsAsFactors = FALSE, check.names = FALSE)
}
trim_initiation_cessation <- function(df) {
cn <- names(df)
miss <- setdiff(cohort_cols, cn)
if (length(miss)) {
stop("Missing cohort columns in table: ", paste(miss, collapse = ", "))
}
df <- df[df$RACE == 0 & df$SEX == 0, , drop = FALSE]
df[, c("RACE", "SEX", "AGE", cohort_cols)]
}
init_df <- read_csv_flex(file.path(csv_complete, "initiation.csv"))
write.csv(
trim_initiation_cessation(init_df),
file.path(csv_partial, "initiation.csv"),
row.names = FALSE,
quote = FALSE
)
cess_df <- read_csv_flex(file.path(csv_complete, "cessation.csv"))
write.csv(
trim_initiation_cessation(cess_df),
file.path(csv_partial, "cessation.csv"),
row.names = FALSE,
quote = FALSE
)
cpd_full <- read_csv_flex(file.path(csv_complete, "cpd.csv"))
cpd_partial <- cpd_full[
cpd_full$RACE == 0 &
cpd_full$SEX == 0 &
cpd_full$START_YOB == cpd_full$END_YOB &
as.character(cpd_full$START_YOB) %in% cohort_cols,
]
write.csv(
cpd_partial,
file.path(csv_partial, "cpd.csv"),
row.names = FALSE,
quote = FALSE
)
mort_partial <- function(fname) {
m <- read_csv_flex(file.path(csv_complete, fname))
m <- m[m$RACE == 0 & m$SEX == 0, , drop = FALSE]
m[m$YOB %in% as.integer(cohort_cols), ]
}
write.csv(
mort_partial("acm.csv"),
file.path(csv_partial, "acm.csv"),
row.names = FALSE,
quote = FALSE
)
write.csv(
mort_partial("ocm-excl-lung-cancer.csv"),
file.path(csv_partial, "ocm-excl-lung-cancer.csv"),
row.names = FALSE,
quote = FALSE
)
write_legacy_partial_initcess <- function(csv_name, txt_name, title, n_preamble) {
df <- read_csv_flex(file.path(csv_partial, csv_name))
lines <- c(
as.character(n_preamble),
paste0("* ", title),
"* race=0 sex=0; cohort columns 1950, 2010",
"1,1,2,0,99",
"Race,Sex,Age,1950-1950,2010-2010"
)
for (i in seq_len(nrow(df))) {
lines <- c(lines, paste(
df$RACE[i], df$SEX[i], df$AGE[i],
df[["1950"]][i], df[["2010"]][i],
sep = ","
))
}
writeLines(lines, file.path(legacy_partial, txt_name))
}
write_legacy_partial_initcess(
"initiation.csv", "initiation.txt",
"SHG initiation (trimmed NHIS partial, test fixture)", 4
)
write_legacy_partial_initcess(
"cessation.csv", "cessation.txt",
"SHG cessation (trimmed NHIS partial, test fixture)", 4
)
mort_to_legacy_txt <- function(csv_name, txt_name, comment) {
df <- read_csv_flex(file.path(csv_partial, csv_name))
lines <- c(
"3",
paste0("* ", comment),
"1,1,1950,2010,0,99"
)
for (i in seq_len(nrow(df))) {
lines <- c(lines, paste(
c(
df$RACE[i], df$SEX[i], df$YOB[i], df$AGE[i],
as.numeric(df[i, grep("^NS$|^CS_", names(df))])
),
collapse = ","
))
}
writeLines(lines, file.path(legacy_partial, txt_name))
}
mort_to_legacy_txt("acm.csv", "acm.txt", "legacy mortality fixture from acm.csv")
mort_to_legacy_txt(
"ocm-excl-lung-cancer.csv",
"ocm-excl-lung-cancer.txt",
"legacy mortality fixture from ocm-excl-lung-cancer.csv"
)
write_legacy_partial_cpd <- function() {
df <- read_csv_flex(file.path(csv_partial, "cpd.csv"))
df <- df[
df$START_YOB == df$END_YOB &
as.character(df$START_YOB) %in% cohort_cols,
]
cat_cols <- grep("^CAT", names(df), value = TRUE)
if (length(cat_cols) == 0) {
stop("cpd.csv: expected CAT* columns")
}
lines <- c(
"5",
"* SHG CPD (trimmed NHIS partial, test fixture)",
"* race=0 sex=0; cohorts 1950, 2010",
"* 1=1-5, 2=6-15, 3=16-25, 4=26-35, 5=36-45, 6=46+ (CAT1-CAT6)",
paste("1,1,2,0,99", length(cat_cols), sep = ",")
)
for (i in seq_len(nrow(df))) {
lines <- c(lines, paste(
c(
df$RACE[i], df$SEX[i], df$START_YOB[i], df$END_YOB[i], df$AGE[i],
as.numeric(df[i, cat_cols])
),
collapse = ","
))
}
writeLines(lines, file.path(legacy_partial, "cpd.txt"))
}
write_legacy_partial_cpd()
message("Wrote ", out_root)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.