tools/bootstrap-nhis-2018-testdata.R

#!/usr/bin/env Rscript
# Populate tests/testdata/NHIS-1965-2018 from:
#   - shg-params CSV release (smoking to 2018, mortality 2016 bundle)
#   - shg-params wide legacy .txt drop (NHIS 2018 smoking tables + 2016 mortality text)
#
# Usage (from package root):
#   Rscript tools/bootstrap-nhis-2018-testdata.R
# Optional args (positional):
#   1: path to usa-national@smok-2018-mort-2016 directory (CSV release root)
#   2: path to legacy ref folder (contains new_shg_*.txt and lbc_smokehist_* mortality)

args <- commandArgs(trailingOnly = TRUE)
repo_root <- normalizePath(".", winslash = "/", mustWork = TRUE)
default_release <- normalizePath(
  file.path(repo_root, "..", "shg-params", "releases", "usa-national@smok-2018-mort-2016"),
  winslash = "/", mustWork = FALSE
)
default_legacy_ref <- normalizePath(
  file.path(
    repo_root, "..", "shg-params", "tmp", "shg-data-raw-ref-files",
    "2025-04-09-SHG_NHIS2018_SendToJohn_Jihyoun_04092025"
  ),
  winslash = "/", mustWork = FALSE
)

release_dir <- if (length(args) >= 1) args[[1]] else default_release
legacy_ref <- if (length(args) >= 2) args[[2]] else default_legacy_ref

stopifnot(dir.exists(release_dir))
stopifnot(dir.exists(legacy_ref))

out_root <- file.path(repo_root, "tests", "testdata", "NHIS-1965-2018")
csv_complete <- file.path(out_root, "csv-complete")
csv_partial <- file.path(out_root, "csv-partial")
legacy_complete <- file.path(out_root, "legacy-complete")
legacy_partial <- file.path(out_root, "legacy-partial")

for (d in c(csv_complete, csv_partial, legacy_complete, legacy_partial)) {
  dir.create(d, recursive = TRUE, showWarnings = FALSE)
}

file.copy(
  file.path(release_dir, "smoking", "initiation.csv"),
  file.path(csv_complete, "initiation.csv"),
  overwrite = TRUE
)
file.copy(
  file.path(release_dir, "smoking", "cessation.csv"),
  file.path(csv_complete, "cessation.csv"),
  overwrite = TRUE
)
file.copy(
  file.path(release_dir, "smoking", "cpd.csv"),
  file.path(csv_complete, "cpd.csv"),
  overwrite = TRUE
)
file.copy(
  file.path(release_dir, "mortality", "acm.csv"),
  file.path(csv_complete, "acm.csv"),
  overwrite = TRUE
)
file.copy(
  file.path(release_dir, "mortality", "ocm-excl-lung-cancer.csv"),
  file.path(csv_complete, "ocm-excl-lung-cancer.csv"),
  overwrite = TRUE
)

file.copy(
  file.path(legacy_ref, "new_shg_initiation.txt"),
  file.path(legacy_complete, "initiation.txt"),
  overwrite = TRUE
)
file.copy(
  file.path(legacy_ref, "new_shg_cessation.txt"),
  file.path(legacy_complete, "cessation.txt"),
  overwrite = TRUE
)
file.copy(
  file.path(legacy_ref, "new_shg_cpd.txt"),
  file.path(legacy_complete, "cpd.txt"),
  overwrite = TRUE
)
file.copy(
  file.path(legacy_ref, "lbc_smokehist_ac_mortality_02212016.txt"),
  file.path(legacy_complete, "acm.txt"),
  overwrite = TRUE
)
file.copy(
  file.path(legacy_ref, "lbc_smokehist_oc_mortality_02212016.txt"),
  file.path(legacy_complete, "ocm-excl-lung-cancer.txt"),
  overwrite = TRUE
)

cohort_cols <- c("1950", "2010")

read_csv_flex <- function(path) {
  read.csv(path, stringsAsFactors = FALSE, check.names = FALSE)
}

trim_initiation_cessation <- function(df) {
  cn <- names(df)
  miss <- setdiff(cohort_cols, cn)
  if (length(miss)) {
    stop("Missing cohort columns in table: ", paste(miss, collapse = ", "))
  }
  df <- df[df$RACE == 0 & df$SEX == 0, , drop = FALSE]
  df[, c("RACE", "SEX", "AGE", cohort_cols)]
}

init_df <- read_csv_flex(file.path(csv_complete, "initiation.csv"))
write.csv(
  trim_initiation_cessation(init_df),
  file.path(csv_partial, "initiation.csv"),
  row.names = FALSE,
  quote = FALSE
)

cess_df <- read_csv_flex(file.path(csv_complete, "cessation.csv"))
write.csv(
  trim_initiation_cessation(cess_df),
  file.path(csv_partial, "cessation.csv"),
  row.names = FALSE,
  quote = FALSE
)

cpd_full <- read_csv_flex(file.path(csv_complete, "cpd.csv"))
cpd_partial <- cpd_full[
  cpd_full$RACE == 0 &
    cpd_full$SEX == 0 &
    cpd_full$START_YOB == cpd_full$END_YOB &
    as.character(cpd_full$START_YOB) %in% cohort_cols,
]
write.csv(
  cpd_partial,
  file.path(csv_partial, "cpd.csv"),
  row.names = FALSE,
  quote = FALSE
)

mort_partial <- function(fname) {
  m <- read_csv_flex(file.path(csv_complete, fname))
  m <- m[m$RACE == 0 & m$SEX == 0, , drop = FALSE]
  m[m$YOB %in% as.integer(cohort_cols), ]
}

write.csv(
  mort_partial("acm.csv"),
  file.path(csv_partial, "acm.csv"),
  row.names = FALSE,
  quote = FALSE
)
write.csv(
  mort_partial("ocm-excl-lung-cancer.csv"),
  file.path(csv_partial, "ocm-excl-lung-cancer.csv"),
  row.names = FALSE,
  quote = FALSE
)

write_legacy_partial_initcess <- function(csv_name, txt_name, title, n_preamble) {
  df <- read_csv_flex(file.path(csv_partial, csv_name))
  lines <- c(
    as.character(n_preamble),
    paste0("* ", title),
    "* race=0 sex=0; cohort columns 1950, 2010",
    "1,1,2,0,99",
    "Race,Sex,Age,1950-1950,2010-2010"
  )
  for (i in seq_len(nrow(df))) {
    lines <- c(lines, paste(
      df$RACE[i], df$SEX[i], df$AGE[i],
      df[["1950"]][i], df[["2010"]][i],
      sep = ","
    ))
  }
  writeLines(lines, file.path(legacy_partial, txt_name))
}

write_legacy_partial_initcess(
  "initiation.csv", "initiation.txt",
  "SHG initiation (trimmed NHIS partial, test fixture)", 4
)
write_legacy_partial_initcess(
  "cessation.csv", "cessation.txt",
  "SHG cessation (trimmed NHIS partial, test fixture)", 4
)

mort_to_legacy_txt <- function(csv_name, txt_name, comment) {
  df <- read_csv_flex(file.path(csv_partial, csv_name))
  lines <- c(
    "3",
    paste0("* ", comment),
    "1,1,1950,2010,0,99"
  )
  for (i in seq_len(nrow(df))) {
    lines <- c(lines, paste(
      c(
        df$RACE[i], df$SEX[i], df$YOB[i], df$AGE[i],
        as.numeric(df[i, grep("^NS$|^CS_", names(df))])
      ),
      collapse = ","
    ))
  }
  writeLines(lines, file.path(legacy_partial, txt_name))
}

mort_to_legacy_txt("acm.csv", "acm.txt", "legacy mortality fixture from acm.csv")
mort_to_legacy_txt(
  "ocm-excl-lung-cancer.csv",
  "ocm-excl-lung-cancer.txt",
  "legacy mortality fixture from ocm-excl-lung-cancer.csv"
)

write_legacy_partial_cpd <- function() {
  df <- read_csv_flex(file.path(csv_partial, "cpd.csv"))
  df <- df[
    df$START_YOB == df$END_YOB &
      as.character(df$START_YOB) %in% cohort_cols,
  ]
  cat_cols <- grep("^CAT", names(df), value = TRUE)
  if (length(cat_cols) == 0) {
    stop("cpd.csv: expected CAT* columns")
  }
  lines <- c(
    "5",
    "* SHG CPD (trimmed NHIS partial, test fixture)",
    "* race=0 sex=0; cohorts 1950, 2010",
    "* 1=1-5, 2=6-15, 3=16-25, 4=26-35, 5=36-45, 6=46+ (CAT1-CAT6)",
    paste("1,1,2,0,99", length(cat_cols), sep = ",")
  )
  for (i in seq_len(nrow(df))) {
    lines <- c(lines, paste(
      c(
        df$RACE[i], df$SEX[i], df$START_YOB[i], df$END_YOB[i], df$AGE[i],
        as.numeric(df[i, cat_cols])
      ),
      collapse = ","
    ))
  }
  writeLines(lines, file.path(legacy_partial, "cpd.txt"))
}
write_legacy_partial_cpd()

message("Wrote ", out_root)

Try the SmokingHistoryGenerator package in your browser

Any scripts or data that you put into this service are public.

SmokingHistoryGenerator documentation built on June 13, 2026, 1:08 a.m.