tools/check-initiation-csv-read.R

#!/usr/bin/env Rscript
# Read an NHIS-style initiation.csv and report how read.csv() treats cohort columns.
#
# - Default read.csv(..., header = TRUE) uses check.names = TRUE, so bare year
#   headers like 1950 become X1950 (syntactic names).
# - With check.names = FALSE, names stay as in the file (e.g. "1950").
#
# Usage:
#   Rscript tools/check-initiation-csv-read.R path/to/initiation.csv
#
# With no arguments, uses bundled test CSV if your working directory is the
# shg-r package root (also works when `source()`-ing this file from that root).
#
# Example (from shg-r root):
#   Rscript tools/check-initiation-csv-read.R tests/testdata/NHIS-1965-2018/csv-complete/initiation.csv

trailing <- commandArgs(trailingOnly = TRUE)
if (length(trailing)) {
  path <- trailing[[1L]]
} else {
  rel <- "tests/testdata/NHIS-1965-2018/csv-complete/initiation.csv"
  path <- file.path(getwd(), rel)
  if (!file.exists(path)) {
    stop(
      "No path argument and default not found.\n",
      "  Default tried: ", path, "\n",
      "  Usage: Rscript tools/check-initiation-csv-read.R path/to/initiation.csv\n",
      "  Or setwd() to the shg-r package root, then source() or re-run without args.",
      call. = FALSE
    )
  }
  message("No path supplied; using default: ", path)
}

if (!file.exists(path)) {
  stop("File not found: ", path, call. = FALSE)
}

raw_lines <- readLines(path, n = 1L, warn = FALSE)
if (!nzchar(raw_lines)) {
  stop("Empty file: ", path, call. = FALSE)
}

# First row = header; split like a naive CSV (no embedded commas in field names for SHG tables).
raw_fields <- strsplit(raw_lines, ",", fixed = TRUE)[[1L]]
raw_fields <- sub("^\\s+", "", sub("\\s+$", "", raw_fields))
quoted_name <- grepl("^\".*\"$", raw_fields)
if (any(quoted_name)) {
  message("Some header fields appear quoted in the first line (count): ", sum(quoted_name))
} else {
  message("First-line header fields: no double-quote wrappers detected (typical SHG CSV).")
}

# read.csv default: header = TRUE, check.names = TRUE
d_default <- read.csv(path, header = TRUE, stringsAsFactors = FALSE, check.names = TRUE)
n_default <- names(d_default)

# Explicit no name mangling
d_rawnames <- read.csv(path, header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
n_raw <- names(d_rawnames)

# Cohort columns: year-like tokens (4 digits, not RACE/SEX/AGE)
is_year_token <- function(nm) {
  grepl("^[0-9]{4}$", nm)
}
is_x_year <- function(nm) {
  grepl("^X[0-9]{4}$", nm)
}

raw_cohort <- raw_fields[is_year_token(raw_fields)]
default_cohort <- n_default[is_x_year(n_default)]
rawnames_cohort <- n_raw[is_year_token(n_raw)]

message("\n=== Summary ===")
message("Rows read: ", nrow(d_default), "  Columns: ", ncol(d_default))
message(
  "Cohort-year columns in raw header line (bare digits): ",
  length(raw_cohort), " (e.g. ", paste(head(raw_cohort, 5L), collapse = ", "), ", ...)"
)
message(
  "After read.csv(check.names = TRUE): ",
  sum(is_x_year(n_default)), " names like XYYYY (R adds the X prefix for syntactic names)."
)
message(
  "After read.csv(check.names = FALSE): ",
  sum(is_year_token(n_raw)), " names still bare digits (e.g. ",
  paste(head(rawnames_cohort, 5L), collapse = ", "), ", ...)."
)

if (length(raw_cohort) && length(default_cohort)) {
  y0 <- raw_cohort[[1L]]
  x0 <- paste0("X", y0)
  if (x0 %in% n_default) {
    message("\nExample: header '", y0, "' -> column name '", x0, "' with default read.csv().")
  }
}

invisible(TRUE)

Try the SmokingHistoryGenerator package in your browser

Any scripts or data that you put into this service are public.

SmokingHistoryGenerator documentation built on June 13, 2026, 1:08 a.m.