knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) library(ksformat)
The ksformat package provides SAS PROC FORMAT-like functionality for R. This vignette walks through the most common use cases.
Create a format for gender codes (auto-stored in library as "sex"):
fnew( "M" = "Male", "F" = "Female", .missing = "Unknown", .other = "Other Gender", name = "sex" ) gender_codes <- c("M", "F", "M", NA, "X", "F") formatted_genders <- fput(gender_codes, "sex") data.frame( code = gender_codes, label = formatted_genders ) fprint("sex")
Define formats in SAS-like text (auto-registered):
fparse(text = ' VALUE age (numeric) [0, 18) = "Child" [18, 65) = "Adult" [65, HIGH] = "Senior" .missing = "Age Unknown" ; ') ages <- c(5, 15.3, 17.9, 18, 45, 64.99, 65, 85, NA) age_groups <- fputn(ages, "age") data.frame( age = ages, group = age_groups )
fparse(text = ' VALUE bmi (numeric) [0, 18.5) = "Underweight" [18.5, 25) = "Normal" [25, 30) = "Overweight" [30, HIGH] = "Obese" .missing = "No data" ; ') bmi_values <- c(16.2, 18.5, 22.7, 25, 29.9, 35.1, NA) bmi_labels <- fputn(bmi_values, "bmi") data.frame( bmi = bmi_values, category = bmi_labels )
fparse(text = ' VALUE score (numeric) (0, 50] = "Low" (50, 100] = "High" .other = "Out of range" ; ') scores <- c(0, 1, 50, 51, 100, 101) score_labels <- fputn(scores, "score") data.frame( score = scores, label = score_labels )
Invalues convert labels back to values. The default target_type is "numeric":
finput( "Male" = 1, "Female" = 2, name = "sex_inv" ) labels <- c("Male", "Female", "Male", "Unknown", "Female") codes <- finputn(labels, "sex_inv") data.frame( label = labels, code = codes )
fnew_bid() creates both a format and an invalue at once:
status_bi <- fnew_bid( "A" = "Active", "I" = "Inactive", "P" = "Pending", name = "status" ) # Forward: code -> label status_codes <- c("A", "I", "P", "A") status_labels <- fputc(status_codes, "status") data.frame(code = status_codes, label = status_labels) # Reverse: label -> code test_labels <- c("Active", "Pending", "Inactive") test_codes <- finputc(test_labels, "status_inv") data.frame(label = test_labels, code = test_codes)
fparse(text = ' // Study format definitions VALUE race (character) "W" = "White" "B" = "Black" "A" = "Asian" .missing = "Unknown" ; INVALUE race_inv "White" = 1 "Black" = 2 "Asian" = 3 ; ') flist() # character vector of names fprint()
bmi_fmt <- format_get("bmi") cat(fexport(bmi = bmi_fmt))
# fputn — apply numeric format by name fputn(c(5, 30, 70), "age") # fputc — apply character format by name fputc(c("M", "F"), "sex") # finputn — apply numeric invalue by name finputn(c("White", "Black"), "race_inv")
df <- data.frame( id = 1:6, sex = c("M", "F", "M", "F", NA, "X"), age = c(15, 25, 45, 70, 35, NA), stringsAsFactors = FALSE ) sex_f <- format_get("sex") age_f <- format_get("age") df_formatted <- fput_df( df, sex = sex_f, age = age_f, suffix = "_label" ) df_formatted
# With .missing label fput(c("M", "F", NA), "sex") # With keep_na = TRUE fput(c("M", "F", NA), sex_f, keep_na = TRUE) # is_missing() checks is_missing(NA) is_missing(NaN) is_missing("") # TRUE — empty strings are treated as missing
SAS date format names are auto-resolved — no pre-creation needed:
today <- Sys.Date() data.frame( format = c("DATE9.", "MMDDYY10.", "DDMMYY10.", "YYMMDD10.", "MONYY7.", "WORDDATE.", "YEAR4.", "QTR."), result = c( fputn(today, "DATE9."), fputn(today, "MMDDYY10."), fputn(today, "DDMMYY10."), fputn(today, "YYMMDD10."), fputn(today, "MONYY7."), fputn(today, "WORDDATE."), fputn(today, "YEAR4."), fputn(today, "QTR.") ) ) # Multiple dates dates <- as.Date(c("2020-01-15", "2020-06-30", "2020-12-25")) fputn(dates, "DATE9.")
r_days <- as.numeric(as.Date("2025-01-01")) r_days fputn(r_days, "DATE9.") fputn(r_days, "MMDDYY10.")
Time is represented as seconds since midnight:
seconds <- c(0, 3600, 45000, 86399) data.frame( seconds = seconds, TIME8 = fputn(seconds, "TIME8."), TIME5 = fputn(seconds, "TIME5."), HHMM = fputn(seconds, "HHMM.") )
now <- Sys.time() data.frame( format = c("DATETIME20.", "DATETIME13.", "DTDATE.", "DTYYMMDD."), result = c( fputn(now, "DATETIME20."), fputn(now, "DATETIME13."), fputn(now, "DTDATE."), fputn(now, "DTYYMMDD.") ) ) # From numeric R-epoch seconds r_secs <- as.numeric(as.POSIXct("2025-06-15 14:30:00", tz = "UTC")) fputn(r_secs, "DATETIME20.")
fnew_date()# SAS-named format fnew_date("DATE9.", name = "bday_fmt") birthdays <- as.Date(c("1990-03-25", "1985-11-03", "2000-07-14")) fput(birthdays, "bday_fmt") # Custom strftime pattern (e.g. DD.MM.YYYY) fnew_date("%d.%m.%Y", name = "ru_date", type = "date") fput(birthdays, "ru_date") # Custom pattern with missing label fnew_date("MMDDYY10.", name = "us_date", .missing = "NO DATE") mixed <- c(as.Date("2025-01-01"), NA, as.Date("2025-12-31")) fput(mixed, "us_date") fprint("bday_fmt")
patients <- data.frame( id = 1:4, visit_date = as.Date(c("2025-01-10", "2025-02-15", "2025-03-20", NA)), stringsAsFactors = FALSE ) visit_fmt <- fnew_date("DATE9.", name = "visit_fmt", .missing = "NOT RECORDED") fput_df(patients, visit_date = visit_fmt)
fparse(text = ' VALUE enrldt (date) pattern = "DATE9." .missing = "Not Enrolled" ; VALUE visit_time (time) pattern = "TIME8." ; VALUE stamp (datetime) pattern = "DATETIME20." ; ') fput(as.Date("2025-03-01"), "enrldt") fput(36000, "visit_time") fput(as.POSIXct("2025-03-01 10:00:00", tz = "UTC"), "stamp") # Export back to text enrl_obj <- format_get("enrldt") cat(fexport(enrldt = enrl_obj)) fclear()
With multilabel formats, a single value can match multiple labels:
fnew( "0,5,TRUE,TRUE" = "Infant", "6,11,TRUE,TRUE" = "Child", "12,17,TRUE,TRUE" = "Adolescent", "0,17,TRUE,TRUE" = "Pediatric", "18,64,TRUE,TRUE" = "Adult", "65,Inf,TRUE,TRUE" = "Elderly", "18,Inf,TRUE,TRUE" = "Non-Pediatric", name = "age_categories", type = "numeric", multilabel = TRUE ) ages <- c(3, 14, 25, 70) # fput returns first match only fput(ages, "age_categories") # fput_all returns ALL matching labels all_labels <- fput_all(ages, "age_categories") for (i in seq_along(ages)) { cat("Age", ages[i], "->", paste(all_labels[[i]], collapse = ", "), "\n") }
fnew( "0,100,TRUE,TRUE" = "Valid Score", "0,49,TRUE,TRUE" = "Below Average", "50,100,TRUE,TRUE" = "Above Average", "90,100,TRUE,TRUE" = "Excellent", .missing = "No Score", .other = "Out of Range", name = "score_ml", type = "numeric", multilabel = TRUE ) scores <- c(95, 45, NA, 150) ml_result <- fput_all(scores, "score_ml") for (i in seq_along(scores)) { cat("Score", ifelse(is.na(scores[i]), "NA", scores[i]), "->", paste(ml_result[[i]], collapse = ", "), "\n") }
fparse(text = ' VALUE risk (numeric, multilabel) [0, 3] = "Low Risk" [0, 7] = "Monitored" (3, 7] = "Medium Risk" (7, 10] = "High Risk" ; ') risk_scores <- c(2, 5, 9) risk_labels <- fput_all(risk_scores, "risk") for (i in seq_along(risk_scores)) { cat("Score", risk_scores[i], "->", paste(risk_labels[[i]], collapse = " | "), "\n") }
risk_obj <- format_get("risk") cat(fexport(risk = risk_obj)) fprint("risk")
fnew( "1,1,TRUE,TRUE" = "Mild", "2,2,TRUE,TRUE" = "Moderate", "3,3,TRUE,TRUE" = "Severe", "4,4,TRUE,TRUE" = "Life-threatening", "5,5,TRUE,TRUE" = "Fatal", "3,5,TRUE,TRUE" = "Serious", "1,2,TRUE,TRUE" = "Non-serious", name = "ae_grade", type = "numeric", multilabel = TRUE ) grades <- c(1, 2, 3, 4, 5) ae_labels <- fput_all(grades, "ae_grade") for (i in seq_along(grades)) { cat("Grade", grades[i], ":", paste(ae_labels[[i]], collapse = " + "), "\n") } fclear()
sex_nc <- fnew( "M" = "Male", "F" = "Female", .missing = "Unknown", name = "sex_nc", type = "character", ignore_case = TRUE ) input <- c("m", "F", "M", "f", NA) fput(input, sex_nc) # Note the [nocase] flag fprint("sex_nc") # Also works with fputc fputc("m", "sex_nc") fclear()
Expression labels contain .x1, .x2, etc., which reference extra arguments
passed to fput(). This lets you compute labels dynamically.
sprintf Expressionstat_fmt <- fnew( "n" = "sprintf('%s', .x1)", "pct" = "sprintf('%.1f%%', .x1 * 100)", name = "stat", type = "character" ) types <- c("n", "pct", "n", "pct") values <- c(42, 0.053, 100, 0.255) fput(types, stat_fmt, values)
.x1, .x2)ratio_fmt <- fnew( "ratio" = "sprintf('%s/%s', .x1, .x2)", name = "ratio", type = "character" ) fput("ratio", ratio_fmt, 3, 10) fput(c("ratio", "ratio"), ratio_fmt, c(3, 7), c(10, 20))
ifelse Expressionsign_fmt <- fnew( "val" = "ifelse(.x1 > 0, paste0('+', .x1), as.character(.x1))", name = "sign", type = "character" ) nums <- c(5, 0, -3) fput(rep("val", 3), sign_fmt, nums)
mixed_fmt <- fnew( "header" = "HEADER", "n" = "sprintf('N=%s', .x1)", "pct" = "sprintf('%.1f%%', .x1 * 100)", name = "mixed", type = "character" ) keys <- c("header", "n", "pct", "header", "n") vals <- c(0, 42, 0.15, 0, 100) fput(keys, mixed_fmt, vals)
.other Fallbackknown_fmt <- fnew( "ok" = "OK", .other = "sprintf('Error(%s)', .x1)", name = "err_fmt", type = "character" ) codes <- c("ok", "E01", "ok", "E99") details <- c("", "timeout", "", "overflow") fput(codes, known_fmt, details)
label_fmt <- fnew( "val" = "sprintf('%s (N=%s)', .x1, .x2)", name = "recycle", type = "character" ) fput(c("val", "val"), label_fmt, c(42, 55), 100)
A realistic clinical-trial example: e() marks labels as expressions evaluated
at apply-time, .x1 references the extra argument, and multiline dplyr::case_when
shows complex conditional formatting.
# Population counts used as denominators n.trt <- data.frame(pop = c("fas","pps","saf"), ntot = c(34, 30, 36)) get_n <- function(pop) { n.trt$ntot[n.trt$pop == pop] } fnew( "n_fas" = e("get_n('fas')"), "n_pps" = e("get_n('pps')"), "n_saf" = e("get_n('saf')"), "n" = "sprintf('%d', .x1)", "n_pct_fas" = "sprintf('%d (%5.1f%%)', .x1, .x1 * 100 / get_n('fas'))", "n_pct_pps" = "sprintf('%d (%5.1f%%)', .x1, .x1 * 100 / get_n('pps'))", "n_pct_saf" = "sprintf('%d (%5.1f%%)', .x1, .x1 * 100 / get_n('saf'))", "pct" = "dplyr::case_when( .x1>0 & .x1<0.1 ~ sprintf('%5s', ' <0.1%'), .x1>=0.1 | .x1==0 ~ sprintf(paste0('%5.', 1 ,'f%%'), .x1) )", "pval" = "dplyr::case_when( .x1>=0 & .x1<0.001 ~ sprintf('%s', '<0.001'), .x1>=0.001 & .x1<=0.999 ~ sprintf(paste0('%.', 3 ,'f'), .x1), .x1>0.999 ~ sprintf('%s', '>0.999'), .default = '--' )", name = "stat", type = "character" )
The same format can be created via fparse(). Note that multiline expressions
must be collapsed to single lines in the text block, and (eval) marks
evaluated labels:
fmt <- ' VALUE stat_01 (character) "n_fas" = "get_n(\'fas\')" (eval) "n_pps" = "get_n(\'pps\')" (eval) "n_saf" = "get_n(\'saf\')" (eval) "n" = "sprintf(\'%d\', .x1)" "pct" = "dplyr::case_when(.x1>0 & .x1<0.1 ~ sprintf(\'%5s\', \' <0.1%\'), .x1>=0.1 | .x1==0 ~ sprintf(paste0(\'%5.\', 1 ,\'f%%\'), .x1))" "n_pct_fas" = "sprintf(\'%d (%5.1f%%)\', .x1, .x1 * 100 / get_n(\'fas\'))" "n_pct_pps" = "sprintf(\'%d (%5.1f%%)\', .x1, .x1 * 100 / get_n(\'pps\'))" "n_pct_saf" = "sprintf(\'%d (%5.1f%%)\', .x1, .x1 * 100 / get_n(\'saf\'))" "pval" = "dplyr::case_when(.x1>=0 & .x1<0.001 ~ sprintf(\'%s\', \'<0.001\'), .x1>=0.001 & .x1<=0.999 ~ sprintf(paste0(\'%.\', 3 ,\'f\'), .x1), .x1>0.999 ~ sprintf(\'%s\', \'>0.999\'), .default = \'--\')" ;' fparse(fmt)
Both stat (via fnew) and stat_01 (via fparse) produce identical results:
df <- data.frame( types = c("n_fas", "n_pps", "n_saf", "n", "pct", "pct", "n", "pval", "pval", "n_pct_fas", "n_pct_pps", "n_pct_saf"), values = c(NA, NA, NA, 42, 0.053, 0.0008, 100, 0.255, 0.0003, 22, 22, 22) ) df$fmt <- fput(df$types, "stat", df$values) df$fmt_01 <- fput(df$types, "stat_01", df$values) print(df) fclear()
Each element can use a different format, determined by a vector of format names:
# Dispatch format: maps type code to format name fnew("1" = "groupx", "2" = "groupy", "3" = "groupz", name = "typefmt", type = "numeric") # Per-group character formats fnew("positive" = "agree", "negative" = "disagree", "neutral" = "notsure", name = "groupx", type = "character") fnew("positive" = "accept", "negative" = "reject", "neutral" = "possible", name = "groupy", type = "character") fnew("positive" = "pass", "negative" = "fail", "neutral" = "retest", name = "groupz", type = "character") type <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) response <- c("positive", "negative", "neutral", "positive", "negative", "neutral", "positive", "negative", "neutral") # Step 1: map type -> format name respfmt <- fput(type, "typefmt") # Step 2: apply per-element format word <- fputc(response, respfmt) data.frame(type = type, response = response, respfmt = respfmt, word = word) fclear()
A SAS-style workflow where format names are looked up dynamically per observation:
# Format that maps key codes to date format names fnew("1" = "date9.", "2" = "mmddyy10.", name = "writfmt", type = "numeric") fnew_date("date9.") fnew_date("mmddyy10.") # Input data (R date numbers = days since 1970-01-01) number <- c(12103, 10899) key <- c(1, 2) # Look up format name per observation datefmt <- fputn(key, "writfmt") # Apply per-element date format date <- fputn(number, datefmt) data.frame(number = number, key = key, datefmt = datefmt, date = date) fclear()
The fimport() function reads a CSV file exported from a SAS format catalogue
(PROC FORMAT ... CNTLOUT=):
csv_path <- system.file("extdata", "test_cntlout.csv", package = "ksformat")
imported <- fimport(csv_path) names(imported) flist() fprint()
# Character format (GENDER) gender_codes <- c("M", "F", NA, "X") data.frame( code = gender_codes, label = fputc(gender_codes, "GENDER") ) # Numeric format (AGEGRP) ages <- c(5, 17, 18, 45, 65, 100, NA, -1) data.frame( age = ages, group = fputn(ages, "AGEGRP") ) # Numeric format (BMICAT) bmi_values <- c(15.0, 18.5, 22.3, 25.0, 28.7, 30.0, 35.5) data.frame( bmi = bmi_values, category = fputn(bmi_values, "BMICAT") ) # Invalue (RACEIN) race_labels <- c("White", "Black", "Asian", "Other") data.frame( label = race_labels, code = finputn(race_labels, "RACEIN") )
df <- data.frame( id = 1:5, sex = c("M", "F", "M", NA, "F"), age = c(10, 30, 70, NA, 50), stringsAsFactors = FALSE ) gender_fmt <- imported[["GENDER"]] age_fmt <- imported[["AGEGRP"]] fput_df(df, sex = gender_fmt, age = age_fmt, suffix = "_label")
cat(fexport(AGEGRP = age_fmt)) cat(fexport(GENDER = gender_fmt))
fclear() manual <- fimport(csv_path, register = FALSE) # Library should be empty flist() fprint() # Use directly from returned list fput(c("M", "F"), manual[["GENDER"]]) fclear()
Expression labels can select between languages at apply-time using an extra argument:
# Single format, language selected via .x1 extra argument sex_bi <- fnew( "M" = "ifelse(.x1 == 'en', 'Male', 'Homme')", "F" = "ifelse(.x1 == 'en', 'Female', 'Femme')", .missing = "Unknown", name = "sex_bi" ) # .x1 = language code per observation fput(c("M", "F", "M"), sex_bi, c("en", "fr", "en")) # -> "Male" "Femme" "Male" # Alternative: one format per language, selected at apply-time fnew("M" = "Male", "F" = "Female", .missing = "Unknown", name = "sex_en") fnew("M" = "Homme", "F" = "Femme", .missing = "Inconnu", name = "sex_fr") lang <- "fr" fput(c("M", "F", NA), paste0("sex_", lang)) # -> "Homme" "Femme" "Inconnu" fclear()
fputk()fputk() pastes multiple vectors into a composite key before format lookup.
This is useful when a format is keyed on the combination of several columns,
a common pattern in clinical data (e.g., looking up a visit date by
subject + visit number).
# Simulate a Subject Visits (SV) domain SV <- data.frame( USUBJID = c("SUBJ-001", "SUBJ-001", "SUBJ-001", "SUBJ-002", "SUBJ-002"), VISITNUM = c(1, 2, 3, 1, 2), SVSTDTC = c("2025-01-15", "2025-02-20", "2025-03-10", "2025-01-18", "2025-02-25"), stringsAsFactors = FALSE ) # Simulate a Questionnaires (QS) domain QS <- data.frame( USUBJID = c("SUBJ-001", "SUBJ-001", "SUBJ-002", "SUBJ-002", "SUBJ-002"), VISITNUM = c(1, 2, 1, 2, 3), QSTESTCD = c("SCORE1", "SCORE1", "SCORE1", "SCORE1", "SCORE1"), QSSTRESN = c(85, 90, 72, 78, NA), stringsAsFactors = FALSE ) SV QS
Register a format keyed on USUBJID|VISITNUM with values being the visit
start date (SVSTDTC) as character strings:
# Create composite key -> date string mapping from SV fnew( fmap(paste(SV$USUBJID, SV$VISITNUM, sep = "|"), SV$SVSTDTC), .other = "NOT FOUND", name = "svdtc", type = "character", ignore_case = TRUE ) fprint("svdtc")
Now look up visit dates in the QS domain using fputk():
QS$SVSTDTC <- fputk(QS$USUBJID, QS$VISITNUM, format = "svdtc") QS class(QS$SVSTDTC) # character fclear()
Using type = "Date", values are stored as native R Date objects and
fput()/fputk() return them directly — no string conversion needed:
# Create composite key -> Date mapping from SV fnew( fmap( paste(SV$USUBJID, SV$VISITNUM, sep = "|"), as.Date(SV$SVSTDTC, format = "%Y-%m-%d") ), .other = NA, name = "svdtn", type = "Date", ignore_case = TRUE ) fprint("svdtn")
QS$SVSTDTC_DT <- fputk(QS$USUBJID, QS$VISITNUM, format = "svdtn") QS class(QS$SVSTDTC_DT) # Date # Typed NA for unmatched keys (SUBJ-002 Visit 3 not in SV) is.na(QS$SVSTDTC_DT[5]) # Date arithmetic works directly QS$SVSTDTC_DT + 7 # add 7 days fclear()
fmap()When building formats from data (e.g., a data frame with 1000+ rows), you
need a named vector mapping keys to values. By default, fnew() treats
named vectors differently depending on the output type:
Date, POSIXct, logical): c(key = value) — natural
direction, no reversal.c(Label = "Code") — R convention, names and
values are swapped internally.This inconsistency is confusing for data-driven formats. The fmap() helper
solves it: fmap(keys, values) works identically for all types.
Suppose we have a demographics dataset and need two lookup formats from the same data — one returning Date objects, one returning character strings:
library(ksformat) dm <- data.frame( USUBJID = c("SUBJ-001", "SUBJ-002", "SUBJ-003"), SUBJID = c("001", "002", "003"), RFICDTC = c("2023-03-09T08:45", "2024-08-13T09:53", "2025-06-17T09:03"), stringsAsFactors = FALSE ) # Composite key for both formats keys <- paste(dm$USUBJID, dm$SUBJID, sep = "|")
fmap(keys, values) pattern for both typesBoth formats use the identical calling style — fmap(keys, values) where
keys are input lookup values and values are output objects:
# Date lookup fnew( fmap(keys, as.Date(dm$RFICDTC, format = "%Y-%m-%d")), .other = NA, type = "Date", ignore_case = TRUE, name = "icdtn" ) # Character lookup — same fmap(keys, values) pattern! fnew( fmap(keys, dm$RFICDTC), .other = "NOT FOUND", type = "character", ignore_case = TRUE, name = "icdtc" ) fprint("icdtn") fprint("icdtc")
# Both return the expected results fputk("SUBJ-001", "001", format = "icdtn") class(fputk("SUBJ-001", "001", format = "icdtn")) fputk("SUBJ-001", "001", format = "icdtc") class(fputk("SUBJ-001", "001", format = "icdtc")) fclear()
No extra parameters needed — fmap() tells fnew() to use the natural
direction for all types.
The default auto-reversal preserves the standard R convention
where c(Label = "Code") maps Code -> Label. This is natural for
hand-written formats:
# These are equivalent — both map "M" -> "Male" fmt_a <- fnew(c(Male = "M", Female = "F")) fmt_b <- fnew("M" = "Male", "F" = "Female") identical(fput(c("M", "F"), fmt_a), fput(c("M", "F"), fmt_b)) fclear()
| Use case | Style | Reversal |
|:--------------------------|:--------------------------------------|:----------|
| Data-driven (any type) | fmap(keys, values) | Suppressed |
| Hand-written (char/num) | c(Label = "Code") or "Code" = "Label" | Auto (default) |
| Value types (Date, etc.)| fmap(keys, values) or setNames(values, keys) | No reversal (default) |
fparse() and fputk()Examples 20–21 built composite-key formats programmatically with fnew() and
fmap(). When the mapping is small and known in advance (e.g., a
study-specific visit schedule), you can define the same lookup entirely in
text with fparse().
The simplest approach: store dates as character strings using a regular
character format.
fparse(text = ' VALUE svdtc (character, nocase) "SUBJ-001|1" = "2025-01-15" "SUBJ-001|2" = "2025-02-20" "SUBJ-001|3" = "2025-03-10" "SUBJ-002|1" = "2025-01-18" "SUBJ-002|2" = "2025-02-25" .other = "NOT FOUND" ; ') fprint("svdtc")
Apply with fputk() to look up visit dates from a questionnaire domain:
QS <- data.frame( USUBJID = c("SUBJ-001", "SUBJ-001", "SUBJ-002", "SUBJ-002", "SUBJ-002"), VISITNUM = c(1, 2, 1, 2, 3), QSSTRESN = c(85, 90, 72, 78, NA), stringsAsFactors = FALSE ) QS$SVSTDTC <- fputk(QS$USUBJID, QS$VISITNUM, format = "svdtc") QS fclear()
Use the Date value type with format: to store dates as native R Date
objects. The format: parameter tells fparse() how to parse the date
strings in the text block:
fparse(text = ' VALUE svdtn (Date, format: %Y-%m-%d, nocase) "SUBJ-001|1" = "2025-01-15" "SUBJ-001|2" = "2025-02-20" "SUBJ-001|3" = "2025-03-10" "SUBJ-002|1" = "2025-01-18" "SUBJ-002|2" = "2025-02-25" ; ') fprint("svdtn")
Now fputk() returns real Date objects — arithmetic and comparison work
directly:
QS$SVSTDTC_DT <- fputk(QS$USUBJID, QS$VISITNUM, format = "svdtn") QS class(QS$SVSTDTC_DT) # Date is.na(QS$SVSTDTC_DT[5]) # TRUE — no match for SUBJ-002 Visit 3 # Date arithmetic works directly QS$SVSTDTC_DT + 7
Formats created with fparse() can be exported back to text with fexport()
and re-parsed — useful for version-controlled format definitions:
fmt_obj <- format_get("svdtn") txt <- fexport(svdtn = fmt_obj) cat(txt)
# Re-parse the exported text fclear() fparse(text = txt) # Verify it still works fputk("SUBJ-001", 2, format = "svdtn") fclear()
franges()franges() extracts all range-based mappings from a format and returns them
as a tidy data.frame — useful for auditing, documentation, or downstream
processing.
fparse(text = ' VALUE age (numeric) [0, 18) = "Child" [18, 65) = "Adult" [65, HIGH] = "Senior" .missing = "Unknown" ; ') franges("age")
You can use the result like any data frame — filter, display, or feed into further calculations:
df <- franges("age") # Which ranges have a finite upper bound? df[is.finite(df$high), ]
franges() silently excludes discrete entries (.missing, .other, plain
string keys) — only range rows appear. It returns an empty data.frame with
the same columns when the format contains no ranges.
fnew("M" = "Male", "F" = "Female", .missing = "Unknown", name = "sex") franges("sex") # 0 rows
fclear()
fmap_to_ranges()When a range format stores numeric codes as its labels (e.g. visit
windows coded as weeks), fmap_to_ranges() turns a vector of those codes
back into the original [low, high] bounds — one row per input value.
fparse(text = ' VALUE visit_ther (numeric) [LOW, 1] = 0 [ 8, 22] = 2 [22, 36] = 4 [37, 50] = 6 [51, 63] = 8 [64, 78] = 10 [79, 91] = 12 ; ') coded_weeks <- c(0, 2, 4, 6, 8, 10, 12) fmap_to_ranges(coded_weeks, "visit_ther")
Unmatched values produce NA rows, making it safe to pass arbitrary vectors:
fmap_to_ranges(c(2, 99, 4), "visit_ther")
fclear()
date_range and datetime_range formats bucket Date or POSIXct input
into character labels using ISO date/datetime interval bounds.
They reuse the same range-table engine as numeric ranges, so the
findInterval() fast path is active for sorted, disjoint buckets.
fnew( "2023-01-01,2024-01-01,TRUE,FALSE" = "FY23", "2024-01-01,2025-01-01,TRUE,FALSE" = "FY24", "2025-01-01,2026-01-01,TRUE,FALSE" = "FY25", type = "date_range", name = "fiscal_year" ) dates <- as.Date(c("2023-06-15", "2024-03-01", "2024-12-31", "2025-07-04", "2022-01-01", NA)) data.frame( date = dates, fy = fput(dates, "fiscal_year") )
fparse()fparse(text = ' VALUE quarter (date_range) [2024-01-01, 2024-04-01) = "Q1-2024" [2024-04-01, 2024-07-01) = "Q2-2024" [2024-07-01, 2024-10-01) = "Q3-2024" [2024-10-01, 2025-01-01) = "Q4-2024" .other = "Outside 2024" ; ') sample_dates <- as.Date(c("2024-02-14", "2024-05-20", "2024-08-08", "2024-11-30", "2025-03-01")) data.frame( date = sample_dates, quarter = fput(sample_dates, "quarter") )
LOW / HIGH open-ended boundsLOW and HIGH represent $-\infty$ and $+\infty$ — any date before or
after a cutpoint falls in the open arm.
fparse(text = ' VALUE era (date_range) [LOW, 2000-01-01) = "Pre-2000" [2000-01-01, 2010-01-01) = "2000s" [2010-01-01, 2020-01-01) = "2010s" [2020-01-01, HIGH] = "2020+" ; ') event_dates <- as.Date(c("1985-07-04", "2005-12-25", "2015-06-01", "2023-11-11")) data.frame( date = event_dates, era = fput(event_dates, "era") )
Formats export with ISO date bounds and re-parse without loss:
q_obj <- format_get("quarter") cat(fexport(quarter = q_obj))
# Re-parse the exported text txt <- fexport(quarter = q_obj) fclear() fparse(text = txt) fput(as.Date(c("2024-02-14", "2024-08-08")), "quarter")
multilabel and fput_all()fparse(text = ' VALUE study_window (date_range, multilabel) [2024-01-01, 2024-07-01) = "First Half" [2024-04-01, 2024-10-01) = "Mid-Year" [2024-07-01, 2025-01-01) = "Second Half" ; ') checkup_dates <- as.Date(c("2024-02-15", "2024-05-20", "2024-09-01")) all_windows <- fput_all(checkup_dates, "study_window") for (i in seq_along(checkup_dates)) { cat(format(checkup_dates[i]), "->", paste(all_windows[[i]], collapse = " | "), "\n") }
When no explicit type is given, fparse() infers date_range from ISO date
bounds and datetime_range when bounds include a time component:
fparse(text = ' VALUE auto_fy [2024-01-01, 2025-01-01) = "2024" ; VALUE auto_shift [2024-01-15 08:00, 2024-01-15 16:00) = "Day shift" ; ') cat("auto_fy type :", format_get("auto_fy")$type, "\n") cat("auto_shift type:", format_get("auto_shift")$type, "\n")
datetime_range works identically to date_range but matches against
POSIXct values. Bounds are expressed as YYYY-MM-DD HH:MM[:SS] strings.
fparse(text = ' VALUE shift (datetime_range) [2024-01-15 00:00, 2024-01-15 08:00) = "Night" [2024-01-15 08:00, 2024-01-15 16:00) = "Day" [2024-01-15 16:00, 2024-01-16 00:00) = "Evening" ; ') timestamps <- as.POSIXct( c("2024-01-15 03:22:00", "2024-01-15 11:45:00", "2024-01-15 19:00:00"), tz = "UTC" ) data.frame( ts = format(timestamps, tz = "UTC"), shift = fput(timestamps, "shift") )
fclear()
fputk()The stratified_range type combines a discrete stratum (such as a study arm,
subject id, or any composite key) with a numeric / Date / POSIXct range. Each
stratum has its own bucket boundaries, and fputk() dispatches to the right
bucket for each row.
fmap_strata()visits <- fmap_strata( stratum = c("ARM_A", "ARM_A", "ARM_A", "ARM_B", "ARM_B"), low = c(0, 7, 28, 0, 14), high = c(7, 28, Inf, 14, Inf), label = c("Baseline", "Wk1-3", "Wk4+", "Baseline", "Wk2+"), inc_high = c(FALSE, FALSE, TRUE, FALSE, TRUE) ) fnew(visits, type = "stratified_range", ".other|ARM_A" = "A_outside", .other = "outside_window", name = "vw") df <- data.frame( arm = c("ARM_A", "ARM_A", "ARM_B", "ARM_B", "ARM_C"), day = c(3, 35, 5, 40, 10) ) df$visit <- fputk(df$arm, df$day, format = "vw") df
fparse()fparse(text = ' VALUE vw_text (stratified_range, range_subtype: numeric) "ARM_A"|[0, 7) = "Baseline" "ARM_A"|[7, 28) = "Wk1-3" "ARM_A"|[28, HIGH]= "Wk4+" "ARM_B"|[0, 14) = "Baseline" "ARM_B"|[14, HIGH]= "Wk2+" ".other|ARM_A" = "A_outside" .other = "outside_window" ; ') fputk(df$arm, df$day, format = "vw_text")
windows <- fmap_strata( stratum = c("S001", "S001", "S002", "S002"), low = as.Date(c("2024-01-01", "2024-01-15", "2024-02-01", "2024-02-20")), high = as.Date(c("2024-01-15", "2024-02-01", "2024-02-20", "2024-03-10")), label = c("Screen", "Treat", "Screen", "Treat") ) fnew(windows, type = "stratified_range", range_subtype = "date", .other = "off-window", name = "win") subj <- c("S001", "S001", "S002", "S002", "S003") visits <- as.Date(c("2024-01-05", "2024-01-20", "2024-02-10", "2024-03-01", "2024-01-01")) data.frame( subj = subj, date = visits, phase = fputk(subj, visits, format = "win") )
fexport() / fparse()txt <- fexport(format_get("vw")) cat(txt, "\n") fclear() fparse(text = txt) fputk(df$arm, df$day, format = "vw")
fclear()
fmap_ranges()For non-stratified numeric / Date ranges, fmap_ranges() saves you from
hand-crafting canonical \code{"low,high,inc_low,inc_high"} keys.
age_groups <- fmap_ranges( low = c(0, 18, 65), high = c(18, 65, Inf), label = c("Child", "Adult", "Senior"), inc_high = c(FALSE, FALSE, TRUE) ) fnew(age_groups, type = "numeric", name = "ag") fput(c(5, 25, 90), "ag") fclear()
na_as_string)When building a format from data using fmap(paste(..., sep = "|"), values),
base R's paste() converts any NA component to the literal string
"NA". The resulting composite key is therefore "CAT|TEST|NA", not a
missing value.
By default, fputk() restores NA_character_ wherever any component is
NA before the lookup — so the key "CAT|TEST|NA" is never reached and
the row falls through to .other / .missing.
Setting na_as_string = TRUE keeps paste()'s literal "NA", making the
round-trip consistent.
A common ADaM task: derive PARAMCD from a combination of
LBCAT, LBSPEC, LBTESTCD, and LBSTRESU, where some rows have
LBSTRESU = NA (dimensionless tests such as INR).
# Source lab mapping (as received from a specification) lb_map <- data.frame( LBCAT = c("BLOOD CHEMISTRY", "COAGULOGRAM", "COAGULATION PANEL", "COAGULOGRAM"), LBSPEC = c("BLOOD", "BLOOD", "BLOOD", "BLOOD"), LBTESTCD = c("ALB", "FIBRINO", "INR", "INR"), LBSTRESU = c("g/L", "g/L", NA, NA), PARAMCD = c("ALB", "FIBRINO", "INR", "INR"), stringsAsFactors = FALSE ) lb_map
Build the format with fmap(paste(...), PARAMCD).
paste() converts NA in LBSTRESU to "NA", so the stored keys for
INR rows are "COAGULATION PANEL|BLOOD|INR|NA" and "COAGULOGRAM|BLOOD|INR|NA".
with(lb_map, fmap(paste(LBCAT, LBSPEC, LBTESTCD, LBSTRESU, sep = "|"), PARAMCD) ) |> fnew(ignore_case = TRUE, .other = NA, type = "character", name = "lb_param") fprint("lb_param")
Now apply the format.
With the default na_as_string = FALSE, the INR rows get NA (no match):
lb_map$PARAMCD_default <- with(lb_map, fputk(LBCAT, LBSPEC, LBTESTCD, LBSTRESU, format = "lb_param") ) lb_map[, c("LBTESTCD", "LBSTRESU", "PARAMCD", "PARAMCD_default")]
With na_as_string = TRUE, paste() also converts the lookup-side NA to
"NA", so the keys match:
lb_map$PARAMCD_back <- with(lb_map, fputk(LBCAT, LBSPEC, LBTESTCD, LBSTRESU, format = "lb_param", na_as_string = TRUE) ) lb_map[, c("LBTESTCD", "LBSTRESU", "PARAMCD", "PARAMCD_back")]
fclear()
Use
na_as_string = TRUEwhenever the format was built with
fmap(paste(...), values)and any key column can containNA.
If the format keys were set by hand
(fnew("CAT|TEST|g/L" = "ALB", ...)), NA components should still go
through .missing — keep the default na_as_string = FALSE.
finputk()finputk() is the invalue-side mirror of fputk(): it pastes multiple
columns into a composite label and reverse-looks it up in a ks_invalue
format. The same na_as_string argument applies.
# Build an INVALUE from two-column composite labels finput( fmap(paste(c("BLOOD CHEMISTRY", "COAGULOGRAM", "COAGULATION PANEL"), c("ALB", "FIBRINO", "INR"), sep = "|"), c(1L, 2L, 3L)), target_type = "integer", name = "lb_code_inv" ) # Reverse lookup: two separate columns → integer code cat_vec <- c("BLOOD CHEMISTRY", "COAGULOGRAM", "COAGULATION PANEL", "OTHER") test_vec <- c("ALB", "FIBRINO", "INR", "X") finputk(cat_vec, test_vec, invalue_name = "lb_code_inv") # BLOOD CHEMISTRY|ALB → 1, COAGULOGRAM|FIBRINO → 2, # COAGULATION PANEL|INR → 3, OTHER|X → NA (no match → missing_value) fclear()
na_as_string = TRUE)When the INVALUE was built from data containing NA columns, use
na_as_string = TRUE on both the build side (fmap(paste(...), ...)) and
the lookup side (finputk(..., na_as_string = TRUE)).
# INVALUE where LBSTRESU can be NA (like INR) finput( fmap( paste(lb_map$LBCAT, lb_map$LBTESTCD, lb_map$LBSTRESU, sep = "|"), seq_len(nrow(lb_map)) ), target_type = "integer", name = "lb_row_inv" ) # Reconstruct lb_map row indices — works even when LBSTRESU is NA finputk(lb_map$LBCAT, lb_map$LBTESTCD, lb_map$LBSTRESU, invalue_name = "lb_row_inv", na_as_string = TRUE) fclear()
The output type is always determined by the stored invalue's target_type
(here integer). For character output create the invalue with
target_type = "character" and finputk() returns a character vector.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.