inst/testdata/test_pharmaverse.R

#!/usr/bin/env Rscript
# =============================================================================
# clinCompare — Pharmaverse Integration Test
# =============================================================================
# Tests clinCompare against real-world CDISC datasets from the pharmaverse
# ecosystem (pharmaversesdtm + pharmaverseadam). These are publicly available
# CDISC-compliant datasets modeled on realistic clinical trial data.
#
# Prerequisites:
#   install.packages("pharmaversesdtm")
#   install.packages("pharmaverseadam")
#
# Usage:
#   devtools::load_all("~/Desktop/cowork/clinCompare")
#   source("inst/testdata/test_pharmaverse.R")
# =============================================================================

cat("\n")
cat("==============================================================\n")
cat("  clinCompare — Pharmaverse Integration Test\n")
cat("  Real-world CDISC data from pharmaversesdtm + pharmaverseadam\n")
cat("==============================================================\n\n")

# --- Check pharmaverse packages are available -----------------------------
if (!requireNamespace("pharmaversesdtm", quietly = TRUE)) {
  stop("pharmaversesdtm not installed. Run: install.packages('pharmaversesdtm')")
}
if (!requireNamespace("pharmaverseadam", quietly = TRUE)) {
  stop("pharmaverseadam not installed. Run: install.packages('pharmaverseadam')")
}

# --- Load SDTM data -------------------------------------------------------
cat("Loading pharmaverse SDTM datasets...\n")
dm_pharma   <- as.data.frame(pharmaversesdtm::dm)
ae_pharma   <- as.data.frame(pharmaversesdtm::ae)
lb_pharma   <- as.data.frame(pharmaversesdtm::lb)
vs_pharma   <- as.data.frame(pharmaversesdtm::vs)
ex_pharma   <- as.data.frame(pharmaversesdtm::ex)
cat(sprintf("  DM: %d rows x %d cols\n", nrow(dm_pharma), ncol(dm_pharma)))
cat(sprintf("  AE: %d rows x %d cols\n", nrow(ae_pharma), ncol(ae_pharma)))
cat(sprintf("  LB: %d rows x %d cols\n", nrow(lb_pharma), ncol(lb_pharma)))
cat(sprintf("  VS: %d rows x %d cols\n", nrow(vs_pharma), ncol(vs_pharma)))
cat(sprintf("  EX: %d rows x %d cols\n", nrow(ex_pharma), ncol(ex_pharma)))

# --- Load ADaM data -------------------------------------------------------
cat("\nLoading pharmaverse ADaM datasets...\n")
adsl_pharma <- as.data.frame(pharmaverseadam::adsl)
adae_pharma <- as.data.frame(pharmaverseadam::adae)
adlb_pharma <- as.data.frame(pharmaverseadam::adlb)
cat(sprintf("  ADSL: %d rows x %d cols\n", nrow(adsl_pharma), ncol(adsl_pharma)))
cat(sprintf("  ADAE: %d rows x %d cols\n", nrow(adae_pharma), ncol(adae_pharma)))
cat(sprintf("  ADLB: %d rows x %d cols\n", nrow(adlb_pharma), ncol(adlb_pharma)))


# =============================================================================
# PHARMA TEST 1: Domain auto-detection on real-world data
# =============================================================================
# ADaM detection should now correctly identify ADSL/ADAE/ADLB as ADaM
# (not SDTM) thanks to indicator column boosting
cat("\n\n--- PHARMA TEST 1: Domain auto-detection on real-world data ----------\n")
pharma_datasets <- list(
  DM = dm_pharma,
  AE = ae_pharma,
  LB = lb_pharma,
  VS = vs_pharma,
  EX = ex_pharma,
  ADSL = adsl_pharma,
  ADAE = adae_pharma,
  ADLB = adlb_pharma
)

for (name in names(pharma_datasets)) {
  det <- tryCatch(
    detect_cdisc_domain(pharma_datasets[[name]]),
    warning = function(w) {
      result <- suppressWarnings(detect_cdisc_domain(pharma_datasets[[name]]))
      result$ambiguity_warning <- conditionMessage(w)
      result
    }
  )
  warn <- if (!is.null(det$ambiguity_warning)) " [ambiguous]" else ""
  correct <- identical(det$domain, name)
  status <- if (correct) "OK" else sprintf("MISMATCH (got %s)", det$domain)
  cat(sprintf("  %-6s -> Detected: %-6s (%s) | Confidence: %.0f%% | %s%s\n",
              name, det$domain, det$standard, det$confidence * 100, status, warn))
}


# =============================================================================
# PHARMA TEST 2: Compare real DM with simulated edits
# =============================================================================
cat("\n\n--- PHARMA TEST 2: DM — compare with simulated corrections ----------\n")

# Create a "corrected" version of DM with realistic edits
dm_corrected <- dm_pharma
set.seed(42)
n_dm <- nrow(dm_corrected)

# 1. Correct some RACE values (common data correction scenario)
race_edits <- sample(seq_len(n_dm), 5)
dm_corrected$RACE[race_edits] <- "MULTIPLE"

# 2. Add a new column (ETHNIC was missing, now added)
if (!"ETHNIC" %in% names(dm_corrected)) {
  dm_corrected$ETHNIC <- sample(c("HISPANIC OR LATINO", "NOT HISPANIC OR LATINO"),
                                 n_dm, replace = TRUE, prob = c(0.15, 0.85))
}

# 3. Add a new subject (new enrollment)
new_subj <- dm_corrected[1, , drop = FALSE]
new_subj$USUBJID <- "AB12345-NEW-SUBJ-999"
new_subj$SUBJID <- "999"
dm_corrected <- rbind(dm_corrected, new_subj)

result <- cdisc_compare(dm_pharma, dm_corrected, domain = "DM", standard = "SDTM")
print(result)

# Key-based comparison for real analysis
cat("\n--- PHARMA TEST 2b: DM — key-based matching by USUBJID ---------------\n")
result_keyed <- cdisc_compare(dm_pharma, dm_corrected, domain = "DM",
                               standard = "SDTM", id_vars = "USUBJID")
print(result_keyed)

cat("\n--- PHARMA TEST 2c: cdisc_compare() with WHERE filter ----------------\n")
# Filter to just male subjects
dm_where_result <- cdisc_compare(dm_pharma, dm_corrected,
                                  domain = "DM", standard = "SDTM",
                                  where = "SEX == 'M'")
n_male_base <- sum(dm_pharma$SEX == "M")
cat(sprintf("  Males in base: %d\n", n_male_base))
cat(sprintf("  Rows compared: %d (should equal males in base)\n",
            dm_where_result$nrow_df1))

cat("\n--- PHARMA TEST 2d: cdisc_compare() with vars parameter ---------------\n")
dm_vars_result <- cdisc_compare(dm_pharma, dm_corrected,
                                 domain = "DM", standard = "SDTM",
                                 vars = c("RACE", "AGE"))
obs_diffs <- dm_vars_result$observation_comparison
diff_cols <- names(obs_diffs$discrepancies[obs_diffs$discrepancies > 0])
cat(sprintf("  Vars requested: RACE, AGE\n"))
cat(sprintf("  Cols with diffs: %s (should only be RACE and/or AGE)\n",
            paste(diff_cols, collapse = ", ")))


# =============================================================================
# PHARMA TEST 3: Tolerance stress test with floating-point rounding noise
# =============================================================================
cat("\n\n--- PHARMA TEST 3: Tolerance stress test (floating-point noise) ------\n")
cat("  Scenario: Simulate SAS→R numeric conversion rounding artifacts\n")
cat("  (SAS stores 8-byte doubles; R reads may differ at ~1e-10 precision)\n\n")

# Create two versions of LB that differ ONLY in floating-point noise
lb_base <- lb_pharma
lb_noisy <- lb_pharma

# Add realistic floating-point noise to LBSTRESN (the primary numeric result)
set.seed(123)
if ("LBSTRESN" %in% names(lb_noisy)) {
  n_lb <- nrow(lb_noisy)
  numeric_rows <- which(!is.na(lb_noisy$LBSTRESN))

  # Tier 1: Ultra-small noise (machine epsilon level, ~1e-14)
  tier1 <- sample(numeric_rows, min(200, length(numeric_rows)))
  lb_noisy$LBSTRESN[tier1] <- lb_noisy$LBSTRESN[tier1] +
    runif(length(tier1), -1e-13, 1e-13)

  # Tier 2: SAS→R rounding noise (~1e-10)
  tier2 <- sample(setdiff(numeric_rows, tier1), min(100, length(numeric_rows) - 200))
  lb_noisy$LBSTRESN[tier2] <- lb_noisy$LBSTRESN[tier2] +
    runif(length(tier2), -1e-9, 1e-9)

  # Tier 3: Small but real diffs (~0.01–0.05) — should always flag
  tier3 <- sample(setdiff(numeric_rows, c(tier1, tier2)),
                  min(10, length(numeric_rows) - 300))
  lb_noisy$LBSTRESN[tier3] <- lb_noisy$LBSTRESN[tier3] +
    runif(length(tier3), 0.01, 0.05)

  cat(sprintf("  Noise injected: %d tier-1 (~1e-14), %d tier-2 (~1e-10), %d tier-3 (>0.01)\n",
              length(tier1), length(tier2), length(tier3)))
}

# Without tolerance — catches everything
result_no_tol <- compare_datasets(lb_base, lb_noisy, tolerance = 0)
n_no_tol <- sum(result_no_tol$observation_comparison$discrepancies, na.rm = TRUE)
cat(sprintf("  Tolerance = 0:      %d differences (machine epsilon noise included)\n", n_no_tol))

# With tolerance = 1e-12 — filters out machine epsilon, keeps SAS rounding + real diffs
result_tol_12 <- compare_datasets(lb_base, lb_noisy, tolerance = 1e-12)
n_tol_12 <- sum(result_tol_12$observation_comparison$discrepancies, na.rm = TRUE)
cat(sprintf("  Tolerance = 1e-12:  %d differences (machine epsilon filtered)\n", n_tol_12))

# With tolerance = 1e-8 — filters out SAS rounding noise, keeps real diffs
result_tol_8 <- compare_datasets(lb_base, lb_noisy, tolerance = 1e-8)
n_tol_8 <- sum(result_tol_8$observation_comparison$discrepancies, na.rm = TRUE)
cat(sprintf("  Tolerance = 1e-8:   %d differences (SAS rounding noise filtered)\n", n_tol_8))

# With tolerance = 0.001 — only shows substantial diffs
result_tol_3 <- compare_datasets(lb_base, lb_noisy, tolerance = 0.001)
n_tol_3 <- sum(result_tol_3$observation_comparison$discrepancies, na.rm = TRUE)
cat(sprintf("  Tolerance = 0.001:  %d differences (only real changes)\n", n_tol_3))

cat(sprintf("\n  Impact: Tolerance=1e-8 eliminated %d false positives from SAS→R noise!\n",
            n_no_tol - n_tol_8))


# =============================================================================
# PHARMA TEST 4: ADLB comparison with key-based matching + tolerance
# =============================================================================
cat("\n\n--- PHARMA TEST 4: ADLB — key-based matching + tolerance -------------\n")

adlb_modified <- adlb_pharma
set.seed(456)
if ("AVAL" %in% names(adlb_modified)) {
  n_adlb <- nrow(adlb_modified)
  aval_rows <- which(!is.na(adlb_modified$AVAL))

  # Add both noise and real changes
  noise_rows <- sample(aval_rows, min(50, length(aval_rows)))
  adlb_modified$AVAL[noise_rows] <- adlb_modified$AVAL[noise_rows] +
    runif(length(noise_rows), -1e-10, 1e-10)

  real_rows <- sample(setdiff(aval_rows, noise_rows), min(15, length(aval_rows) - 50))
  adlb_modified$AVAL[real_rows] <- adlb_modified$AVAL[real_rows] *
    (1 + runif(length(real_rows), -0.03, 0.03))

  cat(sprintf("  Injected: %d rounding-noise rows + %d real-change rows\n",
              length(noise_rows), length(real_rows)))
}

# Determine which id_vars are available
potential_keys <- c("USUBJID", "PARAMCD", "AVISIT", "AVISITN", "ADT", "ADTM")
available_keys <- intersect(potential_keys, names(adlb_pharma))
cat(sprintf("  Available keys: %s\n", paste(available_keys, collapse = ", ")))

# Use USUBJID + PARAMCD + AVISIT for matching (standard ADaM keys)
id_cols <- intersect(c("USUBJID", "PARAMCD", "AVISIT"), available_keys)
if (length(id_cols) >= 2) {
  cat(sprintf("  Using id_vars: %s\n\n", paste(id_cols, collapse = ", ")))

  cat("  Without tolerance:\n")
  result_adlb_no_tol <- cdisc_compare(adlb_pharma, adlb_modified,
                                       domain = "ADLB", standard = "ADaM",
                                       id_vars = id_cols, tolerance = 0)
  n_no <- sum(result_adlb_no_tol$observation_comparison$discrepancies, na.rm = TRUE)
  cat(sprintf("    Differences: %d\n", n_no))

  cat("  With tolerance = 1e-8:\n")
  result_adlb_tol <- cdisc_compare(adlb_pharma, adlb_modified,
                                    domain = "ADLB", standard = "ADaM",
                                    id_vars = id_cols, tolerance = 1e-8)
  n_tol <- sum(result_adlb_tol$observation_comparison$discrepancies, na.rm = TRUE)
  cat(sprintf("    Differences: %d (eliminated %d false positives)\n",
              n_tol, n_no - n_tol))
  print(result_adlb_tol)
} else {
  cat("  Skipping key-based test (insufficient key columns)\n")
}


# =============================================================================
# PHARMA TEST 5: Full detail report on AE comparison
# =============================================================================
cat("\n\n--- PHARMA TEST 5: Full detail report (AE) -----------------------\n")

ae_modified <- ae_pharma
set.seed(789)
n_ae <- nrow(ae_modified)

# Modify multiple AE columns to exercise full detail report with multiple variables
if ("AESEV" %in% names(ae_modified)) {
  sev_rows <- sample(seq_len(n_ae), min(8, n_ae))
  ae_modified$AESEV[sev_rows] <- sample(c("MILD", "MODERATE", "SEVERE"),
                                         length(sev_rows), replace = TRUE)
}
if ("AEREL" %in% names(ae_modified)) {
  rel_rows <- sample(seq_len(n_ae), min(5, n_ae))
  ae_modified$AEREL[rel_rows] <- sample(c("RELATED", "NOT RELATED", "POSSIBLY RELATED"),
                                         length(rel_rows), replace = TRUE)
}
if ("AEOUT" %in% names(ae_modified)) {
  out_rows <- sample(seq_len(n_ae), min(3, n_ae))
  ae_modified$AEOUT[out_rows] <- "RECOVERED/RESOLVED"
}

ae_id_vars <- intersect(c("USUBJID", "AESEQ"), names(ae_pharma))
if (length(ae_id_vars) >= 1) {
  ae_result <- cdisc_compare(ae_pharma, ae_modified, domain = "AE",
                              standard = "SDTM", id_vars = ae_id_vars)

  # Show the full detail report
  cat("\n  --- generate_detailed_report() output: ---\n\n")
  generate_detailed_report(ae_result)
}


# =============================================================================
# PHARMA TEST 6: get_all_differences() — unified output
# =============================================================================
cat("\n\n--- PHARMA TEST 6: get_all_differences() on AE ---------------------\n")
if (exists("ae_result")) {
  all_diffs <- get_all_differences(ae_result)
  cat(sprintf("  Unified diff table: %d rows x %d cols\n", nrow(all_diffs), ncol(all_diffs)))
  cat(sprintf("  Columns: %s\n", paste(names(all_diffs), collapse = ", ")))
  cat(sprintf("  Variables with diffs: %s\n",
              paste(unique(all_diffs$Variable), collapse = ", ")))
  cat(sprintf("  Has 'Row' column: %s (should be FALSE for key-based)\n",
              "Row" %in% names(all_diffs)))
  if (nrow(all_diffs) > 0) {
    cat("\n  First 15 rows:\n")
    print(head(all_diffs, 15), row.names = FALSE)
  }
}


# =============================================================================
# PHARMA TEST 7: CDISC validation on real pharmaverse data
# =============================================================================
cat("\n\n--- PHARMA TEST 7: CDISC validation on pharmaverse SDTM DM ---------\n")
val_dm <- validate_cdisc(dm_pharma, domain = "DM", standard = "SDTM")
n_err <- sum(val_dm$severity == "ERROR")
n_warn <- sum(val_dm$severity == "WARNING")
n_info <- sum(val_dm$severity == "INFO")
cat(sprintf("  Errors: %d | Warnings: %d | Info: %d\n", n_err, n_warn, n_info))
if (n_err > 0) {
  cat("  ERRORS:\n")
  for (i in which(val_dm$severity == "ERROR")) {
    cat(sprintf("    %s: %s\n", val_dm$variable[i], val_dm$message[i]))
  }
}
if (n_warn > 0) {
  cat("  WARNINGS:\n")
  for (i in which(val_dm$severity == "WARNING")) {
    cat(sprintf("    %s: %s\n", val_dm$variable[i], val_dm$message[i]))
  }
}


# =============================================================================
# PHARMA TEST 8: Summary report on real-world comparison
# =============================================================================
cat("\n\n--- PHARMA TEST 8: Summary report (LB tolerance comparison) ---------\n")
if (exists("result_tol_8")) {
  generate_summary_report(result_tol_8)
}


# =============================================================================
# PHARMA TEST 9: VS comparison with explicit domain
# =============================================================================
cat("\n\n--- PHARMA TEST 9: VS vital signs comparison -------------------------\n")
vs_modified <- vs_pharma
set.seed(321)
if ("VSSTRESN" %in% names(vs_modified)) {
  vs_rows <- which(!is.na(vs_modified$VSSTRESN))
  edit_rows <- sample(vs_rows, min(20, length(vs_rows)))
  vs_modified$VSSTRESN[edit_rows] <- vs_modified$VSSTRESN[edit_rows] *
    (1 + runif(length(edit_rows), -0.02, 0.02))
}

vs_id_vars <- intersect(c("USUBJID", "VSSEQ"), names(vs_pharma))
if (length(vs_id_vars) >= 1) {
  vs_result <- cdisc_compare(vs_pharma, vs_modified, domain = "VS",
                              standard = "SDTM", id_vars = vs_id_vars)
  print(vs_result)
}


# =============================================================================
# SUMMARY
# =============================================================================
cat("\n\n==============================================================\n")
cat("  PHARMAVERSE INTEGRATION TESTS COMPLETE\n")
cat("==============================================================\n")
cat("  Data sources: pharmaversesdtm, pharmaverseadam\n")
cat("  Features tested:\n")
cat("    - Domain auto-detection on real CDISC data\n")
cat("    - Key-based matching (USUBJID, AESEQ, PARAMCD, etc.)\n")
cat("    - Tolerance at 4 levels (0, 1e-12, 1e-8, 0.001)\n")
cat("    - Floating-point noise elimination (SAS→R artifact)\n")
cat("    - Full detail report with multiple variables\n")
cat("    - get_all_differences() unified output (Row dropped for key-based)\n")
cat("    - CDISC validation on real-world SDTM DM\n")
cat("    - Summary report on tolerance comparison\n")
cat("    - VS vital signs comparison with numeric diffs\n")
cat("==============================================================\n\n")

Try the clinCompare package in your browser

Any scripts or data that you put into this service are public.

clinCompare documentation built on Feb. 19, 2026, 1:07 a.m.