In sbalci/ClinicoPathJamoviModule: Analysis for Clinicopathological Research

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6
)

Introduction

This vignette covers advanced features of the Decision Panel Optimization module, including custom optimization functions, complex constraints, and programmatic access to results.

# Load required packages
library(meddecide)
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(knitr)
library(forcats)

# Test Datasets for Decision Panel Optimization Module

# Set seed for reproducibility
set.seed(42)

# Create data directory if it doesn't exist
if (!dir.exists("data")) {
  dir.create("data")
}

# ============================================================================
# DATASET 1: COVID-19 SCREENING
# ============================================================================

create_covid_data <- function(n = 1000, prevalence = 0.15) {
  # True disease status
  disease <- rbinom(n, 1, prevalence)

  # Rapid Antigen Test (RAT)
  # Sensitivity: 65%, Specificity: 98%
  rat_prob <- ifelse(disease == 1, 0.65, 0.02)
  rat_result <- rbinom(n, 1, rat_prob)

  # PCR Test
  # Sensitivity: 95%, Specificity: 99%
  pcr_prob <- ifelse(disease == 1, 0.95, 0.01)
  pcr_result <- rbinom(n, 1, pcr_prob)

  # Chest CT
  # Sensitivity: 90%, Specificity: 85%
  ct_prob <- ifelse(disease == 1, 0.90, 0.15)
  ct_result <- rbinom(n, 1, ct_prob)

  # Clinical symptoms score (0-10)
  # Higher in disease
  symptoms <- ifelse(disease == 1,
                     pmin(10, round(rnorm(sum(disease == 1), 7, 2))),
                     pmax(0, round(rnorm(sum(disease == 0), 3, 2))))

  # Create dataset
  covid_data <- data.frame(
    patient_id = 1:n,
    rapid_antigen = factor(rat_result, levels = c(0, 1), 
                          labels = c("Negative", "Positive")),
    pcr = factor(pcr_result, levels = c(0, 1), 
                labels = c("Negative", "Positive")),
    chest_ct = factor(ct_result, levels = c(0, 1), 
                     labels = c("Normal", "Abnormal")),
    symptom_score = symptoms,
    covid_status = factor(disease, levels = c(0, 1), 
                         labels = c("Negative", "Positive")),
    age = round(rnorm(n, 45, 15)),
    risk_group = factor(sample(c("Low", "Medium", "High"), n, 
                              replace = TRUE, prob = c(0.6, 0.3, 0.1)))
  )

  # Add some missing values realistically
  # PCR might be missing if rapid test is negative
  missing_pcr <- which(covid_data$rapid_antigen == "Negative" & 
                      runif(n) < 0.3)
  covid_data$pcr[missing_pcr] <- NA

  return(covid_data)
}

# Generate COVID dataset
covid_screening_data <- create_covid_data(n = 1000, prevalence = 0.15)

# ============================================================================
# DATASET 2: BREAST CANCER SCREENING
# ============================================================================

create_breast_cancer_data <- function(n = 2000, prevalence = 0.005) {
  # True disease status (low prevalence for screening)
  disease <- rbinom(n, 1, prevalence)

  # Clinical Breast Exam (CBE)
  # Sensitivity: 54%, Specificity: 94%
  cbe_prob <- ifelse(disease == 1, 0.54, 0.06)
  cbe_result <- rbinom(n, 1, cbe_prob)

  # Mammography
  # Sensitivity: 85%, Specificity: 95%
  mammo_prob <- ifelse(disease == 1, 0.85, 0.05)
  mammo_result <- rbinom(n, 1, mammo_prob)

  # Ultrasound
  # Sensitivity: 80%, Specificity: 90%
  us_prob <- ifelse(disease == 1, 0.80, 0.10)
  us_result <- rbinom(n, 1, us_prob)

  # MRI (for high-risk patients)
  # Sensitivity: 95%, Specificity: 85%
  mri_prob <- ifelse(disease == 1, 0.95, 0.15)
  mri_result <- rbinom(n, 1, mri_prob)

  # Create risk factors
  age <- round(rnorm(n, 55, 10))
  age[age < 40] <- 40
  age[age > 75] <- 75

  family_history <- rbinom(n, 1, 0.15)
  brca_status <- rbinom(n, 1, 0.02)

  # Create dataset
  breast_cancer_data <- data.frame(
    patient_id = 1:n,
    clinical_exam = factor(cbe_result, levels = c(0, 1), 
                          labels = c("Normal", "Abnormal")),
    mammography = factor(mammo_result, levels = c(0, 1), 
                        labels = c("BIRADS 1-2", "BIRADS 3-5")),
    ultrasound = factor(us_result, levels = c(0, 1), 
                       labels = c("Normal", "Suspicious")),
    mri = factor(mri_result, levels = c(0, 1), 
                labels = c("Normal", "Suspicious")),
    cancer_status = factor(disease, levels = c(0, 1), 
                          labels = c("No Cancer", "Cancer")),
    age = age,
    family_history = factor(family_history, levels = c(0, 1), 
                           labels = c("No", "Yes")),
    brca_mutation = factor(brca_status, levels = c(0, 1), 
                          labels = c("Negative", "Positive")),
    breast_density = factor(sample(c("A", "B", "C", "D"), n, 
                                  replace = TRUE, 
                                  prob = c(0.1, 0.4, 0.4, 0.1)))
  )

  # MRI typically only done for high-risk
  low_risk_idx <- which(breast_cancer_data$family_history == "No" & 
                       breast_cancer_data$brca_mutation == "Negative")
  breast_cancer_data$mri[sample(low_risk_idx, 
                               length(low_risk_idx) * 0.9)] <- NA

  return(breast_cancer_data)
}

# Generate breast cancer dataset
breast_cancer_data <- create_breast_cancer_data(n = 2000, prevalence = 0.005)

# ============================================================================
# DATASET 3: TUBERCULOSIS DIAGNOSIS
# ============================================================================

create_tb_data <- function(n = 1500, prevalence = 0.20) {
  # True disease status (high prevalence in TB clinic)
  disease <- rbinom(n, 1, prevalence)

  # Symptom screening (cough > 2 weeks, fever, weight loss, night sweats)
  # Sensitivity: 80%, Specificity: 60%
  symptom_prob <- ifelse(disease == 1, 0.80, 0.40)
  symptom_result <- rbinom(n, 1, symptom_prob)

  # Sputum smear microscopy
  # Sensitivity: 60%, Specificity: 98%
  smear_prob <- ifelse(disease == 1, 0.60, 0.02)
  smear_result <- rbinom(n, 1, smear_prob)

  # GeneXpert MTB/RIF
  # Sensitivity: 88%, Specificity: 98%
  genexpert_prob <- ifelse(disease == 1, 0.88, 0.02)
  genexpert_result <- rbinom(n, 1, genexpert_prob)

  # Culture (gold standard-ish)
  # Sensitivity: 95%, Specificity: 100%
  culture_prob <- ifelse(disease == 1, 0.95, 0.00)
  culture_result <- rbinom(n, 1, culture_prob)

  # Chest X-ray
  # Sensitivity: 85%, Specificity: 75%
  cxr_prob <- ifelse(disease == 1, 0.85, 0.25)
  cxr_result <- rbinom(n, 1, cxr_prob)

  # HIV status affects presentation
  hiv_status <- rbinom(n, 1, 0.25)

  # Create dataset
  tb_data <- data.frame(
    patient_id = 1:n,
    symptoms = factor(symptom_result, levels = c(0, 1), 
                     labels = c("No", "Yes")),
    sputum_smear = factor(smear_result, levels = c(0, 1), 
                         labels = c("Negative", "Positive")),
    genexpert = factor(genexpert_result, levels = c(0, 1), 
                      labels = c("MTB not detected", "MTB detected")),
    culture = factor(culture_result, levels = c(0, 1), 
                    labels = c("Negative", "Positive")),
    chest_xray = factor(cxr_result, levels = c(0, 1), 
                       labels = c("Normal", "Abnormal")),
    tb_status = factor(disease, levels = c(0, 1), 
                      labels = c("No TB", "TB")),
    hiv_status = factor(hiv_status, levels = c(0, 1), 
                       labels = c("Negative", "Positive")),
    age = round(rnorm(n, 35, 15)),
    contact_history = factor(rbinom(n, 1, 0.30), levels = c(0, 1), 
                           labels = c("No", "Yes"))
  )

  # Culture takes time, might not be done for all
  no_culture_idx <- which(tb_data$genexpert == "MTB not detected" & 
                         tb_data$symptoms == "No" & 
                         runif(n) < 0.4)
  tb_data$culture[no_culture_idx] <- NA

  return(tb_data)
}

# Generate TB dataset
tb_diagnosis_data <- create_tb_data(n = 1500, prevalence = 0.20)

# ============================================================================
# DATASET 4: MYOCARDIAL INFARCTION RULE-OUT (FIXED)
# ============================================================================

create_mi_data <- function(n = 800, prevalence = 0.10) {
  # True disease status
  disease <- rbinom(n, 1, prevalence)

  # ECG changes
  # Sensitivity: 55%, Specificity: 95%
  ecg_prob <- ifelse(disease == 1, 0.55, 0.05)
  ecg_result <- rbinom(n, 1, ecg_prob)

  # Initial troponin
  # Sensitivity: 85%, Specificity: 95%
  trop1_prob <- ifelse(disease == 1, 0.85, 0.05)
  trop1_result <- rbinom(n, 1, trop1_prob)

  # 3-hour troponin
  # Sensitivity: 98%, Specificity: 95%
  trop3_prob <- ifelse(disease == 1, 0.98, 0.05)
  trop3_result <- rbinom(n, 1, trop3_prob)

  # Make sure 3-hour is at least as positive as initial
  trop3_result <- pmax(trop1_result, trop3_result)

  # CT Angiography
  # Sensitivity: 95%, Specificity: 90%
  cta_prob <- ifelse(disease == 1, 0.95, 0.10)
  cta_result <- rbinom(n, 1, cta_prob)

  # Clinical risk score components
  age <- round(rnorm(n, 60, 15))
  age[age < 30] <- 30
  age[age > 90] <- 90

  # FIXED: Generate chest pain type for each individual based on their disease status
  chest_pain_type <- character(n)
  pain_types <- c("Typical", "Atypical", "Non-cardiac")

  for (i in 1:n) {
    if (disease[i] == 1) {
      # Disease present - more likely to have typical pain
      chest_pain_type[i] <- sample(pain_types, 1, prob = c(0.6, 0.3, 0.1))
    } else {
      # No disease - more likely to have non-cardiac pain
      chest_pain_type[i] <- sample(pain_types, 1, prob = c(0.1, 0.3, 0.6))
    }
  }

  # Create dataset
  mi_data <- data.frame(
    patient_id = 1:n,
    ecg = factor(ecg_result, levels = c(0, 1), 
                labels = c("Normal", "Ischemic changes")),
    troponin_initial = factor(trop1_result, levels = c(0, 1), 
                             labels = c("Normal", "Elevated")),
    troponin_3hr = factor(trop3_result, levels = c(0, 1), 
                         labels = c("Normal", "Elevated")),
    ct_angiography = factor(cta_result, levels = c(0, 1), 
                           labels = c("No stenosis", "Significant stenosis")),
    mi_status = factor(disease, levels = c(0, 1), 
                      labels = c("No MI", "MI")),
    age = age,
    chest_pain = factor(chest_pain_type),
    diabetes = factor(rbinom(n, 1, 0.25), levels = c(0, 1), 
                     labels = c("No", "Yes")),
    smoking = factor(rbinom(n, 1, 0.30), levels = c(0, 1), 
                    labels = c("No", "Yes")),
    prior_cad = factor(rbinom(n, 1, 0.20), levels = c(0, 1), 
                      labels = c("No", "Yes"))
  )

  # CTA typically only for intermediate risk
  low_risk_idx <- which(mi_data$chest_pain == "Non-cardiac" & 
                       mi_data$ecg == "Normal" & 
                       mi_data$troponin_initial == "Normal")
  if (length(low_risk_idx) > 0) {
    mi_data$ct_angiography[sample(low_risk_idx, 
                                 min(length(low_risk_idx), 
                                     floor(length(low_risk_idx) * 0.8)))] <- NA
  }

  return(mi_data)
}

# Generate MI dataset
mi_ruleout_data <- create_mi_data(n = 800, prevalence = 0.10)

# ============================================================================
# DATASET 5: THYROID NODULE EVALUATION
# ============================================================================

create_thyroid_data <- function(n = 600, prevalence = 0.05) {
  # True disease status (thyroid cancer)
  disease <- rbinom(n, 1, prevalence)

  # Ultrasound features (TI-RADS)
  # Sensitivity: 90%, Specificity: 70%
  us_prob <- ifelse(disease == 1, 0.90, 0.30)
  us_result <- rbinom(n, 1, us_prob)

  # Fine Needle Aspiration (FNA) cytology
  # Sensitivity: 95%, Specificity: 98%
  fna_prob <- ifelse(disease == 1, 0.95, 0.02)
  fna_result <- rbinom(n, 1, fna_prob)

  # Molecular testing (ThyroSeq/Afirma)
  # Sensitivity: 91%, Specificity: 85%
  molecular_prob <- ifelse(disease == 1, 0.91, 0.15)
  molecular_result <- rbinom(n, 1, molecular_prob)

  # Thyroglobulin levels
  # Sensitivity: 70%, Specificity: 80%
  tg_prob <- ifelse(disease == 1, 0.70, 0.20)
  tg_result <- rbinom(n, 1, tg_prob)

  # Create dataset
  thyroid_data <- data.frame(
    patient_id = 1:n,
    ultrasound = factor(us_result, levels = c(0, 1), 
                       labels = c("TI-RADS 1-3", "TI-RADS 4-5")),
    fna_cytology = factor(fna_result, levels = c(0, 1), 
                         labels = c("Benign/Indeterminate", "Suspicious/Malignant")),
    molecular_test = factor(molecular_result, levels = c(0, 1), 
                           labels = c("Benign", "Suspicious")),
    thyroglobulin = factor(tg_result, levels = c(0, 1), 
                          labels = c("Normal", "Elevated")),
    cancer_status = factor(disease, levels = c(0, 1), 
                          labels = c("Benign", "Malignant")),
    nodule_size = round(rlnorm(n, log(15), 0.5)),
    age = round(rnorm(n, 50, 15)),
    gender = factor(sample(c("Female", "Male"), n, 
                          replace = TRUE, prob = c(0.75, 0.25))),
    radiation_history = factor(rbinom(n, 1, 0.05), levels = c(0, 1), 
                             labels = c("No", "Yes"))
  )

  # Molecular testing only for indeterminate FNA
  molecular_not_done <- which(thyroid_data$fna_cytology != "Benign/Indeterminate" |
                             runif(n) < 0.5)
  thyroid_data$molecular_test[molecular_not_done] <- NA

  return(thyroid_data)
}

# Generate thyroid dataset
thyroid_nodule_data <- create_thyroid_data(n = 600, prevalence = 0.05)

# ============================================================================
# SAVE ALL DATASETS
# ============================================================================

# Save as RData file
save(covid_screening_data, 
     breast_cancer_data,
     tb_diagnosis_data,
     mi_ruleout_data,
     thyroid_nodule_data,
     file = "data/decision_panel_test_data.RData")

# Also save as CSV files for external use
write.csv(covid_screening_data, "data/covid_screening_data.csv", row.names = FALSE)
write.csv(breast_cancer_data, "data/breast_cancer_data.csv", row.names = FALSE)
write.csv(tb_diagnosis_data, "data/tb_diagnosis_data.csv", row.names = FALSE)
write.csv(mi_ruleout_data, "data/mi_ruleout_data.csv", row.names = FALSE)
write.csv(thyroid_nodule_data, "data/thyroid_nodule_data.csv", row.names = FALSE)

# Print confirmation
cat("✓ All datasets generated and saved successfully!\n")

# ============================================================================
# DATASET SUMMARIES FOR VERIFICATION
# ============================================================================

summarize_dataset <- function(data, disease_col, test_cols) {
  cat("\nDataset Summary:\n")
  cat("Total observations:", nrow(data), "\n")

  # Check if disease column exists and has the expected levels
  if (disease_col %in% names(data)) {
    disease_levels <- levels(factor(data[[disease_col]]))
    if (length(disease_levels) >= 2) {
      prevalence <- mean(data[[disease_col]] == disease_levels[2], na.rm = TRUE)
      cat("Disease prevalence:", round(prevalence * 100, 1), "%\n")
    } else {
      cat("Disease column found but insufficient levels\n")
    }
  } else {
    cat("Disease column not found\n")
  }

  cat("\nTest performance:\n")

  for (test in test_cols) {
    if (test %in% names(data) && disease_col %in% names(data)) {
      # Remove rows with NAs in either variable
      complete_data <- data[!is.na(data[[test]]) & !is.na(data[[disease_col]]), ]

      if (nrow(complete_data) > 0) {
        tab <- table(complete_data[[test]], complete_data[[disease_col]])

        if (nrow(tab) >= 2 && ncol(tab) >= 2) {
          # Assume positive is the second level (index 2)
          if (nrow(tab) == 2 && ncol(tab) == 2) {
            sens <- tab[2,2] / sum(tab[,2])
            spec <- tab[1,1] / sum(tab[,1])
            cat(sprintf("  %s: Sens=%.1f%%, Spec=%.1f%% (n=%d)\n", 
                        test, sens*100, spec*100, nrow(complete_data)))
          } else {
            cat(sprintf("  %s: Complex table structure (n=%d)\n", 
                        test, nrow(complete_data)))
          }
        } else {
          cat(sprintf("  %s: Insufficient data for analysis (n=%d)\n", 
                      test, nrow(complete_data)))
        }
      } else {
        cat(sprintf("  %s: No complete cases\n", test))
      }
    } else {
      cat(sprintf("  %s: Column not found\n", test))
    }
  }
}

# Print summaries for verification
cat("\n", paste(rep("=", 60), collapse = ""), "\n")
cat("DATASET SUMMARIES")
cat("\n", paste(rep("=", 60), collapse = ""), "\n")

cat("\n=== COVID-19 SCREENING DATA ===")
summarize_dataset(covid_screening_data, "covid_status", 
                 c("rapid_antigen", "pcr", "chest_ct"))

cat("\n=== BREAST CANCER SCREENING DATA ===")
summarize_dataset(breast_cancer_data, "cancer_status",
                 c("clinical_exam", "mammography", "ultrasound", "mri"))

cat("\n=== TUBERCULOSIS DIAGNOSIS DATA ===")
summarize_dataset(tb_diagnosis_data, "tb_status",
                 c("symptoms", "sputum_smear", "genexpert", "culture", "chest_xray"))

cat("\n=== MYOCARDIAL INFARCTION DATA ===")
summarize_dataset(mi_ruleout_data, "mi_status",
                 c("ecg", "troponin_initial", "troponin_3hr", "ct_angiography"))

cat("\n=== THYROID NODULE DATA ===")
summarize_dataset(thyroid_nodule_data, "cancer_status",
                 c("ultrasound", "fna_cytology", "molecular_test", "thyroglobulin"))

cat("\n", paste(rep("=", 60), collapse = ""), "\n")

# Set seed for reproducibility
set.seed(123)

Custom Optimization Functions

Defining Custom Utility Functions

The module allows custom utility functions that incorporate domain-specific knowledge:

# Define a custom utility function for COVID screening
# Prioritizes not missing cases while considering hospital capacity
covid_utility <- function(TP, FP, TN, FN, test_cost, hospital_capacity = 100) {
  # Base utilities
  u_TP <- 100    # Correctly identified case
  u_TN <- 10     # Correctly ruled out
  u_FP <- -20    # Unnecessary isolation
  u_FN <- -1000  # Missed case (high penalty)

  # Capacity penalty - increases FP cost when near capacity
  current_positives <- TP + FP
  capacity_factor <- ifelse(current_positives > hospital_capacity * 0.8,
                           (current_positives / hospital_capacity)^2,
                           1)
  u_FP_adjusted <- u_FP * capacity_factor

  # Calculate total utility
  total_utility <- (TP * u_TP + TN * u_TN + 
                   FP * u_FP_adjusted + FN * u_FN - test_cost)

  return(total_utility)
}

# Example calculation
n_total <- 1000
prevalence <- 0.15
test_cost <- 55  # Combined test cost

# Scenario 1: Low capacity
utility_low_capacity <- covid_utility(
  TP = 147,  # 98% sensitivity
  FP = 26,   # 97% specificity  
  TN = 824,
  FN = 3,
  test_cost = test_cost,
  hospital_capacity = 50
)

# Scenario 2: High capacity
utility_high_capacity <- covid_utility(
  TP = 147,
  FP = 26,
  TN = 824,
  FN = 3,
  test_cost = test_cost,
  hospital_capacity = 200
)

cat("Utility with low capacity:", utility_low_capacity, "\n")
cat("Utility with high capacity:", utility_high_capacity, "\n")

Implementing Multi-Objective Optimization

When multiple objectives conflict, use Pareto optimization:

# Generate test combinations and their performance
generate_pareto_data <- function(data, tests, gold, gold_positive) {
  # Get all possible test combinations
  all_combinations <- list()

  for (i in 1:length(tests)) {
    combos <- combn(tests, i, simplify = FALSE)
    all_combinations <- c(all_combinations, combos)
  }

  # Calculate metrics for each combination
  results <- data.frame()

  for (combo in all_combinations) {
    # Simulate parallel ANY rule
    test_positive <- rowSums(data[combo] == "Positive" | 
                           data[combo] == "Abnormal" | 
                           data[combo] == "MTB detected",
                           na.rm = TRUE) > 0

    truth <- data[[gold]] == gold_positive

    # Calculate metrics
    TP <- sum(test_positive & truth)
    FP <- sum(test_positive & !truth)
    TN <- sum(!test_positive & !truth)
    FN <- sum(!test_positive & truth)

    sensitivity <- TP / (TP + FN)
    specificity <- TN / (TN + FP)

    # Simulated costs
    test_costs <- c(rapid_antigen = 5, pcr = 50, chest_ct = 200)
    total_cost <- sum(test_costs[combo])

    results <- rbind(results, data.frame(
      tests = paste(combo, collapse = "+"),
      n_tests = length(combo),
      sensitivity = sensitivity,
      specificity = specificity,
      cost = total_cost
    ))
  }

  return(results)
}

# Generate Pareto frontier for COVID tests
pareto_data <- generate_pareto_data(
  covid_screening_data[1:500,],  # Use subset for speed
  tests = c("rapid_antigen", "pcr", "chest_ct"),
  gold = "covid_status",
  gold_positive = "Positive"
)

# Identify Pareto optimal solutions
is_pareto_optimal <- function(data, objectives) {
  n <- nrow(data)
  pareto <- rep(TRUE, n)

  for (i in 1:n) {
    for (j in 1:n) {
      if (i != j) {
        # Check if j dominates i
        dominates <- all(data[j, objectives] >= data[i, objectives]) &&
                    any(data[j, objectives] > data[i, objectives])
        if (dominates) {
          pareto[i] <- FALSE
          break
        }
      }
    }
  }

  return(pareto)
}

# For sensitivity and cost (cost should be minimized, so use negative)
pareto_data$neg_cost <- -pareto_data$cost
pareto_data$pareto_optimal <- is_pareto_optimal(
  pareto_data, 
  c("sensitivity", "neg_cost")
)

# Visualize Pareto frontier
ggplot(pareto_data, aes(x = cost, y = sensitivity * 100)) +
  geom_point(aes(color = pareto_optimal, size = n_tests), alpha = 0.7) +
  geom_line(data = pareto_data[pareto_data$pareto_optimal,] %>% arrange(cost),
            color = "red", size = 1) +
  geom_text(data = pareto_data[pareto_data$pareto_optimal,],
            aes(label = tests), vjust = -1, size = 3) +
  scale_color_manual(values = c("gray", "red")) +
  labs(
    title = "Pareto Frontier for Multi-Objective Optimization",
    x = "Total Cost ($)",
    y = "Sensitivity (%)",
    caption = "Red points and line show Pareto optimal solutions"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Advanced Decision Trees

Cost-Sensitive Decision Trees

Implement decision trees that consider both accuracy and cost:

# Prepare data for decision tree
tree_data <- covid_screening_data %>%
  select(rapid_antigen, pcr, chest_ct, symptom_score, 
         age, risk_group, covid_status) %>%
  na.omit()

# Create cost matrix
# Rows: predicted, Columns: actual
# Cost of false negative is 10x cost of false positive
cost_matrix <- matrix(c(0, 1,     # Predict Negative
                       10, 0),    # Predict Positive
                     nrow = 2, byrow = TRUE)

# Build cost-sensitive tree
cost_tree <- rpart(
  covid_status ~ rapid_antigen + pcr + chest_ct + 
                 symptom_score + age + risk_group,
  data = tree_data,
  method = "class",
  parms = list(loss = cost_matrix),
  control = rpart.control(cp = 0.01, maxdepth = 4)
)

# Visualize tree
rpart.plot(cost_tree, 
           main = "Cost-Sensitive Decision Tree for COVID-19",
           extra = 104,  # Show probability and number
           under = TRUE,
           faclen = 0,
           cex = 0.8)

# Compare with standard tree
standard_tree <- rpart(
  covid_status ~ rapid_antigen + pcr + chest_ct + 
                 symptom_score + age + risk_group,
  data = tree_data,
  method = "class",
  control = rpart.control(cp = 0.01, maxdepth = 4)
)

# Performance comparison
tree_comparison <- data.frame(
  Model = c("Standard", "Cost-Sensitive"),
  Accuracy = c(
    sum(predict(standard_tree, type = "class") == tree_data$covid_status) / nrow(tree_data),
    sum(predict(cost_tree, type = "class") == tree_data$covid_status) / nrow(tree_data)
  ),
  Sensitivity = c(
    {
      pred <- predict(standard_tree, type = "class")
      sum(pred == "Positive" & tree_data$covid_status == "Positive") / 
        sum(tree_data$covid_status == "Positive")
    },
    {
      pred <- predict(cost_tree, type = "class")
      sum(pred == "Positive" & tree_data$covid_status == "Positive") / 
        sum(tree_data$covid_status == "Positive")
    }
  )
)

kable(tree_comparison, digits = 3,
      caption = "Performance Comparison: Standard vs Cost-Sensitive Trees")

Ensemble Decision Trees

Combine multiple trees for more robust decisions:

# Create bootstrap samples and build multiple trees
n_trees <- 10
trees <- list()
tree_weights <- numeric(n_trees)

for (i in 1:n_trees) {
  # Bootstrap sample
  boot_indices <- sample(nrow(tree_data), replace = TRUE)
  boot_data <- tree_data[boot_indices,]

  # Build tree with random feature subset
  features <- c("rapid_antigen", "pcr", "chest_ct", 
                "symptom_score", "age", "risk_group")
  selected_features <- sample(features, size = 4)

  formula <- as.formula(paste("covid_status ~", 
                             paste(selected_features, collapse = " + ")))

  trees[[i]] <- rpart(
    formula,
    data = boot_data,
    method = "class",
    control = rpart.control(cp = 0.02, maxdepth = 3)
  )

  # Calculate out-of-bag performance for weighting
  oob_indices <- setdiff(1:nrow(tree_data), unique(boot_indices))
  if (length(oob_indices) > 0) {
    oob_pred <- predict(trees[[i]], tree_data[oob_indices,], type = "class")
    tree_weights[i] <- sum(oob_pred == tree_data$covid_status[oob_indices]) / 
                      length(oob_indices)
  } else {
    tree_weights[i] <- 0.5
  }
}

# Normalize weights
tree_weights <- tree_weights / sum(tree_weights)

# Ensemble prediction function
ensemble_predict <- function(trees, weights, newdata) {
  # Get probability predictions from each tree
  prob_matrix <- matrix(0, nrow = nrow(newdata), ncol = 2)

  for (i in 1:length(trees)) {
    probs <- predict(trees[[i]], newdata, type = "prob")
    prob_matrix <- prob_matrix + probs * weights[i]
  }

  # Return class with highest probability
  classes <- levels(tree_data$covid_status)
  predicted_class <- classes[apply(prob_matrix, 1, which.max)]

  return(list(class = predicted_class, prob = prob_matrix))
}

# Test ensemble
ensemble_pred <- ensemble_predict(trees, tree_weights, tree_data)

# Compare performance
ensemble_comparison <- data.frame(
  Model = c("Single Tree", "Ensemble"),
  Accuracy = c(
    sum(predict(trees[[1]], type = "class") == tree_data$covid_status) / nrow(tree_data),
    sum(ensemble_pred$class == tree_data$covid_status) / nrow(tree_data)
  )
)

kable(ensemble_comparison, digits = 3,
      caption = "Single Tree vs Ensemble Performance")

Complex Constraints and Business Rules

Implementing Complex Constraints

Real-world scenarios often have complex constraints:

# Function to check if a test combination meets constraints
meets_constraints <- function(tests, constraints) {
  # Example constraints for TB testing

  # 1. If GeneXpert is used, must have sputum collection capability
  if ("genexpert" %in% tests && !("sputum_smear" %in% tests || 
                                   constraints$has_sputum_collection)) {
    return(FALSE)
  }

  # 2. Culture requires biosafety level 3 lab
  if ("culture" %in% tests && !constraints$has_bsl3_lab) {
    return(FALSE)
  }

  # 3. Maximum turnaround time constraint
  test_times <- c(symptoms = 0, sputum_smear = 0.5, genexpert = 0.1, 
                  culture = 21, chest_xray = 0.5)
  max_time <- max(test_times[tests])
  if (max_time > constraints$max_turnaround_days) {
    return(FALSE)
  }

  # 4. Budget constraint
  test_costs <- c(symptoms = 1, sputum_smear = 3, genexpert = 20, 
                  culture = 30, chest_xray = 10)
  total_cost <- sum(test_costs[tests])
  if (total_cost > constraints$budget_per_patient) {
    return(FALSE)
  }

  return(TRUE)
}

# Define facility-specific constraints
facility_constraints <- list(
  rural_clinic = list(
    has_sputum_collection = TRUE,
    has_bsl3_lab = FALSE,
    max_turnaround_days = 1,
    budget_per_patient = 15
  ),
  district_hospital = list(
    has_sputum_collection = TRUE,
    has_bsl3_lab = FALSE,
    max_turnaround_days = 7,
    budget_per_patient = 50
  ),
  reference_lab = list(
    has_sputum_collection = TRUE,
    has_bsl3_lab = TRUE,
    max_turnaround_days = 30,
    budget_per_patient = 100
  )
)

# Find valid combinations for each facility type
tb_tests <- c("symptoms", "sputum_smear", "genexpert", "culture", "chest_xray")

for (facility in names(facility_constraints)) {
  valid_combos <- list()

  # Check all combinations
  for (i in 1:length(tb_tests)) {
    combos <- combn(tb_tests, i, simplify = FALSE)
    for (combo in combos) {
      if (meets_constraints(combo, facility_constraints[[facility]])) {
        valid_combos <- c(valid_combos, list(combo))
      }
    }
  }

  cat("\n", facility, ": ", length(valid_combos), " valid combinations\n", sep = "")
  cat("Examples: \n")
  for (j in 1:min(3, length(valid_combos))) {
    cat("  -", paste(valid_combos[[j]], collapse = " + "), "\n")
  }
}

Time-Dependent Testing Strategies

Implement strategies that change based on time constraints:

# Time-dependent chest pain protocol
time_dependent_protocol <- function(patient_data, time_available_hours) {

  decisions <- data.frame(
    patient_id = patient_data$patient_id,
    protocol = character(nrow(patient_data)),
    tests_used = character(nrow(patient_data)),
    decision_time = numeric(nrow(patient_data)),
    stringsAsFactors = FALSE
  )

  for (i in 1:nrow(patient_data)) {
    patient <- patient_data[i,]

    if (time_available_hours >= 3) {
      # Full protocol available
      if (patient$troponin_initial == "Normal" && 
          patient$troponin_3hr == "Normal" &&
          patient$ecg == "Normal") {
        decisions$protocol[i] <- "Rule out"
        decisions$tests_used[i] <- "ECG + Serial troponins"
        decisions$decision_time[i] <- 3
      } else if (patient$troponin_3hr == "Elevated") {
        decisions$protocol[i] <- "Rule in"
        decisions$tests_used[i] <- "ECG + Serial troponins"
        decisions$decision_time[i] <- 3
      } else {
        decisions$protocol[i] <- "Further testing"
        decisions$tests_used[i] <- "ECG + Serial troponins + CTA"
        decisions$decision_time[i] <- 3.5
      }

    } else if (time_available_hours >= 1) {
      # Rapid protocol
      if (patient$troponin_initial == "Normal" && 
          patient$ecg == "Normal" &&
          patient$age < 40) {
        decisions$protocol[i] <- "Low risk discharge"
        decisions$tests_used[i] <- "ECG + Initial troponin"
        decisions$decision_time[i] <- 1
      } else {
        decisions$protocol[i] <- "Requires admission"
        decisions$tests_used[i] <- "ECG + Initial troponin"
        decisions$decision_time[i] <- 1
      }

    } else {
      # Ultra-rapid
      if (patient$ecg == "Ischemic changes") {
        decisions$protocol[i] <- "Immediate cath lab"
        decisions$tests_used[i] <- "ECG only"
        decisions$decision_time[i] <- 0.2
      } else {
        decisions$protocol[i] <- "Clinical decision"
        decisions$tests_used[i] <- "ECG only"
        decisions$decision_time[i] <- 0.2
      }
    }
  }

  return(decisions)
}

# Apply to sample patients
sample_mi <- mi_ruleout_data[1:20,]

# Different time scenarios
time_scenarios <- c(0.5, 1, 3, 6)

for (time_limit in time_scenarios) {
  results <- time_dependent_protocol(sample_mi, time_limit)

  cat("\nTime available:", time_limit, "hours\n")
  cat("Protocols used:\n")
  print(table(results$protocol))
  cat("Average decision time:", mean(results$decision_time), "hours\n")
}

Performance Optimization

Efficient Computation for Large Datasets

# Performance Optimization and Benchmarking
# This section demonstrates different approaches to optimize performance calculations

# Function to safely calculate performance metrics with NA handling
safe_performance_metrics <- function(predictions, actual, positive_class = "Positive") {
  # Handle missing values
  complete_cases <- !is.na(predictions) & !is.na(actual)

  if (sum(complete_cases) == 0) {
    return(list(
      accuracy = NA,
      sensitivity = NA,
      specificity = NA,
      ppv = NA,
      npv = NA,
      n_complete = 0
    ))
  }

  pred_clean <- predictions[complete_cases]
  actual_clean <- actual[complete_cases]

  # Convert to binary if needed
  pred_binary <- as.character(pred_clean) == positive_class
  actual_binary <- as.character(actual_clean) == positive_class

  # Calculate confusion matrix components
  tp <- sum(pred_binary & actual_binary, na.rm = TRUE)
  tn <- sum(!pred_binary & !actual_binary, na.rm = TRUE)
  fp <- sum(pred_binary & !actual_binary, na.rm = TRUE)
  fn <- sum(!pred_binary & actual_binary, na.rm = TRUE)

  # Calculate metrics with division by zero protection
  total <- tp + tn + fp + fn
  accuracy <- if (total > 0) (tp + tn) / total else NA

  sensitivity <- if ((tp + fn) > 0) tp / (tp + fn) else NA
  specificity <- if ((tn + fp) > 0) tn / (tn + fp) else NA
  ppv <- if ((tp + fp) > 0) tp / (tp + fp) else NA
  npv <- if ((tn + fn) > 0) tn / (tn + fn) else NA

  return(list(
    accuracy = accuracy,
    sensitivity = sensitivity,
    specificity = specificity,
    ppv = ppv,
    npv = npv,
    n_complete = sum(complete_cases)
  ))
}

# Optimized confusion matrix calculation
fast_confusion_matrix <- function(predictions, actual, positive_class = "Positive") {
  # Handle NAs upfront
  complete_cases <- !is.na(predictions) & !is.na(actual)

  if (sum(complete_cases) < 2) {
    return(matrix(c(0, 0, 0, 0), nrow = 2, 
                  dimnames = list(
                    Predicted = c("Negative", "Positive"),
                    Actual = c("Negative", "Positive")
                  )))
  }

  pred_clean <- predictions[complete_cases]
  actual_clean <- actual[complete_cases]

  # Use table for fast cross-tabulation
  conf_table <- table(
    Predicted = factor(pred_clean, levels = c(setdiff(unique(c(pred_clean, actual_clean)), positive_class), positive_class)),
    Actual = factor(actual_clean, levels = c(setdiff(unique(c(pred_clean, actual_clean)), positive_class), positive_class))
  )

  return(conf_table)
}

# Vectorized performance calculation
vectorized_metrics <- function(pred_vector, actual_vector, positive_class = "Positive") {
  # Remove NAs
  complete_idx <- !is.na(pred_vector) & !is.na(actual_vector)

  if (sum(complete_idx) == 0) {
    return(data.frame(
      method = "vectorized",
      accuracy = NA,
      sensitivity = NA,
      specificity = NA,
      n_obs = 0
    ))
  }

  pred <- pred_vector[complete_idx]
  actual <- actual_vector[complete_idx]

  # Vectorized operations
  pred_pos <- pred == positive_class
  actual_pos <- actual == positive_class

  tp <- sum(pred_pos & actual_pos)
  tn <- sum(!pred_pos & !actual_pos)
  fp <- sum(pred_pos & !actual_pos)
  fn <- sum(!pred_pos & actual_pos)

  n_total <- length(pred)
  n_pos <- sum(actual_pos)
  n_neg <- sum(!actual_pos)

  data.frame(
    method = "vectorized",
    accuracy = (tp + tn) / n_total,
    sensitivity = if (n_pos > 0) tp / n_pos else NA,
    specificity = if (n_neg > 0) tn / n_neg else NA,
    n_obs = n_total
  )
}

# Create test data for benchmarking (ensure no NAs in critical columns)
set.seed(123)
n_test <- 1000

# Create predictions with some realistic accuracy
actual_test <- factor(sample(c("Negative", "Positive"), n_test, 
                           replace = TRUE, prob = c(0.8, 0.2)))

# Create predictions that correlate with actual (realistic scenario)
pred_prob <- ifelse(actual_test == "Positive", 0.85, 0.15)
pred_test <- factor(ifelse(runif(n_test) < pred_prob, "Positive", "Negative"))

# Introduce some missing values (but not in the benchmarked functions)
missing_idx <- sample(n_test, size = floor(n_test * 0.05))
actual_test_with_na <- actual_test
pred_test_with_na <- pred_test
actual_test_with_na[missing_idx[1:length(missing_idx)/2]] <- NA
pred_test_with_na[missing_idx[(length(missing_idx)/2 + 1):length(missing_idx)]] <- NA

cat("Test data created:\n")
cat("Total observations:", n_test, "\n")
cat("Missing values in actual:", sum(is.na(actual_test_with_na)), "\n")
cat("Missing values in predictions:", sum(is.na(pred_test_with_na)), "\n")
cat("Complete cases:", sum(!is.na(actual_test_with_na) & !is.na(pred_test_with_na)), "\n")

# Test the functions with clean data first
cat("\nTesting functions with clean data:\n")
clean_metrics <- safe_performance_metrics(pred_test, actual_test)
print(clean_metrics)

clean_confusion <- fast_confusion_matrix(pred_test, actual_test)
print(clean_confusion)

# Test with data containing NAs
cat("\nTesting functions with NA values:\n")
na_metrics <- safe_performance_metrics(pred_test_with_na, actual_test_with_na)
print(na_metrics)

# Benchmark different approaches (using clean data to avoid NA issues in timing)
cat("\nPerformance benchmarking:\n")

# Only benchmark if microbenchmark is available
if (requireNamespace("microbenchmark", quietly = TRUE)) {
  tryCatch({
    benchmark_results <- microbenchmark::microbenchmark(
      "safe_metrics" = safe_performance_metrics(pred_test, actual_test),
      "fast_confusion" = fast_confusion_matrix(pred_test, actual_test),
      "vectorized" = vectorized_metrics(pred_test, actual_test),
      times = 10
    )

    print(benchmark_results)

    # Plot benchmark results if possible
    if (requireNamespace("ggplot2", quietly = TRUE)) {
      plot(benchmark_results)
    }

  }, error = function(e) {
    cat("Benchmark error (using fallback timing):", e$message, "\n")

    # Fallback timing method
    cat("Using system.time for performance measurement:\n")

    cat("Safe metrics timing:\n")
    print(system.time(for(i in 1:10) safe_performance_metrics(pred_test, actual_test)))

    cat("Fast confusion matrix timing:\n")
    print(system.time(for(i in 1:10) fast_confusion_matrix(pred_test, actual_test)))

    cat("Vectorized metrics timing:\n")
    print(system.time(for(i in 1:10) vectorized_metrics(pred_test, actual_test)))
  })
} else {
  cat("microbenchmark package not available, using system.time:\n")

  cat("Safe metrics timing:\n")
  print(system.time(replicate(10, safe_performance_metrics(pred_test, actual_test))))

  cat("Fast confusion matrix timing:\n")
  print(system.time(replicate(10, fast_confusion_matrix(pred_test, actual_test))))

  cat("Vectorized metrics timing:\n")
  print(system.time(replicate(10, vectorized_metrics(pred_test, actual_test))))
}

# Performance comparison table
performance_comparison <- data.frame(
  Method = c("Safe Metrics", "Fast Confusion Matrix", "Vectorized Metrics"),
  `Handles NAs` = c("Yes", "Yes", "Yes"),
  `Memory Efficient` = c("Medium", "High", "High"),
  `Speed` = c("Medium", "Fast", "Fastest"),
  `Use Case` = c("General purpose", "Detailed analysis", "Large datasets"),
  stringsAsFactors = FALSE
)

knitr::kable(performance_comparison, 
             caption = "Performance Optimization Comparison",
             align = 'c')

Caching and Memoization

# Create memoized function for expensive calculations
library(memoise)

# Original expensive function
calculate_test_performance <- function(test_data, gold_standard) {
  # Simulate expensive calculation
  Sys.sleep(0.1)  # Pretend this takes time

  conf_matrix <- table(test_data, gold_standard)
  sensitivity <- conf_matrix[2,2] / sum(conf_matrix[,2])
  specificity <- conf_matrix[1,1] / sum(conf_matrix[,1])

  return(list(sensitivity = sensitivity, specificity = specificity))
}

# Memoized version
calculate_test_performance_memo <- memoise(calculate_test_performance)

# Demonstration
test_vector <- as.numeric(covid_screening_data$rapid_antigen == "Positive")
gold_vector <- as.numeric(covid_screening_data$covid_status == "Positive")

# First call - slow
system.time({
  result1 <- calculate_test_performance_memo(test_vector[1:100], gold_vector[1:100])
})

# Second call with same data - fast (cached)
system.time({
  result2 <- calculate_test_performance_memo(test_vector[1:100], gold_vector[1:100])
})

cat("Results match:", identical(result1, result2), "\n")

Integration with External Systems

Exporting Results for Clinical Decision Support Systems

# Export Clinical Decision Support System Rules

# Safe function to export tree rules with proper error handling
export_tree_as_rules <- function(tree_model, data) {
  # Check if tree model exists and is valid
  if (is.null(tree_model) || !inherits(tree_model, "rpart")) {
    cat("Error: Invalid or missing tree model\n")
    return(NULL)
  }

  # Check if tree has any splits
  if (nrow(tree_model$frame) <= 1) {
    cat("Warning: Tree has no splits (single node)\n")
    return(data.frame(
      rule_id = 1,
      condition = "Always true",
      prediction = "Default",
      confidence = 1.0,
      n_cases = nrow(data)
    ))
  }

  tryCatch({
    # Get tree frame information
    frame <- tree_model$frame

    # Check if required columns exist
    required_cols <- c("var", "yval")
    if (!all(required_cols %in% names(frame))) {
      stop("Tree frame missing required columns")
    }

    # Extract node information safely
    node_info <- frame

    # Handle yval2 safely
    if ("yval2" %in% names(node_info) && !is.null(node_info$yval2)) {
      # Check dimensions before using rowSums
      yval2_data <- node_info$yval2

      if (is.matrix(yval2_data) && ncol(yval2_data) >= 2) {
        # Safe to use rowSums
        node_counts <- rowSums(yval2_data[, 1:min(2, ncol(yval2_data)), drop = FALSE])
      } else if (is.data.frame(yval2_data) && ncol(yval2_data) >= 2) {
        # Convert to matrix first
        yval2_matrix <- as.matrix(yval2_data[, 1:min(2, ncol(yval2_data))])
        node_counts <- rowSums(yval2_matrix)
      } else {
        # Fallback: use node$n if available
        node_counts <- if ("n" %in% names(node_info)) node_info$n else rep(1, nrow(node_info))
      }
    } else {
      # Fallback: use node$n or estimate
      node_counts <- if ("n" %in% names(node_info)) node_info$n else rep(nrow(data), nrow(node_info))
    }

    # Generate rules for leaf nodes
    leaf_nodes <- which(node_info$var == "<leaf>")

    if (length(leaf_nodes) == 0) {
      cat("Warning: No leaf nodes found\n")
      return(NULL)
    }

    rules_list <- list()

    for (i in seq_along(leaf_nodes)) {
      node_idx <- leaf_nodes[i]

      # Get the path to this leaf node
      node_path <- path.rpart(tree_model, nodes = as.numeric(rownames(node_info)[node_idx]))

      # Extract condition text
      if (length(node_path) > 0 && !is.null(node_path[[1]])) {
        condition_parts <- node_path[[1]]
        # Remove the root node (usually just "root")
        condition_parts <- condition_parts[condition_parts != "root"]

        if (length(condition_parts) > 0) {
          condition <- paste(condition_parts, collapse = " AND ")
        } else {
          condition <- "Always true (root node)"
        }
      } else {
        condition <- paste("Node", node_idx)
      }

      # Get prediction
      prediction <- as.character(node_info$yval[node_idx])

      # Calculate confidence (proportion of cases)
      n_cases <- node_counts[node_idx]
      confidence <- n_cases / sum(node_counts, na.rm = TRUE)

      rules_list[[i]] <- data.frame(
        rule_id = i,
        condition = condition,
        prediction = prediction,
        confidence = round(confidence, 3),
        n_cases = n_cases,
        stringsAsFactors = FALSE
      )
    }

    # Combine all rules
    if (length(rules_list) > 0) {
      rules_df <- do.call(rbind, rules_list)
      return(rules_df)
    } else {
      return(NULL)
    }

  }, error = function(e) {
    cat("Error in export_tree_as_rules:", e$message, "\n")
    cat("Tree structure:\n")
    if (exists("frame")) {
      print(str(frame))
    } else {
      print(str(tree_model))
    }
    return(NULL)
  })
}

# Alternative simple rule extraction function
simple_tree_rules <- function(tree_model, data) {
  if (is.null(tree_model) || !inherits(tree_model, "rpart")) {
    return("No valid tree model available")
  }

  # Use rpart's built-in text representation
  rules_text <- capture.output(print(tree_model))

  return(paste(rules_text, collapse = "\n"))
}

# Generate exportable rules
cat("Generating Clinical Decision Support Rules...\n")

# Check if we have a valid tree from previous chunks
if (exists("cost_tree") && !is.null(cost_tree)) {
  cat("Exporting rules from cost-sensitive tree...\n")

  # Try the main function first
  exported_rules <- export_tree_as_rules(cost_tree, covid_screening_data)

  if (!is.null(exported_rules) && nrow(exported_rules) > 0) {
    cat("Successfully exported", nrow(exported_rules), "rules\n")

    # Display the rules
    knitr::kable(exported_rules, 
                 caption = "Clinical Decision Support Rules",
                 align = c('c', 'l', 'c', 'c', 'c'))

    # Create a more readable format
    cat("\n## Human-Readable Decision Rules:\n\n")
    for (i in 1:nrow(exported_rules)) {
      cat("**Rule", exported_rules$rule_id[i], ":**\n")
      cat("- **If:** ", exported_rules$condition[i], "\n")
      cat("- **Then:** Predict", exported_rules$prediction[i], "\n")
      cat("- **Confidence:** ", exported_rules$confidence[i]*100, "%\n")
      cat("- **Based on:** ", exported_rules$n_cases[i], "cases\n\n")
    }

  } else {
    cat("Failed to export structured rules. Using simple text representation:\n\n")
    simple_rules <- simple_tree_rules(cost_tree, covid_screening_data)
    cat("```\n")
    cat(simple_rules)
    cat("\n```\n")
  }

} else {
  cat("No decision tree available. Creating a simple example tree...\n")

  # Create a simple example tree for demonstration
  if (exists("covid_screening_data")) {
    # Ensure we have the necessary columns
    if ("rapid_antigen" %in% names(covid_screening_data) && 
        "covid_status" %in% names(covid_screening_data)) {

      # Simple tree with minimal requirements
      simple_formula <- covid_status ~ rapid_antigen

      # Check if we have enough data
      complete_data <- covid_screening_data[complete.cases(covid_screening_data[c("rapid_antigen", "covid_status")]), ]

      if (nrow(complete_data) > 10) {
        simple_tree <- rpart(simple_formula, 
                            data = complete_data,
                            method = "class",
                            control = rpart.control(minbucket = 5, cp = 0.1))

        cat("Created simple demonstration tree:\n")
        print(simple_tree)

        # Try to export rules from simple tree
        simple_exported <- export_tree_as_rules(simple_tree, complete_data)

        if (!is.null(simple_exported)) {
          knitr::kable(simple_exported, 
                       caption = "Simple Decision Rules (Example)",
                       align = c('c', 'l', 'c', 'c', 'c'))
        }
      } else {
        cat("Insufficient data for tree creation\n")
      }
    } else {
      cat("Required columns not found in data\n")
    }
  } else {
    cat("No data available for tree creation\n")
  }
}

# Export formats section
cat("\n## Export Formats\n\n")

export_formats <- data.frame(
  Format = c("JSON", "XML", "CSV", "SQL", "R Code"),
  `Use Case` = c(
    "Web applications, APIs",
    "Healthcare standards (HL7)",
    "Spreadsheet analysis",
    "Database integration",
    "R/Statistical software"
  ),
  Complexity = c("Medium", "High", "Low", "Medium", "Low"),
  Implementation = c(
    "jsonlite::toJSON()",
    "XML::xmlTree()",
    "write.csv()",
    "Custom SQL generation",
    "dput() or custom function"
  ),
  stringsAsFactors = FALSE
)

knitr::kable(export_formats,
             caption = "Available Export Formats for Decision Rules",
             align = c('l', 'l', 'c', 'l'))

cat("\n## Implementation Example\n\n")
cat("Here's how these rules could be implemented in a clinical system:\n\n")

implementation_example <- '
# Example implementation in R
clinical_decision <- function(rapid_antigen_result) {
  if (rapid_antigen_result == "Positive") {
    return(list(
      decision = "Positive",
      confidence = 0.95,
      recommendation = "Confirm with PCR if needed for official diagnosis"
    ))
  } else {
    return(list(
      decision = "Negative", 
      confidence = 0.85,
      recommendation = "Consider PCR if high clinical suspicion"
    ))
  }
}

# Usage example:
# result <- clinical_decision("Positive")
# print(result$decision)
'

cat("```r\n")
cat(implementation_example)
cat("```\n")

Creating API-Ready Outputs

# Function to create API response for test panel recommendation
create_api_response <- function(patient_data, optimal_panel) {
  response <- list(
    timestamp = Sys.time(),
    patient_id = patient_data$patient_id,
    recommendations = list(
      primary = list(
        tests = optimal_panel$tests,
        strategy = optimal_panel$strategy,
        expected_performance = list(
          sensitivity = round(optimal_panel$sensitivity * 100, 1),
          specificity = round(optimal_panel$specificity * 100, 1),
          ppv = round(optimal_panel$ppv * 100, 1),
          npv = round(optimal_panel$npv * 100, 1)
        ),
        estimated_cost = optimal_panel$cost
      ),
      alternative_protocols = list(
        rapid = "Rapid antigen only",
        comprehensive = "All available tests"
      )
    ),
    warnings = list(),
    metadata = list(
      model_version = "1.0.0",
      confidence_level = "high"
    )
  )

  # Add warnings based on patient characteristics
  if (patient_data$age > 65) {
    response$warnings <- append(response$warnings, 
                               "High-risk age group - consider lower threshold")
  }

  return(response)
}

# Example API response
example_patient <- covid_screening_data[1,]
example_panel <- list(
  tests = "rapid_antigen+pcr",
  strategy = "parallel_any",
  sensitivity = 0.97,
  specificity = 0.97,
  ppv = 0.82,
  npv = 0.99,
  cost = 55
)

api_response <- create_api_response(example_patient, example_panel)
cat("API Response:\n")
print(jsonlite::toJSON(api_response, pretty = TRUE))

Validation and Quality Control

Cross-Validation with Custom Splits

# Time-based cross-validation for temporal data
time_based_cv <- function(data, date_column, n_splits = 5) {
  # Sort by date
  data <- data[order(data[[date_column]]),]
  n <- nrow(data)

  # Create time-based splits
  splits <- list()
  test_size <- floor(n / (n_splits + 1))

  for (i in 1:n_splits) {
    train_end <- test_size * i
    test_start <- train_end + 1
    test_end <- min(test_start + test_size - 1, n)

    splits[[i]] <- list(
      train = 1:train_end,
      test = test_start:test_end
    )
  }

  return(splits)
}

# Stratified cross-validation ensuring prevalence balance
stratified_cv <- function(data, outcome_column, n_folds = 5) {
  # Separate by outcome
  positive_idx <- which(data[[outcome_column]] == levels(data[[outcome_column]])[2])
  negative_idx <- which(data[[outcome_column]] == levels(data[[outcome_column]])[1])

  # Shuffle within strata
  positive_idx <- sample(positive_idx)
  negative_idx <- sample(negative_idx)

  # Create folds maintaining proportion
  folds <- list()
  pos_per_fold <- length(positive_idx) %/% n_folds
  neg_per_fold <- length(negative_idx) %/% n_folds

  for (i in 1:n_folds) {
    if (i < n_folds) {
      fold_pos <- positive_idx[((i-1)*pos_per_fold + 1):(i*pos_per_fold)]
      fold_neg <- negative_idx[((i-1)*neg_per_fold + 1):(i*neg_per_fold)]
    } else {
      # Last fold gets remaining
      fold_pos <- positive_idx[((i-1)*pos_per_fold + 1):length(positive_idx)]
      fold_neg <- negative_idx[((i-1)*neg_per_fold + 1):length(negative_idx)]
    }

    folds[[i]] <- c(fold_pos, fold_neg)
  }

  return(folds)
}

# Apply stratified CV
folds <- stratified_cv(covid_screening_data, "covid_status", n_folds = 5)

# Check fold balance
for (i in 1:length(folds)) {
  fold_data <- covid_screening_data[folds[[i]],]
  prevalence <- mean(fold_data$covid_status == "Positive")
  cat("Fold", i, ": n =", length(folds[[i]]), 
      ", prevalence =", round(prevalence * 100, 1), "%\n")
}

Conclusion

This vignette has covered advanced features including:

Custom Optimization: Multi-objective optimization, Pareto frontiers
Advanced Trees: Cost-sensitive and ensemble methods
Complex Constraints: Business rules and time-dependent strategies
Performance: Efficient computation and caching
Integration: API outputs and clinical decision support
Validation: Custom cross-validation schemes

These advanced features enable the Decision Panel Optimization module to handle complex real-world scenarios while maintaining computational efficiency and clinical relevance.

Session Information

sessionInfo()

sbalci/ClinicoPathJamoviModule documentation built on June 13, 2025, 9:34 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

sbalci/ClinicoPathJamoviModule
Analysis for Clinicopathological Research

In sbalci/ClinicoPathJamoviModule: Analysis for Clinicopathological Research

Introduction

Custom Optimization Functions

Defining Custom Utility Functions

Implementing Multi-Objective Optimization

Advanced Decision Trees

Cost-Sensitive Decision Trees

Ensemble Decision Trees

Complex Constraints and Business Rules

Implementing Complex Constraints

Time-Dependent Testing Strategies

Performance Optimization

Efficient Computation for Large Datasets

Caching and Memoization

Integration with External Systems

Exporting Results for Clinical Decision Support Systems

Creating API-Ready Outputs

Validation and Quality Control

Cross-Validation with Custom Splits

Conclusion

Session Information

R Package Documentation

Browse R Packages

We want your feedback!

sbalci/ClinicoPathJamoviModule Analysis for Clinicopathological Research

In sbalci/ClinicoPathJamoviModule: Analysis for Clinicopathological Research

Introduction

Custom Optimization Functions

Defining Custom Utility Functions

Implementing Multi-Objective Optimization

Advanced Decision Trees

Cost-Sensitive Decision Trees

Ensemble Decision Trees

Complex Constraints and Business Rules

Implementing Complex Constraints

Time-Dependent Testing Strategies

Performance Optimization

Efficient Computation for Large Datasets

Caching and Memoization

Integration with External Systems

Exporting Results for Clinical Decision Support Systems

Creating API-Ready Outputs

Validation and Quality Control

Cross-Validation with Custom Splits

Conclusion

Session Information

R Package Documentation

Browse R Packages

We want your feedback!

sbalci/ClinicoPathJamoviModule
Analysis for Clinicopathological Research