knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6 )
This vignette covers advanced features of the Decision Panel Optimization module, including custom optimization functions, complex constraints, and programmatic access to results.
# Load required packages library(meddecide) library(dplyr) library(ggplot2) library(rpart) library(rpart.plot) library(knitr) library(forcats)
# Test Datasets for Decision Panel Optimization Module # Set seed for reproducibility set.seed(42) # Create data directory if it doesn't exist if (!dir.exists("data")) { dir.create("data") } # ============================================================================ # DATASET 1: COVID-19 SCREENING # ============================================================================ create_covid_data <- function(n = 1000, prevalence = 0.15) { # True disease status disease <- rbinom(n, 1, prevalence) # Rapid Antigen Test (RAT) # Sensitivity: 65%, Specificity: 98% rat_prob <- ifelse(disease == 1, 0.65, 0.02) rat_result <- rbinom(n, 1, rat_prob) # PCR Test # Sensitivity: 95%, Specificity: 99% pcr_prob <- ifelse(disease == 1, 0.95, 0.01) pcr_result <- rbinom(n, 1, pcr_prob) # Chest CT # Sensitivity: 90%, Specificity: 85% ct_prob <- ifelse(disease == 1, 0.90, 0.15) ct_result <- rbinom(n, 1, ct_prob) # Clinical symptoms score (0-10) # Higher in disease symptoms <- ifelse(disease == 1, pmin(10, round(rnorm(sum(disease == 1), 7, 2))), pmax(0, round(rnorm(sum(disease == 0), 3, 2)))) # Create dataset covid_data <- data.frame( patient_id = 1:n, rapid_antigen = factor(rat_result, levels = c(0, 1), labels = c("Negative", "Positive")), pcr = factor(pcr_result, levels = c(0, 1), labels = c("Negative", "Positive")), chest_ct = factor(ct_result, levels = c(0, 1), labels = c("Normal", "Abnormal")), symptom_score = symptoms, covid_status = factor(disease, levels = c(0, 1), labels = c("Negative", "Positive")), age = round(rnorm(n, 45, 15)), risk_group = factor(sample(c("Low", "Medium", "High"), n, replace = TRUE, prob = c(0.6, 0.3, 0.1))) ) # Add some missing values realistically # PCR might be missing if rapid test is negative missing_pcr <- which(covid_data$rapid_antigen == "Negative" & runif(n) < 0.3) covid_data$pcr[missing_pcr] <- NA return(covid_data) } # Generate COVID dataset covid_screening_data <- create_covid_data(n = 1000, prevalence = 0.15) # ============================================================================ # DATASET 2: BREAST CANCER SCREENING # ============================================================================ create_breast_cancer_data <- function(n = 2000, prevalence = 0.005) { # True disease status (low prevalence for screening) disease <- rbinom(n, 1, prevalence) # Clinical Breast Exam (CBE) # Sensitivity: 54%, Specificity: 94% cbe_prob <- ifelse(disease == 1, 0.54, 0.06) cbe_result <- rbinom(n, 1, cbe_prob) # Mammography # Sensitivity: 85%, Specificity: 95% mammo_prob <- ifelse(disease == 1, 0.85, 0.05) mammo_result <- rbinom(n, 1, mammo_prob) # Ultrasound # Sensitivity: 80%, Specificity: 90% us_prob <- ifelse(disease == 1, 0.80, 0.10) us_result <- rbinom(n, 1, us_prob) # MRI (for high-risk patients) # Sensitivity: 95%, Specificity: 85% mri_prob <- ifelse(disease == 1, 0.95, 0.15) mri_result <- rbinom(n, 1, mri_prob) # Create risk factors age <- round(rnorm(n, 55, 10)) age[age < 40] <- 40 age[age > 75] <- 75 family_history <- rbinom(n, 1, 0.15) brca_status <- rbinom(n, 1, 0.02) # Create dataset breast_cancer_data <- data.frame( patient_id = 1:n, clinical_exam = factor(cbe_result, levels = c(0, 1), labels = c("Normal", "Abnormal")), mammography = factor(mammo_result, levels = c(0, 1), labels = c("BIRADS 1-2", "BIRADS 3-5")), ultrasound = factor(us_result, levels = c(0, 1), labels = c("Normal", "Suspicious")), mri = factor(mri_result, levels = c(0, 1), labels = c("Normal", "Suspicious")), cancer_status = factor(disease, levels = c(0, 1), labels = c("No Cancer", "Cancer")), age = age, family_history = factor(family_history, levels = c(0, 1), labels = c("No", "Yes")), brca_mutation = factor(brca_status, levels = c(0, 1), labels = c("Negative", "Positive")), breast_density = factor(sample(c("A", "B", "C", "D"), n, replace = TRUE, prob = c(0.1, 0.4, 0.4, 0.1))) ) # MRI typically only done for high-risk low_risk_idx <- which(breast_cancer_data$family_history == "No" & breast_cancer_data$brca_mutation == "Negative") breast_cancer_data$mri[sample(low_risk_idx, length(low_risk_idx) * 0.9)] <- NA return(breast_cancer_data) } # Generate breast cancer dataset breast_cancer_data <- create_breast_cancer_data(n = 2000, prevalence = 0.005) # ============================================================================ # DATASET 3: TUBERCULOSIS DIAGNOSIS # ============================================================================ create_tb_data <- function(n = 1500, prevalence = 0.20) { # True disease status (high prevalence in TB clinic) disease <- rbinom(n, 1, prevalence) # Symptom screening (cough > 2 weeks, fever, weight loss, night sweats) # Sensitivity: 80%, Specificity: 60% symptom_prob <- ifelse(disease == 1, 0.80, 0.40) symptom_result <- rbinom(n, 1, symptom_prob) # Sputum smear microscopy # Sensitivity: 60%, Specificity: 98% smear_prob <- ifelse(disease == 1, 0.60, 0.02) smear_result <- rbinom(n, 1, smear_prob) # GeneXpert MTB/RIF # Sensitivity: 88%, Specificity: 98% genexpert_prob <- ifelse(disease == 1, 0.88, 0.02) genexpert_result <- rbinom(n, 1, genexpert_prob) # Culture (gold standard-ish) # Sensitivity: 95%, Specificity: 100% culture_prob <- ifelse(disease == 1, 0.95, 0.00) culture_result <- rbinom(n, 1, culture_prob) # Chest X-ray # Sensitivity: 85%, Specificity: 75% cxr_prob <- ifelse(disease == 1, 0.85, 0.25) cxr_result <- rbinom(n, 1, cxr_prob) # HIV status affects presentation hiv_status <- rbinom(n, 1, 0.25) # Create dataset tb_data <- data.frame( patient_id = 1:n, symptoms = factor(symptom_result, levels = c(0, 1), labels = c("No", "Yes")), sputum_smear = factor(smear_result, levels = c(0, 1), labels = c("Negative", "Positive")), genexpert = factor(genexpert_result, levels = c(0, 1), labels = c("MTB not detected", "MTB detected")), culture = factor(culture_result, levels = c(0, 1), labels = c("Negative", "Positive")), chest_xray = factor(cxr_result, levels = c(0, 1), labels = c("Normal", "Abnormal")), tb_status = factor(disease, levels = c(0, 1), labels = c("No TB", "TB")), hiv_status = factor(hiv_status, levels = c(0, 1), labels = c("Negative", "Positive")), age = round(rnorm(n, 35, 15)), contact_history = factor(rbinom(n, 1, 0.30), levels = c(0, 1), labels = c("No", "Yes")) ) # Culture takes time, might not be done for all no_culture_idx <- which(tb_data$genexpert == "MTB not detected" & tb_data$symptoms == "No" & runif(n) < 0.4) tb_data$culture[no_culture_idx] <- NA return(tb_data) } # Generate TB dataset tb_diagnosis_data <- create_tb_data(n = 1500, prevalence = 0.20) # ============================================================================ # DATASET 4: MYOCARDIAL INFARCTION RULE-OUT (FIXED) # ============================================================================ create_mi_data <- function(n = 800, prevalence = 0.10) { # True disease status disease <- rbinom(n, 1, prevalence) # ECG changes # Sensitivity: 55%, Specificity: 95% ecg_prob <- ifelse(disease == 1, 0.55, 0.05) ecg_result <- rbinom(n, 1, ecg_prob) # Initial troponin # Sensitivity: 85%, Specificity: 95% trop1_prob <- ifelse(disease == 1, 0.85, 0.05) trop1_result <- rbinom(n, 1, trop1_prob) # 3-hour troponin # Sensitivity: 98%, Specificity: 95% trop3_prob <- ifelse(disease == 1, 0.98, 0.05) trop3_result <- rbinom(n, 1, trop3_prob) # Make sure 3-hour is at least as positive as initial trop3_result <- pmax(trop1_result, trop3_result) # CT Angiography # Sensitivity: 95%, Specificity: 90% cta_prob <- ifelse(disease == 1, 0.95, 0.10) cta_result <- rbinom(n, 1, cta_prob) # Clinical risk score components age <- round(rnorm(n, 60, 15)) age[age < 30] <- 30 age[age > 90] <- 90 # FIXED: Generate chest pain type for each individual based on their disease status chest_pain_type <- character(n) pain_types <- c("Typical", "Atypical", "Non-cardiac") for (i in 1:n) { if (disease[i] == 1) { # Disease present - more likely to have typical pain chest_pain_type[i] <- sample(pain_types, 1, prob = c(0.6, 0.3, 0.1)) } else { # No disease - more likely to have non-cardiac pain chest_pain_type[i] <- sample(pain_types, 1, prob = c(0.1, 0.3, 0.6)) } } # Create dataset mi_data <- data.frame( patient_id = 1:n, ecg = factor(ecg_result, levels = c(0, 1), labels = c("Normal", "Ischemic changes")), troponin_initial = factor(trop1_result, levels = c(0, 1), labels = c("Normal", "Elevated")), troponin_3hr = factor(trop3_result, levels = c(0, 1), labels = c("Normal", "Elevated")), ct_angiography = factor(cta_result, levels = c(0, 1), labels = c("No stenosis", "Significant stenosis")), mi_status = factor(disease, levels = c(0, 1), labels = c("No MI", "MI")), age = age, chest_pain = factor(chest_pain_type), diabetes = factor(rbinom(n, 1, 0.25), levels = c(0, 1), labels = c("No", "Yes")), smoking = factor(rbinom(n, 1, 0.30), levels = c(0, 1), labels = c("No", "Yes")), prior_cad = factor(rbinom(n, 1, 0.20), levels = c(0, 1), labels = c("No", "Yes")) ) # CTA typically only for intermediate risk low_risk_idx <- which(mi_data$chest_pain == "Non-cardiac" & mi_data$ecg == "Normal" & mi_data$troponin_initial == "Normal") if (length(low_risk_idx) > 0) { mi_data$ct_angiography[sample(low_risk_idx, min(length(low_risk_idx), floor(length(low_risk_idx) * 0.8)))] <- NA } return(mi_data) } # Generate MI dataset mi_ruleout_data <- create_mi_data(n = 800, prevalence = 0.10) # ============================================================================ # DATASET 5: THYROID NODULE EVALUATION # ============================================================================ create_thyroid_data <- function(n = 600, prevalence = 0.05) { # True disease status (thyroid cancer) disease <- rbinom(n, 1, prevalence) # Ultrasound features (TI-RADS) # Sensitivity: 90%, Specificity: 70% us_prob <- ifelse(disease == 1, 0.90, 0.30) us_result <- rbinom(n, 1, us_prob) # Fine Needle Aspiration (FNA) cytology # Sensitivity: 95%, Specificity: 98% fna_prob <- ifelse(disease == 1, 0.95, 0.02) fna_result <- rbinom(n, 1, fna_prob) # Molecular testing (ThyroSeq/Afirma) # Sensitivity: 91%, Specificity: 85% molecular_prob <- ifelse(disease == 1, 0.91, 0.15) molecular_result <- rbinom(n, 1, molecular_prob) # Thyroglobulin levels # Sensitivity: 70%, Specificity: 80% tg_prob <- ifelse(disease == 1, 0.70, 0.20) tg_result <- rbinom(n, 1, tg_prob) # Create dataset thyroid_data <- data.frame( patient_id = 1:n, ultrasound = factor(us_result, levels = c(0, 1), labels = c("TI-RADS 1-3", "TI-RADS 4-5")), fna_cytology = factor(fna_result, levels = c(0, 1), labels = c("Benign/Indeterminate", "Suspicious/Malignant")), molecular_test = factor(molecular_result, levels = c(0, 1), labels = c("Benign", "Suspicious")), thyroglobulin = factor(tg_result, levels = c(0, 1), labels = c("Normal", "Elevated")), cancer_status = factor(disease, levels = c(0, 1), labels = c("Benign", "Malignant")), nodule_size = round(rlnorm(n, log(15), 0.5)), age = round(rnorm(n, 50, 15)), gender = factor(sample(c("Female", "Male"), n, replace = TRUE, prob = c(0.75, 0.25))), radiation_history = factor(rbinom(n, 1, 0.05), levels = c(0, 1), labels = c("No", "Yes")) ) # Molecular testing only for indeterminate FNA molecular_not_done <- which(thyroid_data$fna_cytology != "Benign/Indeterminate" | runif(n) < 0.5) thyroid_data$molecular_test[molecular_not_done] <- NA return(thyroid_data) } # Generate thyroid dataset thyroid_nodule_data <- create_thyroid_data(n = 600, prevalence = 0.05) # ============================================================================ # SAVE ALL DATASETS # ============================================================================ # Save as RData file save(covid_screening_data, breast_cancer_data, tb_diagnosis_data, mi_ruleout_data, thyroid_nodule_data, file = "data/decision_panel_test_data.RData") # Also save as CSV files for external use write.csv(covid_screening_data, "data/covid_screening_data.csv", row.names = FALSE) write.csv(breast_cancer_data, "data/breast_cancer_data.csv", row.names = FALSE) write.csv(tb_diagnosis_data, "data/tb_diagnosis_data.csv", row.names = FALSE) write.csv(mi_ruleout_data, "data/mi_ruleout_data.csv", row.names = FALSE) write.csv(thyroid_nodule_data, "data/thyroid_nodule_data.csv", row.names = FALSE) # Print confirmation cat("✓ All datasets generated and saved successfully!\n") # ============================================================================ # DATASET SUMMARIES FOR VERIFICATION # ============================================================================ summarize_dataset <- function(data, disease_col, test_cols) { cat("\nDataset Summary:\n") cat("Total observations:", nrow(data), "\n") # Check if disease column exists and has the expected levels if (disease_col %in% names(data)) { disease_levels <- levels(factor(data[[disease_col]])) if (length(disease_levels) >= 2) { prevalence <- mean(data[[disease_col]] == disease_levels[2], na.rm = TRUE) cat("Disease prevalence:", round(prevalence * 100, 1), "%\n") } else { cat("Disease column found but insufficient levels\n") } } else { cat("Disease column not found\n") } cat("\nTest performance:\n") for (test in test_cols) { if (test %in% names(data) && disease_col %in% names(data)) { # Remove rows with NAs in either variable complete_data <- data[!is.na(data[[test]]) & !is.na(data[[disease_col]]), ] if (nrow(complete_data) > 0) { tab <- table(complete_data[[test]], complete_data[[disease_col]]) if (nrow(tab) >= 2 && ncol(tab) >= 2) { # Assume positive is the second level (index 2) if (nrow(tab) == 2 && ncol(tab) == 2) { sens <- tab[2,2] / sum(tab[,2]) spec <- tab[1,1] / sum(tab[,1]) cat(sprintf(" %s: Sens=%.1f%%, Spec=%.1f%% (n=%d)\n", test, sens*100, spec*100, nrow(complete_data))) } else { cat(sprintf(" %s: Complex table structure (n=%d)\n", test, nrow(complete_data))) } } else { cat(sprintf(" %s: Insufficient data for analysis (n=%d)\n", test, nrow(complete_data))) } } else { cat(sprintf(" %s: No complete cases\n", test)) } } else { cat(sprintf(" %s: Column not found\n", test)) } } } # Print summaries for verification cat("\n", paste(rep("=", 60), collapse = ""), "\n") cat("DATASET SUMMARIES") cat("\n", paste(rep("=", 60), collapse = ""), "\n") cat("\n=== COVID-19 SCREENING DATA ===") summarize_dataset(covid_screening_data, "covid_status", c("rapid_antigen", "pcr", "chest_ct")) cat("\n=== BREAST CANCER SCREENING DATA ===") summarize_dataset(breast_cancer_data, "cancer_status", c("clinical_exam", "mammography", "ultrasound", "mri")) cat("\n=== TUBERCULOSIS DIAGNOSIS DATA ===") summarize_dataset(tb_diagnosis_data, "tb_status", c("symptoms", "sputum_smear", "genexpert", "culture", "chest_xray")) cat("\n=== MYOCARDIAL INFARCTION DATA ===") summarize_dataset(mi_ruleout_data, "mi_status", c("ecg", "troponin_initial", "troponin_3hr", "ct_angiography")) cat("\n=== THYROID NODULE DATA ===") summarize_dataset(thyroid_nodule_data, "cancer_status", c("ultrasound", "fna_cytology", "molecular_test", "thyroglobulin")) cat("\n", paste(rep("=", 60), collapse = ""), "\n")
# Set seed for reproducibility set.seed(123)
The module allows custom utility functions that incorporate domain-specific knowledge:
# Define a custom utility function for COVID screening # Prioritizes not missing cases while considering hospital capacity covid_utility <- function(TP, FP, TN, FN, test_cost, hospital_capacity = 100) { # Base utilities u_TP <- 100 # Correctly identified case u_TN <- 10 # Correctly ruled out u_FP <- -20 # Unnecessary isolation u_FN <- -1000 # Missed case (high penalty) # Capacity penalty - increases FP cost when near capacity current_positives <- TP + FP capacity_factor <- ifelse(current_positives > hospital_capacity * 0.8, (current_positives / hospital_capacity)^2, 1) u_FP_adjusted <- u_FP * capacity_factor # Calculate total utility total_utility <- (TP * u_TP + TN * u_TN + FP * u_FP_adjusted + FN * u_FN - test_cost) return(total_utility) } # Example calculation n_total <- 1000 prevalence <- 0.15 test_cost <- 55 # Combined test cost # Scenario 1: Low capacity utility_low_capacity <- covid_utility( TP = 147, # 98% sensitivity FP = 26, # 97% specificity TN = 824, FN = 3, test_cost = test_cost, hospital_capacity = 50 ) # Scenario 2: High capacity utility_high_capacity <- covid_utility( TP = 147, FP = 26, TN = 824, FN = 3, test_cost = test_cost, hospital_capacity = 200 ) cat("Utility with low capacity:", utility_low_capacity, "\n") cat("Utility with high capacity:", utility_high_capacity, "\n")
When multiple objectives conflict, use Pareto optimization:
# Generate test combinations and their performance generate_pareto_data <- function(data, tests, gold, gold_positive) { # Get all possible test combinations all_combinations <- list() for (i in 1:length(tests)) { combos <- combn(tests, i, simplify = FALSE) all_combinations <- c(all_combinations, combos) } # Calculate metrics for each combination results <- data.frame() for (combo in all_combinations) { # Simulate parallel ANY rule test_positive <- rowSums(data[combo] == "Positive" | data[combo] == "Abnormal" | data[combo] == "MTB detected", na.rm = TRUE) > 0 truth <- data[[gold]] == gold_positive # Calculate metrics TP <- sum(test_positive & truth) FP <- sum(test_positive & !truth) TN <- sum(!test_positive & !truth) FN <- sum(!test_positive & truth) sensitivity <- TP / (TP + FN) specificity <- TN / (TN + FP) # Simulated costs test_costs <- c(rapid_antigen = 5, pcr = 50, chest_ct = 200) total_cost <- sum(test_costs[combo]) results <- rbind(results, data.frame( tests = paste(combo, collapse = "+"), n_tests = length(combo), sensitivity = sensitivity, specificity = specificity, cost = total_cost )) } return(results) } # Generate Pareto frontier for COVID tests pareto_data <- generate_pareto_data( covid_screening_data[1:500,], # Use subset for speed tests = c("rapid_antigen", "pcr", "chest_ct"), gold = "covid_status", gold_positive = "Positive" ) # Identify Pareto optimal solutions is_pareto_optimal <- function(data, objectives) { n <- nrow(data) pareto <- rep(TRUE, n) for (i in 1:n) { for (j in 1:n) { if (i != j) { # Check if j dominates i dominates <- all(data[j, objectives] >= data[i, objectives]) && any(data[j, objectives] > data[i, objectives]) if (dominates) { pareto[i] <- FALSE break } } } } return(pareto) } # For sensitivity and cost (cost should be minimized, so use negative) pareto_data$neg_cost <- -pareto_data$cost pareto_data$pareto_optimal <- is_pareto_optimal( pareto_data, c("sensitivity", "neg_cost") ) # Visualize Pareto frontier ggplot(pareto_data, aes(x = cost, y = sensitivity * 100)) + geom_point(aes(color = pareto_optimal, size = n_tests), alpha = 0.7) + geom_line(data = pareto_data[pareto_data$pareto_optimal,] %>% arrange(cost), color = "red", size = 1) + geom_text(data = pareto_data[pareto_data$pareto_optimal,], aes(label = tests), vjust = -1, size = 3) + scale_color_manual(values = c("gray", "red")) + labs( title = "Pareto Frontier for Multi-Objective Optimization", x = "Total Cost ($)", y = "Sensitivity (%)", caption = "Red points and line show Pareto optimal solutions" ) + theme_minimal() + theme(legend.position = "none")
Implement decision trees that consider both accuracy and cost:
# Prepare data for decision tree tree_data <- covid_screening_data %>% select(rapid_antigen, pcr, chest_ct, symptom_score, age, risk_group, covid_status) %>% na.omit() # Create cost matrix # Rows: predicted, Columns: actual # Cost of false negative is 10x cost of false positive cost_matrix <- matrix(c(0, 1, # Predict Negative 10, 0), # Predict Positive nrow = 2, byrow = TRUE) # Build cost-sensitive tree cost_tree <- rpart( covid_status ~ rapid_antigen + pcr + chest_ct + symptom_score + age + risk_group, data = tree_data, method = "class", parms = list(loss = cost_matrix), control = rpart.control(cp = 0.01, maxdepth = 4) ) # Visualize tree rpart.plot(cost_tree, main = "Cost-Sensitive Decision Tree for COVID-19", extra = 104, # Show probability and number under = TRUE, faclen = 0, cex = 0.8) # Compare with standard tree standard_tree <- rpart( covid_status ~ rapid_antigen + pcr + chest_ct + symptom_score + age + risk_group, data = tree_data, method = "class", control = rpart.control(cp = 0.01, maxdepth = 4) ) # Performance comparison tree_comparison <- data.frame( Model = c("Standard", "Cost-Sensitive"), Accuracy = c( sum(predict(standard_tree, type = "class") == tree_data$covid_status) / nrow(tree_data), sum(predict(cost_tree, type = "class") == tree_data$covid_status) / nrow(tree_data) ), Sensitivity = c( { pred <- predict(standard_tree, type = "class") sum(pred == "Positive" & tree_data$covid_status == "Positive") / sum(tree_data$covid_status == "Positive") }, { pred <- predict(cost_tree, type = "class") sum(pred == "Positive" & tree_data$covid_status == "Positive") / sum(tree_data$covid_status == "Positive") } ) ) kable(tree_comparison, digits = 3, caption = "Performance Comparison: Standard vs Cost-Sensitive Trees")
Combine multiple trees for more robust decisions:
# Create bootstrap samples and build multiple trees n_trees <- 10 trees <- list() tree_weights <- numeric(n_trees) for (i in 1:n_trees) { # Bootstrap sample boot_indices <- sample(nrow(tree_data), replace = TRUE) boot_data <- tree_data[boot_indices,] # Build tree with random feature subset features <- c("rapid_antigen", "pcr", "chest_ct", "symptom_score", "age", "risk_group") selected_features <- sample(features, size = 4) formula <- as.formula(paste("covid_status ~", paste(selected_features, collapse = " + "))) trees[[i]] <- rpart( formula, data = boot_data, method = "class", control = rpart.control(cp = 0.02, maxdepth = 3) ) # Calculate out-of-bag performance for weighting oob_indices <- setdiff(1:nrow(tree_data), unique(boot_indices)) if (length(oob_indices) > 0) { oob_pred <- predict(trees[[i]], tree_data[oob_indices,], type = "class") tree_weights[i] <- sum(oob_pred == tree_data$covid_status[oob_indices]) / length(oob_indices) } else { tree_weights[i] <- 0.5 } } # Normalize weights tree_weights <- tree_weights / sum(tree_weights) # Ensemble prediction function ensemble_predict <- function(trees, weights, newdata) { # Get probability predictions from each tree prob_matrix <- matrix(0, nrow = nrow(newdata), ncol = 2) for (i in 1:length(trees)) { probs <- predict(trees[[i]], newdata, type = "prob") prob_matrix <- prob_matrix + probs * weights[i] } # Return class with highest probability classes <- levels(tree_data$covid_status) predicted_class <- classes[apply(prob_matrix, 1, which.max)] return(list(class = predicted_class, prob = prob_matrix)) } # Test ensemble ensemble_pred <- ensemble_predict(trees, tree_weights, tree_data) # Compare performance ensemble_comparison <- data.frame( Model = c("Single Tree", "Ensemble"), Accuracy = c( sum(predict(trees[[1]], type = "class") == tree_data$covid_status) / nrow(tree_data), sum(ensemble_pred$class == tree_data$covid_status) / nrow(tree_data) ) ) kable(ensemble_comparison, digits = 3, caption = "Single Tree vs Ensemble Performance")
Real-world scenarios often have complex constraints:
# Function to check if a test combination meets constraints meets_constraints <- function(tests, constraints) { # Example constraints for TB testing # 1. If GeneXpert is used, must have sputum collection capability if ("genexpert" %in% tests && !("sputum_smear" %in% tests || constraints$has_sputum_collection)) { return(FALSE) } # 2. Culture requires biosafety level 3 lab if ("culture" %in% tests && !constraints$has_bsl3_lab) { return(FALSE) } # 3. Maximum turnaround time constraint test_times <- c(symptoms = 0, sputum_smear = 0.5, genexpert = 0.1, culture = 21, chest_xray = 0.5) max_time <- max(test_times[tests]) if (max_time > constraints$max_turnaround_days) { return(FALSE) } # 4. Budget constraint test_costs <- c(symptoms = 1, sputum_smear = 3, genexpert = 20, culture = 30, chest_xray = 10) total_cost <- sum(test_costs[tests]) if (total_cost > constraints$budget_per_patient) { return(FALSE) } return(TRUE) } # Define facility-specific constraints facility_constraints <- list( rural_clinic = list( has_sputum_collection = TRUE, has_bsl3_lab = FALSE, max_turnaround_days = 1, budget_per_patient = 15 ), district_hospital = list( has_sputum_collection = TRUE, has_bsl3_lab = FALSE, max_turnaround_days = 7, budget_per_patient = 50 ), reference_lab = list( has_sputum_collection = TRUE, has_bsl3_lab = TRUE, max_turnaround_days = 30, budget_per_patient = 100 ) ) # Find valid combinations for each facility type tb_tests <- c("symptoms", "sputum_smear", "genexpert", "culture", "chest_xray") for (facility in names(facility_constraints)) { valid_combos <- list() # Check all combinations for (i in 1:length(tb_tests)) { combos <- combn(tb_tests, i, simplify = FALSE) for (combo in combos) { if (meets_constraints(combo, facility_constraints[[facility]])) { valid_combos <- c(valid_combos, list(combo)) } } } cat("\n", facility, ": ", length(valid_combos), " valid combinations\n", sep = "") cat("Examples: \n") for (j in 1:min(3, length(valid_combos))) { cat(" -", paste(valid_combos[[j]], collapse = " + "), "\n") } }
Implement strategies that change based on time constraints:
# Time-dependent chest pain protocol time_dependent_protocol <- function(patient_data, time_available_hours) { decisions <- data.frame( patient_id = patient_data$patient_id, protocol = character(nrow(patient_data)), tests_used = character(nrow(patient_data)), decision_time = numeric(nrow(patient_data)), stringsAsFactors = FALSE ) for (i in 1:nrow(patient_data)) { patient <- patient_data[i,] if (time_available_hours >= 3) { # Full protocol available if (patient$troponin_initial == "Normal" && patient$troponin_3hr == "Normal" && patient$ecg == "Normal") { decisions$protocol[i] <- "Rule out" decisions$tests_used[i] <- "ECG + Serial troponins" decisions$decision_time[i] <- 3 } else if (patient$troponin_3hr == "Elevated") { decisions$protocol[i] <- "Rule in" decisions$tests_used[i] <- "ECG + Serial troponins" decisions$decision_time[i] <- 3 } else { decisions$protocol[i] <- "Further testing" decisions$tests_used[i] <- "ECG + Serial troponins + CTA" decisions$decision_time[i] <- 3.5 } } else if (time_available_hours >= 1) { # Rapid protocol if (patient$troponin_initial == "Normal" && patient$ecg == "Normal" && patient$age < 40) { decisions$protocol[i] <- "Low risk discharge" decisions$tests_used[i] <- "ECG + Initial troponin" decisions$decision_time[i] <- 1 } else { decisions$protocol[i] <- "Requires admission" decisions$tests_used[i] <- "ECG + Initial troponin" decisions$decision_time[i] <- 1 } } else { # Ultra-rapid if (patient$ecg == "Ischemic changes") { decisions$protocol[i] <- "Immediate cath lab" decisions$tests_used[i] <- "ECG only" decisions$decision_time[i] <- 0.2 } else { decisions$protocol[i] <- "Clinical decision" decisions$tests_used[i] <- "ECG only" decisions$decision_time[i] <- 0.2 } } } return(decisions) } # Apply to sample patients sample_mi <- mi_ruleout_data[1:20,] # Different time scenarios time_scenarios <- c(0.5, 1, 3, 6) for (time_limit in time_scenarios) { results <- time_dependent_protocol(sample_mi, time_limit) cat("\nTime available:", time_limit, "hours\n") cat("Protocols used:\n") print(table(results$protocol)) cat("Average decision time:", mean(results$decision_time), "hours\n") }
# Performance Optimization and Benchmarking # This section demonstrates different approaches to optimize performance calculations # Function to safely calculate performance metrics with NA handling safe_performance_metrics <- function(predictions, actual, positive_class = "Positive") { # Handle missing values complete_cases <- !is.na(predictions) & !is.na(actual) if (sum(complete_cases) == 0) { return(list( accuracy = NA, sensitivity = NA, specificity = NA, ppv = NA, npv = NA, n_complete = 0 )) } pred_clean <- predictions[complete_cases] actual_clean <- actual[complete_cases] # Convert to binary if needed pred_binary <- as.character(pred_clean) == positive_class actual_binary <- as.character(actual_clean) == positive_class # Calculate confusion matrix components tp <- sum(pred_binary & actual_binary, na.rm = TRUE) tn <- sum(!pred_binary & !actual_binary, na.rm = TRUE) fp <- sum(pred_binary & !actual_binary, na.rm = TRUE) fn <- sum(!pred_binary & actual_binary, na.rm = TRUE) # Calculate metrics with division by zero protection total <- tp + tn + fp + fn accuracy <- if (total > 0) (tp + tn) / total else NA sensitivity <- if ((tp + fn) > 0) tp / (tp + fn) else NA specificity <- if ((tn + fp) > 0) tn / (tn + fp) else NA ppv <- if ((tp + fp) > 0) tp / (tp + fp) else NA npv <- if ((tn + fn) > 0) tn / (tn + fn) else NA return(list( accuracy = accuracy, sensitivity = sensitivity, specificity = specificity, ppv = ppv, npv = npv, n_complete = sum(complete_cases) )) } # Optimized confusion matrix calculation fast_confusion_matrix <- function(predictions, actual, positive_class = "Positive") { # Handle NAs upfront complete_cases <- !is.na(predictions) & !is.na(actual) if (sum(complete_cases) < 2) { return(matrix(c(0, 0, 0, 0), nrow = 2, dimnames = list( Predicted = c("Negative", "Positive"), Actual = c("Negative", "Positive") ))) } pred_clean <- predictions[complete_cases] actual_clean <- actual[complete_cases] # Use table for fast cross-tabulation conf_table <- table( Predicted = factor(pred_clean, levels = c(setdiff(unique(c(pred_clean, actual_clean)), positive_class), positive_class)), Actual = factor(actual_clean, levels = c(setdiff(unique(c(pred_clean, actual_clean)), positive_class), positive_class)) ) return(conf_table) } # Vectorized performance calculation vectorized_metrics <- function(pred_vector, actual_vector, positive_class = "Positive") { # Remove NAs complete_idx <- !is.na(pred_vector) & !is.na(actual_vector) if (sum(complete_idx) == 0) { return(data.frame( method = "vectorized", accuracy = NA, sensitivity = NA, specificity = NA, n_obs = 0 )) } pred <- pred_vector[complete_idx] actual <- actual_vector[complete_idx] # Vectorized operations pred_pos <- pred == positive_class actual_pos <- actual == positive_class tp <- sum(pred_pos & actual_pos) tn <- sum(!pred_pos & !actual_pos) fp <- sum(pred_pos & !actual_pos) fn <- sum(!pred_pos & actual_pos) n_total <- length(pred) n_pos <- sum(actual_pos) n_neg <- sum(!actual_pos) data.frame( method = "vectorized", accuracy = (tp + tn) / n_total, sensitivity = if (n_pos > 0) tp / n_pos else NA, specificity = if (n_neg > 0) tn / n_neg else NA, n_obs = n_total ) } # Create test data for benchmarking (ensure no NAs in critical columns) set.seed(123) n_test <- 1000 # Create predictions with some realistic accuracy actual_test <- factor(sample(c("Negative", "Positive"), n_test, replace = TRUE, prob = c(0.8, 0.2))) # Create predictions that correlate with actual (realistic scenario) pred_prob <- ifelse(actual_test == "Positive", 0.85, 0.15) pred_test <- factor(ifelse(runif(n_test) < pred_prob, "Positive", "Negative")) # Introduce some missing values (but not in the benchmarked functions) missing_idx <- sample(n_test, size = floor(n_test * 0.05)) actual_test_with_na <- actual_test pred_test_with_na <- pred_test actual_test_with_na[missing_idx[1:length(missing_idx)/2]] <- NA pred_test_with_na[missing_idx[(length(missing_idx)/2 + 1):length(missing_idx)]] <- NA cat("Test data created:\n") cat("Total observations:", n_test, "\n") cat("Missing values in actual:", sum(is.na(actual_test_with_na)), "\n") cat("Missing values in predictions:", sum(is.na(pred_test_with_na)), "\n") cat("Complete cases:", sum(!is.na(actual_test_with_na) & !is.na(pred_test_with_na)), "\n") # Test the functions with clean data first cat("\nTesting functions with clean data:\n") clean_metrics <- safe_performance_metrics(pred_test, actual_test) print(clean_metrics) clean_confusion <- fast_confusion_matrix(pred_test, actual_test) print(clean_confusion) # Test with data containing NAs cat("\nTesting functions with NA values:\n") na_metrics <- safe_performance_metrics(pred_test_with_na, actual_test_with_na) print(na_metrics) # Benchmark different approaches (using clean data to avoid NA issues in timing) cat("\nPerformance benchmarking:\n") # Only benchmark if microbenchmark is available if (requireNamespace("microbenchmark", quietly = TRUE)) { tryCatch({ benchmark_results <- microbenchmark::microbenchmark( "safe_metrics" = safe_performance_metrics(pred_test, actual_test), "fast_confusion" = fast_confusion_matrix(pred_test, actual_test), "vectorized" = vectorized_metrics(pred_test, actual_test), times = 10 ) print(benchmark_results) # Plot benchmark results if possible if (requireNamespace("ggplot2", quietly = TRUE)) { plot(benchmark_results) } }, error = function(e) { cat("Benchmark error (using fallback timing):", e$message, "\n") # Fallback timing method cat("Using system.time for performance measurement:\n") cat("Safe metrics timing:\n") print(system.time(for(i in 1:10) safe_performance_metrics(pred_test, actual_test))) cat("Fast confusion matrix timing:\n") print(system.time(for(i in 1:10) fast_confusion_matrix(pred_test, actual_test))) cat("Vectorized metrics timing:\n") print(system.time(for(i in 1:10) vectorized_metrics(pred_test, actual_test))) }) } else { cat("microbenchmark package not available, using system.time:\n") cat("Safe metrics timing:\n") print(system.time(replicate(10, safe_performance_metrics(pred_test, actual_test)))) cat("Fast confusion matrix timing:\n") print(system.time(replicate(10, fast_confusion_matrix(pred_test, actual_test)))) cat("Vectorized metrics timing:\n") print(system.time(replicate(10, vectorized_metrics(pred_test, actual_test)))) } # Performance comparison table performance_comparison <- data.frame( Method = c("Safe Metrics", "Fast Confusion Matrix", "Vectorized Metrics"), `Handles NAs` = c("Yes", "Yes", "Yes"), `Memory Efficient` = c("Medium", "High", "High"), `Speed` = c("Medium", "Fast", "Fastest"), `Use Case` = c("General purpose", "Detailed analysis", "Large datasets"), stringsAsFactors = FALSE ) knitr::kable(performance_comparison, caption = "Performance Optimization Comparison", align = 'c')
# Create memoized function for expensive calculations library(memoise) # Original expensive function calculate_test_performance <- function(test_data, gold_standard) { # Simulate expensive calculation Sys.sleep(0.1) # Pretend this takes time conf_matrix <- table(test_data, gold_standard) sensitivity <- conf_matrix[2,2] / sum(conf_matrix[,2]) specificity <- conf_matrix[1,1] / sum(conf_matrix[,1]) return(list(sensitivity = sensitivity, specificity = specificity)) } # Memoized version calculate_test_performance_memo <- memoise(calculate_test_performance) # Demonstration test_vector <- as.numeric(covid_screening_data$rapid_antigen == "Positive") gold_vector <- as.numeric(covid_screening_data$covid_status == "Positive") # First call - slow system.time({ result1 <- calculate_test_performance_memo(test_vector[1:100], gold_vector[1:100]) }) # Second call with same data - fast (cached) system.time({ result2 <- calculate_test_performance_memo(test_vector[1:100], gold_vector[1:100]) }) cat("Results match:", identical(result1, result2), "\n")
# Export Clinical Decision Support System Rules # Safe function to export tree rules with proper error handling export_tree_as_rules <- function(tree_model, data) { # Check if tree model exists and is valid if (is.null(tree_model) || !inherits(tree_model, "rpart")) { cat("Error: Invalid or missing tree model\n") return(NULL) } # Check if tree has any splits if (nrow(tree_model$frame) <= 1) { cat("Warning: Tree has no splits (single node)\n") return(data.frame( rule_id = 1, condition = "Always true", prediction = "Default", confidence = 1.0, n_cases = nrow(data) )) } tryCatch({ # Get tree frame information frame <- tree_model$frame # Check if required columns exist required_cols <- c("var", "yval") if (!all(required_cols %in% names(frame))) { stop("Tree frame missing required columns") } # Extract node information safely node_info <- frame # Handle yval2 safely if ("yval2" %in% names(node_info) && !is.null(node_info$yval2)) { # Check dimensions before using rowSums yval2_data <- node_info$yval2 if (is.matrix(yval2_data) && ncol(yval2_data) >= 2) { # Safe to use rowSums node_counts <- rowSums(yval2_data[, 1:min(2, ncol(yval2_data)), drop = FALSE]) } else if (is.data.frame(yval2_data) && ncol(yval2_data) >= 2) { # Convert to matrix first yval2_matrix <- as.matrix(yval2_data[, 1:min(2, ncol(yval2_data))]) node_counts <- rowSums(yval2_matrix) } else { # Fallback: use node$n if available node_counts <- if ("n" %in% names(node_info)) node_info$n else rep(1, nrow(node_info)) } } else { # Fallback: use node$n or estimate node_counts <- if ("n" %in% names(node_info)) node_info$n else rep(nrow(data), nrow(node_info)) } # Generate rules for leaf nodes leaf_nodes <- which(node_info$var == "<leaf>") if (length(leaf_nodes) == 0) { cat("Warning: No leaf nodes found\n") return(NULL) } rules_list <- list() for (i in seq_along(leaf_nodes)) { node_idx <- leaf_nodes[i] # Get the path to this leaf node node_path <- path.rpart(tree_model, nodes = as.numeric(rownames(node_info)[node_idx])) # Extract condition text if (length(node_path) > 0 && !is.null(node_path[[1]])) { condition_parts <- node_path[[1]] # Remove the root node (usually just "root") condition_parts <- condition_parts[condition_parts != "root"] if (length(condition_parts) > 0) { condition <- paste(condition_parts, collapse = " AND ") } else { condition <- "Always true (root node)" } } else { condition <- paste("Node", node_idx) } # Get prediction prediction <- as.character(node_info$yval[node_idx]) # Calculate confidence (proportion of cases) n_cases <- node_counts[node_idx] confidence <- n_cases / sum(node_counts, na.rm = TRUE) rules_list[[i]] <- data.frame( rule_id = i, condition = condition, prediction = prediction, confidence = round(confidence, 3), n_cases = n_cases, stringsAsFactors = FALSE ) } # Combine all rules if (length(rules_list) > 0) { rules_df <- do.call(rbind, rules_list) return(rules_df) } else { return(NULL) } }, error = function(e) { cat("Error in export_tree_as_rules:", e$message, "\n") cat("Tree structure:\n") if (exists("frame")) { print(str(frame)) } else { print(str(tree_model)) } return(NULL) }) } # Alternative simple rule extraction function simple_tree_rules <- function(tree_model, data) { if (is.null(tree_model) || !inherits(tree_model, "rpart")) { return("No valid tree model available") } # Use rpart's built-in text representation rules_text <- capture.output(print(tree_model)) return(paste(rules_text, collapse = "\n")) } # Generate exportable rules cat("Generating Clinical Decision Support Rules...\n") # Check if we have a valid tree from previous chunks if (exists("cost_tree") && !is.null(cost_tree)) { cat("Exporting rules from cost-sensitive tree...\n") # Try the main function first exported_rules <- export_tree_as_rules(cost_tree, covid_screening_data) if (!is.null(exported_rules) && nrow(exported_rules) > 0) { cat("Successfully exported", nrow(exported_rules), "rules\n") # Display the rules knitr::kable(exported_rules, caption = "Clinical Decision Support Rules", align = c('c', 'l', 'c', 'c', 'c')) # Create a more readable format cat("\n## Human-Readable Decision Rules:\n\n") for (i in 1:nrow(exported_rules)) { cat("**Rule", exported_rules$rule_id[i], ":**\n") cat("- **If:** ", exported_rules$condition[i], "\n") cat("- **Then:** Predict", exported_rules$prediction[i], "\n") cat("- **Confidence:** ", exported_rules$confidence[i]*100, "%\n") cat("- **Based on:** ", exported_rules$n_cases[i], "cases\n\n") } } else { cat("Failed to export structured rules. Using simple text representation:\n\n") simple_rules <- simple_tree_rules(cost_tree, covid_screening_data) cat("```\n") cat(simple_rules) cat("\n```\n") } } else { cat("No decision tree available. Creating a simple example tree...\n") # Create a simple example tree for demonstration if (exists("covid_screening_data")) { # Ensure we have the necessary columns if ("rapid_antigen" %in% names(covid_screening_data) && "covid_status" %in% names(covid_screening_data)) { # Simple tree with minimal requirements simple_formula <- covid_status ~ rapid_antigen # Check if we have enough data complete_data <- covid_screening_data[complete.cases(covid_screening_data[c("rapid_antigen", "covid_status")]), ] if (nrow(complete_data) > 10) { simple_tree <- rpart(simple_formula, data = complete_data, method = "class", control = rpart.control(minbucket = 5, cp = 0.1)) cat("Created simple demonstration tree:\n") print(simple_tree) # Try to export rules from simple tree simple_exported <- export_tree_as_rules(simple_tree, complete_data) if (!is.null(simple_exported)) { knitr::kable(simple_exported, caption = "Simple Decision Rules (Example)", align = c('c', 'l', 'c', 'c', 'c')) } } else { cat("Insufficient data for tree creation\n") } } else { cat("Required columns not found in data\n") } } else { cat("No data available for tree creation\n") } } # Export formats section cat("\n## Export Formats\n\n") export_formats <- data.frame( Format = c("JSON", "XML", "CSV", "SQL", "R Code"), `Use Case` = c( "Web applications, APIs", "Healthcare standards (HL7)", "Spreadsheet analysis", "Database integration", "R/Statistical software" ), Complexity = c("Medium", "High", "Low", "Medium", "Low"), Implementation = c( "jsonlite::toJSON()", "XML::xmlTree()", "write.csv()", "Custom SQL generation", "dput() or custom function" ), stringsAsFactors = FALSE ) knitr::kable(export_formats, caption = "Available Export Formats for Decision Rules", align = c('l', 'l', 'c', 'l')) cat("\n## Implementation Example\n\n") cat("Here's how these rules could be implemented in a clinical system:\n\n") implementation_example <- ' # Example implementation in R clinical_decision <- function(rapid_antigen_result) { if (rapid_antigen_result == "Positive") { return(list( decision = "Positive", confidence = 0.95, recommendation = "Confirm with PCR if needed for official diagnosis" )) } else { return(list( decision = "Negative", confidence = 0.85, recommendation = "Consider PCR if high clinical suspicion" )) } } # Usage example: # result <- clinical_decision("Positive") # print(result$decision) ' cat("```r\n") cat(implementation_example) cat("```\n")
# Function to create API response for test panel recommendation create_api_response <- function(patient_data, optimal_panel) { response <- list( timestamp = Sys.time(), patient_id = patient_data$patient_id, recommendations = list( primary = list( tests = optimal_panel$tests, strategy = optimal_panel$strategy, expected_performance = list( sensitivity = round(optimal_panel$sensitivity * 100, 1), specificity = round(optimal_panel$specificity * 100, 1), ppv = round(optimal_panel$ppv * 100, 1), npv = round(optimal_panel$npv * 100, 1) ), estimated_cost = optimal_panel$cost ), alternative_protocols = list( rapid = "Rapid antigen only", comprehensive = "All available tests" ) ), warnings = list(), metadata = list( model_version = "1.0.0", confidence_level = "high" ) ) # Add warnings based on patient characteristics if (patient_data$age > 65) { response$warnings <- append(response$warnings, "High-risk age group - consider lower threshold") } return(response) } # Example API response example_patient <- covid_screening_data[1,] example_panel <- list( tests = "rapid_antigen+pcr", strategy = "parallel_any", sensitivity = 0.97, specificity = 0.97, ppv = 0.82, npv = 0.99, cost = 55 ) api_response <- create_api_response(example_patient, example_panel) cat("API Response:\n") print(jsonlite::toJSON(api_response, pretty = TRUE))
# Time-based cross-validation for temporal data time_based_cv <- function(data, date_column, n_splits = 5) { # Sort by date data <- data[order(data[[date_column]]),] n <- nrow(data) # Create time-based splits splits <- list() test_size <- floor(n / (n_splits + 1)) for (i in 1:n_splits) { train_end <- test_size * i test_start <- train_end + 1 test_end <- min(test_start + test_size - 1, n) splits[[i]] <- list( train = 1:train_end, test = test_start:test_end ) } return(splits) } # Stratified cross-validation ensuring prevalence balance stratified_cv <- function(data, outcome_column, n_folds = 5) { # Separate by outcome positive_idx <- which(data[[outcome_column]] == levels(data[[outcome_column]])[2]) negative_idx <- which(data[[outcome_column]] == levels(data[[outcome_column]])[1]) # Shuffle within strata positive_idx <- sample(positive_idx) negative_idx <- sample(negative_idx) # Create folds maintaining proportion folds <- list() pos_per_fold <- length(positive_idx) %/% n_folds neg_per_fold <- length(negative_idx) %/% n_folds for (i in 1:n_folds) { if (i < n_folds) { fold_pos <- positive_idx[((i-1)*pos_per_fold + 1):(i*pos_per_fold)] fold_neg <- negative_idx[((i-1)*neg_per_fold + 1):(i*neg_per_fold)] } else { # Last fold gets remaining fold_pos <- positive_idx[((i-1)*pos_per_fold + 1):length(positive_idx)] fold_neg <- negative_idx[((i-1)*neg_per_fold + 1):length(negative_idx)] } folds[[i]] <- c(fold_pos, fold_neg) } return(folds) } # Apply stratified CV folds <- stratified_cv(covid_screening_data, "covid_status", n_folds = 5) # Check fold balance for (i in 1:length(folds)) { fold_data <- covid_screening_data[folds[[i]],] prevalence <- mean(fold_data$covid_status == "Positive") cat("Fold", i, ": n =", length(folds[[i]]), ", prevalence =", round(prevalence * 100, 1), "%\n") }
This vignette has covered advanced features including:
These advanced features enable the Decision Panel Optimization module to handle complex real-world scenarios while maintaining computational efficiency and clinical relevance.
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.