inst/doc/matching-workflows.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
library(couplr)
library(dplyr)
library(ggplot2)

## ----basic-example------------------------------------------------------------
# Simulate observational data
set.seed(123)
n_left <- 100
n_right <- 150

# Treatment group (tends to be younger, higher income)
left_data <- tibble(
  id = 1:n_left,
  age = rnorm(n_left, mean = 45, sd = 10),
  income = rnorm(n_left, mean = 60000, sd = 15000),
  education = sample(c("HS", "BA", "MA"), n_left, replace = TRUE),
  group = "treatment"
)

# Control group (older, lower income on average)
right_data <- tibble(
  id = 1:n_right,
  age = rnorm(n_right, mean = 52, sd = 12),
  income = rnorm(n_right, mean = 50000, sd = 18000),
  education = sample(c("HS", "BA", "MA"), n_right, replace = TRUE),
  group = "control"
)

# Perform optimal matching
result <- match_couples(
  left = left_data,
  right = right_data,
  vars = c("age", "income"),
  auto_scale = TRUE,
  return_diagnostics = TRUE
)

# View matched pairs
head(result$pairs)

# Summary statistics
result$info

## ----inspect-output, fig.alt="Histogram showing the distribution of match distances, with most matches having low distances indicating good match quality"----
# How many matched?
cat("Matched pairs:", result$info$n_matched, "\n")
cat("Unmatched left:", nrow(result$left_unmatched), "\n")
cat("Unmatched right:", nrow(result$right_unmatched), "\n")

# Distribution of match distances
summary(result$pairs$distance)

# Visualize match quality
ggplot(result$pairs, aes(x = distance)) +
  geom_histogram(bins = 30, fill = "#29abe0", alpha = 0.7) +
  labs(
    title = "Distribution of Match Distances",
    x = "Euclidean Distance (scaled)",
    y = "Count"
  ) +
  theme_minimal() +
  theme(plot.background = element_rect(fill = "transparent", color = NA),
        panel.background = element_rect(fill = "transparent", color = NA),
        panel.grid = element_blank())

## ----preprocessing-demo-------------------------------------------------------
# Create data with scaling challenges
set.seed(456)
challenging_data <- tibble(
  id = 1:50,
  age = rnorm(50, 50, 10),                    # Years (reasonable scale)
  income = rnorm(50, 60000, 20000),           # Dollars (large scale)
  bmi = rnorm(50, 25, 5),                     # Ratio (small scale)
  smoker = sample(0:1, 50, replace = TRUE),   # Binary
  education = factor(
    sample(c("HS", "BA", "MA", "PhD"), 50, replace = TRUE),
    ordered = TRUE,
    levels = c("HS", "BA", "MA", "PhD")
  )
)

left_chal <- challenging_data[1:25, ]
right_chal <- challenging_data[26:50, ]

# Match WITHOUT auto-scaling (income dominates)
result_no_scale <- match_couples(
  left_chal, right_chal,
  vars = c("age", "income", "bmi"),
  auto_scale = FALSE
)

# Match WITH auto-scaling (all variables contribute)
result_scaled <- match_couples(
  left_chal, right_chal,
  vars = c("age", "income", "bmi"),
  auto_scale = TRUE,
  scale = "robust"  # Median/MAD scaling (robust to outliers)
)

# Compare match quality
cat("Without scaling - mean distance:", mean(result_no_scale$pairs$distance), "\n")
cat("With scaling - mean distance:", mean(result_scaled$pairs$distance), "\n")

## ----scaling-comparison-------------------------------------------------------
# Demonstrate scaling methods
demo_var <- c(10, 20, 25, 30, 100)  # Contains outlier (100)

# Function to show scaling
show_scaling <- function(x, method) {
  left_demo <- tibble(id = 1:3, var = x[1:3])
  right_demo <- tibble(id = 1:2, var = x[4:5])

  result <- match_couples(
    left_demo, right_demo,
    vars = "var",
    auto_scale = TRUE,
    scale = method,
    return_diagnostics = TRUE
  )

  # Extract scaled values from cost matrix
  cat(method, "scaling:\n")
  cat("  Original:", x, "\n")
  cat("  Distance matrix diagonal:", diag(result$cost_matrix)[1:2], "\n\n")
}

show_scaling(demo_var, "robust")
show_scaling(demo_var, "standardize")
show_scaling(demo_var, "range")

## ----health-checks, eval=FALSE------------------------------------------------
# # Create data with issues
# problematic_data <- tibble(
#   id = 1:100,
#   age = rnorm(100, 50, 10),
#   constant_var = 5,                           # No variation - will warn
#   mostly_missing = c(rnorm(20, 50, 10), rep(NA, 80)),  # >50% missing
#   extreme_skew = rexp(100, rate = 0.1)       # Very skewed
# )
# 
# # Attempt matching - will show warnings
# result <- match_couples(
#   problematic_data[1:50, ],
#   problematic_data[51:100, ],
#   vars = c("age", "constant_var", "mostly_missing", "extreme_skew"),
#   auto_scale = TRUE
# )
# 
# # Warning messages will indicate:
# # - "constant_var excluded (SD = 0)"
# # - "mostly_missing has 80% missing values"
# # - "extreme_skew has high skewness (3.2)"

## ----optimal-vs-greedy--------------------------------------------------------
# Create moderately large dataset
set.seed(789)
n <- 1000
large_left <- tibble(
  id = 1:n,
  x1 = rnorm(n),
  x2 = rnorm(n),
  x3 = rnorm(n)
)
large_right <- tibble(
  id = 1:n,
  x1 = rnorm(n),
  x2 = rnorm(n),
  x3 = rnorm(n)
)

# Optimal matching
time_optimal <- system.time({
  result_optimal <- match_couples(
    large_left, large_right,
    vars = c("x1", "x2", "x3"),
    method = "hungarian"
  )
})

# Greedy matching (row_best strategy)
time_greedy <- system.time({
  result_greedy <- greedy_couples(
    large_left, large_right,
    vars = c("x1", "x2", "x3"),
    strategy = "row_best"
  )
})

# Compare
cat("Optimal matching:\n")
cat("  Time:", round(time_optimal["elapsed"], 3), "seconds\n")
cat("  Mean distance:", round(mean(result_optimal$pairs$distance), 4), "\n\n")

cat("Greedy matching:\n")
cat("  Time:", round(time_greedy["elapsed"], 3), "seconds\n")
cat("  Mean distance:", round(mean(result_greedy$pairs$distance), 4), "\n")
cat("  Speedup:", round(time_optimal["elapsed"] / time_greedy["elapsed"], 1), "x\n")

## ----greedy-strategies--------------------------------------------------------
# Compare greedy strategies on same data
set.seed(101)
test_left <- tibble(id = 1:200, x = rnorm(200))
test_right <- tibble(id = 1:200, x = rnorm(200))

strategies <- c("sorted", "row_best", "pq")
results <- list()

for (strat in strategies) {
  time <- system.time({
    result <- greedy_couples(
      test_left, test_right,
      vars = "x",
      strategy = strat
    )
  })

  results[[strat]] <- list(
    time = time["elapsed"],
    mean_dist = mean(result$pairs$distance),
    total_dist = result$info$total_distance
  )
}

# Display comparison
comparison <- do.call(rbind, lapply(names(results), function(s) {
  data.frame(
    strategy = s,
    time_sec = round(results[[s]]$time, 4),
    mean_distance = round(results[[s]]$mean_dist, 4),
    total_distance = round(results[[s]]$total_dist, 2)
  )
}))

print(comparison)

## ----calipers, fig.alt="Overlapping histograms comparing match distances with and without caliper constraint, showing the caliper excludes distant matches"----
# Create data where some units are far apart
set.seed(202)
left_cal <- tibble(
  id = 1:50,
  x = c(rnorm(40, mean = 0, sd = 1), rnorm(10, mean = 5, sd = 0.5))  # Some outliers
)
right_cal <- tibble(
  id = 1:50,
  x = rnorm(50, mean = 0, sd = 1)
)

# Match without caliper - pairs everything
result_no_cal <- match_couples(
  left_cal, right_cal,
  vars = "x",
  auto_scale = FALSE
)

# Match with caliper - excludes poor matches
result_with_cal <- match_couples(
  left_cal, right_cal,
  vars = "x",
  max_distance = 1.5,  # Caliper: max distance = 1.5
  auto_scale = FALSE
)

cat("Without caliper:\n")
cat("  Matched:", result_no_cal$info$n_matched, "\n")
cat("  Mean distance:", round(mean(result_no_cal$pairs$distance), 3), "\n")
cat("  Max distance:", round(max(result_no_cal$pairs$distance), 3), "\n\n")

cat("With caliper (1.5):\n")
cat("  Matched:", result_with_cal$info$n_matched, "\n")
cat("  Mean distance:", round(mean(result_with_cal$pairs$distance), 3), "\n")
cat("  Max distance:", round(max(result_with_cal$pairs$distance), 3), "\n")

# Visualize caliper effect
ggplot(result_no_cal$pairs, aes(x = distance)) +
  geom_histogram(aes(fill = "No caliper"), bins = 30, alpha = 0.5) +
  geom_histogram(
    data = result_with_cal$pairs,
    aes(fill = "With caliper"),
    bins = 30,
    alpha = 0.5
  ) +
  geom_vline(xintercept = 1.5, linetype = "dashed", color = "red") +
  labs(
    title = "Caliper Effect on Match Distances",
    x = "Distance",
    y = "Count",
    fill = "Condition"
  ) +
  theme_minimal() +
  theme(plot.background = element_rect(fill = "transparent", color = NA),
        panel.background = element_rect(fill = "transparent", color = NA),
        legend.background = element_rect(fill = "transparent", color = NA),
        panel.grid = element_blank())

## ----caliper-sd, eval=FALSE---------------------------------------------------
# # Calculate pooled SD
# combined <- bind_rows(
#   left_data %>% mutate(group = "left"),
#   right_data %>% mutate(group = "right")
# )
# 
# pooled_sd <- sd(combined$age)  # For single variable
# caliper_width <- 0.2 * pooled_sd
# 
# result <- match_couples(
#   left_data, right_data,
#   vars = "age",
#   max_distance = caliper_width
# )

## ----caliper-empirical--------------------------------------------------------
# Fit all matches first
all_matches <- match_couples(left_cal, right_cal, vars = "x")

# Choose caliper at 90th percentile
caliper_90 <- quantile(all_matches$pairs$distance, 0.90)

# Refit with caliper
refined_matches <- match_couples(
  left_cal, right_cal,
  vars = "x",
  max_distance = caliper_90
)

cat("90th percentile caliper:", round(caliper_90, 3), "\n")
cat("Matches retained:",
    round(100 * refined_matches$info$n_matched / all_matches$info$n_matched, 1), "%\n")

## ----blocking-exact-----------------------------------------------------------
# Create multi-site data
set.seed(303)
multi_site <- bind_rows(
  tibble(
    id = 1:100,
    site = sample(c("A", "B", "C"), 100, replace = TRUE),
    age = rnorm(100, 50, 10),
    income = rnorm(100, 55000, 15000),
    group = "treatment"
  ),
  tibble(
    id = 101:250,
    site = sample(c("A", "B", "C"), 150, replace = TRUE),
    age = rnorm(150, 50, 10),
    income = rnorm(150, 55000, 15000),
    group = "control"
  )
)

left_site <- multi_site %>% filter(group == "treatment")
right_site <- multi_site %>% filter(group == "control")

# Create exact blocks by site
blocks <- matchmaker(
  left = left_site,
  right = right_site,
  block_type = "group",
  block_by = "site"
)

cat("Blocking structure:\n")
print(blocks$block_summary)

# Match within blocks
result_blocked <- match_couples(
  left = blocks$left,
  right = blocks$right,
  vars = c("age", "income"),
  block_id = "block_id",  # Use block IDs from matchmaker
  auto_scale = TRUE
)

# Verify exact site balance
result_blocked$pairs %>%
  mutate(left_id = as.integer(left_id), right_id = as.integer(right_id)) %>%
  left_join(left_site %>% select(id, site), by = c("left_id" = "id")) %>%
  left_join(right_site %>% select(id, site), by = c("right_id" = "id"), suffix = c("_left", "_right")) %>%
  count(site_left, site_right)

## ----blocking-cluster---------------------------------------------------------
# Create blocks based on age groups (data-driven)
cluster_blocks <- matchmaker(
  left = left_site,
  right = right_site,
  block_type = "cluster",
  block_vars = "age",
  n_blocks = 3
)

cat("Cluster-based blocks:\n")
print(cluster_blocks$block_summary)

# Match within clusters
result_clustered <- match_couples(
  left = cluster_blocks$left,
  right = cluster_blocks$right,
  vars = c("age", "income"),
  block_id = "block_id",
  auto_scale = TRUE
)

# Show age distribution by cluster
cluster_blocks$left %>%
  group_by(block_id) %>%
  summarise(
    n = n(),
    mean_age = mean(age),
    sd_age = sd(age)
  )

## ----balance-assessment-------------------------------------------------------
# Perform matching
match_result <- match_couples(
  left = left_data,
  right = right_data,
  vars = c("age", "income"),
  auto_scale = TRUE
)

# Get matched samples
matched_left <- left_data %>%
  filter(id %in% match_result$pairs$left_id)

matched_right <- right_data %>%
  filter(id %in% match_result$pairs$right_id)

# Compute balance diagnostics
balance <- balance_diagnostics(
  result = match_result,
  left = left_data,
  right = right_data,
  vars = c("age", "income")
)

# Print balance summary
print(balance)

# Extract balance table for reporting
balance_table(balance)

## ----balance-plots, fig.alt="Bar chart comparing standardized differences before and after matching, with threshold lines at 0.1 and 0.25 showing improved balance after matching"----
# Before-after balance plot
# (Requires creating pre-match balance for comparison)

# For pre-match comparison, just compute summary statistics directly
pre_match_stats <- tibble(
  variable = c("age", "income"),
  std_diff = c(
    (mean(left_data$age) - mean(right_data$age)) / sqrt((sd(left_data$age)^2 + sd(right_data$age)^2) / 2),
    (mean(left_data$income) - mean(right_data$income)) / sqrt((sd(left_data$income)^2 + sd(right_data$income)^2) / 2)
  ),
  when = "Before"
)

# Combine for plotting
balance_comparison <- bind_rows(
  pre_match_stats,
  balance$var_stats %>% select(variable, std_diff) %>% mutate(when = "After")
)

ggplot(balance_comparison, aes(x = variable, y = std_diff, fill = when)) +
  geom_hline(yintercept = c(-0.1, 0.1), linetype = "dashed", color = "#93c54b", linewidth = 0.8) +
  geom_hline(yintercept = c(-0.25, 0.25), linetype = "dashed", color = "#f47c3c", linewidth = 0.8) +
  geom_col(position = "dodge", width = 0.5) +
  geom_label(aes(x = 1.5, y = -0.1, label = "±0.1 excellent"),
             fill = "#93c54b", color = "white", inherit.aes = FALSE,
             size = 3, fontface = "bold", label.padding = unit(0.2, "lines")) +
  geom_label(aes(x = 1.5, y = 0.25, label = "±0.25 acceptable"),
             fill = "#f47c3c", color = "white", inherit.aes = FALSE,
             size = 3, fontface = "bold", label.padding = unit(0.2, "lines")) +
  labs(
    title = "Covariate Balance Before and After Matching",
    x = "Variable",
    y = "Standardized Difference",
    fill = "Timing"
  ) +
  theme_minimal() +
  theme(plot.background = element_rect(fill = "transparent", color = NA),
        panel.background = element_rect(fill = "transparent", color = NA),
        legend.background = element_rect(fill = "transparent", color = NA),
        panel.grid = element_blank()) +
  coord_flip()

## ----real-world-data----------------------------------------------------------
set.seed(404)

# Simulate realistic scenario with selection bias
# Program attracts younger, more educated, currently employed individuals
create_participant <- function(n, is_treatment) {
  if (is_treatment) {
    tibble(
      id = 1:n,
      age = rnorm(n, mean = 35, sd = 8),
      education_years = rnorm(n, mean = 14, sd = 2),
      prior_earnings = rnorm(n, mean = 35000, sd = 10000),
      employed = sample(c(0, 1), n, replace = TRUE, prob = c(0.3, 0.7)),
      treatment = 1
    )
  } else {
    tibble(
      id = (n+1):(n+500),
      age = rnorm(500, mean = 42, sd = 12),
      education_years = rnorm(500, mean = 12, sd = 3),
      prior_earnings = rnorm(500, mean = 30000, sd = 12000),
      employed = sample(c(0, 1), 500, replace = TRUE, prob = c(0.5, 0.5)),
      treatment = 0
    )
  }
}

treatment_group <- create_participant(200, TRUE)
control_group <- create_participant(500, FALSE)

# Simulate outcome (earnings) with treatment effect
# True effect: +$5,000, with heterogeneity
treatment_group <- treatment_group %>%
  mutate(
    earnings = prior_earnings +
      5000 +  # True treatment effect
      2000 * rnorm(n()) +  # Random variation
      100 * education_years  # Education effect
  )

control_group <- control_group %>%
  mutate(
    earnings = prior_earnings +
      2000 * rnorm(n()) +
      100 * education_years
  )

# Examine baseline imbalance
cat("Pre-matching differences:\n")
cat("Age diff:",
    mean(treatment_group$age) - mean(control_group$age), "\n")
cat("Education diff:",
    mean(treatment_group$education_years) - mean(control_group$education_years), "\n")
cat("Prior earnings diff:",
    mean(treatment_group$prior_earnings) - mean(control_group$prior_earnings), "\n")

## ----real-world-matching------------------------------------------------------
# Match on baseline covariates
job_match <- match_couples(
  left = treatment_group,
  right = control_group,
  vars = c("age", "education_years", "prior_earnings", "employed"),
  auto_scale = TRUE,
  scale = "robust",
  return_diagnostics = TRUE
)

cat("Matching summary:\n")
cat("  Treated units:", nrow(treatment_group), "\n")
cat("  Matched treated:", job_match$info$n_matched, "\n")
cat("  Match rate:",
    round(100 * job_match$info$n_matched / nrow(treatment_group), 1), "%\n")

## ----real-world-balance-------------------------------------------------------
# Extract matched samples
matched_treated <- treatment_group %>%
  filter(id %in% job_match$pairs$left_id)

matched_control <- control_group %>%
  filter(id %in% job_match$pairs$right_id)

# Compute balance
job_balance <- balance_diagnostics(
  result = job_match,
  left = treatment_group,
  right = control_group,
  vars = c("age", "education_years", "prior_earnings", "employed")
)

print(job_balance)

# Check overall balance quality
cat("\nOverall balance:\n")
cat("  Mean |std diff|:", round(job_balance$overall$mean_abs_std_diff, 3), "\n")
cat("  Max |std diff|:", round(job_balance$overall$max_abs_std_diff, 3), "\n")
cat("  % with |std diff| > 0.1:",
    round(job_balance$overall$pct_large_imbalance, 1), "%\n")

## ----real-world-ate-----------------------------------------------------------
# Naive estimate (without matching) - BIASED
naive_effect <- mean(treatment_group$earnings) - mean(control_group$earnings)

# Matched estimate - accounts for baseline differences
matched_effect <- mean(matched_treated$earnings) - mean(matched_control$earnings)

# Paired t-test for significance
paired_comparison <- tibble(
  treated = matched_treated$earnings,
  control = matched_control$earnings[match(
    matched_treated$id,
    job_match$pairs$left_id
  )]
)

t_test <- t.test(paired_comparison$treated, paired_comparison$control, paired = TRUE)

# Report results
cat("Treatment Effect Estimates:\n\n")
cat("Naive (unmatched):\n")
cat("  Difference: $", round(naive_effect, 0), "\n")
cat("  (Upward biased due to selection)\n\n")

cat("Matched estimate:\n")
cat("  Difference: $", round(matched_effect, 0), "\n")
cat("  95% CI: ($", round(t_test$conf.int[1], 0), ", $",
    round(t_test$conf.int[2], 0), ")\n")
cat("  P-value:", format.pval(t_test$p.value, digits = 3), "\n")
cat("  (Closer to true effect of $5,000)\n")

## ----real-world-table, eval=FALSE---------------------------------------------
# # Table 1: Balance table
# balance_publication <- balance_table(job_balance)
# print(balance_publication)
# 
# # Table 2: Sample characteristics
# sample_table <- bind_rows(
#   matched_treated %>%
#     summarise(
#       Group = "Treatment",
#       N = n(),
#       `Age (mean ± SD)` = sprintf("%.1f ± %.1f", mean(age), sd(age)),
#       `Education (years)` = sprintf("%.1f ± %.1f", mean(education_years), sd(education_years)),
#       `Prior Earnings` = sprintf("$%s ± %s",
#                                  format(round(mean(prior_earnings)), big.mark = ","),
#                                  format(round(sd(prior_earnings)), big.mark = ",")),
#       `Employed (%)` = sprintf("%.1f", 100 * mean(employed))
#     ),
#   matched_control %>%
#     summarise(
#       Group = "Control",
#       N = n(),
#       `Age (mean ± SD)` = sprintf("%.1f ± %.1f", mean(age), sd(age)),
#       `Education (years)` = sprintf("%.1f ± %.1f", mean(education_years), sd(education_years)),
#       `Prior Earnings` = sprintf("$%s ± %s",
#                                  format(round(mean(prior_earnings)), big.mark = ","),
#                                  format(round(sd(prior_earnings)), big.mark = ",")),
#       `Employed (%)` = sprintf("%.1f", 100 * mean(employed))
#     )
# )
# 
# print(sample_table)
# 
# # Table 3: Treatment effect
# effect_table <- tibble(
#   Method = c("Unmatched", "Matched"),
#   `N (Treated)` = c(nrow(treatment_group), nrow(matched_treated)),
#   `N (Control)` = c(nrow(control_group), nrow(matched_control)),
#   `Effect Estimate` = sprintf("$%s", format(round(c(naive_effect, matched_effect)), big.mark = ",")),
#   `95% CI` = c("--", sprintf("($%s, $%s)",
#                             format(round(t_test$conf.int[1]), big.mark = ","),
#                             format(round(t_test$conf.int[2]), big.mark = ",")))
# )
# 
# print(effect_table)

## ----optimization-blocking, eval=FALSE----------------------------------------
# # Instead of matching 10,000 × 10,000:
# # Create 10 blocks of ~1,000 × 1,000 each
# blocks <- matchmaker(
#   left_large, right_large,
#   block_type = "cluster",
#   cluster_vars = "age",
#   n_clusters = 10
# )
# 
# # Much faster: 10 * O(1000^3) << O(10000^3)
# result <- match_couples(
#   blocks$left, blocks$right,
#   vars = covariates,
#   block_id = "block_id"
# )

## ----optimization-greedy, eval=FALSE------------------------------------------
# # Quick greedy match for exploration
# quick <- greedy_couples(
#   left_data, right_data,
#   vars = covariates,
#   strategy = "row_best"
# )
# 
# # Assess balance
# balance_quick <- balance_diagnostics(quick, left_data, right_data, vars = covariates)
# 
# # If balance is acceptable, done!
# # If not, try optimal or add blocking

## ----optimization-caliper, eval=FALSE-----------------------------------------
# # Caliper removes distant pairs from cost matrix
# # Can dramatically reduce effective problem size
# result <- match_couples(
#   left_data, right_data,
#   vars = covariates,
#   max_distance = 0.25,  # Strict caliper
#   auto_scale = TRUE
# )

## ----poor-balance-fix, eval=FALSE---------------------------------------------
# # 1. Add more matching variables
# result <- match_couples(left, right,
#                         vars = c("age", "income", "education", "region"),  # Added!
#                         auto_scale = TRUE)
# 
# # 2. Tighten caliper (fewer but better matches)
# result <- match_couples(left, right, vars = vars,
#                         max_distance = 0.1)  # Was 0.5
# 
# # 3. Block on the problematic variable
# blocks <- matchmaker(left, right, block_type = "group", block_by = "region")
# result <- match_couples(blocks$left, blocks$right, vars = other_vars,
#                         block_id = "block_id")

## ----few-matches-diagnosis, eval=FALSE----------------------------------------
# # Check covariate overlap
# library(ggplot2)
# combined <- bind_rows(
#   left %>% mutate(group = "treatment"),
#   right %>% mutate(group = "control")
# )
# ggplot(combined, aes(x = age, fill = group)) +
#   geom_density(alpha = 0.5) +
#   labs(title = "Check for Overlap")

## ----slow-matching-fix, eval=FALSE--------------------------------------------
# # For n > 3000: use greedy
# result <- greedy_couples(left, right, vars = vars, strategy = "sorted")
# 
# # For n > 5000: add blocking
# blocks <- matchmaker(left, right, block_type = "cluster", n_blocks = 20)
# result <- match_couples(blocks$left, blocks$right, vars = vars,
#                         block_id = "block_id")

## ----workflow-diagram, fig.width=8, fig.height=6, echo=FALSE, fig.alt="Flowchart showing recommended matching workflow with iterative refinement loop"----
library(ggplot2)

# Define workflow nodes
nodes <- data.frame(
  label = c("1. Explore data\nIdentify confounders",
            "2. First match\nmatch_couples()",
            "3. Check balance\nbalance_diagnostics()",
            "Balance\nOK?",
            "4. Estimate\neffect",
            "Refine: caliper,\nblocking, more vars"),
  x = c(4, 4, 4, 4, 2, 6),
  y = c(6, 5, 4, 3, 2, 2),
  type = c("step", "step", "step", "decision", "step", "step")
)

# Define edges
edges <- data.frame(
  from_x = c(4, 4, 4, 4, 4, 6),
  from_y = c(6, 5, 4, 3, 3, 2),
  to_x = c(4, 4, 4, 2, 6, 4),
  to_y = c(5, 4, 3, 2, 2, 4),
  label = c("", "", "", "Yes", "No", ""),
  label_x = c(NA, NA, NA, 2.8, 5.2, 5.5),
  label_y = c(NA, NA, NA, 2.6, 2.6, 3)
)

ggplot() +
  # Draw straight edges (vertical arrows)
  geom_segment(data = edges[1:3, ],
               aes(x = from_x, y = from_y, xend = to_x, yend = to_y),
               arrow = arrow(length = unit(0.15, "cm"), type = "closed"),
               color = "#3e3f3a", linewidth = 0.7) +

  # Draw decision branches
  geom_segment(data = edges[4:5, ],
               aes(x = from_x, y = from_y, xend = to_x, yend = to_y),
               arrow = arrow(length = unit(0.15, "cm"), type = "closed"),
               color = "#3e3f3a", linewidth = 0.7) +
  # Draw return loop (curved)
  geom_curve(data = edges[6, ],
             aes(x = from_x, y = from_y, xend = to_x, yend = to_y),
             arrow = arrow(length = unit(0.15, "cm"), type = "closed"),
             color = "#3e3f3a", linewidth = 0.7, curvature = 0.3) +
  # Edge labels - no explicit color so CSS can control dark mode
  geom_text(data = edges[!is.na(edges$label_x), ],
            aes(x = label_x, y = label_y, label = label),
            size = 3, fontface = "italic") +
  # Step nodes (rectangles) - sandstone info color with white text
  geom_label(data = nodes[nodes$type == "step", ],
             aes(x = x, y = y, label = label), size = 3,
             fill = "#29abe0", color = "white",
             label.padding = unit(0.4, "lines"), label.r = unit(0.15, "lines"),
             lineheight = 0.9) +
  # Decision node (diamond) - sandstone warning color
  geom_point(data = nodes[nodes$type == "decision", ],
             aes(x = x, y = y), shape = 23, size = 20,
             fill = "#ffc107", color = "#f47c3c", stroke = 1.5) +
  # Decision text - use default color that adapts to dark mode
  geom_text(data = nodes[nodes$type == "decision", ],
            aes(x = x, y = y, label = label), size = 3, lineheight = 0.85) +
  # Title
  labs(title = "Matching Workflow") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
        axis.text = element_blank(),
        axis.title = element_blank(),
        panel.grid = element_blank(),
        plot.background = element_rect(fill = "transparent", color = NA),
        panel.background = element_rect(fill = "transparent", color = NA)) +
  coord_cartesian(xlim = c(0, 8), ylim = c(1.5, 6.5))

Try the couplr package in your browser

Any scripts or data that you put into this service are public.

couplr documentation built on Jan. 20, 2026, 5:07 p.m.