Nothing
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.width = 7,
fig.height = 5,
error = TRUE
)
library(couplr)
library(dplyr)
## ----error=TRUE---------------------------------------------------------------
try({
# This fails: all assignments have Inf cost
cost <- matrix(c(1, Inf, Inf, Inf, Inf, Inf, Inf, 2, 3), nrow = 3, byrow = TRUE)
result <- lap_solve(cost)
})
## -----------------------------------------------------------------------------
# Check for infeasible rows
check_feasibility <- function(cost_matrix) {
finite_per_row <- rowSums(is.finite(cost_matrix))
infeasible_rows <- which(finite_per_row == 0)
if (length(infeasible_rows) > 0) {
cat("Infeasible rows (no finite costs):", infeasible_rows, "\n")
return(FALSE)
}
finite_per_col <- colSums(is.finite(cost_matrix))
infeasible_cols <- which(finite_per_col == 0)
if (length(infeasible_cols) > 0) {
cat("Infeasible columns (no finite costs):", infeasible_cols, "\n")
return(FALSE)
}
cat("Problem appears feasible\n")
return(TRUE)
}
# Check the problematic matrix
cost <- matrix(c(1, Inf, Inf, Inf, Inf, Inf, Inf, 2, 3), nrow = 3, byrow = TRUE)
check_feasibility(cost)
## -----------------------------------------------------------------------------
# Remove rows with no valid assignments
cost <- matrix(c(1, Inf, Inf, Inf, Inf, Inf, Inf, 2, 3), nrow = 3, byrow = TRUE)
valid_rows <- rowSums(is.finite(cost)) > 0
cost_valid <- cost[valid_rows, , drop = FALSE]
if (nrow(cost_valid) > 0) {
result <- lap_solve(cost_valid)
cat("Matched", nrow(result), "of", nrow(cost), "rows\n")
}
## -----------------------------------------------------------------------------
# Replace Inf with high (but finite) penalty
cost_with_fallback <- cost
cost_with_fallback[!is.finite(cost_with_fallback)] <- 1e6 # Large penalty
result <- lap_solve(cost_with_fallback)
print(result)
## ----fig.width=7, fig.height=4, fig.alt="Density plot showing overlap between treatment and control groups on the age variable"----
# Simulate poor overlap scenario
set.seed(123)
left <- tibble(id = 1:50, age = rnorm(50, mean = 25, sd = 3))
right <- tibble(id = 1:50, age = rnorm(50, mean = 55, sd = 3))
# Visualize overlap
library(ggplot2)
combined <- bind_rows(
left %>% mutate(group = "Left"),
right %>% mutate(group = "Right")
)
ggplot(combined, aes(x = age, fill = group)) +
geom_density(alpha = 0.5) +
labs(title = "Poor Covariate Overlap",
subtitle = "No overlap means no good matches possible") +
theme_minimal() +
theme(plot.background = element_rect(fill = "transparent", color = NA),
panel.background = element_rect(fill = "transparent", color = NA))
## -----------------------------------------------------------------------------
# Example of poor balance
set.seed(456)
left <- tibble(
id = 1:100,
age = rnorm(100, 30, 5),
income = rnorm(100, 80000, 20000) # Very different from right
)
right <- tibble(
id = 1:100,
age = rnorm(100, 32, 5),
income = rnorm(100, 40000, 10000) # Very different income
)
result <- match_couples(left, right, vars = c("age", "income"), auto_scale = TRUE)
balance <- balance_diagnostics(result, left, right, vars = c("age", "income"))
print(balance)
## ----eval=FALSE---------------------------------------------------------------
# # Include additional relevant variables
# result <- match_couples(
# left, right,
# vars = c("age", "income", "education", "region"), # More variables
# auto_scale = TRUE
# )
## -----------------------------------------------------------------------------
result_strict <- match_couples(
left, right,
vars = c("age", "income"),
max_distance = 0.3, # Stricter caliper
auto_scale = TRUE
)
cat("Original matches:", result$info$n_matched, "\n")
cat("With caliper:", result_strict$info$n_matched, "\n")
balance_strict <- balance_diagnostics(result_strict, left, right, vars = c("age", "income"))
print(balance_strict)
## ----eval=FALSE---------------------------------------------------------------
# # Block on income tertiles to ensure exact balance
# left$income_cat <- cut(left$income, breaks = 3, labels = c("low", "mid", "high"))
# right$income_cat <- cut(right$income, breaks = 3, labels = c("low", "mid", "high"))
#
# blocks <- matchmaker(left, right, block_type = "group", block_by = "income_cat")
# result_blocked <- match_couples(
# blocks$left, blocks$right,
# vars = c("age"), # Match on age within income blocks
# block_id = "block_id"
# )
## -----------------------------------------------------------------------------
# Compare scaling methods
for (scale_method in c("robust", "standardize", "range")) {
res <- match_couples(left, right, vars = c("age", "income"),
auto_scale = TRUE, scale = scale_method)
bal <- balance_diagnostics(res, left, right, vars = c("age", "income"))
cat(scale_method, "- max |std_diff|:",
round(bal$overall$max_abs_std_diff, 3), "\n")
}
## -----------------------------------------------------------------------------
# Estimate runtime
estimate_runtime <- function(n, seconds_per_billion = 1) {
ops <- n^3
time_sec <- ops / 1e9 * seconds_per_billion
if (time_sec < 60) {
sprintf("%.1f seconds", time_sec)
} else if (time_sec < 3600) {
sprintf("%.1f minutes", time_sec / 60)
} else {
sprintf("%.1f hours", time_sec / 3600)
}
}
cat("Estimated runtime for optimal matching:\n")
for (n in c(100, 500, 1000, 3000, 5000, 10000)) {
cat(sprintf(" n = %5d: %s\n", n, estimate_runtime(n)))
}
## -----------------------------------------------------------------------------
set.seed(789)
n <- 500
large_left <- tibble(id = 1:n, x1 = rnorm(n), x2 = rnorm(n))
large_right <- tibble(id = 1:n, x1 = rnorm(n), x2 = rnorm(n))
# Greedy is much faster
time_greedy <- system.time({
result_greedy <- greedy_couples(
large_left, large_right,
vars = c("x1", "x2"),
strategy = "row_best"
)
})
cat("Greedy matching (n=500):", round(time_greedy["elapsed"], 2), "seconds\n")
cat("Quality (mean distance):", round(mean(result_greedy$pairs$distance), 4), "\n")
## ----eval=FALSE---------------------------------------------------------------
# # Create clusters to match within
# blocks <- matchmaker(
# large_left, large_right,
# block_type = "cluster",
# block_vars = c("x1", "x2"),
# n_blocks = 10 # 10 blocks of ~200 each
# )
#
# # Match within blocks (10 x O(200^3) << O(2000^3))
# result_blocked <- match_couples(
# blocks$left, blocks$right,
# vars = c("x1", "x2"),
# block_id = "block_id"
# )
## ----eval=FALSE---------------------------------------------------------------
# # For n > 1000, auction algorithm often faster
# result <- match_couples(
# large_left, large_right,
# vars = c("x1", "x2"),
# method = "auction"
# )
#
# # For sparse problems (many forbidden pairs)
# result <- match_couples(
# left, right,
# vars = vars,
# max_distance = 0.5, # Creates sparsity
# method = "sap" # Sparse algorithm
# )
## -----------------------------------------------------------------------------
# Compute once, reuse multiple times
dist_cache <- compute_distances(
large_left, large_right,
vars = c("x1", "x2"),
scale = "robust"
)
# Fast: reuse cached distances
result1 <- match_couples(dist_cache, max_distance = 0.3)
result2 <- match_couples(dist_cache, max_distance = 0.5)
result3 <- match_couples(dist_cache, max_distance = 1.0)
## -----------------------------------------------------------------------------
# Memory requirements
memory_needed <- function(n) {
bytes <- 8 * n^2
if (bytes < 1e6) {
sprintf("%.1f KB", bytes / 1e3)
} else if (bytes < 1e9) {
sprintf("%.1f MB", bytes / 1e6)
} else {
sprintf("%.1f GB", bytes / 1e9)
}
}
cat("Memory for full distance matrix:\n")
for (n in c(1000, 5000, 10000, 20000, 50000)) {
cat(sprintf(" n = %5d: %s\n", n, memory_needed(n)))
}
## ----eval=FALSE---------------------------------------------------------------
# # Greedy computes distances on-the-fly
# result <- greedy_couples(
# left, right,
# vars = covariates,
# strategy = "row_best" # Most memory-efficient
# )
## ----eval=FALSE---------------------------------------------------------------
# # Each block is much smaller
# blocks <- matchmaker(left, right, block_type = "cluster", n_blocks = 20)
# result <- match_couples(blocks$left, blocks$right, vars = vars, block_id = "block_id")
## ----eval=FALSE---------------------------------------------------------------
# # Caliper excludes distant pairs (sparse representation)
# result <- match_couples(
# left, right,
# vars = covariates,
# max_distance = 0.5,
# method = "sap" # Sparse-optimized algorithm
# )
## ----eval=FALSE---------------------------------------------------------------
# # Increase to 16 GB (if available)
# memory.limit(size = 16000)
## -----------------------------------------------------------------------------
cost <- matrix(c(1, 2, 2, 2, 1, 2, 2, 2, 1), nrow = 3, byrow = TRUE)
result_jv <- lap_solve(cost, method = "jv")
result_hungarian <- lap_solve(cost, method = "hungarian")
cat("JV assignment: ", result_jv$target, "\n")
cat("Hungarian assignment:", result_hungarian$target, "\n")
cat("JV total cost: ", get_total_cost(result_jv), "\n")
cat("Hungarian total cost:", get_total_cost(result_hungarian), "\n")
## -----------------------------------------------------------------------------
# Check for ties
check_ties <- function(cost_matrix) {
n <- nrow(cost_matrix)
# Check if diagonal dominates (trivial ties)
diag_costs <- diag(cost_matrix)
if (length(unique(diag_costs)) < n) {
cat("Tied costs on diagonal - multiple optima likely\n")
}
# Check cost uniqueness
unique_costs <- length(unique(as.vector(cost_matrix)))
total_entries <- length(cost_matrix)
if (unique_costs < total_entries * 0.5) {
cat("Many repeated costs - ties possible\n")
}
}
check_ties(cost)
## -----------------------------------------------------------------------------
# Total cost should be identical
stopifnot(get_total_cost(result_jv) == get_total_cost(result_hungarian))
cat("Both methods found optimal solutions (same total cost)\n")
## -----------------------------------------------------------------------------
# Hungarian has consistent tie-breaking
result <- lap_solve(cost, method = "hungarian")
## -----------------------------------------------------------------------------
set.seed(42)
cost_perturbed <- cost + matrix(rnorm(9, 0, 1e-10), 3, 3)
result <- lap_solve(cost_perturbed)
## ----error=TRUE---------------------------------------------------------------
try({
left <- tibble(id = 1:5, age = c(25, 30, NA, 35, 40))
right <- tibble(id = 1:5, age = c(28, 32, 33, 36, 42))
# This may fail or give unexpected results
result <- match_couples(left, right, vars = "age")
})
## -----------------------------------------------------------------------------
left_clean <- left %>% filter(!is.na(age))
right_clean <- right %>% filter(!is.na(age))
result <- match_couples(left_clean, right_clean, vars = "age")
cat("Matched", result$info$n_matched, "pairs (excluded 1 left unit with NA)\n")
## -----------------------------------------------------------------------------
# Simple mean imputation
left_imputed <- left %>%
mutate(age = if_else(is.na(age), mean(age, na.rm = TRUE), age))
result <- match_couples(left_imputed, right, vars = "age")
cat("Matched", result$info$n_matched, "pairs (imputed 1 NA with mean)\n")
## -----------------------------------------------------------------------------
health <- preprocess_matching_vars(
left, right,
vars = "age"
)
print(health)
## ----eval=FALSE---------------------------------------------------------------
# # In R:
# remove.packages("couplr")
# install.packages("couplr")
#
# # Or from GitHub:
# devtools::install_github("gcol33/couplr", force = TRUE)
## ----eval=FALSE---------------------------------------------------------------
# devtools::clean_dll()
# devtools::load_all()
## -----------------------------------------------------------------------------
# Check if Rtools is properly configured
Sys.which("make")
Sys.which("g++")
## -----------------------------------------------------------------------------
cost <- matrix(c(1, NA, 3, 4), 2, 2)
cost[is.na(cost)] <- Inf # Mark as forbidden
result <- lap_solve(cost)
## -----------------------------------------------------------------------------
# Always verify data before matching
stopifnot(nrow(left) > 0, nrow(right) > 0)
## ----eval=FALSE---------------------------------------------------------------
# # Minimal example template
# library(couplr)
#
# # Minimal data that reproduces the issue
# set.seed(123)
# left <- tibble(id = 1:10, x = rnorm(10))
# right <- tibble(id = 1:10, x = rnorm(10))
#
# # Code that causes the error
# result <- match_couples(left, right, vars = "x")
#
# # Expected vs actual behavior
# # Expected: ...
# # Actual: [error message]
#
# # Session info
# sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.