knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, error = TRUE ) library(couplr) library(dplyr)
This vignette provides solutions to common issues encountered when using couplr. Each section describes a problem, explains why it occurs, and provides step-by-step solutions.
Quick reference:
| Issue | Jump to Section | |-------|-----------------| | No valid pairs found | Infeasible Problems | | Poor balance after matching | Poor Balance | | Matching is too slow | Performance Issues | | Memory errors | Memory Errors | | Different results with different methods | Non-Determinism | | Unexpected NA in results | Missing Values | | C++ compilation errors | Installation Issues |
# This fails: all assignments have Inf cost cost <- matrix(c(1, Inf, Inf, Inf, Inf, Inf, Inf, 2, 3), nrow = 3, byrow = TRUE) result <- lap_solve(cost)
Error message: "No feasible assignment found" or total cost is Inf.
A feasible assignment requires that every row can be assigned to at least one column with finite cost. When rows have all-Inf or all-NA entries, no valid assignment exists.
# Check for infeasible rows check_feasibility <- function(cost_matrix) { finite_per_row <- rowSums(is.finite(cost_matrix)) infeasible_rows <- which(finite_per_row == 0) if (length(infeasible_rows) > 0) { cat("Infeasible rows (no finite costs):", infeasible_rows, "\n") return(FALSE) } finite_per_col <- colSums(is.finite(cost_matrix)) infeasible_cols <- which(finite_per_col == 0) if (length(infeasible_cols) > 0) { cat("Infeasible columns (no finite costs):", infeasible_cols, "\n") return(FALSE) } cat("Problem appears feasible\n") return(TRUE) } # Check the problematic matrix cost <- matrix(c(1, Inf, Inf, Inf, Inf, Inf, Inf, 2, 3), nrow = 3, byrow = TRUE) check_feasibility(cost)
1. Remove infeasible rows/columns:
# Remove rows with no valid assignments cost <- matrix(c(1, Inf, Inf, Inf, Inf, Inf, Inf, 2, 3), nrow = 3, byrow = TRUE) valid_rows <- rowSums(is.finite(cost)) > 0 cost_valid <- cost[valid_rows, , drop = FALSE] if (nrow(cost_valid) > 0) { result <- lap_solve(cost_valid) cat("Matched", nrow(result), "of", nrow(cost), "rows\n") }
2. Add fallback costs:
# Replace Inf with high (but finite) penalty cost_with_fallback <- cost cost_with_fallback[!is.finite(cost_with_fallback)] <- 1e6 # Large penalty result <- lap_solve(cost_with_fallback) print(result)
3. For matching: check covariate overlap:
# Simulate poor overlap scenario set.seed(123) left <- tibble(id = 1:50, age = rnorm(50, mean = 25, sd = 3)) right <- tibble(id = 1:50, age = rnorm(50, mean = 55, sd = 3)) # Visualize overlap library(ggplot2) combined <- bind_rows( left %>% mutate(group = "Left"), right %>% mutate(group = "Right") ) ggplot(combined, aes(x = age, fill = group)) + geom_density(alpha = 0.5) + labs(title = "Poor Covariate Overlap", subtitle = "No overlap means no good matches possible") + theme_minimal() + theme(plot.background = element_rect(fill = "transparent", color = NA), panel.background = element_rect(fill = "transparent", color = NA))
After matching, balance_diagnostics() shows |std_diff| > 0.25 for some variables.
# Example of poor balance set.seed(456) left <- tibble( id = 1:100, age = rnorm(100, 30, 5), income = rnorm(100, 80000, 20000) # Very different from right ) right <- tibble( id = 1:100, age = rnorm(100, 32, 5), income = rnorm(100, 40000, 10000) # Very different income ) result <- match_couples(left, right, vars = c("age", "income"), auto_scale = TRUE) balance <- balance_diagnostics(result, left, right, vars = c("age", "income")) print(balance)
1. Add more matching variables:
# Include additional relevant variables result <- match_couples( left, right, vars = c("age", "income", "education", "region"), # More variables auto_scale = TRUE )
2. Tighten caliper (fewer but better matches):
result_strict <- match_couples( left, right, vars = c("age", "income"), max_distance = 0.3, # Stricter caliper auto_scale = TRUE ) cat("Original matches:", result$info$n_matched, "\n") cat("With caliper:", result_strict$info$n_matched, "\n") balance_strict <- balance_diagnostics(result_strict, left, right, vars = c("age", "income")) print(balance_strict)
3. Use blocking on problematic variable:
# Block on income tertiles to ensure exact balance left$income_cat <- cut(left$income, breaks = 3, labels = c("low", "mid", "high")) right$income_cat <- cut(right$income, breaks = 3, labels = c("low", "mid", "high")) blocks <- matchmaker(left, right, block_type = "group", block_by = "income_cat") result_blocked <- match_couples( blocks$left, blocks$right, vars = c("age"), # Match on age within income blocks block_id = "block_id" )
4. Try different scaling:
# Compare scaling methods for (scale_method in c("robust", "standardize", "range")) { res <- match_couples(left, right, vars = c("age", "income"), auto_scale = TRUE, scale = scale_method) bal <- balance_diagnostics(res, left, right, vars = c("age", "income")) cat(scale_method, "- max |std_diff|:", round(bal$overall$max_abs_std_diff, 3), "\n") }
match_couples() or lap_solve() takes too long or doesn't complete.
Optimal matching has $O(n^3)$ complexity. For n = 5,000, this means ~125 billion operations.
# Estimate runtime estimate_runtime <- function(n, seconds_per_billion = 1) { ops <- n^3 time_sec <- ops / 1e9 * seconds_per_billion if (time_sec < 60) { sprintf("%.1f seconds", time_sec) } else if (time_sec < 3600) { sprintf("%.1f minutes", time_sec / 60) } else { sprintf("%.1f hours", time_sec / 3600) } } cat("Estimated runtime for optimal matching:\n") for (n in c(100, 500, 1000, 3000, 5000, 10000)) { cat(sprintf(" n = %5d: %s\n", n, estimate_runtime(n))) }
1. Use greedy matching for large problems:
set.seed(789) n <- 500 large_left <- tibble(id = 1:n, x1 = rnorm(n), x2 = rnorm(n)) large_right <- tibble(id = 1:n, x1 = rnorm(n), x2 = rnorm(n)) # Greedy is much faster time_greedy <- system.time({ result_greedy <- greedy_couples( large_left, large_right, vars = c("x1", "x2"), strategy = "row_best" ) }) cat("Greedy matching (n=500):", round(time_greedy["elapsed"], 2), "seconds\n") cat("Quality (mean distance):", round(mean(result_greedy$pairs$distance), 4), "\n")
2. Use blocking to divide the problem:
# Create clusters to match within blocks <- matchmaker( large_left, large_right, block_type = "cluster", block_vars = c("x1", "x2"), n_blocks = 10 # 10 blocks of ~200 each ) # Match within blocks (10 x O(200^3) << O(2000^3)) result_blocked <- match_couples( blocks$left, blocks$right, vars = c("x1", "x2"), block_id = "block_id" )
3. Choose a faster algorithm:
# For n > 1000, auction algorithm often faster result <- match_couples( large_left, large_right, vars = c("x1", "x2"), method = "auction" ) # For sparse problems (many forbidden pairs) result <- match_couples( left, right, vars = vars, max_distance = 0.5, # Creates sparsity method = "sap" # Sparse algorithm )
4. Pre-compute and cache distances:
# Compute once, reuse multiple times dist_cache <- compute_distances( large_left, large_right, vars = c("x1", "x2"), scale = "robust" ) # Fast: reuse cached distances result1 <- match_couples(dist_cache, max_distance = 0.3) result2 <- match_couples(dist_cache, max_distance = 0.5) result3 <- match_couples(dist_cache, max_distance = 1.0)
R crashes or shows: "Error: cannot allocate vector of size X GB"
A full distance matrix for n×n requires 8n² bytes:
# Memory requirements memory_needed <- function(n) { bytes <- 8 * n^2 if (bytes < 1e6) { sprintf("%.1f KB", bytes / 1e3) } else if (bytes < 1e9) { sprintf("%.1f MB", bytes / 1e6) } else { sprintf("%.1f GB", bytes / 1e9) } } cat("Memory for full distance matrix:\n") for (n in c(1000, 5000, 10000, 20000, 50000)) { cat(sprintf(" n = %5d: %s\n", n, memory_needed(n))) }
1. Use greedy matching (avoids full matrix):
# Greedy computes distances on-the-fly result <- greedy_couples( left, right, vars = covariates, strategy = "row_best" # Most memory-efficient )
2. Use blocking to create smaller subproblems:
# Each block is much smaller blocks <- matchmaker(left, right, block_type = "cluster", n_blocks = 20) result <- match_couples(blocks$left, blocks$right, vars = vars, block_id = "block_id")
3. Use caliper to create sparse matrix:
# Caliper excludes distant pairs (sparse representation) result <- match_couples( left, right, vars = covariates, max_distance = 0.5, method = "sap" # Sparse-optimized algorithm )
4. Increase R's memory limit (Windows):
# Increase to 16 GB (if available) memory.limit(size = 16000)
Different algorithms return different assignments:
cost <- matrix(c(1, 2, 2, 2, 1, 2, 2, 2, 1), nrow = 3, byrow = TRUE) result_jv <- lap_solve(cost, method = "jv") result_hungarian <- lap_solve(cost, method = "hungarian") cat("JV assignment: ", result_jv$target, "\n") cat("Hungarian assignment:", result_hungarian$target, "\n") cat("JV total cost: ", get_total_cost(result_jv), "\n") cat("Hungarian total cost:", get_total_cost(result_hungarian), "\n")
When multiple optimal solutions exist (tied costs), different algorithms may find different ones. The total cost should be the same; if not, report a bug.
# Check for ties check_ties <- function(cost_matrix) { n <- nrow(cost_matrix) # Check if diagonal dominates (trivial ties) diag_costs <- diag(cost_matrix) if (length(unique(diag_costs)) < n) { cat("Tied costs on diagonal - multiple optima likely\n") } # Check cost uniqueness unique_costs <- length(unique(as.vector(cost_matrix))) total_entries <- length(cost_matrix) if (unique_costs < total_entries * 0.5) { cat("Many repeated costs - ties possible\n") } } check_ties(cost)
1. Verify total costs match:
# Total cost should be identical stopifnot(get_total_cost(result_jv) == get_total_cost(result_hungarian)) cat("Both methods found optimal solutions (same total cost)\n")
2. Use Hungarian for deterministic tie-breaking:
# Hungarian has consistent tie-breaking result <- lap_solve(cost, method = "hungarian")
3. Add small noise to break ties:
set.seed(42) cost_perturbed <- cost + matrix(rnorm(9, 0, 1e-10), 3, 3) result <- lap_solve(cost_perturbed)
Matching fails or produces unexpected results due to NA values.
left <- tibble(id = 1:5, age = c(25, 30, NA, 35, 40)) right <- tibble(id = 1:5, age = c(28, 32, 33, 36, 42)) # This may fail or give unexpected results result <- match_couples(left, right, vars = "age")
couplr requires complete cases for distance computation. NA values in matching variables cause issues.
1. Remove rows with NA (before matching):
left_clean <- left %>% filter(!is.na(age)) right_clean <- right %>% filter(!is.na(age)) result <- match_couples(left_clean, right_clean, vars = "age") cat("Matched", result$info$n_matched, "pairs (excluded 1 left unit with NA)\n")
2. Impute missing values:
# Simple mean imputation left_imputed <- left %>% mutate(age = if_else(is.na(age), mean(age, na.rm = TRUE), age)) result <- match_couples(left_imputed, right, vars = "age") cat("Matched", result$info$n_matched, "pairs (imputed 1 NA with mean)\n")
3. Use preprocessing to diagnose:
health <- preprocess_matching_vars( left, right, vars = "age" ) print(health)
Error in .Call("_couplr_assignment_impl"): object not found
or
Error: package 'couplr' was built under R version X.X
C++ code not compiled properly, or object files are stale.
1. Clean and reinstall:
# In R: remove.packages("couplr") install.packages("couplr") # Or from GitHub: devtools::install_github("gcol33/couplr", force = TRUE)
2. Clean compiled files (development):
# PowerShell Remove-Item src\*.o, src\*.dll -Force
# Bash
rm src/*.o src/*.so
Then rebuild:
devtools::clean_dll() devtools::load_all()
3. Check Rtools/compiler (Windows):
# Check if Rtools is properly configured Sys.which("make") Sys.which("g++")
Make sure Rtools is installed and PATH is configured correctly.
Cause: Result object wasn't created properly.
Solution: Use lap_solve() instead of assignment() for the tidy interface.
Cause: NA or NaN values in cost matrix (not Inf).
Solution: Replace NA with Inf for forbidden assignments, or remove rows/cols.
cost <- matrix(c(1, NA, 3, 4), 2, 2) cost[is.na(cost)] <- Inf # Mark as forbidden result <- lap_solve(cost)
Cause: Caliper is too strict; all potential matches exceed threshold.
Solution: Increase max_distance or check for overlap issues.
Cause: Empty left or right data frame, or ID mismatch.
Solution: Check data dimensions and ID column consistency.
# Always verify data before matching stopifnot(nrow(left) > 0, nrow(right) > 0)
If you encounter an issue not covered here:
?match_couples, ?lap_solve# Minimal example template library(couplr) # Minimal data that reproduces the issue set.seed(123) left <- tibble(id = 1:10, x = rnorm(10)) right <- tibble(id = 1:10, x = rnorm(10)) # Code that causes the error result <- match_couples(left, right, vars = "x") # Expected vs actual behavior # Expected: ... # Actual: [error message] # Session info sessionInfo()
vignette("getting-started") - Basic usagevignette("matching-workflows") - Production matching pipelinesvignette("algorithms") - Algorithm selection guidevignette("comparison") - Comparison with other packagesAny scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.