tests/testthat/test-data-sim.R

test_that("sim_dgp_independent generates correct structure", {
	set.seed(42) # Fixed seed for reproducibility
	task <- sim_dgp_independent(n = 200)
	data <- task$data()

	# Basic structure tests
	expect_s3_class(task, "TaskRegr")
	expect_equal(nrow(data), 200)
	expect_equal(task$target_names, "y")
	expect_setequal(
		task$feature_names,
		c("important1", "important2", "important3", "unimportant1", "unimportant2")
	)
	expect_true(grepl("^independent_", task$id))

	# Check data types
	expect_true(all(sapply(data, is.numeric)))

	# Check approximate independence (correlations should be low)
	cor_matrix <- cor(data[, .(important1, important2, important3, unimportant1, unimportant2)])
	off_diagonal <- cor_matrix[upper.tri(cor_matrix)]
	expect_lt(mean(abs(off_diagonal)), 0.2) # Low correlations

	# Check that y has reasonable correlation with predictors
	cor_y <- cor(data$y, data[, .(important1, important2, important3)])
	# Check each correlation individually for better error messages
	expect_gt(abs(cor_y[1, 1]), 0.15) # important1 should have meaningful correlation
	expect_gt(abs(cor_y[1, 2]), 0.15) # important2 should have meaningful correlation
	expect_gt(abs(cor_y[1, 3]), 0.15) # important3 should have meaningful correlation

	# Noise features should have low correlation
	expect_lt(abs(cor(data$y, data$unimportant1)), 0.3)
	expect_lt(abs(cor(data$y, data$unimportant2)), 0.3)
})

test_that("sim_dgp_correlated generates correlated features", {
	task <- sim_dgp_correlated(n = 200, r = 0.9)
	data <- task$data()

	# Basic structure tests
	expect_s3_class(task, "TaskRegr")
	expect_equal(nrow(data), 200)
	expect_setequal(task$feature_names, c("x1", "x2", "x3", "x4"))
	expect_true(grepl("^correlated_", task$id))

	# Check high correlation between x1 and x2
	cor_x1_x2 <- cor(data$x1, data$x2)
	expect_gt(cor_x1_x2, 0.8) # Should be highly correlated

	# Check that x3 and x4 are less correlated with x1, x2
	expect_lt(abs(cor(data$x1, data$x3)), 0.3)
	expect_lt(abs(cor(data$x1, data$x4)), 0.3)

	# Check that x1 and x3 contribute to y (x2 is spurious, x4 is noise)
	lm_fit <- lm(y ~ x1 + x2 + x3 + x4, data = data)
	expect_gt(summary(lm_fit)$r.squared, 0.7) # Should explain most variance

	# Also check a model with just the true causal features
	lm_causal <- lm(y ~ x1 + x3, data = data)
	expect_gt(summary(lm_causal)$r.squared, 0.9) # Causal features alone should explain most variance
})

test_that("sim_dgp_correlated generates correlated features with different strength", {
	for (r in c(0.2, 0.5, 0.8)) {
		task <- sim_dgp_correlated(n = 1000, r = r)
		cor_x1_x2 <- cor(task$data(cols = c("x1", "x2")))[1, 2]
		expect_gt(cor_x1_x2, r - 0.1)
		expect_lt(cor_x1_x2, r + 0.1)
	}
})


test_that("sim_dgp_mediated generates mediation structure", {
	task <- sim_dgp_mediated(n = 150)
	data <- task$data()

	# Basic structure tests
	expect_s3_class(task, "TaskRegr")
	expect_equal(nrow(data), 150)
	expect_setequal(task$feature_names, c("exposure", "mediator", "direct", "noise"))
	expect_true(grepl("^mediated_", task$id))

	# Check mediation relationships
	# Exposure should correlate with mediator
	expect_gt(cor(data$exposure, data$mediator), 0.3)

	# Direct should correlate with both mediator and y
	expect_gt(cor(data$direct, data$mediator), 0.3)
	expect_gt(cor(data$direct, data$y), 0.3)

	# Mediator should strongly correlate with y
	expect_gt(cor(data$mediator, data$y), 0.5)

	# Noise should have low correlation with everything
	expect_lt(abs(cor(data$noise, data$y)), 0.2)
})

test_that("sim_dgp_confounded with hidden=TRUE generates correct structure", {
	task <- sim_dgp_confounded(n = 500, hidden = TRUE)
	data <- task$data()

	# Basic structure tests
	expect_s3_class(task, "TaskRegr")
	expect_equal(nrow(data), 500)
	expect_setequal(task$feature_names, c("x1", "proxy", "independent"))
	expect_true(grepl("^confounded_hidden_", task$id))
	expect_false("confounder" %in% task$feature_names) # Should be hidden

	# Check confounding structure through correlations
	# x1 and proxy should be correlated (due to hidden confounder)
	expect_gt(cor(data$x1, data$proxy), 0.4)

	# Independent should have low correlation with confounded variables
	expect_lt(abs(cor(data$independent, data$x1)), 0.3)
	expect_lt(abs(cor(data$independent, data$proxy)), 0.3)

	# All should correlate with y
	cor_y <- cor(data$y, data[, .(x1, proxy, independent)])
	# Check all correlations are meaningful
	for (i in 1:ncol(cor_y)) {
		expect_gt(abs(cor_y[1, i]), 0.2)
	}
})

test_that("sim_dgp_confounded with hidden=FALSE includes confounder", {
	task <- sim_dgp_confounded(n = 100, hidden = FALSE)
	data <- task$data()

	# Basic structure tests
	expect_s3_class(task, "TaskRegr")
	expect_equal(nrow(data), 100)
	expect_setequal(task$feature_names, c("x1", "confounder", "proxy", "independent"))
	expect_true(grepl("^confounded_observed_", task$id))
	expect_true("confounder" %in% task$feature_names) # Should be observable

	# Check that confounder is the source of correlations
	# Confounder should correlate highly with x1, proxy
	expect_gt(cor(data$confounder, data$x1), 0.6)
	expect_gt(cor(data$confounder, data$proxy), 0.6)
	expect_gt(cor(data$confounder, data$y), 0.6)

	# Independent should still be independent of confounder
	expect_lt(abs(cor(data$independent, data$confounder)), 0.3)
})

test_that("sim_dgp_interactions generates pure interaction effects", {
	task <- sim_dgp_interactions(n = 300)
	data <- task$data()

	# Basic structure tests
	expect_s3_class(task, "TaskRegr")
	expect_equal(nrow(data), 300)
	expect_setequal(task$feature_names, c("x1", "x2", "x3", "noise1", "noise2"))
	expect_true(grepl("^interactions_", task$id))

	# Check that x1 and x2 have low marginal correlations with y
	expect_lt(abs(cor(data$x1, data$y)), 0.3) # Should have low marginal correlation
	expect_lt(abs(cor(data$x2, data$y)), 0.3) # Should have low marginal correlation

	# But x3 should have strong correlation (main effect)
	expect_gt(abs(cor(data$x3, data$y)), 0.3)

	# Check interaction effect by fitting model
	lm_fit <- lm(y ~ x1 + x2 + x1:x2 + x3 + noise1 + noise2, data = data)
	coefs <- coef(lm_fit)

	# Main effects of x1, x2 should be near zero
	expect_lt(abs(coefs["x1"]), 0.5)
	expect_lt(abs(coefs["x2"]), 0.5)

	# Interaction effect should be significant and close to 2
	expect_gt(abs(coefs["x1:x2"]), 1.5)
	expect_lt(abs(coefs["x1:x2"]), 2.5)

	# x3 effect should be close to 1
	expect_gt(abs(coefs["x3"]), 0.5)
	expect_lt(abs(coefs["x3"]), 1.5)

	# Noise effects should be small
	expect_lt(abs(coefs["noise1"]), 0.5)
	expect_lt(abs(coefs["noise2"]), 0.5)
})

test_that("all simulation functions handle different sample sizes", {
	sample_sizes <- c(50, 500)

	for (n in sample_sizes) {
		# Test each function with different sample sizes
		task_ind <- sim_dgp_independent(n = n)
		expect_equal(nrow(task_ind$data()), n)

		task_cor <- sim_dgp_correlated(n = n)
		expect_equal(nrow(task_cor$data()), n)

		task_med <- sim_dgp_mediated(n = n)
		expect_equal(nrow(task_med$data()), n)

		task_conf_h <- sim_dgp_confounded(n = n, hidden = TRUE)
		expect_equal(nrow(task_conf_h$data()), n)

		task_conf_o <- sim_dgp_confounded(n = n, hidden = FALSE)
		expect_equal(nrow(task_conf_o$data()), n)

		task_int <- sim_dgp_interactions(n = n)
		expect_equal(nrow(task_int$data()), n)

		task_ewald <- sim_dgp_ewald(n = n)
		expect_equal(nrow(task_ewald$data()), n)
	}
})

test_that("simulation functions produce finite values", {
	# Test that all functions produce finite (no NaN, Inf, NA) values
	functions_to_test <- list(
		function() sim_dgp_independent(n = 50),
		function() sim_dgp_correlated(n = 50),
		function() sim_dgp_mediated(n = 50),
		function() sim_dgp_confounded(n = 50, hidden = TRUE),
		function() sim_dgp_confounded(n = 50, hidden = FALSE),
		function() sim_dgp_interactions(n = 50),
		function() sim_dgp_ewald(n = 50)
	)

	for (sim_fn in functions_to_test) {
		task <- sim_fn()
		data <- task$data()

		# Check for finite values
		expect_true(all(is.finite(as.matrix(data))))

		# Check for no missing values
		expect_true(all(!is.na(as.matrix(data))))
	}
})

test_that("simulation functions are reproducible with set.seed", {
	# Test reproducibility
	set.seed(123)
	task1 <- sim_dgp_independent(n = 100)
	data1 <- task1$data()

	set.seed(123)
	task2 <- sim_dgp_independent(n = 100)
	data2 <- task2$data()

	expect_equal(data1, data2)

	# Test with different function
	set.seed(456)
	task3 <- sim_dgp_interactions(n = 50)
	data3 <- task3$data()

	set.seed(456)
	task4 <- sim_dgp_interactions(n = 50)
	data4 <- task4$data()

	expect_equal(data3, data4)

	# Test with sim_dgp_ewald
	set.seed(789)
	task5 <- sim_dgp_ewald(n = 75)
	data5 <- task5$data()

	set.seed(789)
	task6 <- sim_dgp_ewald(n = 75)
	data6 <- task6$data()

	expect_equal(data5, data6)
})

test_that("sim_dgp_confounded default parameters work correctly", {
	# Test default parameters
	task_default <- sim_dgp_confounded()
	expect_equal(nrow(task_default$data()), 500) # Default n
	expect_false("confounder" %in% task_default$feature_names) # Default hidden=TRUE

	# Test explicit parameters match defaults
	task_explicit <- sim_dgp_confounded(n = 500L, hidden = TRUE)
	expect_equal(task_default$feature_names, task_explicit$feature_names)
	expect_equal(nrow(task_default$data()), nrow(task_explicit$data()))
})

test_that("sim_dgp_ewald generates correct structure", {
	task <- sim_dgp_ewald(n = 500)
	data <- task$data()

	# Basic structure tests
	expect_s3_class(task, "TaskRegr")
	expect_equal(nrow(data), 500)
	expect_equal(task$target_names, "y")
	expect_setequal(task$feature_names, c("x1", "x2", "x3", "x4", "x5"))
	expect_true(grepl("^Ewald_", task$id))

	# Check data types
	expect_true(all(sapply(data, is.numeric)))

	# Check relationships as specified in Ewald et al.
	# x2 should be highly correlated with x1 (noisy copy with sd=0.001)
	expect_gt(cor(data$x1, data$x2), 0.8)

	# x4 should be correlated with x3 (noisier copy with sd=0.1)
	expect_gt(cor(data$x3, data$x4), 0.8)

	# x1, x3, x5 should be relatively independent
	expect_lt(abs(cor(data$x1, data$x3)), 0.2)
	expect_lt(abs(cor(data$x1, data$x5)), 0.2)
	expect_lt(abs(cor(data$x3, data$x5)), 0.2)

	# Check that y depends on x4, x5, and their interaction
	# Simple check: both x4 and x5 should correlate with y
	expect_gt(abs(cor(data$x4, data$y)), 0.3)
	expect_gt(abs(cor(data$x5, data$y)), 0.3)

	# Fit a model to verify the interaction effect
	lm_fit <- lm(y ~ x4 + x5 + x4:x5, data = data)
	expect_gt(summary(lm_fit)$r.squared, 0.75) # Should explain most variance

	# All coefficients should be significant and close to 1
	coefs <- coef(lm_fit)
	expect_lt(abs(coefs["x4"] - 1), 0.3)
	expect_lt(abs(coefs["x5"] - 1), 0.3)
	expect_lt(abs(coefs["x4:x5"] - 1), 0.3)
})

Try the xplainfi package in your browser

Any scripts or data that you put into this service are public.

xplainfi documentation built on Feb. 27, 2026, 1:08 a.m.