Nothing
# sample_na_loc ----
test_that("returns a list of length n_reps, each a 2-col row/col matrix", {
m <- sim_mat(20, 10, perc_total_na = 0)$input
out <- sample_na_loc(m, n_cols = 3, n_rows = 2, n_reps = 4)
expect_type(out, "list")
expect_length(out, 4)
for (rep in out) {
expect_equal(nrow(rep), 3 * 2)
}
})
# n_cols / n_rows semantics
test_that("each rep uses exactly n_cols distinct columns, each with n_rows NAs", {
m <- sim_mat(20, 10, perc_total_na = 0)$input
out <- sample_na_loc(m, n_cols = 4, n_rows = 3, n_reps = 5)
for (rep in out) {
tab <- table(rep[, "col"])
expect_length(tab, 4) # exactly n_cols distinct columns
expect_true(all(tab == 3)) # each with exactly n_rows NAs
}
})
test_that("no duplicated (row, col) pairs within a rep", {
m <- sim_mat(20, 10, perc_total_na = 0)$input
out <- sample_na_loc(m, n_cols = 4, n_rows = 3, n_reps = 5)
for (rep in out) {
key <- paste(rep[, "row"], rep[, "col"], sep = ":")
expect_false(anyDuplicated(key) > 0)
}
})
# num_na distribution
test_that("num_na divisible by n_rows distributes evenly", {
m <- sim_mat(20, 10, perc_total_na = 0)$input
out <- sample_na_loc(m, num_na = 12, n_rows = 3, n_reps = 1)
rep <- out[[1]]
expect_equal(nrow(rep), 12)
tab <- table(rep[, "col"])
expect_length(tab, 4) # 12 / 3 = 4 columns
expect_true(all(tab == 3))
})
test_that("num_na not divisible by n_rows bumps last buckets by at least 1", {
m <- sim_mat(20, 10, perc_total_na = 0)$input
# num_na = 13, n_rows = 3 -> n_cols = 4, na_per_col = c(4, 3, 3, 3)
# (remainder = 1 column gets a +1)
out <- sample_na_loc(m, num_na = 13, n_rows = 3, n_reps = 1)
rep <- out[[1]]
expect_equal(nrow(rep), 13)
tab <- sort(as.integer(table(rep[, "col"])))
expect_equal(tab, c(3L, 3L, 3L, 4L))
})
# na_col_subset handling
test_that("numeric na_col_subset restricts columns to the pool", {
m <- sim_mat(p = 10, perc_total_na = 0)$input
pool <- c(2L, 4L, 6L, 8L)
out <- sample_na_loc(m, n_cols = 3, n_rows = 2, na_col_subset = pool, n_reps = 10)
used <- unique(unlist(lapply(out, function(r) r[, "col"])))
expect_true(all(used %in% pool))
})
test_that("character na_col_subset resolves via colnames", {
m <- sim_mat(p = 6, perc_total_na = 0)$input # colnames feature1 ... feature6
out <- sample_na_loc(m,
n_cols = 2, n_rows = 1,
na_col_subset = c("feature2", "feature5"), n_reps = 5
)
used <- unique(unlist(lapply(out, function(r) r[, "col"])))
expect_setequal(used, c(2L, 5L))
})
# zero-variance cols protection
test_that("columns keep >= 2 distinct observed values after injection", {
m <- sim_mat(20, p = 6, perc_total_na = 0)$input
out <- sample_na_loc(m, n_cols = 4, n_rows = 5, n_reps = 10)
for (rep in out) {
m2 <- apply_na(m, rep)
touched <- unique(rep[, "col"])
for (j in touched) {
obs <- m2[, j][!is.na(m2[, j])]
expect_gte(length(unique(obs)), 2L)
}
}
})
test_that("columns without enough eligible rows (after keeping 2 uniques) are skipped", {
m <- sim_mat(20, 5, perc_total_na = 0)$input
# Column 3 has only 4 observed values with exactly 2 uniques.
# After keeping 2 (one of each), left_over = 2 < needed = 3 -> must be skipped.
m[1:4, 3] <- c(1, 1, 2, 2)
m[5:20, 3] <- NA
out <- sample_na_loc(
m,
n_cols = 4,
n_rows = 3,
na_col_subset = 1:5,
n_reps = 10
)
for (rep in out) {
expect_false(
3L %in% rep[, "col"],
label = "column 3 should never be selected (not enough sacrificable rows)"
)
}
})
test_that("pre-check aborts on columns with zero variance", {
m <- sim_mat(20, 5, perc_total_na = 0)$input
m[, 3] <- 7 # truly zero variance
expect_error(
sample_na_loc(m, n_cols = 4, n_rows = 1, na_col_subset = 1:5),
"Some columns already have zero"
)
})
test_that("aborts when requested n_cols exceeds available pool", {
m <- sim_mat(20, 5, perc_total_na = 0)$input
expect_error(
sample_na_loc(m, n_cols = 10, n_rows = 1, na_col_subset = 1:5),
"Cannot place"
)
})
# row / col budget enforcement
test_that("resulting matrix respects colmax", {
m <- sim_mat(20, 8, perc_total_na = 0)$input
colmax <- 0.5
out <- sample_na_loc(m,
n_cols = 4, n_rows = 5,
colmax = colmax, n_reps = 10
)
cap <- floor(nrow(m) * colmax)
for (rep in out) {
m2 <- apply_na(m, rep)
col_miss <- colSums(is.na(m2))
expect_true(all(col_miss <= cap))
}
})
test_that("resulting matrix respects rowmax", {
m <- sim_mat(30, 10, perc_total_na = 0)$input
rowmax <- 0.4
out <- sample_na_loc(m,
n_cols = 5, n_rows = 4,
rowmax = rowmax, n_reps = 10
)
cap <- floor(ncol(m) * rowmax)
for (rep in out) {
m2 <- apply_na(m, rep)
row_miss <- rowSums(is.na(m2))
expect_true(all(row_miss <= cap))
}
})
# rep independence / reproducibility
test_that("reps are independently sampled (not identical)", {
m <- sim_mat(30, 10, perc_total_na = 0)$input
out <- sample_na_loc(m, n_cols = 4, n_rows = 3, n_reps = 5)
# Hash each rep; at least 2 distinct with high probability.
sigs <- vapply(out, function(r) {
paste(r[, "row"], r[, "col"], collapse = ",")
}, character(1))
expect_gt(length(unique(sigs)), 1)
})
test_that("set.seed makes sample_na_loc reproducible", {
m <- sim_mat(20, 10, perc_total_na = 0)$input
set.seed(42)
a <- sample_na_loc(m, n_cols = 3, n_rows = 2, n_reps = 3)
set.seed(42)
b <- sample_na_loc(m, n_cols = 3, n_rows = 2, n_reps = 3)
expect_identical(a, b)
})
# failure path
test_that("aborts when budgets make sampling infeasible", {
# 10 rows allows n_rows=3 under the "keep >=2 values" hard bound,
# but colmax=0.1 -> floor(10*0.1)=1 NA per column, so needing 3 is impossible.
m <- sim_mat(10, 6, perc_total_na = 0)$input
expect_error(
sample_na_loc(m,
n_cols = 2, n_rows = 3, colmax = 0.1,
max_attempts = 3
),
"Failed to sample NA locations"
)
})
# pre-existing NAs
test_that("sampled positions never collide with pre-existing NAs", {
set.seed(1)
m <- sim_mat(30, 10, perc_total_na = 0)$input
# scatter some NAs, keeping columns healthy
preset <- cbind(row = c(1, 2, 3, 4, 5), col = c(1, 2, 3, 4, 5))
m[preset] <- NA
out <- sample_na_loc(m, n_cols = 5, n_rows = 3, n_reps = 20)
for (rep in out) {
# none of the sampled (row, col) pairs should already be NA
expect_true(all(!is.na(m[rep])))
}
})
test_that("final colmax / rowmax account for pre-existing NAs", {
set.seed(2)
m <- sim_mat(20, 8, perc_total_na = 0)$input
# preload column 1 with 4 NAs and row 1 with 2 NAs (avoiding overlap with col 1)
m[1:4, 1] <- NA
m[1, 3:4] <- NA
colmax <- 0.5 # cap = floor(20 * 0.5) = 10
rowmax <- 0.5 # cap = floor(8 * 0.5) = 4
out <- sample_na_loc(
m,
n_cols = 4, n_rows = 3,
colmax = colmax, rowmax = rowmax,
n_reps = 20
)
col_cap <- floor(nrow(m) * colmax)
row_cap <- floor(ncol(m) * rowmax)
for (rep in out) {
m2 <- apply_na(m, rep)
expect_true(all(colSums(is.na(m2)) <= col_cap))
expect_true(all(rowSums(is.na(m2)) <= row_cap))
}
})
test_that("columns with exhausted col_room are skipped even if individually healthy", {
set.seed(3)
m <- sim_mat(20, 6, perc_total_na = 0)$input
colmax <- 0.5 # cap = 10
# column 2 already has 9 NAs -> col_room = 1, so needed = 3 can't fit
na_rows <- sample.int(20, 9)
m[na_rows, 2] <- NA
out <- sample_na_loc(
m,
n_cols = 4, n_rows = 3,
colmax = colmax, n_reps = 20
)
for (rep in out) {
expect_false(2L %in% rep[, "col"])
}
})
test_that("num_na with remainder > n_cols distributes via larger bumps", {
m <- sim_mat(20, 10, perc_total_na = 0)$input
# num_na = 5, n_rows = 3 -> n_cols = 1, one column takes all 5
out <- sample_na_loc(m, num_na = 5, n_rows = 3, n_reps = 1)
rep <- out[[1]]
expect_equal(nrow(rep), 5)
tab <- as.integer(table(rep[, "col"]))
expect_equal(tab, 5L)
})
test_that("num_na = 11 with n_rows = 3 yields c(3, 4, 4)", {
m <- sim_mat(20, 10, perc_total_na = 0)$input
out <- sample_na_loc(m, num_na = 11, n_rows = 3, n_reps = 1)
rep <- out[[1]]
expect_equal(nrow(rep), 11)
expect_equal(sort(as.integer(table(rep[, "col"]))), c(3L, 4L, 4L))
})
# tune imp ----
test_that("tune_imp works", {
slide_imp_par <- data.frame(
window_size = c(100, 100),
k = c(5, 10),
overlap_size = c(10, 10),
min_window_n = 20,
method = "euclidean",
post_imp = FALSE
)
set.seed(1234)
obj <- sim_mat(50, 1000, perc_col_na = 0.5)$input
expect_true(anyNA(obj))
location <- 1:ncol(obj)
# Check `slide_imp`
expect_no_error({
slide_imp_imp_res <- tune_imp(
obj,
slide_imp_par,
.f = "slide_imp",
location = location,
n_reps = 1,
num_na = 200
)
})
# `slide_imp` requires parameters
expect_error(
{
slide_imp_imp_res <- tune_imp(
obj,
.f = "slide_imp",
location = location,
n_reps = 1,
num_na = 200
)
},
regexp = "requires"
)
expect_true(
all(
vapply(
slide_imp_imp_res$result,
\(x) {
class(x$estimate)
},
character(1)
) == "numeric"
)
)
# Check `knn_imp`
knn_imp_par <- data.frame(
k = c(5, 10),
method = "euclidean",
post_imp = TRUE
)
expect_no_error({
knn_imp_res <- tune_imp(obj, knn_imp_par, .f = "knn_imp", n_reps = 1, num_na = 100)
})
expect_true(
all(
vapply(
knn_imp_res$result,
\(x) {
class(x$estimate)
},
character(1)
) == "numeric"
)
)
# Check `pca_imp`
pca_imp_par <- data.frame(ncp = 2, miniter = 2)
expect_no_error({
pca_imp_res <- tune_imp(obj, pca_imp_par, .f = "pca_imp", n_reps = 1, num_na = 100)
})
expect_true(
all(
vapply(
pca_imp_res$result,
\(x) {
class(x$estimate)
},
character(1)
) == "numeric"
)
)
# Check custom function
f1 <- function() {}
custom_fun <- function(obj, value) {
obj[is.na(obj)] <- value
f1()
return(obj)
}
custom_par <- data.frame(
value = c(0, 1)
)
expect_no_error({
custom_imp_res <- tune_imp(obj, custom_par, n_reps = 1, num_na = 100, .f = custom_fun)
})
expect_true(
all(
vapply(custom_imp_res$result, \(x) {
class(x$estimate)
}, character(1)) == "numeric"
)
)
})
test_that("tune_imp works when n_reps is a list of NA locations", {
# Create a complete matrix (no NAs) for testing
obj <- sim_mat(50, 200)$input
obj[is.na(obj)] <- 0 # Fill any existing NAs
# Create predefined NA location sets
# Each set has 10 locations, all within matrix bounds
set.seed(42)
na_loc_list <- list(
sample(1:length(obj), 10, replace = FALSE),
sample(1:length(obj), 10, replace = FALSE),
sample(1:length(obj), 10, replace = FALSE)
)
# Test with slide_imp
slide_imp_par <- data.frame(
window_size = 100,
k = 5,
overlap_size = 10,
method = "euclidean",
min_window_n = 10,
post_imp = FALSE
)
location <- 1:ncol(obj)
expect_no_error({
slide_imp_res <- tune_imp(
location = location,
obj,
slide_imp_par,
.f = "slide_imp",
na_loc = na_loc_list, # Using list instead of integer
)
})
# Check that we get 3 results (one for each NA location set)
expect_equal(nrow(slide_imp_res), 3)
# Check that each result has the correct number of estimates (10 each)
expect_true(
all(vapply(slide_imp_res$result, function(x) nrow(x) == 10, logical(1)))
)
# Verify the truth values match the original matrix values at those locations
for (i in 1:3) {
truth_values <- slide_imp_res$result[[i]]$truth
expected_truth <- obj[na_loc_list[[i]]]
expect_equal(truth_values, expected_truth)
}
# Test with knn_imp
knn_imp_par <- data.frame(
k = c(5, 10),
method = "euclidean",
post_imp = FALSE
)
expect_no_error({
knn_imp_res <- tune_imp(
obj,
knn_imp_par,
.f = "knn_imp",
na_loc = na_loc_list
)
})
# Should have 2 parameters × 3 repetitions = 6 rows
expect_equal(nrow(knn_imp_res), 6)
# Check that results contain numeric estimates
expect_true(
all(vapply(knn_imp_res$result, function(x) {
is.numeric(x$estimate) && is.numeric(x$truth)
}, logical(1)))
)
# Test with custom function
custom_fun <- function(obj, value) {
obj[is.na(obj)] <- value
return(obj)
}
custom_par <- data.frame(value = c(0.5, 1.5))
expect_no_error({
custom_res <- tune_imp(
obj,
custom_par,
na_loc = na_loc_list,
.f = custom_fun
)
})
# Should have 2 parameters × 3 repetitions = 6 rows
expect_equal(nrow(custom_res), 6)
# Verify custom function fills with the specified values
for (i in 1:nrow(custom_res)) {
expected_value <- custom_res$value[i]
estimates <- custom_res$result[[i]]$estimate
expect_true(all(estimates == expected_value))
}
# Test with different length NA location sets
varied_na_locs <- list(
sample(1:length(obj), 5, replace = FALSE),
sample(1:length(obj), 5, replace = FALSE)
)
location <- 1:ncol(obj)
expect_no_error({
varied_res <- tune_imp(
obj,
location = location,
slide_imp_par,
.f = "slide_imp",
na_loc = varied_na_locs
)
})
expect_equal(nrow(varied_res), 2)
expect_equal(nrow(varied_res$result[[1]]), 5)
expect_equal(nrow(varied_res$result[[2]]), 5)
})
test_that("tune_imp correctly uses provided NA locations from list", {
# Create a simple matrix for easier verification
set.seed(123)
obj <- matrix(1:100, nrow = 10, ncol = 10)
# Define specific NA locations
na_locations <- list(
c(1, 11, 21), # First column positions
c(10, 20, 30), # Last position of first 3 rows
c(50, 60, 70) # Middle positions
)
simple_imp <- function(obj, fill_value) {
obj[is.na(obj)] <- fill_value
return(obj)
}
params <- data.frame(fill_value = 42)
result <- tune_imp(
obj,
params,
na_loc = na_locations,
.f = simple_imp
)
# Verify each repetition used the correct NA locations
for (i in 1:3) {
res <- result$result[[i]]
# Check truth values match original matrix at specified locations
expected_truth <- obj[na_locations[[i]]]
expect_equal(res$truth, expected_truth)
# Check all estimates are the fill value
expect_true(all(res$estimate == 42))
# Check we have the right number of values
expect_equal(length(res$truth), length(na_locations[[i]]))
}
})
test_that("tune_imp handles mixed linear and 2D positions in list", {
set.seed(789)
obj <- matrix(1:100, nrow = 10, ncol = 10)
# mix of linear and 2D positions
na_locations_mixed <- list(
c(1, 11, 21), # linear
matrix(c(10, 10, 10, 1, 2, 3), ncol = 2), # 2D, row 10, column 1, 2, 3
c(45, 55, 65) # linear
)
simple_imp <- function(obj, fill_value) {
obj[is.na(obj)] <- fill_value
return(obj)
}
params <- data.frame(fill_value = 67)
result <- tune_imp(
obj,
params,
na_loc = na_locations_mixed,
.f = simple_imp
)
expected_linear <- list(
c(1, 11, 21),
c(10, 20, 30),
c(45, 55, 65)
)
for (i in 1:3) {
res <- result$result[[i]]
expected_truth <- obj[expected_linear[[i]]]
expect_equal(res$truth, expected_truth)
expect_true(all(res$estimate == 67))
}
})
test_that("compute_metrics works with slideimp_tune and data.frame", {
set.seed(123)
obj <- matrix(1:100, nrow = 10, ncol = 10)
simple_imp <- function(obj, mu) {
miss <- is.na(obj)
obj[miss] <- stats::rnorm(n = sum(miss), mean = mu)
return(obj)
}
params <- data.frame(mu = 42)
result_tune <- tune_imp(
obj,
params,
n_reps = 2,
num_na = 10,
.f = simple_imp
)
# slideimp_tune object
out_tune <- compute_metrics(result_tune)
expect_s3_class(out_tune, "data.frame")
expect_true(all(c(".metric", ".estimator", ".estimate", "n", "n_miss") %in% names(out_tune)))
expect_equal(sort(unique(out_tune$.metric)), c("mae", "rmse"))
# Plain data.frame
result_df <- as.data.frame(result_tune)
class(result_df) <- "data.frame"
out_df <- compute_metrics(result_df)
expect_s3_class(out_df, "data.frame")
expect_equal(out_df$.estimate, out_tune$.estimate)
})
test_that("compute_metrics correctly computes n and n_miss with NA estimates", {
set.seed(456)
obj <- matrix(1:100, nrow = 10, ncol = 10)
simple_imp <- function(obj, mu) {
miss <- is.na(obj)
obj[miss] <- rnorm(n = sum(miss), mean = mu)
return(obj)
}
params <- data.frame(mu = 42)
result <- tune_imp(
obj,
params,
n_reps = 2,
num_na = 10,
.f = simple_imp
)
# No NAs case: all estimates should be present
out_clean <- compute_metrics(result)
expect_true(all(out_clean$n == 10))
expect_true(all(out_clean$n_miss == 0))
# Inject NAs into the estimate column of each result element
result$result[[1]]$estimate[c(1, 3)] <- NA
result$result[[2]]$estimate[c(2, 5, 7)] <- NA
out_na <- compute_metrics(
result,
metrics = c("mae", "rmse", "mape", "bias", "rsq", "rsq_trad")
)
# Rep 1: 10 rows, 2 missing
rows_rep1 <- out_na[out_na$rep_id == 1, ]
expect_true(all(rows_rep1$n == 10))
expect_true(all(rows_rep1$n_miss == 2))
# Rep 2: 10 rows, 3 missing
rows_rep2 <- out_na[out_na$rep_id == 2, ]
expect_true(all(rows_rep2$n == 10))
expect_true(all(rows_rep2$n_miss == 3))
# n and n_miss are consistent across metrics within the same rep
for (r in unique(out_na$rep_id)) {
subset <- out_na[out_na$rep_id == r, ]
expect_length(unique(subset$n), 1)
expect_length(unique(subset$n_miss), 1)
}
})
test_that("compute_metrics.data.frame errors without required columns", {
bad_df <- data.frame(x = 1:3)
expect_error(compute_metrics(bad_df), "result")
bad_result <- data.frame(result = I(list(data.frame(a = 1, b = 2))))
expect_error(compute_metrics(bad_result), "truth.*estimate")
})
test_that("tune_imp works with custom function and list-column parameters", {
set.seed(42)
obj <- matrix(rnorm(200), nrow = 10, ncol = 20)
# custom function that takes a vector of weights per column and fills NAs
# with a weighted column mean
weighted_fill <- function(obj, weights) {
stopifnot(length(weights) == ncol(obj))
for (j in seq_len(ncol(obj))) {
col_mean <- mean(obj[, j], na.rm = TRUE)
obj[is.na(obj[, j]), j] <- col_mean * weights[j]
}
return(obj)
}
# parameters with a list column: each row holds a different weight vector
custom_par <- data.frame(
weights = I(list(
rep(1, 20),
rep(0.5, 20),
seq(0.1, 2, length.out = 20)
))
)
expect_no_error({
res <- tune_imp(
obj,
custom_par,
.f = weighted_fill,
n_reps = 2,
num_na = 15
)
})
# Should have 3 param sets * 2 reps = 6 rows
expect_equal(nrow(res), 6)
expect_true("result" %in% names(res))
expect_true(
all(
vapply(res$result, \(x) {
is.numeric(x$estimate) && nrow(x) > 0
}, logical(1))
)
)
# The weights list column should be preserved in the output
expect_true("weights" %in% names(res))
expect_true(is.list(res$weights))
})
test_that("tune_imp works with custom function and NULL parameters", {
set.seed(99)
obj <- matrix(rnorm(150), nrow = 10, ncol = 15)
# a function with only `obj` — fills NAs with 0
zero_fill <- function(obj) {
obj[is.na(obj)] <- 0
return(obj)
}
expect_no_error({
res <- tune_imp(
obj,
parameters = NULL,
.f = zero_fill,
n_reps = 3,
num_na = 10
)
})
# 1 param set * 3 reps = 3 rows
expect_equal(nrow(res), 3)
expect_true("result" %in% names(res))
# placeholder column should be stripped
expect_false(".placeholder" %in% names(res))
expect_true(
all(
vapply(res$result, \(x) {
is.numeric(x$estimate) && is.numeric(x$truth) && nrow(x) > 0
}, logical(1))
)
)
})
test_that("tune_imp with NULL parameters and a function that has defaults", {
set.seed(7)
obj <- matrix(rnorm(100), nrow = 5, ncol = 20)
fill_with_default <- function(obj, value = -999, scale = 1.0) {
obj[is.na(obj)] <- value * scale
return(obj)
}
# NULL parameters should run the function using its defaults
expect_no_error({
res_null <- tune_imp(
obj,
parameters = NULL,
.f = fill_with_default,
n_reps = 1,
num_na = 10
)
})
expect_equal(nrow(res_null), 1)
# all imputed values should be -999 (the defaults)
expect_true(all(res_null$result[[1]]$estimate == -999))
# compare with explicit parameters to make sure NULL truly uses defaults
explicit_par <- data.frame(value = -999, scale = 1.0)
expect_no_error({
res_explicit <- tune_imp(
obj,
parameters = explicit_par,
.f = fill_with_default,
n_reps = 1,
num_na = 10
)
})
expect_equal(res_null$result[[1]]$estimate, res_explicit$result[[1]]$estimate)
})
# test_that("grid_to_linear correctly converts 2D positions to linear indices", {
# n <- 10
# m <- 10
#
# pos_2d <- matrix(c(1, 1, 1, 2, 2, 1, 10, 10), ncol = 2, byrow = TRUE)
# pos_1d <- grid_to_linear(pos_2d, n, m)
# sim_dat <- matrix(rnorm(n * m), ncol = n, nrow = m)
# expect_identical(sim_dat[pos_2d], sim_dat[pos_1d])
# })
# Tests for sample_na_loc() / sample_each_rep()
#
# Focus: core sampling logic (shape, budgets, zero-variance protection,
# subset handling, num_na distribution, rep independence).
# Pre-condition validation (checkmate asserts, colmax/rowmax pre-injection
# checks) is intentionally not covered here.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.