tests/testthat/test-knn_imp.R

test_that("`impute_knn_brute` and `impute_knn_mlpack` calculate the missing location correctly", {
  set.seed(1234)
  to_test <- sim_mat(20, 50, perc_total_na = 0.5, perc_col_na = 1)$input
  miss <- is.na(to_test)
  cmiss <- colSums(miss)
  miss_rate <- cmiss / nrow(to_test)
  # same preprocessing knn_imp() does before calling the low-level functions
  colmax <- 0.9
  eligible <- miss_rate < colmax
  pre_imp_cols <- to_test[, eligible, drop = FALSE]
  pre_imp_miss <- miss[, eligible, drop = FALSE]
  pre_imp_cols[pre_imp_miss] <- 0.0

  # local groups (0-based indices into pre_imp_cols)
  local_has_miss <- which(cmiss[eligible] > 0L)
  grp_impute <- as.integer(local_has_miss - 1L)

  # Expected missing positions *local to the submatrix pre_imp_cols* (1-based)
  expected_local <- unname(which(pre_imp_miss, arr.ind = TRUE))

  # brute-force path
  imputed_brute <- impute_knn_brute(
    obj = pre_imp_cols,
    nmiss = !pre_imp_miss,
    k = 5,
    grp_impute = grp_impute,
    grp_miss_no_imp = integer(0L),
    grp_complete = integer(0L),
    method = 0L,
    dist_pow = 1,
    cache = FALSE,
    cores = 1
  )

  # mlpack path
  imputed_mlpack <- impute_knn_mlpack(
    obj = mean_imp_col(pre_imp_cols),
    nmiss = !pre_imp_miss,
    k = 5,
    grp_impute = grp_impute,
    method = 0L,
    dist_pow = 1,
    cores = 1
  )

  # extract only the location columns (row, local_col_1based) that the C++ already returns
  idx_brute <- imputed_brute[, 1:2, drop = FALSE]
  idx_mlpack <- imputed_mlpack[, 1:2, drop = FALSE]

  # sort so the comparison is order-independent
  expected_sorted <- expected_local[order(expected_local[, 1], expected_local[, 2]), , drop = FALSE]
  brute_sorted <- idx_brute[order(idx_brute[, 1], idx_brute[, 2]), , drop = FALSE]
  mlpack_sorted <- idx_mlpack[order(idx_mlpack[, 1], idx_mlpack[, 2]), , drop = FALSE]

  expect_equal(brute_sorted, expected_sorted)
  expect_equal(mlpack_sorted, expected_sorted)
})

test_that("`knn_imp` works", {
  obj <- sim_mat(50, 100)$input
  expect_no_error(knn_imp(
    obj,
    k = 3,
    method = "euclidean"
  ))

  expect_no_error(knn_imp(
    obj,
    k = 3,
    method = "manhattan",
    tree = TRUE
  ))
})

test_that("`knn_imp` cache and non cache path is the same", {
  obj <- sim_mat(50, 100)$input
  expect_identical(
    knn_imp(obj, k = 10, method = "euclidean"),
    knn_imp(obj, k = 10, method = "euclidean", max_cache = 0)
  )
})

test_that("`knn_imp` tree and brute is the same for few missing values", {
  set.seed(1234)
  to_test <- sim_mat(20, n = 1000, perc_total_na = 0, perc_col_na = 0)$input
  to_test[1, 1] <- NA
  to_test[2, 2] <- NA

  expect_identical(
    knn_imp(to_test, k = 3, method = "euclidean"),
    knn_imp(to_test, k = 3, method = "euclidean", tree = TRUE)
  )
})

test_that("Exactly replicate `impute.knn`", {
  skip("Manual Testing Only")
  # library(impute)
  # set.seed(1234)
  # # post_imp behavior can cause differences
  # obj <- sim_mat(100, 100, perc_total_na = 0.05)$input
  #
  # # Check if the 'impute' package is installed
  #
  # # Perform imputation using knn_imp with method "impute.knn" on transposed data
  # r1 <- knn_imp(obj, k = 3, method = "euclidean", post_imp = FALSE)
  #
  # # Perform imputation using the original impute.knn function
  # # Transpose the result to match the orientation
  # r2 <- t(
  #   impute.knn(
  #     t(obj),
  #     k = 3,
  #     maxp = ncol(obj)
  #   )$data
  # )
  #
  # # Verify that the results from knn_imp match exactly with impute.knn
  # expect_equal(r1[, ], r2[, ])
})

test_that("`subset` feature of `knn_imp` works with post_imp = FALSE/TRUE", {
  set.seed(1234)
  to_test <- sim_mat(20, 50, perc_total_na = 0.2, perc_col_na = 1)$input
  # Impute just 3 columns
  ## Check subset using numeric index
  r1 <- knn_imp(to_test, k = 3, post_imp = FALSE, subset = c(1, 3, 5))
  expect_true(!anyNA(r1[, c(1, 3, 5)]))
  expect_equal(is.na(r1[, -c(1, 3, 5)]), is.na(to_test[, -c(1, 3, 5)]))
  ## Check subset using character vector
  r2 <- knn_imp(
    to_test,
    k = 3,
    post_imp = FALSE,
    subset = paste0("feature", c(1, 3, 5))
  )
  expect_equal(r1, r2)

  # Test with post_imp = TRUE and a column requiring post imputation
  to_test_post <- to_test
  # Column 5 will be colMeans if post_imp is TRUE
  to_test_post[2:nrow(to_test_post), 5] <- NA
  r3 <- knn_imp(to_test_post, k = 3, post_imp = TRUE, subset = c(1, 3, 5))
  expect_true(!anyNA(r3[, c(1, 3, 5)]))
  # Expect that only the subset columns are imputed. The rests are untouched
  expect_equal(is.na(r3[, -c(1, 3, 5)]), is.na(to_test_post[, -c(1, 3, 5)]))
  # Verify post_imp on column 5
  col5_mean <- mean(to_test_post[, 5], na.rm = TRUE)
  expect_equal(unname(r3[, 5]), rep(col5_mean, nrow(to_test_post)))
  r4 <- knn_imp(
    to_test_post,
    k = 3,
    post_imp = TRUE,
    subset = paste0("feature", c(1, 3, 5))
  )
  expect_equal(r3, r4)
})

test_that("Behavior with extreme missing columns and rows", {
  set.seed(1234)
  to_test <- sim_mat(20, 20, perc_total_na = 0.2, perc_col_na = 1)$input
  # row 1 is all NA
  to_test[1, ] <- NA
  expect_no_error(knn_imp(to_test, k = 3, post_imp = FALSE))
  expect_true(!anyNA(knn_imp(to_test, k = 3, post_imp = TRUE)))

  to_test[, 1] <- NA
  expect_error(knn_imp(to_test, k = 3, post_imp = FALSE), "All NA/Inf")

  # not admissible
  mat <- matrix(NA, nrow = 20, ncol = 20)
  diag(mat) <- rnorm(20)
  expect_error(knn_imp(mat, k = 2), "exceeds usable columns")
})

Try the slideimp package in your browser

Any scripts or data that you put into this service are public.

slideimp documentation built on April 17, 2026, 1:07 a.m.