tests/testthat/test-missRanger.R

iris2 <- generateNA(iris, p = 0.2, seed = 1L)

test_that("all missings are filled for multivariate and univariate imputation", {
  imp <- missRanger(iris2, verbose = 0L, seed = 1L, num.trees = 20)
  expect_true(!anyNA(imp))
  
  imp <- missRanger(iris2, . ~ 1, verbose = 0L, seed = 1L)
  expect_true(!anyNA(imp))
})

#===================================================
# TEST ARGUMENTS
#===================================================

imp1 <- missRanger(iris2, num.trees = 20L, verbose = 0L, seed = 1L)

test_that("pmm.k produces values present in the original data", {
  imp2 <- missRanger(iris2, num.trees = 20L, verbose = 0L, pmm.k = 3L, seed = 1L)
  
  expect_false(isTRUE(all.equal(imp1, imp2)))
  expect_true(all(imp2$Sepal.Length %in% iris2$Sepal.Length))
})

test_that("num.trees has an effect", {
  imp2 <- missRanger(iris2, num.trees = 10L, verbose = 0L, seed = 1L)
  expect_false(isTRUE(all.equal(imp1, imp2)))
})

test_that("mtry has an effect", {
  imp2 <- missRanger(iris2, num.trees = 20L, verbose = 0L, seed = 1L, mtry = 1)
  imp3 <- missRanger(
    iris2, num.trees = 20L, verbose = 0L, seed = 1L, mtry = function(m) min(m, 4)
  )
  
  expect_false(isTRUE(all.equal(imp1, imp2)))
  expect_false(isTRUE(all.equal(imp1, imp3)))
  expect_false(isTRUE(all.equal(imp2, imp3)))
})

test_that("min.node.size has an effect", {
  imp2 <- missRanger(
    iris2, num.trees = 20L, verbose = 0L, seed = 1L, min.node.size = 20
  )
  expect_false(isTRUE(all.equal(imp1, imp2)))
})

test_that("min.bucket has an effect", {
  imp2 <- missRanger(iris2, num.trees = 20L, verbose = 0L, seed = 1L, min.bucket = 20)
  expect_false(isTRUE(all.equal(imp1, imp2)))
})

test_that("max.depth has an effect", {
  imp2 <- missRanger(iris2, num.trees = 20L, verbose = 0L, seed = 1L, max.depth = 1)
  expect_false(isTRUE(all.equal(imp1, imp2)))
})

test_that("replace has an effect", {
  imp2 <- missRanger(iris2, num.trees = 20L, verbose = 0L, seed = 1L, replace = FALSE)
  expect_false(isTRUE(all.equal(imp1, imp2)))
})

test_that("sample.fraction has an effect", {
  imp2 <- missRanger(
    iris2, num.trees = 20L, verbose = 0L, seed = 1L, sample.fraction = 0.5
  )
  expect_false(isTRUE(all.equal(imp1, imp2)))
})

test_that("case.weights works", {
  imp2 <- missRanger(
    iris2, num.trees = 20L, verbose = 0L, seed = 1L, case.weights = rep(1, nrow(iris))
  )
  imp3 <- missRanger(
    iris2, num.trees = 20L, verbose = 0L, seed = 1L, case.weights = 1:nrow(iris)
  )
  
  expect_equal(imp1, imp2)
  expect_false(isTRUE(all.equal(imp1, imp3)))
  
  expect_error(missRanger(iris2, num.trees = 3L, verbose = 0L, case.weights = 1:7))
})

test_that("seed works", {
  imp2 <- missRanger(iris2, num.trees = 20L, verbose = 0L, seed = 1L)
  imp3 <- missRanger(iris2, num.trees = 20L, verbose = 0L, seed = 2L)
  
  expect_equal(imp1, imp2)
  expect_false(identical(imp1, imp3))
})

test_that("verbose can be suppressed", {
  expect_silent(missRanger(iris2, num.trees = 3L, verbose = 0L))
  capture_output(expect_message(missRanger(iris2, num.trees = 3L, verbose = 1L)))
  capture_output(expect_message(missRanger(iris2, num.trees = 3L, verbose = 2L)))
})

test_that("returnOOB works", {
  imp2 <- missRanger(iris2, num.trees = 20L, verbose = 0L, returnOOB = TRUE)
  
  expect_true(length(attributes(imp2)$oob) == ncol(iris))
  expect_null(attributes(imp1)$oob)
})

#===================================================
# DATA TYPE CONSISTENCY
#===================================================

n <- 200L

X <- data.frame(
  int = seq_len(n), 
  double = log(seq_len(n)), 
  char = rep(LETTERS[1:4], n %/% 4),
  fact = factor(rep(LETTERS[1:2], n %/% 2)),
  logi = seq_len(n) > n %/% 3,
  date = seq.Date(as.Date("2001-01-01"), length.out = n, by = "1 d")
)

X_NA <- generateNA(X[1:5], p = 0.2, seed = 1L)

test_that("Imputing retains data type (with/without PMM and univariately)", {
  imp1 <- missRanger(X_NA, num.trees = 20L, verbose = 0L, pmm.k = 0, seed = 1L)
  imp2 <- missRanger(X_NA, num.trees = 20L, verbose = 0L, pmm.k = 3, seed = 1L)
  imp3 <- missRanger(X_NA, . ~ 1, verbose = 0L, seed = 1L)

  for (imp in list(imp1, imp2, imp3)) {
    expect_true(is.numeric(imp$double))
    expect_true(is.numeric(imp$int))
    expect_true(is.logical(imp$logi))
    expect_true(is.character(imp$char))
   
    expect_true(is.factor(imp$fact))
    expect_equal(levels(imp$fact), levels(X$fact))
  }
})

# SELECTION OF "to_impute" and "impute_by"

# Are two vectors identical up to sorting?
.setequal <- function(x, y) {
  (length(x) == length(y)) && all(sort(x) == sort(y))
}

test_that("Only features with missings, but not all missings, are being imputed", {
  X2 <- generateNA(X, p = c(int = 1, double = 0.2), seed = 1L)
  imp <- missRanger(X2, num.trees = 20L, verbose = 0L, seed = 1L, data_only = FALSE)
  
  expect_equal(imp$to_impute, "double")
  expect_equal(unname(colSums(is.na(imp$data))), c(n, 0, 0, 0, 0, 0))
})

test_that("Date variables (and other non-standard variable types) are not imputed", {
  X2 <- generateNA(X, p = c(date = 0.3, int = 0.1), seed = 1L)
  imp <- missRanger(X2, num.trees = 20L, verbose = 0L, seed = 1L, data_only = FALSE)
  
  expect_equal(imp$to_impute, "int")
  expect_true(anyNA(imp$data$date))
})

test_that("Features with missings that are not imputed are not used for imputation", {
  X2 <- generateNA(X, p = c(int = 0.1, double = 0.2), seed = 1L)
  imp <- missRanger(
    X2, . - double ~ ., num.trees = 20L, verbose = 0L, seed = 1L, data_only = FALSE
  )
  
  expect_equal(imp$impute_by, setdiff(colnames(X2), "double"))
})

test_that("Constant features are not used for imputation", {
  X2 <- generateNA(X, p = c(int = 0.1, double = 0.2), seed = 1L) |> 
    transform(const = 1)
  imp <- missRanger(
    X2, num.trees = 20L, verbose = 0L, seed = 1L, data_only = FALSE
  )
  
  expect_equal(imp$impute_by, setdiff(colnames(X2), "const"))
})

test_that("list variables (and other strange types) cannot be used for imputation", {
  X2 <- generateNA(X, p = c(int = 0.1, double = 0.2), seed = 1L)
  X2$list <- as.list(replicate(n, NULL))
  imp <- missRanger(X2, num.trees = 20L, verbose = 0L, seed = 1L, data_only = FALSE)
  
  expect_equal(imp$impute_by, setdiff(colnames(X), "list"))
})

test_that("Multiple problems together", {
  X2 <- generateNA(X, p = 0.2, seed = 1L)
  X2$list <- as.list(replicate(n, NULL))
  X2$const <- 1
  
  suppressMessages(
    capture_output(
      imp <- missRanger(X2, . - double ~ ., num.trees = 20L, seed = 1L, data_only = FALSE) 
    )
  )
  
  xpected <- setdiff(colnames(X), c("double", "date"))  
  expect_equal(imp$impute_by, xpected)
  expect_true(.setequal(imp$to_impute, xpected))
})

# FORMULA PARSING

test_that("formula interface works with specified left and right side", {
  imp <- missRanger(
    X_NA,
    int ~ int + double,
    pmm = 3L,
    num.trees = 20L,
    verbose = 0L,
    seed = 1L,
    data_only = FALSE
  )
  na_per_col <- colSums(is.na(imp$data))
  
  expect_equal(unname(na_per_col["int"]), 0L)
  expect_true(all(na_per_col[-1L] >= 1L))
  expect_equal(imp$to_impute, "int")
  expect_equal(imp$impute_by, "int")
})

test_that("formula interface works with unspecified right side", {
  imp <- missRanger(
    X_NA,
    int + double ~ .,
    pmm = 3L,
    num.trees = 20L,
    verbose = 0L,
    seed = 1L,
    data_only = FALSE
  )
  na_per_col <- colSums(is.na(imp$data))
  
  expect_equal(unname(na_per_col[c("int", "double")]), c(0, 0))
  expect_true(all(na_per_col[-(1:2)] >= 1L))
  expect_true(.setequal(imp$to_impute, c("int", "double")))
  expect_equal(imp$impute_by, c("int", "double"))
})

test_that("formula interface works with unspecified left side", {
  imp <- missRanger(
    X_NA, . ~ int, pmm = 3L, num.trees = 20L, verbose = 0L, seed = 1L, data_only = FALSE
  )
  expect_true(!anyNA(imp$data))
  expect_true(.setequal(imp$to_impute, colnames(X_NA)))
  expect_equal(imp$impute_by, "int")
  
  # Prediction error is on its default 1 for int (issue #77)
  expect_equal(imp$pred_errors[, "int"], rep(1, length(imp$mean_pred_errors)))
})

test_that("dropping columns on left side leaves missing values", {
  imp <- missRanger(
    X_NA,
    . - int ~ .,
    pmm = 3L,
    num.trees = 20L,
    verbose = 0L,
    seed = 1L,
    data_only = FALSE
  )
  expect_equal(
    unname(colSums(is.na(imp$data)) > 0),
    c(TRUE, FALSE, FALSE, FALSE, FALSE)
  )
  xpected <- setdiff(colnames(X_NA), "int")
  expect_true(.setequal(imp$to_impute, xpected))
  expect_equal(imp$impute_by, xpected)
})

test_that("dropping columns on right side has an impact", {
  imp1 <- missRanger(
    X_NA, . ~ . - int, num.trees = 20L, verbose = 0L, seed = 1L, data_only = FALSE
  )
  imp2 <- missRanger(X_NA, num.trees = 20L, verbose = 0L, seed = 1L)
  
  expect_false(identical(imp1$data, imp2))
  expect_true(.setequal(imp1$to_impute, colnames(X_NA)))
  expect_equal(imp1$impute_by, setdiff(colnames(X_NA), "int"))
})

test_that("empty rhs equals univariate imputation", {
  imp1 <- missRanger(X_NA, . ~ 1, num.trees = 20L, verbose = 0L, seed = 1L)
  imp2 <- imputeUnivariate(X_NA, seed = 1L)

  imp3 <- missRanger(X_NA, int + char ~ 1, num.trees = 20L, verbose = 0L, seed = 1L)
  imp4 <- imputeUnivariate(X_NA, seed = 1L, v = c("int", "char"))
  
  expect_equal(imp1, imp2)
  expect_equal(imp3, imp4)
})

test_that("non-syntactic column names work with or without formula", {
  X_NA2 <- X_NA
  colnames(X_NA2)[1:2] <- c("1bad name", "2 also bad")
  
  imp1 <- missRanger(X_NA, num.trees = 20L, verbose = 0L, seed = 1L)
  imp2 <- missRanger(X_NA2, num.trees = 20L, verbose = 0L, seed = 1L)
  imp3 <- missRanger(
    . - `1bad name` ~ - ., data = X_NA2, num.trees = 20L, verbose = 0L, seed = 1L
  )
  
  expect_equal(colnames(X_NA2), colnames(imp2))
  expect_equal(imp1, setNames(imp2, colnames(imp1)))
  expect_false(anyNA(imp2))
  expect_equal(
    unname(colSums(is.na(imp3)) == 0L),
    c(FALSE, TRUE, TRUE, TRUE, TRUE)
  )
})

Try the missRanger package in your browser

Any scripts or data that you put into this service are public.

missRanger documentation built on Sept. 12, 2024, 7:15 a.m.