test.fill.NAs.R
In optmatch: Functions for Optimal Matching

################################################################################
# fill.NAs tests
################################################################################

context("fill.NAs")

test_that("Basic Tests", {
  # Takes and returns a data frame
  expect_is(fill.NAs(data.frame(1)), "data.frame")

  # A formula alone is not allowed
  expect_error(fill.NAs(y ~ x))

  sample.df <- data.frame(a = 1:100, b = 100:1, c = rep(c(1,2, NA, 3, 4), 20))

  # takes a formula and a data.frame, returns a data frame
  result <- fill.NAs(a ~ b, sample.df)
  expect_is(result, "data.frame") # no missingness

  # simple calls should be equivalent to model.frame
  expect_equal(length(result), 2)

  # Adds additional columns for missing data indicators
  expect_equal(dim(fill.NAs(sample.df))[2], 4)
  expect_equal(dim(fill.NAs(sample.df, all.covs = T))[2], 4)

  result <- fill.NAs(sample.df)
  # the last column should be TRUE every 3 unit
  expect_identical(result[[4]], rep(c(F, F, T, F, F), 20))

  # column name should be c.NA
  expect_identical(colnames(result)[4], "c.NA")
})

test_that("Function expansion", {
  if (requireNamespace("splines", quietly = TRUE)) {
    # for variables encapsulated in functions, only the variable should be expanded into a NA column
    sample.df <- data.frame(a = 1:100, b = 100:1, c = rep(c(1,2, NA, 3, 4), 20))

    result <- fill.NAs(a ~ splines::ns(c, df = 3), sample.df)
    expect_equal(length(result), 5)
    expect_equal(colnames(result)[1], "a")

    ## right number of columns if 2 of the same variable used
    imputed.fmla <- fill.NAs(a ~ log(c) + sqrt(c), data = sample.df)
    expect_equal(dim(imputed.fmla)[2],  4)
  }
  expect_true(TRUE) # avoiding empty test warning
})

test_that("Matrices are valid", {
  if (requireNamespace("splines", quietly = FALSE)) {
    sample.df <- as.matrix(data.frame(a = 1:100, b = 100:1,
                                      c = rep(c(1, 2, NA, 3, 4), 20)))

    result <- fill.NAs(a ~ splines::ns(c, df = 3), sample.df)
    expect_equal(length(result), 5)
    expect_equal(colnames(result)[1], "a")
  }
  expect_true(TRUE) # avoid empty test warning
})

test_that("Results pass to lm()", {
  sample.df <- data.frame(a = 1:100, c = rep(c(1,2, NA, 3, 4), 20))

  imputed.fmla <- fill.NAs(a ~ log(c), data = sample.df)
  imputed.frame <- fill.NAs(sample.df)

  m1 <- lm(imputed.fmla)
  m2 <- lm(a ~ log(c) + c.NA, data = imputed.frame)

  # for some reason log(c) appears as `log(c)`. I strip these
  # out and treat the results as equal otherwise
  expect_identical(gsub("`", "", names(m1$coef)), names(m2$coef))

})

test_that("Response not imputed by default", {

  #### Do not impute response, only covariates
  naresponse.df <- data.frame(Y = c(1, 2, 3, NA, 5), X = c(10, 20, NA, 40, 50))
  imputed.response <- fill.NAs(Y ~ X, naresponse.df)
  expect_true(any(is.na(imputed.response$Y)))
  expect_true(!any(is.na(imputed.response$X)))

  #### Impute when all.covs = T

  # formula style
  imputed.all <- fill.NAs(Y ~ X, naresponse.df, all.covs = T)
  expect_true(!any(is.na(imputed.all)))

  # model frame style
  imputed.all <- fill.NAs(naresponse.df, all.covs = T)
  expect_true(!any(is.na(imputed.all)))

})

test_that("Transform, then impute", {

  #### Transform, then impute ####
  #### turning off tests for now. the strategy is to use model.matrix before
  #### imputing
  transform.df <- data.frame(Y = c(1,2,3,4,5), X1 = c(2,2,4, NA, 4), X2 = c(NA, 10, 20, 30, NA))
  imputed.transform <- fill.NAs(Y ~ X1 * X2, data = transform.df)
  # should have 6 columns Y, X1, X2, X2:X3, X1.NA, and X2.NA
  expect_equal(dim(imputed.transform)[2], 6)
  expect_identical(imputed.transform$X1 , c(2,2,4,3,4))
  expect_identical(imputed.transform$X2 , c(20, 10, 20, 30, 20))
  expect_equal(imputed.transform[["X1:X2"]], c(50, 20, 80, 50, 50))

  i2.transform <- fill.NAs(Y ~ X1, data = transform.df)
  expect_equal(length(i2.transform), 3)
})

test_that("response variables with complex names", {
  data(nuclearplants)
  nuclearplants$cost[1] <- NA
  nuclearplants$cap[2] <- NA
  m <- lm(cost + t1 ~ cap + pr, data=nuclearplants)
  d <- model.frame(m, na.action=na.pass)
  # Name of response in this model is now `cost + t1`
  # Renaming the column to ensure special characters aren't
  # causing problems.
  d1 <- d
  names(d1)[1] <- "costplust1"

  expect_true(all(fill.NAs(d, all.covs=TRUE) == fill.NAs(d1, all.covs=TRUE)))


  # Addressing issue #100
  m2 <- lm(cbind(cost, t1) ~ cap + pr, data=nuclearplants)
  d2 <- model.frame(m2, na.action=na.pass)

  # Informative error for #104 before fixing.
  expect_error(fill.NAs(d2), "matrix columns not supported")

  ## Disabling for now. See issue 104 for details on probable solution
  ## d3 <- d2
  ## names(d3)[1] <- "cbind"
  ## expect_true(all(fill.NAs(d2, all.covs=TRUE), fill.NAS(d3, all.covs=TRUE)))

})

test_that("strata() function handling", {

  set.seed(20150624)

  data.full <- data.frame(z = c(rep(1, 10), rep(0, 10)),
                          x = rnorm(20),
                          s = sample(c("A", "B", "C"), size = 20, replace = TRUE),
                          t = sample(c("UP", "DOWN"), size = 20, replace = TRUE))
  data.full$x[c(1, 2, 11)] <- NA

  # basic strata handling without NAs
  res1 <- fill.NAs(z ~ x + strata(s), data = data.full)
  expect_equal(dim(res1), c(20, 4)) # do not expand strata variable
  expect_false(any(is.na(res1)))

  res2 <- fill.NAs(z ~ x + strata(s) + strata(t), data = data.full)
  expect_equal(dim(res2), c(20, 5))
  expect_false(any(is.na(res2)))

  # now, let's knock out some strata levels
  data.NAs <- data.full
  data.NAs$s[sample(1:20, size = 3)] <- NA

  res3 <- fill.NAs(z ~ x + strata(s), data = data.NAs)
  expect_equal(sum(is.na(res3$s)), 3)
  # The following line should not error per #103
  glm(z ~ x + x.NA + strata(s), family=binomial, data=res3)

  # Our version of `strata` doesn't support na.group
  #res4 <- fill.NAs(z ~ x + strata(s, na.group = TRUE), data = data.NAs)
  #expect_false(any(is.na(res4$s)))
  # Again, this should not error per #103
  #glm(z ~ x + x.NA + strata(s, na.group=TRUE), family=binomial, data=res4)

  ## checking for terms attribute on the returned data.frame
  tt <- terms(res1)
  expect_false(is.null(attr(tt, "specials")$strata)) # the strata term is marked as such

  # if we spell things out, we should get a model on the imputed values
  xx <- glm(z ~ x + x.NA + strata(s), data = res1, family = binomial)
  expect_true(all(names(coef(xx))[1:3] %in% c("(Intercept)", "x", "x.NATRUE")))
  ## 4th coeff sometimes turns up as "strata(s)B", other times as "strata(s)s=s=B"
  ## the latter is less desirable, but no time to explore circumventing it.
  ## (Seen with: survival_2.37-7 ; R 3.1.2; x86_64-apple-darwin12.6.0, 64-bit.)
  expect_true(grep(glob2rx("strata*B"), names(coef(xx)))==4)
  expect_true(grep(glob2rx("strata*C"), names(coef(xx)))==5)

  # does not work yet:
  # yy <- glm(res1)
  # expect_equivalent(xx, yy)

  ## imputation should be per stratum
  expect_false(all(res1$x == (fill.NAs(z ~ x, data = data.full))$x))
})

test_that("Checking for fix to factors in fill.nas, mentioned in #103", {

  data(nuclearplants)
  nuclearplants$t1[1] <- NA
  f <- fill.NAs(pr ~ cap + factor(t1), data=nuclearplants)
  g <- glm(f, family=binomial)
  expect_is(g, "glm")

})