test-fit.mult.impute-C&A.R
In Greg: Regression Helper Functions

############################
# Tests for the            #
# fit.mult.impute          #
# in printCrudeAndAdjusted #
# issues                   #
############################

suppressMessages(library("rms"))

test_that("Check regular linear regression with getC&A", {
  set.seed(10)
  n <- 500
  ds <- data.frame(
    ftime = rexp(n),
    fstatus = sample(0:1, size = n, replace = TRUE),
    x1 = factor(sample(LETTERS[1:4], size = n, replace = TRUE)),
    x2 = rnorm(n, mean = 3, 2),
    x3 = factor(sample(letters[1:3], size = n, replace = TRUE)),
    subsetting = factor(sample(c(TRUE, FALSE), size = n, replace = TRUE))
  )

  ds$missing_var_1 <- factor(sample(letters[1:4], size = n, replace = TRUE))
  ds$missing_var_2 <- factor(sample(letters[1:4], size = n, replace = TRUE))
  ds$y <- rnorm(nrow(ds)) +
    (as.numeric(ds$x1) - 1) * 1 +
    (as.numeric(ds$missing_var_1) - 1) * 1 +
    (as.numeric(ds$missing_var_2) - 1) * .5

  # Create a messy missing variable
  non_random_missing <- sample(which(ds$missing_var_1 %in% c("b", "d")),
    size = 150, replace = FALSE
  )
  # Restrict the non-random number on the x2 variables
  non_random_missing <- non_random_missing[non_random_missing %in%
    which(ds$x2 > mean(ds$x2) * 1.5) &
    non_random_missing %in%
      which(ds$x2 > mean(ds$y))]
  ds$missing_var_1[non_random_missing] <- NA

  # Simple missing
  ds$missing_var_2[sample(1:nrow(ds), size = 50)] <- NA

  # Setup the rms environment
  ddist <<- datadist(ds)
  options(datadist = "ddist")

  fit_ols <- ols(y ~ x1 + x2 + x3 + missing_var_1 + missing_var_2, data = ds)
  impute_formula <- as.formula(paste("~",
                                     paste(all.vars(as.formula(fit_ols)),
                                           collapse = "+"
                                     )))

  sink(file = ifelse(Sys.info()["sysname"] == "Windows",
    "NUL",
    "/dev/null"
  ))
  imp_ds <- aregImpute(impute_formula, data = ds, n.impute = 10)

  fmult <- suppressWarnings(fit.mult.impute(formula(fit_ols),
    fitter = lm, xtrans = imp_ds, data = ds
  ))

  a <- suppressWarnings(getCrudeAndAdjustedModelData(fmult))
  sink()
  expect_true(sum(a[, "Adjusted"] - coef(fmult)) < .Machine$double.eps)

  sink(file = ifelse(Sys.info()["sysname"] == "Windows",
    "NUL",
    "/dev/null"
  ))
  fmult <- suppressWarnings(fit.mult.impute(formula(fmult),
    fitter = ols, xtrans = imp_ds, data = ds
  ))

  a <- suppressWarnings(getCrudeAndAdjustedModelData(fmult))
  sink()
  # Remove the intercept as the fitter was ols
  expect_true(sum(a[, "Adjusted"] - coef(fmult)[-1]) < .Machine$double.eps)

  sink(file = ifelse(Sys.info()["sysname"] == "Windows",
    "NUL",
    "/dev/null"
  ))
  single_fit <- suppressWarnings(fit.mult.impute(y ~ missing_var_1,
    fitter = ols, xtrans = imp_ds, data = ds
  ))
  sink()
  expect_true(sum(a[grep("missing_var_1", rownames(a)), "Crude"] -
    coef(single_fit)[grep(
      "missing_var_1",
      names(coef(single_fit))
    )]) <
    .Machine$double.eps)
})

test_that("Check regular linear regression with printC&A", {
  set.seed(10)
  n <- 500
  ds <- data.frame(
    ftime = rexp(n),
    fstatus = sample(0:1, size = n, replace = TRUE),
    x1 = factor(sample(LETTERS[1:4], size = n, replace = TRUE)),
    x2 = rnorm(n, mean = 3, 2),
    x3 = factor(sample(letters[1:3], size = n, replace = TRUE)),
    subsetting = factor(sample(c(TRUE, FALSE), size = n, replace = TRUE))
  )

  ds$missing_var_1 <- factor(sample(letters[1:4], size = n, replace = TRUE))
  ds$missing_var_2 <- factor(sample(letters[1:4], size = n, replace = TRUE))
  ds$y <- rnorm(nrow(ds)) +
    (as.numeric(ds$x1) - 1) * 1 +
    (as.numeric(ds$missing_var_1) - 1) * 1 +
    (as.numeric(ds$missing_var_2) - 1) * .5

  # Create a messy missing variable
  non_random_missing <- sample(which(ds$missing_var_1 %in% c("b", "d")),
    size = 150, replace = FALSE
  )
  # Restrict the non-random number on the x2 variables
  non_random_missing <- non_random_missing[non_random_missing %in%
    which(ds$x2 > mean(ds$x2) * 1.5) &
    non_random_missing %in%
      which(ds$x2 > mean(ds$y))]
  ds$missing_var_1[non_random_missing] <- NA

  # Simple missing
  ds$missing_var_2[sample(1:nrow(ds), size = 50)] <- NA

  # Setup the rms environment
  ddist <<- datadist(ds)
  options(datadist = "ddist")

  fit_ols <- ols(y ~ x1 + x2 + x3 + missing_var_1 + missing_var_2, data = ds)
  
  sink(file = ifelse(Sys.info()["sysname"] == "Windows",
    "NUL",
    "/dev/null"
  ))
  imp_ds <- aregImpute(~y + x1 + x2 + x3 + missing_var_1 + missing_var_2, data = ds, n.impute = 10)

  fmult <- suppressWarnings(fit.mult.impute(formula(fit_ols),
    lm, imp_ds,
    data = ds
  ))

  a <- suppressWarnings(printCrudeAndAdjustedModel(fmult))
  sink()
  expect_equivalent(tail(attr(a, "rgroup"), 1), "missing_var_2")

  expect_equivalent(tail(rownames(a), 4),
    c(
      tail(levels(ds$missing_var_1), 1),
      levels(ds$missing_var_2)[-1]
    ),
    info = paste(
      "The reference should not be included by default",
      "without specifying add_reference"
    )
  )

  expect_equivalent(ncol(a), 4)

  sink(file = ifelse(Sys.info()["sysname"] == "Windows",
    "NUL",
    "/dev/null"
  ))
  a <- suppressWarnings(printCrudeAndAdjustedModel(fmult, add_references = TRUE))
  sink()
  expect_equivalent(tail(rownames(a), 5),
    c(
      tail(levels(ds$missing_var_1), 1),
      levels(ds$missing_var_2)
    ),
    info = paste(
      "The reference should be included ",
      "if add_reference == TRUE"
    )
  )
  expect_equivalent(ncol(a), 4)

  sink(file = ifelse(Sys.info()["sysname"] == "Windows",
    "NUL",
    "/dev/null"
  ))
  coef_change_tst <-
    suppressWarnings(printCrudeAndAdjustedModel(fmult,
      add_references = TRUE,
      impute_args = list(coef_change = TRUE)
    ))
  coef_change_tst_name <-
    suppressWarnings(printCrudeAndAdjustedModel(fmult,
      add_references = TRUE,
      impute_args = list(coef_change = list(name = "cc test"))
    ))
  vi_infl_tst_name <-
    suppressWarnings(printCrudeAndAdjustedModel(fmult,
      add_references = TRUE,
      impute_args = list(variance.inflation = list(name = "vi test"))
    ))
  sink()
  expect_equivalent(ncol(coef_change_tst), 5,
    info = "When using fit.mult.impute additional columns should be added"
  )

  expect_equivalent(tail(colnames(coef_change_tst_name), 1),
    "cc test",
    info = "When using fit.mult.impute additional columns should be nameable"
  )

  expect_equivalent(tail(colnames(vi_infl_tst_name), 1),
    "vi test",
    info = "When using fit.mult.impute additional columns should be nameable"
  )
})

# test_that("Check logistic, survival, and poisson for antilog and more", {
#   set.seed(10)
#   n <- 500
#   ds <- data.frame(
#     ftime = rexp(n),
#     fstatus = sample(0:1, size = n, replace = TRUE),
#     x1 = factor(sample(LETTERS[1:4], size = n, replace = TRUE)),
#     x2 = rnorm(n, mean = 3, 2),
#     x3 = factor(sample(letters[1:3], size = n, replace = TRUE)),
#     subsetting = factor(sample(c(TRUE, FALSE), size = n, replace = TRUE))
#   )
# 
#   ds$missing_var_1 <- factor(sample(letters[1:4], size = n, replace = TRUE))
#   ds$missing_var_2 <- factor(sample(letters[1:4], size = n, replace = TRUE))
#   ds$y <- rnorm(nrow(ds)) +
#     (as.numeric(ds$x1) - 1) * 1 +
#     (as.numeric(ds$missing_var_1) - 1) * 1 +
#     (as.numeric(ds$missing_var_2) - 1) * .5
# 
#   # Create a messy missing variable
#   non_random_missing <- sample(which(ds$missing_var_1 %in% c("b", "d")),
#     size = 150, replace = FALSE
#   )
#   # Restrict the non-random number on the x2 variables
#   non_random_missing <- non_random_missing[non_random_missing %in%
#     which(ds$x2 > mean(ds$x2) * 1.5) &
#     non_random_missing %in%
#       which(ds$x2 > mean(ds$y))]
#   ds$missing_var_1[non_random_missing] <- NA
# 
#   # Simple missing
#   ds$missing_var_2[sample(1:nrow(ds), size = 50)] <- NA
# 
#   # Setup the rms environment
#   ddist <<- datadist(ds)
#   options(datadist = "ddist")
# 
#   s <- Surv(ds$ftime, ds$fstatus == 1)
#   sink(file = ifelse(Sys.info()["sysname"] == "Windows",
#     "NUL",
#     "/dev/null"
#   ))
# 
#   imp_ds <- aregImpute(as.formula(paste(
#     "~",
#     paste(colnames(ds),
#       collapse = "+"
#     )
#   )),
#   data = ds, n.impute = 10
#   )
# 
#   fit <- suppressWarnings(fit.mult.impute(s ~ x1 + x2 + strat(x3) + missing_var_1 + missing_var_2,
#   fitter = cph, data = ds, xtrans = imp_ds
#   ))
# 
#   fit <- coxph(s ~ x1 + x2 + strata(x3) +
#     missing_var_1 + missing_var_2, data = ds)
#   suppressWarnings(printCrudeAndAdjustedModel(fit))
#   sink()
# 
#   fit <- glm(fstatus ~ offset(log(ftime)) + x1 + x2 + x3, data = ds, family = binomial)
# 
#   # TODO: add tests
# })