tests/testthat/test-mcee-missing-data-error.R

test_that("mcee errors cleanly on missing/NaN/Inf in core, moderator, and control vars", {
    set.seed(42)

    # ---- base DGM (no missing) ----
    n <- 6
    Ti <- c(5, 7, 6, 6, 8, 5)
    id <- rep(seq_len(n), Ti)
    dp <- unlist(lapply(Ti, seq_len))
    I <- rbinom(length(dp), 1, 0.9)
    A <- rbinom(length(dp), 1, 0.6)
    M <- rbinom(length(dp), 1, plogis(-0.2 + 0.3 * A + 0.1 * scale(dp)))
    Ytmp <- 0.5 * A + 0.6 * M + 0.08 * scale(dp) + rnorm(length(dp), 0, 0.2)
    Y <- ave(Ytmp, id, FUN = function(v) rep(mean(v), length(v)))

    d0 <- data.frame(id, dp, I, A, M, Y, check.names = FALSE)

    # nontrivial weights
    w <- ave(0.3 + 0.7 * (d0$dp / ave(d0$dp, d0$id, FUN = max)),
        d0$id,
        FUN = function(v) v / sum(v)
    )

    # 1) Missing in a core column (dp)
    d1 <- d0
    d1$dp[c(3, 10)] <- NA_integer_
    expect_error(
        mcee(
            data = d1, id = "id", dp = "dp", outcome = "Y", treatment = "A", mediator = "M",
            availability = "I",
            time_varying_effect_form = ~1,
            control_formula_with_mediator = ~ dp + M,
            control_reg_method = "glm",
            rand_prob = 0.5,
            weight_per_row = w,
            verbose = FALSE
        ),
        regexp = "Missing/NaN/Inf.*dp.*rows\\s+3,\\s+10.*does not support handling missing data",
        ignore.case = TRUE
    )

    # 2) NaN/Inf in moderator vars (here dp is used in moderator)
    d2 <- d0
    d2$dp[c(2, 9)] <- Inf
    expect_error(
        mcee(
            data = d2, id = "id", dp = "dp", outcome = "Y", treatment = "A", mediator = "M",
            availability = "I",
            time_varying_effect_form = ~dp,
            control_formula_with_mediator = ~ dp + M,
            control_reg_method = "glm",
            rand_prob = 0.5,
            weight_per_row = w,
            verbose = FALSE
        ),
        regexp = "Missing/NaN/Inf.*dp.*rows\\s+2,\\s+9",
        ignore.case = TRUE
    )

    # 3) Missing in a control-only variable (M here)
    d3 <- d0
    d3$M[c(4, 12, 25)] <- NA_real_
    expect_error(
        mcee(
            data = d3, id = "id", dp = "dp", outcome = "Y", treatment = "A", mediator = "M",
            availability = "I",
            time_varying_effect_form = ~1,
            control_formula_with_mediator = ~ dp + M,
            control_reg_method = "glm",
            rand_prob = 0.5,
            weight_per_row = w,
            verbose = FALSE
        ),
        regexp = "Missing/NaN/Inf.*M.*rows\\s+4,\\s+12,\\s+25",
        ignore.case = TRUE
    )
})

test_that("mcee_general errors cleanly on missing in config formula variables", {
    set.seed(99)

    # base DGM again
    n <- 6
    Ti <- c(5, 7, 6, 6, 8, 5)
    id <- rep(seq_len(n), Ti)
    dp <- unlist(lapply(Ti, seq_len))
    I <- rbinom(length(dp), 1, 0.9)
    A <- rbinom(length(dp), 1, 0.6)
    M <- rbinom(length(dp), 1, plogis(-0.2 + 0.3 * A + 0.1 * scale(dp)))
    Ytmp <- 0.5 * A + 0.6 * M + 0.08 * scale(dp) + rnorm(length(dp), 0, 0.2)
    Y <- ave(Ytmp, id, FUN = function(v) rep(mean(v), length(v)))
    Z <- rnorm(length(dp)) # extra covariate for configs

    d0 <- data.frame(id, dp, I, A, M, Y, Z, check.names = FALSE)
    w <- ave(0.3 + 0.7 * (d0$dp / ave(d0$dp, d0$id, FUN = max)),
        d0$id,
        FUN = function(v) v / sum(v)
    )

    # Make NA only in a variable used by a config (Z used in q)
    d1 <- d0
    d1$Z[c(6, 13)] <- NA_real_

    cfg_p <- list(method = "glm", formula = ~dp) # binomial auto
    cfg_q <- list(method = "glm", formula = ~ dp + Z) # binomial auto; Z has NA
    cfg_eta <- list(method = "glm", formula = ~dp) # gaussian auto
    cfg_mu <- list(method = "glm", formula = ~ dp + M) # gaussian auto
    cfg_nu <- list(method = "glm", formula = ~dp) # gaussian auto

    expect_error(
        mcee_general(
            data = d1,
            id = "id", dp = "dp", outcome = "Y",
            treatment = "A", mediator = "M",
            availability = "I",
            time_varying_effect_form = ~1,
            config_p = cfg_p, config_q = cfg_q,
            config_eta = cfg_eta, config_mu = cfg_mu, config_nu = cfg_nu,
            weight_per_row = w,
            verbose = FALSE
        ),
        regexp = "Missing/NaN/Inf.*Z.*rows\\s+6,\\s+13",
        ignore.case = TRUE
    )
})

test_that("mcee_userfit_nuisance errors cleanly on missing in supplied nuisance vectors", {
    set.seed(123)

    # base DGM
    n <- 6
    Ti <- c(5, 7, 6, 6, 8, 5)
    id <- rep(seq_len(n), Ti)
    dp <- unlist(lapply(Ti, seq_len))
    I <- rbinom(length(dp), 1, 0.9)
    A <- rbinom(length(dp), 1, 0.6)
    M <- rbinom(length(dp), 1, plogis(-0.2 + 0.3 * A + 0.1 * scale(dp)))
    Ytmp <- 0.5 * A + 0.6 * M + 0.08 * scale(dp) + rnorm(length(dp), 0, 0.2)
    Y <- ave(Ytmp, id, FUN = function(v) rep(mean(v), length(v)))

    d0 <- data.frame(id, dp, I, A, M, Y, check.names = FALSE)
    w <- ave(0.3 + 0.7 * (d0$dp / ave(d0$dp, d0$id, FUN = max)),
        d0$id,
        FUN = function(v) v / sum(v)
    )

    # Create sane nuisance predictions first
    p1 <- plogis(-0.1 + 0.02 * dp) # in (0,1)
    q1 <- plogis(-0.2 + 0.02 * dp + 0.3 * M)
    eta1 <- 0.4 + 0.1 * dp
    eta0 <- 0.3 + 0.05 * dp
    mu1 <- 0.4 + 0.15 * dp + 0.2 * M
    mu0 <- 0.3 + 0.10 * dp + 0.1 * M
    nu1 <- 0.35 + 0.12 * dp
    nu0 <- 0.25 + 0.08 * dp

    # Inject NA into one nuisance vector
    p1_bad <- p1
    p1_bad[c(2, 11, 17)] <- NA_real_

    expect_error(
        mcee_userfit_nuisance(
            data = d0,
            id = "id", dp = "dp", outcome = "Y",
            treatment = "A", mediator = "M",
            availability = "I",
            time_varying_effect_form = ~dp,
            p1 = p1_bad, q1 = q1,
            eta1 = eta1, eta0 = eta0,
            mu1 = mu1, mu0 = mu0,
            nu1 = nu1, nu0 = nu0,
            weight_per_row = w,
            verbose = FALSE
        ),
        regexp = "Missing/NaN/Inf detected in 'p1'.*rows\\s+2,\\s+11,\\s+17.*does not support handling missing data",
        ignore.case = TRUE
    )

    # Inject Inf into another vector (nu0)
    nu0_bad <- nu0
    nu0_bad[c(3, 9)] <- Inf
    expect_error(
        mcee_userfit_nuisance(
            data = d0,
            id = "id", dp = "dp", outcome = "Y",
            treatment = "A", mediator = "M",
            availability = "I",
            time_varying_effect_form = ~1,
            p1 = p1, q1 = q1,
            eta1 = eta1, eta0 = eta0,
            mu1 = mu1, mu0 = mu0,
            nu1 = nu1, nu0 = nu0_bad,
            weight_per_row = w,
            verbose = FALSE
        ),
        regexp = "Missing/NaN/Inf detected in 'nu0'.*rows\\s+3,\\s+9",
        ignore.case = TRUE
    )
})

test_that("mcee_* missing-data message aggregates multiple offenders", {
    set.seed(7)

    n <- 5
    Ti <- rep(5, n)
    id <- rep(seq_len(n), Ti)
    dp <- unlist(lapply(Ti, seq_len))
    I <- rep(1, length(dp))
    A <- rbinom(length(dp), 1, 0.5)
    M <- rbinom(length(dp), 1, 0.5)
    Y <- ave(0.2 * A + 0.3 * M + rnorm(length(dp), 0, .1), id, FUN = function(v) rep(mean(v), length(v)))

    d0 <- data.frame(id, dp, I, A, M, Y)

    # Make both Y and M missing in a few rows (should list both variables)
    d1 <- d0
    d1$Y[c(4, 7)] <- NA_real_
    d1$M[c(3, 8)] <- NA_real_

    w <- rep(1, nrow(d1))

    expect_error(
        mcee(
            data = d1, id = "id", dp = "dp", outcome = "Y", treatment = "A", mediator = "M",
            availability = "I",
            time_varying_effect_form = ~1,
            control_formula_with_mediator = ~ dp + M,
            control_reg_method = "glm",
            rand_prob = 0.5,
            weight_per_row = w,
            verbose = FALSE
        ),
        regexp = "(Y.*rows\\s+4,\\s+7|M.*rows\\s+3,\\s+8).*(Y.*rows\\s+4,\\s+7|M.*rows\\s+3,\\s+8)",
        ignore.case = TRUE
    )
})

Try the MRTAnalysis package in your browser

Any scripts or data that you put into this service are public.

MRTAnalysis documentation built on Sept. 9, 2025, 5:41 p.m.