test-missing_value_estimation.R
In selection.index: Analysis of Selection Index in Plant Breeding

# Test the modular missing_value_estimation function

# NOTE: Tests for gen_varcov() and phen_varcov() integration require the full
# package to be loaded (via devtools::test() or R CMD check). They will fail
# when running this file individually with test_file().

test_that("missing_value_estimation works with all three methods", {
  skip_on_cran() # heavy cross-products / TRE regex — bypass CRAN sanitizers
  # Create sample RCBD data with missing values
  set.seed(123)
  n_gen <- 5
  n_rep <- 3
  n_traits <- 4
  n_obs <- n_gen * n_rep

  # Generate test data
  data_mat <- matrix(rnorm(n_obs * n_traits), nrow = n_obs, ncol = n_traits)
  colnames(data_mat) <- paste0("trait", 1:n_traits)

  gen_idx <- rep(1:n_gen, each = n_rep)
  rep_idx <- rep(1:n_rep, times = n_gen)

  # Introduce missing values
  missing_positions <- c(3, 7, 12, 15)
  data_with_missing <- data_mat
  data_with_missing[missing_positions, 1] <- NA
  data_with_missing[c(5, 10), 2] <- NA

  # Test REML method
  result_reml <- selection.index:::missing_value_estimation(data_with_missing, gen_idx, rep_idx, method = "REML")
  expect_true(all(is.finite(result_reml)))
  expect_equal(dim(result_reml), dim(data_with_missing))
  expect_false(any(is.na(result_reml)))

  # Test Yates method
  result_yates <- selection.index:::missing_value_estimation(data_with_missing, gen_idx, rep_idx, method = "Yates")
  expect_true(all(is.finite(result_yates)))
  expect_equal(dim(result_yates), dim(data_with_missing))
  expect_false(any(is.na(result_yates)))

  # Test Healy & Westmacott method
  result_healy <- selection.index:::missing_value_estimation(data_with_missing, gen_idx, rep_idx, method = "Healy")
  expect_true(all(is.finite(result_healy)))
  expect_equal(dim(result_healy), dim(data_with_missing))
  expect_false(any(is.na(result_healy)))

  # Test Regression method
  result_regression <- selection.index:::missing_value_estimation(data_with_missing, gen_idx, rep_idx, method = "Regression")
  expect_true(all(is.finite(result_regression)))
  expect_equal(dim(result_regression), dim(data_with_missing))
  expect_false(any(is.na(result_regression)))

  # Test Mean substitution method
  result_mean <- selection.index:::missing_value_estimation(data_with_missing, gen_idx, rep_idx, method = "Mean")
  expect_true(all(is.finite(result_mean)))
  expect_equal(dim(result_mean), dim(data_with_missing))
  expect_false(any(is.na(result_mean)))

  # Test Bartlett method
  result_bartlett <- selection.index:::missing_value_estimation(data_with_missing, gen_idx, rep_idx, method = "Bartlett")
  expect_true(all(is.finite(result_bartlett)))
  expect_equal(dim(result_bartlett), dim(data_with_missing))
  expect_false(any(is.na(result_bartlett)))

  # Test that non-missing values are preserved
  non_missing_idx <- which(is.finite(data_with_missing[, 3]))
  expect_equal(result_reml[non_missing_idx, 3], data_with_missing[non_missing_idx, 3])
  expect_equal(result_yates[non_missing_idx, 3], data_with_missing[non_missing_idx, 3])
  expect_equal(result_healy[non_missing_idx, 3], data_with_missing[non_missing_idx, 3])
  expect_equal(result_regression[non_missing_idx, 3], data_with_missing[non_missing_idx, 3])
  expect_equal(result_mean[non_missing_idx, 3], data_with_missing[non_missing_idx, 3])
  expect_equal(result_bartlett[non_missing_idx, 3], data_with_missing[non_missing_idx, 3])
})

test_that("missing_value_estimation returns data unchanged when no missing values", {
  set.seed(456)
  data_mat <- matrix(rnorm(30), nrow = 10, ncol = 3)
  gen_idx <- rep(1:5, each = 2)
  rep_idx <- rep(1:2, times = 5)

  result <- selection.index:::missing_value_estimation(data_mat, gen_idx, rep_idx)
  expect_equal(result, data_mat)
})

test_that("gen.varcov and phen.varcov use missing_value_estimation correctly", {
  skip_on_cran() # heavy cross-products / TRE regex — bypass CRAN sanitizers
  # Load test data
  data(seldata, package = "selection.index")

  # Create data with missing values
  test_data <- seldata[, 3:9]
  test_data[3, 1] <- NA
  test_data[7, 2] <- NA
  test_data[15, 3] <- NA

  # Test that functions work with all three methods
  gen_reml <- selection.index:::gen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "REML"
  )
  expect_true(is.matrix(gen_reml))
  expect_equal(nrow(gen_reml), ncol(test_data))

  gen_yates <- selection.index:::gen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "Yates"
  )
  expect_true(is.matrix(gen_yates))
  expect_equal(nrow(gen_yates), ncol(test_data))

  gen_healy <- selection.index:::gen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "Healy"
  )
  expect_true(is.matrix(gen_healy))
  expect_equal(nrow(gen_healy), ncol(test_data))

  gen_regression <- selection.index:::gen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "Regression"
  )
  expect_true(is.matrix(gen_regression))
  expect_equal(nrow(gen_regression), ncol(test_data))

  gen_mean <- selection.index:::gen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "Mean"
  )
  expect_true(is.matrix(gen_mean))
  expect_equal(nrow(gen_mean), ncol(test_data))

  gen_bartlett <- selection.index:::gen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "Bartlett"
  )
  expect_true(is.matrix(gen_bartlett))
  expect_equal(nrow(gen_bartlett), ncol(test_data))

  phen_reml <- selection.index:::phen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "REML"
  )
  expect_true(is.matrix(phen_reml))
  expect_equal(nrow(phen_reml), ncol(test_data))

  phen_yates <- selection.index:::phen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "Yates"
  )
  expect_true(is.matrix(phen_yates))
  expect_equal(nrow(phen_yates), ncol(test_data))

  phen_healy <- selection.index:::phen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "Healy"
  )
  expect_true(is.matrix(phen_healy))
  expect_equal(nrow(phen_healy), ncol(test_data))

  phen_regression <- selection.index:::phen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "Regression"
  )
  expect_true(is.matrix(phen_regression))
  expect_equal(nrow(phen_regression), ncol(test_data))

  phen_mean <- selection.index:::phen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "Mean"
  )
  expect_true(is.matrix(phen_mean))
  expect_equal(nrow(phen_mean), ncol(test_data))

  phen_bartlett <- selection.index:::phen_varcov(
    data = test_data,
    genotypes = seldata$treat,
    replication = seldata$rep,
    method = "Bartlett"
  )
  expect_true(is.matrix(phen_bartlett))
  expect_equal(nrow(phen_bartlett), ncol(test_data))
})

test_that("functions warn when missing values present but method not specified", {
  skip_on_cran() # heavy cross-products / TRE regex — bypass CRAN sanitizers
  # Load test data
  data(seldata, package = "selection.index")

  # Create data with missing values
  test_data <- seldata[, 3:9]
  test_data[3, 1] <- NA
  test_data[7, 2] <- NA

  # Test gen.varcov warns when method not provided
  expect_warning(
    selection.index:::gen_varcov(data = test_data, genotypes = seldata$treat, replication = seldata$rep),
    "Missing values detected in data. Using default method 'REML' for imputation."
  )

  # Test phen.varcov warns when method not provided
  expect_warning(
    selection.index:::phen_varcov(data = test_data, genotypes = seldata$treat, replication = seldata$rep),
    "Missing values detected in data. Using default method 'REML' for imputation."
  )

  # Test no warning when method is explicitly provided
  expect_silent(
    selection.index:::gen_varcov(data = test_data, genotypes = seldata$treat, replication = seldata$rep, method = "REML")
  )

  expect_silent(
    selection.index:::phen_varcov(data = test_data, genotypes = seldata$treat, replication = seldata$rep, method = "Mean")
  )
})

test_that("functions work without warning when no missing values", {
  skip_on_cran() # heavy cross-products / TRE regex — bypass CRAN sanitizers
  # Load test data
  data(seldata, package = "selection.index")

  # Use complete data (no missing values)
  test_data <- seldata[, 3:9]

  # Should not warn even without method specified
  expect_silent(
    selection.index:::gen_varcov(data = test_data, genotypes = seldata$treat, replication = seldata$rep)
  )

  expect_silent(
    selection.index:::phen_varcov(data = test_data, genotypes = seldata$treat, replication = seldata$rep)
  )

  # Verify matrices are returned
  result_gen <- selection.index:::gen_varcov(data = test_data, genotypes = seldata$treat, replication = seldata$rep)
  result_phen <- selection.index:::phen_varcov(data = test_data, genotypes = seldata$treat, replication = seldata$rep)

  expect_true(is.matrix(result_gen))
  expect_true(is.matrix(result_phen))
  expect_equal(nrow(result_gen), ncol(test_data))
  expect_equal(nrow(result_phen), ncol(test_data))
})

# ==============================================================================
# TEST: Latin Square Design (LSD) support
# ==============================================================================

test_that("missing_value_estimation works with LSD design", {
  skip_on_cran() # heavy cross-products / TRE regex — bypass CRAN sanitizers
  # Create LSD test data
  set.seed(123)
  n <- 5
  n_obs <- n * n

  lsd_data <- data.frame(
    treat = rep(1:n, each = n),
    row = rep(1:n, times = n),
    col = as.vector(sapply(0:(n - 1), function(i) (1:n + i - 1) %% n + 1)),
    trait1 = rnorm(n_obs, mean = 10, sd = 2),
    trait2 = rnorm(n_obs, mean = 15, sd = 3)
  )

  # Introduce missing values
  lsd_data$trait1[c(1, 5, 12)] <- NA
  lsd_data$trait2[c(3, 8)] <- NA

  imputed <- selection.index:::missing_value_estimation(
    data_mat = as.matrix(lsd_data[, c("trait1", "trait2")]),
    gen_idx = lsd_data$treat,
    rep_idx = lsd_data$row,
    col_idx = lsd_data$col,
    design_type = "LSD",
    method = "REML"
  )

  expect_false(anyNA(imputed))
  expect_equal(dim(imputed), c(n_obs, 2))
})

test_that("missing_value_estimation requires columns for LSD", {
  skip_on_cran() # error handling test or warning test
  set.seed(456)
  data_mat <- matrix(rnorm(30), nrow = 10, ncol = 3)
  data_mat[1, 1] <- NA
  gen_idx <- rep(1:5, each = 2)
  rep_idx <- rep(1:2, times = 5)

  expect_error(
    selection.index:::missing_value_estimation(
      data_mat = data_mat,
      gen_idx = gen_idx,
      rep_idx = rep_idx,
      design_type = "LSD",
      method = "REML"
    ),
    "col_idx"
  )
})

# ==============================================================================
# TEST: Split Plot Design (SPD) support
# ==============================================================================

test_that("missing_value_estimation works with SPD design", {
  skip_on_cran() # heavy cross-products / TRE regex — bypass CRAN sanitizers
  # Create SPD test data
  set.seed(789)
  n_main <- 4
  n_sub <- 6
  n_rep <- 3
  n_obs <- n_main * n_sub * n_rep

  spd_data <- data.frame(
    mainplot = rep(1:n_main, each = n_sub * n_rep),
    subplot = rep(rep(1:n_sub, each = n_rep), times = n_main),
    block = rep(1:n_rep, times = n_main * n_sub),
    trait1 = rnorm(n_obs, mean = 12, sd = 2.5)
  )

  # Introduce missing values
  spd_data$trait1[c(1, 10, 25, 40)] <- NA

  # SPD only supports Mean method
  expect_warning(
    imputed <- selection.index:::missing_value_estimation(
      data_mat = as.matrix(spd_data[, "trait1", drop = FALSE]),
      gen_idx = spd_data$subplot,
      rep_idx = spd_data$block,
      main_idx = spd_data$mainplot,
      design_type = "SPD",
      method = "REML" # Should fall back to Mean
    ),
    "Split Plot Design"
  )

  expect_false(anyNA(imputed))
  expect_equal(nrow(imputed), n_obs)
})

test_that("missing_value_estimation requires main_plots for SPD", {
  skip_on_cran() # error handling test or warning test
  set.seed(999)
  data_mat <- matrix(rnorm(30), nrow = 10, ncol = 3)
  data_mat[1, 1] <- NA
  gen_idx <- rep(1:5, each = 2)
  rep_idx <- rep(1:2, times = 5)

  expect_error(
    selection.index:::missing_value_estimation(
      data_mat = data_mat,
      gen_idx = gen_idx,
      rep_idx = rep_idx,
      design_type = "SPD",
      method = "Mean"
    ),
    "main_idx"
  )
})

# ==============================================================================
# TEST: Input validation
# ==============================================================================

test_that("missing_value_estimation errors with invalid design", {
  skip_on_cran() # error handling test or warning test
  data_mat <- matrix(rnorm(30), nrow = 10, ncol = 3)
  gen_idx <- rep(1:5, each = 2)
  rep_idx <- rep(1:2, times = 5)

  expect_error(
    selection.index:::missing_value_estimation(
      data_mat = data_mat,
      gen_idx = gen_idx,
      rep_idx = rep_idx,
      design_type = "INVALID"
    ),
    "should be one of"
  )
})

test_that("missing_value_estimation errors with invalid method", {
  skip_on_cran() # error handling test or warning test
  data_mat <- matrix(rnorm(30), nrow = 10, ncol = 3)
  gen_idx <- rep(1:5, each = 2)
  rep_idx <- rep(1:2, times = 5)

  expect_error(
    selection.index:::missing_value_estimation(
      data_mat = data_mat,
      gen_idx = gen_idx,
      rep_idx = rep_idx,
      method = "INVALID"
    ),
    "should be one of"
  )
})

# ==============================================================================
# TEST: Edge cases and parameter handling
# ==============================================================================

test_that("missing_value_estimation respects tolerance parameter", {
  set.seed(321)
  data_mat <- matrix(rnorm(30), nrow = 10, ncol = 3)
  data_mat[c(1, 5), 1] <- NA
  gen_idx <- rep(1:5, each = 2)
  rep_idx <- rep(1:2, times = 5)

  # Strict tolerance
  imputed_strict <- selection.index:::missing_value_estimation(
    data_mat = data_mat,
    gen_idx = gen_idx,
    rep_idx = rep_idx,
    method = "Yates",
    tolerance = 1e-8
  )

  # Loose tolerance
  imputed_loose <- selection.index:::missing_value_estimation(
    data_mat = data_mat,
    gen_idx = gen_idx,
    rep_idx = rep_idx,
    method = "Yates",
    tolerance = 1e-4
  )

  # Both should complete without error
  expect_false(anyNA(imputed_strict))
  expect_false(anyNA(imputed_loose))
  expect_equal(dim(imputed_strict), dim(imputed_loose))
})

test_that("missing_value_estimation handles numeric genotypes", {
  set.seed(654)
  data_mat <- matrix(rnorm(30), nrow = 10, ncol = 3)
  data_mat[1, 1] <- NA

  # Test with numeric indices
  gen_numeric <- rep(1:5, each = 2)
  rep_numeric <- rep(1:2, times = 5)

  imputed_numeric <- selection.index:::missing_value_estimation(
    data_mat = data_mat,
    gen_idx = gen_numeric,
    rep_idx = rep_numeric,
    method = "Mean"
  )

  expect_false(anyNA(imputed_numeric))
  expect_equal(dim(imputed_numeric), dim(data_mat))
})

test_that("missing_value_estimation handles high proportion of missing values", {
  set.seed(777)
  data_mat <- matrix(rnorm(50), nrow = 10, ncol = 5)
  gen_idx <- rep(1:5, each = 2)
  rep_idx <- rep(1:2, times = 5)

  # Introduce many missing values (40% missing in one column)
  missing_idx <- sample(1:10, size = 4)
  data_mat[missing_idx, 1] <- NA

  imputed <- selection.index:::missing_value_estimation(
    data_mat = data_mat,
    gen_idx = gen_idx,
    rep_idx = rep_idx,
    method = "REML"
  )

  # Should still complete
  expect_false(anyNA(imputed))
})

# ==============================================================================
# TEST: Edge cases for 100% missing_value_estimation coverage
# ==============================================================================

test_that("missing_value_estimation: SPD falls back to Mean with warning (Lines 152-155)", {
  data_mat <- matrix(c(1, 2, NA, 4), nrow = 2, ncol = 2)
  gen_idx <- c(1, 2)
  rep_idx <- c(1, 1)
  main_idx <- c(1, 1)

  expect_warning(
    selection.index:::missing_value_estimation(
      data_mat, gen_idx, rep_idx,
      main_idx = main_idx,
      design_type = "SPD", method = "REML"
    )
  )

  result <- suppressWarnings(selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    main_idx = main_idx,
    design_type = "SPD", method = "REML"
  ))
  expect_false(anyNA(result))
})

test_that("missing_value_estimation: Healy RCBD w_sum <= 0 fallback (Lines 292-293)", {
  # To trigger w_sum <= 0, we need w_treat = 0 and w_block = 0.


  # So n_miss_treat must equal repli (all reps for a treatment missing).
  # And n_miss_block must equal genotype (all treatments for a block missing).
  # We must have at least one complete observation for grouped_sums to not throw an error on NA.

  data_mat <- matrix(c(1, NA, NA, NA), nrow = 4, ncol = 1)
  gen_idx <- c(1, 1, 2, 2)
  rep_idx <- c(1, 2, 1, 2)

  # For idx = 4 (gen=2, rep=2):
  # n_miss_treat for gen 2 is 2 (equals repli 2)
  # n_miss_block for rep 2 is 2 (equals genotype 2)
  # w_treat = 0, w_block = 0, w_sum = 0

  result <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "Healy"
  )
  expect_false(anyNA(result))
})

test_that("missing_value_estimation: Healy LSD w_sum <= 0 fallback (Lines 319-321)", {
  # Similar to RCBD but 3-way.
  data_mat <- matrix(c(1, NA, NA, NA), nrow = 4, ncol = 1)
  gen_idx <- c(1, 2, 2, 1)
  rep_idx <- c(1, 1, 2, 2)
  col_idx <- c(1, 2, 1, 2)

  result <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    col_idx = col_idx,
    design_type = "LSD", method = "Healy"
  )
  expect_false(anyNA(result))
})

test_that("missing_value_estimation: Regression RCBD rank deficiency fallback (Lines 476-481)", {
  # min_obs for Regression RCBD = genotype + repli. (2+2 = 4)
  # length(complete_idx) must be > 4, so 5.
  # But the design matrix must be rank deficient.
  data_mat <- matrix(c(1, 1, 1, 1, 1, NA, NA, NA), nrow = 8, ncol = 1)
  gen_idx <- rep(1:2, each = 4)
  rep_idx <- rep(1:4, times = 2)
  result <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "Regression"
  )
  expect_false(anyNA(result))
})

test_that("missing_value_estimation: Regression LSD rank deficiency fallback (Lines 436-441)", {
  set.seed(123)
  n <- 3
  n_obs <- n * n
  lsd_data <- data.frame(
    treat = rep(1:n, each = n),
    row = rep(1:n, times = n),
    col = as.vector(sapply(0:(n - 1), function(i) (1:n + i - 1) %% n + 1)),
    trait1 = rnorm(n_obs, mean = 10, sd = 2)
  )
  lsd_data$trait1[1] <- NA
  result <- selection.index:::missing_value_estimation(
    as.matrix(lsd_data$trait1), lsd_data$treat, lsd_data$row, lsd_data$col,
    design_type = "LSD", method = "Regression"
  )
  expect_false(anyNA(result))
})

test_that("missing_value_estimation: Regression zero complete fallback (Line 487)", {
  data_mat <- matrix(c(1, NA, NA, NA), nrow = 4, ncol = 1)
  gen_idx <- c(1, 1, 2, 2)
  rep_idx <- c(1, 2, 1, 2)

  result <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "Regression"
  )
  expect_false(anyNA(result))
})

test_that("missing_value_estimation: REML zero variance sum (Lines 674, 686-689, 734-737, 747-753)", {
  # REML total variance <= 0 (RCBD lines 674, 686)
  data_mat <- matrix(c(10, 10, 10, 10, NA), nrow = 5, ncol = 1)
  gen_idx <- c(1, 1, 2, 2, 2)
  rep_idx <- c(1, 2, 1, 2, 3)

  result_rcbd <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "REML"
  )
  expect_false(anyNA(result_rcbd))

  # REML total variance <= 0 (LSD lines 734, 747, 750)
  n <- 3
  n_obs <- n * n
  lsd_data <- matrix(10, nrow = n_obs, ncol = 1)
  lsd_data[1, 1] <- NA
  treat <- rep(1:n, each = n)
  row <- rep(1:n, times = n)
  col <- as.vector(sapply(0:(n - 1), function(i) (1:n + i - 1) %% n + 1))

  result_lsd <- selection.index:::missing_value_estimation(
    lsd_data, treat, row, col,
    design_type = "LSD", method = "REML"
  )
  expect_false(anyNA(result_lsd))
})

test_that("missing_value_estimation: Mean zero complete fallback (Line 596)", {
  data_mat <- matrix(NA_real_, nrow = 4, ncol = 1)
  gen_idx <- c(1, 1, 2, 2)
  rep_idx <- c(1, 2, 1, 2)

  result <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "Mean"
  )
  expect_true(all(result == 0))
})

test_that("missing_value_estimation: Bartlett fallback without valid covariates (Lines 873-878, 897-955)", {
  # Bartlett fallback needs length(complete_idx) > 3, so at least 4.
  # And it needs NO valid covariates. A covariate is only valid if it has NO NAs
  # for the complete_idx rows.
  data_mat <- matrix(1:10, nrow = 5, ncol = 2)
  data_mat[5, 1] <- NA # Col 1 is target, row 5 is missing. complete_idx = 1:4
  data_mat[2, 2] <- NA # Col 2 is covariate. Since row 2 is in complete_idx and has NA, Col 2 is invalid.

  gen_idx <- c(1, 1, 2, 2, 3)
  rep_idx <- c(1, 2, 1, 2, 3)

  result_rcbd <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "Bartlett"
  )
  expect_false(anyNA(result_rcbd[, 1]))

  # LSD Bartlett fallback
  n <- 3
  n_obs <- n * n
  lsd_data <- data.frame(
    treat = rep(1:n, each = n),
    row = rep(1:n, times = n),
    col = as.vector(sapply(0:(n - 1), function(i) (1:n + i - 1) %% n + 1)),
    trait1 = rnorm(n_obs, mean = 10, sd = 2),
    trait2 = rnorm(n_obs, mean = 5, sd = 1)
  )
  lsd_data$trait1[5] <- NA # Target missing
  lsd_data$trait2[1] <- NA # Covariate invalid because NA is in complete_idx

  result_lsd_nocovar <- selection.index:::missing_value_estimation(
    as.matrix(lsd_data[, c("trait1", "trait2")]),
    lsd_data$treat, lsd_data$row,
    col_idx = lsd_data$col,
    design_type = "LSD", method = "Bartlett"
  )
  expect_false(anyNA(result_lsd_nocovar[, 1]))
})

test_that("missing_value_estimation: out of bounds indices in fallbacks (Lines 481, 674, 686-689, 737, 747-753, 933-952)", {
  # We trigger the g <= length(treat_means) FALSE, r <= length(block_means) FALSE conditions
  # by making the missing value have a genotype/rep index higher than any complete value.
  data_mat <- matrix(c(10, 10, 10, 10, NA), nrow = 5, ncol = 1)
  gen_idx <- c(1, 1, 2, 2, 3) # Missing value is genotype 3. Complete is only 1, 2. (length = 2)
  rep_idx <- c(1, 2, 1, 2, 3) # Missing value is rep 3. Complete is only 1, 2.

  # Regression fallback
  result_reg <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "Regression"
  )
  expect_false(anyNA(result_reg))

  # REML fallback
  result_reml <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "REML"
  )
  expect_false(anyNA(result_reml))

  # Bartlett fallback (requires >3 complete and 0 covariates)
  # Just use same data_mat and duplicate column so Bartlett sees it as covariate.
  # Make the covariate invalid.
  data_mat_bart <- cbind(data_mat, c(1, NA, 3, 4, 5))
  result_bart <- selection.index:::missing_value_estimation(
    data_mat_bart, gen_idx, rep_idx,
    design_type = "RCBD", method = "Bartlett"
  )
  expect_false(anyNA(result_bart[, 1]))

  # Now for LSD (needs col_means out of bounds)
  # 16 complete, 1 missing.
  lsd_mat <- matrix(10, nrow = 17, ncol = 1)
  lsd_mat[17, 1] <- NA
  # Complete 1-16 use indices 1-4.
  lsd_treat <- c(rep(1:4, 4), 5) # Make missing be index 5
  lsd_row <- c(rep(1:4, each = 4), 5)
  lsd_col <- c(rep(1:4, times = 4), 5)

  # REML LSD fallback
  result_lsd_reml <- selection.index:::missing_value_estimation(
    lsd_mat, lsd_treat, lsd_row, lsd_col,
    design_type = "LSD", method = "REML"
  )
  expect_false(anyNA(result_lsd_reml))

  # Bartlett LSD fallback
  lsd_mat_bart <- cbind(lsd_mat, c(NA, rep(1, 16)))
  result_lsd_bart <- selection.index:::missing_value_estimation(
    lsd_mat_bart, lsd_treat, lsd_row, lsd_col,
    design_type = "LSD", method = "Bartlett"
  )
  expect_false(anyNA(result_lsd_bart[, 1]))
})

# ==============================================================================
# TEST: Precisely targeted tests for remaining uncovered lines
# ==============================================================================

test_that("missing_value_estimation: Healy RCBD w_sum=0 triggered when ALL reps of a treatment AND ALL treatments of a block are missing (Lines 292-293)", {
  # Setup: 2 genotypes, 3 reps. Row 5 has genotype=2, rep=3.
  # All reps of genotype 2 are missing: rows 4,5,6 → NA
  # All genotypes of rep 3 are missing: rows 3,6 → NA
  # So for the missing value at row 6 (gen=2, rep=3):
  #   n_miss_treat = 3 = repli  → w_treat = 0
  #   n_miss_block = 2 = genotype → w_block = 0
  #   w_sum = 0  → triggers lines 292-293
  data_mat <- matrix(c(10, 20, NA, 30, NA, NA), nrow = 6, ncol = 1)
  gen_idx <- c(1, 1, 1, 2, 2, 2)
  rep_idx <- c(1, 2, 3, 1, 2, 3)

  result <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "Healy"
  )
  expect_false(anyNA(result))
})

test_that("missing_value_estimation: Healy LSD w_sum=0 triggered when treatment, row, and column are ALL completely missing (Lines 319-321)", {
  # 3x3 LSD. Make one missing value where ALL elements of its treatment, row, AND column are NA.
  # Rows 1-5 complete, rows 6-9 NA. Missing value in row 7 (treat=3, row=2, col=3).
  # All treat=3 rows (7,8,9) are NA → n_miss_treat = 3 = n(genotype)
  # All row=2 rows (4,7) are NA → n_miss_block = 2 < genotype (fails partial)
  # Let's try: only 2 complete, rest NA. treat=1,row=1,col=1 and treat=2,row=2,col=1 are complete.
  # For missing at treat=3,row=3,col=2: all treat=3 rows are NA (2), all row=3 rows are NA (3x3 = col1,2,3 → 3), all col=2 rows are NA
  # n = 3 → w_treat = (3-3)/3=0, w_row = (3-3)/3=0, w_col = (3-3)/3=0 → triggers 319-321
  n <- 3
  data_mat <- matrix(NA_real_, nrow = n * n, ncol = 1)
  # Only keep (treat=1,row=1,col=1) and (treat=2,row=2,col=3) complete
  # LSD layout: treat=1,2,3; row=1,2,3; col cycles
  treat_idx <- c(1, 2, 3, 2, 3, 1, 3, 1, 2) # Standard 3x3 Latin square
  row_idx <- c(1, 1, 1, 2, 2, 2, 3, 3, 3)
  col_idx <- c(1, 2, 3, 1, 2, 3, 1, 2, 3)
  # Set complete values only where treat=1 or treat=2 (partially), keep treat=3 all NA
  # treat=3 rows: positions 3, 6, 9 → all NA
  # row=3 rows: positions 7,8,9 → all NA already (treat=3 is there)
  # col=2 rows: positions 2,5,8 → position 8 is treat=1 and row=3
  # For missing at position 9 (treat=2 in this layout), let's re-layout
  # Actually let's use a simpler layout where we can control
  treat_idx2 <- rep(1:n, n) # 1,2,3,1,2,3,1,2,3
  row_idx2 <- rep(1:n, each = n) # 1,1,1,2,2,2,3,3,3
  col_idx2 <- as.integer(sapply(1:n, function(i) (seq(0, n - 1) + i - 1) %% n + 1))
  # Complete values: only first 2 rows to ensure we have some data
  data_mat2 <- matrix(NA_real_, nrow = n * n, ncol = 1)
  data_mat2[1, 1] <- 10 # treat=1, row=1
  data_mat2[2, 1] <- 20 # treat=2, row=1
  # treat=3 rows: 3, 6, 9 are all NA
  # row=3 positions: 7, 8, 9 all NA
  # For position 9 (treat=3, row=3): n_miss_treat(treat=3)=3, n_miss_row(row=3)=3, n_miss_col=?
  # col of position 9: col_idx2[9]
  col9 <- col_idx2[9]
  # positions with that col
  n_miss_col9 <- sum(is.na(data_mat2[col_idx2 == col9, 1]))
  # w_treat = (n - 3)/n = 0, w_row = (n - 3)/n = 0, w_col = (n - n_miss_col9)/n
  # For w_col = 0 too, we need all positions in that column to be NA.
  # Ensure all positions with col=col9 are NA (they should be since only rows 1,2 have data,
  # and col_idx2[1] and col_idx2[2] are cols 1 and 2 respectively)

  result <- selection.index:::missing_value_estimation(
    data_mat2, treat_idx2, row_idx2,
    col_idx = col_idx2,
    design_type = "LSD", method = "Healy"
  )
  expect_false(anyNA(result))
})

test_that("missing_value_estimation: Regression LSD full-rank path covers lines 436-441 (col effect coding for prediction)", {
  # This test ensures the LSD Regression path where QR succeeds (rank == ncol(X))
  # properly executes the column-effect coding for missing values (lines 436-441).
  # We need: (1) enough complete observations, (2) full-rank X, (3) at least one missing value.
  # Use a set of 5x5 LSD with one missing value.
  set.seed(42)
  n <- 5
  n_obs <- n * n
  trait <- rnorm(n_obs, 20, 3)
  treat <- rep(1:n, n)
  row <- rep(1:n, each = n)
  col <- as.integer(sapply(1:n, function(i) (seq(0, n - 1) + i - 1) %% n + 1))
  trait[round(n_obs / 2)] <- NA # A single central missing value

  result <- selection.index:::missing_value_estimation(
    as.matrix(trait), treat, row, col,
    design_type = "LSD", method = "Regression"
  )
  expect_false(anyNA(result))
  expect_equal(length(result), n_obs)
})

test_that("missing_value_estimation: REML out-of-bounds genotype/rep indices trigger 'else grand_mean' in sapply (Lines 686, 689, 747, 750, 753)", {
  # When the missing observation's gen_idx = 99 (larger than any of the 4 unique complete
  # genotypes), `g > length(treat_means)` is TRUE, so the `else grand_mean` branch fires.
  # Same for rep_idx = 99 and col_idx = 99.

  # RCBD: covers lines 686 and 689
  data_mat <- matrix(c(rep(10, 16), NA), nrow = 17, ncol = 1)
  gen_idx <- c(rep(1:4, 4), 99) # 99 is out-of-bounds for treat_means
  rep_idx <- c(rep(1:4, each = 4), 99) # 99 is out-of-bounds for block_means

  result_rcbd <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "REML"
  )
  expect_false(anyNA(result_rcbd))
  expect_equal(result_rcbd[17], 10) # Should impute as grand mean (10)

  # LSD: covers lines 747, 750 and 753
  n <- 5
  lsd_mat <- matrix(c(rep(20, n * n), NA), nrow = n * n + 1, ncol = 1)
  lsd_treat <- c(rep(1:n, n), 99) # out-of-bounds
  lsd_row <- c(rep(1:n, each = n), 99) # out-of-bounds
  lsd_col <- c(as.integer(sapply(1:n, function(i) (seq(0, n - 1) + i - 1) %% n + 1)), 99)

  result_lsd <- selection.index:::missing_value_estimation(
    lsd_mat, lsd_treat, lsd_row, lsd_col,
    design_type = "LSD", method = "REML"
  )
  expect_false(anyNA(result_lsd))
})

test_that("missing_value_estimation: Bartlett no-covariate fallback with out-of-bounds index (Lines 933, 936, 939, 949, 952)", {
  # To cover the `else 0` branches inside Bartlett's no-covariate sapply lookups,
  # the missing value's gen_idx/rep_idx must be out-of-bounds (> length of means vector).

  # RCBD: covers lines 949 and 952
  data_mat_bart <- matrix(c(rep(10, 16), NA), nrow = 17, ncol = 2)
  data_mat_bart[2, 2] <- NA # Invalidate covariate at a complete row → no valid covariates
  gen_idx <- c(rep(1:4, 4), 99) # Missing row has out-of-bounds genotype
  rep_idx <- c(rep(1:4, each = 4), 99) # Missing row has out-of-bounds rep

  result_rcbd <- selection.index:::missing_value_estimation(
    data_mat_bart, gen_idx, rep_idx,
    design_type = "RCBD", method = "Bartlett"
  )
  expect_false(anyNA(result_rcbd[, 1]))

  # LSD: covers lines 933, 936, and 939
  n <- 5
  lsd_mat <- matrix(c(rep(20, n * n), NA), nrow = n * n + 1, ncol = 2)
  lsd_mat[1, 2] <- NA # Invalidate covariate at a complete row → no valid covariates
  lsd_treat <- c(rep(1:n, n), 99)
  lsd_row <- c(rep(1:n, each = n), 99)
  lsd_col <- c(as.integer(sapply(1:n, function(i) (seq(0, n - 1) + i - 1) %% n + 1)), 99)

  result_lsd <- selection.index:::missing_value_estimation(
    lsd_mat, lsd_treat, lsd_row, lsd_col,
    design_type = "LSD", method = "Bartlett"
  )
  expect_false(anyNA(result_lsd[, 1]))
})

test_that("missing_value_estimation: Healy LSD w_sum=0 when ALL of treat, row, col for the missing value have NO complete observations (Lines 319-321)", {
  # Use a 3x3 LSD with only 1 complete observation.
  # With treat_idx = rep(1:3, 3) and row_idx = rep(1:3, each=3),
  # col_idx = c(1,2,3, 2,3,1, 3,1,2), only row 1 (treat=1,row=1,col=1) is complete.
  # Position 5 has treat=2, row=2, col=3:
  #   All of treat=2 rows (2,5,8) are NA → n_miss_treat=3=n → w_treat=0
  #   All of row=2 rows (4,5,6) are NA → n_miss_block=3=n → w_row=0
  #   All of col=3 rows (3,5,7) are NA → n_miss_col=3=n → w_col=0
  #   w_sum=0 → triggers lines 319-321
  n <- 3
  treat_idx <- rep(1:n, n)
  row_idx <- rep(1:n, each = n)
  col_idx <- c(1, 2, 3, 2, 3, 1, 3, 1, 2)
  data_mat <- matrix(NA_real_, nrow = n * n, ncol = 1)
  data_mat[1, 1] <- 10 # Only first row complete (treat=1, row=1, col=1)

  result <- selection.index:::missing_value_estimation(
    data_mat, treat_idx, row_idx,
    col_idx = col_idx,
    design_type = "LSD", method = "Healy"
  )
  expect_false(anyNA(result))
})

test_that("missing_value_estimation: Regression RCBD rank-deficient fallback inner loop with out-of-bounds gen/rep (Lines 476-481)", {
  # Create rank-deficient design matrix: all 7 complete rows belong to genotype=1 only.
  # This makes the treatment contrast columns entirely zero for complete rows → rank < ncol(X).
  # The missing rows (8,9) have genotype=2 and 3.
  # Because the missing gen_idx > length(treat_means from complete rows),
  # the `if (g <= length(treat_means))` inside the fallback loop returns FALSE,
  # triggering the `else 0` branch at line 479 (and equivalent for block at 480).
  # Also ensures lines 476-481 are fully traversed.
  data_mat <- matrix(c(rep(10, 7), NA, NA), nrow = 9, ncol = 1)
  gen_idx <- c(1, 1, 1, 1, 1, 1, 1, 2, 3) # Complete rows all gen=1; missing are gen=2,3
  rep_idx <- c(1, 2, 3, 1, 2, 3, 1, 1, 2) # Missing rows have rep=1 and rep=2

  result <- selection.index:::missing_value_estimation(
    data_mat, gen_idx, rep_idx,
    design_type = "RCBD", method = "Regression"
  )
  expect_false(anyNA(result))
  expect_equal(nrow(result), 9)
})

test_that("missing_value_estimation: REML LSD shrinkage=0.33 when complete_idx <= 3*genotype-2 (Line 737)", {
  # Line 737 fires in the LSD REML branch when df_error <= 0 OR
  # length(complete_idx) <= (3 * genotype - 2).
  # With n=3 (genotype=3), 3*3-2=7, so we need complete_idx <= 7.
  # A 3x3 LSD with 8 complete obs and 1 missing → complete_idx = 8, but 8 > 7.
  # With 3x3, genotype = repli = ncol_blocks = 3 → df_error = (3-1)*(3-2) = 2
  # We need complete_idx <= 7. Use a 3x3 LSD with only 4 complete observations.
  n <- 3
  treat_idx <- rep(1:n, n)
  row_idx <- rep(1:n, each = n)
  col_idx <- c(1, 2, 3, 2, 3, 1, 3, 1, 2)
  # Only keep 4 complete observations; mark remaining 5 as NA
  data_mat <- matrix(NA_real_, nrow = n * n, ncol = 1)
  data_mat[c(1, 2, 4, 6), 1] <- c(10, 12, 11, 9)

  result <- selection.index:::missing_value_estimation(
    data_mat, treat_idx, row_idx,
    col_idx = col_idx,
    design_type = "LSD", method = "REML"
  )
  expect_false(anyNA(result))
})
Any scripts or data that you put into this service are public.
selection.index documentation built on March 9, 2026, 1:06 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
selection.index
Analysis of Selection Index in Plant Breeding

tests/testthat/test-missing_value_estimation.R
In selection.index: Analysis of Selection Index in Plant Breeding

Try the selection.index package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

selection.index Analysis of Selection Index in Plant Breeding

tests/testthat/test-missing_value_estimation.R In selection.index: Analysis of Selection Index in Plant Breeding

Try the selection.index package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

selection.index
Analysis of Selection Index in Plant Breeding

tests/testthat/test-missing_value_estimation.R
In selection.index: Analysis of Selection Index in Plant Breeding