tests/testthat/test-3-fastImpute.R

################################################################################

context("FAST_IMPUTE")

options(bigstatsr.check.parallel.blas = FALSE)

################################################################################

test_that("fast imputation (xgboost) works", {

  skip_if_not_installed("xgboost")
  skip_if(is_cran)

  suppressMessages({
    library(Matrix)
    library(foreach)
    library(magrittr)
    library(dplyr)
  })

  bigsnp <- snp_attachExtdata()
  G <- bigsnp$genotypes
  expect_equal(G$code256, CODE_012)
  corr <- snp_cor(G)
  ind <- which(apply(corr, 2, function(x) max(x[x < 1])) > 0.6)

  indNA <- foreach(j = ind, .combine = "rbind") %do% {
    cbind(sample(nrow(G), 100), j)
  }

  # Create a copy of our bigSNP example
  tmpfile <- tempfile()
  bigsnp.copy <- snp_writeBed(bigsnp, bedfile = paste0(tmpfile, ".bed")) %>%
    snp_readBed(backingfile = tmpfile) %>%
    snp_attach()
  # Fill some missing values
  GNA <- bigsnp.copy$genotypes
  GNA[indNA] <- as.raw(3)
  counts <- big_counts(GNA)
  expect_equal(sum(counts[4, ]), nrow(indNA))

  ################################################################################

  # Fast imputation
  time <- system.time(
    infos <- snp_fastImpute(GNA, infos.chr = bigsnp$map$chromosome, p.train = 0.6)
  )
  # expect_lt(time[3], 10)

  # Still NAs
  counts <- big_counts(GNA)
  expect_equal(sum(counts[4, ]), nrow(indNA))

  GNA$code256 <- CODE_IMPUTE_PRED
  infosNA <- tibble(
    col = indNA[, 2],
    error = (GNA[indNA] != G[indNA])
  ) %>%
    group_by(col) %>%
    summarise(nb_err = sum(error)) %>%
    mutate(nb_err_est = (infos[1, ] * infos[2, ] * nrow(G))[col])

  pval <- anova(lm(nb_err ~ nb_err_est - 1, data = infosNA))$`Pr(>F)`[[1]]
  expect_lt(pval, 2e-16)

  ################################################################################

  fake <- snp_fake(100, 200)
  G <- fake$genotypes
  G[] <- sample(as.raw(0:3), size = length(G), replace = TRUE)
  CHR <- fake$map$chromosome
  # 'infos.chr' needs to have the same length as the number of SNPs
  expect_error(snp_fastImpute(G, 1), "Incompatibility between dimensions.",
               fixed = TRUE)
  # you can't impute randomness, right?
  expect_gt(mean(snp_fastImpute(G, CHR)[2, ]), 0.5)

  ################################################################################

  fake <- snp_fake(100, 200)
  G <- fake$genotypes
  G[] <- sample(as.raw(0:3), size = length(G), replace = TRUE)
  CHR <- sort(sample(1:3, ncol(G), TRUE))
  G2 <- bigsnpr:::FBM_infos(G)
  ind <- 1:11 + 150
  G2[1, ind] <- seq(0, 1, 0.1)
  G2 <- snp_fastImpute(G, CHR)
  G3 <- G$copy(CODE_IMPUTE_PRED)
  nbNA <- colSums(is.na(G3[]))
  expect_true(all(nbNA[ind] > 0))
  expect_true(all(nbNA[-ind] == 0))

  ################################################################################

  bigsnp <- snp_attachExtdata()
  G <- bigsnp$genotypes
  CHR <- bigsnp$map$chromosome

  elemNA <- sample(length(G), size = 20)
  G2 <- big_copy(G); G2[elemNA] <- as.raw(3)  # NA
  G3 <- big_copy(G); G3[elemNA] <- as.raw(3)  # NA

  expect_equal(snp_fastImpute(G2, CHR, seed = 2)[],
               snp_fastImpute(G3, CHR, seed = 2, ncores = 2)[])
  expect_equal(G2[], G3[])

})

################################################################################

test_that("fast imputation (simple) works", {

  G <- snp_attachExtdata("example-missing.bed")$genotypes
  # cbind(colMeans(G[]), colMeans(G[], na.rm = TRUE))
  expect_error(snp_fastImputeSimple(G, "mean"), "should be one of")

  expect_warning(G1 <- snp_fastImputeSimple(G, "zero"), "deprecated")
  expect_equal(G[c(18, 72), 400], rep(NA_real_, 2))
  expect_equal(G1[c(18, 72), 400], rep(0, 2))

  G2 <- snp_fastImputeSimple(G, "mean0")
  expect_equal(G[c(18, 72), 400], rep(NA_real_, 2))
  expect_equal(G2[c(18, 72), 400], rep(1, 2))

  G3 <- snp_fastImputeSimple(G, "mean2")
  expect_equal(G3[c(18, 72), 400], rep(1.01, 2))

  G4 <- snp_fastImputeSimple(G, "mode")
  expect_equal(G4[c(4, 12), 1], rep(0, 2))
  expect_equal(G4[c(18, 72), 400], rep(1, 2))

  imp_val <- replicate(500, {
    G5 <- snp_fastImputeSimple(G, "random")
    G5[c(18, 72), 400]
  })
  p <- mean(G[, 400], na.rm = TRUE) / 2
  prob <- c((1 - p)^2, 2 * p * (1 - p), p^2)
  expect_gt(stats::chisq.test(table(imp_val), p = prob)$p.value, 1e-4)

  expect_equal(snp_fastImputeSimple(G, method = "mean2")[],
               snp_fastImputeSimple(G, method = "mean2", ncores = 2)[])
})

################################################################################
privefl/mypack documentation built on April 20, 2024, 1:51 a.m.