mypack: Analysis of Massive SNP Arrays

require(bigsnpr)
require(xgboost)

celiac <- AttachBigSNP("celiac_impute1_sub1")
# celiac <- AttachBigSNP("celiac_sub2_impute1",
#                        backingpath = "../thesis-celiac/backingfiles/")
X <- celiac$genotypes

ind <- which(celiac$fam$pheno == -1)
Chr1 <- X[, 1:3000]
storage.mode(Chr1) <- "double"

ind.train <- sort(sample(ind, 7000))
ind.test <- setdiff(ind, ind.train)

size <- 200
s <- setdiff(-size:size, 0)
size2 <- 50
s2 <- setdiff(-size2:size2, 0)


assess <- function(indNA, param, ...) {
  dtrain <- xgb.DMatrix(data = Chr1[ind.train, indNA + s],
                        label = (Chr1[ind.train, indNA]))
  dtest <- xgb.DMatrix(data = Chr1[ind.test, indNA + s],
                       label = (Chr1[ind.test, indNA]))
  watchlist <- list(train=dtrain, test=dtest)

  bst <- xgb.train(param, data = dtrain, nthread = 1,
                   watchlist=watchlist, ...)

  labels <- getinfo(dtrain, "label")
  m <- mean(labels)
  a <- 0.5 - m
  b <- 1.5 - m
  preds <- predict(bst, Chr1[ind.test, indNA + s])
  pred <- (preds > a) + (preds > b)
  true <- Chr1[ind.test, indNA]
  mean(round(pred) != true)
}

assess2 <- function(indNA, param, ...) {
  bst <- xgboost(params = param, data = Chr1[ind.train, indNA + s],
                 label = Chr1[ind.train, indNA],
                 nthread = 1, ...)
  pred <- predict(bst, Chr1[ind.test, indNA + s])
  true <- Chr1[ind.test, indNA]
  mean(round(pred) != true)
}

assess3 <- function(indNA, param, ...) {
  bst <- xgboost(params = param, data = Chr1[ind.train, indNA + s2],
                 label = Chr1[ind.train, indNA],
                 nthread = 1, ...)
  pred <- predict(bst, Chr1[ind.test, indNA + s2])
  true <- Chr1[ind.test, indNA]
  mean(round(pred) != true)
}

require(foreach)
res <- foreach(i = 1:20, .combine = 'cbind') %do% {
  indNA <- round(runif(1, 1200, 1800))
  time1 <- system.time(
    xgb1 <- assess3(indNA, param = list(max_depth=5, colsample_bylevel = 1),
                    verbose = 0, nrounds = 50)
  )[3]
  time2 <- system.time(
    xgb2 <- assess3(indNA, param = list(max_depth=5, colsample_bylevel = 0.5),
                    verbose = 0, nrounds = 50)
  )[3]
  c(xgb1, xgb2, time1, time2)
}
plot(t(res))
abline(0, 1, col = "red")
rowMeans(res)
# reg:linear faster and maybe better than multi:softmax
# booster = "gblinear" is faster but worst than the default gbtree
# alpha = 1000 is baaaad
# alpha = 10 is better and even alpha = 0 -> 0.031 vs 0.286
# max_depth = 5 is better
# eta = 0.3 is better than 1
# 0.0364 vs 0.0357 for nrounds = 50 vs 200, but 2 times slower
# 0.051 vs 0.062 for nrounds = 50 vs 10, but 4 times slower
# min_child_weight is OK
# subsample = 1 is OK

# print(system.time(
#   bst <- xgboost(params = list(), data = Chr1[ind.train, indNA + s],
#                  label = Chr1[ind.train, indNA],
#                  nthread = 1, nrounds = 100)
# ))
#
# lo <- c(TRUE, FALSE)
# print(system.time(
#   bst <- xgboost(params = list(), data = Chr1[ind.train, indNA + s][lo, ],
#                  label = Chr1[ind.train, indNA][lo],
#                  nthread = 1, nrounds = 100)
# )) # algo seems to be linear with nb of indiv

# print(system.time(print(
#   assess2(indNA, param = list(colsample_bylevel = 0.5),
#          verbose = 0, nrounds = 20)
# )))