R/layerXGBoost.R

#' One Layer of XGBoosts on subdivisions of the variables
#'
#' @param x is an object
#' @return y is the output
#' @export

layerXGBoost <- function(y.train, y.test,
                         dat.train, dat.test, dat.testREAL,
                         n.train, n.test, n.testREAL,
                         n.subvar = 2000,
                         max.depth = 100, num_parallel_tree = 100,
                         eta = 0.1, gamma = 0.1, nthread = 2, nrounds = 10,
                         num_class = 26, objective = "multi:softmax",
                         colsample_bytree = 0.3, colsample_bylevel = 0.5, colsample_bynode = 1,
                         subsample=0.75,
                         cores = 2, rnd.seed = 1, layer = 1){

  # Parallilisation Settings
  registerDoParallel(cores=cores)
  CL <- makeCluster(cores)

  # Subset data into sets of features
  set.seed(rnd.seed)
  list_folds <- split(sample(ncol(dat.train), ncol(dat.train),replace=FALSE), as.factor(1:n.subvar))

  # Determine levels for factorisation
  if(layer!=1){
    levels = c("ACB","ASW","BEB","CDX", "CEU","CHB","CHS",
               "CLM","ESN","FIN","GBR","GIH","GWD","IBS",
               "ITU","JPT","KHV","LWK","MSL","MXL","PEL",
               "PJL","PUR","STU","TSI","YRI")
  } else {
    levels = c("0", "1", "2")
  }

  list_xgboost <- foreach(k = 1:32, .inorder = FALSE,
                                .packages = c("tcltk","xgboost", "SMLpractical")) %dopar%{
                                  # Rnd Seed
                                  set.seed(741*(rnd.seed+k)) # choose a base seed not commonly chosen

                                  if(!exists("pb")) pb <- tkProgressBar("Parallel XGBoost", min=0, max=32)

                                  # Create variable subsets
                                  mat.train <- dat.train[,list_folds[[k]]]
                                  mat.test <- dat.test[,list_folds[[k]]]
                                  mat.testREAL <- dat.testREAL[,list_folds[[k]]]

                                  res <- customXGBoost(y.train, layer = layer,
                                                       mat.train, mat.test, mat.testREAL,
                                                       max.depth = max.depth, num_parallel_tree = num_parallel_tree,
                                                       eta = eta, gamma = gamma, nthread = nthread, nrounds = nrounds,
                                                       num_class = num_class, objective = "multi:softmax",
                                                       colsample_bytree = colsample_bytree,
                                                       colsample_bylevel = colsample_bylevel,
                                                       colsample_bynode = colsample_bynode,
                                                       subsample = subsample,
                                                       pass.object = FALSE)

                                  progress <- setTkProgressBar(pb, k)

                                  # Output
                                  res
                                }
  # Terminate parallel clusters
  stopCluster(CL)
  closeAllConnections()

  final.results <- mergefromlists(list_xgboost, num_expForests=1,
                   n.train, n.test, n.testREAL)
  return(final.results)
}
thomaswiemann/SMLpractical documentation built on May 28, 2019, 12:23 p.m.