tests/testthat/test-06-SRxgboost_make_ensemble.R

# Setup -------------------------------------------------------------------
#
# empty memory and cache
rm(list = ls()); gc()
# load libraries
library(dplyr); library(testthat)
# set paths
path_output <- "output_temp/"
assign('path_output', path_output, envir = .GlobalEnv)
if (dir.exists("D:/Samuel/Analysen in R/Analysen/")) {
  path_to_data <- "D:/Samuel/Analysen in R/Analysen/99 ML Algorithm Benchmark/"
} else if (dir.exists("/home/sandbox/sandbox/z-Sonstige_Analysen/")) {
  path_to_data <- "/home/sandbox/sandbox/z-Sonstige_Analysen/99_ML_Algorithm_Benchmark/"
} else if (dir.exists("O:/Group/CC-Analytics/04_Organisatorisches/RHU/")) {
  path_to_data <- "O:/Group/CC-Analytics/04_Organisatorisches/RHU/AnalysenInR/99 ML Algorithm Benchmark/"
}
assign('path_to_data', path_to_data, envir = .GlobalEnv)
# get path and name of this file
this_file <- list.files()[1]
# try(this_file <- rstudioapi::getActiveDocumentContext()$path, T)
assign('this_file', this_file, envir = .GlobalEnv)




# Regression: read data --------------------------------------------------------------
#
house <- utils::read.csv(paste0(path_to_data,
                                "Regression/Kaggle - house prices/data/train.csv"))
assign('house', house, envir = .GlobalEnv)
id_unique_train <- house$Id
assign('id_unique_train', id_unique_train, envir = .GlobalEnv)
train <- house %>% dplyr::select(-Id)
assign('train', train, envir = .GlobalEnv)




# Regression: no_folds ----------------------------------------------------
#
test_that("regression", {
  skip('skip')
  # initialisation
  lauf <- "regression.csv"
  assign('lauf', lauf, envir = .GlobalEnv)
  cat(lauf, "\n")
  # prepare data and test
  SRxgboost_data_prep(yname = "SalePrice",
                      data_train = train,
                      no_folds = 5,
                      objective = "regression")
  # run models
  SRxgboost_run(nround = 1000, eta = 0.1, obj = "reg:squarederror", metric = "rmse",
                runs = 50, nfold = 5)
  # make ensemble
  SRxgboost_make_ensemble(name = gsub(".csv", "_ensemble.csv", lauf),
                          lauf = lauf, top_rank = 3)
  #
  # tests
  # no. of files
  testthat::expect_equal(length(list.files(
    paste0(path_output, gsub(".csv", "", lauf), "_ensemble/"))), 6)
  #
  # clean up
  suppressWarnings(rm(OOF_metrics, TEST_metrics, OOFforecast, TESTforecast,
                      y_OOF, y_TEST, SummaryCV_META))
  SRxgboost_cleanup()
})




# Regression: train/test --------------------------------------------------
#
test_that("regression train test", {
  skip('skip')
  # initialisation
  lauf <- "regression_train_test.csv"
  assign('lauf', lauf, envir = .GlobalEnv)
  cat(lauf, "\n")
  id_unique_train <- house$Id[seq(1, 1460, 2)]
  id_unique_test <- house$Id[seq(2, 1460, 2)]
  # prepare data and test
  SRxgboost_data_prep(yname = "SalePrice",
                      data_train = train[seq(1, 1460, 2), ],
                      data_test = train[seq(2, 1460, 2), ],
                      no_folds = 5,
                      objective = "regression")
  # run models
  SRxgboost_run(nround = 1000, eta = 0.1, obj = "reg:squarederror", metric = "rmse",
                runs = 50, nfold = 5)
  # make ensemble
  SRxgboost_make_ensemble(name = gsub(".csv", "_ensemble.csv", lauf),
                          lauf = lauf, top_rank = 3)
  #
  # tests
  # no. of files
  testthat::expect_equal(length(list.files(
    paste0(path_output, gsub(".csv", "", lauf), "_ensemble/"))), 10)
  #
  # clean up
  suppressWarnings(rm(OOF_metrics, TEST_metrics, OOFforecast, TESTforecast,
                      y_OOF, y_TEST, SummaryCV_META))
  SRxgboost_cleanup()
})




# Regression: 2 runs ------------------------------------------------------
#
test_that("regression train test", {
  skip('skip')
  #
  ## lauf 1
  # initialisation
  lauf <- "regression_train_test_1.csv"
  assign('lauf', lauf, envir = .GlobalEnv)
  cat(lauf, "\n")
  train_ <- train %>% select(1:60, SalePrice)
  id_unique_train <- house$Id[seq(1, 1460, 2)]
  id_unique_test <- house$Id[seq(2, 1460, 2)]
  # prepare data and test
  SRxgboost_data_prep(yname = "SalePrice",
                      data_train = train_[seq(1, 1460, 2), ],
                      data_test = train_[seq(2, 1460, 2), ],
                      no_folds = 5,
                      objective = "regression")
  # run models
  SRxgboost_run(nround = 1000, eta = 0.1, obj = "reg:squarederror", metric = "rmse",
                runs = 50, nfold = 5)
  # clean up
  SRxgboost_cleanup(); rm(train_)
  #
  ## lauf 2
  # initialisation
  lauf <- "regression_train_test_2.csv"
  assign('lauf', lauf, envir = .GlobalEnv)
  cat(lauf, "\n")
  train_ <- train %>% select(21:79, SalePrice)
  id_unique_train <- house$Id[seq(1, 1460, 2)]
  id_unique_test <- house$Id[seq(2, 1460, 2)]
  # prepare data and test
  SRxgboost_data_prep(yname = "SalePrice",
                      data_train = train[seq(1, 1460, 2), ],
                      data_test = train[seq(2, 1460, 2), ],
                      no_folds = 5,
                      objective = "regression")
  # run models
  SRxgboost_run(nround = 1000, eta = 0.1, obj = "reg:squarederror", metric = "rmse",
                runs = 50, nfold = 5)
  #
  # make ensemble
  SRxgboost_make_ensemble(name = "regression_train_test_1_2_ensemble",
                          lauf = c("regression_train_test_1.csv",
                                   "regression_train_test_2.csv"),
                          top_rank = c(3, 4))
  #
  # tests
  # no. of files
  testthat::expect_equal(length(list.files(
    paste0(path_output, "regression_train_test_1_2_ensemble/"))), 10)
  #
  # clean up
  suppressWarnings(rm(OOF_metrics, TEST_metrics, OOFforecast, TESTforecast,
                      y_OOF, y_TEST, SummaryCV_META, train_))
  SRxgboost_cleanup()
})




# Regression: clean up ----------------------------------------------------
#
suppressWarnings(rm(house, train, id_unique_train, id_unique_test))




# Classification: read data -----------------------------------------------
#
churn <- utils::read.csv(paste0(path_to_data,
                                "Classification/Telco Customer Churn/Telco-Customer-Churn.csv"))
assign('churn', churn, envir = .GlobalEnv)
id_unique_train <- churn$customerID
assign('id_unique_train', id_unique_train, envir = .GlobalEnv)
train <- churn %>%
  dplyr::mutate(Churn = dplyr::case_when(Churn == "No"  ~ 0,
                                         Churn == "Yes" ~ 1,
                                         T              ~ NA_real_)) %>%
  dplyr::select(-customerID)
assign('train', train, envir = .GlobalEnv)




# Classification ----------------------------------------------------------
#
test_that("classification", {
  skip('skip')
  # initialisation
  lauf <- "classification.csv"
  assign('lauf', lauf, envir = .GlobalEnv)
  cat(lauf, "\n")
  # prepare data and test
  SRxgboost_data_prep(yname = "Churn",
                      data_train = train,
                      no_folds = 5,
                      objective = "classification")
  # run models
  SRxgboost_run(nround = 1000, eta = 0.1, obj = "binary:logistic", metric = "auc",
                runs = 50, nfold = 5)
  # make ensemble
  SRxgboost_make_ensemble(name = gsub(".csv", "_ensemble.csv", lauf),
                          lauf = lauf, top_rank = 3)
  #
  # tests
  # no. of files
  testthat::expect_equal(length(list.files(
    paste0(path_output, gsub(".csv", "", lauf), "_ensemble/"))), 12)
  #
  # clean up
  suppressWarnings(rm(OOF_metrics, TEST_metrics, OOFforecast, TESTforecast,
                      y_OOF, y_TEST, SummaryCV_META))
  SRxgboost_cleanup()
})




# Classification: train/test --------------------------------------------------
#
test_that("classification train test", {
  skip('skip')
  # initialisation
  lauf <- "classification_train_test.csv"
  assign('lauf', lauf, envir = .GlobalEnv)
  cat(lauf, "\n")
  id_unique_train <- churn$customerID[seq(1, 7043, 2)]
  id_unique_test <- churn$customerID[seq(2, 7043, 2)]
  # prepare data and test
  SRxgboost_data_prep(yname = "Churn",
                      data_train = train[seq(1, 7043, 2), ],
                      data_test = train[seq(2, 7043, 2), ],
                      no_folds = 5,
                      objective = "classification")
  # run models
  SRxgboost_run(nround = 1000, eta = 0.1, obj = "binary:logistic", metric = "auc",
                runs = 50, nfold = 5)
  # make ensemble
  SRxgboost_make_ensemble(name = gsub(".csv", "_ensemble.csv", lauf),
                          lauf = lauf, top_rank = 3)
  #
  # tests
  # no. of files
  testthat::expect_equal(length(list.files(
    paste0(path_output, gsub(".csv", "", lauf), "_ensemble/"))), 20)
  #
  # clean up
  suppressWarnings(rm(OOF_metrics, TEST_metrics, OOFforecast, TESTforecast,
                      y_OOF, y_TEST, SummaryCV_META))
  SRxgboost_cleanup()
})




# Classification: clean up ----------------------------------------------------
#
suppressWarnings(rm(churn, train, id_unique_train, id_unique_test))




# Multilabel Classification: read data ------------------------------------
#
# WARNING: mlogloss CRASHES with xgboost v0.90.0.2 !!!
#
birds <- utils::read.csv(paste0(path_to_data,
                                "Multilabel Classification/Birds Bones and Living Habits/data.csv"))
assign('birds', birds, envir = .GlobalEnv)
id_unique_train <- birds$id
assign('id_unique_train', id_unique_train, envir = .GlobalEnv)
train <- birds %>%
  dplyr::mutate(type = as.numeric(as.factor(type)) - 1) %>%
  dplyr::select(-id)
assign('train', train, envir = .GlobalEnv)




# Multilabel Classification -----------------------------------------------
#
test_that("multilabel classification", {
  skip('skip')
  # initialisation
  lauf <- "mclass_softprob.csv"
  assign('lauf', lauf, envir = .GlobalEnv)
  cat(lauf, "\n")
  # prepare data and test
  SRxgboost_data_prep(yname = "type",
                      data_train = train,
                      no_folds = 5,
                      objective = "multilabel")
  # run models
  SRxgboost_run(nround = 1000, eta = 0.1, obj = "multi:softprob", metric = "mAUC",
                runs = 50, nfold = 5)
  # make ensemble
  SRxgboost_make_ensemble(name = gsub(".csv", "_ensemble.csv", lauf),
                          lauf = lauf, top_rank = 3)
  #
  # tests
  # no. of files
  testthat::expect_equal(length(list.files(
    paste0(path_output, gsub(".csv", "", lauf), "_ensemble/"))), 8)
  #
  # clean up
  suppressWarnings(rm(OOF_metrics, TEST_metrics, OOFforecast, TESTforecast,
                      y_OOF, y_TEST, SummaryCV_META))
  SRxgboost_cleanup()
})




# Multilabel Classification: train/test -----------------------------------
#
test_that("multilabel classification train test", {
  skip('skip')
  # initialisation
  lauf <- "mclass_softprob_train_test.csv"
  assign('lauf', lauf, envir = .GlobalEnv)
  cat(lauf, "\n")
  id_unique_train <- birds$id[seq(1, 420, 2)]
  id_unique_test <- birds$id[seq(2, 420, 2)]
  # prepare data and test
  SRxgboost_data_prep(yname = "type",
                      data_train = train[seq(1, 420, 2), ],
                      data_test = train[seq(2, 420, 2), ],
                      no_folds = 5,
                      objective = "multilabel")
  # run models
  SRxgboost_run(nround = 1000, eta = 0.1, obj = "multi:softprob", metric = "mAUC",
                runs = 50, nfold = 5)
  # make ensemble
  SRxgboost_make_ensemble(name = gsub(".csv", "_ensemble.csv", lauf),
                          lauf = lauf, top_rank = 3)
  #
  # tests
  # no. of files
  testthat::expect_equal(length(list.files(
    paste0(path_output, gsub(".csv", "", lauf), "_ensemble/"))), 14)
  #
  # clean up
  suppressWarnings(rm(OOF_metrics, TEST_metrics, OOFforecast, TESTforecast,
                      y_OOF, y_TEST, SummaryCV_META))
  SRxgboost_cleanup()
})




# Multilabel Classification: 2 runs -----------------------------------
#
test_that("multilabel classification train test", {
  skip('skip')
  #
  ## lauf 1
  # initialisation
  lauf <- "mclass_softprob_train_test_1.csv"
  assign('lauf', lauf, envir = .GlobalEnv)
  cat(lauf, "\n")
  train_ <- train %>% select(1:8, type)
  id_unique_train <- birds$id[seq(1, 420, 2)]
  id_unique_test <- birds$id[seq(2, 420, 2)]
  # prepare data and test
  SRxgboost_data_prep(yname = "type",
                      data_train = train_[seq(1, 420, 2), ],
                      data_test = train_[seq(2, 420, 2), ],
                      no_folds = 5,
                      objective = "multilabel")
  # run models
  SRxgboost_run(nround = 1000, eta = 0.1, obj = "multi:softprob", metric = "mAUC",
                runs = 50, nfold = 5)
  #
  ## lauf 2
  # initialisation
  lauf <- "mclass_softprob_train_test_2.csv"
  assign('lauf', lauf, envir = .GlobalEnv)
  cat(lauf, "\n")
  train_ <- train %>% select(4:10, type)
  id_unique_train <- birds$id[seq(1, 420, 2)]
  id_unique_test <- birds$id[seq(2, 420, 2)]
  # prepare data and test
  SRxgboost_data_prep(yname = "type",
                      data_train = train_[seq(1, 420, 2), ],
                      data_test = train_[seq(2, 420, 2), ],
                      no_folds = 5,
                      objective = "multilabel")
  # run models
  SRxgboost_run(nround = 1000, eta = 0.1, obj = "multi:softprob", metric = "mAUC",
                runs = 50, nfold = 5)
  #
  # make ensemble
  SRxgboost_make_ensemble(name = "mclass_softprob_train_test_1_2_ensemble",
                          lauf = c("mclass_softprob_train_test_1.csv",
                                   "mclass_softprob_train_test_2.csv"),
                          top_rank = c(3, 4))
  #
  # tests
  # no. of files
  testthat::expect_equal(length(list.files(
    paste0(path_output, "mclass_softprob_train_test_1_2_ensemble/"))), 14)
  #
  # clean up
  suppressWarnings(rm(OOF_metrics, TEST_metrics, OOFforecast, TESTforecast,
                      y_OOF, y_TEST, SummaryCV_META, train_))
  SRxgboost_cleanup()
})




# Multilabel Classification: clean up ----------------------------------------------------
#
suppressWarnings(rm(birds, train, id_unique_train, id_unique_test))




# Clean up ----------------------------------------------------------------
#
unlink(path_output, recursive = TRUE)
rm(path_output, path_to_data, this_file)
samuelreuther/SRxgboost documentation built on March 30, 2025, 12:48 a.m.