knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" )
stackeR package implements classical model stacking algorithm called SuperLearner. It works around resampleR and grideR and has as little dependecies as possible.
You can install dev version of stackeR from GitHub GitHub with:
# install.packages("devtools") devtools::install_github("statist-bhfz/stackeR")
We will use BostonHousing
data to predict median value of owner-occupied homes in USD 1000's (medv
):
library(grideR) library(stackeR) library(mlbench) library(ggplot2) # Input data data(BostonHousing) dt <- as.data.table(BostonHousing) dt[, chas := as.numeric(chas) - 1] set.seed(42) dt <- dt[sample(1:.N, .N), ] # random shuffle # Train/test split train_test_split <- c(rep(0, 400), rep(1, 106)) # ~80%/20% dt_train <- dt[train_test_split == 0, ] dt_test <- dt[train_test_split == 1, ]
Lets fit metamodel (simple linear model) with two base models - xgboost and catboost:
# data.table with resamples splits <- resampleR::cv_base(dt_train, "medv") # List of models models <- list("xgboost" = xgb_fit, "catboost" = catboost_fit) # Model parameters xgb_params <- data.table( max_depth = 6, eta = 0.025, colsample_bytree = 0.9, subsample = 0.8, gamma = 0, min_child_weight = 5, alpha = 0, lambda = 1 ) xgb_args <- list( nrounds = 500, early_stopping_rounds = 10, booster = "gbtree", eval_metric = "rmse", objective = "reg:linear", verbose = 0 ) catboost_params <- data.table( iterations = 1000, learning_rate = 0.05, depth = 8, loss_function = "RMSE", eval_metric = "RMSE", random_seed = 42, od_type = 'Iter', od_wait = 10, use_best_model = TRUE, logging_level = "Silent" ) catboost_args <- NULL model_params <- list(xgb_params, catboost_params) model_args <- list(xgb_args, catboost_args) # Dumb preprocessing function # Real function will contain imputation, feature engineering etc. # with all statistics computed on train folds and applied to validation fold preproc_fun_example <- function(data) return(data[]) # List of preprocessing fuctions for each model preproc_funs <- list(preproc_fun_example, preproc_fun_example) metamodel_obj <- metamodel_fit(data = dt_train, target = "medv", split = splits, models = models, model_params = model_params, model_args = model_args, preproc_funs = preproc_funs, metamodel = lm, metamodel_params = list(), metamodel_interface = "formula" ) metamodel_obj$call$formula metamodel_obj$coefficients
The final step is to fit base models in the entire train dataset, get base models predictions on test set and get metamodel predictions using first level predictions as input:
first_level_preds <- across_models(data = dt, target = "medv", split = train_test_split, models = models, model_params = model_params, model_args = model_args, preproc_funs = preproc_funs) first_level_preds[, split := "split_1"] # to match lm formula # Quick check to ensure that correlation between base model predictions isn't so strong ggplot(first_level_preds) + geom_point(aes(ground_truth, xgboost), color = "red") + geom_point(aes(ground_truth, catboost), color = "green") test_preds <- data.table( ground_truth = dt_test[, medv], preds = predict.lm(metamodel_obj, newdata = first_level_preds) ) ggplot(test_preds) + geom_point(aes(ground_truth, preds)) ggplot(test_preds) + geom_point(aes(ground_truth, preds), size = 3) + geom_point(aes(ground_truth, xgboost), first_level_preds, color = "red", alpha = 0.7) + geom_point(aes(ground_truth, catboost), first_level_preds, color = "green", alpha = 0.7) + geom_abline(intercept = 0, slope = 1) rmse <- function(ground_truth, prediction) { sqrt(mean((ground_truth - prediction)^2)) } rmse(first_level_preds$ground_truth, first_level_preds$xgboost) rmse(first_level_preds$ground_truth, first_level_preds$catboost) rmse(test_preds$ground_truth, test_preds$preds) # smallest rmse
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.