knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
The purpose of this vignette is to show you how to use tagi in an insurance regression problem using the Medical Cost dataset and to compare the results to the package xgboost. If the xgboost package is not installed, please install it using the following code: install.packages('xgboost')
Package loading:
require(tagi) require(xgboost)
MedicalCost contains 1,338 rows and 10 variables.
data(MedicalCost, package = 'tagi')
First setting a seed to get the same results each time the code is ran. Note that it is not a mandatory step.
set.seed(100)
Define the neural network properties, such as the number of epochs, activation function, etc.
nobs <- nrow(MedicalCost) ncvr <- 9 # Input features x <- MedicalCost[,1:ncvr] # Output targets y <- matrix(MedicalCost[,10], ncol = 1) nx <- ncol(x) ny <- ncol(y) NN <- list( "nx" = nx, # Number of input covariates "ny" = ny, # Number of output responses "batchSizeList" = c(1, 1, 1), # Batch size [train, val, test] "nodes" = c(nx, 100, ny), # Number of nodes for each layer "sx" = NULL, # Input standard deviation "sv" = 0.32 * matrix(1L, nrow = 1, ncol = ny), # Observations standard deviation "maxEpoch" = 40, # maximal number of learning epoch "hiddenLayerActivation" = "relu", # Activation function for hidden layer {'tanh','sigm','cdf','relu','softplus'} "outputActivation" = "linear", # Activation function for hidden layer {'linear', 'tanh','sigm','cdf','relu'} "ratio" = 0.8, # Ratio between training set and validation set "numSplits" = 20, # Number of splits "task" = "regression" # Task regression or classification )
Next, initialize weights and biases and parameters for the split between training and testing set.
NN$factor4Bp = 0.01 * matrix(1L, nrow = 1, ncol = length(NN$nodes) - 1) # Factor for initializing bias NN$factor4Wp = 0.25 * matrix(c(1/NN$nodes[1],1/NN$nodes[2]), nrow = 1, ncol = 2) # Factor for initializing weights trainIdx <- NULL testIdx <- NULL
Run the neural network model and collect metrics at each epoch.
out_regression <- regression(NN, x, y, trainIdx, testIdx) mp = out_regression[[1]] Sp = out_regression[[2]] metric = out_regression[[3]] time = out_regression[[4]]
Results
cat(sprintf("Average RMSE: %s +- %s", mean(metric[["RMSElist"]]), sd(metric[["RMSElist"]]))) cat(sprintf("Average LL: %s +- %s", mean(metric[["LLlist"]]), sd(metric[["LLlist"]])))
Split the data into input and response variables.
# Input features x <- MedicalCost[,1:9] # Output targets y <- matrix(MedicalCost[,10], ncol = 1)
Next, set the number of splits and initialize the RMSE list.
Nsplit = 20 RMSElist = rep(0, Nsplit)
Run the extreme gradient boosting model with a tree booster and fixed number of boosting iterations (i.e. 40) and collect RMSE at each round.
for (s in 1:Nsplit){ out_split = split(x, y, 0.8) # Preparing matrix dtrain = xgb.DMatrix(data = out_split[[1]], label = out_split[[2]]) dtest = xgb.DMatrix(data = out_split[[3]], label = out_split[[4]]) # Initial parameters for gbtree params <- list(booster = "gbtree", objective = "reg:squarederror", eval_metric = "rmse", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1) # If nrounds not optimized and use 40 (as for the number of epoch) bst <- xgboost(params = params, data = dtrain, nrounds = 40, verbose=0) pred <- predict(bst, dtest) # Evaluation RMSElist[s] = computeError(out_split[[4]], pred) print("") cat(sprintf("Results for Run # %s, RMSE: %s", s, RMSElist[s])) }
Results
cat(sprintf("Average RMSE: %s +- %s", mean(RMSElist), sd(RMSElist)))
Run the extreme gradient boosting model with a tree booster and optimized number of boosting iterations and collect RMSE at each round.
for (s in 1:Nsplit){ out_split = split(x, y, 0.8) # Preparing matrix dtrain = xgb.DMatrix(data = out_split[[1]], label = out_split[[2]]) dtest = xgb.DMatrix(data = out_split[[3]], label = out_split[[4]]) # Initial parameters for gbtree params <- list(booster = "gbtree", objective = "reg:squarederror", eval_metric = "rmse", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1) # Calculate the best nround for this model xgbcv <- xgb.cv(params = params, data = dtrain, nrounds = 100, nfold = 5, showsd = T, stratified = T, print_every_n = 10, early_stop._round = 20, maximize = F) bst <- xgboost(params = params, data = dtrain, nrounds = which.min(xgbcv$evaluation_log$test_rmse_mean), verbose = 0) pred <- predict(bst, dtest) # Evaluation RMSElist[s] = computeError(out_split[[4]], pred) print("") cat(sprintf("Results for Run # %s, RMSE: %s", s, RMSElist[s])) }
Results
cat(sprintf("Average RMSE: %s +- %s", mean(RMSElist), sd(RMSElist)))
Run the extreme gradient boosting model with a linear booster and fixed number of boosting iterations (i.e. 40) and collect RMSE at each round.
for (s in 1:Nsplit){ out_split = split(x, y, 0.8) # Preparing matrix dtrain = xgb.DMatrix(data = out_split[[1]], label = out_split[[2]]) dtest = xgb.DMatrix(data = out_split[[3]], label = out_split[[4]]) # Initial parameters for gblinear params <- list(booster = "gblinear", objective = "reg:linear") # If nrounds not optimized and use 40 (as for the number of epoch) bst <- xgboost(params = params, data = dtrain, nrounds = 40, verbose=0) pred <- predict(bst, dtest) # Evaluation RMSElist[s] = computeError(out_split[[4]], pred) print("") cat(sprintf("Results for Run # %s, RMSE: %s", s, RMSElist[s])) }
Results
cat(sprintf("Average RMSE: %s +- %s", mean(RMSElist), sd(RMSElist)))
Run the extreme gradient boosting model with a linear booster and optimized number of boosting iterations and collect RMSE at each round.
for (s in 1:Nsplit){ out_split = split(x, y, 0.8) # Preparing matrix dtrain = xgb.DMatrix(data = out_split[[1]], label = out_split[[2]]) dtest = xgb.DMatrix(data = out_split[[3]], label = out_split[[4]]) # Initial parameters for gblinear params <- list(booster = "gblinear", objective = "reg:linear") # Calculate the best nround for this model xgbcv <- xgb.cv(params = params, data = dtrain, nrounds = 100, nfold = 5, showsd = T, stratified = T, print_every_n = 10, early_stop._round = 20, maximize = F) bst <- xgboost(params = params, data = dtrain, nrounds = which.min(xgbcv$evaluation_log$test_rmse_mean), verbose = 0) pred <- predict(bst, dtest) # Evaluation RMSElist[s] = computeError(out_split[[4]], pred) print("") cat(sprintf("Results for Run # %s, RMSE: %s", s, RMSElist[s])) }
Results
cat(sprintf("Average RMSE: %s +- %s", mean(RMSElist), sd(RMSElist)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.