knitr::opts_chunk$set(collapse = TRUE) # devtools::load_all() library(compboost) options(width = 80) required.pcks = c("ggplot2") dependencies = all( unlist(lapply(required.pcks, requireNamespace, quietly = TRUE)) )
We use the titanic dataset with binary
classification on survived
. First of all we store the train and test data
in two data frames and remove all rows that contains NA
s:
# Store train and test data: df.train = na.omit(titanic::titanic_train) df.test = na.omit(titanic::titanic_test) str(df.train)
In the next step we transform the response to a factor with more intuitive levels:
df.train$Survived = factor(df.train$Survived, labels = c("no", "yes")) # Train and evaluation split for training: set.seed(1111) idx.train = sample(x = seq_len(nrow(df.train)), size = 0.6 * nrow(df.train)) idx.eval = setdiff(seq_len(nrow(df.train)), idx.train)
This split will be used while the training to calculate the out of bag risk.
Due to the R6
API it is necessary to create a new class object which gets the data, the target as character, and the used loss. Note that it is important to give an initialized loss object:
cboost = Compboost$new(data = df.train[idx.train, ], target = "Survived", loss = LossBinomial$new())
Use an initialized object for the loss gives the opportunity to use a loss initialized with a custom offset.
Adding new base-learners is also done by giving a character to indicate the feature. As second argument it is important to name an identifier for the factory since we can define multiple base-learner on the same source.
For instance, we can define a spline and a linear base-learner of the same feature:
# Spline base-learner of age: cboost$addBaselearner("Age", "spline", BaselearnerPSpline) # Linear base-learner of age (degree = 1 with intercept is default): cboost$addBaselearner("Age", "linear", BaselearnerPolynomial)
Additional arguments can be specified after naming the base-learner. For a complete list see the functionality at the project page:
# Spline base-learner of fare: cboost$addBaselearner("Fare", "spline", BaselearnerPSpline, degree = 2, n.knots = 14, penalty = 10, differences = 2)
When adding categorical features each group is added as single base-learner to avoid biased feature selection. Also note that we don't need an intercept here:
cboost$addBaselearner("Sex", "categorical", BaselearnerPolynomial, intercept = FALSE)
Finally, we can check what factories are registered:
cboost$getBaselearnerNames()
This logger logs the elapsed time. The time unit can be one of microseconds
, seconds
or minutes
. The logger stops if max_time
is reached. But we do not use that logger as stopper here:
cboost$addLogger(logger = LoggerTime, use.as.stopper = FALSE, logger.id = "time", max.time = 0, time.unit = "microseconds")
The out of bag risk logger does basically the same as the inbag risk logger but calculates the empirical risk using another data source. Therefore, the new data object have to be a list with data sources containing the evaluation data. This is automatically done by the prepareData()
member of Compboost
:
cboost$addLogger(logger = LoggerOobRisk, use.as.stopper = FALSE, logger.id ="oob", LossBinomial$new(), 0.01, cboost$prepareData(df.train[idx.eval, ]), df.train[["Survived"]][idx.eval])
After defining all object we can train the model:
cboost$train(1000, trace = 50) cboost
Object of the Compboost
class do have member functions such as getEstimatedCoef()
, getInbagRisk()
or predict()
to access the results:
str(cboost$getEstimatedCoef()) str(cboost$getInbagRisk()) str(cboost$predict())
To obtain a vector of selected learner just call getSelectedBaselearner()
table(cboost$getSelectedBaselearner())
We can also predict on new data to get estimated probabilities. Therefore we have to use response = TRUE
:
prob.newdata = cboost$predict(df.train[idx.eval, ], response = TRUE) table(df.train[idx.eval, "Survived"], ifelse(prob.newdata > 0.5, "no", "yes"))
To set the whole model to another iteration one can easily call train()
to another iteration:
cboost$train(1500) str(cboost$getEstimatedCoef()) str(cboost$getInbagRisk()) table(cboost$getSelectedBaselearner()) # Get confusion matrix: prob.newdata = cboost$predict(df.train[idx.eval, ], response = TRUE) table(df.train[idx.eval, "Survived"], ifelse(prob.newdata > 0.5, "no", "yes"))
To visualize a base-learner it is important to exactly use a name from getBaselearnerNames()
:
gg1 = cboost$plot("Age_spline") gg2 = cboost$plot("Age_spline", iters = c(50, 100, 500, 1000, 1500))
gg1 = cboost$plot("Age_spline") gg2 = cboost$plot("Age_spline", iters = c(200, 500, 1000, 1500)) gridExtra::grid.arrange(gg1, gg2, ncol = 2)
gg1 = cboost$plot("Age_spline") gg2 = cboost$plot("Age_spline", iters = c(50, 100, 500, 1000, 1500))
gg1 = cboost$plot("Fare_spline") gg2 = cboost$plot("Fare_spline", iters = c(200, 500, 1000, 1500)) gridExtra::grid.arrange(gg1, gg2, ncol = 2)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.