#' Train an extreme gradient boosted tree model
#'
#' Function does some preprocessing and calls xgboost to train gradient boosted trees.
#'
#' See run_algorithms.R for some comments about tuning.
#'
#' @param data a data.table created with \code{\link{removeFaultyAndUncodableAnswers_And_PrepareForAnalysis}}
#' @param allowed.codes a vector containg all the labels of the codes (even those that are not in the data are possible.)
#' @param testCases If \code{NULL} (default) the model is trained as usual. If \code{testCases} is a logical vector equal to the length of \code{data}, the function splits data into a separate evaluation set (the elements with \code{testCases == TRUE}), prints missclassification error of training and evaluation dataset and returns the predictions from the evaluation dataset.
#' @param returnPredictions (only used if testCases are given.) If TRUE, a data.table with predictions for all testCases is returned. Otherwise the xgboost model is returned which can be used for diagnostics but not inside \code{\link{predictXgboost}} because the term-document matrix will be calculated in a different way.
#' @param coding_index a data.table with columns
#' \describe{
#' \item{title}{a character vector, a preprocessed character vector with entries from the coding index.}
#' \item{Code}{a character vector with associated classification codes.}
#' }
#' @param preprocessing a list with elements
#' \describe{
#' \item{stopwords}{a character vector, use \code{tm::stopwords("de")} for German stopwords.}
#' \item{stemming}{\code{NULL} for no stemming and \code{"de"} for stemming using the German porter stemmer.}
#' \item{countWords}{Set to TRUE if the predictor matrix should contain a column for answer length.}
#' }
#' @param tuning a list with elements that will be passed to \code{\link[xgboost]{xgb.train}} except for two parameters
#' \describe{
#' \item{early.stopping.max.diff}{If the sum of probabilities of the most probable category is by \code{early.stopping.max.diff} cases larger than the observed number of cases correctly predicted, return Infinity, a value that will support early stopping. Idea behind this: Once the sum of predicted probabilities becomes too large, it is unusual (impossible) to decrease it again. Thus, we have overfitting.}
#' \item{early.stopping.precision.digits}{Logloss is rounded with \code{early.stopping.precision.digits}. This leads to early stopping if improvements in logloss are actually to small.}
#' }
#'
#' @seealso \code{\link{predictXgboost}}, \code{\link[xgboost]{xgb.train}}
#'
#' @return If \code{testCases = NULL} (default) a xgboost model to be used with \code{\link{predictXgboost}}.
#' @import data.table
#' @export
#'
#' @examples
#' # set up data
#' data(occupations)
#' allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
#' allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
#' "Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
#' "Not precise enough for coding", "Student assistants")
#' proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
#' group <- sample(c(rep("test", n.test), rep("train", nrow(proc.occupations)-n.test)))
#' ##### Tune pararameters with verbose=1 output. We split the data into training and evaluation set of size n.test = 50
#' n.test <- 50
#'
#' # output test dataset with 'returnPredictions = TRUE
#' eval.dataset <- trainXgboost(proc.occupations, allowed.codes = allowed.codes, testCases = group == "test", returnPredictions = TRUE,
#' preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE),
#' tuning = list(eta = 0.5, lambda = 1e-4, alpha = 0,
#' max.depth = 20, gamma = 0.6,
#' min_child_weight = 0, max_delta_step = 1,
#' subsample = 0.75, colsample_bytree = 1, colsample_bylevel=1,
#' nrounds= 3, early_stopping_rounds = 1,
#' early.stopping.max.diff = n.test / 100, early.stopping.precision.digits = 3,
#' nthread = 8, verbose=1)
#' )
#' eval.dataset[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id][, mean(acc)]
#' produceResults(expandPredictionResults(eval.dataset, allowed.codes = allowed.codes, method.name = "xgboost"), k = 1, n = n.test, num.codes = length(allowed.codes))
#'
#' # same as before but output the model
#' XGboostModel <- trainXgboost(proc.occupations, allowed.codes = allowed.codes, testCases = group == "test", returnPredictions = FALSE,
#' preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE),
#' tuning = list(eta = 0.5, lambda = 1e-4, alpha = 0,
#' max.depth = 20, gamma = 0.6,
#' min_child_weight = 0, max_delta_step = 1,
#' subsample = 0.75, colsample_bytree = 1, colsample_bylevel=1,
#' nrounds= 3, early_stopping_rounds = 1,
#' early.stopping.max.diff = n.test / 100, early.stopping.precision.digits = 3,
#' nthread = 8, verbose=1)
#' )
#'
#' # same as before, but without test data and without early stopping (not recommended because results can be worse)
#' XGboostModel <- trainXgboost(proc.occupations, allowed.codes = allowed.codes, testCases = NULL, returnPredictions = FALSE,
#' preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE),
#' tuning = list(eta = 0.5, lambda = 1e-4, alpha = 0,
#' max.depth = 20, gamma = 0.6,
#' min_child_weight = 0, max_delta_step = 1,
#' subsample = 0.75, colsample_bytree = 1, colsample_bylevel=1,
#' nrounds= 3, early_stopping_rounds = NULL,
#' early.stopping.max.diff = n.test / 100, early.stopping.precision.digits = 3,
#' nthread = 8, verbose=0)
#' )
#'
#' # same as before, now using the coding index
#' # point path_to_file to your local file
#' # path_to_file <- ".../Gesamtberufsliste_der_BA.xlsx"
#' # coding_index_excerpt <- prepare_German_coding_index_Gesamtberufsliste_der_BA(path_to_file, count.categories = FALSE)
#' XGboostModel <- trainXgboost(proc.occupations, allowed.codes = allowed.codes, testCases = NULL, returnPredictions = FALSE,
#' coding_index = coding_index_excerpt,
#' preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE),
#' tuning = list(eta = 0.5, lambda = 1e-4, alpha = 0,
#' max.depth = 20, gamma = 0.6,
#' min_child_weight = 0, max_delta_step = 1,
#' subsample = 0.75, colsample_bytree = 1, colsample_bylevel=1,
#' nrounds= 3, early_stopping_rounds = NULL,
#' early.stopping.max.diff = n.test / 100, early.stopping.precision.digits = 3,
#' nthread = 8, verbose=0)
#' )
trainXgboost <- function(data, allowed.codes, testCases = NULL, returnPredictions = FALSE,
coding_index = NULL,
preprocessing = list(stopwords = character(0), stemming = "de", countWords = TRUE),
tuning = list(eta = 0.5, lambda = 1e-4, alpha = 0,
max.depth = 20, gamma = 0.6,
min_child_weight = 0, max_delta_step = 1,
subsample = 0.75, colsample_bytree = 1, colsample_bylevel=1,
nrounds=40, early_stopping_rounds=1,
early.stopping.max.diff = sum(testCases) / 100, early.stopping.precision.digits = 3,
nthread = 1, verbose=1)) {
if (!requireNamespace("xgboost", quietly = TRUE)) {
stop("Package \"xgboost\" needed if you want to use this function. Please install it.",
call. = FALSE)
}
# add job titles from the coding index to the training data
if (!is.null(coding_index)) {
coding_index <- rbind(coding_index[, list(title = bezMale, code = Code, origin = "codingIndex")],
coding_index[, list(title = bezFemale, code = Code, origin = "codingIndex")])
data <- rbind(data[, list(ans, code = as.character(code), origin = "training")],
coding_index[, list(ans = title, code, origin)])
# our training data set increases in size -> make sure the new cases are not considered as test data
if (!is.null(testCases)) testCases <- c(testCases, rep(FALSE, nrow(coding_index)))
}
# and prepare codes
outcome <- factor(as.character(data[,code]), levels = unique(c(allowed.codes, as.character(data[,code])))) # include data[,code] because this may contain new codes from the coding index
outcome_dictionary <- data.table(num = 0:(nlevels(outcome) - 1), cat = levels(outcome))
# preprocessing
ans <- data[, stringPreprocessing(ans)]
# prepare text for efficient computation -> transform to sparse matrix
matrix <- asDocumentTermMatrix(ans, vect.vocab = NULL,
stopwords = preprocessing$stopwords,
stemming = preprocessing$stemming,
type = "dgCMatrix")
# include feature for number of words
if (preprocessing$countWords) {
ans_freq <- sapply(strsplit(ans, " "), length)
matrix$dtm <- cbind(matrix$dtm, ans_freq)
}
# include feature for origin of training data (0 if training, 1 if coding index)
if (!is.null(coding_index)) {
matrix$dtm <- cbind(matrix$dtm, origin = ifelse(data[, origin] == "training", 0, 1))
}
if (is.null(testCases)) { # standard if we want a model
dtrain <- xgboost::xgb.DMatrix(
data=matrix$dtm,
label=as.numeric(outcome) - 1)
trainingEmptyColumns <- FALSE # no columns from predictor matrix will be excluded during prediction
watchlist <- list() # not used
} else {
### Use the following lines to see live evaluation
### remove columns from predictor matrix if they are all zero in the training data
trainingEmptyColumns <- Matrix::colSums(matrix$dtm[!testCases, ]) == 0
dtrain <- xgboost::xgb.DMatrix(
data=matrix$dtm[!testCases, !trainingEmptyColumns],
label=as.numeric(outcome[!testCases]) - 1)
dTest <- xgboost::xgb.DMatrix(data = matrix$dtm[testCases, !trainingEmptyColumns],
label=as.numeric(outcome[testCases]) - 1)
watchlist <- list(train = dtrain, eval = dTest)
}
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin (not sure if this is correct)
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
# IDEA: we start with small probabilities that increase with the number of rounds. This means in the beginning we expect to predict more correct categories than what sum(probabilities) would suggest. Stop early if the probabilities grow too large (and the model overfits)
evalerror <- function(preds, dtrain) {
labels <- xgboost::getinfo(dtrain, "label")
pred2 <- matrix(preds, nrow = nrow(outcome_dictionary), byrow = FALSE) # one row per person
n <- ncol(pred2)
pred.cat <- apply(pred2, 2, which.max) # for each n, the index of the category with maximal probability
num.categories.expected <- sum(pred2[cbind(pred.cat, 1:n )])
num.categories.correct.predicted <- sum(labels == (pred.cat - 1))
logloss <- -mean(log(pred2[cbind(labels+1, 1:n)]))
err <- num.categories.correct.predicted - num.categories.expected
# we would like to stop as soon as err < -tuning$early.stopping.max.diff (results become oversharp) or if logloss actually increases. Workaround to achieve this:
err <- if (err < -tuning$early.stopping.max.diff) Inf else logloss
return(list(metric = paste("Logloss:", round(logloss, 4), "N.predict:", num.categories.correct.predicted, "(Acc:", round(num.categories.correct.predicted/n, 4), ")-N.expect:", round(num.categories.expected, 2)), value = round(as.numeric(err), tuning$early.stopping.precision.digits))) # if many errors in a row are smaller than -5 this seems like a good point to stop running more rounds
}
# estimate model
fit <- xgboost::xgb.train(params = list( # see https://xgboost.readthedocs.io/en/latest//parameter.html
objective="multi:softprob", # "multi:softmax"
# eval_metric = "merror", eval_metric = "mlogloss", # the latter will be used for early stopping.
eval_metric = evalerror,
num_class = nrow(outcome_dictionary),
booster="gbtree",
eta=tuning$eta, # Control the learning rate, related to nrounds
lambda=tuning$lambda, # shrinks weights of each leaf (i.e. values that will be passed through a sigmoid) towards zero with a quadratic penalty
alpha=tuning$alpha, # shrinks weights of each leaf (i.e. values that will be passed through a sigmoid) towards zero with a absolute penalty
tree.method = "exact", # because we have 0-1 variables, this should not impede speed to much. Looks like the time is still the same (and the results also)
max.depth=tuning$max.depth, # Maximum depth of the tree, we'd probably like that every possible job title can be picked up by a tree, so this is related to the number of meaningful job titles a tree can find.
subsample=tuning$subsample, # subsample ratio of the training instance, Rational: 1 seems reasonable. In every round a tree should be able to pick up the category for every word. 0.9 was a bit worse/or maybe just slower. However, the difference between mlogloss for eval and training becomes smaller.
colsample_bytree=tuning$colsample_bytree, # subsample ratio of columns when constructing each tree. rational is the same as before. Likewise, 0.9 makes this a bit worse.
colsample_bylevel = tuning$colsample_bylevel, # even a value at 0.4 did not change the results by much. usually not used because one could achieve the same with the above options?
gamma = tuning$gamma, # highly important, 0.9 looks quite good for PASS. minimum loss reduction required to make a further partition, closely related to "gain" in the model dump, meaningful words have large gains.
min_child_weight = tuning$min_child_weight, # should probably be 0, when large the algorithm will stop early "stop trying to split once you reach a certain degree of purity in a node and your model can fit it" (see https://stats.stackexchange.com/questions/317073/explanation-of-min-child-weight-in-xgboost-algorithm)
max_delta_step = tuning$max_delta_step), # very meaningful, 8 is too large, 0 too low. supposed to be helpful for the estimation of probabilities with imbalanced datasets with logistic regression, see https://xgboost.readthedocs.io/en/latest/how_to/param_tuning.html
data=dtrain,
nthread= tuning$nthread, # number of CPU threads
nrounds= tuning$nrounds, # 80, # number of trees
early_stopping_rounds = tuning$early_stopping_rounds, # early stopping if the performance keeps getting worse consecutively for k rounds
verbose=tuning$verbose, # 1 # do not show partial info
nfold=1, # number of CV folds
watchlist = watchlist,
# feval=rmpse, # custom evaluation metric
maximize=FALSE # the lower the evaluation score the better
)
## some diagnostics to understand xgboost
# plot the first of nrounds trees
# feature.names <- matrix$dtm@Dimnames[[2]][!trainingEmptyColumns]
# (dt <- xgboost::xgb.model.dt.tree(feature.names, fit))
# xgboost::xgb.plot.tree(feature_names = feature.names, model = fit, n_first_tree = 0) # shows only the first tree (the one to predict "-0004")
# xgboost::xgb.plot.tree(feature_names = feature.names, model = fit, n_first_tree = 1) # shows only the first two trees (that predict "-0004" and "01104")
## xgboost::xgb.importance(feature.names, model = fit)
# write.table(data.table(featureid = 0:(length(feature.names)-1), featurename = feature.names, type = "q"), file = "feature_map.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
#
#
# xgboost::xgb.dump(model = fit, fname = "test.txt", fmap = "feature_map.txt", with_stats = TRUE)
#
# How to match feature names of splits that are following a current 'Yes' branch:
# merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)]
# xgboost::xgb.plot.deepness(fit)
if (!is.null(testCases) & returnPredictions == TRUE) {
res <- data.table(id = rep(1:length(ans[testCases]), each = nrow(outcome_dictionary)),
ans = rep(ans[testCases], each = nrow(outcome_dictionary)),
code = rep(outcome[testCases], each = nrow(outcome_dictionary)),
pred.code = rep(outcome_dictionary$cat, times = length(ans[testCases])),
pred.prob = predict(fit, dTest, ntreelimit = fit$best_ntreelimit)) # use optimal number of iterations (as measured by mlogloss) to make predictions (one might also want to use the last iterations because it is not so clear what is better. Differences are usually small.)
class(res) <- c(class(res), "occupationalPredictions")
attr(res, "evaluation_log") <- fit$evaluation_log
return(res)
}
# and save preprocessing
fit$outcome_dictionary <- outcome_dictionary
fit$vect.vocab <- matrix$vect.vocab
fit$trainingEmptyColumns <- trainingEmptyColumns # in case we used a training process with test data and early stopping
fit$preprocessing <- preprocessing
fit$coding_index <- coding_index # we won't need this because it is already in the training data
return(fit)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.