trainXgboost | R Documentation |
Function does some preprocessing and calls xgboost to train gradient boosted trees.
trainXgboost(
data,
allowed.codes,
testCases = NULL,
returnPredictions = FALSE,
coding_index = NULL,
preprocessing = list(stopwords = character(0), stemming = "de", countWords = TRUE),
tuning = list(eta = 0.5, lambda = 1e-04, alpha = 0, max.depth = 20, gamma = 0.6,
min_child_weight = 0, max_delta_step = 1, subsample = 0.75, colsample_bytree = 1,
colsample_bylevel = 1, nrounds = 40, early_stopping_rounds = 1,
early.stopping.max.diff = sum(testCases)/100, early.stopping.precision.digits = 3,
nthread = 1, verbose = 1)
)
data |
a data.table created with |
allowed.codes |
a vector containg all the labels of the codes (even those that are not in the data are possible.) |
testCases |
If |
returnPredictions |
(only used if testCases are given.) If TRUE, a data.table with predictions for all testCases is returned. Otherwise the xgboost model is returned which can be used for diagnostics but not inside |
coding_index |
a data.table with columns
|
preprocessing |
a list with elements
|
tuning |
a list with elements that will be passed to
|
See run_algorithms.R for some comments about tuning.
If testCases = NULL
(default) a xgboost model to be used with predictXgboost
.
predictXgboost
, xgb.train
# set up data
data(occupations)
allowed.codes <- c("71402", "71403", "63302", "83112", "83124", "83131", "83132", "83193", "83194", "-0004", "-0030")
allowed.codes.titles <- c("Office clerks and secretaries (without specialisation)-skilled tasks", "Office clerks and secretaries (without specialisation)-complex tasks", "Gastronomy occupations (without specialisation)-skilled tasks",
"Occupations in child care and child-rearing-skilled tasks", "Occupations in social work and social pedagogics-highly complex tasks", "Pedagogic specialists in social care work and special needs education-unskilled/semiskilled tasks", "Pedagogic specialists in social care work and special needs education-skilled tasks", "Supervisors in education and social work, and of pedagogic specialists in social care work", "Managers in education and social work, and of pedagogic specialists in social care work",
"Not precise enough for coding", "Student assistants")
proc.occupations <- removeFaultyAndUncodableAnswers_And_PrepareForAnalysis(occupations, colNames = c("orig_answer", "orig_code"), allowed.codes, allowed.codes.titles)
group <- sample(c(rep("test", n.test), rep("train", nrow(proc.occupations)-n.test)))
##### Tune pararameters with verbose=1 output. We split the data into training and evaluation set of size n.test = 50
n.test <- 50
# output test dataset with 'returnPredictions = TRUE
eval.dataset <- trainXgboost(proc.occupations, allowed.codes = allowed.codes, testCases = group == "test", returnPredictions = TRUE,
preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE),
tuning = list(eta = 0.5, lambda = 1e-4, alpha = 0,
max.depth = 20, gamma = 0.6,
min_child_weight = 0, max_delta_step = 1,
subsample = 0.75, colsample_bytree = 1, colsample_bylevel=1,
nrounds= 3, early_stopping_rounds = 1,
early.stopping.max.diff = n.test / 100, early.stopping.precision.digits = 3,
nthread = 8, verbose=1)
)
eval.dataset[, .SD[which.max(pred.prob), list(ans, true.code = code, pred.code, acc = code == pred.code)], by = id][, mean(acc)]
produceResults(expandPredictionResults(eval.dataset, allowed.codes = allowed.codes, method.name = "xgboost"), k = 1, n = n.test, num.codes = length(allowed.codes))
# same as before but output the model
XGboostModel <- trainXgboost(proc.occupations, allowed.codes = allowed.codes, testCases = group == "test", returnPredictions = FALSE,
preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE),
tuning = list(eta = 0.5, lambda = 1e-4, alpha = 0,
max.depth = 20, gamma = 0.6,
min_child_weight = 0, max_delta_step = 1,
subsample = 0.75, colsample_bytree = 1, colsample_bylevel=1,
nrounds= 3, early_stopping_rounds = 1,
early.stopping.max.diff = n.test / 100, early.stopping.precision.digits = 3,
nthread = 8, verbose=1)
)
# same as before, but without test data and without early stopping (not recommended because results can be worse)
XGboostModel <- trainXgboost(proc.occupations, allowed.codes = allowed.codes, testCases = NULL, returnPredictions = FALSE,
preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE),
tuning = list(eta = 0.5, lambda = 1e-4, alpha = 0,
max.depth = 20, gamma = 0.6,
min_child_weight = 0, max_delta_step = 1,
subsample = 0.75, colsample_bytree = 1, colsample_bylevel=1,
nrounds= 3, early_stopping_rounds = NULL,
early.stopping.max.diff = n.test / 100, early.stopping.precision.digits = 3,
nthread = 8, verbose=0)
)
# same as before, now using the coding index
# point path_to_file to your local file
# path_to_file <- ".../Gesamtberufsliste_der_BA.xlsx"
# coding_index_excerpt <- prepare_German_coding_index_Gesamtberufsliste_der_BA(path_to_file, count.categories = FALSE)
XGboostModel <- trainXgboost(proc.occupations, allowed.codes = allowed.codes, testCases = NULL, returnPredictions = FALSE,
coding_index = coding_index_excerpt,
preprocessing = list(stopwords = tm::stopwords("de"), stemming = "de", countWords = FALSE),
tuning = list(eta = 0.5, lambda = 1e-4, alpha = 0,
max.depth = 20, gamma = 0.6,
min_child_weight = 0, max_delta_step = 1,
subsample = 0.75, colsample_bytree = 1, colsample_bylevel=1,
nrounds= 3, early_stopping_rounds = NULL,
early.stopping.max.diff = n.test / 100, early.stopping.precision.digits = 3,
nthread = 8, verbose=0)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.