machine_learning | R Documentation |
These functions can be used to create a machine learning model based on different 'engines' and to generalise predicting outcomes based on such models. These functions are wrappers around tidymodels
packages (especially parsnip
, recipes
, rsample
, tune
, and yardstick
) created by RStudio.
ml_xg_boost(
.data,
outcome,
predictors = everything(),
training_fraction = 0.75,
strata = NULL,
na_threshold = 0.01,
correlation_threshold = 0.9,
centre = TRUE,
scale = TRUE,
engine = "xgboost",
mode = c("classification", "regression", "unknown"),
trees = 15,
...
)
ml_decision_trees(
.data,
outcome,
predictors = everything(),
training_fraction = 0.75,
strata = NULL,
na_threshold = 0.01,
correlation_threshold = 0.9,
centre = TRUE,
scale = TRUE,
engine = "rpart",
mode = c("classification", "regression", "unknown"),
tree_depth = 30,
...
)
ml_random_forest(
.data,
outcome,
predictors = everything(),
training_fraction = 0.75,
strata = NULL,
na_threshold = 0.01,
correlation_threshold = 0.9,
centre = TRUE,
scale = TRUE,
engine = "ranger",
mode = c("classification", "regression", "unknown"),
trees = 500,
...
)
ml_neural_network(
.data,
outcome,
predictors = everything(),
training_fraction = 0.75,
strata = NULL,
na_threshold = 0.01,
correlation_threshold = 0.9,
centre = TRUE,
scale = TRUE,
engine = "nnet",
mode = c("classification", "regression", "unknown"),
penalty = 0,
epochs = 100,
...
)
ml_nearest_neighbour(
.data,
outcome,
predictors = everything(),
training_fraction = 0.75,
strata = NULL,
na_threshold = 0.01,
correlation_threshold = 0.9,
centre = TRUE,
scale = TRUE,
engine = "kknn",
mode = c("classification", "regression", "unknown"),
neighbors = 5,
weight_func = "triangular",
...
)
ml_linear_regression(
.data,
outcome,
predictors = everything(),
training_fraction = 0.75,
strata = NULL,
na_threshold = 0.01,
correlation_threshold = 0.9,
centre = TRUE,
scale = TRUE,
engine = "lm",
mode = "regression",
...
)
ml_logistic_regression(
.data,
outcome,
predictors = everything(),
training_fraction = 0.75,
strata = NULL,
na_threshold = 0.01,
correlation_threshold = 0.9,
centre = TRUE,
scale = TRUE,
engine = "glm",
mode = "classification",
penalty = 0.1,
...
)
## S3 method for class 'certestats_ml'
confusion_matrix(data, ...)
## S3 method for class 'certestats_ml'
predict(object, new_data, type = NULL, ...)
apply_model_to(
object,
new_data,
add_certainty = TRUE,
only_prediction = FALSE,
correct_mistakes = TRUE,
impute_algorithm = "mice",
...
)
feature_importances(object, ...)
feature_importance_plot(object, ...)
roc_plot(object, ...)
gain_plot(object, ...)
tree_plot(object, ...)
correlation_plot(
data,
add_values = TRUE,
cols = everything(),
correlation_threshold = 0.9
)
get_metrics(object)
get_accuracy(object)
get_kappa(object)
get_recipe(object)
get_specification(object)
get_rows_testing(object)
get_rows_training(object)
get_original_data(object)
get_roc_data(object)
get_coefficients(object)
get_model_variables(object)
get_variable_weights(object)
tune_parameters(object, ..., only_params_in_model = FALSE, levels = 5, k = 10)
check_testing_predictions(object)
## S3 method for class 'certestats_ml'
autoplot(object, plot_type = "roc", ...)
## S3 method for class 'certestats_feature_importances'
autoplot(object, ...)
## S3 method for class 'certestats_tuning'
autoplot(object, type = c("marginals", "parameters", "performance"), ...)
.data |
Data set to train |
outcome |
Outcome variable, also called the response variable or the dependent variable; the variable that must be predicted. The value will be evaluated in |
predictors |
Explanatory variables, also called the predictors or the independent variables; the variables that are used to predict |
training_fraction |
Fraction of rows to be used for training, defaults to 75%. The rest will be used for testing. If given a number over 1, the number will be considered to be the required number of rows for training. |
strata |
A variable in |
na_threshold |
Maximum fraction of |
correlation_threshold |
A value (default 0.9) to indicate the correlation threshold. Predictors with a correlation higher than this value with be removed from the model, using |
centre |
A logical to indicate whether the |
scale |
A logical to indicate whether the |
engine |
R package or function name to be used for the model, will be passed on to |
mode |
Type of predicted value - defaults to |
trees |
An integer for the number of trees contained in the ensemble. |
... |
Arguments to be passed on to the For the For |
tree_depth |
An integer for maximum depth of the tree. |
penalty |
A non-negative number representing the total amount of regularization (specific engines only). |
epochs |
An integer for the number of training iterations. |
neighbors |
A single integer for the number of neighbors
to consider (often called |
weight_func |
A single character for the type of kernel function used
to weight distances between samples. Valid choices are: |
object , data |
outcome of machine learning model |
new_data |
A rectangular data object, such as a data frame. |
type |
A single character value or |
add_certainty |
a logical to indicate whether certainties should be added to the output data.frame |
only_prediction |
a logical to indicate whether predictions must be returned as vector, otherwise returns a data.frame |
correct_mistakes |
a logical to indicate whether missing variables and missing values should be added to |
impute_algorithm |
the algorithm to use in |
add_values |
a logical to indicate whether values must be printed in the tiles |
cols |
columns to use for correlation plot, defaults to |
only_params_in_model |
a logical to indicate whether only parameters in the model should be tuned |
levels |
An integer for the number of values of each parameter to use
to make the regular grid. |
k |
The number of partitions of the data set |
plot_type |
the plot type, can be |
To predict regression (numeric values), the function ml_logistic_regression()
cannot be used.
To predict classifications (character values), the function ml_linear_regression()
cannot be used.
The workflow of the ml_*()
functions is basically like this (thus saving a lot of tidymodels
functions to type):
.data | rsample::initial_split() / \ rsample::training() rsample::testing() | | recipe::recipe() | | | recipe::step_corr() | | | recipe::step_center() | | | recipe::step_scale() | | | recipe::prep() | / \ | recipes::bake() recipes::bake() | | generics::fit() yardstick::metrics() | | output attributes(output)
The predict()
function can be used to fit a model on a new data set. Its wrapper apply_model_to()
works in the same way, but can also detect and fix missing variables, missing data points, and data type differences between the trained data and the input data.
Use feature_importances()
to get the importance of all features/variables. Use autoplot()
afterwards to plot the results. These two functions are combined in feature_importance_plot()
.
Use correlation_plot()
to plot the correlation between all variables, even characters. If the input is a certestats
ML model, the training data of the model will be plotted.
Use the get_model_variables()
function to return a zero-row data.frame with the variables that were used for training, even before the recipe steps.
Use the get_variable_weights()
function to determine the (rough) estimated weights of each variable in the model. This is not as reliable as retrieving coefficients, but it does work for any model. The weights are determined by running the model over all the highest and lowest values of each variable in the trained data. The function returns a data set with 1 row, of which the values sum up to 1.
Use the tune_parameters()
function to analyse tune parameters of any ml_*()
function. Without any parameters manually defined, it will try to tune all parameters of the underlying ML model. The tuning will be based on a K-fold cross-validation, of which the number of partitions can be set with k
. The number of levels
will be used to split the range of the parameters. For example, a range of 1-10 with levels = 2
will lead to [1, 10]
, while levels = 5
will lead to [1, 3, 5, 7, 9]
. The resulting data.frame will be sorted from best to worst. These results can also be plotted using autoplot()
.
The check_testing_predictions()
function combines the data used for testing from the original data with its predictions, so the original data can be reviewed per prediction.
Use autoplot()
on a model to plot the receiver operating characteristic (ROC) curve, the gain curve, the lift curve, or the precision-recall (PR) curve. For the ROC curve, the (overall) area under the curve (AUC) will be printed as subtitle.
A machine learning model of class certestats_ml
/ ... / model_fit
.
The ml_*()
functions return the following attributes:
properties
: a list with model properties: the ML function, engine package, training size, testing size, strata size, mode, and the different ML function-specific properties (such as tree_depth
in ml_decision_trees()
)
recipe
: a recipe as generated with recipes::prep()
, to be used for training and testing
data_original
: a data.frame containing the original data, possibly without invalid strata
data_structure
: a data.frame containing the original data structure (only trained variables) with zero rows
data_means
: a data.frame containing the means of the original data (only trained variables)
data_training
: a data.frame containing the training data of data_original
data_testing
: a data.frame containing the testing data of data_original
rows_training
: an integer vector of rows used for training in data_original
rows_testing
: an integer vector of rows used for training in data_original
predictions
: a data.frame containing predicted values based on the testing data
metrics
: a data.frame with model metrics as returned by yardstick::metrics()
correlation_threshold
: a logical indicating whether recipes::step_corr()
has been applied
centre
: a logical indicating whether recipes::step_center()
has been applied
scale
: a logical indicating whether recipes::step_scale()
has been applied
These are the called functions from the parsnip
package. Arguments set in ...
will be passed on to these parsnip
functions:
ml_decision_trees
: parsnip::decision_tree()
ml_linear_regression
: parsnip::linear_reg()
ml_logistic_regression
: parsnip::logistic_reg()
ml_neural_network
: parsnip::mlp()
ml_nearest_neighbour
: parsnip::nearest_neighbor()
ml_random_forest
: parsnip::rand_forest()
ml_xg_boost
: parsnip::xgb_train()
# 'esbl_tests' is an included data set, see ?esbl_tests
print(esbl_tests, n = 5)
esbl_tests |> correlation_plot(add_values = FALSE) # red will be removed from model
# predict ESBL test outcome based on MICs using 2 different models
model1 <- esbl_tests |> ml_xg_boost(esbl, where(is.double))
model2 <- esbl_tests |> ml_decision_trees(esbl, where(is.double))
# Assessing A Model ----------------------------------------------------
model1 |> get_metrics()
model2 |> get_metrics()
model1 |> confusion_matrix()
# a correlation plot of a model shows the training data
model1 |> correlation_plot(add_values = FALSE)
model1 |> feature_importances()
model1 |> feature_importances() |> autoplot()
model2 |> feature_importance_plot()
# decision trees can also have a tree plot
model2 |> tree_plot()
# Applying A Model -----------------------------------------------------
# simply use base R `predict()` to apply a model:
model1 |> predict(esbl_tests)
# but apply_model_to() contains more info and can apply corrections:
model1 |> apply_model_to(esbl_tests)
model1 |> apply_model_to(esbl_tests[, 1:15])
esbl_tests2 <- esbl_tests
esbl_tests2[2, "CIP"] <- NA
esbl_tests2[5, "AMC"] <- NA
# with XGBoost, nothing will be changed (it can correct for missings):
model1 |> apply_model_to(esbl_tests2)
# with random forest (or others), missings will be imputed:
model2 |> apply_model_to(esbl_tests2)
# Tuning A Model -------------------------------------------------------
# tune the parameters of a model (will take some time)
tuning <- model2 |>
tune_parameters(k = 5, levels = 3)
autoplot(tuning)
# tuning analysis by specifying (some) parameters
iris |>
ml_xg_boost(Species) |>
tune_parameters(mtry = dials::mtry(range = c(1, 3)),
trees = dials::trees())
# Practical Example #1 --------------------------------------------------
# this is what iris data set looks like:
head(iris)
# create a model to predict the species:
iris_model <- iris |> ml_xg_boost(Species)
iris_model_rf <- iris |> ml_random_forest(Species)
# is it a bit reliable?
get_metrics(iris_model)
# now try to predict species from an arbitrary data set:
to_predict <- data.frame(Sepal.Length = 5,
Sepal.Width = 3,
Petal.Length = 1.5,
Petal.Width = 0.5)
to_predict
# should be 'setosa' in the 'predicted' column with huge certainty:
iris_model |> apply_model_to(to_predict)
# which variables are generally important (only trained variables)?
iris_model |> feature_importances()
# how would the model do without the 'Sepal.Length' column?
to_predict <- to_predict[, c("Sepal.Width", "Petal.Width", "Petal.Length")]
to_predict
iris_model |> apply_model_to(to_predict)
# now compare that with a random forest model that requires imputation:
iris_model_rf |> apply_model_to(to_predict)
# the certainly is very different.
# Practical Example #2 -------------------------------------------------
# this example shows plotting methods for a model
# train model to predict genus based on MICs:
genus <- esbl_tests |> ml_xg_boost(genus, everything())
genus |> get_metrics()
genus |> feature_importance_plot()
genus |> autoplot()
genus |> autoplot(plot_type = "gain")
genus |> autoplot(plot_type = "pr")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.