Nothing
# This file is part of the R package "aifeducation".
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>
#' @title Text embedding classifier with a neural net
#' @description Abstract class for neural nets with 'keras'/'tensorflow' and ' pytorch'.
#'
#' @return Objects of this class are used for assigning texts to classes/categories. For the creation and training of a
#' classifier an object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings] on the one hand and a [factor] on
#' the other hand are necessary.
#'
#' The object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings] contains the numerical text representations
#' (text embeddings) of the raw texts generated by an object of class [TextEmbeddingModel]. For supporting large data
#' sets it is recommended to use [LargeDataSetForTextEmbeddings] instead of [EmbeddedText].
#'
#' The `factor` contains the classes/categories for every text. Missing values (unlabeled cases) are supported and can
#' be used for pseudo labeling.
#'
#' For predictions an object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings] has to be used which was
#' created with the same [TextEmbeddingModel] as for training.
#'
#' @family Classification
#' @export
TEClassifierRegular <- R6::R6Class(
classname = "TEClassifierRegular",
inherit = AIFEBaseModel,
public = list(
#' @field feature_extractor ('list()')\cr
#' List for storing information and objects about the feature_extractor.
feature_extractor = list(),
#' @field reliability ('list()')\cr
#'
#' List for storing central reliability measures of the last training.
#'
#' * `reliability$test_metric`: Array containing the reliability measures for the test data for
#' every fold and step (in case of pseudo-labeling).
#' * `reliability$test_metric_mean`: Array containing the reliability measures for the test data.
#' The values represent the mean values for every fold.
#' * `reliability$raw_iota_objects`: List containing all iota_object generated with the package `iotarelr`
#' for every fold at the end of the last training for the test data.
#'
#'
#' * `reliability$raw_iota_objects$iota_objects_end`: List of objects with class `iotarelr_iota2` containing the
#' estimated iota reliability of the second generation for the final model for every fold for the test data.
#' * `reliability$raw_iota_objects$iota_objects_end_free`: List of objects with class `iotarelr_iota2` containing
#' the estimated iota reliability of the second generation for the final model for every fold for the test data.
#' Please note that the model is estimated without forcing the Assignment Error Matrix to be in line with the
#' assumption of weak superiority.
#' * `reliability$iota_object_end`: Object of class `iotarelr_iota2` as a mean of the individual objects
#' for every fold for the test data.
#' * `reliability$iota_object_end_free`: Object of class `iotarelr_iota2` as a mean of the individual objects
#' for every fold. Please note that the model is estimated without forcing the Assignment Error Matrix to be in
#' line with the assumption of weak superiority.
#'
#'
#' * `reliability$standard_measures_end`: Object of class `list` containing the final measures for precision,
#' recall, and f1 for every fold.
#' * `reliability$standard_measures_mean`: `matrix` containing the mean measures for precision, recall, and f1.
#'
reliability = list(
test_metric = NULL,
test_metric_mean = NULL,
raw_iota_objects = list(
iota_objects_end = NULL,
iota_objects_end_free = NULL
),
iota_object_end = NULL,
iota_object_end_free = NULL,
standard_measures_end = NULL,
standard_measures_mean = NULL
),
# New-----------------------------------------------------------------------
#' @description Creating a new instance of this class.
#' @param ml_framework `string` Framework to use for training and inference. `ml_framework="tensorflow"` for
#' 'tensorflow' and `ml_framework="pytorch"` for 'pytorch'
#' @param name `string` Name of the new classifier. Please refer to common name conventions. Free text can be used
#' with parameter `label`.
#' @param label `string` Label for the new classifier. Here you can use free text.
#' @param text_embeddings An object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings].
#' @param feature_extractor Object of class [TEFeatureExtractor] which should be used in order to reduce the number
#' of dimensions of the text embeddings. If no feature extractor should be applied set `NULL`.
#' @param target_levels `vector` containing the levels (categories or classes) within the target data. Please not
#' that order matters. For ordinal data please ensure that the levels are sorted correctly with later levels
#' indicating a higher category/class. For nominal data the order does not matter.
#' @param dense_layers `int` Number of dense layers.
#' @param dense_size `int` Number of neurons for each dense layer.
#' @param rec_layers `int` Number of recurrent layers.
#' @param rec_size `int` Number of neurons for each recurrent layer.
#' @param rec_type `string` Type of the recurrent layers. `rec_type="gru"` for Gated Recurrent Unit and
#' `rec_type="lstm"` for Long Short-Term Memory.
#' @param rec_bidirectional `bool` If `TRUE` a bidirectional version of the recurrent layers is used.
#' @param attention_type `string` Choose the relevant attention type. Possible values are `fourier` and `multihead`. Please note
#' that you may see different values for a case for different input orders if you choose `fourier` on linux.
#' @param self_attention_heads `int` determining the number of attention heads for a self-attention layer. Only
#' relevant if `attention_type="multihead"`
#' @param repeat_encoder `int` determining how many times the encoder should be added to the network.
#' @param intermediate_size `int` determining the size of the projection layer within a each transformer encoder.
#' @param add_pos_embedding `bool` `TRUE` if positional embedding should be used.
#' @param encoder_dropout `int` ranging between 0 and lower 1, determining the dropout for the dense projection
#' within the encoder layers.
#' @param dense_dropout `int` ranging between 0 and lower 1, determining the dropout between dense layers.
#' @param rec_dropout `int` ranging between 0 and lower 1, determining the dropout between bidirectional recurrent
#' layers.
#' @param recurrent_dropout `int` ranging between 0 and lower 1, determining the recurrent dropout for each
#' recurrent layer. Only relevant for keras models.
#' @param optimizer `string` `"adam"` or `"rmsprop"` .
#' @return Returns an object of class [TEClassifierRegular] which is ready for training.
configure = function(ml_framework = "pytorch",
name = NULL,
label = NULL,
text_embeddings = NULL,
feature_extractor = NULL,
target_levels = NULL,
dense_size = 4,
dense_layers=0,
rec_size = 4,
rec_layers=2,
rec_type = "gru",
rec_bidirectional = FALSE,
self_attention_heads = 0,
intermediate_size = NULL,
attention_type = "fourier",
add_pos_embedding = TRUE,
rec_dropout = 0.1,
repeat_encoder = 1,
dense_dropout = 0.4,
recurrent_dropout = 0.4,
encoder_dropout = 0.1,
optimizer = "adam") {
#check if already configured
private$check_config_for_FALSE()
# Checking of parameters--------------------------------------------------
if ((ml_framework %in% c("tensorflow", "pytorch")) == FALSE) {
stop("ml_framework must be 'tensorflow' or 'pytorch'.")
}
check_type(name, type = "string", FALSE)
check_type(label, type = "string", FALSE)
check_type(target_levels, c("vector"), FALSE)
check_type(dense_size, type = "int", FALSE)
check_type(dense_layers, type = "int", FALSE)
if(dense_layers>0){
if(dense_size<1){
stop("Dense layers added. Size for dense layers must be at least 1.")
}
}
check_type(rec_size, type = "int", FALSE)
check_type(rec_layers, type = "int", FALSE)
if(rec_layers>0){
if(rec_size<1){
stop("Recurrent layers added. Size for recurrent layers must be at least 1.")
}
}
check_type(self_attention_heads, type = "int", FALSE)
if (optimizer %in% c("adam", "rmsprop") == FALSE) {
stop("Optimzier must be 'adam' oder 'rmsprop'.")
}
if (attention_type %in% c("fourier", "multihead") == FALSE) {
stop("Optimzier must be 'fourier' oder 'multihead'.")
}
if (repeat_encoder > 0 & attention_type == "multihead" & self_attention_heads <= 0) {
stop("Encoder layer is set to 'multihead'. This requires self_attention_heads>=1.")
}
private$check_embeddings_object_type(text_embeddings, strict = TRUE)
#------------------------------------------------------------------------
# Set ML framework
private$ml_framework <- ml_framework
# Setting Label and Name
private$set_model_info(
model_name_root = name,
model_id = generate_id(16),
label = label,
model_date = date()
)
# Set TextEmbeddingModel
private$set_text_embedding_model(
model_info = text_embeddings$get_model_info(),
feature_extractor_info = text_embeddings$get_feature_extractor_info(),
times = text_embeddings$get_times(),
features = text_embeddings$get_features()
)
# Saving Configuration
config <- list(
use_fe = FALSE,
features = private$text_embedding_model[["features"]],
times = private$text_embedding_model[["times"]],
dense_size = dense_size,
dense_layers=dense_layers,
rec_size = rec_size,
rec_layers=rec_layers,
rec_type = rec_type,
rec_bidirectional = rec_bidirectional,
intermediate_size = intermediate_size,
attention_type = attention_type,
repeat_encoder = repeat_encoder,
dense_dropout = dense_dropout,
rec_dropout = rec_dropout,
recurrent_dropout = recurrent_dropout,
encoder_dropout = encoder_dropout,
add_pos_embedding = add_pos_embedding,
optimizer = optimizer,
act_fct = "gelu",
rec_act_fct = "tanh",
self_attention_heads = self_attention_heads
)
# Basic Information of Input and Target Data
variable_name_order <- dimnames(text_embeddings$embeddings)[[3]]
config["target_levels"] <- list(target_levels)
config["n_categories"] <- list(length(target_levels))
config["input_variables"] <- list(variable_name_order)
if (length(target_levels) > 2) {
# Multi Class
config["act_fct_last"] <- "softmax"
config["err_fct"] <- "categorical_crossentropy"
config["metric"] <- "categorical_accuracy"
config["balanced_metric"] <- "balanced_accuracy"
} else {
# Binary Classification
config["act_fct_last"] <- "sigmoid"
config["err_fct"] <- "binary_crossentropy"
config["metric"] <- "binary_accuracy"
config["balanced_metric"] <- "balanced_accuracy"
}
if (ml_framework == "tensorflow") {
if (length(target_levels) > 2) {
config["require_one_hot"] <- list(TRUE)
} else {
config["require_one_hot"] <- list(FALSE)
}
} else {
config["require_one_hot"] <- list(TRUE)
}
if (rec_layers > 0 | repeat_encoder > 0) {
config["require_matrix_map"] <- list(FALSE)
} else {
config["require_matrix_map"] <- list(TRUE)
}
self$model_config <- config
# Adjust configuration
private$adjust_configuration()
# Set FeatureExtractor and adapt config
self$check_feature_extractor_object_type(feature_extractor)
private$set_feature_extractor(feature_extractor)
# Set package versions
private$set_package_versions()
# Finalize configuration
private$set_configuration_to_TRUE()
# Create_Model
private$create_reset_model()
},
#-------------------------------------------------------------------------
#' @description Method for training a neural net.
#'
#' Training includes a routine for early stopping. In the case that loss<0.0001
#' and Accuracy=1.00 and Average Iota=1.00 training stops. The history uses the values
#' of the last trained epoch for the remaining epochs.
#'
#' After training the model with the best values for Average Iota, Accuracy, and Loss
#' on the validation data set is used as the final model.
#'
#' @param data_embeddings Object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings].
#' @param data_targets `factor` containing the labels for cases stored in `data_embeddings`. Factor must be named
#' and has to use the same names used in `data_embeddings`.
#' @param data_folds `int` determining the number of cross-fold samples.
#' @param data_val_size `double` between 0 and 1, indicating the proportion of cases of each class which should be
#' used for the validation sample during the estimation of the model. The remaining cases are part of the training
#' data.
#' @param balance_class_weights `bool` If `TRUE` class weights are generated based on the frequencies of the
#' training data with the method Inverse Class Frequency'. If `FALSE` each class has the weight 1.
#' @param balance_sequence_length `bool` If `TRUE` sample weights are generated for the length of sequences based on
#' the frequencies of the training data with the method Inverse Class Frequency'. If `FALSE` each sequences length
#' has the weight 1.
#' @param use_sc `bool` `TRUE` if the estimation should integrate synthetic cases. `FALSE` if not.
#' @param sc_method `vector` containing the method for generating synthetic cases. Possible are `sc_method="adas"`,
#' `sc_method="smote"`, and `sc_method="dbsmote"`.
#' @param sc_min_k `int` determining the minimal number of k which is used for creating synthetic units.
#' @param sc_max_k `int` determining the maximal number of k which is used for creating synthetic units.
#' @param use_pl `bool` `TRUE` if the estimation should integrate pseudo-labeling. `FALSE` if not.
#' @param pl_max_steps `int` determining the maximum number of steps during pseudo-labeling.
#' @param pl_anchor `double` between 0 and 1 indicating the reference point for sorting the new cases of every
#' label. See notes for more details.
#' @param pl_max `double` between 0 and 1, setting the maximal level of confidence for considering a case for
#' pseudo-labeling.
#' @param pl_min `double` between 0 and 1, setting the minimal level of confidence for considering a case for
#' pseudo-labeling.
#' @param sustain_track `bool` If `TRUE` energy consumption is tracked during training via the python library
#' 'codecarbon'.
#' @param sustain_iso_code `string` ISO code (Alpha-3-Code) for the country. This variable must be set if
#' sustainability should be tracked. A list can be found on Wikipedia:
#' <https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes>.
#' @param sustain_region Region within a country. Only available for USA and Canada See the documentation of
#' codecarbon for more information. <https://mlco2.github.io/codecarbon/parameters.html>
#' @param sustain_interval `int` Interval in seconds for measuring power usage.
#' @param epochs `int` Number of training epochs.
#' @param batch_size `int` Size of the batches for training.
#' @param dir_checkpoint `string` Path to the directory where the checkpoint during training should be saved. If the
#' directory does not exist, it is created.
#' @param log_dir `string` Path to the directory where the log files should be saved. If no logging is desired set
#' this argument to `NULL`.
#' @param log_write_interval `int` Time in seconds determining the interval in which the logger should try to update
#' the log files. Only relevant if `log_dir` is not `NULL`.
#' @param trace `bool` `TRUE`, if information about the estimation phase should be printed to the console.
#' @param ml_trace `int` `ml_trace=0` does not print any information about the training process from pytorch on the
#' console.
#' @param n_cores `int` Number of cores which should be used during the calculation of synthetic cases. Only relevant if
#' `use_sc=TRUE`.
#' @return Function does not return a value. It changes the object into a trained classifier.
#' @details
#'
#' * `sc_max_k`: All values from sc_min_k up to sc_max_k are successively used. If
#' the number of sc_max_k is too high, the value is reduced to a number that allows the calculating of synthetic
#' units.
#' * `pl_anchor`: With the help of this value, the new cases are sorted. For
#' this aim, the distance from the anchor is calculated and all cases are arranged into an ascending order.
#'
train = function(data_embeddings,
data_targets,
data_folds = 5,
data_val_size = 0.25,
balance_class_weights = TRUE,
balance_sequence_length = TRUE,
use_sc = TRUE,
sc_method = "dbsmote",
sc_min_k = 1,
sc_max_k = 10,
use_pl = TRUE,
pl_max_steps = 3,
pl_max = 1.00,
pl_anchor = 1.00,
pl_min = 0.00,
sustain_track = TRUE,
sustain_iso_code = NULL,
sustain_region = NULL,
sustain_interval = 15,
epochs = 40,
batch_size = 32,
dir_checkpoint,
trace = TRUE,
ml_trace = 1,
log_dir = NULL,
log_write_interval = 10,
n_cores=auto_n_cores()) {
# Checking Arguments------------------------------------------------------
check_type(data_folds, type = "int", FALSE)
check_type(data_val_size, type = "double", FALSE)
check_type(balance_class_weights, type = "bool", FALSE)
check_type(balance_sequence_length, type = "bool", FALSE)
check_type(use_sc, type = "bool", FALSE)
check_type(sc_method, type = "string", FALSE)
check_type(sc_min_k, type = "int", FALSE)
check_type(sc_max_k, type = "int", FALSE)
check_type(use_pl, type = "bool", FALSE)
check_type(pl_max_steps, type = "int", FALSE)
check_type(pl_max, type = "double", FALSE)
check_type(pl_anchor, type = "double", FALSE)
check_type(pl_min, type = "double", FALSE)
check_type(sustain_track, type = "bool", FALSE)
check_type(sustain_iso_code, type = "string", TRUE)
check_type(sustain_region, type = "string", TRUE)
check_type(sustain_interval, type = "int", FALSE)
check_type(epochs, type = "int", FALSE)
check_type(batch_size, type = "int", FALSE)
check_type(dir_checkpoint, type = "string", FALSE)
check_type(trace, type = "bool", FALSE)
check_type(n_cores, type = "int", FALSE)
check_class(data_embeddings, c("EmbeddedText", "LargeDataSetForTextEmbeddings"), FALSE)
self$check_embedding_model(data_embeddings, require_compressed = FALSE)
check_class(data_targets, c("factor"), FALSE)
data_targets <- private$check_and_adjust_target_levels(data_targets)
if (pl_anchor < pl_min) {
stop("pl_anchor must be at least pl_min.")
}
if (pl_anchor > pl_max) {
stop("pl_anchor must be lower or equal to pl_max.")
}
if (data_folds < 2) {
stop("data_folds must be at least 2.")
}
# Saving training configuration-------------------------------------------
self$last_training$config$balance_class_weights <- balance_class_weights
self$last_training$config$balance_sequence_length <- balance_sequence_length
self$last_training$config$data_val_size <- data_val_size
self$last_training$config$use_sc <- use_sc
self$last_training$config$sc_method <- sc_method
self$last_training$config$sc_min_k <- sc_min_k
self$last_training$config$sc_max_k <- sc_max_k
self$last_training$config$use_pl <- use_pl
self$last_training$config$pl_max_steps <- pl_max_steps
self$last_training$config$pl_max <- pl_max
self$last_training$config$pl_anchor <- pl_anchor
self$last_training$config$pl_min <- pl_min
self$last_training$config$sustain_track <- sustain_track
self$last_training$config$sustain_iso_code <- sustain_iso_code
self$last_training$config$sustain_region <- sustain_region
self$last_training$config$sustain_interval <- sustain_interval
self$last_training$config$epochs <- epochs
self$last_training$config$batch_size <- batch_size
self$last_training$config$dir_checkpoint <- dir_checkpoint
self$last_training$config$trace <- trace
self$last_training$config$ml_trace <- ml_trace
self$last_training$config$n_cores<-n_cores
private$log_config$log_dir <- log_dir
private$log_config$log_state_file <- paste0(private$log_config$log_dir, "/aifeducation_state.log")
private$log_config$log_write_interval <- log_write_interval
# Start-------------------------------------------------------------------
if (self$last_training$config$trace == TRUE) {
message(paste(
date(),
"Start"
))
}
# Set up data
if ("EmbeddedText" %in% class(data_embeddings)) {
data <- data_embeddings$convert_to_LargeDataSetForTextEmbeddings()
} else {
data <- data_embeddings
}
# Create DataManager------------------------------------------------------
if (self$model_config$use_fe == TRUE) {
compressed_embeddings <- self$feature_extractor$extract_features_large(
data_embeddings = data,
as.integer(self$last_training$config$batch_size),
trace = self$last_training$config$trace
)
data_manager <- DataManagerClassifier$new(
data_embeddings = compressed_embeddings,
data_targets = data_targets,
folds = data_folds,
val_size = self$last_training$config$data_val_size,
class_levels = self$model_config$target_levels,
one_hot_encoding = self$model_config$require_one_hot,
add_matrix_map = (self$model_config$require_matrix_map == TRUE | self$last_training$config$use_sc == TRUE),
sc_method = sc_method,
sc_min_k = sc_min_k,
sc_max_k = sc_max_k,
trace = trace,
n_cores=self$last_training$config$n_cores
)
} else {
data_manager <- DataManagerClassifier$new(
data_embeddings = data,
data_targets = data_targets,
folds = data_folds,
val_size = self$last_training$config$data_val_size,
class_levels = self$model_config$target_levels,
one_hot_encoding = self$model_config$require_one_hot,
add_matrix_map = (self$model_config$require_matrix_map == TRUE | self$last_training$config$use_sc == TRUE),
sc_method = sc_method,
sc_min_k = sc_min_k,
sc_max_k = sc_max_k,
trace = trace,
n_cores=self$last_training$config$n_cores
)
}
# Save Data Statistics
self$last_training$data <- data_manager$get_statistics()
# Save the number of folds
self$last_training$config$n_folds <- data_manager$get_n_folds()
# Init Training------------------------------------------------------------
private$init_train()
# config datasets
datasets$disable_progress_bars()
# datasets$disable_caching()
# Start Sustainability Tracking-------------------------------------------
if (sustain_track == TRUE) {
if (is.null(sustain_iso_code) == TRUE) {
stop("Sustainability tracking is activated but iso code for the
country is missing. Add iso code or deactivate tracking.")
}
tmp_code_carbon<-reticulate::import("codecarbon")
codecarbon_version=as.character(tmp_code_carbon["__version__"])
if(utils::compareVersion(codecarbon_version,"2.8.0")>=0){
path_look_file=codecarbon$lock$LOCKFILE
if(file.exists(path_look_file)){
unlink(path_look_file)
}
}
sustainability_tracker <- codecarbon$OfflineEmissionsTracker(
country_iso_code = sustain_iso_code,
region = sustain_region,
tracking_mode = "machine",
log_level = "warning",
measure_power_secs = sustain_interval,
save_to_file = FALSE,
save_to_api = FALSE
)
sustainability_tracker$start()
}
# Start Training----------------------------------------------------------
# Load Custom Model Scripts
private$load_reload_python_scripts()
# Start Loop inclusive final training
for (iter in 1:(self$last_training$config$n_folds + 1)) {
base::gc(verbose = FALSE, full = TRUE)
if (self$last_training$config$use_pl == FALSE) {
private$train_standard(
iteration = iter,
data_manager = data_manager,
inc_synthetic = self$last_training$config$use_sc
)
} else if (self$last_training$config$use_pl == TRUE) {
private$train_with_pseudo_labels(
init_train = TRUE,
iteration = iter,
data_manager = data_manager,
inc_synthetic = self$last_training$config$use_sc
)
}
# Calculate measures on categorical level
private$calculate_measures_on_categorical_level(
data_manager = data_manager,
iteration = iter
)
gc()
}
# Finalize Training
private$finalize_train()
# Stop sustainability tracking if requested
if (sustain_track == TRUE) {
sustainability_tracker$stop()
private$sustainability <- summarize_tracked_sustainability(sustainability_tracker)
} else {
private$sustainability <- list(
sustainability_tracked = FALSE,
date = NA,
sustainability_data = list(
duration_sec = NA,
co2eq_kg = NA,
cpu_energy_kwh = NA,
gpu_energy_kwh = NA,
ram_energy_kwh = NA,
total_energy_kwh = NA
)
)
}
if (self$last_training$config$trace == TRUE) {
message(paste(
date(),
"Training Complete"
))
}
},
#-------------------------------------------------------------------------
#' @description Method for predicting new data with a trained neural net.
#' @param newdata Object of class [TextEmbeddingModel] or [LargeDataSetForTextEmbeddings] for which predictions
#' should be made. In addition, this method allows to use objects of class `array` and
#' `datasets.arrow_dataset.Dataset`. However, these should be used only by developers.
#' @param ml_trace `int` `ml_trace=0` does not print any information on the process from the machine learning
#' framework.
#' @param batch_size `int` Size of batches.
#' @return Returns a `data.frame` containing the predictions and the probabilities of the different labels for each
#' case.
predict = function(newdata,
batch_size = 32,
ml_trace = 1) {
# Check arguments
check_type(batch_size, "int", FALSE)
check_type(ml_trace, "int", FALSE)
# Check if the embeddings must be compressed before passing to the model
requires_compression <- self$requires_compression(newdata)
# Check input for compatible text embedding models and feature extractors
if ("EmbeddedText" %in% class(newdata) |
"LargeDataSetForTextEmbeddings" %in% class(newdata)) {
self$check_embedding_model(text_embeddings = newdata, require_compressed = FALSE)
} else {
private$check_embeddings_object_type(newdata, strict = FALSE)
if (requires_compression == TRUE) {
stop("Objects of class datasets.arrow_dataset.Dataset must be provided in
compressed form.")
}
}
# Apply feature extractor if it is part of the model
if (requires_compression == TRUE) {
if ("EmbeddedText" %in% class(newdata)) {
newdata <- newdata$convert_to_LargeDataSetForTextEmbeddings()
} else {
newdata <- newdata
}
# Returns a data set
newdata <- self$feature_extractor$extract_features_large(
data_embeddings = newdata,
batch_size = as.integer(batch_size)
)
}
# Load Custom Model Scripts
private$load_reload_python_scripts()
# Check number of cases in the data
single_prediction <- private$check_single_prediction(newdata)
# Get current row names/name of the cases
current_row_names <- private$get_rownames_from_embeddings(newdata)
# If at least two cases are part of the data set---------------------------
if (single_prediction == FALSE) {
# Returns a data set object
prediction_data <- private$prepare_embeddings_as_dataset(newdata)
# Tensorflow----------------------------------------------------------
if (private$ml_framework == "tensorflow") {
# Prepare data set for tensorflow
prediction_data <- prediction_data$rename_column("input", "input_embeddings")
if ("labels" %in% prediction_data$column_names) {
prediction_data <- prediction_data$remove_columns("labels")
}
prediction_data$set_format("tf")
tf_dataset_predict <- prediction_data$to_tf_dataset(
columns = c("input_embeddings"),
batch_size = as.integer(batch_size),
shuffle = FALSE
)
if (length(self$model_config$target_levels) > 2) {
# Multi Class
predictions_prob <- self$model$predict(
x = tf_dataset_predict,
verbose = as.integer(ml_trace)
)
predictions <- max.col(predictions_prob) - 1
} else {
predictions_prob <- self$model$predict(
x = tf_dataset_predict,
verbose = as.integer(ml_trace)
)
# Add Column for the second characteristic
predictions <- vector(length = length(predictions_prob))
predictions_binary_prob <- matrix(
ncol = 2,
nrow = length(predictions_prob)
)
for (i in 1:length(predictions_prob)) {
if (predictions_prob[i] >= 0.5) {
predictions_binary_prob[i, 1] <- 1 - predictions_prob[i]
predictions_binary_prob[i, 2] <- predictions_prob[i]
predictions[i] <- 1
} else {
predictions_binary_prob[i, 1] <- 1 - predictions_prob[i]
predictions_binary_prob[i, 2] <- predictions_prob[i]
predictions[i] <- 0
}
}
predictions_prob <- predictions_binary_prob
}
# Pytorch----------------------------------------------------------
} else if (private$ml_framework == "pytorch") {
prediction_data$set_format("torch")
predictions_prob <- py$TeClassifierBatchPredict(
model = self$model,
dataset = prediction_data,
batch_size = as.integer(batch_size)
)
predictions_prob <- private$detach_tensors(predictions_prob)
predictions <- max.col(predictions_prob) - 1
}
# In the case the data has one single row-------------------------------
} else {
prediction_data <- private$prepare_embeddings_as_np_array(newdata)
# Tensorflow------------------------------------------------------------
if (private$ml_framework == "tensorflow") {
if (length(self$model_config$target_levels) > 2) {
# Multy Class
predictions_prob <- self$model$predict(
x = prediction_data,
batch_size = as.integer(batch_size),
verbose = as.integer(ml_trace)
)
predictions <- max.col(predictions_prob) - 1
} else {
predictions_prob <- self$model$predict(
x = prediction_data,
batch_size = as.integer(batch_size),
verbose = as.integer(ml_trace)
)
# Add Column for the second characteristic
predictions <- vector(length = length(predictions_prob))
predictions_binary_prob <- matrix(
ncol = 2,
nrow = length(predictions_prob)
)
for (i in 1:length(predictions_prob)) {
if (predictions_prob[i] >= 0.5) {
predictions_binary_prob[i, 1] <- 1 - predictions_prob[i]
predictions_binary_prob[i, 2] <- predictions_prob[i]
predictions[i] <- 1
} else {
predictions_binary_prob[i, 1] <- 1 - predictions_prob[i]
predictions_binary_prob[i, 2] <- predictions_prob[i]
predictions[i] <- 0
}
}
predictions_prob <- predictions_binary_prob
}
} else if (private$ml_framework == "pytorch") {
if (torch$cuda$is_available()) {
device <- "cuda"
dtype <- torch$double
self$model$to(device, dtype = dtype)
self$model$eval()
input <- torch$from_numpy(prediction_data)
predictions_prob <- self$model(input$to(device, dtype = dtype),
predication_mode = TRUE
)
predictions_prob <- private$detach_tensors(predictions_prob)
} else {
device <- "cpu"
dtype <- torch$float
self$model$to(device, dtype = dtype)
self$model$eval()
input <- torch$from_numpy(prediction_data)
predictions_prob <- self$model(input$to(device, dtype = dtype),
predication_mode = TRUE
)
predictions_prob <- private$detach_tensors(predictions_prob)
}
predictions <- max.col(predictions_prob) - 1
}
}
# Transforming predictions to target levels------------------------------
predictions <- as.character(as.vector(predictions))
for (i in 0:(length(self$model_config$target_levels) - 1)) {
predictions <- replace(
x = predictions,
predictions == as.character(i),
values = self$model_config$target_levels[i + 1]
)
}
# Transforming to a factor
predictions <- factor(predictions, levels = self$model_config$target_levels)
colnames(predictions_prob) <- self$model_config$target_levels
predictions_prob <- as.data.frame(predictions_prob)
predictions_prob$expected_category <- predictions
rownames(predictions_prob) <- current_row_names
return(predictions_prob)
},
# Check Embedding Model compatibility of the text embedding
#' @description Method for checking if the provided text embeddings are created with the same [TextEmbeddingModel]
#' as the classifier.
#' @param text_embeddings Object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings].
#' @param require_compressed `TRUE` if a compressed version of the embeddings are necessary. Compressed embeddings
#' are created by an object of class [TEFeatureExtractor].
#' @return `TRUE` if the underlying [TextEmbeddingModel] is the same. `FALSE` if the models differ.
check_embedding_model = function(text_embeddings, require_compressed = FALSE) {
# Check Embeddings Object Type
private$check_embeddings_object_type(text_embeddings, strict = TRUE)
# Check original text embedding model.
embedding_model_config <- text_embeddings$get_model_info()
check <- c("model_name")
if (!is.null_or_na(embedding_model_config[[check]]) &
!is.null_or_na(private$text_embedding_model$model[[check]])) {
if (embedding_model_config[[check]] != private$text_embedding_model$model[[check]]) {
stop("The TextEmbeddingModel that generated the data_embeddings is not
the same as the TextEmbeddingModel when generating the classifier.")
}
}
# Check if a compressed version is necessary and if true if the feature extractor is
# compatible
feature_extractor_info <- text_embeddings$get_feature_extractor_info()
if (require_compressed == TRUE) {
if (!is.null(feature_extractor_info$model_name) & self$model_config$use_fe == FALSE) {
stop("Compressed embeddings provided but the classifier does not support
compressed embeddings.")
} else if (!is.null(feature_extractor_info$model_name) & self$model_config$use_fe == TRUE) {
if (private$text_embedding_model$feature_extractor$model_name != feature_extractor_info$model_name) {
stop("The feature extractor of the compressed embeddings is not the same as
the feature extractor during the creation of the classifier.")
}
}
} else {
if (!is.null(feature_extractor_info$model_name)) {
stop("Compressed embeddings are provided but uncompressed are needed.")
}
}
},
#--------------------------------------------------------------------------
#' @description Method for checking an object of class [TEFeatureExtractor].
#' @param feature_extractor Object of class [TEFeatureExtractor]
#' @return This method does nothing returns. It raises an error if
#'
#' * the object is `NULL`
#' * the object does not rely on the same machine learning framework as the classifier
#' * the object is not trained.
#'
check_feature_extractor_object_type = function(feature_extractor) {
if (!is.null(feature_extractor)) {
if ("TEFeatureExtractor" %in% class(feature_extractor) == FALSE) {
stop("Object passed to feature_extractor must be an object of class
TEFeatureExtractor or NULL.")
} else {
if (feature_extractor$get_ml_framework() != self$get_ml_framework()) {
stop("The machine learning framework of the feature extractior and
classifier do not match. Please provide a feature extractor
with the same machine learning framework as the classifier.")
} else {
if (feature_extractor$is_trained() == FALSE) {
stop("The supplied feature extractor is not trained. Please
provide trained feature extractor and try again.")
}
}
}
}
},
#--------------------------------------------------------------------------
#' @description Method for checking if provided text embeddings must be compressed via a [TEFeatureExtractor] before
#' processing.
#' @param text_embeddings Object of class [EmbeddedText], [LargeDataSetForTextEmbeddings], `array` or
#' `datasets.arrow_dataset.Dataset`.
#' @return Return `TRUE` if a compression is necessary and `FALSE` if not.
requires_compression = function(text_embeddings) {
# Check arguments
check_class(text_embeddings, c(
"EmbeddedText", "LargeDataSetForTextEmbeddings",
"array", "datasets.arrow_dataset.Dataset"
), FALSE)
if ("EmbeddedText" %in% class(text_embeddings) |
"LargeDataSetForTextEmbeddings" %in% class(text_embeddings)) {
if (self$model_config$use_fe == TRUE & text_embeddings$is_compressed() == FALSE) {
return(TRUE)
} else {
return(FALSE)
}
} else if ("array" %in% class(text_embeddings)) {
if (dim(text_embeddings)[3] > self$model_config$features) {
return(TRUE)
} else {
return(FALSE)
}
} else if ("datasets.arrow_dataset.Dataset" %in% class(text_embeddings)) {
text_embeddings$set_format("np")
tensors <- text_embeddings["input"][1, , , drop = FALSE]
if (dim(tensors)[3] > self$model_config$features) {
return(TRUE)
} else {
return(FALSE)
}
}
},
#-------------------------------------------------------------------------
#' @description Method for saving a model.
#' @param dir_path `string` Path of the directory where the model should be saved.
#' @param folder_name `string` Name of the folder that should be created within the directory.
#' @return Function does not return a value. It saves the model to disk.
save = function(dir_path, folder_name) {
# Save the classifier
super$save(
dir_path = dir_path,
folder_name = folder_name
)
# Save the feature extractor if necessary
if (self$model_config$use_fe == TRUE) {
save_to_disk(
object = self$feature_extractor,
dir_path = paste0(dir_path, "/", folder_name),
folder_name = "feature_extractor"
)
}
},
#--------------------------------------------------------------------------
#' @description loads an object from disk and updates the object to the current version of the package.
#' @param dir_path Path where the object set is stored.
#' @return Method does not return anything. It loads an object from disk.
load_from_disk = function(dir_path) {
# Call the core method which loads data common for all models
private$load_config_and_docs(dir_path = dir_path)
# Add classifier specific data
# Load R file
config_file <- load_R_config_state(dir_path)
# Set Reliability measures
self$reliability <- list(
test_metric = config_file$public$reliability$test_metric,
test_metric_mean = config_file$public$reliability$test_metric_mean,
raw_iota_objects = list(
iota_objects_end = config_file$public$reliability$raw_iota_objects$iota_objects_end,
iota_objects_end_free = config_file$public$reliability$raw_iota_objects$iota_objects_end_free
),
iota_object_end = config_file$public$reliability$iota_object_end,
iota_object_end_free = config_file$public$reliability$iota_object_end_free,
standard_measures_end = config_file$public$reliability$standard_measures_end,
standard_measures_mean = config_file$public$reliability$standard_measures_mean
)
# Set FeatureExtractor
if (self$model_config$use_fe == TRUE) {
feature_extractor <- TEFeatureExtractor$new()
feature_extractor$load_from_disk(paste0(dir_path, "/feature_extractor"))
self$feature_extractor <- feature_extractor
}
# Create and load AI model
private$create_reset_model()
self$load(dir_path = dir_path)
}
),
private = list(
#--------------------------------------------------------------------------
load_reload_python_scripts = function() {
if (private$ml_framework == "tensorflow") {
reticulate::py_run_file(system.file("python/keras_te_classifier.py",
package = "aifeducation"
))
reticulate::py_run_file(system.file("python/keras_callbacks.py",
package = "aifeducation"
))
} else if (private$ml_framework == "pytorch") {
reticulate::py_run_file(system.file("python/pytorch_te_classifier.py",
package = "aifeducation"
))
reticulate::py_run_file(system.file("python/pytorch_autoencoder.py",
package = "aifeducation"
))
reticulate::py_run_file(system.file("python/py_log.py",
package = "aifeducation"
))
}
},
#--------------------------------------------------------------------------
create_reset_model = function() {
private$check_config_for_TRUE()
if (private$ml_framework == "tensorflow") {
# Load custom layers
private$load_reload_python_scripts()
# Defining basic keras model
layer_list <- NULL
# Adding Input Layer
# if(n_rec>0 | self$model_config$repeat_encoder>0){
model_input <- keras$layers$Input(
shape = list(as.integer(self$model_config$times), as.integer(self$model_config$features)),
name = "input_embeddings"
)
# } #else {
# model_input<-keras$layers$Input(shape=as.integer(self$model_config$times*self$model_config$features),
# name="input_embeddings")
# }
layer_list[1] <- list(model_input)
# Adding a Mask-Layer
if (self$model_config$rec_layers > 0 | self$model_config$repeat_encoder > 0) {
masking_layer <- keras$layers$Masking(
mask_value = 0.0,
name = "masking_layer",
input_shape = c(self$model_config$times, self$model_config$features),
trainable = FALSE
)(layer_list[[length(layer_list)]])
layer_list[length(layer_list) + 1] <- list(masking_layer)
if (self$model_config$add_pos_embedding == TRUE) {
positional_embedding <- py$AddPositionalEmbedding(
sequence_length = as.integer(self$model_config$times),
name = "add_positional_embedding"
)(layer_list[[length(layer_list)]])
layer_list[length(layer_list) + 1] <- list(positional_embedding)
}
norm_layer <- keras$layers$LayerNormalization(
name = "normalizaion_layer"
)(layer_list[[length(layer_list)]])
layer_list[length(layer_list) + 1] <- list(norm_layer)
} # else {
# norm_layer<-keras$layers$BatchNormalization(
# name = "normalizaion_layer")(layer_list[[length(layer_list)]])
# layer_list[length(layer_list)+1]<-list(norm_layer)
# }
if (self$model_config$repeat_encoder > 0) {
for (r in 1:self$model_config$repeat_encoder) {
if (self$model_config$attention_type == "multihead") {
layer_list[length(layer_list) + 1] <- list(
py$TransformerEncoder(
embed_dim = as.integer(self$model_config$features),
dense_dim = as.integer(self$model_config$intermediate_size),
num_heads = as.integer(self$model_config$self_attention_heads),
dropout_rate = self$model_config$encoder_dropout,
name = paste0("encoder_", r)
)(layer_list[[length(layer_list)]])
)
} else if (self$model_config$attention_type == "fourier") {
layer_list[length(layer_list) + 1] <- list(
py$FourierEncoder(
dense_dim = as.integer(self$model_config$intermediate_size),
dropout_rate = self$model_config$encoder_dropout,
name = paste0("encoder_", r)
)(layer_list[[length(layer_list)]])
)
}
}
}
# Adding rec layer
if (self$model_config$rec_layers > 0) {
if (self$model_config$rec_bidirectional == TRUE) {
for (i in 1:self$model_config$rec_layers) {
if (self$model_config$rec_type == "gru") {
layer_list[length(layer_list) + 1] <- list(
keras$layers$Bidirectional(
layer = keras$layers$GRU(
units = as.integer(self$model_config$rec_size),
input_shape = list(self$model_config$times, self$model_config$features),
return_sequences = TRUE,
dropout = 0,
recurrent_dropout = self$model_config$recurrent_dropout,
activation = "tanh",
name = paste0("gru_", i)
),
name = paste0("bidirectional_", i)
)(layer_list[[length(layer_list)]])
)
if (i != self$model_config$rec_layers) {
layer_list[length(layer_list) + 1] <- list(
keras$layers$Dropout(
rate = self$model_config$rec_dropout,
name = paste0("gru_dropout_", i)
)(layer_list[[length(layer_list)]])
)
}
} else if (self$model_config$rec_type == "lstm") {
layer_list[length(layer_list) + 1] <- list(
keras$layers$Bidirectional(
layer = keras$layers$LSTM(
units = as.integer(self$model_config$rec_size),
input_shape = list(self$model_config$times, self$model_config$features),
return_sequences = TRUE,
dropout = 0,
recurrent_dropout = self$model_config$recurrent_dropout,
activation = "tanh",
name = paste0("lstm", i)
),
name = paste0("bidirectional_", i)
)(layer_list[[length(layer_list)]])
)
if (i != self$model_config$rec_layers) {
layer_list[length(layer_list) + 1] <- list(
keras$layers$Dropout(
rate = self$model_config$rec_dropout,
name = paste0("lstm_dropout_", i)
)(layer_list[[length(layer_list)]])
)
}
}
}
} else {
for (i in 1:self$model_config$rec_layers) {
if (self$model_config$rec_type == "gru") {
layer_list[length(layer_list) + 1] <- list(
layer = keras$layers$GRU(
units = as.integer(self$model_config$rec_size),
input_shape = list(self$model_config$times, self$model_config$features),
return_sequences = TRUE,
dropout = 0,
recurrent_dropout = self$model_config$recurrent_dropout,
activation = "tanh",
name = paste0("uni_directional_gru_", i)
)(layer_list[[length(layer_list)]])
)
if (i != self$model_config$rec_layers) {
layer_list[length(layer_list) + 1] <- list(
keras$layers$Dropout(
rate = self$model_config$rec_dropout,
name = paste0("gru_dropout_", i)
)(layer_list[[length(layer_list)]])
)
}
} else if (self$model_config$rec_type == "lstm") {
layer_list[length(layer_list) + 1] <- list(
layer = keras$layers$LSTM(
units = as.integer(self$model_config$rec_size),
input_shape = list(self$model_config$times, self$model_config$features),
return_sequences = TRUE,
dropout = 0,
recurrent_dropout = self$model_config$recurrent_dropout,
activation = "tanh",
name = paste0("unidirectional_lstm", i)
)(layer_list[[length(layer_list)]])
)
if (i != self$model_config$rec_layers) {
layer_list[length(layer_list) + 1] <- list(
keras$layers$Dropout(
rate = self$model_config$rec_dropout,
name = paste0("lstm_dropout_", i)
)(layer_list[[length(layer_list)]])
)
}
}
}
}
}
layer_list[length(layer_list) + 1] <- list(
keras$layers$GlobalAveragePooling1D(
name = "global_average_pooling"
)(layer_list[[length(layer_list)]])
)
# Adding standard layer
if (self$model_config$dense_layers > 0) {
for (i in 1:self$model_config$dense_layers) {
layer_list[length(layer_list) + 1] <- list(
keras$layers$Dense(
units = as.integer(self$model_config$dense_size),
activation = "gelu",
name = paste0("dense_", i)
)(layer_list[[length(layer_list)]])
)
if (i != self$model_config$dense_layers) {
# Add Dropout_Layer
layer_list[length(layer_list) + 1] <- list(
keras$layers$Dropout(
rate = self$model_config$dense_dropout,
name = paste0("dense_dropout_", i)
)(layer_list[[length(layer_list)]])
)
}
}
}
# Adding final Layer
if (length(self$model_config$target_levels) > 2) {
# Multi Class
layer_list[length(layer_list) + 1] <- list(
keras$layers$Dense(
units = as.integer(length(self$model_config$target_levels)),
activation = self$model_config$act_fct_last,
name = "output_categories"
)(layer_list[[length(layer_list)]])
)
} else {
# Binary Class
layer_list[length(layer_list) + 1] <- list(
keras$layers$Dense(
units = as.integer(1),
activation = self$model_config$act_fct_last,
name = "output_categories"
)(layer_list[[length(layer_list)]])
)
}
# Creating Model
model <- keras$Model(
inputs = model_input,
outputs = layer_list[length(layer_list)],
name = self$model_config$name
)
self$model <- model
} else {
#--------------------------------------------------------------------------
# Load Custom Pytorch Objects and Functions
private$load_reload_python_scripts()
self$model <- py$TextEmbeddingClassifier_PT(
features = as.integer(self$model_config$features),
times = as.integer(self$model_config$times),
dense_size=as.integer(self$model_config$dense_size),
dense_layers=as.integer(self$model_config$dense_layers),
rec_size=as.integer(self$model_config$rec_size),
rec_layers=as.integer(self$model_config$rec_layers),
rec_type = self$model_config$rec_type,
rec_bidirectional = self$model_config$rec_bidirectional,
intermediate_size = as.integer(self$model_config$intermediate_size),
attention_type = self$model_config$attention_type,
repeat_encoder = as.integer(self$model_config$repeat_encoder),
dense_dropout = self$model_config$dense_dropout,
rec_dropout = self$model_config$rec_dropout,
encoder_dropout = self$model_config$encoder_dropout,
add_pos_embedding = self$model_config$add_pos_embedding,
self_attention_heads = as.integer(self$model_config$self_attention_heads),
target_levels = self$model_config$target_levels
)
}
},
#--------------------------------------------------------------------------
init_train = function() {
# Setting a new ID for the classifier
private$model_info$model_name <- paste0(
private$model_info$model_name_root,
"_id_",
generate_id(16)
)
# Initializing Objects for Saving Performance
metric_names <- get_coder_metrics(
true_values = NULL,
predicted_values = NULL,
return_names_only = TRUE
)
self$reliability$test_metric <- matrix(
nrow = self$last_training$config$n_folds,
ncol = length(metric_names),
dimnames = list(
iterations = NULL,
metrics = metric_names
)
)
self$reliability$test_metric_mean <- NULL
self$reliability$iota_objects_end <- NULL
self$reliability$iota_objects_end_free <- NULL
self$reliability$iota_object_end <- NULL
self$reliability$iota_object_end_free <- NULL
standard_measures_mean_table <- matrix(
nrow = length(self$model_config$target_levels),
ncol = 3,
data = 0
)
colnames(standard_measures_mean_table) <- c("precision", "recall", "f1")
rownames(standard_measures_mean_table) <- self$model_config$target_levels
self$reliability$standard_measures_mean <- standard_measures_mean_table
# Save start time of training
self$last_training$start_time <- Sys.time()
},
#--------------------------------------------------------------------------
calculate_test_metric = function(test_data, iteration, type) {
test_predictions <- self$predict(
newdata = test_data,
ml_trace = self$last_training$config$ml_trace,
batch_size = self$last_training$config$batch_size
)
test_pred_cat <- test_predictions$expected_category
names(test_pred_cat) <- rownames(test_predictions)
test_pred_cat <- test_pred_cat[test_data["id"]]
test_data$set_format("np")
true_values <- factor(
x = test_data["labels"],
levels = 0:(length(self$model_config$target_levels) - 1),
labels = self$model_config$target_levels
)
names(true_values) <- test_data["id"]
test_res <- get_coder_metrics(
true_values = true_values,
predicted_values = test_pred_cat
)
# Save results
self$reliability$test_metric[iteration, ] <- test_res
},
#--------------------------------------------------------------------------
calculate_measures_on_categorical_level = function(data_manager, iteration) {
# Get test data
data_manager$set_state(
iteration = iteration,
step = NULL
)
test_data <- data_manager$get_test_dataset()
if (!is.null(test_data) == TRUE) {
# Predict labels
test_predictions <- self$predict(
newdata = test_data,
ml_trace = self$last_training$config$ml_trace,
batch_size = self$last_training$config$batch_size
)
test_pred_cat <- test_predictions$expected_category
names(test_pred_cat) <- rownames(test_predictions)
test_pred_cat <- test_pred_cat[test_data["id"]]
# Calculate standard measures
test_data$set_format("np")
true_values <- factor(
x = test_data["labels"],
levels = 0:(length(self$model_config$target_levels) - 1),
labels = self$model_config$target_levels
)
names(true_values) <- test_data["id"]
self$reliability$standard_measures_end[iteration] <- list(
calc_standard_classification_measures(
true_values = true_values,
predicted_values = test_pred_cat
)
)
# Calculate iota objects
self$reliability$iota_objects_end[iteration] <- list(iotarelr::check_new_rater(
true_values = factor(
x = test_data["labels"],
levels = 0:(length(self$model_config$target_levels) - 1),
labels = self$model_config$target_levels
),
assigned_values = test_pred_cat,
free_aem = FALSE
))
self$reliability$iota_objects_end_free[iteration] <- list(iotarelr::check_new_rater(
true_values = factor(
x = test_data["labels"],
levels = 0:(length(self$model_config$target_levels) - 1),
labels = self$model_config$target_levels
),
assigned_values = test_pred_cat,
free_aem = TRUE
))
} else if (iteration <= data_manager$get_n_folds()) {
warning("Unable to calculate test scores. There is no test data.")
}
},
#--------------------------------------------------------------------------
finalize_train = function() {
# Save Final Information
self$last_training$date <- date()
# Finalize measures from content analysis
test_metric_mean <- vector(length = ncol(self$reliability$test_metric))
test_metric_mean[] <- 0
names(test_metric_mean) <- colnames(self$reliability$test_metric)
n_mean <- vector(length = ncol(self$reliability$test_metric))
n_mean[] <- self$last_training$config$n_folds
for (i in 1:self$last_training$config$n_folds) {
for (j in 1:ncol(self$reliability$test_metric)) {
if (is.na(self$reliability$test_metric[i, j]) == FALSE) {
test_metric_mean[j] <- test_metric_mean[j] + self$reliability$test_metric[i, j]
} else {
n_mean[j] <- n_mean[j] - 1
}
}
}
test_metric_mean <- test_metric_mean / n_mean
test_metric_mean[is.nan(test_metric_mean)] <- NA
self$reliability$test_metric_mean <- test_metric_mean
self$last_training$learning_time <- as.numeric(
difftime(Sys.time(),
self$last_training$start_time,
units = "mins"
)
)
# Finalize iota objects
if (is.null(self$reliability$iota_objects_end) == FALSE) {
self$reliability$iota_object_end <- create_iota2_mean_object(
iota2_list = self$reliability$iota_objects_end,
original_cat_labels = self$model_config$target_levels,
free_aem = FALSE,
call = "aifeducation::te_classifier_neuralnet"
)
} else {
self$reliability$iota_objects_end <- NULL
}
if (is.null(self$reliability$iota_objects_end_free) == FALSE) {
self$reliability$iota_object_end_free <- create_iota2_mean_object(
iota2_list = self$reliability$iota_objects_end_free,
original_cat_labels = self$model_config$target_levels,
free_aem = TRUE,
call = "aifeducation::te_classifier_neuralnet"
)
} else {
self$reliability$iota_objects_end_free <- NULL
}
# Finalize standard measures
standard_measures <- self$reliability$standard_measures_mean
for (i in 1:self$last_training$config$n_folds) {
for (tmp_cat in self$model_config$target_levels) {
standard_measures[tmp_cat, "precision"] <- standard_measures[tmp_cat, "precision"] +
self$reliability$standard_measures_end[[i]][tmp_cat, "precision"]
standard_measures[tmp_cat, "recall"] <- standard_measures[tmp_cat, "recall"] +
self$reliability$standard_measures_end[[i]][tmp_cat, "recall"]
standard_measures[tmp_cat, "f1"] <- standard_measures[tmp_cat, "f1"] +
self$reliability$standard_measures_end[[i]][tmp_cat, "f1"]
}
}
self$reliability$standard_measures_mean <- standard_measures / self$last_training$config$n_folds
},
#--------------------------------------------------------------------------
train_standard = function(iteration = NULL,
data_manager = NULL,
inc_synthetic = FALSE) {
# Print status message to console
if (self$last_training$config$trace == TRUE) {
if (iteration <= self$last_training$config$n_folds) {
message(paste(
date(),
"|", "Iteration", iteration, "from", self$last_training$config$n_folds
))
} else {
message(paste(
date(),
"|", "Final training"
))
}
}
# Set the state of the DataManager
data_manager$set_state(
iteration = iteration,
step = NULL
)
# Generate syntetic cases if requested
if (inc_synthetic == TRUE) {
data_manager$create_synthetic(
trace = self$last_training$config$trace,
inc_pseudo_data = FALSE
)
}
# Get the different DataSets
train_data <- data_manager$get_dataset(
inc_labeled = TRUE,
inc_synthetic = inc_synthetic,
inc_pseudo_data = FALSE,
inc_unlabeled = FALSE
)
val_data <- data_manager$get_val_dataset()
if (iteration != "final") {
test_data <- data_manager$get_test_dataset()
} else {
test_data <- NULL
}
# Print status to console
if (self$last_training$config$trace == TRUE) {
if (iteration <= self$last_training$config$n_folds) {
message(paste(
date(),
"|", "Iteration", iteration, "from", self$last_training$config$n_folds,
"|", "Training"
))
} else {
message(paste(
date(),
"|", "Final training",
"|", "Training"
))
}
}
# Start training
train_history <- private$basic_train(
train_data = train_data,
val_data = val_data,
test_data = test_data,
reset_model = TRUE,
use_callback = TRUE,
log_dir = private$log_config$log_dir,
log_write_interval = private$log_config$log_write_interval,
log_top_value = iteration,
log_top_total = self$last_training$config$n_folds + 1,
log_top_message = "Overall"
)
# Save history
self$last_training$history[iteration] <- list(train_history)
# Calculate test metric
if (!is.null(test_data) == TRUE) {
private$calculate_test_metric(
test_data = test_data,
iteration = iteration,
type = (as.numeric(inc_synthetic)) + 1
)
}
},
#--------------------------------------------------------------------------
train_with_pseudo_labels = function(init_train = TRUE,
iteration = NULL,
data_manager = NULL,
inc_synthetic = FALSE) {
# If model is not trained than train for the first time
# Necessary for estimating pseudo labels
if (init_train == TRUE) {
private$train_standard(
iteration = iteration,
data_manager = data_manager,
inc_synthetic = inc_synthetic
)
}
# Get validation and test data for training loop
val_data <- data_manager$get_val_dataset()
if (iteration != "final") {
test_data <- data_manager$get_test_dataset()
} else {
test_data <- NULL
}
# Start training loop with pseudo labels
data_manager$set_state(
iteration = iteration,
step = NULL
)
# Create list for saving training histories per step
step_histories <- NULL
for (step in 1:self$last_training$config$pl_max_steps) {
# Print status message to console
if (self$last_training$config$trace == TRUE) {
if (iteration <= self$last_training$config$n_folds) {
message(paste(
date(),
"|", "Iteration", iteration, "from", self$last_training$config$n_folds,
"|", "Pseudo labeling", "step", step, "from", self$last_training$config$pl_max_steps
))
} else {
message(paste(
date(),
"|", "Final training",
"|", "Pseudo labeling", "step", step, "from", self$last_training$config$pl_max_steps
))
}
}
# Set correct state for the data_manager
data_manager$set_state(
iteration = iteration,
step = step
)
# Generate pseudo labels
pseudo_data <- private$estimate_pseudo_labels(
unlabeled_data = data_manager$get_unlabeled_data(),
val_data=val_data,
current_step = step
)
# Save pseudo labels in the data_manager
data_manager$add_replace_pseudo_data(
inputs = pseudo_data$input,
labels = pseudo_data$labels
)
# Remove old pseudo data
rm(pseudo_data)
# Generate synthetic data if requested
if (inc_synthetic == TRUE) {
data_manager$create_synthetic(
trace = self$last_training$config$trace,
inc_pseudo_data = TRUE
)
}
# Request training data
train_data <- data_manager$get_dataset(
inc_labeled = TRUE,
inc_synthetic = inc_synthetic,
inc_pseudo_data = TRUE,
inc_unlabeled = FALSE
)
# Print status to console
if (self$last_training$config$trace == TRUE) {
if (iteration <= self$last_training$config$n_folds) {
message(paste(
date(),
"|", "Iteration", iteration, "from", self$last_training$config$n_folds,
"|", "Training"
))
} else {
message(paste(
date(),
"|", "Final training",
"|", "Training"
))
}
}
# Start training
train_history <- private$basic_train(
train_data = train_data,
val_data = val_data,
test_data = test_data,
reset_model = TRUE,
use_callback = TRUE,
log_dir = private$log_config$log_state_file,
log_write_interval = private$log_config$log_write_interval,
log_top_value = iteration,
log_top_total = self$last_training$config$n_folds + 1,
log_top_message = "Overall"
)
# Save history
step_histories[step] <- list(train_history)
}
# Save the histories for the complete iteration
self$last_training$history[iteration] <- list(step_histories)
# Calculate test metric
if (!is.null(test_data) == TRUE) {
private$calculate_test_metric(
test_data = test_data,
iteration = iteration,
type = 3
)
}
},
#--------------------------------------------------------------------------
estimate_pseudo_labels = function(unlabeled_data,
val_data,
current_step) {
# Predict pseudo labels for unlabeled data
predicted_labels <- self$predict(
newdata = unlabeled_data,
ml_trace = self$last_training$config$ml_trace,
batch_size = self$last_training$config$batch_size
)
# Create Matrix for saving the results
new_categories <- matrix(
nrow = nrow(predicted_labels),
ncol = 2
)
rownames(new_categories) <- rownames(predicted_labels)
colnames(new_categories) <- c("cat", "prob")
#Correct probabilities for reliability on the validation data
predicted_labels<-private$pseudo_labels_correct_prob(
predictions=predicted_labels,
val_data=val_data
)
# Gather information for every case. That is the category with the
# highest probability and save both
for (i in 1:nrow(predicted_labels)) {
tmp_est_prob <- predicted_labels[i, 1:(ncol(predicted_labels) - 1)]
new_categories[i, 1] <- which.max(tmp_est_prob) - 1
new_categories[i, 2] <- max(tmp_est_prob)
}
new_categories <- as.data.frame(new_categories)
# Transforming the probabilities to an information index
new_categories[, 2] <- abs(
self$last_training$config$pl_anchor -
(as.numeric(new_categories[, 2]) - 1 / length(self$model_config$target_levels)) / (1 - 1 / length(self$model_config$target_levels))
)
new_categories <- as.data.frame(new_categories)
# Reducing the new categories to the desired range
condition <- (new_categories[, 2] >= self$last_training$config$pl_min &
new_categories[, 2] <= self$last_training$config$pl_max)
new_categories <- subset(new_categories, condition)
# Calculate number of cases to include
bpl_inc_ratio <- current_step / self$last_training$config$pl_max_steps
n_cases_to_include <- nrow(new_categories) * bpl_inc_ratio
# Order cases with increasing distance from maximal information
new_categories <- new_categories[order(new_categories$prob, decreasing = FALSE), ]
# Select the best cases
names_final_new_categories <- rownames(new_categories)[1:n_cases_to_include]
# Get the labels for these cases
targets_pseudo_labeled <- new_categories[names_final_new_categories, 1]
targets_pseudo_labeled <- as.numeric(targets_pseudo_labeled)
names(targets_pseudo_labeled) <- names_final_new_categories
# Transform pseudo labels to a factor
targets_pseudo_labeled <- factor(
x = targets_pseudo_labeled,
levels = 0:(length(self$model_config$target_levels) - 1),
labels = self$model_config$target_levels
)
# get the corresponding input
unlabeled_data$set_format("np")
embeddings <- unlabeled_data["input"]
rownames(embeddings) <- unlabeled_data["id"]
embeddings <- embeddings[names_final_new_categories, , ]
# Return results
pseudo_data <- list(
input = embeddings,
labels = targets_pseudo_labeled
)
return(pseudo_data)
},
#--------------------------------------------------------------------------
pseudo_labels_correct_prob=function(predictions,val_data){
#Predict on val data
val_predictions <- self$predict(
newdata = val_data,
ml_trace = self$last_training$config$ml_trace,
batch_size = self$last_training$config$batch_size
)
val_pred_cat <- val_predictions$expected_category
names(val_pred_cat) <- rownames(val_predictions)
val_pred_cat <- val_pred_cat[val_data["id"]]
#Calculate Assignment Error Matrix
val_data$set_format("np")
val_iota_object<-iotarelr::check_new_rater(
true_values = factor(
x = val_data["labels"],
levels = 0:(length(self$model_config$target_levels) - 1),
labels = self$model_config$target_levels
),
assigned_values = val_pred_cat,
free_aem = TRUE
)
#Estimate probability of each category
aem=val_iota_object$categorical_level$raw_estimates$assignment_error_matrix
class_sizes=val_iota_object$information$est_true_cat_sizes
p_cat=class_sizes%*%aem
#Estimate probability that the category is the true category
p_cat_true=class_sizes*diag(aem)/p_cat
p_cat_true=replace(p_cat_true,list=is.nan(p_cat_true),values=0)
#Correct probabilities
number_columns=ncol(predictions)
col=ncol(predictions)-1
for(i in 1:nrow(predictions)){
predictions[i,1:col]<-predictions[i,1:col]*p_cat_true/sum(predictions[i,1:col]*p_cat_true)
predictions[i,number_columns]<-self$model_config$target_levels[which.max(as.numeric(predictions[i,1:col]))]
}
return(predictions)
},
#--------------------------------------------------------------------------
basic_train = function(train_data = NULL,
val_data = NULL,
test_data = NULL,
reset_model = FALSE,
use_callback = TRUE,
log_dir = NULL,
log_write_interval = 10,
log_top_value = NULL,
log_top_total = NULL,
log_top_message = NULL) {
# Clear session to provide enough resources for computations
if (private$ml_framework == "tensorflow") {
keras$backend$clear_session()
} else if (private$ml_framework == "pytorch") {
if (torch$cuda$is_available()) {
torch$cuda$empty_cache()
}
}
# Generating class weights
if (self$last_training$config$balance_class_weights == TRUE) {
abs_freq_classes <- table(train_data["labels"])
class_weights <- as.vector(sum(abs_freq_classes) / (length(abs_freq_classes) * abs_freq_classes))
} else {
class_weights <- rep(x = 1, times = length(self$model_config$target_levels))
}
# Generating weights for sequence length
if (self$last_training$config$balance_sequence_length == TRUE) {
sequence_length <- train_data["length"]
abs_freq_length <- table(sequence_length)
sample_weight_per_sequence_length <- as.vector(sum(abs_freq_length) / (length(abs_freq_length) * abs_freq_length))
sequence_order <- names(abs_freq_length)
sample_weights <- vector(length = length(sequence_length))
for (i in 1:length(sample_weights)) {
idx <- which(sequence_length[i] == sequence_order)
sample_weights[i] <- sample_weight_per_sequence_length[idx]
}
} else {
sequence_length <- train_data["length"]
sample_weights <- rep.int(x = 1, times = length(sequence_length))
}
# Reset model if requested
if (reset_model == TRUE) {
private$create_reset_model()
}
# Set Optimizer
if (private$ml_framework == "tensorflow") {
balanced_metric <- py$BalancedAccuracy(n_classes = as.integer(length(self$model_config$target_levels)))
if (self$model_config$optimizer == "adam") {
self$model$compile(
loss = self$model_config$err_fct,
optimizer = keras$optimizers$Adam(),
metrics = c(self$model_config$metric, balanced_metric)
)
} else if (self$model_config$optimizer == "rmsprop") {
self$model$compile(
loss = self$model_config$err_fct,
optimizer = keras$optimizers$RMSprop(),
metrics = c(self$model_config$metric, balanced_metric)
)
}
} else if (private$ml_framework == "pytorch") {
loss_fct_name <- "CrossEntropyLoss"
if (self$model_config$optimizer == "adam") {
optimizer <- "adam"
} else if (self$model_config$optimizer == "rmsprop") {
optimizer <- "rmsprop"
}
}
# Check directory for checkpoints
create_dir(
dir_path = paste0(self$last_training$config$dir_checkpoint, "/checkpoints"),
trace = self$last_training$config$trace,
msg = "Creating Checkpoint Directory")
# Set target column
if (self$model_config$require_one_hot == FALSE) {
target_column <- "labels"
} else {
target_column <- "one_hot_encoding"
}
# Tensorflow - Callbacks and training
if (private$ml_framework == "tensorflow") {
if (use_callback == TRUE) {
callback <- keras$callbacks$ModelCheckpoint(
filepath = paste0(self$last_training$config$dir_checkpoint, "/checkpoints/best_weights.h5"),
monitor = paste0("val_", self$model_config$balanced_metric),
verbose = as.integer(min(self$last_training$config$ml_trace, 1)),
mode = "auto",
save_best_only = TRUE,
save_weights_only = TRUE
)
} else {
callback <- reticulate::py_none()
}
data_set_weights <- datasets$Dataset$from_dict(
reticulate::dict(list(
sample_weights = sample_weights
))
)
# inputs, targets, sample_weights
dataset_tf <- train_data$add_column("sample_weights", data_set_weights["sample_weights"])
dataset_tf <- dataset_tf$rename_column("input", "input_embeddings")
# Choose correct target column and rename
dataset_tf <- dataset_tf$rename_column(target_column, "targets")
dataset_tf$set_format("tf")
tf_dataset_train <- dataset_tf$to_tf_dataset(
columns = c("input_embeddings", "sample_weights"),
batch_size = as.integer(self$last_training$config$batch_size),
shuffle = TRUE,
label_cols = "targets"
)
# Add sample weights
tf_dataset_train <- tf_dataset_train$map(py$extract_sample_weight)
dataset_tf_val <- val_data$rename_column("input", "input_embeddings")
# Choose correct target column and rename
dataset_tf_val <- dataset_tf_val$rename_column(target_column, "targets")
tf_dataset_val <- dataset_tf_val$to_tf_dataset(
columns = c("input_embeddings"),
batch_size = as.integer(self$last_training$config$batch_size),
shuffle = FALSE,
label_cols = "targets"
)
history <- self$model$fit(
verbose = as.integer(self$last_training$config$ml_trace),
x = tf_dataset_train,
validation_data = tf_dataset_val,
epochs = as.integer(self$last_training$config$epochs),
callbacks = callback,
class_weight = reticulate::py_dict(keys = names(class_weights), values = class_weights)
)$history
if (self$model_config$n_categories == 2) {
history <- list(
loss = rbind(history$loss, history$val_loss),
accuracy = rbind(history$binary_accuracy, history$val_binary_accuracy),
balanced_accuracy = rbind(history$balanced_accuracy, history$val_balanced_accuracy)
)
} else {
history <- list(
loss = rbind(history$loss, history$val_loss),
accuracy = rbind(history$categorical_accuracy, history$val_categorical_accuracy),
balanced_accuracy = rbind(history$balanced_accuracy, history$val_balanced_accuracy)
)
}
if (use_callback == TRUE) {
self$model$load_weights(paste0(self$last_training$config$dir_checkpoint, "/checkpoints/best_weights.h5"))
}
# PyTorch - Callbacks and training
} else if (private$ml_framework == "pytorch") {
data_set_weights <- datasets$Dataset$from_dict(
reticulate::dict(list(
sample_weights = sample_weights
))
)
dataset_train <- train_data$add_column("sample_weights", data_set_weights["sample_weights"])
dataset_train <- dataset_train$select_columns(c("input", target_column, "sample_weights"))
if (self$model_config$require_one_hot == TRUE) {
dataset_train <- dataset_train$rename_column(target_column, "labels")
}
pytorch_train_data <- dataset_train$with_format("torch")
pytorch_val_data <- val_data$select_columns(c("input", target_column))
if (self$model_config$require_one_hot == TRUE) {
pytorch_val_data <- pytorch_val_data$rename_column(target_column, "labels")
}
pytorch_val_data <- pytorch_val_data$with_format("torch")
if (!is.null(test_data)) {
pytorch_test_data <- test_data$select_columns(c("input", target_column))
if (self$model_config$require_one_hot == TRUE) {
pytorch_test_data <- pytorch_test_data$rename_column(target_column, "labels")
}
pytorch_test_data <- pytorch_test_data$with_format("torch")
} else {
pytorch_test_data <- NULL
}
history <- py$TeClassifierTrain_PT_with_Datasets(
model = self$model,
loss_fct_name = loss_fct_name,
optimizer_method = self$model_config$optimizer,
epochs = as.integer(self$last_training$config$epochs),
trace = as.integer(self$last_training$config$ml_trace),
use_callback = use_callback,
batch_size = as.integer(self$last_training$config$batch_size),
train_data = pytorch_train_data,
val_data = pytorch_val_data,
test_data = pytorch_test_data,
filepath = paste0(self$last_training$config$dir_checkpoint, "/checkpoints/best_weights.pt"),
n_classes = as.integer(length(self$model_config$target_levels)),
class_weights = torch$tensor(np$array(class_weights)),
log_dir = log_dir,
log_write_interval = log_write_interval,
log_top_value = log_top_value,
log_top_total = log_top_total,
log_top_message = log_top_message
)
}
#provide rownames and replace -100
history<-private$prepare_history_data(history)
return(history)
},
#--------------------------------------------------------------------------
set_feature_extractor = function(feature_extractor) {
# Check
check_class(feature_extractor, "TEFeatureExtractor", TRUE)
if (!is.null(feature_extractor)) {
if (feature_extractor$get_ml_framework() != private$ml_framework) {
stop("The machine learning framework of the feature extractior and
classifier do not match. Please provide a feature extractor
with the same machine learning framework as the classifier.")
}
if (feature_extractor$is_trained() == FALSE) {
stop("The supplied feature extractor is not trained. Please
provide train and try again.")
}
self$model_config$use_fe <- TRUE
self$model_config$features <- feature_extractor$model_config$features
self$feature_extractor <- feature_extractor$clone(deep = TRUE)
} else {
self$model_config$use_fe <- FALSE
self$model_config$features <- private$text_embedding_model[["features"]]
}
},
#--------------------------------------------------------------------------
adjust_configuration = function() {
if (self$model_config$rec_layers!=0 & self$model_config$self_attention_heads > 0) {
if (self$model_config$features %% 2 != 0) {
stop("The number of features of the TextEmbeddingmodel is
not a multiple of 2.")
}
}
if (is.null(self$model_config$intermediate_size) == TRUE) {
if (self$model_config$attention_type == "fourier" & self$model_config$rec_layers > 0) {
self$model_config$intermediate_size <- 2 * self$model_config$rec_size
} else if (self$model_config$attention_type == "fourier" & self$model_config$rec_layers == 0) {
self$model_config$intermediate_size <- 2 * self$model_config$features
} else if (self$model_config$attention_type == "multihead" & self$model_config$rec_layers > 0 & self$model_config$self_attention_heads > 0) {
self$model_config$intermediate_size <- 2 * self$model_config$features
} else if (self$model_config$attention_type == "multihead" & self$model_config$rec_layers == 0 & self$model_config$self_attention_heads > 0) {
self$model_config$intermediate_size <- 2 * self$model_config$features
} else {
self$model_config$intermediate_size <- NULL
}
}
if(self$model_config$rec_layers<=1){
self$model_config$rec_dropout=0.0
}
if(self$model_config$rec_layers<=0){
self$model_config$rec_size=0
}
if(self$model_config$dense_layers<=1){
self$model_config$dense_dropout=0.0
}
if(self$model_config$dense_layers<=0){
self$model_config$dense_size=0
}
},
#--------------------------------------------------------------------------
check_and_adjust_target_levels = function(data_targets) {
if (sum(levels(data_targets) %in% self$model_config$target_levels) != self$model_config$n_categories) {
warning(
paste(
"data_targets contains levels that are not defined for the classifier",
"Defined levels are", self$model_config$target_levels, ".",
"Please check your data or create a new classifier and pass
all levels to the classifier's configuration."
)
)
}
tmp_data <- as.character(data_targets)
tmp_data <- factor(
x = tmp_data,
levels = self$model_config$target_levels,
ordered = TRUE
)
names(tmp_data) <- names(data_targets)
return(tmp_data)
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.