Nothing
# This file is part of the R package "aifeducation".
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>
#' @title Feature extractor for reducing the number for dimensions of text embeddings.
#'
#' @description Abstract class for auto encoders with 'pytorch'.
#'
#' @return Objects of this class are used for reducing the number of dimensions of text embeddings created by an object
#' of class [TextEmbeddingModel].
#'
#' For training an object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings] generated by an object of class
#' [TextEmbeddingModel] is necessary. Passing raw texts is not supported.
#'
#' For prediction an ob object class [EmbeddedText] or [LargeDataSetForTextEmbeddings] is necessary that was generated
#' with the same [TextEmbeddingModel] as during training. Prediction outputs a new object of class [EmbeddedText] or
#' [LargeDataSetForTextEmbeddings] which contains a text embedding with a lower number of dimensions.
#'
#' All models use tied weights for the encoder and decoder layers (except `method="lstm"`) and apply the estimation of
#' orthogonal weights. In addition, training tries to train the model to achieve uncorrelated features.
#'
#' Objects of class [TEFeatureExtractor] are designed to be used with classifiers such as [TEClassifierRegular] and
#' [TEClassifierProtoNet].
#'
#' @family Text Embedding
#' @export
TEFeatureExtractor <- R6::R6Class(
classname = "TEFeatureExtractor",
inherit = AIFEBaseModel,
public = list(
# New-----------------------------------------------------------------------
#' @description Creating a new instance of this class.
#' @param ml_framework `string` Framework to use for training and inference. Currently only `ml_framework="pytorch"`
#' is supported.
#' @param name `string` Name of the new classifier. Please refer to common name conventions. Free text can be used
#' with parameter `label`.
#' @param label `string` Label for the new classifier. Here you can use free text.
#' @param text_embeddings An object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings].
#' @param features `int` determining the number of dimensions to which the dimension of the text embedding should be
#' reduced.
#' @param method `string` Method to use for the feature extraction. `"lstm"` for an extractor based on LSTM-layers or
#' `"dense"` for dense layers.
#' @param noise_factor `double` between 0 and a value lower 1 indicating how much noise should be added for the
#' training of the feature extractor.
#' @param optimizer `string` `"adam"` or `"rmsprop"` .
#' @return Returns an object of class [TEFeatureExtractor] which is ready for training.
configure = function(ml_framework = "pytorch",
name = NULL,
label = NULL,
text_embeddings = NULL,
features = 128,
method = "lstm",
noise_factor = 0.2,
optimizer = "adam") {
# Checking of parameters--------------------------------------------------
check_type(ml_framework, "string", FALSE)
if ((ml_framework %in% c("pytorch")) == FALSE) {
stop("ml_framework must be 'pytorch'.")
}
check_type(name, "string", FALSE)
check_type(label, "string", FALSE)
check_type(optimizer, "string", FALSE)
if (optimizer %in% c("adam", "rmsprop") == FALSE) {
stop("Optimzier must be 'adam' oder 'rmsprop'.")
}
check_type(method, "string", FALSE)
if (method %in% c("lstm", "dense") == FALSE) {
stop("Method must be lstm, dense or conv. Please check.")
}
private$check_embeddings_object_type(text_embeddings, strict = TRUE)
# Set ML framework------------------------------------------------------------------------
private$ml_framework <- ml_framework
# Setting Label and Name-------------------------------------------------
private$set_model_info(
model_name_root = name,
model_id = generate_id(16),
label = label,
model_date = date()
)
# Set TextEmbeddingModel
private$set_text_embedding_model(
model_info = text_embeddings$get_model_info(),
feature_extractor_info = text_embeddings$get_feature_extractor_info(),
times = text_embeddings$get_times(),
features = text_embeddings$get_features()
)
# Saving Configuration
config <- list(
method = method,
noise_factor = noise_factor,
features = features,
times = private$text_embedding_model[["times"]],
optimizer = optimizer,
require_one_hot = FALSE,
require_matrix_map = FALSE
)
self$model_config <- config
# Set package versions
private$set_package_versions()
# Finalize configuration
private$set_configuration_to_TRUE()
# Create_Model
private$create_reset_model()
},
#-------------------------------------------------------------------------
#' @description Method for training a neural net.
#' @param data_embeddings Object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings].
#' @param data_val_size `double` between 0 and 1, indicating the proportion of cases which should be used for the
#' validation sample.
#' @param sustain_track `bool` If `TRUE` energy consumption is tracked during training via the python library
#' 'codecarbon'.
#' @param sustain_iso_code `string` ISO code (Alpha-3-Code) for the country. This variable must be set if
#' sustainability should be tracked. A list can be found on Wikipedia:
#' <https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes>.
#' @param sustain_region Region within a country. Only available for USA and Canada See the documentation of
#' 'codecarbon' for more information. <https://mlco2.github.io/codecarbon/parameters.html>
#' @param sustain_interval `int` Interval in seconds for measuring power usage.
#' @param epochs `int` Number of training epochs.
#' @param batch_size `int` Size of batches.
#' @param dir_checkpoint `string` Path to the directory where the checkpoint during training should be saved. If the
#' directory does not exist, it is created.
#' @param log_dir `string` Path to the directory where the log files should be saved. If no logging is desired set
#' this argument to `NULL`.
#' @param log_write_interval `int` Time in seconds determining the interval in which the logger should try to update
#' the log files. Only relevant if `log_dir` is not `NULL`.
#' @param trace `bool` `TRUE`, if information about the estimation phase should be printed to the console.
#' @param ml_trace `int` \code{ml_trace=0} does not print any information about the training process from pytorch on
#' the console. \code{ml_trace=1} prints a progress bar.
#' @return Function does not return a value. It changes the object into a trained classifier.
train = function(data_embeddings,
data_val_size = 0.25,
sustain_track = TRUE,
sustain_iso_code = NULL,
sustain_region = NULL,
sustain_interval = 15,
epochs = 40,
batch_size = 32,
dir_checkpoint,
trace = TRUE,
ml_trace = 1,
log_dir = NULL,
log_write_interval = 10) {
# Checking Arguments------------------------------------------------------
self$check_embedding_model(data_embeddings)
check_type(data_val_size, "double", FALSE)
check_type(sustain_track, "bool", FALSE)
check_type(sustain_iso_code, "string", TRUE)
check_type(sustain_region, "string", TRUE)
check_type(sustain_interval, "double", FALSE)
check_type(epochs, "int", FALSE)
check_type(batch_size, "int", FALSE)
check_type(dir_checkpoint, "string", FALSE)
check_type(trace, "bool", FALSE)
# Saving training configuration-------------------------------------------
self$last_training$config$data_val_size <- data_val_size
self$last_training$config$sustain_track <- sustain_track
self$last_training$config$sustain_iso_code <- sustain_iso_code
self$last_training$config$sustain_region <- sustain_region
self$last_training$config$sustain_interval <- sustain_interval
self$last_training$config$epochs <- epochs
self$last_training$config$batch_size <- batch_size
self$last_training$config$dir_checkpoint <- dir_checkpoint
self$last_training$config$trace <- trace
self$last_training$config$ml_trace <- ml_trace
private$log_config$log_dir <- log_dir
private$log_config$log_state_file <- paste0(private$log_config$log_dir, "/aifeducation_state.log")
private$log_config$log_write_interval <- log_write_interval
# Loading PY Scripts
private$load_reload_python_scripts()
# Start-------------------------------------------------------------------
if (self$last_training$config$trace == TRUE) {
message(paste(
date(),
"Start"
))
}
# Set up dataset
if ("EmbeddedText" %in% class(data_embeddings)) {
data <- data_embeddings$convert_to_LargeDataSetForTextEmbeddings()
data <- data$get_dataset()
} else {
data <- data_embeddings$get_dataset()
}
# Reduce to unique cases for training
data <- reduce_to_unique(data, "id")
# Copy input as label for training
extractor_dataset <- data$map(py$map_input_to_labels)
# Check directory for checkpoints
create_dir(
dir_path = self$last_training$config$dir_checkpoint,
trace = self$last_training$config$trace,
msg = "Creating Checkpoint Directory")
# Set up log file
log_top_value <- 0
log_top_total <- 1
log_top_message <- "Overall"
if (private$ml_framework == "pytorch") {
# Set format
extractor_dataset$set_format("torch")
# Split into train and validation data
extractor_dataset <- extractor_dataset$train_test_split(self$last_training$config$data_val_size)
# print(extractor_dataset$train)
self$last_training$history <- py$AutoencoderTrain_PT_with_Datasets(
model = self$model,
epochs = as.integer(self$last_training$config$epochs),
trace = as.integer(self$last_training$config$ml_trace),
batch_size = as.integer(self$last_training$config$batch_size),
train_data = extractor_dataset$train,
val_data = extractor_dataset$test,
filepath = paste0(self$last_training$config$dir_checkpoint, "/best_weights.pt"),
use_callback = TRUE,
log_dir = private$log_config$log_dir,
log_write_interval = log_write_interval,
log_top_value = log_top_value,
log_top_total = log_top_total,
log_top_message = log_top_message
)
#-----------------------------------------------------------------------
} else if (private$ml_framework == "tensorflow") {
# Set format
extractor_dataset$set_format("torch")
# Split into train and validation data
extractor_dataset <- extractor_dataset$train_test_split(self$last_training$config$data_val_size)
# Set Callback
callback <- keras$callbacks$ModelCheckpoint(
filepath = paste0(self$last_training$config$dir_checkpoint, "/best_weights.keras"),
monitor = "val_loss",
verbose = as.integer(min(self$last_training$config$ml_trace, 1)),
mode = "auto",
save_best_only = TRUE,
save_weights_only = TRUE
)
# Set optimizer
if (self$model_config$optimizer == "adam") {
self$model$compile(
loss = "MSE",
optimizer = keras$optimizers$Adam()
)
} else if (self$model_config$optimizer == "rmsprop") {
self$model$compile(
loss = "MSE",
optimizer = keras$optimizers$RMSprop()
)
}
tf_dataset_train <- extractor_dataset$train$to_tf_dataset(
columns = c("input"),
batch_size = as.integer(self$last_training$config$batch_size),
shuffle = TRUE,
label_cols = "labels"
)
tf_dataset_val <- extractor_dataset$test$to_tf_dataset(
columns = c("input"),
batch_size = as.integer(self$last_training$config$batch_size),
shuffle = FALSE,
label_cols = "labels"
)
history <- self$model$fit(
verbose = as.integer(self$last_training$config$ml_trace),
x = tf_dataset_train,
validation_data = tf_dataset_val,
epochs = as.integer(self$last_training$config$epochs),
callbacks = callback
)$history
history <- rbind(history$loss, history$val_loss)
self$last_training$history <- history
self$model$load_weights(paste0(self$last_training$config$dir_checkpoint, "/best_weights.keras"))
}
rownames(self$last_training$history$loss) <- c("train", "val")
# Set training status value
private$trained <- TRUE
if (self$last_training$config$trace == TRUE) {
message(paste(date(), "Training finished"))
}
},
#--------------------------------------------------------------------------
#' @description loads an object from disk and updates the object to the current version of the package.
#' @param dir_path Path where the object set is stored.
#' @return Method does not return anything. It loads an object from disk.
load_from_disk = function(dir_path) {
# Call the core method which loads data common for all models
private$load_config_and_docs(dir_path = dir_path)
# Create and load AI model
private$create_reset_model()
self$load(dir_path = dir_path)
# Add FeatureExtractor specific data
# Load R file
config_file <- load_R_config_state(dir_path)
# Set training status
private$trained <- config_file$private$trained
},
#---------------------------------------------------------------------------
#' @description Method for extracting features. Applying this method reduces the number of dimensions of the text
#' embeddings. Please note that this method should only be used if a small number of cases should be compressed
#' since the data is loaded completely into memory. For a high number of cases please use the method
#' `extract_features_large`.
#' @param data_embeddings Object of class [EmbeddedText],[LargeDataSetForTextEmbeddings],
#' `datasets.arrow_dataset.Dataset` or `array` containing the text embeddings which should be reduced in their
#' dimensions.
#' @param batch_size `int` batch size.
#' @return Returns an object of class [EmbeddedText] containing the compressed embeddings.
extract_features = function(data_embeddings, batch_size) {
# Argument checking
check_type(batch_size, "int", FALSE)
# check data_embeddings object
if ("EmbeddedText" %in% class(data_embeddings) |
"LargeDataSetForTextEmbeddings" %in% class(data_embeddings)) {
self$check_embedding_model(text_embeddings = data_embeddings)
} else {
private$check_embeddings_object_type(data_embeddings, strict = FALSE)
}
# Load Custom Model Scripts
private$load_reload_python_scripts()
# Check number of cases in the data
single_prediction <- private$check_single_prediction(data_embeddings)
# Get current row names/name of the cases
current_row_names <- private$get_rownames_from_embeddings(data_embeddings)
# If at least two cases are part of the data set---------------------------
if (single_prediction == FALSE) {
prepared_embeddings <- private$prepare_embeddings_as_dataset(data_embeddings)
if (private$ml_framework == "pytorch") {
prepared_embeddings$set_format("torch")
reduced_tensors <- py$TeFeatureExtractorBatchExtract(
model = self$model,
dataset = prepared_embeddings,
batch_size = as.integer(batch_size)
)
reduced_embeddings <- private$detach_tensors(reduced_tensors)
} else if (private$ml_framework == "tensorflow") {
prepared_embeddings$set_format("tf")
prepared_embeddings_tf <- prepared_embeddings$to_tf_dataset(
columns = c("input"),
batch_size = as.integer(batch_size),
shuffle = FALSE
)
encoder_model <- tf$keras$Model(inputs = self$model$input, outputs = self$model$get_layer("latent_space_output")$output)
reduced_embeddings <- encoder_model$predict(prepared_embeddings_tf,
verbose = as.integer(0)
)
}
#---------------------------------------------------------------------
} else {
prepared_embeddings <- private$prepare_embeddings_as_np_array(data_embeddings)
if (private$ml_framework == "pytorch") {
if (torch$cuda$is_available()) {
device <- "cuda"
dtype <- torch$double
self$model$to(device, dtype = dtype)
self$model$eval()
input <- torch$from_numpy(prepared_embeddings)
reduced_tensors <- self$model(input$to(device, dtype = dtype),
encoder_mode = TRUE
)
reduced_embeddings <- private$detach_tensors(reduced_tensors)
} else {
device <- "cpu"
dtype <- torch$float
self$model$to(device, dtype = dtype)
self$model$eval()
input <- torch$from_numpy(prepared_embeddings)
reduced_tensors <- self$model(input$to(device, dtype = dtype),
encoder_mode = TRUE
)
reduced_embeddings <- private$detach_tensors(reduced_tensors)
}
} else if (private$ml_framework == "tensorflow") {
encoder_model <- tf$keras$Model(inputs = self$model$input, outputs = self$model$get_layer("latent_space_output")$output)
reduced_embeddings <- encoder_model$predict(prepared_embeddings,
verbose = as.integer(0)
)
}
}
# Prepare output
rownames(reduced_embeddings) <- current_row_names
model_info <- self$get_text_embedding_model()
red_embedded_text <- EmbeddedText$new()
red_embedded_text$configure(
model_name = paste0("feature_extracted_", model_info$model_name),
model_label = model_info$model$model_label,
model_date = model_info$model$model_date,
model_method = model_info$model$model_method,
model_version = model_info$model$model_version,
model_language = model_info$model$model_language,
param_seq_length = model_info$model$param_seq_length,
param_features = dim(reduced_embeddings)[3],
param_chunks = model_info$model$param_chunks,
param_overlap = model_info$model$param_overlap,
param_emb_layer_min = model_info$model$param_emb_layer_min,
param_emb_layer_max = model_info$model$param_emb_layer_max,
param_emb_pool_type = model_info$model$param_emb_pool_type,
param_aggregation = model_info$model$param_aggregation,
embeddings = reduced_embeddings
)
red_embedded_text$add_feature_extractor_info(
model_name = private$model_info$model_name,
model_label = private$model_info$model_label,
features = self$model_config$features,
method = self$model_config$method,
noise_factor = self$model_config$noise_factor,
optimizer = self$model_config$optimizer
)
return(red_embedded_text)
},
#--------------------------------------------------------------------------
#' @description Method for extracting features from a large number of cases. Applying this method reduces the number
#' of dimensions of the text embeddings.
#' @param data_embeddings Object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings] containing the text
#' embeddings which should be reduced in their dimensions.
#' @param batch_size `int` batch size.
#' @param trace `bool` If `TRUE` information about the progress is printed to the console.
#' @return Returns an object of class [LargeDataSetForTextEmbeddings] containing the compressed embeddings.
extract_features_large = function(data_embeddings, batch_size, trace = FALSE) {
# Argument checking
check_class(data_embeddings, c("EmbeddedText", "LargeDataSetForTextEmbeddings"), FALSE)
check_type(batch_size, "int", FALSE)
check_type(trace, "bool", FALSE)
# Get total number of batches for the loop
total_number_of_bachtes <- ceiling(data_embeddings$n_rows() / batch_size)
# Get indices for every batch
batches_index <- get_batches_index(
number_rows = data_embeddings$n_rows(),
batch_size = batch_size,
zero_based = TRUE
)
# Process every batch
for (i in 1:total_number_of_bachtes) {
subset <- data_embeddings$select(as.integer(batches_index[[i]]))
embeddings <- self$extract_features(
data_embeddings = subset,
batch_size = batch_size
)
if (i == 1) {
# Create Large Dataset
model_info <- self$get_text_embedding_model()
embedded_texts_large <- LargeDataSetForTextEmbeddings$new()
embedded_texts_large$configure(
model_label = model_info$model_label,
model_date = model_info$model_date,
model_method = model_info$model_method,
model_version = model_info$model_version,
model_language = model_info$model_language,
param_seq_length = model_info$param_seq_length,
param_features = dim(embeddings)[3],
param_chunks = model_info$model$param_chunks,
param_overlap = model_info$model$param_overlap,
param_emb_layer_min = model_info$model$param_emb_layer_min,
param_emb_layer_max = model_info$model$param_emb_layer_max,
param_emb_pool_type = model_info$model$param_emb_pool_type,
param_aggregation = model_info$model$param_aggregation
)
embedded_texts_large$add_feature_extractor_info(
model_name = private$model_info$model_name,
model_label = private$model_info$model_label,
features = self$model_config$features,
method = self$model_config$method,
noise_factor = self$model_config$noise_factor,
optimizer = self$model_config$optimizer
)
# Add new data
embedded_texts_large$add_embeddings_from_EmbeddedText(embeddings)
} else {
# Add new data
embedded_texts_large$add_embeddings_from_EmbeddedText(embeddings)
}
if (trace == TRUE) {
cat(paste(
date(),
"Batch", i, "/", total_number_of_bachtes, "done", "\n"
))
}
gc()
}
return(embedded_texts_large)
},
#--------------------------------------------------------------------------
#' @description Check if the [TEFeatureExtractor] is trained.
#' @return Returns `TRUE` if the object is trained and `FALSE` if not.
is_trained = function() {
return(private$trained)
}
),
private = list(
trained = FALSE,
#--------------------------------------------------------------------------
load_reload_python_scripts = function() {
reticulate::py_run_file(system.file("python/py_functions.py",
package = "aifeducation"
))
if (private$ml_framework == "tensorflow") {
reticulate::py_run_file(system.file("python/keras_autoencoder.py",
package = "aifeducation"
))
reticulate::py_run_file(system.file("python/keras_callbacks.py",
package = "aifeducation"
))
} else if (private$ml_framework == "pytorch") {
reticulate::py_run_file(system.file("python/pytorch_te_classifier.py",
package = "aifeducation"
))
reticulate::py_run_file(system.file("python/pytorch_autoencoder.py",
package = "aifeducation"
))
reticulate::py_run_file(system.file("python/py_log.py",
package = "aifeducation"
))
}
},
#--------------------------------------------------------------------------
create_reset_model = function() {
private$load_reload_python_scripts()
private$check_config_for_TRUE()
if (private$ml_framework == "pytorch") {
if (self$model_config$method == "lstm") {
self$model <- py$LSTMAutoencoder_with_Mask_PT(
times = as.integer(private$text_embedding_model["times"]),
features_in = as.integer(private$text_embedding_model["features"]),
features_out = as.integer(self$model_config$features),
noise_factor = self$model_config$noise_factor
)
} else if (self$model_config$method == "dense") {
self$model <- feature_extractor <- py$DenseAutoencoder_with_Mask_PT(
features_in = as.integer(private$text_embedding_model["features"]),
features_out = as.integer(self$model_config$features),
noise_factor = self$model_config$noise_factor
)
} else if (self$model_config$method == "conv") {
self$model <- feature_extractor <- py$ConvAutoencoder_with_Mask_PT(
features_in = as.integer(private$text_embedding_model["features"]),
features_out = as.integer(self$model_config$features),
noise_factor = self$model_config$noise_factor
)
}
} else if (private$ml_framework == "tensorflow") {
if (self$model_config$method == "lstm") {
self$model <- py$LSTMAutoencoder_with_Mask_TF(
times = as.integer(private$text_embedding_model["times"]),
features_in = as.integer(private$text_embedding_model["features"]),
features_out = as.integer(self$model_config$features),
noise_factor = self$model_config$noise_factor
)
}
}
},
#--------------------------------------------------------------------------
init_gui = function(data_manager) {
# Check for a running Shiny App and set the configuration
# The Gui functions must be set in the server function of shiny globally
if (requireNamespace("shiny", quietly = TRUE) & requireNamespace("shinyWidgets", quietly = TRUE)) {
if (shiny::isRunning()) {
private$gui$shiny_app_active <- TRUE
} else {
private$gui$shiny_app_active <- FALSE
}
} else {
private$gui$shiny_app_active <- FALSE
}
# SetUp Progressbar for UI
private$gui$pgr_value <- -1
private$gui$pgr_max_value <- data_manager$get_n_folds() + 1 +
(data_manager$get_n_folds() + 1) * self$last_training$config$use_pl * self$last_training$config$pl_max_steps
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.