R/FeatureExtractor.R
In aifeducation: Artificial Intelligence for Education

# This file is part of the R package "aifeducation".
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>

#' @title Feature extractor for reducing the number for dimensions of text embeddings.
#'
#' @description Abstract class for auto encoders with 'pytorch'.
#'
#' @return Objects of this class are used for reducing the number of dimensions of text embeddings created by an object
#'   of class [TextEmbeddingModel].
#'
#'   For training an object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings] generated by an object of class
#'   [TextEmbeddingModel] is necessary. Passing raw texts is not supported.
#'
#'   For prediction an ob object class [EmbeddedText] or [LargeDataSetForTextEmbeddings] is necessary that was generated
#'   with the same [TextEmbeddingModel] as during training. Prediction outputs a new object of class [EmbeddedText] or
#'   [LargeDataSetForTextEmbeddings] which contains a text embedding with a lower number of dimensions.
#'
#'   All models use tied weights for the encoder and decoder layers (except `method="lstm"`) and apply the estimation of
#'   orthogonal weights. In addition, training tries to train the model to achieve uncorrelated features.
#'
#'   Objects of class [TEFeatureExtractor] are designed to be used with classifiers such as [TEClassifierRegular] and
#'   [TEClassifierProtoNet].
#'
#' @family Text Embedding
#' @export
TEFeatureExtractor <- R6::R6Class(
  classname = "TEFeatureExtractor",
  inherit = AIFEBaseModel,
  public = list(
    # New-----------------------------------------------------------------------
    #' @description Creating a new instance of this class.
    #' @param ml_framework `string` Framework to use for training and inference. Currently only `ml_framework="pytorch"`
    #'   is supported.
    #' @param name `string` Name of the new classifier. Please refer to common name conventions. Free text can be used
    #'   with parameter `label`.
    #' @param label `string` Label for the new classifier. Here you can use free text.
    #' @param text_embeddings An object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings].
    #' @param features `int` determining the number of dimensions to which the dimension of the text embedding should be
    #'   reduced.
    #' @param method `string` Method to use for the feature extraction. `"lstm"` for an extractor based on LSTM-layers or
    #'   `"dense"` for dense layers.
    #' @param noise_factor `double` between 0 and a value lower 1 indicating how much noise should be added for the
    #'   training of the feature extractor.
    #' @param optimizer `string` `"adam"` or `"rmsprop"` .
    #' @return Returns an object of class [TEFeatureExtractor] which is ready for training.
    configure = function(ml_framework = "pytorch",
                         name = NULL,
                         label = NULL,
                         text_embeddings = NULL,
                         features = 128,
                         method = "lstm",
                         noise_factor = 0.2,
                         optimizer = "adam") {
      # Checking of parameters--------------------------------------------------
      check_type(ml_framework, "string", FALSE)
      if ((ml_framework %in% c("pytorch")) == FALSE) {
        stop("ml_framework must be 'pytorch'.")
      }
      check_type(name, "string", FALSE)
      check_type(label, "string", FALSE)
      check_type(optimizer, "string", FALSE)
      if (optimizer %in% c("adam", "rmsprop") == FALSE) {
        stop("Optimzier must be 'adam' oder 'rmsprop'.")
      }
      check_type(method, "string", FALSE)
      if (method %in% c("lstm", "dense") == FALSE) {
        stop("Method must be lstm, dense or conv. Please check.")
      }
      private$check_embeddings_object_type(text_embeddings, strict = TRUE)

      # Set ML framework------------------------------------------------------------------------
      private$ml_framework <- ml_framework

      # Setting Label and Name-------------------------------------------------
      private$set_model_info(
        model_name_root = name,
        model_id = generate_id(16),
        label = label,
        model_date = date()
      )

      # Set TextEmbeddingModel
      private$set_text_embedding_model(
        model_info = text_embeddings$get_model_info(),
        feature_extractor_info = text_embeddings$get_feature_extractor_info(),
        times = text_embeddings$get_times(),
        features = text_embeddings$get_features()
      )

      # Saving Configuration
      config <- list(
        method = method,
        noise_factor = noise_factor,
        features = features,
        times = private$text_embedding_model[["times"]],
        optimizer = optimizer,
        require_one_hot = FALSE,
        require_matrix_map = FALSE
      )
      self$model_config <- config

      # Set package versions
      private$set_package_versions()

      # Finalize configuration
      private$set_configuration_to_TRUE()

      # Create_Model
      private$create_reset_model()
    },

    #-------------------------------------------------------------------------
    #' @description Method for training a neural net.
    #' @param data_embeddings Object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings].
    #' @param data_val_size `double` between 0 and 1, indicating the proportion of cases which should be used for the
    #'   validation sample.
    #' @param sustain_track `bool` If `TRUE` energy consumption is tracked during training via the python library
    #'   'codecarbon'.
    #' @param sustain_iso_code `string` ISO code (Alpha-3-Code) for the country. This variable must be set if
    #'   sustainability should be tracked. A list can be found on Wikipedia:
    #'   <https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes>.
    #' @param sustain_region Region within a country. Only available for USA and Canada See the documentation of
    #'   'codecarbon' for more information. <https://mlco2.github.io/codecarbon/parameters.html>
    #' @param sustain_interval `int` Interval in seconds for measuring power usage.
    #' @param epochs `int` Number of training epochs.
    #' @param batch_size `int` Size of batches.
    #' @param dir_checkpoint `string` Path to the directory where the checkpoint during training should be saved. If the
    #'   directory does not exist, it is created.
    #' @param log_dir `string` Path to the directory where the log files should be saved. If no logging is desired set
    #'   this argument to `NULL`.
    #' @param log_write_interval `int` Time in seconds determining the interval in which the logger should try to update
    #'   the log files. Only relevant if `log_dir` is not `NULL`.
    #' @param trace `bool` `TRUE`, if information about the estimation phase should be printed to the console.
    #' @param ml_trace `int` \code{ml_trace=0} does not print any information about the training process from pytorch on
    #'   the console. \code{ml_trace=1} prints a progress bar.
    #' @return Function does not return a value. It changes the object into a trained classifier.
    train = function(data_embeddings,
                     data_val_size = 0.25,
                     sustain_track = TRUE,
                     sustain_iso_code = NULL,
                     sustain_region = NULL,
                     sustain_interval = 15,
                     epochs = 40,
                     batch_size = 32,
                     dir_checkpoint,
                     trace = TRUE,
                     ml_trace = 1,
                     log_dir = NULL,
                     log_write_interval = 10) {
      # Checking Arguments------------------------------------------------------
      self$check_embedding_model(data_embeddings)
      check_type(data_val_size, "double", FALSE)
      check_type(sustain_track, "bool", FALSE)
      check_type(sustain_iso_code, "string", TRUE)
      check_type(sustain_region, "string", TRUE)
      check_type(sustain_interval, "double", FALSE)
      check_type(epochs, "int", FALSE)
      check_type(batch_size, "int", FALSE)
      check_type(dir_checkpoint, "string", FALSE)
      check_type(trace, "bool", FALSE)

      # Saving training configuration-------------------------------------------
      self$last_training$config$data_val_size <- data_val_size
      self$last_training$config$sustain_track <- sustain_track
      self$last_training$config$sustain_iso_code <- sustain_iso_code
      self$last_training$config$sustain_region <- sustain_region
      self$last_training$config$sustain_interval <- sustain_interval
      self$last_training$config$epochs <- epochs
      self$last_training$config$batch_size <- batch_size
      self$last_training$config$dir_checkpoint <- dir_checkpoint
      self$last_training$config$trace <- trace
      self$last_training$config$ml_trace <- ml_trace

      private$log_config$log_dir <- log_dir
      private$log_config$log_state_file <- paste0(private$log_config$log_dir, "/aifeducation_state.log")
      private$log_config$log_write_interval <- log_write_interval

      # Loading PY Scripts
      private$load_reload_python_scripts()

      # Start-------------------------------------------------------------------
      if (self$last_training$config$trace == TRUE) {
        message(paste(
          date(),
          "Start"
        ))
      }

      # Set up dataset
      if ("EmbeddedText" %in% class(data_embeddings)) {
        data <- data_embeddings$convert_to_LargeDataSetForTextEmbeddings()
        data <- data$get_dataset()
      } else {
        data <- data_embeddings$get_dataset()
      }

      # Reduce to unique cases for training
      data <- reduce_to_unique(data, "id")

      # Copy input as label for training
      extractor_dataset <- data$map(py$map_input_to_labels)

      # Check directory for checkpoints
      create_dir(
        dir_path = self$last_training$config$dir_checkpoint,
        trace = self$last_training$config$trace,
        msg = "Creating Checkpoint Directory")

      # Set up log file
      log_top_value <- 0
      log_top_total <- 1
      log_top_message <- "Overall"

      if (private$ml_framework == "pytorch") {
        # Set format
        extractor_dataset$set_format("torch")

        # Split into train and validation data
        extractor_dataset <- extractor_dataset$train_test_split(self$last_training$config$data_val_size)

        # print(extractor_dataset$train)
        self$last_training$history <- py$AutoencoderTrain_PT_with_Datasets(
          model = self$model,
          epochs = as.integer(self$last_training$config$epochs),
          trace = as.integer(self$last_training$config$ml_trace),
          batch_size = as.integer(self$last_training$config$batch_size),
          train_data = extractor_dataset$train,
          val_data = extractor_dataset$test,
          filepath = paste0(self$last_training$config$dir_checkpoint, "/best_weights.pt"),
          use_callback = TRUE,
          log_dir = private$log_config$log_dir,
          log_write_interval = log_write_interval,
          log_top_value = log_top_value,
          log_top_total = log_top_total,
          log_top_message = log_top_message
        )
        #-----------------------------------------------------------------------
      } else if (private$ml_framework == "tensorflow") {
        # Set format
        extractor_dataset$set_format("torch")

        # Split into train and validation data
        extractor_dataset <- extractor_dataset$train_test_split(self$last_training$config$data_val_size)

        # Set Callback
        callback <- keras$callbacks$ModelCheckpoint(
          filepath = paste0(self$last_training$config$dir_checkpoint, "/best_weights.keras"),
          monitor = "val_loss",
          verbose = as.integer(min(self$last_training$config$ml_trace, 1)),
          mode = "auto",
          save_best_only = TRUE,
          save_weights_only = TRUE
        )

        # Set optimizer
        if (self$model_config$optimizer == "adam") {
          self$model$compile(
            loss = "MSE",
            optimizer = keras$optimizers$Adam()
          )
        } else if (self$model_config$optimizer == "rmsprop") {
          self$model$compile(
            loss = "MSE",
            optimizer = keras$optimizers$RMSprop()
          )
        }

        tf_dataset_train <- extractor_dataset$train$to_tf_dataset(
          columns = c("input"),
          batch_size = as.integer(self$last_training$config$batch_size),
          shuffle = TRUE,
          label_cols = "labels"
        )

        tf_dataset_val <- extractor_dataset$test$to_tf_dataset(
          columns = c("input"),
          batch_size = as.integer(self$last_training$config$batch_size),
          shuffle = FALSE,
          label_cols = "labels"
        )

        history <- self$model$fit(
          verbose = as.integer(self$last_training$config$ml_trace),
          x = tf_dataset_train,
          validation_data = tf_dataset_val,
          epochs = as.integer(self$last_training$config$epochs),
          callbacks = callback
        )$history

        history <- rbind(history$loss, history$val_loss)
        self$last_training$history <- history

        self$model$load_weights(paste0(self$last_training$config$dir_checkpoint, "/best_weights.keras"))
      }

      rownames(self$last_training$history$loss) <- c("train", "val")

      # Set training status value
      private$trained <- TRUE

      if (self$last_training$config$trace == TRUE) {
        message(paste(date(), "Training finished"))
      }
    },
    #--------------------------------------------------------------------------
    #' @description loads an object from disk and updates the object to the current version of the package.
    #' @param dir_path Path where the object set is stored.
    #' @return Method does not return anything. It loads an object from disk.
    load_from_disk = function(dir_path) {
      # Call the core method which loads data common for all models
      private$load_config_and_docs(dir_path = dir_path)

      # Create and load AI model
      private$create_reset_model()
      self$load(dir_path = dir_path)

      # Add FeatureExtractor specific data
      # Load R file
      config_file <- load_R_config_state(dir_path)

      # Set training status
      private$trained <- config_file$private$trained
    },
    #---------------------------------------------------------------------------
    #' @description Method for extracting features. Applying this method reduces the number of dimensions of the text
    #'   embeddings. Please note that this method should only be used if a small number of cases should be compressed
    #'   since the data is loaded completely into memory. For a high number of cases please use the method
    #'   `extract_features_large`.
    #' @param data_embeddings Object of class [EmbeddedText],[LargeDataSetForTextEmbeddings],
    #'   `datasets.arrow_dataset.Dataset` or `array` containing the text embeddings which should be reduced in their
    #'   dimensions.
    #' @param batch_size `int` batch size.
    #' @return Returns an object of class [EmbeddedText] containing the compressed embeddings.
    extract_features = function(data_embeddings, batch_size) {
      # Argument checking
      check_type(batch_size, "int", FALSE)
      # check data_embeddings object
      if ("EmbeddedText" %in% class(data_embeddings) |
        "LargeDataSetForTextEmbeddings" %in% class(data_embeddings)) {
        self$check_embedding_model(text_embeddings = data_embeddings)
      } else {
        private$check_embeddings_object_type(data_embeddings, strict = FALSE)
      }

      # Load Custom Model Scripts
      private$load_reload_python_scripts()

      # Check number of cases in the data
      single_prediction <- private$check_single_prediction(data_embeddings)

      # Get current row names/name of the cases
      current_row_names <- private$get_rownames_from_embeddings(data_embeddings)

      # If at least two cases are part of the data set---------------------------
      if (single_prediction == FALSE) {
        prepared_embeddings <- private$prepare_embeddings_as_dataset(data_embeddings)

        if (private$ml_framework == "pytorch") {
          prepared_embeddings$set_format("torch")
          reduced_tensors <- py$TeFeatureExtractorBatchExtract(
            model = self$model,
            dataset = prepared_embeddings,
            batch_size = as.integer(batch_size)
          )
          reduced_embeddings <- private$detach_tensors(reduced_tensors)
        } else if (private$ml_framework == "tensorflow") {
          prepared_embeddings$set_format("tf")
          prepared_embeddings_tf <- prepared_embeddings$to_tf_dataset(
            columns = c("input"),
            batch_size = as.integer(batch_size),
            shuffle = FALSE
          )

          encoder_model <- tf$keras$Model(inputs = self$model$input, outputs = self$model$get_layer("latent_space_output")$output)

          reduced_embeddings <- encoder_model$predict(prepared_embeddings_tf,
            verbose = as.integer(0)
          )
        }
        #---------------------------------------------------------------------
      } else {
        prepared_embeddings <- private$prepare_embeddings_as_np_array(data_embeddings)
        if (private$ml_framework == "pytorch") {
          if (torch$cuda$is_available()) {
            device <- "cuda"
            dtype <- torch$double
            self$model$to(device, dtype = dtype)
            self$model$eval()
            input <- torch$from_numpy(prepared_embeddings)
            reduced_tensors <- self$model(input$to(device, dtype = dtype),
              encoder_mode = TRUE
            )
            reduced_embeddings <- private$detach_tensors(reduced_tensors)
          } else {
            device <- "cpu"
            dtype <- torch$float
            self$model$to(device, dtype = dtype)
            self$model$eval()
            input <- torch$from_numpy(prepared_embeddings)
            reduced_tensors <- self$model(input$to(device, dtype = dtype),
              encoder_mode = TRUE
            )
            reduced_embeddings <- private$detach_tensors(reduced_tensors)
          }
        } else if (private$ml_framework == "tensorflow") {
          encoder_model <- tf$keras$Model(inputs = self$model$input, outputs = self$model$get_layer("latent_space_output")$output)
          reduced_embeddings <- encoder_model$predict(prepared_embeddings,
            verbose = as.integer(0)
          )
        }
      }

      # Prepare output
      rownames(reduced_embeddings) <- current_row_names

      model_info <- self$get_text_embedding_model()

      red_embedded_text <- EmbeddedText$new()
      red_embedded_text$configure(
        model_name = paste0("feature_extracted_", model_info$model_name),
        model_label = model_info$model$model_label,
        model_date = model_info$model$model_date,
        model_method = model_info$model$model_method,
        model_version = model_info$model$model_version,
        model_language = model_info$model$model_language,
        param_seq_length = model_info$model$param_seq_length,
        param_features = dim(reduced_embeddings)[3],
        param_chunks = model_info$model$param_chunks,
        param_overlap = model_info$model$param_overlap,
        param_emb_layer_min = model_info$model$param_emb_layer_min,
        param_emb_layer_max = model_info$model$param_emb_layer_max,
        param_emb_pool_type = model_info$model$param_emb_pool_type,
        param_aggregation = model_info$model$param_aggregation,
        embeddings = reduced_embeddings
      )

      red_embedded_text$add_feature_extractor_info(
        model_name = private$model_info$model_name,
        model_label = private$model_info$model_label,
        features = self$model_config$features,
        method = self$model_config$method,
        noise_factor = self$model_config$noise_factor,
        optimizer = self$model_config$optimizer
      )

      return(red_embedded_text)
    },
    #--------------------------------------------------------------------------
    #' @description Method for extracting features from a large number of cases. Applying this method reduces the number
    #'   of dimensions of the text embeddings.
    #' @param data_embeddings Object of class [EmbeddedText] or [LargeDataSetForTextEmbeddings] containing the text
    #'   embeddings which should be reduced in their dimensions.
    #' @param batch_size `int` batch size.
    #' @param trace `bool` If `TRUE` information about the progress is printed to the console.
    #' @return Returns an object of class [LargeDataSetForTextEmbeddings] containing the compressed embeddings.
    extract_features_large = function(data_embeddings, batch_size, trace = FALSE) {
      # Argument checking
      check_class(data_embeddings, c("EmbeddedText", "LargeDataSetForTextEmbeddings"), FALSE)
      check_type(batch_size, "int", FALSE)
      check_type(trace, "bool", FALSE)

      # Get total number of batches for the loop
      total_number_of_bachtes <- ceiling(data_embeddings$n_rows() / batch_size)

      # Get indices for every batch
      batches_index <- get_batches_index(
        number_rows = data_embeddings$n_rows(),
        batch_size = batch_size,
        zero_based = TRUE
      )
      # Process every batch
      for (i in 1:total_number_of_bachtes) {
        subset <- data_embeddings$select(as.integer(batches_index[[i]]))
        embeddings <- self$extract_features(
          data_embeddings = subset,
          batch_size = batch_size
        )
        if (i == 1) {
          # Create Large Dataset
          model_info <- self$get_text_embedding_model()

          embedded_texts_large <- LargeDataSetForTextEmbeddings$new()
          embedded_texts_large$configure(
            model_label = model_info$model_label,
            model_date = model_info$model_date,
            model_method = model_info$model_method,
            model_version = model_info$model_version,
            model_language = model_info$model_language,
            param_seq_length = model_info$param_seq_length,
            param_features = dim(embeddings)[3],
            param_chunks = model_info$model$param_chunks,
            param_overlap = model_info$model$param_overlap,
            param_emb_layer_min = model_info$model$param_emb_layer_min,
            param_emb_layer_max = model_info$model$param_emb_layer_max,
            param_emb_pool_type = model_info$model$param_emb_pool_type,
            param_aggregation = model_info$model$param_aggregation
          )
          embedded_texts_large$add_feature_extractor_info(
            model_name = private$model_info$model_name,
            model_label = private$model_info$model_label,
            features = self$model_config$features,
            method = self$model_config$method,
            noise_factor = self$model_config$noise_factor,
            optimizer = self$model_config$optimizer
          )

          # Add new data
          embedded_texts_large$add_embeddings_from_EmbeddedText(embeddings)
        } else {
          # Add new data
          embedded_texts_large$add_embeddings_from_EmbeddedText(embeddings)
        }
        if (trace == TRUE) {
          cat(paste(
            date(),
            "Batch", i, "/", total_number_of_bachtes, "done", "\n"
          ))
        }
        gc()
      }
      return(embedded_texts_large)
    },
    #--------------------------------------------------------------------------
    #' @description Check if the [TEFeatureExtractor] is trained.
    #' @return Returns `TRUE` if the object is trained and `FALSE` if not.
    is_trained = function() {
      return(private$trained)
    }
  ),
  private = list(
    trained = FALSE,
    #--------------------------------------------------------------------------
    load_reload_python_scripts = function() {
      reticulate::py_run_file(system.file("python/py_functions.py",
        package = "aifeducation"
      ))
      if (private$ml_framework == "tensorflow") {
        reticulate::py_run_file(system.file("python/keras_autoencoder.py",
          package = "aifeducation"
        ))
        reticulate::py_run_file(system.file("python/keras_callbacks.py",
          package = "aifeducation"
        ))
      } else if (private$ml_framework == "pytorch") {
        reticulate::py_run_file(system.file("python/pytorch_te_classifier.py",
          package = "aifeducation"
        ))
        reticulate::py_run_file(system.file("python/pytorch_autoencoder.py",
          package = "aifeducation"
        ))
        reticulate::py_run_file(system.file("python/py_log.py",
          package = "aifeducation"
        ))
      }
    },
    #--------------------------------------------------------------------------
    create_reset_model = function() {
      private$load_reload_python_scripts()
      private$check_config_for_TRUE()
      if (private$ml_framework == "pytorch") {
        if (self$model_config$method == "lstm") {
          self$model <- py$LSTMAutoencoder_with_Mask_PT(
            times = as.integer(private$text_embedding_model["times"]),
            features_in = as.integer(private$text_embedding_model["features"]),
            features_out = as.integer(self$model_config$features),
            noise_factor = self$model_config$noise_factor
          )
        } else if (self$model_config$method == "dense") {
          self$model <- feature_extractor <- py$DenseAutoencoder_with_Mask_PT(
            features_in = as.integer(private$text_embedding_model["features"]),
            features_out = as.integer(self$model_config$features),
            noise_factor = self$model_config$noise_factor
          )
        } else if (self$model_config$method == "conv") {
          self$model <- feature_extractor <- py$ConvAutoencoder_with_Mask_PT(
            features_in = as.integer(private$text_embedding_model["features"]),
            features_out = as.integer(self$model_config$features),
            noise_factor = self$model_config$noise_factor
          )
        }
      } else if (private$ml_framework == "tensorflow") {
        if (self$model_config$method == "lstm") {
          self$model <- py$LSTMAutoencoder_with_Mask_TF(
            times = as.integer(private$text_embedding_model["times"]),
            features_in = as.integer(private$text_embedding_model["features"]),
            features_out = as.integer(self$model_config$features),
            noise_factor = self$model_config$noise_factor
          )
        }
      }
    },
    #--------------------------------------------------------------------------
    init_gui = function(data_manager) {
      # Check for a running Shiny App and set the configuration
      # The Gui functions must be set in the server function of shiny globally
      if (requireNamespace("shiny", quietly = TRUE) & requireNamespace("shinyWidgets", quietly = TRUE)) {
        if (shiny::isRunning()) {
          private$gui$shiny_app_active <- TRUE
        } else {
          private$gui$shiny_app_active <- FALSE
        }
      } else {
        private$gui$shiny_app_active <- FALSE
      }

      # SetUp Progressbar for UI
      private$gui$pgr_value <- -1
      private$gui$pgr_max_value <- data_manager$get_n_folds() + 1 +
        (data_manager$get_n_folds() + 1) * self$last_training$config$use_pl * self$last_training$config$pl_max_steps
    }
  )
)