R/utils_DataManager.R

Defines functions get_n_chunks get_stratified_train_test_split get_folds get_train_test_split create_synthetic_units_from_matrix get_synthetic_cases_from_matrix

Documented in create_synthetic_units_from_matrix get_n_chunks get_synthetic_cases_from_matrix

# This file is part of the R package "aifeducation".
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as published by
# the Free Software Foundation.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>

#-----------------------------------------------------------------------------
#' @title Create synthetic cases for balancing training data
#' @description This function creates synthetic cases for balancing the training with classifier models.
#' @param matrix_form Named `matrix` containing the text embeddings in a matrix form.
#' @param target Named `factor` containing the labels of the corresponding embeddings.
#' @param times `int` for the number of sequences/times.
#' @param features `int` for the number of features within each sequence.
#' @param sequence_length `int` Length of the text embedding sequences.
#' @param method `vector` containing strings of the requested methods for generating new cases. Currently
#'   "knnor" from this package is available.
#' @param min_k `int` The minimal number of nearest neighbors during sampling process.
#' @param max_k `int` The maximum number of nearest neighbors during sampling process.
#' @return `list` with the following components:
#'   * `syntetic_embeddings`: Named `data.frame` containing the text embeddings of the synthetic cases.
#'   * `syntetic_targets`: Named `factor` containing the labels of the corresponding synthetic cases.
#'   * `n_syntetic_units`: `table` showing the number of synthetic cases for every label/category.
#'
#' @family Utils Developers
#'
#' @export
#' @import foreach
#' @import doParallel
get_synthetic_cases_from_matrix <- function(matrix_form,
                                            times,
                                            features,
                                            target,
                                            sequence_length,
                                            method = "knnor",
                                            min_k = 1L,
                                            max_k = 6L) {
  # get possible seq lengthes in order to group the cases by sequence length
  seq_length_categories <- as.numeric(names(table(sequence_length)))

  index <- 1L
  input <- NULL

  # Create tasks for every group of sequence lengths
  for (current_seq_length in seq_length_categories) {
    condition <- (sequence_length == current_seq_length)
    idx <- which(condition)
    cat_freq <- table(target[idx])
    categories <- names(cat_freq)
    max_freq <- max(cat_freq)

    for (cat in categories) {
      # Check k and adjust if necessary
      n_neighbors <- cat_freq[cat] - 1L

      if (n_neighbors <= max_k) {
        max_k_final <- n_neighbors
        if (min_k > max_k_final) {
          min_k_final <- max_k_final
        } else {
          min_k_final <- min_k
        }
      } else {
        max_k_final <- max_k
        min_k_final <- min_k
      }

      max_k_final <- as.numeric(max_k_final)
      min_k_final <- as.numeric(min_k_final)

      # Check k and adjust according to the difference to the major category
      required_cases <- as.numeric(max_freq - cat_freq[cat])
      n_k <- length(min_k_final:max_k_final)
      if (required_cases < n_k) {
        difference <- n_k - required_cases
        min_k_final <- min_k_final + difference
      }

      if (cat_freq[cat] < max_freq && min_k > 0L && cat_freq[cat] > 3L && required_cases > 0L) {
        for (m in seq_len(length(method))) {
          for (k in min_k_final:max_k_final) {
            if (length(max_k_final) > 1L) {
              stop("length")
            }
            if (max_k_final < 0L) {
              stop("max smaller 0")
            }
            # print(as.numeric(max_k_final))
            # print(class(max_k_final))
            input[[index]] <- list(
              cat = as.character(cat),
              required_cases = as.numeric(required_cases),
              k = as.numeric(k),
              method = as.character(method[m]),
              selected_cases = idx,
              chunks = as.character(current_seq_length),
              k_s = length(min_k_final:max_k_final),
              max_k = as.numeric(max_k_final)
            )
            index <- index + 1L
          }
        }
      }
    }
  }
  # print(input)
  # return(input)


  result_list <- foreach::foreach(
    index = seq_len(length(input)),
    .export = "create_synthetic_units_from_matrix",
    .errorhandling = "pass"
  ) %dopar% {
    # index <- 1
    create_synthetic_units_from_matrix(
      matrix_form = matrix_form[
        input[[index]]$selected_cases,
        c(1L:(input[[index]]$chunks * features))
      ],
      target = target[input[[index]]$selected_cases],
      required_cases = input[[index]]$required_cases,
      k = input[[index]]$k,
      method = input[[index]]$method,
      cat = input[[index]]$cat,
      k_s = input[[index]]$k_s,
      max_k = input[[index]]$max_k
    )
  }

  # get number of synthetic cases
  n_syn_cases <- 0L
  for (i in seq_len(length(result_list))) {
    if (!is.null(result_list[[i]]$syntetic_embeddings)) {
      n_syn_cases <- n_syn_cases + nrow(result_list[[i]]$syntetic_embeddings)
    }
  }

  syntetic_embeddings <- matrix(
    data = 0L,
    nrow = n_syn_cases,
    ncol = times * features
  )
  colnames(syntetic_embeddings) <- colnames(matrix_form)
  syntetic_embeddings <- as.data.frame(syntetic_embeddings)
  syntetic_targets <- NULL

  n_row <- 0L
  names_vector <- NULL
  for (i in seq_len(length(result_list))) {
    if (!is.null(result_list[[i]]$syntetic_embeddings)) {
      syntetic_embeddings[
        (n_row + 1L):(n_row + nrow(result_list[[i]]$syntetic_embeddings)),
        c(seq_len(ncol(result_list[[i]]$syntetic_embeddings)))
      ] <- result_list[[i]]$syntetic_embeddings[, c(seq_len(ncol(result_list[[i]]$syntetic_embeddings)))]
      syntetic_targets <- append(syntetic_targets, values = result_list[[i]]$syntetic_targets)
      n_row <- n_row + nrow(result_list[[i]]$syntetic_embeddings)
      names_vector <- append(
        x = names_vector,
        values = rownames(result_list[[i]]$syntetic_embeddings)
      )
    }
  }

  # Transform matrix back to array
  syntetic_embeddings <- matrix_to_array_c(
    matrix = as.matrix(syntetic_embeddings),
    times = times,
    features = features
  )
  rownames(syntetic_embeddings) <- names_vector
  # dimnames(syntetic_embeddings)[3] <- feature_names

  n_syntetic_units <- table(syntetic_targets)

  results <- NULL
  results["syntetic_embeddings"] <- list(syntetic_embeddings)
  results["syntetic_targets"] <- list(syntetic_targets)
  results["n_syntetic_units"] <- list(n_syntetic_units)

  return(results)
}

#---------------------------------------------
#' @title Create synthetic units
#' @description Function for creating synthetic cases in order to balance the data for training with
#'   [TEClassifierRegular] or [TEClassifierProtoNet]]. This is an auxiliary function for use with
#'   [get_synthetic_cases_from_matrix] to allow parallel computations.
#'
#' @param matrix_form Named `matrix` containing the text embeddings in matrix form. In most cases this object is taken
#'   from [EmbeddedText]$embeddings.
#' @param target Named `factor` containing the labels/categories of the corresponding cases.
#' @param required_cases `int` Number of cases necessary to fill the gab between the frequency of the class under
#'   investigation and the major class.
#' @param k `int` The number of nearest neighbors during sampling process.
#' @param max_k `int` The maximum number of nearest neighbors during sampling process.
#' @param k_s `int` Number of ks in the complete generation process.
#' @param method `vector` containing strings of the requested methods for generating new cases. Currently
#'   "knnor" from this package is available.
#' @param cat `string` The category for which new cases should be created.
#' @return Returns a `list` which contains the text embeddings of the new synthetic cases as a named `data.frame` and
#'   their labels as a named `factor`.
#'
#' @family Utils Developers
#'
#' @export
# TODO (Yuliia): k_s and max_k parameters can be removed
create_synthetic_units_from_matrix <- function(matrix_form,
                                               target,
                                               required_cases,
                                               k,
                                               method,
                                               cat,
                                               k_s,
                                               max_k) {
  # Transform to a binary problem
  tmp_target <- (target == cat)

  syn_data <- NULL
  if (method == "knnor") {
    syn_data <- try(
      knnor(
        dataset = list(
          embeddings = matrix_form,
          labels = tmp_target
        ),
        k = k,
        aug_num = required_cases
      ),
      silent = TRUE
    )
  }

  if (
    !inherits(x = syn_data, what = "try-error") &&
      (!is.null(syn_data) || nrow(syn_data$syn_data) > 0L)
  ) {
    n_cols_embedding <- ncol(matrix_form)
    tmp_data <- syn_data$syn_data[1L:required_cases, -ncol(syn_data$syn_data)]
    rownames(tmp_data) <- paste0(
      method, "_", cat, "_", k, "_", n_cols_embedding, "_",
      seq(from = 1L, to = nrow(tmp_data), by = 1L)
    )
    tmp_data <- as.data.frame(tmp_data)
    tmp_target <- rep(cat, times = nrow(tmp_data))
    names(tmp_target) <- rownames(tmp_data)

    results <- list(
      syntetic_embeddings = tmp_data,
      syntetic_targets = tmp_target
    )
  } else {
    results <- list(
      syntetic_embeddings = NULL,
      syntetic_targets = NULL
    )
  }

  return(results)
}

#------------------------------------------------------------------------------
#' @title Function for splitting data into a train and validation sample
#' @description This function creates a train and validation sample based on stratified random sampling. The relative
#'   frequencies of each category in the train and validation sample equal the relative frequencies of the initial data
#'   (proportional stratified sampling).
#'
#' @param embedding Object of class [EmbeddedText].
#' @param target Named `factor` containing the labels of every case.
#' @param val_size `double` Ratio between 0 and 1 indicating the relative frequency of cases which should be used as
#'   validation sample.
#' @return Returns a `list` with the following components:
#'   * `target_train`: Named `factor` containing the labels of the training sample.
#'   * `embeddings_train`: Object of class [EmbeddedText] containing the text embeddings for the training sample.
#'   * `target_test`: Named `factor` containing the labels of the validation sample.
#'   * `embeddings_test`: Object of class [EmbeddedText] containing the text embeddings for the validation sample.
#'
#' @family Utils Developers
#' @keywords internal
#' @noRd
get_train_test_split <- function(embedding = NULL,
                                 target,
                                 val_size) {
  categories <- names(table(target))
  val_sampe <- NULL
  for (cat in categories) {
    tmp <- subset(target, target == cat)
    val_sampe[cat] <- list(
      sample(names(tmp), size = max(1L, length(tmp) * val_size))
    )
  }
  val_data <- target[unlist(val_sampe)]
  train_data <- target[setdiff(names(target), names(val_data))]

  if (!is.null(embedding)) {
    val_embeddings <- embedding$clone(deep = TRUE)
    val_embeddings$embeddings <- val_embeddings$embeddings[names(val_data), ]
    val_embeddings$embeddings <- na.omit(val_embeddings$embeddings)
    train_embeddings <- embedding$clone(deep = TRUE)
    train_embeddings$embeddings <- train_embeddings$embeddings[names(train_data), ]
    train_embeddings$embeddings <- na.omit(train_embeddings$embeddings)

    results <- list(
      target_train = train_data,
      embeddings_train = train_embeddings,
      target_test = val_data,
      embeddings_test = val_embeddings
    )
  } else {
    results <- list(
      target_train = train_data,
      embeddings_train = NA,
      target_test = val_data,
      embeddings_test = NA
    )
  }

  return(results)
}

#-----------------------------------------------------------------------------
#' @title Create cross-validation samples
#' @description Function creates cross-validation samples and ensures that the relative frequency for every
#'   category/label within a fold equals the relative frequency of the category/label within the initial data.
#'
#' @param target Named `factor` containing the relevant labels/categories. Missing cases should be declared with `NA`.
#' @param k_folds `int` number of folds.
#'
#' @return Return a `list` with the following components:
#'   * `val_sample`: `vector` of `strings` containing the names of cases of the validation sample.
#'   * `train_sample`: `vector` of `strings` containing the names of cases of the train sample.
#'   * `n_folds`: int` Number of realized folds.
#'   * `unlabeled_cases`: `vector` of `strings` containing the names of the unlabeled cases.
#'
#' @note The parameter `target` allows cases with missing categories/labels. These should be declared with `NA`. All
#'   these cases are ignored for creating the different folds. Their names are saved within the component
#'   `unlabeled_cases`. These cases can be used for Pseudo Labeling.
#' @note the function checks the absolute frequencies of every category/label. If the absolute frequency is not
#'   sufficient to ensure at least four cases in every fold, the number of folds is adjusted. In these cases, a warning
#'   is printed to the console. At least four cases per fold are necessary to ensure that the training of
#'   [TEClassifierRegular] or [TEClassifierProtoNet] works well with all options turned on.
#' @family Utils Developers
#' @keywords internal
#' @noRd
get_folds <- function(target,
                      k_folds) {
  sample_target <- na.omit(target)
  freq_cat <- table(sample_target)
  categories <- names(freq_cat)
  min_freq <- min(freq_cat)

  if (min_freq / k_folds < 1L) {
    fin_k_folds <- min_freq
    warning("Frequency of the smallest category/label is not sufficent to ensure
                  at least 1 cases per fold. Adjusting number of folds from ", k_folds, " to ", fin_k_folds, ".")
    if (fin_k_folds == 0L) {
      stop("Frequency of the smallest category/label is to low. Please check your data.
           Consider to remove all categories/labels with a very low absolute frequency.")
    }
  } else {
    fin_k_folds <- k_folds
  }

  final_assignments <- NULL
  for (cat in categories) {
    condition <- (sample_target == cat)
    focused_targets <- subset(
      x = sample_target,
      subset = condition
    )
    n_cases <- length(focused_targets)

    cases_per_fold <- vector(length = fin_k_folds)
    cases_per_fold[] <- ceiling(n_cases / fin_k_folds)

    delta <- sum(cases_per_fold) - n_cases
    if (delta > 0L) {
      for (i in 1L:delta) {
        cases_per_fold[1L + (i - 1L) %% fin_k_folds] <- cases_per_fold[1L + (i - 1L) %% fin_k_folds] - 1L
      }
    }

    possible_assignments <- NULL
    for (i in seq_len(length(cases_per_fold))) {
      possible_assignments <- append(
        x = possible_assignments,
        values = rep.int(
          x = i,
          times = cases_per_fold[i]
        )
      )
    }

    assignments <- sample(
      x = possible_assignments,
      size = length(possible_assignments),
      replace = FALSE
    )
    names(assignments) <- names(focused_targets)
    final_assignments <- append(
      x = final_assignments,
      values = assignments
    )
  }

  val_sample <- NULL
  for (i in 1L:fin_k_folds) {
    condition <- (final_assignments == i)
    val_sample[i] <- list(names(subset(
      x = final_assignments,
      subset = condition
    )))
  }

  train_sample <- NULL
  for (i in 1L:fin_k_folds) {
    train_sample[i] <- list(setdiff(x = names(sample_target), y = val_sample[[i]]))
  }

  unlabeled_cases <- setdiff(x = names(target), y = c(val_sample[[1L]], train_sample[[1L]]))

  results <- list(
    val_sample = val_sample,
    train_sample = train_sample,
    n_folds = fin_k_folds,
    unlabeled_cases = unlabeled_cases
  )
  return(results)
}

#-------------------------------------------------------------------------------
#' @title Create a stratified random sample
#' @description This function creates a stratified random sample.The difference to `get_train_test_split` is that this
#'   function does not require text embeddings and does not split the text embeddings into a train and validation
#'   sample.
#'
#' @param targets Named `vector` containing the labels/categories for each case.
#' @param val_size `double` Value between 0 and 1 indicating how many cases of each label/category should be part of the
#'   validation sample.
#' @return `list` which contains the names of the cases belonging to the train sample and to the validation sample.
#' @family Utils Developers
#' @keywords internal
#' @noRd
get_stratified_train_test_split <- function(targets, val_size = 0.25) {
  test_sample <- NULL
  categories <- names(table(targets))

  for (cat in categories) {
    condition <- (targets == cat)
    tmp <- names(subset(
      x = targets,
      subset = condition
    ))
    test_sample[cat] <- list(
      sample(tmp, size = max(1L, length(tmp) * val_size))
    )
  }
  test_sample <- unlist(test_sample, use.names = FALSE)
  train_sample <- setdiff(names(targets), test_sample)

  results <- list(
    test_sample = test_sample,
    train_sample = train_sample
  )
  return(results)
}

#------------------------------------------------------------------------------
#' @title Get the number of chunks/sequences for each case
#' @description Function for calculating the number of chunks/sequences for every case.
#'
#' @param text_embeddings `data.frame` or `array` containing the text embeddings.
#' @param features `int` Number of features within each sequence.
#' @param times `int` Number of sequences.
#' @param pad_value `r get_param_doc_desc("pad_value")`
#' @return Named`vector` of integers representing the number of chunks/sequences for every case.
#'
#' @family Utils Developers
#'
#' @export
get_n_chunks <- function(text_embeddings, features, times, pad_value = -100L) {
  n_chunks <- vector(length = nrow(text_embeddings))
  n_chunks[] <- 0L

  if (length(dim(text_embeddings)) == 2L) {
    for (i in 1L:times) {
      window <- c(1L:features) + (i - 1L) * features
      sub_matrix <- text_embeddings[, window, drop = FALSE]
      tmp_sums <- rowSums(sub_matrix)
      n_chunks <- n_chunks + as.numeric(!tmp_sums == times * pad_value)
    }
  } else if (length(dim(text_embeddings)) == 3L) {
    for (i in 1L:times) {
      sub_matrix <- text_embeddings[, i, , drop = FALSE]
      tmp_sums <- rowSums(sub_matrix)
      n_chunks <- n_chunks + as.numeric(!tmp_sums == features * pad_value)
    }
  } else {
    stop("Dimensionality of text_embeddings must be 2 (matrix) or 3 (array).")
  }
  names(n_chunks) <- rownames(text_embeddings)
  return(n_chunks)
}

Try the aifeducation package in your browser

Any scripts or data that you put into this service are public.

aifeducation documentation built on Nov. 19, 2025, 5:08 p.m.