R/RSLSMOTE.R
In SMOTEWB: Imbalanced Resampling using SMOTE with Boosting (SMOTEWB)

Documented in RSLSMOTE

#' @title  Relocating safe-level SMOTE with minority outcast handling
#'
#' @description The Relocating Safe-Level SMOTE (RSLS) algorithm improves the
#' quality of synthetic samples generated by Safe-Level SMOTE (SLS) by
#' relocating specific synthetic data points that are too close to the majority
#' class distribution towards the original minority class distribution in the
#' feature space.
#'
#' @param x feature matrix or data.frame.
#' @param y a factor class variable with two classes.
#' @param k1 number of neighbors to link. Default is 5.
#' @param k2 number of neighbors to determine safe levels. Default is 5.
#' @param n_needed vector of desired number of synthetic samples for each class.
#' A vector of integers for each class. Default is NULL meaning full balance.
#'
#' @details
#' In Safe-level SMOTE (SLS), a safe-level threshold is used to control the number of synthetic
#' samples generated from each minority instance. This threshold is calculated
#' based on the number of minority and majority instances in the local
#' neighborhood of each minority instance. SLS generates synthetic samples that
#' are located closer to the original minority class distribution in the feature
#' space.
#'
#' In Relocating safe-level SMOTE (RSLS), after generating synthetic samples
#' using the SLS algorithm, the algorithm relocates specific synthetic data
#' points that are deemed to be too close to the majority class distribution in
#' the feature space. The relocation process moves these synthetic data points
#' towards the original minority class distribution in the feature space.
#'
#' This relocation process is performed by first identifying the synthetic data
#' points that are too close to the majority class distribution. Then, for each
#' identified synthetic data point, the algorithm calculates a relocation vector
#' based on the distance between the synthetic data point and its k nearest
#' minority class instances. This relocation vector is used to move the
#' synthetic data point towards the minority class distribution in the feature
#' space.
#'
#' Can work with classes more than 2.
#'
#' @return a list with resampled dataset.
#'  \item{x_new}{Resampled feature matrix.}
#'  \item{y_new}{Resampled target variable.}
#'  \item{x_syn}{Generated synthetic data.}
#'  \item{C}{Number of synthetic samples for each positive class samples.}
#'
#' @author Fatih Saglam, saglamf89@gmail.com
#'
#' @importFrom  Rfast Dist
#' @importFrom  FNN knnx.index
#' @importFrom  stats rnorm
#' @importFrom  stats sd
#'
#' @references
#' Siriseriwan, W., & Sinapiromsaran, K. (2016). The effective redistribution
#' for imbalance dataset: Relocating safe-level SMOTE with minority outcast
#' handling. Chiang Mai J. Sci, 43(1), 234-246.
#'
#' @examples
#'
#' set.seed(1)
#' x <- rbind(matrix(rnorm(2000, 3, 1), ncol = 2, nrow = 1000),
#'            matrix(rnorm(100, 5, 1), ncol = 2, nrow = 50))
#' y <- as.factor(c(rep("negative", 1000), rep("positive", 50)))
#'
#' plot(x, col = y)
#'
#' # resampling
#' m <- RSLSMOTE(x = x, y = y, k1 = 5, k2 = 5)
#'
#' plot(m$x_new, col = m$y_new)
#'
#' @rdname RSLSMOTE
#' @export

RSLSMOTE <- function(x, y, k1 = 5, k2 = 5, n_needed = NULL) {

  if (!is.data.frame(x) & !is.matrix(x)) {
    stop("x must be a matrix or dataframe")
  }

  if (is.data.frame(x)) {
    x <- as.matrix(x)
  }

  if (!is.factor(y)) {
    stop("y must be a factor")
  }

  if (!is.numeric(k1)) {
    stop("k1 must be numeric")
  }

  if (k1 < 1) {
    stop("k1 must be positive")
  }

  if (!is.numeric(k2)) {
    stop("k2 must be numeric")
  }

  if (k2 < 1) {
    stop("k2 must be positive")
  }

  var_names <- colnames(x)
  x <- as.matrix(x)
  p <- ncol(x)

  class_names <- as.character(levels(y))
  n_classes <- sapply(class_names, function(m) sum(y == m))
  k_class <- length(class_names)
  x_classes <- lapply(class_names, function(m) x[y == m,, drop = FALSE])

  if (is.null(n_needed)) {
    n_needed <- max(n_classes) - n_classes
  }
  if (length(n_needed) != k_class) {
    stop("n_needed must be an integer vector matching the number of classes.")
  }

  x_syn <- matrix(NA, nrow = 0, ncol = p)
  y_syn <- factor(c(), levels = class_names)
  C <- list()

  for (j in 1:k_class) {
    m_syn <- generateRSLSMOTE(
      x_pos = x_classes[[j]],
      x_neg = do.call(rbind, x_classes[-j]),
      n_syn = n_needed[j],
      k1 = k1,
      k2 = k2,
      class_pos = class_names[j],
      class_names = class_names
    )

    x_syn <- rbind(x_syn, m_syn$x_syn)
    y_syn <- c(y_syn, m_syn$y_syn)

    C[[j]] <- m_syn$C
  }

  x_new <- rbind(x, x_syn)
  y_new <- c(y, y_syn)
  colnames(x_new) <- var_names
  names(C) <- class_names

  return(list(
    x_new = x_new,
    y_new = y_new,
    x_syn = x_syn,
    y_syn = y_syn,
    C = C
  ))
}