Nothing
#' @title Safe-level Synthetic Minority Oversampling Technique
#'
#' @description \code{SLSMOTE()} generates synthetic samples by considering a
#' safe level of the nearest minority class examples.
#'
#' @param x feature matrix or data.frame.
#' @param y a factor class variable with two classes.
#' @param k1 number of neighbors to link. Default is 5.
#' @param k2 number of neighbors to determine safe levels. Default is 5.
#'
#' @details
#' SLSMOTE uses the safe-level distance metric to identify the minority class
#' samples that are safe to oversample. Safe-level distance measures the
#' distance between a minority class sample and its k-nearest minority class
#' neighbors. A sample is considered safe to oversample if its safe-level is
#' greater than a threshold. The safe-level of a sample is the ratio of minority
#' class samples among its k-nearest neighbors.
#'
#' In SLSMOTE, the oversampling process only applies to the safe minority class
#' samples, which avoids the generation of noisy samples that can lead to
#' overfitting. To generate synthetic samples, SLSMOTE randomly selects a
#' minority class sample and finds its k-nearest minority class neighbors.
#' Then, a random minority class neighbor is selected, and a synthetic sample
#' is generated by adding a random proportion of the difference between the
#' selected sample and its neighbor to the selected sample.
#'
#' Note: Much faster than \code{smotefamily::SLS()}.
#'
#' @return a list with resampled dataset.
#' \item{x_new}{Resampled feature matrix.}
#' \item{y_new}{Resampled target variable.}
#' \item{x_syn}{Generated synthetic data.}
#' \item{C}{Number of synthetic samples for each positive class samples.}
#'
#' @author Fatih Saglam, saglamf89@gmail.com
#'
#' @importFrom FNN knnx.index
#' @importFrom stats rnorm
#' @importFrom stats sd
#'
#' @references
#' Bunkhumpornpat, C., Sinapiromsaran, K., & Lursinsap, C. (2009).
#' Safe-level-smote: Safe-level-synthetic minority over-sampling technique for
#' handling the class imbalanced problem. In Advances in Knowledge Discovery
#' and Data Mining: 13th Pacific-Asia Conference, PAKDD 2009 Bangkok, Thailand,
#' April 27-30, 2009 Proceedings 13 (pp. 475-482). Springer Berlin Heidelberg.
#'
#' @examples
#'
#' set.seed(1)
#' x <- rbind(matrix(rnorm(2000, 3, 1), ncol = 2, nrow = 1000),
#' matrix(rnorm(100, 5, 1), ncol = 2, nrow = 50))
#' y <- as.factor(c(rep("negative", 1000), rep("positive", 50)))
#'
#' plot(x, col = y)
#'
#' # resampling
#' m <- SLSMOTE(x = x, y = y, k1 = 5, k2 = 5)
#'
#' plot(m$x_new, col = m$y_new)
#'
#' @rdname SLSMOTE
#' @export
SLSMOTE <- function(x, y, k1 = 5, k2 = 5) {
if (!is.data.frame(x) & !is.matrix(x)) {
stop("x must be a matrix or dataframe")
}
if (is.data.frame(x)) {
x <- as.matrix(x)
}
if (!is.factor(y)) {
stop("y must be a factor")
}
if (!is.numeric(k1)) {
stop("k1 must be numeric")
}
if (k1 < 1) {
stop("k1 must be positive")
}
if (!is.numeric(k2)) {
stop("k2 must be numeric")
}
if (k2 < 1) {
stop("k2 must be positive")
}
var_names <- colnames(x)
x <- as.matrix(x)
p <- ncol(x)
class_names <- as.character(unique(y))
class_pos <- names(which.min(table(y)))
class_neg <- class_names[class_names != class_pos]
x_pos <- x[y == class_pos,,drop = FALSE]
x_neg <- x[y == class_neg,,drop = FALSE]
n_pos <- nrow(x_pos)
n_neg <- nrow(x_neg)
x <- rbind(x_pos, x_neg)
nn_pos2all <- FNN::knnx.index(data = x, query = x_pos, k = k2 + 1)[,-1]
nn_pos2pos <- FNN::knnx.index(data = x_pos, query = x_pos, k = k1 + 1)[,-1]
nn_pos2all_classcounts <- cbind(
rowSums(nn_pos2all <= n_pos),
rowSums(nn_pos2all > n_pos)
)
safe_levels <- nn_pos2all_classcounts[,1]
i_safe <- which(safe_levels > 0)
x_pos_safe <- x_pos[i_safe,,drop = FALSE]
n_safe <- nrow(x_pos_safe)
n_syn <- (n_neg - n_pos)
C <- rep(0, n_pos)
C[i_safe] <- rep(ceiling(n_syn/n_safe) - 1, n_safe)
n_diff <- (n_syn - sum(C))
ii <- sample(1:n_safe, size = abs(n_diff))
C[i_safe][ii] <- C[i_safe][ii] + n_diff/abs(n_diff)
x_syn <- matrix(nrow = 0, ncol = p)
for (i in 1:n_pos) {
if (safe_levels[i] > 0 & C[i] > 0) {
i_k <- sample(1:k1, C[i], replace = TRUE)
i_nn_pos2pos <- nn_pos2pos[i, i_k]
k_safe_levels <- safe_levels[i_nn_pos2pos]
r <- rep(0, C[i])
for (j in 1:C[i]) {
if (k_safe_levels[j] == 0) {
r[j] <- 0
} else if (k_safe_levels[j] == safe_levels[i]) {
r[j] <- runif(1, 0, 1)
} else if (k_safe_levels[j] < safe_levels[i]) {
r[j] <- runif(1, 0, k_safe_levels[j]/safe_levels[i])
} else {
r[j] <- runif(1, 1 - safe_levels[i]/k_safe_levels[j], 1)
}
}
x_pos_step <- x_pos[rep(i, C[i]),,drop = FALSE]
x_pos_k <- x_pos[i_nn_pos2pos,,drop = FALSE]
x_syn_step <- x_pos_step + (x_pos_k - x_pos_step)*r
x_syn <- rbind(x_syn, x_syn_step)
}
}
x_new <- rbind(
x_syn,
x_pos,
x_neg
)
y_new <- c(
rep(class_pos, n_syn + n_pos),
rep(class_neg, n_neg)
)
y_new <- factor(y_new, levels = levels(y), labels = levels(y))
colnames(x_new) <- var_names
return(list(
x_new = x_new,
y_new = y_new,
x_syn = x_new[1:n_syn,,drop = FALSE],
C = C
))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.