# The ubUnder(), ubOver(), and ubSMOTE() functions were taken from the unbalanced package and modified.
# Unbalanced packages have not been upgraded since 2015.
# And it was removed from CRAN because it did not meet CRAN's criteria.
# So I take the core logic and use it.
ubUnder <- function (X, Y, perc = 50) {
stopifnot(all(unique(Y) %in% c(0, 1)))
N <- length(Y)
i_pos <- which(Y == 1)
n_pos <- length(i_pos)
i_neg <- which(Y == 0)
n_neg <- length(i_neg)
if (n_pos >= n_neg) {
stop("less 0s instances than 1s, the minority class has to be class 1")
}
stopifnot(perc >= (n_pos/N * 100), perc <= 50)
n_neg_sub <- floor(n_pos * (100 - perc)/perc)
w <- rep(1/n_neg, n_neg)
if (n_neg_sub <= n_neg) {
i_neg_sub <- sample(i_neg, n_neg_sub, prob = w)
} else {
stop("subset of majoirty instances bigger than orginal set of majoirty instances")
}
idx <- c(i_neg_sub, i_pos)
idx <- sort(idx)
if (is.vector(X) != TRUE) {
X <- X[idx, ]
} else {
X <- X[idx]
}
Y <- Y[idx]
return(list(X = X, Y = Y))
}
ubOver <- function (X, Y, k = 0, verbose = TRUE) {
stopifnot(k >= 0, class(verbose) == "logical", all(unique(Y) %in% c(0, 1)))
i_pos <- which(Y == 1)
n_pos <- length(i_pos)
i_neg <- which(Y == 0)
n_neg <- length(i_neg)
max_k <- floor(n_neg/n_pos)
if (k == 0) {
i_pos_over <- sample(i_pos, n_neg, replace = TRUE)
}
if (k > 0) {
n_pos_over <- n_pos * k
if (n_pos_over > n_neg) {
if (verbose)
cat("Max number of times allowed to replicate minority class is",
max_k, "\n taking as many samples as the majority class \n")
n_pos_over <- n_neg
}
i_pos_over <- sample(i_pos, n_pos_over, replace = TRUE)
}
idx = c(i_neg, i_pos_over)
idx <- sort(idx)
if (is.vector(X) != TRUE) {
X = X[idx, ]
} else {
X = X[idx]
}
Y = Y[idx]
return(list(X = X, Y = Y))
}
#' @importFrom stats runif
ubSmoteExs <- function (data, N = 200, k = 5) {
nomatr <- c()
n_row <- NROW(data)
n_col <- NCOL(data)
mat <- matrix(nrow = n_row, ncol = n_col - 1)
for (col in seq.int(NCOL(mat))) {
obs <- data[, col]
cl <- class(obs)
if (cl %in% c("Date", "POSIXct", "POSIXt"))
stop("cannot SMOTE variables of class Date, POSIXct or POSIXt")
if (cl %in% c("factor", "character")) {
mat[, col] <- as.integer(obs)
nomatr <- c(nomatr, col)
} else {
mat[, col] <- obs
}
}
if (N < 100) {
idx <- sample(1:n_row, as.integer((N/100) * n_row))
mat <- mat[idx, ]
N <- 100
}
p <- NCOL(mat)
nmat <- NROW(mat)
ranges <- apply(mat, 2, max) - apply(mat, 2, min)
nexs <- as.integer(N/100)
new <- matrix(nrow = nexs * nmat, ncol = p)
for (i in 1:nmat) {
xd <- scale(mat, mat[i, ], ranges)
for (a in nomatr) {
xd[, a] <- xd[, a] == 0
}
dd <- drop(xd^2 %*% rep(1, ncol(xd)))
kNNs <- order(dd)[2:(k + 1)]
for (n in 1:nexs) {
neig <- sample(1:k, 1)
ex <- vector(length = ncol(mat))
difs <- mat[kNNs[neig], ] - mat[i, ]
new[(i - 1) * nexs + n, ] <- mat[i, ] + stats::runif(1) * difs
for (a in nomatr) {
new[(i - 1) * nexs + n, a] <- c(mat[kNNs[neig], a], mat[i, a])[1 + round(runif(1), 0)]
}
}
}
newCases <- data.frame(new)
for (a in nomatr) {
newCases[, a] <- factor(newCases[, a],
levels = 1:nlevels(data[, a]),
labels = levels(data[, a]))
}
newCases[, "Y"] <- factor(rep(data[1, "Y"], nrow(newCases)),
levels = levels(data[, "Y"]))
colnames(newCases) <- colnames(data)
newCases
}
#' @importFrom dplyr bind_rows
ubSMOTE <- function (X, Y, perc.over = 200, k = 5, perc.under = 200) {
if (!is.factor(Y))
stop("Y has to be a factor")
if (is.vector(X))
stop("X cannot be a vector")
data <- cbind(X, Y)
idx_pos <- which(Y == 1)
newExs <- ubSmoteExs(data[idx_pos, ], perc.over, k)
is_narow <- apply(newExs, 1, function(x) any(is.na(x)))
if (any(is_narow)) {
newExs <- newExs[!is_narow, ]
colnames(newExs) <- colnames(data)
cat("WARNING: NAs generated by SMOmatE removed \n")
}
selMaj <- sample((1:NROW(data))[-idx_pos],
as.integer((perc.under / 100) * nrow(newExs)),
replace = TRUE)
newdataset <- dplyr::bind_rows(data[selMaj, ], data[idx_pos, ], newExs)
newdataset <- newdataset[sample(1:NROW(newdataset)), ]
X <- newdataset[, -ncol(newdataset)]
Y <- newdataset[, ncol(newdataset)]
return(list(X = X, Y = Y))
}
#' Extract the data to fit the model
#'
#' @description To solve the imbalanced class, perform sampling in the train set of split_df.
#' @details In order to solve the problem of imbalanced class, sampling is performed by under sampling,
#' over sampling, SMOTE method.
#' @section attributes of train_df class:
#' The attributes of the train_df class are as follows.:
#'
#' \itemize{
#' \item sample_seed : integer. random seed used for sampling
#' \item method : character. sampling methods.
#' \item perc : integer. perc argument value
#' \item k : integer. k argument value
#' \item perc.over : integer. perc.over argument value
#' \item perc.under : integer. perc.under argument value
#' \item binary : logical. whether the target variable is a binary class
#' \item target : character. target variable name
#' \item minority : character. the level of the minority class
#' \item majority : character. the level of the majority class
#' }
#'
#' @param .data an object of class "split_df", usually, a result of a call to split_df().
#' @param method character. sampling methods. "ubUnder" is under-sampling,
#' and "ubOver" is over-sampling, "ubSMOTE" is SMOTE(Synthetic Minority Over-sampling TEchnique).
#' @param seed integer. random seed used for sampling
#' @param perc integer. The percentage of positive class in the final dataset.
#' It is used only in under-sampling. The default is 50. perc can not exceed 50.
#' @param k integer. It is used only in over-sampling and SMOTE.
#' If over-sampling and if K=0: sample with replacement from the minority class until
#' we have the same number of instances in each class. under-sampling and if K>0:
#' sample with replacement from the minority class until we have k-times
#' the original number of minority instances.
#' If SMOTE, the number of neighbours to consider as the pool from where the new
#' examples are generated
#' @param perc.over integer. It is used only in SMOTE. per.over/100 is the number of new instances
#' generated for each rare instance. If perc.over < 100 a single instance is generated.
#' @param perc.under integer. It is used only in SMOTE. perc.under/100 is the number
#' of "normal" (majority class) instances that are randomly selected for each smoted
#' observation.
#' @return An object of train_df.
#' @export
#' @examples
#' library(dplyr)
#'
#' # Credit Card Default Data
#' head(ISLR::Default)
#'
#' # Generate data for the example
#' sb <- ISLR::Default %>%
#' split_by(default)
#'
#' # under-sampling with random seed
#' under <- sb %>%
#' sampling_target(seed = 1234L)
#'
#' under %>%
#' count(default)
#'
#' # under-sampling with random seed, and minority class frequency is 40%
#' under40 <- sb %>%
#' sampling_target(seed = 1234L, perc = 40)
#'
#' under40 %>%
#' count(default)
#'
#' # over-sampling with random seed
#' over <- sb %>%
#' sampling_target(method = "ubOver", seed = 1234L)
#'
#' over %>%
#' count(default)
#'
#' # over-sampling with random seed, and k = 10
#' over10 <- sb %>%
#' sampling_target(method = "ubOver", seed = 1234L, k = 10)
#'
#' over10 %>%
#' count(default)
#'
#' # SMOTE with random seed
#' smote <- sb %>%
#' sampling_target(method = "ubSMOTE", seed = 1234L)
#'
#' smote %>%
#' count(default)
#'
#' # SMOTE with random seed, and perc.under = 250
#' smote250 <- sb %>%
#' sampling_target(method = "ubSMOTE", seed = 1234L, perc.under = 250)
#'
#' smote250 %>%
#' count(default)
#'
#' @importFrom tidyselect vars_select
#' @importFrom rlang quos
#' @importFrom tibble as_tibble
#' @export
#'
sampling_target <- function(.data, method = c("ubUnder", "ubOver", "ubSMOTE"),
seed = NULL, perc = 50, k = ifelse(method == "ubSMOTE", 5, 0),
perc.over = 200, perc.under = 200) {
if(is(.data) != "split_df") {
stop("x is not split_df class")
}
target <- attr(.data, "target")
target_idx <- .data %>%
ungroup() %>%
dplyr::select(-split_flag) %>%
names(.) %in% target %>%
which
method <- match.arg(method)
minority <- attr(.data, "minority")
majority <- attr(.data, "majority")
Y <- .data %>%
filter(split_flag == "train") %>%
ungroup() %>%
dplyr::select(target = target_idx) %>%
mutate(target = ifelse(target == minority, "1", "0") %>%
factor(levels = c("1", "0"))) %>%
pull()
X <- .data %>%
filter(split_flag == "train") %>%
ungroup() %>%
dplyr::select(-target, -split_flag)
type_before <- sapply(X, function(x) is(x)[1])
if (method %in% c("ubSMOTE")) {
idx_ordered <- which(type_before == "ordered")
if (length(idx_ordered) > 0) {
for (i in idx_ordered) {
X[, i] <- factor(pull(X[, i]), ordered = FALSE)
}
}
idx_character <- which(type_before == "character")
if (length(idx_character) > 0) {
for (i in idx_character) {
X[, i] <- factor(pull(X[, i]))
}
}
}
if (is.null(seed))
seed <- sample(seq(1e5), size = 1)
set.seed(seed)
samples <- switch (method,
ubUnder = ubUnder(X, Y, perc = perc),
ubOver = ubOver(X, Y, k = k),
ubSMOTE = ubSMOTE(data.frame(X), Y, perc.over = perc.over, k = k,
perc.under = perc.under)
)
if (method == "ubUnder") {
result <- .data %>%
filter(split_flag == "train") %>%
ungroup() %>%
dplyr::select(-split_flag)
} else if(method %in% c("ubOver", "ubSMOTE")) {
Y <- factor(ifelse(samples$Y == 1, minority, majority))
if (method == "ubSMOTE") {
type_after <- sapply(samples$X, function(x) is(x)[1])
change_int <- which((type_after != type_before) & (type_before == "integer"))
if (length(change_int) > 0) {
for (i in change_int) {
samples$X[, i] <- as.integer(samples$X[, i])
}
}
if (length(idx_ordered) > 0) {
for (i in idx_ordered) {
samples$X[, i] <- factor(samples$X[, i], ordered = TRUE)
}
}
if (length(idx_character) > 0) {
for (i in idx_character) {
samples$X[, i] <- as.character(samples$X[, i])
}
}
}
smpl <- cbind(samples$X, Y)
names(smpl)[NCOL(smpl)] <- attr(.data, "target")
result <- smpl %>%
mutate_at(attr(.data, "target"), function(x) Y) %>%
tibble::as_tibble()
if (target_idx == 1) {
result <- result %>%
dplyr::select(NCOL(result), 1:NCOL(samples$X))
} else if (target_idx == NCOL(result)) {
result <- result %>%
dplyr::select(1:NCOL(samples$X), NCOL(result))
} else {
result <- result %>%
dplyr::select(1:(target_idx - 1), NCOL(result),
(target_idx + 1):NCOL(samples$X))
}
names(result)[target_idx] <- target
}
attr(result , "sample_seed") <- seed
attr(result , "method") <- method
attr(result , "perc") <- perc
attr(result , "k") <- k
attr(result , "perc.over") <- perc.over
attr(result , "perc.under") <- perc.under
attr(result , "binary") <- attr(.data , "binary")
attr(result , "target") <- attr(.data , "target")
attr(result , "minority") <- attr(.data , "minority")
attr(result , "majority") <- attr(.data , "majority")
class(result) <- append("train_df", class(result))
result
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.