Nothing
#' Apply borderline-SMOTE Algorithm
#'
#' `step_bsmote()` creates a *specification* of a recipe step that generate new
#' examples of the minority class using nearest neighbors of these cases in the
#' border region between classes.
#'
#' @inheritParams recipes::step_center
#' @inheritParams step_upsample
#' @param ... One or more selector functions to choose which
#' variable is used to sample the data. See [selections()]
#' for more details. The selection should result in _single
#' factor variable_. For the `tidy` method, these are not
#' currently used.
#' @param role Not used by this step since no new variables are
#' created.
#' @param column A character string of the variable name that will
#' be populated (eventually) by the `...` selectors.
#' @param neighbors An integer. Number of nearest neighbor that are used
#' to generate the new examples of the minority class.
#' @param all_neighbors Type of two borderline-SMOTE method. Defaults to FALSE.
#' See details.
#' @param seed An integer that will be used as the seed when
#' smote-ing.
#' @return An updated version of `recipe` with the new step
#' added to the sequence of existing steps (if any). For the
#' `tidy` method, a tibble with columns `terms` which is
#' the variable used to sample.
#'
#' @details
#' This methods works the same way as [step_smote()], expect that instead of
#' generating points around every point of of the minority class each point is
#' first being classified into the boxes "danger" and "not". For each point the
#' k nearest neighbors is calculated. If all the neighbors comes from a
#' different class it is labeled noise and put in to the "not" box. If more then
#' half of the neighbors comes from a different class it is labeled "danger.
# Points will be generated around points labeled "danger".
#'
#' If all_neighbors = FALSE then points will be generated between nearest
#' neighbors in its own class. If all_neighbors = TRUE then points will be
#' generated between any nearest neighbors. See examples for visualization.
#'
#' The parameter `neighbors` controls the way the new examples are created.
#' For each currently existing minority class example X new examples will be
#' created (this is controlled by the parameter `over_ratio` as mentioned
#' above). These examples will be generated by using the information from the
#' `neighbors` nearest neighbor of each example of the minority class.
#' The parameter `neighbors` controls how many of these neighbor are used.
#'
#' All columns in the data are sampled and returned by [juice()]
#' and [bake()].
#'
#' All columns used in this step must be numeric with no missing data.
#'
#' When used in modeling, users should strongly consider using the
#' option `skip = TRUE` so that the extra sampling is _not_
#' conducted outside of the training set.
#'
#' # Tidying
#'
#' When you [`tidy()`][tidy.recipe()] this step, a tibble with columns `terms`
#' (the selectors or variables selected) will be returned.
#'
#' ```{r, echo = FALSE, results="asis"}
#' step <- "step_bsmote"
#' result <- knitr::knit_child("man/rmd/tunable-args.Rmd")
#' cat(result)
#' ```
#'
#' @template case-weights-not-supported
#'
#' @references Hui Han, Wen-Yuan Wang, and Bing-Huan Mao. Borderline-smote:
#' a new over-sampling method in imbalanced data sets learning. In
#' International Conference on Intelligent Computing, pages 878–887. Springer,
#' 2005.
#'
#' @seealso [bsmote()] for direct implementation
#' @family Steps for over-sampling
#'
#' @export
#' @examples
#' library(recipes)
#' library(modeldata)
#' data(hpc_data)
#'
#' hpc_data0 <- hpc_data %>%
#' select(-protocol, -day)
#'
#' orig <- count(hpc_data0, class, name = "orig")
#' orig
#'
#' up_rec <- recipe(class ~ ., data = hpc_data0) %>%
#' # Bring the minority levels up to about 1000 each
#' # 1000/2211 is approx 0.4523
#' step_bsmote(class, over_ratio = 0.4523) %>%
#' prep()
#'
#' training <- up_rec %>%
#' bake(new_data = NULL) %>%
#' count(class, name = "training")
#' training
#'
#' # Since `skip` defaults to TRUE, baking the step has no effect
#' baked <- up_rec %>%
#' bake(new_data = hpc_data0) %>%
#' count(class, name = "baked")
#' baked
#'
#' # Note that if the original data contained more rows than the
#' # target n (= ratio * majority_n), the data are left alone:
#' orig %>%
#' left_join(training, by = "class") %>%
#' left_join(baked, by = "class")
#'
#' library(ggplot2)
#'
#' ggplot(circle_example, aes(x, y, color = class)) +
#' geom_point() +
#' labs(title = "Without SMOTE")
#'
#' recipe(class ~ x + y, data = circle_example) %>%
#' step_bsmote(class, all_neighbors = FALSE) %>%
#' prep() %>%
#' bake(new_data = NULL) %>%
#' ggplot(aes(x, y, color = class)) +
#' geom_point() +
#' labs(title = "With borderline-SMOTE, all_neighbors = FALSE")
#'
#' recipe(class ~ x + y, data = circle_example) %>%
#' step_bsmote(class, all_neighbors = TRUE) %>%
#' prep() %>%
#' bake(new_data = NULL) %>%
#' ggplot(aes(x, y, color = class)) +
#' geom_point() +
#' labs(title = "With borderline-SMOTE, all_neighbors = TRUE")
step_bsmote <-
function(recipe, ..., role = NA, trained = FALSE,
column = NULL, over_ratio = 1, neighbors = 5, all_neighbors = FALSE,
skip = TRUE, seed = sample.int(10^5, 1), id = rand_id("bsmote")) {
add_step(
recipe,
step_bsmote_new(
terms = enquos(...),
role = role,
trained = trained,
column = column,
over_ratio = over_ratio,
neighbors = neighbors,
all_neighbors = all_neighbors,
predictors = NULL,
skip = skip,
seed = seed,
id = id
)
)
}
step_bsmote_new <-
function(terms, role, trained, column, over_ratio, neighbors, all_neighbors,
predictors, skip, seed, id) {
step(
subclass = "bsmote",
terms = terms,
role = role,
trained = trained,
column = column,
over_ratio = over_ratio,
neighbors = neighbors,
all_neighbors = all_neighbors,
predictors = predictors,
skip = skip,
id = id,
seed = seed,
id = id
)
}
#' @export
prep.step_bsmote <- function(x, training, info = NULL, ...) {
col_name <- recipes_eval_select(x$terms, training, info)
if (length(col_name) > 1) {
rlang::abort("The selector should select at most a single variable")
}
if (length(col_name) == 1) {
check_column_factor(training, col_name)
}
predictors <- setdiff(get_from_info(info, "predictor"), col_name)
check_type(training[, predictors], types = c("double", "integer"))
check_na(select(training, all_of(c(col_name, predictors))))
step_bsmote_new(
terms = x$terms,
role = x$role,
trained = TRUE,
column = col_name,
over_ratio = x$over_ratio,
neighbors = x$neighbors,
all_neighbors = x$all_neighbors,
predictors = predictors,
skip = x$skip,
seed = x$seed,
id = x$id
)
}
#' @export
bake.step_bsmote <- function(object, new_data, ...) {
col_names <- unique(c(object$predictors, object$column))
check_new_data(col_names, object, new_data)
if (length(object$column) == 0L) {
# Empty selection
return(new_data)
}
new_data <- as.data.frame(new_data)
predictor_data <- new_data[, col_names]
# bsmote with seed for reproducibility
with_seed(
seed = object$seed,
code = {
synthetic_data <- bsmote_impl(
predictor_data,
object$column,
k = object$neighbors,
over_ratio = object$over_ratio,
all_neighbors = object$all_neighbors
)
synthetic_data <- as_tibble(synthetic_data)
}
)
new_data <- na_splice(new_data, synthetic_data, object)
new_data
}
#' @export
print.step_bsmote <-
function(x, width = max(20, options()$width - 26), ...) {
title <- "BorderlineSMOTE based on "
print_step(x$column, x$terms, x$trained, title, width)
invisible(x)
}
#' @rdname tidy.recipe
#' @param x A `step_bsmote` object.
#' @export
tidy.step_bsmote <- function(x, ...) {
if (is_trained(x)) {
res <- tibble(terms = unname(x$column))
} else {
term_names <- sel2char(x$terms)
res <- tibble(terms = unname(term_names))
}
res$id <- x$id
res
}
#' @export
#' @rdname tunable_themis
tunable.step_bsmote <- function(x, ...) {
tibble::tibble(
name = c("over_ratio", "neighbors", "all_neighbors"),
call_info = list(
list(pkg = "dials", fun = "over_ratio"),
list(pkg = "dials", fun = "neighbors"),
list(pkg = "dials", fun = "all_neighbors")
),
source = "recipe",
component = "step_bsmote",
component_id = x$id
)
}
#' @rdname required_pkgs.step
#' @export
required_pkgs.step_bsmote <- function(x, ...) {
c("themis")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.