R/sampling.R
In alookr: Model Classifier for Binary Classification

Documented in sampling_target

# The ubUnder(), ubOver(), and ubSMOTE() functions were taken from the unbalanced package and modified.
# Unbalanced packages have not been upgraded since 2015. 
# And it was removed from CRAN because it did not meet CRAN's criteria. 
# So I take the core logic and use it.
ubUnder <- function (X, Y, perc = 50) {
  stopifnot(all(unique(Y) %in% c(0, 1)))
  
  N <- length(Y)
  
  i_pos <- which(Y == 1)
  n_pos <- length(i_pos)
  
  i_neg <- which(Y == 0)
  n_neg <- length(i_neg)
  
  if (n_pos >= n_neg) {
    stop("less 0s instances than 1s, the minority class has to be class 1")
  }
  
  stopifnot(perc >= (n_pos/N * 100), perc <= 50)
  n_neg_sub <- floor(n_pos * (100 - perc)/perc)
  
  w <- rep(1/n_neg, n_neg)
  
  if (n_neg_sub <= n_neg) {
    i_neg_sub <- sample(i_neg, n_neg_sub, prob = w)
  } else {
    stop("subset of majoirty instances bigger than orginal set of majoirty instances")
  }
  
  idx <- c(i_neg_sub, i_pos)
  idx <- sort(idx)
  
  if (is.vector(X) != TRUE) {
    X <- X[idx, ]
  } else {
    X <- X[idx]
  }
  
  Y <- Y[idx]
  
  return(list(X = X, Y = Y))
}


ubOver <- function (X, Y, k = 0, verbose = TRUE) {
  stopifnot(k >= 0, class(verbose) == "logical", all(unique(Y) %in% c(0, 1)))
  
  i_pos <- which(Y == 1)
  n_pos <- length(i_pos)
  i_neg <- which(Y == 0)
  n_neg <- length(i_neg)
  
  max_k <- floor(n_neg/n_pos)
  
  if (k == 0) {
    i_pos_over <- sample(i_pos, n_neg, replace = TRUE)
  }
  
  if (k > 0) {
    n_pos_over <- n_pos * k
    
    if (n_pos_over > n_neg) {
      if (verbose) 
        cat("Max number of times allowed to replicate minority class is", 
            max_k, "\n taking as many samples as the majority class \n")
      n_pos_over <- n_neg
    }
    
    i_pos_over <- sample(i_pos, n_pos_over, replace = TRUE)
  }
  
  idx = c(i_neg, i_pos_over)
  idx <- sort(idx)
  
  if (is.vector(X) != TRUE) {
    X = X[idx, ]
  } else {
    X = X[idx]
  }
  
  Y = Y[idx]
  
  return(list(X = X, Y = Y))
}

#' @importFrom stats runif
ubSmoteExs <- function (data, N = 200, k = 5) {
  nomatr <- c()
  
  n_row <- NROW(data)
  n_col <- NCOL(data)
  
  mat <- matrix(nrow = n_row, ncol = n_col - 1)
  
  for (col in seq.int(NCOL(mat))) {
    obs <- data[, col]
    cl <- class(obs)
    
    if (cl %in% c("Date", "POSIXct", "POSIXt")) 
      stop("cannot SMOTE variables of class Date, POSIXct or POSIXt")
    
    if (cl %in% c("factor", "character")) {
      mat[, col] <- as.integer(obs)
      nomatr <- c(nomatr, col)
    } else {
      mat[, col] <- obs
    }
  }
  
  if (N < 100) {
    idx <- sample(1:n_row, as.integer((N/100) * n_row))
    mat <- mat[idx, ]
    
    N <- 100
  }
  
  p <- NCOL(mat)
  nmat <- NROW(mat)
  
  ranges <- apply(mat, 2, max) - apply(mat, 2, min)
  nexs <- as.integer(N/100)
  
  new <- matrix(nrow = nexs * nmat, ncol = p)
  
  for (i in 1:nmat) {
    xd <- scale(mat, mat[i, ], ranges)
    
    for (a in nomatr) {
      xd[, a] <- xd[, a] == 0
    } 
    
    dd <- drop(xd^2 %*% rep(1, ncol(xd)))
    kNNs <- order(dd)[2:(k + 1)]
    
    for (n in 1:nexs) {
      neig <- sample(1:k, 1)
      ex <- vector(length = ncol(mat))
      difs <- mat[kNNs[neig], ] - mat[i, ]
      new[(i - 1) * nexs + n, ] <- mat[i, ] + stats::runif(1) * difs
      
      for (a in nomatr) {
        new[(i - 1) * nexs + n, a] <- c(mat[kNNs[neig], a], mat[i, a])[1 + round(runif(1), 0)]
      } 
    }
  }
  
  newCases <- data.frame(new)
  
  for (a in nomatr) {
    newCases[, a] <- factor(newCases[, a], 
                            levels = 1:nlevels(data[, a]), 
                            labels = levels(data[, a]))
  } 
  
  newCases[, "Y"] <- factor(rep(data[1, "Y"], nrow(newCases)), 
                            levels = levels(data[, "Y"]))
  colnames(newCases) <- colnames(data)
  
  newCases
}


#' @importFrom dplyr bind_rows
ubSMOTE <- function (X, Y, perc.over = 200, k = 5, perc.under = 200) {
  if (!is.factor(Y)) 
    stop("Y has to be a factor")
  
  if (is.vector(X)) 
    stop("X cannot be a vector")
  
  data <- cbind(X, Y)
  idx_pos <- which(Y == 1)
  
  newExs <- ubSmoteExs(data[idx_pos, ], perc.over, k)
  
  is_narow <- apply(newExs, 1, function(x) any(is.na(x)))
  
  if (any(is_narow)) {
    newExs <- newExs[!is_narow, ]
    colnames(newExs) <- colnames(data)
    cat("WARNING: NAs generated by SMOmatE removed \n")
  }
  
  selMaj <- sample((1:NROW(data))[-idx_pos], 
                   as.integer((perc.under / 100) * nrow(newExs)), 
                   replace = TRUE)
  
  newdataset <- dplyr::bind_rows(data[selMaj, ], data[idx_pos, ], newExs)
  newdataset <- newdataset[sample(1:NROW(newdataset)), ]
  
  X <- newdataset[, -ncol(newdataset)]
  Y <- newdataset[, ncol(newdataset)]
  
  return(list(X = X, Y = Y))
}


#' Extract the data to fit the model
#'
#' @description To solve the imbalanced class, perform sampling in the train set of split_df.
#' @details In order to solve the problem of imbalanced class, sampling is performed by under sampling,
#' over sampling, SMOTE method.
#' @section attributes of train_df class:
#' The attributes of the train_df class are as follows.:
#'
#' \itemize{
#' \item sample_seed : integer. random seed used for sampling
#' \item method : character. sampling methods.
#' \item perc : integer. perc argument value
#' \item k : integer. k argument value
#' \item perc.over : integer. perc.over argument value
#' \item perc.under : integer. perc.under argument value
#' \item binary : logical. whether the target variable is a binary class
#' \item target : character. target variable name
#' \item minority : character. the level of the minority class
#' \item majority : character. the level of the majority class
#' }
#'
#' @param .data an object of class "split_df", usually, a result of a call to split_df().
#' @param method character. sampling methods. "ubUnder" is under-sampling,
#' and "ubOver" is over-sampling, "ubSMOTE" is SMOTE(Synthetic Minority Over-sampling TEchnique).
#' @param seed integer. random seed used for sampling
#' @param perc integer. The percentage of positive class in the final dataset.
#' It is used only in under-sampling. The default is 50. perc can not exceed 50.
#' @param k integer. It is used only in over-sampling and SMOTE.
#' If over-sampling and if K=0: sample with replacement from the minority class until
#' we have the same number of instances in each class. under-sampling and if K>0:
#' sample with replacement from the minority class until we have k-times
#' the original number of minority instances.
#' If SMOTE, the number of neighbours to consider as the pool from where the new
#' examples are generated
#' @param perc.over integer. It is used only in SMOTE. per.over/100 is the number of new instances
#' generated for each rare instance. If perc.over < 100 a single instance is generated.
#' @param perc.under integer. It is used only in SMOTE. perc.under/100 is the number
#' of "normal" (majority class) instances that are randomly selected for each smoted
#' observation.
#' @return An object of train_df.
#' @export
#' @examples
#' library(dplyr)
#'
#' # Credit Card Default Data
#' head(ISLR::Default)
#'
#' # Generate data for the example
#' sb <- ISLR::Default %>%
#'   split_by(default)
#'
#' # under-sampling with random seed
#' under <- sb %>%
#'   sampling_target(seed = 1234L)
#'
#' under %>%
#'   count(default)
#'
#' # under-sampling with random seed, and minority class frequency is 40%
#' under40 <- sb %>%
#'   sampling_target(seed = 1234L, perc = 40)
#'
#' under40 %>%
#'   count(default)
#'
#' # over-sampling with random seed
#' over <- sb %>%
#'   sampling_target(method = "ubOver", seed = 1234L)
#'
#' over %>%
#'   count(default)
#'
#' # over-sampling with random seed, and k = 10
#' over10 <- sb %>%
#'   sampling_target(method = "ubOver", seed = 1234L, k = 10)
#'
#' over10 %>%
#'   count(default)
#'
#' # SMOTE with random seed
#' smote <- sb %>%
#'   sampling_target(method = "ubSMOTE", seed = 1234L)
#'
#' smote %>%
#'   count(default)
#'
#' # SMOTE with random seed, and perc.under = 250
#' smote250 <- sb %>%
#'   sampling_target(method = "ubSMOTE", seed = 1234L, perc.under = 250)
#'
#' smote250 %>%
#'   count(default)
#'
#' @importFrom tidyselect vars_select
#' @importFrom rlang quos
#' @importFrom tibble as_tibble
#' @export
#'
sampling_target <- function(.data, method = c("ubUnder", "ubOver", "ubSMOTE"),
  seed = NULL, perc = 50, k = ifelse(method == "ubSMOTE", 5, 0),
  perc.over = 200, perc.under = 200) {
  if(is(.data) != "split_df") {
    stop("x is not split_df class")
  }

  target <- attr(.data, "target")
  target_idx <- .data %>%
    ungroup() %>%
    dplyr::select(-split_flag) %>%
    names(.) %in% target %>%
    which

  method <- match.arg(method)

  minority <- attr(.data, "minority")
  majority <- attr(.data, "majority")

  Y <- .data %>%
    filter(split_flag == "train") %>%
    ungroup() %>%
    dplyr::select(target = target_idx) %>%
    mutate(target = ifelse(target == minority, "1", "0") %>%
        factor(levels = c("1", "0"))) %>%
    pull()

  X <- .data %>%
    filter(split_flag == "train") %>%
    ungroup() %>%
    dplyr::select(-target, -split_flag)

  type_before <- sapply(X, function(x) is(x)[1])

  if (method %in% c("ubSMOTE")) {
    idx_ordered <- which(type_before == "ordered")

    if (length(idx_ordered) > 0) {
      for (i in idx_ordered) {
        X[, i] <- factor(pull(X[, i]), ordered = FALSE)
      }
    }

    idx_character <- which(type_before == "character")

    if (length(idx_character) > 0) {
      for (i in idx_character) {
        X[, i] <- factor(pull(X[, i]))
      }
    }
  }

  if (is.null(seed))
    seed <- sample(seq(1e5), size = 1)

  set.seed(seed)

  samples <- switch (method,
    ubUnder = ubUnder(X, Y, perc = perc),
    ubOver  = ubOver(X, Y, k = k),
    ubSMOTE = ubSMOTE(data.frame(X), Y, perc.over = perc.over, k = k,
                      perc.under = perc.under)
  )

  if (method == "ubUnder") {
    result <- .data %>%
      filter(split_flag == "train") %>%
      ungroup() %>%
      dplyr::select(-split_flag)
  } else if(method %in% c("ubOver", "ubSMOTE")) {
    Y <- factor(ifelse(samples$Y == 1, minority, majority))

    if (method == "ubSMOTE") {
      type_after <- sapply(samples$X, function(x) is(x)[1])

      change_int <- which((type_after != type_before) & (type_before == "integer"))

      if (length(change_int) > 0) {
        for (i in change_int) {
          samples$X[, i] <- as.integer(samples$X[, i])
        }
      }

      if (length(idx_ordered) > 0) {
        for (i in idx_ordered) {
          samples$X[, i] <- factor(samples$X[, i], ordered = TRUE)
        }
      }

      if (length(idx_character) > 0) {
        for (i in idx_character) {
          samples$X[, i] <- as.character(samples$X[, i])
        }
      }
    }

    smpl <- cbind(samples$X, Y)
    names(smpl)[NCOL(smpl)] <- attr(.data, "target")

    result <- smpl %>%
      mutate_at(attr(.data, "target"), function(x) Y) %>%
      tibble::as_tibble()

    if (target_idx == 1) {
      result <- result %>%
        dplyr::select(NCOL(result), 1:NCOL(samples$X))
    } else if (target_idx == NCOL(result)) {
      result <- result %>%
        dplyr::select(1:NCOL(samples$X), NCOL(result))
    } else {
      result <- result %>%
        dplyr::select(1:(target_idx - 1), NCOL(result),
          (target_idx + 1):NCOL(samples$X))
    }

    names(result)[target_idx] <- target
  }

  attr(result , "sample_seed") <- seed
  attr(result , "method") <- method
  attr(result , "perc") <- perc
  attr(result , "k") <- k
  attr(result , "perc.over") <- perc.over
  attr(result , "perc.under") <- perc.under
  attr(result , "binary") <- attr(.data , "binary")
  attr(result , "target") <- attr(.data , "target")
  attr(result , "minority") <- attr(.data , "minority")
  attr(result , "majority") <- attr(.data , "majority")

  class(result) <- append("train_df", class(result))

  result
}