R/ADASYNPara.R

#' Generate samples by ADASYN approach.
#' 
#' @param p minority class samples
#' @param n majority class samples
#' @param nTarget the targeted number of samples to achieve
#' @param k number of the neareat neighbour in k-NN used by the ADASYN algorithm, with the default value of 5
#' @param m seeds from the positive class in m-NN used by the ADASYN algorithm, with the default value of 15
#' @return sampleADA
#' @importFrom fields rdist 
#' @importFrom stats runif 
#' @importFrom parallel makeCluster stopCluster detectCores
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %dopar%
#' @keywords internal

ADASYNPara <- function(p, n, nTarget, k, m) {
  # Generate samples by ADASYN.
  #
  # Args:
  #   p:       The minority class samples.
  #   n:       The majority class samples. P and N must have the same feature dimension, greater than one,
  #            with no missing values.
  #   nTarget: The targeted number of samples to achieve.
  #   k:       k-NN used in the ADASYN algorithm, with the default value of 5.
  #   m:       m-NN used in ADASYN, finding seeds from the Positive Class, with the default value of 15.
  #
  # Returns:
  #   The ADASYN oversampled dataset sampleADA.
  nt <- ncol(p)  # NT is number of samples in P
  if (nt == 0) {
    stop ("The minority class is empty")
  } else if (nt == 1) {
    sampleADA <- kronecker(matrix(1, 1, nTarget), p)  # duplicate
  } else {
    if (k > nt-1) {
      k <- nt-1  # number of nearest neighbours can not be greater than nt-1
      warning ("The minority class instances is not enough. k is set to ", k)
    } 
    numAtt <- nrow(p)  # Feature dimension
    ratio <- FindRatioPara(p, n, m)  # the ratio of each positive sample need to be duplicated
    no <- round(nTarget*ratio)  # the number of each positive sample need to be duplicated
    # adjust No to make the total number of new created samples to equal to the number needed
    while (sum(no) != nTarget) {
      # tmp <- max(no)
      ind <- which.max(no)
      diff <- nTarget - sum(no)
      if (no[ind] + diff > 0) {
        no[ind] <- no[ind] + diff
      } else {
        no[ind] <- 0
      }
    }
    # data generation
    nlen <- length(no)  # number of positive samples    
    i <- 0
    cl <- makeCluster(detectCores(logical = FALSE) - 1)  # start parallel
    registerDoParallel(cl)
    sampleADA <- foreach(i = 1:nlen, .combine = 'cbind') %dopar% {
      if (no[i] != 0) {
        # k-NN
        d <- rdist(t(p[, i]), t(p))  # the Euclidean distance between each positive sample and other positive data
        d[i] <-Inf  # Set d[i] to infinity manually
        # Find the k indices corresponding to the closest indices
        if (k<log(nt)) {
          minId <- list()
          for (j in 1:k) {
            # tmp <- min(d)
            id <- which.min(d)
            d[id] <-Inf
            minId <- cbind(minId, id)  # sort>=O(n*logn),so we take min: O(n).total time:O(k*n)
          } 
        }else {
          # tmp <- sort(d)
          id <- order(d)
          minId <- id[1:k]
        }
        
        rn <- floor(runif(no[i], min = 0, max = k)) + 1  # random generated No[i] elements integer vector in range 1 to k
        id <- minId[rn]
        weight <- matrix(runif(numAtt * no[i]), nrow = numAtt, ncol = no[i], byrow = TRUE)
        kro <- kronecker(matrix(1, 1, no[i]), p[, i])
        
        # for numeric attributes
        aid <- 1:numAtt
        kro[aid, ] <- kro[aid, ] + weight[aid, ]*(p[aid, unlist(id)] - kro[aid, ])
        
        return(kro)
      }
    }
    stopCluster(cl)  # end parallel
  }
  return(sampleADA)
}
lweicdsor/OSTSC documentation built on May 8, 2019, 1:13 p.m.