unbalanced: Racing for Unbalanced Methods Selection

Documented in ubSMOTE

#' SMOTE
#'
#' Function that implements SMOTE (synthetic minority over-sampling technique)
#' 
#' @param X the input variables of the unbalanced dataset.
#' @param Y the response variable of the unbalanced dataset. It must be a binary factor where the majority class is coded as 0 and the minority as 1.
#' @param perc.over per.over/100 is the number of new instances generated for each rare instance. If perc.over < 100 a single instance is generated.
#' @param k the number of neighbours to consider as the pool from where the new examples are generated
#' @param perc.under perc.under/100 is the number of "normal" (majority class) instances that are randomly selected for each smoted observation.
#' @param verbose print extra information (TRUE/FALSE).
#' 
#' 
#' 
#' @return The function returns a list: 
#'  \item{X}{input variables}
#'  \item{Y}{response variable}
#'
#'
#' @references Chawla, Nitesh V., et al. "SMOTE: synthetic minority over-sampling technique." arXiv preprint arXiv:1106.1813 (2011).
#' 
#' @note Original code from DMwR package
#' 
#' @examples
#' library(unbalanced)
#' data(ubIonosphere)
#' 
#' n<-ncol(ubIonosphere)
#' output<-ubIonosphere$Class
#' input<-ubIonosphere[ ,-n]
#' data<-ubSMOTE(X=input, Y=output)
#' newData<-cbind(data$X, data$Y)
#'
#' @export
ubSMOTE <-
function(X,Y,perc.over=200,k=5,perc.under=200,verbose=TRUE){
  
  if(!is.factor(Y)) 
    stop("Y has to be a factor")
  if(is.vector(X)) 
    stop("X cannot be a vector")  
  
  data<-cbind(X,Y)
  id.1 <- which(Y == 1)
  
  time<-system.time({
    # generate synthetic cases from these id.1
    newExs <- ubSmoteExs(data[id.1,],"Y",perc.over,k)   
  })
#   if(verbose)
#     cat("Time SMOTE:",round(as.numeric(time["elapsed"]),digits=2),"; perc.over",perc.over,"; perc.under",perc.under,"; k",k,"\n")
  
  row.has.na<-function(X)
    return(apply(X,1,function(x){any(is.na(x))}))
  
  row.is.na<-row.has.na(newExs)
  
  if(any(row.is.na)) {
    newExs<-newExs[!row.is.na, ]
    colnames(newExs)<-colnames(data)
    cat("WARNING: NAs generated by SMOTE removed \n")
  }
  
  # get the undersample of the "majority class" examples
  selMaj <- sample((1:NROW(data))[-id.1],
                   as.integer((perc.under/100)*nrow(newExs)),
                   replace=TRUE)
  
  # the final data set (the undersample + the rare cases + the smoted exs)
  newdataset <- rbind(data[selMaj,],data[id.1,],newExs)
  #shuffle the order of instances
  newdataset<-newdataset[sample(1:NROW(newdataset)), ]
  
  X<-newdataset[ ,-ncol(newdataset)]
  Y<-newdataset[ ,ncol(newdataset)]
  
  return(list(X=X,Y=Y))
}