#' SMOTE
#'
#' Function that implements SMOTE (synthetic minority over-sampling technique)
#'
#' @param X the input variables of the unbalanced dataset.
#' @param Y the response variable of the unbalanced dataset. It must be a binary factor where the majority class is coded as 0 and the minority as 1.
#' @param perc.over per.over/100 is the number of new instances generated for each rare instance. If perc.over < 100 a single instance is generated.
#' @param k the number of neighbours to consider as the pool from where the new examples are generated
#' @param perc.under perc.under/100 is the number of "normal" (majority class) instances that are randomly selected for each smoted observation.
#' @param verbose print extra information (TRUE/FALSE).
#'
#'
#'
#' @return The function returns a list:
#' \item{X}{input variables}
#' \item{Y}{response variable}
#'
#'
#' @references Chawla, Nitesh V., et al. "SMOTE: synthetic minority over-sampling technique." arXiv preprint arXiv:1106.1813 (2011).
#'
#' @note Original code from DMwR package
#'
#' @examples
#' library(unbalanced)
#' data(ubIonosphere)
#'
#' n<-ncol(ubIonosphere)
#' output<-ubIonosphere$Class
#' input<-ubIonosphere[ ,-n]
#' data<-ubSMOTE(X=input, Y=output)
#' newData<-cbind(data$X, data$Y)
#'
#' @export
ubSMOTE <-
function(X,Y,perc.over=200,k=5,perc.under=200,verbose=TRUE){
if(!is.factor(Y))
stop("Y has to be a factor")
if(is.vector(X))
stop("X cannot be a vector")
data<-cbind(X,Y)
id.1 <- which(Y == 1)
time<-system.time({
# generate synthetic cases from these id.1
newExs <- ubSmoteExs(data[id.1,],"Y",perc.over,k)
})
# if(verbose)
# cat("Time SMOTE:",round(as.numeric(time["elapsed"]),digits=2),"; perc.over",perc.over,"; perc.under",perc.under,"; k",k,"\n")
row.has.na<-function(X)
return(apply(X,1,function(x){any(is.na(x))}))
row.is.na<-row.has.na(newExs)
if(any(row.is.na)) {
newExs<-newExs[!row.is.na, ]
colnames(newExs)<-colnames(data)
cat("WARNING: NAs generated by SMOTE removed \n")
}
# get the undersample of the "majority class" examples
selMaj <- sample((1:NROW(data))[-id.1],
as.integer((perc.under/100)*nrow(newExs)),
replace=TRUE)
# the final data set (the undersample + the rare cases + the smoted exs)
newdataset <- rbind(data[selMaj,],data[id.1,],newExs)
#shuffle the order of instances
newdataset<-newdataset[sample(1:NROW(newdataset)), ]
X<-newdataset[ ,-ncol(newdataset)]
Y<-newdataset[ ,ncol(newdataset)]
return(list(X=X,Y=Y))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.