# R/ubSmoteExs.R In unbalanced: Racing for Unbalanced Methods Selection

#### Documented in ubSmoteExs

```ubSmoteExs <-
function(data,tgt,N=200,k=5) {
# INPUTS:
# data are the rare cases (the minority "class" cases)
# tgt is the name of the target variable
# N is the percentage of over-sampling to carry out;
# and k is the number of nearest neighours to use for the generation
# OUTPUTS:
# The result of the function is a (N/100)*T set of generated
# examples with rare values on the target

nomatr <- c()
T <- matrix(nrow=dim(data)[1],ncol=dim(data)[2]-1)
for(col in seq.int(dim(T)[2])){
cl <- class(data[,col])

if (cl %in% c('Date','POSIXct','POSIXt'))
stop("cannot SMOTE variables of class Date, POSIXct or POSIXt")

if (cl %in% c('factor','character')) {
T[,col] <- as.integer(data[,col])
nomatr <- c(nomatr,col)
} else
T[,col] <- data[,col]
}

if (N < 100) { # only a percentage of the T cases will be SMOTEd
nT <- NROW(T)
idx <- sample(1:nT,as.integer((N/100)*nT))
T <- T[idx,]
N <- 100
}

p <- dim(T)[2]
nT <- dim(T)[1]

ranges <- apply(T,2,max)-apply(T,2,min)

nexs <-  as.integer(N/100) # this is the number of artificial exs generated
# for each member of T
new <- matrix(nrow=nexs*nT,ncol=p)    # the new cases

for(i in 1:nT) {

# the k NNs of case T[i,]
xd <- scale(T,T[i,],ranges)
for(a in nomatr)
xd[,a] <- xd[,a]==0
dd <- drop(xd^2 %*% rep(1, ncol(xd)))
kNNs <- order(dd)[2:(k+1)]

for(n in 1:nexs) {
# select randomly one of the k NNs
neig <- sample(1:k,1)

ex <- vector(length=ncol(T))

# the attribute values of the generated case
difs <- T[kNNs[neig],]-T[i,]
new[(i-1)*nexs+n,] <- T[i,]+runif(1)*difs
for(a in nomatr)
new[(i-1)*nexs+n,a] <- c(T[kNNs[neig],a],T[i,a])[1+round(runif(1),0)]

}
}
newCases <- data.frame(new)
for(a in nomatr)
newCases[,a] <- factor(newCases[,a],levels=1:nlevels(data[,a]),labels=levels(data[,a]))

newCases[,tgt] <- factor(rep(data[1,tgt],nrow(newCases)),levels=levels(data[,tgt]))
colnames(newCases) <- colnames(data)
newCases
}
```

## Try the unbalanced package in your browser

Any scripts or data that you put into this service are public.

unbalanced documentation built on May 29, 2017, 8:47 p.m.