#' Synthetic Minority Oversampling Technique (SMOTE)
#'
#' \code{smote} performs type of data augmentation for the selected (usually minority). In order to process continuous and
#' categorical risk factors simultaneously, Heterogeneity Euclidean Overlapping Metric (HEOM) is used in nearest neighbors
#' algorithm.
#'@param db Data set of risk factors and target variable.
#'@param target Name of target variable within \code{db} argument.
#'@param minority.class Value of minority class. It can be numeric or character value, but it has to exist in target variable.
#'@param osr Oversampling rate. It has to be numeric value greater than 0 (for example 0.2 for 20% oversampling).
#'@param ordinal.rf Character vector of ordinal risk factors. Default value is \code{NULL}.
#'@param num.rf.const Data frame with constrains for numeric risk factors. It has to contain the following columns:
#' \code{rf}(numeric risk factor names from \code{db}),
#' \code{lower} (lower bound of numeric risk factor),
#' \code{upper} (upper bound of numeric risk factor),
#' \code{type} (type of numeric risk factor - \code{"numeric"} or \code{"integer"}).
#' Constrains are used for correction of synthetic data for selected numeric risk factors.
#' Default value is \code{NULL} which means that no corrections are assumed.
#'@param k Number of nearest neighbors. Default value is 5.
#'@param seed Random seed needed for ensuring the result reproducibility. Default is 81000.
#'@return The command \code{smote} returns a data frame with added synthetic observations for selected minority class.
#' The data frame contains all variables from \code{db} data frame plus additional variable (\code{smote}) that serves as
#' indicator for distinguishing between original and synthetic observations.
#'@examples
#'suppressMessages(library(PDtoolkit))
#'data(loans)
#'#check numeric variables (note that one of variables is target not a risk factor)
#'names(loans)[sapply(loans, is.numeric)]
#'#define constains of numeric risk factors
#'num.rf.const <- data.frame(rf = c("Duration of Credit (month)", "Credit Amount", "Age (years)"),
#' lower = c(4, 250, 19),
#' upper = c(72, 20000, 75),
#' type = c("integer", "numeric", "integer"))
#'num.rf.const
#'
#'#loans$"Account Balance"[990:1000] <- NA
#'#loans$"Credit Amount"[900:920] <- NA
#'
#'loans.s <- smote(db = loans,
#' target = "Creditability",
#' minority.class = 1,
#' osr = 0.05,
#' ordinal.rf = NULL,
#' num.rf.const = num.rf.const,
#' k = 5,
#' seed = 81000)
#'str(loans.s)
#'table(loans.s$Creditability, loans.s$smote)
#'#select minority class
#'loans.mc <- loans.s[loans.s$Creditability%in%1, ]
#'head(loans.mc)
#'@importFrom stats runif
#'@export
smote <- function(db, target, minority.class, osr, ordinal.rf = NULL, num.rf.const = NULL, k = 5, seed = 81000) {
dfl.n <- c("rf", "lower", "upper", "type")
dfl.t <- c("integer", "numeric")
if (!is.data.frame(db)) {
stop("db is not a data frame.")
}
if (length(names(db)[!names(db)%in%target]) == 0) {
stop("only target supplied in db.")
}
if (!is.null(num.rf.const)) {
if (!is.data.frame(num.rf.const)) {
stop("num.rf.const is not a data frame.")
}
if (any(!dfl.n%in%names(num.rf.const))) {
stop(paste0("column names of num.rf.const data frame have to be the following: ",
paste(dfl.n, collapse = ", "), "."))
}
if (any(!unique(num.rf.const$type)%in%dfl.t)) {
stop("column type of num.rf.const has to be one of integer or numeric.")
}
if (any(!num.rf.const$rf%in%names(db))) {
stop("supplied risk factor(s) in num.rf.const do not exist in db.")
}
}
var.check <- c(target, ordinal.rf)
if (!sum(var.check%in%names(db)) == length(var.check)) {
stop("target and/or ordinal.rf do not exist in db.")
}
target.mod <- unique(db[, target])
excl.ind <- is.na(target.mod) | is.infinite(target.mod)
if (length(target.mod[!excl.ind]) == 0) {
stop("target contains only unavailable values")
}
if (!minority.class%in%target.mod[!excl.ind]) {
stop("minority.class does not exists in target inidicator.")
}
if (!is.numeric(osr) | !is.numeric(k)) {
stop("osr and k has to be of numeric type.")
}
if (k <= 0 | k > 50) {
stop("k has to be between 1 and 50.")
}
if (osr <= 0) {
stop("oversampling rate has to be greater than 0.")
}
db.c <- db
#ordinal to interval risk factors
ord.l <- length(ordinal.rf)
if (ord.l > 0) {
for (i in 1:ord.l) {
db.c[, ordinal.rf[i]] <- ord.to.int(x = db.c[, ordinal.rf[i]])
}
}
#normalization
rf.num.check <- sapply(db.c[, !names(db.c)%in%target], is.numeric)
rf.num <- names(rf.num.check)[rf.num.check]
if (length(rf.num) > 0) {
for (i in 1:length(rf.num)) {
rf.num.l <- rf.num[i]
db.c[, rf.num.l] <- db.c[, rf.num.l] / (max(db.c[, rf.num.l], na.rm = TRUE) -
min(db.c[, rf.num.l], na.rm = TRUE))
}
}
#index sampling
db.mc <- db[db[, target]%in%minority.class, ]
db.c.mc <- db.c[db.c[, target]%in%minority.class, ]
mc.num <- nrow(db.c.mc)
os.num <- round(mc.num * osr)
set.seed(seed)
indx <- sample(1:mc.num, os.num, replace = ifelse(os.num > mc.num, TRUE, FALSE))
#k check
if (k > mc.num) {k <- mc.num}
#find k nearest neighbors
res.knn <- heom(db = db.c.mc[, !names(db.c.mc)%in%target],
indx = indx,
k = k,
rfn = unname(which(rf.num.check)),
nc = length(rf.num.check),
db.nr = nrow(db.c.mc))
#create synthetic data
db.s <- create.synthetic.data(db = db.mc[, !names(db.mc)%in%target],
target = target,
minority.class = minority.class,
res.knn = res.knn,
seed = seed)
#correct for given constrains
if (!is.null(num.rf.const)) {
db.s <- synthetic.data.corr(db = db.s, num.rf.const = num.rf.const)
}
db.s <- cbind.data.frame(xxxxx = minority.class, db.s, smote = 1)
names(db.s)[1] <- target
db.smote <- bind_rows(cbind.data.frame(db, smote = 0), db.s)
return(db.smote)
}
ord.to.int <- function(x) {
x.u <- sort(unique(x), na.last = TRUE)
x.r <- 1:length(x.u[!is.na(x.u)])
names(x.r) <- x.u[!is.na(x.u)]
x.n <- unname(x.r[x])
return(x.n)
}
heom <- function(db, indx, k, rfn, nc, db.nr) {
indx.u <- sort(unique(indx))
indxl <- length(indx.u)
res <- data.frame(matrix(ncol = k + 2, nrow = indxl))
names(res) <- c("indx", "ss", paste0("k", 1:k))
for (i in 1:indxl) {
indx.l <- indx.u[i]
ss <- sum(indx%in%indx.l)
db.l <- db[indx.l, ]
res.l <- rep(NA, db.nr)
for (y in 1:db.nr) {
res.l[y] <- heom.aux(x = db.l, y = db[y, ], rfn = rfn, nc = nc)
}
dist.sorted <- sort(res.l, index.return = TRUE)$ix
find.k <- dist.sorted[!dist.sorted%in%indx.l][1:k]
res[i, ] <- c(indx.l, ss, find.k)
}
return(res)
}
heom.aux <- function(x, y, rfn, nc) {
res <- rep(NA, nc)
for (i in 1:nc) {
x.l <- x[1, i]
y.l <- y[1, i]
if (is.na(x.l) | is.infinite(x.l) | is.na(y.l) | is.infinite(y.l)) {
res[i] <- 1; next
}
if (i%in%rfn) {
res[i] <- abs(x.l - y.l)
} else {
res[i] <- ifelse(x.l == y.l, 0, 1)
}
}
distance <- sqrt(sum(res ^ 2))
return(distance)
}
create.synthetic.data <- function(db, target = target, minority.class, res.knn, seed) {
sd.r <- sum(res.knn$ss)
sd.c <- ncol(db)
rf.num <- unname(which(sapply(db, is.numeric)))
res <- data.frame(matrix(ncol = sd.c, nrow = sd.r))
names(res) <- names(db)
sd.r.cs <- c(0, cumsum(res.knn$ss))
for (i in 1:nrow(res.knn)) {
indx.l <- res.knn$indx[i]
ss <- res.knn$ss[i]
knn <- unname(c(res.knn[i, -c(1, 2)], recursive = TRUE))
seed <- seed + i
set.seed(seed)
nn.indx <- sample(knn, ss, replace = TRUE)
db.l <- db[indx.l, ]
db.nn <- db[nn.indx, ]
synt.d <- synthetic.data.aux(x = db.l,
y = db.nn,
nc = ncol(db.l),
rfn = rf.num,
seed = seed)
res[(sd.r.cs[i] + 1):sd.r.cs[i + 1], ] <- synt.d
}
return(res)
}
synthetic.data.aux <- function(x, y, nc, rfn, seed) {
res.l <- data.frame(matrix(ncol = nc, nrow = 1))
names(res.l) <- names(x)
ss <- nrow(y)
res <- vector("list", ss)
if (length(rfn) > 0) {
synth.exp <- "res.i[, rfn] <- x[, rfn] + rand.n * (y.l[, rfn] - x[, rfn]);
res.i[, -rfn] <- y.l[, -rfn]"
} else {
synth.exp <- "res.i <- y.l"
}
for (i in 1:ss) {
res.i <- res.l
y.l <- y[i, ]
set.seed(seed)
rand.n <- runif(1)
eval(parse(text = synth.exp))
res[[i]] <- res.i
}
res <- bind_rows(res)
return(res)
}
synthetic.data.corr <- function(db, num.rf.const) {
for (i in 1:nrow(num.rf.const)) {
rf.n <- num.rf.const$rf[i]
rf.l <- num.rf.const$lower[i]
rf.u <- num.rf.const$upper[i]
rf.t <- num.rf.const$type[i]
db[, rf.n] <- ifelse(db[, rf.n] < rf.l, rf.l, db[, rf.n])
db[, rf.n] <- ifelse(db[, rf.n] > rf.u, rf.u, db[, rf.n])
if (rf.t%in%"integer") {db[, rf.n] <- round(db[, rf.n])}
}
return(db)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.