Nothing
presetWrappers <- c("KNN", "C5.0")
#' Rapidly converging Gibbs algorithm.
#'
#' Allows you to treat imbalanced discrete numeric datasets by generating
#' synthetic minority examples, approximating their probability distribution.
#'
#' Approximates minority distribution using Gibbs Sampler. Dataset must be
#' discretized and numeric. In each iteration, it builds a new sample using a
#' Markov chain. It discards first \code{burnin} iterations, and from then on,
#' each \code{lag} iterations, it validates the example as a new minority
#' example. It generates \eqn{d (iterations-burnin)/lag} where \eqn{d} is
#' minority examples number.
#'
#' @param dataset \code{data.frame} to treat. All columns, except
#' \code{classAttr} one, have to be numeric or coercible to numeric.
#' @param numInstances Integer. Number of new minority examples to generate.
#' @param burnin Integer. It determines how many examples generated for a given
#' one are going to be discarded firstly. By default, 100.
#' @param lag Integer. Number of iterations between new generated example for a
#' minority one. By default, 20.
#' @param classAttr \code{character}. Indicates the class attribute from
#' \code{dataset}. Must exist in it.
#'
#' @return A \code{data.frame} with the same structure as \code{dataset},
#' containing the generated synthetic examples.
#' @export
#'
#' @references
#'
#' Das, Barnan; Krishnan, Narayanan C.; Cook, Diane J. Racog and Wracog: Two
#' Probabilistic Oversampling Techniques. IEEE Transactions on Knowledge and
#' Data Engineering 27(2015), Nr. 1, p. 222–234.
#'
#' @examples
#' data(iris0)
#'
#' # Generates new minority examples
#'
#' newSamples <- racog(iris0, numInstances = 40, burnin = 20, lag = 10,
#' classAttr = "Class")
#' \donttest{
#' newSamples <- racog(iris0, numInstances = 100)
#' }
racog <- function(dataset, numInstances, burnin = 100, lag = 20, classAttr = "Class"){
checkDataset(dataset)
checkDatasetClass(dataset, classAttr)
originalShape <- datasetStructure(dataset, classAttr)
dataset <- toNumeric(dataset, exclude = classAttr)
checkAllColumnsNumeric(dataset, exclude = classAttr)
if(!is.numeric(burnin) || !is.numeric(lag) || !is.numeric(numInstances) ||
burnin <= 0 || lag <= 0 || numInstances <= 0)
stop("burnin, lag and numInstances must be positive integers")
# Compute minority class
minority <- selectMinority(dataset, classAttr)
probDist <- genDistribution(minority)
minority <- data.matrix(minority)
newSamples <- data.frame(matrix(ncol = ncol(minority), nrow = 0))
# Set iterations to generate at least numInstances new examples
iterations <- ceiling(numInstances/nrow(minority)) * lag + burnin
# Create new examples, approximating minority distribution with Gibss sampler
for(k in seq_len(iterations)){
# Generate new sample using Gibbs Sampler
minority <- gibbsSampler(probDist, minority)
if(k > burnin && k%%lag == 0)
newSamples <- rbind(newSamples, minority)
}
newSamples <- selectSamples(newSamples, numInstances)
# Prepare newSamples output
# normalizeNewSamples(newSamples, minorityClass, attrs, classAttr, colTypes)
normalizeNewSamples(originalShape, newSamples)
}
#' Wrapper for rapidly converging Gibbs algorithm.
#'
#' Generates synthetic minority examples by approximating their probability
#' distribution until sensitivity of \code{wrapper} over \code{validation}
#' cannot be further improved. Works only on discrete numeric datasets.
#'
#' Until the last \code{slideWin} executions of \code{wrapper} over
#' \code{validation} dataset reach a mean sensitivity lower than
#' \code{threshold}, the algorithm keeps generating samples using Gibbs Sampler,
#' and adding misclassified samples with respect to a model generated by a
#' former train, to the train dataset. Initial model is built on initial
#' \code{train}.
#'
#' @param train \code{data.frame}. A initial dataset to generate first model.
#' All columns, except \code{classAttr} one, have to be numeric or coercible
#' to numeric.
#' @param validation \code{data.frame}. A dataset to compare results of
#' consecutive classifiers. Must have the same structure of \code{train}.
#' @param wrapper An \code{S3} object. There must exist a method
#' \code{\link{trainWrapper}} implemented for the class of the object, and a
#' \code{\link[stats]{predict}} method implemented for the class of the model
#' returned by \code{trainWrapper}. Alternatively, it can the name of one of
#' the wrappers distributed with the package, \code{"KNN"} or \code{"C5.0"}.
#' @param slideWin Number of last sensitivities to take into account to meet the
#' stopping criteria. By default, 10.
#' @param threshold Threshold that the last \code{slideWin} sensitivities mean
#' should reach. By default, 0.02.
#' @param classAttr \code{character}. Indicates the class attribute from
#' \code{train} and \code{validation}. Must exist in them.
#' @param ... further arguments for \code{wrapper}.
#'
#' @return A \code{data.frame} with the same structure as \code{train},
#' containing the generated synthetic examples.
#' @importFrom stats predict
#' @export
#'
#' @references
#'
#' Das, Barnan; Krishnan, Narayanan C.; Cook, Diane J. Racog and Wracog: Two
#' Probabilistic Oversampling Techniques. IEEE Transactions on Knowledge and
#' Data Engineering 27(2015), Nr. 1, p. 222–234.
#'
#' @examples
#' data(haberman)
#'
#' # Create train and validation partitions of haberman
#' trainFold <- sample(1:nrow(haberman), nrow(haberman)/2, FALSE)
#' trainSet <- haberman[trainFold, ]
#' validationSet <- haberman[-trainFold, ]
#'
#' # Defines our own wrapper with a C5.0 tree
#' myWrapper <- structure(list(), class="TestWrapper")
#' trainWrapper.TestWrapper <- function(wrapper, train, trainClass){
#' C50::C5.0(train, trainClass)
#' }
#'
#' # Execute wRACOG with our own wrapper
#' newSamples <- wracog(trainSet, validationSet, myWrapper,
#' classAttr = "Class")
#'
#'
#' # Execute wRACOG with predifined wrappers for "KNN" or "C5.0"
#' KNNSamples <- wracog(trainSet, validationSet, "KNN")
#' C50Samples <- wracog(trainSet, validationSet, "C5.0")
#'
wracog <- function(train, validation, wrapper, slideWin = 10,
threshold = 0.02, classAttr = "Class", ...){
wrapperByName <- class(wrapper) == "character"
if(wrapperByName){
wrapper <- match.arg(wrapper, presetWrappers)
wrapper <- switch(wrapper, "KNN" = KNNWrapper, "C5.0" = C50Wrapper)
}
trainMethod <- paste("trainWrapper", class(wrapper), sep=".")
checkDataset(train)
checkDataset(validation)
checkDatasetClass(train, classAttr)
checkSameShape(train, validation)
# Extract shape of the dataset and convert both train and validation to numeric
originalShape <- datasetStructure(train, classAttr)
train <- toNumeric(train, exclude = classAttr)
validation <- toNumeric(validation, exclude = classAttr)
checkAllColumnsNumeric(train, exclude = classAttr)
if((!is.numeric(slideWin) || !is.numeric(threshold)) ||
slideWin <= 0 || threshold <= 0 || threshold >= 1)
stop("slideWin must be a positive integer \n threshold must be in ]0,1[")
# Strip class column from both train and validation, compute
# minority class and minority examples
minority <- selectMinority(train, classAttr)
minorityClass <- originalShape$minorityClass
trainClass <- train[, classAttr]
validationClass <- validation[, classAttr]
train <- train[, names(train) != classAttr]
validation <- validation[, names(validation) != classAttr]
# Compute probability distribution of the minority class
probDist <- genDistribution(minority)
# Value for lasts winSlides standard deviations
lastSlides <- rep(Inf, slideWin)
model <- trainWrapper(wrapper, train, trainClass, ...)
predictMethod <- paste("predict", class(model), sep=".")
if(!wrapperByName){
if(!trainMethod %in% utils::methods(trainWrapper)){
stop(paste("There doesn't exist a method "), trainMethod)
wrapperByName <- FALSE
}
if(!any(predictMethod %in% utils::methods(predict))){
stop(paste("There must exist a method predict.class where class\n",
"is any class of the model returned by", trainMethod))
}
}
newSamples <- data.frame(matrix(ncol = ncol(minority), nrow = 0))
while(naReplace(stats::sd(lastSlides), Inf) >= threshold){
minority <- gibbsSampler(probDist, minority)
prediction <- predict(model, data.frame(minority), ...)
misclassified <- minority[as.character(prediction) != as.character(minorityClass), ,
drop = FALSE]
newSamples <- rbind(newSamples, misclassified)
train <- rbind(train, misclassified)
trainClass <- appendfactor(trainClass, rep(minorityClass, nrow(misclassified)))
model <- trainWrapper(wrapper, train, trainClass, ...)
prediction <- predict(model, validation, ...)
# Measure of the quality of the newTrain
qMeasure <- sensitivity(prediction, validationClass, minorityClass)
lastSlides <- c(qMeasure, lastSlides)
lastSlides <- lastSlides[1:slideWin]
}
normalizeNewSamples(originalShape, newSamples)
}
#' Generic methods to train classifiers
#'
#' @param wrapper the wrapper instance
#' @param train \code{data.frame} of the train dataset without the class column
#' @param trainClass a vector containing the class column for \code{train}
#' @param ... further arguments for \code{wrapper}
#' @return A model which is \code{\link[stats]{predict}} callable.
#' @seealso \code{\link[stats]{predict}}
#' @export
#'
#' @examples
#' myWrapper <- structure(list(), class="C50Wrapper")
#' trainWrapper.C50Wrapper <- function(wrapper, train, trainClass){
#' C50::C5.0(train, trainClass)
#' }
trainWrapper <- function(wrapper, train, trainClass, ...){
UseMethod("trainWrapper")
}
# Examples of wrapper for wRACOG: C5.0 and KNN
C50Wrapper <- structure(list(), class = "C50Wrapper")
KNNWrapper <- structure(list(), class = "KNNWrapper")
trainWrapper.C50Wrapper <- function(wrapper, train, trainClass, ...){
C50::C5.0(train, trainClass, ...)
}
predict.KNN <- function(model, test, ...){
FNN::knn(model$train, test, model$trainClass, ...)
}
trainWrapper.KNNWrapper <- function(wrapper, train, trainClass, ...){
myKNN <- structure(list(), class = "KNN")
myKNN$train <- train
myKNN$trainClass <- trainClass
myKNN
}
#' Make a tree, represented as a matrix of edges nx2, directed.
#'
#' Returns a directed tree build up from an undirected one, complying with the
#' condition that each non-root node has a single parent.
#'
#' @param tree Matrix nx2 columns denoting undirected arcs of a tree.
#'
#' @return Matrix of directed arcs. Arcs are directed from the first coordinate
#' towards second.
#' @noRd
#'
#' @examples
#' DT <- bnlearn::chow.liu(iris[, names(iris) != "Species"])$arcs
#'
#' tree <- unname(DT[ seq(1, nrow(DT), 2), ])
#' makeDirected(tree)
#'
makeDirected <- function(tree){
# Visit first two nodes, marking them as parents
parents <- tree[1, ]
directed <- tree[1, , drop = FALSE]
tree <- tree[-1, , drop = FALSE]
# While there still are parents and edges to make oriented,
# select parent, orient children
while (length(parents) > 0 && nrow(tree) > 0){
current <- parents[1]
parents <- parents[-1]
indexes <- which(tree[, 1] == current | tree[, 2] == current)
newDirected <- t(apply(tree[indexes, , drop = FALSE], MARGIN = 1,
function(edge){
if(edge[2] == current){
edge[c(2,1)]
} else{
edge
}
}))
if(length(indexes) > 0){
tree <- tree[-indexes, , drop = FALSE]
parents <- c(parents, newDirected[, 2])
directed <- rbind(directed, newDirected)
}
}
list(edges = directed, root = directed[1, 1])
}
#' Generate joint distribution approximation using Chow Liu dependence tree
#'
#' @param samples A numerical dataframe of samples whose distribution
#' approximate
#'
#' @return An S3 object with class \code{probDistribution}
#' @noRd
genDistribution <- function(samples){
attrs <- names(samples)
DT <- bnlearn::chow.liu(samples)$arcs
# Choose only one sense for the arcs (the odd ones) and make tree directed
tree <- unname(DT[ seq(1, nrow(DT), 2), ])
tree <- apply(tree, MARGIN = c(1,2), function(attr){
which(attrs == attr)
})
attrs <- seq_along(attrs)
tree <- makeDirected(tree)
edges <- tree$edges
root <- tree$root
# Calculate conditioned probability distributions
# Cols are variables to which we are conditioning to
genProbs <- function(i, j){
contingency <- table(samples[, i], samples[, j])
prop.table(contingency, margin = 1)
#rowProbs <- as(rowProbs, "matrix")
#hash::hash( as.list(as.data.frame(rowProbs)) )
#as(rowProbs, "dgCMatrix")
}
# Calculate absolute probability distributions
absoluteProbs <- apply(samples, MARGIN = 2, function(col){
prop.table(table(col))
})
# Store attributes to which a given atribute is conditioned to,
# attributes which are being conditioned by a given one,
# and posible values for each attribute
probDist <- lapply(attrs, function(attr){
# Calc arcs in which attr is conditioned to other attribute
conditioned <- which(edges[, 2] == attr)
# Calc arcs in which attr is conditioning other attribute
conditioning <- which(edges[, 1] == attr)
conditionedProbs <- NULL
conditioningProbs <- NULL
values <- as.numeric(names(absoluteProbs[[attr]]))
if(length(conditioned) > 0){
conditioned <- edges[conditioned, ]
conditionedProbs <- genProbs(conditioned[1], conditioned[2])
conditioned <- conditioned[1]
} else{
conditioned <- NULL
}
if(length(conditioning) > 0){
conditioning <- edges[conditioning, 2]
conditioningProbs <- lapply(conditioning, function(conditioningAttr){
genProbs(conditioningAttr, attr)
})
} else{
conditioning <- NULL
}
list(conditionedProbs = conditionedProbs,
conditioningProbs = conditioningProbs,
absoluteProb = absoluteProbs[[attr]],
conditioned = conditioned,
conditioning = conditioning,
values = values)
})
class(probDist) <- "probDistribution"
attr(probDist, "attrs") <- attrs
attr(probDist, "root") <- root
probDist
}
#' Gibbs Sampler generic S3 method
#'
#' @param probDist A probability distribution
#' @param samples A \code{matrix}. Samples for which to generate new ones.
#'
#' @noRd
gibbsSampler <- function(probDist, samples){
UseMethod("gibbsSampler")
}
#' Gibbs sampler for a class probDistribution
#' @inheritParams gibbsSampler
#'
#' @return a new sample
#' @noRd
gibbsSampler.probDistribution <- function(probDist, samples){
attrs <- attr(probDist, "attrs")
t(apply(samples, MARGIN = 1, function(x){
for(attr in attrs){
i <- probDist[[attr]]$conditioned
first <- list()
if(!is.null(i)){
value <- toString(unlist(x[i]))
first <- probDist[[attr]]$conditionedProbs[value, ]
}
valuesConditioning <- sapply(probDist[[attr]]$conditioning, function(i){
toString(unlist(x[i]))
})
second <- mapply(function(dist, value){
r <- dist[value, ]
r / probDist[[attr]]$absoluteProb
}, probDist[[attr]]$conditioningProbs, valuesConditioning)
if(attr == attr(probDist, "root")){
probVectors <- cbind(first, second, probDist[[attr]]$absoluteProb)
} else{
probVectors <- cbind(first, second)
}
# Prob of attr. is product of probabilites from the dependence tree
ithProb <- apply(probVectors, MARGIN = 1, function(r){
prod(unlist(r))
})
# If all probabilities are zero, create vector with same probabilities
if(!any(ithProb != 0))
ithProb <- rep(1, length(ithProb))
x[attr] = sample( probDist[[attr]]$values, 1, prob = ithProb )
}
# Return new sample
x
}))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.