R/racog.R
In imbalance: Preprocessing Algorithms for Imbalanced Datasets

Documented in racog trainWrapper wracog

presetWrappers <- c("KNN", "C5.0")

#' Rapidly converging Gibbs algorithm.
#'
#' Allows you to treat imbalanced discrete numeric datasets by generating
#' synthetic minority examples, approximating their probability distribution.
#'
#' Approximates minority distribution using Gibbs Sampler. Dataset must be
#' discretized and numeric. In each iteration, it builds a new sample using a
#' Markov chain. It discards first \code{burnin} iterations, and from then on,
#' each \code{lag} iterations, it validates the example as a new minority
#' example. It generates \eqn{d (iterations-burnin)/lag} where \eqn{d} is
#' minority examples number.
#'
#' @param dataset \code{data.frame} to treat. All columns, except
#'   \code{classAttr} one, have to be numeric or coercible to numeric.
#' @param numInstances Integer. Number of new minority examples to generate.
#' @param burnin Integer. It determines how many examples generated for a given
#'   one are going to be discarded firstly. By default, 100.
#' @param lag Integer. Number of iterations between new generated example for a
#'   minority one. By default, 20.
#' @param classAttr \code{character}. Indicates the class attribute from
#'   \code{dataset}. Must exist in it.
#'
#' @return A \code{data.frame} with the same structure as \code{dataset},
#'   containing the generated synthetic examples.
#' @export
#'
#' @references
#'
#' Das, Barnan; Krishnan, Narayanan C.; Cook, Diane J. Racog and Wracog: Two
#' Probabilistic Oversampling Techniques. IEEE Transactions on Knowledge and
#' Data Engineering 27(2015), Nr. 1, p. 222–234.
#'
#' @examples
#' data(iris0)
#'
#' # Generates new minority examples
#'
#' newSamples <- racog(iris0, numInstances = 40, burnin = 20, lag = 10,
#'                     classAttr = "Class")
#' \donttest{
#' newSamples <- racog(iris0, numInstances = 100)
#' }
racog <- function(dataset, numInstances, burnin = 100, lag = 20, classAttr = "Class"){
  checkDataset(dataset)
  checkDatasetClass(dataset, classAttr)
  originalShape <- datasetStructure(dataset, classAttr)
  dataset <- toNumeric(dataset, exclude = classAttr)
  checkAllColumnsNumeric(dataset, exclude = classAttr)

  if(!is.numeric(burnin) || !is.numeric(lag) || !is.numeric(numInstances) ||
     burnin <= 0 || lag <= 0 || numInstances <= 0)
    stop("burnin, lag and numInstances must be positive integers")

  # Compute minority class
  minority <- selectMinority(dataset, classAttr)
  probDist <- genDistribution(minority)
  minority <- data.matrix(minority)
  newSamples <- data.frame(matrix(ncol = ncol(minority), nrow = 0))

  # Set iterations to generate at least numInstances new examples
  iterations <- ceiling(numInstances/nrow(minority)) * lag + burnin

  # Create new examples, approximating minority distribution with Gibss sampler
  for(k in seq_len(iterations)){
    # Generate new sample using Gibbs Sampler
    minority <- gibbsSampler(probDist, minority)

    if(k > burnin && k%%lag == 0)
      newSamples <- rbind(newSamples, minority)
  }


  newSamples <- selectSamples(newSamples, numInstances)
  # Prepare newSamples output
  # normalizeNewSamples(newSamples, minorityClass, attrs, classAttr, colTypes)
  normalizeNewSamples(originalShape, newSamples)
}

#' Wrapper for rapidly converging Gibbs algorithm.
#'
#' Generates synthetic minority examples by approximating their probability
#' distribution until sensitivity of \code{wrapper} over \code{validation}
#' cannot be further improved. Works only on discrete numeric datasets.
#'
#' Until the last \code{slideWin} executions of \code{wrapper} over
#' \code{validation} dataset reach a mean sensitivity lower than
#' \code{threshold}, the algorithm keeps generating samples using Gibbs Sampler,
#' and adding misclassified samples with respect to a model generated by a
#' former train, to the train dataset. Initial model is built on initial
#' \code{train}.
#'
#' @param train \code{data.frame}. A initial dataset to generate first model.
#'   All columns, except \code{classAttr} one, have to be numeric or coercible
#'   to numeric.
#' @param validation \code{data.frame}. A dataset to compare results of
#'   consecutive classifiers. Must have the same structure of \code{train}.
#' @param wrapper An \code{S3} object. There must exist a method
#'   \code{\link{trainWrapper}} implemented for the class of the object, and a
#'   \code{\link[stats]{predict}} method implemented for the class of the model
#'   returned by \code{trainWrapper}. Alternatively, it can the name of one of
#'   the wrappers distributed with the package, \code{"KNN"} or \code{"C5.0"}.
#' @param slideWin Number of last sensitivities to take into account to meet the
#'   stopping criteria. By default, 10.
#' @param threshold Threshold that the last \code{slideWin} sensitivities mean
#'   should reach. By default, 0.02.
#' @param classAttr \code{character}. Indicates the class attribute from
#'   \code{train} and \code{validation}. Must exist in them.
#' @param ... further arguments for \code{wrapper}.
#'
#' @return A \code{data.frame} with the same structure as \code{train},
#'   containing the generated synthetic examples.
#' @importFrom stats predict
#' @export
#'
#' @references
#'
#' Das, Barnan; Krishnan, Narayanan C.; Cook, Diane J. Racog and Wracog: Two
#' Probabilistic Oversampling Techniques. IEEE Transactions on Knowledge and
#' Data Engineering 27(2015), Nr. 1, p. 222–234.
#'
#' @examples
#' data(haberman)
#'
#' # Create train and validation partitions of haberman
#' trainFold <- sample(1:nrow(haberman), nrow(haberman)/2, FALSE)
#' trainSet <- haberman[trainFold, ]
#' validationSet <- haberman[-trainFold, ]
#'
#' # Defines our own wrapper with a C5.0 tree
#' myWrapper <- structure(list(), class="TestWrapper")
#' trainWrapper.TestWrapper <- function(wrapper, train, trainClass){
#'   C50::C5.0(train, trainClass)
#' }
#'
#' # Execute wRACOG with our own wrapper
#' newSamples <- wracog(trainSet, validationSet, myWrapper,
#'                      classAttr = "Class")
#'
#'
#' # Execute wRACOG with predifined wrappers for "KNN" or "C5.0"
#' KNNSamples <- wracog(trainSet, validationSet, "KNN")
#' C50Samples <- wracog(trainSet, validationSet, "C5.0")
#'
wracog <- function(train, validation, wrapper, slideWin = 10,
                   threshold = 0.02, classAttr = "Class", ...){
  wrapperByName <- class(wrapper) == "character"

  if(wrapperByName){
    wrapper <- match.arg(wrapper, presetWrappers)
    wrapper <- switch(wrapper, "KNN" = KNNWrapper, "C5.0" = C50Wrapper)
  }

  trainMethod <- paste("trainWrapper", class(wrapper), sep=".")

  checkDataset(train)
  checkDataset(validation)
  checkDatasetClass(train, classAttr)
  checkSameShape(train, validation)

  # Extract shape of the dataset and convert both train and validation to numeric
  originalShape <- datasetStructure(train, classAttr)
  train <- toNumeric(train, exclude = classAttr)
  validation <- toNumeric(validation, exclude = classAttr)
  checkAllColumnsNumeric(train, exclude = classAttr)

  if((!is.numeric(slideWin) || !is.numeric(threshold)) ||
     slideWin <= 0 || threshold <= 0 || threshold >= 1)
    stop("slideWin must be a positive integer \n  threshold must be in ]0,1[")


  # Strip class column from both train and validation, compute
  # minority class and minority examples
  minority <- selectMinority(train, classAttr)
  minorityClass <- originalShape$minorityClass
  trainClass <- train[, classAttr]
  validationClass <- validation[, classAttr]
  train <- train[, names(train) != classAttr]
  validation <- validation[, names(validation) != classAttr]

  # Compute probability distribution of the minority class
  probDist <- genDistribution(minority)

  # Value for lasts winSlides standard deviations
  lastSlides <- rep(Inf, slideWin)

  model <- trainWrapper(wrapper, train, trainClass, ...)
  predictMethod <- paste("predict", class(model), sep=".")

  if(!wrapperByName){
    if(!trainMethod %in% utils::methods(trainWrapper)){
      stop(paste("There doesn't exist a method "), trainMethod)
      wrapperByName <- FALSE
    }
    if(!any(predictMethod %in% utils::methods(predict))){
      stop(paste("There must exist a method predict.class where class\n",
                 "is any class of the model returned by", trainMethod))
    }
  }

  newSamples <- data.frame(matrix(ncol = ncol(minority), nrow = 0))

  while(naReplace(stats::sd(lastSlides), Inf) >= threshold){
    minority <- gibbsSampler(probDist, minority)
    prediction <- predict(model, data.frame(minority), ...)
    misclassified <- minority[as.character(prediction) != as.character(minorityClass), ,
                              drop = FALSE]
    newSamples <- rbind(newSamples, misclassified)
    train <- rbind(train, misclassified)
    trainClass <- appendfactor(trainClass, rep(minorityClass, nrow(misclassified)))
    model <- trainWrapper(wrapper, train, trainClass, ...)
    prediction <- predict(model, validation, ...)

    # Measure of the quality of the newTrain
    qMeasure <- sensitivity(prediction, validationClass, minorityClass)
    lastSlides <- c(qMeasure, lastSlides)
    lastSlides <- lastSlides[1:slideWin]
  }

  normalizeNewSamples(originalShape, newSamples)
}


#' Generic methods to train classifiers
#'
#' @param wrapper the wrapper instance
#' @param train \code{data.frame} of the train dataset without the class column
#' @param trainClass a vector containing the class column for \code{train}
#' @param ... further arguments for \code{wrapper}
#' @return A model which is \code{\link[stats]{predict}} callable.
#' @seealso \code{\link[stats]{predict}}
#' @export
#'
#' @examples
#' myWrapper <- structure(list(), class="C50Wrapper")
#' trainWrapper.C50Wrapper <- function(wrapper, train, trainClass){
#'   C50::C5.0(train, trainClass)
#' }
trainWrapper <- function(wrapper, train, trainClass, ...){
  UseMethod("trainWrapper")
}

# Examples of wrapper for wRACOG: C5.0 and KNN
C50Wrapper <- structure(list(), class = "C50Wrapper")
KNNWrapper <- structure(list(), class = "KNNWrapper")

trainWrapper.C50Wrapper <- function(wrapper, train, trainClass, ...){
  C50::C5.0(train, trainClass, ...)
}

predict.KNN <- function(model, test, ...){
  FNN::knn(model$train, test, model$trainClass, ...)
}

trainWrapper.KNNWrapper <- function(wrapper, train, trainClass, ...){
  myKNN <- structure(list(), class = "KNN")
  myKNN$train <- train
  myKNN$trainClass <- trainClass
  myKNN
}

#' Make a tree, represented as a matrix of edges nx2, directed.
#'
#' Returns a directed tree build up from an undirected one, complying with the
#' condition that each non-root node has a single parent.
#'
#' @param tree Matrix nx2 columns denoting undirected arcs of a tree.
#'
#' @return Matrix of directed arcs. Arcs are directed from the first coordinate
#'   towards second.
#' @noRd
#'
#' @examples
#' DT <- bnlearn::chow.liu(iris[, names(iris) != "Species"])$arcs
#'
#' tree <- unname(DT[ seq(1, nrow(DT), 2), ])
#' makeDirected(tree)
#'
makeDirected <- function(tree){
  # Visit first two nodes, marking them as parents
  parents <- tree[1, ]
  directed <- tree[1, , drop = FALSE]
  tree <- tree[-1, , drop = FALSE]

  # While there still are parents and edges to make oriented,
  # select parent, orient children
  while (length(parents) > 0 && nrow(tree) > 0){
    current <- parents[1]
    parents <- parents[-1]

    indexes <- which(tree[, 1] == current | tree[, 2] == current)

    newDirected <- t(apply(tree[indexes, , drop = FALSE], MARGIN = 1,
      function(edge){
        if(edge[2] == current){
          edge[c(2,1)]
        } else{
          edge
        }
      }))

    if(length(indexes) > 0){
      tree <- tree[-indexes, , drop = FALSE]
      parents <- c(parents, newDirected[, 2])
      directed <- rbind(directed, newDirected)
    }
  }

  list(edges = directed, root = directed[1, 1])
}


#' Generate joint distribution approximation using Chow Liu dependence tree
#'
#' @param samples A numerical dataframe of samples whose distribution
#'   approximate
#'
#' @return An S3 object with class \code{probDistribution}
#' @noRd
genDistribution <- function(samples){
  attrs <- names(samples)
  DT <- bnlearn::chow.liu(samples)$arcs
  # Choose only one sense for the arcs (the odd ones) and make tree directed
  tree <- unname(DT[ seq(1, nrow(DT), 2), ])
  tree <- apply(tree, MARGIN = c(1,2), function(attr){
    which(attrs == attr)
  })
  attrs <- seq_along(attrs)
  tree <- makeDirected(tree)
  edges <- tree$edges
  root <- tree$root

  # Calculate conditioned probability distributions
  # Cols are variables to which we are conditioning to
  genProbs <- function(i, j){
    contingency <- table(samples[, i], samples[, j])
    prop.table(contingency, margin = 1)
    #rowProbs <- as(rowProbs, "matrix")
    #hash::hash( as.list(as.data.frame(rowProbs)) )
    #as(rowProbs, "dgCMatrix")
  }

  # Calculate absolute probability distributions
  absoluteProbs <- apply(samples, MARGIN = 2, function(col){
    prop.table(table(col))
  })


  # Store attributes to which a given atribute is conditioned to,
  # attributes which are being conditioned by a given one,
  # and posible values for each attribute
  probDist <- lapply(attrs, function(attr){
    # Calc arcs in which attr is conditioned to other attribute
    conditioned <- which(edges[, 2] == attr)
    # Calc arcs in which attr is conditioning other attribute
    conditioning <- which(edges[, 1] == attr)
    conditionedProbs <- NULL
    conditioningProbs <- NULL
    values <- as.numeric(names(absoluteProbs[[attr]]))

    if(length(conditioned) > 0){
      conditioned <- edges[conditioned, ]
      conditionedProbs <- genProbs(conditioned[1], conditioned[2])
      conditioned <- conditioned[1]
    } else{
      conditioned <- NULL
    }

    if(length(conditioning) > 0){
      conditioning <- edges[conditioning, 2]

      conditioningProbs <- lapply(conditioning, function(conditioningAttr){
          genProbs(conditioningAttr, attr)
      })
    } else{
      conditioning <- NULL
    }

    list(conditionedProbs = conditionedProbs,
         conditioningProbs = conditioningProbs,
         absoluteProb = absoluteProbs[[attr]],
         conditioned = conditioned,
         conditioning = conditioning,
         values = values)
  })

  class(probDist) <- "probDistribution"
  attr(probDist, "attrs") <- attrs
  attr(probDist, "root") <- root
  probDist
}



#' Gibbs Sampler generic S3 method
#'
#' @param probDist A probability distribution
#' @param samples A \code{matrix}. Samples for which to generate new ones.
#'
#' @noRd
gibbsSampler <- function(probDist, samples){
  UseMethod("gibbsSampler")
}


#' Gibbs sampler for a class probDistribution
#' @inheritParams gibbsSampler
#'
#' @return a new sample
#' @noRd
gibbsSampler.probDistribution <- function(probDist, samples){
  attrs <- attr(probDist, "attrs")

  t(apply(samples, MARGIN = 1, function(x){
    for(attr in attrs){
      i <- probDist[[attr]]$conditioned
      first <- list()

      if(!is.null(i)){
        value <- toString(unlist(x[i]))
        first <- probDist[[attr]]$conditionedProbs[value, ]
      }

      valuesConditioning <- sapply(probDist[[attr]]$conditioning, function(i){
        toString(unlist(x[i]))
      })

      second <- mapply(function(dist, value){
          r <- dist[value, ]
          r / probDist[[attr]]$absoluteProb
      }, probDist[[attr]]$conditioningProbs, valuesConditioning)


      if(attr == attr(probDist, "root")){
        probVectors <- cbind(first, second, probDist[[attr]]$absoluteProb)
      } else{
        probVectors <- cbind(first, second)
      }

      # Prob of attr. is product of probabilites from the dependence tree
      ithProb <- apply(probVectors, MARGIN = 1, function(r){
        prod(unlist(r))
      })

      # If all probabilities are zero, create vector with same probabilities
      if(!any(ithProb != 0))
        ithProb <- rep(1, length(ithProb))

      x[attr] = sample( probDist[[attr]]$values, 1, prob = ithProb )
    }
    # Return new sample
    x
  }))
}