R/NaiveBayes.R

Defines functions NaiveBayes NaiveBayes.default NaiveBayes.formula print.NaiveBayes predict.NaiveBayes

Documented in NaiveBayes NaiveBayes.default NaiveBayes.formula predict.NaiveBayes print.NaiveBayes

#' NaiveBayes
#'
#' This NaiveBayes package provides an efficient implementation of the very popular Naive
#' Bayes classifier, which assumes independence between the feature variables. The core
#' classification function is written in Rcpp. Gaussian distribution is used with numerical
#' variables. Please use 'NaiveBayes (...)' for model fitting, and use 'predict (...)' to obtain its
#' corresponding predictions.
#'
#' The general function \bold{\code{NaiveBayes()}} detects the class of each feature in the
#' dataset and assumes possibly different distribution for each feature. Predict function uses
#' a NaiveBayes model and a new data set to create the classifications. This can either be the
#' raw probabilities generated by the NaiveBayes model or the classes themselves.
#'
#' @import Rcpp stats
#' @importFrom Rcpp evalCpp
#' @importFrom Rcpp sourceCpp
#'
#' @param x matrix or dataframe with categorical ( character / factor / logical ) or metric ( numeric ) predictors. Please correctly specified data types in each column. No NA is allowed.
#' @param y class vector ( character / factor / logical )
#' @param formula users can also input their data via NaiveBayes ( formula, data = ... ) format. A formula of the form "class ~ x1 + x2 + x3 ..." Interactions are not allowed
#' @param data Either a datafrom of predictors ( categorical and/or numeric)  or a contigency table.
#' @param laplace value used for Laplace smoothing ( additive smoothing ). Defaults to 0 ( no Laplace smoothing )
#' @param object a fitted object of class "NaiveBayes"
#' @param newdata matrix or dataframe with categorical ( character / factor / logical ) or metric ( numeric ) predictors. Note: if NaiveBayes was used to create the model, then if newdata contains features that were not encountered in the training data, these are omitted from the prediction.
#' @param type if "class", new data points are classified according to the highest posterior probabilities. If "raw", the posterior probabilities for each class are returned.
#' @param threshold value by which zero probabilities or probabilities within the epsilon-range corresponding to metric variables are replaced ( zero probabilities corresponding to categorical variables can be handled with Laplace ( additive ) smoothing ).
#' @param eps value that specifies an epsilon-range to replace zero or close to zero probabilities by \code{threshold}. It applies to metric variables.
#' @param ... not used
#' @return An object of class "NaiveBayes", which has five components:
#' \itemize{
#' \item \code{apriori} Class probabilities for the dependent variable
#' \item \code{results}  A list of tables, one for each predictor variable. For each categorical variable a table giving, for each attribute level, the conditional probabilities given the target class. For each numeric variable, a table giving, for each target class, mean and standard deviation of the variable.
#' \item \code{predictors}  The list of independent variables
#' \item \code{call}  The call that produced this object.
#' \item \code{level}  Levels of the dependent variable
#' }
#'
#' @details 1. Numeric ( metric ) predictors are handled by assuming that they follow Gaussian distribution, given the class label; Missing values are not included into constructing tables. Logical variables are treated as categorical ( binary ) variables.
#' 2. Prediction function computes conditional posterior probabilities for each lass label using the Bayes' rule under the assumption of independence of predictors. Logical variables are treated as categorical ( binary ) variables.
#' @note The class "numeric" contains "double" ( double precision floating point numbers ) and "integer".
#' Prior the model fittng the classes of columns in the data.frame "data" can be easily checked via:
#' \itemize{
#' \item \code{sapply(data, class)}
#' \item \code{sapply(data, is.numeric)}
#' \item \code{sapply(data, is.double)}
#' \item \code{sapply(data, is.integer)}
#' }

#' @examples

#' x = as.matrix(iris[, 1:4])
#' mymodel1 = NaiveBayes(iris[, 1:4], iris[, 5])
#' ## or
#' mymodel1_f = NaiveBayes(Species ~. , data = iris)
#' predict(mymodel1, iris[, 1:4])
#'
#' x1 <- matrix( rpois(100 * 4, 5), ncol = 4)
#' x2 <- matrix( rpois(50 * 4, 10), ncol = 4)
#' x <- rbind(x1, x2)
#' ina <- c( rep(1, 100), rep(2, 50) )
#' mymodel2 = NaiveBayes(x, ina)
#' predict(mymodel2, x)

#' @useDynLib NaiveBayes, .registration = TRUE
#'


#' @rdname NaiveBayes
#' @export NaiveBayes
NaiveBayes = function(x, ...)
  UseMethod("NaiveBayes", x)


#' @rdname NaiveBayes
#' @export
NaiveBayes.default = function(x, y, laplace = 0, ...){

  # organizing function input
  call = match.call()
  x = as.data.frame(x)
  n_var = ncol(x)
  Name_y = deparse(substitute(y))
  y = as.character(y)
  laplace = laplace

  # generate frequency tables of each variable, implemented via Rcpp
  apriori = table(y)
  results = mean_sd(x, y, laplace)

  # formatting function output
  for (i in 1:length(results)){
    names(dimnames(results[[i]])) = c(Name_y, colnames(x)[i])
  }
  names(dimnames(apriori)) = Name_y

  structure(list(apriori = apriori / sum(apriori),
                 results = results,
                 levels = if (is.logical(y)) c(FALSE, TRUE) else levels(y),
                 predictors = colnames(x),
                 call = call
  ),
  class = "NaiveBayes"
  )
}

#' @rdname  NaiveBayes
#' @export
# for formula input
NaiveBayes.formula = function(formula, data, laplace = 0, ...) {

  # organizing function input
  call = match.call()
  fm = match.call(expand.dots = FALSE)
  fm$... = NULL
  fm$laplace = NULL
  fm[[1L]] = quote(stats::model.frame)
  fm = eval(fm, parent.frame())
  tms = attr(fm, "terms")
  Y = model.extract(fm, "response")
  X = fm[, -attr(tms, "response"), drop = FALSE]

  # send to .default function for data processing
  return(NaiveBayes(X, Y, laplace = laplace, ...))
}


#' @rdname NaiveBayes
#' @export
# output formatting
print.NaiveBayes = function(x, ...) {
  cat("\nNaive Bayes Classifier for Discrete Predictors\n\n")
  cat("Call:\n")
  print(x$call)
  cat("\nA-priori probabilities:\n")
  print(x$apriori)
  cat("\nPredictors:\n")
  print(x$predictors)
  cat("\nConditional probabilities:\n")
  for (i in x$results) {print(i); cat("\n")}
}



#' @rdname NaiveBayes
#' @export
predict.NaiveBayes = function(object, newdata, type = c("class", "raw"), threshold = 0.001, eps = 0, ...) {
  # organizing function input
  type = match.arg(type)
  newdata = as.data.frame(newdata)
  neworder = match(object$predictors, colnames(newdata))
  probs = matrix(0, length(object$apriori), nrow(newdata))

  # generate the probability of each observation given different y level
  # more explanation on the theoretical part of Naive Bayes please refer to the vignettes of this package
  for (j in 1:nrow(newdata)){
    # use log to handle numerical underflow
    probs[ , j] = rowSums(log(sapply(seq_along(neworder), function(index){
      # handling discrete and continuous data differently
      if (is.numeric(newdata[j, index])){
        oneresult = object$results[[index]]
        oneresult[, 2][oneresult[, 2] <= eps] = threshold
        # if continuous, we assumes Gaussian Distribution
        return(stats::dnorm(newdata[j, index], oneresult[, 1], oneresult[, 2]))
      } else {
        prob = object$results[[index]][, as.character(newdata[j, index])]
        prob[prob <= eps] = threshold
        return(prob)
      }})))
  }

  # calculate the probability of each observation being categorized under different levels of y
  probs = t(exp(probs))
  apriori = object$apriori
  for (i in 1:length(apriori)) {
    probs[, i] = probs[, i] * apriori[i]
  }
  sums = rowSums(probs)
  sums[sums == 0] = 1
  probs = probs/sums
  colnames(probs) = rownames(object$results[[1]])

  # output "class" or "raw"
  if (type == "class") {
    class = as.factor(rownames(object$results[[1]])[max.col(probs, ties.method = "first")])
    return(class)
  } else {
    return(probs)
  }
}
sidiwang/NaiveBayes documentation built on Nov. 26, 2019, 9 a.m.