#' NaiveBayes
#'
#' This NaiveBayes package provides an efficient implementation of the very popular Naive
#' Bayes classifier, which assumes independence between the feature variables. The core
#' classification function is written in Rcpp. Gaussian distribution is used with numerical
#' variables. Please use 'NaiveBayes (...)' for model fitting, and use 'predict (...)' to obtain its
#' corresponding predictions.
#'
#' The general function \bold{\code{NaiveBayes()}} detects the class of each feature in the
#' dataset and assumes possibly different distribution for each feature. Predict function uses
#' a NaiveBayes model and a new data set to create the classifications. This can either be the
#' raw probabilities generated by the NaiveBayes model or the classes themselves.
#'
#' @import Rcpp stats
#' @importFrom Rcpp evalCpp
#' @importFrom Rcpp sourceCpp
#'
#' @param x matrix or dataframe with categorical ( character / factor / logical ) or metric ( numeric ) predictors. Please correctly specified data types in each column. No NA is allowed.
#' @param y class vector ( character / factor / logical )
#' @param formula users can also input their data via NaiveBayes ( formula, data = ... ) format. A formula of the form "class ~ x1 + x2 + x3 ..." Interactions are not allowed
#' @param data Either a datafrom of predictors ( categorical and/or numeric) or a contigency table.
#' @param laplace value used for Laplace smoothing ( additive smoothing ). Defaults to 0 ( no Laplace smoothing )
#' @param object a fitted object of class "NaiveBayes"
#' @param newdata matrix or dataframe with categorical ( character / factor / logical ) or metric ( numeric ) predictors. Note: if NaiveBayes was used to create the model, then if newdata contains features that were not encountered in the training data, these are omitted from the prediction.
#' @param type if "class", new data points are classified according to the highest posterior probabilities. If "raw", the posterior probabilities for each class are returned.
#' @param threshold value by which zero probabilities or probabilities within the epsilon-range corresponding to metric variables are replaced ( zero probabilities corresponding to categorical variables can be handled with Laplace ( additive ) smoothing ).
#' @param eps value that specifies an epsilon-range to replace zero or close to zero probabilities by \code{threshold}. It applies to metric variables.
#' @param ... not used
#' @return An object of class "NaiveBayes", which has five components:
#' \itemize{
#' \item \code{apriori} Class probabilities for the dependent variable
#' \item \code{results} A list of tables, one for each predictor variable. For each categorical variable a table giving, for each attribute level, the conditional probabilities given the target class. For each numeric variable, a table giving, for each target class, mean and standard deviation of the variable.
#' \item \code{predictors} The list of independent variables
#' \item \code{call} The call that produced this object.
#' \item \code{level} Levels of the dependent variable
#' }
#'
#' @details 1. Numeric ( metric ) predictors are handled by assuming that they follow Gaussian distribution, given the class label; Missing values are not included into constructing tables. Logical variables are treated as categorical ( binary ) variables.
#' 2. Prediction function computes conditional posterior probabilities for each lass label using the Bayes' rule under the assumption of independence of predictors. Logical variables are treated as categorical ( binary ) variables.
#' @note The class "numeric" contains "double" ( double precision floating point numbers ) and "integer".
#' Prior the model fittng the classes of columns in the data.frame "data" can be easily checked via:
#' \itemize{
#' \item \code{sapply(data, class)}
#' \item \code{sapply(data, is.numeric)}
#' \item \code{sapply(data, is.double)}
#' \item \code{sapply(data, is.integer)}
#' }
#' @examples
#' x = as.matrix(iris[, 1:4])
#' mymodel1 = NaiveBayes(iris[, 1:4], iris[, 5])
#' ## or
#' mymodel1_f = NaiveBayes(Species ~. , data = iris)
#' predict(mymodel1, iris[, 1:4])
#'
#' x1 <- matrix( rpois(100 * 4, 5), ncol = 4)
#' x2 <- matrix( rpois(50 * 4, 10), ncol = 4)
#' x <- rbind(x1, x2)
#' ina <- c( rep(1, 100), rep(2, 50) )
#' mymodel2 = NaiveBayes(x, ina)
#' predict(mymodel2, x)
#' @useDynLib NaiveBayes, .registration = TRUE
#'
#' @rdname NaiveBayes
#' @export NaiveBayes
NaiveBayes = function(x, ...)
UseMethod("NaiveBayes", x)
#' @rdname NaiveBayes
#' @export
NaiveBayes.default = function(x, y, laplace = 0, ...){
# organizing function input
call = match.call()
x = as.data.frame(x)
n_var = ncol(x)
Name_y = deparse(substitute(y))
y = as.character(y)
laplace = laplace
# generate frequency tables of each variable, implemented via Rcpp
apriori = table(y)
results = mean_sd(x, y, laplace)
# formatting function output
for (i in 1:length(results)){
names(dimnames(results[[i]])) = c(Name_y, colnames(x)[i])
}
names(dimnames(apriori)) = Name_y
structure(list(apriori = apriori / sum(apriori),
results = results,
levels = if (is.logical(y)) c(FALSE, TRUE) else levels(y),
predictors = colnames(x),
call = call
),
class = "NaiveBayes"
)
}
#' @rdname NaiveBayes
#' @export
# for formula input
NaiveBayes.formula = function(formula, data, laplace = 0, ...) {
# organizing function input
call = match.call()
fm = match.call(expand.dots = FALSE)
fm$... = NULL
fm$laplace = NULL
fm[[1L]] = quote(stats::model.frame)
fm = eval(fm, parent.frame())
tms = attr(fm, "terms")
Y = model.extract(fm, "response")
X = fm[, -attr(tms, "response"), drop = FALSE]
# send to .default function for data processing
return(NaiveBayes(X, Y, laplace = laplace, ...))
}
#' @rdname NaiveBayes
#' @export
# output formatting
print.NaiveBayes = function(x, ...) {
cat("\nNaive Bayes Classifier for Discrete Predictors\n\n")
cat("Call:\n")
print(x$call)
cat("\nA-priori probabilities:\n")
print(x$apriori)
cat("\nPredictors:\n")
print(x$predictors)
cat("\nConditional probabilities:\n")
for (i in x$results) {print(i); cat("\n")}
}
#' @rdname NaiveBayes
#' @export
predict.NaiveBayes = function(object, newdata, type = c("class", "raw"), threshold = 0.001, eps = 0, ...) {
# organizing function input
type = match.arg(type)
newdata = as.data.frame(newdata)
neworder = match(object$predictors, colnames(newdata))
probs = matrix(0, length(object$apriori), nrow(newdata))
# generate the probability of each observation given different y level
# more explanation on the theoretical part of Naive Bayes please refer to the vignettes of this package
for (j in 1:nrow(newdata)){
# use log to handle numerical underflow
probs[ , j] = rowSums(log(sapply(seq_along(neworder), function(index){
# handling discrete and continuous data differently
if (is.numeric(newdata[j, index])){
oneresult = object$results[[index]]
oneresult[, 2][oneresult[, 2] <= eps] = threshold
# if continuous, we assumes Gaussian Distribution
return(stats::dnorm(newdata[j, index], oneresult[, 1], oneresult[, 2]))
} else {
prob = object$results[[index]][, as.character(newdata[j, index])]
prob[prob <= eps] = threshold
return(prob)
}})))
}
# calculate the probability of each observation being categorized under different levels of y
probs = t(exp(probs))
apriori = object$apriori
for (i in 1:length(apriori)) {
probs[, i] = probs[, i] * apriori[i]
}
sums = rowSums(probs)
sums[sums == 0] = 1
probs = probs/sums
colnames(probs) = rownames(object$results[[1]])
# output "class" or "raw"
if (type == "class") {
class = as.factor(rownames(object$results[[1]])[max.col(probs, ties.method = "first")])
return(class)
} else {
return(probs)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.