Nothing
# VALIDATION: Bernoulli and Categorical =======================================
#
# Benchmarking models:
# - e1071 and klaR for Bernoulli and categorical
#
library(e1071)
library(klaR)
#
# naive_bayes vs bernoulli_naive_bayes vs e1071::naiveBayes vs klaR::NaiveBayes
#
#
# CONTENT: =====================================================================
#
# 0) Simulated data
# 1) naive_bayes
# 1.1) Bernoulli
# naive_bayes vs bernoulli_naive_bayes vs e1071::naiveBayes vs klaR::NaiveBayes
# 1.1.1) Compared for laplace = 0, laplace > 0 and different priors
# * parameter estimates
# * posterior probabilities
# 1.1.2) NAs in training data or in test data
# * parameter estimates
# * posterior probabilities
# 1.1.3) Special cases
# 1.2) Categorical
# 1.2.1) Special cases
# (Everything works the same way, thus, only special cases are covered)
# DATA for naive_bayes: ========================================================
n <- 100
n_test <- 5
set.seed(3)
data <- data.frame(class = sample(c("classA", "classB"), n, TRUE),
bern = sample(LETTERS[1:2], n, TRUE),
bern2 = sample(LETTERS[3:4], n, TRUE),
cat = sample(letters[1:3], n, TRUE),
cat2 = sample(letters[4:7], n, TRUE),
norm = rnorm(n),
norm2 = rnorm(n, 100, 15),
count = rpois(n, lambda = 5),
count2 = rpois(n, lambda = 50))
ind_train <- 1:(n-n_test)
ind_test <- ((n-n_test)+1):n
train <- data[ind_train, ]
test <- data[ind_test, -1]
test_bern <- test[ c("bern", "bern2")]
# EQUIVALENT DATA for bernoulli_naive_bayes: ===================================
ytrain <- train$class
X_bern <- sapply(data[ ,c("bern", "bern2")], function(x) as.numeric(x) - 1)
X_bern_train <- X_bern[ind_train, ,drop = FALSE]
X_bern_test <- X_bern[ind_test, ,drop = FALSE]
# 1) naive_bayes: ==============================================================
# 1.1) Bernoulli: ==============================================================
#
# naive_bayes vs bernoulli_naive_bayes vs e1071::naiveBayes
#
# Overview: models are fitted based on two simulated factors each with two levels.
#
# 1.1.1) =======================================================================
# 0 = Additive smoothing: ======================================================
nb_bern <- naive_bayes(class ~ bern + bern2, train)
bnb <- bernoulli_naive_bayes(x = X_bern_train, y = ytrain)
e10_bern <- e1071::naiveBayes(class ~ bern + bern2, train)
# Compare parameter estimates for bern
tables(nb_bern, 1)
tables(bnb, 1)
t(e10_bern$tables$bern)
# Check for differences
sum(abs(tables(nb_bern, 1)[[1]] - tables(nb_bern, 1)[[1]]))
sum(abs(tables(nb_bern, 1)[[1]] - t(e10_bern$tables$bern)))
# Check if probabilities sum up to 1 column-wise
colSums(tables(nb_bern, 1)[[1]])
colSums(tables(bnb, 1)[[1]])
colSums(t(e10_bern$tables$bern))
# Compare parameter estimates for bern2
tables(nb_bern, 2)
tables(bnb, 2)
t(e10_bern$tables$bern2)
# Check for differences
sum(abs(tables(nb_bern, 2)[[1]] - tables(nb_bern, 2)[[1]]))
sum(abs(tables(nb_bern, 2)[[1]] - t(e10_bern$tables$bern2)))
# Check if probabilities sum up to 1 column-wise
colSums(tables(nb_bern, 2)[[1]])
colSums(tables(bnb, 2)[[1]])
colSums(t(e10_bern$tables$bern2))
# Check posterior probabilities
pred_nb_bern <- predict(nb_bern, newdata = test_bern, type = "prob")
pred_nb_bern2 <- nb_bern %prob% test_bern
pred_bnb <- predict(bnb, newdata = X_bern_test, type = "prob")
pred_e10_bern <- predict(e10_bern, newdata = test_bern, type = "raw")
pred_nb_bern
pred_nb_bern2
pred_bnb
pred_e10_bern
# Check for absolute differences
sum(abs(pred_nb_bern - pred_nb_bern2))
sum(abs(pred_nb_bern - pred_bnb))
sum(abs(pred_nb_bern - pred_e10_bern))
# 0 < Additive smoothing: ======================================================
smooth <- 0.5
nb_bern_smoothed <- naive_bayes(class ~ bern + bern2, train, laplace = smooth)
bnb_smoothed <- bernoulli_naive_bayes(x = X_bern_train, y = ytrain, laplace = smooth)
e10_bern_smoothed <- e1071::naiveBayes(class ~ bern + bern2, train, laplace = smooth)
# Compare parameter estimates for bern
tables(nb_bern_smoothed, 1)
tables(bnb_smoothed, 1)
t(e10_bern_smoothed$tables$bern)
# Check for differences
sum(abs(tables(nb_bern_smoothed, 1)[[1]] - tables(nb_bern_smoothed, 1)[[1]]))
sum(abs(tables(nb_bern_smoothed, 1)[[1]] - t(e10_bern_smoothed$tables$bern)))
# Check if probabilities sum up to 1 column-wise
colSums(tables(nb_bern_smoothed, 1)[[1]])
colSums(tables(bnb_smoothed, 1)[[1]])
colSums(t(e10_bern_smoothed$tables$bern))
# Compare parameter estimates for bern2
tables(nb_bern_smoothed, 2)
tables(bnb_smoothed, 2)
t(e10_bern_smoothed$tables$bern2)
# Check for differences
sum(abs(tables(nb_bern_smoothed, 2)[[1]] - tables(nb_bern_smoothed, 2)[[1]]))
sum(abs(tables(nb_bern_smoothed, 2)[[1]] - t(e10_bern_smoothed$tables$bern2)))
# Check if probabilities sum up to 1 column-wise
colSums(tables(nb_bern_smoothed, 2)[[1]])
colSums(tables(bnb_smoothed, 2)[[1]])
colSums(t(e10_bern_smoothed$tables$bern2))
# Check posterior probabilities
pred_nb_bern_smoothed <- predict(nb_bern_smoothed, newdata = test_bern, type = "prob")
pred_nb_bern2_smoothed <- nb_bern_smoothed %prob% test_bern
pred_bnb_smoothed <- predict(bnb_smoothed, newdata = X_bern_test, type = "prob")
pred_e10_bern_smoothed <- predict(e10_bern_smoothed, newdata = test_bern, type = "raw")
pred_nb_bern_smoothed
pred_nb_bern2_smoothed
pred_bnb_smoothed
pred_e10_bern_smoothed
# Check for absolute differences
sum(abs(pred_nb_bern_smoothed - pred_nb_bern2_smoothed))
sum(abs(pred_nb_bern_smoothed - pred_bnb_smoothed))
sum(abs(pred_nb_bern_smoothed - pred_e10_bern_smoothed))
# Custom priors: ===============================================================
# c(0,1) prior: ----------------------------------------------------------------
# Fix in 0.9.7 NaN when one class has prior probability == 0
prior1 <- c(0, 1)
nb_bern_prior1 <- naive_bayes(class ~ bern + bern2, train, prior = prior1)
bnb_prior1 <- bernoulli_naive_bayes(x = X_bern_train, y = ytrain, prior = prior1)
klar_bern_prior1 <- klaR::NaiveBayes(class ~ bern + bern2, train, prior = prior1)
# Check posterior probabilities
pred_nb_bern_prior1 <- predict(nb_bern_prior1, newdata = test_bern, type = "prob")
pred_nb_bern2_prior1 <- nb_bern_prior1 %prob% test_bern
pred_bnb_prior1 <- predict(bnb_prior1, newdata = X_bern_test, type = "prob")
pred_klar_bern_prior1 <- predict(klar_bern_prior1, newdata = test_bern)$posterior
pred_nb_bern_prior1
pred_nb_bern2_prior1
pred_bnb_prior1
pred_klar_bern_prior1
# c(0.0000001, 1-0.0000001) prior: ----------------------------------------------------------------
prior2 <- c(0.0000001, 1-0.0000001)
nb_bern_prior2 <- naive_bayes(class ~ bern + bern2, train, prior = prior2)
bnb_prior2 <- bernoulli_naive_bayes(x = X_bern_train, y = ytrain, prior = prior2)
klar_bern_prior2 <- klaR::NaiveBayes(class ~ bern + bern2, train, prior = prior2)
# Check posterior probabilities
pred_nb_bern_prior2 <- predict(nb_bern_prior2, newdata = test_bern, type = "prob")
pred_nb_bern2_prior2 <- nb_bern_prior2 %prob% test_bern
pred_bnb_prior2 <- predict(bnb_prior2, newdata = X_bern_test, type = "prob")
pred_klar_bern_prior2 <- predict(klar_bern_prior2, newdata = test_bern)$posterior
pred_nb_bern_prior2
pred_nb_bern2_prior2
pred_bnb_prior2
pred_klar_bern_prior2
# Check for absolute differences
sum(abs(pred_nb_bern_prior2 - pred_nb_bern2_prior2))
sum(abs(pred_nb_bern_prior2 - pred_bnb_prior2))
sum(abs(pred_nb_bern_prior2 - pred_klar_bern_prior2))
# 1.1.2) NAs ===================================================================
# Missing values in train data: ------------------------------------------------
train_na <- train[,c("class", "bern", "bern2")]
# Add NAs at random
N_na1 <- 5
N_na2 <- 5
N_nay <- 5
train_na[sample(1:length(train_na$bern), N_na1), "bern"] <- NA
train_na[sample(1:length(train_na$bern), N_na2), "bern2"] <- NA
train_na[sample(1:length(train_na$bern), N_na2), "class"] <- NA
ytrain_na <- train_na$class
X_bern_train_na <- sapply(train_na[ ,c("bern", "bern2")], function(x) as.numeric(x) - 1)
nb_bern_na <- naive_bayes(class ~ bern + bern2, train_na)
bnb_na <- bernoulli_naive_bayes(x = X_bern_train_na, y = ytrain_na)
e10_bern_na <- e1071::naiveBayes(class ~ bern + bern2, train_na)
# Compare parameter estimates for bern
tables(nb_bern_na, 1)
tables(bnb_na, 1)
t(e10_bern_na$tables$bern)
# Check for differences
sum(abs(tables(nb_bern_na, 1)[[1]] - tables(nb_bern_na, 1)[[1]]))
sum(abs(tables(nb_bern_na, 1)[[1]] - t(e10_bern_na$tables$bern)))
# Check if probabilities sum up to 1 column-wise
colSums(tables(nb_bern_na, 1)[[1]])
colSums(tables(bnb_na, 1)[[1]])
colSums(t(e10_bern_na$tables$bern))
# Compare parameter estimates for bern2
tables(nb_bern_na, 2)
tables(bnb_na, 2)
t(e10_bern_na$tables$bern2)
# Check for differences
sum(abs(tables(nb_bern_na, 2)[[1]] - tables(nb_bern_na, 2)[[1]]))
sum(abs(tables(nb_bern_na, 2)[[1]] - t(e10_bern_na$tables$bern2)))
# Check if probabilities sum up to 1 column-wise
colSums(tables(nb_bern_na, 2)[[1]])
colSums(tables(bnb_na, 2)[[1]])
colSums(t(e10_bern_na$tables$bern2))
# Check posterior probabilities
pred_nb_bern_na <- predict(nb_bern_na, newdata = test_bern, type = "prob")
pred_nb_bern2_na <- nb_bern_na %prob% test_bern
pred_bnb_na <- predict(bnb_na, newdata = X_bern_test, type = "prob")
pred_e10_bern_na <- predict(e10_bern_na, newdata = test_bern, type = "raw")
pred_nb_bern_na
pred_nb_bern2_na
pred_bnb_na
pred_e10_bern_na
# Check for absolute differences
sum(abs(pred_nb_bern_na - pred_nb_bern2_na))
sum(abs(pred_nb_bern_na - pred_bnb_na))
sum(abs(pred_nb_bern_na - pred_e10_bern_na))
# Missing values in test data: -------------------------------------------------
test_bern_na <- test_bern
X_bern_test_na <- X_bern_test
# Add two missing values
test_bern_na[1,"bern"] <- NA
X_bern_test_na[1,"bern"] <- NA
test_bern_na[4,"bern2"] <- NA
X_bern_test_na[4,"bern2"] <- NA
# Check posterior probabilities
pred_nb_bern_na_test <- predict(nb_bern_na, newdata = test_bern_na, type = "prob")
pred_nb_bern2_na_test <- nb_bern_na %prob% test_bern_na
pred_bnb_na_test <- predict(bnb_na, newdata = X_bern_test_na, type = "prob")
pred_e10_bern_na_test <- predict(e10_bern_na, newdata = test_bern_na, type = "raw")
pred_nb_bern_na_test
pred_nb_bern2_na_test
pred_bnb_na_test
pred_e10_bern_na_test
# Check for absolute differences
sum(abs(pred_nb_bern_na - pred_nb_bern2_na))
sum(abs(pred_nb_bern_na - pred_bnb_na))
sum(abs(pred_nb_bern_na - pred_e10_bern_na))
# SPECIAL CASES: ===============================================================
# - One variable and one observation in test
nb_bern <- naive_bayes(class ~ bern + bern2, train)
nb_bern %prob% test[1,1, drop = FALSE]
bnb <- bernoulli_naive_bayes(x = X_bern_train, y = ytrain)
bnb %prob% X_bern_test[1,1, drop = FALSE]
e10_bern <- e1071::naiveBayes(class ~ bern + bern2, train)
predict(e10_bern, newdata = test[1,1, drop = FALSE], type = "raw")
# 1.2) Categorical: ============================================================
train_cat <- train[ ,c("class", "cat", "cat2")]
test_cat <- test[ ,c("cat", "cat2")]
train_cat[1,"class"] <- NA
train_cat[5,"cat"] <- NA
train_cat[23,"cat2"] <- NA
test_cat[1,"cat"] <- NA
# Train models
nb_cat <- naive_bayes(class ~ cat + cat2, data = train_cat)
e10_cat <- e1071::naiveBayes(class ~ cat + cat2, data = train_cat)
tables(nb_cat, 1)
t(e10_cat$tables$cat)
# Check absolute differences of parameter estimates
sum(abs(tables(nb_cat, 1)[[1]] - t(e10_cat$tables$cat)))
# Check posterior probabilities
pred_nb_cat <- nb_cat %prob% test_cat
pred_e10_cat <- predict(e10_cat, newdata = test_cat, type = "raw")
pred_nb_cat
pred_e10_cat
# Check for absolute differences
sum(abs(pred_nb_cat - pred_e10_cat))
# 1.2.1) SPECIAL CASES: --------------------------------------------------------
# Check when there are too few/many levels in predictor variable in test data
# - Too many levels
test_cat_more_levels <- test_cat
levels(test_cat_more_levels$cat) <- c(levels(test_cat_more_levels$cat), "newlevel")
test_cat_more_levels$cat[2] <- "newlevel"
nb_cat %prob% test_cat_more_levels
# Informative error is correctly given (issues with the data that have to be
# resolved by the user)
# - Too many levels
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.