intro_naivebayes.R
In naivebayes: High Performance Implementation of the Naive Bayes Algorithm

## ----eval=FALSE---------------------------------------------------------------
#  install.packages("naivebayes")

## ----eval=FALSE---------------------------------------------------------------
#  install.packages("path_to_tar.gz", repos = NULL, type = "source")

## ----eval=FALSE---------------------------------------------------------------
#  library(naivebayes)

## ----fig.width=6,fig.height=6/1.618-------------------------------------------
# Section: General usage - Training with formula interface
library(naivebayes)

# Simulate data
n <- 100
set.seed(1)
data <- data.frame(class = sample(c("classA", "classB"), n, TRUE),
                   bern = sample(LETTERS[1:2], n, TRUE),
                   cat  = sample(letters[1:3], n, TRUE),
                   logical = sample(c(TRUE,FALSE), n, TRUE),
                   norm = rnorm(n),
                   count = rpois(n, lambda = c(5,15)))

# Split data into train and test sets
train <- data[1:95, ]
test <- data[96:100, -1]

# General usage via formula interface
nb <- naive_bayes(class ~ ., train, usepoisson = TRUE)

# Show summary of the model
summary(nb)

# Classification
predict(nb, test, type = "class") # nb %class% test

# Posterior probabilities
predict(nb, test, type = "prob") # nb %prob% test

# Tabular and visual summaries of fitted distributions for a given feature
tables(nb, which = "norm")
plot(nb, which = "norm")

# Get names of assigned class conditional distributions
get_cond_dist(nb)

## -----------------------------------------------------------------------------
# Section: Model fitting and classification using matrix/data.frame - vector interface
library(naivebayes)

# Simulate data
n_vars <- 10
n <- 1e6
y <- sample(x = c("a", "b"), size = n, replace = TRUE)

# Discrete features
X1 <- matrix(data = sample(letters[5:9], n * n_vars, TRUE),
             ncol = n_vars)
X1 <- as.data.frame(X1)

# Fit a Naive Bayes model using matrix/data.frame - vector interface
nb_cat <- naive_bayes(x = X1, y = y)

# Show summary of the model
summary(nb_cat)

# Classification
system.time(pred2 <- predict(nb_cat, X1))
head(pred2)

## ----fig.width=6,fig.height=6/1.618-------------------------------------------
# Section: Model estimation through a specialized fitting function
library(naivebayes)

# Prepare data (matrix and vector inputs are strictly necessary)
data(iris)
M <- as.matrix(iris[, 1:4])
y <- iris$Species

# Train the Gaussian Naive Bayes
gnb <- gaussian_naive_bayes(x = M, y = y)
summary(gnb)

# Parameter estimates
coef(gnb)
coef(gnb)[c(TRUE, FALSE)] # show only means
tables(gnb, 1)

# Visualization of fitted distributions
plot(gnb, which = 1)

# Classification
head(predict(gnb, newdata = M, type = "class")) # head(gnb %class% M)

# Posterior probabilities
head(predict(gnb, newdata = M, type = "prob")) # head(gnb %prob% M)

# Equivalent calculation via naive_bayes
gnb2 <- naive_bayes(M, y)
head(predict(gnb2, newdata = M, type = "prob"))

## ----eval=FALSE---------------------------------------------------------------
#  library(naivebayes)
#  
#  # Prepare data: --------------------------------------------------------
#  data(iris)
#  iris2 <- iris
#  N <- nrow(iris2)
#  n_new_factors <- 3
#  factor_names <- paste0("level", 1:n_new_factors)
#  
#  # Add a new categorical feature, where one level is very unlikely
#  set.seed(2)
#  iris2$new <- factor(sample(paste0("level", 1:n_new_factors),
#                             prob = c(0.005, 0.7, 0.295),
#                             size = 150,
#                             replace = TRUE), levels = factor_names)
#  
#  # Define class and feature levels: -------------------------------------
#  Ck <- "setosa"
#  level1 <- "level1"
#  level2 <- "level2"
#  level3 <- "level3"
#  
#  # level1 did not show up in the sample but we know that it
#  # has 0.5% probability to occur.
#  table(iris2$new)
#  
#  # Parameter estimation: ------------------------------------------------
#  
#  # ML-estimates
#  ck_sub_sample <- table(iris2$new[iris$Species == Ck])
#  ck_mle <-  ck_sub_sample / sum(ck_sub_sample)
#  
#  # Bayesian estimation via symmetric Dirichlet prior with concentration parameter 0.5
#  # (corresponds to the Jeffreys' uninformative prior)
#  
#  laplace <- 0.5
#  N1 <- sum(iris2$Species == Ck & iris2$new == level1) + laplace
#  N2 <- sum(iris2$Species == Ck & iris2$new == level2) + laplace
#  N3 <- sum(iris2$Species == Ck & iris2$new == level3) + laplace
#  N <-  sum(iris2$Species == Ck) + laplace * n_new_factors
#  ck_bayes <- c(N1, N2, N3) / N
#  
#  # Compare estimates
#  rbind(ck_mle, ck_bayes)
#  
#  # Unlike MLE, the Bayesian estimate for level1 assigns positive probability
#  # but is slightly overestimated. Compared to MLE,
#  # estimates for level2 and level3 have been slightly shrunken.
#  
#  # In general, the higher value of laplace, the more resulting
#  # distribution tends to the uniform distribution.
#  # When laplace would be set to infinity
#  # then the estimates for level1, level2 and level3
#  # would be 1/3, 1/3 and 1/3.
#  
#  # Comparison with estimates obtained with naive_bayes function:
#  nb_mle <- naive_bayes(Species ~ new, data = iris2)
#  nb_bayes <- naive_bayes(Species ~ new, data = iris2, laplace = laplace)
#  
#  # MLE
#  rbind(ck_mle,
#        "nb_mle" = tables(nb_mle, which = "new")[[1]][ ,Ck])
#  
#  # Bayes
#  rbind(ck_bayes,
#        "nb_bayes" = tables(nb_bayes, which = "new")[[1]][ ,Ck])
#  
#  
#  # Impact of 0 probabilities on posterior probabilities:
#  new_data <- data.frame(new = c("level1", "level2", "level3"))
#  
#  # The posterior probabilities are NaNs, due to division by 0 when normalization
#  predict(nb_mle, new_data, type = "prob", threshold = 0)
#  
#  # By default, this is remediated by replacing zero probabilities
#  # with a small number given by threshold.
#  # This leads to posterior probabilities being equal to prior probabilities
#  predict(nb_mle, new_data, type = "prob")

## ----eval = FALSE-------------------------------------------------------------
#  # Prepare data: --------------------------------------------------------
#  data(iris)
#  
#  # Define the feature and class of interest
#  Xi <- "Petal.Width" # Selected feature
#  Ck <- "versicolor"  # Selected class
#  
#  # Build class sub-sample for the selected feature
#  Ck_Xi_subsample <- iris[iris$Species == Ck, Xi]
#  
#  # Maximum-Likelihood Estimation (MLE)
#  mle_norm <- cbind("mean" = mean(Ck_Xi_subsample),
#                    "sd" = sd(Ck_Xi_subsample))
#  
#  # MLE estimates obtained using the naive_bayes function
#  nb_mle <- naive_bayes(x = iris[Xi], y = iris[["Species"]])
#  rbind(mle_norm,
#        "nb_mle" = tables(nb_mle, which = Xi)[[Xi]][ ,Ck])

## ----eval=FALSE---------------------------------------------------------------
#  # Prepare data: --------------------------------------------------------
#  data(iris)
#  
#  # Selected feature
#  Xi <- "Sepal.Width"
#  
#  # Classes
#  C1 <- "setosa"
#  C2 <- "virginica"
#  C3 <- "versicolor"
#  
#  # Build class sub-samples for the selected feature
#  C1_Xi_subsample <- iris[iris$Species == C1, Xi]
#  C2_Xi_subsample <- iris[iris$Species == C2, Xi]
#  C3_Xi_subsample <- iris[iris$Species == C3, Xi]
#  
#  # Estimate class conditional densities for the selected feature
#  dens1 <- density(C1_Xi_subsample)
#  dens2 <- density(C2_Xi_subsample)
#  dens3 <- density(C3_Xi_subsample)
#  
#  # Visualisation: -------------------------------------------------------
#  plot(dens1, main = "", col = "blue", xlim = c(1.5, 5), ylim = c(0, 1.4))
#  lines(dens2, main = "", col = "red")
#  lines(dens3, main = "", col = "black")
#  legend("topleft", legend = c(C1, C2, C3),
#         col = c("blue", "red", "black"),
#         lty = 1, bty = "n")
#  
#  # Compare to the naive_bayes: ------------------------------------------
#  nb_kde <- naive_bayes(x = iris[Xi], y = iris[["Species"]], usekernel = TRUE)
#  plot(nb_kde, prob = "conditional")
#  
#  dens3
#  nb_kde$tables[[Xi]][[C3]]
#  tables(nb_kde, Xi)[[1]][[C3]]
#  
#  
#  # Use custom bandwidth selector: ---------------------------------------
#  ?bw.SJ
#  nb_kde_SJ_bw <- naive_bayes(x = iris[Xi], y = iris[["Species"]],
#                        usekernel = TRUE, bw = "SJ")
#  plot(nb_kde, prob = "conditional")
#  
#  
#  # Visualize all available kernels: -------------------------------------
#  kernels <- c("gaussian", "epanechnikov", "rectangular","triangular",
#              "biweight", "cosine", "optcosine")
#  iris3 <- iris
#  iris3$one <- 1
#  
#  sapply(kernels, function (ith_kernel) {
#      nb <- naive_bayes(formula = Species ~ one, data = iris3,
#                        usekernel = TRUE, kernel = ith_kernel)
#      plot(nb, arg.num = list(main = paste0("Kernel: ", ith_kernel),
#                              col = "black"), legend = FALSE)
#      invisible()
#  })

## ----eval = FALSE-------------------------------------------------------------
#  # Simulate data: -------------------------------------------------------
#  cols <- 2
#  rows <- 10
#  set.seed(11)
#  M <- matrix(rpois(rows * cols, lambda = c(0.1,1)), nrow = rows,
#              ncol = cols, dimnames = list(NULL, paste0("Var", seq_len(cols))))
#  y <- factor(sample(paste0("class", LETTERS[1:2]), rows, TRUE))
#  Xi <- M[ ,"Var1", drop = FALSE]
#  
#  # MLE: -----------------------------------------------------------------
#  # Estimate lambdas for each class
#  tapply(Xi, y, mean)
#  
#  # Compare with naive_bayes
#  pnb <- naive_bayes(x = Xi, y = y, usepoisson = TRUE)
#  tables(pnb, 1)
#  
#  # Adding pseudo-counts via laplace parameter: --------------------------
#  laplace <- 1
#  Xi_pseudo <- Xi
#  Xi_pseudo[y == "classB",][1] <- Xi_pseudo[y == "classB",][1] + laplace
#  Xi_pseudo[y == "classA",][1] <- Xi_pseudo[y == "classA",][1] + laplace
#  
#  # Estimates
#  tapply(Xi_pseudo, y, mean)
#  
#  # Compare with naive_bayes
#  pnb <- naive_bayes(x = Xi, y = y, usepoisson = TRUE, laplace = laplace)
#  tables(pnb, 1)

## ----eval = FALSE-------------------------------------------------------------
#  # Prepare data for an artificial example: --------------------------------------
#  set.seed(1)
#  cols <- 3       # words
#  rows <- 100     # all documents
#  rows_spam <- 10 # spam documents
#  
#  # Probability of no-spam for each word
#  prob_non_spam <- prop.table(runif(cols)) # C_1
#  
#  # Probability of spam for each word
#  prob_spam <- prop.table(runif(cols)) # C_2
#  
#  # Simulate counts of words according to the multinomial distributions
#  M1 <- t(rmultinom(rows - rows_spam, size = cols, prob = prob_non_spam))
#  M2 <- t(rmultinom(rows_spam,        size = cols, prob = prob_spam))
#  M <- rbind(M1, M2)
#  colnames(M) <- paste0("word", 1:cols) ; rownames(M) <- paste0("doc", 1:rows)
#  head(M)
#  
#  # Simulate response with spam/no-spam
#  y <- c(rep("non-spam", rows - rows_spam), rep("spam", rows_spam))
#  
#  # Additive smoothing
#  laplace <- 0.5
#  
#  # Estimate the multinomial probabilities p_{ik} (i is word, k is class)
#  # p1 = (p_11, p_21, p_31) (non-spam)
#  # p2 = (p_12, p_22, p_32) (spam)
#  
#  N_1 <- sum(M1)
#  N_i1 <- colSums(M1)
#  p1 <- (N_i1 + laplace) / (N_1 + cols * laplace)
#  
#  N_2 <- sum(M2)
#  N_i2 <- colSums(M2)
#  p2 <- (N_i2 + laplace) / (N_2 + cols * laplace)
#  
#  # Combine estimated Multinomial probabilities for each class
#  cbind("non-spam" = p1, "spam" = p2)
#  
#  # Compare to the multinomial_naive_bayes
#  mnb <- multinomial_naive_bayes(x = M, y = y, laplace = laplace)
#  coef(mnb)
#  # colSums(coef(mnb))