Nothing
## ----eval=FALSE---------------------------------------------------------------
# install.packages("naivebayes")
## ----eval=FALSE---------------------------------------------------------------
# install.packages("path_to_tar.gz", repos = NULL, type = "source")
## ----eval=FALSE---------------------------------------------------------------
# library(naivebayes)
## ----fig.width=6,fig.height=6/1.618-------------------------------------------
# Section: General usage - Training with formula interface
library(naivebayes)
# Simulate data
n <- 100
set.seed(1)
data <- data.frame(class = sample(c("classA", "classB"), n, TRUE),
bern = sample(LETTERS[1:2], n, TRUE),
cat = sample(letters[1:3], n, TRUE),
logical = sample(c(TRUE,FALSE), n, TRUE),
norm = rnorm(n),
count = rpois(n, lambda = c(5,15)))
# Split data into train and test sets
train <- data[1:95, ]
test <- data[96:100, -1]
# General usage via formula interface
nb <- naive_bayes(class ~ ., train, usepoisson = TRUE)
# Show summary of the model
summary(nb)
# Classification
predict(nb, test, type = "class") # nb %class% test
# Posterior probabilities
predict(nb, test, type = "prob") # nb %prob% test
# Tabular and visual summaries of fitted distributions for a given feature
tables(nb, which = "norm")
plot(nb, which = "norm")
# Get names of assigned class conditional distributions
get_cond_dist(nb)
## -----------------------------------------------------------------------------
# Section: Model fitting and classification using matrix/data.frame - vector interface
library(naivebayes)
# Simulate data
n_vars <- 10
n <- 1e6
y <- sample(x = c("a", "b"), size = n, replace = TRUE)
# Discrete features
X1 <- matrix(data = sample(letters[5:9], n * n_vars, TRUE),
ncol = n_vars)
X1 <- as.data.frame(X1)
# Fit a Naive Bayes model using matrix/data.frame - vector interface
nb_cat <- naive_bayes(x = X1, y = y)
# Show summary of the model
summary(nb_cat)
# Classification
system.time(pred2 <- predict(nb_cat, X1))
head(pred2)
## ----fig.width=6,fig.height=6/1.618-------------------------------------------
# Section: Model estimation through a specialized fitting function
library(naivebayes)
# Prepare data (matrix and vector inputs are strictly necessary)
data(iris)
M <- as.matrix(iris[, 1:4])
y <- iris$Species
# Train the Gaussian Naive Bayes
gnb <- gaussian_naive_bayes(x = M, y = y)
summary(gnb)
# Parameter estimates
coef(gnb)
coef(gnb)[c(TRUE, FALSE)] # show only means
tables(gnb, 1)
# Visualization of fitted distributions
plot(gnb, which = 1)
# Classification
head(predict(gnb, newdata = M, type = "class")) # head(gnb %class% M)
# Posterior probabilities
head(predict(gnb, newdata = M, type = "prob")) # head(gnb %prob% M)
# Equivalent calculation via naive_bayes
gnb2 <- naive_bayes(M, y)
head(predict(gnb2, newdata = M, type = "prob"))
## ----eval=FALSE---------------------------------------------------------------
# library(naivebayes)
#
# # Prepare data: --------------------------------------------------------
# data(iris)
# iris2 <- iris
# N <- nrow(iris2)
# n_new_factors <- 3
# factor_names <- paste0("level", 1:n_new_factors)
#
# # Add a new categorical feature, where one level is very unlikely
# set.seed(2)
# iris2$new <- factor(sample(paste0("level", 1:n_new_factors),
# prob = c(0.005, 0.7, 0.295),
# size = 150,
# replace = TRUE), levels = factor_names)
#
# # Define class and feature levels: -------------------------------------
# Ck <- "setosa"
# level1 <- "level1"
# level2 <- "level2"
# level3 <- "level3"
#
# # level1 did not show up in the sample but we know that it
# # has 0.5% probability to occur.
# table(iris2$new)
#
# # Parameter estimation: ------------------------------------------------
#
# # ML-estimates
# ck_sub_sample <- table(iris2$new[iris$Species == Ck])
# ck_mle <- ck_sub_sample / sum(ck_sub_sample)
#
# # Bayesian estimation via symmetric Dirichlet prior with concentration parameter 0.5
# # (corresponds to the Jeffreys' uninformative prior)
#
# laplace <- 0.5
# N1 <- sum(iris2$Species == Ck & iris2$new == level1) + laplace
# N2 <- sum(iris2$Species == Ck & iris2$new == level2) + laplace
# N3 <- sum(iris2$Species == Ck & iris2$new == level3) + laplace
# N <- sum(iris2$Species == Ck) + laplace * n_new_factors
# ck_bayes <- c(N1, N2, N3) / N
#
# # Compare estimates
# rbind(ck_mle, ck_bayes)
#
# # Unlike MLE, the Bayesian estimate for level1 assigns positive probability
# # but is slightly overestimated. Compared to MLE,
# # estimates for level2 and level3 have been slightly shrunken.
#
# # In general, the higher value of laplace, the more resulting
# # distribution tends to the uniform distribution.
# # When laplace would be set to infinity
# # then the estimates for level1, level2 and level3
# # would be 1/3, 1/3 and 1/3.
#
# # Comparison with estimates obtained with naive_bayes function:
# nb_mle <- naive_bayes(Species ~ new, data = iris2)
# nb_bayes <- naive_bayes(Species ~ new, data = iris2, laplace = laplace)
#
# # MLE
# rbind(ck_mle,
# "nb_mle" = tables(nb_mle, which = "new")[[1]][ ,Ck])
#
# # Bayes
# rbind(ck_bayes,
# "nb_bayes" = tables(nb_bayes, which = "new")[[1]][ ,Ck])
#
#
# # Impact of 0 probabilities on posterior probabilities:
# new_data <- data.frame(new = c("level1", "level2", "level3"))
#
# # The posterior probabilities are NaNs, due to division by 0 when normalization
# predict(nb_mle, new_data, type = "prob", threshold = 0)
#
# # By default, this is remediated by replacing zero probabilities
# # with a small number given by threshold.
# # This leads to posterior probabilities being equal to prior probabilities
# predict(nb_mle, new_data, type = "prob")
## ----eval = FALSE-------------------------------------------------------------
# # Prepare data: --------------------------------------------------------
# data(iris)
#
# # Define the feature and class of interest
# Xi <- "Petal.Width" # Selected feature
# Ck <- "versicolor" # Selected class
#
# # Build class sub-sample for the selected feature
# Ck_Xi_subsample <- iris[iris$Species == Ck, Xi]
#
# # Maximum-Likelihood Estimation (MLE)
# mle_norm <- cbind("mean" = mean(Ck_Xi_subsample),
# "sd" = sd(Ck_Xi_subsample))
#
# # MLE estimates obtained using the naive_bayes function
# nb_mle <- naive_bayes(x = iris[Xi], y = iris[["Species"]])
# rbind(mle_norm,
# "nb_mle" = tables(nb_mle, which = Xi)[[Xi]][ ,Ck])
## ----eval=FALSE---------------------------------------------------------------
# # Prepare data: --------------------------------------------------------
# data(iris)
#
# # Selected feature
# Xi <- "Sepal.Width"
#
# # Classes
# C1 <- "setosa"
# C2 <- "virginica"
# C3 <- "versicolor"
#
# # Build class sub-samples for the selected feature
# C1_Xi_subsample <- iris[iris$Species == C1, Xi]
# C2_Xi_subsample <- iris[iris$Species == C2, Xi]
# C3_Xi_subsample <- iris[iris$Species == C3, Xi]
#
# # Estimate class conditional densities for the selected feature
# dens1 <- density(C1_Xi_subsample)
# dens2 <- density(C2_Xi_subsample)
# dens3 <- density(C3_Xi_subsample)
#
# # Visualisation: -------------------------------------------------------
# plot(dens1, main = "", col = "blue", xlim = c(1.5, 5), ylim = c(0, 1.4))
# lines(dens2, main = "", col = "red")
# lines(dens3, main = "", col = "black")
# legend("topleft", legend = c(C1, C2, C3),
# col = c("blue", "red", "black"),
# lty = 1, bty = "n")
#
# # Compare to the naive_bayes: ------------------------------------------
# nb_kde <- naive_bayes(x = iris[Xi], y = iris[["Species"]], usekernel = TRUE)
# plot(nb_kde, prob = "conditional")
#
# dens3
# nb_kde$tables[[Xi]][[C3]]
# tables(nb_kde, Xi)[[1]][[C3]]
#
#
# # Use custom bandwidth selector: ---------------------------------------
# ?bw.SJ
# nb_kde_SJ_bw <- naive_bayes(x = iris[Xi], y = iris[["Species"]],
# usekernel = TRUE, bw = "SJ")
# plot(nb_kde, prob = "conditional")
#
#
# # Visualize all available kernels: -------------------------------------
# kernels <- c("gaussian", "epanechnikov", "rectangular","triangular",
# "biweight", "cosine", "optcosine")
# iris3 <- iris
# iris3$one <- 1
#
# sapply(kernels, function (ith_kernel) {
# nb <- naive_bayes(formula = Species ~ one, data = iris3,
# usekernel = TRUE, kernel = ith_kernel)
# plot(nb, arg.num = list(main = paste0("Kernel: ", ith_kernel),
# col = "black"), legend = FALSE)
# invisible()
# })
## ----eval = FALSE-------------------------------------------------------------
# # Simulate data: -------------------------------------------------------
# cols <- 2
# rows <- 10
# set.seed(11)
# M <- matrix(rpois(rows * cols, lambda = c(0.1,1)), nrow = rows,
# ncol = cols, dimnames = list(NULL, paste0("Var", seq_len(cols))))
# y <- factor(sample(paste0("class", LETTERS[1:2]), rows, TRUE))
# Xi <- M[ ,"Var1", drop = FALSE]
#
# # MLE: -----------------------------------------------------------------
# # Estimate lambdas for each class
# tapply(Xi, y, mean)
#
# # Compare with naive_bayes
# pnb <- naive_bayes(x = Xi, y = y, usepoisson = TRUE)
# tables(pnb, 1)
#
# # Adding pseudo-counts via laplace parameter: --------------------------
# laplace <- 1
# Xi_pseudo <- Xi
# Xi_pseudo[y == "classB",][1] <- Xi_pseudo[y == "classB",][1] + laplace
# Xi_pseudo[y == "classA",][1] <- Xi_pseudo[y == "classA",][1] + laplace
#
# # Estimates
# tapply(Xi_pseudo, y, mean)
#
# # Compare with naive_bayes
# pnb <- naive_bayes(x = Xi, y = y, usepoisson = TRUE, laplace = laplace)
# tables(pnb, 1)
## ----eval = FALSE-------------------------------------------------------------
# # Prepare data for an artificial example: --------------------------------------
# set.seed(1)
# cols <- 3 # words
# rows <- 100 # all documents
# rows_spam <- 10 # spam documents
#
# # Probability of no-spam for each word
# prob_non_spam <- prop.table(runif(cols)) # C_1
#
# # Probability of spam for each word
# prob_spam <- prop.table(runif(cols)) # C_2
#
# # Simulate counts of words according to the multinomial distributions
# M1 <- t(rmultinom(rows - rows_spam, size = cols, prob = prob_non_spam))
# M2 <- t(rmultinom(rows_spam, size = cols, prob = prob_spam))
# M <- rbind(M1, M2)
# colnames(M) <- paste0("word", 1:cols) ; rownames(M) <- paste0("doc", 1:rows)
# head(M)
#
# # Simulate response with spam/no-spam
# y <- c(rep("non-spam", rows - rows_spam), rep("spam", rows_spam))
#
# # Additive smoothing
# laplace <- 0.5
#
# # Estimate the multinomial probabilities p_{ik} (i is word, k is class)
# # p1 = (p_11, p_21, p_31) (non-spam)
# # p2 = (p_12, p_22, p_32) (spam)
#
# N_1 <- sum(M1)
# N_i1 <- colSums(M1)
# p1 <- (N_i1 + laplace) / (N_1 + cols * laplace)
#
# N_2 <- sum(M2)
# N_i2 <- colSums(M2)
# p2 <- (N_i2 + laplace) / (N_2 + cols * laplace)
#
# # Combine estimated Multinomial probabilities for each class
# cbind("non-spam" = p1, "spam" = p2)
#
# # Compare to the multinomial_naive_bayes
# mnb <- multinomial_naive_bayes(x = M, y = y, laplace = laplace)
# coef(mnb)
# # colSums(coef(mnb))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.