Nothing
#' Naive Bayes
#'
#' An implementation of a Naive Bayes classifier adapted to autocorrelated time
#' series such as the type of nonlinear vocal phenomena in consecutive audio
#' frames. All predictors must be continuous, and the outcome must be
#' categorical. Cases with missing values are not deleted because the posterior
#' probabilities of each outcome class can be calculated from different
#' combinations of predictors on a case-by-case basis. Two optional
#' modifications of a standard Naive Bayes algorithm can be made: (1)
#' classifications can be "clumped" at the final stage, ensuring that every run
#' or "epoch" of a particular predicted class is at least \code{minLength} steps
#' long, and (2) priors can be continuously adapted based on the likelihood
#' function of the preceding \code{wlPrior} observations if \code{prior =
#' 'dynamic'}.
#'
#' @param formula model formula of the type outcome ~ predictor1 + predictor2 +
#' ... (no interactions)
#' @param train either the training dataframe or the output of
#' \code{\link{naiveBayes_train}}. This data is used to calculate
#' class-specific distributions of the predictors and prior class
#' probabilities
#' @param test the test dataframe. This data is used to make predictions - that
#' is, outcome class probabilities given the values of predictors
#' @param prior "flat" = all classes are equally likely a prior, "static" = use
#' class probabilities in the training dataset, "dynamic" = update prior
#' probabilities from weighted likelihoods of \code{wlPrior} preceding
#' observations
#' @param wlPrior the length of a Gaussian window used for updating dynamic
#' priors
#' @param wlClumper the minimum expected number of observations of the same
#' class before the class can change
#' @param runBack if TRUE, the dynamic prior is calculated both forward and
#' backward and averaged (only has an effect f \code{prior = 'dynamic'})
#' @param plot if TRUE, produces diagnostic plots
#' @export
#' @return Returns the \code{test} dataframe with new columns: "pr" = the
#' predicted class membership, "[class]" = posterior probabilities per class,
#' "like_[class]" = log-likelihoods, "prior_[class]" = log-priors,
#' "priorF_[class]" / "priorB_[class]" = forward / backward log-priors per
#' class.
#'
#' @examples
#' set.seed(151)
#' ## create some fake data
#' df = data.frame(group = rep(c(
#' rep('A', 150), rep('B', 50), rep('A', 120),
#' rep('A', 100), rep('B', 30), rep('A', 90)
#' ), 3))
#' df$group = as.factor(df$group)
#' df$x1 = rnorm(nrow(df), mean = ifelse(df$group == 'A', 3, 6), sd = 2)
#' df$x2 = rnorm(nrow(df), mean = ifelse(df$group == 'A', 2, -1), sd = 2)
#' boxplot(x1 ~ group, df)
#' boxplot(x2 ~ group, df)
#'
#' ## train the classifier
#' mod_train = naiveBayes_train(group ~ x1 + x2, data = df)
#' mod_train
#'
#' ## test on new data generated by the same process
#' test = data.frame(group = rep(c(
#' rep('A', 90), rep('B', 40), rep('A', 150),
#' rep('B', 40), rep('A', 130), rep('B', 30)
#' ), 2))
#' test$group = as.factor(test$group)
#' test$x1 = rnorm(nrow(test), mean = ifelse(test$group == 'A', 3, 6), sd = 2)
#' test$x2 = rnorm(nrow(test), mean = ifelse(test$group == 'A', 2, -1), sd = 2)
#'
#' # flat priors (same prior probability for each class)
#' nb_flat = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#' prior = 'flat', plot = TRUE)
#' # same as passing 'train' directly to the model, w/o calling naiveBayes_train():
#' nb_flat = naiveBayes(group ~ x1 + x2, train = df, test = test, prior = 'flat')
#' table(nb_flat$group, nb_flat$pr)
#' mean(nb_flat$group == nb_flat$pr) # 84% correct
#'
#' # static priors (use original class proportions as prior class probabilities)
#' nb_static = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#' prior = 'static', wlClumper = NULL, plot = TRUE)
#' table(nb_static$group, nb_static$pr)
#' mean(nb_static$group == nb_static$pr) # 87% correct
#'
#' # specify custom static priors
#' mod_train2 = mod_train
#' mod_train$table
#' mod_train2$table = list(A = .1, B = .9) # sum to 1
#' nb_static2 = naiveBayes(group ~ x1 + x2, train = mod_train2, test = test,
#' prior = 'static', wlClumper = NULL, plot = TRUE)
#' mean(nb_static2$group == nb_static2$pr) # 61% correct
#'
#' # if we expect autocorrelation, ie class X is more likely a priori if the
#' # last few observations were also likely to be class X, we can use dynamic
#' # priors and/or clumper the predicted classes (the latter imposes strong
#' # constraints on the predictions, but may be worth it if the data is known to
#' # be strongly "clumpered", ie if we know classes occur in long'ish runs)
#' nb1 = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#' prior = 'dynamic', wlPrior = 10, plot = TRUE)
#' table(nb1$group, nb1$pr)
#' mean(nb1$group == nb1$pr) # 94% correct
#'
#' nb2 = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#' prior = 'static', wlClumper = 10, plot = TRUE)
#' table(nb2$group, nb2$pr)
#' mean(nb2$group == nb2$pr) # 89% correct
#'
#' nb3 = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#' prior = 'dynamic', wlPrior = 10, wlClumper = 10, plot = TRUE)
#' table(nb3$group, nb3$pr)
#' mean(nb3$group == nb3$pr) # 98% correct
#'
#' # NAs in the data are not a problem
#' test1 = test
#' test1$x1[sample(1:nrow(test1), 100)] = NA
#' test1$x2[sample(1:nrow(test1), 10)] = NA
#' summary(test1)
#'
#' nb4 = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#' prior = 'dynamic', wlPrior = 10, plot = TRUE)
#' table(nb4$group, nb4$pr)
#' mean(nb4$group == nb4$pr) # still 94% correct
naiveBayes = function(
formula,
train,
test = train,
prior = c('flat', 'static', 'dynamic')[2],
wlPrior = 3,
wlClumper = NULL,
runBack = TRUE,
plot = FALSE) {
# parse model formula
mf = as.formula(as.list(environment())$formula)
vars = attr(terms(mf), 'variables')
vars_str = as.character(vars)[-1]
outcome = vars_str[1]
predictors = vars_str[2:length(vars_str)]
if (any(!predictors %in% colnames(test))) {
missing_preds = predictors[which(!predictors %in% colnames(test))]
warning(paste0('some predictor variables are missing in test: ',
paste0(missing_preds, collapse = ', ')))
predictors = predictors[which(predictors %in% colnames(test))]
}
# check whether train is a dataset or the output of naiveBayes_train()
if (!is.list(train) || !is.list(train[[1]]) || is.null(train$table)) {
mod_train = naiveBayes_train(formula = formula, data = train)
} else {
mod_train = train
}
if (any(!predictors %in% names(mod_train))) {
missing_preds = predictors[which(!predictors %in% names(mod_train))]
warning(paste0('some predictor variables are missing in train: ',
paste0(missing_preds, collapse = ', ')))
predictors = predictors[which(predictors %in% names(mod_train))]
}
nPredictors = length(predictors)
classes = names(mod_train$table)
nClasses = length(classes)
nObs = nrow(test)
class_names = paste0(outcome, classes)
like_names = paste0('like_', outcome, classes)
prior_names = paste0('prior_', outcome, classes)
priorF_names = paste0('priorF_', outcome, classes)
priorB_names = paste0('priorB_', outcome, classes)
# get likelihoods
test[, c(class_names, like_names, prior_names, priorF_names, priorB_names)] = NA
test[, like_names] = naiveBayes_likelihood(
d = test, nObs = nObs, mod_train = mod_train,
class_names = class_names, nClasses = nClasses, like_names = like_names,
predictors = predictors, nPredictors = nPredictors)
## priors
if (prior == 'dynamic' && is.null(wlPrior) ||
!is.finite(wlPrior) || wlPrior < 1) prior = 'static'
if (prior == 'flat') {
# use only the likelihood - every class is considered equally common
test[, prior_names] = log(1 / nClasses)
} else if (prior == 'static') {
# use class proportions in the training set as priors
cat_props = log(as.numeric(mod_train$table))
test[, prior_names] = matrix(cat_props, nrow = nObs, ncol = nClasses, byrow = TRUE)
} else if (prior == 'dynamic') {
# forward
test[, priorF_names] = naiveBayes_dynamicPrior(
d = test, nObs = nObs, mod_train = mod_train, wl = wlPrior,
class_names = class_names, nClasses = nClasses, like_names = like_names,
prior_names = prior_names, predictors = predictors, nPredictors = nPredictors)
if (runBack) {
# backward
myseq = nObs:1
priorB = naiveBayes_dynamicPrior(
d = test[myseq, ], nObs = nObs, mod_train = mod_train, wl = wlPrior,
class_names = class_names, nClasses = nClasses, like_names = like_names,
prior_names = prior_names, predictors = predictors, nPredictors = nPredictors)
test[, priorB_names] = priorB[myseq, ]
# average the forward and backward priors
test[, prior_names] = (test[, priorF_names] + test[, priorB_names]) / 2
} else {
test[, prior_names] = test[, priorF_names]
}
}
## posterior
test[, class_names] = exp(test[, like_names] + test[, prior_names])
test[, class_names] = test[, class_names] /
rowSums(test[, class_names], na.rm = TRUE)
# predict the class
idx = as.numeric(apply(test[, class_names], 1, which.max))
test$pr = as.factor(classes[idx])
if (!is.null(wlClumper) && is.finite(wlClumper) && wlClumper > 1) {
test$pr = clumper(test$pr, minLength = wlClumper)
}
# from log to prob [0, 1] (just to return a neat table and for plotting)
test[, like_names] = exp(test[, like_names])
test[, like_names] = test[, like_names] / rowSums(test[, like_names], na.rm = TRUE)
test[, prior_names] = exp(test[, prior_names])
test[, prior_names] = test[, prior_names] / rowSums(test[, prior_names], na.rm = TRUE)
test[, priorF_names] = exp(test[, priorF_names])
test[, priorF_names] = test[, priorF_names] / rowSums(test[, priorF_names], na.rm = TRUE)
test[, priorB_names] = exp(test[, priorB_names])
test[, priorB_names] = test[, priorB_names] / rowSums(test[, priorB_names], na.rm = TRUE)
# plot
if (plot) {
par(mfrow = c(2, 2))
# likelihood
plot(as.numeric(test[, outcome]) - 1, type = 'l', main = 'Likelihood',
xlab = '', ylab = '')
points(1 - test[, like_names[1]], type = 'l', col = 'blue')
# priors
plot(as.numeric(test$group) - 1, type = 'l', main = 'Priors',
xlab = '', ylab = '')
points(1 - test[, prior_names[1]], type = 'l', col = 'green')
points(1 - test[, priorF_names[1]], type = 'l', col = 'blue')
points(1 - test[, priorB_names[1]], type = 'l', col = 'red')
# posterior
plot(as.numeric(test$group) - 1, type = 'l', main = 'Posterior',
xlab = '', ylab = '')
points(1 - test[, class_names[1]], type = 'l', col = 'blue')
# prediction
plot(as.numeric(test[, outcome]) - 1, type = 'l', main = 'Prediction',
xlab = '', ylab = '')
points(as.numeric(test$pr) - 1, type = 'l', lty = 2, col = 'blue')
par(mfrow = c(1, 1))
}
return(test)
}
#' Train a naive Bayes classifier
#'
#' Returns conditional means and standard deviations per class as well as a
#' table with the global proportions of each class in the dataset. This is
#' mostly useful because the output can be passed on to \code{\link{naiveBayes}}
#' to save time if naiveBayes() is called in a loop with the same training
#' dataset.
#'
#' @param formula outcome ~ predictor1 + predictor1 + ...
#' @param data training dataset
#' @export
naiveBayes_train = function(formula, data) {
# parse model formula
mf = as.formula(as.list(environment())$formula)
vars = attr(terms(mf), 'variables')
vars_str = as.character(vars)[-1]
outcome = vars_str[1]
predictors = vars_str[2:length(vars_str)]
nPredictors = length(predictors)
out = vector('list', nPredictors)
names(out) = predictors
for (p in predictors) {
# skip columns with only NAs
if (any(!is.na(data[, p]))) {
# class-conditional distributions of each predictor
ag_p = as.data.frame(aggregate(data[, p] ~ data[, outcome], FUN = mean))
colnames(ag_p) = c('outcome', 'mean')
ag_p$sd = aggregate(data[, p] ~ data[, outcome], FUN = sd)[, 2]
out[[p]] = ag_p
}
}
out$table = table(data[, outcome])
out$table = as.list(out$table / sum(out$table))
return(out)
}
#' Naive Bayes likelihood
#'
#' Internal soundgen function
#'
#' A Helper function called by \code{\link{naiveBayes}} to calculate the
#' likelihood of each observation. Algorithm: for each predictor and class, the
#' likelihood is dnorm(observation, mean_per_class, sd_per_class). I tried
#' non-Gaussian probability distributions (Student's t to accommodate outliers),
#' but Gaussian actually seems to be more robust.
#'
#' @param d dataframe containing the observations
#' @param nObs the number of observations
#' @param mod_train the output of naiveBayes_train()
#' @param class_names names of outcome classes
#' @param nClasses the number of outcome classes
#' @param like_names the names of variables holding likelihoods
#' @param predictors the names of predictor variables
#' @param nPredictors the number of predicto variables
#' @keywords internal
naiveBayes_likelihood = function(d,
nObs = nrow(d),
mod_train,
class_names,
nClasses = length(class_names),
like_names,
predictors,
nPredictors = length(predictors)) {
for (c in 1:nClasses) {
class_name = class_names[c]
like_name = like_names[c]
likes = matrix(NA, nrow = nObs, ncol = nPredictors)
for (p in 1:nPredictors) {
pred_p = predictors[p]
mean_p_c = mod_train[[pred_p]]$mean[c]
sd_p_c = mod_train[[pred_p]]$sd[c]
temp = suppressWarnings(try(
log(dnorm(as.numeric(d[, pred_p]), mean = mean_p_c, sd = sd_p_c)),
silent = TRUE))
if (!inherits(temp, 'try-error')) likes[, p] = temp
}
d[, like_name] = apply(likes, 1, sum, na.rm = TRUE)
}
# d[, like_names] = d[, like_names] / rowSums(d[, like_names], na.rm = TRUE)
return(d[, like_names])
}
#' Naive Bayes dynamic prior
#'
#' Internal soundgen function
#'
#' A Helper function called by \code{\link{naiveBayes}} to calculate dynamic
#' priors. Algorithm: average the likelihoods of wl preceding observations
#' weighted by a Gaussian function, so more recent observations have more
#' weight.
#' @inheritParams naiveBayes_likelihood
#' @param wl window length, points
#' @param prior_names the names of prior variables
#' @keywords internal
naiveBayes_dynamicPrior = function(d,
nObs = nrow(d),
mod_train,
wl,
class_names,
nClasses = length(class_names),
like_names,
prior_names,
predictors,
nPredictors = length(predictors)) {
# set up a gaussian window which sums to 1
if (wl > 2) {
n1 = wl - 1
e12 = exp(-12)
win = (exp(-12 * (((0:n1)/n1) - 0.5)^2) - e12)/(1 - e12)
} else {
win = rep(1, wl)
}
# the first observation is just assigned the global prior
d[1, prior_names] = log(as.numeric(mod_train$table))
# then we start using the likelihood over some preceding window to update the
# prior for each successive point
for (c in 1:nClasses) {
like_c = d[, like_names[c]]
table_c = log(mod_train$table[[c]])
myseq = 2:nObs
d[myseq, prior_names[c]] = apply(as.matrix(myseq), 1, function(i) {
# absolute indices of frame in win
prior_win = max(1, i - wl) : (i - 1)
# windowing function over wl normalized to sum to 1
win_i = win[(wl - length(prior_win) + 1):wl]
win_i = win_i / sum(win_i)
# weighted sum of the likelihood of class c over win_i
weighted_sum = sum(win_i * like_c[prior_win])
# add weighted_sum to the static prior of class c, using
# logistic(logit(x+y)) to avoid ever exceeding 1
# logistic(sum(logit(c(table_c, weighted_sum))))
table_c + weighted_sum
})
}
# d[, prior_names] = log(d[, prior_names]) # / rowSums(d[, prior_names])
return(d[, prior_names])
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.