R/naiveBayes.R

Defines functions naiveBayes_dynamicPrior naiveBayes_likelihood naiveBayes_train naiveBayes

Documented in naiveBayes naiveBayes_dynamicPrior naiveBayes_likelihood naiveBayes_train

#' Naive Bayes
#'
#' An implementation of a Naive Bayes classifier adapted to autocorrelated time
#' series such as the type of nonlinear vocal phenomena in consecutive audio
#' frames. All predictors must be continuous, and the outcome must be
#' categorical. Cases with missing values are not deleted because the posterior
#' probabilities of each outcome class can be calculated from different
#' combinations of predictors on a case-by-case basis. Two optional
#' modifications of a standard Naive Bayes algorithm can be made: (1)
#' classifications can be "clumped" at the final stage, ensuring that every run
#' or "epoch" of a particular predicted class is at least \code{minLength} steps
#' long, and (2) priors can be continuously adapted based on the likelihood
#' function of the preceding \code{wlPrior} observations if \code{prior =
#' 'dynamic'}.
#'
#' @param formula model formula of the type outcome ~ predictor1 + predictor2 +
#'   ... (no interactions)
#' @param train either the training dataframe or the output of
#'   \code{\link{naiveBayes_train}}. This data is used to calculate
#'   class-specific distributions of the predictors and prior class
#'   probabilities
#' @param test the test dataframe. This data is used to make predictions - that
#'   is, outcome class probabilities given the values of predictors
#' @param prior "flat" = all classes are equally likely a prior, "static" = use
#'   class probabilities in the training dataset, "dynamic" = update prior
#'   probabilities from weighted likelihoods of \code{wlPrior} preceding
#'   observations
#' @param wlPrior the length of a Gaussian window used for updating dynamic
#'   priors
#' @param wlClumper the minimum expected number of observations of the same
#'   class before the class can change
#' @param runBack if TRUE, the dynamic prior is calculated both forward and
#'   backward and averaged (only has an effect f \code{prior = 'dynamic'})
#' @param plot if TRUE, produces diagnostic plots
#' @export
#' @return Returns the \code{test} dataframe with new columns: "pr" = the
#'   predicted class membership, "[class]" = posterior probabilities per class,
#'   "like_[class]" = log-likelihoods, "prior_[class]" = log-priors,
#'   "priorF_[class]" / "priorB_[class]" = forward / backward log-priors per
#'   class.
#'
#' @examples
#' set.seed(151)
#' ## create some fake data
#' df = data.frame(group = rep(c(
#'     rep('A', 150), rep('B', 50), rep('A', 120),
#'     rep('A', 100), rep('B', 30), rep('A', 90)
#'   ), 3))
#' df$group = as.factor(df$group)
#' df$x1 = rnorm(nrow(df), mean = ifelse(df$group == 'A', 3, 6), sd = 2)
#' df$x2 = rnorm(nrow(df), mean = ifelse(df$group == 'A', 2, -1), sd = 2)
#' boxplot(x1 ~ group, df)
#' boxplot(x2 ~ group, df)
#'
#' ## train the classifier
#' mod_train = naiveBayes_train(group ~ x1 + x2, data = df)
#' mod_train
#'
#' ## test on new data generated by the same process
#' test = data.frame(group = rep(c(
#'   rep('A', 90), rep('B', 40), rep('A', 150),
#'   rep('B', 40), rep('A', 130), rep('B', 30)
#' ), 2))
#' test$group = as.factor(test$group)
#' test$x1 = rnorm(nrow(test), mean = ifelse(test$group == 'A', 3, 6), sd = 2)
#' test$x2 = rnorm(nrow(test), mean = ifelse(test$group == 'A', 2, -1), sd = 2)
#'
#' # flat priors (same prior probability for each class)
#' nb_flat = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#'   prior = 'flat', plot = TRUE)
#' # same as passing 'train' directly to the model, w/o calling naiveBayes_train():
#' nb_flat = naiveBayes(group ~ x1 + x2, train = df, test = test, prior = 'flat')
#' table(nb_flat$group, nb_flat$pr)
#' mean(nb_flat$group == nb_flat$pr) # 84% correct
#'
#' # static priors (use original class proportions as prior class probabilities)
#' nb_static = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#'   prior = 'static', wlClumper = NULL, plot = TRUE)
#' table(nb_static$group, nb_static$pr)
#' mean(nb_static$group == nb_static$pr) # 87% correct
#'
#' # specify custom static priors
#' mod_train2 = mod_train
#' mod_train$table
#' mod_train2$table = list(A = .1, B = .9)  # sum to 1
#' nb_static2 = naiveBayes(group ~ x1 + x2, train = mod_train2, test = test,
#'   prior = 'static', wlClumper = NULL, plot = TRUE)
#' mean(nb_static2$group == nb_static2$pr)  # 61% correct
#'
#' # if we expect autocorrelation, ie class X is more likely a priori if the
#' # last few observations were also likely to be class X, we can use dynamic
#' # priors and/or clumper the predicted classes (the latter imposes strong
#' # constraints on the predictions, but may be worth it if the data is known to
#' # be strongly "clumpered", ie if we know classes occur in long'ish runs)
#' nb1 = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#'   prior = 'dynamic', wlPrior = 10, plot = TRUE)
#' table(nb1$group, nb1$pr)
#' mean(nb1$group == nb1$pr) # 94% correct
#'
#' nb2 = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#'   prior = 'static', wlClumper = 10, plot = TRUE)
#' table(nb2$group, nb2$pr)
#' mean(nb2$group == nb2$pr) # 89% correct
#'
#' nb3 = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#'   prior = 'dynamic', wlPrior = 10, wlClumper = 10, plot = TRUE)
#' table(nb3$group, nb3$pr)
#' mean(nb3$group == nb3$pr) # 98% correct
#'
#' # NAs in the data are not a problem
#' test1 = test
#' test1$x1[sample(1:nrow(test1), 100)] = NA
#' test1$x2[sample(1:nrow(test1), 10)] = NA
#' summary(test1)
#'
#' nb4 = naiveBayes(group ~ x1 + x2, train = mod_train, test = test,
#'   prior = 'dynamic', wlPrior = 10, plot = TRUE)
#' table(nb4$group, nb4$pr)
#' mean(nb4$group == nb4$pr)  # still 94% correct
naiveBayes = function(
    formula,
    train,
    test = train,
    prior = c('flat', 'static', 'dynamic')[2],
    wlPrior = 3,
    wlClumper = NULL,
    runBack = TRUE,
    plot = FALSE) {
  # parse model formula
  mf = as.formula(as.list(environment())$formula)
  vars = attr(terms(mf), 'variables')
  vars_str = as.character(vars)[-1]
  outcome = vars_str[1]
  predictors = vars_str[2:length(vars_str)]
  if (any(!predictors %in% colnames(test))) {
    missing_preds = predictors[which(!predictors %in% colnames(test))]
    warning(paste0('some predictor variables are missing in test: ',
                   paste0(missing_preds, collapse = ', ')))
    predictors = predictors[which(predictors %in% colnames(test))]
  }

  # check whether train is a dataset or the output of naiveBayes_train()
  if (!is.list(train) || !is.list(train[[1]]) || is.null(train$table)) {
    mod_train = naiveBayes_train(formula = formula, data = train)
  } else {
    mod_train = train
  }
  if (any(!predictors %in% names(mod_train))) {
    missing_preds = predictors[which(!predictors %in% names(mod_train))]
    warning(paste0('some predictor variables are missing in train: ',
                   paste0(missing_preds, collapse = ', ')))
    predictors = predictors[which(predictors %in% names(mod_train))]
  }

  nPredictors = length(predictors)
  classes = names(mod_train$table)
  nClasses = length(classes)
  nObs = nrow(test)
  class_names = paste0(outcome, classes)
  like_names = paste0('like_', outcome, classes)
  prior_names = paste0('prior_', outcome, classes)
  priorF_names = paste0('priorF_', outcome, classes)
  priorB_names = paste0('priorB_', outcome, classes)

  # get likelihoods
  test[, c(class_names, like_names, prior_names, priorF_names, priorB_names)] = NA
  test[, like_names] = naiveBayes_likelihood(
    d = test, nObs = nObs, mod_train = mod_train,
    class_names = class_names, nClasses = nClasses, like_names = like_names,
    predictors = predictors, nPredictors = nPredictors)

  ## priors
  if (prior == 'dynamic' && is.null(wlPrior) ||
      !is.finite(wlPrior) || wlPrior < 1) prior = 'static'
  if (prior == 'flat') {
    # use only the likelihood - every class is considered equally common
    test[, prior_names] = log(1 / nClasses)
  } else if (prior == 'static') {
    # use class proportions in the training set as priors
    cat_props = log(as.numeric(mod_train$table))
    test[, prior_names] = matrix(cat_props, nrow = nObs, ncol = nClasses, byrow = TRUE)
  } else if (prior == 'dynamic') {
    # forward
    test[, priorF_names] = naiveBayes_dynamicPrior(
      d = test, nObs = nObs, mod_train = mod_train, wl = wlPrior,
      class_names = class_names, nClasses = nClasses, like_names = like_names,
      prior_names = prior_names, predictors = predictors, nPredictors = nPredictors)

    if (runBack) {
      # backward
      myseq = nObs:1
      priorB = naiveBayes_dynamicPrior(
        d = test[myseq, ], nObs = nObs, mod_train = mod_train, wl = wlPrior,
        class_names = class_names, nClasses = nClasses, like_names = like_names,
        prior_names = prior_names, predictors = predictors, nPredictors = nPredictors)
      test[, priorB_names] = priorB[myseq, ]

      # average the forward and backward priors
      test[, prior_names] = (test[, priorF_names] + test[, priorB_names]) / 2
    } else {
      test[, prior_names] = test[, priorF_names]
    }
  }

  ## posterior
  test[, class_names] = exp(test[, like_names] + test[, prior_names])
  test[, class_names] = test[, class_names] /
    rowSums(test[, class_names], na.rm = TRUE)

  # predict the class
  idx = as.numeric(apply(test[, class_names], 1, which.max))
  test$pr = as.factor(classes[idx])

  if (!is.null(wlClumper) && is.finite(wlClumper) && wlClumper > 1) {
    test$pr = clumper(test$pr, minLength = wlClumper)
  }

  # from log to prob [0, 1] (just to return a neat table and for plotting)
  test[, like_names] = exp(test[, like_names])
  test[, like_names] = test[, like_names] / rowSums(test[, like_names], na.rm = TRUE)

  test[, prior_names] = exp(test[, prior_names])
  test[, prior_names] = test[, prior_names] / rowSums(test[, prior_names], na.rm = TRUE)

  test[, priorF_names] = exp(test[, priorF_names])
  test[, priorF_names] = test[, priorF_names] / rowSums(test[, priorF_names], na.rm = TRUE)

  test[, priorB_names] = exp(test[, priorB_names])
  test[, priorB_names] = test[, priorB_names] / rowSums(test[, priorB_names], na.rm = TRUE)

  # plot
  if (plot) {
    par(mfrow = c(2, 2))
    # likelihood
    plot(as.numeric(test[, outcome]) - 1, type = 'l', main = 'Likelihood',
         xlab = '', ylab = '')
    points(1 - test[, like_names[1]], type = 'l', col = 'blue')

    # priors
    plot(as.numeric(test$group) - 1, type = 'l', main = 'Priors',
         xlab = '', ylab = '')
    points(1 - test[, prior_names[1]], type = 'l', col = 'green')
    points(1 - test[, priorF_names[1]], type = 'l', col = 'blue')
    points(1 - test[, priorB_names[1]], type = 'l', col = 'red')

    # posterior
    plot(as.numeric(test$group) - 1, type = 'l', main = 'Posterior',
         xlab = '', ylab = '')
    points(1 - test[, class_names[1]], type = 'l', col = 'blue')

    # prediction
    plot(as.numeric(test[, outcome]) - 1, type = 'l', main = 'Prediction',
         xlab = '', ylab = '')
    points(as.numeric(test$pr) - 1, type = 'l', lty = 2, col = 'blue')
    par(mfrow = c(1, 1))
  }
  return(test)
}


#' Train a naive Bayes classifier
#'
#' Returns conditional means and standard deviations per class as well as a
#' table with the global proportions of each class in the dataset. This is
#' mostly useful because the output can be passed on to \code{\link{naiveBayes}}
#' to save time if naiveBayes() is called in a loop with the same training
#' dataset.
#'
#' @param formula outcome ~ predictor1 + predictor1 + ...
#' @param data training dataset
#' @export
naiveBayes_train = function(formula, data) {
  # parse model formula
  mf = as.formula(as.list(environment())$formula)
  vars = attr(terms(mf), 'variables')
  vars_str = as.character(vars)[-1]
  outcome = vars_str[1]
  predictors = vars_str[2:length(vars_str)]

  nPredictors = length(predictors)
  out = vector('list', nPredictors)
  names(out) = predictors

  for (p in predictors) {
    # skip columns with only NAs
    if (any(!is.na(data[, p]))) {
      # class-conditional distributions of each predictor
      ag_p = as.data.frame(aggregate(data[, p] ~ data[, outcome], FUN = mean))
      colnames(ag_p) = c('outcome', 'mean')
      ag_p$sd = aggregate(data[, p] ~ data[, outcome], FUN = sd)[, 2]
      out[[p]] = ag_p
    }
  }
  out$table = table(data[, outcome])
  out$table = as.list(out$table / sum(out$table))
  return(out)
}


#' Naive Bayes likelihood
#'
#' Internal soundgen function
#'
#' A Helper function called by \code{\link{naiveBayes}} to calculate the
#' likelihood of each observation. Algorithm: for each predictor and class, the
#' likelihood is dnorm(observation, mean_per_class, sd_per_class). I tried
#' non-Gaussian probability distributions (Student's t to accommodate outliers),
#' but Gaussian actually seems to be more robust.
#'
#' @param d dataframe containing the observations
#' @param nObs the number of observations
#' @param mod_train the output of naiveBayes_train()
#' @param class_names names of outcome classes
#' @param nClasses the number of outcome classes
#' @param like_names the names of variables holding likelihoods
#' @param predictors the names of predictor variables
#' @param nPredictors the number of predicto variables
#' @keywords internal
naiveBayes_likelihood = function(d,
                                 nObs = nrow(d),
                                 mod_train,
                                 class_names,
                                 nClasses = length(class_names),
                                 like_names,
                                 predictors,
                                 nPredictors = length(predictors)) {
  for (c in 1:nClasses) {
    class_name = class_names[c]
    like_name = like_names[c]
    likes = matrix(NA, nrow = nObs, ncol = nPredictors)
    for (p in 1:nPredictors) {
      pred_p = predictors[p]
      mean_p_c = mod_train[[pred_p]]$mean[c]
      sd_p_c = mod_train[[pred_p]]$sd[c]
      temp = suppressWarnings(try(
        log(dnorm(as.numeric(d[, pred_p]), mean = mean_p_c, sd = sd_p_c)),
        silent = TRUE))
      if (!inherits(temp, 'try-error')) likes[, p] = temp
    }
    d[, like_name] = apply(likes, 1, sum, na.rm = TRUE)
  }
  # d[, like_names] = d[, like_names] / rowSums(d[, like_names], na.rm = TRUE)
  return(d[, like_names])
}


#' Naive Bayes dynamic prior
#'
#' Internal soundgen function
#'
#' A Helper function called by \code{\link{naiveBayes}} to calculate dynamic
#' priors. Algorithm: average the likelihoods of wl preceding observations
#' weighted by a Gaussian function, so more recent observations have more
#' weight.
#' @inheritParams naiveBayes_likelihood
#' @param wl window length, points
#' @param prior_names the names of prior variables
#' @keywords internal
naiveBayes_dynamicPrior = function(d,
                                   nObs = nrow(d),
                                   mod_train,
                                   wl,
                                   class_names,
                                   nClasses = length(class_names),
                                   like_names,
                                   prior_names,
                                   predictors,
                                   nPredictors = length(predictors)) {
  # set up a gaussian window which sums to 1
  if (wl > 2) {
    n1 = wl - 1
    e12 = exp(-12)
    win = (exp(-12 * (((0:n1)/n1) - 0.5)^2) - e12)/(1 - e12)
  } else {
    win = rep(1, wl)
  }

  # the first observation is just assigned the global prior
  d[1, prior_names] = log(as.numeric(mod_train$table))

  # then we start using the likelihood over some preceding window to update the
  # prior for each successive point
  for (c in 1:nClasses) {
    like_c = d[, like_names[c]]
    table_c = log(mod_train$table[[c]])
    myseq = 2:nObs
    d[myseq, prior_names[c]] = apply(as.matrix(myseq), 1, function(i) {
      # absolute indices of frame in win
      prior_win = max(1, i - wl) : (i - 1)
      # windowing function over wl normalized to sum to 1
      win_i = win[(wl - length(prior_win) + 1):wl]
      win_i = win_i / sum(win_i)

      # weighted sum of the likelihood of class c over win_i
      weighted_sum = sum(win_i * like_c[prior_win])
      # add weighted_sum to the static prior of class c, using
      # logistic(logit(x+y)) to avoid ever exceeding 1
      # logistic(sum(logit(c(table_c, weighted_sum))))
      table_c + weighted_sum
    })
  }
  # d[, prior_names] = log(d[, prior_names]) #  / rowSums(d[, prior_names])
  return(d[, prior_names])
}

Try the soundgen package in your browser

Any scripts or data that you put into this service are public.

soundgen documentation built on Sept. 12, 2024, 6:29 a.m.