R/trainNB.R

Defines functions trainNB

Documented in trainNB

#' Train Naive Bayes
#'
#' Trains multiclass Naive Bayes classifier
#' @param coding Numeric vector of training document codings
#' @param train_matrix A \code{quanteda} document-feature matrix with the number of rows equal to the length of \code{coding}
#' @param smoothing Type of Laplacian smoothing for term priors. See 'Details'.
#' @param alpha Smoothing hyperparameter for 'parameterized' smoothing
#' @param beta Smoothing hyperparameter for 'parameterized' smoothing
#'
#' @return A list with the elements
#' \item{w_0c}{Constant portion of NB classification probabilities.}
#' \item{w_jc}{Portion of NB classification probabilities that varies with test document word appearances.}
#' \item{nc}{Frequency of each category in training documents (named numeric vector)}
#' \item{theta_c}{Unsmoothed prior class probabilities (named numeric vector)}
#'
#' @details
#' Smoothing method defaults to 'normalized' using the system advocated by
#' Frank and Bouckaert (2006) for per-class word vector normalization.
#'
#' Using 'simple' will employ a simple version of Laplacian smoothing described
#' in Metsis et al. (2006). Prior probability of term appearance, given a class,
#' is just frequency of term in class plus 1 over count of documents in class
#' plus 2.
#'
#' Using 'parameterized' will use a version of smoothing mentioned in O'Neil &
#' Schutt (2013) for multiclass Naive bayes. Prior prob. of term appearance,
#' given a class, is frequency of term in class plus alpha minus 1 over count
#' of documents in class plus alpha plus beta minus 2.
#'
#' Using 'none' is inadvisable. In this case, prior prob. of term appearance,
#' given a class, is frequency of term in class over count of documents in class.
#' This will likely generate zero priors, which is a problem.
#'
#' @author Matt W. Loftis
#'
#' @references
#' Frank, E. and Bouckaert, R.R. (2006) Naive Bayes for Text Classification with
#' Unbalanced Classes. s, Knowledge Discovery in Databases: PKDD, 503-510.
#'
#' Metsis, V. Androutsopoulos, I. and Paliouras, G. (2006) Spam Filtering with
#' Naive Bayes -- Which Naive Bayes? CEAS 2006 - Third Conference on Email and
#' Anti-Spam, July 27-28, 2006, Mountain View, California USA.
#'
#' O'Neil, C. and Schutt, R. (2013) Doing Data Science: Straight Talk from the
#' Frontline. O'Reilly.
#'
#' @examples
#' ## Load data and create document-feature matrices
#'   train_corpus <- quanteda::corpus(x = training_agendas$text)
#'   metadoc(train_corpus, "language") <- "danish"
#'   train_matrix <- quanteda::dfm(train_corpus,
#'                                 stem = TRUE,
#'                                 removeNumbers = FALSE)
#'
#' est <- trainNB(training_agendas$coding, train_matrix)
#'
#' @rdname trainNB
#' @import Matrix quanteda
#' @export


trainNB <- function(coding,
                    train_matrix,
                    smoothing = c("normalized", "simple",
                                  "parameterized", "none"),
                    alpha = 2,
                    beta = 10,
                    custom.class.priors = NULL) { ##TRAIN CLASSIFIER
  smoothing <- match.arg(smoothing)

  ##Error catching and warnings
  if (length(coding) != nrow(train_matrix)) {
    stop('Number of codings does not equal number of documents in training document-feature matrix')
  }
  if (any(is.na(coding))){
    warning('Missing values present in coding. Removed observations with missing coding.')
    train_matrix <- train_matrix[!is.na(coding), ]
    coding <- coding[!is.na(coding)]
  }
  if (!quanteda::is.dfm(train_matrix)) stop('Must supply a quanteda dfm as train_matrix.')
  if (!is.numeric(coding)) stop('Coding is not numeric. agendacodeR currently requires numeric codings.')

  ##Preliminary items
  if (any(train_matrix@x > 1)) train_matrix@x[train_matrix@x > 1] <- 1
  c <- length(unique(coding)) #total categories (1 x 1)
  nc <- as.vector(table(coding)) #number of training obs per category (c x 1)
  names(nc) <- names(table(coding)) #naming nc vector with category names

  ##Calculate or accept simple class priors
  if (!is.null(custom.class.priors)) {
    if(!is.vector(custom.class.priors)) stop('Custom class priors must be a vector')
    if(!is.numeric(custom.class.priors)) stop('Custom class priors must be numeric')
    if(length(custom.class.priors) != length(nc)) stop('Incorrect number of custom class priors')
    if(any(names(custom.class.priors) != names(nc))) stop('Custom class priors do not match training codings')
    theta_c <- custom.class.priors
  } else {
    theta_c <- nc / nrow(train_matrix) #simple prior probs of categories (c x 1)
  }

  ##Reordering these vectors to deal with the reference category problem
  ##If the reference category is the least common category, predictive accuracy is better
  nc <- nc[order(theta_c, decreasing = TRUE)]
  theta_c <- theta_c[order(theta_c, decreasing = TRUE)] #sorting nc and theta_c by theta_c

  ##Calculating priors over terms/categories
  njc <- matrix(NA, nrow = c, ncol = ncol(train_matrix)) #word frequencies by category (c x j)
  rownames(njc) <- names(nc) #apply category names and term names to dimensions
  colnames(njc) <- colnames(train_matrix) #apply category names and term names to dimensions
  for (cat in 1:c){ #loop over categories to count this
    if (length(coding[which(coding == rownames(njc)[cat])]) > 1) {
      njc[cat,] <- Matrix::colSums(train_matrix[which(coding == rownames(njc)[cat]), ])
    } else {
      njc[cat,] <- as.vector(train_matrix[which(coding == rownames(njc)[cat]), ])
    }
  }

  ##Setting prior prob of words in docs by categories in the training set ################
  #NB: Priors set wrt total terms use in category proposed in Frank and Bouckaert 2006
  # if (smoothing == "normalized-corrected") {
  #   j <- ncol(train_matrix) #total number of terms (1 x 1)
  #   n_notj_c <- matrix(rep(apply(njc, 1, sum), ncol(njc)),
  #                      ncol = ncol(njc), nrow = nrow(njc)) - njc #freq of all j in c minus freq of j in c (c x j)
  #   # i.e. frequency of all NOT j in c
  #
  #   theta_jc <- 1 + (njc / n_notj_c)
  # }

  if (smoothing == "normalized") {
    j <- ncol(train_matrix) #total number of terms (1 x 1)
    n_notj_c <- matrix(rep(apply(njc, 1, sum), ncol(njc)),
                       ncol = ncol(njc), nrow = nrow(njc)) - njc #freq of all j in c minus freq of j in c (c x j)
    # i.e. frequency of all NOT j in c

    theta_jc <- (1 + njc) / (j + n_notj_c)
    # theta_jc[theta_jc > 1] <- .999999999999 #ceiling on odds greater than one (this was an unnecessary fix)
    if (any(theta_jc > 1)) stop("Some theta_jc greater than 1. Use unbalanced class correction, option \"normalized-corrected\".")
  }

  #NB: Laplacian priors proposed in Metsis et al 2006
  if (smoothing == "simple"){
    theta_jc <- (njc + 1) / (nc + 10)
  }

  #NB: With smoothing hyperparameters proposed in O'Neil & Schutt 2013
  if (smoothing == "parameterized"){
    theta_jc <- (njc + alpha - 1) / (nc + alpha + beta - 2)
  }

  #NB: This is inadvisable. In fact, don't do it. You'll almost certainly get zero priors.
  if (smoothing == "none"){
    theta_jc <- (njc) / (nc)
  }

  ########################################################################################
  ## Main training steps
  foo <- log( (1 - t(theta_jc[-c, ]))/(1 - theta_jc[c, ]) ) #intermediate step to build baseline log odds of each category
  if (c == 2) { # transpose foo and add colnames when number of categories is 2
    foo <- t(foo)
    colnames(foo) <- rownames(theta_jc)[1]
  }
  foo <- apply(foo, 2, sum) #intermediate step to build baseline log odds of each category
  w_0c <- foo + log(theta_c[-c] / theta_c[c]) #baseline log odds of categories (invariant to words in vector x_i)
  w_jc <- log((theta_jc[-c, ] * (1 - theta_jc[c, ]) / (theta_jc[c, ] * (1 - theta_jc[-c, ])))) #variable portion of log odds, depending on words in vector x_i
  if (c == 2) { # convert w_jc from a vector into a matrix when number of categories is 2
    w_jc <- t(as.matrix(w_jc))
    rownames(w_jc) <- rownames(theta_jc)[1]
  }

  return( list( w_0c = w_0c, w_jc = w_jc, nc = nc, theta_c = theta_c ) ) #return stuff to make forecasts on new data
}
mattwloftis/agendacodeR documentation built on June 5, 2023, 7 p.m.