R/NaiveBayes.R

Defines functions NaiveBayes NaiveBayes.default NaiveBayes.formula print.NaiveBayes predict.NaiveBayes

Documented in NaiveBayes

#' Find first non-missing element
#'
#' Given a set of vectors, `coalesce()` finds the first non-missing value
#' at each position. This is inspired by the SQL `COALESCE` function
#' which does the same thing for `NULL`s.
#'
#' @param ... Vectors. All inputs should either be length 1, or the
#'   same length as the first argument.
#'
#'   These dots support [tidy dots][rlang::tidy-dots] features.
#' @return A vector the same length as the first `...` argument with
#'   missing values replaced by the first non-missing value.
#' @seealso [na_if()] to replace specified values with a `NA`.
#' [tidyr::replace_na()] to replace `NA` with a value
#' @export
#' @examples
#' # Use a single value to replace all missing values
#' x <- sample(c(1:5, NA, NA, NA))
#' coalesce(x, 0L)
#'
#' # Or match together a complete vector from missing pieces
#' y <- c(1, 2, NA, NA, 5)
#' z <- c(NA, NA, 3, 4, 5)
#' coalesce(y, z)
#'
#' # Supply lists by splicing them into dots:
#' vecs <- list(
#'   c(1, 2, NA, NA, 5),
#'   c(NA, NA, 3, 4, 5)
#' )
#' coalesce(!!!vecs)
#' @import Rcpp
NaiveBayes = function(x, ...)
  UseMethod("NaiveBayes")

NaiveBayes.default = function(x, y, laplace = 0, ...){
  call = match.call()
  x = as.data.frame(x)
  n_var = ncol(x)
  Name_y = deparse(substitute(y))
  y = as.character(y)
  laplace = laplace

  # for continuous variables, calculate their means and standard deviation under different y level
  # for categorical variables, calculate their frequency and relative frequency under different y

  cppFunction('
  List mean_sd(DataFrame x, CharacterVector y, double laplace = 0){

      int n_var = x.ncol();
      List m_s (x.ncol());
      int n_y = unique(y).size();
      CharacterVector level = unique(y);
      LogicalVector lc (x.nrow());

      for(int j = 0; j < n_var; j++){

       RObject column = x[j];
       Function rcpp_type( "rcpp_type" );
       String judge = rcpp_type(column);

       if(judge == "Numeric"){

        NumericMatrix group (n_y, 2);
         for(int i = 0; i < n_y; i++){
         for(int k = 0; k < x.nrow(); k++){
           lc(k) = (y(k) == level(i));
         }

         NumericVector selected = x[j];
         NumericVector m = selected[lc];
         group(i,_) = NumericVector::create(mean(m), sd(m));
         rownames(group) = level;
         m_s[j] = group;
         }
        } else {

         Function tableC( "tableC" );
         CharacterVector selected = x[j];
         NumericVector grouping = tableC(selected);
         CharacterVector groupnam = grouping.names();
         int group_column = grouping.size();
         NumericMatrix group (n_y, group_column);
         rownames(group) = level;
         colnames(group) = groupnam;

         for(int i = 0; i < n_y; i++){
         for(int k = 0; k < x.nrow(); k++){
           lc(k) = (y(k) == level(i));
         }

         CharacterVector m = selected[lc];
         NumericVector counts = tableC(m);
         CharacterVector xnames = counts.names();
         NumericVector rows (group_column);
         rows.names() = groupnam;

           for(CharacterVector::iterator q = groupnam.begin(); q != groupnam.end(); q++){
             String ind = *q;
             if (std::find(xnames.begin(), xnames.end(),ind) != xnames.end()){
             rows[ind] = (counts[ind] + laplace)/(m.size() + laplace * group_column);
             } else {
             }
           }
             group(i,_) = rows;
         }

           NumericVector xnames = tableC(selected);
           CharacterVector xnam = xnames.names();
           colnames(group) = xnam;
           m_s[j] = group;
         }
        }
      return m_s;
  }
  ')

  # format output
  apriori = table(y)
  results = mean_sd(x, y)

  for (i in 1:length(results)){
    names(dimnames(results[[i]])) = c(Name_y, colnames(x)[i])
  }
  names(dimnames(apriori)) = Name_y

  structure(list(apriori = apriori / sum(apriori),
                 results = results,
                 levels = if (is.logical(y)) c(FALSE, TRUE) else levels(y),
                 predictors = colnames(x),
                 call   = call
  ),
  class = "NaiveBayes"
  )
}


# for formula input
NaiveBayes.formula = function(formula, data, laplace = 0, ...) {
  call = match.call()
  fm = match.call(expand.dots = FALSE)
  fm$... = NULL
  fm$laplace = NULL
  fm[[1L]] = quote(stats::model.frame)
  fm = eval(fm, parent.frame())
  tms = attr(fm, "terms")
  Y = model.extract(fm, "response")
  X = fm[,-attr(tms, "response"), drop = FALSE]

  return(NaiveBayes(X, Y, laplace = laplace, ...))
}


# check data type
cppFunction('
String rcpp_type(RObject x){
  if(is<NumericVector>(x)){
    if(Rf_isMatrix(x))  return "Numeric";
    else return "Numeric";
  }
  else if(is<IntegerVector>(x)){
    if(Rf_isFactor(x)) return "Non-Numeric";
    else return "Numeric";
  }
  else if(is<CharacterVector>(x))
    return "Non-Numeric";
  else if(is<LogicalVector>(x))
    return "Non-Numeric";
  else if(x.isNULL())
    return "Non-Numeric";
  else{
    return "Non-Numeric";
  }
}
')

# generate a frequncy table of data
cppFunction('
std::map<String, int> tableC(CharacterVector x) {
  std::map<String, int> counts;
  int n = x.size();
  for (int i = 0; i < n; i++) {
    counts[x[i]]++;
  }
  return counts;
}
')

# output format for model fitting
print.NaiveBayes = function(x, ...) {
  cat("\nNaive Bayes Classifier for Discrete Predictors\n\n")
  cat("Call:\n")
  print(x$call)
  cat("\nA-priori probabilities:\n")
  print(x$apriori)
  cat("\nPredictors:\n")
  print(x$predictors)
  cat("\nConditional probabilities:\n")
  for (i in x$results) {print(i); cat("\n")}
}



predict.NaiveBayes <- function(object, newdata, type = c("class", "raw"), threshold = 0.001, eps = 0, ...) {
  type = match.arg(type)
  newdata = as.data.frame(newdata)
  neworder = match(object$predictors, colnames(newdata))
  probs = matrix(0, length(object$apriori), nrow(newdata))

  # generate probability of each observation given each level of y
  for (j in 1:nrow(newdata)){
    probs[ , j] = rowSums(log(sapply(neworder, function(index){
      if (is.numeric(newdata[j,index])){
        oneresult = object$results[[index]]
        oneresult[, 2][oneresult[, 2] <= eps] = threshold
        return(dnorm(newdata[j,index], oneresult[, 1], oneresult[, 2]))
      } else {
        prob = object$results[[index]][, newdata[j, index]]
        prob[prob <= eps] = threshold
        return(prob)
      }})))
  }

  # calculate the probability of each observation being categorized under different levels of y
  probs = t(exp(probs))
  apriori = object$apriori
  for (i in 1:length(apriori)) {
    probs[, i] = probs[, i] * apriori[i]
  }
  sums = rowSums(probs)
  sums[sums == 0] = 1
  probs = probs/sums
  colnames(probs) = rownames(object$results[[1]])

  # output "class" or "raw"
  if (type == "class") {
    class = as.factor(rownames(object$results[[1]])[max.col(probs, ties.method = "first")])
    return(class)
  } else {
    return(probs)
  }
  }
sidiwang/classifier documentation built on Nov. 23, 2019, 12:06 a.m.