#' Find first non-missing element
#'
#' Given a set of vectors, `coalesce()` finds the first non-missing value
#' at each position. This is inspired by the SQL `COALESCE` function
#' which does the same thing for `NULL`s.
#'
#' @param ... Vectors. All inputs should either be length 1, or the
#' same length as the first argument.
#'
#' These dots support [tidy dots][rlang::tidy-dots] features.
#' @return A vector the same length as the first `...` argument with
#' missing values replaced by the first non-missing value.
#' @seealso [na_if()] to replace specified values with a `NA`.
#' [tidyr::replace_na()] to replace `NA` with a value
#' @export
#' @examples
#' # Use a single value to replace all missing values
#' x <- sample(c(1:5, NA, NA, NA))
#' coalesce(x, 0L)
#'
#' # Or match together a complete vector from missing pieces
#' y <- c(1, 2, NA, NA, 5)
#' z <- c(NA, NA, 3, 4, 5)
#' coalesce(y, z)
#'
#' # Supply lists by splicing them into dots:
#' vecs <- list(
#' c(1, 2, NA, NA, 5),
#' c(NA, NA, 3, 4, 5)
#' )
#' coalesce(!!!vecs)
#' @import Rcpp
NaiveBayes = function(x, ...)
UseMethod("NaiveBayes")
NaiveBayes.default = function(x, y, laplace = 0, ...){
call = match.call()
x = as.data.frame(x)
n_var = ncol(x)
Name_y = deparse(substitute(y))
y = as.character(y)
laplace = laplace
# for continuous variables, calculate their means and standard deviation under different y level
# for categorical variables, calculate their frequency and relative frequency under different y
cppFunction('
List mean_sd(DataFrame x, CharacterVector y, double laplace = 0){
int n_var = x.ncol();
List m_s (x.ncol());
int n_y = unique(y).size();
CharacterVector level = unique(y);
LogicalVector lc (x.nrow());
for(int j = 0; j < n_var; j++){
RObject column = x[j];
Function rcpp_type( "rcpp_type" );
String judge = rcpp_type(column);
if(judge == "Numeric"){
NumericMatrix group (n_y, 2);
for(int i = 0; i < n_y; i++){
for(int k = 0; k < x.nrow(); k++){
lc(k) = (y(k) == level(i));
}
NumericVector selected = x[j];
NumericVector m = selected[lc];
group(i,_) = NumericVector::create(mean(m), sd(m));
rownames(group) = level;
m_s[j] = group;
}
} else {
Function tableC( "tableC" );
CharacterVector selected = x[j];
NumericVector grouping = tableC(selected);
CharacterVector groupnam = grouping.names();
int group_column = grouping.size();
NumericMatrix group (n_y, group_column);
rownames(group) = level;
colnames(group) = groupnam;
for(int i = 0; i < n_y; i++){
for(int k = 0; k < x.nrow(); k++){
lc(k) = (y(k) == level(i));
}
CharacterVector m = selected[lc];
NumericVector counts = tableC(m);
CharacterVector xnames = counts.names();
NumericVector rows (group_column);
rows.names() = groupnam;
for(CharacterVector::iterator q = groupnam.begin(); q != groupnam.end(); q++){
String ind = *q;
if (std::find(xnames.begin(), xnames.end(),ind) != xnames.end()){
rows[ind] = (counts[ind] + laplace)/(m.size() + laplace * group_column);
} else {
}
}
group(i,_) = rows;
}
NumericVector xnames = tableC(selected);
CharacterVector xnam = xnames.names();
colnames(group) = xnam;
m_s[j] = group;
}
}
return m_s;
}
')
# format output
apriori = table(y)
results = mean_sd(x, y)
for (i in 1:length(results)){
names(dimnames(results[[i]])) = c(Name_y, colnames(x)[i])
}
names(dimnames(apriori)) = Name_y
structure(list(apriori = apriori / sum(apriori),
results = results,
levels = if (is.logical(y)) c(FALSE, TRUE) else levels(y),
predictors = colnames(x),
call = call
),
class = "NaiveBayes"
)
}
# for formula input
NaiveBayes.formula = function(formula, data, laplace = 0, ...) {
call = match.call()
fm = match.call(expand.dots = FALSE)
fm$... = NULL
fm$laplace = NULL
fm[[1L]] = quote(stats::model.frame)
fm = eval(fm, parent.frame())
tms = attr(fm, "terms")
Y = model.extract(fm, "response")
X = fm[,-attr(tms, "response"), drop = FALSE]
return(NaiveBayes(X, Y, laplace = laplace, ...))
}
# check data type
cppFunction('
String rcpp_type(RObject x){
if(is<NumericVector>(x)){
if(Rf_isMatrix(x)) return "Numeric";
else return "Numeric";
}
else if(is<IntegerVector>(x)){
if(Rf_isFactor(x)) return "Non-Numeric";
else return "Numeric";
}
else if(is<CharacterVector>(x))
return "Non-Numeric";
else if(is<LogicalVector>(x))
return "Non-Numeric";
else if(x.isNULL())
return "Non-Numeric";
else{
return "Non-Numeric";
}
}
')
# generate a frequncy table of data
cppFunction('
std::map<String, int> tableC(CharacterVector x) {
std::map<String, int> counts;
int n = x.size();
for (int i = 0; i < n; i++) {
counts[x[i]]++;
}
return counts;
}
')
# output format for model fitting
print.NaiveBayes = function(x, ...) {
cat("\nNaive Bayes Classifier for Discrete Predictors\n\n")
cat("Call:\n")
print(x$call)
cat("\nA-priori probabilities:\n")
print(x$apriori)
cat("\nPredictors:\n")
print(x$predictors)
cat("\nConditional probabilities:\n")
for (i in x$results) {print(i); cat("\n")}
}
predict.NaiveBayes <- function(object, newdata, type = c("class", "raw"), threshold = 0.001, eps = 0, ...) {
type = match.arg(type)
newdata = as.data.frame(newdata)
neworder = match(object$predictors, colnames(newdata))
probs = matrix(0, length(object$apriori), nrow(newdata))
# generate probability of each observation given each level of y
for (j in 1:nrow(newdata)){
probs[ , j] = rowSums(log(sapply(neworder, function(index){
if (is.numeric(newdata[j,index])){
oneresult = object$results[[index]]
oneresult[, 2][oneresult[, 2] <= eps] = threshold
return(dnorm(newdata[j,index], oneresult[, 1], oneresult[, 2]))
} else {
prob = object$results[[index]][, newdata[j, index]]
prob[prob <= eps] = threshold
return(prob)
}})))
}
# calculate the probability of each observation being categorized under different levels of y
probs = t(exp(probs))
apriori = object$apriori
for (i in 1:length(apriori)) {
probs[, i] = probs[, i] * apriori[i]
}
sums = rowSums(probs)
sums[sums == 0] = 1
probs = probs/sums
colnames(probs) = rownames(object$results[[1]])
# output "class" or "raw"
if (type == "class") {
class = as.factor(rownames(object$results[[1]])[max.col(probs, ties.method = "first")])
return(class)
} else {
return(probs)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.