#This is a auxilar function to estimate parameters of naive Bayes model.
#Arguments:
#distr: is a string that specifies which distribution should be used for modelling the features
#by_var: is a list that contains the data separated by category.
#Output: is a list which contains the parameters estimates.
runPar = function(by_var){
lapply(by_var, function(y){
sapply(y, simplify=FALSE, function(w){
if( class(w) == "factor"){
est.par = prop.table(table(w, dnn=NULL))
#names(est.par) =paste(levels(w))
est.par
} else if(class(w) == "numeric") {
est.par = as.vector(c(mean(w),var(w)))
names(est.par) = c("mu","sigma2")
est.par
} else {
stop("Error")
}
})
})
}
#This is a auxilar function to classify a new observation
#Arguments:
#distr: is a string that specifies which distribution should be used for modelling the features
#par: is a list which contains informations about the parameters.
#xnew.quali: a vector which contains qualititative/discrete features of the new #observation.
#xnew.quant: a vector which contains quantitative features of the new observation.
#nam.prob: is a vector of characters with labels of probabilities posterior
#nCat: is the number of categories
#est.Pi: is a vector of prior probabilities
#Output: is a list which contains the posterior probabilities
runProb.mix = function(par,xnew.quali, xnew.quant,nCat,est.Pi){
param = par
if(missing(xnew.quali) || missing(xnew.quant)){
stop("Error: You must provide a new observation")
}
param.quali = lapply(param, function(x){
x[names(x) %in% names(xnew.quali)]
})
prXqualigivenZ = lapply(param.quali, function(w){
sapply(1:length(w), function(x){
s = w[[x]][xnew.quali[x]== names(w[[x]])]
#ifelse(s == 0, 0, log(s) )
ifelse(s == 0, log(s+1e-50), log(s) )
})
})
param.quant = lapply(param, function(x){
x[names(x) %in% names(xnew.quant)]
})
param.quant.aug = lapply(param.quant, function(x){
x = t(matrix(unlist(x),ncol=2,byrow=T))
rbind(x,xnew.quant)
})
prXquantgivenZ = lapply(param.quant.aug, function(w){
apply(w, 2, function(x){
dnorm(x[3], mean = x[1], sd = sqrt(x[2]), log=TRUE)
})
})
prXgivenZ = sapply(1:nCat, function(x){
c(prXquantgivenZ[[x]],prXqualigivenZ[[x]])
})
prXgivenZ = apply(prXgivenZ,2,sum)
if(any(est.Pi == 0) ){
prXZ = prXgivenZ + log(est.Pi+1e-50)
} else {
prXZ = prXgivenZ + log(est.Pi)
}
praddlog = addlog(prXZ[1],prXZ[2])
if(nCat > 2){
for(i in 3:nCat){
praddlog = addlog(praddlog,prXZ[i])
}
}
prRes = exp(prXZ-praddlog)
#names(prRes) = paste("Pr(Z==",nam.prob,"|X=x)",sep="")
prRes
}
#Arguments:
#distr: is a string that specifies which distribution should be used for modelling the features
#par: is a list which contains informations about the parameters.
#xnew.quant: a vector which contains quantitative features of the new observation.
#nam.prob: is a vector of characters with labels of probabilities posterior
#nCat: is the number of categories
#est.Pi: is a vector of prior probabilities
#Output: is a list which contains the posterior probabilities
runProb.quant = function(par, xnew.quant, nCat,est.Pi){
param = par
if( missing(xnew.quant)){
stop("Error: You must provide a new observation")
}
param.quant = lapply(param, function(x){
x[names(x) %in% names(xnew.quant)]
})
param.quant.aug = lapply(param.quant, function(x){
x = t(matrix(unlist(x),ncol=2,byrow=T))
rbind(x,xnew.quant)
})
prXquantgivenZ = lapply(param.quant.aug, function(w){
apply(w, 2, function(x){
dnorm(x[3], mean = x[1], sd = sqrt(x[2]), log=TRUE)
})
})
prXgivenZ = sapply(1:nCat, function(x){
c(prXquantgivenZ[[x]])
})
prXgivenZ = apply(prXgivenZ,2,sum)
if(any(est.Pi == 0) ){
prXZ = prXgivenZ + log(est.Pi+1e-50)
} else {
prXZ = prXgivenZ + log(est.Pi)
}
praddlog = addlog(prXZ[1],prXZ[2])
if(nCat > 2){
for(i in 3:nCat){
praddlog = addlog(praddlog,prXZ[i])
}
}
prRes = exp(prXZ-praddlog)
#names(prRes) = paste("Pr(Z==",nam.prob,"|X=x)",sep="")
prRes
}
#This is a auxilar function to classify a new observation
#Arguments:
#distr: is a string that specifies which distribution should be used for modelling the features
#par: is a list which contains informations about the parameters.
#xnew.quali: a vector which contains qualititative/discrete features of the new #observation.
#nam.prob: is a vector of characters with labels of probabilities posterior
#nCat: is the number of categories
#est.Pi: is a vector of prior probabilities
#Output: is a list which contains the posterior probabilities
runProb.quali = function(par,xnew.quali, nCat,est.Pi){
param = par
if(missing(xnew.quali) ){
stop("Error: You must provide a new observation")
}
param.quali = lapply(param, function(x){
x[names(x) %in% names(xnew.quali)]
})
prXqualigivenZ = lapply(param.quali, function(w){
sapply(1:length(w), function(x){
s = w[[x]][xnew.quali[x]== names(w[[x]])]
#ifelse(s == 0, 0, log(s) )
ifelse(s == 0, log(s+1e-50), log(s) )
})
})
prXgivenZ = sapply(1:nCat, function(x){
c(prXqualigivenZ[[x]])
})
prXgivenZ = apply(prXgivenZ,2,sum)
if(any(est.Pi == 0) ){
prXZ = prXgivenZ + log(est.Pi+1e-50)
} else {
prXZ = prXgivenZ + log(est.Pi)
}
praddlog = addlog(prXZ[1],prXZ[2])
if(nCat > 2){
for(i in 3:nCat){
praddlog = addlog(praddlog,prXZ[i])
}
}
prRes = exp(prXZ-praddlog)
#names(prRes) = paste("Pr(Z==",nam.prob,"|X=x)",sep="")
prRes
}
#This is a R code based on code written in C by Karl Broman (2018).
#Calculate addlog(a,b) = log[exp(a) + exp(b)]
#This makes use of the function log1p(x) = log(1+x) provided
#in R's math library.
addlog=function(a, b)
{
THRESH = 200
if(b > a + THRESH){
return(b)
} else if(a > b + THRESH){
return(a)
} else {
return(a + log1p(exp(b-a)))
}
}
#This is a function to estimate parameters of the naive Bayes model.
#Arguments:
#x: is a dataset which the categorical variable must be a factor
#ct: is a string of the name of categorical variable that classifies the instances into groups
#Output: a nb object with informations about the naive Bayes model:
#par: estimates of parameters of condtional probability density function
#multinomial and gaussian
#est.pi: mle estimate of the proportion of each category
#ct: is a string of the name of categorical variable that classifies the instances into groups
#nfeat: number of features
#ncat: number of categories
naive.bayes = function(x, ct){
#Dealing with exceptions
if(missing(x) || !is.data.frame(x)){
stop("You must provide the dataset as data.frame")
}
if(missing(ct)){
stop("You must provide the name of the categorical variables vector")
}
if(!is.factor(x[ct][,1])){
stop("The categorical variables vector must be factor")
}
type_columns = sapply(x[names(x)!=c(ct)],class)
if(any( type_columns == "logical" ) || any( type_columns == "character" ) ||
any( type_columns == "integer" )
){
stop("The qualitative/discrete features must be a factor and
the quantitative features must be a numeric")
}
res = list()
idx = x[ct][,1]
xi = x[,colnames(x)!=c(ct)]
#Prior Probabilities
est.pi = prop.table(table(idx,dnn=NULL))
#Parameters Estimates
#by_var = lapply(by_var, function(x){ xi[,colnames(x)!=c(ct)] })
by_var = split(xi,idx)
res$par = runPar(by_var)
#Final Results
res$est.pi = est.pi
res$nfeat = ncol(x[,colnames(x)!=c(ct)])
res$ct = ct
res$ncat = length(est.pi)
class(res) = "nb"
invisible(res)
}
#post.prob:
#This a function to compute posterior probabilities based on object of class nb
#Arguments:
#nb: is a object of class nb which contains information about the
#naive Bayes model
#xnew.quali: a vector which contains qualititative/discrete features of the new #observation.
#xnew.quant: a vector which contains quantitative features of the new observation.
#Output: a vector which contains the posterior probabilities
post.prob = function(object, xnew.quali, xnew.quant){
if(missing(xnew.quali) && missing(xnew.quant) ){
stop("You must provide the new observations")
}
#Number of categories
ncat = object$ncat
if((!missing(xnew.quali)) && (!missing(xnew.quant))){
runProb.mix(object$par,xnew.quali,xnew.quant,ncat,object$est.pi)
} else if((missing(xnew.quali)) && (!missing(xnew.quant))) {
runProb.quant(object$par,xnew.quant,ncat,object$est.pi)
} else {
runProb.quali(object$par,xnew.quali,ncat,object$est.pi)
}
}
#classify:
#This is a function which classifies the instances into categories
#Arguments:
#object: is a object of class nb
#xnew: is a vector of new observation
classify = function(object, xnew.quali, xnew.quant){
if(missing(xnew.quali) && missing(xnew.quant) ){
stop("You must provide the new observations")
}
if((!missing(xnew.quali)) && (!missing(xnew.quant))){
pred = post.prob(object,xnew.quali,xnew.quant)
} else if((missing(xnew.quali)) && (!missing(xnew.quant))) {
pred = post.prob(object,xnew.quant=xnew.quant)
} else {
pred = post.prob(object,xnew.quali)
}
names(pred)[which.max(pred)]
}
#confusion.matrix
#This is a function to estimate relative frequencies of the elements of
#confusion matrix based on k fold cross validation
#Arguments:
#object: is a nb object
#x: the dataset
#k: number of folds
#Output: Relative frequencies of the elements of confusion matrix
confusion.matrix = function(object, x, k=10){
if(k!=10 && k!=5){
stop("It is only allowed 5 and 10!")
}
#k folds Cross Validation
flds = createFolds(x[object$ct][,1], k = k, list = TRUE, returnTrain = FALSE)
res = lapply(1:k, function(w){
#Defining test sample
xtst = x[flds[[w]], names(x[colnames(x) != object$ct]) ]
#Defining training sample
xtrn = x[-flds[[w]],]
#Training the naive Bayes model
modtrn = naive.bayes(x=xtrn, ct=object$ct)
#Checking the class o variables
idclass = sapply(xtst,function(x){class(x)})
if(all(idclass == "factor")){
xtst = xtst[,idclass == "factor"]
pred = apply(xtst, 1, function(y){
classify(object = modtrn, xnew.quali=y)
})
} else if(all(idclass == "numeric")){
xtst = xtst[,idclass == "numeric"]
pred = apply(xtst, 1, function(y){
classify(object = modtrn, xnew.quant=y)
})
} else {
pred = unlist(sapply(1:nrow(xtst), simplify=FALSE, function(x){
x.quali = unlist(xtst[x,idclass=="factor"])
x.quant = unlist(xtst[x,idclass=="numeric"])
classify(object = modtrn, xnew.quali=x.quali,
xnew.quant=x.quant)
}) )
}
pred = factor(pred,levels(x[object$ct][,1]))
obs = factor(x[flds[[w]], colnames(x) == object$ct],levels(x[object$ct][,1]))
#Confusion matrix
conf.matrix = table(pred,obs)
#conf.matrix = prop.table(conf.matrix)
conf.matrix
})
nRow = nCol = length(levels(x[object$ct][,1]))
conf.array = array(unlist(res), dim = c(nRow, nCol , length(res)))
conf.matrix = apply(conf.array,c(1,2),sum)
#conf.matrix = (1/k)*conf.matrix
conf.matrix
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.