nb: Functions to run the naive Bayes classifier

#This is a auxilar function to estimate parameters of naive Bayes model.
#Arguments:
#distr: is a string that specifies which distribution should be used for modelling the features
#by_var: is a list that contains the data separated by category.
#Output: is a list which contains the parameters estimates.

runPar = function(by_var){
  lapply(by_var, function(y){
    sapply(y, simplify=FALSE, function(w){
      if( class(w) == "factor"){
        est.par = prop.table(table(w, dnn=NULL))
        #names(est.par) =paste(levels(w))
        est.par
      } else if(class(w) == "numeric")  {
        est.par = as.vector(c(mean(w),var(w)))
        names(est.par) = c("mu","sigma2")
        est.par
      } else {
        stop("Error")
      }
    })
  })
}



#This is a auxilar function to classify a new observation
#Arguments:
#distr: is a string that specifies which distribution should be used for modelling the features
#par: is a list which contains informations about the parameters.
#xnew.quali: a vector which contains qualititative/discrete features of the new #observation.
#xnew.quant: a vector which contains quantitative features of the new observation.
#nam.prob: is a vector of characters with labels of probabilities posterior
#nCat: is the number of categories
#est.Pi: is a vector of prior probabilities
#Output: is a list which contains the posterior probabilities


runProb.mix = function(par,xnew.quali, xnew.quant,nCat,est.Pi){
  param = par
  if(missing(xnew.quali) || missing(xnew.quant)){
    stop("Error: You must provide a new observation")
  }
  param.quali = lapply(param, function(x){
    x[names(x) %in% names(xnew.quali)]
  })
  prXqualigivenZ = lapply(param.quali, function(w){
    sapply(1:length(w), function(x){
      s = w[[x]][xnew.quali[x]== names(w[[x]])]
      #ifelse(s == 0, 0, log(s) )
      ifelse(s == 0, log(s+1e-50), log(s) )
    })
  })
  param.quant = lapply(param, function(x){
    x[names(x) %in% names(xnew.quant)]
  })
  param.quant.aug = lapply(param.quant, function(x){
    x = t(matrix(unlist(x),ncol=2,byrow=T))
    rbind(x,xnew.quant)
  })
  prXquantgivenZ = lapply(param.quant.aug, function(w){
    apply(w, 2, function(x){
      dnorm(x[3], mean = x[1], sd = sqrt(x[2]), log=TRUE)
    })
  })
  prXgivenZ = sapply(1:nCat, function(x){
    c(prXquantgivenZ[[x]],prXqualigivenZ[[x]])
  })
  prXgivenZ = apply(prXgivenZ,2,sum)
  if(any(est.Pi == 0) ){
    prXZ =  prXgivenZ + log(est.Pi+1e-50)
  } else {
    prXZ =  prXgivenZ + log(est.Pi)
  }
  praddlog = addlog(prXZ[1],prXZ[2])
  if(nCat > 2){
    for(i in 3:nCat){
      praddlog = addlog(praddlog,prXZ[i])
    }
  }
  prRes = exp(prXZ-praddlog)
  #names(prRes) = paste("Pr(Z==",nam.prob,"|X=x)",sep="")
  prRes
}

#Arguments:
#distr: is a string that specifies which distribution should be used for modelling the features
#par: is a list which contains informations about the parameters.
#xnew.quant: a vector which contains quantitative features of the new observation.
#nam.prob: is a vector of characters with labels of probabilities posterior
#nCat: is the number of categories
#est.Pi: is a vector of prior probabilities
#Output: is a list which contains the posterior probabilities


runProb.quant = function(par, xnew.quant, nCat,est.Pi){
  param = par
  if( missing(xnew.quant)){
    stop("Error: You must provide a new observation")
  }
  param.quant = lapply(param, function(x){
    x[names(x) %in% names(xnew.quant)]
  })
  param.quant.aug = lapply(param.quant, function(x){
    x = t(matrix(unlist(x),ncol=2,byrow=T))
    rbind(x,xnew.quant)
  })
  prXquantgivenZ = lapply(param.quant.aug, function(w){
    apply(w, 2, function(x){
      dnorm(x[3], mean = x[1], sd = sqrt(x[2]), log=TRUE)
    })
  })
  prXgivenZ = sapply(1:nCat, function(x){
    c(prXquantgivenZ[[x]])
  })
  prXgivenZ = apply(prXgivenZ,2,sum)
  if(any(est.Pi == 0) ){
    prXZ =  prXgivenZ + log(est.Pi+1e-50)
  } else {
    prXZ =  prXgivenZ + log(est.Pi)
  }
  praddlog = addlog(prXZ[1],prXZ[2])
  if(nCat > 2){
    for(i in 3:nCat){
      praddlog = addlog(praddlog,prXZ[i])
    }
  }
  prRes = exp(prXZ-praddlog)
  #names(prRes) = paste("Pr(Z==",nam.prob,"|X=x)",sep="")
  prRes
}

#This is a auxilar function to classify a new observation
#Arguments:
#distr: is a string that specifies which distribution should be used for modelling the features
#par: is a list which contains informations about the parameters.
#xnew.quali: a vector which contains qualititative/discrete features of the new #observation.
#nam.prob: is a vector of characters with labels of probabilities posterior
#nCat: is the number of categories
#est.Pi: is a vector of prior probabilities
#Output: is a list which contains the posterior probabilities

runProb.quali = function(par,xnew.quali, nCat,est.Pi){
  param = par
  if(missing(xnew.quali) ){
    stop("Error: You must provide a new observation")
  }
  param.quali = lapply(param, function(x){
    x[names(x) %in% names(xnew.quali)]
  })
  prXqualigivenZ = lapply(param.quali, function(w){
    sapply(1:length(w), function(x){
      s = w[[x]][xnew.quali[x]== names(w[[x]])]
      #ifelse(s == 0, 0, log(s) )
      ifelse(s == 0, log(s+1e-50), log(s) )
    })
  })
  prXgivenZ = sapply(1:nCat, function(x){
    c(prXqualigivenZ[[x]])
  })
  prXgivenZ = apply(prXgivenZ,2,sum)
  if(any(est.Pi == 0) ){
    prXZ =  prXgivenZ + log(est.Pi+1e-50)
  } else {
    prXZ =  prXgivenZ + log(est.Pi)
  }
  praddlog = addlog(prXZ[1],prXZ[2])
  if(nCat > 2){
    for(i in 3:nCat){
      praddlog = addlog(praddlog,prXZ[i])
    }
  }
  prRes = exp(prXZ-praddlog)
  #names(prRes) = paste("Pr(Z==",nam.prob,"|X=x)",sep="")
  prRes
}



#This is a R code based on code written in C by Karl Broman (2018).
#Calculate addlog(a,b) = log[exp(a) + exp(b)]
#This makes use of the function log1p(x) = log(1+x) provided
#in R's math library.

addlog=function(a, b)
{
  THRESH = 200
  if(b > a + THRESH){
    return(b)
  } else if(a > b + THRESH){
    return(a)
  } else {
    return(a + log1p(exp(b-a)))
  }
}


#This is a function to estimate parameters of the naive Bayes model.
#Arguments:
#x: is a dataset which the categorical variable must be a factor
#ct: is a string of the name of categorical variable that classifies the instances into groups
#Output: a nb object with informations about the naive Bayes model:
#par: estimates of parameters of condtional probability density function
#multinomial and gaussian
#est.pi: mle estimate of the  proportion of each category
#ct: is a string of the name of categorical variable that classifies the instances into groups
#nfeat: number of features
#ncat: number of categories


naive.bayes = function(x, ct){
  #Dealing with exceptions
  if(missing(x) || !is.data.frame(x)){
    stop("You must provide the dataset as data.frame")
  }
  if(missing(ct)){
    stop("You must provide the name of the categorical variables vector")
  }
  if(!is.factor(x[ct][,1])){
    stop("The categorical variables vector must be factor")
  }
  type_columns = sapply(x[names(x)!=c(ct)],class)
  if(any( type_columns == "logical" ) || any( type_columns == "character" ) ||
     any( type_columns == "integer" )
  ){
    stop("The qualitative/discrete features must be a factor and
         the quantitative features must be a numeric")
  }
  res = list()
  idx = x[ct][,1]
  xi = x[,colnames(x)!=c(ct)]
  #Prior Probabilities
  est.pi = prop.table(table(idx,dnn=NULL))
  #Parameters Estimates
  #by_var = lapply(by_var, function(x){  xi[,colnames(x)!=c(ct)] })
  by_var = split(xi,idx)
  res$par = runPar(by_var)
  #Final Results
  res$est.pi = est.pi
  res$nfeat = ncol(x[,colnames(x)!=c(ct)])
  res$ct = ct
  res$ncat = length(est.pi)
  class(res) = "nb"
  invisible(res)
  }


#post.prob:
#This a function to compute posterior probabilities based on object of class nb
#Arguments:
#nb: is a object of class nb which contains information about the
#naive Bayes model
#xnew.quali: a vector which contains qualititative/discrete features of the new #observation.
#xnew.quant: a vector which contains quantitative features of the new observation.
#Output: a vector which contains the posterior probabilities

post.prob = function(object, xnew.quali, xnew.quant){
  if(missing(xnew.quali)  && missing(xnew.quant)   ){
    stop("You must provide the new observations")
  }
  #Number of categories
  ncat = object$ncat
  if((!missing(xnew.quali))  && (!missing(xnew.quant))){
    runProb.mix(object$par,xnew.quali,xnew.quant,ncat,object$est.pi)
  } else  if((missing(xnew.quali))  && (!missing(xnew.quant))) {
    runProb.quant(object$par,xnew.quant,ncat,object$est.pi)
  } else {
    runProb.quali(object$par,xnew.quali,ncat,object$est.pi)
  }

}

#classify:
#This is a function which classifies the instances into categories
#Arguments:
#object: is a object of class nb
#xnew: is a vector of new observation

classify = function(object, xnew.quali, xnew.quant){
  if(missing(xnew.quali)  && missing(xnew.quant)   ){
    stop("You must provide the new observations")
  }
  if((!missing(xnew.quali))  && (!missing(xnew.quant))){
    pred = post.prob(object,xnew.quali,xnew.quant)
  } else  if((missing(xnew.quali))  && (!missing(xnew.quant))) {
    pred = post.prob(object,xnew.quant=xnew.quant)
  } else {
    pred = post.prob(object,xnew.quali)
  }
  names(pred)[which.max(pred)]
}

#confusion.matrix
#This is a function to estimate relative frequencies of the elements of
#confusion matrix based on k fold cross validation
#Arguments:
#object: is a nb object
#x: the dataset
#k: number of folds
#Output: Relative frequencies of the elements of confusion matrix

confusion.matrix = function(object, x, k=10){
  if(k!=10 && k!=5){
    stop("It is only allowed  5 and 10!")
  }
  #k folds Cross Validation
  flds = createFolds(x[object$ct][,1], k = k, list = TRUE, returnTrain = FALSE)
  res = lapply(1:k, function(w){
    #Defining test sample
    xtst = x[flds[[w]], names(x[colnames(x) != object$ct]) ]
    #Defining training sample
    xtrn = x[-flds[[w]],]
    #Training the naive Bayes model
    modtrn = naive.bayes(x=xtrn, ct=object$ct)
    #Checking the class o variables
    idclass = sapply(xtst,function(x){class(x)})
    if(all(idclass == "factor")){
      xtst = xtst[,idclass == "factor"]
      pred = apply(xtst, 1, function(y){
        classify(object = modtrn, xnew.quali=y)
      })
    } else if(all(idclass == "numeric")){
      xtst = xtst[,idclass == "numeric"]
      pred = apply(xtst, 1, function(y){
        classify(object = modtrn, xnew.quant=y)
      })
    } else {
      pred =  unlist(sapply(1:nrow(xtst), simplify=FALSE, function(x){
        x.quali = unlist(xtst[x,idclass=="factor"])
        x.quant = unlist(xtst[x,idclass=="numeric"])
        classify(object = modtrn, xnew.quali=x.quali,
                 xnew.quant=x.quant)
      }) )
    }
    pred = factor(pred,levels(x[object$ct][,1]))
    obs =  factor(x[flds[[w]], colnames(x) == object$ct],levels(x[object$ct][,1]))
    #Confusion matrix
    conf.matrix = table(pred,obs)
    #conf.matrix = prop.table(conf.matrix)
    conf.matrix
  })
  nRow = nCol =  length(levels(x[object$ct][,1]))
  conf.array = array(unlist(res), dim = c(nRow, nCol , length(res)))
  conf.matrix = apply(conf.array,c(1,2),sum)
  #conf.matrix = (1/k)*conf.matrix
  conf.matrix
}