R/predict_naive_bn.R

#' Naive Bayes Classifier
#'
#' A function of Naive Bayes Classifier for numeric only independents.
#'
#' @param fit a data.table class modelling dataset; only support numeric independents.
#' @param newdata a data.table class dataset with target variable; only support numeric independents.
#' @param threshold a positive numeric, usually small; if the variance of independent is 0, then use this value to replace it to avoid perfect division. 
#'
#' @return a list of modeling result. It contains the following components:
#'    predict: a prediction result table.
#'
#' @export

predict_naive_bn=function(fit,newdata,threshold){
  #newdata must be data.table class WITH TARGET VARIABLE
  #newdata=data
  #threshold=0.01 # replace 0 sd
  zero.prob=1e-4 # replace 0 prob
  
   newdata=newdata[,!fit$target,with=F]
  names.ind=names(newdata)
  n=ncol(newdata)
  level.target=fit$prior$target
  prob=vector("list",length(level.target))
  
  if(fit$method=="exp"){  
    for (i in 1:length(level.target)){
      #i=2
      prob.ind=vector("list",n)
      prior.target=fit$prior$prob[i]
      for (j in 1:length(names.ind)){
        #j=17
        tp=names.ind[j]
        if (fit$overall.mean[[tp]]!=0) lbd=1/fit$overall.mean[[tp]] else
          lbd=1/threshold  
        
        
        index=fit$group.mean[[fit$target]]==level.target[i]
        if (fit$group.mean[[tp]][index]!=0) lbd.cond=1/fit$group.mean[[tp]][index] else
          lbd.cond=1/threshold 
        test=dexp(newdata[[tp]],rate=lbd)
        if (sum(test==0)==0) {
          prob.ind[[j]]=dexp(newdata[[tp]],rate=lbd.cond)/test
        }else {
          test[test==0]=zero.prob
          prob.ind[[j]]=dexp(newdata[[tp]],rate=lbd.cond)/test
        }
      } 
      # compute target prob for level.target[i]
      prob.ind.matrix=do.call("rbind",prob.ind)
      prob[[i]]=apply(prob.ind.matrix,2,prod)*prior.target
    }
  }else if (fit$method=="normal") {
    for (i in 1:length(level.target)){
      #i=1
      prob.ind=vector("list",n)
      prior.target=fit$prior$prob[i]
      for (j in 1:length(names.ind)){
        #j=1
        tp=names.ind[j]
        if (fit$overall.sd[[tp]]!=0) sd.overall=fit$overall.sd[[tp]] else
          sd.overall=threshold 
        mean.overall=fit$overall.mean[[tp]]
        
        
        index=fit$group.mean[[fit$target]]==level.target[i]
        if (fit$group.sd[[tp]][index]!=0) sd.cond=fit$group.sd[[tp]][index] else
          sd.cond=threshold 
        mean.cond=fit$group.mean[[tp]][index]
        test=dnorm(newdata[[tp]],mean=mean.overall,sd=sd.overall)
        if(sum(test==0)==0){
          prob.ind[[j]]=dnorm(newdata[[tp]],mean=mean.cond,sd=sd.cond)/test
        } else {
          test[test==0]=zero.prob
          prob.ind[[j]]=dnorm(newdata[[tp]],mean=mean.cond,sd=sd.cond)/test
        }
      }
      
      # compute target prob for level.target[i]
      prob.ind.matrix=do.call("rbind",prob.ind)
      prob[[i]]=apply(prob.ind.matrix,2,prod)*prior.target
    }
  }else if (fit$method=="gamma"){
    for (i in 1:length(level.target)){
      #i=1
      prob.ind=vector("list",n)
      prior.target=fit$prior$prob[i]
      for (j in 1:length(names.ind)){
        #j=1
        tp=names.ind[j]
        if ((fit$overall.a[[tp]])^2!=0) a.overall=fit$overall.a[[tp]] else
          a.overall=sqrt(threshold) 
        k.overall=fit$overall.k[[tp]]
        
        
        index=fit$group.a[[fit$target]]==level.target[i]
        if ((fit$group.a[[tp]][index])^2!=0) a.cond=fit$group.a[[tp]][index] else
          a.cond=sqrt(threshold) 
        k.cond=fit$group.k[[tp]][index]
        test=dgamma(newdata[[tp]],shape=k.overall,scale=a.overall)
        if(sum(test==0)==0){
          prob.ind[[j]]=dgamma(newdata[[tp]],shape=k.cond,scale=a.cond)/test
        } else {
          test[test==0]=zero.prob
          prob.ind[[j]]=dgamma(newdata[[tp]],shape=k.cond,scale=a.cond)/test
        }
      }
      
      # compute target prob for level.target[i]
      prob.ind.matrix=do.call("rbind",prob.ind)
      prob[[i]]=apply(prob.ind.matrix,2,prod)*prior.target
    }
  }
  
  rm(prob.ind.matrix,prob.ind)
  prob.final=do.call("rbind",prob)
  rm(prob)
  prob.final.sum=apply(prob.final,2,sum)
  pred=data.frame(t(sweep(prob.final,2,prob.final.sum,"/")))
  rm(prob.final)
  names(pred)=level.target
  index=prob.final.sum %in% c(0,Inf) | is.nan(prob.final.sum)
  temp=data.frame(rep(fit$prior$prob[1],sum(index)),rep(fit$prior$prob[2],sum(index)))
  names(temp)=level.target
  pred[index,]=temp
  return(list(predict=pred))
}
xinzhou1023/nbcont documentation built on May 4, 2019, 1:07 p.m.