R/logforest.R

Defines functions logforest

Documented in logforest

#' Logic Forest
#'
#' Constructs an ensemble of logic regression models using bagging for classification and identification of important predictors and predictor interactions
#' @usage logforest(resp, Xs, nBSXVars, anneal.params, nBS=100, h=0.5, norm=TRUE, numout=5, nleaves)
#' @param resp numeric vector of binary response values
#' @param Xs matrix or dataframe of zeros and ones for all predictor variables
#' @param nBSXVars integer for the number of predictors used to construct each logic regression model.  The default value is all predictors in the data.
#' @param anneal.params a list containing the parameters for simulated annealing.  See the help file for the function \code{logreg.anneal.control} in the \code{LogicReg} package.  If missing, default annealing parameters are set at \code{start}=1, \code{end}=-2, and \code{iter}=50000.
#' @param nBS number of logic regression trees to be fit in the logic forest model.
#' @param h a number between 0 and 1 for the minimum proportion of trees in the logic forest that must predict a 1 for the prediction to be one.
#' @param norm logical.  If FALSE, predictor and interaction scores in model output are not normalized to range between zero and one.
#' @param numout number of predictors and interactions to be included in model output
#' @param nleaves the maximum number of end nodes generated for each tree
#'
#' @return An object of class \code{"logforest"} including a list of values

#' @export

logforest<-function(resp, Xs, nBSXVars, anneal.params, nBS=100, h=0.5, norm=TRUE, numout=5, nleaves)
{
  pred<-ncol(Xs)
  if (missing(anneal.params)) {anneal.params<-logreg.anneal.control(start=2, end=-1, iter=100000)}
  if (missing(nBSXVars)) {nBSXVars<-pred}
  n<-nrow(Xs)
  if (is.null(colnames(Xs))) {x.nms<-paste("X", 1:pred, sep="")}
  else {x.nms<-colnames(Xs)}
  fitlist<-vector("list", nBS)
  IBdata<-vector("list", nBS)
  OOBdata<-vector("list", nBS)
  OOBpred<-matrix(nrow=n, ncol=nBS)
  single.predimport<-vector("list",nBS)
  vimp.import<-vector("list", nBS)
  treepreds.list<-vector("list", nBS)
  IPImatrix<-matrix(NA, nrow=0, ncol=(1+pred*2))
  colnames(IPImatrix)<-c("treenum", x.nms, paste("!",x.nms, sep=""))
  if (length(which(resp %in% c(0, 1))) == length(resp))  {mtype="Classification"}
  if (length(which(resp %in% c(0, 1))) != length(resp))  {mtype="Linear Regression"}
  for(b in 1:nBS)
  {
    if(missing(nleaves)) {nleaves<-sample(2:8, 1, replace=FALSE)}
    BSindices<-sample(1:n, n, replace=TRUE)
    OOBindices<-(1:n)[!is.element(1:n, BSindices)]
    BS<-Xs[BSindices, ] #Selects the bootstrap sample corresponding to the chosen indices
    OOB<-Xs[OOBindices, ]  #Selects the out-of-bag sample
    BSY<-resp[BSindices]  #Response variable for bootstrap sample
    OOBY<-resp[OOBindices]  #Response variable for out-of-bag sample
    XVarIndices<-sort(sample(1:pred, nBSXVars, replace=FALSE))
    rsBSX<-BS[ ,XVarIndices]  #Bootstrap with selected X-vars
    rsOOBX<-OOB[,XVarIndices]  #Out-of-bag with selected X-vars
    FinalBS<-cbind(rsBSX, BSY)   #Final Bootstrap with Y
    FinalOOB<-cbind(rsOOBX, OOBY)   #Final out-of-bag with Y
    colnames(FinalBS)<-colnames(FinalOOB)<-c(x.nms[XVarIndices], "Y")
    if (mtype=="Classification")
    {
      fit <- logreg(resp = BSY, bin = FinalBS[,1:nBSXVars],
                    type = 1, select = 1, ntrees = 1, nleaves = nleaves,   anneal.control = anneal.params)
    }
    if (mtype=="Linear Regression")
    {
      fit <- logreg(resp = BSY, bin = FinalBS[,1:nBSXVars],
                    type = 2, select = 1, ntrees = 1, nleaves = nleaves,   anneal.control = anneal.params)
    }

    OOBpred[as.vector(OOBindices),b]<-predict.logreg(fit, newbin=as.matrix(FinalOOB[,1:nBSXVars]))
    if (sum(fit$model$tree[[1]]$trees[,3])!=0)
    {
      pred.import<-pimp.import(fit=fit, data=FinalBS, testdata=FinalOOB, BSpred=length(XVarIndices),
                               pred=pred, Xs=XVarIndices, mtype=mtype)
      vimp.import[[b]]<-pred.import$pimp.vimp
      single.predimport[[b]]<-pred.import$single.vimp
      treepreds.list[[b]]<-pred.import$Xids
      ipimat<-cbind(rep(b, nrow(pred.import$Ipimat)), pred.import$Ipimat)
      IPImatrix<-rbind(IPImatrix, ipimat)
    }
    else
    {
      single.predimport[[b]]= 0
      vimp.import[[b]]= 0
      treepreds.list[[b]]=0
      IPImatrix<-rbind(IPImatrix, c(b, rep(0, pred*2)))
    }
    fitlist[[b]]<-fit   #list of all models
    IBdata[[b]]<-BSindices
    OOBdata[[b]]<-OOBindices
  }
  if (mtype=="Classification")
  {
    OOB.pred<-matrix(0, nrow=n, ncol=2)
    for (i in 1:n)
    {
      pred.ids<-which(OOBpred[i,]==1|OOBpred[i,]==0)
      pred.vec<-OOBpred[i,c(pred.ids)]
      OOBprop<-sum(pred.vec)/length(pred.vec)
      OOBi.pred<-ifelse(OOBprop>h, 1, 0)
      OOB.pred[i,]<-c(OOBi.pred, OOBprop)
    }
    colnames(OOB.pred)<-c("predicted_class","proportion")
    OOBmiss<-sum(abs(OOB.pred[,1]-resp))/n
  }
  if (mtype=="Linear Regression")
  {
    OOB.pred<-matrix(0, nrow=n, ncol=2)
    for (i in 1:n)
    {
      pred.ids<-which(is.na(OOBpred[i,])==F)
      pred.vec<-OOBpred[i,c(pred.ids)]
      OOBval<-sum(pred.vec)/length(pred.vec)
      OOB.pred[i,]<-c(i, OOBval)
    }
    colnames(OOB.pred)<-c("sample id","mean prediction")
    OOBmiss<-sum((OOB.pred[,2]-resp)^2)/n
  }
  pred.importmat<-matrix(0, nrow=nBS, ncol=pred)
  colnames(pred.importmat)<-x.nms
  for (i in 1:nBS)
  {
    pred.ind<-treepreds.list[[i]]
    m<-length(pred.ind)
    for (j in 1:m)
    {
      col<-pred.ind[j]
      pred.importmat[i,col]<-single.predimport[[i]][j]
    }
  }
  pred.imp<-colSums(pred.importmat)
  names(pred.imp)<-x.nms
  freq.table<-table(names(unlist(single.predimport)))
  all.pimps<-unique(names(unlist(vimp.import)))
  npimps<-length(unique(names(unlist(vimp.import))))
  pimp.freqtable<-table(names(unlist(vimp.import)))
  pimptable<-matrix(0, nrow=nBS, ncol=npimps)
  colnames(pimptable)<-all.pimps
  for (i in 1:nBS)
  {
    npimps.tree<-length(vimp.import[[i]])
    for (j in 1:npimps.tree)
    {
      cpimp<-vimp.import[[i]][j]
      col.id<-which(colnames(pimptable)%in%names(cpimp))
      pimptable[i,col.id]<-cpimp
    }
  }
  pimpsum<-colSums(pimptable)  #importance of each interaction
  t5PIs<-names(sort(pimpsum, decreasing=TRUE)[1:5])
  ans<-list(AllFits=fitlist, Top5.PI=t5PIs, Predictor.importance=pred.imp, Predictor.frequency=freq.table,
            PI.frequency=pimp.freqtable, PI.importance=pimpsum,  ModelPI.import=vimp.import,
            IPImatrix=IPImatrix, OOBmiss=OOBmiss, OOBprediction=OOB.pred, IBdata=IBdata, OOBdata=OOBdata, norm=norm,
            numout=numout, predictors=pred, Xs=Xs, model.type=mtype)
  class(ans)<-"logforest"
  ans
}

Try the LogicForest package in your browser

Any scripts or data that you put into this service are public.

LogicForest documentation built on June 22, 2024, 7:05 p.m.