R/logforest.R
In LogicForest: Logic Forest

Documented in logforest

#' Logic Forest & Logic Survival Forest
#'
#' Constructs an ensemble of logic regression models using bagging for classification or regression, and identifies important predictors and interactions. Logic Forest (LF) efficiently searches the space of logical combinations of binary variables using simulated annealing. It has been extended to support linear and survival regression.
#' @importFrom LogicReg eval.logreg logreg logreg.anneal.control
#' @importFrom methods is
#'
#' @param resp.type String indicating regression type: \code{"bin"} for classification, \code{"lin"} for linear regression, \code{"exp_surv"} for exponential time-to-event, and \code{"cph_surv"} for Cox proportional hazards.
#' @param resp Numeric vector of response values (binary for classification/survival, continuous for linear regression). For time-to-event, indicates event/censoring status.
#' @param resp.time Numeric vector of event/censoring times (used only for survival models).
#' @param Xs Matrix or data frame of binary predictor variables (0/1 only).
#' @param nBSXVars Integer. Number of predictors sampled for each tree (default is all predictors).
#' @param anneal.params A list of parameters for simulated annealing (see \code{\link[LogicReg]{logreg.anneal.control}}). Defaults: \code{start = 1}, \code{end = -2}, \code{iter = 50000}.
#' @param nBS Number of trees to fit in the logic forest.
#' @param h Numeric. Minimum proportion of trees predicting "1" required to classify an observation as "1" (used for classification).
#' @param norm Logical. If \code{FALSE}, importance scores are not normalized.
#' @param numout Integer. Number of predictors and interactions to report.
#' @param nleaves Integer. Maximum number of leaves (end nodes) allowed per tree.
#'
#' @details
#' Logic Forest is designed to identify interactions between binary predictors without requiring their pre-specification. Using simulated annealing, it searches the space of all possible logical combinations (e.g., AND, OR, NOT) among predictors. Originally developed for binary outcomes in gene-environment interaction studies, it has since been extended to linear and time-to-event outcomes (Logic Survival Forest).
#'
#' @return A \code{logforest} object containing:
#' \describe{
#'   \item{Predictor.frequency}{Frequency of each predictor across trees.}
#'   \item{Predictor.importance}{Importance of each predictor.}
#'   \item{PI.frequency}{Frequency of each interaction across trees.}
#'   \item{PI.importance}{Importance of each interaction.}
#' }
#'
#' @references
#' Wolf BJ, Hill EG, Slate EH. (2010). Logic Forest: An ensemble classifier for discovering logical combinations of binary markers. \emph{Bioinformatics}, 26(17):2183–2189. \doi{10.1093/bioinformatics/btq354} \cr
#' Wolf BJ et al. (2012). LBoost: A boosting algorithm with application for epistasis discovery. \emph{PLoS One}, 7(11):e47281. \doi{10.1371/journal.pone.0047281} \cr
#' Hyer JM et al. (2019). Novel Machine Learning Approach to Identify Preoperative Risk Factors Associated With Super-Utilization of Medicare Expenditure Following Surgery. \emph{JAMA Surg}, 154(11):1014–1021. \doi{10.1001/jamasurg.2019.2979}
#'
#' @author
#' Bethany J. Wolf \email{wolfb@@musc.edu} \cr
#' J. Madison Hyer \email{madison.hyer@@osumc.edu}
#'
#' @note
#' Development of Logic Forest was supported by NIH/NCATS UL1RR029882. Logic Survival Forest development was supported by NIH/NIA R01AG082873.
#'
#' @seealso \code{\link{pimp.import}}, \code{\link[LogicReg]{logreg.anneal.control}}
#'
#' @examples
#' \dontrun{
#' set.seed(10051988)
#' N_c <- 50
#' N_r <- 200
#' init <- as.data.frame(matrix(0, nrow = N_r, ncol = N_c))
#' colnames(init) <- paste0("X", 1:N_c)
#' for(n in 1:N_c){
#'   p <- runif(1, min = 0.2, max = 0.6)
#'   init[,n] <- rbinom(N_r, 1, p)
#' }
#'
#' X3X4int <- as.numeric(init$X3 == init$X4)
#' X5X6int <- as.numeric(init$X5 == init$X6)
#' y_p <- -2.5 + init$X1 + init$X2 + 2 * X3X4int + 2 * X5X6int
#' p <- 1 / (1 + exp(-y_p))
#' init$Y.bin <- rbinom(N_r, 1, p)
#'
#' # Classification
#' LF.fit.bin <- logforest("bin", init$Y.bin, NULL, init[,1:N_c], nBS=10, nleaves=8, numout=10)
#' print(LF.fit.bin)
#'
#' # Continuous
#' init$Y.cont <- rnorm(N_r, mean = 0) + init$X1 + init$X2 + 5 * X3X4int + 5 * X5X6int
#' LF.fit.lin <- logforest("lin", init$Y.cont, NULL, init[,1:N_c], nBS=10, nleaves=8, numout=10)
#' print(LF.fit.lin)
#'
#' # Time-to-event
#' shape <- 1 - 0.05*init$X1 - 0.05*init$X2 - 0.2*init$X3*init$X4 - 0.2*init$X5*init$X6
#' scale <- 1.5 - 0.05*init$X1 - 0.05*init$X2 - 0.2*init$X3*init$X4 - 0.2*init$X5*init$X6
#' init$TIME_Y <- rgamma(N_r, shape = shape, scale = scale)
#' LF.fit.surv <- logforest("exp_surv", init$Y.bin, init$TIME_Y, init[,1:N_c],
#'   nBS=10, nleaves=8, numout=10)
#' print(LF.fit.surv)
#' }
#'
#' @export

logforest<-function(resp.type, resp, resp.time = data.frame(X = rep(1, nrow(resp))), Xs, nBSXVars, anneal.params, nBS=100, h=0.5, norm=TRUE, numout=5, nleaves)
{
  pred<-ncol(Xs) #Determines the number of predictors
  if (missing(anneal.params)) {anneal.params<-logreg.anneal.control(start=2, end=-1, iter=100000)} #Sets default annealing parameters
  if (missing(nBSXVars)) {nBSXVars<-pred} #Sets default number of variables for randomly selecting predictors
  n <- nrow(Xs) #Determines the number of rows
  if (is.null(colnames(Xs))) {x.nms<-paste("X", 1:pred, sep="")} #Sets default predictor names if missing
  else {x.nms<-colnames(Xs)} #Sets predictor names if non missing
  fitlist <- vector("list", nBS) #Initiates vector of fits
  IBdata <- vector("list", nBS) #Initiates in-bag data list
  OOBdata <- vector("list", nBS) #Initiates out-of-bag data list
  OOBpred <- matrix(nrow=n, ncol=nBS) #Initiates out-of-bag predictor list
  single.predimport <- vector("list",nBS) #Initiates list of predictor importance
  vimp.import <- vector("list", nBS) #Initiates list of interaction importance
  treepreds.list <- vector("list", nBS) #Initiates list of trees
  IPImatrix<-matrix(NA, nrow=0, ncol=(1+pred*2))
  colnames(IPImatrix)<-c("treenum", x.nms, paste("!",x.nms, sep=""))
  if (resp.type == "bin")  {mtype="Classification"}
  if (resp.type == "lin")  {mtype="Linear Regression"}
  if (resp.type == "exp_surv")  {mtype="Exp. Time-to-Event"}
  if (resp.type == "cph_surv")  {mtype="Cox-PH Time-to-Event"}
  if (mtype=="Classification") {mtyp=1}
  if (mtype=="Linear Regression") {mtyp=2}
  if (mtype=="Cox-PH Time-to-Event") {mtyp=3}
  if (mtype=="Exp. Time-to-Event") {mtyp=4}
  for(b in 1:nBS)
  {
    if(missing(nleaves)) {nleaves<-sample(2:8, 1, replace=FALSE)}
    BSindices <- sample(1:n, n, replace=TRUE)
    OOBindices <- (1:n)[!is.element(1:n, BSindices)]
    BS <- Xs[BSindices, ] #Selects the bootstrap sample corresponding to the chosen indices
    OOB <- Xs[OOBindices, ]  #Selects the out-of-bag sample
    BSY <- as.matrix(resp)[BSindices]  #Response variable for bootstrap sample
    OOBY <- as.matrix(resp)[OOBindices]  #Response variable for out-of-bag sample
    if(mtype == "Exp. Time-to-Event" | mtype == "Cox-PH Time-to-Event"){
      BST <- resp.time[BSindices]  #Response variable for bootstrap sample
      OOBT <- resp.time[OOBindices]  #Response variable for out-of-bag sample
    }
    XVarIndices <- sort(sample(1:pred, nBSXVars, replace=FALSE))
    rsBSX <- BS[ ,XVarIndices]  #Bootstrap with selected X-vars
    rsOOBX <- OOB[,XVarIndices]  #Out-of-bag with selected X-vars
    if (mtype=="Classification")
    {
      FinalBS<-cbind(rsBSX, BSY)   #Final Bootstrap with Y
      FinalOOB<-cbind(rsOOBX, OOBY)   #Final out-of-bag with Y
      colnames(FinalBS)<-colnames(FinalOOB)<-c(x.nms[XVarIndices], "Y")
      fit <- logreg(resp = BSY, bin = FinalBS[,1:nBSXVars],
                    type = 1, select = 1, ntrees = 1, nleaves = nleaves, anneal.control = anneal.params)
    }
    if (mtype=="Linear Regression")
    {
      FinalBS<-cbind(rsBSX, as.matrix(BSY))   #Final Bootstrap with Y
      FinalOOB<-cbind(rsOOBX, as.matrix(OOBY))   #Final out-of-bag with Y
      colnames(FinalBS)<-colnames(FinalOOB)<-c(x.nms[XVarIndices], "Y")
      fit <- logreg(resp = BSY, bin = FinalBS[,1:nBSXVars],
                    type = 2, select = 1, ntrees = 1, nleaves = nleaves, anneal.control = anneal.params)
    }
    if (mtype=="Cox-PH Time-to-Event")
    {
      FinalBS<-cbind(rsBSX, BSY, BST)   #Final Bootstrap with Y and Time
      FinalOOB<-cbind(rsOOBX, OOBY, OOBT)   #Final out-of-bag with Y and Time
      colnames(FinalBS)<-colnames(FinalOOB)<-c(x.nms[XVarIndices], "Y")
      fit <- logreg(resp = BST, cens = BSY, bin = FinalBS[,1:nBSXVars],
                    type = 4, select = 1, ntrees = 1, nleaves = nleaves, anneal.control = anneal.params)
    }
    if (mtype=="Exp. Time-to-Event")
    {
      FinalBS<-cbind(rsBSX, BSY, BST)   #Final Bootstrap with Y and Time
      FinalOOB<-cbind(rsOOBX, OOBY, OOBT)   #Final out-of-bag with Y and Time
      colnames(FinalBS)<-colnames(FinalOOB)<-c(x.nms[XVarIndices], "Y")
      fit <- logreg(resp = BST, cens = BSY, bin = FinalBS[,1:nBSXVars],
                    type = 5, select = 1, ntrees = 1, nleaves = nleaves, anneal.control = anneal.params)
    }

    if(mtyp %in% 3:4)
    {
      #OOBpred[as.vector(OOBindices),b]<-predict.logreg2(fit, newbin=as.matrix(FinalOOB[,1:nBSXVars]), newcens = OOBY)
      OOBi <- frame.logreg2(fit = fit, newbin = as.matrix(FinalOOB[,1:nBSXVars]), newcens = OOBY)
      OOBpred[as.vector(OOBindices),b] <- OOBi[, 3]
    } else {
      OOBpred[as.vector(OOBindices),b] <- predict.logreg2(fit, newbin=as.matrix(FinalOOB[,1:nBSXVars]))
    }

    if (sum(fit$model$tree[[1]]$trees[,3])!=0)
    {
      pred.import<-pimp.import(fit=fit, data=FinalBS, testdata=FinalOOB, BSpred=length(XVarIndices),
                               pred=pred, Xs=XVarIndices, mtype=mtype)

      vimp.import[[b]]<-pred.import$pimp.vimp
      single.predimport[[b]]<-pred.import$single.vimp
      treepreds.list[[b]]<-pred.import$Xids
      ipimat<-cbind(rep(b, nrow(pred.import$Ipimat)), pred.import$Ipimat)
      IPImatrix<-rbind(IPImatrix, ipimat) #data frame with row = one line per interaction and col = var and !var
    }
    else
    {
      single.predimport[[b]]= 0
      vimp.import[[b]]= 0
      treepreds.list[[b]]=0
      IPImatrix<-rbind(IPImatrix, c(b, rep(0, pred*2)))
    }
    fitlist[[b]]<-fit   #list of all models
    IBdata[[b]]<-BSindices #data frame of one vector per BS sample, listing each row in the IB sample
    OOBdata[[b]]<-OOBindices #data frame of one vector per BS sample, listing each row in the OOB sample
  }
  ###CLOSE  THE BOOTSTAP LOOP

  if (mtype=="Classification")
  {
    OOB.pred<-matrix(0, nrow=n, ncol=2)
    for (i in 1:n)
    {
      pred.ids<-which(OOBpred[i,]==1|OOBpred[i,]==0)
      pred.vec<-OOBpred[i,c(pred.ids)]
      OOBprop<-sum(pred.vec)/length(pred.vec)
      OOBi.pred<-ifelse(OOBprop>h, 1, 0)
      OOB.pred[i,]<-c(OOBi.pred, OOBprop)
    }
    colnames(OOB.pred)<-c("predicted_class","proportion")
    OOBmiss<-sum(abs(OOB.pred[,1]-resp))/n
  }
  if (mtype=="Linear Regression")
  {
    OOB.pred<-matrix(0, nrow=n, ncol=2)
    for (i in 1:n)
    {
      pred.ids<-which(is.na(OOBpred[i,])==F)
      pred.vec<-OOBpred[i,c(pred.ids)]
      OOBval<-sum(pred.vec)/length(pred.vec)
      OOB.pred[i,]<-c(i, OOBval)
    }
    colnames(OOB.pred)<-c("sample id","mean prediction")
    OOBmiss<-sum((OOB.pred[,2]-resp)^2)/n
  }
  if (mtype=="Cox-PH Time-to-Event")
  {
    OOB.pred<-matrix(0, nrow=n, ncol=2)
    for (i in 1:n)
    {
      pred.ids<-which(is.na(OOBpred[i,])==F)
      pred.vec<-OOBpred[i,c(pred.ids)]
      OOBval<-sum(pred.vec)/length(pred.vec)
      OOB.pred[i,]<-c(i, OOBval)
    }
    colnames(OOB.pred)<-c("sample id","mean prediction")
    OOBmiss<-sum((OOB.pred[,2]-resp)^2)/n
  }
  if (mtype=="Exp. Time-to-Event")
  {
    OOB.pred<-matrix(0, nrow=n, ncol=2)
    for (i in 1:n)
    {
      pred.ids<-which(is.na(OOBpred[i,])==F)
      pred.vec<-OOBpred[i,c(pred.ids)]
      OOBval<-sum(pred.vec)/length(pred.vec)
      OOB.pred[i,]<-c(i, OOBval)
    }
    colnames(OOB.pred) <- c("sample id","mean prediction")
    OOBmiss <- sum((OOB.pred[,2]-resp)^2)/n
  }
  pred.importmat<-matrix(0, nrow=nBS, ncol=pred)
  colnames(pred.importmat)<-x.nms
  for (i in 1:nBS)
  {
    pred.ind<-treepreds.list[[i]]
    m<-length(pred.ind)
    for (j in 1:m)
    {
      col<-pred.ind[j]
      pred.importmat[i,col]<-single.predimport[[i]][j]
    }
  }
  pred.imp<-colSums(pred.importmat)
  names(pred.imp)<-x.nms
  freq.table<-table(names(unlist(single.predimport)))
  all.pimps<-unique(names(unlist(vimp.import)))
  npimps<-length(unique(names(unlist(vimp.import))))
  pimp.freqtable<-table(names(unlist(vimp.import)))
  pimptable<-matrix(0, nrow=nBS, ncol=npimps)
  colnames(pimptable)<-all.pimps
  for (i in 1:nBS)
  {
    npimps.tree<-length(vimp.import[[i]])
    for (j in 1:npimps.tree)
    {
      cpimp<-vimp.import[[i]][j]
      col.id<-which(colnames(pimptable)%in%names(cpimp))
      pimptable[i,col.id]<-cpimp
    }
  }
  pimpsum<-colSums(pimptable)  #importance of each interaction
  t5PIs<-names(sort(pimpsum, decreasing=TRUE)[1:5])

  ans<-list(AllFits=fitlist, Top5.PI=t5PIs, Predictor.importance=pred.imp, Predictor.frequency=freq.table,
            PI.frequency=pimp.freqtable, PI.importance=pimpsum,  ModelPI.import=vimp.import,
            IPImatrix=IPImatrix, OOBmiss=OOBmiss, OOBprediction=OOB.pred, IBdata=IBdata, OOBdata=OOBdata, norm=norm,
            numout=numout, predictors=pred, Xs=Xs, model.type=mtype)
  class(ans)<-"logforest"
  ans
}