Nothing
#' Logic Forest & Logic Survival Forest
#'
#' Constructs an ensemble of logic regression models using bagging for classification or regression, and identifies important predictors and interactions. Logic Forest (LF) efficiently searches the space of logical combinations of binary variables using simulated annealing. It has been extended to support linear and survival regression.
#' @importFrom LogicReg eval.logreg logreg logreg.anneal.control
#' @importFrom methods is
#'
#' @param resp.type String indicating regression type: \code{"bin"} for classification, \code{"lin"} for linear regression, \code{"exp_surv"} for exponential time-to-event, and \code{"cph_surv"} for Cox proportional hazards.
#' @param resp Numeric vector of response values (binary for classification/survival, continuous for linear regression). For time-to-event, indicates event/censoring status.
#' @param resp.time Numeric vector of event/censoring times (used only for survival models).
#' @param Xs Matrix or data frame of binary predictor variables (0/1 only).
#' @param nBSXVars Integer. Number of predictors sampled for each tree (default is all predictors).
#' @param anneal.params A list of parameters for simulated annealing (see \code{\link[LogicReg]{logreg.anneal.control}}). Defaults: \code{start = 1}, \code{end = -2}, \code{iter = 50000}.
#' @param nBS Number of trees to fit in the logic forest.
#' @param h Numeric. Minimum proportion of trees predicting "1" required to classify an observation as "1" (used for classification).
#' @param norm Logical. If \code{FALSE}, importance scores are not normalized.
#' @param numout Integer. Number of predictors and interactions to report.
#' @param nleaves Integer. Maximum number of leaves (end nodes) allowed per tree.
#'
#' @details
#' Logic Forest is designed to identify interactions between binary predictors without requiring their pre-specification. Using simulated annealing, it searches the space of all possible logical combinations (e.g., AND, OR, NOT) among predictors. Originally developed for binary outcomes in gene-environment interaction studies, it has since been extended to linear and time-to-event outcomes (Logic Survival Forest).
#'
#' @return A \code{logforest} object containing:
#' \describe{
#' \item{Predictor.frequency}{Frequency of each predictor across trees.}
#' \item{Predictor.importance}{Importance of each predictor.}
#' \item{PI.frequency}{Frequency of each interaction across trees.}
#' \item{PI.importance}{Importance of each interaction.}
#' }
#'
#' @references
#' Wolf BJ, Hill EG, Slate EH. (2010). Logic Forest: An ensemble classifier for discovering logical combinations of binary markers. \emph{Bioinformatics}, 26(17):2183–2189. \doi{10.1093/bioinformatics/btq354} \cr
#' Wolf BJ et al. (2012). LBoost: A boosting algorithm with application for epistasis discovery. \emph{PLoS One}, 7(11):e47281. \doi{10.1371/journal.pone.0047281} \cr
#' Hyer JM et al. (2019). Novel Machine Learning Approach to Identify Preoperative Risk Factors Associated With Super-Utilization of Medicare Expenditure Following Surgery. \emph{JAMA Surg}, 154(11):1014–1021. \doi{10.1001/jamasurg.2019.2979}
#'
#' @author
#' Bethany J. Wolf \email{wolfb@@musc.edu} \cr
#' J. Madison Hyer \email{madison.hyer@@osumc.edu}
#'
#' @note
#' Development of Logic Forest was supported by NIH/NCATS UL1RR029882. Logic Survival Forest development was supported by NIH/NIA R01AG082873.
#'
#' @seealso \code{\link{pimp.import}}, \code{\link[LogicReg]{logreg.anneal.control}}
#'
#' @examples
#' \dontrun{
#' set.seed(10051988)
#' N_c <- 50
#' N_r <- 200
#' init <- as.data.frame(matrix(0, nrow = N_r, ncol = N_c))
#' colnames(init) <- paste0("X", 1:N_c)
#' for(n in 1:N_c){
#' p <- runif(1, min = 0.2, max = 0.6)
#' init[,n] <- rbinom(N_r, 1, p)
#' }
#'
#' X3X4int <- as.numeric(init$X3 == init$X4)
#' X5X6int <- as.numeric(init$X5 == init$X6)
#' y_p <- -2.5 + init$X1 + init$X2 + 2 * X3X4int + 2 * X5X6int
#' p <- 1 / (1 + exp(-y_p))
#' init$Y.bin <- rbinom(N_r, 1, p)
#'
#' # Classification
#' LF.fit.bin <- logforest("bin", init$Y.bin, NULL, init[,1:N_c], nBS=10, nleaves=8, numout=10)
#' print(LF.fit.bin)
#'
#' # Continuous
#' init$Y.cont <- rnorm(N_r, mean = 0) + init$X1 + init$X2 + 5 * X3X4int + 5 * X5X6int
#' LF.fit.lin <- logforest("lin", init$Y.cont, NULL, init[,1:N_c], nBS=10, nleaves=8, numout=10)
#' print(LF.fit.lin)
#'
#' # Time-to-event
#' shape <- 1 - 0.05*init$X1 - 0.05*init$X2 - 0.2*init$X3*init$X4 - 0.2*init$X5*init$X6
#' scale <- 1.5 - 0.05*init$X1 - 0.05*init$X2 - 0.2*init$X3*init$X4 - 0.2*init$X5*init$X6
#' init$TIME_Y <- rgamma(N_r, shape = shape, scale = scale)
#' LF.fit.surv <- logforest("exp_surv", init$Y.bin, init$TIME_Y, init[,1:N_c],
#' nBS=10, nleaves=8, numout=10)
#' print(LF.fit.surv)
#' }
#'
#' @export
logforest<-function(resp.type, resp, resp.time = data.frame(X = rep(1, nrow(resp))), Xs, nBSXVars, anneal.params, nBS=100, h=0.5, norm=TRUE, numout=5, nleaves)
{
pred<-ncol(Xs) #Determines the number of predictors
if (missing(anneal.params)) {anneal.params<-logreg.anneal.control(start=2, end=-1, iter=100000)} #Sets default annealing parameters
if (missing(nBSXVars)) {nBSXVars<-pred} #Sets default number of variables for randomly selecting predictors
n <- nrow(Xs) #Determines the number of rows
if (is.null(colnames(Xs))) {x.nms<-paste("X", 1:pred, sep="")} #Sets default predictor names if missing
else {x.nms<-colnames(Xs)} #Sets predictor names if non missing
fitlist <- vector("list", nBS) #Initiates vector of fits
IBdata <- vector("list", nBS) #Initiates in-bag data list
OOBdata <- vector("list", nBS) #Initiates out-of-bag data list
OOBpred <- matrix(nrow=n, ncol=nBS) #Initiates out-of-bag predictor list
single.predimport <- vector("list",nBS) #Initiates list of predictor importance
vimp.import <- vector("list", nBS) #Initiates list of interaction importance
treepreds.list <- vector("list", nBS) #Initiates list of trees
IPImatrix<-matrix(NA, nrow=0, ncol=(1+pred*2))
colnames(IPImatrix)<-c("treenum", x.nms, paste("!",x.nms, sep=""))
if (resp.type == "bin") {mtype="Classification"}
if (resp.type == "lin") {mtype="Linear Regression"}
if (resp.type == "exp_surv") {mtype="Exp. Time-to-Event"}
if (resp.type == "cph_surv") {mtype="Cox-PH Time-to-Event"}
if (mtype=="Classification") {mtyp=1}
if (mtype=="Linear Regression") {mtyp=2}
if (mtype=="Cox-PH Time-to-Event") {mtyp=3}
if (mtype=="Exp. Time-to-Event") {mtyp=4}
for(b in 1:nBS)
{
if(missing(nleaves)) {nleaves<-sample(2:8, 1, replace=FALSE)}
BSindices <- sample(1:n, n, replace=TRUE)
OOBindices <- (1:n)[!is.element(1:n, BSindices)]
BS <- Xs[BSindices, ] #Selects the bootstrap sample corresponding to the chosen indices
OOB <- Xs[OOBindices, ] #Selects the out-of-bag sample
BSY <- as.matrix(resp)[BSindices] #Response variable for bootstrap sample
OOBY <- as.matrix(resp)[OOBindices] #Response variable for out-of-bag sample
if(mtype == "Exp. Time-to-Event" | mtype == "Cox-PH Time-to-Event"){
BST <- resp.time[BSindices] #Response variable for bootstrap sample
OOBT <- resp.time[OOBindices] #Response variable for out-of-bag sample
}
XVarIndices <- sort(sample(1:pred, nBSXVars, replace=FALSE))
rsBSX <- BS[ ,XVarIndices] #Bootstrap with selected X-vars
rsOOBX <- OOB[,XVarIndices] #Out-of-bag with selected X-vars
if (mtype=="Classification")
{
FinalBS<-cbind(rsBSX, BSY) #Final Bootstrap with Y
FinalOOB<-cbind(rsOOBX, OOBY) #Final out-of-bag with Y
colnames(FinalBS)<-colnames(FinalOOB)<-c(x.nms[XVarIndices], "Y")
fit <- logreg(resp = BSY, bin = FinalBS[,1:nBSXVars],
type = 1, select = 1, ntrees = 1, nleaves = nleaves, anneal.control = anneal.params)
}
if (mtype=="Linear Regression")
{
FinalBS<-cbind(rsBSX, as.matrix(BSY)) #Final Bootstrap with Y
FinalOOB<-cbind(rsOOBX, as.matrix(OOBY)) #Final out-of-bag with Y
colnames(FinalBS)<-colnames(FinalOOB)<-c(x.nms[XVarIndices], "Y")
fit <- logreg(resp = BSY, bin = FinalBS[,1:nBSXVars],
type = 2, select = 1, ntrees = 1, nleaves = nleaves, anneal.control = anneal.params)
}
if (mtype=="Cox-PH Time-to-Event")
{
FinalBS<-cbind(rsBSX, BSY, BST) #Final Bootstrap with Y and Time
FinalOOB<-cbind(rsOOBX, OOBY, OOBT) #Final out-of-bag with Y and Time
colnames(FinalBS)<-colnames(FinalOOB)<-c(x.nms[XVarIndices], "Y")
fit <- logreg(resp = BST, cens = BSY, bin = FinalBS[,1:nBSXVars],
type = 4, select = 1, ntrees = 1, nleaves = nleaves, anneal.control = anneal.params)
}
if (mtype=="Exp. Time-to-Event")
{
FinalBS<-cbind(rsBSX, BSY, BST) #Final Bootstrap with Y and Time
FinalOOB<-cbind(rsOOBX, OOBY, OOBT) #Final out-of-bag with Y and Time
colnames(FinalBS)<-colnames(FinalOOB)<-c(x.nms[XVarIndices], "Y")
fit <- logreg(resp = BST, cens = BSY, bin = FinalBS[,1:nBSXVars],
type = 5, select = 1, ntrees = 1, nleaves = nleaves, anneal.control = anneal.params)
}
if(mtyp %in% 3:4)
{
#OOBpred[as.vector(OOBindices),b]<-predict.logreg2(fit, newbin=as.matrix(FinalOOB[,1:nBSXVars]), newcens = OOBY)
OOBi <- frame.logreg2(fit = fit, newbin = as.matrix(FinalOOB[,1:nBSXVars]), newcens = OOBY)
OOBpred[as.vector(OOBindices),b] <- OOBi[, 3]
} else {
OOBpred[as.vector(OOBindices),b] <- predict.logreg2(fit, newbin=as.matrix(FinalOOB[,1:nBSXVars]))
}
if (sum(fit$model$tree[[1]]$trees[,3])!=0)
{
pred.import<-pimp.import(fit=fit, data=FinalBS, testdata=FinalOOB, BSpred=length(XVarIndices),
pred=pred, Xs=XVarIndices, mtype=mtype)
vimp.import[[b]]<-pred.import$pimp.vimp
single.predimport[[b]]<-pred.import$single.vimp
treepreds.list[[b]]<-pred.import$Xids
ipimat<-cbind(rep(b, nrow(pred.import$Ipimat)), pred.import$Ipimat)
IPImatrix<-rbind(IPImatrix, ipimat) #data frame with row = one line per interaction and col = var and !var
}
else
{
single.predimport[[b]]= 0
vimp.import[[b]]= 0
treepreds.list[[b]]=0
IPImatrix<-rbind(IPImatrix, c(b, rep(0, pred*2)))
}
fitlist[[b]]<-fit #list of all models
IBdata[[b]]<-BSindices #data frame of one vector per BS sample, listing each row in the IB sample
OOBdata[[b]]<-OOBindices #data frame of one vector per BS sample, listing each row in the OOB sample
}
###CLOSE THE BOOTSTAP LOOP
if (mtype=="Classification")
{
OOB.pred<-matrix(0, nrow=n, ncol=2)
for (i in 1:n)
{
pred.ids<-which(OOBpred[i,]==1|OOBpred[i,]==0)
pred.vec<-OOBpred[i,c(pred.ids)]
OOBprop<-sum(pred.vec)/length(pred.vec)
OOBi.pred<-ifelse(OOBprop>h, 1, 0)
OOB.pred[i,]<-c(OOBi.pred, OOBprop)
}
colnames(OOB.pred)<-c("predicted_class","proportion")
OOBmiss<-sum(abs(OOB.pred[,1]-resp))/n
}
if (mtype=="Linear Regression")
{
OOB.pred<-matrix(0, nrow=n, ncol=2)
for (i in 1:n)
{
pred.ids<-which(is.na(OOBpred[i,])==F)
pred.vec<-OOBpred[i,c(pred.ids)]
OOBval<-sum(pred.vec)/length(pred.vec)
OOB.pred[i,]<-c(i, OOBval)
}
colnames(OOB.pred)<-c("sample id","mean prediction")
OOBmiss<-sum((OOB.pred[,2]-resp)^2)/n
}
if (mtype=="Cox-PH Time-to-Event")
{
OOB.pred<-matrix(0, nrow=n, ncol=2)
for (i in 1:n)
{
pred.ids<-which(is.na(OOBpred[i,])==F)
pred.vec<-OOBpred[i,c(pred.ids)]
OOBval<-sum(pred.vec)/length(pred.vec)
OOB.pred[i,]<-c(i, OOBval)
}
colnames(OOB.pred)<-c("sample id","mean prediction")
OOBmiss<-sum((OOB.pred[,2]-resp)^2)/n
}
if (mtype=="Exp. Time-to-Event")
{
OOB.pred<-matrix(0, nrow=n, ncol=2)
for (i in 1:n)
{
pred.ids<-which(is.na(OOBpred[i,])==F)
pred.vec<-OOBpred[i,c(pred.ids)]
OOBval<-sum(pred.vec)/length(pred.vec)
OOB.pred[i,]<-c(i, OOBval)
}
colnames(OOB.pred) <- c("sample id","mean prediction")
OOBmiss <- sum((OOB.pred[,2]-resp)^2)/n
}
pred.importmat<-matrix(0, nrow=nBS, ncol=pred)
colnames(pred.importmat)<-x.nms
for (i in 1:nBS)
{
pred.ind<-treepreds.list[[i]]
m<-length(pred.ind)
for (j in 1:m)
{
col<-pred.ind[j]
pred.importmat[i,col]<-single.predimport[[i]][j]
}
}
pred.imp<-colSums(pred.importmat)
names(pred.imp)<-x.nms
freq.table<-table(names(unlist(single.predimport)))
all.pimps<-unique(names(unlist(vimp.import)))
npimps<-length(unique(names(unlist(vimp.import))))
pimp.freqtable<-table(names(unlist(vimp.import)))
pimptable<-matrix(0, nrow=nBS, ncol=npimps)
colnames(pimptable)<-all.pimps
for (i in 1:nBS)
{
npimps.tree<-length(vimp.import[[i]])
for (j in 1:npimps.tree)
{
cpimp<-vimp.import[[i]][j]
col.id<-which(colnames(pimptable)%in%names(cpimp))
pimptable[i,col.id]<-cpimp
}
}
pimpsum<-colSums(pimptable) #importance of each interaction
t5PIs<-names(sort(pimpsum, decreasing=TRUE)[1:5])
ans<-list(AllFits=fitlist, Top5.PI=t5PIs, Predictor.importance=pred.imp, Predictor.frequency=freq.table,
PI.frequency=pimp.freqtable, PI.importance=pimpsum, ModelPI.import=vimp.import,
IPImatrix=IPImatrix, OOBmiss=OOBmiss, OOBprediction=OOB.pred, IBdata=IBdata, OOBdata=OOBdata, norm=norm,
numout=numout, predictors=pred, Xs=Xs, model.type=mtype)
class(ans)<-"logforest"
ans
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.