#' @title Approximate Best Subset Maximum Binary Prediction Rule (PRESCIENCE)
#'
#' @description
#' \code{select} employs the Approximate Best Subset Maximum Binary Prediction Rule
#' (PRESCIENCE) on the given variable selection problem.
#'
#' @param formula an object of class formula of the format: (binary dependent variable) ~ (normalized focus variable) + (remaining focus variables) + (auxiliary variables).
#' @param data a data frame containing the variables in the model.
#' @param nfoc integer. The number of focus variable(s) excluding the intercept.
#' @param q integer. The cardinality constraint for the covariate selection.
#' @param bound numeric. The maximum absolute value of the bounds for all variables.
#' @param beta0 integer. The coefficient taking value either 1 or -1 to normalize the scale for the first focus variable.
#' @param tol numeric. Tolerance level. If \code{NULL}, use the default tolerance level.
#' @param warmstart logical. If \code{TRUE}, use the warm start strategy.
#' @param tau the tuning parameter for enlarging the estimated bounds.
#' @param mio integer. 1 for MIO method 1 and 2 for method 2 in the paper.
#' @param tlim time limit (in seconds) specified for the MIO solver.
#' @import stats
#' @import slam
#' @import gurobi
#' @export
#' @return a list with 7 elements:
#' \item{tolerance}{tolerance level}
#' \item{status}{optimization status}
#' \item{score}{Gurobi score}
#' \item{gap}{the MIO optimization gap value in case of early termination (0 if optimal solution is found within the time limit)}
#' \item{rtime}{time used by the MIO solver in the estimation procedure}
#' \item{ncount}{the number of Branch-and-bound nodes}
#' \item{bhat}{maximum score estimates for coefficients}
#' @author Yankang (Bennie) Chen <yankang.chen@@columbia.edu>
#' @references \emph{Best Subset Binary Prediction} by Le-Yu Chen and Sokbae Lee (2018).
#' \url{https://arxiv.org/abs/1610.02738}
#' @examples
#' results <- select(auto ~ dcost + cars + dovtt + divtt,
#' data = transportation, nfoc = 1, q = 1, bound = 10)
#'
#' summary(results)
#' coef(results)
select <- function(formula, data, nfoc, q, bound, beta0 = 1, tol = NULL, warmstart = TRUE, tau = 1.5, mio = 1, tlim = 86400){
# Error checking
if (missing(formula) || class(formula) != "formula")
stop("'formula' is missing or incorrect.")
requireNamespace("gurobi")
gc()
results <- list()
df <- model.frame(formula, data)
varnames <- colnames(df)
varnames <- varnames[2:length(varnames)]
varnames <- c(varnames[1:nfoc], '(Intercept)', varnames[(nfoc+1):length(varnames)])
Y_tr <- as.matrix(df[1])
temp <- df[2:ncol(df)]
n_tr <- nrow(Y_tr)
x_std <- scale(temp)
x_foc <- cbind(as.matrix(x_std[,1:nfoc]), matrix(rep(1, n_tr), ncol = 1)) # [focus variables, intercept]
x_aux <- x_std[,(nfoc+1):ncol(x_std)] # [auxiliary variables]
remove(df, temp, x_std)
gc()
d <- ncol(x_aux)
bnd <- cbind(matrix(rep(-bound, nfoc+d), ncol = 1), matrix(rep(bound, nfoc+d), ncol = 1)) # set the initial parameter bounds
if (is.null(tol)) {
tol <- floor(sqrt(log(n_tr) * n_tr) / 2) # set the tolerance level value
results$tolerance <- tol/n_tr
} else {
results$tolerance <- tol
tol <- tol * n_tr
}
# Results
if (warmstart) { # warm start MIO
max_score <- warm_start_max_score(Y_tr, x_foc, x_aux, beta0, q, tlim, tol, bnd, mio, tau)
results$status <- max_score$status
bhat <- c(1, max_score$bhat)
results$score <- max_score$score/n_tr
results$gap <- max_score$gap/n_tr
results$rtime <- max_score$rtime
results$ncount <- max_score$ncount
} else { # cold start MIO
max_score <- max_score_constr_fn(Y_tr, x_foc, x_aux, beta0, q, tlim, tol, bnd, mio)
results$status <- max_score$status
bhat <- c(1, max_score$bhat)
results$score <- max_score$score/n_tr
results$gap <- max_score$gap/n_tr
results$rtime <- max_score$rtime
results$ncount <- max_score$ncount
}
bhat <- as.data.frame(bhat)
rownames(bhat) <- varnames
colnames(bhat) <- "Estimate"
results$bhat <- bhat
class(results) <- c("select", "list")
return(results)
}
miobnd_fn <- function(x, beta0, bnd){
n <- nrow(x)
k <- ncol(x) - 1
# Build model
model <- list()
model$modelsense <- "max"
model$sense <- ">"
model$lb <- bnd[,1]
model$ub <- bnd[,2]
# Set parameters
params <- list()
tol <- 1e-6
params$outputflag <- 0
params$OptimalityTol <- tol
params$FeasibilityTol <- tol
params$IntFeasTol <- tol
v <- rep(0, 2)
value <- rep(0, n)
for (i in 1:n) {
alpha <- beta0 * x[i,1]
model$obj <- x[i,2:(k+1)]
model$objcon <- alpha
model$A <- as.simple_triplet_matrix(matrix(x[i,2:(k+1)], nrow = 1))
model$rhs <- -alpha
try(result <- gurobi(model, params))
try(v[1] <- result$objval, silent = TRUE)
model$obj <- -x[i,2:(k+1)]
model$objcon <- -alpha
model$A <- as.simple_triplet_matrix(matrix(-x[i,2:(k+1)], nrow = 1))
model$rhs <- alpha
try(result <- gurobi(model, params))
try(v[2] <- result$objval, silent = TRUE)
value[i] <- max(v)
}
return(value)
}
get_bnd <- function(y, x, beta0, bnd){
k <- ncol(x) - 1
n <- k+1
logit <- matrix(coefficients(glm.fit(x, y, family = binomial(link = "logit"), intercept = FALSE)), ncol = 1)
p_hat <- 1/(1+exp(-x %*% logit))
constr <- matrix(rep(p_hat-0.5, n), ncol = n) * x
bound <- bnd
remove(logit, p_hat)
gc()
# Build model
model <- list()
model$sense <- ">"
model$A <- as.simple_triplet_matrix(constr[,2:n])
model$rhs <- -constr[,1]*beta0
model$lb = bound[,1]
model$ub = bound[,2]
# Set parameters
params <- list()
tol <- 1e-6
params$outputflag <- 0
params$OptimalityTol <- tol
params$FeasibilityTol <- tol
params$IntFeasTol <- tol
for (i in 1:k) {
objcoef <- rep(0, k)
objcoef[i] <- 1
model$obj <- objcoef
model$modelsense <- "min"
try(result <- gurobi(model, params))
try(bound[i,1] <- result$objval, silent = TRUE)
model$modelsense <- "max"
model$lb <- bound[,1]
try(result <- gurobi(model, params))
try(bound[i,2] <- result$objval, silent = TRUE)
model$ub = bound[,2]
}
return(bound)
}
max_score_constr_fn <- function(y, x_foc, x_aux, beta0, q, tlim, abgap, bnd, mio){
N <- nrow(y)
k <- ncol(x_foc) - 1
d <- ncol(x_aux)
bhat <- matrix(rep(0, k+d), ncol = 1) # (k+d) vector
gap <- 0
rtime <- 0
miobnd <- miobnd_fn(cbind(x_foc, x_aux), beta0, bnd) # (N) vector
# Build model
model <- list()
model$sense <- "<"
model$modelsense <- "max"
model$lb <- c(rep(0, N), bnd[,1], rep(0, d)) # (N+k+2*d) vector
model$ub <- c(rep(1, N), bnd[,2], rep(1, d)) # (N+k+2*d) vector
model$vtype <- c(rep("B", N), rep("C", k+d), rep("B", d)) # (N+k+2*d) vector
# Set parameters
params <- list()
tol <- 1e-6
params$outputflag <- 0
params$OptimalityTol <- tol
params$FeasibilityTol <- tol
params$IntFeasTol <- tol
if (tlim > 0) params$TimeLimit <- tlim
if (abgap > 0) params$MIPGapAbs <- abgap
ztemp1 <- matrix(rep(0, N*d), ncol = d) # N*d matrix
ztemp2 <- matrix(rep(0, N*(2*d+1)), ncol = N) # (2*d+1)*N matrix
htemp <- rbind(diag(d), -diag(d), matrix(rep(0, d), ncol = d)) # (2*d+1)*d matrix
etemp <- rbind(-diag(bnd[(k+1):(k+d), 2]), diag(bnd[(k+1):(k+d), 1]), matrix(rep(1, d), ncol = d)) # (2*d+1)*d matrix
mtemp1 <- cbind(ztemp2, matrix(rep(0, k*(2*d+1)), ncol = k), htemp, etemp) # (2*d+1)*(N+k+2*d) matrix
mtemp2 <- c(rep(0, 2*d), q) # (2*d+1) vector
if (mio == 1) {
model$obj <- c(2*y-1, rep(0, k+2*d)) # Method 1 formulation
model$objcon <- sum(1-y)
miobnd_bar <- miobnd+tol # (N) vector
mtemp3 <- cbind(diag(-miobnd_bar), x_foc[,2:(k+1)], x_aux, ztemp1) # N*(N+k+2*d) matrix
mtemp4 <- cbind(diag(miobnd), -x_foc[,2:(k+1)], -x_aux, ztemp1) # N*(N+k+2*d) matrix
model$A <- as.simple_triplet_matrix(rbind(mtemp4, mtemp3, mtemp1)) # (2*N+2*d+1)*(N+k+2*d) matrix
model$rhs <- c(miobnd*(1-tol)+beta0*x_foc[,1], -tol*miobnd_bar-beta0*x_foc[,1], mtemp2) # (2*N+2*d+1) vector
} else {
model$obj <- c(rep(1, N), rep(0, k+2*d)) # Method 2 formulation
temp2 <- 1-2*y # (N) vector
temp3 <- matrix(rep(temp2, k), ncol = k) # N*k matrix
temp4 <- matrix(rep(temp2, d), ncol = d) # N*d matrix
model$A <- as.simple_triplet_matrix(rbind(cbind(diag(miobnd), temp3*x_foc[,2:(k+1)], temp4*x_aux, ztemp1), mtemp1))
# (N+2*d+1)*(N+k+2*d) sparse matrix
model$rhs <- c(miobnd*(1-tol) - beta0*temp2*x_foc[,1], mtemp2) # (N+2*d+1) vector
}
remove(ztemp1, ztemp2, htemp, etemp, mtemp1, mtemp2)
gc()
results <- list()
try(result <- gurobi(model, params))
try(results$status <- result$status, silent = TRUE)
try(results$bhat <- result$x[(N+1):(N+k+d)], silent = TRUE)
try(results$score <- result$objval, silent = TRUE)
try(results$gap <- result$objbound - results$score, silent = TRUE)
try(results$rtime <- result$runtime, silent = TRUE)
try(results$ncount <- result$nodecount, silent = TRUE)
return(results)
}
warm_start_max_score <- function(y, x_foc, x_aux, beta0, q, tlim, tol, bnd, mio, tau){
bnd_h <- get_bnd(y, cbind(x_foc, x_aux), beta0, bnd) # (k-1+d)*2 matrix
bnd_abs <- tau * apply(abs(bnd_h), 1, max) # (k-1+d) vector
bnd0 <- cbind(apply(cbind(-bnd_abs, bnd[,1]), 1, max), apply(cbind(bnd_abs, bnd[,2]), 1, min)) # (k-1+d)*2 matrix
# this is the refind bound used for warm start MIO
max_score <- max_score_constr_fn(y, x_foc, x_aux, beta0, q, tlim, tol, bnd0, mio)
results <- list()
results$status <- max_score$status
results$bhat <- max_score$bhat
results$score <- max_score$score
results$gap <- max_score$gap
results$rtime <- max_score$rtime
results$ncount <- max_score$ncount
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.