library("SubBoost")
library("leaps")
library("glmnet")
library("MASS")
library("mvnfast")
library("mboost")
library("stabs")
library("bst")
library("parallel")
library("glmnetUtils")
library("xgboost")
External_LOOCV = function (i,data,...){
set.seed(i)
n=nrow(data$x)
p=ncol(data$x)
data.cur = list()
data.cur$x = data$x[-i,]
data.cur$y = data$y[-i]
data.adjXi = c(1,data$x[i,])
# AdaSubBoost
start.time <- Sys.time()
output = AdaSubBoost(data=data.cur, Iter=Iter, size.fixed=size.fixed, K=K, q=q, tau = tau, const=const, U_C=20, savings=1, nstop=nstop, s_max=s_max)
end.time <- Sys.time()
timeAdaSubBoost = as.double(end.time-start.time,units = "secs")
AdaSubBoost_beta = output$beta.cur
AdaSubBoost_model = which(AdaSubBoost_beta[-1]!=0)
linpred = data.adjXi %*% AdaSubBoost_beta
SqE_AdaSubBoost = as.numeric((linpred - data$y[i])^2)
modelsizes_AdaSubBoost = as.numeric(length(AdaSubBoost_model))
# RSubBoost (without adapting probabilities, adaptive=FALSE)
start.time <- Sys.time()
outputRSub = AdaSubBoost(data=data.cur, Iter=Iter, size.fixed=size.fixed, K=K, q=q, tau = tau, const=const, U_C=20, savings=1,
adaptive=FALSE, nstop=nstop, s_max=s_max)
end.time <- Sys.time()
timeRSubBoost = as.double(end.time-start.time,units = "secs")
RSubBoost_beta = outputRSub$beta.cur
RSubBoost_model = which(RSubBoost_beta[-1]!=0)
linpred = data.adjXi %*% RSubBoost_beta
SqE_RSubBoost = as.numeric((linpred - data$y[i])^2)
modelsizes_RSubBoost = as.numeric(length(RSubBoost_model))
# SubBoost (with full enumeration)
if (p<=20){
start.time <- Sys.time()
outputSub = SubBoost(data=data.cur,Iter=Iter,size.fixed=size.fixed, tau = tau, const=const, savings=1)
end.time <- Sys.time()
timeSubBoost = as.double(end.time-start.time,units = "secs")
SubBoost_beta = outputSub$beta.cur
SubBoost_model = which(SubBoost_beta[-1]!=0)
linpred = data.adjXi %*% SubBoost_beta
SqE_SubBoost = as.numeric((linpred - data$y[i])^2)
modelsizes_SubBoost = as.numeric(length(SubBoost_model))
}
#L2Boosting
start.time <- Sys.time()
mod <- glmboost(y = data.cur$y, x = data.cur$x, control = boost_control(mstop = mstop.max))
cv10f <- cv(model.weights(mod), type = "kfold")
cvm <-cvrisk(mod, papply = lapply, folds = cv10f)
m.stop = mstop(cvm)
end.time <- Sys.time()
timeBoosting = as.double(end.time-start.time,units = "secs")
mod.boost = mod[m.stop]
Boost_model = which(coef(mod.boost, which=1:p)!=0)
Boost_beta = coef(mod.boost, which=1:p)
data.adjXi = data$x[i,,drop=F]
linpred = predict(mod.boost,type = "response", newdata = data.adjXi)
SqE_Boosting = as.numeric((linpred - data$y[i])^2)
modelsizes_Boosting= as.numeric(length(Boost_model))
# Stability Selection for Boosting
mod <- glmboost(y = data.cur$y, x = data.cur$x, control = boost_control(mstop = 1000))
start.time <- Sys.time()
mod.stab = stabsel(mod, PFER = PFER, sampling.type = "SS", q = q_stabsel, papply=lapply)
end.time <- Sys.time()
timeStability = as.double(end.time-start.time, units = "secs")
Stability_model = as.vector(mod.stab$selected)
Stability_beta = beta.hat(data,Stability_model)
data.adjXi = c(1,data$x[i,])
linpred = data.adjXi %*% Stability_beta
SqE_Stability= as.numeric((linpred - data$y[i])^2)
modelsizes_Stability = as.numeric(length(Stability_model))
# Twin Boosting
mstop.max2 = 5000
start.time <- Sys.time()
cvm2 <- cv.bst(data.cur$x, data.cur$y, family="gaussian", ctrl = bst_control(twinboost = TRUE, twintype=1, coefir = Boost_beta ,
xselect.init = Boost_model, mstop = mstop.max2), learner="ls", n.cores=1)
mstop.optimal = which.min(cvm2$cv.error)
twinBoost = bst(data.cur$x, data.cur$y, family="gaussian", ctrl = bst_control(twinboost = TRUE, twintype=1, coefir = Boost_beta ,
xselect.init = Boost_model, mstop = mstop.optimal), learner="ls")
end.time <- Sys.time()
timeBoost2 = as.double(end.time-start.time,units = "secs")
timeTwinBoost = timeBoosting + timeBoost2
TwinBoost_model = twinBoost$xselect
data.adjXi = data$x[i,,drop=F]
linpred = predict(twinBoost,type = "response", newdata = data.adjXi)
SqE_TwinBoost = as.numeric((linpred - data$y[i])^2)
modelsizes_TwinBoost = as.numeric(length(TwinBoost_model))
## Lasso with cross-validation
start.time <- Sys.time()
mod_cv <- cv.glmnet(x=data.cur$x, y=data.cur$y, family="gaussian")
Lasso_beta = coef(mod_cv, mod_cv$lambda.min) #lambda.min
Lasso_model = which(Lasso_beta[-1] !=0)
end.time <- Sys.time()
timeLasso = as.double(end.time-start.time,units = "secs")
data.adjXi = c(1,data$x[i,])
linpred = data.adjXi %*% Lasso_beta
SqE_Lasso = as.numeric((linpred - data$y[i])^2)
modelsizes_Lasso = length(Lasso_model)
## Elastic net with cross-validation for both parameters (lambda and alpha)
start.time <- Sys.time()
mod_cv <- cva.glmnet(x=data.cur$x, y=data.cur$y, family="gaussian")
cv_errors_alpha <- rep(NA,length(mod_cv$alpha))
for (k in 1:length(mod_cv$alpha)) { # find minimal mean CV-error among alphas
cv_errors_alpha[k] <- min(mod_cv$modlist[[k]]$cvm)
}
alpha_min <- mod_cv$alpha[which.min(cv_errors_alpha)]
lambda_min <- sapply(mod_cv$modlist, `[[`, "lambda.min")[which.min(cv_errors_alpha)]
ENet_beta = coef(mod_cv, alpha=alpha_min, s=lambda_min) #coef(mod_cv, alpha=alpha_min)
ENet_model = which(ENet_beta[-1] !=0)
end.time <- Sys.time()
timeENet = as.double(end.time-start.time,units = "secs")
data.adjXi = c(1,data$x[i,])
linpred = data.adjXi %*% ENet_beta
SqE_ENet = as.numeric((linpred - data$y[i])^2)
modelsizes_ENet = length(ENet_model)
## Relaxed Lasso with cross-validation
start.time <- Sys.time()
mod_cv <- cv.glmnet(x=data.cur$x, y=data.cur$y, family="gaussian", relax=TRUE)
ReLasso_beta = coef(mod_cv, s="lambda.min", gamma="gamma.min") #lambda.min
ReLasso_model = which(ReLasso_beta[-1] !=0)
end.time <- Sys.time()
timeReLasso = as.double(end.time-start.time,units = "secs")
data.adjXi = c(1,data$x[i,])
linpred = data.adjXi %*% ReLasso_beta
SqE_ReLasso = as.numeric((linpred - data$y[i])^2)
modelsizes_ReLasso = length(ReLasso_model)
## Model selected by EBIC with least squares
start.time <- Sys.time()
s_max = s_max
# forward screening
regs=summary(regsubsets(as.matrix(data.cur$x),data.cur$y,intercept=TRUE,nvmax=s_max,method="forward"))
modelmatrix=as.matrix(regs$which[,-1,drop=F])
S = which(modelmatrix[s_max,])
regs=summary(regsubsets(as.matrix(data.cur$x[,S,drop=F]),data.cur$y,intercept=TRUE,nvmax=min(c(length(S),n-3)),method="exhaustive"))
modelmatrix=as.matrix(regs$which[,-1,drop=F])
vec=rep(0,nrow(modelmatrix)+1)
vec[nrow(modelmatrix)+1] = EBIC(data.cur,NULL,const)
for (j in 1:nrow(modelmatrix)) {
indices.help=S[as.vector(which(modelmatrix[j,]==TRUE))]
vec[j]=EBIC(data.cur,indices.help,const)
}
mini = which.min(vec)
if (mini<= nrow(modelmatrix)) { S=S[as.vector(which(modelmatrix[which.min(vec),]==TRUE))] } else { S = integer(0) }
EBIC_model <- S
EBIC_beta <- beta.hat(data.cur,S)
end.time <- Sys.time()
timeEBIC = as.double(end.time-start.time,units = "secs")
linpred = data.adjXi %*% EBIC_beta
SqE_EBIC = as.numeric((linpred - data$y[i])^2)
modelsizes_EBIC = length(EBIC_model)
#XGBoost with variable selection (component-wise boosting)
start.time <- Sys.time()
cv <- xgb.cv(data = data.cur$x ,
label = data.cur$y,
eta = 0.1,
nrounds = mstop.max,
nfold = 10,
booster = "gblinear",
objective = "reg:squarederror",
updater = "coord_descent",
feature_selector = "greedy",
base_score=0,
top_k = 1,
verbose=FALSE,
nthread=1,
early_stopping_rounds = 10)
iter.opt <- which.min(cv$evaluation_log$test_rmse_mean)
bst <- xgboost(data = data.cur$x , label = data.cur$y, eta = 0.1, nrounds = iter.opt, nthread = 1,
updater = 'coord_descent', feature_selector = 'greedy', top_k = 1, base_score=0, verbose=FALSE,
booster = "gblinear", objective = "reg:squarederror", callbacks = list(cb.gblinear.history()))
coef_path <- xgb.gblinear.history(bst)
XGBoostCD_beta <- coef_path[iter.opt,]
XGBoostCD_model <- which(coef_path[iter.opt,-1]!=0)
end.time <- Sys.time()
timeXGBoostCD = as.double(end.time-start.time,units = "secs")
linpred = data.adjXi %*% XGBoostCD_beta
SqE_XGBoostCD = as.numeric((linpred - data$y[i])^2)
modelsizes_XGBoostCD = length(XGBoostCD_model)
if (p<=20) {
return(list(SqE_AdaSubBoost=SqE_AdaSubBoost, SqE_RSubBoost=SqE_RSubBoost, SqE_SubBoost=SqE_SubBoost, SqE_Boosting = SqE_Boosting, SqE_TwinBoost = SqE_TwinBoost, SqE_Lasso = SqE_Lasso, SqE_ENet = SqE_ENet, SqE_Stability = SqE_Stability ,
SqE_ReLasso = SqE_ReLasso, SqE_EBIC = SqE_EBIC, SqE_XGBoostCD = SqE_XGBoostCD,
modelsizes_AdaSubBoost=modelsizes_AdaSubBoost, modelsizes_RSubBoost=modelsizes_RSubBoost, modelsizes_SubBoost=modelsizes_SubBoost, modelsizes_TwinBoost = modelsizes_TwinBoost, modelsizes_Boosting = modelsizes_Boosting, modelsizes_Lasso = modelsizes_Lasso, modelsizes_ENet = modelsizes_ENet, modelsizes_Stability = modelsizes_Stability,
modelsizes_ReLasso = modelsizes_ReLasso, modelsizes_EBIC = modelsizes_EBIC, modelsizes_XGBoostCD = modelsizes_XGBoostCD,
timeAdaSubBoost=timeAdaSubBoost, timeRSubBoost=timeRSubBoost, timeSubBoost=timeSubBoost, timeSubBoost=timeSubBoost, timeTwinBoost = timeTwinBoost, timeBoosting = timeBoosting, timeLasso = timeLasso, timeENet = timeENet, timeStability = timeStability,
timeReLasso = timeReLasso, timeEBIC = timeEBIC, timeXGBoostCD = timeXGBoostCD))
} else {
return(list(SqE_AdaSubBoost=SqE_AdaSubBoost, SqE_RSubBoost=SqE_RSubBoost, SqE_Boosting = SqE_Boosting, SqE_TwinBoost = SqE_TwinBoost, SqE_Lasso = SqE_Lasso, SqE_ENet = SqE_ENet, SqE_Stability = SqE_Stability ,
SqE_ReLasso = SqE_ReLasso, SqE_EBIC = SqE_EBIC, SqE_XGBoostCD = SqE_XGBoostCD,
modelsizes_AdaSubBoost=modelsizes_AdaSubBoost, modelsizes_RSubBoost=modelsizes_RSubBoost, modelsizes_TwinBoost = modelsizes_TwinBoost, modelsizes_Boosting = modelsizes_Boosting, modelsizes_Lasso = modelsizes_Lasso, modelsizes_ENet = modelsizes_ENet, modelsizes_Stability = modelsizes_Stability,
modelsizes_ReLasso = modelsizes_ReLasso, modelsizes_EBIC = modelsizes_EBIC, modelsizes_XGBoostCD = modelsizes_XGBoostCD,
timeAdaSubBoost=timeAdaSubBoost, timeRSubBoost=timeRSubBoost, timeTwinBoost = timeTwinBoost, timeBoosting = timeBoosting, timeLasso = timeLasso, timeENet = timeENet, timeStability = timeStability,
timeReLasso = timeReLasso, timeEBIC = timeEBIC, timeXGBoostCD = timeXGBoostCD))
}
}
##################################################
unAsIs <- function(X) {
if("AsIs" %in% class(X)) {
class(X) <- class(X)[-match("AsIs", class(X))]
}
X
}
##################################################
library("hdi")
data(riboflavin)
n = length(riboflavin$y)
p = dim(riboflavin$x)[2]
Xnames = colnames(riboflavin$x)
data = list()
data$x = as.matrix(riboflavin$x)
data$y = as.vector(riboflavin$y)
data$x = unAsIs(data$x)
data$y = unAsIs(data$y)
################################################
q = min(20,p/2)
size.fixed = NULL
tau = 0.01 # 0.01
Iter = 10000
const = 1 # gamma (Penalty in EBIC)
PFER = 2
mstop.max = 5000
nstop = p / 2
q_stabsel = min(15, p/2)
K = p/q
s_max = 15
#numCores <- detectCores()
numCores = 1 # Increase number of cores for parallel computing
RNGkind("L'Ecuyer-CMRG")
set.seed(1)
LOOCV_results = mclapply(1:n, External_LOOCV, mc.cores = numCores, data=data, mstop.max = mstop.max,Iter=Iter,K=K,q=q, tau = tau, const=const, U_C=20, savings=1, mc.set.seed = TRUE, mc.preschedule = FALSE, size.fixed=size.fixed, q_stabsel=q_stabsel, PFER=PFER, nstop=nstop, s_max = s_max)
save(LOOCV_results, file="LOOCV_Riboflavin_CV.RData")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.