Workflow similar to here, but much more hyper parameter tuning. Notice it didn't actually improve the held-out model. Our advice is to not always treat the number of variables as just another hyper-parameter, but instead do a simple initial filtering step (either with a simple linear model, or with a random-forest style permutation test). At some point the hyper-parameter optimizer can't tell apart two ways of improving the training error: picking a better model or leaking through the cross-validation procedures.
library("wrapr")
library("vtreat")
library("glmnet")
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
library("ggplot2")
library("WVPlots")
library("doParallel")
## Loading required package: iterators
## Loading required package: parallel
# https://arxiv.org/abs/1703.03373
library("mlrMBO")
## Loading required package: mlr
## Loading required package: ParamHelpers
## Loading required package: smoof
## Loading required package: BBmisc
##
## Attaching package: 'BBmisc'
## The following object is masked from 'package:wrapr':
##
## coalesce
## The following object is masked from 'package:base':
##
## isFALSE
## Loading required package: checkmate
ncore <- parallel::detectCores()
cl <- parallel::makeCluster(ncore)
registerDoParallel(cl)
# function to make practice data
mk_data <- function(nrows, n_var_cols, n_noise_cols) {
d <- data.frame(y = rnorm(nrows))
for(i in seq_len(n_var_cols)) {
vari = paste0("var_", sprintf("%03g", i))
d[[vari]] <- rnorm(nrows)
d$y <- d$y + (2/n_var_cols)*d[[vari]]
d[[vari]][d[[vari]]>abs(2*rnorm(nrows))] <- NA
d[[vari]] <- rlnorm(1, meanlog=10, sdlog = 10)*d[[vari]]
}
for(i in seq_len(n_noise_cols)) {
vari = paste0("noise_", sprintf("%03g", i))
d[[vari]] <- rnorm(nrows)
d[[vari]][d[[vari]]>abs(2*rnorm(nrows))] <- NA
d[[vari]] <- rlnorm(1, meanlog=10, sdlog = 10)*d[[vari]]
}
d
}
set.seed(2018)
d <- mk_data(10000, 10, 200)
is_train <- runif(nrow(d))<=0.5
dTrain <- d[is_train, , drop = FALSE]
dTest <- d[!is_train, , drop = FALSE]
outcome_name <- "y"
vars <- setdiff(colnames(dTrain), outcome_name)
# design a treatment plan using cross-validation methods
ncross <- 5
cplan <- vtreat::kWayStratifiedY(
nrow(dTrain), ncross, dTrain, dTrain[[outcome_name]])
cp <- vtreat::mkCrossFrameNExperiment(
dTrain, vars, outcome_name,
splitFunction = pre_comp_xval(nrow(dTrain), ncross, cplan),
ncross = ncross,
parallelCluster = cl)
## [1] "vtreat 1.3.3 start initial treatment design Mon Nov 26 08:33:42 2018"
## [1] " start cross frame work Mon Nov 26 08:33:44 2018"
## [1] " vtreat::mkCrossFrameNExperiment done Mon Nov 26 08:33:52 2018"
print(cp$method)
## [1] "kwaycrossystratified (pre-computed 5053 5 )"
# sort the variables by possible sig
sf <- cp$treatments$scoreFrame
orderedvars <- sf$varName[order(sf$sig)]
# build a cross-validation strategy to help us
# search for a good alpha hyper-parameter value
# convert the plan to cv.glmnet group notation
foldid <- numeric(nrow(dTrain))
for(i in seq_len(length(cplan))) {
cpi <- cplan[[i]]
foldid[cpi$app] <- i
}
build_model <- function(nvars, alpha) {
newvars <- orderedvars[seq_len(nvars)]
# learn a centering and scaling of the cross-validated
# training frame
tfs <- scale(cp$crossFrame[, newvars, drop = FALSE],
center = TRUE, scale = TRUE)
centering <- attr(tfs, "scaled:center")
scaling <- attr(tfs, "scaled:scale")
# apply the centering and scaling to the cross-validated
# training frame
tfs <- scale(cp$crossFrame[, newvars, drop = FALSE],
center = centering,
scale = scaling)
# look for best lambda
model <- cv.glmnet(as.matrix(tfs),
cp$crossFrame[[outcome_name]],
alpha = alpha,
family = "gaussian",
standardize = FALSE,
foldid = foldid,
parallel = TRUE)
index <- which(model$lambda == model$lambda.min)[[1]]
score <- model$cvm[[index]]
list(model = model, score = score)
}
fn = function(x) {
scored_model <- build_model(nvars = x[[1]], alpha = x[[2]])
scored_model$score
}
# https://cran.r-project.org/web/packages/mlrMBO/vignettes/mlrMBO.html
obj_fn <- makeSingleObjectiveFunction(
name = "loss",
fn = function(x) {
scored_model <- build_model(nvars = x[[1]], alpha = x[[2]])
scored_model$score
},
par.set = makeParamSet(
makeIntegerVectorParam("nvars", len = 1L, lower = 2L, upper = length(orderedvars)),
makeNumericVectorParam("alpha", len = 1L, lower = 0, upper = 1)
),
minimize = TRUE
)
des = generateDesign(n = 20, par.set = getParamSet(obj_fn), fun = lhs::randomLHS)
des$y <- vapply(seq_len(nrow(des)),
function(i) {
obj_fn(c(des$nvars[[i]],des$alpha[[i]]))
}, numeric(1))
surr_km = makeLearner("regr.km", predict.type = "se", covtype = "matern3_2", control = list(trace = FALSE))
control = makeMBOControl()
control = setMBOControlTermination(control, iters = 100)
control = setMBOControlInfill(control, crit = makeMBOInfillCritEI())
run = mbo(obj_fn, design = des, learner = surr_km, control = control, show.info = TRUE)
## [mbo] 1: nvars=142; alpha=0.704 : y = 1.07 : 0.6 secs : infill_ei
## [mbo] 2: nvars=74; alpha=0.62 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 3: nvars=38; alpha=0.441 : y = 1.06 : 0.2 secs : infill_ei
## [mbo] 4: nvars=346; alpha=1 : y = 1.08 : 2.2 secs : infill_ei
## [mbo] 5: nvars=60; alpha=1 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 6: nvars=90; alpha=7.13e-05 : y = 1.05 : 0.4 secs : infill_ei
## [mbo] 7: nvars=31; alpha=6.37e-05 : y = 1.06 : 0.2 secs : infill_ei
## [mbo] 8: nvars=86; alpha=1 : y = 1.05 : 0.5 secs : infill_ei
## [mbo] 9: nvars=420; alpha=0.815 : y = 1.08 : 2.6 secs : infill_ei
## [mbo] 10: nvars=49; alpha=1 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 11: nvars=2; alpha=0.636 : y = 1.39 : 0.1 secs : infill_ei
## [mbo] 12: nvars=33; alpha=1 : y = 1.06 : 0.2 secs : infill_ei
## [mbo] 13: nvars=83; alpha=0.365 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 14: nvars=116; alpha=1 : y = 1.06 : 0.6 secs : infill_ei
## [mbo] 15: nvars=160; alpha=1 : y = 1.07 : 0.8 secs : infill_ei
## [mbo] 16: nvars=55; alpha=0.688 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 17: nvars=107; alpha=1.71e-05 : y = 1.06 : 0.5 secs : infill_ei
## [mbo] 18: nvars=227; alpha=1 : y = 1.08 : 1.2 secs : infill_ei
## [mbo] 19: nvars=63; alpha=2.06e-06 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 20: nvars=313; alpha=1 : y = 1.08 : 2.0 secs : infill_ei
## [mbo] 21: nvars=66; alpha=0.42 : y = 1.05 : 0.4 secs : infill_ei
## [mbo] 22: nvars=48; alpha=5.38e-05 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 23: nvars=69; alpha=0.998 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 24: nvars=318; alpha=1.46e-05 : y = 1.12 : 1.8 secs : infill_ei
## [mbo] 25: nvars=49; alpha=0.611 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 26: nvars=94; alpha=0.346 : y = 1.05 : 0.4 secs : infill_ei
## [mbo] 27: nvars=137; alpha=0.000819 : y = 1.06 : 0.6 secs : infill_ei
## [mbo] 28: nvars=58; alpha=0.0997 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 29: nvars=191; alpha=7.43e-05 : y = 1.08 : 0.9 secs : infill_ei
## [mbo] 30: nvars=78; alpha=0.885 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 31: nvars=52; alpha=0.842 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 32: nvars=53; alpha=0.000119 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 33: nvars=420; alpha=6.47e-06 : y = 1.16 : 2.8 secs : infill_ei
## [mbo] 34: nvars=405; alpha=1 : y = 1.08 : 2.9 secs : infill_ei
## [mbo] 35: nvars=90; alpha=0.732 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 36: nvars=173; alpha=0.532 : y = 1.07 : 0.7 secs : infill_ei
## [mbo] 37: nvars=45; alpha=0.769 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 38: nvars=51; alpha=0.131 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 39: nvars=78; alpha=2.35e-06 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 40: nvars=95; alpha=1 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 41: nvars=33; alpha=0.669 : y = 1.06 : 0.1 secs : infill_ei
## [mbo] 42: nvars=52; alpha=0.443 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 43: nvars=152; alpha=0.332 : y = 1.07 : 0.6 secs : infill_ei
## [mbo] 44: nvars=111; alpha=0.52 : y = 1.06 : 0.4 secs : infill_ei
## [mbo] 45: nvars=251; alpha=1 : y = 1.08 : 1.2 secs : infill_ei
## [mbo] 46: nvars=64; alpha=0.867 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 47: nvars=54; alpha=0.93 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 48: nvars=364; alpha=0.761 : y = 1.08 : 2.3 secs : infill_ei
## [mbo] 49: nvars=69; alpha=3.9e-05 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 50: nvars=51; alpha=0.351 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 51: nvars=52; alpha=0.0537 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 52: nvars=61; alpha=0.53 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 53: nvars=291; alpha=1 : y = 1.08 : 1.6 secs : infill_ei
## [mbo] 54: nvars=97; alpha=9.38e-05 : y = 1.05 : 0.4 secs : infill_ei
## [mbo] 55: nvars=55; alpha=0.000316 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 56: nvars=53; alpha=0.77 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 57: nvars=52; alpha=0.206 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 58: nvars=53; alpha=1 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 59: nvars=270; alpha=5.14e-05 : y = 1.11 : 1.7 secs : infill_ei
## [mbo] 60: nvars=81; alpha=0.101 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 61: nvars=76; alpha=0.999 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 62: nvars=51; alpha=0.000118 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 63: nvars=79; alpha=0.525 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 64: nvars=42; alpha=8.17e-05 : y = 1.06 : 0.3 secs : infill_ei
## [mbo] 65: nvars=30; alpha=0.311 : y = 1.06 : 0.2 secs : infill_ei
## [mbo] 66: nvars=57; alpha=0.819 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 67: nvars=135; alpha=1 : y = 1.06 : 0.5 secs : infill_ei
## [mbo] 68: nvars=53; alpha=0.548 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 69: nvars=108; alpha=1 : y = 1.06 : 0.4 secs : infill_ei
## [mbo] 70: nvars=52; alpha=0.958 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 71: nvars=85; alpha=0.000229 : y = 1.05 : 0.4 secs : infill_ei
## [mbo] 72: nvars=53; alpha=0.39 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 73: nvars=52; alpha=0.32 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 74: nvars=60; alpha=0.000125 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 75: nvars=81; alpha=1 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 76: nvars=65; alpha=0.0181 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 77: nvars=53; alpha=0.495 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 78: nvars=52; alpha=2.2e-05 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 79: nvars=53; alpha=0.0934 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 80: nvars=53; alpha=0.891 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 81: nvars=53; alpha=0.165 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 82: nvars=53; alpha=0.639 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 83: nvars=53; alpha=0.018 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 84: nvars=124; alpha=0.872 : y = 1.06 : 0.5 secs : infill_ei
## [mbo] 85: nvars=53; alpha=0.722 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 86: nvars=63; alpha=0.24 : y = 1.05 : 0.5 secs : infill_ei
## [mbo] 87: nvars=75; alpha=0.000222 : y = 1.05 : 0.4 secs : infill_ei
## [mbo] 88: nvars=100; alpha=0.204 : y = 1.06 : 0.8 secs : infill_ei
## [mbo] 89: nvars=214; alpha=0.635 : y = 1.08 : 1.2 secs : infill_ei
## [mbo] 90: nvars=53; alpha=0.592 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 91: nvars=67; alpha=0.733 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 92: nvars=72; alpha=0.812 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 93: nvars=93; alpha=0.0734 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 94: nvars=77; alpha=0.29 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 95: nvars=50; alpha=0.000182 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 96: nvars=41; alpha=0.999 : y = 1.06 : 0.4 secs : infill_ei
## [mbo] 97: nvars=53; alpha=0.348 : y = 1.05 : 0.3 secs : infill_ei
## [mbo] 98: nvars=114; alpha=6.36e-05 : y = 1.06 : 0.6 secs : infill_ei
## [mbo] 99: nvars=50; alpha=0.242 : y = 1.05 : 0.2 secs : infill_ei
## [mbo] 100: nvars=53; alpha=0.811 : y = 1.05 : 0.2 secs : infill_ei
opt <- run$x
print(opt)
## $nvars
## [1] 53
##
## $alpha
## [1] 0.0001189942
print(run$y)
## [1] 1.049697
alpha <- opt$alpha
nvars <- opt$nvars
# re-fit model with chosen alpha and nvars
m <- build_model(nvars, alpha)
newvars <- orderedvars[seq_len(nvars)]
lambdas <- m$model$lambda
s <- m$model$lambda.min
lambdas <- lambdas[lambdas>=s]
print(nvars)
## [1] 53
print(alpha)
## [1] 0.0001189942
print(s)
## [1] 0.03029936
# learn a centering and scaling of the cross-validated
# training frame
tfs <- scale(cp$crossFrame[, newvars, drop = FALSE],
center = TRUE, scale = TRUE)
centering <- attr(tfs, "scaled:center")
scaling <- attr(tfs, "scaled:scale")
# apply the centering and scaling to the cross-validated
# training frame
tfs <- scale(cp$crossFrame[, newvars, drop = FALSE],
center = centering,
scale = scaling)
model <- glmnet(as.matrix(tfs),
cp$crossFrame[[outcome_name]],
alpha = alpha,
family = "gaussian",
standardize = FALSE,
lambda = lambdas)
pipeline <-
pkgfn("vtreat::prepare",
arg_name = "dframe",
args = list(treatmentplan = cp$treatments,
varRestriction = newvars)) %.>%
pkgfn("subset",
arg_name = "x",
args = list(select = newvars)) %.>%
pkgfn("scale",
arg_name = "x",
args = list(center = centering,
scale = scaling)) %.>%
pkgfn("glmnet::predict.glmnet",
arg_name = "newx",
args = list(object = model,
s = s)) %.>%
srcfn(".[, cname, drop = TRUE]",
arg_name = ".",
args = list(cname = "1"))
cat(format(pipeline))
## UnaryFnList(
## vtreat::prepare(dframe=., treatmentplan, varRestriction),
## base::subset(x=., select),
## base::scale(x=., center, scale),
## glmnet::predict.glmnet(newx=., object, s),
## SrcFunction{ .[, cname, drop = TRUE] }(.=., cname))
dTrain$prediction <- dTrain %.>% pipeline
WVPlots::ScatterHist(
dTrain, "prediction", "y", "fit on training data",
smoothmethod = "identity",
estimate_sig = TRUE,
point_alpha = 0.1,
contour = TRUE)
dTest$prediction <- dTest %.>% pipeline
WVPlots::ScatterHist(
dTest, "prediction", "y", "fit on test",
smoothmethod = "identity",
estimate_sig = TRUE,
point_alpha = 0.1,
contour = TRUE)
parallel::stopCluster(cl)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.