Code to run examples for the Win-Vector blog article "When Cross-Validation is More Powerful than Regularization".
knitr::opts_chunk$set(echo = TRUE)
Convenience functions.
# function to calculate the rmse rmse = function(ypred, y) { resid = y - ypred sqrt(mean(resid^2)) } # function to calculate R-squared rsquared = function(ypred, y) { null_variance = sum((y - mean(y))^2) resid_variance = sum((y - ypred)^2) 1 - (resid_variance/null_variance) } compare_models = function(predframe) { predictions = setdiff(colnames(predframe), "y") data.frame(# pred_type = predictions, rmse = vapply(predframe[,predictions, drop=FALSE], FUN = function(p) rmse(p, predframe$y), numeric(1)), rsquared = vapply(predframe[,predictions,drop=FALSE], FUN = function(p) rsquared(p, predframe$y), numeric(1)) ) }
Set up the data and split into training and test sets.
set.seed(3453421) Ndata = 500 nnoise = 100 nsig = 10 noise_levels = paste0("n_", sprintf('%02d', 1:nnoise)) signal_levels = paste0("s_", sprintf('%02d', 1:nsig)) sig_amps = 2*runif(1:nsig, min=-1, max=1) names(sig_amps) = signal_levels sig_amps = sig_amps - mean(sig_amps) # mean zero x_s = sample(signal_levels, Ndata, replace=TRUE) x_n = sample(noise_levels, Ndata, replace=TRUE) y = sig_amps[x_s] + rnorm(Ndata) # a function of x_s but not x_n df = data.frame(x_s=x_s, x_n=x_n, y=y, stringsAsFactors=FALSE) library(zeallot) c(dtest, dtrain) %<-% split(df, runif(Ndata) < 0.5) # false comes first head(dtrain) # for later - a frame to hold the test set predictions pframe = data.frame(y = dtest$y)
For this simple example, you might try representing each variable as the expected value of y - mean(y)
in the training data, conditioned on the variable's level. So the ith "coefficient" of the one-variable model would be given by:
$$ v_i = E[y | x = s_i] - E[y] $$
Where $s_i$ is the $i$th level.
"Fit" the one-variable model for x_s
.
# build the maps of expected values library(rqdatatable) # yes, you can use dplyr or base instead... library(wrapr) xs_means = dtrain %.>% extend(., delta := y - mean(y)) %.>% project(., meany := mean(y), coeff := mean(delta), groupby = 'x_s') %.>% order_rows(., 'x_s') %.>% as.data.frame(.) xs_means
"Fit" the one-variable model for x_n
and treat or "prepare" the data (we are using terminology that is consistent with vtreat
).
xn_means = dtrain %.>% extend(., delta := y - mean(y)) %.>% project(., meany := mean(delta), groupby = 'x_n') %.>% order_rows(., 'x_n') %.>% as.data.frame(.) # the maps that convert categorical levels to numerical values xs_map = with(xs_means, x_s := coeff) xn_map = with(xn_means, x_n := meany) prepare_manually = function(coefmap, xcol) { treated = coefmap[xcol] ifelse(is.na(treated), 0, treated) } # "prepare" the data dtrain_treated = dtrain dtrain_treated$vs = prepare_manually(xs_map, dtrain$x_s) dtrain_treated$vn = prepare_manually(xn_map, dtrain$x_n) head(dtrain_treated)
Now fit a linear model for y
as a function of vs
and vn
.
model_raw = lm(y ~ vs + vn, data=dtrain_treated) summary(model_raw)
Apply the naive model to the test data.
# apply to test data dtest_treated = dtest dtest_treated$vs = prepare_manually(xs_map, dtest$x_s) dtest_treated$vn = prepare_manually(xn_map, dtest$x_n) pframe$ypred_naive = predict(model_raw, newdata=dtest_treated) # look at the predictions on holdout data compare_models(pframe) %.>% knitr::kable(.)
Let's fit the correct nested model, using vtreat
.
library(vtreat) library(wrapr) xframeResults = mkCrossFrameNExperiment(dtrain, qc(x_s, x_n), "y", codeRestriction = qc(catN), verbose = FALSE) # the plan uses the one-variable models to treat data treatmentPlan = xframeResults$treatments # the cross-frame dtrain_treated = xframeResults$crossFrame head(dtrain_treated) variables = setdiff(colnames(dtrain_treated), "y") model_X = lm(mk_formula("y", variables), data=dtrain_treated) summary(model_X)
We can compare the performance of this model to the naive model on holdout data.
dtest_treated = prepare(treatmentPlan, dtest) pframe$ypred_crossval = predict(model_X, newdata=dtest_treated) compare_models(pframe) %.>% knitr::kable(.)
The correct model has a much smaller root-mean-squared error and a much larger R-squared than the naive model when applied to new data.
For a one-variable model, L2-regularization is simply Laplace smoothing. Again, we'll represent each "coefficient" of the one-variable model as the Laplace smoothed value minus the grand mean.
$$ v_i = \sum\nolimits_{x_j = s_i} y_i/(\text{count}_i + \lambda) - E[y_i] $$
Where $\text{count}_i$ is the frequency of $s_i$ in the training data, and $\lambda$ is the smoothing parameter (usually 1). If $\lambda = 1$ then the first term on the right is just adding one to the frequency of the level and then taking the "adjusted conditional mean" of y
.
"Fit" a regularized model to x_s
.
# build the coefficients lambda = 1 xs_regmap = dtrain %.>% extend(., grandmean = mean(y)) %.>% project(., sum_y := sum(y), count_y := n(), grandmean := mean(grandmean), # pseudo-aggregator groupby = 'x_s') %.>% extend(., vs := (sum_y/(count_y + lambda)) - grandmean ) %.>% order_rows(., 'x_s') %.>% as.data.frame(.) xs_regmap
"Fit" a regularized model to x_m
. Apply the one variable models for x_s
and x_n
to the data.
xn_regmap = dtrain %.>% extend(., grandmean = mean(y)) %.>% project(., sum_y := sum(y), count_y := n(), grandmean := mean(grandmean), # pseudo-aggregator groupby = 'x_n') %.>% extend(., vn := (sum_y/(count_y + lambda)) - grandmean ) %.>% order_rows(., 'x_n') %.>% as.data.frame(.) # the maps that convert categorical levels to numerical values vs_map = xs_regmap$x_s := xs_regmap$vs vn_map = xn_regmap$x_n := xn_regmap$vn # "prepare" the data dtrain_treated = dtrain dtrain_treated$vs = prepare_manually(vs_map, dtrain$x_s) dtrain_treated$vn = prepare_manually(vn_map, dtrain$x_n) head(dtrain_treated)
Now fit the overall model:
model_reg = lm(y ~ vs + vn, data=dtrain_treated) summary(model_reg)
Comparing the performance of the three models on holdout data.
# apply to test data dtest_treated = dtest dtest_treated$vs = prepare_manually(vs_map, dtest$x_s) dtest_treated$vn = prepare_manually(vn_map, dtest$x_n) pframe$ypred_reg = predict(model_reg, newdata=dtest_treated) # compare the predictions of each model compare_models(pframe) %.>% knitr::kable(.)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.