knitr::opts_chunk$set(
  fig.retina = 3,
  fig.align = "center",
  fig.width = 6.93,
  fig.height = 6.13,
  out.width = "100%",
  echo = TRUE
)

R.utils::sourceDirectory("R")
library("drake")
library("mlr")
library("glmnet")
library("ggplot2")
library("magrittr")
library("plotmo")

options(crayon.enabled = TRUE, pillar.bold = TRUE, scipen = 999)
fansi::set_knit_hooks(knitr::knit_hooks)

# load drake objects
loadd(
  benchmark_models_new_penalized_mbo_buffer2,
  task,
  task_reduced_cor
)

Last update:

date()

General glmnet notes

Inspect fitted models during CV

Inspect Ridge regression on VI task in detail because the error is enourmus.

First extract the models.

models <- benchmark_models_new_penalized_mbo_buffer2[[8]][["results"]][["vi_buffer2"]][["Ridge-MBO"]][["models"]]

Then look at the fold performances

benchmark_models_new_penalized_mbo_buffer2[[8]][["results"]][["vi_buffer2"]][["Ridge-MBO"]][["measures.test"]][["rmse"]]

We see a high error on Fold 4 (= Laukiz 2). The others are also quite high but not "out of bounds".

Let's look at the full lambda sequence

purrr::map_int(models, ~ length(.x[["learner.model"]][["lambda"]]))

Interestingly, the lambda length of model 1 is not 100 (default) but only 5.

Train/predict via {glmnet} manually

To inspect further, let's refit a {glmnet} model directly on the training data of Fold 4 and inspect what glmnet::cv.glmnet estimates for the lambda sequence:

train_inds_fold4 <- benchmark_models_new_penalized_mbo_buffer2[[8]][["results"]][["vi_buffer2"]][["Ridge-MBO"]][["pred"]][["instance"]][["train.inds"]][[4]]

obs_train_f4 <- as.matrix(task[[2]]$env$data[train_inds_fold4, getTaskFeatureNames(task[[2]])])
target_f4 <- getTaskTargets(task[[2]])[train_inds_fold4]

Fit cv.glmnet

set.seed(1)
modf4 <- glmnet::cv.glmnet(obs_train_f4, target_f4, alpha = 0)

modf4$lambda.1se

Predict on Laukiz 2 now.

pred_inds_fold4 <- benchmark_models_new_penalized_mbo_buffer2[[8]][["results"]][["vi_buffer2"]][["Ridge-MBO"]][["pred"]][["instance"]][["test.inds"]][[4]]

obs_pred_f4 <- as.matrix(task[[2]]$env$data[pred_inds_fold4, getTaskFeatureNames(task[[2]])])

pred <- predict(modf4, newx = obs_pred_f4, s = modf4$lambda.1se)

Calculate the error

truth <- task[[2]]$env$data[pred_inds_fold4, "defoliation"]

mlr:::measureRMSE(truth, pred)

Ok, RMSE of 97073324139. This is most likely because of a few. observations which were predicted completely out of bounds.

qplot(pred, geom = "histogram")
pred[which(pred > 100), , drop = FALSE]

Ok, its one observation (row id = 737).

Let's have a look at the predictor values for this observation.

summary(obs_train_f4[737, ])

Ok, how does this compare to summaries of other observations? NB: Obs. 500 - 510 were chosen randomly. The purpose here is to see if something within the predictors for specific observations looks abnormal.

lapply(seq(500:510), function(x) summary(obs_train_f4[x, ]))

We have some higher values for obs 737 but nothing which stands out.

Let's look at the model coefficients and Partial Dependence Plots (PDP):

coef(modf4)

Feature "bf2_PRI_norm" has a quite high value (-95).

plotres(modf4)
plot_glmnet(modf4$glmnet.fit)
plotmo(modf4$glmnet.fit)

Let's figure out which are the ten most important features and create PDPs for these:

top_ten_abs <- coef(modf4) %>%
  as.matrix() %>%
  as.data.frame() %>%
  dplyr::rename(coef = `1`) %>%
  dplyr::mutate(feature = rownames(coef(modf4))) %>%
  dplyr::slice(-1) %>%
  dplyr::mutate(coef_abs = abs(coef)) %>%
  dplyr::arrange(desc(coef_abs)) %>%
  dplyr::slice(1:10) %>%
  dplyr::pull(feature)

top_ten_abs

Partial Dependence Plots

For PDP we use a model trained with {mlr} and check for equality first.

lrn <- makeLearner("regr.cvglmnet", alpha = 0)
task_f4 <- subsetTask(task[[2]], train_inds_fold4)

set.seed(1)
mod_mlr <- train(lrn, task_f4)

Check lambda sequence and lambda.1se:

mod_mlr$learner.model$lambda
mod_mlr$learner.model$lambda.1se

Check for equality between {mlr} and {glmnet} directly

all.equal(modf4$lambda.1se, mod_mlr$learner.model$lambda.1se)
pdp <- generatePartialDependenceData(mod_mlr, task_f4, features = top_ten_abs)
plotPartialDependence(pdp)

Individual PDP

pdp_ind <- generatePartialDependenceData(mod_mlr, task_f4,
  features = top_ten_abs,
  individual = TRUE
)
plotPartialDependence(pdp_ind)

Let's look at the x values for observation 737:

obs_train_f4[737, top_ten_abs]

Looks ok - they are all within a normal range with respectv to the PDP estimates.

Train/predict via {glmnet} manually on dataset with filtered feature correlation

To inspect further, let's refit a {glmnet} model directly on the training data of Fold 4 and inspect what glmnet::cv.glmnet estimates for the lambda sequence:

train_inds_fold4 <- benchmark_models_new_penalized_mbo_buffer2[[8]][["results"]][["vi_buffer2"]][["Ridge-MBO"]][["pred"]][["instance"]][["train.inds"]][[4]]

obs_train_f4 <- as.matrix(task_reduced_cor[[2]]$env$data[train_inds_fold4, getTaskFeatureNames(task_reduced_cor[[2]])])
target_f4 <- getTaskTargets(task_reduced_cor[[2]])[train_inds_fold4]

Fit cv.glmnet

set.seed(1)
modf4 <- glmnet::cv.glmnet(obs_train_f4, target_f4, alpha = 0)

modf4$lambda.1se

Predict on Laukiz 2 now.

pred_inds_fold4 <- benchmark_models_new_penalized_mbo_buffer2[[8]][["results"]][["vi_buffer2"]][["Ridge-MBO"]][["pred"]][["instance"]][["test.inds"]][[4]]

obs_pred_f4 <- as.matrix(task_reduced_cor[[2]]$env$data[pred_inds_fold4, getTaskFeatureNames(task_reduced_cor[[2]])])

pred <- predict(modf4, newx = obs_pred_f4, s = modf4$lambda.1se)

Calculate the error

truth <- task_reduced_cor[[2]]$env$data[pred_inds_fold4, "defoliation"]

mlr:::measureRMSE(truth, pred)


pat-s/2019-feature-selection documentation built on Dec. 24, 2021, 8:40 a.m.