knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  message=FALSE, 
  warning=FALSE
)
library(risk3r)
library(broom)
library(dplyr)
library(ggplot2)

theme_set(
  theme_minimal(base_size = 21) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  )

if(require(showtext)){

  sysfonts::font_add_google("IBM Plex Sans", "plex")
  showtext::showtext_auto()

}

Raw Model

Let fit a very raw model to check:

library(risk3r)
library(broom)
library(dplyr)
library(ggplot2)

data("credit_woe")

credit_woe <- select(credit_woe, -id_client_woe)
train_data <- head(credit_woe, 20000)
test_data  <- tail(credit_woe, 20000)

model_raw <- glm(bad ~ ., family = binomial, data = train_data)

gg_model_coef(model_raw) +
  labs(
    title = "Yikes!",
    subtitle = "Some parameter are negative, multicollinearity"
    )

Satellite Model

We'll use a simple random forest to have a benchmark.

model_rf <- randomForest::randomForest(bad ~ .,
                                       data = train_data,
                                       do.trace = FALSE,
                                       ntree = 100)

metrics(train_data$bad, predict(model_rf))

Lasso or Elasticnet Regularization

This is a wrapper of `glmnet::cv.glmnet.

From https://glmnet.stanford.edu/articles/glmnet.html:

There 2 option for S: lambda.min and lambda.1se , this last option you have a more regularized model.

This wrapper around the glmnet package take a model as input, then return the model with the variables non zero from the glmnet::cv.glmnet() function according with the selected S option. This function reorder the variables in the same order the coefficient in the glmnet model turn to non zero (check the plots when run this funtion).

model_fsglmnet <- featsel_glmnet(model_raw, S = "lambda.1se", trace.it = FALSE)

gg_model_coef(model_fsglmnet)

Setpwise Forward

This is a wrapper for stats::step but the start point model is the null one: response ~ 1.

model_fsstep <- featsel_stepforward(model_raw, trace = 0)

gg_model_coef(model_fsstep)

Repetitive round of drop out loss

From https://ema.drwhy.ai/featureImportance.html

This is a wrapper of celavi::feature_selection.

model_lss_prmt <- featsel_loss_function_permutations(model_raw)

gg_model_coef(model_lss_prmt)

Comparison

models <- list(
  `raw`      = model_raw,
  `glmnet`   = model_fsglmnet,
  `stepwise` = model_fsstep,
  `loss`     = model_lss_prmt
)

models |> 
  purrr::map_df(broom::tidy, .id = "model") |> 
  select(model, term) |> 
  mutate(value = 1) |> 
  tidyr::spread(model, value) |> 
  mutate(across(where(is.numeric), tidyr::replace_na, 0)) |> 
  select(term, raw, stepwise, glmnet, loss) |> 
  arrange(desc(raw), desc(stepwise), desc(glmnet), desc(loss))


models |> 
  purrr::map_df(model_metrics, newdata = test_data, .id = "model")

Manual

Let suppose you have a nice model, but you want to reduce the number of variables.

Maybe you want check:

model_partials(model_fsstep)

gg_model_partials(model_fsstep) +
    ggplot2::facet_wrap(
      ggplot2::vars(.data$key), ncol = 2,
      scales = "free_y"
      )


jbkunst/risk3r documentation built on March 19, 2024, 10:49 p.m.