knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message=FALSE, warning=FALSE )
library(risk3r) library(broom) library(dplyr) library(ggplot2) theme_set( theme_minimal(base_size = 21) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ) if(require(showtext)){ sysfonts::font_add_google("IBM Plex Sans", "plex") showtext::showtext_auto() }
Let fit a very raw model to check:
library(risk3r) library(broom) library(dplyr) library(ggplot2) data("credit_woe") credit_woe <- select(credit_woe, -id_client_woe) train_data <- head(credit_woe, 20000) test_data <- tail(credit_woe, 20000) model_raw <- glm(bad ~ ., family = binomial, data = train_data) gg_model_coef(model_raw) + labs( title = "Yikes!", subtitle = "Some parameter are negative, multicollinearity" )
We'll use a simple random forest to have a benchmark.
model_rf <- randomForest::randomForest(bad ~ ., data = train_data, do.trace = FALSE, ntree = 100) metrics(train_data$bad, predict(model_rf))
This is a wrapper of `glmnet::cv.glmnet
.
From https://glmnet.stanford.edu/articles/glmnet.html:
There 2 option for S
: lambda.min
and lambda.1se
, this last option
you have a more regularized model.
This wrapper around the glmnet package take a model as input, then return the
model with the variables non zero from the glmnet::cv.glmnet()
function
according with the selected S
option. This function reorder the variables
in the same order the coefficient in the glmnet model turn to non zero
(check the plots when run this funtion).
model_fsglmnet <- featsel_glmnet(model_raw, S = "lambda.1se", trace.it = FALSE) gg_model_coef(model_fsglmnet)
This is a wrapper for stats::step
but the start point model is the null one:
response ~ 1
.
model_fsstep <- featsel_stepforward(model_raw, trace = 0) gg_model_coef(model_fsstep)
From https://ema.drwhy.ai/featureImportance.html
This is a wrapper of celavi::feature_selection
.
model_lss_prmt <- featsel_loss_function_permutations(model_raw) gg_model_coef(model_lss_prmt)
models <- list( `raw` = model_raw, `glmnet` = model_fsglmnet, `stepwise` = model_fsstep, `loss` = model_lss_prmt ) models |> purrr::map_df(broom::tidy, .id = "model") |> select(model, term) |> mutate(value = 1) |> tidyr::spread(model, value) |> mutate(across(where(is.numeric), tidyr::replace_na, 0)) |> select(term, raw, stepwise, glmnet, loss) |> arrange(desc(raw), desc(stepwise), desc(glmnet), desc(loss)) models |> purrr::map_df(model_metrics, newdata = test_data, .id = "model")
Let suppose you have a nice model, but you want to reduce the number of variables.
Maybe you want check:
model_partials(model_fsstep) gg_model_partials(model_fsstep) + ggplot2::facet_wrap( ggplot2::vars(.data$key), ncol = 2, scales = "free_y" )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.