knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
# general library(dplyr) library(ggplot2) library(patchwork) # # main library(risk3r) library(scorecard) # # descriptive & premodelling library(skimr) library(xray) library(moments) library(rsample) # # modelling library(randomForest) # # post modelling library(performance) library(parameters) library(broom) data("credit") # if(require(kunstomverse)){ library(extrafont) extrafont::loadfonts(quiet = TRUE) THEME <- kunstomverse::theme_knst_segoe theme_set(kunstomverse::theme_knst_segoe()) } #
PARAMETERS <- list( SEED = 12345, P_TRAIN = 0.7, P_CONCENTRATION = .95, IV_MIN = 0.02, # WEAK >= COUNT_DISTR_LIMIT = 0.05, CORRELATION_LIMIT = 0.5, POINTS0 = 600, PDO = 20, ODDS0 = 1/50 )
custom_skim <- skim_with( numeric = sfl( median = purrr::partial(median, na.rm = TRUE), skewness = purrr::partial(moments::skewness, na.rm = TRUE), kurtosis = purrr::partial(moments::kurtosis, na.rm = TRUE), count_distr_max = risk3r::count_distr_max, n_unique = skimr::n_unique ), factor = sfl( hhi = risk3r::hhi, hhi_lbl = purrr::compose(risk3r::hhi, risk3r::hhi_label, .dir = "forward"), count_distr_max = risk3r::count_distr_max ), append = TRUE ) # # custom_skim(data %>% select(1:10)) #
OPTION <- 3 if(OPTION == 1) { data <- readRDS(here::here("data-raw/cd.rds")) # data <- data %>% # select(1:300, where(is.character)) %>% sample_n(100000) # data %>% select(1:10) %>% glimpse() # data <- data %>% mutate( bad_good = ifelse(bad_good == 1, "bad", "good"), bad_good = factor(bad_good, levels = c("good", "bad")) ) %>% mutate(across(where(is.character), as.factor)) # } else if (OPTION == 2) { # data <- scorecard::germancredit %>% as_tibble() %>% rename(bad_good = creditability) %>% select(bad_good, everything()) %>% mutate(bad_good = forcats::fct_relevel(bad_good, "good", "bad")) %>% mutate(across(where(is.character), as.factor)) %>% mutate(id = row_number(), .before = 1) # } else if (OPTION == 3) { # data <- risk3r::hmeq %>% rename(bad_good = bad) %>% select(bad_good, everything()) %>% mutate(bad_good = ifelse(bad_good == 1, "bad", "good")) %>% mutate(bad_good = forcats::fct_relevel(bad_good, "good", "bad")) %>% mutate(across(where(is.character), as.factor)) %>% mutate(id = row_number(), .before = 1) # } # dim(data)
describe <- custom_skim(data) # # use as.list to export skimr (risk3r:::as.list.skim_df) # writexl::write_xlsx(as.list(describe), here::here("data-raw/describe.xlsx"))
Remove variables with 0 variance and near 0 variance.
describe_min <- skimr::partition(describe) %>% purrr::map_df(select, variable = skim_variable, n_unique, count_distr_max) %>% tibble::as_tibble() # vars_to_remove_unique_value <- describe_min %>% filter(n_unique == 1) %>% pull(variable) # vars_to_remove_near_0_var <- describe_min %>% filter(count_distr_max >= PARAMETERS$P_CONCENTRATION) %>% pull(variable) # vars_to_remove_unique_value vars_to_remove_near_0_var <- setdiff(vars_to_remove_near_0_var, vars_to_remove_unique_value)
Or using scorecard:
scrd_filter <- scorecard::var_filter( data, # next 2 parameter are needed even we want only due identical_limit y = "bad_good", iv_limit = 0, missing_limit = 1, identical_limit = PARAMETERS$P_CONCENTRATION, return_rm_reason = TRUE ) # scrd_filter$rm %>% as_tibble() %>% filter(!is.na(rm_reason)) %>% arrange(variable)
From https://github.com/sicarul/xray.
describe_anomalies <- xray::anomalies(data) # as_tibble(describe_anomalies$variables) # as_tibble(describe_anomalies$problem_variables)
Some differences between result due scorecard::var_filter
exclude NAs
to caculate de identical limit.
Surely we want remove other variables like the id operation/customer or a temporal/time variables.
vars_to_remove_specials <- c("time", "id") # data <- data %>% select(-all_of(vars_to_remove_unique_value)) %>% select(-all_of(vars_to_remove_near_0_var)) %>% select(-any_of(vars_to_remove_specials))
From: https://www.tidymodels.org/start/recipes/#data-split
# Fix the random numbers by setting the seed # This enables the analysis to be reproducible when random numbers are used set.seed(PARAMETERS$SEED) # data_split <- rsample::initial_split(data, prop = PARAMETERS$P_TRAIN) # # Create data frames for the two sets: train_data <- training(data_split) test_data <- testing(data_split)
We can do check:
train_data %>% count(bad_good) %>% mutate(n/sum(n)) test_data %>% count(bad_good) %>% mutate(n/sum(n))
We'll start with the default method = "tree"
bins <- scorecard::woebin( train_data, y = "bad_good", method = "tree", count_distr_limit = PARAMETERS$COUNT_DISTR_LIMIT )
Some sanity checks:
# train_data$variable_with_missing %>% is.na() %>% table() # bins$variable_with_missing
The risk3r package have woebin_summary
to get the summary.
dbiv <- woebin_summary(bins) # dbiv # glimpse(dbiv)
Note that dbiv
object have breaks
column among other information.
In some cases, monotonic woes are required in continous variables. With the
dbiv
object we can check what variables have a high information and no monotonic
(excluding the missing category oc).
dbiv %>% filter( iv > 0.02, # A >= weak iv value !factor, # is numeric !monotone # is not monotonic )
risk3r::woebin2
-a wrapper around the main scorecard::woebin
function-
implement method = "ctree"
via the partykit package (we'll try implement monotonic binnings from mob package https://github.com/statcompute/mob).
Having this we can iterate among all available methods:
binssums <- purrr::map_df(c("tree", "ctree"), function(m = "chimerge"){ # message(m) # bins <- risk3r::woebin2( train_data, y = "bad_good", count_distr_limit = PARAMETERS$COUNT_DISTR_LIMIT, method = m ) # woebin_summary(bins) %>% mutate(method = m, .before = 1) # })
We can check, for each variable: - Every method is generated for every variable - The methods give same IV in some variables
binssums %>% count(method) # binssums %>% count(variable, iv) %>% count(n)
Get the best breaks in the sense to maximize the IV, the get the breaks
and finally uses scorecard::woebin
.
binbest <- binssums %>% group_by(variable) %>% arrange(desc(iv)) %>% filter(iv == max(iv)) %>% slice(1) # binbest %>% ungroup() %>% count(method) # best_breaks <- binbest %>% select(variable, breaks) %>% tibble::deframe() # bin <- scorecard::woebin( train_data, y = "bad_good", count_distr_limit = PARAMETERS$COUNT_DISTR_LIMIT, breaks_list = best_breaks ) # dbiv <- woebin_summary(bins) # dbiv # glimpse(dbiv)
Apply woes to train and test table.
Just remove variables with low predictive power.
vars_to_remove_low_iv <- dbiv %>% filter(iv < PARAMETERS$IV_MIN) %>% pull(variable)
In risk3r we can get a data frame with correlations between woes and IVs to decide what variables remove.
# FALSE to know check the next steps below dcors <- woebin_cor_iv(train_data, bins, upper = FALSE) dcors
Now, we can get the variables which have high correlation
n <- length(bins) # vars_to_rm_correlation <- dcors # nrow(vars_to_rm_correlation) == n * n # # remove same vars (diagonal) vars_to_rm_correlation <- vars_to_rm_correlation %>% filter(var1 != var2) # nrow(vars_to_rm_correlation) == n * (n-1) # # keep upper matrix vars_to_rm_correlation <- vars_to_rm_correlation %>% filter(var1_rank < var2_rank) # nrow(vars_to_rm_correlation) == n * (n - 1) /2 # # remove variables which causes high correlations and have lower IV # respect the other variable vars_to_rm_correlation <- vars_to_rm_correlation %>% filter(abs(r) >= PARAMETERS$CORRELATION_LIMIT) %>% distinct(var2) %>% pull() %>% as.character()
vars_to_rm_correlation <- setdiff(vars_to_rm_correlation, vars_to_remove_low_iv) # train_data <- train_data %>% select(-all_of(vars_to_remove_low_iv)) %>% select(-all_of(vars_to_rm_correlation))
Just scorecard::woebin_ply
:)
# class before woes train_data %>% purrr::map_chr(class) %>% table() # train_data <- as_tibble(scorecard::woebin_ply(train_data, bins)) test_data <- as_tibble(scorecard::woebin_ply(test_data, bins)) # # check all is numeric except response variable train_data %>% purrr::map_chr(class) %>% table()
models <- list( model_raw, model_fsstep, model_fsglmnet # model_lss_prmt ) # dmodels <- tibble(model = models) %>% mutate( model_name = c("raw", "stepwise", "glmnet" ), .before = 1 ) %>% mutate( terms = purrr::map(model, broom::tidy), n_variables = purrr::map_dbl(terms, nrow), metrics = purrr::map(model, model_metrics), metrics_validation = purrr::map(model, model_metrics, newdata = test_data) ) # dmodels # dmodels %>% select(model_name, terms) %>% tidyr::unnest(cols = c(terms)) %>% select(model_name, term, estimate) %>% filter(term != "(Intercept)") %>% ggplot() + geom_tile(aes(term, model_name, fill = estimate)) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # dmodels %>% select(model_name, metrics, metrics_validation) %>% tidyr::gather(sample, value, -model_name) %>% tidyr::unnest(cols = c(value)) %>% tidyr::gather(metric, value, -model_name, -sample) %>% mutate(sample = ifelse(sample == "metrics", "train", "test")) %>% ggplot() + geom_point(aes(model_name, value, color = sample), size = 2) + facet_wrap(vars(metric), scales = "free_y") + scale_color_viridis_d(begin = 0.3, end = 0.8) + scale_y_continuous(limits = c(0, NA), labels = scales::percent)
See if we can remove variables via inspecting predictive measures.
models %>% purrr::map(broom::tidy) %>% purrr::map(nrow) # models %>% purrr::map(broom::tidy) %>% purrr::map(head) # model_final <- model_fsglmnet # dfmetrics <- model_partials(model_final, newdata = test_data) dfmetrics # dfmetrics %>% select(-gini, -iv) %>% risk3r:::plot.model_partials() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1, size = 5)) + ggplot2::ylim(0, NA)
{performance}
packagecheck <- performance::check_model(model_final) # performance::check_collinearity(model_final) %>% plot() + THEME() + ggplot2::coord_flip() # car::vif(model_final)
POINTS0 <- 600 PDO <- 20 ODDS0 <- 1/50 # scrcrd <- scorecard::scorecard( bins = bins, model = model_final, points0 = POINTS0, pdo = PDO, odds0 = ODDS0, basepoints_eq0 = TRUE ) # scrcrd[[2]] # scrcrd2 <- scorecard::scorecard2( dt = training(data_split), y = "bad_good", bins = bins, x = stringr::str_remove(names(coef(model_final))[-1], "_woe$"), points0 = POINTS0, pdo = PDO, odds0 = ODDS0, basepoints_eq0 = TRUE ) # scrcrd2[[2]]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.