In MichaelGurkov/LearningMaterials: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)

library(tidyverse)

library(tidymodels)

data("GermanCredit",package = "caret")

raw_data = read_csv(paste0(Sys.getenv("USERPROFILE"),
                           "\\OneDrive - Bank Of Israel\\Data",
                           "\\applied_predictive_modeling",
                           "\\german_credit_data.csv"))

raw_data = raw_data %>% 
  mutate(job = recode(job,
                      `0` = "unskilled and non-resident",
                      `1` = "unskilled and resident",
                      `2` = "skilled",
                      `3` = "highly_skilled")) %>% 
  mutate(across(where(is.character),~str_replace_all(.,"\\s|/","_")))

Eda

raw_data %>% 
  skimr::skim_without_charts()

raw_data %>% 
  count(target) %>% 
  mutate(n = n / sum(n)) %>% 
  ggplot(aes(target,n)) + 
  geom_col(width = 0.5) + 
  scale_y_continuous(labels = scales::percent_format()) + 
  xlab(NULL) + ylab(NULL)

raw_data %>% 
  ggplot(aes(x = 0, y = duration)) + 
  geom_jitter() + 
  xlab(NULL) + ylab(NULL) + 
  theme(axis.text.x = element_blank())

Rare instances at large duration

Numeric features

raw_data %>% 
  select(where(is.numeric), target) %>% 
  pivot_longer(-target) %>% 
  ggplot(aes(x = target,value)) + 
  geom_jitter(aes(color = target)) + 
  facet_wrap(~name, scales = "free")

raw_data %>% 
  select(where(is.numeric), target) %>% 
  pivot_longer(-target) %>% 
  ggplot(aes(x = value, fill = target)) + 
  geom_density(alpha = 0.5) + 
  facet_wrap(~name, scales = "free")

Age: younger default more
Credit amount: small credit default less
Duration: short duration (less than 20) default less

Categoric features

raw_data %>% 
  select(where(is.character), target) %>% 
  pivot_longer(-target) %>% 
  count(target, name, value,name = "count") %>% 
  group_by(name, value) %>% 
  mutate(count = count / sum (count)) %>% 
  ungroup() %>% 
  filter(target == "bad") %>% 
  ggplot(aes(x = tidytext::reorder_within(value, count, name),y = count)) + 
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "blue") + 
  geom_col(position = "dodge", width = 0.5) + 
  tidytext::scale_x_reordered() + 
  facet_wrap(~name, scales = "free") + 
  theme(axis.text.x = element_text(size = 7))

Modeling

data_split = initial_split(raw_data,prop = 0.8,strata = target)

train_set = training(data_split)

test_set = testing(data_split)

rm(data_split)

preproc_rec = recipe(target ~ .,data = train_set) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
    step_dummy(all_nominal_predictors())

# logit_model = logistic_reg()

svm_model = svm_rbf(cost = tune()) %>%
   set_mode("classification")

tune_grid = grid_regular(cost(range = c(-2,7),
                              trans = log2_trans()),levels = 10)

resample_folds = vfold_cv(train_set,repeats = 5)

svm_wf = workflow() %>% 
  add_recipe(preproc_rec) %>% 
  add_model(svm_model)

svm_rs = svm_wf %>% 
  tune_grid(resamples = resample_folds, grid = tune_grid)

svm_rs %>% 
  collect_metrics() %>% 
  filter(.metric == "accuracy") %>% 
  ggplot(aes(x = cost, y = mean)) + 
  geom_point() + 
  geom_line() +
  scale_x_continuous(trans = "log2") + 
  geom_errorbar(aes(x = cost, ymax = mean + std_err,
                    ymin = mean - std_err), width = 0.5)