knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
library(tidyverse) library(tidymodels)
data("GermanCredit",package = "caret") raw_data = read_csv(paste0(Sys.getenv("USERPROFILE"), "\\OneDrive - Bank Of Israel\\Data", "\\applied_predictive_modeling", "\\german_credit_data.csv"))
raw_data = raw_data %>% mutate(job = recode(job, `0` = "unskilled and non-resident", `1` = "unskilled and resident", `2` = "skilled", `3` = "highly_skilled")) %>% mutate(across(where(is.character),~str_replace_all(.,"\\s|/","_")))
raw_data %>% skimr::skim_without_charts()
raw_data %>% count(target) %>% mutate(n = n / sum(n)) %>% ggplot(aes(target,n)) + geom_col(width = 0.5) + scale_y_continuous(labels = scales::percent_format()) + xlab(NULL) + ylab(NULL)
raw_data %>% ggplot(aes(x = 0, y = duration)) + geom_jitter() + xlab(NULL) + ylab(NULL) + theme(axis.text.x = element_blank())
Rare instances at large duration
raw_data %>% select(where(is.numeric), target) %>% pivot_longer(-target) %>% ggplot(aes(x = target,value)) + geom_jitter(aes(color = target)) + facet_wrap(~name, scales = "free")
raw_data %>% select(where(is.numeric), target) %>% pivot_longer(-target) %>% ggplot(aes(x = value, fill = target)) + geom_density(alpha = 0.5) + facet_wrap(~name, scales = "free")
Age: younger default more
Credit amount: small credit default less
Duration: short duration (less than 20) default less
raw_data %>% select(where(is.character), target) %>% pivot_longer(-target) %>% count(target, name, value,name = "count") %>% group_by(name, value) %>% mutate(count = count / sum (count)) %>% ungroup() %>% filter(target == "bad") %>% ggplot(aes(x = tidytext::reorder_within(value, count, name),y = count)) + geom_hline(yintercept = 0.5, linetype = "dashed", color = "blue") + geom_col(position = "dodge", width = 0.5) + tidytext::scale_x_reordered() + facet_wrap(~name, scales = "free") + theme(axis.text.x = element_text(size = 7))
data_split = initial_split(raw_data,prop = 0.8,strata = target) train_set = training(data_split) test_set = testing(data_split) rm(data_split)
preproc_rec = recipe(target ~ .,data = train_set) %>% step_impute_mode(all_nominal_predictors()) %>% step_dummy(all_nominal_predictors())
# logit_model = logistic_reg() svm_model = svm_rbf(cost = tune()) %>% set_mode("classification")
tune_grid = grid_regular(cost(range = c(-2,7), trans = log2_trans()),levels = 10) resample_folds = vfold_cv(train_set,repeats = 5)
svm_wf = workflow() %>% add_recipe(preproc_rec) %>% add_model(svm_model) svm_rs = svm_wf %>% tune_grid(resamples = resample_folds, grid = tune_grid)
svm_rs %>% collect_metrics() %>% filter(.metric == "accuracy") %>% ggplot(aes(x = cost, y = mean)) + geom_point() + geom_line() + scale_x_continuous(trans = "log2") + geom_errorbar(aes(x = cost, ymax = mean + std_err, ymin = mean - std_err), width = 0.5)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.