knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
```r knitr::opts_chunk$set(echo = TRUE, include =TRUE)
```r library(codeBase) library(tidyverse) library(checkmate) library(magrittr) library(data.table) library(testthat) library(knitr) library(xgboost)
sessionInfo()
Currently the master branch contains the following functions however we have many others as pull request
ls("package:codeBase")
Importing the packages inbuilt testing datasets
data(hh_train) data(hh_test)
This data is the Californian house prices dataset and has a target of r "SalePrice"
target <- 'SalePrice' hh_train %<>% select(all_of(c(intersect(names(hh_train), names(hh_test)), target))) hh_test %<>% select(intersect(names(hh_train), names(hh_test))) names(hh_train)
For each variable, find the number or percentage of NA's in a dataset
na_vars <- codeBase::expl_na(df = hh_train, na.strings = NULL, ignore.case = FALSE)
We can now see that we have some factors which are useless as they are mostly unknown:
head(dplyr::arrange(na_vars, -na)) %>% kable()
For each categorical variable, find the number of / percentage of each level
codeBase::expl_categorical(df = hh_train, char.level = 3, num.level =2) %>% kable()
This format can be easily plotted to show factor distributions for each factor (either in a loop or using ggplot2 facet functions)
Find the mean, standard deviation, length, and NA count of numeric and integer variables, with the option to group by other variables
codeBase::expl_summary(hh_train, summarise_vars = "SalePrice", group_vars = "YrSold") %>% kable()
Apply frequency encoding with the option to group the rarest levels
# Add some unknowns to show what it looks like hh_train$Neighborhood[5:10] <- NA
From the below output we can see that the factor Neighborhood has more levels than can usefully be modelled
hh_train %>% group_by(Neighborhood) %>% summarise(Volume = n()) %>% kable()
We decide to frequency encode the factor
nhood_freq <- codeBase::encode_freq(data = hh_train$Neighborhood, n_levels = 5, min_level_count = NULL, unknown_levels = NULL, unknown_treatment_method = 1)
which gives us with the transformed factor
hh_train$nhood_freq <- factor(x = nhood_freq$data, labels = nhood_freq$levels) hh_train %>% group_by(nhood_freq) %>% summarise(Volume = n()) %>% kable()
We build a model using standard R functions
hh_train %<>% mutate(GarageYrBlt = coalesce(GarageYrBlt, YearBuilt)) model_vars <- c('GarageYrBlt', 'nhood_freq', 'OverallCond', 'YearBuilt', 'YrSold', target) hh_train %<>% select(all_of(model_vars)) dataset <- sample(1:10, nrow(hh_train), replace = TRUE) dTrn <- hh_train[dataset[dataset<8],] dVal <- hh_train[dataset[dataset>7],] mTrn <- model.matrix(~., select(dTrn, -SalePrice)) mVal <- model.matrix(~., select(dVal, -SalePrice)) xTrn <- xgb.DMatrix(mTrn, label = dTrn$SalePrice) xVal <- xgb.DMatrix(mVal, label = dVal$SalePrice) params_xgb <- list(nthread = 4 ,alpha = 0 ,lambda = 1 ,booster = 'gbtree' ,objective = "reg:squarederror" ,eval_metric = 'rmse' ,maximize = FALSE ) model_xgb <- xgb.train(data = xTrn ,params = params_xgb ,nrounds = 100 ,early_stopping_rounds = 250 ,print_every_n = 20 ,watchlist = list(train = xTrn, eval = xVal)) dTrn$pred <- predict(object = model_xgb, newdata = xTrn) dVal$pred <- predict(object = model_xgb, newdata = xVal) xgboost::xgb.importance(feature_names = colnames(mTrn), model = model_xgb)
We can now plot PDP plots for any factor
codeBase::plot_PDP(data = as.data.frame(mTrn), model = model_xgb, explain_col = 'GarageYrBlt', n_bins = 20, use_plotly = FALSE)
We can also plot useful global model outputs such as lift curves
codeBase::plot_lift_curve(actual = dTrn$SalePrice, predicted = dTrn$pred, use_plotly = FALSE)
Some users may require plotly and others ggplot2 so all plots work with both engines. This can be controlled with the use_plotly
parameter of the plotting functions.
We have many metrics all of which have the same construction of arguments meaning they can be used by other functions or in loops
codeBase::metric_rmse(actual = dTrn$SalePrice, predicted = dTrn$pred) codeBase::metric_mae(actual = dTrn$SalePrice, predicted = dTrn$pred) codeBase::metric_deviance(actual = dTrn$SalePrice, predicted = dTrn$pred, family = "gaussian") codeBase::metric_nloglik(actual = dTrn$SalePrice, predicted = dTrn $pred, family = "gaussian")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.