knitr::opts_chunk$set(echo = TRUE, include =TRUE)
.libPaths('C:/Program Files/R/R-3.6.3/library') library(ADMR) library(tidyverse) library(checkmate) library(magrittr) library(data.table) library(testthat)
sessionInfo()
Currently the master branch contains the following functions however we have many others as pull request
ls("package:ADMR")
Importing the packages inbuilt testing datasets
data(hh_train) data(hh_test)
This data is the Californian house prices dataset and has a target of r "SalePrice"
target <- 'SalePrice' hh_train %<>% select(all_of(c(intersect(names(hh_train), names(hh_test)), target))) hh_test %<>% select(intersect(names(hh_train), names(hh_test))) names(hh_train)
For each variable, find the number or percentage of NA's in a dataset
na_vars <- ADMR::expl_na(df = hh_train, na.strings = NULL, ignore.case = FALSE)
We can now see that we have some factors which are useless as they are mostly unknown
head(dplyr::arrange(na_vars, -na))
For each categorical variable, find the number of / percentage of each level
ADMR::expl_categorical(df = hh_train, char.level = 3, num.level =2)
This format can be easily plotted to show factor distributions for each factor (either in a loop or using ggplot2 facet functions)
Apply frequency encoding with the option to group the rarest levels
# Add some unknowns to show what it looks like hh_train$Neighborhood[5:10] <- NA
From the below output we can see that the factor Neighborhood has more levels than can usefully be modelled
table(hh_train$Neighborhood, exclude = NULL)
We decide to frequency encode the factor
nhood_freq <- ADMR::encode_freq(data = hh_train$Neighborhood, n_levels = 5, min_level_count = NULL, unknown_levels = NULL, unknown_treatment_method = 1)
which levels us with the transformed new factor
hh_train$nhood_freq <- factor(x = nhood_freq$data, labels = nhood_freq$levels) table(hh_train$nhood_freq)
We build a model using standard R functions
hh_train %<>% mutate(GarageYrBlt = coalesce(GarageYrBlt, YearBuilt)) model_vars <- c('GarageYrBlt', 'nhood_freq', 'OverallCond', 'ExterCond', target) hh_train %<>% select(all_of(model_vars)) model <- stats::glm(formula = 'SalePrice ~ .', family = 'gaussian', data = hh_train) summary(model) hh_train$pred <- predict(object = model, newdata = hh_train)
We can now plot PDP plots for any factor
ADMR::plot_PDP(data = hh_train, model = model, explain_col = 'GarageYrBlt', n_bins = 20)
We can also plot pairs of factors
ADMR::plot_PDP(data = hh_train, model = model, explain_col = c('GarageYrBlt', 'OverallCond'), n_bins = 10)
Note: OverallCond: Rates the overall condition of the house 10 - Very Excellent 1 - Very Poor
We can now plot ALE plots for any factor
ADMR::plot_ALE(data = hh_train, model = model, explain_col = 'GarageYrBlt', n_bins = 20)
We can also plot useful global model outputs such as lift curves
ADMR::plot_lift_curve(actual = hh_train$SalePrice, predicted = hh_train$pred)
Some users may require plotly and others ggplot2 so all plots work with both engines
ADMR::plot_lift_curve(actual = hh_train$SalePrice, predicted = hh_train$pred, use_plotly = FALSE)
We can use codebase to compute metrics
library(Metrics) ADMR::metric_mae(actual = hh_train$SalePrice, predicted = hh_train$pred, na.rm = TRUE) Metrics::mae(actual = hh_train$SalePrice, predicted = hh_train$pred)
we have many metrics all of which have the same construction of arguments meaning they can be used by other functions or in loops
ADMR::metric_rmse(actual = hh_train$SalePrice, predicted = hh_train$pred) ADMR::metric_mae(actual = hh_train$SalePrice, predicted = hh_train$pred) ADMR::metric_deviance(actual = hh_train$SalePrice, predicted = hh_train$pred, family = "gaussian") ADMR::metric_nloglik(actual = hh_train$SalePrice, predicted = hh_train$pred, family = "gaussian")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.