### Don't overload CRAN servers ### https://stackoverflow.com/questions/28961431/computationally-heavy-r-vignettes is_check <- ("CheckExEnv" %in% search()) || any(c("_R_CHECK_TIMINGS_", "_R_CHECK_LICENSE_") %in% names(Sys.getenv()))
This short notebook illustrates basic usage of the OutlierTree library for explainable outlier detection using the Titanic dataset. For more details, you can check the package's documentation at CRAN or through R's help (e.g. ?outliertree::outlier.tree
). For a more interesting and interactive example, see the documentation of the main function (outlier.tree
), which uses a larger dataset.
The dataset is very popular and can be downloaded from different sources, such as Kaggle or many university webpages. This vignette took it from the following link: https://github.com/jbryer/CompStats/raw/master/Data/titanic3.csv
The data comes bundled in the package so there is no need to download it from the link above.
library(data.table) library(kableExtra) library(outliertree) data("titanic") titanic |> head(5) |> kable() |> kable_styling()
## Capitalize column names and some values for easier reading capitalize <- function(x) gsub("^(\\w)", "\\U\\1\\E", x, perl=TRUE) titanic <- as.data.table(titanic) titanic[ , setnames(.SD, names(.SD), capitalize(names(.SD))) ][ , setnames(.SD, "Sibsp", "SibSp") ][ , Sex := capitalize(Sex) ] -> titanic ## Convert 'survived' to yes/no for easier reading titanic[ , Survived := ifelse(Survived, "Yes", "No") ] ## Some columns are not useful, such as name (an ID), ticket number (another ID), ## or destination (too many values, many non-repeated) titanic[ , !c("Name", "Ticket", "Home.dest") ] -> titanic ## Ordinal columns need to be passed as ordered factors cols_ord <- c("Pclass", "Parch", "SibSp") titanic[ , (cols_ord) := lapply(.SD, function(x) factor(x, ordered = TRUE)) , .SDcols = cols_ord ] ## A look at the processed data titanic |> head(5) |> kable() |> kable_styling()
library(outliertree) ## Fit model with default hyperparameters otree <- outlier.tree(titanic) otree
library(outliertree) ## Fit model with default hyperparameters otree <- outlier.tree(titanic, nthreads=1) otree
## Double-check the data (last 2 outliers) titanic[c(1147, 1164), ]
## Distribution of the group from which those two outliers were flagged titanic[ Pclass == 3 & SibSp == 0 & Embarked == "Q" ][ , Fare ] |> hist(breaks = 100, col = "navy", xlab="Fare", main="Distribution of Fare within cluster")
## Get the outliers in a manipulable format predict(otree, titanic, outliers_print = 0)[[1147]]
## To programatically get all the outliers that were flagged pred <- predict(otree, titanic, outliers_print = 0) only_flagged <- pred[!is.na(sapply(pred, function(x) x$outlier_score))]
## To print selected rows only print(pred, only_these_rows = 1147)
## In order to flag more outliers, one can also experiment ## with lowering the threshold hyperparameters outlier.tree(titanic, z_outlier = 6., outliers_print = 5)
## In order to flag more outliers, one can also experiment ## with lowering the threshold hyperparameters outlier.tree(titanic, z_outlier = 6., outliers_print = 5, nthreads=1)
## One can also lower the gain threshold, but this tends ## to result in more spurious outliers which come from ## not-so-good splits (not recommended) outlier.tree(titanic, z_outlier = 6., min_gain = 1e-6, outliers_print = 5)
## One can also lower the gain threshold, but this tends ## to result in more spurious outliers which come from ## not-so-good splits (not recommended) outlier.tree(titanic, z_outlier = 6., min_gain = 1e-6, outliers_print = 5, nthreads=1)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.