Nothing
## ---- include = FALSE---------------------------------------------------------
### Don't overload CRAN servers
### https://stackoverflow.com/questions/28961431/computationally-heavy-r-vignettes
is_check <- ("CheckExEnv" %in% search()) || any(c("_R_CHECK_TIMINGS_",
"_R_CHECK_LICENSE_") %in% names(Sys.getenv()))
## ----message=FALSE------------------------------------------------------------
library(data.table)
library(kableExtra)
library(outliertree)
data("titanic")
titanic |>
head(5) |>
kable() |>
kable_styling()
## -----------------------------------------------------------------------------
## Capitalize column names and some values for easier reading
capitalize <- function(x) gsub("^(\\w)", "\\U\\1\\E", x, perl=TRUE)
titanic <- as.data.table(titanic)
titanic[
, setnames(.SD, names(.SD), capitalize(names(.SD)))
][
, setnames(.SD, "Sibsp", "SibSp")
][
, Sex := capitalize(Sex)
] -> titanic
## Convert 'survived' to yes/no for easier reading
titanic[
, Survived := ifelse(Survived, "Yes", "No")
]
## Some columns are not useful, such as name (an ID), ticket number (another ID),
## or destination (too many values, many non-repeated)
titanic[
, !c("Name", "Ticket", "Home.dest")
] -> titanic
## Ordinal columns need to be passed as ordered factors
cols_ord <- c("Pclass", "Parch", "SibSp")
titanic[
, (cols_ord) := lapply(.SD, function(x) factor(x, ordered = TRUE))
, .SDcols = cols_ord
]
## A look at the processed data
titanic |>
head(5) |>
kable() |>
kable_styling()
## ---- eval=FALSE--------------------------------------------------------------
# library(outliertree)
#
# ## Fit model with default hyperparameters
# otree <- outlier.tree(titanic)
# otree
## ---- echo=FALSE, comment=NA--------------------------------------------------
library(outliertree)
## Fit model with default hyperparameters
otree <- outlier.tree(titanic, nthreads=1)
otree
## -----------------------------------------------------------------------------
## Double-check the data (last 2 outliers)
titanic[c(1147, 1164), ]
## -----------------------------------------------------------------------------
## Distribution of the group from which those two outliers were flagged
titanic[
Pclass == 3 &
SibSp == 0 &
Embarked == "Q"
][
, Fare
] |>
hist(breaks = 100, col = "navy", xlab="Fare",
main="Distribution of Fare within cluster")
## ----comment=NA---------------------------------------------------------------
## Get the outliers in a manipulable format
predict(otree, titanic, outliers_print = 0)[[1147]]
## ----comment=NA---------------------------------------------------------------
## To programatically get all the outliers that were flagged
pred <- predict(otree, titanic, outliers_print = 0)
only_flagged <- pred[!is.na(sapply(pred, function(x) x$outlier_score))]
## ----comment=NA---------------------------------------------------------------
## To print selected rows only
print(pred, only_these_rows = 1147)
## ---- eval=FALSE--------------------------------------------------------------
# ## In order to flag more outliers, one can also experiment
# ## with lowering the threshold hyperparameters
# outlier.tree(titanic, z_outlier = 6., outliers_print = 5)
## ---- echo=FALSE, comment=NA--------------------------------------------------
## In order to flag more outliers, one can also experiment
## with lowering the threshold hyperparameters
outlier.tree(titanic, z_outlier = 6., outliers_print = 5, nthreads=1)
## ---- eval=FALSE--------------------------------------------------------------
# ## One can also lower the gain threshold, but this tends
# ## to result in more spurious outliers which come from
# ## not-so-good splits (not recommended)
# outlier.tree(titanic, z_outlier = 6., min_gain = 1e-6, outliers_print = 5)
## ---- echo=FALSE, comment=NA--------------------------------------------------
## One can also lower the gain threshold, but this tends
## to result in more spurious outliers which come from
## not-so-good splits (not recommended)
outlier.tree(titanic, z_outlier = 6., min_gain = 1e-6, outliers_print = 5, nthreads=1)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.