knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(tidy.outliers)
The most central aspect of tidy.outliers is to combine it on a workflow to filter outliers out of your training dataset, and to consider the outlier removal process just one of your many dials controlled hyper parameters, you can even 'pool' the score out of multiple outlier methods!
library(tidymodels)
data(ames) data_split <- ames |> mutate(Sale_Price = log10(Sale_Price)) |> initial_split(strata = Sale_Price) ames_train <- training(data_split) ames_test <- testing(data_split)
ames_rec <- recipe(Sale_Price ~ Gr_Liv_Area + Longitude + Latitude, data = ames_train) |> step_log(Gr_Liv_Area, base = 10) |> step_ns(Longitude, deg_free = tune("long df")) |> step_ns(Latitude, deg_free = tune("lat df")) |> step_outliers_maha(all_numeric(), -all_outcomes()) |> step_outliers_lookout(all_numeric(),-contains(r"(.outliers)"),-all_outcomes()) |> step_outliers_remove(contains(r"(.outliers)"),score_dropout = tune("dropout"),aggregation_function = tune("aggregation"))
parameters(ames_rec)
ames_param <- ames_rec |> parameters() |> update( `long df` = spline_degree(), `lat df` = spline_degree(), dropout = dropout(range = c(0.75, 1)), aggregation = aggregation() |> value_set(c("mean","weighted_mean")) )
weighted_mean <- function(x) { x[[1]] * .75 + x[[2]] * .25 }
spline_grid <- grid_max_entropy(ames_param, size = 20) spline_grid
lin_mod <- linear_reg() |> set_engine("lm")
wf_tune <- workflow() |> add_recipe(ames_rec) |> add_model(lin_mod)
set.seed(2453) cv_splits <- vfold_cv(ames_train, v = 5, strata = Sale_Price)
ames_res <- tune_grid(wf_tune, resamples = cv_splits, grid = spline_grid)
estimates <- collect_metrics(ames_res) rmse_vals <- estimates |> dplyr::filter(.metric == "rmse") |> arrange(mean) rmse_vals
autoplot(ames_res,metric = "rmse")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.