knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Set up package and load in data for analysis

Generally, you want to set a seed so that your analysis is reproducible.

If easytidymodels isn't installed you can install it through devtools, which is on Github.

If you want to use the dataset provided here you will need to install modeldata.

Parallel computing is used through the doParallel library to speed up computations.

set.seed(24)

# install.packages("devtools")
# devtools::install_github("amanda-park/easytidymodels")
library(easytidymodels)
library(recipes)
library(doParallel)

data(ames, package = "modeldata")
#Use parallel compute to speed up processing time
cores <- parallel::detectCores(logical = FALSE)
registerDoParallel(cores = cores)

Prepare data for analysis with preprocessing

Define your response variable and save it as resp.

trainTestSplit is a wrapper for rsample's function to split your training and testing data. There is the option to split based on time dependency and to stratify on the response if you aren't splitting based on time.

recipes are your model's preprocessing steps. This varies for each data set you work with the level of preprocessing you need, so instead this portion of tidymodels has not been given a wrapper. The available preprocessing steps that you can use in recipes can be seen here.

After your recipe is set up, you can split your data into training and testing and then bake your recipe's preprocessing steps into the model.

Lastly, you can set up a cross-validation fold object through the function cvFolds.

These objects are all necessary for fitting the variety of models that tidymodels offers you.

#Define your response variable and formula object here
resp <- "Sale_Price"
formula <- stats::as.formula(paste(resp, ".", sep="~"))

#Split data into training and testing sets
split <- trainTestSplit(ames,
                        stratifyOnResponse = TRUE,
                        responseVar = resp)

#Create recipe for feature engineering for dataset, varies based on data working with
rec <- recipe(formula, data = split$train) %>%
  step_log(resp, base = 10) %>%
  step_YeoJohnson(Lot_Area, Gr_Liv_Area) %>%
  step_other(Neighborhood, threshold = .1)  %>%
  step_dummy(all_nominal()) %>%
  step_zv(all_predictors()) %>%
  step_nzv(all_predictors()) %>%
  step_corr(all_numeric(), -all_outcomes(), threshold = .8) %>%
  prep()

train_df <- bake(rec, split$train)
test_df <- bake(rec, split$test)

folds <- cvFolds(train_df)

About all regression models

General workflow they all follow:

Available evaluation metrics for evalMetric:

Fit a KNN Model

Uses library(kknn) to compute model.

This is what the KNN model tunes:

knnReg <- knnRegress(
  recipe = rec,
  response = resp,
  folds = folds,
  train = train_df,
  test = test_df,
  evalMetric = "rmse"
)

#Visualize training data and its predictions
knnReg$trainPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for training data
knnReg$trainScore

#Visualize testing data and its predictions
knnReg$testPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for testing data
knnReg$testScore

#See the final model chosen by KNN based on optimizing for your chosen evaluation metric
knnReg$final

#See how model fit looks based on another evaluation metric
knnReg$tune %>% tune::show_best("mae")

Fit a Support Vector Machine (Radial Basis Kernel) Model

Uses library(kernlab) to compute SVM model.

What the model tunes:

All the same evaluation methods for KNN are also available for SVM.

svmReg <- svmRegress(
  recipe = rec,
  response = resp,
  folds = folds,
  train = train_df,
  test = test_df,
  evalMetric = "rmse"
)

#Visualize training data and its predictions
svmReg$trainPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for training data
svmReg$trainScore

#Visualize testing data and its predictions
svmReg$testPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for testing data
svmReg$testScore

#See the final model chosen by svm based on optimizing for your chosen evaluation metric
svmReg$final

#See how model fit looks based on another evaluation metric
svmReg$tune %>% tune::show_best("mae")

Fit a Tuned Linear Regression Model

Uses library(glmnet) to compute tuned linear regression model.

What the model tunes:

linearReg <- linearRegress(
  recipe = rec,
  response = resp,
  folds = folds,
  train = train_df,
  test = test_df,
  evalMetric = "rmse",
  tidyModelVersion = TRUE
)

#Visualize training data and its predictions
linearReg$trainPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for training data
linearReg$trainScore

#Visualize testing data and its predictions
linearReg$testPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for testing data
linearReg$testScore

#See the final model chosen by linear based on optimizing for your chosen evaluation metric
linearReg$final

#See how model fit looks based on another evaluation metric
linearReg$tune %>% tune::show_best("mae")

Fit a Random Forest Model

Uses library(ranger) to compute a random forest regression model.

What the model tunes:

What you set specifically:

rfReg <- rfRegress(
  recipe = rec,
  response = resp,
  folds = folds,
  train = train_df,
  test = test_df,
  calcFeatImp = TRUE,
  evalMetric = "mae"
)

#Visualize training data and its predictions
rfReg$trainPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for training data
rfReg$trainScore

#Visualize testing data and its predictions
rfReg$testPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for testing data
rfReg$testScore

#See the final model chosen by rf based on optimizing for your chosen evaluation metric
rfReg$final

#See how model fit looks based on another evaluation metric
rfReg$tune %>% tune::show_best("mae")

#See feature importance of model
rfReg$featImpPlot

#See numeric feature importance of model
rfReg$featImpVars

Fit an XGBoost Model

Uses library(xgboost) to compute a random forest regression model.

What the model tunes:

What you set specifically:

xgReg <- xgRegress(
  recipe = rec,
  response = resp,
  folds = folds,
  train = train_df,
  test = test_df,
  calcFeatImp = TRUE,
  evalMetric = "mae"
)

#Visualize training data and its predictions
xgReg$trainPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for training data
xgReg$trainScore

#Visualize testing data and its predictions
xgReg$testPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for testing data
xgReg$testScore

#See the final model chosen by xg based on optimizing for your chosen evaluation metric
xgReg$final

#See how model fit looks based on another evaluation metric
xgReg$tune %>% tune::select_best("rmse")

#See feature importance of model
xgReg$featImpPlot

Fit a MARS Model

Uses library(earth) to compute a multi-adaptive regressive spline model.

What the model tunes:

marsReg <- marsRegress(
  recipe = rec,
  response = resp,
  folds = folds,
  train = train_df,
  test = test_df,
  evalMetric = "rmse"
)

#Visualize training data and its predictions
marsReg$trainPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for training data
marsReg$trainScore

#Visualize testing data and its predictions
marsReg$testPred %>% select(.pred, !!resp)

#View how model metrics for RMSE, R-Squared, and MAE look for testing data
marsReg$testScore

#See the final model chosen by mars based on optimizing for your chosen evaluation metric
marsReg$final

#See how model fit looks based on another evaluation metric
marsReg$tune %>% tune::show_best("mae")

Model Stacking Example

library(parsnip)
library(tune)

# Finalize parameters 

rfParam <- rfReg$tune %>% tune::show_best("rmse", n=1) %>% 
  select(mtry, min_n)

xgParam <- xgReg$tune %>% 
  tune::show_best("rmse", n=1) %>% 
  select(mtry:sample_size)

# Collect model predictions to stack

xgStack <- xgReg$tune %>% 
  tune::collect_predictions() %>% 
  inner_join(xgParam) %>% 
  select(id, .row, !!resp, xgboost = .pred)

rfStack <- rfReg$tune %>% 
  tune::collect_predictions() %>% 
  inner_join(rfParam) %>% 
  select(id, .row, randomforest = .pred)

marsStack <- marsReg$tune %>%
  tune::collect_predictions() %>% 
  select(id, .row, mars = .pred)

knnStack <- knnReg$tune %>%
  tune::collect_predictions() %>% 
  select(id, .row, knn = .pred)

svmStack <- svmReg$tune %>%
  tune::collect_predictions() %>%
  select(id, .row, svm = .pred)

lmStack <- linearReg$tune %>%
  tune::collect_predictions() %>% 
  select(id, .row, lm = .pred)

stackDat <- xgStack %>% 
  left_join(rfStack) %>% 
  left_join(marsStack) %>%
  left_join(knnStack) %>%
  left_join(svmStack) %>%
  left_join(lmStack) %>%
  select(-id, -.row)

stackModel <- parsnip::linear_reg(penalty = .2, mixture = 1) %>% 
  parsnip::set_mode("regression") %>% 
  parsnip::set_engine("glmnet") %>% 
  parsnip::fit(formula, data = stackDat)

stackModel %>% tidy()

#Finalize model fits

xgFinal <- xgReg$final %>% last_fit(split$split)
rfFinal <- rfReg$final %>% last_fit(split$split)
marsFinal <- marsReg$final %>% last_fit(split$split)
knnFinal <- knnReg$final %>% last_fit(split$split)
svmFinal <- svmReg$final %>% last_fit(split$split)
lmFinal <- linearReg$final %>% last_fit(split$split)

stackFinal <- tibble(
  "model" = list(
    xgFinal, rfFinal, marsFinal, knnFinal, svmFinal, lmFinal
    ),
  "model_names" = c(
    "xgboost", "randomforest", "mars", "knn", "svm", "lm"
    )) %>% 
  mutate(pred = purrr::map(model, collect_predictions))

stackFinal <- stackFinal %>% 
  select(model_names, pred) %>% 
  tidyr::unnest(pred) %>% 
  tidyr::pivot_wider(names_from = model_names, values_from = .pred) %>% 
  select(-id, -.row, -.config)

predict(stackModel, stackFinal) %>% 
  bind_cols(stackFinal) %>% 
  rename("stack" = .pred) %>% 
  tidyr::pivot_longer(-!!resp) %>% 
  group_by(name) %>% 
  yardstick::rmse(truth = !!resp, estimate = value) %>% 
  ungroup() %>% 
  tidyr::pivot_wider(names_from = .metric, values_from = .estimate) %>% 
  arrange(rmse)


amanda-park/easytidymodels documentation built on Dec. 13, 2021, 11:28 a.m.