knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Set up package and load in data for analysis

Some things to note:

set.seed(24)

# install.packages("devtools")
#devtools::install_github("amanda-park/easytidymodels")
library(easytidymodels)
library(recipes)
library(doParallel)
library(ggplot2)

data(penguins, package = "modeldata")

#Use parallel compute to speed up processing time
cores <- parallel::detectCores(logical = FALSE)
registerDoParallel(cores = cores)

Prepare data for analysis with preprocessing

Define your response variable and save it as resp.

trainTestSplit is a wrapper for rsample's function to split your training and testing data. There is the option to split based on time dependency and to stratify on the response if you aren't splitting based on time.

recipes are your model's preprocessing steps. This varies for each data set you work with the level of preprocessing you need, so instead this portion of tidymodels has not been given a wrapper. The available preprocessing steps that you can use in recipes can be seen here.

After your recipe is set up, you can split your data into training and testing and then bake your recipe's preprocessing steps into the model.

Lastly, you can set up a cross-validation fold object through the function cvFolds.

These objects are all necessary for fitting the variety of models that tidymodels offers you.

#Define your response variable and formula object here
resp <- "sex"
formula <- stats::as.formula(paste(resp, ".", sep="~"))

#Split data into training and testing sets
split <- trainTestSplit(penguins,
                        stratifyOnResponse = TRUE,
                        responseVar = resp)

#Create recipe for feature engineering for dataset, varies based on data working with
rec <- recipe(formula, data = split$train) %>%
  step_impute_knn(!!resp) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>% 
  step_impute_median(all_predictors()) %>% 
  step_normalize(all_predictors()) %>%
  step_nzv(all_predictors()) %>%
  step_corr(all_numeric(), -all_outcomes(), threshold = .8) %>%
  prep()

train_df <- bake(rec, split$train)
test_df <- bake(rec, split$test)
#df <- rbind(train_df, test_df)
folds <- cvFolds(train_df)

About Classification Models

General workflow they all follow:

Available evaluation metrics for evalMetric:

Fit a KNN Model

Uses library(kknn) to compute model.

This is what the KNN model tunes:

knnClass <- knnClassif(
  recipe = rec,
  response = resp,
  folds = folds,
  train = train_df,
  test = test_df
)

#Visualize training data and its predictions
knnClass$trainConfMat

#View model metrics for accuracy and kappa
knnClass$trainScore

#Visualize testing data and its predictions
knnClass$testConfMat

#View model metrics for accuracy and kappa
knnClass$testScore

#See the final model chosen by KNN based on optimizing for your chosen evaluation metric
knnClass$final

#See how model fit looks based on another evaluation metric
knnClass$tune %>% tune::show_best("roc_auc")

Fit a Logistic Regression Model

Uses library(glmnet) to compute tuned logistic regression model.

What the model tunes:

lr <- logRegBinary(recipe = rec,
                   response = resp,
                   folds = folds,
                   train = train_df,
                   test = test_df)

#Confusion Matrix
lr$trainConfMat

#Plot of confusion matrix
lr$trainConfMatPlot

#Train Score
lr$trainScore

#Test Confusion Matrix
lr$testConfMat

#Test Confusion Matrix Plot
lr$testConfMatPlot

#Test Score
lr$testScore

#See the final model chosen by svm based on optimizing for your chosen evaluation metric
lr$final

#See how model fit looks based on another evaluation metric
lr$tune %>% tune::show_best("roc_auc")

Fit a Support Vector Machine (Radial Basis Kernel) Model

Uses library(kernlab) to compute SVM model.

What the model tunes:

All the same evaluation methods for KNN are also available for SVM.

svmClass <- svmClassif(
  recipe = rec,
  response = resp,
  folds = folds,
  train = train_df,
  test = test_df,
  evalMetric = "bal_accuracy"
)

#Visualize training data and its predictions
svmClass$trainConfMat

#View model metrics for accuracy and kappa
svmClass$trainScore

#Visualize testing data and its predictions
svmClass$testConfMat

#View model metrics for accuracy and kappa
svmClass$testScore

#See the final model chosen by svm based on optimizing for your chosen evaluation metric
svmClass$final

#See how model fit looks based on another evaluation metric
svmClass$tune %>% tune::show_best("roc_auc")

Fit an XGBoost Model

Uses library(xgboost) to compute a random forest regression model.

What the model tunes:

What you set specifically:

xgClass <- xgBinaryClassif(
                   recipe = rec,
                   response = resp,
                   folds = folds,
                   train = train_df,
                   test = test_df,
                   evalMetric = "roc_auc"
                   )

#Visualize training data and its predictions
xgClass$trainConfMat

#View model metrics for accuracy and kappa
xgClass$trainScore

#Visualize testing data and its predictions
xgClass$testConfMat

#View model metrics for accuracy and kappa
xgClass$testScore

#See the final model chosen by svm based on optimizing for your chosen evaluation metric
xgClass$final

#See how model fit looks based on another evaluation metric
xgClass$tune %>% tune::show_best("bal_accuracy")

# ROC-AUC on CV Folds Example
xgClass$tune %>%
  collect_predictions() %>%
  group_by(id) %>%
  yardstick::roc_curve(!!resp, .pred_female) %>%
  ggplot(aes(1 - specificity, sensitivity, color = id)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_path(show.legend = TRUE, alpha = 0.6, size = 1.2) +
  coord_equal() 

#Evaluate model parameters on specific metric
xgClass$tune %>%
  tune::show_best(metric = "roc_auc",n = 10) %>%
  tidyr::pivot_longer(mtry:sample_size, names_to="variable",values_to="value" ) %>%
  ggplot(aes(value,mean)) +
  geom_line(alpha=1/2)+
  geom_point()+
  facet_wrap(~variable,scales = "free")+
  ggtitle("Best parameters for ROC-AUC")

#Feature importance plot
xgClass$featImpPlot

#Feature importance variables
xgClass$featImpVars


amanda-park/easytidymodels documentation built on Dec. 13, 2021, 11:28 a.m.