knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Some things to note:
set.seed(24) # install.packages("devtools") #devtools::install_github("amanda-park/easytidymodels") library(easytidymodels) library(recipes) library(doParallel) library(ggplot2) data(penguins, package = "modeldata") #Use parallel compute to speed up processing time cores <- parallel::detectCores(logical = FALSE) registerDoParallel(cores = cores)
Define your response variable and save it as resp.
trainTestSplit is a wrapper for rsample's function to split your training and testing data. There is the option to split based on time dependency and to stratify on the response if you aren't splitting based on time.
recipes are your model's preprocessing steps. This varies for each data set you work with the level of preprocessing you need, so instead this portion of tidymodels has not been given a wrapper. The available preprocessing steps that you can use in recipes can be seen here.
After your recipe is set up, you can split your data into training and testing and then bake your recipe's preprocessing steps into the model.
Lastly, you can set up a cross-validation fold object through the function cvFolds.
These objects are all necessary for fitting the variety of models that tidymodels offers you.
#Define your response variable and formula object here resp <- "sex" formula <- stats::as.formula(paste(resp, ".", sep="~")) #Split data into training and testing sets split <- trainTestSplit(penguins, stratifyOnResponse = TRUE, responseVar = resp) #Create recipe for feature engineering for dataset, varies based on data working with rec <- recipe(formula, data = split$train) %>% step_impute_knn(!!resp) %>% step_dummy(all_nominal(), -all_outcomes()) %>% step_impute_median(all_predictors()) %>% step_normalize(all_predictors()) %>% step_nzv(all_predictors()) %>% step_corr(all_numeric(), -all_outcomes(), threshold = .8) %>% prep() train_df <- bake(rec, split$train) test_df <- bake(rec, split$test) #df <- rbind(train_df, test_df) folds <- cvFolds(train_df)
General workflow they all follow:
Available evaluation metrics for evalMetric:
Uses library(kknn)
to compute model.
This is what the KNN model tunes:
knnClass <- knnClassif( recipe = rec, response = resp, folds = folds, train = train_df, test = test_df ) #Visualize training data and its predictions knnClass$trainConfMat #View model metrics for accuracy and kappa knnClass$trainScore #Visualize testing data and its predictions knnClass$testConfMat #View model metrics for accuracy and kappa knnClass$testScore #See the final model chosen by KNN based on optimizing for your chosen evaluation metric knnClass$final #See how model fit looks based on another evaluation metric knnClass$tune %>% tune::show_best("roc_auc")
Uses library(glmnet)
to compute tuned logistic regression model.
What the model tunes:
lr <- logRegBinary(recipe = rec, response = resp, folds = folds, train = train_df, test = test_df) #Confusion Matrix lr$trainConfMat #Plot of confusion matrix lr$trainConfMatPlot #Train Score lr$trainScore #Test Confusion Matrix lr$testConfMat #Test Confusion Matrix Plot lr$testConfMatPlot #Test Score lr$testScore #See the final model chosen by svm based on optimizing for your chosen evaluation metric lr$final #See how model fit looks based on another evaluation metric lr$tune %>% tune::show_best("roc_auc")
Uses library(kernlab)
to compute SVM model.
What the model tunes:
All the same evaluation methods for KNN are also available for SVM.
svmClass <- svmClassif( recipe = rec, response = resp, folds = folds, train = train_df, test = test_df, evalMetric = "bal_accuracy" ) #Visualize training data and its predictions svmClass$trainConfMat #View model metrics for accuracy and kappa svmClass$trainScore #Visualize testing data and its predictions svmClass$testConfMat #View model metrics for accuracy and kappa svmClass$testScore #See the final model chosen by svm based on optimizing for your chosen evaluation metric svmClass$final #See how model fit looks based on another evaluation metric svmClass$tune %>% tune::show_best("roc_auc")
Uses library(xgboost)
to compute a random forest regression model.
What the model tunes:
What you set specifically:
xgClass <- xgBinaryClassif( recipe = rec, response = resp, folds = folds, train = train_df, test = test_df, evalMetric = "roc_auc" ) #Visualize training data and its predictions xgClass$trainConfMat #View model metrics for accuracy and kappa xgClass$trainScore #Visualize testing data and its predictions xgClass$testConfMat #View model metrics for accuracy and kappa xgClass$testScore #See the final model chosen by svm based on optimizing for your chosen evaluation metric xgClass$final #See how model fit looks based on another evaluation metric xgClass$tune %>% tune::show_best("bal_accuracy") # ROC-AUC on CV Folds Example xgClass$tune %>% collect_predictions() %>% group_by(id) %>% yardstick::roc_curve(!!resp, .pred_female) %>% ggplot(aes(1 - specificity, sensitivity, color = id)) + geom_abline(lty = 2, color = "gray80", size = 1.5) + geom_path(show.legend = TRUE, alpha = 0.6, size = 1.2) + coord_equal() #Evaluate model parameters on specific metric xgClass$tune %>% tune::show_best(metric = "roc_auc",n = 10) %>% tidyr::pivot_longer(mtry:sample_size, names_to="variable",values_to="value" ) %>% ggplot(aes(value,mean)) + geom_line(alpha=1/2)+ geom_point()+ facet_wrap(~variable,scales = "free")+ ggtitle("Best parameters for ROC-AUC") #Feature importance plot xgClass$featImpPlot #Feature importance variables xgClass$featImpVars
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.