knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Generally, you want to set a seed so that your analysis is reproducible.
If easytidymodels isn't installed you can install it through devtools, which is on Github.
If you want to use the dataset provided here you will need to install modeldata.
Parallel computing is used through the doParallel library to speed up computations.
set.seed(24) # install.packages("devtools") # devtools::install_github("amanda-park/easytidymodels") library(easytidymodels) library(recipes) library(doParallel) data(ames, package = "modeldata") #Use parallel compute to speed up processing time cores <- parallel::detectCores(logical = FALSE) registerDoParallel(cores = cores)
Define your response variable and save it as resp.
trainTestSplit is a wrapper for rsample's function to split your training and testing data. There is the option to split based on time dependency and to stratify on the response if you aren't splitting based on time.
recipes are your model's preprocessing steps. This varies for each data set you work with the level of preprocessing you need, so instead this portion of tidymodels has not been given a wrapper. The available preprocessing steps that you can use in recipes can be seen here.
After your recipe is set up, you can split your data into training and testing and then bake your recipe's preprocessing steps into the model.
Lastly, you can set up a cross-validation fold object through the function cvFolds.
These objects are all necessary for fitting the variety of models that tidymodels offers you.
#Define your response variable and formula object here resp <- "Sale_Price" formula <- stats::as.formula(paste(resp, ".", sep="~")) #Split data into training and testing sets split <- trainTestSplit(ames, stratifyOnResponse = TRUE, responseVar = resp) #Create recipe for feature engineering for dataset, varies based on data working with rec <- recipe(formula, data = split$train) %>% step_log(resp, base = 10) %>% step_YeoJohnson(Lot_Area, Gr_Liv_Area) %>% step_other(Neighborhood, threshold = .1) %>% step_dummy(all_nominal()) %>% step_zv(all_predictors()) %>% step_nzv(all_predictors()) %>% step_corr(all_numeric(), -all_outcomes(), threshold = .8) %>% prep() train_df <- bake(rec, split$train) test_df <- bake(rec, split$test) folds <- cvFolds(train_df)
General workflow they all follow:
Available evaluation metrics for evalMetric:
Uses library(kknn)
to compute model.
This is what the KNN model tunes:
knnReg <- knnRegress( recipe = rec, response = resp, folds = folds, train = train_df, test = test_df, evalMetric = "rmse" ) #Visualize training data and its predictions knnReg$trainPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for training data knnReg$trainScore #Visualize testing data and its predictions knnReg$testPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for testing data knnReg$testScore #See the final model chosen by KNN based on optimizing for your chosen evaluation metric knnReg$final #See how model fit looks based on another evaluation metric knnReg$tune %>% tune::show_best("mae")
Uses library(kernlab)
to compute SVM model.
What the model tunes:
All the same evaluation methods for KNN are also available for SVM.
svmReg <- svmRegress( recipe = rec, response = resp, folds = folds, train = train_df, test = test_df, evalMetric = "rmse" ) #Visualize training data and its predictions svmReg$trainPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for training data svmReg$trainScore #Visualize testing data and its predictions svmReg$testPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for testing data svmReg$testScore #See the final model chosen by svm based on optimizing for your chosen evaluation metric svmReg$final #See how model fit looks based on another evaluation metric svmReg$tune %>% tune::show_best("mae")
Uses library(glmnet)
to compute tuned linear regression model.
What the model tunes:
linearReg <- linearRegress( recipe = rec, response = resp, folds = folds, train = train_df, test = test_df, evalMetric = "rmse", tidyModelVersion = TRUE ) #Visualize training data and its predictions linearReg$trainPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for training data linearReg$trainScore #Visualize testing data and its predictions linearReg$testPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for testing data linearReg$testScore #See the final model chosen by linear based on optimizing for your chosen evaluation metric linearReg$final #See how model fit looks based on another evaluation metric linearReg$tune %>% tune::show_best("mae")
Uses library(ranger)
to compute a random forest regression model.
What the model tunes:
What you set specifically:
rfReg <- rfRegress( recipe = rec, response = resp, folds = folds, train = train_df, test = test_df, calcFeatImp = TRUE, evalMetric = "mae" ) #Visualize training data and its predictions rfReg$trainPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for training data rfReg$trainScore #Visualize testing data and its predictions rfReg$testPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for testing data rfReg$testScore #See the final model chosen by rf based on optimizing for your chosen evaluation metric rfReg$final #See how model fit looks based on another evaluation metric rfReg$tune %>% tune::show_best("mae") #See feature importance of model rfReg$featImpPlot #See numeric feature importance of model rfReg$featImpVars
Uses library(xgboost)
to compute a random forest regression model.
What the model tunes:
What you set specifically:
xgReg <- xgRegress( recipe = rec, response = resp, folds = folds, train = train_df, test = test_df, calcFeatImp = TRUE, evalMetric = "mae" ) #Visualize training data and its predictions xgReg$trainPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for training data xgReg$trainScore #Visualize testing data and its predictions xgReg$testPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for testing data xgReg$testScore #See the final model chosen by xg based on optimizing for your chosen evaluation metric xgReg$final #See how model fit looks based on another evaluation metric xgReg$tune %>% tune::select_best("rmse") #See feature importance of model xgReg$featImpPlot
Uses library(earth)
to compute a multi-adaptive regressive spline model.
What the model tunes:
marsReg <- marsRegress( recipe = rec, response = resp, folds = folds, train = train_df, test = test_df, evalMetric = "rmse" ) #Visualize training data and its predictions marsReg$trainPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for training data marsReg$trainScore #Visualize testing data and its predictions marsReg$testPred %>% select(.pred, !!resp) #View how model metrics for RMSE, R-Squared, and MAE look for testing data marsReg$testScore #See the final model chosen by mars based on optimizing for your chosen evaluation metric marsReg$final #See how model fit looks based on another evaluation metric marsReg$tune %>% tune::show_best("mae")
library(parsnip) library(tune) # Finalize parameters rfParam <- rfReg$tune %>% tune::show_best("rmse", n=1) %>% select(mtry, min_n) xgParam <- xgReg$tune %>% tune::show_best("rmse", n=1) %>% select(mtry:sample_size) # Collect model predictions to stack xgStack <- xgReg$tune %>% tune::collect_predictions() %>% inner_join(xgParam) %>% select(id, .row, !!resp, xgboost = .pred) rfStack <- rfReg$tune %>% tune::collect_predictions() %>% inner_join(rfParam) %>% select(id, .row, randomforest = .pred) marsStack <- marsReg$tune %>% tune::collect_predictions() %>% select(id, .row, mars = .pred) knnStack <- knnReg$tune %>% tune::collect_predictions() %>% select(id, .row, knn = .pred) svmStack <- svmReg$tune %>% tune::collect_predictions() %>% select(id, .row, svm = .pred) lmStack <- linearReg$tune %>% tune::collect_predictions() %>% select(id, .row, lm = .pred) stackDat <- xgStack %>% left_join(rfStack) %>% left_join(marsStack) %>% left_join(knnStack) %>% left_join(svmStack) %>% left_join(lmStack) %>% select(-id, -.row) stackModel <- parsnip::linear_reg(penalty = .2, mixture = 1) %>% parsnip::set_mode("regression") %>% parsnip::set_engine("glmnet") %>% parsnip::fit(formula, data = stackDat) stackModel %>% tidy() #Finalize model fits xgFinal <- xgReg$final %>% last_fit(split$split) rfFinal <- rfReg$final %>% last_fit(split$split) marsFinal <- marsReg$final %>% last_fit(split$split) knnFinal <- knnReg$final %>% last_fit(split$split) svmFinal <- svmReg$final %>% last_fit(split$split) lmFinal <- linearReg$final %>% last_fit(split$split) stackFinal <- tibble( "model" = list( xgFinal, rfFinal, marsFinal, knnFinal, svmFinal, lmFinal ), "model_names" = c( "xgboost", "randomforest", "mars", "knn", "svm", "lm" )) %>% mutate(pred = purrr::map(model, collect_predictions)) stackFinal <- stackFinal %>% select(model_names, pred) %>% tidyr::unnest(pred) %>% tidyr::pivot_wider(names_from = model_names, values_from = .pred) %>% select(-id, -.row, -.config) predict(stackModel, stackFinal) %>% bind_cols(stackFinal) %>% rename("stack" = .pred) %>% tidyr::pivot_longer(-!!resp) %>% group_by(name) %>% yardstick::rmse(truth = !!resp, estimate = value) %>% ungroup() %>% tidyr::pivot_wider(names_from = .metric, values_from = .estimate) %>% arrange(rmse)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.