Supervised Learning with tidylearn
In tidylearn: A Unified Tidy Interface to R's Machine Learning Ecosystem

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

library(tidylearn)
library(dplyr)

Introduction

This vignette demonstrates supervised learning capabilities in tidylearn. All methods shown here wrap established R packages - the algorithms are unchanged, tidylearn simply provides a consistent interface and tidy output.

Wrapped packages include:

stats (lm(), glm()) for linear and logistic regression
rpart for decision trees
randomForest for random forests
gbm and xgboost for gradient boosting
glmnet for regularization (ridge, lasso, elastic net)
e1071 for support vector machines
nnet for neural networks

Access raw model objects via model$fit for package-specific functionality.

Classification

Binary Classification

Let's create a binary classification problem from the iris dataset:

# Create binary classification dataset
iris_binary <- iris %>%
  filter(Species %in% c("setosa", "versicolor")) %>%
  mutate(Species = droplevels(Species))

# Split data
split <- tl_split(iris_binary, prop = 0.7, stratify = "Species", seed = 123)

Logistic Regression

# Train logistic regression
model_logistic <- tl_model(split$train, Species ~ ., method = "logistic")
print(model_logistic)

# Predictions
preds_logistic <- predict(model_logistic, new_data = split$test)
head(preds_logistic)

Decision Trees

# Train decision tree
model_tree <- tl_model(split$train, Species ~ ., method = "tree")
print(model_tree)

# Predictions
preds_tree <- predict(model_tree, new_data = split$test)

Multi-class Classification

# Split full iris dataset
split_multi <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)

Random Forest

# Train random forest
model_forest <- tl_model(split_multi$train, Species ~ ., method = "forest")
print(model_forest)

# Predictions
preds_forest <- predict(model_forest, new_data = split_multi$test)
head(preds_forest)

# Accuracy on test set
mean(preds_forest$.pred == split_multi$test$Species)

Support Vector Machines

# Train SVM
model_svm <- tl_model(split_multi$train, Species ~ ., method = "svm")
print(model_svm)

# Predictions
preds_svm <- predict(model_svm, new_data = split_multi$test)

Regression

Linear Regression

# Split mtcars data
split_reg <- tl_split(mtcars, prop = 0.7, seed = 123)

# Train linear model
model_lm <- tl_model(split_reg$train, mpg ~ wt + hp + disp, method = "linear")
print(model_lm)

# Predictions
preds_lm <- predict(model_lm, new_data = split_reg$test)
head(preds_lm)

# Calculate RMSE
rmse <- sqrt(mean((preds_lm$.pred - split_reg$test$mpg)^2))
cat("RMSE:", round(rmse, 2), "\n")

Polynomial Regression

# Polynomial regression for non-linear relationships
model_poly <- tl_model(split_reg$train, mpg ~ wt, method = "polynomial", degree = 2)
print(model_poly)

# Predictions
preds_poly <- predict(model_poly, new_data = split_reg$test)

# RMSE
rmse_poly <- sqrt(mean((preds_poly$.pred - split_reg$test$mpg)^2))
cat("Polynomial RMSE:", round(rmse_poly, 2), "\n")

Random Forest Regression

# Train random forest for regression
model_rf_reg <- tl_model(split_reg$train, mpg ~ ., method = "forest")
print(model_rf_reg)

# Predictions
preds_rf <- predict(model_rf_reg, new_data = split_reg$test)

# RMSE
rmse_rf <- sqrt(mean((preds_rf$.pred - split_reg$test$mpg)^2))
cat("Random Forest RMSE:", round(rmse_rf, 2), "\n")

Regularized Regression

Regularization helps prevent overfitting by adding penalties to model complexity.

Ridge Regression

# Ridge regression (L2 regularization)
model_ridge <- tl_model(split_reg$train, mpg ~ ., method = "ridge")
print(model_ridge)

# Predictions
preds_ridge <- predict(model_ridge, new_data = split_reg$test)

LASSO

# LASSO (L1 regularization) - performs feature selection
model_lasso <- tl_model(split_reg$train, mpg ~ ., method = "lasso")
print(model_lasso)

# Predictions
preds_lasso <- predict(model_lasso, new_data = split_reg$test)

Elastic Net

# Elastic Net - combines L1 and L2 regularization
model_enet <- tl_model(split_reg$train, mpg ~ ., method = "elastic_net", alpha = 0.5)
print(model_enet)

# Predictions
preds_enet <- predict(model_enet, new_data = split_reg$test)

Model Comparison

# Compare multiple models
models <- list(
  linear = tl_model(split_reg$train, mpg ~ ., method = "linear"),
  tree = tl_model(split_reg$train, mpg ~ ., method = "tree"),
  forest = tl_model(split_reg$train, mpg ~ ., method = "forest")
)

# Calculate RMSE for each model
results <- data.frame(
  Model = character(),
  RMSE = numeric(),
  stringsAsFactors = FALSE
)

for (model_name in names(models)) {
  preds <- predict(models[[model_name]], new_data = split_reg$test)
  rmse <- sqrt(mean((preds$.pred - split_reg$test$mpg)^2))

  results <- rbind(results, data.frame(
    Model = model_name,
    RMSE = rmse
  ))
}

results <- results %>% arrange(RMSE)
print(results)

Advanced Features

Using Preprocessed Data

# Preprocess data
processed <- tl_prepare_data(
  split_reg$train,
  mpg ~ .,
  scale_method = "standardize",
  remove_correlated = TRUE,
  correlation_cutoff = 0.9
)

# Train on preprocessed data
model_processed <- tl_model(processed$data, mpg ~ ., method = "linear")
print(model_processed)

Formula Variations

# Interaction terms
model_interact <- tl_model(split_reg$train, mpg ~ wt * hp, method = "linear")

# Polynomial terms using I()
model_poly_manual <- tl_model(split_reg$train, mpg ~ wt + I(wt^2), method = "linear")

# Subset of predictors
model_subset <- tl_model(split_reg$train, mpg ~ wt + hp + disp, method = "linear")

Handling Different Data Types

Categorical Predictors

# Create dataset with categorical variables
mtcars_cat <- mtcars %>%
  mutate(
    cyl = as.factor(cyl),
    gear = as.factor(gear),
    am = as.factor(am)
  )

split_cat <- tl_split(mtcars_cat, prop = 0.7, seed = 123)

# Model with categorical predictors
model_cat <- tl_model(split_cat$train, mpg ~ ., method = "forest")
print(model_cat)

Missing Values

# Create data with missing values
mtcars_missing <- mtcars
mtcars_missing[sample(1:nrow(mtcars_missing), 5), "hp"] <- NA
mtcars_missing[sample(1:nrow(mtcars_missing), 3), "wt"] <- NA

# Preprocess to handle missing values
processed_missing <- tl_prepare_data(
  mtcars_missing,
  mpg ~ .,
  impute_method = "mean",
  scale_method = "standardize"
)

# Train model
model_imputed <- tl_model(processed_missing$data, mpg ~ ., method = "linear")

Best Practices

Always split your data before training to properly evaluate performance
Use stratified splitting for classification to maintain class proportions
Preprocess your data for better model performance
Compare multiple models to find the best approach
Consider regularization when dealing with many predictors
Use appropriate metrics - accuracy for classification, RMSE/MAE for regression

Summary

tidylearn provides a unified interface for supervised learning:

Classification: Logistic regression, decision trees, random forests, SVM, etc.
Regression: Linear, polynomial, random forests, regularized methods
Preprocessing: Integrated data preparation tools
Consistent API: Same function (tl_model()) for all methods
Tidy Output: Easy-to-use predictions and model objects

# Complete workflow example
final_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42)
final_prep <- tl_prepare_data(final_split$train, Species ~ ., scale_method = "standardize")
final_model <- tl_model(final_prep$data, Species ~ ., method = "forest")
final_preds <- predict(final_model, new_data = final_split$test)

# Evaluate
accuracy <- mean(final_preds$.pred == final_split$test$Species)
cat("Test Accuracy:", round(accuracy * 100, 1), "%\n")

Any scripts or data that you put into this service are public.

tidylearn documentation built on Feb. 6, 2026, 5:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tidylearn
A Unified Tidy Interface to R's Machine Learning Ecosystem

Supervised Learning with tidylearn
In tidylearn: A Unified Tidy Interface to R's Machine Learning Ecosystem

Introduction

Classification

Binary Classification

Logistic Regression

Decision Trees

Multi-class Classification

Random Forest

Support Vector Machines

Regression

Linear Regression

Polynomial Regression

Random Forest Regression

Regularized Regression

Ridge Regression

LASSO

Elastic Net

Model Comparison

Advanced Features

Using Preprocessed Data

Formula Variations

Handling Different Data Types

Categorical Predictors

Missing Values

Best Practices

Summary

Try the tidylearn package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tidylearn A Unified Tidy Interface to R's Machine Learning Ecosystem

Supervised Learning with tidylearn In tidylearn: A Unified Tidy Interface to R's Machine Learning Ecosystem

Introduction

Classification

Binary Classification

Logistic Regression

Decision Trees

Multi-class Classification

Random Forest

Support Vector Machines

Regression

Linear Regression

Polynomial Regression

Random Forest Regression

Regularized Regression

Ridge Regression

LASSO

Elastic Net

Model Comparison

Advanced Features

Using Preprocessed Data

Formula Variations

Handling Different Data Types

Categorical Predictors

Missing Values

Best Practices

Summary

Try the tidylearn package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tidylearn
A Unified Tidy Interface to R's Machine Learning Ecosystem

Supervised Learning with tidylearn
In tidylearn: A Unified Tidy Interface to R's Machine Learning Ecosystem