Unsupervised Learning with tidylearn
In tidylearn: A Unified Tidy Interface to R's Machine Learning Ecosystem

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

library(tidylearn)
library(dplyr)
library(ggplot2)

Introduction

This vignette explores unsupervised learning in tidylearn. All methods wrap established R packages - the algorithms are unchanged, tidylearn simply provides a consistent interface and tidy output.

Wrapped packages include:

stats (prcomp(), kmeans(), hclust(), cmdscale())
cluster (pam(), clara())
dbscan for density-based clustering
MASS (isoMDS(), sammon())
smacof for MDS algorithms

Access raw model objects via model$fit for package-specific functionality.

Dimensionality Reduction

Dimensionality reduction techniques help visualize high-dimensional data and extract key patterns.

Principal Component Analysis (PCA)

# Perform PCA on iris data (excluding species)
model_pca <- tl_model(iris[, 1:4], method = "pca")
print(model_pca)

# Extract variance explained
variance_explained <- model_pca$fit$variance_explained
print(variance_explained)

# Cumulative variance explained
cumsum(variance_explained$prop_variance)

# Transform data to principal components
pca_scores <- predict(model_pca)
head(pca_scores)

# Visualize first two components
pca_plot_data <- pca_scores %>%
  mutate(Species = iris$Species)

ggplot(pca_plot_data, aes(x = PC1, y = PC2, color = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "PCA of Iris Dataset",
    x = paste0("PC1 (", round(variance_explained$prop_variance[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained$prop_variance[2] * 100, 1), "%)")
  ) +
  theme_minimal()

# Examine loadings (variable contributions)
loadings <- model_pca$fit$loadings
print(loadings)

Multidimensional Scaling (MDS)

# Perform MDS
model_mds <- tl_model(iris[, 1:4], method = "mds", k = 2)
print(model_mds)

# Extract MDS coordinates
mds_points <- predict(model_mds)
head(mds_points)

# Visualize MDS
mds_plot_data <- mds_points %>%
  mutate(Species = iris$Species)

ggplot(mds_plot_data, aes(x = Dim1, y = Dim2, color = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "MDS of Iris Dataset") +
  theme_minimal()

Clustering

Clustering algorithms group similar observations together without using labels.

K-means Clustering

# Perform k-means with k=3
model_kmeans <- tl_model(iris[, 1:4], method = "kmeans", k = 3)
print(model_kmeans)

# Extract cluster assignments
clusters <- model_kmeans$fit$clusters
head(clusters)

# Compare clusters with actual species
table(Cluster = clusters$cluster, Species = iris$Species)

# Visualize clusters using PCA
cluster_viz <- pca_scores %>%
  mutate(
    Cluster = as.factor(clusters$cluster),
    Species = iris$Species
  )

ggplot(cluster_viz, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "K-means Clusters vs True Species") +
  theme_minimal()

# Access cluster centers
centers <- model_kmeans$fit$centers
print(centers)

PAM (K-medoids)

PAM is more robust to outliers than k-means:

# Perform PAM clustering
model_pam <- tl_model(iris[, 1:4], method = "pam", k = 3)
print(model_pam)

# Extract clusters
clusters_pam <- model_pam$fit$clusters
table(Cluster = clusters_pam$cluster, Species = iris$Species)

Hierarchical Clustering

# Perform hierarchical clustering
model_hclust <- tl_model(iris[, 1:4], method = "hclust")
print(model_hclust)

# Plot dendrogram
plot(model_hclust$fit$model, labels = FALSE, main = "Hierarchical Clustering of Iris")

# Cut tree to get clusters
k <- 3
clusters_hc <- cutree(model_hclust$fit$model, k = k)
table(Cluster = clusters_hc, Species = iris$Species)

# Visualize hierarchical clusters
hc_viz <- pca_scores %>%
  mutate(
    Cluster = as.factor(clusters_hc),
    Species = iris$Species
  )

ggplot(hc_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Hierarchical Clustering Results") +
  theme_minimal()

DBSCAN (Density-Based Clustering)

DBSCAN can find arbitrarily shaped clusters and identify outliers:

# Perform DBSCAN
model_dbscan <- tl_model(iris[, 1:4], method = "dbscan", eps = 0.5, minPts = 5)
print(model_dbscan)

# Extract clusters (0 = noise/outliers)
clusters_dbscan <- model_dbscan$fit$clusters
table(clusters_dbscan$cluster)

# Compare with species
table(Cluster = clusters_dbscan$cluster, Species = iris$Species)

CLARA (for Large Datasets)

CLARA is efficient for large datasets:

# Create larger dataset
large_data <- iris[rep(1:nrow(iris), 10), 1:4]

# Perform CLARA
model_clara <- tl_model(large_data, method = "clara", k = 3, samples = 5)
print(model_clara)

# Extract clusters
clusters_clara <- model_clara$fit$clusters

Choosing the Number of Clusters

Elbow Method

# Try different values of k
k_values <- 2:8
within_ss <- numeric(length(k_values))

for (i in seq_along(k_values)) {
  k <- k_values[i]
  model <- tl_model(iris[, 1:4], method = "kmeans", k = k)
  within_ss[i] <- model$fit$model$tot.withinss
}

# Plot elbow curve
elbow_data <- data.frame(k = k_values, within_ss = within_ss)

ggplot(elbow_data, aes(x = k, y = within_ss)) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  labs(
    title = "Elbow Method for Optimal k",
    x = "Number of Clusters (k)",
    y = "Total Within-Cluster Sum of Squares"
  ) +
  theme_minimal()

Predicting on New Data

Clustering New Observations

# Train clustering model
model_train <- tl_model(iris[1:100, 1:4], method = "kmeans", k = 3)

# Predict cluster assignments for new data
new_data <- iris[101:150, 1:4]
new_clusters <- predict(model_train, new_data = new_data)

head(new_clusters)

Transforming New Data with PCA

# Train PCA model
pca_train <- tl_model(iris[1:100, 1:4], method = "pca")

# Transform new data
new_pca <- predict(pca_train, new_data = new_data)
head(new_pca)

Combining Multiple Techniques

PCA followed by Clustering

# Reduce dimensions with PCA
pca_model <- tl_model(iris[, 1:4], method = "pca")
pca_data <- predict(pca_model)

# Select first 2 components
pca_reduced <- pca_data %>% select(PC1, PC2)

# Cluster in reduced space
kmeans_pca <- tl_model(pca_reduced, method = "kmeans", k = 3)
clusters_pca <- kmeans_pca$fit$clusters

# Visualize
viz_combined <- pca_data %>%
  mutate(
    Cluster = as.factor(clusters_pca$cluster),
    Species = iris$Species
  )

ggplot(viz_combined, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Clustering in PCA Space") +
  theme_minimal()

Practical Applications

Customer Segmentation

# Simulate customer data
set.seed(42)
customers <- data.frame(
  age = rnorm(200, 40, 15),
  income = rnorm(200, 50000, 20000),
  spending_score = rnorm(200, 50, 25)
)

# Standardize features
customers_scaled <- scale(customers) %>% as.data.frame()

# Cluster customers
customer_segments <- tl_model(customers_scaled, method = "kmeans", k = 4)
customers$segment <- customer_segments$fit$clusters$cluster

# Visualize segments
ggplot(customers, aes(x = income, y = spending_score, color = as.factor(segment))) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "Customer Segmentation",
    color = "Segment"
  ) +
  theme_minimal()

Feature Extraction

# Use PCA for feature extraction
pca_features <- tl_model(mtcars, method = "pca")

# Keep components explaining 90% of variance
var_exp <- pca_features$fit$variance_explained
cumulative_var <- cumsum(var_exp$prop_variance)
n_components <- which(cumulative_var >= 0.90)[1]

cat("Components needed for 90% variance:", n_components, "\n")
cat("Original features:", ncol(mtcars), "\n")
cat("Dimension reduction:", round((1 - n_components/ncol(mtcars)) * 100, 1), "%\n")

Best Practices

Scale your data before clustering or PCA for fair feature comparison
Determine optimal k using elbow method or silhouette analysis
Try multiple methods - different algorithms work better for different data
Visualize results to understand cluster structure
Consider domain knowledge when interpreting clusters
Use PCA for visualization when data has more than 2-3 dimensions

Summary

tidylearn provides comprehensive unsupervised learning tools:

Dimensionality Reduction: PCA, MDS for visualization and feature extraction
Clustering: K-means, PAM, CLARA, hierarchical, DBSCAN
Unified API: Same tl_model() function for all methods
Prediction Support: Transform new data with learned patterns
Flexible: Works with different data types and sizes

# Complete unsupervised workflow
workflow_data <- iris[, 1:4]

# 1. Reduce dimensions
pca_final <- tl_model(workflow_data, method = "pca")

# 2. Cluster in reduced space
pca_coords <- predict(pca_final) %>% select(PC1, PC2)
clusters_final <- tl_model(pca_coords, method = "kmeans", k = 3)

# 3. Visualize
final_viz <- pca_coords %>%
  mutate(
    Cluster = as.factor(clusters_final$fit$clusters$cluster),
    Species = iris$Species
  )

ggplot(final_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Complete Unsupervised Workflow") +
  theme_minimal()

Any scripts or data that you put into this service are public.

tidylearn documentation built on Feb. 6, 2026, 5:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

tidylearn
A Unified Tidy Interface to R's Machine Learning Ecosystem

Unsupervised Learning with tidylearn
In tidylearn: A Unified Tidy Interface to R's Machine Learning Ecosystem

Introduction

Dimensionality Reduction

Principal Component Analysis (PCA)

Multidimensional Scaling (MDS)

Clustering

K-means Clustering

PAM (K-medoids)

Hierarchical Clustering

DBSCAN (Density-Based Clustering)

CLARA (for Large Datasets)

Choosing the Number of Clusters

Elbow Method

Predicting on New Data

Clustering New Observations

Transforming New Data with PCA

Combining Multiple Techniques

PCA followed by Clustering

Practical Applications

Customer Segmentation

Feature Extraction

Best Practices

Summary

Try the tidylearn package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tidylearn A Unified Tidy Interface to R's Machine Learning Ecosystem

Unsupervised Learning with tidylearn In tidylearn: A Unified Tidy Interface to R's Machine Learning Ecosystem

Introduction

Dimensionality Reduction

Principal Component Analysis (PCA)

Multidimensional Scaling (MDS)

Clustering

K-means Clustering

PAM (K-medoids)

Hierarchical Clustering

DBSCAN (Density-Based Clustering)

CLARA (for Large Datasets)

Choosing the Number of Clusters

Elbow Method

Predicting on New Data

Clustering New Observations

Transforming New Data with PCA

Combining Multiple Techniques

PCA followed by Clustering

Practical Applications

Customer Segmentation

Feature Extraction

Best Practices

Summary

Try the tidylearn package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

tidylearn
A Unified Tidy Interface to R's Machine Learning Ecosystem

Unsupervised Learning with tidylearn
In tidylearn: A Unified Tidy Interface to R's Machine Learning Ecosystem