knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 )
library(tidylearn) library(dplyr) library(ggplot2)
This vignette explores unsupervised learning in tidylearn. All methods wrap established R packages - the algorithms are unchanged, tidylearn simply provides a consistent interface and tidy output.
Wrapped packages include:
prcomp(), kmeans(), hclust(), cmdscale())pam(), clara())isoMDS(), sammon())Access raw model objects via model$fit for package-specific functionality.
Dimensionality reduction techniques help visualize high-dimensional data and extract key patterns.
# Perform PCA on iris data (excluding species) model_pca <- tl_model(iris[, 1:4], method = "pca") print(model_pca)
# Extract variance explained variance_explained <- model_pca$fit$variance_explained print(variance_explained)
# Cumulative variance explained cumsum(variance_explained$prop_variance)
# Transform data to principal components pca_scores <- predict(model_pca) head(pca_scores)
# Visualize first two components pca_plot_data <- pca_scores %>% mutate(Species = iris$Species) ggplot(pca_plot_data, aes(x = PC1, y = PC2, color = Species)) + geom_point(size = 3, alpha = 0.7) + labs( title = "PCA of Iris Dataset", x = paste0("PC1 (", round(variance_explained$prop_variance[1] * 100, 1), "%)"), y = paste0("PC2 (", round(variance_explained$prop_variance[2] * 100, 1), "%)") ) + theme_minimal()
# Examine loadings (variable contributions) loadings <- model_pca$fit$loadings print(loadings)
# Perform MDS model_mds <- tl_model(iris[, 1:4], method = "mds", k = 2) print(model_mds)
# Extract MDS coordinates mds_points <- predict(model_mds) head(mds_points)
# Visualize MDS mds_plot_data <- mds_points %>% mutate(Species = iris$Species) ggplot(mds_plot_data, aes(x = Dim1, y = Dim2, color = Species)) + geom_point(size = 3, alpha = 0.7) + labs(title = "MDS of Iris Dataset") + theme_minimal()
Clustering algorithms group similar observations together without using labels.
# Perform k-means with k=3 model_kmeans <- tl_model(iris[, 1:4], method = "kmeans", k = 3) print(model_kmeans)
# Extract cluster assignments clusters <- model_kmeans$fit$clusters head(clusters)
# Compare clusters with actual species table(Cluster = clusters$cluster, Species = iris$Species)
# Visualize clusters using PCA cluster_viz <- pca_scores %>% mutate( Cluster = as.factor(clusters$cluster), Species = iris$Species ) ggplot(cluster_viz, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) + geom_point(size = 3, alpha = 0.7) + labs(title = "K-means Clusters vs True Species") + theme_minimal()
# Access cluster centers centers <- model_kmeans$fit$centers print(centers)
PAM is more robust to outliers than k-means:
# Perform PAM clustering model_pam <- tl_model(iris[, 1:4], method = "pam", k = 3) print(model_pam) # Extract clusters clusters_pam <- model_pam$fit$clusters table(Cluster = clusters_pam$cluster, Species = iris$Species)
# Perform hierarchical clustering model_hclust <- tl_model(iris[, 1:4], method = "hclust") print(model_hclust)
# Plot dendrogram plot(model_hclust$fit$model, labels = FALSE, main = "Hierarchical Clustering of Iris")
# Cut tree to get clusters k <- 3 clusters_hc <- cutree(model_hclust$fit$model, k = k) table(Cluster = clusters_hc, Species = iris$Species)
# Visualize hierarchical clusters hc_viz <- pca_scores %>% mutate( Cluster = as.factor(clusters_hc), Species = iris$Species ) ggplot(hc_viz, aes(x = PC1, y = PC2, color = Cluster)) + geom_point(size = 3, alpha = 0.7) + labs(title = "Hierarchical Clustering Results") + theme_minimal()
DBSCAN can find arbitrarily shaped clusters and identify outliers:
# Perform DBSCAN model_dbscan <- tl_model(iris[, 1:4], method = "dbscan", eps = 0.5, minPts = 5) print(model_dbscan) # Extract clusters (0 = noise/outliers) clusters_dbscan <- model_dbscan$fit$clusters table(clusters_dbscan$cluster) # Compare with species table(Cluster = clusters_dbscan$cluster, Species = iris$Species)
CLARA is efficient for large datasets:
# Create larger dataset large_data <- iris[rep(1:nrow(iris), 10), 1:4] # Perform CLARA model_clara <- tl_model(large_data, method = "clara", k = 3, samples = 5) print(model_clara) # Extract clusters clusters_clara <- model_clara$fit$clusters
# Try different values of k k_values <- 2:8 within_ss <- numeric(length(k_values)) for (i in seq_along(k_values)) { k <- k_values[i] model <- tl_model(iris[, 1:4], method = "kmeans", k = k) within_ss[i] <- model$fit$model$tot.withinss } # Plot elbow curve elbow_data <- data.frame(k = k_values, within_ss = within_ss) ggplot(elbow_data, aes(x = k, y = within_ss)) + geom_line(linewidth = 1) + geom_point(size = 3) + labs( title = "Elbow Method for Optimal k", x = "Number of Clusters (k)", y = "Total Within-Cluster Sum of Squares" ) + theme_minimal()
# Train clustering model model_train <- tl_model(iris[1:100, 1:4], method = "kmeans", k = 3) # Predict cluster assignments for new data new_data <- iris[101:150, 1:4] new_clusters <- predict(model_train, new_data = new_data) head(new_clusters)
# Train PCA model pca_train <- tl_model(iris[1:100, 1:4], method = "pca") # Transform new data new_pca <- predict(pca_train, new_data = new_data) head(new_pca)
# Reduce dimensions with PCA pca_model <- tl_model(iris[, 1:4], method = "pca") pca_data <- predict(pca_model) # Select first 2 components pca_reduced <- pca_data %>% select(PC1, PC2) # Cluster in reduced space kmeans_pca <- tl_model(pca_reduced, method = "kmeans", k = 3) clusters_pca <- kmeans_pca$fit$clusters # Visualize viz_combined <- pca_data %>% mutate( Cluster = as.factor(clusters_pca$cluster), Species = iris$Species ) ggplot(viz_combined, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) + geom_point(size = 3, alpha = 0.7) + labs(title = "Clustering in PCA Space") + theme_minimal()
# Simulate customer data set.seed(42) customers <- data.frame( age = rnorm(200, 40, 15), income = rnorm(200, 50000, 20000), spending_score = rnorm(200, 50, 25) ) # Standardize features customers_scaled <- scale(customers) %>% as.data.frame() # Cluster customers customer_segments <- tl_model(customers_scaled, method = "kmeans", k = 4) customers$segment <- customer_segments$fit$clusters$cluster # Visualize segments ggplot(customers, aes(x = income, y = spending_score, color = as.factor(segment))) + geom_point(size = 3, alpha = 0.7) + labs( title = "Customer Segmentation", color = "Segment" ) + theme_minimal()
# Use PCA for feature extraction pca_features <- tl_model(mtcars, method = "pca") # Keep components explaining 90% of variance var_exp <- pca_features$fit$variance_explained cumulative_var <- cumsum(var_exp$prop_variance) n_components <- which(cumulative_var >= 0.90)[1] cat("Components needed for 90% variance:", n_components, "\n") cat("Original features:", ncol(mtcars), "\n") cat("Dimension reduction:", round((1 - n_components/ncol(mtcars)) * 100, 1), "%\n")
tidylearn provides comprehensive unsupervised learning tools:
tl_model() function for all methods# Complete unsupervised workflow workflow_data <- iris[, 1:4] # 1. Reduce dimensions pca_final <- tl_model(workflow_data, method = "pca") # 2. Cluster in reduced space pca_coords <- predict(pca_final) %>% select(PC1, PC2) clusters_final <- tl_model(pca_coords, method = "kmeans", k = 3) # 3. Visualize final_viz <- pca_coords %>% mutate( Cluster = as.factor(clusters_final$fit$clusters$cluster), Species = iris$Species ) ggplot(final_viz, aes(x = PC1, y = PC2, color = Cluster)) + geom_point(size = 3, alpha = 0.7) + labs(title = "Complete Unsupervised Workflow") + theme_minimal()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.