knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 )
library(tidylearn) library(dplyr) library(ggplot2)
This vignette showcases tidylearn's capability to seamlessly integrate supervised and unsupervised learning in multi-step workflows.
Important: These integration functions orchestrate the wrapped packages
(stats, glmnet, randomForest, cluster, etc.) rather than implementing new
algorithms. tidylearn provides the workflow coordination, while the underlying
packages do the computational work. Access raw model objects via model$fit.
Use PCA or MDS to reduce feature space before supervised learning.
# Reduce dimensions before classification reduced <- tl_reduce_dimensions(iris, response = "Species", method = "pca", n_components = 3) # Inspect reduced data head(reduced$data)
# Train classifier on reduced features (remove .obs_id column first) reduced_data <- reduced$data %>% select(-starts_with(".obs")) model_reduced <- tl_model(reduced_data, Species ~ ., method = "logistic") print(model_reduced)
# Make predictions preds <- predict(model_reduced) accuracy <- mean(preds$.pred == iris$Species) cat("Accuracy with PCA features:", round(accuracy * 100, 1), "%\n")
# Split data for fair comparison split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123) # Model with original features model_original <- tl_model(split$train, Species ~ ., method = "logistic") preds_original <- predict(model_original, new_data = split$test) acc_original <- mean(preds_original$.pred == split$test$Species) # Model with PCA features reduced_train <- tl_reduce_dimensions(split$train, response = "Species", method = "pca", n_components = 3) # Remove .obs_id column before modeling (it's just an identifier) reduced_train_data <- reduced_train$data %>% select(-starts_with(".obs")) model_pca <- tl_model(reduced_train_data, Species ~ ., method = "logistic") # Need to transform test data using same PCA test_predictors <- split$test %>% select(-Species) test_transformed <- predict(reduced_train$reduction_model, new_data = test_predictors) test_transformed$Species <- split$test$Species test_transformed <- test_transformed %>% select(-starts_with(".obs")) preds_pca <- predict(model_pca, new_data = test_transformed) acc_pca <- mean(preds_pca$.pred == split$test$Species) # Compare results cat("Original features (4):", round(acc_original * 100, 1), "%\n") cat("PCA features (3):", round(acc_pca * 100, 1), "%\n") cat("Feature reduction:", round((1 - 3/4) * 100, 1), "%\n")
Add cluster assignments as features to boost supervised learning performance.
# Add cluster features data_clustered <- tl_add_cluster_features(iris, response = "Species", method = "kmeans", k = 3) # Check new features names(data_clustered)
# Train model with cluster features model_cluster <- tl_model(data_clustered, Species ~ ., method = "forest") print(model_cluster)
# Compare models with and without cluster features split_comp <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42) # Without cluster features model_no_cluster <- tl_model(split_comp$train, Species ~ ., method = "forest") preds_no_cluster <- predict(model_no_cluster, new_data = split_comp$test) acc_no_cluster <- mean(preds_no_cluster$.pred == split_comp$test$Species) # With cluster features train_clustered <- tl_add_cluster_features(split_comp$train, response = "Species", method = "kmeans", k = 3) model_with_cluster <- tl_model(train_clustered, Species ~ ., method = "forest") # Need to get cluster model for test data cluster_model <- attr(train_clustered, "cluster_model") test_clusters <- predict(cluster_model, new_data = split_comp$test[, -5]) test_clustered <- split_comp$test test_clustered$cluster_kmeans <- as.factor(test_clusters$cluster) preds_with_cluster <- predict(model_with_cluster, new_data = test_clustered) acc_with_cluster <- mean(preds_with_cluster$.pred == split_comp$test$Species) cat("Without cluster features:", round(acc_no_cluster * 100, 1), "%\n") cat("With cluster features:", round(acc_with_cluster * 100, 1), "%\n")
Train models with limited labels using cluster-based label propagation.
# Use only 10% of labels set.seed(123) labeled_indices <- sample(nrow(iris), size = 15) # Only 15 out of 150 labeled! # Train semi-supervised model model_semi <- tl_semisupervised(iris, Species ~ ., labeled_indices = labeled_indices, cluster_method = "kmeans", supervised_method = "logistic") print(model_semi)
# Check how labels were propagated label_mapping <- model_semi$semisupervised_info$label_mapping print(label_mapping)
# Evaluate performance preds_semi <- predict(model_semi) accuracy_semi <- mean(preds_semi$.pred == iris$Species) cat("Accuracy with only", length(labeled_indices), "labels:", round(accuracy_semi * 100, 1), "%\n") cat("Proportion of data labeled:", round(length(labeled_indices)/nrow(iris) * 100, 1), "%\n")
# Fully supervised with same amount of data labeled_data <- iris[labeled_indices, ] model_full <- tl_model(labeled_data, Species ~ ., method = "logistic") preds_full <- predict(model_full, new_data = iris) accuracy_full <- mean(preds_full$.pred == iris$Species) cat("Fully supervised (15 samples):", round(accuracy_full * 100, 1), "%\n") cat("Semi-supervised (15 labels + propagation):", round(accuracy_semi * 100, 1), "%\n")
Detect and handle outliers before supervised learning.
# Flag anomalies as a feature model_anomaly_flag <- tl_anomaly_aware(iris, Species ~ ., response = "Species", anomaly_method = "dbscan", action = "flag", supervised_method = "logistic") # Check anomaly info cat("Anomalies detected:", model_anomaly_flag$anomaly_info$n_anomalies, "\n")
# Remove anomalies before training model_anomaly_remove <- tl_anomaly_aware(iris, Species ~ ., response = "Species", anomaly_method = "dbscan", action = "remove", supervised_method = "logistic") cat("Anomalies removed:", model_anomaly_remove$anomalies_removed, "\n")
Create cluster-specific models for heterogeneous data.
# Train separate models for different clusters stratified_models <- tl_stratified_models(mtcars, mpg ~ ., cluster_method = "kmeans", k = 3, supervised_method = "linear") # Check structure names(stratified_models) length(stratified_models$supervised_models)
# Predictions using stratified models preds_stratified <- predict(stratified_models) head(preds_stratified)
# Calculate RMSE rmse_stratified <- sqrt(mean((preds_stratified$.pred - mtcars$mpg)^2)) cat("Stratified Model RMSE:", round(rmse_stratified, 2), "\n") # Compare with single model model_single <- tl_model(mtcars, mpg ~ ., method = "linear") preds_single <- predict(model_single) rmse_single <- sqrt(mean((preds_single$.pred - mtcars$mpg)^2)) cat("Single Model RMSE:", round(rmse_single, 2), "\n")
Combining multiple integration techniques:
# Step 1: Split data workflow_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42) # Step 2: Reduce dimensions workflow_reduced <- tl_reduce_dimensions(workflow_split$train, response = "Species", method = "pca", n_components = 3) # Step 3: Add cluster features to reduced data (remove .obs_id first) workflow_reduced_clean <- workflow_reduced$data %>% select(-starts_with(".obs")) workflow_clustered <- tl_add_cluster_features(workflow_reduced_clean, response = "Species", method = "kmeans", k = 3) # Step 4: Train final model workflow_model <- tl_model(workflow_clustered, Species ~ ., method = "forest") print(workflow_model)
# Transform test data through same pipeline # 1. Apply PCA transformation test_pca <- predict(workflow_reduced$reduction_model, new_data = workflow_split$test[, -5]) test_pca$Species <- workflow_split$test$Species # 2. Get cluster assignments cluster_model_wf <- attr(workflow_clustered, "cluster_model") test_clusters_wf <- predict(cluster_model_wf, new_data = test_pca[, grep("PC", names(test_pca))]) test_pca$cluster_kmeans <- as.factor(test_clusters_wf$cluster) # 3. Predict workflow_preds <- predict(workflow_model, new_data = test_pca) workflow_accuracy <- mean(workflow_preds$.pred == workflow_split$test$Species) cat("Complete Workflow Accuracy:", round(workflow_accuracy * 100, 1), "%\n")
# Simulate credit data set.seed(42) n <- 500 credit_data <- data.frame( age = rnorm(n, 40, 12), income = rnorm(n, 50000, 20000), debt_ratio = runif(n, 0, 0.5), credit_score = rnorm(n, 700, 100), years_employed = rpois(n, 5) ) # Create target variable (default risk) credit_data$default <- factor( ifelse(credit_data$debt_ratio > 0.4 & credit_data$credit_score < 650, "Yes", "No") ) # Split data credit_split <- tl_split(credit_data, prop = 0.7, stratify = "default", seed = 123)
# Strategy 1: Add customer segments as features credit_clustered <- tl_add_cluster_features(credit_split$train, response = "default", method = "kmeans", k = 4) model_credit <- tl_model(credit_clustered, default ~ ., method = "forest") # Transform test data cluster_model_credit <- attr(credit_clustered, "cluster_model") test_clusters_credit <- predict(cluster_model_credit, new_data = credit_split$test[, -6]) test_credit <- credit_split$test test_credit$cluster_kmeans <- as.factor(test_clusters_credit$cluster) preds_credit <- predict(model_credit, new_data = test_credit) accuracy_credit <- mean(preds_credit$.pred == credit_split$test$default) cat("Credit Risk Model Accuracy:", round(accuracy_credit * 100, 1), "%\n")
tidylearn's integration functions provide powerful ways to combine supervised and unsupervised learning:
tl_reduce_dimensions(): Use PCA/MDS as preprocessingtl_add_cluster_features(): Engineer features from clusterstl_semisupervised(): Train with limited labelstl_anomaly_aware(): Handle outliers intelligentlytl_stratified_models(): Create cluster-specific modelsThese functions orchestrate the underlying packages (stats, cluster, glmnet, randomForest, etc.) to enable multi-step workflows with a consistent interface.
# Final integrated example final_data <- iris final_split <- tl_split(final_data, prop = 0.7, stratify = "Species", seed = 999) # Combine PCA + clustering final_reduced <- tl_reduce_dimensions(final_split$train, response = "Species", method = "pca", n_components = 3) # Remove .obs_id column before clustering final_reduced_clean <- final_reduced$data %>% select(-starts_with(".obs")) final_clustered <- tl_add_cluster_features(final_reduced_clean, response = "Species", method = "kmeans", k = 3) final_model <- tl_model(final_clustered, Species ~ ., method = "logistic") cat("Final integrated model created successfully!\n") print(final_model)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.