R/integration.R

Defines functions predict.tidylearn_stratified tl_stratified_models tl_anomaly_aware tl_semisupervised tl_add_cluster_features tl_reduce_dimensions

Documented in predict.tidylearn_stratified tl_add_cluster_features tl_anomaly_aware tl_reduce_dimensions tl_semisupervised tl_stratified_models

#' Integration Functions: Combining Supervised and Unsupervised Learning
#'
#' These functions demonstrate the power of tidylearn's unified approach by
#' seamlessly integrating supervised and unsupervised learning techniques.

#' Feature Engineering via Dimensionality Reduction
#'
#' Use PCA, MDS, or other dimensionality reduction as a preprocessing step
#' for supervised learning. This can improve model performance and interpretability.
#'
#' @param data A data frame
#' @param response Response variable name (will be preserved)
#' @param method Dimensionality reduction method: "pca", "mds"
#' @param n_components Number of components to retain
#' @param ... Additional arguments for the dimensionality reduction method
#' @return A list containing the transformed data and the reduction model
#' @export
#' @examples
#' \donttest{
#' # Reduce dimensions before classification
#' reduced <- tl_reduce_dimensions(iris, response = "Species", method = "pca", n_components = 3)
#' model <- tl_model(reduced$data, Species ~ ., method = "logistic")
#' }
tl_reduce_dimensions <- function(data,
                                 response = NULL,
                                 method = "pca",
                                 n_components = NULL,
                                 ...) {
  # Separate response if provided
  if (!is.null(response)) {
    if (!response %in% names(data)) {
      stop("Response variable '", response, "' not found in data", call. = FALSE)
    }
    response_data <- data[[response]]
    predictor_data <- data %>% dplyr::select(-dplyr::all_of(response))
  } else {
    response_data <- NULL
    predictor_data <- data
  }

  # Apply dimensionality reduction
  reduction_model <- tl_model(predictor_data, method = method, ...)

  # Transform data
  if (method == "pca") {
    transformed <- reduction_model$fit$scores

    # Select components
    if (!is.null(n_components)) {
      pc_cols <- paste0("PC", seq_len(n_components))
      transformed <- transformed %>% dplyr::select(.obs_id, dplyr::all_of(pc_cols))
    }

    # Add response back
    if (!is.null(response)) {
      transformed[[response]] <- response_data
    }

  } else if (method == "mds") {
    transformed <- reduction_model$fit$points

    # Select dimensions
    if (!is.null(n_components)) {
      dim_cols <- paste0("Dim", seq_len(n_components))
      transformed <- transformed %>% dplyr::select(.obs_id, dplyr::all_of(dim_cols))
    }

    # Add response back
    if (!is.null(response)) {
      transformed[[response]] <- response_data
    }
  }

  list(
    data = transformed,
    reduction_model = reduction_model,
    original_data = data,
    response = response
  )
}

#' Cluster-Based Features
#'
#' Add cluster assignments as features for supervised learning.
#' This semi-supervised approach can capture non-linear patterns.
#'
#' @param data A data frame
#' @param response Response variable name (will be excluded from clustering)
#' @param method Clustering method: "kmeans", "pam", "hclust", "dbscan"
#' @param ... Additional arguments for clustering
#' @return Original data with cluster assignment column(s) added
#' @export
#' @examples
#' \donttest{
#' # Add cluster features before supervised learning
#' data_with_clusters <- tl_add_cluster_features(iris, response = "Species",
#'                                                 method = "kmeans", k = 3)
#' model <- tl_model(data_with_clusters, Species ~ ., method = "forest")
#' }
tl_add_cluster_features <- function(data, response = NULL, method = "kmeans", ...) {
  # Separate response if provided
  if (!is.null(response)) {
    if (!response %in% names(data)) {
      stop("Response variable '", response, "' not found in data", call. = FALSE)
    }
    predictor_data <- data %>% dplyr::select(-dplyr::all_of(response))
  } else {
    predictor_data <- data
  }

  # Perform clustering
  cluster_model <- tl_model(predictor_data, method = method, ...)

  # Extract cluster assignments
  if (method %in% c("kmeans", "pam", "clara")) {
    clusters <- cluster_model$fit$clusters$cluster
  } else if (method == "hclust") {
    # For hclust, we need k to cut the tree
    if (!"k" %in% names(list(...))) {
      warning("k not specified for hclust, using k=3")
      k <- 3
    } else {
      k <- list(...)$k
    }
    clusters <- stats::cutree(cluster_model$fit$model, k = k)
  } else if (method == "dbscan") {
    clusters <- cluster_model$fit$clusters$cluster
  }

  # Add to original data
  data_augmented <- data %>%
    dplyr::mutate(
      !!paste0("cluster_", method) := as.factor(clusters)
    )

  attr(data_augmented, "cluster_model") <- cluster_model
  data_augmented
}

#' Semi-Supervised Learning via Clustering
#'
#' Train a supervised model with limited labels by first clustering the data
#' and propagating labels within clusters.
#'
#' @param data A data frame
#' @param formula Model formula
#' @param labeled_indices Indices of labeled observations
#' @param cluster_method Clustering method for label propagation
#' @param supervised_method Supervised learning method for final model
#' @param ... Additional arguments
#' @return A tidylearn model trained on pseudo-labeled data
#' @export
#' @examples
#' \donttest{
#' # Use only 10% of labels
#' labeled_idx <- sample(nrow(iris), size = 15)
#' model <- tl_semisupervised(iris, Species ~ ., labeled_indices = labeled_idx,
#'                            cluster_method = "kmeans", supervised_method = "logistic")
#' }
tl_semisupervised <- function(data, formula, labeled_indices,
                              cluster_method = "kmeans",
                              supervised_method = "logistic", ...) {
  # Extract response variable
  response_var <- all.vars(formula)[1]

  # Create training data with only labeled observations
  labeled_data <- data[labeled_indices, ]
  unlabeled_data <- data[-labeled_indices, ]

  # Cluster the full dataset (excluding response)
  predictor_data <- data %>% dplyr::select(-dplyr::all_of(response_var))

  # Determine k from labeled data
  k <- length(unique(labeled_data[[response_var]]))

  cluster_model <- tl_model(predictor_data, method = cluster_method, k = k, ...)

  # Get cluster assignments
  clusters <- cluster_model$fit$clusters$cluster

  # Propagate labels within clusters
  cluster_labels <- tibble::tibble(
    obs_id = seq_len(nrow(data)),
    cluster = clusters,
    label = data[[response_var]]
  )

  # For each cluster, find the most common label from labeled data
  label_mapping <- cluster_labels %>%
    dplyr::filter(obs_id %in% labeled_indices) %>%
    dplyr::group_by(cluster) %>%
    dplyr::summarize(
      cluster_label = names(which.max(table(label))),
      .groups = "drop"
    )

  # Assign pseudo-labels to unlabeled data
  pseudo_labeled <- cluster_labels %>%
    dplyr::left_join(label_mapping, by = "cluster") %>%
    dplyr::mutate(
      final_label = dplyr::if_else(obs_id %in% labeled_indices, as.character(label), cluster_label)
    )

  # Create pseudo-labeled dataset
  data_pseudo <- data
  data_pseudo[[response_var]] <- as.factor(pseudo_labeled$final_label)

  # Train supervised model on pseudo-labeled data
  model <- tl_model(data_pseudo, formula, method = supervised_method, ...)

  # Add metadata
  model$semisupervised_info <- list(
    labeled_indices = labeled_indices,
    cluster_model = cluster_model,
    label_mapping = label_mapping
  )

  class(model) <- c("tidylearn_semisupervised", class(model))
  model
}

#' Anomaly-Aware Supervised Learning
#'
#' Detect outliers using DBSCAN or other methods, then optionally
#' remove them or down-weight them before supervised learning.
#'
#' @param data A data frame
#' @param formula Model formula
#' @param response Response variable name
#' @param anomaly_method Method for anomaly detection: "dbscan", "isolation_forest"
#' @param action Action to take: "remove", "flag", "downweight"
#' @param supervised_method Supervised learning method
#' @param ... Additional arguments
#' @return A tidylearn model or list with model and anomaly info
#' @export
#' @examples
#' \donttest{
#' model <- tl_anomaly_aware(iris, Species ~ ., response = "Species",
#'                            anomaly_method = "dbscan", action = "flag")
#' }
tl_anomaly_aware <- function(data, formula, response,
                              anomaly_method = "dbscan",
                              action = "flag",
                              supervised_method = "logistic", ...) {
  # Separate predictors for anomaly detection
  predictor_data <- data %>% dplyr::select(-dplyr::all_of(response))

  # Detect anomalies
  if (anomaly_method == "dbscan") {
    anomaly_model <- tl_model(predictor_data, method = "dbscan", ...)
    is_anomaly <- anomaly_model$fit$clusters$cluster == 0  # DBSCAN noise points
  } else {
    stop("Unsupported anomaly method: ", anomaly_method, call. = FALSE)
  }

  # Take action based on anomalies
  if (action == "remove") {
    data_clean <- data[!is_anomaly, ]
    model <- tl_model(data_clean, formula, method = supervised_method)
    model$anomalies_removed <- sum(is_anomaly)
  } else if (action == "flag") {
    data_flagged <- data %>%
      dplyr::mutate(is_anomaly = is_anomaly)
    # Include anomaly flag as a feature - manually construct formula
    response_var <- all.vars(formula)[1]
    all_vars <- all.vars(formula)
    predictor_vars <- all_vars[-1]

    if (length(predictor_vars) == 1 && predictor_vars[1] == ".") {
      # Formula was response ~ .
      formula_updated <- as.formula(paste(response_var, "~ . + is_anomaly"))
    } else {
      # Explicit predictors
      preds_str <- paste(c(predictor_vars, "is_anomaly"), collapse = " + ")
      formula_updated <- as.formula(paste(response_var, "~", preds_str))
    }
    model <- tl_model(data_flagged, formula_updated, method = supervised_method)
  } else if (action == "downweight") {
    # Create weights (anomalies get lower weight)
    weights <- ifelse(is_anomaly, 0.1, 1.0)
    model <- tl_model(data, formula, method = supervised_method, weights = weights)
  }

  # Add anomaly detection info
  model$anomaly_info <- list(
    anomaly_model = anomaly_model,
    is_anomaly = is_anomaly,
    n_anomalies = sum(is_anomaly),
    action = action
  )

  class(model) <- c("tidylearn_anomaly_aware", class(model))
  model
}

#' Stratified Features via Clustering
#'
#' Create cluster-specific supervised models for heterogeneous data
#'
#' @param data A data frame
#' @param formula Model formula
#' @param cluster_method Clustering method
#' @param k Number of clusters
#' @param supervised_method Supervised learning method
#' @param ... Additional arguments
#' @return A list of models (one per cluster) plus cluster assignments
#' @export
#' @examples
#' \donttest{
#' models <- tl_stratified_models(mtcars, mpg ~ ., cluster_method = "kmeans",
#'                                 k = 3, supervised_method = "linear")
#' }
tl_stratified_models <- function(data, formula, cluster_method = "kmeans",
                                 k = 3, supervised_method = "linear", ...) {
  # Extract response variable
  response_var <- all.vars(formula)[1]

  # Cluster the predictors
  predictor_data <- data %>% dplyr::select(-dplyr::all_of(response_var))
  cluster_model <- tl_model(predictor_data, method = cluster_method, k = k, ...)

  # Get cluster assignments
  clusters <- cluster_model$fit$clusters$cluster

  # Train a model for each cluster
  cluster_models <- list()
  for (i in seq_len(k)) {
    cluster_data <- data[clusters == i, ]
    if (nrow(cluster_data) > 0) {
      cluster_models[[paste0("cluster_", i)]] <- tl_model(
        cluster_data, formula, method = supervised_method, ...
      )
    }
  }

  # Return stratified model object
  structure(
    list(
      cluster_model = cluster_model,
      supervised_models = cluster_models,
      formula = formula,
      data = data
    ),
    class = c("tidylearn_stratified", "list")
  )
}

#' Predict from stratified models
#' @param object A tidylearn_stratified model object
#' @param new_data New data for predictions
#' @param ... Additional arguments
#' @return A tibble of predictions with cluster assignments
#' @export
predict.tidylearn_stratified <- function(object, new_data = NULL, ...) {
  if (is.null(new_data)) {
    new_data <- object$data
  }

  # Get response variable
  response_var <- all.vars(object$formula)[1]

  # Assign new data to clusters
  predictor_data <- new_data %>% dplyr::select(-dplyr::all_of(response_var))
  new_clusters <- predict(object$cluster_model, new_data = predictor_data)

  # Predict from appropriate cluster model
  predictions <- vector("list", nrow(new_data))
  for (i in seq_len(nrow(new_data))) {
    cluster_id <- new_clusters$cluster[i]
    model_name <- paste0("cluster_", cluster_id)

    if (model_name %in% names(object$supervised_models)) {
      pred <- predict(object$supervised_models[[model_name]],
                     new_data = new_data[i, , drop = FALSE], ...)
      predictions[[i]] <- pred$.pred[1]
    } else {
      predictions[[i]] <- NA
    }
  }

  tibble::tibble(
    .pred = unlist(predictions),
    .cluster = new_clusters$cluster
  )
}

Try the tidylearn package in your browser

Any scripts or data that you put into this service are public.

tidylearn documentation built on Feb. 6, 2026, 5:07 p.m.