R/unsupervised-dbscan.R

Defines functions tl_fit_dbscan print.tidy_dbscan explore_dbscan_params augment_dbscan plot_knn_dist suggest_eps tidy_knn_dist tidy_dbscan

Documented in augment_dbscan explore_dbscan_params plot_knn_dist print.tidy_dbscan suggest_eps tidy_dbscan tidy_knn_dist

#' Tidy DBSCAN Clustering
#'
#' Performs density-based clustering with tidy output
#'
#' @param data A data frame, tibble, or distance matrix
#' @param eps Neighborhood radius (epsilon)
#' @param minPts Minimum number of points to form a dense region (default: 5)
#' @param cols Columns to include (tidy select). If NULL, uses all numeric columns.
#' @param distance Distance metric if data is not a dist object (default: "euclidean")
#'
#' @return A list of class "tidy_dbscan" containing:
#' \itemize{
#'   \item clusters: tibble with observation IDs and cluster assignments (0 = noise)
#'   \item core_points: logical vector indicating core points
#'   \item n_clusters: number of clusters (excluding noise)
#'   \item n_noise: number of noise points
#'   \item model: original dbscan object
#' }
#'
#' @examples
#' # Basic DBSCAN
#' db_result <- tidy_dbscan(iris, eps = 0.5, minPts = 5)
#'
#' # With suggested eps from k-NN distance plot
#' eps_suggestion <- suggest_eps(iris, minPts = 5)
#' db_result <- tidy_dbscan(iris, eps = eps_suggestion$eps, minPts = 5)
#'
#' @export
tidy_dbscan <- function(data, eps, minPts = 5, cols = NULL, distance = "euclidean") {

  # Handle distance matrix
  if (inherits(data, "dist")) {
    data_matrix <- as.matrix(data)
    data_orig <- NULL
  } else {
    # Select columns
    if (!is.null(cols)) {
      cols_enquo <- rlang::enquo(cols)
      data_selected <- data %>% dplyr::select(!!cols_enquo)
    } else {
      data_selected <- data %>% dplyr::select(where(is.numeric))
    }

    data_matrix <- as.matrix(data_selected)
    data_orig <- data_selected
  }

  # Perform DBSCAN
  db_model <- dbscan::dbscan(data_matrix, eps = eps, minPts = minPts)

  # Count clusters and noise
  n_clusters <- max(db_model$cluster)
  n_noise <- sum(db_model$cluster == 0)

  # Create clusters tibble
  if (!is.null(rownames(data))) {
    obs_ids <- rownames(data)
  } else {
    obs_ids <- paste0("obs_", seq_len(nrow(data_matrix)))
  }

  clusters_tbl <- tibble::tibble(
    .obs_id = obs_ids,
    cluster = as.integer(db_model$cluster),
    is_noise = cluster == 0,
    is_core = seq_along(cluster) %in% attr(db_model, "core")
  )

  # Create summary statistics
  cluster_summary <- clusters_tbl %>%
    dplyr::filter(!is_noise) %>%
    dplyr::group_by(cluster) %>%
    dplyr::summarise(
      size = dplyr::n(),
      n_core = sum(is_core),
      .groups = "drop"
    )

  # Return tidy object
  result <- list(
    clusters = clusters_tbl,
    summary = cluster_summary,
    n_clusters = n_clusters,
    n_noise = n_noise,
    eps = eps,
    minPts = minPts,
    model = db_model
  )

  class(result) <- c("tidy_dbscan", "list")
  result
}


#' Compute k-NN Distances
#'
#' Calculate distances to k-th nearest neighbor for each point
#'
#' @param data A data frame or matrix
#' @param k Number of nearest neighbors (default: 4)
#' @param cols Columns to include (tidy select). If NULL, uses all numeric columns.
#'
#' @return A tibble with observation IDs and k-NN distances
#' @export
tidy_knn_dist <- function(data, k = 4, cols = NULL) {

  # Select columns
  if (!is.null(cols)) {
    cols_enquo <- rlang::enquo(cols)
    data_selected <- data %>% dplyr::select(!!cols_enquo)
  } else {
    data_selected <- data %>% dplyr::select(where(is.numeric))
  }

  data_matrix <- as.matrix(data_selected)

  # Compute k-NN distances
  knn_distances <- dbscan::kNNdist(data_matrix, k = k)

  # Create tibble
  tibble::tibble(
    .obs_id = rownames(data) %||% paste0("obs_", seq_len(nrow(data))),
    knn_dist = as.numeric(knn_distances),
    rank = rank(knn_distances)
  )
}


#' Suggest eps Parameter for DBSCAN
#'
#' Use k-NN distance plot to suggest eps value
#'
#' @param data A data frame or matrix
#' @param minPts Minimum points parameter (used as k for k-NN)
#' @param method Method to suggest eps: "knee" (default), "percentile"
#' @param percentile If method="percentile", which percentile to use (default: 0.95)
#'
#' @return A list containing:
#' \itemize{
#'   \item eps: suggested epsilon value
#'   \item knn_distances: full tibble of k-NN distances
#'   \item method: method used
#' }
#'
#' @examples
#' eps_info <- suggest_eps(iris, minPts = 5)
#' eps_info$eps
#'
#' @export
suggest_eps <- function(data, minPts = 5, method = "percentile", percentile = 0.95) {

  # Compute k-NN distances
  knn_data <- tidy_knn_dist(data, k = minPts)

  # Suggest eps based on method
  if (method == "percentile") {
    eps_suggested <- stats::quantile(knn_data$knn_dist, percentile)

  } else if (method == "knee") {
    # Find knee/elbow in sorted k-NN distances
    sorted_dist <- sort(knn_data$knn_dist)
    n <- length(sorted_dist)

    # Calculate differences
    diffs <- diff(sorted_dist)

    # Find maximum jump
    max_jump_idx <- which.max(diffs)
    eps_suggested <- sorted_dist[max_jump_idx]

  } else {
    stop("method must be 'percentile' or 'knee'")
  }

  list(
    eps = as.numeric(eps_suggested),
    knn_distances = knn_data,
    method = method
  )
}


#' Plot k-NN Distance Plot
#'
#' Visualize k-NN distances to help choose eps
#'
#' @param data A data frame or tidy_knn_dist result
#' @param k If data is a data frame, k for k-NN (default: 4)
#' @param add_suggestion Add suggested eps line? (default: TRUE)
#' @param percentile Percentile for suggestion (default: 0.95)
#'
#' @return A ggplot object
#' @export
plot_knn_dist <- function(data, k = 4, add_suggestion = TRUE, percentile = 0.95) {

  # Get k-NN distances if needed
  if (inherits(data, "tbl_df") && "knn_dist" %in% names(data)) {
    knn_data <- data
  } else {
    knn_data <- tidy_knn_dist(data, k = k)
  }

  # Sort by distance
  knn_data <- knn_data %>% dplyr::arrange(knn_dist)

  # Create plot
  p <- ggplot2::ggplot(knn_data, ggplot2::aes(x = seq_along(knn_dist), y = knn_dist)) +
    ggplot2::geom_line(color = "steelblue", size = 1) +
    ggplot2::labs(
      title = paste0("k-NN Distance Plot (k = ", k, ")"),
      subtitle = "Look for 'elbow' or 'knee' to determine eps",
      x = "Points (sorted by distance)",
      y = paste0(k, "-NN Distance")
    ) +
    ggplot2::theme_minimal()

  # Add suggestion line
  if (add_suggestion) {
    eps_line <- stats::quantile(knn_data$knn_dist, percentile)
    p <- p +
      ggplot2::geom_hline(yintercept = eps_line, linetype = "dashed", color = "red") +
      ggplot2::annotate("text",
                       x = nrow(knn_data) * 0.7,
                       y = eps_line * 1.1,
                       label = sprintf("Suggested eps = %.3f\n(%d%% percentile)", eps_line, percentile * 100),
                       color = "red")
  }

  p
}


#' Augment Data with DBSCAN Cluster Assignments
#'
#' @param dbscan_obj A tidy_dbscan object
#' @param data Original data frame
#'
#' @return Original data with cluster information added
#' @export
augment_dbscan <- function(dbscan_obj, data) {

  if (!inherits(dbscan_obj, "tidy_dbscan")) {
    stop("dbscan_obj must be a tidy_dbscan object")
  }

  data %>%
    dplyr::bind_cols(
      tibble::tibble(
        cluster = as.factor(dbscan_obj$model$cluster),
        is_noise = dbscan_obj$clusters$is_noise,
        is_core = dbscan_obj$clusters$is_core
      )
    )
}


#' Explore DBSCAN Parameters
#'
#' Test multiple eps and minPts combinations
#'
#' @param data A data frame or matrix
#' @param eps_values Vector of eps values to test
#' @param minPts_values Vector of minPts values to test
#'
#' @return A tibble with parameter combinations and resulting cluster counts
#' @export
explore_dbscan_params <- function(data, eps_values, minPts_values) {

  data_numeric <- data %>% dplyr::select(where(is.numeric))

  # Create parameter grid
  param_grid <- expand.grid(
    eps = eps_values,
    minPts = minPts_values,
    stringsAsFactors = FALSE
  )

  # Test each combination
  results <- purrr::map2_dfr(param_grid$eps, param_grid$minPts, function(e, m) {
    db <- tidy_dbscan(data_numeric, eps = e, minPts = m)

    tibble::tibble(
      eps = e,
      minPts = m,
      n_clusters = db$n_clusters,
      n_noise = db$n_noise,
      prop_noise = db$n_noise / nrow(data_numeric)
    )
  })

  results
}


#' Print Method for tidy_dbscan
#'
#' @param x A tidy_dbscan object
#' @param ... Additional arguments (ignored)
#'
#' @return Invisibly returns the input object x
#' @export
print.tidy_dbscan <- function(x, ...) {
  cat("Tidy DBSCAN Clustering\n")
  cat("======================\n\n")
  cat("Parameters:\n")
  cat("  eps (neighborhood radius):", x$eps, "\n")
  cat("  minPts (minimum points):  ", x$minPts, "\n\n")

  cat("Results:\n")
  cat("  Number of clusters:", x$n_clusters, "\n")
  cat("  Number of noise points:", x$n_noise, "\n")
  cat("  Proportion noise:", sprintf("%.1f%%", (x$n_noise / nrow(x$clusters)) * 100), "\n\n")

  if (nrow(x$summary) > 0) {
    cat("Cluster Summary:\n")
    print(x$summary)
  }

  cat("\nUse augment_dbscan() to add cluster assignments to your data\n")

  invisible(x)
}


#' Fit DBSCAN for tidylearn models
#' @keywords internal
#' @noRd
tl_fit_dbscan <- function(data, formula = NULL, eps = 0.5, minPts = 5, ...) {
  tl_check_packages("dbscan")

  # Extract variables to use
  if (!is.null(formula)) {
    vars <- get_formula_vars(formula, data)
    data_for_db <- data[, vars, drop = FALSE]
  } else {
    data_for_db <- data %>% dplyr::select(where(is.numeric))
  }

  # Fit DBSCAN using tidy_dbscan
  db_result <- tidy_dbscan(data_for_db, eps = eps, minPts = minPts, ...)

  # Return in expected format
  list(
    clusters = db_result$clusters,
    core_points = db_result$core_points,
    n_clusters = db_result$n_clusters,
    n_noise = db_result$n_noise,
    model = db_result$model
  )
}

Try the tidylearn package in your browser

Any scripts or data that you put into this service are public.

tidylearn documentation built on Feb. 6, 2026, 5:07 p.m.