R/dormancy_scout.R

Defines functions print.dormancy_map compute_local_entropy compute_dormancy_potential dormancy_scout

Documented in compute_dormancy_potential compute_local_entropy dormancy_scout

#' @title Scout for Dormant Pattern Regions in Data
#' @name dormancy_scout
#' @description
#' Systematically scans the data space to identify regions where dormant patterns
#' might emerge. Unlike \code{\link{dormancy_detect}} which identifies specific patterns,
#' \code{dormancy_scout} maps the "terrain" of dormancy potential.
#'
#' @param data A numeric matrix or data frame.
#' @param grid_resolution Integer. Resolution of the scanning grid. Higher values
#'   give finer resolution but slower computation. Default is 20.
#' @param scout_method Character. Scanning method:
#'   \itemize{
#'     \item "density" - Identifies low-density regions where patterns might hide
#'     \item "variance" - Identifies high-variance regions with pattern potential
#'     \item "correlation" - Maps local correlation landscapes
#'     \item "entropy" - Identifies high-entropy regions with dormancy potential
#'   }
#'   Default is "density".
#' @param return_map Logical. Whether to return the full dormancy map. Default is TRUE.
#' @param verbose Logical. Whether to print progress messages. Default is FALSE.
#'
#' @return A list containing:
#' \itemize{
#'   \item \code{scout_results} - Data frame with coordinates and dormancy potential
#'   \item \code{hotspots} - Regions with highest dormancy potential
#'   \item \code{dormancy_map} - If \code{return_map = TRUE}, a matrix representing
#'     the dormancy landscape
#'   \item \code{summary} - Summary statistics of the scan
#' }
#'
#' @details
#' Scout analysis is useful for:
#'
#' \itemize{
#'   \item Identifying regions to monitor for future pattern emergence
#'   \item Understanding the "geography" of your data's pattern space
#'   \item Finding data regions that are underexplored or anomalous
#'   \item Planning targeted data collection in high-potential regions
#' }
#'
#' The scout creates a map of "dormancy potential" - not actual patterns, but
#' locations where patterns are more likely to exist or emerge.
#'
#' @examples
#' set.seed(42)
#' n <- 500
#' x <- rnorm(n)
#' y <- rnorm(n)
#' # Create a region with hidden pattern
#' z <- ifelse(x > 1 & y > 1, 0.9 * x + rnorm(sum(x > 1 & y > 1), 0, 0.1), y)
#' data <- data.frame(x = x, y = z)
#'
#' scout <- dormancy_scout(data, grid_resolution = 15)
#' print(scout)
#'
#' @export
dormancy_scout <- function(data, grid_resolution = 20, scout_method = "density",
                           return_map = TRUE, verbose = FALSE) {
  
  if (!is.data.frame(data) && !is.matrix(data)) {
    stop("data must be a data frame or matrix")
  }
  
  scout_method <- match.arg(scout_method,
                            c("density", "variance", "correlation", "entropy"))
  
  data <- as.data.frame(data)
  numeric_cols <- sapply(data, is.numeric)
  num_data <- data[, numeric_cols, drop = FALSE]
  num_data <- na.omit(num_data)
  
  if (ncol(num_data) < 2) {
    stop("Need at least 2 numeric variables for scouting")
  }
  
  if (verbose) message("Scouting data space using method: ", scout_method)
  
  # For simplicity, focus on 2D projections
  var_pairs <- combn(colnames(num_data), 2, simplify = FALSE)
  
  all_results <- list()
  all_maps <- list()
  
  for (pair in var_pairs) {
    v1 <- pair[1]
    v2 <- pair[2]
    
    x <- num_data[[v1]]
    y <- num_data[[v2]]
    
    # Create grid
    x_range <- range(x, na.rm = TRUE)
    y_range <- range(y, na.rm = TRUE)
    
    x_grid <- seq(x_range[1], x_range[2], length.out = grid_resolution)
    y_grid <- seq(y_range[1], y_range[2], length.out = grid_resolution)
    
    # Compute dormancy potential at each grid point
    dormancy_map <- matrix(0, nrow = grid_resolution, ncol = grid_resolution)
    
    for (i in seq_along(x_grid)) {
      for (j in seq_along(y_grid)) {
        dormancy_map[i, j] <- compute_dormancy_potential(
          x, y, x_grid[i], y_grid[j], scout_method
        )
      }
    }
    
    # Find hotspots
    threshold <- quantile(dormancy_map, 0.9, na.rm = TRUE)
    hotspot_idx <- which(dormancy_map > threshold, arr.ind = TRUE)
    
    hotspots <- data.frame(
      x_coord = x_grid[hotspot_idx[, 1]],
      y_coord = y_grid[hotspot_idx[, 2]],
      potential = dormancy_map[hotspot_idx]
    )
    
    all_results[[paste(v1, v2, sep = "~")]] <- list(
      variables = c(v1, v2),
      hotspots = hotspots,
      max_potential = max(dormancy_map, na.rm = TRUE),
      mean_potential = mean(dormancy_map, na.rm = TRUE)
    )
    
    if (return_map) {
      all_maps[[paste(v1, v2, sep = "~")]] <- list(
        map = dormancy_map,
        x_grid = x_grid,
        y_grid = y_grid,
        variables = c(v1, v2)
      )
    }
  }
  
  # Compile summary
  summary_df <- data.frame(
    variable_pair = names(all_results),
    max_potential = sapply(all_results, function(x) x$max_potential),
    mean_potential = sapply(all_results, function(x) x$mean_potential),
    n_hotspots = sapply(all_results, function(x) nrow(x$hotspots))
  )
  
  result <- list(
    scout_results = summary_df,
    detailed_results = all_results,
    hotspots = do.call(rbind, lapply(names(all_results), function(nm) {
      df <- all_results[[nm]]$hotspots
      if (nrow(df) > 0) {
        df$variable_pair <- nm
      }
      return(df)
    })),
    dormancy_map = all_maps,
    method = scout_method,
    grid_resolution = grid_resolution
  )
  
  class(result) <- "dormancy_map"
  return(result)
}


#' Compute dormancy potential at a grid point
#' @keywords internal
compute_dormancy_potential <- function(x, y, x_center, y_center, method) {
  
  # Find nearby points
  bandwidth_x <- diff(range(x, na.rm = TRUE)) / 10
  bandwidth_y <- diff(range(y, na.rm = TRUE)) / 10
  
  nearby_idx <- (abs(x - x_center) < bandwidth_x) & (abs(y - y_center) < bandwidth_y)
  n_nearby <- sum(nearby_idx)
  
  if (n_nearby < 5) {
    return(0.5)  # Unknown potential in sparse regions
  }
  
  x_near <- x[nearby_idx]
  y_near <- y[nearby_idx]
  
  if (method == "density") {
    # Low density regions have higher dormancy potential
    potential <- 1 - n_nearby / length(x)
  } else if (method == "variance") {
    # High variance regions have higher potential
    potential <- var(x_near, na.rm = TRUE) * var(y_near, na.rm = TRUE)
    potential <- min(potential, 1)
  } else if (method == "correlation") {
    # Weak correlation regions have higher potential
    local_cor <- cor(x_near, y_near, use = "complete.obs")
    potential <- 1 - abs(local_cor)
  } else {  # entropy
    # Higher entropy = higher potential
    potential <- compute_local_entropy(x_near, y_near)
  }
  
  return(potential)
}


#' Compute local entropy
#' @keywords internal
compute_local_entropy <- function(x, y) {
  
  # Discretize
  n_bins <- 5
  x_bins <- cut(x, breaks = n_bins, labels = FALSE, include.lowest = TRUE)
  y_bins <- cut(y, breaks = n_bins, labels = FALSE, include.lowest = TRUE)
  
  # Joint distribution
  joint_table <- table(x_bins, y_bins)
  joint_prob <- joint_table / sum(joint_table)
  
  # Entropy
  entropy <- -sum(joint_prob * log2(joint_prob + 1e-10))
  
  # Normalize
  max_entropy <- log2(n_bins^2)
  
  return(entropy / max_entropy)
}


#' @export
print.dormancy_map <- function(x, ...) {
  cat("Dormancy Scout Results\n")
  cat("======================\n\n")
  
  cat("Method:", x$method, "\n")
  cat("Grid resolution:", x$grid_resolution, "\n\n")
  
  cat("Variable Pair Analysis:\n")
  print(x$scout_results)
  
  if (nrow(x$hotspots) > 0) {
    cat("\nTop Dormancy Hotspots:\n")
    print(head(x$hotspots[order(-x$hotspots$potential), ], 10))
  }
  
  invisible(x)
}

Try the dormancy package in your browser

Any scripts or data that you put into this service are public.

dormancy documentation built on March 16, 2026, 5:09 p.m.