R/get_cluster_regions.R

Defines functions .find_zone .get_cluster_regions_singlepass get_cluster_regions.iterative_scan get_cluster_regions.default get_cluster_regions

Documented in get_cluster_regions get_cluster_regions.default get_cluster_regions.iterative_scan

#' Extract Cluster Membership for Each Region
#'
#' Returns a data.frame indicating which cluster (if any) each region
#' belongs to. This is the primary output for visualization — use it with
#' any mapping package (ggplot2, leaflet, tmap, etc.).
#'
#' This is a generic with methods for objects returned by
#' \code{\link{treespatial_scan}}, \code{\link{circular_scan}}, and
#' \code{\link{iterative_scan}}.
#'
#' @param result Either a \code{"treespatial_scan"}/\code{"circular_scan"}
#'   object (single-pass scan) or an \code{"iterative_scan"} object.
#' @param n_clusters Integer. (Single-pass methods only.) Number of
#'   clusters to extract. \code{1} returns only the most likely cluster.
#'   Values greater than 1 use \code{\link{filter_clusters}} internally
#'   to identify distinct secondary clusters. Default is \code{1}.
#'   Ignored for iterative-scan inputs (every iteration is returned).
#' @param overlap Logical. If \code{TRUE} (default), returns a facet-ready
#'   data.frame: all regions are replicated for each cluster panel, with a
#'   \code{panel} column for faceting and \code{cluster} marked only for
#'   member regions. If \code{FALSE}, returns one row per region assigned
#'   to its highest-ranked cluster (for single-panel maps).
#' @param ... Further arguments passed to methods.
#'
#' @return A \code{data.frame} with columns from \code{result$regions} plus:
#'   \describe{
#'     \item{cluster}{Integer cluster number (1 = most likely / first
#'       iteration, 2 = first secondary / second iteration, etc.), or
#'       \code{NA} if the region is not in the cluster.}
#'     \item{node_id}{The tree node of the cluster, or \code{NA}.}
#'     \item{llr}{The log-likelihood ratio of the cluster, or \code{NA}.}
#'     \item{pvalue}{The p-value of the cluster, or \code{NA}.}
#'     \item{pvalue_adjusted, significant}{(Iterative method only) The
#'       Holm-Bonferroni adjusted p-value and corresponding significance
#'       flag for the iteration.}
#'     \item{panel}{(Only when \code{overlap = TRUE}) A two-line label
#'       for \code{facet_wrap}, with the cluster identifier on the first
#'       line and the test statistic on the second. For single-pass
#'       scans the label looks like \code{"#1 P209\\n(LR=39.6)"}; for
#'       iterative scans it looks like
#'       \code{"Iter 1: P209\\n(LR=39.6, p_adj=0.005)"}. The newline
#'       keeps long node identifiers from overflowing the strip in
#'       multi-panel layouts.}
#'   }
#'
#' @seealso \code{\link{treespatial_scan}}, \code{\link{circular_scan}},
#'   \code{\link{iterative_scan}}, \code{\link{filter_clusters}}
#'
#' @examples
#' data(london_collisions); data(london_tree)
#' result <- treespatial_scan(
#'   cases       = london_collisions$cases,
#'   population  = london_collisions$population,
#'   region_id   = london_collisions$region_id,
#'   x           = london_collisions$x,
#'   y           = london_collisions$y,
#'   node_id     = london_collisions$node_id,
#'   tree        = london_tree,
#'   nsim        = 99, seed = 42
#' )
#'
#' # Long format suitable for merging with a polygon layer.
#' cr <- get_cluster_regions(result, n_clusters = 2L, overlap = TRUE)
#' head(cr)
#'
#' @export
get_cluster_regions <- function(result, n_clusters = 1L, overlap = TRUE,
                                 ...) {
  UseMethod("get_cluster_regions")
}


#' @rdname get_cluster_regions
#' @export
get_cluster_regions.default <- function(result, n_clusters = 1L,
                                          overlap = TRUE, ...) {

  if (!inherits(result, "treespatial_scan") &&
      !inherits(result, "circular_scan")) {
    stop("'result' must be a treespatial_scan, circular_scan, or ",
         "iterative_scan object.", call. = FALSE)
  }
  .get_cluster_regions_singlepass(result, n_clusters, overlap)
}


#' @rdname get_cluster_regions
#' @export
get_cluster_regions.iterative_scan <- function(result, n_clusters = 1L,
                                                 overlap = TRUE, ...) {

  iter <- result
  if (iter$n_iter == 0) {
    return(data.frame())
  }
  if (is.null(iter$regions)) {
    stop("Iterative scan has no $regions table (tree-only scan). ",
         "Mapping by region is not applicable.", call. = FALSE)
  }

  full_regions <- iter$regions
  parts <- vector("list", iter$n_iter)

  for (k in seq_len(iter$n_iter)) {
    cl_regs <- iter$clusters$region_ids[[k]]
    in_cl   <- full_regions$region_id %in% cl_regs

    cr_k <- cbind(
      full_regions,
      data.frame(
        cluster         = ifelse(in_cl, k, NA_integer_),
        node_id         = ifelse(in_cl,
                                  as.character(iter$clusters$node_id[k]),
                                  NA_character_),
        llr             = ifelse(in_cl, iter$clusters$llr[k], NA_real_),
        pvalue          = ifelse(in_cl, iter$clusters$pvalue[k], NA_real_),
        pvalue_adjusted = ifelse(in_cl, iter$clusters$pvalue_adjusted[k],
                                  NA_real_),
        significant     = ifelse(in_cl, iter$clusters$significant[k], NA),
        stringsAsFactors = FALSE
      )
    )

    if (overlap) {
      sig_marker <- if (isTRUE(iter$clusters$significant[k])) "" else " (n.s.)"
      # Split into two lines so facet_wrap shows the node name on
      # line 1 (which can already be long, e.g. a multi-segment ICD-10
      # path) and the test statistic / adjusted p-value on line 2.
      cr_k$panel <- paste0("Iter ", k, ": ", iter$clusters$node_id[k], "\n",
                           "(LR=", round(iter$clusters$llr[k], 1),
                           ", p_adj=",
                           format.pval(iter$clusters$pvalue_adjusted[k],
                                        digits = 2),
                           sig_marker, ")")
    }

    parts[[k]] <- cr_k
  }

  if (overlap) {
    do.call(rbind, parts)
  } else {
    # Non-overlap: one row per region, using the LOWEST iteration in which
    # it appears (mirrors single-pass non-overlap semantics).
    full <- do.call(rbind, parts)
    full <- full[order(full$cluster, na.last = TRUE), ]
    full[!duplicated(full$region_id), ]
  }
}


# Internal: original single-pass logic moved out of the generic
.get_cluster_regions_singlepass <- function(result, n_clusters, overlap) {

  regions <- result$regions
  n <- nrow(regions)

  # --- Collect all cluster region assignments ---
  cluster_list <- list()

  # Cluster 1: most likely
  mlc <- result$most_likely_cluster
  mlc_idx <- match(mlc$region_ids, regions$region_id)
  mlc_idx <- mlc_idx[!is.na(mlc_idx)]
  mlc_node <- if (!is.null(mlc$node_id)) as.character(mlc$node_id) else "Zone"
  cluster_list[[1]] <- list(
    idx     = mlc_idx,
    cluster = 1L,
    node_id = mlc_node,
    llr     = mlc$llr,
    pvalue  = result$pvalue
  )

  # Secondary clusters
  if (n_clusters > 1) {
    dc <- filter_clusters(result)
    if (!is.null(dc) && nrow(dc) > 1) {
      zones <- build_zones(regions, max_pop = result$total_population * 0.5)
      sec_rows <- dc[-1, , drop = FALSE]
      n_sec <- min(nrow(sec_rows), n_clusters - 1)
      has_node <- "node_id" %in% names(sec_rows)

      for (k in seq_len(n_sec)) {
        zi <- .find_zone(zones, sec_rows$center[k], sec_rows$n_regions[k])
        if (!is.null(zi)) {
          sec_node <- if (has_node) as.character(sec_rows$node_id[k]) else "Zone"
          cluster_list[[k + 1]] <- list(
            idx     = zones[[zi]]$region_idx,
            cluster = as.integer(k + 1),
            node_id = sec_node,
            llr     = sec_rows$llr[k],
            pvalue  = sec_rows$pvalue[k]
          )
        }
      }
    }
  }

  # --- Build output ---
  if (overlap) {
    # Facet-ready format: ALL regions replicated for each cluster panel.
    # In each panel, cluster members are marked, non-members have NA.
    # Ready for: merge(map, cr) then ggplot + facet_wrap(~ panel)
    rows <- list()

    for (cl in cluster_list) {
      # Skip empty slots that arise when .find_zone() could not locate a
      # secondary cluster's zone in the rebuilt zone family. Iterating over
      # a NULL slot would silently produce a chunk with character(0) panel.
      if (is.null(cl)) next

      chunk <- regions
      chunk$cluster <- NA_integer_
      chunk$node_id <- NA_character_
      chunk$llr     <- NA_real_
      chunk$pvalue  <- NA_real_
      chunk$panel   <- paste0("#", cl$cluster, " ", cl$node_id, "\n",
                              "(LR=", round(cl$llr, 1), ")")

      # Mark cluster members
      chunk$cluster[cl$idx] <- cl$cluster
      chunk$node_id[cl$idx] <- cl$node_id
      chunk$llr[cl$idx]     <- cl$llr
      chunk$pvalue[cl$idx]  <- cl$pvalue

      rows <- c(rows, list(chunk))
    }

    out <- do.call(rbind, rows)
    rownames(out) <- NULL

  } else {
    # Single-map format: one row per region, highest-ranked cluster wins.
    out <- regions
    out$cluster <- NA_integer_
    out$node_id <- NA_character_
    out$llr     <- NA_real_
    out$pvalue  <- NA_real_

    for (cl in cluster_list) {
      if (is.null(cl)) next
      free <- is.na(out$cluster[cl$idx])
      out$cluster[cl$idx[free]] <- cl$cluster
      out$node_id[cl$idx[free]] <- cl$node_id
      out$llr[cl$idx[free]]     <- cl$llr
      out$pvalue[cl$idx[free]]  <- cl$pvalue
    }
  }

  out
}

#' @keywords internal
.find_zone <- function(zones, center, n_regions) {
  for (i in seq_along(zones)) {
    if (zones[[i]]$center == center &&
        length(zones[[i]]$region_idx) == n_regions) {
      return(i)
    }
  }
  NULL
}

Try the treeSS package in your browser

Any scripts or data that you put into this service are public.

treeSS documentation built on May 16, 2026, 1:08 a.m.