Nothing
# ---- HYPA: Hypothesis Testing for Path Anomalies ----
#
# Implements HYPA (LaRock et al. 2020) for detecting anomalous paths in
# sequential data. Uses a multi-hypergeometric null model on k-th order
# De Bruijn graphs to identify over/under-represented paths.
# ---------------------------------------------------------------------------
# Internal: Fit Xi matrix (iterative proportional fitting)
# ---------------------------------------------------------------------------
#' Compute propensity matrix Xi from node strengths
#'
#' Xi_{vw} = s_out(v) * s_in(w) for edges present in the De Bruijn graph.
#' The product of strengths gives N = sum(Xi) >> m = sum(adj), ensuring a
#' non-degenerate hypergeometric null model. This follows the HYPA null
#' model where edge propensity is proportional to the product of endpoint
#' weighted degrees.
#'
#' @param adj Square adjacency matrix of the De Bruijn graph.
#' @return Matrix Xi with same dimensions as adj (sum(Xi) >> sum(adj)).
#' @noRd
.hypa_fit_xi <- function(adj) {
out_strength <- rowSums(adj)
in_strength <- colSums(adj)
mask <- adj > 0
# Xi_{vw} = s_out(v) * s_in(w) where edges exist in the De Bruijn graph
outer(out_strength, in_strength) * mask
}
# ---------------------------------------------------------------------------
# Internal: Compute HYPA scores
# ---------------------------------------------------------------------------
#' Compute hypergeometric p-values for each edge
#'
#' For each edge (v,w) with observed weight f, computes:
#' \code{HYPA(v,w) = P(X <= f)} where \code{X ~ Hypergeometric(N, K, n)},
#' \code{N = round(sum(Xi))}, \code{K = round(Xi[v,w])}, \code{n = sum(adj)}
#'
#' @param adj Adjacency matrix (edge weights = path frequencies).
#' @param xi Fitted propensity matrix.
#' @return Data frame with from, to, observed, expected, p_value, anomaly.
#' @noRd
.hypa_compute_scores <- function(adj, xi) {
n <- nrow(adj)
nodes <- rownames(adj)
# Total pool
N_total <- round(sum(xi))
n_draws <- sum(adj)
# Collect edges
edge_idx <- which(adj > 0, arr.ind = TRUE)
if (nrow(edge_idx) == 0L) {
return(data.frame(path = character(0L), from = character(0L),
to = character(0L), observed = integer(0L),
expected = numeric(0L), ratio = numeric(0L),
p_value = numeric(0L), anomaly = character(0L),
stringsAsFactors = FALSE))
}
results <- lapply(seq_len(nrow(edge_idx)), function(idx) {
i <- edge_idx[idx, 1L]
j <- edge_idx[idx, 2L]
f_obs <- adj[i, j]
K <- round(xi[i, j])
# Clamp parameters to valid range
K <- min(K, N_total)
K <- max(K, 0L)
n_clamp <- min(n_draws, N_total)
# Hypergeometric CDF: P(X <= f_obs)
p_value <- stats::phyper(f_obs, K, N_total - K, n_clamp)
# Expected value: n * K / N
expected <- if (N_total > 0) n_draws * K / N_total else 0
# Reconstruct full path; from = context, to = next state
from_parts <- strsplit(nodes[i], .HON_SEP, fixed = TRUE)[[1L]]
to_parts <- strsplit(nodes[j], .HON_SEP, fixed = TRUE)[[1L]]
next_state <- to_parts[length(to_parts)]
path <- paste(c(from_parts, next_state), collapse = " -> ")
ratio <- if (expected > 0) f_obs / expected else Inf
data.frame(
path = path,
from = paste(from_parts, collapse = " -> "),
to = next_state,
observed = as.integer(f_obs),
expected = expected,
ratio = ratio,
p_value = p_value,
stringsAsFactors = FALSE
)
})
result <- do.call(rbind, results)
# Classify anomalies
result$anomaly <- vapply(result$p_value, function(s) {
if (s < 0.05) "under" else if (s > 0.95) "over" else "normal"
}, character(1L))
result
}
# ---------------------------------------------------------------------------
# Main function: build_hypa
# ---------------------------------------------------------------------------
#' Detect Path Anomalies via HYPA
#'
#' Constructs a k-th order De Bruijn graph from sequential trajectory data and
#' uses a hypergeometric null model to detect paths with anomalous frequencies.
#' Paths occurring more or less often than expected under the null model are
#' flagged as over- or under-represented.
#'
#' @param data A data.frame (rows = trajectories), list of character vectors,
#' \code{tna} object, or \code{netobject} with sequence data. For
#' \code{tna}/\code{netobject}, numeric state IDs are automatically
#' converted to label names.
#' @param k Integer. Order of the De Bruijn graph (default 2). Detects
#' anomalies in paths of length k.
#' @param alpha Numeric. Significance threshold for anomaly classification
#' (default 0.05). Paths with HYPA score < alpha are under-represented;
#' paths with score > 1-alpha are over-represented.
#' @param min_count Integer. Minimum observed count for a path to be
#' classified as anomalous (default 2). Paths with fewer observations
#' are always classified as \code{"normal"} regardless of their
#' HYPA score, since single occurrences are unreliable.
#' @param p_adjust Character. Method for multiple testing correction of
#' p-values. Default \code{"BH"} (Benjamini-Hochberg FDR control).
#' Accepts any method from \code{\link[stats]{p.adjust.methods}} or
#' \code{"none"} to skip correction. Under- and over-representation
#' p-values are adjusted separately (two-sided testing).
#' @return An object of class \code{net_hypa} with components:
#' \describe{
#' \item{scores}{Data frame with path, from, to, observed, expected,
#' ratio, p_value, p_adjusted_under, p_adjusted_over, anomaly
#' columns. The \code{path} column shows the full state sequence
#' (e.g., "A -> B -> C"); \code{from} is the context (conditioning
#' states); \code{to} is the next state; \code{ratio} is
#' observed / expected; \code{p_value} is the raw hypergeometric
#' CDF value; \code{p_adjusted_under} and \code{p_adjusted_over}
#' are the corrected p-values for under- and over-representation
#' tests respectively.}
#' \item{adjacency}{Weighted adjacency matrix of the De Bruijn graph.}
#' \item{xi}{Fitted propensity matrix.}
#' \item{k}{Order of the De Bruijn graph.}
#' \item{alpha}{Significance threshold used.}
#' \item{p_adjust}{Multiple testing correction method used.}
#' \item{n_anomalous}{Number of anomalous paths detected.}
#' \item{n_over}{Number of over-represented paths.}
#' \item{n_under}{Number of under-represented paths.}
#' \item{n_edges}{Total number of edges.}
#' \item{nodes}{Node names in the De Bruijn graph.}
#' }
#'
#' @references
#' LaRock, T., Nanumyan, V., Scholtes, I., Casiraghi, G., Eliassi-Rad, T.,
#' & Schweitzer, F. (2020). HYPA: Efficient Detection of Path Anomalies in
#' Time Series Data on Networks. \emph{SDM 2020}, 460–468.
#'
#' @examples
#' seqs <- list(c("A","B","C"), c("B","C","A"), c("A","C","B"), c("A","B","C"))
#' hyp <- build_hypa(seqs, k = 2)
#'
#' \donttest{
#' trajs <- list(c("A","B","C"), c("A","B","C"), c("A","B","C"),
#' c("A","B","D"), c("C","B","D"), c("C","B","A"))
#' h <- build_hypa(trajs, k = 2)
#' print(h)
#' }
#'
#' @export
build_hypa <- function(data, k = 3L, alpha = 0.05, min_count = 5L,
p_adjust = "BH") {
# --- Coerce tna/netobject to labeled data.frame ---
data <- .coerce_sequence_input(data)
k <- as.integer(k)
min_count <- as.integer(min_count)
# Validate p_adjust method
valid_methods <- c(stats::p.adjust.methods, "none")
if (!is.character(p_adjust) || length(p_adjust) != 1L ||
!p_adjust %in% valid_methods) {
stop(sprintf("'p_adjust' must be one of: %s",
paste(valid_methods, collapse = ", ")),
call. = FALSE)
}
stopifnot(
"'data' must be a data.frame or list" =
is.data.frame(data) || is.list(data),
"'k' must be >= 1" = k >= 1L,
"'alpha' must be in (0, 0.5)" = alpha > 0 && alpha < 0.5,
"'min_count' must be >= 1" = min_count >= 1L
)
trajectories <- .hon_parse_input(data, collapse_repeats = FALSE)
if (length(trajectories) == 0L) {
stop("No valid trajectories (each must have at least 2 states)")
}
# Build k-th order De Bruijn graph (reuse MOGen infrastructure)
kg <- .mogen_count_kgrams(trajectories, k)
if (nrow(kg$edges) == 0L) {
stop(sprintf("No edges at order %d (paths too short or too few)", k))
}
# Build adjacency matrix (weighted, NOT normalized)
nodes <- kg$nodes
n <- length(nodes)
adj <- matrix(0, nrow = n, ncol = n, dimnames = list(nodes, nodes))
idx <- cbind(match(kg$edges$from, nodes), match(kg$edges$to, nodes))
adj[idx] <- kg$edges$weight
# Compute Xi (propensity matrix)
xi <- .hypa_fit_xi(adj)
# Compute HYPA scores
scores <- .hypa_compute_scores(adj, xi)
# Two-sided p-value correction: p_value = P(X <= obs) from hypergeometric CDF
if (p_adjust != "none" && nrow(scores) > 0L) {
scores$p_adjusted_under <- stats::p.adjust(scores$p_value,
method = p_adjust)
scores$p_adjusted_over <- stats::p.adjust(1 - scores$p_value,
method = p_adjust)
} else {
scores$p_adjusted_under <- scores$p_value
scores$p_adjusted_over <- 1 - scores$p_value
}
# Classify anomalies: must exceed alpha AND min_count
scores$anomaly <- ifelse(
scores$observed < min_count, "normal",
ifelse(scores$p_adjusted_under < alpha, "under",
ifelse(scores$p_adjusted_over < alpha, "over", "normal"))
)
# Pre-sort: anomalous first (over by ratio desc, then under by ratio asc),
# then normal
anom_over <- scores[scores$anomaly == "over", , drop = FALSE]
anom_under <- scores[scores$anomaly == "under", , drop = FALSE]
normal <- scores[scores$anomaly == "normal", , drop = FALSE]
anom_over <- anom_over[order(-anom_over$ratio), , drop = FALSE]
anom_under <- anom_under[order(anom_under$ratio), , drop = FALSE]
scores <- rbind(anom_over, anom_under, normal)
rownames(scores) <- NULL
n_over <- nrow(anom_over)
n_under <- nrow(anom_under)
# cograph-compatible fields
cg <- .ho_cograph_fields(adj, nodes, method = "hypa")
result <- list(
scores = scores,
ho_edges = scores,
edges = cg$edges,
over = anom_over,
under = anom_under,
adjacency = adj,
weights = cg$weights,
xi = xi,
k = k,
alpha = alpha,
p_adjust = p_adjust,
n_anomalous = n_over + n_under,
n_over = n_over,
n_under = n_under,
n_edges = nrow(scores),
nodes = cg$nodes,
directed = TRUE,
meta = cg$meta,
node_groups = NULL
)
class(result) <- c("net_hypa", "cograph_network")
result
}
# ---------------------------------------------------------------------------
# S3 methods
# ---------------------------------------------------------------------------
#' Print Method for net_hypa
#'
#' @param x A \code{net_hypa} object.
#' @param ... Additional arguments (ignored).
#'
#' @return The input object, invisibly.
#'
#' @examples
#' seqs <- list(c("A","B","C"), c("B","C","A"), c("A","C","B"), c("A","B","C"))
#' hyp <- build_hypa(seqs, k = 2)
#' print(hyp)
#'
#' \donttest{
#' seqs <- data.frame(
#' V1 = c("A","B","C","A","B","C","A","B","C","A"),
#' V2 = c("B","C","A","B","C","A","B","C","A","B"),
#' V3 = c("C","A","B","C","A","B","C","A","B","C"),
#' V4 = c("A","B","C","A","B","C","A","B","C","A")
#' )
#' hypa <- build_hypa(seqs, k = 2L)
#' print(hypa)
#' }
#'
#' @export
print.net_hypa <- function(x, ...) {
cat("HYPA: Path Anomaly Detection\n")
cat(sprintf(" Order k: %d\n", x$k))
cat(sprintf(" Edges: %d\n", x$n_edges))
cat(sprintf(" Anomalous: %d (alpha=%.2f, p_adjust=%s)\n",
x$n_anomalous, x$alpha, x$p_adjust %||% "none"))
cat(sprintf(" Over-repr: %d\n", x$n_over))
cat(sprintf(" Under-repr: %d\n", x$n_under))
invisible(x)
}
#' Summary Method for net_hypa
#'
#' @param object A \code{net_hypa} object.
#' @param n Integer. Maximum number of paths to display per category
#' (default: 10).
#' @param type Character. Which anomalies to show: \code{"all"} (default),
#' \code{"over"}, or \code{"under"}.
#' @param ... Additional arguments (ignored).
#'
#' @return The input object, invisibly.
#'
#' @examples
#' seqs <- list(c("A","B","C"), c("B","C","A"), c("A","C","B"), c("A","B","C"))
#' hyp <- build_hypa(seqs, k = 2)
#' summary(hyp)
#'
#' \donttest{
#' seqs <- data.frame(
#' V1 = c("A","B","C","A","B","C","A","B","C","A"),
#' V2 = c("B","C","A","B","C","A","B","C","A","B"),
#' V3 = c("C","A","B","C","A","B","C","A","B","C"),
#' V4 = c("A","B","C","A","B","C","A","B","C","A")
#' )
#' hypa <- build_hypa(seqs, k = 2L)
#' summary(hypa)
#' summary(hypa, type = "over", n = 5)
#' }
#'
#' @export
summary.net_hypa <- function(object, n = 10L,
type = c("all", "over", "under"), ...) {
type <- match.arg(type)
cat("HYPA Summary\n\n")
cat(sprintf(" Order: %d | Nodes: %d | Edges: %d\n",
object$k, nrow(object$nodes), object$n_edges))
cat(sprintf(" Alpha: %.2f | p_adjust: %s\n",
object$alpha, object$p_adjust %||% "none"))
cat(sprintf(" Anomalous: %d (over: %d, under: %d)\n\n",
object$n_anomalous, object$n_over, object$n_under))
show_cols <- c("path", "observed", "expected", "ratio", "p_value")
if (type %in% c("all", "over") && object$n_over > 0L) {
cat(" Over-represented (top", min(n, object$n_over), "):\n")
top_over <- utils::head(object$over[, show_cols, drop = FALSE], n)
print(top_over, row.names = FALSE)
cat("\n")
}
if (type %in% c("all", "under") && object$n_under > 0L) {
cat(" Under-represented (top", min(n, object$n_under), "):\n")
top_under <- utils::head(object$under[, show_cols, drop = FALSE], n)
print(top_under, row.names = FALSE)
cat("\n")
}
if (object$n_anomalous == 0L) {
cat(" No anomalous paths detected.\n")
}
invisible(object)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.