Nothing
# ---- Higher-Order Networks (HON) ----
# Separator for encoding tuple keys as strings (non-printable, avoids
# collision with any state label)
.HON_SEP <- "\x01"
# ---------------------------------------------------------------------------
# Encoding / decoding helpers
# ---------------------------------------------------------------------------
#' Encode a character vector (source tuple) into a single string key
#' @param x Character vector representing a source tuple.
#' @return Single string with elements joined by .HON_SEP.
#' @noRd
.hon_encode <- function(x) {
paste(x, collapse = .HON_SEP)
}
#' Decode a string key back to its character vector tuple
#' @param key Single string created by .hon_encode().
#' @return Character vector.
#' @noRd
.hon_decode <- function(key) {
strsplit(key, .HON_SEP, fixed = TRUE)[[1L]]
}
#' Return the tuple length for one or more encoded keys
#' @param keys Character vector of encoded keys.
#' @return Integer vector of lengths.
#' @noRd
.hon_key_len <- function(keys) {
# Count separators + 1
n_sep <- nchar(keys) - nchar(gsub(.HON_SEP, "", keys, fixed = TRUE))
as.integer(n_sep + 1L)
}
# ---------------------------------------------------------------------------
# Input parsing
# ---------------------------------------------------------------------------
#' Parse and validate input for build_hon
#'
#' Converts data.frame or list input into a canonical list of character
#' vectors (one per trajectory). Strips trailing NAs from data.frame rows.
#' Optionally collapses adjacent duplicate states.
#'
#' @param data data.frame or list of character/numeric vectors.
#' @param collapse_repeats Logical. Remove adjacent duplicates.
#' @return List of character vectors.
#' @noRd
.hon_parse_input <- function(data, collapse_repeats = FALSE) {
if (is.data.frame(data)) {
stopifnot("data.frame must have at least one column" = ncol(data) >= 1L)
stopifnot("data.frame must have at least one row" = nrow(data) >= 1L)
trajectories <- lapply(seq_len(nrow(data)), function(i) {
row_vals <- as.character(unlist(data[i, ], use.names = FALSE))
# Strip trailing NAs
non_na <- which(!is.na(row_vals))
if (length(non_na) == 0L) return(character(0L))
row_vals[seq_len(max(non_na))]
})
# Remove empty trajectories
trajectories <- trajectories[vapply(trajectories, length, integer(1L)) > 0L]
} else if (is.list(data)) {
trajectories <- lapply(data, function(x) as.character(x))
} else {
stop("'data' must be a data.frame or list of character vectors") # nocov
}
if (collapse_repeats) {
trajectories <- lapply(trajectories, function(traj) {
if (length(traj) <= 1L) return(traj) # nocov
keep <- c(TRUE, traj[-1L] != traj[-length(traj)])
traj[keep]
})
}
# Filter trajectories with < 2 states (no transitions possible)
trajectories <- trajectories[vapply(trajectories, length, integer(1L)) >= 2L]
trajectories
}
# ---------------------------------------------------------------------------
# Observation counting
# ---------------------------------------------------------------------------
#' Build observation counts from trajectories
#'
#' For each trajectory and each subsequence length from 2 to (max_order + 1),
#' counts how many times each (source_tuple -> target) transition occurs.
#'
#' @param trajectories List of character vectors.
#' @param max_order Integer. Maximum order to consider.
#' @return Environment mapping encoded source keys to named integer vectors
#' of target counts.
#' @noRd
.hon_build_observations <- function(trajectories, max_order) {
count <- new.env(hash = TRUE, parent = emptyenv())
for (traj in trajectories) {
n <- length(traj)
# Subsequence lengths from 2 to min(max_order + 1, n)
max_len <- min(max_order + 1L, n)
for (order in seq.int(2L, max_len)) {
n_subseqs <- n - order + 1L
for (start in seq_len(n_subseqs)) {
subseq <- traj[start:(start + order - 1L)]
target <- subseq[order]
source_key <- .hon_encode(subseq[-order])
if (is.null(count[[source_key]])) {
count[[source_key]] <- integer(0L)
}
existing <- count[[source_key]]
if (is.na(existing[target])) {
existing[target] <- 1L
} else {
existing[target] <- existing[target] + 1L
}
count[[source_key]] <- existing
}
}
}
count
}
# ---------------------------------------------------------------------------
# Distribution building
# ---------------------------------------------------------------------------
#' Build probability distributions from observation counts
#'
#' Zeros out counts below min_freq, then normalizes remaining counts to
#' probabilities.
#'
#' @param count Environment from .hon_build_observations().
#' @param min_freq Integer. Minimum count to keep a transition.
#' @return Environment mapping encoded source keys to named numeric vectors
#' of probabilities.
#' @noRd
.hon_build_distributions <- function(count, min_freq) {
distr <- new.env(hash = TRUE, parent = emptyenv())
for (source_key in ls(count)) {
counts <- count[[source_key]]
# Zero out below min_freq — modify count in place so KLDThreshold
# uses filtered totals (matching pyHON's BuildDistributions behavior)
counts[counts < min_freq] <- 0L
count[[source_key]] <- counts
total <- sum(counts)
if (total > 0L) {
# Keep only positive entries
pos <- counts[counts > 0L]
distr[[source_key]] <- pos / total
} else {
distr[[source_key]] <- numeric(0L)
}
}
distr
}
# ---------------------------------------------------------------------------
# KL-divergence
# ---------------------------------------------------------------------------
#' KL-divergence from distribution a to distribution b
#'
#' Computes sum over targets in a of P(a,t) * log2(P(a,t) / P(b,t)).
#' When P(b,t) = 0 for a target where P(a,t) > 0, contributes Inf.
#'
#' @param a Named numeric vector (probabilities).
#' @param b Named numeric vector (probabilities).
#' @return Numeric scalar.
#' @noRd
.hon_kld <- function(a, b) {
if (length(a) == 0L) return(0.0)
divergence <- 0.0
for (target in names(a)) {
p_a <- unname(a[target])
p_b_raw <- b[target]
p_b <- if (is.na(p_b_raw)) 0.0 else unname(p_b_raw)
if (p_a > 0.0 && p_b > 0.0) {
divergence <- divergence + p_a * log2(p_a / p_b)
} else if (p_a > 0.0 && p_b == 0.0) {
divergence <- Inf
}
}
divergence
}
#' KL-divergence threshold for deciding whether to extend
#'
#' Threshold = new_order / log2(1 + sum(counts for ext_source)).
#' More data -> lower threshold -> easier to extend.
#' Higher order -> higher threshold -> harder to extend.
#'
#' @param new_order Integer. The proposed new order.
#' @param ext_source_key Encoded source key.
#' @param count Environment of observation counts.
#' @return Numeric scalar.
#' @noRd
.hon_kld_threshold <- function(new_order, ext_source_key, count) {
total <- sum(count[[ext_source_key]])
new_order / log2(1.0 + total)
}
# ---------------------------------------------------------------------------
# Source cache (suffix -> extended sources)
# ---------------------------------------------------------------------------
#' Build mapping from suffix tuples to their extended source tuples
#'
#' For each source tuple of length > 1 in the distribution, maps each suffix
#' of that tuple to the full source at the appropriate order.
#'
#' @param distr Environment of distributions.
#' @return Environment mapping encoded suffix keys to environments mapping
#' order (as character) to character vectors of encoded extended source keys.
#' @noRd
.hon_build_source_cache <- function(distr) {
cache <- new.env(hash = TRUE, parent = emptyenv())
for (source_key in ls(distr)) {
source <- .hon_decode(source_key)
src_len <- length(source)
if (src_len > 1L) {
new_order <- src_len
order_key <- as.character(new_order)
# For each suffix (starting positions 2, 3, ..., src_len)
for (start in seq.int(2L, src_len)) {
curr <- source[start:src_len]
curr_key <- .hon_encode(curr)
if (is.null(cache[[curr_key]])) {
cache[[curr_key]] <- new.env(hash = TRUE, parent = emptyenv())
}
order_env <- cache[[curr_key]]
if (is.null(order_env[[order_key]])) {
order_env[[order_key]] <- character(0L)
}
existing <- order_env[[order_key]]
# Add if not already present (set behavior)
if (!(source_key %in% existing)) {
order_env[[order_key]] <- c(existing, source_key)
}
}
}
}
cache
}
#' Get extensions for a source at a given order from the cache
#'
#' @param curr_key Encoded key for the current source.
#' @param new_order Integer order.
#' @param cache Environment from .hon_build_source_cache().
#' @return Character vector of encoded extended source keys, or character(0).
#' @noRd
.hon_get_extensions <- function(curr_key, new_order, cache) {
if (is.null(cache[[curr_key]])) return(character(0L))
order_env <- cache[[curr_key]]
order_key <- as.character(new_order)
if (is.null(order_env[[order_key]])) return(character(0L))
order_env[[order_key]]
}
# ---------------------------------------------------------------------------
# HON+ helpers (parameter-free, lazy observation building)
# ---------------------------------------------------------------------------
#' HON+: maximum-divergence singleton distribution
#'
#' Returns a named numeric(1) that places all probability mass on the least
#' probable target in \code{distr_vec}. This is the distribution that would
#' maximally diverge from \code{distr_vec}, used as a pre-check in
#' \code{.honp_extend_rule()} to prune branches that cannot possibly exceed the
#' KLD threshold.
#'
#' @param distr_vec Named numeric vector of probabilities.
#' @return Named numeric(1) with value 1.0, named after the least probable
#' target.
#' @noRd
.honp_max_divergence <- function(distr_vec) {
min_target <- names(which.min(distr_vec))
result <- 1.0
names(result) <- min_target
result
}
#' HON+: build order-1 counts, distributions, and starting points
#'
#' Scans all trajectories once to collect order-1 transition counts together
#' with the (trajectory-index, position) pairs needed for lazy higher-order
#' extension.
#'
#' @param trajectories List of character vectors.
#' @param min_freq Integer. Minimum count to retain a transition.
#' @return List with three environments: \code{count}, \code{distr}, and
#' \code{starting_points}.
#' @noRd
.honp_build_order1 <- function(trajectories, min_freq) {
count <- new.env(hash = TRUE, parent = emptyenv())
starting_points <- new.env(hash = TRUE, parent = emptyenv())
for (ti in seq_along(trajectories)) {
traj <- trajectories[[ti]]
n <- length(traj)
for (pos in seq_len(n - 1L)) {
source_key <- .hon_encode(traj[pos])
target <- traj[pos + 1L]
# Count
cur <- count[[source_key]]
if (is.null(cur)) {
cur <- integer(0L)
names(cur) <- character(0L)
}
idx <- match(target, names(cur))
if (is.na(idx)) {
cur[target] <- 1L
} else {
cur[idx] <- cur[idx] + 1L
}
count[[source_key]] <- cur
# StartingPoints: store (traj_index, position) pairs
sp <- starting_points[[source_key]]
if (is.null(sp)) sp <- list()
sp[[length(sp) + 1L]] <- c(ti, pos)
starting_points[[source_key]] <- sp
}
}
# Build distributions (apply min_freq filtering then normalise)
distr <- new.env(hash = TRUE, parent = emptyenv())
for (source_key in ls(count)) {
counts <- count[[source_key]]
counts[counts < min_freq] <- 0L
count[[source_key]] <- counts # zero in place (for threshold calc)
total <- sum(counts)
if (total > 0L) {
pos_counts <- counts[counts > 0L]
distr[[source_key]] <- pos_counts / total
} else {
distr[[source_key]] <- numeric(0L)
}
}
list(count = count, distr = distr, starting_points = starting_points)
}
#' HON+: lazily build higher-order counts for a source key
#'
#' For each starting point stored under \code{source_key}, looks one step back
#' in the trajectory to discover extended sources of order
#' \code{length(source) + 1}. Populates \code{count}, \code{distr},
#' \code{starting_points}, and \code{source_to_ext} in place.
#'
#' @param source_key Encoded source key whose extensions are needed.
#' @param source_to_ext Environment mapping suffix keys to environments of
#' extended source keys (set-like).
#' @param count Environment of observation counts (modified in place).
#' @param distr Environment of distributions (modified in place).
#' @param starting_points Environment of starting-point lists (modified in
#' place).
#' @param trajectories List of character vectors.
#' @param min_freq Integer minimum frequency.
#' @return \code{NULL} invisibly.
#' @noRd
.honp_extend_observation <- function(source_key, source_to_ext,
count, distr, starting_points,
trajectories, min_freq) {
source <- .hon_decode(source_key)
order <- length(source)
# Ensure the suffix order is already extended before going deeper
if (order > 1L) {
suffix_key <- .hon_encode(source[-1L])
if (is.null(count[[suffix_key]]) || length(count[[suffix_key]]) == 0L) {
.honp_extend_observation(suffix_key, source_to_ext, count, distr, # nocov start
starting_points, trajectories, min_freq) # nocov end
}
}
sp <- starting_points[[source_key]]
if (is.null(sp) || length(sp) == 0L) return(invisible(NULL)) # nocov
# Accumulate counts for extended sources in a local environment
local_count <- new.env(hash = TRUE, parent = emptyenv())
for (point in sp) {
ti <- point[1L]
pos <- point[2L]
traj <- trajectories[[ti]]
# Look one step back and one step forward
if (pos > 1L && (pos + order) <= length(traj)) {
ext_source <- traj[(pos - 1L):(pos + order - 1L)]
target <- traj[pos + order]
ext_key <- .hon_encode(ext_source)
# Accumulate count
cur <- local_count[[ext_key]]
if (is.null(cur)) {
cur <- integer(0L)
names(cur) <- character(0L)
}
idx <- match(target, names(cur))
if (is.na(idx)) {
cur[target] <- 1L
} else {
cur[idx] <- cur[idx] + 1L
}
local_count[[ext_key]] <- cur
# Record starting point for the extended source
esp <- starting_points[[ext_key]]
if (is.null(esp)) esp <- list()
esp[[length(esp) + 1L]] <- c(ti, pos - 1L)
starting_points[[ext_key]] <- esp
}
}
# Apply min_freq, build distributions, register in source_to_ext
for (ext_key in ls(local_count)) {
lc <- local_count[[ext_key]]
lc[lc < min_freq] <- 0L
count[[ext_key]] <- lc
total <- sum(lc)
if (total > 0L) {
pos_counts <- lc[lc > 0L]
distr[[ext_key]] <- pos_counts / total
# Map suffix -> ext_source in source_to_ext
suffix_key <- .hon_encode(.hon_decode(ext_key)[-1L])
s2e <- source_to_ext[[suffix_key]]
if (is.null(s2e)) {
s2e <- new.env(hash = TRUE, parent = emptyenv())
source_to_ext[[suffix_key]] <- s2e
}
s2e[[ext_key]] <- TRUE
}
}
invisible(NULL)
}
#' HON+: lazy cache lookup for extended sources
#'
#' Returns the character vector of encoded extended-source keys for
#' \code{curr_key}. If the cache has not been populated yet, calls
#' \code{.honp_extend_observation()} first.
#'
#' @param curr_key Encoded key of the current source.
#' @param source_to_ext Extension cache environment.
#' @param count Count environment.
#' @param distr Distribution environment.
#' @param starting_points Starting-points environment.
#' @param trajectories List of character vectors.
#' @param min_freq Integer minimum frequency.
#' @return Character vector of extended source keys that have non-empty
#' distributions.
#' @noRd
.honp_extend_source_fast <- function(curr_key, source_to_ext,
count, distr, starting_points,
trajectories, min_freq) {
s2e <- source_to_ext[[curr_key]]
if (!is.null(s2e)) {
ext_keys <- ls(s2e)
has_distr <- vapply(ext_keys, function(k) {
d <- distr[[k]]
!is.null(d) && length(d) > 0L
}, logical(1L))
return(ext_keys[has_distr])
}
# Not cached yet — build lazily
.honp_extend_observation(curr_key, source_to_ext, count, distr,
starting_points, trajectories, min_freq)
s2e <- source_to_ext[[curr_key]]
if (is.null(s2e)) return(character(0L))
ext_keys <- ls(s2e)
has_distr <- vapply(ext_keys, function(k) {
d <- distr[[k]]
!is.null(d) && length(d) > 0L
}, logical(1L))
ext_keys[has_distr]
}
#' HON+: add a source and all its prefixes to the rules (HON+ variant)
#'
#' Mirrors pyHON+'s \code{AddToRules()}: for each prefix of \code{source_key},
#' if the prefix is not yet in \code{distr} (or has an empty distribution),
#' triggers \code{.honp_extend_source_fast()} on the prefix's suffix to
#' populate it lazily before writing to \code{rules}.
#'
#' @param source_key Encoded source key.
#' @param distr Distribution environment.
#' @param rules Rules environment.
#' @param source_to_ext Extension cache environment.
#' @param count Count environment.
#' @param starting_points Starting-points environment.
#' @param trajectories List of character vectors.
#' @param min_freq Integer minimum frequency.
#' @noRd
.honp_add_to_rules <- function(source_key, distr, rules,
source_to_ext, count, starting_points,
trajectories, min_freq) {
source <- .hon_decode(source_key)
for (ord in seq_along(source)) {
prefix <- source[seq_len(ord)]
prefix_key <- .hon_encode(prefix)
d <- distr[[prefix_key]]
if (is.null(d) || length(d) == 0L) {
if (ord > 1L) {
suffix_key <- .hon_encode(prefix[-1L])
.honp_extend_source_fast(suffix_key, source_to_ext, count, distr,
starting_points, trajectories, min_freq)
}
}
rules[[prefix_key]] <- distr[[prefix_key]]
}
}
#' HON+: recursively extend a rule with MaxDivergence pre-check
#'
#' Extends rules to higher orders. Before attempting to extend, checks whether
#' even the maximum-possible divergence from \code{curr_distr} would exceed the
#' KLD threshold. If not, the current \code{valid_key} is accepted and no
#' further extension is tried (pruning).
#'
#' @param valid_key Encoded key of the currently accepted (valid) source.
#' @param curr_key Encoded key of the source being evaluated for extension.
#' @param order Current order level.
#' @param max_order Maximum allowed order.
#' @param distr Distribution environment.
#' @param count Count environment.
#' @param source_to_ext Extension cache environment.
#' @param starting_points Starting-points environment.
#' @param trajectories List of character vectors.
#' @param min_freq Integer minimum frequency.
#' @param rules Rules environment (modified in place).
#' @noRd
.honp_extend_rule <- function(valid_key, curr_key, order, max_order,
distr, count, source_to_ext, starting_points,
trajectories, min_freq, rules) {
if (order >= max_order) {
.honp_add_to_rules(valid_key, distr, rules, source_to_ext, count,
starting_points, trajectories, min_freq)
return(invisible(NULL))
}
valid_distr <- distr[[valid_key]]
curr_distr <- distr[[curr_key]]
# HON+ MaxDivergence pre-check
if (!is.null(curr_distr) && length(curr_distr) > 0L) {
max_div <- .honp_max_divergence(curr_distr)
if (.hon_kld(max_div, valid_distr) <
.hon_kld_threshold(order + 1L, curr_key, count)) {
.honp_add_to_rules(valid_key, distr, rules, source_to_ext, count,
starting_points, trajectories, min_freq)
return(invisible(NULL))
}
} else {
.honp_add_to_rules(valid_key, distr, rules, source_to_ext, count,
starting_points, trajectories, min_freq)
return(invisible(NULL))
}
new_order <- order + 1L
extended <- .honp_extend_source_fast(curr_key, source_to_ext, count,
distr, starting_points,
trajectories, min_freq)
if (length(extended) == 0L) {
.honp_add_to_rules(valid_key, distr, rules, source_to_ext, count,
starting_points, trajectories, min_freq)
} else {
for (ext_key in extended) {
ext_distr <- distr[[ext_key]]
if (length(ext_distr) > 0L &&
.hon_kld(ext_distr, valid_distr) >
.hon_kld_threshold(new_order, ext_key, count)) {
.honp_extend_rule(ext_key, ext_key, new_order, max_order,
distr, count, source_to_ext, starting_points,
trajectories, min_freq, rules)
} else {
.honp_extend_rule(valid_key, ext_key, new_order, max_order,
distr, count, source_to_ext, starting_points,
trajectories, min_freq, rules)
}
}
}
}
#' HON+: main rule extraction pipeline
#'
#' Builds order-1 observations then iterates over all order-1 sources,
#' calling \code{.honp_extend_rule()} for each to discover higher-order rules
#' via the MaxDivergence pre-check.
#'
#' @param trajectories List of character vectors.
#' @param max_order Integer. Maximum order to consider.
#' @param min_freq Integer. Minimum transition count.
#' @return List with \code{rules} (environment) and \code{count} (environment).
#' @noRd
.honp_extract_rules <- function(trajectories, max_order, min_freq) {
o1 <- .honp_build_order1(trajectories, min_freq)
source_to_ext <- new.env(hash = TRUE, parent = emptyenv())
rules <- new.env(hash = TRUE, parent = emptyenv())
for (source_key in ls(o1$distr)) {
if (.hon_key_len(source_key) == 1L) {
.honp_add_to_rules(source_key, o1$distr, rules, source_to_ext,
o1$count, o1$starting_points, trajectories, min_freq)
.honp_extend_rule(source_key, source_key, 1L, max_order,
o1$distr, o1$count, source_to_ext,
o1$starting_points, trajectories, min_freq, rules)
}
}
list(rules = rules, count = o1$count)
}
# ---------------------------------------------------------------------------
# Rule generation (recursive)
# ---------------------------------------------------------------------------
#' Add a source and all its prefixes to the rules
#'
#' @param source_key Encoded source key.
#' @param distr Environment of distributions.
#' @param rules Environment to store rules.
#' @noRd
.hon_add_to_rules <- function(source_key, distr, rules) {
source <- .hon_decode(source_key)
if (length(source) > 0L) {
if (is.null(rules[[source_key]])) {
rules[[source_key]] <- distr[[source_key]]
}
if (length(source) > 1L) {
prev_key <- .hon_encode(source[-length(source)])
.hon_add_to_rules(prev_key, distr, rules)
}
}
}
#' Recursively extend a rule to higher orders if KLD justifies it
#'
#' @param valid_key Encoded key of the currently valid (accepted) source.
#' @param curr_key Encoded key of the current source to try extending from.
#' @param order Current order level.
#' @param max_order Maximum order.
#' @param distr Environment of distributions.
#' @param count Environment of observation counts.
#' @param cache Source extension cache.
#' @param rules Environment to store rules.
#' @noRd
.hon_extend_rule <- function(valid_key, curr_key, order, max_order,
distr, count, cache, rules) {
if (order >= max_order) {
.hon_add_to_rules(valid_key, distr, rules)
} else {
valid_distr <- distr[[valid_key]]
new_order <- order + 1L
extended <- .hon_get_extensions(curr_key, new_order, cache)
if (length(extended) == 0L) {
.hon_add_to_rules(valid_key, distr, rules)
} else {
for (ext_key in extended) {
ext_distr <- distr[[ext_key]]
if (length(ext_distr) > 0L &&
.hon_kld(ext_distr, valid_distr) >
.hon_kld_threshold(new_order, ext_key, count)) {
.hon_extend_rule(ext_key, ext_key, new_order, max_order,
distr, count, cache, rules)
} else {
.hon_extend_rule(valid_key, ext_key, new_order, max_order,
distr, count, cache, rules)
}
}
}
}
}
#' Extract all HON rules from trajectories
#'
#' Main rule extraction pipeline: builds observations, distributions,
#' source cache, and then generates rules via recursive KLD extension.
#'
#' @param trajectories List of character vectors.
#' @param max_order Integer.
#' @param min_freq Integer.
#' @return List with \code{rules} (environment) and \code{count} (environment).
#' @noRd
.hon_extract_rules <- function(trajectories, max_order, min_freq) {
count <- .hon_build_observations(trajectories, max_order)
distr <- .hon_build_distributions(count, min_freq)
cache <- .hon_build_source_cache(distr)
rules <- new.env(hash = TRUE, parent = emptyenv())
# Start from each first-order source
for (source_key in ls(distr)) {
if (.hon_key_len(source_key) == 1L) {
.hon_add_to_rules(source_key, distr, rules)
.hon_extend_rule(source_key, source_key, 1L, max_order,
distr, count, cache, rules)
}
}
list(rules = rules, count = count)
}
# ---------------------------------------------------------------------------
# Network building
# ---------------------------------------------------------------------------
#' Build HON graph from rules
#'
#' Processes rules (sorted by source length), adds edges, and rewires
#' to connect higher-order nodes.
#'
#' @param rules Environment of rules.
#' @return Environment mapping encoded source keys to environments mapping
#' encoded target keys to numeric weights.
#' @noRd
.hon_build_network <- function(rules) {
graph <- new.env(hash = TRUE, parent = emptyenv())
# Sort sources by tuple length (ascending)
all_sources <- ls(rules)
source_lens <- .hon_key_len(all_sources)
sorted_sources <- all_sources[order(source_lens)]
for (source_key in sorted_sources) {
rule_distr <- rules[[source_key]]
if (is.null(graph[[source_key]])) {
graph[[source_key]] <- new.env(hash = TRUE, parent = emptyenv())
}
for (target in names(rule_distr)) {
target_key <- .hon_encode(target)
graph[[source_key]][[target_key]] <- rule_distr[target]
source <- .hon_decode(source_key)
if (length(source) > 1L) {
.hon_rewire(graph, source_key, target_key)
}
}
}
.hon_rewire_tails(graph)
graph
}
#' Rewire incoming edges for a higher-order source
#'
#' When a higher-order source (A,B) is added, redirect the edge from
#' its prefix (A) pointing to (B) so it now points to (A,B).
#'
#' @param graph Environment (the HON graph).
#' @param source_key Encoded source (length > 1).
#' @param target_key Encoded target.
#' @noRd
.hon_rewire <- function(graph, source_key, target_key) {
source <- .hon_decode(source_key)
prev_source_key <- .hon_encode(source[-length(source)])
prev_target_key <- .hon_encode(source[length(source)])
# Only rewire if the prev_source doesn't already point to this source
prev_graph <- graph[[prev_source_key]]
if (is.null(prev_graph) || !is.null(prev_graph[[source_key]])) {
return(invisible(NULL))
}
# Check if prev_source has an edge to prev_target
if (!is.null(prev_graph[[prev_target_key]])) {
# Redirect: prev_source -> source (instead of -> prev_target)
prev_graph[[source_key]] <- prev_graph[[prev_target_key]]
rm(list = prev_target_key, envir = prev_graph)
}
invisible(NULL)
}
#' Rewire outgoing edges to point to highest-order targets
#'
#' For each edge (source -> target) where target is first-order, check if
#' a higher-order version of the target (source + target suffix) exists as
#' a node in the graph. If so, redirect.
#'
#' @param graph Environment (the HON graph).
#' @noRd
.hon_rewire_tails <- function(graph) {
to_add <- list()
to_remove <- list()
for (source_key in ls(graph)) {
source <- .hon_decode(source_key)
source_graph <- graph[[source_key]]
for (target_key in ls(source_graph)) {
target <- .hon_decode(target_key)
if (length(target) == 1L) {
new_target <- c(source, target)
found <- FALSE
while (length(new_target) > 1L) {
new_target_key <- .hon_encode(new_target)
if (!is.null(graph[[new_target_key]])) {
to_add[[length(to_add) + 1L]] <- list(
source = source_key,
target = new_target_key,
weight = source_graph[[target_key]]
)
to_remove[[length(to_remove) + 1L]] <- list(
source = source_key,
target = target_key
)
found <- TRUE
break
}
new_target <- new_target[-1L]
}
}
}
}
# Apply additions
for (item in to_add) {
graph[[item$source]][[item$target]] <- item$weight
}
# Apply removals
for (item in to_remove) {
src_env <- graph[[item$source]]
if (!is.null(src_env[[item$target]])) {
rm(list = item$target, envir = src_env)
}
}
invisible(NULL)
}
# ---------------------------------------------------------------------------
# Node naming
# ---------------------------------------------------------------------------
#' Convert a source/target tuple to readable arrow notation
#'
#' Examples:
#' ("A",) -> "A"
#' ("A","B") -> "A -> B"
#' ("X","A","B") -> "X -> A -> B"
#'
#' @param seq Character vector (decoded tuple).
#' @return Character string in arrow notation.
#' @noRd
.hon_sequence_to_node <- function(seq) {
if (length(seq) == 0L) return("")
paste(seq, collapse = " -> ")
}
# ---------------------------------------------------------------------------
# Graph to edge list
# ---------------------------------------------------------------------------
#' Convert HON graph environment to edge list data.frame
#'
#' Returns a data.frame with unified column structure matching MOGen/HYPA:
#' \code{path} (full state sequence), \code{from} (context/conditioning states),
#' \code{to} (next state), \code{count}, \code{probability}, plus
#' \code{from_order} and \code{to_order}.
#'
#' @param graph Environment from .hon_build_network().
#' @param count_env Environment of observation counts (optional, for raw counts).
#' @return data.frame with columns: path, from, to, count, probability,
#' from_order, to_order.
#' @noRd
.hon_graph_to_edgelist <- function(graph, count_env = NULL) {
rows <- list()
idx <- 0L
for (source_key in ls(graph)) {
source <- .hon_decode(source_key)
from_node <- .hon_sequence_to_node(source)
from_order <- length(source)
source_graph <- graph[[source_key]]
for (target_key in ls(source_graph)) {
target <- .hon_decode(target_key)
to_order <- length(target)
prob <- source_graph[[target_key]]
# Next state = last element of target tuple
next_state <- target[length(target)]
# Path = source context + next state
path <- paste(c(source, next_state), collapse = " -> ")
# Count lookup from the count environment
edge_count <- NA_integer_
if (!is.null(count_env) && !is.null(count_env[[source_key]])) {
raw <- count_env[[source_key]][[next_state]]
if (!is.null(raw) && !is.na(raw)) edge_count <- as.integer(raw)
}
idx <- idx + 1L
rows[[idx]] <- data.frame(
path = path,
from = from_node,
to = next_state,
count = edge_count,
probability = as.numeric(prob),
from_order = from_order,
to_order = to_order,
stringsAsFactors = FALSE
)
}
}
if (idx == 0L) {
return(data.frame(
path = character(0L),
from = character(0L),
to = character(0L),
count = integer(0L),
probability = numeric(0L),
from_order = integer(0L),
to_order = integer(0L),
stringsAsFactors = FALSE
))
}
do.call(rbind, rows)
}
# ---------------------------------------------------------------------------
# Assembly
# ---------------------------------------------------------------------------
#' Assemble final net_hon output object
#'
#' @param graph Environment from .hon_build_network().
#' @param rules Environment from .hon_extract_rules().
#' @param trajectories List of character vectors (parsed input).
#' @param max_order Integer.
#' @param min_freq Integer.
#' @param count_env Environment of observation counts (for raw counts in edges).
#' @return net_hon S3 object.
#' @noRd
.hon_assemble_output <- function(graph, rules, trajectories, max_order,
min_freq, count_env = NULL) {
edges <- .hon_graph_to_edgelist(graph, count_env)
# Collect all node names from graph (both sources and targets)
all_nodes <- character(0L)
for (source_key in ls(graph)) {
source <- .hon_decode(source_key)
all_nodes <- c(all_nodes, .hon_sequence_to_node(source))
source_graph <- graph[[source_key]]
for (target_key in ls(source_graph)) {
target <- .hon_decode(target_key)
all_nodes <- c(all_nodes, .hon_sequence_to_node(target))
}
}
nodes <- sort(unique(all_nodes))
# Compute max order observed
rule_keys <- ls(rules)
if (length(rule_keys) > 0L) {
observed_max_order <- max(.hon_key_len(rule_keys))
} else {
observed_max_order <- 1L
}
# Build adjacency matrix from graph directly (using readable node names)
n <- length(nodes)
mat <- matrix(0.0, nrow = n, ncol = n, dimnames = list(nodes, nodes))
for (source_key in ls(graph)) {
from_node <- .hon_sequence_to_node(.hon_decode(source_key))
source_graph <- graph[[source_key]]
for (target_key in ls(source_graph)) {
to_node <- .hon_sequence_to_node(.hon_decode(target_key))
mat[from_node, to_node] <- source_graph[[target_key]]
}
}
# Unique first-order states
first_order_states <- sort(unique(unlist(trajectories, use.names = FALSE)))
# cograph-compatible fields
cg <- .ho_cograph_fields(mat, nodes, method = "hon")
result <- structure(
c(list(
weights = mat,
matrix = mat,
ho_edges = edges,
edges = cg$edges,
nodes = cg$nodes,
n_nodes = n,
n_edges = nrow(edges),
first_order_states = first_order_states,
max_order_requested = max_order,
max_order_observed = observed_max_order,
min_freq = min_freq,
n_trajectories = length(trajectories),
directed = TRUE
), cg[c("meta", "node_groups")]),
class = c("net_hon", "cograph_network")
)
result
}
# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------
#' Build a Higher-Order Network (HON)
#'
#' @description
#' Constructs a Higher-Order Network from sequential data, faithfully
#' implementing the BuildHON algorithm (Xu, Wickramarathne & Chawla, 2016).
#'
#' The algorithm detects when a first-order Markov model is insufficient
#' to capture sequential dependencies and automatically creates higher-order
#' nodes. Uses KL-divergence to determine whether extending a node's history
#' provides significantly different transition distributions.
#'
#' @param data One of:
#' \itemize{
#' \item \code{data.frame}: rows are trajectories, columns are time steps.
#' Trailing \code{NA}s are stripped. All non-NA values are coerced to
#' character.
#' \item \code{list}: each element is a character (or coercible) vector
#' representing one trajectory.
#' \item \code{tna}: a tna object with sequence data. Numeric state
#' IDs are automatically converted to label names.
#' \item \code{netobject}: a netobject with sequence data.
#' }
#' @param max_order Integer. Maximum order of the HON. Default 5.
#' The algorithm may produce lower-order nodes if the data do not
#' justify higher orders.
#' @param min_freq Integer. Minimum frequency for a transition to be
#' considered. Transitions observed fewer than \code{min_freq} times are
#' treated as zero. Default 1.
#' @param collapse_repeats Logical. If \code{TRUE}, adjacent duplicate states
#' within each trajectory are collapsed before analysis. Default \code{FALSE}.
#' @param method Character. Algorithm to use: \code{"hon+"} (default,
#' parameter-free BuildHON+ with lazy observation building and MaxDivergence
#' pruning) or \code{"hon"} (original BuildHON with eager observation
#' building).
#'
#' @return An S3 object of class \code{"net_hon"} containing:
#' \describe{
#' \item{matrix}{Weighted adjacency matrix (rows = from, cols = to).
#' Rows and columns use readable arrow notation (e.g., \code{"A -> B"}).}
#' \item{edges}{Data frame with columns: \code{path} (full state sequence,
#' e.g., "A -> B -> C"), \code{from} (context/conditioning states),
#' \code{to} (predicted next state), \code{count} (raw frequency),
#' \code{probability} (transition probability), \code{from_order},
#' \code{to_order}.}
#' \item{nodes}{Character vector of HON node names in arrow notation.}
#' \item{n_nodes}{Number of HON nodes.}
#' \item{n_edges}{Number of edges.}
#' \item{first_order_states}{Character vector of unique original states.}
#' \item{max_order_requested}{The \code{max_order} parameter used.}
#' \item{max_order_observed}{Highest order actually present.}
#' \item{min_freq}{The \code{min_freq} parameter used.}
#' \item{n_trajectories}{Number of trajectories after parsing.}
#' \item{directed}{Logical. Always \code{TRUE}.}
#' }
#'
#' @details
#' \strong{Node naming convention}: Higher-order nodes use readable arrow
#' notation. A first-order node is simply \code{"A"}. A second-order node
#' representing the context "came from A, now at B" is \code{"A -> B"}.
#' Third-order: \code{"A -> B -> C"}, etc.
#'
#' \strong{Algorithm overview}:
#' \enumerate{
#' \item Count all subsequence transitions up to \code{max_order + 1}.
#' \item Build probability distributions, filtering by \code{min_freq}.
#' \item For each first-order source, recursively test whether extending
#' the history (adding more context) produces a significantly different
#' distribution (via KL-divergence vs. an adaptive threshold).
#' \item Build the network from the accepted rules, rewiring edges so
#' higher-order nodes are properly connected.
#' }
#'
#' @references
#' Xu, J., Wickramarathne, T. L., & Chawla, N. V. (2016). Representing
#' higher-order dependencies in networks. \emph{Science Advances},
#' 2(5), e1600028.
#'
#' Saebi, M., Xu, J., Kaplan, L. M., Ribeiro, B., & Chawla, N. V. (2020).
#' Efficient modeling of higher-order dependencies in networks: from algorithm
#' to application for anomaly detection. \emph{EPJ Data Science}, 9(1), 15.
#'
#' @examples
#' seqs <- list(c("A","B","C","D"), c("A","B","C","A"), c("B","C","D","A"))
#' hon <- build_hon(seqs, max_order = 2)
#'
#' \donttest{
#' # From list of trajectories
#' trajs <- list(
#' c("A", "B", "C", "D", "A"),
#' c("A", "B", "D", "C", "A"),
#' c("A", "B", "C", "D", "A")
#' )
#' hon <- build_hon(trajs, max_order = 3, min_freq = 1)
#' print(hon)
#' summary(hon)
#'
#' # From data.frame (rows = trajectories)
#' df <- data.frame(T1 = c("A", "A"), T2 = c("B", "B"),
#' T3 = c("C", "D"), T4 = c("D", "C"))
#' hon <- build_hon(df, max_order = 2)
#' }
#'
#' @export
build_hon <- function(data, max_order = 5L, min_freq = 1L,
collapse_repeats = FALSE, method = "hon+") {
# --- Coerce tna/netobject to labeled data.frame ---
data <- .coerce_sequence_input(data)
# --- Input validation ---
stopifnot(
"'data' must be a data.frame or list" =
is.data.frame(data) || is.list(data),
"'data' must have at least one trajectory/row" =
(is.data.frame(data) && nrow(data) >= 1L && ncol(data) >= 1L) ||
(is.list(data) && !is.data.frame(data) && length(data) >= 1L),
"'max_order' must be >= 1" = is.numeric(max_order) && max_order >= 1L,
"'min_freq' must be >= 1" = is.numeric(min_freq) && min_freq >= 1L
)
method <- match.arg(method, c("hon+", "hon"))
max_order <- as.integer(max_order)
min_freq <- as.integer(min_freq)
# --- Parse input ---
trajectories <- .hon_parse_input(data, collapse_repeats = collapse_repeats)
if (length(trajectories) == 0L) {
stop("No valid trajectories (each must have at least 2 states)")
}
# --- Extract rules ---
extraction <- if (method == "hon+") {
.honp_extract_rules(trajectories, max_order, min_freq)
} else {
.hon_extract_rules(trajectories, max_order, min_freq)
}
rules <- extraction$rules
count_env <- extraction$count
# --- Build network ---
graph <- .hon_build_network(rules)
# --- Assemble output ---
.hon_assemble_output(graph, rules, trajectories, max_order, min_freq,
count_env)
}
# ---------------------------------------------------------------------------
# S3 methods
# ---------------------------------------------------------------------------
#' Print Method for net_hon
#'
#' @param x A \code{net_hon} object.
#' @param ... Additional arguments (ignored).
#'
#' @return The input object, invisibly.
#'
#' @examples
#' seqs <- list(c("A","B","C","D"), c("A","B","C","A"), c("B","C","D","A"))
#' hon <- build_hon(seqs, max_order = 2)
#' print(hon)
#'
#' \donttest{
#' seqs <- data.frame(
#' V1 = c("A","B","C","A","B"),
#' V2 = c("B","C","A","B","C"),
#' V3 = c("C","A","B","C","A")
#' )
#' hon <- build_hon(seqs, max_order = 2L)
#' print(hon)
#' }
#'
#' @export
print.net_hon <- function(x, ...) {
cat("Higher-Order Network (HON)\n")
cat(sprintf(" Nodes: %d (%d first-order states)\n",
x$n_nodes, length(x$first_order_states)))
cat(sprintf(" Edges: %d\n", x$n_edges))
cat(sprintf(" Max order: %d (requested %d)\n",
x$max_order_observed, x$max_order_requested))
cat(sprintf(" Min freq: %d\n", x$min_freq))
cat(sprintf(" Trajectories: %d\n", x$n_trajectories))
invisible(x)
}
#' Summary Method for net_hon
#'
#' @param object A \code{net_hon} object.
#' @param ... Additional arguments (ignored).
#'
#' @return The input object, invisibly.
#'
#' @examples
#' seqs <- list(c("A","B","C","D"), c("A","B","C","A"), c("B","C","D","A"))
#' hon <- build_hon(seqs, max_order = 2)
#' summary(hon)
#'
#' \donttest{
#' seqs <- data.frame(
#' V1 = c("A","B","C","A","B"),
#' V2 = c("B","C","A","B","C"),
#' V3 = c("C","A","B","C","A")
#' )
#' hon <- build_hon(seqs, max_order = 2L)
#' summary(hon)
#' }
#'
#' @export
summary.net_hon <- function(object, ...) {
cat("Higher-Order Network (HON) Summary\n")
cat(sprintf(" Nodes: %d | Edges: %d | Trajectories: %d\n",
object$n_nodes, object$n_edges, object$n_trajectories))
cat(sprintf(" First-order states: %s\n",
paste(object$first_order_states, collapse = ", ")))
cat(sprintf(" Max order observed: %d (requested: %d)\n",
object$max_order_observed, object$max_order_requested))
cat(sprintf(" Min frequency: %d\n", object$min_freq))
if (object$n_nodes > 0L) {
# Order distribution: count arrows to determine order
node_orders <- vapply(object$nodes$label, function(nd) {
length(strsplit(nd, " -> ", fixed = TRUE)[[1L]])
}, integer(1L))
order_tab <- table(node_orders)
cat(" Node order distribution:\n")
for (ord in names(order_tab)) {
cat(sprintf(" Order %s: %d nodes\n", ord, order_tab[ord]))
}
}
invisible(object)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.