Nothing
# Cluster Metrics for Network Analysis
# Summary measures for between/within clusters and multilayer networks
# ==============================================================================
# 1. Edge Weight Aggregation
# ==============================================================================
#' Aggregate Edge Weights
#'
#' Aggregates a vector of edge weights using various methods.
#' Compatible with igraph's edge.attr.comb parameter.
#'
#' @param w Numeric vector of edge weights
#' @param method Aggregation method: "sum", "mean", "median", "max", "min",
#' "prod", "density", "geomean"
#' @param n_possible Number of possible edges (for density calculation)
#' @return Single aggregated value
#' @export
#' @examples
#' w <- c(0.5, 0.8, 0.3, 0.9)
#' net_aggregate_weights(w, "sum") # 2.5
#' net_aggregate_weights(w, "mean") # 0.625
#' net_aggregate_weights(w, "max") # 0.9
#' mat <- matrix(c(0, 0.5, 0.5, 0.3, 0, 0.7, 0.4, 0.6, 0), 3, 3, byrow = TRUE)
#' net_aggregate_weights(mat)
net_aggregate_weights <- function(w, method = "sum", n_possible = NULL) {
# Remove NA and zero weights
w <- w[!is.na(w) & w != 0]
if (length(w) == 0) return(0)
switch(method,
"sum" = sum(w),
"mean" = mean(w),
"median" = stats::median(w),
"max" = max(w),
"min" = min(w),
"prod" = prod(w),
"density" = if (!is.null(n_possible) && n_possible > 0) {
sum(w) / n_possible
} else {
sum(w) / length(w)
},
"geomean" = {
pos_w <- w[w > 0]
if (length(pos_w) == 0) 0 else exp(mean(log(pos_w)))
},
stop("Unknown method: ", method, call. = FALSE)
)
}
# ==============================================================================
# 2. Cluster Summary (Between/Within Aggregates)
# ==============================================================================
#' Cluster Summary Statistics
#'
#' Aggregates node-level network weights to cluster-level summaries. Computes
#' both between-cluster transitions (how clusters connect to each other) and
#' within-cluster transitions (how nodes connect within each cluster).
#'
#' This is the core function for Multi-Cluster Multi-Level (MCML) analysis.
#' Use \code{as_tna()} to convert results to tna objects for further
#' analysis with the tna package.
#'
#' @param x Network input. Accepts multiple formats:
#' \describe{
#' \item{matrix}{Numeric adjacency/weight matrix. Row and column names are
#' used as node labels. Values represent edge weights (e.g., transition
#' counts, co-occurrence frequencies, or probabilities).}
#' \item{netobject}{A cograph network object. The function extracts
#' the weight matrix from \code{x$weights} or converts via
#' \code{to_matrix()}. Clusters can be auto-detected from node attributes.}
#' \item{tna}{A tna object from the tna package. Extracts \code{x$weights}.}
#' \item{cluster_summary}{If already a cluster_summary, returns unchanged.}
#' }
#'
#' @param clusters Cluster/group assignments for nodes. Accepts multiple formats:
#' \describe{
#' \item{NULL}{(default) Auto-detect from netobject. Looks for columns
#' named 'clusters', 'cluster', 'groups', or 'group' in \code{x$nodes}.
#' Throws an error if no cluster column is found.
#' This option only works when \code{x} is a netobject.}
#' \item{vector}{Cluster membership for each node, in the same order as the
#' matrix rows/columns. Can be numeric (1, 2, 3) or character ("A", "B").
#' Cluster names will be derived from unique values.
#' Example: \code{c(1, 1, 2, 2, 3, 3)} assigns first two nodes to cluster 1.}
#' \item{data.frame}{A data frame where the first column contains node names
#' and the second column contains group/cluster names.
#' Example: \code{data.frame(node = c("A", "B", "C"), group = c("G1", "G1", "G2"))}}
#' \item{named list}{Explicit mapping of cluster names to node labels.
#' List names become cluster names, values are character vectors of node
#' labels that must match matrix row/column names.
#' Example: \code{list(Alpha = c("A", "B"), Beta = c("C", "D"))}}
#' }
#'
#' @param method Aggregation method for combining edge weights within/between
#' clusters. Controls how multiple node-to-node edges are summarized:
#' \describe{
#' \item{"sum"}{(default) Sum of all edge weights. Best for count data
#' (e.g., transition frequencies). Preserves total flow.}
#' \item{"mean"}{Average edge weight. Best when cluster sizes differ and
#' you want to control for size. Note: when input is already a transition
#' matrix (rows sum to 1), "mean" avoids size bias.
#' Example: cluster with 5 nodes won't have 5x the weight of cluster with 1 node.}
#' \item{"median"}{Median edge weight. Robust to outliers.}
#' \item{"max"}{Maximum edge weight. Captures strongest connection.}
#' \item{"min"}{Minimum edge weight. Captures weakest connection.}
#' \item{"density"}{Sum divided by number of possible edges. Normalizes
#' by cluster size combinations.}
#' \item{"geomean"}{Geometric mean of positive weights. Useful for
#' multiplicative processes.}
#' }
#'
#' @param type Post-processing applied to aggregated weights. Determines the
#' interpretation of the resulting matrices:
#' \describe{
#' \item{"tna"}{(default) Row-normalize so each row sums to 1. Creates
#' transition probabilities suitable for Markov chain analysis.
#' Interpretation: "Given I'm in cluster A, what's the probability
#' of transitioning to cluster B?"
#' Required for use with tna package functions.
#' Diagonal represents within-cluster transition probability.}
#' \item{"raw"}{No normalization. Returns aggregated counts/weights as-is.
#' Use for frequency analysis or when you need raw counts.
#' Compatible with igraph's contract + simplify output.}
#' \item{"cooccurrence"}{Symmetrize the matrix: (A + t(A)) / 2.
#' For undirected co-occurrence analysis.}
#' \item{"semi_markov"}{Row-normalize with duration weighting.
#' For semi-Markov process analysis.}
#' }
#'
#' @param directed Logical. If \code{TRUE} (default), treat network as directed.
#' A->B and B->A are separate edges. If \code{FALSE}, edges are undirected
#' and the matrix is symmetrized before processing.
#'
#' @param compute_within Logical. If \code{TRUE} (default), compute within-cluster
#' transition matrices for each cluster. Each cluster gets its own n_i x n_i
#' matrix showing internal node-to-node transitions.
#' Set to \code{FALSE} to skip this computation for better performance when
#' only between-cluster summary is needed.
#'
#' @return A \code{cluster_summary} object (S3 class) containing:
#' \describe{
#' \item{between}{List with two elements:
#' \describe{
#' \item{weights}{k x k matrix of cluster-to-cluster weights, where k is
#' the number of clusters. Row i, column j contains the aggregated
#' weight from cluster i to cluster j. Diagonal contains within-cluster
#' totals. Processing depends on \code{type}.}
#' \item{inits}{Numeric vector of length k. Initial state distribution
#' across clusters, computed from column sums of the original matrix.
#' Represents the proportion of incoming edges to each cluster.}
#' }
#' }
#' \item{within}{Named list with one element per cluster. Each element contains:
#' \describe{
#' \item{weights}{n_i x n_i matrix for nodes within that cluster.
#' Shows internal transitions between nodes in the same cluster.}
#' \item{inits}{Initial distribution within the cluster.}
#' }
#' NULL if \code{compute_within = FALSE}.}
#' \item{clusters}{Named list mapping cluster names to their member node labels.
#' Example: \code{list(A = c("n1", "n2"), B = c("n3", "n4", "n5"))}}
#' \item{meta}{List of metadata:
#' \describe{
#' \item{type}{The \code{type} argument used ("tna", "raw", etc.)}
#' \item{method}{The \code{method} argument used ("sum", "mean", etc.)}
#' \item{directed}{Logical, whether network was treated as directed}
#' \item{n_nodes}{Total number of nodes in original network}
#' \item{n_clusters}{Number of clusters}
#' \item{cluster_sizes}{Named vector of cluster sizes}
#' }
#' }
#' }
#'
#' @details
#' ## Workflow
#'
#' Typical MCML analysis workflow:
#' \preformatted{
#' # 1. Create network
#' net <- build_network(data, method = "relative")
#' net$nodes$clusters <- group_assignments
#'
#' # 2. Compute cluster summary
#' cs <- cluster_summary(net, type = "tna")
#'
#' # 3. Convert to tna models
#' tna_models <- as_tna(cs)
#'
#' # 4. Analyze/visualize
#' plot(tna_models$macro)
#' tna::centralities(tna_models$macro)
#' }
#'
#' ## Between-Cluster Matrix Structure
#'
#' The \code{macro$weights} matrix has clusters as both rows and columns:
#' \itemize{
#' \item Off-diagonal (row i, col j): Aggregated weight from cluster i to cluster j
#' \item Diagonal (row i, col i): Within-cluster total (sum of internal edges in cluster i)
#' }
#'
#' When \code{type = "tna"}, rows sum to 1 and diagonal values represent
#' "retention rate" - the probability of staying within the same cluster.
#'
#' ## Choosing method and type
#'
#' \tabular{lll}{
#' \strong{Input data} \tab \strong{Recommended} \tab \strong{Reason} \cr
#' Edge counts \tab method="sum", type="tna" \tab Preserves total flow, normalizes to probabilities \cr
#' Transition matrix \tab method="mean", type="tna" \tab Avoids cluster size bias \cr
#' Frequencies \tab method="sum", type="raw" \tab Keep raw counts for analysis \cr
#' Correlation matrix \tab method="mean", type="raw" \tab Average correlations \cr
#' }
#'
#' @export
#' @seealso
#' \code{as_tna()} to convert results to tna objects,
#' \code{plot()} for two-layer visualization,
#' \code{plot()} for flat cluster visualization
#'
#' @examples
#' # -----------------------------------------------------
#' # Basic usage with matrix and cluster vector
#' # -----------------------------------------------------
#' mat <- matrix(runif(100), 10, 10)
#' rownames(mat) <- colnames(mat) <- LETTERS[1:10]
#'
#' clusters <- c(1, 1, 1, 2, 2, 2, 3, 3, 3, 3)
#' cs <- cluster_summary(mat, clusters)
#'
#' # Access results
#' cs$macro$weights # 3x3 cluster transition matrix
#' cs$macro$inits # Initial distribution
#' cs$clusters$`1`$weights # Within-cluster 1 transitions
#' cs$meta # Metadata
#'
#' # -----------------------------------------------------
#' # Named list clusters (more readable)
#' # -----------------------------------------------------
#' clusters <- list(
#' Alpha = c("A", "B", "C"),
#' Beta = c("D", "E", "F"),
#' Gamma = c("G", "H", "I", "J")
#' )
#' cs <- cluster_summary(mat, clusters, type = "tna")
#' cs$macro$weights # Rows/cols named Alpha, Beta, Gamma
#' cs$clusters$Alpha # Within Alpha cluster
#'
#' # -----------------------------------------------------
#' # Auto-detect clusters from netobject
#' # -----------------------------------------------------
#' \donttest{
#' seqs <- data.frame(
#' V1 = sample(LETTERS[1:10], 30, TRUE), V2 = sample(LETTERS[1:10], 30, TRUE),
#' V3 = sample(LETTERS[1:10], 30, TRUE)
#' )
#' net <- build_network(seqs, method = "relative")
#' cs2 <- cluster_summary(net, c(1, 1, 1, 2, 2, 2, 3, 3, 3, 3))
#' }
#'
#' # -----------------------------------------------------
#' # Different aggregation methods
#' # -----------------------------------------------------
#' cs_sum <- cluster_summary(mat, clusters, method = "sum") # Total flow
#' cs_mean <- cluster_summary(mat, clusters, method = "mean") # Average
#' cs_max <- cluster_summary(mat, clusters, method = "max") # Strongest
#'
#' # -----------------------------------------------------
#' # Raw counts vs TNA probabilities
#' # -----------------------------------------------------
#' cs_raw <- cluster_summary(mat, clusters, type = "raw")
#' cs_tna <- cluster_summary(mat, clusters, type = "tna")
#'
#' rowSums(cs_raw$macro$weights) # Various sums
#' rowSums(cs_tna$macro$weights) # All equal to 1
#'
#' # -----------------------------------------------------
#' # Skip within-cluster computation for speed
#' # -----------------------------------------------------
#' cs_fast <- cluster_summary(mat, clusters, compute_within = FALSE)
#' cs_fast$clusters # NULL
#'
#' # -----------------------------------------------------
#' # Convert to tna objects for tna package
#' # -----------------------------------------------------
#' cs <- cluster_summary(mat, clusters, type = "tna")
#' tna_models <- as_tna(cs)
#' # tna_models$macro # tna object
#' # tna_models$clusters$Alpha # tna object
cluster_summary <- function(x,
clusters = NULL,
method = c("sum", "mean", "median", "max",
"min", "density", "geomean"),
type = c("tna", "cooccurrence", "semi_markov", "raw"),
directed = TRUE,
compute_within = TRUE) {
# If already an mcml object, return as-is
if (inherits(x, "mcml")) {
return(x) # nocov
}
type <- match.arg(type)
method <- match.arg(method)
# Store original for cluster extraction
x_orig <- x
# Extract matrix from various input types
if (inherits(x, "cograph_network") && !inherits(x, "netobject")) {
x <- .as_netobject(x)
mat <- x$weights
} else if (inherits(x, "netobject") || inherits(x, "netobject_ml")) {
mat <- x$weights
} else if (inherits(x, "tna")) {
mat <- x$weights
} else {
mat <- x
}
if (is.null(clusters)) {
stop("clusters argument is required", call. = FALSE)
}
# Validate input matrix
if (!is.matrix(mat) || !is.numeric(mat)) {
stop("x must be a netobject, tna object, or numeric matrix", call. = FALSE) # nocov
}
if (nrow(mat) != ncol(mat)) {
stop("x must be a square matrix", call. = FALSE)
}
n <- nrow(mat)
node_names <- rownames(mat)
if (is.null(node_names)) node_names <- as.character(seq_len(n))
# Convert clusters to list format
cluster_list <- .normalize_clusters(clusters, node_names)
n_clusters <- length(cluster_list)
cluster_names <- names(cluster_list)
if (is.null(cluster_names)) cluster_names <- as.character(seq_len(n_clusters)) # nocov
names(cluster_list) <- cluster_names
# Get node indices for each cluster
cluster_indices <- lapply(cluster_list, function(nodes_vec) {
match(nodes_vec, node_names)
})
# ============================================================================
# Between-cluster computation (always computed)
# ============================================================================
# Aggregate between-cluster weights
between_raw <- matrix(0, n_clusters, n_clusters,
dimnames = list(cluster_names, cluster_names))
for (i in seq_len(n_clusters)) {
idx_i <- cluster_indices[[i]]
n_i <- length(idx_i)
for (j in seq_len(n_clusters)) {
idx_j <- cluster_indices[[j]]
n_j <- length(idx_j)
# Both diagonal (within-cluster) and off-diagonal (between-cluster)
w_ij <- mat[idx_i, idx_j]
n_possible <- n_i * n_j
between_raw[i, j] <- net_aggregate_weights(as.vector(w_ij), method, n_possible)
}
}
# Process based on type
between_weights <- .process_weights(between_raw, type, directed)
# Compute inits from column sums
col_sums <- colSums(between_raw)
total <- sum(col_sums)
if (total > 0) {
between_inits <- col_sums / total
} else {
between_inits <- rep(1 / n_clusters, n_clusters)
}
names(between_inits) <- cluster_names
# Build $macro
between <- list(
weights = between_weights,
inits = between_inits,
labels = cluster_names,
data = NULL
)
# ============================================================================
# Within-cluster computation (optional)
# ============================================================================
within_data <- NULL
if (isTRUE(compute_within)) {
within_data <- lapply(seq_len(n_clusters), function(i) {
idx_i <- cluster_indices[[i]]
n_i <- length(idx_i)
cl_nodes <- cluster_list[[i]]
if (n_i <= 1) {
# Single node: preserve self-loop value
within_raw <- mat[idx_i, idx_i, drop = FALSE]
dimnames(within_raw) <- list(cl_nodes, cl_nodes)
within_weights_i <- within_raw
within_inits_i <- setNames(1, cl_nodes)
} else {
# Extract within-cluster raw weights (preserves self-loops)
within_raw <- mat[idx_i, idx_i]
dimnames(within_raw) <- list(cl_nodes, cl_nodes)
# Process based on type
within_weights_i <- .process_weights(within_raw, type, directed)
# Within-cluster inits (handle NAs)
col_sums_w <- colSums(within_raw, na.rm = TRUE)
total_w <- sum(col_sums_w, na.rm = TRUE)
within_inits_i <- if (!is.na(total_w) && total_w > 0) {
col_sums_w / total_w
} else {
rep(1 / n_i, n_i)
}
names(within_inits_i) <- cl_nodes
}
list(
weights = within_weights_i,
inits = within_inits_i,
labels = cl_nodes,
data = NULL
)
})
names(within_data) <- cluster_names
}
# ============================================================================
# Build result
# ============================================================================
result <- structure(
list(
macro = between,
clusters = within_data,
cluster_members = cluster_list,
meta = list(
type = type,
method = method,
directed = directed,
n_nodes = n,
n_clusters = n_clusters,
cluster_sizes = vapply(cluster_list, length, integer(1))
)
),
class = "mcml"
)
result
}
# ==============================================================================
# 2b. Build MCML from Raw Transition Data
# ==============================================================================
#' Build MCML from Raw Transition Data
#'
#' Builds a Multi-Cluster Multi-Level (MCML) model from raw transition data
#' (edge lists or sequences) by recoding node labels to cluster labels and
#' counting actual transitions. Unlike \code{\link{cluster_summary}} which
#' aggregates a pre-computed weight matrix, this function works from the
#' original transition data to produce the TRUE Markov chain over cluster states.
#'
#' @param x Input data. Accepts multiple formats:
#' \describe{
#' \item{data.frame with from/to columns}{Edge list. Columns named
#' from/source/src/v1/node1/i and to/target/tgt/v2/node2/j are
#' auto-detected. Optional weight column (weight/w/value/strength).}
#' \item{data.frame without from/to columns}{Sequence data. Each row is a
#' sequence, columns are time steps. Consecutive pairs (t, t+1) become
#' transitions.}
#' \item{tna object}{If \code{x$data} is non-NULL, uses sequence path on
#' the raw data. Otherwise falls back to \code{\link{cluster_summary}}.}
#' \item{netobject}{If \code{x$data} is non-NULL, detects edge list
#' vs sequence data. Otherwise falls back to \code{\link{cluster_summary}}.}
#' \item{cluster_summary}{Returns as-is.}
#' \item{square numeric matrix}{Falls back to \code{\link{cluster_summary}}.}
#' \item{non-square or character matrix}{Treated as sequence data.}
#' }
#'
#' @param clusters Cluster/group assignments. Accepts:
#' \describe{
#' \item{named list}{Direct mapping. List names = cluster names, values =
#' character vectors of node labels.
#' Example: \code{list(A = c("N1","N2"), B = c("N3","N4"))}}
#' \item{data.frame}{A data frame where the first column contains node names
#' and the second column contains group/cluster names.
#' Example: \code{data.frame(node = c("N1","N2","N3"), group = c("A","A","B"))}}
#' \item{membership vector}{Character or numeric vector. Node names are
#' extracted from the data.
#' Example: \code{c("A","A","B","B")}}
#' \item{column name string}{For edge list data.frames, the name of a
#' column containing cluster labels. The mapping is built from unique
#' (node, group) pairs in both from and to columns.}
#' \item{NULL}{Auto-detect from \code{netobject$nodes} or
#' \code{$node_groups} (same logic as \code{\link{cluster_summary}}).}
#' }
#'
#' @param method Aggregation method for combining edge weights: "sum", "mean",
#' "median", "max", "min", "density", "geomean". Default "sum".
#' @param type Post-processing: "tna" (row-normalize), "cooccurrence"
#' (symmetrize), "semi_markov", or "raw". Default "tna".
#' @param directed Logical. Treat as directed network? Default TRUE.
#' @param compute_within Logical. Compute within-cluster matrices? Default TRUE.
#'
#' @return A \code{cluster_summary} object with \code{meta$source = "transitions"},
#' fully compatible with \code{plot()}, \code{as_tna()}, and
#' \code{plot()}.
#'
#' @export
#' @seealso \code{\link{cluster_summary}} for matrix-based aggregation,
#' \code{net_as_tna()} to convert to tna objects,
#' \code{plot()} for visualization
#'
#' @examples
#' # Edge list with clusters
#' edges <- data.frame(
#' from = c("A", "A", "B", "C", "C", "D"),
#' to = c("B", "C", "A", "D", "D", "A"),
#' weight = c(1, 2, 1, 3, 1, 2)
#' )
#' clusters <- list(G1 = c("A", "B"), G2 = c("C", "D"))
#' cs <- build_mcml(edges, clusters)
#' cs$macro$weights
#'
#' # Sequence data with clusters
#' seqs <- data.frame(
#' T1 = c("A", "C", "B"),
#' T2 = c("B", "D", "A"),
#' T3 = c("C", "C", "D"),
#' T4 = c("D", "A", "C")
#' )
#' cs <- build_mcml(seqs, clusters, type = "raw")
#' cs$macro$weights
build_mcml <- function(x,
clusters = NULL,
method = c("sum", "mean", "median", "max",
"min", "density", "geomean"),
type = c("tna", "frequency", "cooccurrence",
"semi_markov", "raw"),
directed = TRUE,
compute_within = TRUE) {
# If already an mcml object, return as-is
if (inherits(x, "mcml")) {
return(x)
}
type <- match.arg(type)
method <- match.arg(method)
# Coerce cograph_network so downstream branches see a netobject
if (inherits(x, "cograph_network")) x <- .as_netobject(x)
input_type <- .detect_mcml_input(x)
switch(input_type,
"edgelist" = .build_mcml_edgelist(x, clusters, method, type,
directed, compute_within),
"sequence" = .build_mcml_sequence(x, clusters, method, type,
directed, compute_within),
"tna_data" = .build_mcml_sequence(x$data, clusters, method, type,
directed, compute_within),
"tna_matrix" = cluster_summary(x, clusters, method = method, type = type,
directed = directed,
compute_within = compute_within),
"netobject_data" = {
data <- x$data
# Auto-detect clusters from network if not provided
if (is.null(clusters)) {
clusters <- .auto_detect_clusters(x) # nocov
}
sub_type <- .detect_mcml_input(data)
if (sub_type == "edgelist") {
.build_mcml_edgelist(data, clusters, method, type,
directed, compute_within)
} else {
.build_mcml_sequence(data, clusters, method, type,
directed, compute_within)
}
},
"netobject_matrix" = {
if (is.null(clusters)) {
clusters <- .auto_detect_clusters(x) # nocov
}
cluster_summary(x, clusters, method = method, type = type,
directed = directed, compute_within = compute_within)
},
"matrix" = cluster_summary(x, clusters, method = method, type = type,
directed = directed,
compute_within = compute_within),
stop("Cannot build MCML from input of class '", class(x)[1], "'",
call. = FALSE)
)
}
#' Detect input type for build_mcml
#' @keywords internal
.detect_mcml_input <- function(x) {
if (inherits(x, "tna")) {
if (!is.null(x$data)) return("tna_data")
return("tna_matrix")
}
if (inherits(x, "netobject") || inherits(x, "netobject_ml")) {
if (!is.null(x$data)) return("netobject_data")
return("netobject_matrix")
}
if (is.data.frame(x)) {
col_names <- tolower(names(x))
from_cols <- c("from", "source", "src", "v1", "node1", "i")
to_cols <- c("to", "target", "tgt", "v2", "node2", "j")
has_from <- any(from_cols %in% col_names)
has_to <- any(to_cols %in% col_names)
if (has_from && has_to) return("edgelist")
return("sequence")
}
if (is.matrix(x)) {
if (is.numeric(x) && nrow(x) == ncol(x)) return("matrix")
return("sequence")
}
"unknown"
}
#' Auto-detect clusters from netobject
#' @keywords internal
.auto_detect_clusters <- function(x) {
clusters <- NULL
if (!is.null(x$nodes)) {
cluster_cols <- c("clusters", "cluster", "groups", "group")
for (col in cluster_cols) {
if (col %in% names(x$nodes)) {
clusters <- x$nodes[[col]]
break
}
}
}
if (is.null(clusters) && !is.null(x$node_groups)) {
ng <- x$node_groups
cluster_col <- intersect(c("cluster", "group", "layer"), names(ng))
if (length(cluster_col) > 0) {
clusters <- ng[[cluster_col[1]]]
}
}
if (is.null(clusters)) {
stop("No clusters found in netobject. ",
"Add a 'clusters' column to nodes or provide clusters argument.",
call. = FALSE)
}
clusters
}
#' Build node-to-cluster lookup from cluster specification
#' @keywords internal
.build_cluster_lookup <- function(clusters, all_nodes) {
if (is.data.frame(clusters)) { # nocov start
# Defensive: .normalize_clusters converts df to list before this is called
stopifnot(ncol(clusters) >= 2)
nodes <- as.character(clusters[[1]])
groups <- as.character(clusters[[2]])
clusters <- split(nodes, groups)
} # nocov end
if (is.list(clusters) && !is.data.frame(clusters)) {
# Named list: cluster_name -> node vector
lookup <- character(0)
for (cl_name in names(clusters)) {
nodes <- clusters[[cl_name]]
lookup[nodes] <- cl_name
}
# Verify all nodes are mapped
unmapped <- setdiff(all_nodes, names(lookup))
if (length(unmapped) > 0) {
stop("Unmapped nodes: ",
paste(utils::head(unmapped, 5), collapse = ", "),
call. = FALSE)
}
return(lookup)
}
if (is.character(clusters) || is.factor(clusters)) {
clusters <- as.character(clusters)
if (length(clusters) != length(all_nodes)) {
stop("Membership vector length (", length(clusters),
") must equal number of unique nodes (", length(all_nodes), ")",
call. = FALSE)
}
lookup <- setNames(clusters, all_nodes)
return(lookup)
}
if (is.numeric(clusters) || is.integer(clusters)) {
if (length(clusters) != length(all_nodes)) {
stop("Membership vector length (", length(clusters),
") must equal number of unique nodes (", length(all_nodes), ")",
call. = FALSE)
}
lookup <- setNames(as.character(clusters), all_nodes)
return(lookup)
}
stop("clusters must be a named list, character/numeric vector, or column name",
call. = FALSE)
}
#' Build cluster_summary from transition vectors
#' @keywords internal
.build_from_transitions <- function(from_nodes, to_nodes, weights,
cluster_lookup, cluster_list,
method, type, directed,
compute_within, data = NULL) {
# Sort clusters alphabetically (TNA convention)
cluster_list <- cluster_list[order(names(cluster_list))]
cluster_names <- names(cluster_list)
n_clusters <- length(cluster_names)
# Recode to cluster labels
from_clusters <- cluster_lookup[from_nodes]
to_clusters <- cluster_lookup[to_nodes]
# ---- Between-cluster matrix (includes diagonal = within-cluster loops) ----
between_raw <- matrix(0, n_clusters, n_clusters,
dimnames = list(cluster_names, cluster_names))
# Include ALL transitions — node-level self-loops (A->A) are valid
# cluster-level self-loops (e.g. discuss->discuss = Social->Social)
b_from <- from_clusters
b_to <- to_clusters
b_w <- weights
if (length(b_from) > 0) {
# Build pair keys and aggregate
pair_keys <- paste(b_from, b_to, sep = "\t")
names(b_w) <- pair_keys
agg_vals <- tapply(b_w, pair_keys, function(w) {
n_possible <- NULL
if (method == "density") {
parts <- strsplit(names(w)[1], "\t")[[1]]
n_i <- length(cluster_list[[parts[1]]])
n_j <- length(cluster_list[[parts[2]]])
n_possible <- n_i * n_j
}
net_aggregate_weights(w, method, n_possible)
})
for (key in names(agg_vals)) {
parts <- strsplit(key, "\t")[[1]]
between_raw[parts[1], parts[2]] <- agg_vals[[key]]
}
}
# Process between weights
between_weights <- .process_weights(between_raw, type, directed)
# Compute inits from column sums
col_sums <- colSums(between_raw)
total <- sum(col_sums)
if (total > 0) {
between_inits <- col_sums / total
} else {
between_inits <- rep(1 / n_clusters, n_clusters) # nocov
}
names(between_inits) <- cluster_names
# ---- Build recoded sequence data if input is sequences ----
between_seq_data <- NULL
within_seq_data_list <- NULL
is_seq <- is.data.frame(data) && !any(tolower(names(data)) %in%
c("from", "source", "src", "v1", "node1", "i",
"to", "target", "tgt", "v2", "node2", "j"))
if (is_seq) {
# Recode sequences to cluster labels
between_seq_data <- as.data.frame(
lapply(data, function(col) {
recoded <- unname(cluster_lookup[as.character(col)])
recoded
}),
stringsAsFactors = FALSE
)
# Filter sequences per cluster (keep only that cluster's nodes, NA others)
within_seq_data_list <- lapply(cluster_list, function(cl_nodes) {
as.data.frame(
lapply(data, function(col) {
vals <- as.character(col)
vals[!vals %in% cl_nodes] <- NA_character_
vals
}),
stringsAsFactors = FALSE
)
})
}
between <- list(
weights = between_weights,
inits = between_inits,
labels = cluster_names,
data = between_seq_data
)
# ---- Within-cluster matrices ----
within_data <- NULL
if (isTRUE(compute_within)) {
# Filter within-cluster transitions (same cluster, including self-loops)
is_within <- from_clusters == to_clusters
w_from <- from_nodes[is_within]
w_to <- to_nodes[is_within]
w_w <- weights[is_within]
within_data <- lapply(seq_len(n_clusters), function(i) {
cl_name <- cluster_names[i]
cl_nodes <- cluster_list[[cl_name]]
n_i <- length(cl_nodes)
if (n_i <= 1) {
# Single node: count self-loops
keep_self <- w_from %in% cl_nodes & w_to %in% cl_nodes
self_weight <- if (any(keep_self)) {
net_aggregate_weights(w_w[keep_self], method)
} else { 0 }
within_weights_i <- matrix(self_weight, 1, 1,
dimnames = list(cl_nodes, cl_nodes))
within_inits_i <- setNames(1, cl_nodes)
} else {
# Filter transitions for this cluster (keep self-loops)
keep <- w_from %in% cl_nodes & w_to %in% cl_nodes
cf <- w_from[keep]
ct <- w_to[keep]
cw <- w_w[keep]
within_raw <- matrix(0, n_i, n_i,
dimnames = list(cl_nodes, cl_nodes))
if (length(cf) > 0) {
pair_keys <- paste(cf, ct, sep = "\t")
agg_vals <- tapply(cw, pair_keys, function(w) {
net_aggregate_weights(w, method)
})
for (key in names(agg_vals)) {
parts <- strsplit(key, "\t")[[1]]
within_raw[parts[1], parts[2]] <- agg_vals[[key]]
}
}
within_weights_i <- .process_weights(within_raw, type, directed)
col_sums_w <- colSums(within_raw, na.rm = TRUE)
total_w <- sum(col_sums_w, na.rm = TRUE)
within_inits_i <- if (!is.na(total_w) && total_w > 0) {
col_sums_w / total_w
} else {
rep(1 / n_i, n_i) # nocov
}
names(within_inits_i) <- cl_nodes
}
# Attach filtered sequence data for this cluster
cl_seq_data <- if (!is.null(within_seq_data_list)) {
within_seq_data_list[[cl_name]]
} else {
NULL
}
list(
weights = within_weights_i,
inits = within_inits_i,
labels = cl_nodes,
data = cl_seq_data
)
})
names(within_data) <- cluster_names
}
# ---- Edges data.frame ----
edge_type <- ifelse(from_clusters == to_clusters, "within", "between")
edges <- data.frame(
from = from_nodes,
to = to_nodes,
weight = weights,
cluster_from = unname(from_clusters),
cluster_to = unname(to_clusters),
type = edge_type,
stringsAsFactors = FALSE
)
# ---- Assemble result ----
all_nodes <- sort(unique(c(from_nodes, to_nodes)))
n_nodes <- length(all_nodes)
structure(
list(
macro = between,
clusters = within_data,
cluster_members = cluster_list,
edges = edges,
data = data,
meta = list(
type = type,
method = method,
directed = directed,
n_nodes = n_nodes,
n_clusters = n_clusters,
cluster_sizes = vapply(cluster_list, length, integer(1)),
source = "transitions"
)
),
class = "mcml"
)
}
#' Build MCML from edge list data.frame
#' @keywords internal
.build_mcml_edgelist <- function(df, clusters, method, type,
directed, compute_within) {
col_names <- tolower(names(df))
# Detect from/to columns
from_col <- which(col_names %in% c("from", "source", "src",
"v1", "node1", "i"))[1]
if (is.na(from_col)) from_col <- 1L
to_col <- which(col_names %in% c("to", "target", "tgt",
"v2", "node2", "j"))[1]
if (is.na(to_col)) to_col <- 2L
# Detect weight column
weight_col <- which(col_names %in% c("weight", "w", "value", "strength"))[1]
has_weight <- !is.na(weight_col)
from_vals <- as.character(df[[from_col]])
to_vals <- as.character(df[[to_col]])
weights <- if (has_weight) as.numeric(df[[weight_col]]) else rep(1, nrow(df))
# Remove rows with NA in from/to
valid <- !is.na(from_vals) & !is.na(to_vals)
from_vals <- from_vals[valid]
to_vals <- to_vals[valid]
weights <- weights[valid]
all_nodes <- sort(unique(c(from_vals, to_vals)))
# Handle clusters parameter
if (is.character(clusters) && length(clusters) == 1 &&
clusters %in% names(df)) {
# Column name: build lookup from both from+group and to+group
group_col <- df[[clusters]]
group_col <- as.character(group_col[valid])
# Build mapping from from-side
from_map <- setNames(group_col, from_vals)
# Build mapping from to-side
to_map <- setNames(group_col, to_vals)
# Merge (from takes priority if conflicting, but shouldn't)
full_map <- c(to_map, from_map)
# Keep unique node -> cluster mapping
full_map <- full_map[!duplicated(names(full_map))]
# Build cluster_list
cluster_list <- split(names(full_map), unname(full_map))
cluster_list <- lapply(cluster_list, sort)
cluster_lookup <- full_map
# Re-derive all_nodes from the lookup
all_nodes <- sort(names(cluster_lookup))
} else {
# List or vector clusters
if (is.null(clusters)) {
stop("clusters argument is required for data.frame input", call. = FALSE)
}
if (is.list(clusters) && !is.data.frame(clusters)) {
cluster_list <- clusters
} else {
# Membership vector
cluster_list <- .normalize_clusters(clusters, all_nodes)
}
cluster_lookup <- .build_cluster_lookup(cluster_list, all_nodes)
}
.build_from_transitions(from_vals, to_vals, weights,
cluster_lookup, cluster_list,
method, type, directed, compute_within,
data = df)
}
#' Build MCML from sequence data.frame
#' @keywords internal
.build_mcml_sequence <- function(df, clusters, method, type,
directed, compute_within) {
if (is.matrix(df)) df <- as.data.frame(df, stringsAsFactors = FALSE)
stopifnot(is.data.frame(df))
nc <- ncol(df)
if (nc < 2) {
stop("Sequence data must have at least 2 columns (time steps)",
call. = FALSE)
}
# Extract consecutive pairs: (col[t], col[t+1]) for all rows
pairs <- lapply(seq_len(nc - 1), function(t) {
from_t <- as.character(df[[t]])
to_t <- as.character(df[[t + 1]])
data.frame(from = from_t, to = to_t, stringsAsFactors = FALSE)
})
pairs <- as.data.frame(data.table::rbindlist(pairs))
# Remove NA pairs
valid <- !is.na(pairs$from) & !is.na(pairs$to)
from_vals <- pairs$from[valid]
to_vals <- pairs$to[valid]
weights <- rep(1, length(from_vals))
all_nodes <- sort(unique(c(from_vals, to_vals)))
if (is.null(clusters)) {
stop("clusters argument is required for sequence data", call. = FALSE)
}
if (is.list(clusters) && !is.data.frame(clusters)) {
cluster_list <- clusters
} else {
cluster_list <- .normalize_clusters(clusters, all_nodes)
}
cluster_lookup <- .build_cluster_lookup(cluster_list, all_nodes)
.build_from_transitions(from_vals, to_vals, weights,
cluster_lookup, cluster_list,
method, type, directed, compute_within,
data = df)
}
#' Process weights based on type
#' @keywords internal
.process_weights <- function(raw_weights, type, directed = TRUE) {
if (type == "raw" || type == "frequency") {
return(raw_weights)
}
if (type == "cooccurrence") {
# Symmetrize
return((raw_weights + t(raw_weights)) / 2)
}
if (type == "tna" || type == "semi_markov") {
# Row-normalize so rows sum to 1
rs <- rowSums(raw_weights, na.rm = TRUE)
processed <- raw_weights / ifelse(rs == 0 | is.na(rs), 1, rs)
processed[is.na(processed)] <- 0
return(processed)
}
# Default: return as-is
raw_weights # nocov
}
#' Convert cluster_summary to tna Objects
#'
#' Converts a \code{cluster_summary} object to proper tna objects that can be
#' used with all functions from the tna package. Creates both a between-cluster
#' tna model (cluster-level transitions) and within-cluster tna models (internal
#' transitions within each cluster).
#'
#' This is the final step in the MCML workflow, enabling full integration with
#' the tna package for centrality analysis, bootstrap validation, permutation
#' tests, and visualization.
#'
#' @param x A \code{cluster_summary} object created by \code{\link{cluster_summary}}.
#' The cluster_summary should typically be created with \code{type = "tna"} to
#' ensure row-normalized transition probabilities. If created with
#' \code{type = "raw"}, the raw counts will be passed to \code{tna::tna()}
#' which will normalize them.
#'
#' @return A \code{cluster_tna} object (S3 class) containing:
#' \describe{
#' \item{between}{A tna object representing cluster-level transitions.
#' Contains \code{$weights} (k x k transition matrix), \code{$inits}
#' (initial distribution), and \code{$labels} (cluster names).
#' Use this for analyzing how learners/entities move between high-level
#' groups or phases.}
#' \item{within}{Named list of tna objects, one per cluster. Each tna object
#' represents internal transitions within that cluster. Contains
#' \code{$weights} (n_i x n_i matrix), \code{$inits} (initial distribution),
#' and \code{$labels} (node labels). Clusters with single nodes or zero-row
#' nodes are excluded (tna requires positive row sums).}
#' }
#'
#' @details
#' ## Requirements
#'
#' The tna package must be installed. If not available, the function throws
#' an error with installation instructions.
#'
#' ## Workflow
#'
#' \preformatted{
#' # Full MCML workflow
#' net <- build_network(data, method = "relative")
#' net$nodes$clusters <- group_assignments
#' cs <- cluster_summary(net, type = "tna")
#' tna_models <- as_tna(cs)
#'
#' # Now use tna package functions
#' plot(tna_models$macro)
#' tna::centralities(tna_models$macro)
#' tna::bootstrap(tna_models$macro, iter = 1000)
#'
#' # Analyze within-cluster patterns
#' plot(tna_models$clusters$ClusterA)
#' tna::centralities(tna_models$clusters$ClusterA)
#' }
#'
#' ## Excluded Clusters
#'
#' A within-cluster tna cannot be created when:
#' \itemize{
#' \item The cluster has only 1 node (no internal transitions possible)
#' \item Some nodes in the cluster have no outgoing edges (row sums to 0)
#' }
#'
#' These clusters are silently excluded from \code{$clusters}. The between-cluster
#' model still includes all clusters.
#'
#' @export
#' @seealso
#' \code{\link{cluster_summary}} to create the input object,
#' \code{plot()} for visualization without conversion,
#' \code{tna::tna} for the underlying tna constructor
#'
#' @examples
#' mat <- matrix(runif(36), 6, 6)
#' rownames(mat) <- colnames(mat) <- LETTERS[1:6]
#' clusters <- list(G1 = c("A", "B"), G2 = c("C", "D"), G3 = c("E", "F"))
#' cs <- cluster_summary(mat, clusters, type = "tna")
#' tna_models <- as_tna(cs)
#' tna_models
#' tna_models$macro$weights
as_tna <- function(x) {
UseMethod("as_tna")
}
#' @rdname as_tna
#' @return A \code{netobject_group} with data preserved from each sub-network.
#' @export
as_tna.mcml <- function(x) {
# Determine method from meta
meta_type <- x$meta$type
net_method <- if (!is.null(meta_type) && meta_type %in% c("raw", "frequency")) {
"frequency"
} else {
"relative"
}
directed <- if (!is.null(x$meta$directed)) x$meta$directed else TRUE
# Macro
macro_net <- .wrap_netobject(x$macro$weights, data = x$macro$data,
method = net_method, directed = directed)
# Per-cluster
cluster_nets <- if (!is.null(x$clusters)) {
nets <- lapply(names(x$clusters), function(cl) {
w <- x$clusters[[cl]]$weights
d <- x$clusters[[cl]]$data
if (any(rowSums(w) == 0)) return(NULL)
.wrap_netobject(w, data = d, method = net_method, directed = directed)
})
names(nets) <- names(x$clusters)
nets[!vapply(nets, is.null, logical(1))]
} else {
list()
}
result <- c(list(macro = macro_net), cluster_nets)
class(result) <- "netobject_group"
result
}
#' Wrap a weight matrix + optional data into a minimal netobject
#' @noRd
.wrap_netobject <- function(mat, data = NULL, method = "relative",
directed = TRUE) {
states <- rownames(mat)
edges <- .extract_edges_from_matrix(mat, directed = directed)
nodes_df <- data.frame(
id = seq_along(states), label = states, name = states,
x = NA_real_, y = NA_real_, stringsAsFactors = FALSE
)
structure(
list(
data = data,
weights = mat,
nodes = nodes_df,
edges = edges,
directed = directed,
method = method,
params = list(),
scaling = NULL,
threshold = 0,
n_nodes = length(states),
n_edges = nrow(edges),
level = NULL,
meta = list(source = "nestimate", layout = NULL,
tna = list(method = method)),
node_groups = NULL
),
class = c("netobject", "cograph_network")
)
}
#' @rdname as_tna
#' @return A \code{tna} object constructed from the input.
#' @export
as_tna.default <- function(x) {
if (inherits(x, "tna")) {
return(x)
}
stop("Cannot convert object of class '", class(x)[1], "' to tna", call. = FALSE)
}
#' Normalize cluster specification to list format
#' @keywords internal
#' @noRd
.normalize_clusters <- function(clusters, node_names) {
if (is.data.frame(clusters)) {
# Data frame with node and group columns
stopifnot(ncol(clusters) >= 2)
nodes <- as.character(clusters[[1]])
groups <- as.character(clusters[[2]])
clusters <- split(nodes, groups)
}
if (is.list(clusters)) {
# Already a list - validate node names
all_nodes <- unlist(clusters)
if (!all(all_nodes %in% node_names)) {
missing <- setdiff(all_nodes, node_names)
stop("Unknown nodes in clusters: ",
paste(utils::head(missing, 5), collapse = ", "), call. = FALSE)
}
return(clusters)
}
if (is.vector(clusters) && (is.numeric(clusters) || is.integer(clusters))) {
# Membership vector
if (length(clusters) != length(node_names)) {
stop("Membership vector length (", length(clusters),
") must equal number of nodes (", length(node_names), ")",
call. = FALSE)
}
# Convert to list
unique_clusters <- sort(unique(clusters))
cluster_list <- lapply(unique_clusters, function(k) {
node_names[clusters == k]
})
names(cluster_list) <- as.character(unique_clusters)
return(cluster_list)
}
if (is.factor(clusters) || is.character(clusters)) {
# Named membership
if (length(clusters) != length(node_names)) {
stop("Membership vector length must equal number of nodes", call. = FALSE)
}
clusters <- as.character(clusters)
unique_clusters <- unique(clusters)
cluster_list <- lapply(unique_clusters, function(k) {
node_names[clusters == k]
})
names(cluster_list) <- unique_clusters
return(cluster_list)
}
stop("clusters must be a list, numeric vector, or factor", call. = FALSE)
}
# ==============================================================================
# S3 Methods
# ==============================================================================
#' Print Method for mcml
#'
#' @param x An \code{mcml} object.
#' @param ... Additional arguments (ignored).
#'
#' @return The input object, invisibly.
#'
#' @examples
#' seqs <- data.frame(V1 = c("A","B","C","A"), V2 = c("B","C","A","B"))
#' clusters <- list(G1 = c("A","B"), G2 = c("C"))
#' cs <- build_mcml(seqs, clusters)
#' print(cs)
#' \donttest{
#' seqs <- data.frame(
#' T1 = c("A","B","A"), T2 = c("B","C","B"),
#' T3 = c("C","A","C"), T4 = c("A","B","A")
#' )
#' clusters <- c("Alpha", "Beta", "Alpha")
#' cs <- build_mcml(seqs, clusters, type = "raw")
#' print(cs)
#' }
#'
#' @export
print.mcml <- function(x, ...) {
n_clusters <- x$meta$n_clusters
n_nodes <- x$meta$n_nodes
cluster_sizes <- x$meta$cluster_sizes
cat("MCML Network\n")
cat("============\n")
cat("Type:", x$meta$type, " | Method:", x$meta$method, "\n")
cat("Nodes:", n_nodes, " | Clusters:", n_clusters, "\n")
if (!is.null(x$edges)) {
cat("Transitions:", nrow(x$edges), "\n")
n_between <- sum(x$edges$type == "between")
n_within <- sum(x$edges$type == "within")
cat(" Macro:", n_between, " | Per-cluster:", n_within, "\n\n")
} else {
cat("\n")
}
cat("Clusters:\n")
for (cl in names(x$cluster_members)) {
cat(" ", cl, " (", cluster_sizes[cl], "): ",
paste(x$cluster_members[[cl]], collapse = ", "), "\n", sep = "")
}
cat("\nMacro (cluster-level) weights:\n")
print(round(x$macro$weights, 4))
invisible(x)
}
#' Plot Method for mcml
#'
#' Plots an MCML network. When \pkg{cograph} is available, delegates to
#' \code{cograph::plot_mcml()} which renders a two-layer visualization
#' (macro summary on top, within-cluster detail on bottom). Otherwise,
#' converts to a \code{netobject_group} and plots each layer as a
#' separate panel.
#'
#' @param x An \code{mcml} object.
#' @param ... Additional arguments passed to \code{cograph::plot_mcml()}
#' (e.g., \code{colors}, \code{edge_labels}, \code{mode}).
#'
#' @return The input object, invisibly.
#'
#' @examples
#' \dontrun{
#' seqs <- data.frame(
#' T1 = sample(LETTERS[1:6], 30, TRUE),
#' T2 = sample(LETTERS[1:6], 30, TRUE),
#' T3 = sample(LETTERS[1:6], 30, TRUE)
#' )
#' clusters <- list(G1 = c("A", "B", "C"), G2 = c("D", "E", "F"))
#' cs <- build_mcml(seqs, clusters)
#' plot(cs)
#' }
#'
#' @export
plot.mcml <- function(x, ...) {
if (!requireNamespace("cograph", quietly = TRUE)) {
stop("Package 'cograph' is required for plotting. ",
"Install with: install.packages('cograph')", call. = FALSE)
}
cograph::plot_mcml(x, ...)
invisible(x)
}
#' Summary Method for mcml
#'
#' @param object An \code{mcml} object.
#' @param ... Additional arguments (ignored).
#'
#' @return The input object, invisibly.
#'
#' @examples
#' seqs <- data.frame(V1 = c("A","B","C","A"), V2 = c("B","C","A","B"))
#' clusters <- list(G1 = c("A","B"), G2 = c("C"))
#' cs <- build_mcml(seqs, clusters)
#' summary(cs)
#' \donttest{
#' seqs <- data.frame(
#' T1 = c("A","B","A"), T2 = c("B","C","B"),
#' T3 = c("C","A","C"), T4 = c("A","B","A")
#' )
#' clusters <- c("Alpha", "Beta", "Alpha")
#' cs <- build_mcml(seqs, clusters, type = "raw")
#' summary(cs)
#' }
#'
#' @export
summary.mcml <- function(object, ...) {
print(object, ...)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.