R/build.R

Defines functions nodes scale_weights compute_weights compute_transitions initialize_model create_seqdata build_model_ atna ctna ftna tna build_model.tna_data build_model.data.frame build_model.stslist build_model.matrix build_model.default build_model

Documented in atna build_model build_model.data.frame build_model.default build_model.matrix build_model.stslist build_model.tna_data ctna ftna tna

#' Build a Transition Network Analysis Model
#'
#' Construct a transition network analysis (TNA) model from sequence data.
#' The function takes a data set of sequence of events or states as input and
#' builds a TNA model. It extracts the edge weights and initial probabilities
#' from the data along with the state labels. THe function also accepts weight
#' matrices and initial state probabilities directly.
#'
#' @export
#' @family basic
#' @rdname build_model
#' @param x A `stslist` (from `TraMineR`), `data.frame`, a `matrix`, or
#'   a `tna_data` object (see [prepare_data()]).
#'   For `stslist` and `data.frame` objects `x`
#'   should describe a sequence of events or states to be used for building the
#'   Markov model.  If `x` is a matrix, it is assumed that the element on row
#'   `i` and column `j` is the weight of the edge representing the transition
#'   from state `i` to state `j`. If `x` is a `data.frame`, then
#'   it must be in wide format (see `cols` on how to define columns for the
#'   time points).
#' @param type A `character` string describing the weight matrix type.
#'   Currently supports the following types:
#'
#'   * `"relative"` for relative frequencies (probabilities, the default)
#'   * `"frequency"` for frequencies.
#'   * `"co-occurrence"` for co-occurrences.
#'   * `"n-gram"` for n-gram transitions. Captures higher-order transitions by
#'       considering sequences of n states, useful for identifying longer
#'       patterns.
#'   * `"gap"` allows transitions between non-adjacent states, with
#'       transitions weighted by the gap size.
#'   * `"window"` creates transitions between all states within a
#'       sliding window, capturing local relationships
#'       (several sequences together).
#'   * `"reverse"` considers the sequences in reverse order
#'       (resulting in what is called a reply network in some contexts).
#'       The resulting weight matrix is the transpose of the `"frequency"`
#'       option.
#'   * `"attention"` aggregates all downstream pairs of states with an
#'       exponential decay for the gap between states. The parameter `lambda`
#'       can be used to control the decay rate (the default is 1)-
#'
#' @param scaling A `character` vector describing how to scale the weights
#'   defined by `type`. When a vector is provided, the scaling options are
#'   applied in the respective order. For example, `c("rank", "minmax")` would
#'   first compute the ranks, then scale them to the unit interval using
#'   min-max normalization. An empty vector corresponds to no scaling.
#'   Currently supports the following options:
#'
#'   * `"minmax"` performs min-max normalization to scale the weights to the
#'       unit interval. Note that if the smallest weight is positive, it
#'       will be zero after scaling.
#'   * `"max"` Multiplies the weights by the reciprocal of the largest weight
#'       to scale the weights to the unit interval. This options preserves
#'       positive ranks, unlike `"minmax"` when all weights are positive.
#'   * `"rank"` Computes the ranks of the weights using [base::rank()] with
#'       `ties.method = "average"`.
#'
#' @param inits An optional `numeric` vector of initial state probabilities
#'   for each state. Can be provided only if `x` is a `matrix`. The vector will
#'   be scaled to unity.
#' @param params A `list` of additional arguments for models of specific
#'   `type`. The potential elements of this list are:
#'
#'   * `n_gram`: An `integer` for n-gram transitions specifying the number of
#'       adjacent events. The default value is 2.
#'   * `max_gap`: An `integer` for the gap-allowed transitions specifying the
#'       largest allowed gap size. The default is 1.
#'   * `window_size`: An `integer` for the sliding window transitions
#'       specifying the window size. The default is 2.
#'   * `weighted`: A `logical` value. If `TRUE`, the transitions
#'      are weighted by the inverse of the sequence length. Can be used for
#'      frequency, co-occurrence and reverse model types. The default is
#'      `FALSE`.
#'   * `lambda`: A `numeric` value for the decay rate. The default is 1.
#'
#' @param ... Ignored. For the `build_model` aliases (e.g., `tna`), this
#' argument matches the actual arguments to `build_model` beside `x`.
#' @return An object of class `tna` which is a `list` containing the
#'   following elements:
#'
#'   * `weights`: An adjacency `matrix` of the model (weight matrix).
#'   * `inits`: A `numeric` vector of initial values for each state.
#'     For `matrix` type `x`, this element will be `NULL` if `inits` is not
#'     directly provided
#'   * `labels`: A `character` vector of the state labels, or `NULL` if
#'     there are no labels.
#'   * `data`: The original sequence data that has been converted to an
#'     internal format used by the package when `x` is a `stslist` or a
#'     `data.frame` object. Otherwise `NULL`.
#'
#' @examples
#' model <- build_model(group_regulation)
#' print(model)
#'
build_model <- function(x, type = "relative", scaling = character(0L),
                        ...) {
  UseMethod("build_model")
}

#' @export
#' @rdname build_model
build_model.default <- function(x, type = "relative", scaling = character(0L),
                                inits, params = list(), ...) {
  check_missing(x)
  x <- try_(as.matrix(x))
  stopifnot_(
    !inherits(x, "try-error"),
    "Argument {.arg x} must be coercible to a {.cls matrix}."
  )
  build_model.matrix(x, type, scaling, inits, params, ...)
}

#' @export
#' @rdname build_model
build_model.matrix <- function(x, type = "relative", scaling = character(0L),
                               inits, ...) {
  check_missing(x)
  x <- try_(data.matrix(x))
  stopifnot_(
    !inherits(x, "try-error"),
    "Argument {.arg x} must be coercible to {.cls numeric}."
  )
  check_na(x)
  nc <- ncol(x)
  stopifnot_(
    nc == nrow(x),
    "Argument {.arg x} must be a square {.cls matrix}."
  )
  stopifnot_(
    nc >= 2L,
    "Argument {.arg x} must have at least two columns."
  )
  if (!missing(inits)) {
    stopifnot_(
      length(inits) >= nc,
      "Argument {.arg inits} must provide initial probabilities for all states."
    )
    inits <- try_(as.numeric(inits))
    stopifnot_(
      !inherits(inits, "try-error"),
      "Argument {.arg inits} must be coercible to {.cls numeric}."
    )
    stopifnot_(
      all(inits >= 0),
      "All elements of {.arg inits} must be non-negative."
    )
    if (length(inits) > ncol(x)) {
      warning_(
        c(
          "Argument {.arg inits} contains more values
           than the number of states.",
          `i` = "Only the first {nc} values will be used."
        )
      )
      inits <- inits[seq_len(nc)]
      inits <- inits / sum(inits)
    }
    names(inits) <- colnames(x)
  }
  type <- check_model_type(type)
  scaling <- check_model_scaling(scaling)
  x <- check_weights(x, type = type)
  x <- scale_weights(x, type = type, scaling = scaling, a = nc)
  if (is.null(colnames(x))) {
    dimnames(x) <- list(seq_len(nc), seq_len(nc))
  }
  build_model_(
    weights = x,
    inits = inits,
    labels = colnames(x),
    type = type,
    scaling = scaling
  )
}

#' @export
#' @rdname build_model
#' @param cols An `integer`/`character` vector giving the indices/names of the
#' columns that should be considered as sequence data.
#' Defaults to all columns, i.e., `seq(1, ncol(x))`.
build_model.stslist <- function(x, type = "relative", scaling = character(0L),
                                cols = seq(1, ncol(x)), params = list(), ...) {
  check_missing(x)
  check_class(x, "stslist")
  check_range(cols, type = "integer", scalar = FALSE, min = 1L, max = ncol(x))
  type <- check_model_type(type)
  scaling <- check_model_scaling(scaling)
  x <- create_seqdata(x, cols)
  model <- initialize_model(x, type, scaling, params)
  build_model_(
    weights = model$weights,
    inits = model$inits,
    labels = attr(x, "labels"),
    type = type,
    scaling = scaling,
    data = x,
    params = params
  )
}

#' @export
#' @rdname build_model
build_model.data.frame <- function(x, type = "relative",
                                   scaling = character(0L),
                                   cols = seq(1, ncol(x)),
                                   params = list(), ...) {
  check_missing(x)
  check_class(x, "data.frame")
  check_range(cols, type = "integer", scalar = FALSE, min = 1L, max = ncol(x))
  type <- check_model_type(type)
  scaling <- check_model_scaling(scaling)
  x <- create_seqdata(x, cols = cols)
  model <- initialize_model(x, type, scaling, params)
  build_model_(
    weights = model$weights,
    inits = model$inits,
    labels = model$labels,
    type = type,
    scaling = scaling,
    data = x,
    params = params
  )
}

#' @export
#' @rdname build_model
build_model.tna_data <- function(x, type = "relative", scaling = character(0),
                                 params = list(), ...) {
  check_missing(x)
  check_class(x, "tna_data")
  type <- check_model_type(type)
  scaling <- check_model_scaling(scaling)
  wide <- cbind(x$sequence_data, x$meta_data)
  x <- create_seqdata(wide, cols = seq_len(ncol(x$sequence_data)))
  model <- initialize_model(x, type, scaling, params)
  build_model_(
    weights = model$weights,
    inits = model$inits,
    labels = model$labels,
    type = type,
    scaling = scaling,
    data = x,
    params = params
  )
}

# Aliases -----------------------------------------------------------------

#' @export
#' @rdname build_model
#' @examples
#' model <- tna(group_regulation)
#'
tna <- function(x, ...) {
  check_missing(x)
  build_model(x = x, type = "relative", ...)
}

#' @export
#' @rdname build_model
#' @examples
#' model <- ftna(group_regulation)
#'
ftna <- function(x, ...) {
  build_model(x = x, type = "frequency", ...)
}

#' @export
#' @rdname build_model
#' @examples
#' model <- ctna(group_regulation)
#'
ctna <- function(x, ...) {
  build_model(x = x, type = "co-occurrence", ...)
}

#' @export
#' @rdname build_model
#' @examples
#' model <- atna(group_regulation)
#'
atna <- function(x, ...) {
  build_model(x = x, type = "attention", ...)
}


# Internal ----------------------------------------------------------------

#' Build a Transition Network Analysis object
#'
#' @param weights A `matrix` of edge weights.
#' @param inits A `numeric` vector of initial state probabilities.
#' @param labels A `character` vector of state labels.
#' @param type A `character` string defining the network type.
#' @param scaling A `character` string defining the scaling of the weights
#' @param data A `tna_seqdata` object when `weights` is
#'   created from sequence data.
#' @param params A `list` of parameters for computing the transitions
#' @return A `tna` object.
#' @noRd
build_model_ <- function(weights, inits = NULL, labels = NULL,
                         type = NULL, scaling = character(0L), data = NULL,
                         params = NULL) {
  structure(
    list(
      weights = weights,
      inits = onlyif(!missing(inits), inits),
      labels = labels,
      data = data
    ),
    type = type,
    scaling = scaling,
    params = params,
    class = "tna"
  )
}

#' Convert Sequence Data to an Internal Format
#'
#' @param x A `data.frame` or a `stslist` object.
#' @param cols An `integer` vector of column indices or a `character` vector
#' of column names.
#' @param alphabet Optional `character` vector of the alphabet.
#' @noRd
create_seqdata <- function(x, cols, alphabet) {
  cols <- ifelse_(
    is.character(cols),
    which(names(x) %in% cols),
    cols
  )
  if (inherits(x, "stslist")) {
    alphabet <- attr(x, "alphabet")
    labels <- attr(x, "labels")
    colors <- ifelse_(
      is.null(attr(x, "cpal")),
      color_palette(length(labels)),
      attr(x, "cpal")
    )
    x <- as.data.frame(x)
  } else if (is.data.frame(x)) {
    if (missing(alphabet)) {
      vals <- sort(unique(unlist(x[, cols])))
      alphabet <- labels <- vals[!is.na(vals)]
    } else {
      labels <- alphabet
    }
    colors <- color_palette(length(labels))
    x[, cols] <- as.data.frame(
      lapply(x[, cols], function(y) factor(y, levels = alphabet))
    )
  }
  x[, cols] <- as.data.frame(
    lapply(
      x[, cols],
      function(y) {
        as.integer(replace(y, which(!y %in% alphabet), NA))
      }
    )
  )
  structure(
    as.matrix(x[, cols]),
    class = c("matrix", "array"),
    alphabet = alphabet,
    labels = labels,
    colors = colors#,
    #cols = cols
  )
}

#' Compute Edge Weights from Sequence Data
#'
#' @param x A data object from `create_seqdata()`
#' @param type The type of transition network model to build.
#' @param scaling The scaling methods to apply to the weights.
#' @param params A list of parameters for the transition model.
#' @param transitions Should the individual-level transitions also be returned?
#' Defaults to `FALSE`.
#' @noRd
initialize_model <- function(x, type, scaling, params, transitions = FALSE) {
  alphabet <- attr(x, "alphabet")
  labels <- attr(x, "labels")
  #cols <- attr(x, "cols")
  #m <- as.matrix(x[, cols])
  a <- length(alphabet)
  inits <- factor(x[, 1L], levels = seq_len(a), labels = alphabet)
  inits <- as.vector(table(inits))
  trans <- compute_transitions(x, a, type, params)
  weights <- compute_weights(trans, type, scaling, a)
  inits <- inits / sum(inits)
  names(inits) <- alphabet
  dimnames(weights) <- list(alphabet, alphabet)
  list(
    weights = weights,
    inits = inits,
    labels = labels,
    trans = onlyif(transitions, trans)
  )
}

#' Compute Network Transitions Based on TNA Type
#'
#' @param m A `matrix` of sequences
#' @param a An `integer`, the number of states.
#' @param type Type of the transition network as a `character` string.
#' @param params Parameters for the transition model.
#' @noRd
compute_transitions <- function(m, a, type, params) {
  n <- nrow(m)
  p <- ncol(m)
  idx <- seq_len(n)
  trans <- array(0L, dim = c(n, a, a))
  seq_lengths <- apply(m, 1L, function(x) sum(!is.na(x)))
  weight <- ifelse_(isTRUE(params$weighted), 1.0 / seq_lengths, rep(1L, n))
  if (type %in% c("relative", "frequency")) {
    weight <- ifelse_(type == "frequency", weight, rep(1L, n))
    for (i in seq_len(p - 1L)) {
      from <- m[, i]
      to <- m[, i + 1L]
      any_na <- is.na(from) | is.na(to)
      new_trans <- cbind(idx, from, to)[!any_na, , drop = FALSE]
      trans[new_trans] <- trans[new_trans] + weight[!any_na]
    }
  } else if (type == "reverse") {
    for (i in seq_len(p - 1L)) {
      from <- m[, i + 1L]
      to <- m[, i]
      any_na <- is.na(from) | is.na(to)
      new_trans <- cbind(idx, from, to)[!any_na, , drop = FALSE]
      trans[new_trans] <- trans[new_trans] + weight[!any_na]
    }
  } else if (type == "co-occurrence") {
    for (i in seq_len(p - 1L)) {
      for (j in seq(i + 1L, p)) {
        from <- m[, i]
        to <- m[, j]
        any_na <- is.na(from) | is.na(to)
        new_trans <- rbind(
          cbind(idx, from, to)[!any_na, , drop = FALSE],
          cbind(idx, to, from)[!any_na, , drop = FALSE]
        )
        trans[new_trans] <- trans[new_trans] + weight[!any_na]
      }
    }
  } else if (type == "n-gram") {
    n_gram <- params$n_gram %||% 2L
    for (i in seq_len(p - n_gram + 1L)) {
      for (j in seq(i, i + n_gram - 2L)) {
        from <- m[, j]
        to <- m[, j + 1L]
        any_na <- is.na(from) | is.na(to)
        new_trans <- cbind(idx, from, to)[!any_na, , drop = FALSE]
        trans[new_trans] <- trans[new_trans] + 1L
      }
    }
  } else if (type == "gap") {
    max_gap <- params$max_gap %||% 1L
    for (i in seq_len(p - 1L)) {
      max_j <- min(i + max_gap + 1L, p)
      for (j in seq(i + 1L, max_j)) {
        from <- m[, i]
        to <- m[, j]
        any_na <- is.na(from) | is.na(to)
        new_trans <- cbind(idx, from, to)[!any_na, , drop = FALSE]
        trans[new_trans] <- trans[new_trans] + 1.0 / (j - i)
      }
    }
  } else if (type == "window") {
    window_size <- params$window_size %||% 2L
    for (i in seq_len(p - window_size + 1L)) {
      for (j in seq(i, i + window_size - 2L)) {
        from <- m[, j]
        for (k in seq(j + 1L, i + window_size - 1L)) {
          to <- m[, k]
          any_na <- is.na(from) | is.na(to)
          new_trans <- cbind(idx, from, to)[!any_na, , drop = FALSE]
          trans[new_trans] <- trans[new_trans] + 1L
        }
      }
    }
  } else if (type == "attention") {
    lambda <- params$lambda %||% 1.0
    for (i in seq_len(p - 1L)) {
      for (j in seq(i + 1L, p)) {
        from <- m[, i]
        to <- m[, j]
        any_na <- is.na(from) | is.na(to)
        new_trans <- cbind(idx, from, to)[!any_na, , drop = FALSE]
        trans[new_trans] <- trans[new_trans] + exp((i - j) / lambda)
      }
    }
  }
  trans
}

#' Compute Network Weights Based On TNA Type
#'
#' @param transitions An `array` of the individual-level transitions.
#' @param type Type of the transition network as a `character` string.
#' @param scaling Scaling methods to apply as a `character` vector.
#' @param a An `integer`, the number of states.
#' @return A `matrix` of transition probabilities or frequencies,
#' based on `type`.
#' @noRd
compute_weights <- function(transitions, type, scaling, a) {
  weights <- apply(transitions, c(2, 3), sum)
  scale_weights(weights, type, scaling, a)
}

#' Scale Transition Network Weights
#'
#' @param weights A `matrix` of edge weights
#' @param type Type of the transition network as a `character` string.
#' @param scaling Scaling methods to apply as a `character` vector.
#' @param a An `integer`, the number of states.
#' @noRd
scale_weights <- function(weights, type, scaling, a) {
  if (type == "relative") {
    rs <- .rowSums(weights, m = a, n = a)
    pos <- which(rs > 0)
    weights[pos, ] <- weights[pos, ] / rs[pos]
    weights[!pos, ] <- NA
  }
  for (i in seq_along(scaling)) {
    if (scaling[i] == "minmax") {
      weights[] <- ranger(weights)
    } else if (scaling[i] == "max") {
      weights[] <- weights / max(weights, na.rm = TRUE)
    } else if (scaling[i] == "rank") {
      weights[] <- rank(weights, ties.method = "average")
    }
  }
  weights
}

#' Get Network Node Count
#'
#' @param x A `tna` or a `group_tna` object, or a weight `matrix`.
#' @noRd
nodes <- function(x) {
  if (is_tna(x)) {
    dim(x$weights)[2L]
  } else if (inherits(x, "group_tna")) {
    dim(x[[1L]]$weights)[2L]
  } else {
    ncol(x)
  }
}

Try the tna package in your browser

Any scripts or data that you put into this service are public.

tna documentation built on June 8, 2025, 10:33 a.m.