R/build_lcm.R

Defines functions build_lcm

Documented in build_lcm

#' Build a Latent Class Model
#'
#' Function \code{build_lcm} is a shortcut for constructing a latent class model
#' as a restricted case of an \code{mhmm} object.
#'
#' @export
#' @param observations An \code{stslist} object (see \code{\link[TraMineR]{seqdef}}) containing
#'   the sequences, or a list of such objects (one for each channel).
#' @param n_clusters A scalar giving the number of clusters/submodels
#' (not used if starting values for model parameters are given with \code{emission_probs}).
#' @param emission_probs A matrix containing emission probabilities for each class by rows,
#'   or in case of multichannel data a list of such matrices.
#'   Note that the matrices must have dimensions k x s where k is the number of
#'   latent classes and s is the number of unique symbols (observed states) in the
#'   data. Emission probabilities should follow the ordering of the alphabet of
#'   observations (\code{alphabet(observations)}, returned as \code{symbol_names}).
#' @param formula Optional formula of class \code{\link{formula}} for the
#' mixture probabilities. Left side omitted.
#' @param data A data frame containing the variables used in the formula.
#' Ignored if no formula is provided.
#' @param coefficients An optional \eqn{k x l} matrix of regression coefficients for
#'   time-constant covariates for mixture probabilities, where \eqn{l} is the number
#'   of clusters and \eqn{k} is the number of covariates. A logit-link is used for
#'   mixture probabilities. The first column is set to zero.
#' @param cluster_names A vector of optional names for the classes/clusters.
#' @param channel_names A vector of optional names for the channels.
#' @return Object of class \code{mhmm} with the following elements:
#' \describe{
#'    \item{\code{observations}}{State sequence object or a list of such containing the data.}
#'    \item{\code{transition_probs}}{A matrix of transition probabilities.}
#'    \item{\code{emission_probs}}{A matrix or a list of matrices of emission probabilities.}
#'    \item{\code{initial_probs}}{A vector of initial probabilities.}
#'    \item{\code{coefficients}}{A matrix of parameter coefficients for covariates (covariates in rows, clusters in columns).}
#'    \item{\code{X}}{Covariate values for each subject.}
#'    \item{\code{cluster_names}}{Names for clusters.}
#'    \item{\code{state_names}}{Names for hidden states.}
#'    \item{\code{symbol_names}}{Names for observed states.}
#'    \item{\code{channel_names}}{Names for channels of sequence data}
#'    \item{\code{length_of_sequences}}{(Maximum) length of sequences.}
#'    \item{\code{n_sequences}}{Number of sequences.}
#'    \item{\code{n_symbols}}{Number of observed states (in each channel).}
#'    \item{\code{n_states}}{Number of hidden states.}
#'    \item{\code{n_channels}}{Number of channels.}
#'    \item{\code{n_covariates}}{Number of covariates.}
#'    \item{\code{n_clusters}}{Number of clusters.}
#' }
#' @seealso \code{\link{fit_model}} for estimating model parameters;
#' \code{\link{summary.mhmm}} for a summary of a mixture model;
#' \code{\link{separate_mhmm}} for organizing an \code{mhmm} object into a list of
#' separate \code{hmm} objects; and \code{\link{plot.mhmm}} for plotting
#' mixture models.
#'
#' @examples
#' # Simulate observations from two classes
#' set.seed(123)
#' obs <- seqdef(rbind(
#'   matrix(sample(letters[1:3], 500, TRUE, prob = c(0.1, 0.6, 0.3)), 50, 10),
#'   matrix(sample(letters[1:3], 200, TRUE, prob = c(0.4, 0.4, 0.2)), 20, 10)
#' ))
#'
#' # Initialize the model
#' set.seed(9087)
#' model <- build_lcm(obs, n_clusters = 2)
#'
#' # Estimate model parameters
#' fit <- fit_model(model)
#'
#' # How many of the observations were correctly classified:
#' sum(summary(fit$model)$most_probable_cluster == rep(c("Class 2", "Class 1"), times = c(500, 200)))
#'
#' ############################################################
#' \dontrun{
#' # LCM for longitudinal data
#'
#' # Define sequence data
#' data("mvad", package = "TraMineR")
#' mvad_alphabet <- c(
#'   "employment", "FE", "HE", "joblessness", "school",
#'   "training"
#' )
#' mvad_labels <- c(
#'   "employment", "further education", "higher education",
#'   "joblessness", "school", "training"
#' )
#' mvad_scodes <- c("EM", "FE", "HE", "JL", "SC", "TR")
#' mvad_seq <- seqdef(mvad, 17:86,
#'   alphabet = mvad_alphabet, states = mvad_scodes,
#'   labels = mvad_labels, xtstep = 6
#' )
#'
#' # Initialize the LCM with random starting values
#' set.seed(7654)
#' init_lcm_mvad1 <- build_lcm(
#'   observations = mvad_seq,
#'   n_clusters = 2, formula = ~male, data = mvad
#' )
#'
#' # Own starting values for emission probabilities
#' emiss <- rbind(rep(1 / 6, 6), rep(1 / 6, 6))
#'
#' # LCM with own starting values
#' init_lcm_mvad2 <- build_lcm(
#'   observations = mvad_seq,
#'   emission_probs = emiss, formula = ~male, data = mvad
#' )
#'
#' # Estimate model parameters (EM algorithm with random restarts)
#' lcm_mvad <- fit_model(init_lcm_mvad1,
#'   control_em = list(restart = list(times = 5))
#' )$model
#'
#' # Plot the LCM
#' plot(lcm_mvad, interactive = FALSE, ncol = 2)
#'
#' ###################################################################
#'
#' # Binomial regression (comparison to glm)
#'
#' require("MASS")
#' data("birthwt")
#'
#' model <- build_lcm(
#'   observations = seqdef(birthwt$low), emission_probs = diag(2),
#'   formula = ~ age + lwt + smoke + ht, data = birthwt
#' )
#' fit <- fit_model(model)
#' summary(fit$model)
#' summary(glm(low ~ age + lwt + smoke + ht, binomial, data = birthwt))
#'
#'
#' # Multinomial regression (comparison to multinom)
#'
#' require("nnet")
#'
#' set.seed(123)
#' n <- 100
#' X <- cbind(1, x1 = runif(n, 0, 1), x2 = runif(n, 0, 1))
#' coefs <- cbind(0, c(-2, 5, -2), c(0, -2, 2))
#' pr <- exp(X %*% coefs) + rnorm(n * 3)
#' pr <- pr / rowSums(pr)
#' y <- apply(pr, 1, which.max)
#' table(y)
#'
#' model <- build_lcm(
#'   observations = seqdef(y), emission_probs = diag(3),
#'   formula = ~ x1 + x2, data = data.frame(X[, -1])
#' )
#' fit <- fit_model(model)
#' summary(fit$model)
#' summary(multinom(y ~ x1 + x2, data = data.frame(X[, -1])))
#' }
build_lcm <- function(observations, n_clusters, emission_probs,
                      formula = NULL, data = NULL, coefficients = NULL,
                      cluster_names = NULL, channel_names = NULL) {
  if (missing(n_clusters) && missing(emission_probs)) {
    stop("Provide either 'n_clusters' or 'emission_probs'.")
  }
  multichannel <- is_multichannel(observations)
  # Single channel but observations is a list
  if (is.list(observations) && !inherits(observations, "stslist") && length(observations) == 1) {
    observations <- observations[[1]]
    multichannel <- FALSE
  }
  n_channels <- ifelse(multichannel, length(observations), 1L)

  if (missing(emission_probs)) {
    if (multichannel) {
      symbol_names <- lapply(observations, alphabet)
      n_symbols <- lengths(symbol_names)
    } else {
      symbol_names <- alphabet(observations)
      n_symbols <- length(symbol_names)
    }
    emission_probs <- simulate_emission_probs(1, n_symbols, n_clusters)
  } else {
    if (multichannel) {
      n_clusters <- nrow(emission_probs[[1]])
      emission_probs_list <- vector("list", n_clusters)
      for (i in 1:n_channels) {
        if (nrow(emission_probs[[i]]) != n_clusters) {
          stop("Different number of rows in 'emission_probs'.")
        }
      }
      for (i in 1:n_clusters) {
        emission_probs_list[[i]] <- vector("list", n_channels)
        for (j in 1:n_channels) {
          emission_probs_list[[i]][[j]] <- emission_probs[[j]][i, , drop = FALSE]
        }
      }
    } else {
      n_clusters <- nrow(emission_probs)
      emission_probs_list <- vector("list", n_clusters)
      for (i in 1:n_clusters) {
        emission_probs_list[[i]] <- emission_probs[i, , drop = FALSE]
      }
    }
    emission_probs <- emission_probs_list
  }
  n_states <- rep(1, n_clusters)
  initial_probs <- replicate(n_clusters, 1, simplify = FALSE)

  if (is.null(cluster_names)) {
    cluster_names <- paste("Class", 1:n_clusters)
  } else if (length(cluster_names) != n_clusters) {
    warning("The length of argument cluster_names does not match the number of clusters. Names were not used.")
    cluster_names <- paste("Class", 1:n_clusters)
  }

  transition_probs <- replicate(n_clusters, matrix(1), simplify = FALSE)
  names(transition_probs) <- names(emission_probs) <- names(initial_probs) <- cluster_names

  model <- build_mhmm(
    observations = observations, transition_probs = transition_probs,
    emission_probs = emission_probs, initial_probs = initial_probs,
    formula = formula, data = data, coefficients = coefficients,
    cluster_names = cluster_names, state_names = cluster_names,
    channel_names = channel_names)
  attr(model, "type") <- "lcm"
  model
}

Try the seqHMM package in your browser

Any scripts or data that you put into this service are public.

seqHMM documentation built on July 9, 2023, 6:35 p.m.