R/vlmc.R
In mixvlmc: Variable Length Markov Chains with Covariates

Documented in cutoff.vlmc is_vlmc prune prune.vlmc vlmc

#' Test if the object is a vlmc model
#'
#' This function returns `TRUE` for VLMC models and `FALSE` for other objects.
#'
#' @param x an R object.
#' @returns `TRUE` for VLMC models.
#' @export
#' @examples
#' pc <- powerconsumption[powerconsumption$week == 5, ]
#' dts <- cut(pc$active_power, breaks = c(0, quantile(pc$active_power, probs = c(0.25, 0.5, 0.75, 1))))
#' model <- vlmc(dts)
#' # should be true
#' is_ctx_tree(model)
#' # should be true
#' is_vlmc(model)
#' # should be false
#' is_covlmc(model)
is_vlmc <- function(x) {
  inherits(x, "vlmc")
}

assertthat::on_failure(is_vlmc) <- function(call, env) {
  paste0(deparse(call$x), " is not a vlmc")
}

kl_div <- function(p, q) {
  pratio <- p / q
  pratio <- ifelse(p == q, 1, pratio) ## handles p=q=0
  sum(p * ifelse(pratio > 0, log(pratio), 0))
}

#' Cut off values for pruning the context tree of a VLMC
#'
#' This function returns a collection of cut off values that are guaranteed to
#' induce all valid pruned trees of the context tree of a VLMC. Pruning is
#' implemented by the [prune()] function.
#'
#' By default, the function returns values that can be used directly to induce
#' pruning in the context tree. This is done by computing the log likelihood
#' ratios used by the context algorithm on the reference VLMC and by keeping the
#' relevant ones. From them the function selects intermediate values that are
#' guaranteed to generate via pruning all the VLMC models that could be
#' generated by using larger values of the `cutoff` parameter that was used to
#' build the reference model (or smaller values of the `alpha` parameter in
#' "quantile" scale).
#'
#' Setting the `raw` parameter to `TRUE` removes this operation on the values
#' and asks the function to return the relevant log likelihood ratios.
#'
#' For large VLMC, some log likelihood ratios can be almost identical, with a
#' difference of the order of the machine epsilon value. The `tolerance`
#' parameter is used to keep only values that are different enough. This is done
#' in the native scale, before transformations implemented when `raw` is
#' `FALSE`.
#'
#' As automated model selection is provided by [tune_vlmc()], the direct use of
#' `cutoff` should be reserved to advanced exploration of the set of trees that
#' can be obtained from a complex one, e.g. to implement model selection
#' techniques that are not provided by [tune_vlmc()].
#'
#' @param model a fitted VLMC model.
#' @param scale specify whether the results should be "native" log likelihood
#'   ratio values or expressed in a "quantile" scale of a chi-squared
#'   distribution (defaults to "quantile").
#' @param raw specify whether the returned values should be limit values
#'   computed in the model or modified values that guarantee pruning (see
#'   details)
#' @param tolerance specify the minimum separation between two consecutive
#'   values of the cut off in native mode (before any transformation). See
#'   details.
#' @param ... additional arguments for the cutoff function.
#' @returns a vector of cut off values.
#' @examples
#' pc <- powerconsumption[powerconsumption$week == 5, ]
#' dts <- cut(pc$active_power, breaks = c(0, quantile(pc$active_power, probs = c(0.25, 0.5, 0.75, 1))))
#' model <- vlmc(dts)
#' draw(model)
#' model_cuts <- cutoff(model)
#' model_2 <- prune(model, model_cuts[2])
#' draw(model_2)
#' @seealso [prune()] and [tune_vlmc()]
#' @export
cutoff.vlmc <- function(model, scale = c("quantile", "native"), raw = FALSE,
                        tolerance = .Machine$double.eps^0.5, ...) {
  scale <- match.arg(scale)
  recurse_kl_cutoff <- function(model, p_probs) {
    c_probs <- model$f_by / sum(model$f_by)
    local_kl <- kl_div(c_probs, p_probs) * sum(model$f_by)
    if (!is.null(model$children)) {
      child_kl <- c()
      for (v in seq_along(model$children)) {
        if (length(model$children[[v]]) > 0) {
          child_kl <- c(child_kl, recurse_kl_cutoff(model$children[[v]], c_probs))
        }
      }
      if (local_kl > max(child_kl)) {
        c(local_kl, child_kl)
      } else {
        child_kl
      }
    } else {
      local_kl
    }
  }
  pre_result <- relaxed_unique(
    sort(recurse_kl_cutoff(model, model$f_by / sum(model$f_by))),
    tolerance
  )
  guaranteed_pruning(pre_result, length(model$vals), scale, raw)
}

prune_ctx_tree <- function(tree, alpha = 0.05, cutoff = NULL, verbose = FALSE) {
  recurse_prune_kl_ctx_tree <- function(tree, p_probs, ctx, K) {
    c_probs <- tree$f_by / sum(tree$f_by)
    if (!is.null(tree$children)) {
      subtrees <- vector(mode = "list", length(tree$children))
      nb_pruned <- 0L
      for (v in seq_along(tree$children)) {
        subtrees[[v]] <- recurse_prune_kl_ctx_tree(tree$children[[v]], c_probs, c(ctx, v - 1), K)
        if (!is.null(subtrees[[v]][["kl"]])) {
          if (subtrees[[v]]$kl < K) {
            ## let's prune it
            if (verbose) { # nocov start
              cat(paste("pruning", paste(c(ctx, v - 1), sep = "", collapse = ""), subtrees[[v]]$kl), "\n")
            } # nocov end
            subtrees[[v]] <- list()
            nb_pruned <- nb_pruned + 1L
          } else {
            subtrees[[v]] <- subtrees[[v]]$tree
          }
        }
      }
      if (nb_pruned < length(tree$children)) {
        tree$children <- subtrees
        tree
      } else {
        tree$children <- NULL
        kl <- kl_div(c_probs, p_probs) * sum(tree$f_by)
        list(kl = kl, tree = tree)
      }
    } else {
      kl <- kl_div(c_probs, p_probs) * sum(tree$f_by)
      list(kl = kl, tree = tree)
    }
  }
  if (is.null(cutoff)) {
    K <- to_native(alpha, length(tree$vals))
  } else {
    K <- cutoff
  }
  pre_res <- recurse_prune_kl_ctx_tree(tree, tree$f_by / sum(tree$f_by), c(), K)
  if (!is.null(pre_res$kl)) {
    # empty result
    pre_res <- new_ctx_tree(tree$vals, list(f_by = tree$f_by), class = "vlmc")
  } else {
    ## compute stats
    pre_res <- new_ctx_tree(pre_res$vals, pre_res, class = "vlmc")
  }
  ## preserve construction information
  pre_res$max_depth <- tree$max_depth
  pre_res$keep_match <- tree$keep_match
  pre_res$ix <- tree$ix
  pre_res$data_size <- tree$data_size
  pre_res
}

#' Prune a Variable Length Markov Chain (VLMC)
#'
#' This function prunes a VLMC.
#'
#' In general, pruning a VLMC is more efficient than constructing two VLMC (the
#' base one and pruned one). Up to numerical instabilities, building a VLMC with
#' a `a` cut off and then pruning it with a `b` cut off (with `a>b`) should
#' produce the same VLMC than building directly the VLMC with a `b` cut off.
#' Interesting cut off values can be extracted from a VLMC using the [cutoff()]
#' function.
#'
#' As automated model selection is provided by [tune_vlmc()], the direct use of `cutoff`
#' should be reserved to advanced exploration of the set of trees that can be
#' obtained from a complex one, e.g. to implement model selection techniques that
#' are not provided by [tune_vlmc()].
#'
#' @param vlmc a fitted VLMC model.
#' @param alpha number in (0,1] (default: 0.05) cut off value in quantile scale
#'   for pruning.
#' @param cutoff positive number: cut off value in native (log likelihood ratio)
#'   scale for pruning. Defaults to the value obtained from `alpha`. Takes
#'   precedence over `alpha` if specified.
#' @param ... additional arguments for the prune function.
#'
#' @returns a pruned VLMC
#' @seealso [cutoff()] and [tune_vlmc()]
#' @export
#' @examples
#' pc <- powerconsumption[powerconsumption$week == 5, ]
#' dts <- cut(pc$active_power, breaks = c(0, quantile(pc$active_power, probs = c(0.25, 0.5, 0.75, 1))))
#' base_model <- vlmc(dts, alpha = 0.1)
#' model_cuts <- cutoff(base_model)
#' pruned_model <- prune(base_model, model_cuts[3])
#' draw(pruned_model)
#' direct_simple <- vlmc(dts, alpha = model_cuts[3])
#' draw(direct_simple)
#' # pruned_model and direct_simple should be identical
#' all.equal(pruned_model, direct_simple)
prune <- function(vlmc, alpha = 0.05, cutoff = NULL, ...) {
  UseMethod("prune")
}

#' @rdname prune
#' @export
prune.vlmc <- function(vlmc, alpha = 0.05, cutoff = NULL, ...) {
  if (is.null(cutoff)) {
    if (is.null(alpha) || !is.numeric(alpha) || alpha <= 0 || alpha > 1) {
      stop("the alpha parameter must be in (0, 1]")
    }
  }
  result <- prune_ctx_tree(vlmc, alpha = alpha, cutoff = cutoff)
  if (is.null(cutoff)) {
    cutoff <- to_native(alpha, length(vlmc$vals))
  } else {
    ## cutoff takes precedence
    alpha <- to_quantile(cutoff, length(vlmc$vals))
  }
  result$alpha <- alpha
  result$cutoff <- cutoff
  ## recompute the extended_ll
  if (depth(result) > 0) {
    result$ix <- result$ix[1:min(depth(result), length(result$ix))]
    ivlmc <- match_ctx(result, result$ix)
    result$extended_ll <- rec_loglikelihood_vlmc(ivlmc, TRUE)
  } else {
    result$extended_ll <- 0
    result$ix <- NULL
  }
  ## preserve match information
  result$keep_match <- vlmc$keep_match
  result$data_size <- vlmc$data_size
  if (result$keep_match) {
    ## handle the case where the root is context
    if (!is_full_node(result)) {
      result$match <- 0:(result$data_size - 1)
    }
  }
  ## preserve the construction information
  result$max_depth <- vlmc$max_depth
  result$pruned <- TRUE
  result
}

#' Fit a Variable Length Markov Chain (VLMC)
#'
#' This function fits a  Variable Length Markov Chain (VLMC) to a discrete time
#' series.
#'
#' The VLMC is built using Bühlmann and Wyner's algorithm which consists in
#' fitting a context tree (see [ctx_tree()]) to a time series and then pruning
#' it in such as way that the conditional distribution of the next state of the
#' time series given the context is significantly different from the
#' distribution given a truncated version of the context.
#'
#' The construction of the context tree is controlled by `min_size` and
#' `max_depth`, exactly as in [ctx_tree()]. Significativity is measured using a
#' likelihood ratio test (threshold can be specified in terms of the ratio
#' itself with `cutoff`) or in quantile scale with `alpha`.
#'
#' Pruning can be postponed by setting `prune=FALSE`. Using a combination of
#' [cutoff()] and [prune()], the complexity of the VLMC can then be adjusted.
#' Any VLMC model can be pruned after construction, `prune=FALSE` is a
#' convenience parameter to avoid setting `alpha=1` (which essentially prevents
#' any pruning). Automated model selection is provided by [tune_vlmc()].
#'
#' @param x a discrete time series; can be numeric, character, factor or
#'   logical.
#' @param alpha number in (0,1] (default: 0.05) cut off value in quantile scale
#'   in the pruning phase.
#' @param cutoff non negative number: cut off value in native (likelihood ratio)
#'   scale in the pruning phase. Defaults to the value obtained from `alpha`.
#'   Takes precedence over `alpha` is specified.
#' @param min_size integer >= 1 (default: 2). Minimum number of observations for
#'   a context in the growing phase of the context tree.
#' @param max_depth integer >= 1 (default: 100). Longest context considered in
#'   growing phase of the context tree.
#' @param prune logical: specify whether the context tree should be pruned
#'   (default behaviour).
#' @param keep_match logical: specify whether to keep the context matches
#'   (default to FALSE)
#' @param backend "R" or "C++" (default: as specified by the "mixvlmc.backend"
#'   option). Specifies the implementation used to represent the context tree
#'   and to built it. See details.
#' @inheritSection ctx_tree Back ends
#'
#' @returns a fitted vlmc model.
#' @examples
#' pc <- powerconsumption[powerconsumption$week == 5, ]
#' dts <- cut(pc$active_power,
#'   breaks = c(0, quantile(pc$active_power, probs = c(0.25, 0.5, 0.75, 1)))
#' )
#' model <- vlmc(dts)
#' draw(model)
#' depth(model)
#' ## reduce the detph of the model
#' shallow_model <- vlmc(dts, max_depth = 3)
#' draw(shallow_model, prob = FALSE)
#' ## improve probability estimates
#' robust_model <- vlmc(dts, min_size = 25)
#' draw(robust_model, prob = FALSE) ## show the frequencies
#' draw(robust_model)
#' @export
#' @seealso [cutoff()], [prune()] and [tune_vlmc()]
#' @references Bühlmann, P. and Wyner, A. J. (1999), "Variable length Markov
#'   chains. Ann. Statist." 27 (2) 480-513 \doi{10.1214/aos/1018031204}
vlmc <- function(x, alpha = 0.05, cutoff = NULL, min_size = 2L, max_depth = 100L,
                 prune = TRUE, keep_match = FALSE,
                 backend = getOption("mixvlmc.backend", "R")) {
  backend <- match.arg(backend, c("R", "C++"))
  # data conversion
  nx <- to_dts(x)
  ix <- nx$ix
  vals <- nx$vals
  if (is.null(cutoff)) {
    if (is.null(alpha) || !is.numeric(alpha) || alpha <= 0 || alpha > 1) {
      stop("the alpha parameter must be in (0, 1]")
    }
    cutoff <- to_native(alpha, length(vals))
  } else {
    ## cutoff takes precedence
    if (!is.numeric(cutoff) || cutoff < 0) {
      stop("the cutoff parameter must be a non negative number")
    }
    alpha <- to_quantile(cutoff, length(vals))
  }
  if (length(vals) > max(10, 0.05 * length(x))) {
    warning(paste0("x as numerous unique values (", length(vals), ")"))
  }
  if (backend == "R") {
    ctx_tree <- grow_ctx_tree(ix, vals, min_size = min_size, max_depth = max_depth, compute_stats = !prune, keep_match = keep_match)
    result <- ctx_tree
    if (prune) {
      result <- prune_ctx_tree(ctx_tree, alpha = alpha, cutoff = cutoff)
    } else {
      result <- new_ctx_tree(result$vals, result, class = "vlmc")
    }
  } else {
    cpp_tree <- build_suffix_tree(rev(ix)[-1], length(nx$vals))
    cpp_tree$compute_counts(ix[length(ix)], keep_match)
    if (prune) {
      cpp_tree$prune_context(min_size, max_depth, cutoff)
    } else {
      cpp_tree$prune(min_size, max_depth)
    }
    result <- new_ctx_tree_cpp(vals, cpp_tree, class = c("vlmc_cpp", "vlmc"))
    result$root$make_explicit()
    result$root$compute_reverse()
    ## with the C++ backend, max_depth is only used during pruning
    result$max_depth <- FALSE
    result$restoration <- cpp_tree$restoration_info()
  }
  ## prepare for loglikelihood calculation
  result$alpha <- alpha
  result$cutoff <- cutoff
  if (depth(result) > 0) {
    result$ix <- ix[1:min(depth(result), length(x))]
    if (backend == "R") {
      ivlmc <- match_ctx(result, result$ix)
      result$extended_ll <- rec_loglikelihood_vlmc(ivlmc, TRUE)
    } else {
      result$extended_ll <- result$root$loglikelihood(result$ix, 0, TRUE, FALSE)
    }
  } else {
    result$extended_ll <- 0
  }
  result$keep_match <- keep_match
  result$data_size <- length(x)
  if (backend == "R" && keep_match) {
    ## handle the case where the root is context
    if (!is_full_node(result)) {
      result$match <- 0:(length(x) - 1)
    }
  }
  result$pruned <- prune
  result
}

#' @export
print.vlmc <- function(x, ...) {
  cat(paste(
    "VLMC context tree on",
    paste(x$vals, collapse = ", ")
  ), "\n")
  cat(paste(" cutoff: ", signif(x$cutoff, 4), " (quantile: ", signif(x$alpha, 4), ")\n", sep = ""))
  if (!is.null(x$nb_ctx)) {
    cat(paste(" Number of contexts:", x$nb_ctx, "\n"))
  }
  if (!is.null(x$depth)) {
    cat(paste(" Maximum context length:", x$depth, "\n"))
  }
  invisible(x)
}