R/estimation_structure.R

Defines functions fit_graph_to_Theta emst try_complete_Gamma eglearn

Documented in eglearn emst fit_graph_to_Theta

#' Learning extremal graph structure
#'
#' Following the methodology from \insertCite{eng2022a;textual}{graphicalExtremes},
#' fits an extremal graph structure using the neighborhood selection approach
#' (see \insertCite{meins2006;textual}{graphicalExtremes}) or graphical lasso
#' (see \insertCite{friedman2008;textual}{graphicalExtremes}).
#'
#' @param data Numeric \nxd matrix, where `n` is the
#' number of observations and `d` is the dimension.
#'
#' @param p Numeric between 0 and 1 or `NULL`. If `NULL` (default),
#' it is assumed that the `data` are already on multivariate Pareto scale. Else,
#' `p` is used as the probability in the function [data2mpareto()]
#' to standardize the `data`.
#'
#' @param rholist Numeric vector of non-negative regularization parameters
#' for the lasso.
#' Default is `rholist = c(0.1, 0.15, 0.19, 0.205)`.
#' For details see [glasso::glassopath()].
#'
#' @param reg_method One of `"ns", "glasso"`, for neighborhood selection and
#' graphical lasso, respectively.
#' Default is `reg_method = "ns"`.
#' For details see \insertCite{meins2006;textual}{graphicalExtremes},
#' \insertCite{friedman2008;textual}{graphicalExtremes}.
#'
#' @param complete_Gamma Whether you want to try fto complete Gamma matrix.
#' Default is `complete_Gamma = FALSE`.
#'
#' @return List made of:
#' \item{`graph`}{
#'   A list of [`igraph::graph`] objects representing the
#'   fitted graphs for each `rho` in `rholist`.
#' }
#' \item{`Gamma`}{
#'   A list of numeric estimated \dxd
#'   variogram matrices \eGamma corresponding to the fitted graphs,
#'   for each `rho` in `rholist`. If `complete_Gamma = FALSE` or the
#'   underlying graph is not connected, it returns `NULL`.
#' }
#' \item{`rholist`}{
#'   The list of penalty coefficients.
#' }
#' \item{`graph_ic`}{
#'   A list of [`igraph::graph`] objects
#'   representing the optimal graph
#'   according to the `aic`, `bic`, and `mbic` information criteria.
#'   If `reg_method = "glasso"`, it returns a list of `NULL`.
#' }
#' \item{`Gamma_ic`}{
#'   A list of numeric \dxd estimated
#'   variogram matrices \eGamma corresponding
#'   to the `aic`, `bic`, and `mbic` information criteria.
#'   If `reg_method = "glasso"`, `complete_Gamma = FALSE`, or the underlying
#'   graph is not connected, it returns a list of `NULL`.
#' }
#' 
#' @examples
#' set.seed(2)
#' m <- generate_random_model(d=6)
#' y <- rmpareto(n=500, par=m$Gamma)
#' ret <- eglearn(y)
#' 
#' @references \insertAllCited{}
#'
#' @family structureEstimation
#' @export
eglearn <- function(
  data,
  p = NULL,
  rholist = c(0.1, 0.15, 0.19, 0.205),
  reg_method = c("ns", "glasso"),
  complete_Gamma = FALSE
) {
  # Check arguments
  reg_method <- match.arg(reg_method)
  if (any(rholist < 0)) {
    stop("The regularization parameters in `rholist` must be non-negative.")
  }

  # Normalize data
  if (!is.null(p)) {
    data <- data2mpareto(data, p)
  }

  # Initialize variables
  Gamma <- emp_vario(data)
  sel_methods <- c("aic", "bic", "mbic")

  graphs <- list()
  Gammas <- list()
  rhos <- list()
  nRho <- length(rholist)
  d <- ncol(Gamma)

  null.vote <- array(0, dim = c(d, d, length(rholist)))
  null.vote.ic <- array(0, dim = c(d, d, length(sel_methods)))

  # Loop through variables
  for (k in 1:d) {
    if (reg_method == "glasso") {
      Sk <- Gamma2Sigma(Gamma = Gamma, k = k, check = FALSE)
      gl.fit <- lapply(1:length(rholist), FUN = function(i) {
        glassoFast::glassoFast(S = Sk, rho = rholist[i], thr = 1e-8, maxIt = 100000)$wi
      })
      gl.tmp <- array(unlist(gl.fit), dim = c(d - 1, d - 1, nRho))
      null.vote[-k, -k, ] <- null.vote[-k, -k, , drop = FALSE] +
        (abs(gl.tmp) <= 1e-5) ## make sure is consistent with default threshold value
    } else if (reg_method == "ns") {
      idx_k <- which(data[, k] > 1)
      X <- log(data[idx_k, -k] / data[idx_k, k])
      gl.tmp <- glasso_mb(data = X, lambda = rholist)
      null.vote[-k, -k, ] <- null.vote[-k, -k, , drop = FALSE] + (!gl.tmp$adj.est)
      null.vote.ic[-k, -k, ] <- null.vote.ic[-k, -k, , drop = FALSE] + (!gl.tmp$adj.ic.est)
    }
  }

  adj.est <- (null.vote / (ncol(null.vote) - 2)) < 0.5
  # only makes sense for MB at the moment
  adj.ic.est <- (null.vote.ic / (ncol(null.vote.ic) - 2)) < 0.5


  # complete Gamma for rholist
  for (j in 1:nRho) {
    rho <- rholist[j]
    est_graph <- igraph::graph_from_adjacency_matrix(
      adj.est[, ,j], mode = "undirected", diag = FALSE)

    if (complete_Gamma == FALSE) {
      Gamma_curr <- NULL
    } else {
      Gamma_curr <- try_complete_Gamma(est_graph, Gamma, "rho", round(rho, 3))
    }

    graphs[j] <- list(est_graph)
    Gammas[j] <- list(Gamma_curr)
    rhos[j] <- list(rho)
  }

  # complete Gamma for ns
  graphs_ic <-  list(aic = NULL, bic = NULL, mbic = NULL)
  Gammas_ic <-  list(aic = NULL, bic = NULL, mbic = NULL)

  if (reg_method == "ns") {
    for (l in seq_along(sel_methods)){
      est_graph <-  igraph::graph_from_adjacency_matrix(
        adj.ic.est[, ,l], mode = "undirected", diag = FALSE
      )

      if (complete_Gamma == FALSE) {
        Gamma_curr <- NULL
      } else {
        Gamma_curr <- try_complete_Gamma(
          est_graph,
          Gamma,
          key = "information criterion",
          val = sel_methods[l]
        )
      }

      graphs_ic[l] <- list(est_graph)
      Gammas_ic[l] <- list(Gamma_curr)
    }
  }

  return(list(
    graph = graphs,
    Gamma = Gammas,
    rholist = rhos,
    graph_ic = graphs_ic,
    Gamma_ic = Gammas_ic
  ))
}


try_complete_Gamma <- function(graph, Gamma, key, val){
  ## igraph numeric_matrix character double -> numeric_matrix | NA
  ## tries to call `complete_Gamma`, if it fails returns NULL
  if(!igraph::is.connected(graph)){
    message(paste0(
      'The estimated graph for ',
      key, ' = ', val,
      '} is not connected,  so it is not possible to complete `Gamma`.\n'
    ))
    return(NULL)
  }

  Gamma_comp <- complete_Gamma(graph = graph, Gamma = Gamma)
  graph_comp <- Gamma2graph(Gamma_comp)

  # Check if completed Gamma matches with given graph
  if (!graphs_equal(graph_comp, graph)) {
    message(paste0(
      'The completed Gamma for ',
      key, ' = ', val,
      ' does not match the estimated graph.\n'
    ))
  }

  # Return completed Gamma
  return(Gamma_comp)
}




#' Fitting extremal minimum spanning tree
#'
#' Fits an extremal minimum spanning tree, where the edge weights are:
#' - negative maximized log-likelihoods of the bivariate Huesler-Reiss distributions, 
#'   if `method = "ML"`. See \insertCite{eng2019;textual}{graphicalExtremes} for details.
#' - empirical extremal variogram, if `method = "vario"`. See \insertCite{eng2020;textual}{graphicalExtremes} for details.
#' - empirical extremal correlation, if `method = "chi"`. See \insertCite{eng2020;textual}{graphicalExtremes} for details.
#'
#' @param data Numeric \nxd matrix, where `n` is the
#' number of observations and `d` is the dimension.
#' @param p Numeric between 0 and 1 or `NULL`. If `NULL` (default),
#' it is assumed that the `data` are already on multivariate Pareto scale. Else,
#' `p` is used as the probability in the function [data2mpareto()]
#' to standardize the `data`.
#' @param method One of `"vario", "ML", "chi"`.
#' Default is `method = "vario"`.
#' @param cens Logical. This argument is considered only if `method = "ML"`.
#' If `TRUE`, then censored likelihood contributions are used for
#' components below the threshold. By default, `cens = FALSE`.
#'
#' @return List consisting of:
#' \item{`graph`}{An [`igraph::graph`] object. The fitted minimum spanning tree.}
#' \item{`Gamma`}{
#'   Numeric \dxd estimated variogram matrix \eGamma
#'   corresponding to the fitted minimum spanning tree.
#' }
#'
#' @examples
#' ## Fitting a 4-dimensional HR minimum spanning tree
#' my_graph <- igraph::graph_from_adjacency_matrix(
#'   rbind(
#'     c(0, 1, 0, 0),
#'     c(1, 0, 1, 1),
#'     c(0, 1, 0, 0),
#'     c(0, 1, 0, 0)
#'   ),
#'   mode = "undirected"
#' )
#' n <- 100
#' Gamma_vec <- c(.5, 1.4, .8)
#' complete_Gamma(Gamma = Gamma_vec, graph = my_graph) ## full Gamma matrix
#'
#' set.seed(123)
#' my_data <- rmpareto_tree(n, "HR", tree = my_graph, par = Gamma_vec)
#' my_fit <- emst(my_data, p = NULL, method = "ML", cens = FALSE)
#' @references
#'  \insertAllCited{}
#'
#' @family structureEstimation
#' @export
emst <- function(
    data,
    p = NULL,
    method = c("vario", "ML", "chi"),
    cens = FALSE
){
  # Validate arguments
  method <- match.arg(method)

  # Check if you need to rescale data or not
  if (!is.null(p)) {
    data <- data2mpareto(data, p)
  }

  # Estimate weight matrix
  if (method == "ML") {
    res <- ml_weight_matrix(data, cens = cens)
    weight_matrix <- res$llh_hr
    estimated_gamma <- res$est_gamma
  } else if (method == "vario") {
    weight_matrix <- emp_vario(data)
    estimated_gamma <- weight_matrix
  } else if (method == "chi") {
    weight_matrix <- - emp_chi(data)
    estimated_gamma <- chi2Gamma(- weight_matrix)
  }

  # Estimate tree
  graph.full <- igraph::make_full_graph(ncol(data))
  mst.tree <- igraph::mst(
    graph = graph.full,
    weights = weight_matrix[igraph::ends(graph.full, igraph::E(graph.full))],
    algorithm = "prim"
  )

  # Complete Gamma
  Gamma_comp <- complete_Gamma(estimated_gamma, mst.tree)

  # Return tree and completed Gamma
  return(list(
    graph = mst.tree,
    Gamma = Gamma_comp
  ))
}


#' Experimental: Fit graph using empirical Theta matrix
#'
#' Fits a graph to an empirical Gamma matrix by computing the corresponding
#' Theta matrix using [Gamma2Theta()] and greedily chooses `m` edges that
#' correspond to high absolute values in Theta.
#'
#' @param data The (standardized) data from which to compute Gamma
#' @param m The number of edges to add, defaults to the number of edges in a tree
#' @param Gamma_emp The empirical Gamma matrix
#' (can be `NULL` if `data` is given)
#'
#' @return A list containing an \[`igraph::graph`\] object and a fitted `Gamma` matrix
#'
#' @family structureEstimation
#' @export
fit_graph_to_Theta <- function(data, m=NULL, Gamma_emp=NULL){

  if(is.null(Gamma_emp)){
    Gamma_emp <- emp_vario(data)
  }

  Theta_emp <- Gamma2Theta(Gamma_emp, check = FALSE)

  d <- nrow(Gamma_emp)

  if(is.null(m)){
    m <- d-1
  }
  if(m < d-1){
    stop('m must be at least d-1!')
  }

  g_c <- igraph::make_full_graph(d)
  weights <- Theta_emp[igraph::as_edgelist(g_c)]
  g <- igraph::mst(g_c, weights)

  missing_edges <- igraph::as_edgelist(igraph::complementer(g))
  missing_weights <- Theta_emp[missing_edges]

  ord <- order(abs(missing_weights), decreasing = TRUE)

  new_edges <- missing_edges[ord[seq_len(m - (d-1))],]

  g <- igraph::add_edges(g, t(new_edges))

  Gamma_g <- complete_Gamma(Gamma_emp, g, N=100000, tol=1e-6, check_tol=100)

  return(list(
    graph = g,
    Gamma = Gamma_g
  ))
}

Try the graphicalExtremes package in your browser

Any scripts or data that you put into this service are public.

graphicalExtremes documentation built on Nov. 14, 2023, 1:07 a.m.