R/bipartite_matching_pseudo.R

Defines functions bipartite_matching_pseudolikelihood

Documented in bipartite_matching_pseudolikelihood

# Input: A, B, grid of lambda values
# Ouput: Inverse covariance, Alignment matrix
#' Bipartite to unipartite matching via penalized pseudolikelihood.
#'
#' This function performs graph matching between a bipartite and a unipartite graphs that are assumed to
#' share a common set of vertices. The method performs an alignment of the non-zero parameters of a
#' graphical model for the bipartite graph and the nonzero entries of the unipartite adjacency matrix
#' via pseudolikelihood maximization.
#'
#' @param A Adjacency matrix of the unipartite graph.
#' @param B Bipartite graph incidence matrix, where rows correspond to the vertices in common
#' with the unipartite graph. Entries of B need to be binary in {0, 1}.
#' @param Q_true A permutation matrix with the true solution, if known. When the true solution is unknown, the default is NULL.
#' @param lambdas Penalty values for the optimization algorithm.
#' @param MAX_ITER Maximum number of iterations.
#' @param verbose If TRUE, the method prints an output after each iteration.
#' @param seeds If some vertices have known correspondence, a vector containing the indexes of these
#' vertices can be passed through this parameter, and the corresponding rows of A and B are assumed to be aligned.
#' The algorithm will then match the remaining vertices. The default is NULL if no seeds are available.
#' @param gamma Additional penalty for all variables. A positive small constant is needed when the sample size is small.
#' @return
#' @export
bipartite_matching_pseudolikelihood <- function(A, B, Q_true = NULL,
                                                family = "binomial",
                                                lambdas = 10^(-2:0),
                                                MAX_ITER = 20, verbose = FALSE,
                                                seeds = NULL,
                                                gamma = 1/100) {
  # algorithm parameters
  TOL <- 1e-4
  l_factor <- 1

  n <- nrow(B)
  m <- ncol(B)

  # penalty values
  Q_list_lambda <- list()
  f_list_lambda <- list()


  # check if zero rows in B
  zerorowsB <- sum(rowSums(B!=0)==0) > 0
  if(zerorowsB) whichzerorowsB <- which(rowSums(B!=0)==0)
  # report baseline loglikelihood if true solution is known
  if(!is.null(Q_true)) {
    Q_1 <- as.matrix(Matrix::t(Q_true) %*% A %*% Q_true)
    diag(Q_1) <- 0
    ground_Truth <- pseudolikelihood(B, U = Q_1, family = family)
    if(is.null(ground_Truth))
      error("Error: too many non-zeros in A, not enough samples.")
    if(verbose ==TRUE) cat(paste(sprintf("---Baseline pseudolikelihood: %.4f\n", ground_Truth$pseudolik)))
  }

  i <- 1
  for(lambda in lambdas) {
    if(verbose ==TRUE) cat("-Running lambda=", lambda, "\n")

    # run algorithm ---------------------------------------
    # initialize
    iter <- 0
    crit <- Inf
    D <- matrix(1/n, n, n)
    f_0 <- -Inf
    Q_list <- list()
    f_list <- list()
    while(iter <= MAX_ITER & crit > TOL) {
      iter <- iter + 1

      # make penalty matrix
      P <- Matrix::t(D) %*% A %*% D
      diag(P) <- 1
      P <- 1- P
      P <- as.matrix((P + Matrix::t(P))/2) # make sure it is symmetric

      # run penalized pseudolikelihood optimization =================================
      PL <- penalized_pseudolikelihood(B, lambda = lambda, W = P, family = family,
                                      gamma = gamma)
      if(is.null(PL))
        error("Error: too many non-zeros in A, not enough samples. Increase gamma parameter.")
      # inverse covariance
      U <- (abs(PL$Theta))
      diag(U) <- 0

      sparse <- sum(U!=0)/(n^2-n)

      # run graph matching ========================================================
      GM = iGraphMatch::graph_match_FW(A = A, B = (U + Matrix::t(U))/2,
                                       start = "bari", seeds = seeds)
      # matching solution
      D <- as.matrix(GM$D)

      # objective function to calculate convergence and fit
      Q <- GM$P
      Q_1 <- as.matrix(Matrix::t(Q) %*% A %*% Q)
      diag(Q_1) <- 0
      Pseudo_unpen <- pseudolikelihood(B, family = family, U = Q_1)
      if(is.null(Pseudo_unpen))
        error("Error: too many non-zeros in A, not enough samples.")

      f_t <- Pseudo_unpen$pseudolik
      lambda <- lambda * l_factor
      crit <- abs(f_0 -f_t)
      f_0 <- f_t
      # save parameters
      Q_list[[iter]] <- Q
      f_list[[iter]] <- f_t
      if(verbose == TRUE)
        cat(paste(sprintf("--- %d\t pseudologlik_t= %.4f %%nonzeros= %.4f Match.%%error==%.4f\n",
              iter, f_t, sparse,
              ifelse(is.null(Q_true), "NA",
                     100*ifelse(is.null(seeds), c(sum(abs(Q_true - D))/(2*(n))),
                                                c(sum(abs(Q_true[-seeds, -seeds] - D[-seeds, -seeds]))/(2*(n-length(seeds)))))
                     )
              )))
    }
    # choose best solution
    index <- which.max(unlist(f_list))
    Q_list_lambda[[i]] <- Q_list[[index]]
    f_list_lambda[[i]] <- f_list[[index]]
    i <- i + 1
  }
  # overall best
  index <- which.max(unlist(f_list_lambda))
  Q_best <- Q_list_lambda[[index]]
  f_best <- f_list_lambda[[index]]
  Q_1 <- as.matrix(Q_best %*% A %*% Matrix::t(Q_best))
  diag(Q_1) <- 0
  Pseudo_unpen <- pseudolikelihood(B, family = family, U = Q_1)
  Theta_hat <- Pseudo_unpen$Theta

  return(list(Q = Q_best, Theta_hat=Theta_hat,f = f_best))
}
jesusdaniel/rBipartiteUnipartiteMatch documentation built on Dec. 20, 2021, 11:06 p.m.