R/hcsvd.R
In bdsvd: Block Structure Detection Using Singular Vectors

Documented in hcsvd hcsvd.cor.sim

calc.distance <- function(R, idx.cluster, feature.names, p, linkage) {

  if (linkage == "average") {
    return(1 - mean(abs(R[feature.names %in% idx.cluster[[1]], feature.names %in% idx.cluster[[2]]])))
  }

  if (linkage == "single") {
    return(1 - max(abs(R[feature.names %in% idx.cluster[[1]], feature.names %in% idx.cluster[[2]]])))
  }

  if (linkage == "RV") {
    R1  <- R[feature.names %in% idx.cluster[[1]], feature.names %in% idx.cluster[[1]]]
    R2  <- R[feature.names %in% idx.cluster[[2]], feature.names %in% idx.cluster[[2]]]
    R12 <- R[feature.names %in% idx.cluster[[1]], feature.names %in% idx.cluster[[2]]]
    return(1 - RV.coef(R1, R2, R12))
  }
}




#' @importFrom utils head
calc.sparse.loadings <- function (X, SVD, q, n, maxit = 500, tol = 0.001) {
  n <- ncol(X) - n
  if (length(n) != q) {
    n <- rep(n, length.out = q)
  }

  soft <- function(X, u, p) {
    y <- crossprod(X, u)
    a <- abs(y)
    z <- apply(a, 2, sort)
    lambda <- vapply(seq(length(p)), function(j)  z[p[j], j], pi, USE.NAMES = FALSE)
    sign(y) * pmax(sweep(a, 2, lambda, `-`), 0)
  }
  SVD$v <- SVD$d * SVD$v
  iter <- 0
  delta.u <- Inf
  while (delta.u > tol && iter < maxit) {
    u <- SVD$u
    SVD$v <- soft(X, SVD$u, n)
    xsv <- X %*% SVD$v

    SVD$u <- qr.Q(qr(xsv))
    SVD$u <- sweep(SVD$u, 2, apply(xsv, 2, function(x) sign(head(x[x != 0], 1)))/
                     apply(SVD$u, 2, function(x) sign(head(x[x != 0], 1))), `*`)


    delta.u <- max(1 - diag(abs(crossprod(u, SVD$u))))
    iter <- iter + 1
  }
  if (iter >= maxit)
    warning("Maximum number of iterations reached before convergence: solution may not be optimal. Consider increasing 'maxit'.")
  SVD$v <- SVD$v %*% diag(1/sqrt(apply(SVD$v, 2, crossprod)), q,
                          q)
  list(v = SVD$v, u = SVD$u, iter = iter)
}




hcsvd.ht <- function(R,
                     q,
                     linkage,
                     labels,
                     max.iter,
                     trace = TRUE
) {

  p <- ncol(R)
  if (p == 2) {
    idx.cluster <- list(labels[1], labels[2])
    cluster.distance <- calc.distance(R, idx.cluster, labels, p, linkage)

    return(list(cluster1 = idx.cluster[1], cluster2 = idx.cluster[2], cluster.distance = cluster.distance, q.p = NA))
  }

  lambda <- eigen(R)$values
  if (is.numeric(q)) {
    if (q == 1){
      q <- p - 1
    } else {
      q <- min(round(q * p), p - 1)
    }
  }
  if (q == "Kaiser") {
    q <- min(sum(lambda >= 1) + 2, p - 1)
  }


  # Check if all elements are identical
  all.identical <- length(unique(R[upper.tri(R, diag = FALSE)])) == 1
  if(all.identical){
    print(paste0("all identical: rownames(R) = ", rownames(R)))
    idx.cluster <- list(labels[1], labels[-1])

    cluster.distance <- calc.distance(R, idx.cluster, labels, p, linkage)

    return(list(cluster1 = idx.cluster[[1]], cluster2 = idx.cluster[[2]], cluster.distance = cluster.distance, q.p = q / p))
  }


  dof.grid <- 1 : (p - 1)
  distance <- -Inf
  i <- 1

  for (dof in dof.grid) {
    V <- tryCatch(suppressWarnings(calc.sparse.loadings(X = R, SVD = irlba::irlba(R, q), q = q, n = p - dof, maxit = max.iter)$v), error = function(e) e)
    if (inherits(V, "error")) next

    for (i in seq_len(q)) {
      v <- V[, i]
      idx.cluster <- list(labels[v != 0], labels[v == 0])
      distance.ht <- calc.distance(R, idx.cluster, labels, p, linkage)

      if (distance.ht > distance) {
        distance <- distance.ht
        cluster1 <- idx.cluster[[1]]
        cluster2 <- idx.cluster[[2]]

      }
    }



  }

  cluster.distance <- distance

  return(list(cluster1 = cluster1, cluster2 = cluster2, cluster.distance = cluster.distance, q.p = q / p))
}




is.corr.matrix <- function(R) {

  if (!is.matrix(R)) {
    return(FALSE)
  }

  if(max(abs(R)) > 1) {
    return(FALSE)
  }

  if (nrow(R) != ncol(R)) {
    return(FALSE)
  }

  if (!isSymmetric(R)) {
    return(FALSE)
  }

  if (!all(diag(R) == 1)) {
    return(FALSE)
  }

  eigenvalues <- eigen(R, only.values = TRUE)$values
  if(any(eigenvalues < 0)) {
    return(FALSE)
  }

  return(TRUE)
}




RV.coef <- function(R1, R2, R12) {
  RV <- sum(diag(crossprod(R12, R12))) / sqrt(sum(diag(crossprod(R1))) * sum(diag(crossprod(R2))))
  return(RV)
}





#' @title Correlation Matrix Simulation for HC-SVD
#'
#' @description This function generates correlation matrices based on the simulation studies described in Bauer (202X).
#'
#' @param p Number of variables.
#'
#' @param b Number of blocks.
#'
#' @param design Simulation design "a" or "b".
#'
#' @return
#' A correlation matrix according to the chosen simulation design.
#'
#' @references \cite{Bauer, J.O. (202X). Divisive hierarchical clustering identified by singular vectors.}
#'
#' @examples
#' #The correlation matrix for simulation design (a) is given by
#' #R <- hcsvd.cov.sim(p = 40, b = 5, design = "a")
#'
#' @export
hcsvd.cor.sim <- function(p = p,
                          b = b,
                          design = design
) {

  DESIGN <- c("a", "b")
  if (!(design %in% DESIGN))
    stop(paste(design), " is an invalid design")

  if (design == "a") {
    d <- p / b / 4

    if (!((p / b) %% 4 == 0))
      stop("p/b must be divisible by 4 for simulation design a.")

    R <- matrix(0, p, p)
    for (block in 1:b) {
      eps <- runif(1, -0.05, 0.05)
      Rii <- matrix(0.25 + eps, 4 * d, 4 * d)
      for (j in 3:2) {
        eps <- runif(1, -0.1, 0.1)
        Rii[1 : (j * d), 1 : (j * d)] <- Rii[1 : (j * d), 1 : (j * d)] + (0.25 + eps) * rep(1, (j * d)) %*% t(rep(1, (j * d)))
      }
      Rii[which(kronecker(diag(4), matrix(1, d, d))  == 1)] <- 0.95
      R[((block - 1) * 4 * d + 1) : (block * 4 * d), ((block - 1) * 4 * d + 1) : (block * 4 * d)] <- Rii

    }
    diag(R) <- 1
    return(R)
  }

  if (design == "b") {
    d <- 3

    if (!((p / b) %% 3 == 0))
      stop("p/b must be divisible by 3 for simulation design b.")

    R <- matrix(0, d * b, d * b)
    Rii <- matrix(0, d, d)
    for (i in 1 : b) {
      for (j in 1 : d) {
        omega <- runif(1, 0.75, 0.85)
        Rii[, j] <- (-1)^(j) * omega^((j - 1)^2)
      }
      R[(1 + d * (i - 1)) : (d + d * (i - 1)), (1 + d * (i - 1)) : (d + d * (i - 1))] <- Rii
    }
    R[lower.tri(R, diag = TRUE)] <- t(R)[lower.tri(R, diag = TRUE)]
    diag(R) <- 1
    R
    R <- kronecker(R, matrix(1, p / b / 3, p / b / 3))
    return(R)
  }

}





#' @title Hierarchical Variable Clustering Using Singular Vectors (HC-SVD).
#'
#' @description Performs HC-SVD to reveal the hierarchical variable structure as descried in Bauer (202X). For this divise approach, each cluster is split into two clusters iteratively. Potential splits
#' are identified by the first sparse loadings (which are sparse approximations of the first right eigenvectors, i.e., vectors with many zero values, of the correlation matrix) that
#' mirror the masked shape of the correlation matrix. This procedure is continued until each variable lies in a single cluster.
#'
#' @param R A correlation matrix of dimension \eqn{p}x\eqn{p} or a data matrix of dimension \eqn{n}x\eqn{p} an be provided. If a data matrix is supplied, it must be indicated by setting
#' \code{is.corr = FALSE}, and the correlation matrix will then be calculated as \code{cor(X)}.
#'
#' @param q Number of sparse loadings to be used. This should be either a numeric value between zero and one to indicate percentages, or \code{"Kaiser"} for as many sparse loadings as
#' there are eigenvalues larger or equal to one. For a numerical value between zero and one, the number of sparse loadings is determined as the corresponding share of the total number of loadings.
#' E.g., \code{q = 1} (100%) uses all sparse loadings and \code{q = 0.5} (50%) will use half of all sparse loadings.
#'
#' @param linkage The linkage function to be used. This should be one of \code{"average"}, \code{"single"}, or
#' \code{"RV"} (for RV-coefficient).
#'
#' @param is.corr Is the supplied object a correlation matrix. Default is \code{TRUE} and this parameter must be set to \code{FALSE} is
#' a data matrix instead of a correlation matrix is supplied.
#'
#' @param max.iter How many iterations should be performed for computing the sparse loadings.
#' Default is \code{200}.
#'
#' @param trace Print out progress as \eqn{p-1} iterations for divisive hierarchical clustering are performed.
#' Default is \code{TRUE}.
#'
#' @details
#' The sparse loadings are computed using the method of Shen and Huang (2008), which is implemented based on the code
#' of Baglama, Reichel, and Lewis in \code{\link[irlba]{ssvd}} \{\link[irlba]{irlba}\}, with slight modifications to suit our method.
#'
#' @return
#' A list with four components:
#' \item{hclust}{
#'  The clustering structure identified by HC-SVD as an object of type \code{hclust}.
#' }
#' \item{dist.matrix}{
#'  The ultrametric distance matrix (cophenetic matrix) of the HC-SVD structure as an object of class \code{dist}.
#' }
#' \item{u.cor}{
#'  The ultrametric correlation matrix of \eqn{X} obtained by HC-SVD as an object of class \code{matrix}.
#' }
#' \item{q.p}{
#'  A vector of length \eqn{p-1} containing the ratio \eqn{q_i/p_i} of the \eqn{q_i} sparse loadings used relative to all sparse
#'  loadings \eqn{q_i} for the split of each cluster. The ratio is set to \code{NA} if the cluster contains only two variables as the search
#'  for sparse loadings that reflect the split is not required in this case.
#' }
#'
#' @references \cite{Bauer, J.O. (202X). Divisive hierarchical clustering identified by singular vectors.}
#' @references \cite{Shen, H. and Huang, J.Z. (2008). Sparse principal component analysis via regularized low rank matrix approximation, J. Multivar. Anal. 99, 1015–1034.}
#'
#' @examples
#' #We replicate the simulation study (a) in Bauer (202X)
#'
#' \dontrun{
#' p <- 40
#' n <- 500
#' b <- 5
#' design <- "a"
#'
#' set.seed(1)
#' Rho <- hcsvd.cor.sim(p = p, b = b, design = "a")
#' X <- mvtnorm::rmvnorm(n, mean=rep(0, p), sigma = Rho, checkSymmetry = FALSE)
#' R <- cor(X)
#' hcsvd.obj <- hcsvd(R)
#'
#' #The object of hclust with corresponding dendrogram can be obtained
#' #directly from hcsvd.obj$hclust:
#' hc <- hcsvd.obj$hclust
#' plot(hc)
#'
#' #The dendrogram can also be obtained from the ultrametric distance matrix:
#' plot(hclust(hcsvd.obj$dist.matrix))
#' }
#'
#'
#' @importFrom stats cov
#' @importFrom stats as.dist
#'
#' @export
hcsvd <- function(R,
                  q = "Kaiser",
                  linkage = "average",
                  is.corr = TRUE,
                  max.iter,
                  trace = TRUE
) {

  if (is.corr && !is.corr.matrix(R)) {
    stop("R must be a correlation matrix. Set 'is.corr = FALSE' if you want to supply a data matrix")
  }

  if (!is.corr) {
    X <- R
    if (anyNA(X)) {
      stop("X contains missing value indicator (NA)")
    }
    R <- stats::cor(X)
  }

  if (!((q == "Kaiser") | (is.numeric(q) & (q > 0) & (q <= 1)))) {
    stop(paste(q), " is an invalid argument for q")
  }

  LINKAGE <- c("average", "single", "RV")
  if (!(linkage %in% LINKAGE)) {
    stop(paste(linkage), " is an invalid linkage function")
  }

  if (missing(max.iter)) {
    max.iter <- 500
  }

  p <- ncol(R)
  if (length(colnames(R)) == 0 | length(rownames(R)) == 0) {
    dimnames(R) <- list(as.character(seq_len(p)), as.character(seq_len(p)))
  }

  if (length(unique(colnames(R))) != p) {
    stop("Variable names are not unique")
  }

  q.p <- c()

  dist.matrix <- matrix(0, p, p)
  labels <- colnames(R)
  dimnames(dist.matrix) <- list(labels, labels)

  merge <- matrix(0, p - 1, 2)
  height <- vector(length = p - 1)
  order <- labels

  sub.matrices <- list(colnames(R))
  cluster.count <- p - 2
  for (iter in 1:(p - 1)) {
    current.labels <- labels %in% sub.matrices[[1]]
    hcsvd.ht <- hcsvd.ht(R = R[current.labels, current.labels],
                         q = q,
                         linkage = linkage,
                         labels = labels[current.labels],
                         max.iter = max.iter,
                         trace = trace)

    q.p <- c(q.p, hcsvd.ht$q.p)

    cluster.rows <- labels %in% hcsvd.ht$cluster1
    cluster.cols <- labels %in% hcsvd.ht$cluster2
    dist.matrix[cluster.rows, cluster.cols] <- hcsvd.ht$cluster.distance
    dist.matrix[cluster.cols, cluster.rows] <- hcsvd.ht$cluster.distance

    sub.matrices <- sub.matrices[-1]

    height[p - iter] <- hcsvd.ht$cluster.distance
    for (i in 1:2) {
      cluster <- hcsvd.ht[[paste0("cluster", i)]]
      if (length(cluster) != 1) {
        merge[p - iter, i] <- cluster.count
        cluster.count <- cluster.count - 1
        sub.matrices <- append(sub.matrices, list(cluster))
      } else {
        merge[p - iter, i] <- -which(labels == cluster)
      }
    }

    order.cluster1 <- order %in% hcsvd.ht$cluster1
    order.cluster2 <- order %in% hcsvd.ht$cluster2
    order[order.cluster1 | order.cluster2] <- c(order[order.cluster2], order[order.cluster1])

    cat(sprintf("\rSplit %d out of %d (%.2f%%)           ", iter, p - 1, iter / (p - 1) * 100))
  }

  ordered.height <- order(height)

  merge <- merge[ordered.height, ]
  height <- height[ordered.height]

  not.changed <- matrix(TRUE, p - 1, 2)
  for (i in seq_len(p - 1)) {
    change.idx <- which(merge == i)
    merge[merge == i & not.changed] <- which(ordered.height == i)
    not.changed[change.idx] <- FALSE
  }

  hclust <- list(merge = merge, height = height, order = match(order, labels), labels = labels, method = linkage)
  class(hclust) <- "hclust"

  u.cor <- 1 - dist.matrix
  dist.matrix <- stats::as.dist(dist.matrix)
  attr(dist.matrix, "Size") <- p
  cat("\r======== FINISHED ========                    ")
  cat("\n")
  return(list(hclust = hclust, dist.matrix = dist.matrix, u.cor = u.cor, q.p = q.p))
}