R/ComDim_Exploratory.R
In R.ComDim: Common Dimensions (ComDim) Multi-Block Analysis

Documented in ComDim_Exploratory

#' ComDim_Exploratory - Extending matrix decomposition methods to multi-block data.
#'
#' Extends any matrix decomposition method used for exploratory purposes to the multi-block
#' field. The user provides a function (\code{FUN}) to compute the scores from the
#' salience-weighted concatenated blocks; these scores are then used to derive the global
#' scores, local scores, and loadings following the traditional ComDim-PCA framework.
#' @param MB A MultiBlock object.
#' @param ndim Number of Common Dimensions.
#' @param FUN The function used as the core of the ComDim analysis. It must accept a matrix
#'   \code{W} (salience-weighted concatenated blocks) and \code{ndim} as its first two
#'   arguments, and return a matrix of scores with one column per component.
#' @param normalise To apply block normalisation. FALSE == no (default), TRUE == yes.
#' @param threshold The threshold limit to stop the iterations. Iterations stop when the
#'   change in the global score vector is below this value (1e-10 as default).
#' @param loquace To display the calculation times. TRUE == yes, FALSE == no (default).
#' @param method A string label identifying the decomposition method used (default: 'FUN').
#' @param ... Additional arguments passed to \code{FUN}.
#' @return A \code{ComDim} object. Slots for supervised analysis
#'   (\code{R2Y}, \code{Q2}, \code{DQ2}, \code{VIP}, \code{VIP.block},
#'   \code{PLS.model}, \code{cv}, \code{Prediction}) are empty. The
#'   populated slots are:
#' \describe{
#'   \item{\code{Method}}{The label supplied via the \code{method} argument.}
#'   \item{\code{ndim}}{Number of Common Dimensions extracted.}
#'   \item{\code{Q.scores}}{Global scores matrix (\eqn{n \times ndim}).
#'     Column names are \code{CC1}, \code{CC2}, etc.; row names are sample
#'     names.  Each column \eqn{\mathbf{q}_a} is a unit-norm consensus score
#'     derived from the dominant left direction of FUN applied to the
#'     salience-weighted concatenated blocks
#'     \eqn{\mathbf{W} = [\sqrt{\lambda_1}\mathbf{X}_1 \mid \cdots \mid
#'     \sqrt{\lambda_B}\mathbf{X}_B]}.}
#'   \item{\code{T.scores}}{Named list of block-specific local scores matrices
#'     (\eqn{n \times ndim} each).  For block \eqn{b} and component \eqn{a}:
#'     local loading \eqn{\mathbf{p}_{ba} = \mathbf{X}_b'\mathbf{q}_a} and
#'     local score
#'     \eqn{\mathbf{t}_{ba} =
#'     \mathbf{X}_b\,\mathbf{p}_{ba}(\mathbf{p}_{ba}'\mathbf{p}_{ba})^{-1}}.}
#'   \item{\code{P.loadings}}{Global loadings matrix (\eqn{p_{tot} \times
#'     ndim}).  Column \eqn{a} is
#'     \eqn{\mathbf{P}_a = \mathbf{X}'\mathbf{q}_a},
#'     where \eqn{\mathbf{X}} is the mean-centred (and optionally normalised)
#'     concatenated blocks.}
#'   \item{\code{Saliences}}{Block salience (weight) matrix (\eqn{ntable
#'     \times ndim}, row names = block names).  Entry \eqn{(b,a)} is
#'     \eqn{\lambda_{ba} =
#'     \mathbf{q}_a'\mathbf{X}_b\mathbf{X}_b'\mathbf{q}_a},
#'     the variance of block \eqn{b} captured by global score \eqn{a}.}
#'   \item{\code{R2X}}{Proportion of multi-block variance captured by each
#'     component (named vector, length \eqn{ndim}).  Let \eqn{\mathbf{s}_a}
#'     be the score vector returned by FUN for component \eqn{a} (before
#'     unit-normalisation to obtain \eqn{\mathbf{q}_a}), so that
#'     \eqn{sv_a = \|\mathbf{s}_a\|}; then
#'     \deqn{R2X_a = sv_a^4 \big/ \sum_k sv_k^4 =
#'     Singular_a^2 \big/ \sum_k Singular_k^2.}}
#'   \item{\code{Singular}}{Squared L2 norms of the FUN score vectors:
#'     \eqn{Singular_a = sv_a^2 = \|\mathbf{s}_a\|^2}, used to derive
#'     \code{R2X}.}
#'   \item{\code{Mean}}{List with \code{MeanMB}: named list of column-mean
#'     vectors per block, used for mean-centring.}
#'   \item{\code{Norm}}{List with \code{NormMB}: Frobenius norms used for
#'     block normalisation (all ones when \code{normalise = FALSE}).}
#'   \item{\code{variable.block}}{Character vector (length \eqn{p_{tot}})
#'     indicating the block name of each row in \code{P.loadings}.}
#'   \item{\code{runtime}}{Total computation time in seconds.}
#' }
#' @examples
#' b1 <- matrix(rnorm(500), 10, 50) # 10 samples, 50 variables
#' b2 <- matrix(rnorm(800), 10, 80) # 10 samples, 80 variables
#' mb <- MultiBlock(Data = list(b1 = b1, b2 = b2))
#'
#' if (requireNamespace("ica", quietly = TRUE)) {
#'   fun.ICA <- function(W, ndim, ...) {
#'     # W is the concatenated MB.
#'     # ndim is the number of components.
#'     result <- ica::ica(W, ndim)
#'     # The function must return the source estimates (analogous to PCA scores).
#'     return(result$S)
#'   }
#'   resultsICA <- ComDim_Exploratory(mb,
#'     ndim = 2,
#'     FUN = fun.ICA,
#'     method = "ICA"
#'   )
#' }
#' @references Jouan-Rimbaud Bouveresse D, Rutledge DN (2024).
#'   A synthetic review of some recent extensions of ComDim.
#'   \emph{Journal of Chemometrics}, 38(5), e3454.
#'   \doi{10.1002/cem.3454}
#' @export

ComDim_Exploratory <- function(MB = MB, ndim = NULL,
                               FUN = FUN,
                               normalise = FALSE, threshold = 1e-10,
                               loquace = FALSE,
                               method = "FUN",
                               ...) {
  # INITIALISATION

  progress_bar <- utils::txtProgressBar(min = 0, max = 80, style = 3, char = "=")

  start_ini_time <- Sys.time() # To start counting calculation time for the initialization


  if (!inherits(MB, "MultiBlock")) {
    stop("'MB' is  not a MultiBlock.")
  }

  ntable <- length(blockNames(MB)) # number of blocks
  nrowMB <- length(sampleNames(MB)) # number of samples
  variable_number <- ncol(MB) # Number of variables per block

  give_error <- 0

  # Remove all variables with 0 variance. If there are no remaining variables, give_error
  numvar <- 0
  numblock0 <- vector()
  for (i in 1:ntable) {
    var0 <- apply(MB@Data[[i]], 2, function(x) {
      var(x)
    })
    var0 <- which(var0 == 0)

    if (length(var0) != 0) {
      numvar <- numvar + length(var0)
      if (length(var0) == length(MB@Variables[[i]])) {
        numblock0 <- append(numblock0, i)
      }
      MB@Data[[i]] <- MB@Data[[i]][, setdiff(1:variable_number[i], var0)]
      MB@Variables[[i]] <- MB@Variables[[i]][setdiff(1:variable_number[i], var0)]
      variable_number[i] <- length(MB@Variables[[i]])
    }
  }
  if (numvar != 0) {
    warning(sprintf("Number of variables excluded from the analysis because of their zero variance: %d", numvar))
  }
  if (length(numblock0) != 0) {
    numblock0 <- rev(numblock0) # To sort in decreasing order.
    for (i in seq_along(numblock0)) {
      MB@Data[[numblock0[i]]] <- NULL
      MB@Variables[[numblock0[i]]] <- NULL
      if (numblock0[[i]] %in% MB@Batch) {
        MB@Batch[[numblock0[i]]] <- NULL
      }
      if (numblock0[[i]] %in% MB@Metadata) {
        MB@Metadata[[numblock0[i]]] <- NULL
      }
    }
    warning(sprintf("Number of blocks excluded from the analysis because of their zero variance: %d", length(numblock0)))
  }

  ntable <- length(blockNames(MB)) # number of blocks
  variable_number <- ncol(MB) # In case the number of blocks has changed.

  if (length(MB@Data) == 0) { # In case all blocks had 0 variance...
    warning("All blocks had 0 variance")
    give_error <- 1
  }

  # Check for infinite or missing values (they cannot be handled with svd)
  for (i in 1:ntable) {
    if (any(is.na(MB@Data[[i]]))) {
      stop("The MB contains NA values. They must be removed first for ComDim analysis.")
    }
    if (any(is.infinite(MB@Data[[i]]))) {
      stop("The MB contains infinite values. They must be removed first for ComDim analysis.")
    }
  }

  if (give_error) {
    stop("The data is not ready for ComDim.")
  } else {
    message("The data can be used for ComDim.")
  }


  if (is.null(ndim)) {
    ndim <- ntable # If the number of components is not defined,
    # the number of components to extract is equal to the number of blocks.
  }

  pieceBar <- 4 + 2 * ntable + ndim # Number of updates in the progress bar.
  pieceBar <- 80 / pieceBar
  total_progress <- pieceBar

  DimLabels <- paste0("CC", 1:ndim) # One label per component.
  TableLabels <- blockNames(MB) # One label per block.

  end_ini_time <- Sys.time() # To end the count of the calculation time.

  if (loquace) {
    message(sprintf("Initialisation finished after : %s millisecs", (end_ini_time - start_ini_time) * 1000))
  }

  utils::setTxtProgressBar(progress_bar, value = total_progress)

  # NORMALISATION

  X_mat <- matrix(, nrow = nrowMB, ncol = sum(variable_number))
  Xnorm_mat <- matrix(, nrow = nrowMB, ncol = sum(variable_number))

  res_calib <- list()
  temp_tabCalib <- list()
  s_r <- list()
  res_calib$SingVal <- as.vector(NULL)
  res_calib$NormMB <- as.vector(NULL)
  res_calib$MeanMB <- list()

  for (i in 1:ntable) {
    res_calib$MeanMB[[TableLabels[i]]] <- colMeans(MB@Data[[i]])
    names(res_calib$MeanMB[[TableLabels[i]]]) <- MB@Variables[[i]]

    if (normalise) {
      # Normalise original blocks

      X_mean <- MB@Data[[i]] - matrix(data = rep(1, nrowMB), ncol = 1, nrow = nrowMB) %*% res_calib$MeanMB[[i]]
      XX <- X_mean * X_mean
      Norm_X <- sqrt(sum(XX))
      X_Normed <- X_mean / Norm_X
      res_calib$NormMB[i] <- Norm_X
      temp_tabCalib[[i]] <- X_Normed
      s_r[[i]] <- X_Normed
    } else {
      res_calib$NormMB[[TableLabels[i]]] <- rep(1, length(MB@Variables[[i]]))
      names(res_calib$NormMB[[TableLabels[i]]]) <- MB@Variables[[i]]
      temp_tabCalib[[i]] <- MB@Data[[i]]
      s_r[[i]] <- MB@Data[[i]]
    }

    if (i == 1) {
      X_mat[, 1:variable_number[1]] <- MB@Data[[i]]
      Xnorm_mat[, 1:variable_number[1]] <- temp_tabCalib[[i]]
    } else {
      beg <- sum(variable_number[1:(i - 1)]) + 1
      ending <- sum(variable_number[1:i])
      X_mat[, beg:ending] <- MB@Data[[i]]
      Xnorm_mat[, beg:ending] <- temp_tabCalib[[i]]
    }

    norm_comdim <- Sys.time()
    if (loquace) {
      message(sprintf("Normalization of block %s finished after : %s millisecs", i, (norm_comdim - start_ini_time) * 1000))
    }
    total_progress <- total_progress + pieceBar
    utils::setTxtProgressBar(progress_bar, value = total_progress)
  }

  names(res_calib$NormMB) <- TableLabels

  nR <- nrow(Xnorm_mat)
  nC <- ncol(Xnorm_mat)

  total_progress <- total_progress + pieceBar * ntable
  utils::setTxtProgressBar(progress_bar, value = total_progress)

  ## DO ComDim

  saliences <- matrix(, ncol = ndim, nrow = ntable)
  Q <- matrix(, ncol = ndim, nrow = nrowMB)
  unexplained <- ntable # Warning!: true only if the tables were set to norm=1
  varexp <- as.vector(NULL)

  for (dims in 1:ndim) {
    previousfit <- 10000
    deltafit <- 1000000
    lambda <- matrix(rep(1, ntable), nrow = ntable, ncol = 1)

    qini <- as.vector(s_r[[1]][, 1])
    qini <- qini / sqrt(as.vector(qini %*% qini)) # random initialization of t + set to unit length
    qold <- 100

    iters <- 0
    while (norm(qold - qini, "2") > threshold && iters < 100) {
      iters <- iters + 1
      qold <- qini
      q <- 0

      W <- matrix(, nrow = nrowMB, ncol = 0)
      for (j in 1:ntable) {
        W <- cbind(W, sqrt(lambda[j]) * s_r[[j]])
      }

      # HERE IS THE CORE COMDIM
      scores <- FUN(W = W, ndim = ndim, ...)

      sv <- sqrt(t(scores[, dims]) %*% scores[, dims]) # For R2X
      qtmp <- scores[, dims] / as.vector(sv)

      for (j in 1:ntable) {
        # takes each table into account for lambda after PCA
        lambda[j] <- t(qtmp) %*% (s_r[[j]] %*% t(s_r[[j]])) %*% qtmp
        q <- q + lambda[j] * qtmp
      }

      q <- q / sqrt(as.vector(t(q) %*% q)) # standardizes t
      if (abs(min(q)) > abs(max(q))) {
        q <- -q
      }
      qini <- q
    } # deltafit>threshold

    saliences[, dims] <- lambda
    Q[, dims] <- q


    res_calib$SingVal[dims] <- sv^2
    varexp[dims] <- res_calib$SingVal[dims]^2

    # Deflate blocks
    aux <- diag(nrowMB) - as.matrix(q) %*% t(as.matrix(q))
    for (j in 1:ntable) {
      s_r[[j]] <- aux %*% s_r[[j]]
    }

    iter_comdim <- Sys.time()
    if (loquace) {
      message(sprintf("Component %s determined after : %s millisecs", dims, (iter_comdim - start_ini_time) * 1000))
    }
    total_progress <- total_progress + pieceBar
    utils::setTxtProgressBar(progress_bar, value = total_progress)
  }

  # res_calib$SingVal$i <- 'Singular value'
  names(res_calib$SingVal) <- DimLabels # Dimensions

  # Q$i <- data[[1]]$Samples
  # Q$v <- DimLabels
  colnames(Q) <- DimLabels
  rownames(Q) <- MB@Samples
  res_calib$Q <- Q
  rm(Q)

  ## Adding metadata. Metadata extracted from the first block
  res_calib$metadata <- list()
  if (length(MB@Metadata) != 0) {
    res_calib$metadata[[1]] <- MB@Metadata
  }


  end_comdim <- Sys.time()
  if (loquace) {
    message(sprintf("Scores finished after : %s millisecs", (end_comdim - start_ini_time) * 1000))
  }
  total_progress <- total_progress + pieceBar
  utils::setTxtProgressBar(progress_bar, value = total_progress)

  meanW <- as.vector(NULL)
  for (j in 1:ntable) {
    meanW <- c(meanW, mean(s_r[[j]]))
  }
  # res_calib$s_n <- s_n
  # res_calib$s_r <- s_r
  rm(s_r)

  ##
  # explained <- list()
  explained <- varexp / sum(varexp)
  names(explained) <- DimLabels
  # explained$i <-'% explained'
  # explained$v <- DimLabels #Dimensions
  res_calib$explained <- explained
  rm(varexp)
  rm(explained)

  ## Calculate Sums of saliences for each Dim
  # Sum_saliences_Dim <- list()
  Sum_saliences_Dim <- colSums(saliences)
  names(Sum_saliences_Dim) <- DimLabels
  # Sum_saliences_Dim$i <-'Sum Dim Saliences'
  # Sum_saliences_Dim$v <- DimLabels #Dimensions

  res_calib$Sum_saliences_Dim <- Sum_saliences_Dim
  rm(Sum_saliences_Dim)

  # Calculate Sums of saliences for each Table
  # Sum_saliences_Tab <- list()
  Sum_saliences_Tab <- rowSums(saliences)
  # Sum_saliences_Tab$i <- 'Sum Tab Saliences'
  names(Sum_saliences_Tab) <- TableLabels
  # Sum_saliences_Tab$v <- TableLabels #tables

  res_calib$Sum_saliences_Tab <- Sum_saliences_Tab
  rm(Sum_saliences_Tab)

  # saliences$i <- TableLabels #tables
  # saliences$v <- DimLabels #Dimensions
  res_calib$saliences <- saliences
  colnames(res_calib$saliences) <- DimLabels
  rownames(res_calib$saliences) <- TableLabels

  ## Calculate Normalised concatenated Xs ('Calib') from col
  # Calculate concatenated CD loadings
  # Reorganise Loadings - 1 matrix / LV

  ## If Output = NULL, nothing else is calculated
  # if(!(is.null(output))){
  L_CD_Vec <- NULL
  L_X_Vec <- NULL
  Calib <- NULL

  nCalib <- nrowMB
  # }

  ## Already defined in during ComDim initiallization
  # isT <- grepl(output, 'T')
  # isL <- grepl(output, 'L')
  # isP <- grepl(output, 'P')
  # isLP <- c(isP,isL)
  # isTLP <- c(isT,isP,isL)

  # tic
  # Calculate concatenated CD loadings
  # Reorganise Loadings - 1 matrix / LV
  # L_CD <- list()
  # L_X <- list()
  T_Loc <- list()
  for (i in 1:ntable) { # Prepare lists
    # L_CD[[TableLabels[i]]] <- matrix(,ncol = ndim, nrow = ncol(temp_tabCalib[[i]]))
    # L_X[[TableLabels[i]]] <- matrix(,ncol = ndim, nrow = ncol(temp_tabCalib[[i]]))
    T_Loc[[TableLabels[i]]] <- matrix(, ncol = ndim, nrow = nrowMB)
  }
  # b <- matrix(,ncol = ndim, nrow = ntable)

  if (!requireNamespace("pracma", quietly = TRUE)) {
    stop("The package pracma is needed.")
  }

  for (j in 1:ndim) {

    for (i in 1:ntable) {

      temp <- t(temp_tabCalib[[i]]) %*% res_calib$Q[, j] # Scaled CD 'local' Loadings

      T_Loc[[TableLabels[i]]][, j] <- temp_tabCalib[[i]] %*% (temp %*% pracma::pinv(t(temp) %*% temp)) # local Scores

      # Deflate each temp_tabCalib
      temp_tabCalib[[i]] <- temp_tabCalib[[i]] - res_calib$Q[, j] %*% t(temp)
    }

  }

  for (i in 1:ntable) {
    rownames(T_Loc[[TableLabels[i]]]) <- rownames(res_calib$Q)
    colnames(T_Loc[[TableLabels[i]]]) <- colnames(res_calib$Q)
  }

  # If Output==[], nothing else is calculated
  # if(!(is.null(output))){
  # Calculate Global Loadings
  # if(!(is.null(isP))){
  L_CD_Vec <- t(Xnorm_mat) %*% res_calib$Q # Scaled CD 'global' Loadings
  # }

  # if(!(is.null(isL))){
  #  L_X_Vec <- t(X_mat)%*%Q$Data # Unscaled X 'global' Loadings
  # }
  # }

  #### If Output==[], nothing else is calculated
  load_comdim <- Sys.time()
  if (loquace) {
    message(sprintf("Loadings finished after : %s millisecs", (load_comdim - start_ini_time) * 1000))
  }

  total_progress <- total_progress + pieceBar
  utils::setTxtProgressBar(progress_bar, value = total_progress)

  rm(X_mat)
  rm(temp_tabCalib)
  rm(temp)

  ## Output
  end_function <- Sys.time()


  # if(!(is.null(isT))){
  res_calib$T_Loc <- T_Loc

  # T_Loc is no longer needed
  rm(T_Loc)


  # if(!(is.null(isP))){

  varnames <- vector()
  for (i in 1:ntable) {
    varnames <- append(varnames, MB@Variables[[i]])
  }
  # if(length(which(duplicated(varnames))) != 0){
  #  warning('There are at least two variables with the same name. Their name in the rownames of P has been altered.\n')
  # }

  res_calib$P <- L_CD_Vec
  # res_calib$P$i <- varnames # numbers of all variables;
  # res_calib$P$v <- DimLabels # Dimensions
  colnames(res_calib$P) <- DimLabels
  rownames(res_calib$P) <- varnames


  rm(L_CD_Vec)
  # rm(L_CD)

  ##
  rm(saliences)

  # Define block length
  belong_block <- rep(0, nrow(res_calib$P))
  k <- 1
  for (i in 1:ntable) {
    belong_block[k:(k + variable_number[i] - 1)] <- TableLabels[i]
    k <- k + variable_number[i]
  }


  end_output <- Sys.time()
  running_time <- (end_output - start_ini_time)
  if (loquace) {
    message(sprintf("Analysis finished after : %s seconds", running_time))
  }
  res_calib$runtime <- running_time # Total time of analysis

  progress_bar <- utils::txtProgressBar(min = 0, max = 80, style = 3, char = "=")
  utils::setTxtProgressBar(progress_bar, value = 80)

  close(progress_bar)

  # Save data in a ComDim structure
  res_calib <- new("ComDim",
    Method = method,
    ndim = ndim,
    Q.scores = res_calib$Q,
    T.scores = res_calib$T_Loc,
    P.loadings = res_calib$P,
    Saliences = res_calib$saliences,
    Orthogonal = list(),
    R2X = res_calib$explained,
    R2Y = vector(),
    Q2 = vector(),
    DQ2 = vector(),
    Singular = res_calib$SingVal,
    Mean = list(MeanMB = res_calib$MeanMB, MeanY = NULL),
    Norm = list(NormMB = res_calib$NormMB),
    PLS.model = list(),
    cv = list(),
    Prediction = list(),
    Metadata = res_calib$metadata,
    variable.block = belong_block,
    runtime = as.numeric(res_calib$runtime)
  )
  return(res_calib)
}