R/ComDim_PCA.R

Defines functions RowsPartition ColumnsPartition PCA_Tall_PCT_DNR Compress_Data_2020 ComDim_PCA

Documented in ColumnsPartition ComDim_PCA Compress_Data_2020 PCA_Tall_PCT_DNR RowsPartition

#' ComDim_PCA
#'
#' Finding common dimensions in multi-block datasets.
#' @param MB A MultiBlock object.
#' @param ndim Number of Common Dimensions.
#' @param normalise To apply normalisation. FALSE == no (default), TRUE == yes.
#' @param threshold The threshold limit to stop the iterations. If the "difference of fit" < threshold (1e-10 as default).
#' @param loquace To display the calculation times. TRUE == yes, FALSE == no (default).
#' @param CompMethod To speed-up the analysis for really big MultiBlocks. 'Normal' (default), 'Kernel', 'PCT', 'Tall' or 'Wide'.
#' @param Partitions To speed-up the analysis for really big MultiBlocks. This parameter is used if CompMethod is 'Tall' or 'Wide'.
#' @return A \code{ComDim} object. Slots for supervised analysis
#'   (\code{R2Y}, \code{Q2}, \code{DQ2}, \code{VIP}, \code{VIP.block},
#'   \code{PLS.model}, \code{cv}, \code{Prediction}) are empty. The
#'   populated slots are:
#' \describe{
#'   \item{\code{Method}}{\code{"PCA"}.}
#'   \item{\code{ndim}}{Number of Common Dimensions extracted.}
#'   \item{\code{Q.scores}}{Global scores matrix (\eqn{n \times ndim}).
#'     Column names are \code{CC1}, \code{CC2}, etc.; row names are sample
#'     names.  Each column \eqn{\mathbf{q}_a} is a unit-norm consensus score,
#'     the dominant left singular vector of the salience-weighted concatenated
#'     blocks
#'     \eqn{\mathbf{W} = [\sqrt{\lambda_1}\mathbf{X}_1 \mid \cdots \mid
#'     \sqrt{\lambda_B}\mathbf{X}_B]}.}
#'   \item{\code{T.scores}}{Named list of block-specific local scores matrices
#'     (\eqn{n \times ndim} each).  For block \eqn{b} and component \eqn{a}:
#'     local loading \eqn{\mathbf{p}_{ba} = \mathbf{X}_b'\mathbf{q}_a} and
#'     local score
#'     \eqn{\mathbf{t}_{ba} =
#'     \mathbf{X}_b\,\mathbf{p}_{ba}(\mathbf{p}_{ba}'\mathbf{p}_{ba})^{-1}}.}
#'   \item{\code{P.loadings}}{Global loadings matrix (\eqn{p_{tot} \times
#'     ndim}).  Column \eqn{a} is
#'     \eqn{\mathbf{P}_a = \mathbf{X}'\mathbf{q}_a},
#'     where \eqn{\mathbf{X}} is the mean-centred (and optionally normalised)
#'     concatenated blocks.}
#'   \item{\code{Saliences}}{Block salience (weight) matrix (\eqn{ntable
#'     \times ndim}, row names = block names).  Entry \eqn{(b,a)} is
#'     \eqn{\lambda_{ba} =
#'     \mathbf{q}_a'\mathbf{X}_b\mathbf{X}_b'\mathbf{q}_a},
#'     the variance of block \eqn{b} captured by global score \eqn{a}.}
#'   \item{\code{R2X}}{Proportion of multi-block inertia captured by each
#'     component (named vector, length \eqn{ndim}).  Let
#'     \eqn{d_a} be the leading singular value of \eqn{\mathbf{W}} for
#'     component \eqn{a} (stored as \eqn{Singular_a = d_a^2}); then
#'     \deqn{R2X_a = Singular_a^2 \big/ \sum_k Singular_k^2 = d_a^4 \big/
#'     \sum_k d_k^4.}}
#'   \item{\code{Singular}}{Squared leading singular values of
#'     \eqn{\mathbf{W}}, one per component:
#'     \eqn{Singular_a = d_a^2}.}
#'   \item{\code{Mean}}{List with \code{MeanMB}: named list of column-mean
#'     vectors per block, used for mean-centring.}
#'   \item{\code{Norm}}{List with \code{NormMB}: Frobenius norms used for
#'     block normalisation (all ones when \code{normalise = FALSE}).}
#'   \item{\code{variable.block}}{Character vector (length \eqn{p_{tot}})
#'     indicating the block name of each row in \code{P.loadings}.}
#'   \item{\code{runtime}}{Total computation time in seconds.}
#' }
#' @examples
#' # Example 1: two data blocks.
#' b1 <- matrix(rnorm(500), 10, 50) # 10 samples, 50 variables
#' b2 <- matrix(rnorm(800), 10, 80) # 10 samples, 80 variables
#' mb <- MultiBlock(Data = list(b1 = b1, b2 = b2))
#' results <- ComDim_PCA(mb, 2)
#'
#' # Example 2: two data blocks, each with different replicate number
#' b1 <- matrix(rnorm(500), 10, 50)
#' batch_b1 <- rep(1, 10)
#' b2 <- matrix(rnorm(2400), 30, 80)
#' batch_b2 <- c(rep(1, 10), rep(2, 10), rep(3, 10))
#' mb <- MultiBlock(
#'   Samples = list(
#'     b1 = paste0("samples_", 1:10),
#'     b2 = rep(paste0("samples_", 1:10), 3)
#'   ),
#'   Data = list(b1 = b1, b2 = b2),
#'   Batch = list(b1 = batch_b1, b2 = batch_b2),
#'   ignore.size = TRUE
#' )
#' rw <- SplitRW(mb)
#' results <- ComDim_PCA(rw, 2)
#' @references Jouan-Rimbaud Bouveresse D, Rutledge DN (2024).
#'   A synthetic review of some recent extensions of ComDim.
#'   \emph{Journal of Chemometrics}, 38(5), e3454.
#'   \doi{10.1002/cem.3454}
#'
#'   Original MATLAB implementation: \url{https://github.com/DNRutledge/ComDim/}
#' @export
ComDim_PCA <- function(MB = MB, ndim = NULL,
                       normalise = FALSE, threshold = 1e-10,
                       loquace = FALSE,
                       CompMethod = "Normal", Partitions = 1) {
  # INITIALISATION

  progress_bar <- utils::txtProgressBar(min = 0, max = 80, style = 3, char = "=")

  start_ini_time <- Sys.time() # To start counting calculation time for the initialization


  if (!inherits(MB, "MultiBlock")) {
    stop("'MB' is not a MultiBlock.")
  }

  ntable <- length(blockNames(MB)) # number of blocks
  nrowMB <- length(sampleNames(MB)) # number of samples
  variable_number <- ncol(MB) # Number of variables per block

  give_error <- 0

  # Remove all variables with 0 variance. If there are no remaining variables, give_error
  numvar <- 0
  numblock0 <- vector()
  for (i in 1:ntable) {
    var0 <- apply(MB@Data[[i]], 2, function(x) {
      var(x)
    })
    var0 <- which(var0 == 0)

    if (length(var0) != 0) {
      numvar <- numvar + length(var0)
      if (length(var0) == length(MB@Variables[[i]])) {
        numblock0 <- append(numblock0, i)
      }
      MB@Data[[i]] <- MB@Data[[i]][, setdiff(1:variable_number[i], var0)]
      MB@Variables[[i]] <- MB@Variables[[i]][setdiff(1:variable_number[i], var0)]
      variable_number[i] <- length(MB@Variables[[i]])
    }
  }
  if (numvar != 0) {
    warning(sprintf("Number of variables excluded from the analysis because of their zero variance: %d", numvar))
  }
  if (length(numblock0) != 0) {
    numblock0 <- rev(numblock0) # To sort in decreasing order.
    for (i in seq_along(numblock0)) {
      MB@Data[[numblock0[i]]] <- NULL
      MB@Variables[[numblock0[i]]] <- NULL
      if (numblock0[[i]] %in% MB@Batch) {
        MB@Batch[[numblock0[i]]] <- NULL
      }
      if (numblock0[[i]] %in% MB@Metadata) {
        MB@Metadata[[numblock0[i]]] <- NULL
      }
    }
    warning(sprintf("Number of blocks excluded from the analysis because of their zero variance: %d", length(numblock0)))
  }

  ntable <- length(blockNames(MB)) # number of blocks
  variable_number <- ncol(MB) # In case the number of blocks has changed.

  if (length(MB@Data) == 0) { # In case all blocks had 0 variance...
    warning("All blocks had 0 variance")
    give_error <- 1
  }

  # Check for infinite or missing values (they cannot be handled with svd)
  for (i in 1:ntable) {
    if (any(is.na(MB@Data[[i]]))) {
      stop("The MB contains NA values. They must be removed first for ComDim analysis.")
    }
    if (any(is.infinite(MB@Data[[i]]))) {
      stop("The MB contains infinite values. They must be removed first for ComDim analysis.")
    }
  }

  if (give_error) {
    stop("The data is not ready for ComDim.")
  } else {
    message("The data can be used for ComDim.")
  }


  if (is.null(ndim)) {
    ndim <- ntable # If the number of components is not defined,
    # the number of components to extract is equal to the number of blocks.
  }

  pieceBar <- 4 + 2 * ntable + ndim # Number of updates in the progress bar.
  pieceBar <- 80 / pieceBar
  total_progress <- pieceBar

  DimLabels <- paste0("CC", 1:ndim) # One label per component.
  TableLabels <- blockNames(MB) # One label per block.

  # isT <- grepl(output, 'T')
  # isL <- grepl(output, 'L')
  # isP <- grepl(output, 'P')

  if (CompMethod == "Tall" && any(variable_number > 1000)) {
    message("The method 'Tall' is not recommeded for large blocks (> 1000 columns).")
    message("Do you still want to continue with the analysis?")
    message("You can change now the CompMethod if you prefer: (yes/no)")
    ans1 <- tolower(readline("Type your choice: (y/n)"))
    if (ans1 == "y" || ans1 == "yes") {
      message("Which method do you want to use instead (Normal/Kernel/PCT/Wide)?")
      ans2 <- tolower(readline("Type your choice: (n/k/p/w)"))
      if (ans2 == "n" || ans2 == "normal") {
        CompMethod <- "Normal"
      } else if (ans2 == "k" || ans2 == "kernel") {
        CompMethod <- "Kernel"
      } else if (ans2 == "p" || ans2 == "pct") {
        CompMethod <- "PCT"
      } else if (ans2 == "w" || ans2 == "wide") {
        CompMethod <- "Wide"
      } else {
        stop("Wrong answer typed.")
      }
    } else if (ans1 == "n" || ans1 == "no") {
      message("The method 'Tall' will be used")
    } else {
      stop("Wrong answer typed.")
    }
  }

  if (CompMethod == "Kernel" && nrowMB >= min(variable_number)) {
    stop(sprintf(
      paste0("CompMethod = 'Kernel' requires n < p for every block ",
             "(n = %d, smallest block p = %d). ",
             "Use 'Normal', 'PCT', or 'Wide' instead."),
      nrowMB, min(variable_number)
    ))
  }

  end_ini_time <- Sys.time() # To end the count of the calculation time.

  if (loquace) {
    message(sprintf("Initialisation finished after : %s millisecs", (end_ini_time - start_ini_time) * 1000))
  }

  utils::setTxtProgressBar(progress_bar, value = total_progress)

  # NORMALISATION

  # start_norm_time <- Sys.time()  # To start counting calculation time for the initialization

  X_mat <- matrix(, nrow = nrowMB, ncol = sum(variable_number))
  Xnorm_mat <- matrix(, nrow = nrowMB, ncol = sum(variable_number))

  res_calib <- list()
  temp_tabCalib <- list()
  s_n <- list()
  res_calib$SingVal <- as.vector(NULL)
  res_calib$NormMB <- as.vector(NULL)
  res_calib$MeanMB <- list()

  for (i in 1:ntable) {
    res_calib$MeanMB[[TableLabels[i]]] <- colMeans(MB@Data[[i]])
    names(res_calib$MeanMB[[TableLabels[i]]]) <- MB@Variables[[i]]
    # res_calib$MeanMB$i <- TableLabels

    if (normalise) {
      # Normalise original blocks

      X_mean <- MB@Data[[i]] - matrix(data = rep(1, nrowMB), ncol = 1, nrow = nrowMB) %*% res_calib$MeanMB[[i]]
      # In a previous version (that worked), I had used as.matrix instead of matrix.
      XX <- X_mean * X_mean

      Norm_X <- sqrt(sum(XX))
      X_Normed <- X_mean / Norm_X

      res_calib$NormMB[i] <- Norm_X

      temp_tabCalib[[i]] <- X_Normed
      s_n[[i]] <- X_Normed
    } else {
      res_calib$NormMB[[TableLabels[i]]] <- rep(1, length(MB@Variables[[i]]))
      names(res_calib$NormMB[[TableLabels[i]]]) <- MB@Variables[[i]]
      # res_calib$MeanMB$Variables <- MB@Variables[[i]]
      # res_calib$MeanMB$i <- TableLabels

      temp_tabCalib[[i]] <- MB@Data[[i]]
      s_n[[i]] <- MB@Data[[i]]
    }

    if (i == 1) {
      X_mat[, 1:variable_number[1]] <- MB@Data[[i]]
      Xnorm_mat[, 1:variable_number[1]] <- temp_tabCalib[[i]]
    } else {
      beg <- sum(variable_number[1:(i - 1)]) + 1
      ending <- sum(variable_number[1:i])
      X_mat[, beg:ending] <- MB@Data[[i]]
      Xnorm_mat[, beg:ending] <- temp_tabCalib[[i]]
    }

    norm_comdim <- Sys.time()
    if (loquace) {
      message(sprintf("Normalization of block %s finished after : %s millisecs", i, (norm_comdim - start_ini_time) * 1000))
    }
    total_progress <- total_progress + pieceBar
    utils::setTxtProgressBar(progress_bar, value = total_progress)
  }

  # res_calib$NormMB$i <- 'Norm'
  names(res_calib$NormMB) <- TableLabels

  nR <- nrow(Xnorm_mat)
  nC <- ncol(Xnorm_mat)

  # COMPRESSION

  s_r <- Compress_Data_2020(s_n, CompMethod, Partitions)

  end_comp_time <- Sys.time()

  if (loquace) {
    message(sprintf("Compression finished after : %s millisecs", (end_comp_time - start_ini_time) * 1000))
  }

  total_progress <- total_progress + pieceBar * ntable
  utils::setTxtProgressBar(progress_bar, value = total_progress)

  ## DO ComDim
  # ini_comdim <- Sys.time()

  # saliences <- list()
  saliences <- matrix(, ncol = ndim, nrow = ntable)
  # Q <- list()
  Q <- matrix(, ncol = ndim, nrow = nrowMB)
  unexplained <- ntable # Warning!: true only if the tables were set to norm=1
  varexp <- as.vector(NULL)

  for (dims in 1:ndim) {
    previousfit <- 10000
    deltafit <- 1000000
    lambda <- matrix(rep(1, ntable), nrow = ntable, ncol = 1)

    qini <- as.vector(s_r[[1]][, 1])
    qini <- qini / sqrt(as.vector(qini %*% qini)) # random initialization of t + set to unit length
    qold <- 100

    iters <- 0
    while (norm(qold - qini, "2") > threshold && iters < 100) {
      iters <- iters + 1
      qold <- qini
      q <- 0

      W <- matrix(, nrow = nrowMB, ncol = 0)
      for (j in 1:ntable) {
        W <- cbind(W, sqrt(lambda[j]) * s_r[[j]])
      }

      usv <- svd(W)
      qtmp <- usv$u[, 1]

      for (j in 1:ntable) {
        # takes each table into account for lambda after PCA
        lambda[j] <- t(qtmp) %*% (s_r[[j]] %*% t(s_r[[j]])) %*% qtmp
        q <- q + lambda[j] * qtmp
      }

      q <- q / sqrt(as.vector(t(q) %*% q)) # standardizes t
      if (abs(min(q)) > abs(max(q))) {
        q <- -q
      }
      qini <- q
    } # deltafit>threshold

    saliences[, dims] <- lambda
    Q[, dims] <- q


    # SingVal is square of the SVD singular value
    # because here it is based on X, not X.X'
    # Does NOT work for Kernel & Wide & Tall
    if (CompMethod == "Normal" || CompMethod == "PCT") {
      res_calib$SingVal[dims] <- usv$d[1] * usv$d[1] # from SVD on W
      varexp[dims] <- res_calib$SingVal[dims]^2
    } else {
      varexp[dims] <- 0 # This will be overwritten at the end
      res_calib$SingVal[dims] <- 0 # This will be overwritted at the end
    }


    # Deflate blocks
    aux <- diag(nrowMB) - as.matrix(q) %*% t(as.matrix(q))
    for (j in 1:ntable) {
      s_r[[j]] <- aux %*% s_r[[j]]
    }

    iter_comdim <- Sys.time()
    if (loquace) {
      message(sprintf("Component %s determined after : %s millisecs", dims, (iter_comdim - start_ini_time) * 1000))
    }
    total_progress <- total_progress + pieceBar
    utils::setTxtProgressBar(progress_bar, value = total_progress)
  }

  # res_calib$SingVal$i <- 'Singular value'
  names(res_calib$SingVal) <- DimLabels # Dimensions

  # Q$i <- data[[1]]$Samples
  # Q$v <- DimLabels
  colnames(Q) <- DimLabels
  rownames(Q) <- MB@Samples
  res_calib$Q <- Q
  rm(Q)

  ## Adding metadata. Metadata extracted from the first block
  res_calib$metadata <- list()
  if (length(MB@Metadata) != 0) {
    res_calib$metadata[[1]] <- MB@Metadata
  }


  end_comdim <- Sys.time()
  if (loquace) {
    message(sprintf("Scores finished after : %s millisecs", (end_comdim - start_ini_time) * 1000))
  }
  total_progress <- total_progress + pieceBar
  utils::setTxtProgressBar(progress_bar, value = total_progress)

  meanW <- as.vector(NULL)
  for (j in 1:ntable) {
    meanW <- c(meanW, mean(s_r[[j]]))
  }
  # res_calib$s_n <- s_n
  # res_calib$s_r <- s_r
  rm(s_n)
  rm(s_r)

  ##
  # explained <- list()
  explained <- varexp / sum(varexp)
  names(explained) <- DimLabels
  # explained$i <-'% explained'
  # explained$v <- DimLabels #Dimensions
  res_calib$explained <- explained
  rm(varexp)
  rm(explained)

  ## Calculate Sums of saliences for each Dim
  # Sum_saliences_Dim <- list()
  Sum_saliences_Dim <- colSums(saliences)
  names(Sum_saliences_Dim) <- DimLabels
  # Sum_saliences_Dim$i <-'Sum Dim Saliences'
  # Sum_saliences_Dim$v <- DimLabels #Dimensions

  res_calib$Sum_saliences_Dim <- Sum_saliences_Dim
  rm(Sum_saliences_Dim)

  # Calculate Sums of saliences for each Table
  # Sum_saliences_Tab <- list()
  Sum_saliences_Tab <- rowSums(saliences)
  # Sum_saliences_Tab$i <- 'Sum Tab Saliences'
  names(Sum_saliences_Tab) <- TableLabels
  # Sum_saliences_Tab$v <- TableLabels #tables

  res_calib$Sum_saliences_Tab <- Sum_saliences_Tab
  rm(Sum_saliences_Tab)

  # saliences$i <- TableLabels #tables
  # saliences$v <- DimLabels #Dimensions
  res_calib$saliences <- saliences
  colnames(res_calib$saliences) <- DimLabels
  rownames(res_calib$saliences) <- TableLabels

  ## Calculate Normalised concatenated Xs ('Calib') from col
  # Calculate concatenated CD loadings
  # Reorganise Loadings - 1 matrix / LV

  ## If Output = NULL, nothing else is calculated
  # if(!(is.null(output))){
  L_CD_Vec <- NULL
  L_X_Vec <- NULL
  Calib <- NULL

  nCalib <- nrowMB
  # }

  ## Already defined in during ComDim initiallization
  # isT <- grepl(output, 'T')
  # isL <- grepl(output, 'L')
  # isP <- grepl(output, 'P')
  # isLP <- c(isP,isL)
  # isTLP <- c(isT,isP,isL)

  # tic
  # Calculate concatenated CD loadings
  # Reorganise Loadings - 1 matrix / LV
  # L_CD <- list()
  # L_X <- list()
  T_Loc <- list()
  for (i in 1:ntable) { # Prepare lists
    # L_CD[[TableLabels[i]]] <- matrix(,ncol = ndim, nrow = ncol(temp_tabCalib[[i]]))
    # L_X[[TableLabels[i]]] <- matrix(,ncol = ndim, nrow = ncol(temp_tabCalib[[i]]))
    T_Loc[[TableLabels[i]]] <- matrix(, ncol = ndim, nrow = nrowMB)
  }
  b <- matrix(, ncol = ndim, nrow = ntable) # Used to calculate R2X in Kernel and Tall

  if (!requireNamespace("pracma", quietly = TRUE)) {
    stop("The package pracma is needed.")
  }

  for (j in 1:ndim) {
    T_mat <- matrix(, nrow = nrowMB, ncol = 0)

    for (i in 1:ntable) {
      # Q.d are orthonormal in ICA & PCA

      temp <- t(temp_tabCalib[[i]]) %*% res_calib$Q[, j] # Scaled CD 'local' Loadings

      # if  (!(is.null(isP))){
      # L_CD[[TableLabels[i]]][,j] <- temp
      # }

      # if  (!(is.null(isL))){
      #  #Q.d are orthonormal in ICA & PCA
      #  L_X[[TableLabels[i]]][,j] <- t(MB@Data[[i]])%*%Q$Data[,j] #Unscaled X 'local' Loadings;
      # }

      # T_mat <- cbind(T_mat,temp_tabCalib[[i]]%*%(temp/as.vector(t(temp)%*%temp))) #local Scores
      T_mat <- cbind(T_mat, temp_tabCalib[[i]] %*% (temp %*% pracma::pinv(t(temp) %*% temp))) # local Scores

      # if(!(is.null(isT))){
      # T_Loc[[TableLabels[i]]][,j] <- temp_tabCalib[[i]]%*%(temp/as.vector(t(temp)%*%temp)) # local Scores
      T_Loc[[TableLabels[i]]][, j] <- temp_tabCalib[[i]] %*% (temp %*% pracma::pinv(t(temp) %*% temp)) # local Scores
      # }


      # Deflate each temp_tabCalib
      temp_tabCalib[[i]] <- temp_tabCalib[[i]] - res_calib$Q[, j] %*% t(temp)
    }


    # For each CC
    # MLR b-coefficients between Local and Global Scores
    # [b0,b[,j]] <- mlr_DB(T_mat,Q$d[,j],0)
    # b=pinv(X'*X)*X'*Y;
    b[, j] <- pracma::pinv(t(T_mat) %*% T_mat) %*% t(T_mat) %*% res_calib$Q[, j]
    # b0 <- 0
  }

  for (i in 1:ntable) {
    rownames(T_Loc[[TableLabels[i]]]) <- rownames(res_calib$Q)
    colnames(T_Loc[[TableLabels[i]]]) <- colnames(res_calib$Q)
  }

  # If Output==[], nothing else is calculated
  # if(!(is.null(output))){
  # Calculate Global Loadings
  # if(!(is.null(isP))){
  L_CD_Vec <- t(Xnorm_mat) %*% res_calib$Q # Scaled CD 'global' Loadings
  # }

  # if(!(is.null(isL))){
  #  L_X_Vec <- t(X_mat)%*%Q$Data # Unscaled X 'global' Loadings
  # }
  # }

  #### If Output==[], nothing else is calculated
  load_comdim <- Sys.time()
  if (loquace) {
    message(sprintf("Loadings finished after : %s millisecs", (load_comdim - start_ini_time) * 1000))
  }

  total_progress <- total_progress + pieceBar
  utils::setTxtProgressBar(progress_bar, value = total_progress)

  rm(X_mat)
  rm(temp_tabCalib)
  rm(temp)

  ## Output
  end_function <- Sys.time()

  # res_calib$b <- b
  # colnames(res_calib$b) <- DimLabels
  # rownames(res_calib$b) <- TableLabels
  # res_calib$b$i <- TableLabels # tables
  # res_calib$b$v <- DimLabels  # Dimensions

  # if(!(is.null(isT))){
  res_calib$T_Loc <- T_Loc
  # res_calib$T_Loc$i <- res_calib$Q$i # samples
  # res_calib$T_Loc$v <- DimLabels # Dimensions
  # }

  # T_Loc is no longer needed
  rm(T_Loc)


  # if(!(is.null(isP))){

  varnames <- vector()
  for (i in 1:ntable) {
    varnames <- append(varnames, MB@Variables[[i]])
  }
  # if(length(which(duplicated(varnames))) != 0){
  #  warning('There are at least two variables with the same name. Their name in the rownames of P has been altered.\n')
  # }

  res_calib$P <- L_CD_Vec
  # res_calib$P$i <- varnames # numbers of all variables;
  # res_calib$P$v <- DimLabels # Dimensions
  colnames(res_calib$P) <- DimLabels
  rownames(res_calib$P) <- varnames

  # res_calib$P_Loc <- L_CD
  # res_calib$P_Loc$i <- TableLabels # Tables
  # res_calib$P_Loc$v <- DimLabels # Dimensions
  # for(i in 1:ntable){
  #  rownames(res_calib$P_Loc[[TableLabels[i]]]) <- MB@Variables[[i]]
  #  colnames(res_calib$P_Loc[[TableLabels[i]]]) <- DimLabels
  # }
  # }

  rm(L_CD_Vec)
  # rm(L_CD)

  # if(!(is.null(isL))){
  #   res_calib$Lx$Data <- L_X_Vec
  #   res_calib$Lx$i <- t(c(1:nC)) # numbers of all variables;
  #   res_calib$Lx$v <- DimLabels # Dimensions
  #   colnames(res_calib$Lx$Data) <- DimLabels
  #   rownames(res_calib$Lx$Data) <- t(c(1:nC))
  #   #res_calib$Lx_Loc$i <- TableLabels # Tables
  #   #res_calib$Lx_Loc$v <- DimLabels # Dimensions
  #   res_calib$Lx_Loc$Data <- L_X
  #   for (i in 1:ntable){
  #     colnames(res_calib$Lx_Loc$Data[[TableLabels[i]]]) <- DimLabels
  #     rownames(res_calib$Lx_Loc$Data[[TableLabels[i]]]) <- MB@Variables[[i]]
  #   }
  # }
  # rm(L_X_Vec)
  # rm(L_X)

  ##
  # Singular value calculated from b and saliences
  # Since :
  # b_T_Q(:,j)=saliences.d(:,j).*saliences.d(:,j)/SingVal.d(1,j);
  if (CompMethod == "Kernel" || CompMethod == "Tall" || CompMethod == "Wide") {
    for (dims in 1:ndim) {
      res_calib$SingVal[dims] <- t(b[, dims]) %*% ((saliences[, dims]^2) / as.vector(t(b[, dims]) %*% b[, dims]))
    }
    names(res_calib$SingVal) <- DimLabels
    varexp <- res_calib$SingVal^2
    res_calib$explained <- varexp / sum(varexp)
    names(res_calib$explained) <- DimLabels
  }

  rm(saliences)

  # Define block length
  belong_block <- rep(0, nrow(res_calib$P))
  k <- 1
  for (i in 1:ntable) {
    belong_block[k:(k + variable_number[i] - 1)] <- TableLabels[i]
    k <- k + variable_number[i]
  }

  #### If Output==[], nothing else is calculated
  # if(normalise == 1){
  #  res_calib$Xnorm_mat$i <- res_calib$Q$i # samples
  #  res_calib$Xnorm_mat$v <- t(c(1:nC)) # numbers of all variables;
  #  res_calib$Xnorm_mat$Data <- Xnorm_mat
  # }
  # rm(Xnorm_mat)
  #### If Output==[], nothing else is calculated

  end_output <- Sys.time()
  running_time <- (end_output - start_ini_time)
  if (loquace) {
    message(sprintf("Analysis finished after : %s seconds", running_time))
  }
  res_calib$runtime <- running_time # Total time of analysis

  progress_bar <- utils::txtProgressBar(min = 0, max = 80, style = 3, char = "=")
  utils::setTxtProgressBar(progress_bar, value = 80)

  close(progress_bar)

  # Save data in a ComDim structure
  res_calib <- new("ComDim",
    Method = "PCA",
    ndim = ndim,
    Q.scores = res_calib$Q,
    T.scores = res_calib$T_Loc,
    P.loadings = res_calib$P,
    Saliences = res_calib$saliences,
    Orthogonal = list(),
    R2X = res_calib$explained,
    R2Y = vector(),
    Q2 = vector(),
    DQ2 = vector(),
    Singular = res_calib$SingVal,
    Mean = list(MeanMB = res_calib$MeanMB, MeanY = NULL),
    Norm = list(NormMB = res_calib$NormMB),
    PLS.model = list(),
    cv = list(),
    Prediction = list(),
    Metadata = res_calib$metadata,
    variable.block = belong_block,
    runtime = as.numeric(res_calib$runtime)
  )
  return(res_calib)
}


#' Compress_Data_2020
#'
#' Compress large multi-block objects. Internal function of ComDim_PCA().
#' @param s_n The multi-block object.
#' @param CompMethod It can be 'Normal' (default), 'Kernel', 'PCT', 'Tall' or 'Wide'.
#' @param Partitions The number of partitions.
#' @return The compressed multi-block.
#' @references Original MATLAB implementation:
#'   \url{https://github.com/DNRutledge/ComDim/blob/main/Compress_Data_2020.m}
#' @keywords internal
Compress_Data_2020 <- function(s_n = s_n, CompMethod = CompMethod, Partitions = Partitions) {
  s_r <- list()
  ntable <- length(s_n)
  nR <- nrow(s_n[[1]])

  if (!is.null(CompMethod)) {
    if (CompMethod == "Tall") {
      for (i in 1:ntable) {
        MaxRank <- min(c(nrow(s_n[[i]]), ncol(s_n[[i]])))
        # Tall segmented PCT
        s_r[[i]] <- PCA_Tall_PCT_DNR(Xn = s_n[[i]], PCs = MaxRank, Partitions = Partitions)
      }
    } else if (CompMethod == "Wide") {
      for (i in 1:ntable) {
        Xi <- ColumnsPartition(Xn = s_n[[i]], Partitions = Partitions)
        T_all <- matrix(, nrow = nrow(s_n[[i]]), ncol = 0)
        for (p in 1:Partitions) {
          usv <- svd(Xi[[p]])
          T_in <- usv$u %*% diag(usv$d)
          T_all <- cbind(T_all, T_in)
        }
        s_r[[i]] <- T_all
        # print(s_r[[i]][1:3,1:4])
      }
    } else if (CompMethod == "Kernel") {
      for (i in 1:ntable) {
        Kern <- (s_n[[i]] %*% t(s_n[[i]])) / nR
        usv <- svd(Kern)
        s_r[[i]] <- sqrt(nR) * usv$u %*% diag(sqrt(usv$d))
      }
    } else if (CompMethod == "PCT") {
      for (i in 1:ntable) {
        usv <- svd(s_n[[i]])
        s_r[[i]] <- usv$u %*% diag(usv$d)
      }
    } else if (CompMethod == "Normal") {
      for (i in 1:ntable) {
        s_r[[i]] <- s_n[[i]]
      }
    }
  }
  return(s_r)
}


#' PCA_Tall_PCT_DNR
#'
#' Compress a block using the PCT method. Internal function of ComDim_PCA().
#' @param Xn The block to compress.
#' @param PCs Number of PCs to keep.
#' @param Partitions The number of partitions.
#' @return The compressed block.
#' @keywords internal
PCA_Tall_PCT_DNR <- function(Xn = Xn, PCs = PCs, Partitions = Partitions) {
  W <- RowsPartition(Xn, Partitions)
  cols <- ncol(Xn)

  D <- t(W[[1]]) %*% W[[1]]

  if (Partitions > 1) {
    for (par in 2:Partitions) {
      D <- D + (t(W[[par]]) %*% W[[par]])
    }
  }

  vs <- eigen(D)
  vs_flipped <- vs$vectors # eigen() in R returns eigenvalues in decreasing order; columns are eigenvectors

  Tm <- matrix(, nrow = nrow(Xn), ncol = 0)
  Taux <- matrix(, nrow = 0, ncol = PCs)

  for (par in 1:Partitions) {
    to <- W[[par]] %*% vs_flipped

    Taux <- rbind(Taux, to[, 1:PCs])
  }

  Tm <- cbind(Tm, Taux)
  return(Tm)
}

#' ColumnsPartition
#'
#' Calculate vertical partitions in a block. Internal function of ComDim_PCA().
#' @param Xn A block.
#' @param Partitions The number of partitions.
#' @return A list of column-partitioned sub-matrices.
#' @keywords internal
ColumnsPartition <- function(Xn = Xn, Partitions = Partitions) {
  cols <- ncol(Xn)
  stride <- floor(cols / Partitions)
  remaining <- cols %% Partitions

  count <- 1
  W <- list()

  for (i in 1:Partitions) {
    step <- count + stride - 1
    if (i == Partitions) {
      step <- step + remaining
    }
    W[[i]] <- Xn[, count:step]
    count <- count + stride
  }
  return(W)
}

#' RowsPartition
#'
#' Calculate horizontal partitions in a block. Internal function of ComDim_PCA().
#' @param Xn A block.
#' @param Partitions The number of partitions.
#' @return A list of row-partitioned sub-matrices.
#' @keywords internal
RowsPartition <- function(Xn = Xn, Partitions = Partitions) {
  rows <- nrow(Xn)
  stride <- floor(rows / Partitions)
  remaining <- rows %% Partitions

  count <- 1
  W <- list()

  for (i in 1:Partitions) {
    step <- count + stride - 1
    if (i == Partitions) {
      step <- step + remaining
    }
    W[[i]] <- Xn[count:step, ]
    count <- count + stride
  }
  return(W)
}

Try the R.ComDim package in your browser

Any scripts or data that you put into this service are public.

R.ComDim documentation built on May 13, 2026, 9:07 a.m.