CSSCA: ClusterSSCA: Cluster-wise Simultaneous Component Analysis

Documented in MainCSSCA

#' The main function of the CSSCA method. A rational starting partition is required for the analysis.
#' To obtain the deafult starting partitions (of clusterwise PCA and of iCluster), higher-level function FixedCSSCA should be used
#'
#' @param all_data A matrix with concatenated data (the aggregation of the data blocks by rows (entries)). The CSSCA method will be performed on the data.
#' @param nvar A vector of length nblock, with the ith element indicates the number of variables assumed for the ith data block. It could also be an integer; in such cases, we assume all blocks have the same amount of variables.
#' @param nblcok A positive integer indicates the number of data blocks
#' @param ncom An integer indicates the number of common components
#' @param ndistinct A vector of length nblock, with the ith element indicates the number of distinctive components assumed for the ith data block. It could also be an integer; in such cases, we assume all data blocks have the same amount of distinctive components.
#' @param ncluster A positive integer indicates the number of clusters assumed. When ncluster = 1, CSSCA boils down to SSCA analysis.
#' @param nobservation A positive integer the number of entries that are included in the dataset
#' @param psparse A number within the range of [0,1] that indicates the sparsity level (i.e. the proportion of zero elements in the loading matrix)
#' @param feed A vector (i.e. partition vector) to serve as rational starts (or semi-rational starts)
#' @param cutoff.prop A number within the range of [0,1]. In each iteration, if the proportion of observations that have changed their cluster memberships is smaller than such value, re-estimation of cluster-specific SSCA solution will not be executed
#' @param n_replace A positive integer indicates the amount of observations that have changed their cluster memberships to create the semi-rational starts
#' @param n_replicate A positive integer implies the amount of replicates when the n_replace is fixed (e.g. when n_replace = 1, the algorithm will generate n_replicate semi-rational starts,
#' each of which is generated by randomly change the membership of one of the observation)
#' @param rate A number within the range of [0,1] to implicate the retain rate of starting partitions after the first two iterations.
#' @return a list of four elements. The first element is vector that indicates the resulting cluster partitions, the nth element refers
#' to the cluster assignment of the \code{nth} entry;
#' the second element is a numeric value that is the optimal (minimal) loss function obtained from many starts;
#' the third element is a list that displays cluster-specific loading matrices;
#' the forth element is a list that displays cluster-specific score matrices;
#' @examples
#' n_cluster <- 3
#' mem_cluster <- c(50,50,50) # 50 entries in each cluster
#' n_obs <- sum(mem_cluster)
#' n_block <- 2
#' n_com <- 2
#' n_distinct <- c(1,1) #1 distinctive components in each block
#' n_var <- c(15,9)
#' p_sparse <- 0.5
#' p_noise <- 0.3
#' p_combase <- 0.5 # moderate similarity
#' p_fixzero <- 0.5 # moderate similarity
#' mean_v  <- 0.1
#' # extract the data from the simulation
#' (not run)  sim <- CSSCASimulation(n_cluster, mem_cluster, n_block, n_com, n_distinct, n_var, p_sparse,
#'  p_noise, p_combase, p_fixzero, "both", mean_v)
#'  target_data <- sim$concatnated_data
#'  # feed the data with original cluster assignment and estimate with the CSSCA method
#'  results <- MainCSSCA(target_data, n_var, n_block, n_com, n_distinct, n_cluster, n_obs, p_sparse, sim$cluster_assign, n_replace = 5)
#'
MainCSSCA <- function(all_data, nvar, nblock, ncom, ndistinct, ncluster, nobservations, psparse, feed, cutoff.prop = 1/6, n_replace, n_replicate = 3, rate = 1/10){
#######################################################################################################
################################## the set-ups in the current algorithm
##################################################################################################
# set the upper bound of loss function
upper <- 1e9
# set the values of the flags
flag <- 0

# set the maximum iteration time for one single random start
MAXITER <- 100

if (length(nvar) == 1){
  nvar <- rep(nvar, nblock)
}

if (length(ndistinct) ==1){
  ndistinct <- rep(ndistinct, nblock)
}

# the aggregate level of information
all_member <- nobservations
all_var <- sum(nvar)
all_component <- sum(ncom, ndistinct)

# compute the fixed positions of zeros
distinct_index <- vector("numeric")
sum_var <- 0
# useless onwards
ini_var <- 0

# p: the number of block
for (p in 1:nblock){
  distinct_index <- c(distinct_index, rep(p, ndistinct[p]))
  ini_var <- ini_var + nvar[p]
  sum_var <- c(sum_var, ini_var)
}

distinct_zeros <- vector("numeric")

# r.distinct: the number of distinctive component
if(sum(ndistinct) != 0){
  for (r.distinct in 1:sum(ndistinct)){
    distinct_zeros <- c(distinct_zeros, ((all_var * (ncom + r.distinct - 1)) + sum_var[distinct_index[r.distinct]] + 1): ((all_var * (ncom + r.distinct - 1)) + sum_var[(distinct_index[r.distinct] + 1)]))
  }
}

## the amount of sparseness-induced zeros
no_fixed_zeros <- all_var * all_component - length(distinct_zeros)
# compute the number of zero loadings
zeros <- round(no_fixed_zeros * psparse)

# set the upper bound of minimum loss
loss_min <- upper

# step 1b: generate the multiple starts
## the rule: (1) n_replace in [1,5] -> 15 replications (75)
## (2) n_place in [6, 10] -> 10 replications (50)
## (3) the rest -> 3 replications (3 * 25)
# caculate the amount of multiple starts
n_starts <- 0

# if n_replace = 0, then please directly use the cssca_quick_feed algorithm
if ((n_replace > 0) & (n_replace < 6)){
  n_starts <- 15 * n_replace #overall 75 starts
}
if ((n_replace > 5) & (n_replace < 11)){
  n_starts <- 75 + (n_replace - 5) * 10 #overall 50 starts
}
if (n_replace > 10){
  n_starts <- 125 + (n_replace - 10) * n_replicate #3 * (replace - 10 starts)
}

all_starts <- list(length = n_starts)
for (i in 1:n_starts){
  if ((i > 0) & (i < 76)){
    all_starts[[i]] <- ExchangeMem(feed, ncluster, (floor((i - 1) / 15) + 1))
  }
  if ((i > 75) & (i < 126)){
    all_starts[[i]] <- ExchangeMem(feed, ncluster, (floor((i - 76) / 10) + 6))
  }
  if (i > 125) {
    all_starts[[i]] <- ExchangeMem(feed, ncluster, (floor((i - 126) / n_replicate) + 11))
  }
}

# the current_ families are the parameters that are involved in the iterations in later steps
current_data <- list()
## the current are the lists with n_starts elements (and each element is a sublist with k elements)
current_loading <- list()
current_score <- list()
current_loss <- list()

# the overall_ families are the paramters that include all the parameters in multi-start procedure
overall_data <- list()
## the current are the lists with n_starts elements (and each element is a sublist with k elements)
overall_loading <- list()
overall_score <- list()
overall_loss <- list()

###########################################################################################
## the following objects could only include ncluster things
temp_score <- list()
temp_loading <- list()
temp_mem <- list()
temp_loss <- vector(length = ncluster)
# the amount of members in a certain cluster
current_mem_n <- vector(length = ncluster)
# the proportions of the observations that are from a specific cluster and that have changed their memberships
change_prop <- vector(length = ncluster)
# the amount of the observations in a specific cluster that have changed their memberships
change <- vector(length = ncluster)
####################################################################################################

##############################################################################################
## the following objects could be a single vector (the records for t)
# the loss function in a specific iteration
loss_iteration <- vector("numeric")
# the number of observations that have changed their memberships in a given iteration
change_iteration <- vector("numeric")
# record the member updates in step 3 and realize the membership updates in step 4
mem_update <- rep(0, all_member)
# record the total loss after the first iteration
loss_first <- rep(upper, length = n_starts)
mem_first <- list(length = n_starts)

############################################################################################################
############################################################################################################
####### part 1: the first two iterations: check which starts will be pursued further #######################
for (i in 1:n_starts){
  # determine the initial partition
  cluster_mem <- all_starts[[i]]
  # initiate the elements of the current_ and temp_
  overall_data[[i]] <- list()
  overall_loading[[i]] <- list()
  overall_score[[i]] <- list()
  overall_loss[[i]] <- vector(length = ncluster)

  # the total loss function of a given start partition
  loss_tot <- 0
  flag <- 0

  ## the very first iteration
  for (k in 1:ncluster){
    temp_mem[[k]] <- which(cluster_mem == k)
    overall_data[[i]][[k]] <- all_data[which(cluster_mem == k), ]
    # check whether any cluster has < (#all components + 1) observations. if so, the SVD could not be performed, and the iterations
    # should be broken (starting from another random assignment)
    if (length(overall_data[[i]][[k]]) == 0) {
      flag <- 1
      break
    }
    if (is.null(overall_data[[i]][[k]])) {
      flag <- 1
      break
    }
    if (is.null(nrow(overall_data[[i]][[k]]))) {
      flag <- 1
      break
    }
    if (!is.null(overall_data[[i]][[k]]) & (nrow(overall_data[[i]][[k]])) < (all_component + 1)){
      flag <- 1  # flag for breaking the loop
      break      # for the loop k
    }
    ssca_results <- IntSparseSca_random_cpp(overall_data[[i]][[k]], nvar, nblock, all_component, distinct_zeros, psparse)
    loss_tot <- loss_tot + ssca_results$loss
    # in the influence approach, only the loading matrices that matter
    overall_loading[[i]][[k]] <- ssca_results$loadings
    overall_score[[i]][[k]] <- ssca_results$scores
    overall_loss[[i]][k] <- ssca_results$loss
    ## change the current position is enabled
    #current_position[[k]] <- which(current_loading[[k]] == 0)
  }

  if (flag == 1)  next

  for (j in 1:all_member){
    # set an impossible upper bound
    min_change <- upper
    for (k in 1:ncluster){
      # the member belongs to a certain cluster
      if (cluster_mem[j] == k){
        ## get the loss function without observation j
        # the data matrix without j
        new_data <- all_data[setdiff(temp_mem[[k]], j), ]
        xi <- MatrixCenter_cpp(new_data, 1, 0)

        ## update T and P for only one iteration
        xp <- t(xi %*% overall_loading[[i]][[k]])
        xp_svd <- svd(xp)
        # the new score
        matrix_xi <- xp_svd$v %*% t(xp_svd$u)
        # the new loading
        P1 <- t(xi) %*% matrix_xi
        # add the uniqueness induced zeros and sparsenes induced zeros
        P1[distinct_zeros] <- 0
        P1[ (sort(abs(P1), index.return = TRUE)$ix)[1:(zeros + length(distinct_zeros))]] <- 0
        #P1[current_position[[k]]] <- 0

        u <- sum((xi - matrix_xi %*% t(P1)) ^ 2)

        # update the current loading
        temp_loading[[k]] <- P1
        # update the current score
        temp_score[[k]] <- matrix_xi
        # update the current loss
        temp_loss[k] <- u

        # the loss function with observation j is the old loss function
        # we could thus compute the change in loss
        loss_change <- overall_loss[[i]][k] - temp_loss[k]
      }

      # the member does not belong to a certain cluster
      if (cluster_mem[j] != k) {
        # the loss function with observation j
        # the data matrix with j
        new_data <- all_data[c(temp_mem[[k]], j), ]
        xi <- MatrixCenter_cpp(new_data, 1, 0)

        ## update T and P for only one iteration
        xp <- t(xi %*% overall_loading[[i]][[k]])
        xp_svd <- svd(xp)
        # the new score
        matrix_xi <- xp_svd$v %*% t(xp_svd$u)
        # the new loading
        P1 <- t(xi) %*% matrix_xi
        P1[distinct_zeros] <- 0
        P1[ (sort(abs(P1), index.return = TRUE)$ix)[1:(zeros + length(distinct_zeros))]] <- 0
        #P1[current_position[[k]]] <- 0

        u <- sum((xi - matrix_xi %*% t(P1)) ^ 2)

        # update the current loading
        temp_loading[[k]] <- P1
        # update the current score
        temp_score[[k]] <- matrix_xi
        # update the current loss
        temp_loss[k] <- u

        # the loss function with observation j is the old loss function
        # we could thus compute the change in loss
        loss_change <- temp_loss[k] - overall_loss[[i]][k]
      }

      # assign the jth observation to the best fit cluster
      if (loss_change < min_change){
        min_change <- loss_change
        # update the membership status of this certain observation
        mem_update[j] <- k
      }
    }
  }

  # the first iteration of membership exchange
  for (j in 1:all_member){
    if (cluster_mem[j] != mem_update[j]){
      sign <- 1
      c <- cluster_mem[j]
      cluster_mem[j] <- mem_update[j]
      # remove the current observation from the previous cluster
      #current_mem[[i]][[c]] <- setdiff(current_mem[[i]][[c]], j)
      # add the current observation to the new cluster
      #current_mem[[i]][[mem_update[j]]] <- c(current_mem[[i]][[mem_update[j]]], j)
    }
  }

  # record the cluster partition
  mem_first[[i]] <- cluster_mem

  ### the second iteration of the SSCA solution
  # the total loss function of a given start partition
  loss_tot <- 0
  ## the very first iteration
  for (k in 1:ncluster){
    overall_data[[i]][[k]] <- all_data[which(cluster_mem == k), ]
    # check whether any cluster has < (#all components + 1) observations. if so, the SVD could not be performed, and the iterations
    # should be broken (starting from another random assignment)
    if (length(overall_data[[i]][[k]]) == 0) {
      flag <- 1
      loss_tot <- upper
      break
    }
    if (is.null(overall_data[[i]][[k]])) {
      flag <- 1
      loss_tot <- upper
      break
    }
    if (is.null(nrow(overall_data[[i]][[k]]))) {
      flag <- 1
      loss_tot <- upper
      break
    }
    if (!is.null(overall_data[[i]][[k]]) & (nrow(overall_data[[i]][[k]])) < (all_component + 1)){
      flag <- 1  # flag for breaking the loop
      loss_tot <- upper
      break      # for the loop k
    }

    ssca_results <- IntSparseSca_random_cpp(overall_data[[i]][[k]], nvar, nblock, all_component, distinct_zeros, psparse)
    loss_tot <- loss_tot + ssca_results$loss
    # in the influence approach, only the loading matrices that matter
    overall_loading[[i]][[k]] <- ssca_results$loadings
    overall_score[[i]][[k]] <- ssca_results$scores
    overall_loss[[i]][k] <- ssca_results$loss
    ## change the current position is enabled
    #current_position[[k]] <- which(current_loading[[k]] == 0)
  }
  if (loss_tot!= 0){
    loss_first[i] <- loss_tot
  }
  if (loss_tot == 0){
    loss_first[i] <- upper
  }
}


#############################################################################
## selection of the best several starts (ceiling(rate * n_starts))
retain_n <- floor(rate * n_starts)
# delete all the index that are zeros
number_zeros <- sum(loss_first == 0)
retain_index <- sort(loss_first, index.return = TRUE)$ix[(1 + number_zeros) : (retain_n + number_zeros)]
#############################################################################
#############################################################################

# step 3: perform the SSCA for all starts that are retained until converge
opt_score <- list()
opt_loading <- list()
opt_loss <- vector(length = ncluster)
opt_loss_tot <- upper
opt_cluster_mem <- vector()
opt_loss_iteration <- vector()
loss_overall <- vector()

for (i in 1:retain_n){

  index_starts <- retain_index[i]
  if (loss_first[index_starts] == upper) next
  conv <- 0
  current_loading <- overall_loading[[index_starts]]
  current_score <- overall_score[[index_starts]]
  current_loss <- overall_loss[[index_starts]]
  cluster_mem = mem_first[[index_starts]]
  loss_tot <- loss_first[index_starts]
  min_loss <- upper
  loss_iteration <- vector()
  change_iteration <- vector()

  for (k in 1:ncluster){
    temp_mem[[k]] <- which(cluster_mem == k)
  }

  iter = 0
  conv <- 0

  while (conv == 0){

    # the sign of whethe there are emembership exchanges
    iter = iter + 1
    sign <- 0

    # in the first iteration, this things would not have been done
    if (iter != 1){

      loss_tot <- 0

      ## Step 1: Check the current number of members in each group
      for (k in 1:ncluster){
        current_mem_n[k] <- length(which(cluster_mem == k))
        change_prop[k] <- change[k] / current_mem_n[k]
      }

      ## Step 2: Perfrom the initial full SSCA for each cluster (note that this is the special SSCA that is
      ## performed once in each and every iteration)
      # not random start (rational start from the based on the value of singular value decomposition)
      for (k in 1:ncluster){
        current_data[[k]] <- all_data[which(cluster_mem == k), ]
        # check whether any cluster has < (#all components + 1) observations. if so, the SVD could not be performed, and the iterations
        # should be broken (starting from another random assignment)
        if (length(current_data[[k]]) == 0) {
          flag <- 1
          break
        }
        if (is.null(current_data[[k]])) {
          flag <- 1
          break
        }
        if (is.null(nrow(current_data[[k]]))) {
          flag <- 1
          break
        }
        if (!is.null(current_data[[k]]) & (nrow(current_data[[k]])) < (all_component + 1)){
          flag <- 1  # flag for breaking the loop
          break      # for the loop k
        }

        # random start when the proportion of change members are higher than the pre-set cutoff value(s)
        if (change_prop[k] > cutoff.prop){
          ssca_results <- IntSparseSca_random_cpp(current_data[[k]], nvar, nblock, all_component, distinct_zeros, psparse)
          loss_tot <- loss_tot + ssca_results$loss
          # in the influence approach, only the loading matrices that matter
          current_loading[[k]] <- ssca_results$loadings
          current_score[[k]] <- ssca_results$scores
          current_loss[[k]] <- ssca_results$loss
        }
        ## when the number of changing members is smaller than a certain value, the SSCA algorithm will only perfrom with
        ## starting value equals to last loading matrices
        else{
          ssca_results <- IntSparseSca_rational_full_cpp(current_data[[k]], nvar, nblock, all_component, current_loading[[k]], distinct_zeros, psparse)
          loss_tot <- loss_tot + ssca_results$loss
          # in the influence approach, only the loading matrices that matter
          current_loading[[k]] <- ssca_results$loadings
          current_score[[k]] <- ssca_results$scores
          current_loss[[k]] <- ssca_results$loss
          ## change the current positions is enabled
          #current_position[[k]] <- which(current_loading[[k]] == 0)
        }
      }
    }

    if (flag == 1)  {
      loss_tot <- loss_first[index_starts]
      break
    }

    # the number of observations in each cluster that have changed their cluster membership in the current iteration
    change <- rep(0, ncluster)

    # record the member updates in step 3 and realize the membership updates in step 4
    mem_update <- rep(0, all_member)

    # record the loss functions after every iteraton, and stop the algorithm in case the latest loss
    # function is larger than the previous one (the loss function does not decrease anymore)
    loss_iteration <- c(loss_iteration, loss_tot)

    if (iter != 1){
      # in case due to some randomness, the influence value does not decrease anymore, we keep the previous cluster assignment
      if (loss_iteration[iter] > loss_iteration[iter - 1]) {
        cluster_mem <- cluster_mem_previous
        break
      }
    }


    # Step 3: for each pair of the observation and the cluster, quickly update the corresponding loading matrices
    # wuth and without the observation. Update the cluster assignment of the observation so that the minimal loss functions could be
    # obtained
    for (j in 1:all_member){
      # set an impossible upper bound
      min_change <- upper
      for (k in 1:ncluster){
        # the member belongs to a certain cluster
        if (cluster_mem[j] == k){
          ## get the loss function without observation j
          # the data matrix without j
          new_data <- all_data[setdiff(temp_mem[[k]], j), ]
          xi <- MatrixCenter_cpp(new_data, 1, 0)

          ## update T and P for only one iteration
          xp <- t(xi %*% current_loading[[k]])
          xp_svd <- svd(xp)
          # the new score
          matrix_xi <- xp_svd$v %*% t(xp_svd$u)
          # the new loading
          P1 <- t(xi) %*% matrix_xi
          # add the uniqueness induced zeros and sparsenes induced zeros
          P1[distinct_zeros] <- 0
          P1[ (sort(abs(P1), index.return = TRUE)$ix)[1:(zeros + length(distinct_zeros))]] <- 0
          #P1[current_position[[k]]] <- 0

          u <- sum((xi - matrix_xi %*% t(P1)) ^ 2)

          # update the current loading
          temp_loading[[k]] <- P1
          # update the current score
          temp_score[[k]] <- matrix_xi
          # update the current loss
          temp_loss[k] <- u

          # the loss function with observation j is the old loss function
          # we could thus compute the change in loss
          loss_change <- current_loss[k] - temp_loss[k]
        }

        # the member does not belong to a certain cluster
        if (cluster_mem[j] != k) {
          # the loss function with observation j
          # the data matrix with j
          new_data <- all_data[c(temp_mem[[k]], j), ]
          xi <- MatrixCenter_cpp(new_data, 1, 0)

          ## update T and P for only one iteration
          xp <- t(xi %*% current_loading[[k]])
          xp_svd <- svd(xp)
          # the new score
          matrix_xi <- xp_svd$v %*% t(xp_svd$u)
          # the new loading
          P1 <- t(xi) %*% matrix_xi
          P1[distinct_zeros] <- 0
          P1[ (sort(abs(P1), index.return = TRUE)$ix)[1:(zeros + length(distinct_zeros))]] <- 0
          #P1[current_position[[k]]] <- 0

          u <- sum((xi - matrix_xi %*% t(P1)) ^ 2)

          # update the current loading
          temp_loading[[k]] <- P1
          # update the current score
          temp_score[[k]] <- matrix_xi
          # update the current loss
          temp_loss[k] <- u

          # the loss function with observation j is the old loss function
          # we could thus compute the change in loss
          loss_change <- temp_loss[k] - current_loss[k]
        }

        # assign the jth observation to the best fit cluster
        if (loss_change < min_change){
          min_change <- loss_change
          # update the membership status of this certain observation
          mem_update[j] <- k
        }
      }
    }

    # Step 4: formally update the cluster membership
    cluster_mem_previous <- cluster_mem
    change_current <- 0

    for (j in 1:all_member){
      if (cluster_mem[j] != mem_update[j]){
        sign <- 1
        c <- cluster_mem[j]
        cluster_mem[j] <- mem_update[j]
        # remove the current observation from the previous cluster
        temp_mem[[c]] <- setdiff(temp_mem[[c]], j)
        # add the current observation to the new cluster
        temp_mem[[mem_update[j]]] <- c(temp_mem[[mem_update[j]]], j)
        change[c] <- change[c] + 1
        change_current <- change_current + 1
        change[mem_update[j]] <- change[mem_update[j]] + 1
      }
    }

    change_iteration <- c(change_iteration, change_current)
    ## potantial usable functions: the loss function of other clusters
    #loss_others <- loss_tot_ind - current_loss[[c]] - current_loss[[mem_update]]

    # update the loss function
    #loss_tot_ind <- loss_others + current_loss[[c]] + current_loss[[mem_update]]
    if (sign == 0)  conv <- 1  # when no cluster membership exchanges happen
    if (iter == MAXITER) conv <- 1
  }

  loss_overall <- c(loss_overall, loss_tot)
  # it is possible that the results gained from the starting partition finally produce NULL.
  # We should rule out such situation to avoid the collapse of the functions
  if (!is.null(loss_tot) & !is.na(loss_tot)){
    # summarize overall all starts
    if (loss_tot < opt_loss_tot){
      opt_loss_tot <- loss_tot
      opt_loading <- current_loading
      opt_score <- current_score
      opt_cluster_mem <- cluster_mem
      opt_loss_iteration <- loss_iteration
      opt_change_iteration <- change_iteration
    }
  }
}
# warn the users when the algorothm does not end normally
if (opt_loss_tot == upper){
  warning("The algorithm does not end normally. You need to check a less complex model")
}

return (list(cluster_mem = opt_cluster_mem, loss = opt_loss_tot, loadings = opt_loading, scores = opt_score))
}