#' The main function of the CSSCA method. A rational starting partition is required for the analysis.
#' To obtain the deafult starting partitions (of clusterwise PCA and of iCluster), higher-level function FixedCSSCA should be used
#'
#' @param all_data A matrix with concatenated data (the aggregation of the data blocks by rows (entries)). The CSSCA method will be performed on the data.
#' @param nvar A vector of length nblock, with the ith element indicates the number of variables assumed for the ith data block. It could also be an integer; in such cases, we assume all blocks have the same amount of variables.
#' @param nblcok A positive integer indicates the number of data blocks
#' @param ncom An integer indicates the number of common components
#' @param ndistinct A vector of length nblock, with the ith element indicates the number of distinctive components assumed for the ith data block. It could also be an integer; in such cases, we assume all data blocks have the same amount of distinctive components.
#' @param ncluster A positive integer indicates the number of clusters assumed. When ncluster = 1, CSSCA boils down to SSCA analysis.
#' @param nobservation A positive integer the number of entries that are included in the dataset
#' @param psparse A number within the range of [0,1] that indicates the sparsity level (i.e. the proportion of zero elements in the loading matrix)
#' @param feed A vector (i.e. partition vector) to serve as rational starts (or semi-rational starts)
#' @param cutoff.prop A number within the range of [0,1]. In each iteration, if the proportion of observations that have changed their cluster memberships is smaller than such value, re-estimation of cluster-specific SSCA solution will not be executed
#' @param n_replace A positive integer indicates the amount of observations that have changed their cluster memberships to create the semi-rational starts
#' @param n_replicate A positive integer implies the amount of replicates when the n_replace is fixed (e.g. when n_replace = 1, the algorithm will generate n_replicate semi-rational starts,
#' each of which is generated by randomly change the membership of one of the observation)
#' @param rate A number within the range of [0,1] to implicate the retain rate of starting partitions after the first two iterations.
#' @return a list of four elements. The first element is vector that indicates the resulting cluster partitions, the nth element refers
#' to the cluster assignment of the \code{nth} entry;
#' the second element is a numeric value that is the optimal (minimal) loss function obtained from many starts;
#' the third element is a list that displays cluster-specific loading matrices;
#' the forth element is a list that displays cluster-specific score matrices;
#' @examples
#' n_cluster <- 3
#' mem_cluster <- c(50,50,50) # 50 entries in each cluster
#' n_obs <- sum(mem_cluster)
#' n_block <- 2
#' n_com <- 2
#' n_distinct <- c(1,1) #1 distinctive components in each block
#' n_var <- c(15,9)
#' p_sparse <- 0.5
#' p_noise <- 0.3
#' p_combase <- 0.5 # moderate similarity
#' p_fixzero <- 0.5 # moderate similarity
#' mean_v <- 0.1
#' # extract the data from the simulation
#' (not run) sim <- CSSCASimulation(n_cluster, mem_cluster, n_block, n_com, n_distinct, n_var, p_sparse,
#' p_noise, p_combase, p_fixzero, "both", mean_v)
#' target_data <- sim$concatnated_data
#' # feed the data with original cluster assignment and estimate with the CSSCA method
#' results <- MainCSSCA(target_data, n_var, n_block, n_com, n_distinct, n_cluster, n_obs, p_sparse, sim$cluster_assign, n_replace = 5)
#'
MainCSSCA <- function(all_data, nvar, nblock, ncom, ndistinct, ncluster, nobservations, psparse, feed, cutoff.prop = 1/6, n_replace, n_replicate = 3, rate = 1/10){
#######################################################################################################
################################## the set-ups in the current algorithm
##################################################################################################
# set the upper bound of loss function
upper <- 1e9
# set the values of the flags
flag <- 0
# set the maximum iteration time for one single random start
MAXITER <- 100
if (length(nvar) == 1){
nvar <- rep(nvar, nblock)
}
if (length(ndistinct) ==1){
ndistinct <- rep(ndistinct, nblock)
}
# the aggregate level of information
all_member <- nobservations
all_var <- sum(nvar)
all_component <- sum(ncom, ndistinct)
# compute the fixed positions of zeros
distinct_index <- vector("numeric")
sum_var <- 0
# useless onwards
ini_var <- 0
# p: the number of block
for (p in 1:nblock){
distinct_index <- c(distinct_index, rep(p, ndistinct[p]))
ini_var <- ini_var + nvar[p]
sum_var <- c(sum_var, ini_var)
}
distinct_zeros <- vector("numeric")
# r.distinct: the number of distinctive component
if(sum(ndistinct) != 0){
for (r.distinct in 1:sum(ndistinct)){
distinct_zeros <- c(distinct_zeros, ((all_var * (ncom + r.distinct - 1)) + sum_var[distinct_index[r.distinct]] + 1): ((all_var * (ncom + r.distinct - 1)) + sum_var[(distinct_index[r.distinct] + 1)]))
}
}
## the amount of sparseness-induced zeros
no_fixed_zeros <- all_var * all_component - length(distinct_zeros)
# compute the number of zero loadings
zeros <- round(no_fixed_zeros * psparse)
# set the upper bound of minimum loss
loss_min <- upper
# step 1b: generate the multiple starts
## the rule: (1) n_replace in [1,5] -> 15 replications (75)
## (2) n_place in [6, 10] -> 10 replications (50)
## (3) the rest -> 3 replications (3 * 25)
# caculate the amount of multiple starts
n_starts <- 0
# if n_replace = 0, then please directly use the cssca_quick_feed algorithm
if ((n_replace > 0) & (n_replace < 6)){
n_starts <- 15 * n_replace #overall 75 starts
}
if ((n_replace > 5) & (n_replace < 11)){
n_starts <- 75 + (n_replace - 5) * 10 #overall 50 starts
}
if (n_replace > 10){
n_starts <- 125 + (n_replace - 10) * n_replicate #3 * (replace - 10 starts)
}
all_starts <- list(length = n_starts)
for (i in 1:n_starts){
if ((i > 0) & (i < 76)){
all_starts[[i]] <- ExchangeMem(feed, ncluster, (floor((i - 1) / 15) + 1))
}
if ((i > 75) & (i < 126)){
all_starts[[i]] <- ExchangeMem(feed, ncluster, (floor((i - 76) / 10) + 6))
}
if (i > 125) {
all_starts[[i]] <- ExchangeMem(feed, ncluster, (floor((i - 126) / n_replicate) + 11))
}
}
# the current_ families are the parameters that are involved in the iterations in later steps
current_data <- list()
## the current are the lists with n_starts elements (and each element is a sublist with k elements)
current_loading <- list()
current_score <- list()
current_loss <- list()
# the overall_ families are the paramters that include all the parameters in multi-start procedure
overall_data <- list()
## the current are the lists with n_starts elements (and each element is a sublist with k elements)
overall_loading <- list()
overall_score <- list()
overall_loss <- list()
###########################################################################################
## the following objects could only include ncluster things
temp_score <- list()
temp_loading <- list()
temp_mem <- list()
temp_loss <- vector(length = ncluster)
# the amount of members in a certain cluster
current_mem_n <- vector(length = ncluster)
# the proportions of the observations that are from a specific cluster and that have changed their memberships
change_prop <- vector(length = ncluster)
# the amount of the observations in a specific cluster that have changed their memberships
change <- vector(length = ncluster)
####################################################################################################
##############################################################################################
## the following objects could be a single vector (the records for t)
# the loss function in a specific iteration
loss_iteration <- vector("numeric")
# the number of observations that have changed their memberships in a given iteration
change_iteration <- vector("numeric")
# record the member updates in step 3 and realize the membership updates in step 4
mem_update <- rep(0, all_member)
# record the total loss after the first iteration
loss_first <- rep(upper, length = n_starts)
mem_first <- list(length = n_starts)
############################################################################################################
############################################################################################################
####### part 1: the first two iterations: check which starts will be pursued further #######################
for (i in 1:n_starts){
# determine the initial partition
cluster_mem <- all_starts[[i]]
# initiate the elements of the current_ and temp_
overall_data[[i]] <- list()
overall_loading[[i]] <- list()
overall_score[[i]] <- list()
overall_loss[[i]] <- vector(length = ncluster)
# the total loss function of a given start partition
loss_tot <- 0
flag <- 0
## the very first iteration
for (k in 1:ncluster){
temp_mem[[k]] <- which(cluster_mem == k)
overall_data[[i]][[k]] <- all_data[which(cluster_mem == k), ]
# check whether any cluster has < (#all components + 1) observations. if so, the SVD could not be performed, and the iterations
# should be broken (starting from another random assignment)
if (length(overall_data[[i]][[k]]) == 0) {
flag <- 1
break
}
if (is.null(overall_data[[i]][[k]])) {
flag <- 1
break
}
if (is.null(nrow(overall_data[[i]][[k]]))) {
flag <- 1
break
}
if (!is.null(overall_data[[i]][[k]]) & (nrow(overall_data[[i]][[k]])) < (all_component + 1)){
flag <- 1 # flag for breaking the loop
break # for the loop k
}
ssca_results <- IntSparseSca_random_cpp(overall_data[[i]][[k]], nvar, nblock, all_component, distinct_zeros, psparse)
loss_tot <- loss_tot + ssca_results$loss
# in the influence approach, only the loading matrices that matter
overall_loading[[i]][[k]] <- ssca_results$loadings
overall_score[[i]][[k]] <- ssca_results$scores
overall_loss[[i]][k] <- ssca_results$loss
## change the current position is enabled
#current_position[[k]] <- which(current_loading[[k]] == 0)
}
if (flag == 1) next
for (j in 1:all_member){
# set an impossible upper bound
min_change <- upper
for (k in 1:ncluster){
# the member belongs to a certain cluster
if (cluster_mem[j] == k){
## get the loss function without observation j
# the data matrix without j
new_data <- all_data[setdiff(temp_mem[[k]], j), ]
xi <- MatrixCenter_cpp(new_data, 1, 0)
## update T and P for only one iteration
xp <- t(xi %*% overall_loading[[i]][[k]])
xp_svd <- svd(xp)
# the new score
matrix_xi <- xp_svd$v %*% t(xp_svd$u)
# the new loading
P1 <- t(xi) %*% matrix_xi
# add the uniqueness induced zeros and sparsenes induced zeros
P1[distinct_zeros] <- 0
P1[ (sort(abs(P1), index.return = TRUE)$ix)[1:(zeros + length(distinct_zeros))]] <- 0
#P1[current_position[[k]]] <- 0
u <- sum((xi - matrix_xi %*% t(P1)) ^ 2)
# update the current loading
temp_loading[[k]] <- P1
# update the current score
temp_score[[k]] <- matrix_xi
# update the current loss
temp_loss[k] <- u
# the loss function with observation j is the old loss function
# we could thus compute the change in loss
loss_change <- overall_loss[[i]][k] - temp_loss[k]
}
# the member does not belong to a certain cluster
if (cluster_mem[j] != k) {
# the loss function with observation j
# the data matrix with j
new_data <- all_data[c(temp_mem[[k]], j), ]
xi <- MatrixCenter_cpp(new_data, 1, 0)
## update T and P for only one iteration
xp <- t(xi %*% overall_loading[[i]][[k]])
xp_svd <- svd(xp)
# the new score
matrix_xi <- xp_svd$v %*% t(xp_svd$u)
# the new loading
P1 <- t(xi) %*% matrix_xi
P1[distinct_zeros] <- 0
P1[ (sort(abs(P1), index.return = TRUE)$ix)[1:(zeros + length(distinct_zeros))]] <- 0
#P1[current_position[[k]]] <- 0
u <- sum((xi - matrix_xi %*% t(P1)) ^ 2)
# update the current loading
temp_loading[[k]] <- P1
# update the current score
temp_score[[k]] <- matrix_xi
# update the current loss
temp_loss[k] <- u
# the loss function with observation j is the old loss function
# we could thus compute the change in loss
loss_change <- temp_loss[k] - overall_loss[[i]][k]
}
# assign the jth observation to the best fit cluster
if (loss_change < min_change){
min_change <- loss_change
# update the membership status of this certain observation
mem_update[j] <- k
}
}
}
# the first iteration of membership exchange
for (j in 1:all_member){
if (cluster_mem[j] != mem_update[j]){
sign <- 1
c <- cluster_mem[j]
cluster_mem[j] <- mem_update[j]
# remove the current observation from the previous cluster
#current_mem[[i]][[c]] <- setdiff(current_mem[[i]][[c]], j)
# add the current observation to the new cluster
#current_mem[[i]][[mem_update[j]]] <- c(current_mem[[i]][[mem_update[j]]], j)
}
}
# record the cluster partition
mem_first[[i]] <- cluster_mem
### the second iteration of the SSCA solution
# the total loss function of a given start partition
loss_tot <- 0
## the very first iteration
for (k in 1:ncluster){
overall_data[[i]][[k]] <- all_data[which(cluster_mem == k), ]
# check whether any cluster has < (#all components + 1) observations. if so, the SVD could not be performed, and the iterations
# should be broken (starting from another random assignment)
if (length(overall_data[[i]][[k]]) == 0) {
flag <- 1
loss_tot <- upper
break
}
if (is.null(overall_data[[i]][[k]])) {
flag <- 1
loss_tot <- upper
break
}
if (is.null(nrow(overall_data[[i]][[k]]))) {
flag <- 1
loss_tot <- upper
break
}
if (!is.null(overall_data[[i]][[k]]) & (nrow(overall_data[[i]][[k]])) < (all_component + 1)){
flag <- 1 # flag for breaking the loop
loss_tot <- upper
break # for the loop k
}
ssca_results <- IntSparseSca_random_cpp(overall_data[[i]][[k]], nvar, nblock, all_component, distinct_zeros, psparse)
loss_tot <- loss_tot + ssca_results$loss
# in the influence approach, only the loading matrices that matter
overall_loading[[i]][[k]] <- ssca_results$loadings
overall_score[[i]][[k]] <- ssca_results$scores
overall_loss[[i]][k] <- ssca_results$loss
## change the current position is enabled
#current_position[[k]] <- which(current_loading[[k]] == 0)
}
if (loss_tot!= 0){
loss_first[i] <- loss_tot
}
if (loss_tot == 0){
loss_first[i] <- upper
}
}
#############################################################################
## selection of the best several starts (ceiling(rate * n_starts))
retain_n <- floor(rate * n_starts)
# delete all the index that are zeros
number_zeros <- sum(loss_first == 0)
retain_index <- sort(loss_first, index.return = TRUE)$ix[(1 + number_zeros) : (retain_n + number_zeros)]
#############################################################################
#############################################################################
# step 3: perform the SSCA for all starts that are retained until converge
opt_score <- list()
opt_loading <- list()
opt_loss <- vector(length = ncluster)
opt_loss_tot <- upper
opt_cluster_mem <- vector()
opt_loss_iteration <- vector()
loss_overall <- vector()
for (i in 1:retain_n){
index_starts <- retain_index[i]
if (loss_first[index_starts] == upper) next
conv <- 0
current_loading <- overall_loading[[index_starts]]
current_score <- overall_score[[index_starts]]
current_loss <- overall_loss[[index_starts]]
cluster_mem = mem_first[[index_starts]]
loss_tot <- loss_first[index_starts]
min_loss <- upper
loss_iteration <- vector()
change_iteration <- vector()
for (k in 1:ncluster){
temp_mem[[k]] <- which(cluster_mem == k)
}
iter = 0
conv <- 0
while (conv == 0){
# the sign of whethe there are emembership exchanges
iter = iter + 1
sign <- 0
# in the first iteration, this things would not have been done
if (iter != 1){
loss_tot <- 0
## Step 1: Check the current number of members in each group
for (k in 1:ncluster){
current_mem_n[k] <- length(which(cluster_mem == k))
change_prop[k] <- change[k] / current_mem_n[k]
}
## Step 2: Perfrom the initial full SSCA for each cluster (note that this is the special SSCA that is
## performed once in each and every iteration)
# not random start (rational start from the based on the value of singular value decomposition)
for (k in 1:ncluster){
current_data[[k]] <- all_data[which(cluster_mem == k), ]
# check whether any cluster has < (#all components + 1) observations. if so, the SVD could not be performed, and the iterations
# should be broken (starting from another random assignment)
if (length(current_data[[k]]) == 0) {
flag <- 1
break
}
if (is.null(current_data[[k]])) {
flag <- 1
break
}
if (is.null(nrow(current_data[[k]]))) {
flag <- 1
break
}
if (!is.null(current_data[[k]]) & (nrow(current_data[[k]])) < (all_component + 1)){
flag <- 1 # flag for breaking the loop
break # for the loop k
}
# random start when the proportion of change members are higher than the pre-set cutoff value(s)
if (change_prop[k] > cutoff.prop){
ssca_results <- IntSparseSca_random_cpp(current_data[[k]], nvar, nblock, all_component, distinct_zeros, psparse)
loss_tot <- loss_tot + ssca_results$loss
# in the influence approach, only the loading matrices that matter
current_loading[[k]] <- ssca_results$loadings
current_score[[k]] <- ssca_results$scores
current_loss[[k]] <- ssca_results$loss
}
## when the number of changing members is smaller than a certain value, the SSCA algorithm will only perfrom with
## starting value equals to last loading matrices
else{
ssca_results <- IntSparseSca_rational_full_cpp(current_data[[k]], nvar, nblock, all_component, current_loading[[k]], distinct_zeros, psparse)
loss_tot <- loss_tot + ssca_results$loss
# in the influence approach, only the loading matrices that matter
current_loading[[k]] <- ssca_results$loadings
current_score[[k]] <- ssca_results$scores
current_loss[[k]] <- ssca_results$loss
## change the current positions is enabled
#current_position[[k]] <- which(current_loading[[k]] == 0)
}
}
}
if (flag == 1) {
loss_tot <- loss_first[index_starts]
break
}
# the number of observations in each cluster that have changed their cluster membership in the current iteration
change <- rep(0, ncluster)
# record the member updates in step 3 and realize the membership updates in step 4
mem_update <- rep(0, all_member)
# record the loss functions after every iteraton, and stop the algorithm in case the latest loss
# function is larger than the previous one (the loss function does not decrease anymore)
loss_iteration <- c(loss_iteration, loss_tot)
if (iter != 1){
# in case due to some randomness, the influence value does not decrease anymore, we keep the previous cluster assignment
if (loss_iteration[iter] > loss_iteration[iter - 1]) {
cluster_mem <- cluster_mem_previous
break
}
}
# Step 3: for each pair of the observation and the cluster, quickly update the corresponding loading matrices
# wuth and without the observation. Update the cluster assignment of the observation so that the minimal loss functions could be
# obtained
for (j in 1:all_member){
# set an impossible upper bound
min_change <- upper
for (k in 1:ncluster){
# the member belongs to a certain cluster
if (cluster_mem[j] == k){
## get the loss function without observation j
# the data matrix without j
new_data <- all_data[setdiff(temp_mem[[k]], j), ]
xi <- MatrixCenter_cpp(new_data, 1, 0)
## update T and P for only one iteration
xp <- t(xi %*% current_loading[[k]])
xp_svd <- svd(xp)
# the new score
matrix_xi <- xp_svd$v %*% t(xp_svd$u)
# the new loading
P1 <- t(xi) %*% matrix_xi
# add the uniqueness induced zeros and sparsenes induced zeros
P1[distinct_zeros] <- 0
P1[ (sort(abs(P1), index.return = TRUE)$ix)[1:(zeros + length(distinct_zeros))]] <- 0
#P1[current_position[[k]]] <- 0
u <- sum((xi - matrix_xi %*% t(P1)) ^ 2)
# update the current loading
temp_loading[[k]] <- P1
# update the current score
temp_score[[k]] <- matrix_xi
# update the current loss
temp_loss[k] <- u
# the loss function with observation j is the old loss function
# we could thus compute the change in loss
loss_change <- current_loss[k] - temp_loss[k]
}
# the member does not belong to a certain cluster
if (cluster_mem[j] != k) {
# the loss function with observation j
# the data matrix with j
new_data <- all_data[c(temp_mem[[k]], j), ]
xi <- MatrixCenter_cpp(new_data, 1, 0)
## update T and P for only one iteration
xp <- t(xi %*% current_loading[[k]])
xp_svd <- svd(xp)
# the new score
matrix_xi <- xp_svd$v %*% t(xp_svd$u)
# the new loading
P1 <- t(xi) %*% matrix_xi
P1[distinct_zeros] <- 0
P1[ (sort(abs(P1), index.return = TRUE)$ix)[1:(zeros + length(distinct_zeros))]] <- 0
#P1[current_position[[k]]] <- 0
u <- sum((xi - matrix_xi %*% t(P1)) ^ 2)
# update the current loading
temp_loading[[k]] <- P1
# update the current score
temp_score[[k]] <- matrix_xi
# update the current loss
temp_loss[k] <- u
# the loss function with observation j is the old loss function
# we could thus compute the change in loss
loss_change <- temp_loss[k] - current_loss[k]
}
# assign the jth observation to the best fit cluster
if (loss_change < min_change){
min_change <- loss_change
# update the membership status of this certain observation
mem_update[j] <- k
}
}
}
# Step 4: formally update the cluster membership
cluster_mem_previous <- cluster_mem
change_current <- 0
for (j in 1:all_member){
if (cluster_mem[j] != mem_update[j]){
sign <- 1
c <- cluster_mem[j]
cluster_mem[j] <- mem_update[j]
# remove the current observation from the previous cluster
temp_mem[[c]] <- setdiff(temp_mem[[c]], j)
# add the current observation to the new cluster
temp_mem[[mem_update[j]]] <- c(temp_mem[[mem_update[j]]], j)
change[c] <- change[c] + 1
change_current <- change_current + 1
change[mem_update[j]] <- change[mem_update[j]] + 1
}
}
change_iteration <- c(change_iteration, change_current)
## potantial usable functions: the loss function of other clusters
#loss_others <- loss_tot_ind - current_loss[[c]] - current_loss[[mem_update]]
# update the loss function
#loss_tot_ind <- loss_others + current_loss[[c]] + current_loss[[mem_update]]
if (sign == 0) conv <- 1 # when no cluster membership exchanges happen
if (iter == MAXITER) conv <- 1
}
loss_overall <- c(loss_overall, loss_tot)
# it is possible that the results gained from the starting partition finally produce NULL.
# We should rule out such situation to avoid the collapse of the functions
if (!is.null(loss_tot) & !is.na(loss_tot)){
# summarize overall all starts
if (loss_tot < opt_loss_tot){
opt_loss_tot <- loss_tot
opt_loading <- current_loading
opt_score <- current_score
opt_cluster_mem <- cluster_mem
opt_loss_iteration <- loss_iteration
opt_change_iteration <- change_iteration
}
}
}
# warn the users when the algorothm does not end normally
if (opt_loss_tot == upper){
warning("The algorithm does not end normally. You need to check a less complex model")
}
return (list(cluster_mem = opt_cluster_mem, loss = opt_loss_tot, loadings = opt_loading, scores = opt_score))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.