R/FixedCSSCA.R

Defines functions FixedCSSCA

#' FixedCSSCA
#'
#' CSSCA with known values of paramters (i.e. the number of clusters and sparsity level). When such theoritical knowledge
#' is availble, the model selection is not necessary and the current function could be used to directly compute cluster-wise mean structure and
#' component structure
#'
#' @param all_data A matrix with concatenated data (the aggregation of the data blocks by rows (entries)). The CSSCA method will be performed on the data.
#' @param n_blcok A positive integer indicates the number of data blocks
#' @param n_com An integer indicates the number of common components
#' @param n_distinct A vector of length nblock, with the ith element indicates the number of distinctive components assumed for the ith data block. It could also be an integer; in such cases, we assume all data blocks have the same amount of distinctive components.
#' @param n_var A vector of length nblock, with the ith element indicates the number of variables assumed for the ith data block. It could also be an integer; in such cases, we assume all blocks have the same amount of variables.
#' @param ncluster A positive integer indicates the number of clusters assumed. When ncluster = 1, CSSCA boils down to SSCA analysis.
#' @param psparse A number within the range of [0,1] that indicates the sparsity level (i.e. the proportion of zero elements in the loading matrix)
#' @param cutoff.prop A number within the range of [0,1]. In each iteration, if the proportion of observations that have changed their cluster memberships is smaller than such value, re-estimation of cluster-specific SSCA solution will not be executed. Typically parameter tuning is not required.
#' @param n_replicate A positive integer implies the amount of replicates when the n_replace is fixed (e.g. when n_replace = 1, the algorithm will generate n_replicate semi-rational starts,
#' each of which is generated by randomly change the membership of one of the observation). Typically paramter tuning is not required
#' @param rate A number within the range of [0,1] to implicate the retain rate of starting partitions after the first two iterations. Typically parameter tuning is not required.
#' @param computation Either "easy", "medium" or "difficult" that indicates the desired complexity of the computation. "easy" mode in most cases
#' is already quite enough to produce accurate model estimates. However, when complex datasets are to be estimated, it is favorable to use the "difficult" mode so as to increase the
#' accuracy yet sacrifice the speed of the algorithm. The default computation mode is "easy"
#' @return a list of four elements. The first element is vector that indicates the resulting cluster partitions, the nth element refers
#' to the cluster assignment of the nth entry;
#' the second element is a numeric value that is the minimal loss function value;
#' the third element is a list that displays cluster-specific loading matrices;
#' the forth element is a list that displays cluster-specific score matrices;
#' @examples
#' the setting for simulation and calculation
#' n_cluster <- 3
#' mem_cluster <- c(30,30,30)# 50 entries in each cluster
#' n_obs <- sum(mem_cluster)
#' n_block <- 2
#' n_com <- 2
#' n_distinct <- c(1,1) #1 distinctive components in each block
#' n_var <- c(15,9)
#' p_sparse <- 0.5
#' p_noise <- 0.3
#' p_combase <- 0.5 # moderate similarity
#' p_fixzero <- 0.5 # moderate similarity
#' mean_v  <- 0.1 # co-variance structrue dominates
#' # extract the data from the simulation
#' (not run)  sim <- CSSCASimulation(n_cluster, mem_cluster, n_block, n_com, n_distinct, n_var, p_sparse,
#'  p_noise, p_combase, p_fixzero, "both", mean_v)
#'  target_data <- sim$concatnated_data
#'  # feed the data with original cluster assignment and estimate with the CSSCA method
#'  # note that some of the parameter typically do not require tuning.
#'  results <- FixedCSSCA(target_data,  n_com, n_distinct, n_var, n_cluster,  p_sparse)
#'
FixedCSSCA <- function(all_data,n_block, n_com, n_distinct, n_var, n_cluster, p_sparse, cutoff.prop = 1/6, n_replicate = 3, rate = 1/10,  computation = "easy"){

  ##############################################################################################################
  ########################### obtain additional information from the input #####################################
  ##############################################################################################################

  # additional information from the input
  n_observation <- nrow(all_data)
  sum_var <- ncol(all_data)
  block_version_data <- list(length = 2)
  block_version_data[[1]] <- all_data[,1:n_var[1]]
  block_version_data[[2]] <- all_data[, (n_var[1] + 1):n_var[2]]
  n_total <- sum(n_com, n_distinct)
  # calculate the structure of loading matrices
  distinct_index <- vector("numeric")
  all_var <- 0
  ini_var <- 0
  for (p in 1:n_block){
    distinct_index <- c(distinct_index, rep(p, n_distinct[p]))
    ini_var <- ini_var + n_var[p]
    all_var <- c(all_var, ini_var)
  }
  distinct_zeros <- vector("numeric")
  for (r.distinct in 1:sum(n_distinct)){
    distinct_zeros <- c(distinct_zeros, ((sum_var * (n_com + r.distinct - 1)) + all_var[distinct_index[r.distinct]] + 1): ((sum_var * (n_com + r.distinct - 1)) + all_var[(distinct_index[r.distinct] + 1)]))
  }

  # settings of the estimation process
  if(computation == "test"){
    partition_times <- 1
  }
  if(computation == "easy"){
    partition_times <- 20
  }

  if(computation == "medium"){
    partition_times <- 25
  }

  if(computation == "difficult"){
    partition_times <- 30
  }

  if(computation == "test"){
    csca_times <- 1
  }

  if(computation == "easy"){
    csca_times <- 20
  }
  if(computation == "medium"){
    csca_times <- 25
  }
  if(computation == "difficult"){
    csca_times <- 30
  }
  upper <- 1e9

  ##############################################################################################################
  #####################################  When only 1 cluster presents in the data ##############################
  ##############################################################################################################
  if(n_cluster == 1){
    ssca_results <- IntSparseSca_random_cpp(all_data, sum_var, n_block, n_total, distinct_zeros, p_sparse)
    global <- list()
    global[[1]] <- rep(1, n_observation)
    global[[2]] <- ssca_results$loss
    global[[3]] <- ssca_results$loadings
    global[[4]] <- ssca_results$scores
  }

  ##############################################################################################################
  #####################################  more than 1 cluster ########################################
  ##############################################################################################################
  if (n_cluster != 1){

    ## obtain the rational partitions of two methods (clusterwise PCA and icluster)
    # clusterwise PCA
    results1 <- csca_cpp(all_data, n_var, n_block, n_total, n_cluster, csca_times)
    # icluster
    results_iclust <- iCluster2(block_version_data, n_cluster)$clusters

    # results of the two rational starts
    results1_cssca <- cssca_quick_cpp(all_data, n_var, n_block, n_com, n_distinct, n_cluster, n_observation, p_sparse, results1$cluster_mem, cutoff.prop)
    results2_cssca <- cssca_quick_cpp(all_data, n_var, n_block, n_com, n_distinct, n_cluster, n_observation, p_sparse, results_iclust, cutoff.prop)

    ######### CSCA part
    min_loss <- upper
    if (results1_cssca$loss < min_loss) {
      min_loss<- results1_cssca$loss
      global <- results1_cssca
    }

    ######### iclust part
    if (results2_cssca$loss < min_loss) {
      min_loss <- results2_cssca$loss
      global <- results2_cssca
    }

    # Compare the results of the two rational starts to determine the number of semi-rational starts
    covariance_similarity <- adjustedRandIndex(results1$cluster_mem, results1_cssca$cluster_mem)
    mean_similarity <- adjustedRandIndex(results_iclust, results2_cssca$cluster_mem)

    #### weight scheme
    ## when covariance similarity < mean similarity, mean structure probably dominates the overall structure
    if (covariance_similarity < mean_similarity){
      #### partition for thr mclust part
      partition_results_iclust <- MainCSSCA(all_data, n_var, n_block, n_com, n_distinct, n_cluster, n_observation, p_sparse, results_iclust, cutoff.prop, partition_times, n_replicate, rate)
      if (partition_results_iclust$loss < min_loss){
        min_loss <- partition_results_iclust$loss
        global <- partition_results_iclust
      }
    }
    ## when covariance similarity > mean similarity, m=covariance structure probably dominates the overall structure
    if ((covariance_similarity > mean_similarity) | (covariance_similarity = mean_similarity)){
      ### partition for the csca part
      partition_results_csca <- MainCSSCA(all_data, n_var, n_block, n_com, n_distinct, n_cluster, n_observation, p_sparse, results1$cluster_mem, cutoff.prop, partition_times, n_replicate, rate)
      if (partition_results_csca$loss < min_loss){
        min_loss <- partition_results_csca$loss
        global <- partition_results_csca
      }
    }

  }

  return(global)
}
syuanuvt/CSSCA documentation built on Nov. 28, 2022, 7:58 p.m.