R/CHICKN_W1.R
In chickn: 'Compressive' Hierarchical Kernel Clustering Toolbox

Documented in CHICKN_W1

#' @title Chromatogram Hierarchical Compressive K-means with Nystrom approximation
#' @description An implementation of the complete pipeline of
#'  the CHICKN algorithm.
#' 
#' @param Data A Filebacked Big Matrix n x N.
#' @param K Number of cluster at each call of clustering method. Default is 2.
#' @param k_total An upper bound of the total number of clusters. 
#' @param K_W1 A Filebacked Big Matrix. Nystrom kernel matrix \eqn{s \times N},
#'  where N is the number of signals in the training collection and s is the Nystrom sample size. 
#'  By default is NULL and it is generated using \code{\link{Nystrom_kernel}} function.
#' @param Freq A frequency matrix m x n with frequency vectors in rows. 
#' If NULL, the frequency vectors are generated by \code{\link{GenerateFrequencies}} function.
#' @param ncores Number of cores. Default is 2.
#' @param max_neighbors Number of neighbors used to estimate the kernel parameter \code{gamma}. Default is 32.
#' @param max_Nsize Number of neighbors used to compute consensus chromatograms. 
#' @param DoPreimage logical that controls whether to compute the consensus chromatograms. Default is TRUE.
#' @param DIR_output A directory to save the results. 
#' @param DIR_tmp A directory for temporal files.
#' @inheritParams hcc_parallel
#' @inheritParams GenerateFrequencies
#' @inheritParams EstimSigma
#' @inheritParams Preimage
#' @inheritParams Nystrom_kernel
#' 
#' @return A list with the following attributes:
#' \itemize{
#' \item \code{gamma} is the estimated kernel parameter.
#' \item \code{CompressedData} is the Nystrom kernel matrix.
#' \item \code{sigma} is the estimated variance.
#' \item \code{Frequency} is the frequency matrix m x n.
#' \item \code{Clusters} is the cluster assignment.
#' }
#' @details \code{CHICKN_W1} compresses the data by computing a Nystrom kernel approximation and 
#' applying the sketching operator from \insertCite{DBLP:journals/corr/KerivenBGP16}{chickn}.
#' See \code{\link{Nystrom_kernel}} and \code{\link{Sketch}} functions.  
#' Then clusters are recovered by operating on the compressed data version.
#' It can use the kernel function based on the 
#' Wasserstein-1 or the Euclidean distances. It generates in \code{DIR_output} directory the following files:
#' \itemize{
#' \item 'Cluster_assign_out.bk' is a Filebacked Big Matrix N x \code{maxLevel}+1, which stores the cluster assignment at each hierarchical level.
#' \item 'Centroids_out.bk' is a Filebacked Big Matrix with the resulting cluster centroids in columns. 
#'  }   
#' @examples 
#' \donttest{
#' data("UPS2")
#' N = ncol(UPS2)
#' n= nrow(UPS2)
#' X_FBM = bigstatsr::FBM(init = UPS2, ncol=N, nrow = n)$save()
#' output  <- CHICKN_W1(Data = X_FBM, K = 2, k_total =8, max_neighbors = 10, ncores = 2, 
#'                      N0 = N, DoPreimage = FALSE)
#'}                     
#' @seealso \code{\link{Nystrom_kernel}}, \code{\link{GenerateFrequencies}}, 
#' \code{\link{hcc_parallel}}, \code{\link{Preimage}}, [bigstatsr](https://github.com/privefl/bigstatsr)
#' @references 
#' \itemize{
#' \item Permiakova O, Guibert R, Kraut A, Fortin T, Hesse AM, Burger T (2020) "CHICKN: Extraction of peptide chromatographic 
#' elution profiles from large scale mass spectrometry data by means of Wasserstein compressive hierarchical cluster analysis." 
#' BMC Bioinformatics (under revision).
# \item \insertRef{wang2019scalable}{chickn}
# \item \insertRef{DBLP:journals/corr/KerivenBGP16}{chickn}.
#' }
#' @export
CHICKN_W1 <- function(Data, K=2, k_total,
                      K_W1 = NULL, kernel_type = 'Gaussian', distance_type = 'W1',
                      Freq = NULL,
                      ncores = 2,
                      max_neighbors = 32,
                      nblocks = 64,
                      N0 = 1e4,
                      max_Nsize = 32, DoPreimage = FALSE, 
                      DIR_output = tempfile(), DIR_tmp = tempfile(), 
                      BIG = FALSE, verbose = FALSE, ...){
  
  RcppParallel::setThreadOptions(numThreads = ncores)
  output = list()
#======== Data compression ================================================================================
  if(is.null(K_W1)){
    if(verbose){message("Nystrom approximation")}
    c = ceiling(sqrt(ncol(Data)))
    l = ceiling(c/2)
    s = ceiling(sqrt(c*K))
    Compression = Nystrom_kernel(Data = Data, c = c, l = l, s = s,
                                 gamma = NULL,
                                 max_neighbors = max_neighbors, 
                                 DIR_output = DIR_tmp, DIR_save = DIR_output,
                                 ncores = ncores, kernel_type = kernel_type,
                                 verbose = verbose)
    
    K_W1 = Compression$K_W1
    output$gamma = Compression$gamma
  }
  output$CompressedData = K_W1
  if(is.null(Freq)){
    if(verbose){message("Generate frequencies")}
    m = K*s
    out_freq = GenerateFrequencies(Data = K_W1,
                                   m = m, 
                                   N0 = N0,
                                   nblocks = nblocks, ncores = ncores, verbose = verbose)
    Freq = out_freq$W
    norm_freq = sort.int(rowSums(Freq*Freq), decreasing = TRUE, index.return = TRUE)
    Freq = Freq[norm_freq$ix,]
    output$sigma = out_freq$sigma
  }
  output$Frequency = Freq
#=============== HIERARCHICAL CKM==========================================================
  maxLevel = floor(log(k_total, K))
  if(verbose){message('Clustering')}
  try(Clusters <- hcc_parallel(Data = K_W1, W = Freq, K = K, 
                               maxLevel = maxLevel, ncores = ncores, 
                               DIR_output = DIR_output, verbose = verbose, hybrid = FALSE, ...))
  output$Clusters = Clusters
# ===== Preimage ==========
  if(DoPreimage){
    bf_C_out = file.path(DIR_output, 'Centroids_out')
    C_out = bigstatsr::big_attach(paste0(bf_C_out, '.rds'))
    
    bf_cl_assign =  file.path(DIR_output, 'Cluster_assign_out')
    Output =bigstatsr::big_attach(paste0(bf_cl_assign, '.rds'))
    
    out <- Preimage(Data = Data, 
                    K_W1 = K_W1, 
                    C_out = C_out, 
                    Cl_assign = Output,
                    max_Nsize = max_Nsize, ncores = ncores, DIR_out = DIR_output, BIG = BIG)
  }
  return(output)
}