Nothing
#' @title Chromatogram Hierarchical Compressive K-means with Nystrom approximation
#' @description An implementation of the complete pipeline of
#' the CHICKN algorithm.
#'
#' @param Data A Filebacked Big Matrix n x N.
#' @param K Number of cluster at each call of clustering method. Default is 2.
#' @param k_total An upper bound of the total number of clusters.
#' @param K_W1 A Filebacked Big Matrix. Nystrom kernel matrix \eqn{s \times N},
#' where N is the number of signals in the training collection and s is the Nystrom sample size.
#' By default is NULL and it is generated using \code{\link{Nystrom_kernel}} function.
#' @param Freq A frequency matrix m x n with frequency vectors in rows.
#' If NULL, the frequency vectors are generated by \code{\link{GenerateFrequencies}} function.
#' @param ncores Number of cores. Default is 2.
#' @param max_neighbors Number of neighbors used to estimate the kernel parameter \code{gamma}. Default is 32.
#' @param max_Nsize Number of neighbors used to compute consensus chromatograms.
#' @param DoPreimage logical that controls whether to compute the consensus chromatograms. Default is TRUE.
#' @param DIR_output A directory to save the results.
#' @param DIR_tmp A directory for temporal files.
#' @inheritParams hcc_parallel
#' @inheritParams GenerateFrequencies
#' @inheritParams EstimSigma
#' @inheritParams Preimage
#' @inheritParams Nystrom_kernel
#'
#' @return A list with the following attributes:
#' \itemize{
#' \item \code{gamma} is the estimated kernel parameter.
#' \item \code{CompressedData} is the Nystrom kernel matrix.
#' \item \code{sigma} is the estimated variance.
#' \item \code{Frequency} is the frequency matrix m x n.
#' \item \code{Clusters} is the cluster assignment.
#' }
#' @details \code{CHICKN_W1} compresses the data by computing a Nystrom kernel approximation and
#' applying the sketching operator from \insertCite{DBLP:journals/corr/KerivenBGP16}{chickn}.
#' See \code{\link{Nystrom_kernel}} and \code{\link{Sketch}} functions.
#' Then clusters are recovered by operating on the compressed data version.
#' It can use the kernel function based on the
#' Wasserstein-1 or the Euclidean distances. It generates in \code{DIR_output} directory the following files:
#' \itemize{
#' \item 'Cluster_assign_out.bk' is a Filebacked Big Matrix N x \code{maxLevel}+1, which stores the cluster assignment at each hierarchical level.
#' \item 'Centroids_out.bk' is a Filebacked Big Matrix with the resulting cluster centroids in columns.
#' }
#' @examples
#' \donttest{
#' data("UPS2")
#' N = ncol(UPS2)
#' n= nrow(UPS2)
#' X_FBM = bigstatsr::FBM(init = UPS2, ncol=N, nrow = n)$save()
#' output <- CHICKN_W1(Data = X_FBM, K = 2, k_total =8, max_neighbors = 10, ncores = 2,
#' N0 = N, DoPreimage = FALSE)
#'}
#' @seealso \code{\link{Nystrom_kernel}}, \code{\link{GenerateFrequencies}},
#' \code{\link{hcc_parallel}}, \code{\link{Preimage}}, [bigstatsr](https://github.com/privefl/bigstatsr)
#' @references
#' \itemize{
#' \item Permiakova O, Guibert R, Kraut A, Fortin T, Hesse AM, Burger T (2020) "CHICKN: Extraction of peptide chromatographic
#' elution profiles from large scale mass spectrometry data by means of Wasserstein compressive hierarchical cluster analysis."
#' BMC Bioinformatics (under revision).
# \item \insertRef{wang2019scalable}{chickn}
# \item \insertRef{DBLP:journals/corr/KerivenBGP16}{chickn}.
#' }
#' @export
CHICKN_W1 <- function(Data, K=2, k_total,
K_W1 = NULL, kernel_type = 'Gaussian', distance_type = 'W1',
Freq = NULL,
ncores = 2,
max_neighbors = 32,
nblocks = 64,
N0 = 1e4,
max_Nsize = 32, DoPreimage = FALSE,
DIR_output = tempfile(), DIR_tmp = tempfile(),
BIG = FALSE, verbose = FALSE, ...){
RcppParallel::setThreadOptions(numThreads = ncores)
output = list()
#======== Data compression ================================================================================
if(is.null(K_W1)){
if(verbose){message("Nystrom approximation")}
c = ceiling(sqrt(ncol(Data)))
l = ceiling(c/2)
s = ceiling(sqrt(c*K))
Compression = Nystrom_kernel(Data = Data, c = c, l = l, s = s,
gamma = NULL,
max_neighbors = max_neighbors,
DIR_output = DIR_tmp, DIR_save = DIR_output,
ncores = ncores, kernel_type = kernel_type,
verbose = verbose)
K_W1 = Compression$K_W1
output$gamma = Compression$gamma
}
output$CompressedData = K_W1
if(is.null(Freq)){
if(verbose){message("Generate frequencies")}
m = K*s
out_freq = GenerateFrequencies(Data = K_W1,
m = m,
N0 = N0,
nblocks = nblocks, ncores = ncores, verbose = verbose)
Freq = out_freq$W
norm_freq = sort.int(rowSums(Freq*Freq), decreasing = TRUE, index.return = TRUE)
Freq = Freq[norm_freq$ix,]
output$sigma = out_freq$sigma
}
output$Frequency = Freq
#=============== HIERARCHICAL CKM==========================================================
maxLevel = floor(log(k_total, K))
if(verbose){message('Clustering')}
try(Clusters <- hcc_parallel(Data = K_W1, W = Freq, K = K,
maxLevel = maxLevel, ncores = ncores,
DIR_output = DIR_output, verbose = verbose, hybrid = FALSE, ...))
output$Clusters = Clusters
# ===== Preimage ==========
if(DoPreimage){
bf_C_out = file.path(DIR_output, 'Centroids_out')
C_out = bigstatsr::big_attach(paste0(bf_C_out, '.rds'))
bf_cl_assign = file.path(DIR_output, 'Cluster_assign_out')
Output =bigstatsr::big_attach(paste0(bf_cl_assign, '.rds'))
out <- Preimage(Data = Data,
K_W1 = K_W1,
C_out = C_out,
Cl_assign = Output,
max_Nsize = max_Nsize, ncores = ncores, DIR_out = DIR_output, BIG = BIG)
}
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.