Nothing
#' Base Class for Clustering Algorithms
#'
#' This is the base class for all the clustering algorithms in the
#' [**sklearn.cluster**](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster)
#' module. The child classes are intended to be used within some GUDHI classes
#' such as [`Atol`].
#'
#' @keywords internal
BaseClustering <- R6::R6Class(
classname = "BaseClustering",
inherit = SKLearnClass
)
#' Performs clustering according to the affinity propagation algorithm
#'
#' @description This is a wrapper around the Python class
#' [sklearn.cluster.AffinityPropagation](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation).
#'
#' @references
#' - Brendan J. Frey and Delbert Dueck (2007). *Clustering by Passing Messages
#' Between Data Points*, Science.
#'
#' @export
AffinityPropagation <- R6::R6Class(
classname = "AffinityPropagation",
inherit = BaseClustering,
public = list(
#' @description The [AffinityPropagation] class constructor.
#'
#' @param damping A numeric value specifying the damping factor in the range
#' \eqn{[0.5, 1.0)} which is the extent to which the current value is
#' maintained relative to incoming values (weighted `1 - damping`). This
#' avoids numerical oscillations when updating these values (messages).
#' Defaults to `0.5`.
#' @param max_iter An integer value specifying the maximum number of
#' iterations. Defaults to `200L`.
#' @param convergence_iter An integer value specifying the number of
#' iterations with no change in the number of estimated clusters that
#' stops the convergence. Defaults to `15L`.
#' @param copy A boolean value specifying whether to make a copy of input
#' data. Defaults to `TRUE`.
#' @param preference A numeric value or numeric vector specifying the
#' preferences for each point. Points with larger values of preferences
#' are more likely to be chosen as exemplars. The number of exemplars,
#' i.e. of clusters, is influenced by the input preferences value. If the
#' preferences are not passed as arguments, they will be set to the median
#' of the input similarities. Defaults to `NULL`.
#' @param affinity A string specifying the affinity to use. At the moment
#' `"precomputed"` and `"euclidean"` are supported. `"euclidean"` uses the
#' negative squared euclidean distance between points. Defaults to
#' `"euclidean"`.
#' @param verbose A boolean value specifying whether to be verbose. Defaults
#' to `FALSE`.
#' @param random_state An integer value specifying the seed of the random
#' generator. Defaults to `NULL` which uses current time. Set it to a
#' fixed integer for reproducible results across function calls.
#'
#' @return An object of class [AffinityPropagation].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- AffinityPropagation$new()
initialize = function(damping = 0.5,
max_iter = 200L,
convergence_iter = 15L,
copy = TRUE,
preference = NULL,
affinity = c("euclidean", "precomputed"),
verbose = FALSE,
random_state = NULL) {
affinity <- rlang::arg_match(affinity)
if (!is.null(random_state)) random_state <- as.integer(random_state)
super$set_python_class(
skl_cluster$AffinityPropagation(
damping = damping,
max_iter = as.integer(max_iter),
convergence_iter = as.integer(convergence_iter),
copy = copy,
preference = preference,
affinity = affinity,
verbose = verbose,
random_state = random_state
)
)
}
)
)
#' Performs clustering according to the agglomerative algorithm
#'
#' @description Recursively merges pair of clusters of sample data; uses linkage
#' distance. This is a wrapper around the Python class
#' [sklearn.cluster.AgglomerativeClustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering).
#'
#' @export
AgglomerativeClustering <- R6::R6Class(
classname = "AgglomerativeClustering",
inherit = BaseClustering,
public = list(
#' @description The [AgglomerativeClustering] class constructor.
#'
#' @param n_clusters An integer value specifying the number of clusters to
#' find. It must be `NULL` if `distance_threshold` is not `NULL`. Defaults
#' to `2L`.
#' @param affinity A string specifying the metric used to compute the
#' linkage. Can be `"euclidean"`, `"l1"`, `"l2"`, `"manhattan"`,
#' `"cosine"` or `"precomputed"`. If `linkage` is `"ward"`, only
#' `"euclidean"` is accepted. If `"precomputed"`, a distance matrix
#' (instead of a similarity matrix) is needed as input for the `$fit()`
#' method. Defaults to `"euclidean"`.
#' @param memory A string specifying the path to the caching directory.
#' Defaults to `NULL` in which case no caching is done.
#' @param connectivity Either a numeric matrix or an object of class
#' [stats::dist] or an object coercible into a function by
#' [rlang::as_function()] specifying for each sample the neighboring
#' samples following a given structure of the data. This can be a
#' connectivity matrix itself or a function that transforms the data into
#' a connectivity matrix. Defaults to `NULL`, i.e., the hierarchical
#' clustering algorithm is unstructured.
#' @param compute_full_tree Either a boolean value or the `"auto"` string
#' specifying whether to prematurely stop the construction of the tree at
#' `n_clusters`. This is useful to decrease computation time if the number
#' of clusters is not small compared to the number of samples. This option
#' is useful only when specifying a connectivity matrix. Note also that
#' when varying the number of clusters and using caching, it may be
#' advantageous to compute the full tree. It must be `TRUE` if
#' `distance_threshold` is not `NULL`. Defaults to `"auto"`, which is
#' equivalent to `TRUE` when `distance_threshold` is not `NULL` or that
#' `n_clusters` is inferior to the maximum between `100` and `0.02 *
#' n_samples`. Otherwise, `"auto"` is equivalent to `FALSE`.
#' @param linkage A string specifying which linkage criterion to use. The
#' linkage criterion determines which distance to use between sets of
#' observation. The algorithm will merge the pairs of cluster that
#' minimize this criterion.
#' - `ward`: minimizes the variance of the clusters being merged;
#' - `average`: uses the average of the distances of each observation of
#' the two sets;
#' - `complete`: uses the maximum of the distances between all
#' observations of the two sets.
#' - `single`: uses the minimum of the distances between all observations
#' of the two sets.
#' Defaults to `"ward"`.
#' @param distance_threshold A numeric value specifying the linkage distance
#' threshold above which clusters will not be merged. If not `NULL`,
#' `n_clusters` must be `NULL` and `compute_full_tree` must be `TRUE`.
#' Defaults to `NULL`.
#' @param compute_distances A boolean value specifying whether to compute
#' distances between clusters even if `distance_threshold` is not used.
#' This can be used to make dendrogram visualization, but introduces a
#' computational and memory overhead. Defaults to `FALSE`.
#'
#' @return An object of class [AgglomerativeClustering].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- AgglomerativeClustering$new()
initialize = function(n_clusters = 2L,
affinity = c("euclidean", "l1", "l2", "manhattan",
"cosine", "precomputed"),
memory = NULL,
connectivity = NULL,
compute_full_tree = "auto",
linkage = c("ward", "complete", "average", "single"),
distance_threshold = NULL,
compute_distances = FALSE) {
n_clusters <- as.integer(n_clusters)
affinity <- rlang::arg_match(affinity)
if (!is.null(connectivity)) {
connectivity <- if (is.matrix(connectivity))
connectivity
else if (inherits(connectivity, "dist"))
as.matrix(connectivity)
else
rlang::as_function(connectivity)
}
linkage <- rlang::arg_match(linkage)
super$set_python_class(
skl_cluster$AgglomerativeClustering(
n_clusters = n_clusters,
affinity = affinity,
memory = memory,
connectivity = connectivity,
compute_full_tree = compute_full_tree,
linkage = linkage,
distance_threshold = distance_threshold,
compute_distances = compute_distances
)
)
}
)
)
#' Performs clustering according to the Birch algorithm
#'
#' @description
#' It is a memory-efficient, online-learning algorithm provided as an
#' alternative to [MiniBatchKMeans]. It constructs a tree data structure with
#' the cluster centroids being read off the leaf. These can be either the final
#' cluster centroids or can be provided as input to another clustering algorithm
#' such as [AgglomerativeClustering]. This is a wrapper around the Python class
#' [sklearn.cluster.Birch](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html#sklearn.cluster.Birch).
#'
#' @references
#' - Tian Zhang, Raghu Ramakrishnan, Maron Livny (1996). *BIRCH: An efficient
#' data clustering method for large databases*,
#' <https://www2.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf>.
#' - Roberto Perdisci J. *Birch - Java implementation of BIRCH clustering
#' algorithm*, <https://code.google.com/archive/p/jbirch>.
#'
#' @export
Birch <- R6::R6Class(
classname = "Birch",
inherit = BaseClustering,
public = list(
#' @description The [Birch] class constructor.
#'
#' @param threshold A numeric value specifying the upper bound of the radius
#' of the subcluster obtained by merging a new sample and the closest
#' subcluster. Otherwise a new subcluster is started. Setting this value
#' to be very low promotes splitting and vice-versa. Defaults to `0.5`.
#' @param branching_factor An integer value specifying the maximum number of
#' CF subclusters in each node. If a new sample enters such that the
#' number of subclusters exceeds the `branching_factor` then that node is
#' splitted into two nodes with the subclusters redistributed in each. The
#' parent subcluster of that node is removed and two new subclusters are
#' added as parents of the 2 split nodes.
#' @param n_clusters Either an integer value or an object of class
#' [BaseClustering] specifying the number of clusters after the final
#' clustering step, which treats the subclusters from the leaves as new
#' samples.
#' - `NULL`: the final clustering step is not performed and the
#' subclusters are returned as they are;
#' - an object of class [BaseClustering]: the model is fit treating
#' the subclusters as new samples and the initial data is mapped to the
#' label of the closest subcluster;
#' - integer value: the model fit is [AgglomerativeClustering] with
#' `n_clusters` set to be equal to the integer value.
#' Defaults to `3L`.
#' @param compute_labels A boolean value specifying whether to compute
#' labels for each fit. Defaults to `TRUE`.
#' @param copy A boolean value specifying whether to make a copy of the
#' given data. If set to `FALSE`, the initial data will be overwritten.
#' Defaults to `TRUE`.
#'
#' @return An object of class [Birch].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- Birch$new()
initialize = function(threshold = 0.5,
branching_factor = 50L,
n_clusters = 3L,
compute_labels = TRUE,
copy = TRUE) {
branching_factor <- as.integer(branching_factor)
if (!is.null(n_clusters)) {
if ("BaseClustering" %in% class(n_clusters))
n_clusters <- n_clusters$get_python_class()
else
n_clusters <- as.integer(n_clusters)
}
super$set_python_class(
skl_cluster$Birch(
threshold = threshold,
branching_factor = branching_factor,
n_clusters = n_clusters,
compute_labels = compute_labels,
copy = copy
)
)
}
)
)
#' Performs clustering according to the DBSCAN algorithm
#'
#' @description
#' DBSCAN - Density-Based Spatial Clustering of Applications with Noise. Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density. This is a wrapper around the Python class
#' [sklearn.cluster.DBSCAN](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN).
#'
#' @references
#' - Ester, M., H. P. Kriegel, J. Sander, and X. Xu (1996). *A Density-Based
#' Algorithm for Discovering Clusters in Large Spatial Databases with Noise*,
#' In: Proceedings of the 2nd International Conference on Knowledge Discovery
#' and Data Mining, Portland, OR, AAAI Press, pp. 226-231.
#' - Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
#' *DBSCAN revisited, revisited: why and how you should (still) use DBSCAN*, ACM
#' Transactions on Database Systems (TODS), **42**(3), p. 19.
#'
#' @export
DBSCAN <- R6::R6Class(
classname = "DBSCAN",
inherit = BaseClustering,
public = list(
#' @description The [DBSCAN] class constructor.
#'
#' @param eps A numeric value specifying the maximum distance between two
#' samples for one to be considered as in the neighborhood of the other.
#' This is not a maximum bound on the distances of points within a
#' cluster. This is the most important DBSCAN parameter to choose
#' appropriately for your data set and distance function. Defaults to
#' `0.5`.
#' @param min_samples An integer value specifying the number of samples (or
#' total weight) in a neighborhood for a point to be considered as a core
#' point. This includes the point itself. Defaults to `5L`.
#' @param metric Either a string or an object coercible into a function via
#' [rlang::as_function()] specifying the metric to use when calculating
#' distance between instances in a feature array. If `metric` is a string,
#' it must be one of the options allowed by
#' [sklearn.metrics.pairwise_distances](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html#sklearn.metrics.pairwise_distances)
#' for its `metric` parameter. If `metric` is `"precomputed"`, `X` is
#' assumed to be a distance matrix and must be square. `X` may be a sparse
#' graph, in which case only *nonzero* elements may be considered
#' neighbors for DBSCAN. Defaults to `"euclidean"`.
#' @param metric_params A named list specifying additional parameters to be
#' passed on to the metric function. Defaults to `NULL`.
#' @param algorithm A string specifying the algorithm to be used by the
#' [sklearn.neighbors.NearestNeighbors](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors)
#' module to compute pointwise distances and find nearest neighbors.
#' Choices are `"auto"`, `"ball_tree"`, `"kd_tree"` or `"brute"`. Defaults
#' to `"auto"`.
#' @param leaf_size An integer value specifying the leaf size passed to
#' [sklearn.neighbors.BallTree](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.BallTree.html)
#' or
#' [sklearn.neighbors.KDTree](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree).
#' This can affect the speed of the construction and query, as well as the
#' memory required to store the tree. The optimal value depends on the
#' nature of the problem. Defaults to `30L`.
#' @param p An integer value specifying the power of the Minkowski metric to
#' be used to calculate distance between points. Defaults to `2L`.
#' @param n_jobs An integer value specifying the number of parallel jobs to
#' run. Defaults to `1L`.
#'
#' @return An object of class [DBSCAN].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- DBSCAN$new()
initialize = function(eps = 0.5,
min_samples = 5L,
metric = "euclidean",
metric_params = NULL,
algorithm = c("auto", "ball_tree", "kd_tree", "brute"),
leaf_size = 30L,
p = 2L,
n_jobs = 1L) {
min_samples <- as.integer(min_samples)
if (!is.character(metric)) metric <- rlang::as_function(metric)
if (!is.null(metric_params) && !rlang::is_named(metric_params))
cli::cli_abort("When provided, the {.arg metric_params} must be a named
list.")
algorithm <- rlang::arg_match(algorithm)
leaf_size <- as.integer(leaf_size)
p <- as.integer(p)
n_jobs <- as.integer(n_jobs)
super$set_python_class(
skl_cluster$DBSCAN(
eps = eps,
min_samples = min_samples,
metric = metric,
metric_params = metric_params,
algorithm = algorithm,
leaf_size = leaf_size,
p = p,
n_jobs = n_jobs
)
)
}
)
)
#' Performs clustering according to the feature agglomeration algorithm
#'
#' @description
#' Recursively merges pair of clusters of features. This is a wrapper around the
#' Python class
#' [sklearn.cluster.FeatureAgglomeration](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html#sklearn.cluster.FeatureAgglomeration).
#'
#' @export
FeatureAgglomeration <- R6::R6Class(
classname = "FeatureAgglomeration",
inherit = BaseClustering,
public = list(
#' @description The [FeatureAgglomeration] class constructor.
#'
#' @param n_clusters An integer value specifying the number of clusters to
#' find. Defaults to `2L`.
#' @param affinity A string or an object coercible into a function via
#' [rlang::as_function()] specifying the metric used to compute the
#' linkage. If a string, choices are `"euclidean"`, `"l1"`, `"l2"`,
#' `"manhattan"`, `"cosine"` or `"precomputed"`. If linkage is `"ward"`,
#' only `"euclidean"` is accepted. Defaults to `"euclidean"`.
#' @param memory A string specifying path to the caching directory for
#' storing the computation of the tree. Defaults to `NULL` in which case
#' no caching is done.
#' @param connectivity A numeric matrix or an object coercible into a
#' function via [rlang::as_function()] specifying the connectivity matrix.
#' Defines for each feature the neighboring features following a given
#' structure of the data. This can be a connectivity matrix itself or a
#' function that transforms the data into a connectivity matrix, such as
#' derived from
#' [sklearn.neighbors.kneighbors_graph()](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.kneighbors_graph.html#sklearn.neighbors.kneighbors_graph).
#' Defaults to `NULL` in which case the hierarchical clustering algorithm
#' is unstructured.
#' @param compute_full_tree The string `"auto"` or a boolean value
#' specifying whether to stop early the construction of the tree at
#' `n_clusters`. This is useful to decrease computation time if the number
#' of clusters is not small compared to the number of features. This
#' option is useful only when specifying a connectivity matrix. Note also
#' that when varying the number of clusters and using caching, it may be
#' advantageous to compute the full tree. It must be `TRUE` if
#' `distance_threshold` is not `NULL`. Defaults to `"auto"`, which is
#' equivalent to `TRUE` when `distance_threshold` is not `NULL` or when
#' `n_clusters` is inferior to `max(100, 0.02 * n_samples)` and to `FALSE`
#' otherwise.
#' @param linkage A string specifying which linkage criterion to use. The
#' linkage criterion determines which distance to use between sets of
#' features. The algorithm will merge the pairs of cluster that minimize
#' this criterion:
#' - `"ward"`: minimizes the variance of the clusters being merged;
#' - `"complete"`: maximum linkage uses the maximum distances between all
#' features of the two sets;
#' - `"average"`: uses the average of the distances of each feature of the
#' two sets;
#' - `"single"`: uses the minimum of the distances between all features of
#' the two sets.
#' @param pooling_func An object coercible into a function via
#' [rlang::as_function()] specifying the aggregation method to combine the
#' values of agglomerated features into a single value. It should take as
#' input an array of shape \eqn{M \times N} and the optional argument
#' `axis = 1`, and reduce it to an array of shape \eqn{M}. Defaults to
#' [base::rowMeans].
#' @param distance_threshold A numeric value specifying the linkage distance
#' threshold above which clusters will not be merged. If not `NULL`,
#' `n_clusters` must be `NULL` and `compute_full_tree` must be `TRUE`.
#' Defaults to `NULL`.
#' @param compute_distances A boolean value specifying whether to compute
#' distances between clusters even if `distance_threshold` is not used.
#' This can be used to make dendrogram visualization, but introduces a
#' computational and memory overhead. Defaults to `FALSE`.
#'
#' @return An object of class [FeatureAgglomeration].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- FeatureAgglomeration$new()
initialize = function(n_clusters = 2L,
affinity = c("euclidean", "l1", "l2", "manhattan",
"cosine", "precomputed"),
memory = NULL,
connectivity = NULL,
compute_full_tree = "auto",
linkage = c("ward", "complete", "average", "single"),
pooling_func = rowMeans ,
distance_threshold = NULL,
compute_distances = FALSE) {
if (!is.null(distance_threshold))
n_clusters <- NULL
else
n_clusters <- as.integer(n_clusters)
if (is.character(affinity))
affinity <- rlang::arg_match(affinity)
else
affinity <- rlang::as_function(affinity)
if (!is.null(connectivity))
if (!is.matrix(connectivity))
connectivity <- rlang::as_function(connectivity)
linkage <- rlang::arg_match(linkage)
super$set_python_class(
skl_cluster$FeatureAgglomeration(
n_clusters = n_clusters,
affinity = affinity,
memory = memory,
connectivity = connectivity,
compute_full_tree = compute_full_tree,
linkage = linkage,
pooling_func = pooling_func,
distance_threshold = distance_threshold,
compute_distances = compute_distances
)
)
}
)
)
#' Performs clustering according to the k-means algorithm
#'
#' @description
#' This is a wrapper around the Python class
#' [sklearn.cluster.KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans).
#'
#' @export
KMeans <- R6::R6Class(
classname = "KMeans",
inherit = BaseClustering,
public = list(
#' @description The [KMeans] class constructor.
#'
#' @param n_clusters An integer value specifying the number of clusters to
#' form as well as the number of centroids to generate. Defaults to `2L`.
#' @param init Either a string or a numeric matrix of shape
#' \eqn{\mathrm{n_{clusters}} \times \mathrm{n_{features}}} specifying the
#' method for initialization. If a string, choices are:
#' - `"k-means++"`: selects initial cluster centroids using sampling based
#' on an empirical probability distribution of the points’ contribution to
#' the overall inertia. This technique speeds up convergence, and is
#' theoretically proven to be \eqn{\mathcal{O}(\log(k))}-optimal. See the
#' description of `n_init` for more details;
#' - `"random"`: chooses `n_clusters` observations (rows) at random from
#' data for the initial centroids.
#'
#' Defaults to `"k-means++"`.
#' @param n_init An integer value specifying the number of times the k-means
#' algorithm will be run with different centroid seeds. The final results
#' will be the best output of `n_init` consecutive runs in terms of
#' inertia. Defaults to `10L`.
#' @param max_iter An integer value specifying the maximum number of
#' iterations of the k-means algorithm for a single run. Defaults to
#' `300L`.
#' @param tol A numeric value specifying the relative tolerance with regards
#' to Frobenius norm of the difference in the cluster centers of two
#' consecutive iterations to declare convergence. Defaults to `1e-4`.
#' @param verbose An integer value specifying the level of verbosity.
#' Defaults to `0L` which is equivalent to no verbose.
#' @param random_state An integer value specifying the initial seed of the
#' random number generator. Defaults to `NULL` which uses the current
#' timestamp.
#' @param copy_x A boolean value specifying whether the original data is to
#' be modified. When pre-computing distances it is more numerically
#' accurate to center the data first. If `copy_x` is `TRUE`, then the
#' original data is not modified. If `copy_x` is `FALSE`, the original
#' data is modified, and put back before the function returns, but small
#' numerical differences may be introduced by subtracting and then adding
#' the data mean. Note that if the original data is not C-contiguous, a
#' copy will be made even if `copy_x` is `FALSE`. If the original data is
#' sparse, but not in CSR format, a copy will be made even if `copy_x` is
#' `FALSE`. Defaults to `TRUE`.
#' @param algorithm A string specifying the k-means algorithm to use. The
#' classical EM-style algorithm is `"lloyd"`. The `"elkan"` variation can
#' be more efficient on some datasets with well-defined clusters, by using
#' the triangle inequality. However it’s more memory-intensive due to the
#' allocation of an extra array of shape \eqn{\mathrm{n_{samples}} \times
#' \mathrm{n_{clusters}}}. Defaults to `"lloyd"`.
#'
#' @return An object of class [KMeans].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- KMeans$new()
initialize = function(n_clusters = 2L,
init = c("k-means++", "random"),
n_init = 10L,
max_iter = 300L,
tol = 1e-4,
verbose = 0L,
random_state = NULL,
copy_x = TRUE,
algorithm = c("lloyd", "elkan")) {
n_clusters <- as.integer(n_clusters)
if (is.character(init))
init <- rlang::arg_match(init)
else if (!is.matrix(init) || dim(init)[1] != n_clusters)
cli::cli_abort('The argument {.arg init} should be either one of
{.field "k-means++"} or {.field "random"} or a numeric
matrix with {n_clusters} row{?s}.')
n_init <- as.integer(n_init)
max_iter <- as.integer(max_iter)
if (!is.null(random_state))
random_state <- as.integer(random_state)
algorithm <- rlang::arg_match(algorithm)
super$set_python_class(
skl_cluster$KMeans(
n_clusters = n_clusters,
init = init,
n_init = n_init,
max_iter = max_iter,
tol = tol,
verbose = verbose,
random_state = random_state,
copy_x = copy_x,
algorithm = algorithm
)
)
}
)
)
#' Performs clustering according to the bisecting k-means algorithm
#'
#' @description
#' This is a wrapper around the Python class
#' [sklearn.cluster.BisectingKMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.BisectingKMeans.html#sklearn.cluster.BisectingKMeans).
#'
#' @export
BisectingKMeans <- R6::R6Class(
classname = "BisectingKMeans",
inherit = BaseClustering,
public = list(
#' @description The [BisectingKMeans] class constructor.
#'
#' @param n_clusters An integer value specifying the number of clusters to
#' form as well as the number of centroids to generate. Defaults to `2L`.
#' @param init Either a string or a numeric matrix of shape
#' \eqn{\mathrm{n_clusters} \times \mathrm{n_features}} specifying the
#' method for initialization. If a string, choices are:
#' - `"k-means++"`: selects initial cluster centroids using sampling based
#' on an empirical probability distribution of the points’ contribution to
#' the overall inertia. This technique speeds up convergence, and is
#' theoretically proven to be \eqn{\mathcal{O}(\log(k))}-optimal. See the
#' description of `n_init` for more details;
#' - `"random"`: chooses `n_clusters` observations (rows) at random from
#' data for the initial centroids.
#' Defaults to `"k-means++"`.
#' @param n_init An integer value specifying the number of times the k-means
#' algorithm will be run with different centroid seeds. The final results
#' will be the best output of `n_init` consecutive runs in terms of
#' inertia. Defaults to `10L`.
#' @param max_iter An integer value specifying the maximum number of
#' iterations of the k-means algorithm for a single run. Defaults to
#' `300L`.
#' @param tol A numeric value specifying the relative tolerance with regards
#' to Frobenius norm of the difference in the cluster centers of two
#' consecutive iterations to declare convergence. Defaults to `1e-4`.
#' @param verbose An integer value specifying the level of verbosity.
#' Defaults to `0L` which is equivalent to no verbose.
#' @param random_state An integer value specifying the initial seed of the
#' random number generator. Defaults to `NULL` which uses the current
#' timestamp.
#' @param copy_x A boolean value specifying whether the original data is to
#' be modified. When pre-computing distances it is more numerically
#' accurate to center the data first. If `copy_x` is `TRUE`, then the
#' original data is not modified. If `copy_x` is `FALSE`, the original
#' data is modified, and put back before the function returns, but small
#' numerical differences may be introduced by subtracting and then adding
#' the data mean. Note that if the original data is not C-contiguous, a
#' copy will be made even if `copy_x` is `FALSE`. If the original data is
#' sparse, but not in CSR format, a copy will be made even if `copy_x` is
#' `FALSE`. Defaults to `TRUE`.
#' @param algorithm A string specifying the k-means algorithm to use. The
#' classical EM-style algorithm is `"lloyd"`. The `"elkan"` variation can
#' be more efficient on some datasets with well-defined clusters, by using
#' the triangle inequality. However it’s more memory-intensive due to the
#' allocation of an extra array of shape \eqn{\mathrm{n_samples} \times
#' \mathrm{n_clusters}}. Defaults to `"lloyd"`.
#' @param bisecting_strategy A string specifying how bisection should be
#' performed. Choices are:
#' - `"biggest_inertia"`: means that it will always check all calculated
#' cluster for cluster with biggest SSE (Sum of squared errors) and bisect
#' it. This approach concentrates on precision, but may be costly in terms
#' of execution time (especially for larger amount of data points).
#' - `"largest_cluster"`: means that it will always split cluster with
#' largest amount of points assigned to it from all clusters previously
#' calculated. That should work faster than picking by SSE and may produce
#' similar results in most cases.
#' Defaults to `"biggest_inertia"`.
#'
#' @return An object of class [BisectingKMeans].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster") && getRversion() >= "4.2"
#' cl <- BisectingKMeans$new()
initialize = function(n_clusters = 2L,
init = c("k-means++", "random"),
n_init = 10L,
max_iter = 300L,
tol = 1e-4,
verbose = 0L,
random_state = NULL,
copy_x = TRUE,
algorithm = c("lloyd", "elkan"),
bisecting_strategy = c("biggest_inertia",
"largest_cluster")) {
if (is.character(init))
init <- rlang::arg_match(init)
else if (!is.matrix(init) || dim(init)[1] != n_clusters)
cli::cli_abort('The argument {.arg init} should be either one of
{.field "k-means++"} or {.field "random"} or a numeric
matrix with {n_clusters} row{?s}.')
n_init <- as.integer(n_init)
if (!is.null(random_state))
random_state <- as.integer(random_state)
algorithm <- rlang::arg_match(algorithm)
bisecting_strategy <- rlang::arg_match(bisecting_strategy)
super$set_python_class(
skl_cluster$BisectingKMeans(
n_clusters = n_clusters,
init = init,
n_init = n_init,
max_iter = max_iter,
tol = tol,
verbose = verbose,
random_state = random_state,
copy_x = copy_x,
algorithm = algorithm,
bisecting_strategy = bisecting_strategy
)
)
}
)
)
#' Performs clustering according to the mini-batch k-means algorithm
#'
#' @description
#' This is a wrapper around the Python class
#' [sklearn.cluster.MiniBatchKMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans).
#'
#' @export
MiniBatchKMeans <- R6::R6Class(
classname = "MiniBatchKMeans",
inherit = BaseClustering,
public = list(
#' @description The [MiniBatchKMeans] class constructor.
#'
#' @param n_clusters An integer value specifying the number of clusters to
#' form as well as the number of centroids to generate. Defaults to `2L`.
#' @param init Either a string or a numeric matrix of shape
#' \eqn{\mathrm{n_clusters} \times \mathrm{n_features}} specifying the
#' method for initialization. If a string, choices are:
#' - `"k-means++"`: selects initial cluster centroids using sampling based
#' on an empirical probability distribution of the points’ contribution to
#' the overall inertia. This technique speeds up convergence, and is
#' theoretically proven to be \eqn{\mathcal{O}(\log(k))}-optimal. See the
#' description of `n_init` for more details;
#' - `"random"`: chooses `n_clusters` observations (rows) at random from
#' data for the initial centroids.
#' Defaults to `"k-means++"`.
#' @param n_init An integer value specifying the number of times the k-means
#' algorithm will be run with different centroid seeds. The final results
#' will be the best output of `n_init` consecutive runs in terms of
#' inertia. Defaults to `10L`.
#' @param max_iter An integer value specifying the maximum number of
#' iterations of the k-means algorithm for a single run. Defaults to
#' `300L`.
#' @param tol A numeric value specifying the relative tolerance with regards
#' to Frobenius norm of the difference in the cluster centers of two
#' consecutive iterations to declare convergence. Defaults to `1e-4`.
#' @param verbose An integer value specifying the level of verbosity.
#' Defaults to `0L` which is equivalent to no verbose.
#' @param random_state An integer value specifying the initial seed of the
#' random number generator. Defaults to `NULL` which uses the current
#' timestamp.
#' @param batch_size An integer value specifying the size of the
#' mini-batches. For faster computations, you can set the `batch_size`
#' greater than 256 * number of cores to enable parallelism on all cores.
#' Defaults to `1024L`.
#' @param compute_labels A boolean value specifying whether to compute label
#' assignment and inertia for the complete dataset once the minibatch
#' optimization has converged in fit. Defaults to `TRUE`.
#' @param max_no_improvement An integer value specifying how many
#' consecutive mini batches that does not yield an improvement on the
#' smoothed inertia should be used to call off the algorithm. To disable
#' convergence detection based on inertia, set `max_no_improvement` to
#' `NULL`. Defaults to `10L`.
#' @param init_size An integer value specifying the number of samples to
#' randomly sample for speeding up the initialization (sometimes at the
#' expense of accuracy): the only algorithm is initialized by running a
#' batch [KMeans] on a random subset of the data. This needs to be larger
#' than `n_clusters`. If `NULL`, the heuristic is `init_size = 3 *
#' batch_size` if `3 * batch_size < n_clusters`, else `init_size = 3 *
#' n_clusters`. Defaults to `NULL`.
#' @param reassignment_ratio A numeric value specifying the fraction of the
#' maximum number of counts for a center to be reassigned. A higher value
#' means that low count centers are more easily reassigned, which means
#' that the model will take longer to converge, but should converge in a
#' better clustering. However, too high a value may cause convergence
#' issues, especially with a small batch size. Defaults to `0.01`.
#'
#' @return An object of class [MiniBatchKMeans].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- MiniBatchKMeans$new()
initialize = function(n_clusters = 2L,
init = c("k-means++", "random"),
n_init = 10L,
max_iter = 300L,
tol = 1e-4,
verbose = 0L,
random_state = NULL,
batch_size = 1024L,
compute_labels = TRUE,
max_no_improvement = 10L,
init_size = NULL,
reassignment_ratio = 0.01) {
if (is.character(init))
init <- rlang::arg_match(init)
else if (!is.matrix(init) || dim(init)[1] != n_clusters)
cli::cli_abort('The argument {.arg init} should be either one of
{.field "k-means++"} or {.field "random"} or a numeric
matrix with {n_clusters} row{?s}.')
n_init <- as.integer(n_init)
if (!is.null(random_state))
random_state <- as.integer(random_state)
batch_size <- as.integer(batch_size)
if (!is.null(init_size)) init_size <- as.integer(init_size)
super$set_python_class(
skl_cluster$MiniBatchKMeans(
n_clusters = n_clusters,
init = init,
n_init = n_init,
max_iter = max_iter,
tol = tol,
verbose = verbose,
random_state = random_state,
batch_size = batch_size,
compute_labels = compute_labels,
max_no_improvement = max_no_improvement,
init_size = init_size,
reassignment_ratio = reassignment_ratio
)
)
}
)
)
#' Performs clustering according to the mean shift algorithm
#'
#' @description
#' This is a wrapper around the Python class
#' [sklearn.cluster.MeanShift](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html#sklearn.cluster.MeanShift).
#'
#' @export
MeanShift <- R6::R6Class(
classname = "MeanShift",
inherit = BaseClustering,
public = list(
#' @description The [MeanShift] class constructor.
#'
#' @param bandwidth A numeric value specifying the bandwidth used in the RBF
#' kernel. If `NULL`, the bandwidth is estimated using
#' [sklearn.cluster.estimate_bandwidth()](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.estimate_bandwidth.html).
#' Defaults to `NULL`.
#' @param seeds A numeric matrix of shape \eqn{n_\mathrm{samples} \times
#' n_\mathrm{features}} specifying the seeds used to initialize kernels.
#' If `NULL`, the seeds are calculated by
#' `sklearn.cluster.get_bin_seeds()` with bandwidth as the grid size and
#' default values for other parameters. Defaults to `NULL`.
#' @param bin_seeding A boolean value specifying whether initial kernel
#' locations are not locations of all points, but rather the location of
#' the discretized version of points, where points are binned onto a grid
#' whose coarseness corresponds to the bandwidth. Setting this option to
#' `TRUE` will speed up the algorithm because fewer seeds will be
#' initialized. Defaults to `FALSE`. Ignored if `seeds` is not `NULL`.
#' @param min_bin_freq An integer value specifying the minimal size of bins.
#' To speed up the algorithm, accept only those bins with at least
#' `min_bin_freq` points as seeds. Defaults to `1L`.
#' @param cluster_all A boolean value specifying whether all points are
#' clustered, even those orphans that are not within any kernel. Orphans
#' are assigned to the nearest kernel. If `FALSE`, then orphans are given
#' cluster label `-1`. Defaults to `TRUE`.
#' @param n_jobs An integer value specifying the number of jobs to use for
#' the computation. This works by computing each of the `n_init` runs in
#' parallel. Defaults to `1L`.
#' @param max_iter An integer value specifying the maximum number of
#' iterations per seed point before the clustering operation terminates
#' (for that seed point) if it has not yet converged. Defaults to `300L`.
#'
#' @return An object of class [MeanShift].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- MeanShift$new()
initialize = function(bandwidth = NULL,
seeds = NULL,
bin_seeding = FALSE,
min_bin_freq = 1L,
cluster_all = TRUE,
n_jobs = 1L,
max_iter = 300L) {
min_bin_freq <- as.integer(min_bin_freq)
n_jobs <- as.integer(n_jobs)
max_iter <- as.integer(max_iter)
super$set_python_class(
skl_cluster$MeanShift(
bandwidth = bandwidth,
seeds = seeds,
bin_seeding = bin_seeding,
min_bin_freq = min_bin_freq,
cluster_all = cluster_all,
n_jobs = n_jobs,
max_iter = max_iter
)
)
}
)
)
#' Performs clustering according to the OPTICS algorithm
#'
#' @description
#' This is a wrapper around the Python class
#' [sklearn.cluster.OPTICS](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS).
#'
#' @export
OPTICS <- R6::R6Class(
classname = "OPTICS",
inherit = BaseClustering,
public = list(
#' @description The [OPTICS] class constructor.
#'
#' @param min_samples Either an integer value greater than 1 or a numeric
#' value between 0 and 1 specifying the number of samples in a
#' neighborhood for a point to be considered as a core point. Also, up and
#' down steep regions can’t have more than `min_samples` consecutive
#' non-steep points. Expressed as an absolute number or a fraction of the
#' number of samples (rounded to be at least 2). Defaults to `5L`.
#' @param max_eps A numeric value specifying the maximum distance between
#' two samples for one to be considered as in the neighborhood of the
#' other. Reducing `max_eps` will result in shorter run times. Defaults to
#' `Inf`.
#' @param metric Either a string or an object coercible into a function via
#' [rlang::as_function()] specifying the metric to use for distance
#' computation. If `metric` is a function, it is called on each pair of
#' instances (rows) and the resulting value recorded. The function should
#' take two numeric vectors as input and return one numeric value
#' indicating the distance between them. This works for Scipy’s metrics,
#' but is less efficient than passing the metric name as a string. If
#' metric is `"precomputed"`, `X` is assumed to be a distance matrix and
#' must be square. Valid string values for metric are:
#'
#' - from
#' [**sklearn.metrics**](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics):
#' `"cityblock"`, `"cosine"`, `"euclidean"`, `"l1"`, `"l2"`,
#' `"manhattan"`;
#' - from
#' [**scipy.spatial.distance**](https://docs.scipy.org/doc/scipy/reference/spatial.distance.html):
#' `"braycurtis"`, `"canberra"`, `"chebyshev"`, `"correlation"`, `"dice"`,
#' `"hamming"`, `"jaccard"`, `"kulsinski"`, `"mahalanobis"`,
#' `"minkowski"`, `"rogerstanimoto"`, `"russellrao"`, `"seuclidean"`,
#' `"sokalmichener"`, `"sokalsneath"`, `"sqeuclidean"`, `"yule"`.
#'
#' Defaults to `"minkowski"`.
#' @param p An integer value specifying the power for the Minkowski metric.
#' When `p = 1`, this is equivalent to using the Manhattan distance
#' (\eqn{\ell_1}). When `p = 2`, this is equivalent to using the Euclidean
#' distance (\eqn{\ell_2}). For arbitrary \eqn{p}, the Minkowski distance
#' (\eqn{\ell_p}) is used. Defaults to `2L`.
#' @param metric_params A named list specifying additional arguments for the
#' metric function. Defaults to `NULL`.
#' @param cluster_method A string specifying the extraction method used to
#' extract clusters using the calculated reachability and ordering.
#' Possible values are `"xi"` and `"dbscan"`. Defaults to `"xi"`.
#' @param eps A numeric value specifying the maximum distance between two
#' samples for one to be considered as in the neighborhood of the other.
#' Defaults to `max_eps`. Used only when `cluster_method == "dbscan"`.
#' @param xi A numeric value in \eqn{[0,1]} specifying the minimum steepness
#' on the reachability plot that constitutes a cluster boundary. For
#' example, an upwards point in the reachability plot is defined by the
#' ratio from one point to its successor being at most `1 - xi`. Used only
#' when `cluster_method == "xi"`. Defaults to `0.05`.
#' @param predecessor_correction A boolean value specifying whether to
#' correct clusters according to the predecessors calculated by OPTICS
#' \insertCite{schubert2018improving}{rgudhi}. This parameter has minimal
#' effect on most data sets. Used only when `cluster_method == "xi"`.
#' Defaults to `TRUE`.
#' @param min_cluster_size Either an integer value \eqn{> 1} or a numeric
#' value in \eqn{[0,1]} specifying the minimum number of samples in an
#' OPTICS cluster, expressed as an absolute number or a fraction of the
#' number of samples (rounded to be at least 2). If `NULL`, the value of
#' `min_samples` is used instead. Used only when `cluster_method == "xi"`.
#' Defaults to `NULL`.
#' @param algorithm A string specifying the algorithm used to compute the
#' nearest neighbors. Choices are `c("auto", "ball_tree", "kd_tree",
#' "brute")`. Defaults to `"auto"` which will attempt to decide the most
#' appropriate algorithm based on the values passed to fit method. Note:
#' fitting on sparse input will override the setting of this parameter,
#' using `algorithm == "brute"`.
#' @param leaf_size An integer value specifying the leaf size passed to
#' `BallTree` or `KDTree`. This can affect the speed of the construction
#' and query, as well as the memory required to store the tree. The
#' optimal value depends on the nature of the problem. Defaults to `30L`
#' @param memory A string specifying the path to the caching directory into
#' which caching the output of the computation of the tree. Defaults to
#' `NULL` in which case no caching is done.
#' @param n_jobs An integer value specifying the number of parallel jobs to
#' run for neighbors search. Defaults to `1L`. A value of `-1L` means
#' using all processors.
#'
#' @return An object of class [OPTICS].
#'
#' ## References
#' \insertCited{}
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster") && getRversion() >= "4.2"
#' cl <- OPTICS$new()
initialize = function(min_samples = 5L,
max_eps = Inf,
metric = c(
"minkowski", "cityblock", "cosine", "euclidean",
"l1", "l2", "manhattan", "braycurtis", "canberra",
"chebyshev", "correlation", "dice", "hamming",
"jaccard", "kulsinski", "mahalanobis",
"rogerstanimoto", "russellrao", "seuclidean",
"sokalmichener", "sokalsneath", "sqeuclidean",
"yule"),
p = 2L,
metric_params = NULL,
cluster_method = c("xi", "dbscan"),
eps = NULL,
xi = 0.05,
predecessor_correction = TRUE,
min_cluster_size = NULL,
algorithm = c("auto", "ball_tree", "kd_tree", "brute"),
leaf_size = 30L,
memory = NULL,
n_jobs = 1L) {
if (min_samples > 1) min_samples <- as.integer(min_samples)
if (is.character(metric))
metric <- rlang::arg_match(metric)
else
metric <- rlang::as_function(metric)
p <- as.integer(p)
cluster_method <- rlang::arg_match(cluster_method)
if (!is.null(min_cluster_size) && min_cluster_size > 1)
min_cluster_size <- as.integer(min_cluster_size)
algorithm <- rlang::arg_match(algorithm)
leaf_size <- as.integer(leaf_size)
n_jobs <- as.integer(n_jobs)
super$set_python_class(
skl_cluster$OPTICS(
min_samples = min_samples,
max_eps = max_eps,
metric = metric,
p = p,
metric_params = metric_params,
cluster_method = cluster_method,
eps = eps,
xi = xi,
predecessor_correction = predecessor_correction,
min_cluster_size = min_cluster_size,
algorithm = algorithm,
leaf_size = leaf_size,
memory = memory,
n_jobs = n_jobs
)
)
}
)
)
#' Performs clustering according to the spectral clustering algorithm
#'
#' @description
#' This is a wrapper around the Python class
#' [sklearn.cluster.SpectralClustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering).
#'
#' @export
SpectralClustering <- R6::R6Class(
classname = "SpectralClustering",
inherit = BaseClustering,
public = list(
#' @description The [SpectralClustering] class constructor.
#'
#' @param n_clusters An integer value specifying the dimension of the
#' projection subspace. Defaults to `8L`.
#' @param eigen_solver A string specifying the eigenvalue decomposition
#' strategy to use. Choices are `c("arpack", "lobpcg", "amg")`. AMG
#' requires **pyamg** to be installed. It can be faster on very large,
#' sparse problems, but may also lead to instabilities. Defaults to
#' `"arpack"`.
#' @param n_components An integer value specifying the number of
#' eigenvectors to use for the spectral embedding. Defaults to `NULL`, in
#' which case, `n_clusters` is used.
#' @param random_state An integer value specifying a pseudo random number
#' generator used for the initialization of the `lobpcg` eigenvectors
#' decomposition when `eigen_solver == "amg"`, and for the k-means
#' initialization. Defaults to `NULL` which uses clock time.
#' @param n_init An integer value specifying the number of time the k-means
#' algorithm will be run with different centroid seeds. The final results
#' will be the best output of `n_init` consecutive runs in terms of
#' inertia. Only used if `assign_labels == "kmeans"`. Defaults to `10L`.
#' @param gamma A numeric value specifying the kernel coefficient for `rbf`,
#' `poly`, `sigmoid`, `laplacian` and `chi2` kernels. Ignored for
#' `affinity == "nearest_neighbors"`. Defaults to `1.0`.
#' @param affinity Either a string or an object coercible to a function via
#' [`rlang::as_function()`] specifying how to construct the affinity
#' matrix:
#'
#' - `"nearest_neighbors"`: construct the affinity matrix by computing a
#' graph of nearest neighbors;
#' - `"rbf"`: construct the affinity matrix using a radial basis function
#' (RBF) kernel;
#' - `"precomputed"`: interpret `X` as a precomputed affinity matrix,
#' where larger values indicate greater similarity between instances;
#' - `"precomputed_nearest_neighbors"`: interpret `X` as a sparse graph of
#' precomputed distances, and construct a binary affinity matrix from the
#' `n_neighbors` nearest neighbors of each instance;
#' - one of the kernels supported by
#' [pairwise_kernels](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_kernels.html).
#'
#' Only kernels that produce similarity scores (non-negative values that
#' increase with similarity) should be used. This property is not checked
#' by the clustering algorithm.
#'
#' Defaults to `"rbf"`.
#' @param n_neighbors An integer value specifying the number of neighbors to
#' use when constructing the affinity matrix using the nearest neighbors
#' method. Ignored for `affinity == "rbf"`. Defaults to `10L`.
#' @param eigen_tol A numeric value specifying the stopping criterion for
#' the eigen-decomposition of the Laplacian matrix. If `eigen_tol ==
#' "auto"`, then the passed tolerance will depend on the `eigen_solver`:
#'
#' - If `eigen_solver == "arpack"`, then `eigen_tol = 0.0`;
#' - If `eigen_solver == "lobpcg"` or `eigen_solver == "amg"`, then
#' `eigen_tol == NULL` which configures the underlying `lobpcg` solver to
#' automatically resolve the value according to their heuristics.
#'
#' Note that when using `eigen_solver == "lobpcg"` or `eigen_solver ==
#' "amg"` values of `tol < 1e-5` may lead to convergence issues and should
#' be avoided.
#'
#' Defaults to `"auto"`.
#' @param assign_labels A string specifying the strategy for assigning
#' labels in the embedding space. There are two ways to assign labels
#' after the Laplacian embedding. k-means is a popular choice
#' (`"kmeans"`), but it can be sensitive to initialization. Discretization
#' is another approach which is less sensitive to random initialization
#' (`"discretize"`). The `cluster_qr` method directly extract clusters
#' from eigenvectors in spectral clustering. In contrast to k-means and
#' discretization, `cluster_qr` has no tuning parameters and runs no
#' iterations, yet may outperform k-means and discretization in terms of
#' both quality and speed. Defaults to `"kmeans"`.
#' @param degree An integer value specifying the degree of the polynomial
#' kernel. Ignored by other kernels. Defaults to `3L`.
#' @param coef0 A numeric value specifying the value of the zero coefficient
#' for polynomial and sigmoid kernels. Ignored by other kernels. Defaults
#' to `1.0`.
#' @param kernel_params A named list specifying extra arguments to the
#' kernels passed as functions. Ignored by other kernels. Defaults to
#' `NULL`.
#' @param n_jobs An integer value specifying the number of parallel jobs to
#' run for neighbors search. Defaults to `1L`. A value of `-1L` means
#' using all processors.
#' @param verbose A boolean value specifying the verbosity mode. Defaults to
#' `FALSE`.
#'
#' @return An object of class [SpectralClustering].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- SpectralClustering$new()
initialize = function(n_clusters = 8L,
eigen_solver = c("arpack", "lobpcg", "amg"),
n_components = NULL,
random_state = NULL,
n_init = 10L,
gamma = 1.0,
affinity = c("rbf", "nearest_neighbors", "precomputed",
"precomputed_nearest_neighbors"),
n_neighbors = 10L,
eigen_tol = "auto",
assign_labels = c("kmeans", "discretize", "cluster_qr"),
degree = 3L,
coef0 = 1,
kernel_params = NULL,
n_jobs = 1L,
verbose = FALSE) {
n_clusters <- as.integer(n_clusters)
eigen_solver <- rlang::arg_match(eigen_solver)
if (!is.null(n_components)) n_components <- as.integer(n_components)
if (!is.null(random_state)) random_state <- as.integer(random_state)
n_init <- as.integer(n_init)
if (is.character(affinity))
affinity <- rlang::arg_match(affinity)
else
affinity <- rlang::as_function(affinity)
n_neighbors <- as.integer(n_neighbors)
assign_labels <- rlang::arg_match(assign_labels)
degree <- as.integer(degree)
if (!is.null(kernel_params)) {
if (!rlang::is_named(kernel_params))
cli::cli_abort("The {.arg kernel_params} argument should be either
{.code NULL} or a named list.")
}
n_jobs <- as.integer(n_jobs)
super$set_python_class(
skl_cluster$SpectralClustering(
n_clusters = n_clusters,
eigen_solver = eigen_solver,
n_components = n_components,
random_state = random_state,
n_init = n_init,
gamma = gamma,
affinity = affinity,
n_neighbors = n_neighbors,
eigen_tol = eigen_tol,
assign_labels = assign_labels,
degree = degree,
coef0 = coef0,
kernel_params = kernel_params,
n_jobs = n_jobs,
verbose = verbose
)
)
}
)
)
#' Performs clustering according to the spectral biclustering algorithm
#'
#' @description
#' This is a wrapper around the Python class
#' [sklearn.cluster.SpectralBiclustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralBiclustering.html#sklearn.cluster.SpectralBiclustering).
#'
#' @export
SpectralBiclustering <- R6::R6Class(
classname = "SpectralBiclustering",
inherit = BaseClustering,
public = list(
#' @description The [SpectralBiclustering] class constructor.
#'
#' @param n_clusters An integer value or a length-2 vector specifying the
#' number of row and column clusters in the checkerboard structure.
#' Defaults to `3L`.
#' @param method A string specifying the method of normalizing and
#' converting singular vectors into biclusters. May be one of `"scale"`,
#' `"bistochastic"` or `"log"`. The authors recommend using `"log"`. If
#' the data is sparse, however, log-normalization will not work, which is
#' why the default is `"bistochastic"`. Warning: if `method == "log"`, the
#' data must not be sparse.
#' @param n_components An integer value specifying the number of singular
#' vectors to check. Defaults to `6L`.
#' @param n_best An integer value specifying the number of best singular
#' vectors to which to project the data for clustering. Defaults to `3L`.
#' @param svd_method A string specifying the algorithm for finding singular
#' vectors. May be `"randomized"` or `"arpack"`. If `"randomized"`, uses
#' `randomized_svd()`, which may be faster for large matrices. If
#' `"arpack"`, uses `scipy.sparse.linalg.svds()`, which is more accurate,
#' but possibly slower in some cases. Defaults to `"randomized"`.
#' @param n_svd_vecs An integer value specifying the number of vectors to
#' use in calculating the SVD. Corresponds to `ncv` when `svd_method ==
#' "arpack"` and `n_oversamples` when `svd_method == "randomized"`.
#' Defaults to `NULL`.
#' @param mini_batch A boolean value specifying whether to use mini-batch
#' k-means, which is faster but may get different results. Defaults to
#' `FALSE`.
#' @param init A string specifying the method for initialization of k-means
#' algorithm. Choices are `"k-means++"` or `"random"`. Defaults to
#' `"k-means++"`.
#' @param n_init An integer value specifying the number of random
#' initializations that are tried with the k-means algorithm. If
#' mini-batch k-means is used, the best initialization is chosen and the
#' algorithm runs once. Otherwise, the algorithm is run for each
#' initialization and the best solution chosen. Defaults to `10L`.
#' @param random_state An integer value specifying a pseudo random number
#' generator used for the initialization of the `lobpcg` eigenvectors
#' decomposition when `eigen_solver == "amg"`, and for the k-means
#' initialization. Defaults to `NULL` which uses clock time.
#'
#' @return An object of class [SpectralBiclustering].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- SpectralBiclustering$new()
initialize = function(n_clusters = 3L,
method = c("bistochastic", "scale", "log"),
n_components = 6L,
n_best = 3L,
svd_method = c("randomized", "arpack"),
n_svd_vecs = NULL,
mini_batch = FALSE,
init = c("k-means++", "random"),
n_init = 10L,
random_state = NULL) {
n_clusters <- as.integer(n_clusters)
if (length(n_clusters) > 1L) {
if (length(n_clusters) > 2L)
cli::cli_abort("The argument {.arg n_clusters} should be of length 1 or 2.")
n_clusters <- reticulate::tuple(n_clusters[1], n_clusters[2])
}
method <- rlang::arg_match(method)
n_components <- as.integer(n_components)
n_best <- as.integer(n_best)
svd_method <- rlang::arg_match(svd_method)
if (!is.null(n_svd_vecs)) n_svd_vecs <- as.integer(n_svd_vecs)
init <- rlang::arg_match(init)
n_init <- as.integer(n_init)
if (!is.null(random_state)) random_state <- as.integer(random_state)
super$set_python_class(
skl_cluster$SpectralBiclustering(
n_clusters = n_clusters,
method = method,
n_components = n_components,
n_best = n_best,
svd_method = svd_method,
n_svd_vecs = n_svd_vecs,
mini_batch = mini_batch,
init = init,
n_init = n_init,
random_state = random_state
)
)
}
)
)
#' Performs clustering according to the spectral coclustering algorithm
#'
#' @description
#' This is a wrapper around the Python class
#' [sklearn.cluster.SpectralCoclustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralCoclustering.html#sklearn.cluster.SpectralCoclustering).
#'
#' @export
SpectralCoclustering <- R6::R6Class(
classname = "SpectralCoclustering",
inherit = BaseClustering,
public = list(
#' @description The [SpectralCoclustering] class constructor.
#'
#' @param n_clusters An integer value specifying the number of biclusters to
#' find. Defaults to `3L`.
#' @param svd_method A string specifying the algorithm for finding singular
#' vectors. May be `"randomized"` or `"arpack"`. If `"randomized"`, uses
#' `sklearn.utils.extmath.randomized_svd()`, which may be faster for large
#' matrices. If `"arpack"`, uses `scipy.sparse.linalg.svds()`, which is
#' more accurate, but possibly slower in some cases. Defaults to
#' `"randomized"`.
#' @param n_svd_vecs An integer value specifying the number of vectors to
#' use in calculating the SVD. Corresponds to `ncv` when `svd_method ==
#' "arpack"` and `n_oversamples` when `svd_method == "randomized"`.
#' Defaults to `NULL`.
#' @param mini_batch A boolean value specifying whether to use mini-batch
#' k-means, which is faster but may get different results. Defaults to
#' `FALSE`.
#' @param init A string specifying the method for initialization of k-means
#' algorithm. Choices are `"k-means++"` or `"random"`. Defaults to
#' `"k-means++"`.
#' @param n_init An integer value specifying the number of random
#' initializations that are tried with the k-means algorithm. If
#' mini-batch k-means is used, the best initialization is chosen and the
#' algorithm runs once. Otherwise, the algorithm is run for each
#' initialization and the best solution chosen. Defaults to `10L`.
#' @param random_state An integer value specifying a pseudo random number
#' generator used for the initialization of the `lobpcg` eigenvectors
#' decomposition when `eigen_solver == "amg"`, and for the k-means
#' initialization. Defaults to `NULL` which uses clock time.
#'
#' @return An object of class [SpectralCoclustering].
#'
#' @examplesIf reticulate::py_module_available("sklearn.cluster")
#' cl <- SpectralCoclustering$new()
initialize = function(n_clusters = 3L,
svd_method = c("randomized", "arpack"),
n_svd_vecs = NULL,
mini_batch = FALSE,
init = c("k-means++", "random"),
n_init = 10L,
random_state = NULL) {
n_clusters <- as.integer(n_clusters)
svd_method <- rlang::arg_match(svd_method)
if (!is.null(n_svd_vecs)) n_svd_vecs <- as.integer(n_svd_vecs)
init <- rlang::arg_match(init)
n_init <- as.integer(n_init)
if (!is.null(random_state)) random_state <- as.integer(random_state)
super$set_python_class(
skl_cluster$SpectralCoclustering(
n_clusters = n_clusters,
svd_method = svd_method,
n_svd_vecs = n_svd_vecs,
mini_batch = mini_batch,
init = init,
n_init = n_init,
random_state = random_state
)
)
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.