#' Dimensionality Reduction with UMAP
#'
#' Carry out dimensionality reduction of a dataset using the Uniform Manifold
#' Approximation and Projection (UMAP) method (McInnes et al., 2018).
#'
#' This function behaves like \code{\link{umap}} except with some updated
#' defaults that make it behave more like the Python implementation and which
#' cannot be added to \code{\link{umap}} without breaking backwards
#' compatibility. In addition:
#'
#' \itemize{
#' \item if \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is
#' installed, it will be used in preference to Annoy if a compatible metric
#' is requested.
#' \item if RcppHNSW is not present, but
#' \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed, it will be used in preference to Annoy if a compatible metric
#' is requested.
#' \item if \code{batch = TRUE} then the default \code{n_sgd_threads} is set
#' to the same value as \code{n_threads}.
#' \item if the input data \code{X} is a sparse matrix, it is interpreted
#' similarly to a dense matrix or dataframe, and not as a distance matrix.
#' This requires \code{rnndescent} package to be installed.
#' }
#'
#' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
#' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
#' Matrix and data frames should contain one observation per row. Data frames
#' will have any non-numeric columns removed, although factor columns will be
#' used if explicitly included via \code{metric} (see the help for
#' \code{metric} for details). Sparse matrices must be in the \code{dgCMatrix}
#' format, and you must also install
#' \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#' and set \code{nn_method = "nndescent"}
#' \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#' passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#' \code{"pca"}.
#' @param n_neighbors The size of local neighborhood (in terms of number of
#' neighboring sample points) used for manifold approximation. Larger values
#' result in more global views of the manifold, while smaller values result in
#' more local data being preserved. In general values should be in the range
#' \code{2} to \code{100}.
#' @param n_components The dimension of the space to embed into. This defaults
#' to \code{2} to provide easy visualization, but can reasonably be set to any
#' integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. For
#' \code{nn_method = "annoy"} this can be one of:
#' \itemize{
#' \item \code{"euclidean"} (the default)
#' \item \code{"cosine"}
#' \item \code{"manhattan"}
#' \item \code{"hamming"}
#' \item \code{"correlation"} (a distance based on the Pearson correlation)
#' \item \code{"categorical"} (see below)
#' }
#' For \code{nn_method = "hnsw"} this can be one of:
#' \itemize{
#' \item \code{"euclidean"}
#' \item \code{"cosine"}
#' \item \code{"correlation"}
#' }
#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed and \code{nn_method = "nndescent"} is specified then many more
#' metrics are avaiable, including:
#' \itemize{
#' \item \code{"braycurtis"}
#' \item \code{"canberra"}
#' \item \code{"chebyshev"}
#' \item \code{"dice"}
#' \item \code{"hamming"}
#' \item \code{"hellinger"}
#' \item \code{"jaccard"}
#' \item \code{"jensenshannon"}
#' \item \code{"kulsinski"}
#' \item \code{"rogerstanimoto"}
#' \item \code{"russellrao"}
#' \item \code{"sokalmichener"}
#' \item \code{"sokalsneath"}
#' \item \code{"spearmanr"}
#' \item \code{"symmetrickl"}
#' \item \code{"tsss"}
#' \item \code{"yule"}
#' }
#' For more details see the package documentation of \code{rnndescent}.
#' For \code{nn_method = "fnn"}, the distance metric is always "euclidean".
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#' embedded coordinates. By default, this value is set to \code{500} for
#' datasets containing 10,000 vertices or less, and \code{200} otherwise.
#' If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will
#' be returned.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#' \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#' \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#' zero mean and variance 1.
#' \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#' element by the maximum absolute value over the entire matrix.
#' \item{\code{"range"}} Range scale the entire matrix, so the smallest
#' element is 0 and the largest is 1.
#' \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For UMAP, the default is \code{"none"}.
#' @param learning_rate Initial learning rate used in optimization of the
#' coordinates.
#' @param init Type of initialization for the coordinates. Options are:
#' \itemize{
#' \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#' of the fuzzy 1-skeleton, with Gaussian noise added.
#' \item \code{"normlaplacian"}. Spectral embedding using the normalized
#' Laplacian of the fuzzy 1-skeleton, without noise.
#' \item \code{"random"}. Coordinates assigned using a uniform random
#' distribution between -10 and 10.
#' \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#' distribution with standard deviation 1e-4, as used in LargeVis
#' (Tang et al., 2016) and t-SNE.
#' \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#' (Belkin and Niyogi, 2002).
#' \item \code{"pca"}. The first two principal components from PCA of
#' \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#' MDS if \code{X} is of class \code{"dist"}.
#' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#' so the standard deviation is 1e-4, to give a distribution similar to that
#' used in t-SNE. This is an alias for \code{init = "pca", init_sdev =
#' 1e-4}.
#' \item \code{"agspectral"} An "approximate global" modification of
#' \code{"spectral"} which all edges in the graph to a value of 1, and then
#' sets a random number of edges (\code{negative_sample_rate} edges per
#' vertex) to 0.1, to approximate the effect of non-local affinities.
#' \item A matrix of initial coordinates.
#' }
#' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#' \code{"laplacian"}, \code{"agspectral"}), if more than one connected
#' component is identified, no spectral initialization is attempted. Instead
#' a PCA-based initialization is attempted. If \code{verbose = TRUE} the
#' number of connected components are logged to the console. The existence of
#' multiple connected components implies that a global view of the data cannot
#' be attained with this initialization. Increasing the value of
#' \code{n_neighbors} may help.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#' coordinates (including any user-supplied matrix) to this standard
#' deviation. By default, (\code{init_sdev = "range"}), each column of the
#' initial coordinates are range scaled between 0-10. Scaling the input may
#' help if the unscaled versions result in initial coordinates with large
#' inter-point distances or outliers. This usually results in small gradients
#' during optimization and very little progress being made to the layout.
#' Shrinking the initial embedding by rescaling can help under these
#' circumstances. Scaling the result of \code{init = "pca"} is usually
#' recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#' init_sdev = 1e-4} but for the spectral initializations the scaled versions
#' usually aren't necessary unless you are using a large value of
#' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).
#' @param spread The effective scale of embedded points. In combination with
#' \code{min_dist}, this determines how clustered/clumped the embedded points
#' are.
#' @param min_dist The effective minimum distance between embedded points.
#' Smaller values will result in a more clustered/clumped embedding where
#' nearby points on the manifold are drawn closer together, while larger
#' values will result on a more even dispersal of points. The value should be
#' set relative to the \code{spread} value, which determines the scale at
#' which embedded points will be spread out.
#' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as
#' the set operation used to combine local fuzzy simplicial sets to obtain a
#' global fuzzy simplicial sets. Both fuzzy set operations use the product
#' t-norm. The value of this parameter should be between \code{0.0} and
#' \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while
#' \code{0.0} will use a pure fuzzy intersection.
#' @param local_connectivity The local connectivity required -- i.e. the number
#' of nearest neighbors that should be assumed to be connected at a local
#' level. The higher this value the more connected the manifold becomes
#' locally. In practice this should be not more than the local intrinsic
#' dimension of the manifold.
#' @param bandwidth The effective bandwidth of the kernel if we view the
#' algorithm as similar to Laplacian Eigenmaps. Larger values induce more
#' connectivity and a more global view of the data, smaller values concentrate
#' more locally.
#' @param repulsion_strength Weighting applied to negative samples in low
#' dimensional embedding optimization. Values higher than one will result in
#' greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#' use per positive edge/1-simplex sample in optimizing the low dimensional
#' embedding.
#' @param a More specific parameters controlling the embedding. If \code{NULL}
#' these values are set automatically as determined by \code{min_dist} and
#' \code{spread}.
#' @param b More specific parameters controlling the embedding. If \code{NULL}
#' these values are set automatically as determined by \code{min_dist} and
#' \code{spread}.
#' @param nn_method Method for finding nearest neighbors. Options are:
#' \itemize{
#' \item \code{"fnn"}. Use exact nearest neighbors via the
#' \href{https://cran.r-project.org/package=FNN}{FNN} package.
#' \item \code{"annoy"} Use approximate nearest neighbors via the
#' \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#' \item \code{"hnsw"} Use approximate nearest neighbors with the
#' Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin,
#' 2018) via the
#' \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package.
#' \code{RcppHNSW} is not a dependency of this package: this option is
#' only available if you have installed \code{RcppHNSW} yourself. Also,
#' HNSW only supports the following arguments for \code{metric} and
#' \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and
#' \code{"correlation"}.
#' \item \code{"nndescent"} Use approximate nearest neighbors with the
#' Nearest Neighbor Descent method (Dong et al., 2011) via the
#' \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#' package. \code{rnndescent} is not a dependency of this package: this
#' option is only available if you have installed \code{rnndescent}
#' yourself.
#' }
#' By default, if \code{X} has less than 4,096 vertices, the exact nearest
#' neighbors are found. Otherwise, approximate nearest neighbors are used.
#' You may also pass pre-calculated nearest neighbor data to this argument. It
#' must be one of two formats, either a list consisting of two elements:
#' \itemize{
#' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#' containing the integer indexes of the nearest neighbors in \code{X}. Each
#' vertex is considered to be its own nearest neighbor, i.e.
#' \code{idx[, 1] == 1:n_vertices}.
#' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#' containing the distances of the nearest neighbors.
#' }
#' or a sparse distance matrix of type \code{dgCMatrix}, with dimensions
#' \code{n_vertices x n_vertices}. Distances should be arranged by column,
#' i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates
#' that the \code{j}th observation in \code{X} is a nearest neighbor of the
#' \code{i}th observation with the distance given by the value of that
#' element.
#' The \code{n_neighbors} parameter is ignored when using precomputed
#' nearest neighbor data. If using the sparse distance matrix input, each
#' column can contain a different number of neighbors.
#' @param n_trees Number of trees to build when constructing the nearest
#' neighbor index. The more trees specified, the larger the index, but the
#' better the results. With \code{search_k}, determines the accuracy of the
#' Annoy nearest neighbor search. Only used if the \code{nn_method} is
#' \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#' larger k, the more the accurate results, but the longer the search takes.
#' With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#' search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param nn_args A list containing additional arguments to pass to the nearest
#' neighbor method. For \code{nn_method = "annoy"}, you can specify
#' \code{"n_trees"} and \code{"search_k"}, and these will override the
#' \code{n_trees} and \code{search_k} parameters.
#' For \code{nn_method = "hnsw"}, you may specify the following arguments:
#' \itemize{
#' \item \code{M} The maximum number of neighbors to keep for each vertex.
#' Reasonable values are \code{2} to \code{100}. Higher values give better
#' recall at the cost of more memory. Default value is \code{16}.
#' \item \code{ef_construction} A positive integer specifying the size of
#' the dynamic list used during index construction. A higher value will
#' provide better results at the cost of a longer time to build the index.
#' Default is \code{200}.
#' \item \code{ef} A positive integer specifying the size of the dynamic
#' list used during search. This cannot be smaller than \code{n_neighbors}
#' and cannot be higher than the number of items in the index. Default is
#' \code{10}.
#' }
#' For \code{nn_method = "nndescent"}, you may specify the following
#' arguments:
#' \itemize{
#' \item \code{n_trees} The number of trees to use in a random projection
#' forest to initialize the search. A larger number will give more accurate
#' results at the cost of a longer computation time. The default of
#' \code{NULL} means that the number is chosen based on the number of
#' observations in \code{X}.
#' \item \code{max_candidates} The number of potential neighbors to explore
#' per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#' whichever is smaller. A larger number will give more accurate results at
#' the cost of a longer computation time.
#' \item \code{n_iters} The number of iterations to run the search. A larger
#' number will give more accurate results at the cost of a longer computation
#' time. By default, this will be chosen based on the number of observations
#' in \code{X}. You may also need to modify the convergence criterion
#' \code{delta}.
#' \item \code{delta} The minimum relative change in the neighbor graph
#' allowed before early stopping. Should be a value between 0 and 1. The
#' smaller the value, the smaller the amount of progress between iterations is
#' allowed. Default value of \code{0.001} means that at least 0.1% of the
#' neighbor graph must be updated at each iteration.
#' \item \code{init} How to initialize the nearest neighbor descent. By
#' default this is set to \code{"tree"} and uses a random project forest.
#' If you set this to \code{"rand"}, then a random selection is used. Usually
#' this is less accurate than using RP trees, but for high-dimensional cases,
#' there may be little difference in the quality of the initialization and
#' random initialization will be a lot faster. If you set this to
#' \code{"rand"}, then the \code{n_trees} parameter is ignored.
#' \item \code{pruning_degree_multiplier} The maximum number of edges per node
#' to retain in the search graph, relative to \code{n_neighbors}. A larger
#' value will give more accurate results at the cost of a longer computation
#' time. Default is \code{1.5}. This parameter only affects neighbor search
#' when transforming new data with \code{\link{umap_transform}}.
#' \item \code{epsilon} Controls the degree of the back-tracking when
#' traversing the search graph. Setting this to \code{0.0} will do a greedy
#' search with no back-tracking. A larger value will give more accurate
#' results at the cost of a longer computation time. Default is \code{0.1}.
#' This parameter only affects neighbor search when transforming new data with
#' \code{\link{umap_transform}}.
#' \item \code{max_search_fraction} Specifies the maximum fraction of the
#' search graph to traverse. By default, this is set to \code{1.0}, so the
#' entire graph (i.e. all items in \code{X}) may be visited. You may want to
#' set this to a smaller value if you have a very large dataset (in
#' conjunction with \code{epsilon}) to avoid an inefficient exhaustive search
#' of the data in \code{X}. This parameter only affects neighbor search when
#' transforming new data with \code{\link{umap_transform}}.
#' }
#' @param approx_pow If \code{TRUE}, use an approximation to the power function
#' in the UMAP gradient, from
#' \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}.
#' Ignored if \code{dens_scale} is non-\code{NULL}.
#' @param y Optional target data for supervised dimension reduction. Can be a
#' vector, matrix or data frame. Use the \code{target_metric} parameter to
#' specify the metrics to use, using the same syntax as \code{metric}. Usually
#' either a single numeric or factor column is used, but more complex formats
#' are possible. The following types are allowed:
#' \itemize{
#' \item Factor columns with the same length as \code{X}. \code{NA} is
#' allowed for any observation with an unknown level, in which case
#' UMAP operates as a form of semi-supervised learning. Each column is
#' treated separately.
#' \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the
#' parameter \code{target_n_neighbors} to set the number of neighbors used
#' with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors,
#' numeric columns are grouped into one block unless \code{target_metric}
#' specifies otherwise. For example, if you wish columns \code{a} and
#' \code{b} to be treated separately, specify
#' \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise,
#' the data will be effectively treated as a matrix with two columns.
#' \item Nearest neighbor data, consisting of a list of two matrices,
#' \code{idx} and \code{dist}. These represent the precalculated nearest
#' neighbor indices and distances, respectively. This
#' is the same format as that expected for precalculated data in
#' \code{nn_method}. This format assumes that the underlying data was a
#' numeric vector. Any user-supplied value of the \code{target_n_neighbors}
#' parameter is ignored in this case, because the the number of columns in
#' the matrices is used for the value. Multiple nearest neighbor data using
#' different metrics can be supplied by passing a list of these lists.
#' }
#' Unlike \code{X}, all factor columns included in \code{y} are automatically
#' used.
#' @param target_n_neighbors Number of nearest neighbors to use to construct the
#' target simplicial set. Default value is \code{n_neighbors}. Applies only if
#' \code{y} is non-\code{NULL} and \code{numeric}.
#' @param target_metric The metric used to measure distance for \code{y} if
#' using supervised dimension reduction. Used only if \code{y} is numeric.
#' @param target_weight Weighting factor between data topology and target
#' topology. A value of 0.0 weights entirely on data, a value of 1.0 weights
#' entirely on target. The default of 0.5 balances the weighting equally
#' between data and target. Only applies if \code{y} is non-\code{NULL}.
#' @param pca If set to a positive integer value, reduce data to this number of
#' columns using PCA. Doesn't applied if the distance \code{metric} is
#' \code{"hamming"}, or the dimensions of the data is larger than the
#' number specified (i.e. number of rows and columns must be larger than the
#' value of this parameter). If you have > 100 columns in a data frame or
#' matrix, reducing the number of columns in this way may substantially
#' increase the performance of the nearest neighbor search at the cost of a
#' potential decrease in accuracy. In many t-SNE applications, a value of 50
#' is recommended, although there's no guarantee that this is appropriate for
#' all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#' carrying out PCA. For binary data, it's recommended to set this to
#' \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#' the \code{pca} parameter is specified. Allowed values are:
#' \itemize{
#' \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#' \href{https://cran.r-project.org/package=irlba}{irlba} package.
#' \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#' the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#' This is likely to give much faster but potentially less accurate results
#' than using \code{"irlba"}. For the purposes of nearest neighbor
#' calculation and coordinates initialization, any loss of accuracy doesn't
#' seem to matter much.
#' \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}}
#' from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#' package. The SVD methods used in \code{bigstatsr} may be faster on
#' systems without access to efficient linear algebra libraries (e.g.
#' Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#' uwot: if you choose to use this package for PCA, you \emph{must} install
#' it yourself.
#' \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#' likely to be slow for all but the smallest datasets.
#' \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#' 50% of the full set of singular vectors would be calculated, in which
#' case \code{"svd"} is used.
#' }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#' 2014) during optimization. Otherwise, use the faster (but probably less
#' statistically good) Tausworthe "taus88" generator. The default is
#' \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#' is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and
#' \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to
#' \code{TRUE} will speed up the stochastic optimization phase, but give a
#' potentially less accurate embedding, and which will not be exactly
#' reproducible even with a fixed seed. For visualization, \code{fast_sgd =
#' TRUE} will give perfectly good results. For more generic dimensionality
#' reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd =
#' TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads},
#' and \code{approx_pow} are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#' end of each epoch rather than during the epoch. In batch mode, results are
#' reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#' at the cost of a slightly higher memory use. You may also have to modify
#' \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#' a speed increase over the single-threaded optimization is likely to be
#' dataset and hardware-dependent.
#' @param ret_model If \code{TRUE}, then return extra data that can be used to
#' add new data to an existing embedding via \code{\link{umap_transform}}. The
#' embedded coordinates are returned as the list item \code{embedding}. If
#' \code{FALSE}, just return the coordinates. This parameter can be used in
#' conjunction with \code{ret_nn} and \code{ret_extra}. Note that some
#' settings are incompatible with the production of a UMAP model: external
#' neighbor data (passed via a list to \code{nn_method}), and factor columns
#' that were included via the \code{metric} parameter. In the latter case, the
#' model produced is based only on the numeric data. A transformation using
#' new data is possible, but the factor columns in the new data are ignored.
#' Note that setting \code{ret_model = TRUE} forces the use of the approximate
#' nearest neighbors method. Because small datasets would otherwise use exact
#' nearest neighbor calculations, setting \code{ret_model = TRUE} means that
#' different results may be returned for small datasets in terms of both the
#' returned nearest neighbors (if requested) and the final embedded
#' coordinates, compared to \code{ret_model = FALSE}, even if the random
#' number seed is fixed. To avoid this, explicitly set
#' \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#' nearest neighbor data that can be used as input to \code{nn_method} to
#' avoid the overhead of repeatedly calculating the nearest neighbors when
#' manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#' \code{init}). See the "Value" section for the names of the list items. If
#' \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#' could be sensitive to data scaling, so be wary of reusing nearest neighbor
#' data if modifying the \code{scale} parameter. This parameter can be used in
#' conjunction with \code{ret_model} and \code{ret_extra}.
#' @param ret_extra A vector indicating what extra data to return. May contain
#' any combination of the following strings:
#' \itemize{
#' \item \code{"model"} Same as setting \code{ret_model = TRUE}.
#' \item \code{"nn"} Same as setting \code{ret_nn = TRUE}.
#' \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy
#' simplicial set of the merged local views of the input data). The graph
#' is returned as a sparse symmetric N x N matrix of class
#' \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#' membership strength of the edge connecting vertex i and vertex j. This
#' can be considered analogous to the input probability (or similarity or
#' affinity) used in t-SNE and LargeVis. Note that the graph is further
#' sparsified by removing edges with sufficiently low membership strength
#' that they would not be sampled by the probabilistic edge sampling
#' employed for optimization and therefore the number of non-zero elements
#' in the matrix is dependent on \code{n_epochs}. If you are only
#' interested in the fuzzy input graph (e.g. for clustering), setting
#' \code{n_epochs = 0} will avoid any further sparsifying.
#' Be aware that setting `binary_edge_weights = TRUE` will affect this
#' graph (all non-zero edge weights will be 1).
#' \item \code{"sigma"} the normalization value for each observation in the
#' dataset when constructing the smoothed distances to each of its
#' neighbors. This gives some sense of the local density of each
#' observation in the high dimensional space: higher values of
#' \code{sigma} indicate a higher dispersion or lower density.
#' }
#' @param n_threads Number of threads to use (except during stochastic gradient
#' descent). Default is half the number of concurrent threads supported by the
#' system. For nearest neighbor search, only applies if
#' \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#' will be temporarily written to disk in the location determined by
#' \code{\link[base]{tempfile}}.
#' @param n_sgd_threads Number of threads to use during stochastic gradient
#' descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#' will \emph{not} be reproducible, even if \code{set.seed} is called with a
#' fixed seed before running. Set to \code{"auto"} to use the same value as
#' \code{n_threads}. Default is to use only one thread, unless
#' \code{batch = TRUE} in which case \code{"auto"} used.
#' @param grain_size The minimum amount of work to do on each thread. If this
#' value is set high enough, then less than \code{n_threads} or
#' \code{n_sgd_threads} will be used for processing, which might give a
#' performance improvement if the overhead of thread management and context
#' switching was outweighing the improvement due to concurrent processing.
#' This should be left at default (\code{1}) and work will be spread evenly
#' over all the threads specified.
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#' nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#' only written to disk if \code{n_threads > 1} and
#' \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#' \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#' and Ba, 2014).
#' \itemize{
#' \item \code{method} The optimization method to use. Either \code{"adam"}
#' or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}.
#' \item \code{beta1} (Adam only). The weighting parameter for the
#' exponential moving average of the first moment estimator. Effectively the
#' momentum parameter. Should be a floating point value between 0 and 1.
#' Higher values can smooth oscillatory updates in poorly-conditioned
#' situations and may allow for a larger \code{learning_rate} to be
#' specified, but too high can cause divergence. Default: \code{0.5}.
#' \item \code{beta2} (Adam only). The weighting parameter for the
#' exponential moving average of the uncentered second moment estimator.
#' Should be a floating point value between 0 and 1. Controls the degree of
#' adaptivity in the step-size. Higher values put more weight on previous
#' time steps. Default: \code{0.9}.
#' \item \code{eps} (Adam only). Intended to be a small value to prevent
#' division by zero, but in practice can also affect convergence due to its
#' interaction with \code{beta2}. Higher values reduce the effect of the
#' step-size adaptivity and bring the behavior closer to stochastic gradient
#' descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#' \code{1e-7}.
#' \item \code{alpha} The initial learning rate. Default: the value of the
#' \code{learning_rate} parameter.
#' }
#' @param epoch_callback A function which will be invoked at the end of every
#' epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#' \itemize{
#' \item \code{epoch} The current epoch number (between \code{1} and
#' \code{n_epochs}).
#' \item \code{n_epochs} Number of epochs to use during the optimization of
#' the embedded coordinates.
#' \item \code{coords} The embedded coordinates as of the end of the current
#' epoch, as a matrix with dimensions (N, \code{n_components}).
#' }
#' @param binary_edge_weights If \code{TRUE} then edge weights in the input
#' graph are treated as binary (0/1) rather than real valued. This affects the
#' sampling frequency of neighbors and is the strategy used by the PaCMAP
#' method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020)
#' and theoretical (Damrich and Hamprecht, 2021) work suggests this has little
#' effect on UMAP's performance.
#' @param dens_scale A value between 0 and 1. If > 0 then the output attempts
#' to preserve relative local density around each observation. This uses an
#' approximation to the densMAP method (Narayan and co-workers, 2021). The
#' larger the value of \code{dens_scale}, the greater the range of output
#' densities that will be used to map the input densities. This option is
#' ignored if using multiple \code{metric} blocks.
#' @param seed Integer seed to use to initialize the random number generator
#' state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this
#' should give consistent output across multiple runs on a given installation.
#' Setting this value is equivalent to calling \code{\link[base]{set.seed}},
#' but it may be more convenient in some situations than having to call a
#' separate function. The default is to not set a seed. If
#' \code{ret_model = TRUE}, the seed will be stored in the output model and
#' then used to set the seed inside \code{\link{umap_transform}}.
#' @return A matrix of optimized coordinates, or:
#' \itemize{
#' \item if \code{ret_model = TRUE} (or \code{ret_extra} contains
#' \code{"model"}), returns a list containing extra information that can be
#' used to add new data to an existing embedding via
#' \code{\link{umap_transform}}. In this case, the coordinates are available
#' in the list item \code{embedding}. \bold{NOTE}: The contents of
#' the \code{model} list should \emph{not} be considered stable or part of
#' the public API, and are purposely left undocumented.
#' \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#' returns the nearest neighbor data as a list called \code{nn}. This
#' contains one list for each \code{metric} calculated, itself containing a
#' matrix \code{idx} with the integer ids of the neighbors; and a matrix
#' \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#' used as input to the \code{nn_method} parameter.
#' \item if \code{ret_extra} contains \code{"fgraph"}, returns the high
#' dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type
#' \link[Matrix]{dgCMatrix-class}.
#' \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the
#' smooth knn distance normalization terms for each observation as
#' \code{"sigma"} and a vector \code{"rho"} containing the largest
#' distance to the locally connected neighbors of each observation.
#' \item if \code{ret_extra} contains \code{"localr"}, returns a vector of
#' the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}.
#' }
#' The returned list contains the combined data from any combination of
#' specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}.
#' @examples
#'
#' iris30 <- iris[c(1:10, 51:60, 101:110), ]
#' iris_umap <- umap2(iris30, n_neighbors = 5)
#'
#' @references
#' Belkin, M., & Niyogi, P. (2002).
#' Laplacian eigenmaps and spectral techniques for embedding and clustering.
#' In \emph{Advances in neural information processing systems}
#' (pp. 585-591).
#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
#'
#' Böhm, J. N., Berens, P., & Kobak, D. (2020).
#' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum.
#' \emph{arXiv preprint} \emph{arXiv:2007.08902}.
#' \url{https://arxiv.org/abs/2007.08902}
#'
#' Damrich, S., & Hamprecht, F. A. (2021).
#' On UMAP's true loss function.
#' \emph{Advances in Neural Information Processing Systems}, \emph{34}.
#' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html}
#'
#' Dong, W., Moses, C., & Li, K. (2011, March).
#' Efficient k-nearest neighbor graph construction for generic similarity measures.
#' In \emph{Proceedings of the 20th international conference on World Wide Web}
#' (pp. 577-586).
#' ACM.
#' \doi{10.1145/1963405.1963487}.
#'
#' Kingma, D. P., & Ba, J. (2014).
#' Adam: A method for stochastic optimization.
#' \emph{arXiv preprint} \emph{arXiv}:1412.6980.
#' \url{https://arxiv.org/abs/1412.6980}
#'
#' Malkov, Y. A., & Yashunin, D. A. (2018).
#' Efficient and robust approximate nearest neighbor search using hierarchical
#' navigable small world graphs.
#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836.
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' Narayan, A., Berger, B., & Cho, H. (2021).
#' Assessing single-cell transcriptomic variability through density-preserving data visualization.
#' \emph{Nature biotechnology}, \emph{39}(6), 765-774.
#' \doi{10.1038/s41587-020-00801-7}
#'
#' O’Neill, M. E. (2014).
#' \emph{PCG: A family of simple fast space-efficient statistically good
#' algorithms for random number generation}
#' (Report No. HMC-CS-2014-0905). Harvey Mudd College.
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' Van der Maaten, L., & Hinton, G. (2008).
#' Visualizing data using t-SNE.
#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html}
#'
#' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021).
#' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization.
#' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73.
#' \url{https://www.jmlr.org/papers/v22/20-1061.html}
#'
#' @export
umap2 <-
function(X,
n_neighbors = 15,
n_components = 2,
metric = "euclidean",
n_epochs = NULL,
learning_rate = 1,
scale = FALSE,
init = "spectral",
init_sdev = "range",
spread = 1,
min_dist = 0.1,
set_op_mix_ratio = 1.0,
local_connectivity = 1.0,
bandwidth = 1.0,
repulsion_strength = 1.0,
negative_sample_rate = 5.0,
a = NULL,
b = NULL,
nn_method = NULL,
n_trees = 50,
search_k = 2 * n_neighbors * n_trees,
approx_pow = FALSE,
y = NULL,
target_n_neighbors = n_neighbors,
target_metric = "euclidean",
target_weight = 0.5,
pca = NULL,
pca_center = TRUE,
pcg_rand = TRUE,
fast_sgd = FALSE,
ret_model = FALSE,
ret_nn = FALSE,
ret_extra = c(),
n_threads = NULL,
n_sgd_threads = 0,
grain_size = 1,
tmpdir = tempdir(),
verbose = getOption("verbose", TRUE),
batch = TRUE,
opt_args = NULL,
epoch_callback = NULL,
pca_method = NULL,
binary_edge_weights = FALSE,
dens_scale = NULL,
seed = NULL,
nn_args = list()) {
if (is.null(nn_method)) {
if (is_installed("RcppHNSW") &&
is.character(metric) &&
is_ok_hnsw_metric(metric) &&
is_ok_hnsw_metric(target_metric)) {
nn_method <- "hnsw"
tsmessage("Using HNSW for nearest neighbor search")
}
}
if (is.null(nn_method)) {
if (is_installed("rnndescent")) {
nn_method <- "nndescent"
tsmessage("Using NN-Descent for nearest neighbor search")
}
}
if (is.null(n_threads)) {
n_threads <- default_num_threads()
if (batch) {
n_sgd_threads <- n_threads
}
}
if (is_sparse_matrix(X)) {
if (!methods::is(X, "dgCMatrix")) {
stop("sparse X must be a dgCMatrix object")
}
if (!is.list(nn_method)) {
if (!is_installed("rnndescent")) {
stop(
"nearest neighbor search for sparse matrices requires the ",
"'rnndescent' package, please install it"
)
}
if (!is.null(nn_method) && nn_method != "nndescent") {
stop(
"nearest neighbor search for sparse matrices only supports ",
"the 'nndescent' method"
)
}
tsmessage("Using nndescent for nearest neighbor search")
nn_method <- "nndescent"
}
}
if (is.null(n_epochs)) {
n_epochs <- 500
}
if (is.numeric(a) &&
is.numeric(b) && a == 1 && b == 1 && is.null(dens_scale)) {
method <- "tumap"
} else {
method <- "umap"
}
uwot(
X = X,
n_neighbors = n_neighbors,
n_components = n_components,
metric = metric,
n_epochs = n_epochs,
alpha = learning_rate,
scale = scale,
init = init,
init_sdev = init_sdev,
spread = spread,
min_dist = min_dist,
set_op_mix_ratio = set_op_mix_ratio,
local_connectivity = local_connectivity,
bandwidth = bandwidth,
gamma = repulsion_strength,
negative_sample_rate = negative_sample_rate,
a = a,
b = b,
nn_method = nn_method,
n_trees = n_trees,
search_k = search_k,
method = method,
approx_pow = approx_pow,
n_threads = n_threads,
n_sgd_threads = n_sgd_threads,
grain_size = grain_size,
y = y,
target_n_neighbors = target_n_neighbors,
target_weight = target_weight,
target_metric = target_metric,
pca = pca,
pca_center = pca_center,
pca_method = pca_method,
pcg_rand = pcg_rand,
fast_sgd = fast_sgd,
ret_model = ret_model || "model" %in% ret_extra,
ret_nn = ret_nn || "nn" %in% ret_extra,
ret_fgraph = "fgraph" %in% ret_extra,
ret_sigma = "sigma" %in% ret_extra,
ret_localr = "localr" %in% ret_extra,
batch = batch,
opt_args = opt_args,
epoch_callback = epoch_callback,
binary_edge_weights = binary_edge_weights,
tmpdir = tempdir(),
verbose = verbose,
dens_scale = dens_scale,
seed = seed,
nn_args = nn_args,
sparse_X_is_distance_matrix = FALSE
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.