# R/uwot.R In jlmelville/uwot: The Uniform Manifold Approximation and Projection (UMAP) Method for Dimensionality Reduction

#' Dimensionality Reduction with UMAP
#'
#' Carry out dimensionality reduction of a dataset using the Uniform Manifold
#' Approximation and Projection (UMAP) method (McInnes et al., 2018). Some of
#' the following help text is lifted verbatim from the Python reference
#' implementation at \url{https://github.com/lmcinnes/umap}.
#'
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#'   \code{"pca"}.
#' @param n_neighbors The size of local neighborhood (in terms of number of
#'   neighboring sample points) used for manifold approximation. Larger values
#'   result in more global views of the manifold, while smaller values result in
#'   more local data being preserved. In general values should be in the range
#'   \code{2} to \code{100}.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. For
#'  \code{nn_method = "annoy"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' For \code{nn_method = "hnsw"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"}
#'   \item \code{"cosine"}
#'   \item \code{"correlation"}
#' }
#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed and \code{nn_method = "nndescent"} is specified then many more
#' metrics are avaiable, including:
#' \itemize{
#' \item \code{"braycurtis"}
#' \item \code{"canberra"}
#' \item \code{"chebyshev"}
#' \item \code{"dice"}
#' \item \code{"hamming"}
#' \item \code{"hellinger"}
#' \item \code{"jaccard"}
#' \item \code{"jensenshannon"}
#' \item \code{"kulsinski"}
#' \item \code{"rogerstanimoto"}
#' \item \code{"russellrao"}
#' \item \code{"sokalmichener"}
#' \item \code{"sokalsneath"}
#' \item \code{"spearmanr"}
#' \item \code{"symmetrickl"}
#' \item \code{"tsss"}
#' \item \code{"yule"}
#' }
#' For more details see the package documentation of \code{rnndescent}.
#' For \code{nn_method = "fnn"}, the distance metric is always "euclidean".
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. By default, this value is set to \code{500} for
#'   datasets containing 10,000 vertices or less, and \code{200} otherwise.
#'   If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will
#'   be returned.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For UMAP, the default is \code{"none"}.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#'     (Belkin and Niyogi, 2002).
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE. This is an alias for \code{init = "pca", init_sdev =
#'     1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}, \code{"agspectral"}), if more than one connected
#'  component is identified, no spectral initialization is attempted. Instead
#'  a PCA-based initialization is attempted. If \code{verbose = TRUE} the
#'  number of connected components are logged to the console. The existence of
#'  multiple connected components implies that a global view of the data cannot
#'  be attained with this initialization. Increasing the value of
#'  \code{n_neighbors} may help.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default no scaling is carried out, except when \code{init =
#'   "spca"}, in which case the value is \code{0.0001}. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For
#'   compatibility with recent versions of the Python UMAP package, if you are
#'   using \code{init = "spectral"}, then you should also set
#'   \code{init_sdev = "range"}, which will range scale each of the columns
#'   containing the initial data between 0-10. This is not set by default to
#'   maintain backwards compatibility with previous versions of uwot.
#' @param spread The effective scale of embedded points. In combination with
#'   \code{min_dist}, this determines how clustered/clumped the embedded points
#'   are.
#' @param min_dist The effective minimum distance between embedded points.
#'   Smaller values will result in a more clustered/clumped embedding where
#'   nearby points on the manifold are drawn closer together, while larger
#'   values will result on a more even dispersal of points. The value should be
#'   set relative to the \code{spread} value, which determines the scale at
#'   which embedded points will be spread out.
#' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as
#'   the set operation used to combine local fuzzy simplicial sets to obtain a
#'   global fuzzy simplicial sets. Both fuzzy set operations use the product
#'   t-norm. The value of this parameter should be between \code{0.0} and
#'   \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while
#'   \code{0.0} will use a pure fuzzy intersection.
#' @param local_connectivity The local connectivity required -- i.e. the number
#'   of nearest neighbors that should be assumed to be connected at a local
#'   level. The higher this value the more connected the manifold becomes
#'   locally. In practice this should be not more than the local intrinsic
#'   dimension of the manifold.
#' @param bandwidth The effective bandwidth of the kernel if we view the
#'   algorithm as similar to Laplacian Eigenmaps. Larger values induce more
#'   connectivity and a more global view of the data, smaller values concentrate
#'   more locally.
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param a More specific parameters controlling the embedding. If \code{NULL}
#'   these values are set automatically as determined by \code{min_dist} and
#' @param b More specific parameters controlling the embedding. If \code{NULL}
#'   these values are set automatically as determined by \code{min_dist} and
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'     \item \code{"hnsw"} Use approximate nearest neighbors with the
#'       Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin,
#'       2018) via the
#'       \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package.
#'       \code{RcppHNSW} is not a dependency of this package: this option is
#'       only available if you have installed \code{RcppHNSW} yourself. Also,
#'       HNSW only supports the following arguments for \code{metric} and
#'       \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and
#'       \code{"correlation"}.
#'     \item \code{"nndescent"} Use approximate nearest neighbors with the
#'       Nearest Neighbor Descent method (Dong et al., 2011) via the
#'       \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#'       package. \code{rnndescent} is not a dependency of this package: this
#'       option is only available if you have installed \code{rnndescent}
#'       yourself.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass pre-calculated nearest neighbor data to this argument. It
#'   must be one of two formats, either a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   or a sparse distance matrix of type \code{dgCMatrix}, with dimensions
#'   \code{n_vertices x n_vertices}. Distances should be arranged by column,
#'   i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates
#'   that the \code{j}th observation in \code{X} is a nearest neighbor of the
#'   \code{i}th observation with the distance given by the value of that
#'   element.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data. If using the sparse distance matrix input, each
#'   column can contain a different number of neighbors.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param nn_args A list containing additional arguments to pass to the nearest
#'   neighbor method. For \code{nn_method = "annoy"}, you can specify
#'   \code{"n_trees"} and \code{"search_k"}, and these will override the
#'   \code{n_trees} and \code{search_k} parameters.
#'   For \code{nn_method = "hnsw"}, you may specify the following arguments:
#'   \itemize{
#'   \item \code{M} The maximum number of neighbors to keep for each vertex.
#'   Reasonable values are \code{2} to \code{100}. Higher values give better
#'   recall at the cost of more memory. Default value is \code{16}.
#'   \item \code{ef_construction} A positive integer specifying the size of
#'   the dynamic list used during index construction. A higher value will
#'   provide better results at the cost of a longer time to build the index.
#'   Default is \code{200}.
#'   \item \code{ef} A positive integer specifying the size of the dynamic
#'   list used during search. This cannot be smaller than \code{n_neighbors}
#'   and cannot be higher than the number of items in the index. Default is
#'   \code{10}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest.
#'   If you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   \item \code{pruning_degree_multiplier} The maximum number of edges per node
#'   to retain in the search graph, relative to \code{n_neighbors}. A larger
#'   value will give more accurate results at the cost of a longer computation
#'   time. Default is \code{1.5}. This parameter only affects neighbor search
#'   when transforming new data with \code{\link{umap_transform}}.
#'   \item \code{epsilon} Controls the degree of the back-tracking when
#'   traversing the search graph. Setting this to \code{0.0} will do a greedy
#'   search with no back-tracking. A larger value will give more accurate
#'   results at the cost of a longer computation time. Default is \code{0.1}.
#'   This parameter only affects neighbor search when transforming new data with
#'   \item \code{max_search_fraction} Specifies the maximum fraction of the
#'   search graph to traverse. By default, this is set to \code{1.0}, so the
#'   entire graph (i.e. all items in \code{X}) may be visited. You may want to
#'   set this to a smaller value if you have a very large dataset (in
#'   conjunction with \code{epsilon}) to avoid an inefficient exhaustive search
#'   of the data in \code{X}. This parameter only affects neighbor search when
#'   transforming new data with \code{\link{umap_transform}}.
#'   }
#' @param approx_pow If \code{TRUE}, use an approximation to the power function
#'   in the UMAP gradient, from
#'   \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}.
#'   Ignored if \code{dens_scale} is non-\code{NULL}.
#' @param y Optional target data for supervised dimension reduction. Can be a
#' vector, matrix or data frame. Use the \code{target_metric} parameter to
#' specify the metrics to use, using the same syntax as \code{metric}. Usually
#' either a single numeric or factor column is used, but more complex formats
#' are possible. The following types are allowed:
#'   \itemize{
#'     \item Factor columns with the same length as \code{X}. \code{NA} is
#'     allowed for any observation with an unknown level, in which case
#'     UMAP operates as a form of semi-supervised learning. Each column is
#'     treated separately.
#'     \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the
#'     parameter \code{target_n_neighbors} to set the number of neighbors used
#'     with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors,
#'     numeric columns are grouped into one block unless \code{target_metric}
#'     specifies otherwise. For example, if you wish columns \code{a} and
#'     \code{b} to be treated separately, specify
#'     \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise,
#'     the data will be effectively treated as a matrix with two columns.
#'     \item Nearest neighbor data, consisting of a list of two matrices,
#'     \code{idx} and \code{dist}. These represent the precalculated nearest
#'     neighbor indices and distances, respectively. This
#'     is the same format as that expected for precalculated data in
#'     \code{nn_method}. This format assumes that the underlying data was a
#'     numeric vector. Any user-supplied value of the \code{target_n_neighbors}
#'     parameter is ignored in this case, because the the number of columns in
#'     the matrices is used for the value. Multiple nearest neighbor data using
#'     different metrics can be supplied by passing a list of these lists.
#'   }
#' Unlike \code{X}, all factor columns included in \code{y} are automatically
#' used.
#' @param target_n_neighbors Number of nearest neighbors to use to construct the
#'   target simplicial set. Default value is \code{n_neighbors}. Applies only if
#'   \code{y} is non-\code{NULL} and \code{numeric}.
#' @param target_metric The metric used to measure distance for \code{y} if
#'   using supervised dimension reduction. Used only if \code{y} is numeric.
#' @param target_weight Weighting factor between data topology and target
#'   topology. A value of 0.0 weights entirely on data, a value of 1.0 weights
#'   entirely on target. The default of 0.5 balances the weighting equally
#'   between data and target. Only applies if \code{y} is non-\code{NULL}.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and
#'   \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to
#'   \code{TRUE} will speed up the stochastic optimization phase, but give a
#'   potentially less accurate embedding, and which will not be exactly
#'   reproducible even with a fixed seed. For visualization, \code{fast_sgd =
#'   TRUE} will give perfectly good results. For more generic dimensionality
#'   reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd =
#'   TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads},
#'   and \code{approx_pow} are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param ret_model If \code{TRUE}, then return extra data that can be used to
#'   embedded coordinates are returned as the list item \code{embedding}. If
#'   \code{FALSE}, just return the coordinates. This parameter can be used in
#'   conjunction with \code{ret_nn} and \code{ret_extra}. Note that some
#'   settings are incompatible with the production of a UMAP model: external
#'   neighbor data (passed via a list to \code{nn_method}), and factor columns
#'   that were included via the \code{metric} parameter. In the latter case, the
#'   model produced is based only on the numeric data. A transformation using
#'   new data is possible, but the factor columns in the new data are ignored.
#'   Note that setting \code{ret_model = TRUE} forces the use of the approximate
#'   nearest neighbors method. Because small datasets would otherwise use exact
#'   nearest neighbor calculations, setting \code{ret_model = TRUE} means that
#'   different results may be returned for small datasets in terms of both the
#'   returned nearest neighbors (if requested) and the final embedded
#'   coordinates, compared to \code{ret_model = FALSE}, even if the random
#'   number seed is fixed. To avoid this, explicitly set
#'   \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#'   nearest neighbor data that can be used as input to \code{nn_method} to
#'   avoid the overhead of repeatedly calculating the nearest neighbors when
#'   manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#'   \code{init}). See the "Value" section for the names of the list items. If
#'   \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#'   could be sensitive to data scaling, so be wary of reusing nearest neighbor
#'   data if modifying the \code{scale} parameter. This parameter can be used in
#'   conjunction with \code{ret_model} and \code{ret_extra}.
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'     \item \code{"model"} Same as setting \code{ret_model = TRUE}.
#'     \item \code{"nn"} Same as setting \code{ret_nn = TRUE}.
#'     \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy
#'       simplicial set of the merged local views of the input data). The graph
#'       is returned as a sparse symmetric N x N matrix of class
#'       \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#'       membership strength of the edge connecting vertex i and vertex j. This
#'       can be considered analogous to the input probability (or similarity or
#'       affinity) used in t-SNE and LargeVis. Note that the graph is further
#'       sparsified by removing edges with sufficiently low membership strength
#'       that they would not be sampled by the probabilistic edge sampling
#'       employed for optimization and therefore the number of non-zero elements
#'       in the matrix is dependent on \code{n_epochs}. If you are only
#'       interested in the fuzzy input graph (e.g. for clustering), setting
#'       \code{n_epochs = 0} will avoid any further sparsifying.
#'       Be aware that setting binary_edge_weights = TRUE will affect this
#'       graph (all non-zero edge weights will be 1).
#'    \item \code{"sigma"} the normalization value for each observation in the
#'       dataset when constructing the smoothed distances to each of its
#'       neighbors. This gives some sense of the local density of each
#'       observation in the high dimensional space: higher values of
#'       \code{sigma} indicate a higher dispersion or lower density.
#'   }
#'   descent). Default is half the number of concurrent threads supported by the
#'   system. For nearest neighbor search, only applies if
#'   \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#'   will be temporarily written to disk in the location determined by
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. Set to \code{"auto"} to use the same value as
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @param binary_edge_weights If \code{TRUE} then edge weights in the input
#'   graph are treated as binary (0/1) rather than real valued. This affects the
#'   sampling frequency of neighbors and is the strategy used by the PaCMAP
#'   method (Wang and co-workers, 2020). Practical (BÃ¶hm and co-workers, 2020)
#'   and theoretical (Damrich and Hamprecht, 2021) work suggests this has little
#'   effect on UMAP's performance.
#' @param dens_scale A value between 0 and 1. If > 0 then the output attempts
#'   to preserve relative local density around each observation. This uses an
#'   approximation to the densMAP method (Narayan and co-workers, 2021). The
#'   larger the value of \code{dens_scale}, the greater the range of output
#'   densities that will be used to map the input densities. This option is
#'   ignored if using multiple \code{metric} blocks.
#' @param seed Integer seed to use to initialize the random number generator
#'   state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this
#'   should give consistent output across multiple runs on a given installation.
#'   Setting this value is equivalent to calling \code{\link[base]{set.seed}},
#'   but it may be more convenient in some situations than having to call a
#'   separate function. The default is to not set a seed. If
#'   \code{ret_model = TRUE}, the seed will be stored in the output model and
#'   then used to set the seed inside \code{\link{umap_transform}}.
#' @return A matrix of optimized coordinates, or:
#'   \itemize{
#'     \item if \code{ret_model = TRUE} (or \code{ret_extra} contains
#'     \code{"model"}), returns a list containing extra information that can be
#'     used to add new data to an existing embedding via
#'     \code{\link{umap_transform}}. In this case, the coordinates are available
#'     in the list item \code{embedding}. \bold{NOTE}: The contents of
#'     the \code{model} list should \emph{not} be considered stable or part of
#'     the public API, and are purposely left undocumented.
#'     \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#'     returns the nearest neighbor data as a list called \code{nn}. This
#'     contains one list for each \code{metric} calculated, itself containing a
#'     matrix \code{idx} with the integer ids of the neighbors; and a matrix
#'     \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#'     used as input to the \code{nn_method} parameter.
#'     \item if \code{ret_extra} contains \code{"fgraph"}, returns the high
#'     dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type
#'     \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the
#'     smooth knn distance normalization terms for each observation as
#'     \code{"sigma"} and a vector \code{"rho"} containing the largest
#'     distance to the locally connected neighbors of each observation.
#'     \item if \code{ret_extra} contains \code{"localr"}, returns a vector of
#'     the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}.
#'   }
#'   The returned list contains the combined data from any combination of
#'   specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}.
#' @examples
#'
#' iris30 <- iris[c(1:10, 51:60, 101:110), ]
#'
#' # Non-numeric columns are automatically removed so you can pass data frames
#' # directly in a lot of cases without pre-processing
#' iris_umap <- umap(iris30, n_neighbors = 5, learning_rate = 0.5, init = "random", n_epochs = 20)
#'
#' # Faster approximation to the gradient and return nearest neighbors
#' iris_umap <- umap(iris30, n_neighbors = 5, approx_pow = TRUE, ret_nn = TRUE, n_epochs = 20)
#'
#' # Can specify min_dist and spread parameters to control separation and size
#' # of clusters and reuse nearest neighbors for efficiency
#' nn <- iris_umap$nn #' iris_umap <- umap(iris30, n_neighbors = 5, min_dist = 1, spread = 5, nn_method = nn, n_epochs = 20) #' #' # Supervised dimension reduction using the 'Species' factor column #' iris_sumap <- umap(iris30, #' n_neighbors = 5, min_dist = 0.001, y = iris30$Species,
#'   target_weight = 0.5, n_epochs = 20
#' )
#'
#' # Calculate Petal and Sepal neighbors separately (uses intersection of the resulting sets):
#' iris_umap <- umap(iris30, metric = list(
#'   "euclidean" = c("Sepal.Length", "Sepal.Width"),
#'   "euclidean" = c("Petal.Length", "Petal.Width")
#' ))
#'
#' @references
#' Belkin, M., & Niyogi, P. (2002).
#' Laplacian eigenmaps and spectral techniques for embedding and clustering.
#' In \emph{Advances in neural information processing systems}
#' (pp. 585-591).
#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
#'
#' BÃ¶hm, J. N., Berens, P., & Kobak, D. (2020).
#' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum.
#' \emph{arXiv preprint} \emph{arXiv:2007.08902}.
#' \url{https://arxiv.org/abs/2007.08902}
#'
#' Damrich, S., & Hamprecht, F. A. (2021).
#' On UMAP's true loss function.
#' \emph{Advances in Neural Information Processing Systems}, \emph{34}.
#' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html}
#'
#' Dong, W., Moses, C., & Li, K. (2011, March).
#' Efficient k-nearest neighbor graph construction for generic similarity measures.
#' In \emph{Proceedings of the 20th international conference on World Wide Web}
#' (pp. 577-586).
#' ACM.
#' \doi{10.1145/1963405.1963487}.
#'
#' Kingma, D. P., & Ba, J. (2014).
#' Adam: A method for stochastic optimization.
#' \emph{arXiv preprint} \emph{arXiv}:1412.6980.
#' \url{https://arxiv.org/abs/1412.6980}
#'
#' Malkov, Y. A., & Yashunin, D. A. (2018).
#' Efficient and robust approximate nearest neighbor search using hierarchical
#' navigable small world graphs.
#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836.
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' Narayan, A., Berger, B., & Cho, H. (2021).
#' Assessing single-cell transcriptomic variability through density-preserving data visualization.
#' \emph{Nature biotechnology}, \emph{39}(6), 765-774.
#' \doi{10.1038/s41587-020-00801-7}
#'
#' Oâ€™Neill, M. E. (2014).
#' \emph{PCG: A family of simple fast space-efficient statistically good
#' algorithms for random number generation}
#' (Report No. HMC-CS-2014-0905). Harvey Mudd College.
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' Van der Maaten, L., & Hinton, G. (2008).
#' Visualizing data using t-SNE.
#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html}
#'
#' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021).
#' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization.
#' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73.
#' \url{https://www.jmlr.org/papers/v22/20-1061.html}
#'
#' @export
umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
n_epochs = NULL, learning_rate = 1, scale = FALSE,
init = "spectral", init_sdev = NULL,
spread = 1, min_dist = 0.01,
set_op_mix_ratio = 1.0, local_connectivity = 1.0,
bandwidth = 1.0, repulsion_strength = 1.0,
negative_sample_rate = 5.0, a = NULL, b = NULL,
nn_method = NULL, n_trees = 50,
search_k = 2 * n_neighbors * n_trees,
approx_pow = FALSE,
y = NULL, target_n_neighbors = n_neighbors,
target_metric = "euclidean",
target_weight = 0.5,
pca = NULL, pca_center = TRUE,
pcg_rand = TRUE,
fast_sgd = FALSE,
ret_model = FALSE, ret_nn = FALSE, ret_extra = c(),
grain_size = 1,
tmpdir = tempdir(),
verbose = getOption("verbose", TRUE),
batch = FALSE,
opt_args = NULL, epoch_callback = NULL, pca_method = NULL,
binary_edge_weights = FALSE,
dens_scale = NULL,
seed = NULL,
nn_args = list()) {
uwot(
X = X, n_neighbors = n_neighbors, n_components = n_components,
metric = metric, n_epochs = n_epochs, alpha = learning_rate, scale = scale,
init = init, init_sdev = init_sdev,
set_op_mix_ratio = set_op_mix_ratio,
local_connectivity = local_connectivity, bandwidth = bandwidth,
gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
a = a, b = b, nn_method = nn_method, n_trees = n_trees,
search_k = search_k,
method = "umap", approx_pow = approx_pow,
grain_size = grain_size,
y = y, target_n_neighbors = target_n_neighbors,
target_weight = target_weight, target_metric = target_metric,
pca = pca, pca_center = pca_center, pca_method = pca_method,
pcg_rand = pcg_rand,
fast_sgd = fast_sgd,
ret_model = ret_model || "model" %in% ret_extra,
ret_nn = ret_nn || "nn" %in% ret_extra,
ret_fgraph = "fgraph" %in% ret_extra,
ret_sigma = "sigma" %in% ret_extra,
ret_localr = "localr" %in% ret_extra,
batch = batch,
opt_args = opt_args,
epoch_callback = epoch_callback,
binary_edge_weights = binary_edge_weights,
tmpdir = tempdir(),
verbose = verbose,
dens_scale = dens_scale,
seed = seed,
nn_args = nn_args
)
}

#' Dimensionality Reduction Using t-Distributed UMAP (t-UMAP)
#'
#' A faster (but less flexible) version of the UMAP (McInnes et al, 2018)
#'
#' By setting the UMAP curve parameters \code{a} and \code{b} to \code{1}, you
#' get back the Cauchy distribution as used in t-SNE (van der Maaten and Hinton,
#' 2008) and LargeVis (Tang et al., 2016). It also results in a substantially
#' simplified gradient expression. This can give a speed improvement of around
#' 50\%.
#'
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#'   \code{"pca"}.
#' @param n_neighbors The size of local neighborhood (in terms of number of
#'   neighboring sample points) used for manifold approximation. Larger values
#'   result in more global views of the manifold, while smaller values result in
#'   more local data being preserved. In general values should be in the range
#'   \code{2} to \code{100}.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. For
#'  \code{nn_method = "annoy"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' For \code{nn_method = "hnsw"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"}
#'   \item \code{"cosine"}
#'   \item \code{"correlation"}
#' }
#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed and \code{nn_method = "nndescent"} is specified then many more
#' metrics are avaiable, including:
#' \itemize{
#' \item \code{"braycurtis"}
#' \item \code{"canberra"}
#' \item \code{"chebyshev"}
#' \item \code{"dice"}
#' \item \code{"hamming"}
#' \item \code{"hellinger"}
#' \item \code{"jaccard"}
#' \item \code{"jensenshannon"}
#' \item \code{"kulsinski"}
#' \item \code{"rogerstanimoto"}
#' \item \code{"russellrao"}
#' \item \code{"sokalmichener"}
#' \item \code{"sokalsneath"}
#' \item \code{"spearmanr"}
#' \item \code{"symmetrickl"}
#' \item \code{"tsss"}
#' \item \code{"yule"}
#' }
#' For more details see the package documentation of \code{rnndescent}.
#' For \code{nn_method = "fnn"}, the distance metric is always "euclidean".
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. By default, this value is set to \code{500} for
#'   datasets containing 10,000 vertices or less, and \code{200} otherwise.
#'   If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will
#'   be returned.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For t-UMAP, the default is \code{"none"}.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#'     (Belkin and Niyogi, 2002).
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE. This is an alias for \code{init = "pca", init_sdev =
#'     1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}, \code{"agspectral"}), if more than one connected
#'  component is identified, no spectral initialization is attempted. Instead
#'  a PCA-based initialization is attempted. If \code{verbose = TRUE} the
#'  number of connected components are logged to the console. The existence of
#'  multiple connected components implies that a global view of the data cannot
#'  be attained with this initialization. Increasing the value of
#'  \code{n_neighbors} may help.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default no scaling is carried out, except when \code{init =
#'   "spca"}, in which case the value is \code{0.0001}. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For
#'   compatibility with recent versions of the Python UMAP package, if you are
#'   using \code{init = "spectral"}, then you should also set
#'   \code{init_sdev = "range"}, which will range scale each of the columns
#'   containing the initial data between 0-10. This is not set by default to
#'   maintain backwards compatibility with previous versions of uwot.
#' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as
#'   the set operation used to combine local fuzzy simplicial sets to obtain a
#'   global fuzzy simplicial sets. Both fuzzy set operations use the product
#'   t-norm. The value of this parameter should be between \code{0.0} and
#'   \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while
#'   \code{0.0} will use a pure fuzzy intersection.
#' @param local_connectivity The local connectivity required -- i.e. the number
#'   of nearest neighbors that should be assumed to be connected at a local
#'   level. The higher this value the more connected the manifold becomes
#'   locally. In practice this should be not more than the local intrinsic
#'   dimension of the manifold.
#' @param bandwidth The effective bandwidth of the kernel if we view the
#'   algorithm as similar to Laplacian Eigenmaps. Larger values induce more
#'   connectivity and a more global view of the data, smaller values concentrate
#'   more locally.
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'     \item \code{"hnsw"} Use approximate nearest neighbors with the
#'       Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin,
#'       2018) via the
#'       \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package.
#'       \code{RcppHNSW} is not a dependency of this package: this option is
#'       only available if you have installed \code{RcppHNSW} yourself. Also,
#'       HNSW only supports the following arguments for \code{metric} and
#'       \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and
#'       \code{"correlation"}.
#'     \item \code{"nndescent"} Use approximate nearest neighbors with the
#'       Nearest Neighbor Descent method (Dong et al., 2011) via the
#'       \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#'       package. \code{rnndescent} is not a dependency of this package: this
#'       option is only available if you have installed \code{rnndescent}
#'       yourself.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass pre-calculated nearest neighbor data to this argument. It
#'   must be one of two formats, either a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   or a sparse distance matrix of type \code{dgCMatrix}, with dimensions
#'   \code{n_vertices x n_vertices}. Distances should be arranged by column,
#'   i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates
#'   that the \code{j}th observation in \code{X} is a nearest neighbor of the
#'   \code{i}th observation with the distance given by the value of that
#'   element.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data. If using the sparse distance matrix input, each
#'   column can contain a different number of neighbors.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param nn_args A list containing additional arguments to pass to the nearest
#'   neighbor method. For \code{nn_method = "annoy"}, you can specify
#'   \code{"n_trees"} and \code{"search_k"}, and these will override the
#'   \code{n_trees} and \code{search_k} parameters.
#'   For \code{nn_method = "hnsw"}, you may specify the following arguments:
#'   \itemize{
#'   \item \code{M} The maximum number of neighbors to keep for each vertex.
#'   Reasonable values are \code{2} to \code{100}. Higher values give better
#'   recall at the cost of more memory. Default value is \code{16}.
#'   \item \code{ef_construction} A positive integer specifying the size of
#'   the dynamic list used during index construction. A higher value will
#'   provide better results at the cost of a longer time to build the index.
#'   Default is \code{200}.
#'   \item \code{ef} A positive integer specifying the size of the dynamic
#'   list used during search. This cannot be smaller than \code{n_neighbors}
#'   and cannot be higher than the number of items in the index. Default is
#'   \code{10}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest.
#'   If you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   \item \code{pruning_degree_multiplier} The maximum number of edges per node
#'   to retain in the search graph, relative to \code{n_neighbors}. A larger
#'   value will give more accurate results at the cost of a longer computation
#'   time. Default is \code{1.5}. This parameter only affects neighbor search
#'   when transforming new data with \code{\link{umap_transform}}.
#'   \item \code{epsilon} Controls the degree of the back-tracking when
#'   traversing the search graph. Setting this to \code{0.0} will do a greedy
#'   search with no back-tracking. A larger value will give more accurate
#'   results at the cost of a longer computation time. Default is \code{0.1}.
#'   This parameter only affects neighbor search when transforming new data with
#'   \item \code{max_search_fraction} Specifies the maximum fraction of the
#'   search graph to traverse. By default, this is set to \code{1.0}, so the
#'   entire graph (i.e. all items in \code{X}) may be visited. You may want to
#'   set this to a smaller value if you have a very large dataset (in
#'   conjunction with \code{epsilon}) to avoid an inefficient exhaustive search
#'   of the data in \code{X}. This parameter only affects neighbor search when
#'   transforming new data with \code{\link{umap_transform}}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest. If
#'   you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   \item \code{pruning_degree_multiplier} The maximum number of edges per node
#'   to retain in the search graph, relative to \code{n_neighbors}. A larger
#'   value will give more accurate results at the cost of a longer computation
#'   time. Default is \code{1.5}. This parameter only affects neighbor search
#'   when transforming new data with \code{\link{umap_transform}}.
#'   \item \code{epsilon} Controls the degree of the back-tracking when
#'   traversing the search graph. Setting this to \code{0.0} will do a greedy
#'   search with no back-tracking. A larger value will give more accurate
#'   results at the cost of a longer computation time. Default is \code{0.1}.
#'   This parameter only affects neighbor search when transforming new data with
#'   \item \code{max_search_fraction} Specifies the maximum fraction of the
#'   search graph to traverse. By default, this is set to \code{1.0}, so the
#'   entire graph (i.e. all items in \code{X}) may be visited. You may want to
#'   set this to a smaller value if you have a very large dataset (in
#'   conjunction with \code{epsilon}) to avoid an inefficient exhaustive search
#'   of the data in \code{X}. This parameter only affects neighbor search when
#'   transforming new data with \code{\link{umap_transform}}.
#'   }
#' @param y Optional target data for supervised dimension reduction. Can be a
#' vector, matrix or data frame. Use the \code{target_metric} parameter to
#' specify the metrics to use, using the same syntax as \code{metric}. Usually
#' either a single numeric or factor column is used, but more complex formats
#' are possible. The following types are allowed:
#'   \itemize{
#'     \item Factor columns with the same length as \code{X}. \code{NA} is
#'     allowed for any observation with an unknown level, in which case
#'     UMAP operates as a form of semi-supervised learning. Each column is
#'     treated separately.
#'     \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the
#'     parameter \code{target_n_neighbors} to set the number of neighbors used
#'     with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors,
#'     numeric columns are grouped into one block unless \code{target_metric}
#'     specifies otherwise. For example, if you wish columns \code{a} and
#'     \code{b} to be treated separately, specify
#'     \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise,
#'     the data will be effectively treated as a matrix with two columns.
#'     \item Nearest neighbor data, consisting of a list of two matrices,
#'     \code{idx} and \code{dist}. These represent the precalculated nearest
#'     neighbor indices and distances, respectively. This
#'     is the same format as that expected for precalculated data in
#'     \code{nn_method}. This format assumes that the underlying data was a
#'     numeric vector. Any user-supplied value of the \code{target_n_neighbors}
#'     parameter is ignored in this case, because the the number of columns in
#'     the matrices is used for the value. Multiple nearest neighbor data using
#'     different metrics can be supplied by passing a list of these lists.
#'   }
#' Unlike \code{X}, all factor columns included in \code{y} are automatically
#' used.
#' @param target_n_neighbors Number of nearest neighbors to use to construct the
#'   target simplicial set. Default value is \code{n_neighbors}. Applies only if
#'   \code{y} is non-\code{NULL} and \code{numeric}.
#' @param target_metric The metric used to measure distance for \code{y} if
#'   using supervised dimension reduction. Used only if \code{y} is numeric.
#' @param target_weight Weighting factor between data topology and target
#'   topology. A value of 0.0 weights entirely on data, a value of 1.0 weights
#'   entirely on target. The default of 0.5 balances the weighting equally
#'   between data and target. Only applies if \code{y} is non-\code{NULL}.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The
#'   default is \code{FALSE}. Setting this to \code{TRUE} will speed up the
#'   stochastic optimization phase, but give a potentially less accurate
#'   embedding, and which will not be exactly reproducible even with a fixed
#'   seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good
#'   results. For more generic dimensionality reduction, it's safer to leave
#'   \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied
#'   values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param ret_model If \code{TRUE}, then return extra data that can be used to
#'   embedded coordinates are returned as the list item \code{embedding}. If
#'   \code{FALSE}, just return the coordinates. This parameter can be used in
#'   conjunction with \code{ret_nn} and \code{ret_extra}. Note that some
#'   settings are incompatible with the production of a UMAP model: external
#'   neighbor data (passed via a list to \code{nn_method}), and factor columns
#'   that were included via the \code{metric} parameter. In the latter case, the
#'   model produced is based only on the numeric data. A transformation using
#'   new data is possible, but the factor columns in the new data are ignored.
#'   Note that setting \code{ret_model = TRUE} forces the use of the approximate
#'   nearest neighbors method. Because small datasets would otherwise use exact
#'   nearest neighbor calculations, setting \code{ret_model = TRUE} means that
#'   different results may be returned for small datasets in terms of both the
#'   returned nearest neighbors (if requested) and the final embedded
#'   coordinates, compared to \code{ret_model = FALSE}, even if the random
#'   number seed is fixed. To avoid this, explicitly set
#'   \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#'   nearest neighbor data that can be used as input to \code{nn_method} to
#'   avoid the overhead of repeatedly calculating the nearest neighbors when
#'   manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#'   \code{init}). See the "Value" section for the names of the list items. If
#'   \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#'   could be sensitive to data scaling, so be wary of reusing nearest neighbor
#'   data if modifying the \code{scale} parameter. This parameter can be used in
#'   conjunction with \code{ret_model} and \code{ret_extra}.
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'     \item \code{"model"} Same as setting \code{ret_model = TRUE}.
#'     \item \code{"nn"} Same as setting \code{ret_nn = TRUE}.
#'     \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy
#'       simplicial set of the merged local views of the input data). The graph
#'       is returned as a sparse symmetric N x N matrix of class
#'       \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#'       membership strength of the edge connecting vertex i and vertex j. This
#'       can be considered analogous to the input probability (or similarity or
#'       affinity) used in t-SNE and LargeVis. Note that the graph is further
#'       sparsified by removing edges with sufficiently low membership strength
#'       that they would not be sampled by the probabilistic edge sampling
#'       employed for optimization and therefore the number of non-zero elements
#'       in the matrix is dependent on \code{n_epochs}. If you are only
#'       interested in the fuzzy input graph (e.g. for clustering), setting
#'       \code{n_epochs = 0} will avoid any further sparsifying. Be aware that
#'       setting \code{binary_edge_weights = TRUE} will affect this graph (all
#'       non-zero edge weights will be 1).
#'    \item \code{"sigma"} the normalization value for each observation in the
#'       dataset when constructing the smoothed distances to each of its
#'       neighbors. This gives some sense of the local density of each
#'       observation in the high dimensional space: higher values of
#'       \code{sigma} indicate a higher dispersion or lower density.
#'   }
#'   descent). Default is half the number of concurrent threads supported by the
#'   system. For nearest neighbor search, only applies if
#'   \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#'   will be temporarily written to disk in the location determined by
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. Set to \code{"auto"} to use the same value as
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @param binary_edge_weights If \code{TRUE} then edge weights in the input
#'   graph are treated as binary (0/1) rather than real valued. This affects the
#'   sampling frequency of neighbors and is the strategy used by the PaCMAP
#'   method (Wang and co-workers, 2020). Practical (BÃ¶hm and co-workers, 2020)
#'   and theoretical (Damrich and Hamprecht, 2021) work suggests this has little
#'   effect on UMAP's performance.
#' @param seed Integer seed to use to initialize the random number generator
#'   state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this
#'   should give consistent output across multiple runs on a given installation.
#'   Setting this value is equivalent to calling \code{\link[base]{set.seed}},
#'   but it may be more convenient in some situations than having to call a
#'   separate function. The default is to not set a seed. If
#'   \code{ret_model = TRUE}, the seed will be stored in the output model and
#'   then used to set the seed inside \code{\link{umap_transform}}.
#' @return A matrix of optimized coordinates, or:
#'   \itemize{
#'     \item if \code{ret_model = TRUE} (or \code{ret_extra} contains
#'     \code{"model"}), returns a list containing extra information that can be
#'     used to add new data to an existing embedding via
#'     \code{\link{umap_transform}}. In this case, the coordinates are available
#'     in the list item \code{embedding}. \bold{NOTE}: The contents of
#'     the \code{model} list should \emph{not} be considered stable or part of
#'     the public API, and are purposely left undocumented.
#'     \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#'     returns the nearest neighbor data as a list called \code{nn}. This
#'     contains one list for each \code{metric} calculated, itself containing a
#'     matrix \code{idx} with the integer ids of the neighbors; and a matrix
#'     \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#'     used as input to the \code{nn_method} parameter.
#'     \item if \code{ret_extra} contains \code{"fgraph"} returns the high
#'     dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type
#'     \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the
#'     smooth knn distance normalization terms for each observation as
#'     \code{"sigma"} and a vector \code{"rho"} containing the largest
#'     distance to the locally connected neighbors of each observation.
#'     \item if \code{ret_extra} contains \code{"localr"}, returns a vector of
#'     the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}.
#'   }
#'   The returned list contains the combined data from any combination of
#'   specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}.
#' @examples
#' iris_tumap <- tumap(iris, n_neighbors = 50, learning_rate = 0.5)
#'
#' @references
#' Belkin, M., & Niyogi, P. (2002).
#' Laplacian eigenmaps and spectral techniques for embedding and clustering.
#' In \emph{Advances in neural information processing systems}
#' (pp. 585-591).
#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
#'
#' BÃ¶hm, J. N., Berens, P., & Kobak, D. (2020).
#' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum.
#' \emph{arXiv preprint} \emph{arXiv:2007.08902}.
#' \url{https://arxiv.org/abs/2007.08902}
#'
#' Damrich, S., & Hamprecht, F. A. (2021).
#' On UMAP's true loss function.
#' \emph{Advances in Neural Information Processing Systems}, \emph{34}.
#' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html}
#'
#' Dong, W., Moses, C., & Li, K. (2011, March).
#' Efficient k-nearest neighbor graph construction for generic similarity measures.
#' In \emph{Proceedings of the 20th international conference on World Wide Web}
#' (pp. 577-586).
#' ACM.
#' \doi{10.1145/1963405.1963487}.
#'
#' Kingma, D. P., & Ba, J. (2014).
#' Adam: A method for stochastic optimization.
#' \emph{arXiv preprint} \emph{arXiv}:1412.6980.
#' \url{https://arxiv.org/abs/1412.6980}
#'
#' Malkov, Y. A., & Yashunin, D. A. (2018).
#' Efficient and robust approximate nearest neighbor search using hierarchical
#' navigable small world graphs.
#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836.
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' Oâ€™Neill, M. E. (2014).
#' \emph{PCG: A family of simple fast space-efficient statistically good
#' algorithms for random number generation}
#' (Report No. HMC-CS-2014-0905). Harvey Mudd College.
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' Van der Maaten, L., & Hinton, G. (2008).
#' Visualizing data using t-SNE.
#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html}
#'
#' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021).
#' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization.
#' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73.
#' \url{https://www.jmlr.org/papers/v22/20-1061.html}
#'
#' @export
tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
n_epochs = NULL,
learning_rate = 1, scale = FALSE,
init = "spectral", init_sdev = NULL,
set_op_mix_ratio = 1.0, local_connectivity = 1.0,
bandwidth = 1.0, repulsion_strength = 1.0,
negative_sample_rate = 5.0,
nn_method = NULL, n_trees = 50,
search_k = 2 * n_neighbors * n_trees,
grain_size = 1,
y = NULL, target_n_neighbors = n_neighbors,
target_metric = "euclidean",
target_weight = 0.5,
pca = NULL, pca_center = TRUE,
pcg_rand = TRUE,
fast_sgd = FALSE,
ret_model = FALSE, ret_nn = FALSE, ret_extra = c(),
tmpdir = tempdir(),
verbose = getOption("verbose", TRUE),
batch = FALSE,
opt_args = NULL, epoch_callback = NULL,
pca_method = NULL,
binary_edge_weights = FALSE,
seed = NULL,
nn_args = list()) {
uwot(
X = X, n_neighbors = n_neighbors, n_components = n_components,
metric = metric,
n_epochs = n_epochs, alpha = learning_rate, scale = scale,
init = init, init_sdev = init_sdev,
spread = NULL, min_dist = NULL, set_op_mix_ratio = set_op_mix_ratio,
local_connectivity = local_connectivity, bandwidth = bandwidth,
gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
a = NULL, b = NULL, nn_method = nn_method, n_trees = n_trees,
search_k = search_k,
method = "tumap",
grain_size = grain_size,
y = y, target_n_neighbors = target_n_neighbors,
target_weight = target_weight, target_metric = target_metric,
pca = pca, pca_center = pca_center, pca_method = pca_method,
pcg_rand = pcg_rand,
fast_sgd = fast_sgd,
ret_model = ret_model || "model" %in% ret_extra,
ret_nn = ret_nn || "nn" %in% ret_extra,
ret_fgraph = "fgraph" %in% ret_extra,
ret_sigma = "sigma" %in% ret_extra,
ret_localr = "localr" %in% ret_extra,
batch = batch,
opt_args = opt_args,
epoch_callback = epoch_callback,
binary_edge_weights = binary_edge_weights,
seed = seed,
tmpdir = tmpdir,
verbose = verbose,
nn_args = nn_args
)
}

#' Dimensionality Reduction with a LargeVis-like method
#'
#' Carry out dimensionality reduction of a dataset using a method similar to
#' LargeVis (Tang et al., 2016).
#'
#' \code{lvish} differs from the official LargeVis implementation in the
#' following:
#'
#' \itemize{
#'   \item Only the nearest-neighbor index search phase is multi-threaded.
#'   \item Matrix input data is not normalized.
#'   \item The \code{n_trees} parameter cannot be dynamically chosen based on
#'   data set size.
#'   \item Nearest neighbor results are not refined via the
#'   neighbor-of-my-neighbor method. The \code{search_k} parameter is twice
#'   as large than default to compensate.
#'   \item Gradient values are clipped to \code{4.0} rather than \code{5.0}.
#'   \item Negative edges are generated by uniform sampling of vertexes rather
#'   than their degree ^ 0.75.
#'   \item The default number of samples is much reduced. The default number of
#'   epochs, \code{n_epochs}, is set to \code{5000}, much larger than for
#'   \code{\link{umap}}, but may need to be increased further depending on your
#'   dataset. Using \code{init = "spectral"} can help.
#' }
#'
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#'   \code{"pca"}.
#' @param perplexity Controls the size of the local neighborhood used for
#'   manifold approximation. This is the analogous to \code{n_neighbors} in
#'   \code{\link{umap}}. Change this, rather than \code{n_neighbors}.
#' @param n_neighbors The number of neighbors to use when calculating the
#'  \code{perplexity}. Usually set to three times the value of the
#'  \code{perplexity}. Must be at least as large as \code{perplexity}.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. For
#'  \code{nn_method = "annoy"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' For \code{nn_method = "hnsw"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"}
#'   \item \code{"cosine"}
#'   \item \code{"correlation"}
#' }
#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed and \code{nn_method = "nndescent"} is specified then many more
#' metrics are avaiable, including:
#' \itemize{
#' \item \code{"braycurtis"}
#' \item \code{"canberra"}
#' \item \code{"chebyshev"}
#' \item \code{"dice"}
#' \item \code{"hamming"}
#' \item \code{"hellinger"}
#' \item \code{"jaccard"}
#' \item \code{"jensenshannon"}
#' \item \code{"kulsinski"}
#' \item \code{"rogerstanimoto"}
#' \item \code{"russellrao"}
#' \item \code{"sokalmichener"}
#' \item \code{"sokalsneath"}
#' \item \code{"spearmanr"}
#' \item \code{"symmetrickl"}
#' \item \code{"tsss"}
#' \item \code{"yule"}
#' }
#' For more details see the package documentation of \code{rnndescent}.
#' For \code{nn_method = "fnn"}, the distance metric is always "euclidean".
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. The default is calculate the number of epochs
#'   dynamically based on dataset size, to give the same number of edge samples
#'   as the LargeVis defaults. This is usually substantially larger than the
#'   UMAP defaults. If \code{n_epochs = 0}, then coordinates determined by
#'   \code{"init"} will be returned.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For lvish, the default is \code{"maxabs"}, for consistency with LargeVis.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#'     (Belkin and Niyogi, 2002).
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE and LargeVis. This is an alias for \code{init = "pca",
#'     init_sdev = 1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}, \code{"agspectral"}), if more than one connected
#'  component is identified, no spectral initialization is attempted. Instead
#'  a PCA-based initialization is attempted. If \code{verbose = TRUE} the
#'  number of connected components are logged to the console. The existence of
#'  multiple connected components implies that a global view of the data cannot
#'  be attained with this initialization. Increasing the value of
#'  \code{n_neighbors} may help.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default no scaling is carried out, except when \code{init =
#'   "spca"}, in which case the value is \code{0.0001}. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For
#'   compatibility with recent versions of the Python UMAP package, if you are
#'   using \code{init = "spectral"}, then you should also set
#'   \code{init_sdev = "range"}, which will range scale each of the columns
#'   containing the initial data between 0-10. This is not set by default to
#'   maintain backwards compatibility with previous versions of uwot.
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'     \item \code{"hnsw"} Use approximate nearest neighbors with the
#'       Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin,
#'       2018) via the
#'       \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package.
#'       \code{RcppHNSW} is not a dependency of this package: this option is
#'       only available if you have installed \code{RcppHNSW} yourself. Also,
#'       HNSW only supports the following arguments for \code{metric}:
#'       \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}.
#'     \item \code{"nndescent"} Use approximate nearest neighbors with the
#'       Nearest Neighbor Descent method (Dong et al., 2011) via the
#'       \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#'       package. \code{rnndescent} is not a dependency of this package: this
#'       option is only available if you have installed \code{rnndescent}
#'       yourself.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass precalculated nearest neighbor data to this argument. It
#'   must be a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   Multiple nearest neighbor data (e.g. from two different precomputed
#'   metrics) can be passed by passing a list containing the nearest neighbor
#'   data lists as items.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param nn_args A list containing additional arguments to pass to the nearest
#'   neighbor method. For \code{nn_method = "annoy"}, you can specify
#'   \code{"n_trees"} and \code{"search_k"}, and these will override the
#'   \code{n_trees} and \code{search_k} parameters.
#'   For \code{nn_method = "hnsw"}, you may specify the following arguments:
#'   \itemize{
#'   \item \code{M} The maximum number of neighbors to keep for each vertex.
#'   Reasonable values are \code{2} to \code{100}. Higher values give better
#'   recall at the cost of more memory. Default value is \code{16}.
#'   \item \code{ef_construction} A positive integer specifying the size of
#'   the dynamic list used during index construction. A higher value will
#'   provide better results at the cost of a longer time to build the index.
#'   Default is \code{200}.
#'   \item \code{ef} A positive integer specifying the size of the dynamic
#'   list used during search. This cannot be smaller than \code{n_neighbors}
#'   and cannot be higher than the number of items in the index. Default is
#'   \code{10}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest.
#'   If you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   }
#'   descent). Default is half the number of concurrent threads supported by the
#'   system. For nearest neighbor search, only applies if
#'   \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#'   will be temporarily written to disk in the location determined by
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. Set to \code{"auto"} to use the same value as
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param kernel Type of kernel function to create input probabilities. Can be
#'   one of \code{"gauss"} (the default) or \code{"knn"}. \code{"gauss"} uses
#'   the usual Gaussian weighted similarities. \code{"knn"} assigns equal
#'   probabilities to every edge in the nearest neighbor graph, and zero
#'   otherwise, using \code{perplexity} nearest neighbors. The \code{n_neighbors}
#'   parameter is ignored in this case.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The
#'   default is \code{FALSE}. Setting this to \code{TRUE} will speed up the
#'   stochastic optimization phase, but give a potentially less accurate
#'   embedding, and which will not be exactly reproducible even with a fixed
#'   seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good
#'   results. For more generic dimensionality reduction, it's safer to leave
#'   \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied
#'   values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#'   nearest neighbor data that can be used as input to \code{nn_method} to
#'   avoid the overhead of repeatedly calculating the nearest neighbors when
#'   manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#'   \code{init}). See the "Value" section for the names of the list items. If
#'   \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#'   could be sensitive to data scaling, so be wary of reusing nearest neighbor
#'   data if modifying the \code{scale} parameter.
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'     \item \code{"nn"} same as setting \code{ret_nn = TRUE}.
#'     \item \code{"P"} the high dimensional probability matrix. The graph
#'       is returned as a sparse symmetric N x N matrix of class
#'       \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#'       input probability (or similarity or affinity) of the edge connecting
#'       vertex i and vertex j. Note that the graph is further sparsified by
#'       removing edges with sufficiently low membership strength that they
#'       would not be sampled by the probabilistic edge sampling employed for
#'       optimization and therefore the number of non-zero elements in the
#'       matrix is dependent on \code{n_epochs}. If you are only interested in
#'       the fuzzy input graph (e.g. for clustering), setting
#'       \code{n_epochs = 0} will avoid any further sparsifying. Be aware that
#'       setting \code{binary_edge_weights = TRUE} will affect this graph (all
#'       non-zero edge weights will be 1).
#'    \item \code{sigma} a vector of the bandwidths used to calibrate the input
#'       Gaussians to reproduce the target \code{"perplexity"}.
#'   }
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @param binary_edge_weights If \code{TRUE} then edge weights in the input
#'   graph are treated as binary (0/1) rather than real valued. This affects the
#'   sampling frequency of neighbors and is the strategy used by the PaCMAP
#'   method (Wang and co-workers, 2020). Practical (BÃ¶hm and co-workers, 2020)
#'   and theoretical (Damrich and Hamprecht, 2021) work suggests this has little
#'   effect on UMAP's performance.
#' @return A matrix of optimized coordinates, or:
#'   \itemize{
#'     \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#'     returns the nearest neighbor data as a list called \code{nn}. This
#'     contains one list for each \code{metric} calculated, itself containing a
#'     matrix \code{idx} with the integer ids of the neighbors; and a matrix
#'     \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#'     used as input to the \code{nn_method} parameter.
#'     \item if \code{ret_extra} contains \code{"P"}, returns the high
#'     dimensional probability matrix as a sparse matrix called \code{P}, of
#'     \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of
#'     the high dimensional gaussian bandwidths for each point, and
#'     \code{"dint"} a vector of estimates of the intrinsic dimensionality at
#'     each point, based on the method given by Lee and co-workers (2015).
#'   }
#'   The returned list contains the combined data from any combination of
#'   specifying \code{ret_nn} and \code{ret_extra}.
#'
#' @examples
#' # Default number of epochs is much larger than for UMAP, assumes random
#' # initialization. Use perplexity rather than n_neighbors to control the size
#' # of the local neighborhood 20 epochs may be too small for a random
#' # initialization
#' iris_lvish <- lvish(iris,
#'   perplexity = 50, learning_rate = 0.5,
#'   init = "random", n_epochs = 20
#' )
#'
#' @references
#' Belkin, M., & Niyogi, P. (2002).
#' Laplacian eigenmaps and spectral techniques for embedding and clustering.
#' In \emph{Advances in neural information processing systems}
#' (pp. 585-591).
#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
#'
#' BÃ¶hm, J. N., Berens, P., & Kobak, D. (2020).
#' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum.
#' \emph{arXiv preprint} \emph{arXiv:2007.08902}.
#' \url{https://arxiv.org/abs/2007.08902}
#'
#' Damrich, S., & Hamprecht, F. A. (2021).
#' On UMAP's true loss function.
#' \emph{Advances in Neural Information Processing Systems}, \emph{34}.
#' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html}
#'
#' Dong, W., Moses, C., & Li, K. (2011, March).
#' Efficient k-nearest neighbor graph construction for generic similarity measures.
#' In \emph{Proceedings of the 20th international conference on World Wide Web}
#' (pp. 577-586).
#' ACM.
#' \doi{10.1145/1963405.1963487}.
#'
#' Kingma, D. P., & Ba, J. (2014).
#' Adam: A method for stochastic optimization.
#' \emph{arXiv preprint} \emph{arXiv}:1412.6980.
#' \url{https://arxiv.org/abs/1412.6980}
#'
#' Lee, J. A., Peluffo-OrdÃ³Ã±ez, D. H., & Verleysen, M. (2015).
#' Multi-scale similarities in stochastic neighbour embedding: Reducing
#' dimensionality while preserving both local and global structure.
#' \emph{Neurocomputing}, \emph{169}, 246-261.
#'
#' Malkov, Y. A., & Yashunin, D. A. (2018).
#' Efficient and robust approximate nearest neighbor search using hierarchical
#' navigable small world graphs.
#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836.
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' Oâ€™Neill, M. E. (2014).
#' \emph{PCG: A family of simple fast space-efficient statistically good
#' algorithms for random number generation}
#' (Report No. HMC-CS-2014-0905). Harvey Mudd College.
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' Van der Maaten, L., & Hinton, G. (2008).
#' Visualizing data using t-SNE.
#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html}
#'
#' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021).
#' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization.
#' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73.
#' \url{https://www.jmlr.org/papers/v22/20-1061.html}
#'
#' @export
lvish <- function(X, perplexity = 50, n_neighbors = perplexity * 3,
n_components = 2, metric = "euclidean", n_epochs = -1,
learning_rate = 1, scale = "maxabs",
init = "lvrandom", init_sdev = NULL,
repulsion_strength = 7,
negative_sample_rate = 5.0,
nn_method = NULL, n_trees = 50,
search_k = 2 * n_neighbors * n_trees,
grain_size = 1,
kernel = "gauss",
pca = NULL, pca_center = TRUE,
pcg_rand = TRUE,
fast_sgd = FALSE,
ret_nn = FALSE, ret_extra = c(),
tmpdir = tempdir(),
verbose = getOption("verbose", TRUE),
batch = FALSE,
opt_args = NULL, epoch_callback = NULL,
pca_method = NULL,
binary_edge_weights = FALSE,
nn_args = list()) {
uwot(X,
n_neighbors = n_neighbors, n_components = n_components,
metric = metric,
n_epochs = n_epochs, alpha = learning_rate, scale = scale,
init = init, init_sdev = init_sdev,
gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
nn_method = nn_method,
n_trees = n_trees, search_k = search_k,
method = "largevis", perplexity = perplexity,
pca = pca, pca_center = pca_center, pca_method = pca_method,
grain_size = grain_size,
kernel = kernel,
ret_nn = ret_nn || "nn" %in% ret_extra,
ret_fgraph = "P" %in% ret_extra,
ret_sigma = "sigma" %in% ret_extra,
pcg_rand = pcg_rand,
fast_sgd = fast_sgd,
batch = batch,
opt_args = opt_args,
epoch_callback = epoch_callback,
tmpdir = tmpdir,
binary_edge_weights = binary_edge_weights,
verbose = verbose,
nn_args = list()
)
}

#' Similarity Graph
#'
#' Create a graph (as a sparse symmetric weighted adjacency matrix) representing
#' the similarities between items in a data set. No dimensionality reduction is
#' carried out. By default, the similarities are calculated using the merged
#' fuzzy simplicial set approach in the Uniform Manifold Approximation and
#' Projection (UMAP) method (McInnes et al., 2018), but the approach from
#' LargeVis (Tang et al., 2016) can also be used.
#'
#' This is equivalent to running \code{\link{umap}} with the
#' \code{ret_extra = c("fgraph")} parameter, but without the overhead of
#' calculating (or returning) the optimized low-dimensional coordinates.
#'
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}.
#' @param n_neighbors The size of local neighborhood (in terms of number of
#'   neighboring sample points) used for manifold approximation. Larger values
#'   result in more global views of the manifold, while smaller values result in
#'   more local data being preserved. In general values should be in the range
#'   \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. For
#'  \code{nn_method = "annoy"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' For \code{nn_method = "hnsw"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"}
#'   \item \code{"cosine"}
#'   \item \code{"correlation"}
#' }
#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed and \code{nn_method = "nndescent"} is specified then many more
#' metrics are avaiable, including:
#' \itemize{
#' \item \code{"braycurtis"}
#' \item \code{"canberra"}
#' \item \code{"chebyshev"}
#' \item \code{"dice"}
#' \item \code{"hamming"}
#' \item \code{"hellinger"}
#' \item \code{"jaccard"}
#' \item \code{"jensenshannon"}
#' \item \code{"kulsinski"}
#' \item \code{"rogerstanimoto"}
#' \item \code{"russellrao"}
#' \item \code{"sokalmichener"}
#' \item \code{"sokalsneath"}
#' \item \code{"spearmanr"}
#' \item \code{"symmetrickl"}
#' \item \code{"tsss"}
#' \item \code{"yule"}
#' }
#' For more details see the package documentation of \code{rnndescent}.
#' For \code{nn_method = "fnn"}, the distance metric is always "euclidean".
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For \code{method} \code{"umap"}, the default is \code{"none"}. For
#' \code{"largevis"}, the default is \code{"maxabs"}.
#' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as
#'   the set operation used to combine local fuzzy simplicial sets to obtain a
#'   global fuzzy simplicial sets. Both fuzzy set operations use the product
#'   t-norm. The value of this parameter should be between \code{0.0} and
#'   \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while
#'   \code{0.0} will use a pure fuzzy intersection. Ignored if
#'   \code{method = "largevis"}
#' @param local_connectivity The local connectivity required -- i.e. the number
#'   of nearest neighbors that should be assumed to be connected at a local
#'   level. The higher this value the more connected the manifold becomes
#'   locally. In practice this should be not more than the local intrinsic
#'   dimension of the manifold. Ignored if \code{method = "largevis"}.
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'     \item \code{"hnsw"} Use approximate nearest neighbors with the
#'       Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin,
#'       2018) via the
#'       \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package.
#'       \code{RcppHNSW} is not a dependency of this package: this option is
#'       only available if you have installed \code{RcppHNSW} yourself. Also,
#'       HNSW only supports the following arguments for \code{metric} and
#'       \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and
#'       \code{"correlation"}.
#'     \item \code{"nndescent"} Use approximate nearest neighbors with the
#'       Nearest Neighbor Descent method (Dong et al., 2011) via the
#'       \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#'       package. \code{rnndescent} is not a dependency of this package: this
#'       option is only available if you have installed \code{rnndescent}
#'       yourself.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass pre-calculated nearest neighbor data to this argument. It
#'   must be one of two formats, either a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   or a sparse distance matrix of type \code{dgCMatrix}, with dimensions
#'   \code{n_vertices x n_vertices}. Distances should be arranged by column,
#'   i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates
#'   that the \code{j}th observation in \code{X} is a nearest neighbor of the
#'   \code{i}th observation with the distance given by the value of that
#'   element.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data. If using the sparse distance matrix input, each
#'   column can contain a different number of neighbors.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param nn_args A list containing additional arguments to pass to the nearest
#'   neighbor method. For \code{nn_method = "annoy"}, you can specify
#'   \code{"n_trees"} and \code{"search_k"}, and these will override the
#'   \code{n_trees} and \code{search_k} parameters.
#'   For \code{nn_method = "hnsw"}, you may specify the following arguments:
#'   \itemize{
#'   \item \code{M} The maximum number of neighbors to keep for each vertex.
#'   Reasonable values are \code{2} to \code{100}. Higher values give better
#'   recall at the cost of more memory. Default value is \code{16}.
#'   \item \code{ef_construction} A positive integer specifying the size of
#'   the dynamic list used during index construction. A higher value will
#'   provide better results at the cost of a longer time to build the index.
#'   Default is \code{200}.
#'   \item \code{ef} A positive integer specifying the size of the dynamic
#'   list used during search. This cannot be smaller than \code{n_neighbors}
#'   and cannot be higher than the number of items in the index. Default is
#'   \code{10}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest.
#'   If you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   \item \code{pruning_degree_multiplier} The maximum number of edges per node
#'   to retain in the search graph, relative to \code{n_neighbors}. A larger
#'   value will give more accurate results at the cost of a longer computation
#'   time. Default is \code{1.5}. This parameter only affects neighbor search
#'   when transforming new data with \code{\link{umap_transform}}.
#'   \item \code{epsilon} Controls the degree of the back-tracking when
#'   traversing the search graph. Setting this to \code{0.0} will do a greedy
#'   search with no back-tracking. A larger value will give more accurate
#'   results at the cost of a longer computation time. Default is \code{0.1}.
#'   This parameter only affects neighbor search when transforming new data with
#'   \item \code{max_search_fraction} Specifies the maximum fraction of the
#'   search graph to traverse. By default, this is set to \code{1.0}, so the
#'   entire graph (i.e. all items in \code{X}) may be visited. You may want to
#'   set this to a smaller value if you have a very large dataset (in
#'   conjunction with \code{epsilon}) to avoid an inefficient exhaustive search
#'   of the data in \code{X}. This parameter only affects neighbor search when
#'   transforming new data with \code{\link{umap_transform}}.
#'   }
#' @param perplexity Used only if \code{method = "largevis"}. Controls the size
#'   of the local neighborhood used for manifold approximation. Should be a
#'   value between 1 and one less than the number of items in \code{X}. If
#'   specified, you should \emph{not} specify a value for \code{n_neighbors}
#'   unless you know what you are doing.
#' @param kernel Used only if \code{method = "largevis"}. Type of kernel
#'   function to create input similiarties. Can be one of \code{"gauss"} (the
#'   default) or \code{"knn"}. \code{"gauss"} uses the usual Gaussian weighted
#'   similarities. \code{"knn"} assigns equal similiarties. to every edge in the
#'   nearest neighbor graph, and zero otherwise, using \code{perplexity} nearest
#'   neighbors. The \code{n_neighbors} parameter is ignored in this case.
#' @param method How to generate the similarities between items. One of:
#'   \itemize{
#'     \item \code{"umap"} The UMAP method of McInnes et al. (2018).
#'     \item \code{"largevis"} The LargeVis method of Tang et al. (2016).
#'   }
#' @param y Optional target data to add supervised or semi-supervised weighting
#'   to the similarity graph . Can be a vector, matrix or data frame. Use the
#'   \code{target_metric} parameter to specify the metrics to use, using the
#'   same syntax as \code{metric}. Usually either a single numeric or factor
#'   column is used, but more complex formats are possible. The following types
#'   are allowed:
#'   \itemize{
#'     \item Factor columns with the same length as \code{X}. \code{NA} is
#'     allowed for any observation with an unknown level, in which case
#'     UMAP operates as a form of semi-supervised learning. Each column is
#'     treated separately.
#'     \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the
#'     parameter \code{target_n_neighbors} to set the number of neighbors used
#'     with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors,
#'     numeric columns are grouped into one block unless \code{target_metric}
#'     specifies otherwise. For example, if you wish columns \code{a} and
#'     \code{b} to be treated separately, specify
#'     \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise,
#'     the data will be effectively treated as a matrix with two columns.
#'     \item Nearest neighbor data, consisting of a list of two matrices,
#'     \code{idx} and \code{dist}. These represent the precalculated nearest
#'     neighbor indices and distances, respectively. This
#'     is the same format as that expected for precalculated data in
#'     \code{nn_method}. This format assumes that the underlying data was a
#'     numeric vector. Any user-supplied value of the \code{target_n_neighbors}
#'     parameter is ignored in this case, because the the number of columns in
#'     the matrices is used for the value. Multiple nearest neighbor data using
#'     different metrics can be supplied by passing a list of these lists.
#'   }
#' Unlike \code{X}, all factor columns included in \code{y} are automatically
#' used. This parameter is ignored if \code{method = "largevis"}.
#' @param target_n_neighbors Number of nearest neighbors to use to construct the
#'   target simplicial set. Default value is \code{n_neighbors}. Applies only if
#'   \code{y} is non-\code{NULL} and \code{numeric}.  This parameter is ignored
#'   if \code{method = "largevis"}.
#' @param target_metric The metric used to measure distance for \code{y} if
#'   using supervised dimension reduction. Used only if \code{y} is numeric.
#'   This parameter is ignored if \code{method = "largevis"}.
#' @param target_weight Weighting factor between data topology and target
#'   topology. A value of 0.0 weights entirely on data, a value of 1.0 weights
#'   entirely on target. The default of 0.5 balances the weighting equally
#'   between data and target. Only applies if \code{y} is non-\code{NULL}. This
#'   parameter is ignored if \code{method = "largevis"}.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'    \item \code{"nn"} nearest neighbor data that can be used as input to
#'       \code{nn_method} to avoid the overhead of repeatedly calculating the
#'       nearest neighbors when manipulating unrelated parameters. See the
#'       "Value" section for the names of the list items. Note that the nearest
#'       neighbors could be sensitive to data scaling, so be wary of reusing
#'       nearest neighbor data if modifying the \code{scale} parameter.
#'    \item \code{"sigma"} the normalization value for each observation in the
#'       dataset when constructing the smoothed distances to each of its
#'       neighbors. This gives some sense of the local density of each
#'       observation in the high dimensional space: higher values of
#'       \code{sigma} indicate a higher dispersion or lower density.
#'   }
#' @param n_threads Number of threads to use. Default is half the number of
#'   concurrent threads supported by the system. For nearest neighbor search,
#'   only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then
#'   the Annoy index will be temporarily written to disk in the location
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} will be used for
#'   processing, which might give a performance improvement if the overhead of
#'   thread management and context switching was outweighing the improvement due
#'   to concurrent processing. This should be left at default (\code{1}) and
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param binary_edge_weights If \code{TRUE} then edge weights of the returned
#'   graph are binary (0/1) rather than reflecting the degree of similarity.
#' @return A sparse symmetrized matrix of the similarities between the items in
#'   \code{X} or if \code{nn_method} contains pre-computed nearest neighbor
#'   data, the items in \code{nn_method}. Because of the symmetrization, there
#'   may be more non-zero items in each column than the specified value of
#'   \code{n_neighbors} (or pre-computed neighbors in \code{nn_method}).
#'   If \code{ret_extra} is specified then the return value will be a list
#'   containing:
#'   \itemize{
#'     \item \code{similarity_graph} the similarity graph as a sparse matrix
#'     as described above.
#'     \item \code{nn} (if \code{ret_extra} contained \code{"nn"}) the nearest
#'     neighbor data as a list called \code{nn}. This contains one list for each
#'     \code{metric} calculated, itself containing a matrix \code{idx} with the
#'     integer ids of the neighbors; and a matrix \code{dist} with the
#'     distances. The \code{nn} list (or a sub-list) can be used as input to the
#'     \code{nn_method} parameter.
#'     \item \code{sigma} (if \code{ret_extra} contains \code{"sigma"}),
#'     a vector of calibrated parameters, one for each item in the input data,
#'     reflecting the local data density for that item. The exact definition of
#'     the values depends on the choice of the \code{method} parameter.
#'     \item \code{rho} (if \code{ret_extra} contains \code{"sigma"}), a
#'     vector containing the largest distance to the locally connected neighbors
#'     of each item in the input data. This will exist only if
#'     \code{method = "umap"}.
#'     \item \code{localr} (if \code{ret_extra} contains \code{"localr"}) a
#'     vector of the estimated local radii, the sum of \code{"sigma"} and
#'     \code{"rho"}. This will exist only if \code{method = "umap"}.
#'   }
#' @examples
#'
#' iris30 <- iris[c(1:10, 51:60, 101:110), ]
#'
#' # return a 30 x 30 sparse matrix with similarity data based on 10 nearest
#' # neighbors per item
#' iris30_sim_graph <- similarity_graph(iris30, n_neighbors = 10)
#'
#' # Default is to use the UMAP method of calculating similarities, but LargeVis
#' # is also available: for that method, use perplexity instead of n_neighbors
#' # to control neighborhood size. Use ret_extra = "nn" to return nearest
#' # neighbor data as well as the similarity graph. Return value is a list
#' # containing similarity_graph' and 'nn' items.
#' iris30_lv_graph <- similarity_graph(iris30,
#'   perplexity = 10,
#'   method = "largevis", ret_extra = "nn"
#' )
#' # If you have the neighbor information you don't need the original data
#' iris30_lv_graph_nn <- similarity_graph(
#'   nn_method = iris30_lv_graph$nn, #' perplexity = 10, method = "largevis" #' ) #' all(iris30_lv_graph_nn == iris30_lv_graph$similarity_graph)
#'
#' @references
#' Dong, W., Moses, C., & Li, K. (2011, March).
#' Efficient k-nearest neighbor graph construction for generic similarity measures.
#' In \emph{Proceedings of the 20th international conference on World Wide Web}
#' (pp. 577-586).
#' ACM.
#' \doi{10.1145/1963405.1963487}.
#'
#' Malkov, Y. A., & Yashunin, D. A. (2018).
#' Efficient and robust approximate nearest neighbor search using hierarchical
#' navigable small world graphs.
#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836.
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' @export
similarity_graph <- function(X = NULL, n_neighbors = NULL, metric = "euclidean",
scale = NULL,
set_op_mix_ratio = 1.0, local_connectivity = 1.0,
nn_method = NULL, n_trees = 50,
search_k = 2 * n_neighbors * n_trees,
perplexity = 50,
method = "umap",
y = NULL, target_n_neighbors = n_neighbors,
target_metric = "euclidean",
target_weight = 0.5,
pca = NULL, pca_center = TRUE,
ret_extra = c(),
grain_size = 1,
kernel = "gauss",
tmpdir = tempdir(),
verbose = getOption("verbose", TRUE),
pca_method = NULL,
binary_edge_weights = FALSE,
nn_args = list()) {
if (is.null(n_neighbors)) {
if (method == "largevis") {
n_neighbors <- perplexity * 3
scale <- "maxabs"
} else {
n_neighbors <- 15
scale <- FALSE
}
}

uwot_res <- uwot(
X = X, n_neighbors = n_neighbors,
metric = metric, n_epochs = 0, scale = scale,
init = NULL,
set_op_mix_ratio = set_op_mix_ratio,
local_connectivity = local_connectivity,
nn_method = nn_method, n_trees = n_trees,
search_k = search_k,
method = method,
grain_size = grain_size,
kernel = kernel, perplexity = perplexity,
y = y, target_n_neighbors = target_n_neighbors,
target_weight = target_weight, target_metric = target_metric,
pca = pca, pca_center = pca_center, pca_method = pca_method,
ret_model = FALSE,
ret_nn = "nn" %in% ret_extra,
ret_fgraph = TRUE,
ret_sigma = "sigma" %in% ret_extra,
ret_localr = "localr" %in% ret_extra,
binary_edge_weights = binary_edge_weights,
tmpdir = tempdir(),
verbose = verbose,
nn_args = nn_args
)

res <- list()
for (name in names(uwot_res)) {
if (name == "embedding") {
# embedding will be NULL so remove it
next
}
if (name == "P" || name == "fgraph") {
res$similarity_graph <- uwot_res[[name]] } else { res[[name]] <- uwot_res[[name]] } } if (length(names(res)) == 1 && !is.null(res$similarity_graph)) {
# return just the similarity graph if no extras were requested
res <- res$similarity_graph } res } #' Optimize Graph Layout #' #' Carry out dimensionality reduction on an input graph, where the distances in #' the low dimensional space attempt to reproduce the neighbor relations in the #' input data. By default, the cost function used to optimize the output #' coordinates use the Uniform Manifold Approximation and Projection (UMAP) #' method (McInnes et al., 2018), but the approach from LargeVis (Tang et al., #' 2016) can also be used. This function can be used to produce a low #' dimensional representation of the graph produced by #' \code{\link{similarity_graph}}. #' #' @param graph A sparse, symmetric N x N weighted adjacency matrix #' representing a graph. Non-zero entries indicate an edge between two nodes #' with a given edge weight. There can be a varying number of non-zero entries #' in each row/column. #' @param X Optional input data. Used only for PCA-based initialization. #' @param n_components The dimension of the space to embed into. This defaults #' to \code{2} to provide easy visualization, but can reasonably be set to any #' integer value in the range \code{2} to \code{100}. #' @param n_epochs Number of epochs to use during the optimization of the #' embedded coordinates. By default, this value is set to \code{500} for #' datasets containing 10,000 vertices or less, and \code{200} otherwise. #' If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will #' be returned. #' For UMAP, the default is \code{"none"}. #' @param learning_rate Initial learning rate used in optimization of the #' coordinates. #' @param init Type of initialization for the coordinates. Options are: #' \itemize{ #' \item \code{"spectral"} Spectral embedding using the normalized Laplacian #' of the fuzzy 1-skeleton, with Gaussian noise added. #' \item \code{"normlaplacian"}. Spectral embedding using the normalized #' Laplacian of the fuzzy 1-skeleton, without noise. #' \item \code{"random"}. Coordinates assigned using a uniform random #' distribution between -10 and 10. #' \item \code{"lvrandom"}. Coordinates assigned using a Gaussian #' distribution with standard deviation 1e-4, as used in LargeVis #' (Tang et al., 2016) and t-SNE. #' \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap. #' \item \code{"pca"}. The first two principal components from PCA of #' \code{X} if \code{X} is a data frame, and from a 2-dimensional classical #' MDS if \code{X} is of class \code{"dist"}. #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to that #' used in t-SNE. This is an alias for \code{init = "pca", init_sdev = #' 1e-4}. #' \item \code{"agspectral"} An "approximate global" modification of #' \code{"spectral"} which all edges in the graph to a value of 1, and then #' sets a random number of edges (\code{negative_sample_rate} edges per #' vertex) to 0.1, to approximate the effect of non-local affinities. #' \item A matrix of initial coordinates. #' } #' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, #' \code{"laplacian"}, \code{"agspectral"}), if more than one connected #' component is identified, no spectral initialization is attempted. Instead #' a PCA-based initialization is attempted. If \code{verbose = TRUE} the #' number of connected components are logged to the console. The existence of #' multiple connected components implies that a global view of the data cannot #' be attained with this initialization. Increasing the value of #' \code{n_neighbors} may help. #' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized #' coordinates (including any user-supplied matrix) to this standard #' deviation. By default no scaling is carried out, except when \code{init = #' "spca"}, in which case the value is \code{0.0001}. Scaling the input may #' help if the unscaled versions result in initial coordinates with large #' inter-point distances or outliers. This usually results in small gradients #' during optimization and very little progress being made to the layout. #' Shrinking the initial embedding by rescaling can help under these #' circumstances. Scaling the result of \code{init = "pca"} is usually #' recommended and \code{init = "spca"} as an alias for \code{init = "pca", #' init_sdev = 1e-4} but for the spectral initializations the scaled versions #' usually aren't necessary unless you are using a large value of #' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For #' compatibility with recent versions of the Python UMAP package, if you are #' using \code{init = "spectral"}, then you should also set #' \code{init_sdev = "range"}, which will range scale each of the columns #' containing the initial data between 0-10. This is not set by default to #' maintain backwards compatibility with previous versions of uwot. #' @param spread The effective scale of embedded points. In combination with #' \code{min_dist}, this determines how clustered/clumped the embedded points #' are. #' @param min_dist The effective minimum distance between embedded points. #' Smaller values will result in a more clustered/clumped embedding where #' nearby points on the manifold are drawn closer together, while larger #' values will result on a more even dispersal of points. The value should be #' set relative to the \code{spread} value, which determines the scale at #' which embedded points will be spread out. #' @param repulsion_strength Weighting applied to negative samples in low #' dimensional embedding optimization. Values higher than one will result in #' greater weight being given to negative samples. #' @param negative_sample_rate The number of negative edge/1-simplex samples to #' use per positive edge/1-simplex sample in optimizing the low dimensional #' embedding. #' @param a More specific parameters controlling the embedding. If \code{NULL} #' these values are set automatically as determined by \code{min_dist} and #' \code{spread}. #' @param b More specific parameters controlling the embedding. If \code{NULL} #' these values are set automatically as determined by \code{min_dist} and #' \code{spread}. #' @param approx_pow If \code{TRUE}, use an approximation to the power function #' in the UMAP gradient, from #' \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}. #' @param method Cost function to optimize. One of: #' \itemize{ #' \item{\code{"umap"}}. The UMAP method of McInnes and co-workers (2018). #' \item{\code{"tumap"}}. UMAP with the \code{a} and \code{b} parameters fixed #' to 1. #' \item{\code{"largevis"}}. The LargeVis method Tang and co-workers (2016). #' } #' @param pca_method Method to carry out any PCA dimensionality reduction when #' the \code{pca} parameter is specified. Allowed values are: #' \itemize{ #' \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the #' \href{https://cran.r-project.org/package=irlba}{irlba} package. #' \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from #' the \href{https://cran.r-project.org/package=irlba}{irlba} package. #' This is likely to give much faster but potentially less accurate results #' than using \code{"irlba"}. For the purposes of nearest neighbor #' calculation and coordinates initialization, any loss of accuracy doesn't #' seem to matter much. #' \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}} #' from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr} #' package. The SVD methods used in \code{bigstatsr} may be faster on #' systems without access to efficient linear algebra libraries (e.g. #' Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of #' uwot: if you choose to use this package for PCA, you \emph{must} install #' it yourself. #' \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is #' likely to be slow for all but the smallest datasets. #' \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than #' 50% of the full set of singular vectors would be calculated, in which #' case \code{"svd"} is used. #' } #' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill, #' 2014) during optimization. Otherwise, use the faster (but probably less #' statistically good) Tausworthe "taus88" generator. The default is #' \code{TRUE}. #' @param fast_sgd If \code{TRUE}, then the following combination of parameters #' is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and #' \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to #' \code{TRUE} will speed up the stochastic optimization phase, but give a #' potentially less accurate embedding, and which will not be exactly #' reproducible even with a fixed seed. For visualization, \code{fast_sgd = #' TRUE} will give perfectly good results. For more generic dimensionality #' reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd = #' TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads}, #' and \code{approx_pow} are ignored. #' @param batch If \code{TRUE}, then embedding coordinates are updated at the #' end of each epoch rather than during the epoch. In batch mode, results are #' reproducible with a fixed random seed even with \code{n_sgd_threads > 1}, #' at the cost of a slightly higher memory use. You may also have to modify #' \code{learning_rate} and increase \code{n_epochs}, so whether this provides #' a speed increase over the single-threaded optimization is likely to be #' dataset and hardware-dependent. #' @param n_sgd_threads Number of threads to use during stochastic gradient #' descent. If set to > 1, then be aware that if \code{batch = FALSE}, results #' will \emph{not} be reproducible, even if \code{set.seed} is called with a #' fixed seed before running. If set to \code{"auto"} then half the number of #' concurrent threads supported by the system will be used. #' @param grain_size The minimum amount of work to do on each thread. If this #' value is set high enough, then less than \code{n_threads} or #' \code{n_sgd_threads} will be used for processing, which might give a #' performance improvement if the overhead of thread management and context #' switching was outweighing the improvement due to concurrent processing. #' This should be left at default (\code{1}) and work will be spread evenly #' over all the threads specified. #' @param verbose If \code{TRUE}, log details to the console. #' @param opt_args A list of optimizer parameters, used when #' \code{batch = TRUE}. The default optimization method used is Adam (Kingma #' and Ba, 2014). #' \itemize{ #' \item \code{method} The optimization method to use. Either \code{"adam"} #' or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}. #' \item \code{beta1} (Adam only). The weighting parameter for the #' exponential moving average of the first moment estimator. Effectively the #' momentum parameter. Should be a floating point value between 0 and 1. #' Higher values can smooth oscillatory updates in poorly-conditioned #' situations and may allow for a larger \code{learning_rate} to be #' specified, but too high can cause divergence. Default: \code{0.5}. #' \item \code{beta2} (Adam only). The weighting parameter for the #' exponential moving average of the uncentered second moment estimator. #' Should be a floating point value between 0 and 1. Controls the degree of #' adaptivity in the step-size. Higher values put more weight on previous #' time steps. Default: \code{0.9}. #' \item \code{eps} (Adam only). Intended to be a small value to prevent #' division by zero, but in practice can also affect convergence due to its #' interaction with \code{beta2}. Higher values reduce the effect of the #' step-size adaptivity and bring the behavior closer to stochastic gradient #' descent with momentum. Typical values are between 1e-8 and 1e-3. Default: #' \code{1e-7}. #' \item \code{alpha} The initial learning rate. Default: the value of the #' \code{learning_rate} parameter. #' } #' @param epoch_callback A function which will be invoked at the end of every #' epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where: #' \itemize{ #' \item \code{epoch} The current epoch number (between \code{1} and #' \code{n_epochs}). #' \item \code{n_epochs} Number of epochs to use during the optimization of #' the embedded coordinates. #' \item \code{coords} The embedded coordinates as of the end of the current #' epoch, as a matrix with dimensions (N, \code{n_components}). #' } #' @param binary_edge_weights If \code{TRUE} then edge weights in the input #' graph are treated as binary (0/1) rather than real valued. #' @return A matrix of optimized coordinates. #' #' @examples #' #' iris30 <- iris[c(1:10, 51:60, 101:110), ] #' #' # return a 30 x 30 sparse matrix with similarity data based on 10 nearest #' # neighbors per item #' iris30_sim_graph <- similarity_graph(iris30, n_neighbors = 10) #' # produce 2D coordinates replicating the neighbor relations in the similarity #' # graph #' set.seed(42) #' iris30_opt <- optimize_graph_layout(iris30_sim_graph, X = iris30) #' #' # the above two steps are the same as: #' # set.seed(42); iris_umap <- umap(iris30, n_neighbors = 10) #' #' @references #' Kingma, D. P., & Ba, J. (2014). #' Adam: A method for stochastic optimization. #' \emph{arXiv preprint} \emph{arXiv}:1412.6980. #' \url{https://arxiv.org/abs/1412.6980} #' #' McInnes, L., Healy, J., & Melville, J. (2018). #' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction #' \emph{arXiv preprint} \emph{arXiv}:1802.03426. #' \url{https://arxiv.org/abs/1802.03426} #' #' Oâ€™Neill, M. E. (2014). #' \emph{PCG: A family of simple fast space-efficient statistically good #' algorithms for random number generation} #' (Report No. HMC-CS-2014-0905). Harvey Mudd College. #' #' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April). #' Visualizing large-scale and high-dimensional data. #' In \emph{Proceedings of the 25th International Conference on World Wide Web} #' (pp. 287-297). #' International World Wide Web Conferences Steering Committee. #' \url{https://arxiv.org/abs/1602.00370} #' #' @export optimize_graph_layout <- function(graph, X = NULL, n_components = 2, n_epochs = NULL, learning_rate = 1, init = "spectral", init_sdev = NULL, spread = 1, min_dist = 0.01, repulsion_strength = 1.0, negative_sample_rate = 5.0, a = NULL, b = NULL, method = "umap", approx_pow = FALSE, pcg_rand = TRUE, fast_sgd = FALSE, n_sgd_threads = 0, grain_size = 1, verbose = getOption("verbose", TRUE), batch = FALSE, opt_args = NULL, epoch_callback = NULL, pca_method = NULL, binary_edge_weights = FALSE) { if (!is_sparse_matrix(graph)) { stop("graph should be a sparse matrix") } if (nrow(graph) != ncol(graph)) { stop("graph should be a square matrix") } if (!Matrix::isSymmetric(graph)) { stop("graph should be symmetric") } if (!all(diff(graph@p) > 0)) { stop("All items must have at least one neighbor similarity defined") } # Just do things the UMAP way or we will have a very slow largevis # optimization if (is.null(n_epochs)) { n_vertices <- nrow(graph) if (n_vertices <= 10000) { n_epochs <- 500 } else { n_epochs <- 200 } } uwot( X = X, nn_method = graph, is_similarity_graph = TRUE, n_components = n_components, n_epochs = n_epochs, alpha = learning_rate, init = init, init_sdev = init_sdev, spread = spread, min_dist = min_dist, gamma = repulsion_strength, negative_sample_rate = negative_sample_rate, a = a, b = b, method = method, approx_pow = approx_pow, pcg_rand = pcg_rand, fast_sgd = fast_sgd, n_sgd_threads = n_sgd_threads, grain_size = grain_size, verbose = verbose, batch = batch, opt_args = opt_args, epoch_callback = epoch_callback, pca_method = pca_method ) } #' Merge Similarity Graph by Simplicial Set Union #' #' Combine two similarity graphs by treating them as fuzzy topological sets and #' forming the union. #' #' @param x A sparse matrix representing the first similarity graph in the union #' operation. #' @param y A sparse matrix representing the second similarity graph in the #' union operation. #' @param n_threads Number of threads to use when resetting the local metric. #' Default is half the number of concurrent threads supported by the system. #' @param verbose If \code{TRUE}, log progress to the console. #' @returns A sparse matrix containing the union of \code{x} and \code{y}. #' @examples #' #' # Form two different "views" of the same data #' iris30 <- iris[c(1:10, 51:60, 101:110), ] #' iris_sg12 <- similarity_graph(iris30[, 1:2], n_neighbors = 5) #' iris_sg34 <- similarity_graph(iris30[, 3:4], n_neighbors = 5) #' #' # Combine the two representations into one #' iris_combined <- simplicial_set_union(iris_sg12, iris_sg34) #' #' # Optimize the layout based on the combined view #' iris_combined_umap <- optimize_graph_layout(iris_combined, n_epochs = 100) #' @export simplicial_set_union <- function(x, y, n_threads = NULL, verbose = FALSE) { if (!is_sparse_matrix(x)) { stop("similarity graph x must be a sparse matrix") } if (!is_sparse_matrix(y)) { stop("similarity graph y must be a sparse matrix") } if (!all(dim(x) == dim(y))) { stop("x and y must have identical dimensions") } z <- methods::as(x + y, "TsparseMatrix") z@x <- general_sset_union_cpp( x@p, x@i, x@x, y@p, y@i, y@x, z@i, z@j, z@x ) z <- Matrix::drop0(z) reset_local_connectivity( z, reset_local_metric = TRUE, n_threads = n_threads, verbose = verbose ) } #' Merge Similarity Graph by Simplicial Set Intersection #' #' Combine two similarity graphs by treating them as fuzzy topological sets and #' forming the intersection. #' #' @param x A sparse matrix representing the first similarity graph in the #' intersection operation. #' @param y A sparse matrix representing the second similarity graph in the #' intersection operation. #' @param weight A value between \code{0 - 1}, controlling the relative #' influence of \code{x} and \code{y} in the intersection. Default #' (\code{0.5}) gives equal influence. Values smaller than \code{0.5} put more #' weight on \code{x}. Values greater than \code{0.5} put more weight on #' \code{y}. #' @param n_threads Number of threads to use when resetting the local metric. #' Default is half the number of concurrent threads supported by the system. #' @param verbose If \code{TRUE}, log progress to the console. #' @returns A sparse matrix containing the intersection of \code{x} and #' \code{y}. #' @examples #' #' # Form two different "views" of the same data #' iris30 <- iris[c(1:10, 51:60, 101:110), ] #' iris_sg12 <- similarity_graph(iris30[, 1:2], n_neighbors = 5) #' iris_sg34 <- similarity_graph(iris30[, 3:4], n_neighbors = 5) #' #' # Combine the two representations into one #' iris_combined <- simplicial_set_intersect(iris_sg12, iris_sg34) #' #' # Optimize the layout based on the combined view #' iris_combined_umap <- optimize_graph_layout(iris_combined, n_epochs = 100) #' @export simplicial_set_intersect <- function(x, y, weight = 0.5, n_threads = NULL, verbose = FALSE) { if (weight < 0 || weight > 1) { stop("weight must be between 0-1") } if (!is_sparse_matrix(x)) { stop("similarity graph x must be a sparse matrix") } if (!is_sparse_matrix(y)) { stop("similarity graph y must be a sparse matrix") } if (!all(dim(x) == dim(y))) { stop("x and y must have identical dimensions") } set_intersect( A = x, B = y, weight = weight, reset_connectivity = TRUE, reset_local_metric = TRUE, n_threads = n_threads, verbose = verbose ) } # Function that does all the real work uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", n_epochs = NULL, alpha = 1, scale = FALSE, init = "spectral", init_sdev = NULL, spread = 1, min_dist = 0.01, set_op_mix_ratio = 1.0, local_connectivity = 1.0, bandwidth = 1.0, gamma = 1.0, negative_sample_rate = 5.0, a = NULL, b = NULL, nn_method = NULL, n_trees = 50, search_k = 2 * n_neighbors * n_trees, method = "umap", perplexity = 50, approx_pow = FALSE, y = NULL, target_n_neighbors = n_neighbors, target_metric = "euclidean", target_weight = 0.5, n_threads = NULL, n_sgd_threads = 0, grain_size = 1, kernel = "gauss", ret_model = FALSE, ret_nn = FALSE, ret_fgraph = FALSE, ret_sigma = FALSE, ret_localr = FALSE, pca = NULL, pca_center = TRUE, pca_method = NULL, pcg_rand = TRUE, fast_sgd = FALSE, batch = FALSE, opt_args = NULL, tmpdir = tempdir(), verbose = getOption("verbose", TRUE), epoch_callback = NULL, binary_edge_weights = FALSE, dens_scale = NULL, is_similarity_graph = FALSE, seed = NULL, nn_args = list(), sparse_X_is_distance_matrix = TRUE) { if (is.null(n_threads)) { n_threads <- default_num_threads() } method <- match.arg(tolower(method), c("umap", "tumap", "largevis")) if (method == "umap") { if (is.null(a) || is.null(b)) { ab_res <- find_ab_params(spread = spread, min_dist = min_dist) a <- ab_res[1] b <- ab_res[2] tsmessage("UMAP embedding parameters a = ", formatC(a), " b = ", formatC(b)) } else { # set min_dist and spread to NULL so if ret_model = TRUE, their default # values are not mistaken for having been used for anything min_dist <- NULL spread <- NULL } } if (n_neighbors < 2) { stop("n_neighbors must be >= 2") } if (set_op_mix_ratio < 0.0 || set_op_mix_ratio > 1.0) { stop("set_op_mix_ratio must be between 0.0 and 1.0") } if (local_connectivity < 1.0) { stop("local_connectivity cannot be < 1.0") } if (!is.null(y) && is.numeric(y) && any(is.na(y))) { stop("numeric y cannot contain NA") } if (!is.numeric(n_components) || n_components < 1) { stop("'n_components' must be a positive integer") } if (!is.null(pca)) { if (!is.numeric(pca) || pca < 1) { stop("'pca' must be a positive integer") } if (pca < n_components) { stop("'pca' must be >= n_components") } if (pca > min(nrow(X), na.rm = col(X))) { stop("'pca' must be <= min(nrow(X), ncol(X))") } } if (is.null(pca_method)) { pca_method <- "auto" } pca_method <- match.arg(pca_method, choices = c("irlba", "svdr", "bigstatsr", "svd", "auto") ) if (fast_sgd) { n_sgd_threads <- "auto" pcg_rand <- FALSE approx_pow <- TRUE } if (n_threads < 0) { stop("n_threads cannot be < 0") } if (n_threads %% 1 != 0) { n_threads <- round(n_threads) tsmessage("Non-integer 'n_threads' provided. Setting to ", n_threads) } if (n_sgd_threads == "auto") { n_sgd_threads <- n_threads } if (n_sgd_threads < 0) { stop("n_sgd_threads cannot be < 0") } if (n_sgd_threads %% 1 != 0) { n_sgd_threads <- round(n_sgd_threads) tsmessage("Non-integer 'n_sgd_threads' provided. Setting to ", n_sgd_threads) } if (!is.null(dens_scale) && approx_pow) { warning("approx_pow parameter is ignored when using dens_scale") approx_pow <- FALSE } # 110: for more consistent reproducibility set a user-supplied seed if (!is.null(seed)) { tsmessage("Setting random seed ", seed) set.seed(seed) } if (is.character(nn_method) && nn_method == "hnsw") { if (!is_installed("RcppHNSW")) { stop("RcppHNSW is required for nn_method = 'hnsw', please install it") } if (!is_ok_hnsw_metric(metric)) { stop( "bad metric: hnsw only supports 'euclidean', 'cosine' or ", "'correlation' metrics" ) } if (!is_ok_hnsw_metric(target_metric)) { stop( "bad target_metric: hnsw only supports 'euclidean', 'cosine' or ", "'correlation' metrics" ) } } if (is.character(nn_method) && nn_method == "nndescent") { if (!is_installed("rnndescent")) { stop("rnndescent is required for nn_method = 'nndescent',", " please install it") } } ret_extra <- ret_model || ret_nn || ret_fgraph || ret_sigma || ret_localr # Store categorical columns to be used to generate the graph Xcat <- NULL # number of original columns in data frame (or matrix) # will be used only if using df or matrix and ret_model = TRUE norig_col <- NULL # row names for the input data, which we will apply to the embedding if # needed Xnames <- NULL num_precomputed_nns <- 0 if (is.null(X)) { if (!nn_is_precomputed(nn_method)) { stop("If X is NULL, must provide NN data in nn_method") } if (is.character(init) && tolower(init) %in% c("spca", "pca")) { stop("init = 'pca' and 'spca' can't be used with X = NULL") } if (length(nn_method) == 0) { stop("Incorrect format for precalculated neighbor data") } n_vertices <- x2nv(nn_method) stopifnot(n_vertices > 0) num_precomputed_nns <- check_graph_list(nn_method, n_vertices, bipartite = FALSE ) Xnames <- nn_graph_row_names_list(nn_method) } else if (inherits(X, "dist")) { if (ret_model) { stop("Can only create models with dense matrix or data frame input") } checkna(X) n_vertices <- attr(X, "Size") tsmessage("Read ", n_vertices, " rows") Xnames <- labels(X) } else if (is_sparse_matrix(X) && sparse_X_is_distance_matrix) { if (ret_model) { stop("Can only create models with dense matrix or data frame input") } checkna(X) n_vertices <- nrow(X) if (ncol(X) != n_vertices) { stop("Sparse matrices are only supported as distance matrices") } tsmessage("Read ", n_vertices, " rows of sparse distance matrix") Xnames <- row.names(X) } else { cat_ids <- NULL norig_col <- ncol(X) if (methods::is(X, "data.frame") || methods::is(X, "matrix") || is_sparse_matrix(X)) { cat_res <- find_categoricals(metric) metric <- cat_res$metrics
cat_ids <- cat_res$categoricals # Convert categorical columns to factors if they aren't already if (!is.null(cat_ids)) { X[, cat_ids] <- sapply(X[, cat_ids, drop = FALSE], factor, simplify = methods::is(X, "matrix") ) Xcat <- X[, cat_ids, drop = FALSE] } if (methods::is(X, "data.frame")) { indexes <- which(vapply(X, is.numeric, logical(1))) if (length(indexes) == 0) { stop("No numeric columns found") } tsmessage("Converting dataframe to numerical matrix") if (length(indexes) != ncol(X)) { X <- X[, indexes] } X <- as.matrix(X) } if (n_components > ncol(X)) { warning( "n_components ", "> number of columns in input data: ", n_components, " > ", ncol(X), ", this may give poor or unexpected results" ) } } else { stop("Unknown input data format") } checkna(X) n_vertices <- nrow(X) tsmessage( "Read ", n_vertices, " rows and found ", ncol(X), " numeric columns", appendLF = is.null(cat_ids) ) if (length(cat_ids) > 0) { tsmessage(" and ", pluralize("categorical column", length(cat_ids)), time_stamp = FALSE ) } Xnames <- row.names(X) X <- remove_scaling_attrs(X) X <- scale_input(X, scale_type = scale, ret_model = ret_model, verbose = verbose ) } # Store number of precomputed nn if X is non-NULL (NULL X case handled above) if (nn_is_precomputed(nn_method) && num_precomputed_nns == 0) { num_precomputed_nns <- check_graph_list(nn_method, n_vertices, bipartite = FALSE ) if (is.null(Xnames)) { Xnames <- nn_graph_row_names_list(nn_method) } } if (method == "largevis" && kernel == "knn") { n_neighbors <- perplexity } if (max(n_neighbors) > n_vertices) { # pre-calculated nearest neighbors ignores the user-supplied n_neighbors # which is handled later if (!is.list(nn_method)) { if (method == "largevis") { # for LargeVis, n_neighbors normally determined from perplexity not an # error to be too large tsmessage("Setting n_neighbors to ", n_vertices) n_neighbors <- n_vertices } else { stop("n_neighbors must be smaller than the dataset size") } } } if (!is.list(metric)) { metrics <- list(c()) names(metrics) <- metric } else { metrics <- metric } # For typical case of numeric matrix X and not using binary metric, save # PCA results here in case initialization uses PCA too pca_models <- NULL pca_shortcut <- FALSE if (!is.null(pca) && length(metric) == 1 && !is_binary_metric(metric) && is.matrix(X) && ncol(X) > pca) { tsmessage("Reducing X column dimension to ", pca, " via PCA") pca_res <- pca_init(X, ndim = pca, center = pca_center, pca_method = pca_method, ret_extra = ret_model, verbose = verbose ) if (ret_model) { X <- pca_res$scores
pca_models[["1"]] <- pca_res[c("center", "rotation")]
pca_res <- NULL
} else {
X <- pca_res
}
pca_shortcut <- TRUE
}

if (is_similarity_graph) {
d2sr <-
list(
V = nn_method,
nns = NULL,
pca_models = NULL,
sigma = NULL,
rho = NULL
)
need_sigma <- FALSE
} else {
need_sigma <- ret_sigma || ret_localr || !is.null(dens_scale)
d2sr <- data2set(
X,
Xcat,
n_neighbors,
metrics,
nn_method,
n_trees,
search_k,
method,
set_op_mix_ratio,
local_connectivity,
bandwidth,
perplexity,
kernel,
need_sigma,
grain_size,
ret_model,
pca = pca,
pca_center = pca_center,
pca_method = pca_method,
n_vertices = n_vertices,
nn_args = nn_args,
tmpdir = tmpdir,
sparse_is_distance = sparse_X_is_distance_matrix,
verbose = verbose
)
}
V <- d2sr$V nns <- d2sr$nns
if (is.null(pca_models)) {
pca_models <- d2sr$pca_models } # Calculate approximate local radii sigma <- NULL rho <- NULL localr <- NULL dint <- NULL if (need_sigma) { sigma <- d2sr$sigma
rho <- d2sr$rho dint <- d2sr$dint
}
if (!is.null(dens_scale) || ret_localr) {
localr <- sigma + rho
}

if (!is.null(y)) {
tsmessage("Processing y data")

if (!is.list(target_metric)) {
target_metrics <- list(c())
names(target_metrics) <- target_metric
} else {
target_metrics <- target_metric
}

ycat <- NULL
ycat_ids <- NULL
if (methods::is(y, "data.frame")) {
ycat_res <- find_categoricals(target_metric)
target_metric <- ycat_res$metrics ycat_ids <- ycat_res$categoricals
if (!is.null(ycat_ids)) {
ycat <- y[, ycat_ids, drop = FALSE]
} else {
ycindexes <- which(vapply(y, is.factor, logical(1)))
if (length(ycindexes) > 0) {
ycat <- (y[, ycindexes, drop = FALSE])
}
}

yindexes <- which(vapply(y, is.numeric, logical(1)))

if (length(yindexes) > 0) {
y <- as.matrix(y[, yindexes])
} else {
y <- NULL
}
} else if (is.list(y)) {
nn_method <- y
} else if (is.numeric(y)) {
y <- as.matrix(y)
} else if (is.factor(y)) {
ycat <- data.frame(y)
y <- NULL
}

if (!is.null(y)) {
yd2sr <- data2set(y, ycat, target_n_neighbors, target_metrics, nn_method,
n_trees, search_k,
method,
set_op_mix_ratio = 1.0,
local_connectivity = 1.0,
bandwidth = 1.0,
perplexity = perplexity, kernel = kernel,
ret_sigma = FALSE,
ret_model = FALSE,
pca = pca, pca_center = TRUE, pca_method = pca_method,
n_vertices = n_vertices,
tmpdir = tmpdir,
verbose = verbose
)

tsmessage(
"Intersecting X and Y sets with target weight = ",
formatC(target_weight)
)
# behavior for supervised UMAP: do reset local connectivity
# don't reset metric (same as Python UMAP as of 0.5.3)
V <- set_intersect(V, yd2sr$V, target_weight, reset_connectivity = TRUE) yd2sr$V <- NULL
yd2sr$nns <- NULL } else if (!is.null(ycat)) { V <- categorical_intersection_df(ycat, V, weight = target_weight, verbose = verbose ) } } if (!(ret_model || ret_nn)) { nns <- NULL gc() } if (methods::is(init, "matrix")) { if (nrow(init) != n_vertices || ncol(init) != n_components) { stop("init matrix does not match necessary configuration for X: ", "should have dimensions (", n_vertices, ", ", n_components, ")") } tsmessage("Initializing from user-supplied matrix") embedding <- scale_coords(init, init_sdev, verbose = verbose) } else if (!(methods::is(init, "character") && length(init) == 1)) { if (is.null(init) && !is.null(n_epochs) && n_epochs == 0) { embedding <- NULL if (!ret_extra) { warning( "Neither high-dimensional nor low-dimensional data will be ", "returned with this combination of settings" ) } if (ret_model) { warning( "Returning a model but it will not be valid for transforming ", "new data" ) } } else { stop( "init should be either a matrix or string describing the ", "initialization method" ) } } else { init <- match.arg(tolower(init), c( "spectral", "random", "lvrandom", "normlaplacian", "laplacian", "spca", "pca", "inormlaplacian", "ispectral", "agspectral", "irlba_spectral", "irlba_laplacian", "pacpca" )) if (init_is_spectral(init)) { connected <- connected_components(V) if (connected$n_components > 1) {
tsmessage("Found ", connected$n_components, " connected components, ", appendLF = FALSE) if (is.null(X)) { tsmessage("falling back to random initialization", time_stamp = FALSE) init <- "random" } else { tsmessage("falling back to 'spca' initialization with init_sdev = 1", time_stamp = FALSE ) init <- "spca" init_sdev <- 1 } } } # Don't repeat PCA initialization if we've already done it once if (pca_shortcut && init %in% c("spca", "pca", "pacpca") && pca >= n_components) { embedding <- X[, 1:n_components] switch(init, spca = tsmessage("Initializing from scaled PCA"), pca = tsmessage("Initializing from PCA"), pacpca = tsmessage("Initializing from PaCMAP-style PCA"), stop("Unknown init method '", init, "'") ) } else { embedding <- switch(init, spectral = spectral_init(V, ndim = n_components, verbose = verbose), random = rand_init(n_vertices, n_components, verbose = verbose), lvrandom = rand_init_lv(n_vertices, n_components, verbose = verbose), normlaplacian = normalized_laplacian_init(V, ndim = n_components, verbose = verbose ), laplacian = laplacian_eigenmap(V, ndim = n_components, verbose = verbose), # we handle scaling pca below spca = pca_init(X, ndim = n_components, pca_method = pca_method, verbose = verbose ), pca = pca_init(X, ndim = n_components, pca_method = pca_method, verbose = verbose ), pacpca = pca_init(X, ndim = n_components, pca_method = pca_method, verbose = verbose ), ispectral = irlba_spectral_init(V, ndim = n_components, verbose = verbose), inormlaplacian = irlba_normalized_laplacian_init(V, ndim = n_components, verbose = verbose ), agspectral = agspectral_init(V, n_neg_nbrs = negative_sample_rate, ndim = n_components, verbose = verbose ), irlba_spectral = spectral_init(V, ndim = n_components, verbose = verbose, force_irlba = TRUE), irlba_laplacian = laplacian_eigenmap(V, ndim = n_components, verbose = verbose, force_irlba = TRUE), stop("Unknown initialization method: '", init, "'") ) } if (init == "pacpca") { embedding <- 0.01 * embedding } if (!is.null(init_sdev) || init == "spca") { if (is.null(init_sdev)) { init_sdev <- 1e-4 } embedding <- scale_coords(embedding, init_sdev, verbose = verbose) } } if (any(is.na(embedding))) { stop("Initial data contains NA values: is n_components too high?") } if (is.null(n_epochs) || n_epochs < 0) { if (method == "largevis") { n_epochs <- lvish_epochs(n_vertices, V) } else { if (n_vertices <= 10000) { n_epochs <- 500 } else { n_epochs <- 200 } } } full_opt_args <- get_opt_args(opt_args, alpha) if (binary_edge_weights) { V@x <- rep(1, length(V@x)) } if (n_epochs > 0) { if (any(apply(embedding, 2, stats::sd) > 10.0)) { warning( "Initial embedding standard deviation > 10.0, this can lead to ", "poor optimization" ) } # remove edges which can't be sampled due to n_epochs V@x[V@x < max(V@x) / n_epochs] <- 0 V <- Matrix::drop0(V) # Create the (0-indexed) indices of the head and tail of each directed edge # in V. Graph is symmetric, so both (i->j) and (j->i) are present if (batch) { V <- Matrix::t(V) # head is ordered in non-decreasing order of node index positive_head <- Matrix::which(V != 0, arr.ind = TRUE)[, 2] - 1 # tail is unordered positive_tail <- V@i } else { # Use the Python UMAP ordering # head is unordered positive_head <- V@i # tail is ordered in non-decreasing order of node index positive_tail <- Matrix::which(V != 0, arr.ind = TRUE)[, 2] - 1 } # start/end pointers into the ordered vector positive_ptr <- V@p epochs_per_sample <- make_epochs_per_sample(V@x, n_epochs) tsmessage( "Commencing optimization for ", n_epochs, " epochs, with ", length(positive_head), " positive edges", pluralize("thread", n_sgd_threads, " using") ) ai <- NULL if (!is.null(dens_scale)) { ai <- scale_radii(localr, dens_scale, a) method <- "leopold" if (ret_model) { # store the linear transform from localr to ai for transforming new data lai2 <- 2 * log(range(ai)) llr <- -log(rev(range(localr))) rad_coeff <- stats::lm(lai2 ~ llr)$coefficients
}
}

method <- tolower(method)
method_args <- switch(method,
umap = list(a = a, b = b, gamma = gamma, approx_pow = approx_pow),
tumap = list(),
# a = 1 b = 10 for final phase of PaCMAP optimization
pacmap = list(a = a, b = b),
largevis = list(gamma = gamma),
leopold = list(ai = ai, b = b, ndim = n_components),
stop("Unknown dimensionality reduction method '", method, "'")
)

embedding <- t(embedding)
embedding <- optimize_layout_r(
tail_embedding = NULL,
positive_tail = positive_tail,
positive_ptr = positive_ptr,
n_epochs = n_epochs,
n_tail_vertices = n_vertices,
epochs_per_sample = epochs_per_sample,
method = method,
method_args = method_args,
initial_alpha = alpha,
opt_args = full_opt_args,
negative_sample_rate = negative_sample_rate,
pcg_rand = pcg_rand,
batch = batch,
grain_size = grain_size,
move_other = TRUE,
epoch_callback = epoch_callback,
verbose = verbose
)
embedding <- t(embedding)

gc()
# Center the points before returning
embedding <- scale(embedding, center = TRUE, scale = FALSE)

if (is.null(row.names(embedding)) &&
!is.null(Xnames) && length(Xnames) == nrow(embedding)) {
row.names(embedding) <- Xnames
}
tsmessage("Optimization finished")
}

if (ret_extra) {
nblocks <- length(nns)
res <- list(embedding = embedding)
if (ret_model) {
res <- append(res, list(
scale_info = if (!is.null(X)) {
attr_to_scale_info(X)
} else {
NULL
},
search_k = search_k,
local_connectivity = local_connectivity,
n_epochs = n_epochs,
alpha = alpha,
negative_sample_rate = negative_sample_rate,
method = method,
a = a,
b = b,
gamma = gamma,
approx_pow = approx_pow,
metric = metrics,
norig_col = norig_col,
pcg_rand = pcg_rand,
batch = batch,
opt_args = full_opt_args,
num_precomputed_nns = num_precomputed_nns,
# #95: min_dist and spread are exported for documentation purposes only
min_dist = min_dist,
binary_edge_weights = binary_edge_weights,
seed = seed,
nn_method = nn_method,
nn_args = nn_args
))
if (nn_is_precomputed(nn_method)) {
res$n_neighbors <- nn_graph_nbrs_list(nn_method) } else { res$n_neighbors <- n_neighbors
}
if (method == "leopold") {
res$dens_scale <- dens_scale res$ai <- ai
res$rad_coeff <- rad_coeff } if (nblocks > 1) { if (!nn_is_precomputed(nn_method)) { res$nn_index <- list()
for (i in 1:nblocks) {
res$nn_index[[i]] <- nns[[i]]$index
}
}
} else {
if (!is.null(nns[[1]]$index)) { res$nn_index <- nns[[1]]$index if (is.null(res$metric[[1]])) {
# 31: Metric usually lists column indices or names, NULL means use all
# of them, but for loading the NN index we need the number of
# columns explicitly (we don't have access to the column dimension of
# the input data at load time)
if (res$nn_index$type %in% c("annoyv2", "hnswv1", "nndescentv1")) {
res$metric[[1]] <- list(ndim = res$nn_index$ndim) } else { # To be sure of the dimensionality, fetch the first item from the # index and see how many elements are in the returned vector. if (!is.null(X)) { rcppannoy <- get_rcppannoy(res$nn_index)
res$metric[[1]] <- list(ndim = length(rcppannoy$getItemsVector(0)))
} else {
res$metric[[1]] <- list() } } } } else { if (nn_is_precomputed(nn_method)) { tsmessage( "Note: model requested with precomputed neighbors. ", "For transforming new data, distance data must be ", "provided separately" ) } } } if (!is.null(pca_models)) { res$pca_models <- pca_models
}
}
if (ret_nn) {
res$nn <- list() for (i in 1:nblocks) { if (is.list(nns[[i]])) { res$nn[[i]] <- list(idx = nns[[i]]$idx, dist = nns[[i]]$dist)
if (!is.null(Xnames) && nrow(res$nn[[i]]$idx) == length(Xnames)) {
row.names(res$nn[[i]]$idx) <- Xnames
row.names(res$nn[[i]]$dist) <- Xnames
}
} else if (is_sparse_matrix(nns[[i]])) {
res$nn[[i]] <- nns[[i]] if (!is.null(Xnames) && nrow(res$nn[[i]]) == length(Xnames)) {
row.names(res$nn[[i]]) <- Xnames colnames(res$nn[[i]]) <- Xnames
}
}
}
names(res$nn) <- names(nns) } if (ret_fgraph) { if (method == "largevis") { res$P <- V
} else {
res$fgraph <- V } } if (ret_sigma) { res$sigma <- sigma
res$rho <- rho res$dint <- dint
}
if (ret_localr && !is.null(localr)) {
res$localr <- localr } } else { res <- embedding } res } #' Save or Load a Model #' #' Functions to write a UMAP model to a file, and to restore. #' #' @param model a UMAP model create by \code{\link{umap}}. #' @param file name of the file where the model is to be saved or read from. #' @param unload if \code{TRUE}, unload all nearest neighbor indexes for the #' model. The \code{model} will no longer be valid for use in #' \code{\link{umap_transform}} and the temporary working directory used #' during model saving will be deleted. You will need to reload the model with #' \code{load_uwot} to use the model. If \code{FALSE}, then the model can be #' re-used without reloading, but you must manually unload the NN index when #' you are finished using it if you want to delete the temporary working #' directory. To unload manually, use \code{\link{unload_uwot}}. The absolute #' path of the working directory is found in the \code{mod_dir} item of the #' return value. #' @param verbose if \code{TRUE}, log information to the console. #' @return \code{model} with one extra item: \code{mod_dir}, which contains the #' path to the working directory. If \code{unload = FALSE} then this directory #' still exists after this function returns, and can be cleaned up with #' \code{\link{unload_uwot}}. If you don't care about cleaning up this #' directory, or \code{unload = TRUE}, then you can ignore the return value. #' @examples #' iris_train <- iris[c(1:10, 51:60), ] #' iris_test <- iris[100:110, ] #' #' # create model #' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' #' # save without unloading: this leaves behind a temporary working directory #' model_file <- tempfile("iris_umap") #' model <- save_uwot(model, file = model_file) #' #' # The model can continue to be used #' test_embedding <- umap_transform(iris_test, model) #' #' # To manually unload the model from memory when finished and to clean up #' # the working directory (this doesn't touch your model file) #' unload_uwot(model) #' #' # At this point, model cannot be used with umap_transform, this would fail: #' # test_embedding2 <- umap_transform(iris_test, model) #' #' # restore the model: this also creates a temporary working directory #' model2 <- load_uwot(file = model_file) #' test_embedding2 <- umap_transform(iris_test, model2) #' #' # Unload and clean up the loaded model temp directory #' unload_uwot(model2) #' #' # clean up the model file #' unlink(model_file) #' #' # save with unloading: this deletes the temporary working directory but #' # doesn't allow the model to be re-used #' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' model_file3 <- tempfile("iris_umap") #' model3 <- save_uwot(model3, file = model_file3, unload = TRUE) #' #' @seealso \code{\link{load_uwot}}, \code{\link{unload_uwot}} #' @export save_uwot <- function(model, file, unload = FALSE, verbose = FALSE) { if (!all_nn_indices_are_loaded(model)) { stop("cannot save: NN index is unloaded") } wd <- getwd() model_file <- abspath(file) if (file.exists(model_file)) { stop("model file ", model_file, " already exists") } tmp_model_file <- NULL tryCatch( { # create directory to store files in mod_dir <- tempfile(pattern = "dir") tsmessage("Creating temp model dir ", mod_dir) dir.create(mod_dir) # create the tempdir/uwot subdirectory uwot_dir <- file.path(mod_dir, "uwot") tsmessage("Creating dir ", mod_dir) dir.create(uwot_dir) # save model file to tempdir/uwot/model model_tmpfname <- file.path(uwot_dir, "model") saveRDS(model, file = model_tmpfname) # save each nn index inside tempdir/uwot/model metrics <- names(model$metric)
n_metrics <- length(metrics)

for (i in 1:n_metrics) {
if (n_metrics == 1) {
nn_index <- model$nn_index } else { nn_index <- model$nn_index[[i]]
}

if (startsWith(nn_index$type, "annoy") || startsWith(nn_index$type, "hnsw")) {

nn_tmpfname <- file.path(uwot_dir, paste0("nn", i))
nn_meta_tmpfname <- file.path(uwot_dir, paste0("nn-meta", i))
nn_index$ann$save(nn_tmpfname)

# save metadata wrapper around the index separately
meta_data <- nn_index
meta_data$ann <- NULL saveRDS(meta_data, file = nn_meta_tmpfname) } else if (startsWith(nn_index$type, "nndescent")) {
nn_tmpfname <- file.path(uwot_dir, paste0("nn", i))
saveRDS(nn_index, file = nn_tmpfname)
}
else {
stop("unsupported nn index type: ", model$nn_index$type)
}
}

# archive the files under the temp dir into the single target file
# change directory so the archive only contains one directory
tsmessage("Changing to ", mod_dir)
setwd(mod_dir)
tmp_model_file <- abspath(file)
tsmessage("Creating ", tmp_model_file)

# #109: Windows 7 tar needs "--force-local" to avoid interpreting colon
# as indicating a remote machine
extra_flags <- ""
if (is_win7()) {
extra_flags <- "--force-local"
}
utils::tar(
tarfile = tmp_model_file,
extra_flags = extra_flags,
files = "uwot/"
)
},
finally = {
setwd(wd)
if (!is.null(tmp_model_file) && model_file != tmp_model_file) {
tsmessage("Copying ", tmp_model_file, " to ", model_file)
file.copy(from = tmp_model_file, to = model_file)
}
model$mod_dir <- mod_dir if (unload) { unload_uwot(model, cleanup = TRUE, verbose = verbose) } } ) model } #' Save or Load a Model #' #' Functions to write a UMAP model to a file, and to restore. #' #' @param file name of the file where the model is to be saved or read from. #' @param verbose if \code{TRUE}, log information to the console. #' @return The model saved at \code{file}, for use with #' \code{\link{umap_transform}}. Additionally, it contains an extra item: #' \code{mod_dir}, which contains the path to the temporary working directory #' used during loading of the model. This directory cannot be removed until #' this model has been unloaded by using \code{\link{unload_uwot}}. #' @examples #' library(RSpectra) #' #' iris_train <- iris[c(1:10, 51:60), ] #' iris_test <- iris[100:110, ] #' #' # create model #' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' #' # save without unloading: this leaves behind a temporary working directory #' model_file <- tempfile("iris_umap") #' model <- save_uwot(model, file = model_file) #' #' # The model can continue to be used #' test_embedding <- umap_transform(iris_test, model) #' #' # To manually unload the model from memory when finished and to clean up #' # the working directory (this doesn't touch your model file) #' unload_uwot(model) #' #' # At this point, model cannot be used with umap_transform, this would fail: #' # test_embedding2 <- umap_transform(iris_test, model) #' #' # restore the model: this also creates a temporary working directory #' model2 <- load_uwot(file = model_file) #' test_embedding2 <- umap_transform(iris_test, model2) #' #' # Unload and clean up the loaded model temp directory #' unload_uwot(model2) #' #' # clean up the model file #' unlink(model_file) #' #' # save with unloading: this deletes the temporary working directory but #' # doesn't allow the model to be re-used #' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' model_file3 <- tempfile("iris_umap") #' model3 <- save_uwot(model3, file = model_file3, unload = TRUE) #' #' @seealso \code{\link{save_uwot}}, \code{\link{unload_uwot}} #' @export load_uwot <- function(file, verbose = FALSE) { # create directory to store files in mod_dir <- tempfile(pattern = "dir") tsmessage("Creating temp directory ", mod_dir) dir.create(mod_dir) # #109: Windows 7 tar needs "--force-local" to avoid interpreting colon # as indicating a remote machine extras <- NULL if (is_win7()) { extras <- "--force-local" } utils::untar(abspath(file), exdir = mod_dir, extras = extras, verbose = verbose ) model_fname <- file.path(mod_dir, "uwot/model") if (!file.exists(model_fname)) { stop("Can't find model in ", file) } model <- readRDS(file = model_fname) metrics <- names(model$metric)
n_metrics <- length(metrics)

nn_method <- model$nn_method if (is.null(nn_method)) { nn_method <- "annoy" } for (i in 1:n_metrics) { nn_fname <- file.path(mod_dir, paste0("uwot/nn", i)) if (!file.exists(nn_fname)) { stop("Can't find nearest neighbor index ", nn_fname, " in ", file) } metric <- metrics[[i]] # 31: need to specify the index dimensionality when creating the index if (is.list(model$metric[[i]])) {
# in case where there is only one metric, the value is a one-item list
# named 'ndim' giving the number of dimensions directly: all columns
# are used in this metric
ndim <- model$metric[[i]]$ndim
} else {
# otherwise, metric specifies the name or index used for each metric,
# so the dimension is the number of them
ndim <- length(model$metric[[i]]) } if (nn_method == "annoy") { annoy_metric <- metric ann <- create_ann(annoy_metric, ndim = ndim) ann$load(nn_fname)

idx <-
list(
ann = ann,
type = "annoyv1",
metric = annoy_metric,
ndim = ndim
)
if (n_metrics == 1) {
model$nn_index <- idx } else { model$nn_index[[i]] <- idx
}
}
else if (nn_method == "hnsw") {
ann <- hnsw_load(metric, ndim = ndim, filename = nn_fname)
nn_meta_tmpfname <- file.path(mod_dir, paste0("uwot/nn-meta", i))
idx$ann <- ann if (n_metrics == 1) { model$nn_index <- idx
} else {
model$nn_index[[i]] <- idx } } else if (nn_method == "nndescent") { idx <- readRDS(nn_fname) if (n_metrics == 1) { model$nn_index <- idx
} else {
model$nn_index[[i]] <- idx } } else { stop("Unknown nearest neighbor method ", nn_method) } } model$mod_dir <- mod_dir

model
}

#'
#' Unloads the UMAP model. This prevents the model being used with
#' \code{\link{umap_transform}}, but allows the temporary working directory
#'
#' @param model a UMAP model create by \code{\link{umap}}.
#' @param cleanup if \code{TRUE}, attempt to delete the temporary working
#'   directory that was used in either the save or load of the model.
#'
#' @examples
#' iris_train <- iris[c(1:10, 51:60), ]
#' iris_test <- iris[100:110, ]
#'
#' # create model
#' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20)
#'
#' # save without unloading: this leaves behind a temporary working directory
#' model_file <- tempfile("iris_umap")
#' model <- save_uwot(model, file = model_file)
#'
#' # The model can continue to be used
#' test_embedding <- umap_transform(iris_test, model)
#'
#' # To manually unload the model from memory when finished and to clean up
#' # the working directory (this doesn't touch your model file)
#'
#' # At this point, model cannot be used with umap_transform, this would fail:
#' # test_embedding2 <- umap_transform(iris_test, model)
#'
#' # restore the model: this also creates a temporary working directory
#' model2 <- load_uwot(file = model_file)
#' test_embedding2 <- umap_transform(iris_test, model2)
#'
#'
#' # clean up the model file
#'
#' # save with unloading: this deletes the temporary working directory but
#' # doesn't allow the model to be re-used
#' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20)
#' model_file3 <- tempfile("iris_umap")
#' model3 <- save_uwot(model3, file = model_file3, unload = TRUE)
#'
#' @export
unload_uwot <- function(model, cleanup = TRUE, verbose = FALSE) {
if (is.null(model$nn_method) || model$nn_method == "annoy") {
metrics <- names(model$metric) n_metrics <- length(metrics) for (i in 1:n_metrics) { if (n_metrics == 1) { rcppannoy <- get_rcppannoy(model$nn_index)
rcppannoy$unload() } else { rcppannoy <- get_rcppannoy(model$nn_index[[i]])
rcppannoy$unload() } } } if (cleanup) { if (is.null(model$mod_dir)) {
tsmessage("Model is missing temp dir location, can't clean up")
return()
} else {
mod_dir <- model$mod_dir if (!file.exists(mod_dir)) { tsmessage("model temp dir location '", mod_dir, "' no longer exists") return() } tsmessage("Deleting temp model dir ", mod_dir) res <- unlink(mod_dir, recursive = TRUE) if (res != 0) { tsmessage("Unable to delete tempdir ", mod_dir) } } } } all_nn_indices_are_loaded <- function(model) { if (is.null(model$nn_index)) {
stop("Invalid model: has no 'nn_index'")
}
if (is.list(model$nn_index)) { if (is.null(model$nn_index$type)) { for (i in 1:length(model$nn_index)) {
rcppannoy <- get_rcppannoy(model$nn_index[[i]]) if (rcppannoy$getNTrees() == 0) {
return(FALSE)
}
}
}
else if (model$nn_index$type == "annoyv1") {
rcppannoy <- get_rcppannoy(model$nn_index) if (rcppannoy$getNTrees() == 0) {
return(FALSE)
}
}
else if (model$nn_index$type == "hnswv1") {
return(TRUE)
}
else if (model$nn_index$type == "nndescentv1") {
return(TRUE)
}
else {
stop("Invalid model: has unknown 'nn_index' type ", model$nn_index$type)
}
} else {
rcppannoy <- get_rcppannoy(model$nn_index) if (rcppannoy$getNTrees() == 0) {
return(FALSE)
}
}
TRUE
}

abspath <- function(filename) {
file.path(normalizePath(dirname(filename)), basename(filename))
}

# Half of whatever the C++ implementation thinks are the number of concurrent
# threads supported, but at least 1
max(1, hardware_concurrency() / 2)
}

# Get the number of vertices in X
x2nv <- function(X) {
if (is.list(X)) {
if (!is.null(X$idx)) { n_vertices <- x2nv(X$idx)
} else {
if (length(X) > 0) {
n_vertices <- x2nv(X[[1]])
} else {
stop("Can't find n_vertices for list X")
}
}
} else if (inherits(X, "dist")) {
n_vertices <- attr(X, "Size")
} else if (is_sparse_matrix(X)) {
# older code path where distance matrix was part of X rather than nn_method
# used nrow, but transform was not supported so nrow == ncol
n_vertices <- ncol(X)
} else if (methods::is(X, "data.frame") || methods::is(X, "matrix")) {
n_vertices <- nrow(X)
} else if (is.numeric(X)) {
n_vertices <- length(X)
} else {
stop("Can't find number of vertices for X of type '", class(X)[1], "'")
}
n_vertices
}

data2set <- function(X,
Xcat,
n_neighbors,
metrics,
nn_method,
n_trees,
search_k,
method,
set_op_mix_ratio,
local_connectivity,
bandwidth,
perplexity,
kernel,
ret_sigma,
grain_size,
ret_model,
n_vertices = x2nv(X),
tmpdir = tempdir(),
pca = NULL,
pca_center = TRUE,
pca_method = "irlba",
nn_args = list(),
sparse_is_distance = TRUE,
verbose = FALSE) {
V <- NULL
nns <- list()
nblocks <- length(metrics)
sigma <- NULL
# Check for precalculated NN data in nn_method
if (is.list(nn_method)) {
if (is.null(nn_method$idx)) { nblocks <- length(nn_method) if (nblocks == 0) { stop("Incorrect format for precalculated neighbor data") } } else { nblocks <- 1 # wrap nn data in a list so data is always a list of lists nn_method <- list(nn_method) } metrics <- replicate(nblocks, NULL, simplify = FALSE) names(metrics) <- rep("precomputed", nblocks) } if (nblocks > 1) { tsmessage("Found ", nblocks, " blocks of data") } mnames <- tolower(names(metrics)) if (is.null(nn_method)) { if (methods::is(X, "matrix")) { if (n_vertices < 4096 && !ret_model && all(mnames == "euclidean")) { tsmessage("Using FNN for neighbor search, n_neighbors = ", n_neighbors) nn_method <- "fnn" } else { tsmessage("Using Annoy for neighbor search, n_neighbors = ", n_neighbors) nn_method <- "annoy" } } else { # It's a dist, or an actual distance matrix (sparse or triangular) nn_method <- "matrix" } } pca_models <- list() for (i in 1:nblocks) { metric <- mnames[[i]] if (is.character(nn_method) && nn_method == "annoy") { metric <- match.arg(metric, c( "euclidean", "cosine", "manhattan", "hamming", "correlation", "precomputed" )) } # Defaults for this block which can be overridden pca_i <- pca pca_center_i <- pca_center subset <- metrics[[i]] if (is.null(subset)) { Xsub <- X } else if (is.list(subset)) { # e.g. "euclidean" = list(1:10, pca_center = FALSE), lsres <- lsplit_unnamed(subset) if (is.null(lsres$unnamed)) {
stop("Error: no subset provided for block ", i)
}
if (length(lsres$unnamed) != 1) { stop("Error: only one unnamed item should be provided for block ", i) } subset <- lsres$unnamed[[1]]

# possible overrides
if (!is.null(lsres$named)) { lsnamed <- lsres$named
lsnames <- names(lsnamed)
if (!is.null(lsnamed$pca_center)) { pca_center_i <- lsnamed$pca_center
}
# PCA argument can be NULL, so need to check if it was explicitly provided
if ("pca" %in% lsnames) {
pca_i <- lsnamed$pca } } Xsub <- X[, subset, drop = FALSE] } else { Xsub <- X[, subset, drop = FALSE] } if (!is.null(X) && is.matrix(X)) { block_size <- ncol(Xsub) if (block_size == 0) { stop("Block ", i, " has zero size") } if (nblocks > 1) { tsmessage( "Processing block ", i, " of ", nblocks, " with size ", block_size, " using metric '", metric, "'" ) } } else { # X is NULL or dist or something like that if (nblocks > 1) { tsmessage( "Processing block ", i, " of ", nblocks, " using metric '", metric, "'" ) } } if (!is.null(pca_i) && is.matrix(X) && metric != "hamming" && ncol(X) > pca_i && nrow(X) > pca_i) { tsmessage("Reducing column dimension to ", pca_i, " via PCA") pca_res <- pca_init(Xsub, pca_i, ret_extra = ret_model, center = pca_center_i, pca_method = pca_method, verbose = verbose ) if (ret_model) { Xsub <- pca_res$scores
pca_models[[as.character(i)]] <- pca_res[c("center", "rotation")]
pca_res <- NULL
} else {
Xsub <- pca_res
}
}

nn_sub <- nn_method
# Extract this block of nn data from list of lists
if (metric == "precomputed") {
nn_sub <- nn_method[[i]]
n_neighbors <- NULL
}

x2set_res <- x2set(
Xsub,
n_neighbors,
metric,
nn_method = nn_sub,
n_trees,
search_k,
method,
set_op_mix_ratio,
local_connectivity,
bandwidth,
perplexity,
kernel,
ret_sigma,
grain_size,
ret_model,
n_vertices = n_vertices,
nn_args = nn_args,
tmpdir = tmpdir,
sparse_is_distance = sparse_is_distance,
verbose = verbose
)
Vblock <- x2set_res$V nn <- x2set_res$nn
nns[[i]] <- nn
names(nns)[[i]] <- metric
if (is.null(V)) {
V <- Vblock
} else {
# TODO: should at least offer the option to reset the local metric here
# TODO: make , reset_local_metric = TRUE the default (breaking change)
V <- set_intersect(V, Vblock, weight = 0.5, reset_connectivity = TRUE)
}
if (ret_sigma && is.null(sigma)) {
# No idea how to combine different neighborhood sizes so just return the
# first set
sigma <- x2set_res$sigma rho <- x2set_res$rho
dint <- x2set_res$dint } } if (!is.null(Xcat)) { V <- categorical_intersection_df(Xcat, V, weight = 0.5, verbose = verbose) } res <- list(V = V, nns = nns, pca_models = pca_models) if (!is.null(sigma)) { res$sigma <- sigma
res$rho <- rho res$dint <- dint
}
res
}

x2nn <- function(X,
n_neighbors,
metric,
nn_method,
n_trees,
search_k,
tmpdir = tempdir(),
grain_size,
ret_model,
n_vertices = x2nv(X),
nn_args = list(),
sparse_is_distance = TRUE,
verbose = FALSE) {
if (is.list(nn_method)) {
validate_nn(nn_method, n_vertices)
nn <- nn_method
} else {
nn_method <-
match.arg(tolower(nn_method),
c("annoy", "fnn", "matrix", "hnsw", "nndescent"))
if (nn_method == "fnn" && metric != "euclidean") {
stop(
"nn_method = 'FNN' is only compatible with distance metric ",
"'euclidean'"
)
}
if (nn_method == "fnn" && ret_model) {
stop("nn_method = 'FNN' is incompatible with ret_model = TRUE")
}
nn <- find_nn(
X,
n_neighbors,
method = nn_method,
metric = metric,
n_trees = n_trees,
search_k = search_k,
nn_args = nn_args,
tmpdir = tmpdir,
grain_size = grain_size,
ret_index = ret_model,
sparse_is_distance = sparse_is_distance,
verbose = verbose
)
}
nn
}

validate_nn <- function(nn_method, n_vertices) {
if (!is.matrix(nn_method$idx)) { stop("Couldn't find precalculated 'idx' matrix") } if (nrow(nn_method$idx) != n_vertices) {
stop(
"Precalculated 'idx' matrix must have ", n_vertices,
" rows, but found ", nrow(nn_method$idx) ) } if (!is.matrix(nn_method$dist)) {
stop("Couldn't find precalculated 'dist' matrix")
}
if (nrow(nn_method$idx) != n_vertices) { stop("Precalculated 'dist' matrix must have ", n_vertices, " rows, but found ", nrow(nn_method$dist))
}
if (ncol(nn_method$dist) != ncol(nn_method$idx)) {
stop("Precalculated 'dist' matrix must have ", ncol(nn_method$idx), " cols, but found ", ncol(nn_method$dist))
}
}

nn2set <- function(method, nn,
set_op_mix_ratio, local_connectivity, bandwidth,
perplexity, kernel,
ret_sigma,
verbose = FALSE) {
sigma <- NULL
res <- list()
if (method == "largevis") {
n_vertices <- nrow(nn$dist) if (perplexity >= n_vertices) { stop("perplexity can be no larger than ", n_vertices - 1) } Vres <- perplexity_similarities( nn = nn, perplexity = perplexity, ret_sigma = ret_sigma, n_threads = n_threads, grain_size = grain_size, kernel = kernel, verbose = verbose ) res$V <- Vres$matrix if (ret_sigma && !is.null(Vres$sigma)) {
res$sigma <- Vres$sigma
res$dint <- Vres$dint
}
} else {
Vres <- fuzzy_simplicial_set(
nn = nn,
set_op_mix_ratio = set_op_mix_ratio,
local_connectivity = local_connectivity,
bandwidth = bandwidth,
ret_sigma = ret_sigma,
grain_size = grain_size,
verbose = verbose
)
if (ret_sigma) {
res$V <- Vres$matrix
res$sigma <- Vres$sigma
res$rho <- Vres$rho
} else {
res$V <- Vres } } res } x2set <- function(X, n_neighbors, metric, nn_method, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, ret_sigma, n_threads, grain_size, ret_model, n_vertices = x2nv(X), tmpdir = tempdir(), nn_args = list(), sparse_is_distance = TRUE, verbose = FALSE) { if (is_sparse_matrix(nn_method)) { nn <- nn_method if (nrow(nn) != ncol(nn)) { stop("Sparse distance matrix must have same number of rows and cols") } if (nrow(nn) != n_vertices) { stop("Sparse distance matrix must have same dimensions as input data") } } else { nn <- x2nn( X, n_neighbors = n_neighbors, metric = metric, nn_method = nn_method, n_trees = n_trees, search_k = search_k, tmpdir = tmpdir, n_threads = n_threads, grain_size = grain_size, ret_model = ret_model, nn_args = nn_args, n_vertices = n_vertices, sparse_is_distance = sparse_is_distance, verbose = verbose ) if (any(is.infinite(nn$dist))) {
stop("Infinite distances found in nearest neighbors")
}
}
gc()

nn2set_res <- nn2set(
method,
nn,
set_op_mix_ratio,
local_connectivity,
bandwidth,
perplexity,
kernel,
ret_sigma,
grain_size,
verbose = verbose
)
V <- nn2set_res$V if (any(is.na(V))) { stop("Non-finite entries in the input matrix") } gc() res <- list( nn = nn, V = V ) if (ret_sigma && !is.null(nn2set_res$sigma)) {
res$sigma <- nn2set_res$sigma
res$rho <- nn2set_res$rho
res$dint <- nn2set_res$dint
}
res
}

set_intersect <- function(A, B, weight = 0.5, reset_connectivity = TRUE,
reset_local_metric = FALSE, n_threads = NULL,
verbose = FALSE) {
A <- general_simplicial_set_intersection(
A, B, weight
)
A <- Matrix::drop0(A)
# https://github.com/lmcinnes/umap/issues/58#issuecomment-437633658
# For now always reset
if (reset_connectivity) {
A <- reset_local_connectivity(A,
reset_local_metric = reset_local_metric,
)
}
A
}

categorical_intersection_df <- function(X, V, weight = 0.5, verbose = FALSE) {
tsmessage(
"Carrying out categorical intersection for ",
pluralize("column", ncol(X))
)
for (i in 1:ncol(X)) {
V <- categorical_intersection(X[, i], V,
weight = weight,
verbose = (verbose && i == 1)
)
}
V
}

categorical_intersection <- function(x, V, weight, verbose = FALSE) {
if (is.null(V)) {
stop("V cannot be null for categorical intersection")
}
if (weight < 1.0) {
far_dist <- 2.5 * (1.0 / (1.0 - weight))
} else {
far_dist <- 1.0e12
}
tsmessage(
"Applying categorical set intersection, weight = ", formatC(weight),
" far distance = ", formatC(far_dist)
)

V <- categorical_simplicial_set_intersection(V, x,
far_dist = far_dist,
verbose = verbose
)
V
}

# Creates the number of epochs per sample for each weight
# weights are the non-zero input affinities (1-simplex)
# n_epoch the total number of epochs
# There is an inverse relationship between the weights and the return vector.
make_epochs_per_sample <- function(weights, n_epochs) {
result <- rep(-1, length(weights))
n_samples <- n_epochs * (weights / max(weights))
result[n_samples > 0] <- n_epochs / n_samples[n_samples > 0]
result
}

# Create the a/b parameters from spread and min_dist
find_ab_params <- function(spread = 1, min_dist = 0.001) {
xv <- seq(from = 0, to = spread * 3, length.out = 300)
yv <- rep(0, length(xv))
yv[xv < min_dist] <- 1
yv[xv >= min_dist] <- exp(-(xv[xv >= min_dist] - min_dist) / spread)
result <- try(
{
stats::nls(yv ~ 1 / (1 + a * xv^(2 * b)),
start = list(a = 1, b = 1)
)$m$getPars()
},
silent = TRUE
)
if (inherits(result, "try-error")) {
stop(
" min_dist = ", min_dist
)
}
result
}

# The default number of edge samples used by LargeVis
lvish_samples <- function(n_vertices) {
n_samples <- 0

if (n_vertices < 10000) {
n_samples <- 1000
} else if (n_vertices < 1000000) {
n_samples <- (n_vertices - 10000) * 9000 / (1000000 - 10000) + 1000
} else {
n_samples <- n_vertices / 100
}

round(n_samples * 1000000)
}

# Returns the number of epochs required to generate the default number of edge samples
# used in LargeVis
lvish_epochs <- function(n_vertices, V) {
n_samples <- lvish_samples(n_vertices)
round(n_samples * max(V) / sum(V))
}

# Scale X according to various strategies
scale_input <- function(X, scale_type, ret_model = FALSE, verbose = FALSE) {
if (is.null(scale_type)) {
scale_type <- "none"
} else if (is.logical(scale_type)) {
scale_type <- ifelse(scale_type, "scale", "none")
} else if (tolower(scale_type) == "z") {
scale_type <- "scale"
}

scale_type <- match.arg(
tolower(scale_type),
c("none", "scale", "range", "colrange", "maxabs")
)
switch(scale_type,
range = {
tsmessage("Range scaling X")
min_X <- min(X)
X <- X - min_X

max_X <- max(X)
X <- X / max_X

if (ret_model) {
attr(X, "scaled:range:min") <- min_X
attr(X, "scaled:range:max") <- max_X
}
},
colrange = {
tsmessage("Column range scaling X")
min_X <- apply(X, 2, min)
X <- sweep(X, 2, min_X)

max_X <- apply(X, 2, max)
X <- sweep(X, 2, max_X, /)

if (ret_model) {
attr(X, "scaled:colrange:min") <- min_X
attr(X, "scaled:colrange:max") <- max_X
}
},
maxabs = {
tsmessage("Normalizing by max-abs")
X <- base::scale(X, scale = FALSE)
max_abs <- max(abs(X))
X <- X / max_abs

if (ret_model) {
attr(X, "scaled:maxabs") <- max_abs
}
},
scale = {
tsmessage("Scaling to zero mean and unit variance")

varf <- function(x) {
sum((x - sum(x) / length(x))^2)
}
non_zero_var_cols <- apply(X, 2, varf) >= .Machine$double.xmin if (length(non_zero_var_cols) == 0) { stop("Matrix has zero variance") } X <- X[, non_zero_var_cols] tsmessage("Kept ", ncol(X), " non-zero-variance columns") X <- base::scale(X, scale = TRUE) if (ret_model) { attr(X, "scaled:nzvcols") <- which(non_zero_var_cols) } } ) X } attr_to_scale_info <- function(X) { Xattr <- attributes(X) Xattr <- Xattr[startsWith(names(Xattr), "scaled:")] if (length(Xattr) == 0) { Xattr <- NULL } Xattr } get_opt_args <- function(opt_args, alpha) { default_batch_opt <- "adam" default_opt_args <- list( sgd = list(alpha = alpha), adam = list(alpha = alpha, beta1 = 0.5, beta2 = 0.9, eps = 1e-7) ) if (is.null(opt_args)) { opt_args <- list() } if (is.null(opt_args$method)) {
opt_args$method <- "adam" } if (!(opt_args$method %in% names(default_opt_args))) {
stop("Unknown optimization method '", opt_args$method, "'") } lmerge(default_opt_args[[opt_args$method]], opt_args)
}

# Takes local radii from the input dimension and converts to approximate
# densities in the output space by mapping them to a vector of a parameters
# as used in the UMAP output weight: 1/(1 + a + d^2b).
# Based on testing a rough range of usable a values is 0.01-100. To get that
# we want each a value to be the product of the local density of i and j, so
# a = sqrt(a_i * a_j)
# Also, we want dens_scale to control the spread of a values and for
# dens_scale = 0, the vector of a_i give the the user-selected scalar value of
# a, so we scale the log of the reciprocal of localr to be within [log(a * 1e-(2
# * dens_scale)) ... log(a * 1e(2 * dens_scale))] We take the sqrt of the a_i in
# this function to avoid repeatedly calling it inside the optimization loop.
scale_radii <- function(localr, dens_scale, a) {
log_denso <- -log(localr)
min_densl <- a * (10^(-2 * dens_scale))
log_min_densl <- log(min_densl)
max_densl <- a * (10^(2 * dens_scale))
log_max_densl <- log(max_densl)
log_denso_scale <- range_scale(log_denso, log_min_densl, log_max_densl)
sqrt(exp(log_denso_scale))
}

#' @useDynLib uwot, .registration=TRUE
#' @importFrom Rcpp sourceCpp
}

# Remove scaling attributes from a matrix
# if the scale parameter is set then these attributes are assumed to have
# been applied by uwot's internals and the equivalent scaling will be applied
# to new data in umap_transform. However, these attributes could have been
# applied by manually scaling the data before running any code in uwot, in which
# case we should not save them as part of the model. This function is called
# before applying any other scaling
remove_scaling_attrs <- function(X) {
uwot_attrs <- c(
"scaled:range:min",
"scaled:range:max",
"scaled:colrange:min",
"scaled:colrange:max",
"scaled:maxabs",
"scaled:nzvcols",
"scaled:center",
"scaled:scale"
)
attrs <- names(attributes(X))
for (attr in attrs) {
if (attr %in% uwot_attrs) {
attributes(X)[[attr]] <- NULL
}
}
X
}
jlmelville/uwot documentation built on May 20, 2024, 1:29 a.m.