R/uwot.R

Defines functions remove_scaling_attrs .onUnload scale_radii get_opt_args attr_to_scale_info scale_input lvish_epochs lvish_samples find_ab_params make_epochs_per_sample categorical_intersection categorical_intersection_df set_intersect x2set nn2set validate_nn x2nn data2set x2nv default_num_threads abspath all_nn_indices_are_loaded unload_uwot load_uwot save_uwot uwot simplicial_set_intersect simplicial_set_union optimize_graph_layout similarity_graph lvish tumap umap

Documented in load_uwot lvish optimize_graph_layout save_uwot similarity_graph simplicial_set_intersect simplicial_set_union tumap umap unload_uwot

#' Dimensionality Reduction with UMAP
#'
#' Carry out dimensionality reduction of a dataset using the Uniform Manifold
#' Approximation and Projection (UMAP) method (McInnes et al., 2018). Some of
#' the following help text is lifted verbatim from the Python reference
#' implementation at \url{https://github.com/lmcinnes/umap}.
#'
#' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
#'   \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#'   \code{"pca"}.
#' @param n_neighbors The size of local neighborhood (in terms of number of
#'   neighboring sample points) used for manifold approximation. Larger values
#'   result in more global views of the manifold, while smaller values result in
#'   more local data being preserved. In general values should be in the range
#'   \code{2} to \code{100}.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. For
#'  \code{nn_method = "annoy"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' For \code{nn_method = "hnsw"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"}
#'   \item \code{"cosine"}
#'   \item \code{"correlation"}
#' }
#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed and \code{nn_method = "nndescent"} is specified then many more
#' metrics are avaiable, including:
#' \itemize{
#' \item \code{"braycurtis"}
#' \item \code{"canberra"}
#' \item \code{"chebyshev"}
#' \item \code{"dice"}
#' \item \code{"hamming"}
#' \item \code{"hellinger"}
#' \item \code{"jaccard"}
#' \item \code{"jensenshannon"}
#' \item \code{"kulsinski"}
#' \item \code{"rogerstanimoto"}
#' \item \code{"russellrao"}
#' \item \code{"sokalmichener"}
#' \item \code{"sokalsneath"}
#' \item \code{"spearmanr"}
#' \item \code{"symmetrickl"}
#' \item \code{"tsss"}
#' \item \code{"yule"}
#' }
#' For more details see the package documentation of \code{rnndescent}.
#' For \code{nn_method = "fnn"}, the distance metric is always "euclidean".
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. By default, this value is set to \code{500} for
#'   datasets containing 10,000 vertices or less, and \code{200} otherwise.
#'   If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will
#'   be returned.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For UMAP, the default is \code{"none"}.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#'     (Belkin and Niyogi, 2002).
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE. This is an alias for \code{init = "pca", init_sdev =
#'     1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}, \code{"agspectral"}), if more than one connected
#'  component is identified, no spectral initialization is attempted. Instead
#'  a PCA-based initialization is attempted. If \code{verbose = TRUE} the
#'  number of connected components are logged to the console. The existence of
#'  multiple connected components implies that a global view of the data cannot
#'  be attained with this initialization. Increasing the value of
#'  \code{n_neighbors} may help.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default no scaling is carried out, except when \code{init =
#'   "spca"}, in which case the value is \code{0.0001}. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For
#'   compatibility with recent versions of the Python UMAP package, if you are
#'   using \code{init = "spectral"}, then you should also set
#'   \code{init_sdev = "range"}, which will range scale each of the columns
#'   containing the initial data between 0-10. This is not set by default to
#'   maintain backwards compatibility with previous versions of uwot.
#' @param spread The effective scale of embedded points. In combination with
#'   \code{min_dist}, this determines how clustered/clumped the embedded points
#'   are.
#' @param min_dist The effective minimum distance between embedded points.
#'   Smaller values will result in a more clustered/clumped embedding where
#'   nearby points on the manifold are drawn closer together, while larger
#'   values will result on a more even dispersal of points. The value should be
#'   set relative to the \code{spread} value, which determines the scale at
#'   which embedded points will be spread out.
#' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as
#'   the set operation used to combine local fuzzy simplicial sets to obtain a
#'   global fuzzy simplicial sets. Both fuzzy set operations use the product
#'   t-norm. The value of this parameter should be between \code{0.0} and
#'   \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while
#'   \code{0.0} will use a pure fuzzy intersection.
#' @param local_connectivity The local connectivity required -- i.e. the number
#'   of nearest neighbors that should be assumed to be connected at a local
#'   level. The higher this value the more connected the manifold becomes
#'   locally. In practice this should be not more than the local intrinsic
#'   dimension of the manifold.
#' @param bandwidth The effective bandwidth of the kernel if we view the
#'   algorithm as similar to Laplacian Eigenmaps. Larger values induce more
#'   connectivity and a more global view of the data, smaller values concentrate
#'   more locally.
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param a More specific parameters controlling the embedding. If \code{NULL}
#'   these values are set automatically as determined by \code{min_dist} and
#'   \code{spread}.
#' @param b More specific parameters controlling the embedding. If \code{NULL}
#'   these values are set automatically as determined by \code{min_dist} and
#'   \code{spread}.
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'     \item \code{"hnsw"} Use approximate nearest neighbors with the
#'       Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin,
#'       2018) via the
#'       \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package.
#'       \code{RcppHNSW} is not a dependency of this package: this option is
#'       only available if you have installed \code{RcppHNSW} yourself. Also,
#'       HNSW only supports the following arguments for \code{metric} and
#'       \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and
#'       \code{"correlation"}.
#'     \item \code{"nndescent"} Use approximate nearest neighbors with the
#'       Nearest Neighbor Descent method (Dong et al., 2011) via the
#'       \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#'       package. \code{rnndescent} is not a dependency of this package: this
#'       option is only available if you have installed \code{rnndescent}
#'       yourself.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass pre-calculated nearest neighbor data to this argument. It
#'   must be one of two formats, either a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   or a sparse distance matrix of type \code{dgCMatrix}, with dimensions
#'   \code{n_vertices x n_vertices}. Distances should be arranged by column,
#'   i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates
#'   that the \code{j}th observation in \code{X} is a nearest neighbor of the
#'   \code{i}th observation with the distance given by the value of that
#'   element.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data. If using the sparse distance matrix input, each
#'   column can contain a different number of neighbors.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param nn_args A list containing additional arguments to pass to the nearest
#'   neighbor method. For \code{nn_method = "annoy"}, you can specify
#'   \code{"n_trees"} and \code{"search_k"}, and these will override the
#'   \code{n_trees} and \code{search_k} parameters.
#'   For \code{nn_method = "hnsw"}, you may specify the following arguments:
#'   \itemize{
#'   \item \code{M} The maximum number of neighbors to keep for each vertex.
#'   Reasonable values are \code{2} to \code{100}. Higher values give better
#'   recall at the cost of more memory. Default value is \code{16}.
#'   \item \code{ef_construction} A positive integer specifying the size of
#'   the dynamic list used during index construction. A higher value will
#'   provide better results at the cost of a longer time to build the index.
#'   Default is \code{200}.
#'   \item \code{ef} A positive integer specifying the size of the dynamic
#'   list used during search. This cannot be smaller than \code{n_neighbors}
#'   and cannot be higher than the number of items in the index. Default is
#'   \code{10}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest.
#'   If you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   \item \code{pruning_degree_multiplier} The maximum number of edges per node
#'   to retain in the search graph, relative to \code{n_neighbors}. A larger
#'   value will give more accurate results at the cost of a longer computation
#'   time. Default is \code{1.5}. This parameter only affects neighbor search
#'   when transforming new data with \code{\link{umap_transform}}.
#'   \item \code{epsilon} Controls the degree of the back-tracking when
#'   traversing the search graph. Setting this to \code{0.0} will do a greedy
#'   search with no back-tracking. A larger value will give more accurate
#'   results at the cost of a longer computation time. Default is \code{0.1}.
#'   This parameter only affects neighbor search when transforming new data with
#'   \code{\link{umap_transform}}.
#'   \item \code{max_search_fraction} Specifies the maximum fraction of the
#'   search graph to traverse. By default, this is set to \code{1.0}, so the
#'   entire graph (i.e. all items in \code{X}) may be visited. You may want to
#'   set this to a smaller value if you have a very large dataset (in
#'   conjunction with \code{epsilon}) to avoid an inefficient exhaustive search
#'   of the data in \code{X}. This parameter only affects neighbor search when
#'   transforming new data with \code{\link{umap_transform}}.
#'   }
#' @param approx_pow If \code{TRUE}, use an approximation to the power function
#'   in the UMAP gradient, from
#'   \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}.
#'   Ignored if \code{dens_scale} is non-\code{NULL}.
#' @param y Optional target data for supervised dimension reduction. Can be a
#' vector, matrix or data frame. Use the \code{target_metric} parameter to
#' specify the metrics to use, using the same syntax as \code{metric}. Usually
#' either a single numeric or factor column is used, but more complex formats
#' are possible. The following types are allowed:
#'   \itemize{
#'     \item Factor columns with the same length as \code{X}. \code{NA} is
#'     allowed for any observation with an unknown level, in which case
#'     UMAP operates as a form of semi-supervised learning. Each column is
#'     treated separately.
#'     \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the
#'     parameter \code{target_n_neighbors} to set the number of neighbors used
#'     with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors,
#'     numeric columns are grouped into one block unless \code{target_metric}
#'     specifies otherwise. For example, if you wish columns \code{a} and
#'     \code{b} to be treated separately, specify
#'     \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise,
#'     the data will be effectively treated as a matrix with two columns.
#'     \item Nearest neighbor data, consisting of a list of two matrices,
#'     \code{idx} and \code{dist}. These represent the precalculated nearest
#'     neighbor indices and distances, respectively. This
#'     is the same format as that expected for precalculated data in
#'     \code{nn_method}. This format assumes that the underlying data was a
#'     numeric vector. Any user-supplied value of the \code{target_n_neighbors}
#'     parameter is ignored in this case, because the the number of columns in
#'     the matrices is used for the value. Multiple nearest neighbor data using
#'     different metrics can be supplied by passing a list of these lists.
#'   }
#' Unlike \code{X}, all factor columns included in \code{y} are automatically
#' used.
#' @param target_n_neighbors Number of nearest neighbors to use to construct the
#'   target simplicial set. Default value is \code{n_neighbors}. Applies only if
#'   \code{y} is non-\code{NULL} and \code{numeric}.
#' @param target_metric The metric used to measure distance for \code{y} if
#'   using supervised dimension reduction. Used only if \code{y} is numeric.
#' @param target_weight Weighting factor between data topology and target
#'   topology. A value of 0.0 weights entirely on data, a value of 1.0 weights
#'   entirely on target. The default of 0.5 balances the weighting equally
#'   between data and target. Only applies if \code{y} is non-\code{NULL}.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}}
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     systems without access to efficient linear algebra libraries (e.g.
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and
#'   \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to
#'   \code{TRUE} will speed up the stochastic optimization phase, but give a
#'   potentially less accurate embedding, and which will not be exactly
#'   reproducible even with a fixed seed. For visualization, \code{fast_sgd =
#'   TRUE} will give perfectly good results. For more generic dimensionality
#'   reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd =
#'   TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads},
#'   and \code{approx_pow} are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param ret_model If \code{TRUE}, then return extra data that can be used to
#'   add new data to an existing embedding via \code{\link{umap_transform}}. The
#'   embedded coordinates are returned as the list item \code{embedding}. If
#'   \code{FALSE}, just return the coordinates. This parameter can be used in
#'   conjunction with \code{ret_nn} and \code{ret_extra}. Note that some
#'   settings are incompatible with the production of a UMAP model: external
#'   neighbor data (passed via a list to \code{nn_method}), and factor columns
#'   that were included via the \code{metric} parameter. In the latter case, the
#'   model produced is based only on the numeric data. A transformation using
#'   new data is possible, but the factor columns in the new data are ignored.
#'   Note that setting \code{ret_model = TRUE} forces the use of the approximate
#'   nearest neighbors method. Because small datasets would otherwise use exact
#'   nearest neighbor calculations, setting \code{ret_model = TRUE} means that
#'   different results may be returned for small datasets in terms of both the
#'   returned nearest neighbors (if requested) and the final embedded
#'   coordinates, compared to \code{ret_model = FALSE}, even if the random
#'   number seed is fixed. To avoid this, explicitly set
#'   \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#'   nearest neighbor data that can be used as input to \code{nn_method} to
#'   avoid the overhead of repeatedly calculating the nearest neighbors when
#'   manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#'   \code{init}). See the "Value" section for the names of the list items. If
#'   \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#'   could be sensitive to data scaling, so be wary of reusing nearest neighbor
#'   data if modifying the \code{scale} parameter. This parameter can be used in
#'   conjunction with \code{ret_model} and \code{ret_extra}.
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'     \item \code{"model"} Same as setting \code{ret_model = TRUE}.
#'     \item \code{"nn"} Same as setting \code{ret_nn = TRUE}.
#'     \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy
#'       simplicial set of the merged local views of the input data). The graph
#'       is returned as a sparse symmetric N x N matrix of class
#'       \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#'       membership strength of the edge connecting vertex i and vertex j. This
#'       can be considered analogous to the input probability (or similarity or
#'       affinity) used in t-SNE and LargeVis. Note that the graph is further
#'       sparsified by removing edges with sufficiently low membership strength
#'       that they would not be sampled by the probabilistic edge sampling
#'       employed for optimization and therefore the number of non-zero elements
#'       in the matrix is dependent on \code{n_epochs}. If you are only
#'       interested in the fuzzy input graph (e.g. for clustering), setting
#'       \code{n_epochs = 0} will avoid any further sparsifying.
#'       Be aware that setting `binary_edge_weights = TRUE` will affect this
#'       graph (all non-zero edge weights will be 1).
#'    \item \code{"sigma"} the normalization value for each observation in the
#'       dataset when constructing the smoothed distances to each of its
#'       neighbors. This gives some sense of the local density of each
#'       observation in the high dimensional space: higher values of
#'       \code{sigma} indicate a higher dispersion or lower density.
#'   }
#' @param n_threads Number of threads to use (except during stochastic gradient
#'   descent). Default is half the number of concurrent threads supported by the
#'   system. For nearest neighbor search, only applies if
#'   \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#'   will be temporarily written to disk in the location determined by
#'   \code{\link[base]{tempfile}}.
#' @param n_sgd_threads Number of threads to use during stochastic gradient
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. Set to \code{"auto"} to use the same value as
#'   \code{n_threads}.
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   performance improvement if the overhead of thread management and context
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}.
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     step-size adaptivity and bring the behavior closer to stochastic gradient
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @param binary_edge_weights If \code{TRUE} then edge weights in the input
#'   graph are treated as binary (0/1) rather than real valued. This affects the
#'   sampling frequency of neighbors and is the strategy used by the PaCMAP
#'   method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020)
#'   and theoretical (Damrich and Hamprecht, 2021) work suggests this has little
#'   effect on UMAP's performance.
#' @param dens_scale A value between 0 and 1. If > 0 then the output attempts
#'   to preserve relative local density around each observation. This uses an
#'   approximation to the densMAP method (Narayan and co-workers, 2021). The
#'   larger the value of \code{dens_scale}, the greater the range of output
#'   densities that will be used to map the input densities. This option is
#'   ignored if using multiple \code{metric} blocks.
#' @param seed Integer seed to use to initialize the random number generator
#'   state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this
#'   should give consistent output across multiple runs on a given installation.
#'   Setting this value is equivalent to calling \code{\link[base]{set.seed}},
#'   but it may be more convenient in some situations than having to call a
#'   separate function. The default is to not set a seed. If
#'   \code{ret_model = TRUE}, the seed will be stored in the output model and
#'   then used to set the seed inside \code{\link{umap_transform}}.
#' @return A matrix of optimized coordinates, or:
#'   \itemize{
#'     \item if \code{ret_model = TRUE} (or \code{ret_extra} contains
#'     \code{"model"}), returns a list containing extra information that can be
#'     used to add new data to an existing embedding via
#'     \code{\link{umap_transform}}. In this case, the coordinates are available
#'     in the list item \code{embedding}. \bold{NOTE}: The contents of
#'     the \code{model} list should \emph{not} be considered stable or part of
#'     the public API, and are purposely left undocumented.
#'     \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#'     returns the nearest neighbor data as a list called \code{nn}. This
#'     contains one list for each \code{metric} calculated, itself containing a
#'     matrix \code{idx} with the integer ids of the neighbors; and a matrix
#'     \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#'     used as input to the \code{nn_method} parameter.
#'     \item if \code{ret_extra} contains \code{"fgraph"}, returns the high
#'     dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type
#'     \link[Matrix]{dgCMatrix-class}.
#'     \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the
#'     smooth knn distance normalization terms for each observation as
#'     \code{"sigma"} and a vector \code{"rho"} containing the largest
#'     distance to the locally connected neighbors of each observation.
#'     \item if \code{ret_extra} contains \code{"localr"}, returns a vector of
#'     the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}.
#'   }
#'   The returned list contains the combined data from any combination of
#'   specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}.
#' @examples
#'
#' iris30 <- iris[c(1:10, 51:60, 101:110), ]
#'
#' # Non-numeric columns are automatically removed so you can pass data frames
#' # directly in a lot of cases without pre-processing
#' iris_umap <- umap(iris30, n_neighbors = 5, learning_rate = 0.5, init = "random", n_epochs = 20)
#'
#' # Faster approximation to the gradient and return nearest neighbors
#' iris_umap <- umap(iris30, n_neighbors = 5, approx_pow = TRUE, ret_nn = TRUE, n_epochs = 20)
#'
#' # Can specify min_dist and spread parameters to control separation and size
#' # of clusters and reuse nearest neighbors for efficiency
#' nn <- iris_umap$nn
#' iris_umap <- umap(iris30, n_neighbors = 5, min_dist = 1, spread = 5, nn_method = nn, n_epochs = 20)
#'
#' # Supervised dimension reduction using the 'Species' factor column
#' iris_sumap <- umap(iris30,
#'   n_neighbors = 5, min_dist = 0.001, y = iris30$Species,
#'   target_weight = 0.5, n_epochs = 20
#' )
#'
#' # Calculate Petal and Sepal neighbors separately (uses intersection of the resulting sets):
#' iris_umap <- umap(iris30, metric = list(
#'   "euclidean" = c("Sepal.Length", "Sepal.Width"),
#'   "euclidean" = c("Petal.Length", "Petal.Width")
#' ))
#'
#' @references
#' Belkin, M., & Niyogi, P. (2002).
#' Laplacian eigenmaps and spectral techniques for embedding and clustering.
#' In \emph{Advances in neural information processing systems}
#' (pp. 585-591).
#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
#'
#' Böhm, J. N., Berens, P., & Kobak, D. (2020).
#' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum.
#' \emph{arXiv preprint} \emph{arXiv:2007.08902}.
#' \url{https://arxiv.org/abs/2007.08902}
#'
#' Damrich, S., & Hamprecht, F. A. (2021).
#' On UMAP's true loss function.
#' \emph{Advances in Neural Information Processing Systems}, \emph{34}.
#' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html}
#'
#' Dong, W., Moses, C., & Li, K. (2011, March).
#' Efficient k-nearest neighbor graph construction for generic similarity measures.
#' In \emph{Proceedings of the 20th international conference on World Wide Web}
#' (pp. 577-586).
#' ACM.
#' \doi{10.1145/1963405.1963487}.
#'
#' Kingma, D. P., & Ba, J. (2014).
#' Adam: A method for stochastic optimization.
#' \emph{arXiv preprint} \emph{arXiv}:1412.6980.
#' \url{https://arxiv.org/abs/1412.6980}
#'
#' Malkov, Y. A., & Yashunin, D. A. (2018).
#' Efficient and robust approximate nearest neighbor search using hierarchical
#' navigable small world graphs.
#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836.
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' Narayan, A., Berger, B., & Cho, H. (2021).
#' Assessing single-cell transcriptomic variability through density-preserving data visualization.
#' \emph{Nature biotechnology}, \emph{39}(6), 765-774.
#' \doi{10.1038/s41587-020-00801-7}
#'
#' O’Neill, M. E. (2014).
#' \emph{PCG: A family of simple fast space-efficient statistically good
#' algorithms for random number generation}
#' (Report No. HMC-CS-2014-0905). Harvey Mudd College.
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' Van der Maaten, L., & Hinton, G. (2008).
#' Visualizing data using t-SNE.
#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html}
#'
#' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021).
#' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization.
#' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73.
#' \url{https://www.jmlr.org/papers/v22/20-1061.html}
#'
#' @export
umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
                 n_epochs = NULL, learning_rate = 1, scale = FALSE,
                 init = "spectral", init_sdev = NULL,
                 spread = 1, min_dist = 0.01,
                 set_op_mix_ratio = 1.0, local_connectivity = 1.0,
                 bandwidth = 1.0, repulsion_strength = 1.0,
                 negative_sample_rate = 5.0, a = NULL, b = NULL,
                 nn_method = NULL, n_trees = 50,
                 search_k = 2 * n_neighbors * n_trees,
                 approx_pow = FALSE,
                 y = NULL, target_n_neighbors = n_neighbors,
                 target_metric = "euclidean",
                 target_weight = 0.5,
                 pca = NULL, pca_center = TRUE,
                 pcg_rand = TRUE,
                 fast_sgd = FALSE,
                 ret_model = FALSE, ret_nn = FALSE, ret_extra = c(),
                 n_threads = NULL,
                 n_sgd_threads = 0,
                 grain_size = 1,
                 tmpdir = tempdir(),
                 verbose = getOption("verbose", TRUE),
                 batch = FALSE,
                 opt_args = NULL, epoch_callback = NULL, pca_method = NULL,
                 binary_edge_weights = FALSE,
                 dens_scale = NULL,
                 seed = NULL,
                 nn_args = list()) {
  uwot(
    X = X, n_neighbors = n_neighbors, n_components = n_components,
    metric = metric, n_epochs = n_epochs, alpha = learning_rate, scale = scale,
    init = init, init_sdev = init_sdev,
    spread = spread, min_dist = min_dist,
    set_op_mix_ratio = set_op_mix_ratio,
    local_connectivity = local_connectivity, bandwidth = bandwidth,
    gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
    a = a, b = b, nn_method = nn_method, n_trees = n_trees,
    search_k = search_k,
    method = "umap", approx_pow = approx_pow,
    n_threads = n_threads, n_sgd_threads = n_sgd_threads,
    grain_size = grain_size,
    y = y, target_n_neighbors = target_n_neighbors,
    target_weight = target_weight, target_metric = target_metric,
    pca = pca, pca_center = pca_center, pca_method = pca_method,
    pcg_rand = pcg_rand,
    fast_sgd = fast_sgd,
    ret_model = ret_model || "model" %in% ret_extra,
    ret_nn = ret_nn || "nn" %in% ret_extra,
    ret_fgraph = "fgraph" %in% ret_extra,
    ret_sigma = "sigma" %in% ret_extra,
    ret_localr = "localr" %in% ret_extra,
    batch = batch,
    opt_args = opt_args,
    epoch_callback = epoch_callback,
    binary_edge_weights = binary_edge_weights,
    tmpdir = tempdir(),
    verbose = verbose,
    dens_scale = dens_scale,
    seed = seed,
    nn_args = nn_args
  )
}

#' Dimensionality Reduction Using t-Distributed UMAP (t-UMAP)
#'
#' A faster (but less flexible) version of the UMAP (McInnes et al, 2018)
#' gradient. For more detail on UMAP, see the \code{\link{umap}} function.
#'
#' By setting the UMAP curve parameters \code{a} and \code{b} to \code{1}, you
#' get back the Cauchy distribution as used in t-SNE (van der Maaten and Hinton,
#' 2008) and LargeVis (Tang et al., 2016). It also results in a substantially
#' simplified gradient expression. This can give a speed improvement of around
#' 50\%.
#'
#' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
#'   \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#'   \code{"pca"}.
#' @param n_neighbors The size of local neighborhood (in terms of number of
#'   neighboring sample points) used for manifold approximation. Larger values
#'   result in more global views of the manifold, while smaller values result in
#'   more local data being preserved. In general values should be in the range
#'   \code{2} to \code{100}.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. For
#'  \code{nn_method = "annoy"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' For \code{nn_method = "hnsw"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"}
#'   \item \code{"cosine"}
#'   \item \code{"correlation"}
#' }
#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed and \code{nn_method = "nndescent"} is specified then many more
#' metrics are avaiable, including:
#' \itemize{
#' \item \code{"braycurtis"}
#' \item \code{"canberra"}
#' \item \code{"chebyshev"}
#' \item \code{"dice"}
#' \item \code{"hamming"}
#' \item \code{"hellinger"}
#' \item \code{"jaccard"}
#' \item \code{"jensenshannon"}
#' \item \code{"kulsinski"}
#' \item \code{"rogerstanimoto"}
#' \item \code{"russellrao"}
#' \item \code{"sokalmichener"}
#' \item \code{"sokalsneath"}
#' \item \code{"spearmanr"}
#' \item \code{"symmetrickl"}
#' \item \code{"tsss"}
#' \item \code{"yule"}
#' }
#' For more details see the package documentation of \code{rnndescent}.
#' For \code{nn_method = "fnn"}, the distance metric is always "euclidean".
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. By default, this value is set to \code{500} for
#'   datasets containing 10,000 vertices or less, and \code{200} otherwise.
#'   If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will
#'   be returned.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For t-UMAP, the default is \code{"none"}.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#'     (Belkin and Niyogi, 2002).
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE. This is an alias for \code{init = "pca", init_sdev =
#'     1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}, \code{"agspectral"}), if more than one connected
#'  component is identified, no spectral initialization is attempted. Instead
#'  a PCA-based initialization is attempted. If \code{verbose = TRUE} the
#'  number of connected components are logged to the console. The existence of
#'  multiple connected components implies that a global view of the data cannot
#'  be attained with this initialization. Increasing the value of
#'  \code{n_neighbors} may help.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default no scaling is carried out, except when \code{init =
#'   "spca"}, in which case the value is \code{0.0001}. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For
#'   compatibility with recent versions of the Python UMAP package, if you are
#'   using \code{init = "spectral"}, then you should also set
#'   \code{init_sdev = "range"}, which will range scale each of the columns
#'   containing the initial data between 0-10. This is not set by default to
#'   maintain backwards compatibility with previous versions of uwot.
#' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as
#'   the set operation used to combine local fuzzy simplicial sets to obtain a
#'   global fuzzy simplicial sets. Both fuzzy set operations use the product
#'   t-norm. The value of this parameter should be between \code{0.0} and
#'   \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while
#'   \code{0.0} will use a pure fuzzy intersection.
#' @param local_connectivity The local connectivity required -- i.e. the number
#'   of nearest neighbors that should be assumed to be connected at a local
#'   level. The higher this value the more connected the manifold becomes
#'   locally. In practice this should be not more than the local intrinsic
#'   dimension of the manifold.
#' @param bandwidth The effective bandwidth of the kernel if we view the
#'   algorithm as similar to Laplacian Eigenmaps. Larger values induce more
#'   connectivity and a more global view of the data, smaller values concentrate
#'   more locally.
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'     \item \code{"hnsw"} Use approximate nearest neighbors with the
#'       Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin,
#'       2018) via the
#'       \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package.
#'       \code{RcppHNSW} is not a dependency of this package: this option is
#'       only available if you have installed \code{RcppHNSW} yourself. Also,
#'       HNSW only supports the following arguments for \code{metric} and
#'       \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and
#'       \code{"correlation"}.
#'     \item \code{"nndescent"} Use approximate nearest neighbors with the
#'       Nearest Neighbor Descent method (Dong et al., 2011) via the
#'       \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#'       package. \code{rnndescent} is not a dependency of this package: this
#'       option is only available if you have installed \code{rnndescent}
#'       yourself.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass pre-calculated nearest neighbor data to this argument. It
#'   must be one of two formats, either a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   or a sparse distance matrix of type \code{dgCMatrix}, with dimensions
#'   \code{n_vertices x n_vertices}. Distances should be arranged by column,
#'   i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates
#'   that the \code{j}th observation in \code{X} is a nearest neighbor of the
#'   \code{i}th observation with the distance given by the value of that
#'   element.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data. If using the sparse distance matrix input, each
#'   column can contain a different number of neighbors.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param nn_args A list containing additional arguments to pass to the nearest
#'   neighbor method. For \code{nn_method = "annoy"}, you can specify
#'   \code{"n_trees"} and \code{"search_k"}, and these will override the
#'   \code{n_trees} and \code{search_k} parameters.
#'   For \code{nn_method = "hnsw"}, you may specify the following arguments:
#'   \itemize{
#'   \item \code{M} The maximum number of neighbors to keep for each vertex.
#'   Reasonable values are \code{2} to \code{100}. Higher values give better
#'   recall at the cost of more memory. Default value is \code{16}.
#'   \item \code{ef_construction} A positive integer specifying the size of
#'   the dynamic list used during index construction. A higher value will
#'   provide better results at the cost of a longer time to build the index.
#'   Default is \code{200}.
#'   \item \code{ef} A positive integer specifying the size of the dynamic
#'   list used during search. This cannot be smaller than \code{n_neighbors}
#'   and cannot be higher than the number of items in the index. Default is
#'   \code{10}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest.
#'   If you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   \item \code{pruning_degree_multiplier} The maximum number of edges per node
#'   to retain in the search graph, relative to \code{n_neighbors}. A larger
#'   value will give more accurate results at the cost of a longer computation
#'   time. Default is \code{1.5}. This parameter only affects neighbor search
#'   when transforming new data with \code{\link{umap_transform}}.
#'   \item \code{epsilon} Controls the degree of the back-tracking when
#'   traversing the search graph. Setting this to \code{0.0} will do a greedy
#'   search with no back-tracking. A larger value will give more accurate
#'   results at the cost of a longer computation time. Default is \code{0.1}.
#'   This parameter only affects neighbor search when transforming new data with
#'   \code{\link{umap_transform}}.
#'   \item \code{max_search_fraction} Specifies the maximum fraction of the
#'   search graph to traverse. By default, this is set to \code{1.0}, so the
#'   entire graph (i.e. all items in \code{X}) may be visited. You may want to
#'   set this to a smaller value if you have a very large dataset (in
#'   conjunction with \code{epsilon}) to avoid an inefficient exhaustive search
#'   of the data in \code{X}. This parameter only affects neighbor search when
#'   transforming new data with \code{\link{umap_transform}}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest. If
#'   you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   \item \code{pruning_degree_multiplier} The maximum number of edges per node
#'   to retain in the search graph, relative to \code{n_neighbors}. A larger
#'   value will give more accurate results at the cost of a longer computation
#'   time. Default is \code{1.5}. This parameter only affects neighbor search
#'   when transforming new data with \code{\link{umap_transform}}.
#'   \item \code{epsilon} Controls the degree of the back-tracking when
#'   traversing the search graph. Setting this to \code{0.0} will do a greedy
#'   search with no back-tracking. A larger value will give more accurate
#'   results at the cost of a longer computation time. Default is \code{0.1}.
#'   This parameter only affects neighbor search when transforming new data with
#'   \code{\link{umap_transform}}.
#'   \item \code{max_search_fraction} Specifies the maximum fraction of the
#'   search graph to traverse. By default, this is set to \code{1.0}, so the
#'   entire graph (i.e. all items in \code{X}) may be visited. You may want to
#'   set this to a smaller value if you have a very large dataset (in
#'   conjunction with \code{epsilon}) to avoid an inefficient exhaustive search
#'   of the data in \code{X}. This parameter only affects neighbor search when
#'   transforming new data with \code{\link{umap_transform}}.
#'   }
#' @param y Optional target data for supervised dimension reduction. Can be a
#' vector, matrix or data frame. Use the \code{target_metric} parameter to
#' specify the metrics to use, using the same syntax as \code{metric}. Usually
#' either a single numeric or factor column is used, but more complex formats
#' are possible. The following types are allowed:
#'   \itemize{
#'     \item Factor columns with the same length as \code{X}. \code{NA} is
#'     allowed for any observation with an unknown level, in which case
#'     UMAP operates as a form of semi-supervised learning. Each column is
#'     treated separately.
#'     \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the
#'     parameter \code{target_n_neighbors} to set the number of neighbors used
#'     with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors,
#'     numeric columns are grouped into one block unless \code{target_metric}
#'     specifies otherwise. For example, if you wish columns \code{a} and
#'     \code{b} to be treated separately, specify
#'     \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise,
#'     the data will be effectively treated as a matrix with two columns.
#'     \item Nearest neighbor data, consisting of a list of two matrices,
#'     \code{idx} and \code{dist}. These represent the precalculated nearest
#'     neighbor indices and distances, respectively. This
#'     is the same format as that expected for precalculated data in
#'     \code{nn_method}. This format assumes that the underlying data was a
#'     numeric vector. Any user-supplied value of the \code{target_n_neighbors}
#'     parameter is ignored in this case, because the the number of columns in
#'     the matrices is used for the value. Multiple nearest neighbor data using
#'     different metrics can be supplied by passing a list of these lists.
#'   }
#' Unlike \code{X}, all factor columns included in \code{y} are automatically
#' used.
#' @param target_n_neighbors Number of nearest neighbors to use to construct the
#'   target simplicial set. Default value is \code{n_neighbors}. Applies only if
#'   \code{y} is non-\code{NULL} and \code{numeric}.
#' @param target_metric The metric used to measure distance for \code{y} if
#'   using supervised dimension reduction. Used only if \code{y} is numeric.
#' @param target_weight Weighting factor between data topology and target
#'   topology. A value of 0.0 weights entirely on data, a value of 1.0 weights
#'   entirely on target. The default of 0.5 balances the weighting equally
#'   between data and target. Only applies if \code{y} is non-\code{NULL}.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}}
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     systems without access to efficient linear algebra libraries (e.g.
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The
#'   default is \code{FALSE}. Setting this to \code{TRUE} will speed up the
#'   stochastic optimization phase, but give a potentially less accurate
#'   embedding, and which will not be exactly reproducible even with a fixed
#'   seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good
#'   results. For more generic dimensionality reduction, it's safer to leave
#'   \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied
#'   values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param ret_model If \code{TRUE}, then return extra data that can be used to
#'   add new data to an existing embedding via \code{\link{umap_transform}}. The
#'   embedded coordinates are returned as the list item \code{embedding}. If
#'   \code{FALSE}, just return the coordinates. This parameter can be used in
#'   conjunction with \code{ret_nn} and \code{ret_extra}. Note that some
#'   settings are incompatible with the production of a UMAP model: external
#'   neighbor data (passed via a list to \code{nn_method}), and factor columns
#'   that were included via the \code{metric} parameter. In the latter case, the
#'   model produced is based only on the numeric data. A transformation using
#'   new data is possible, but the factor columns in the new data are ignored.
#'   Note that setting \code{ret_model = TRUE} forces the use of the approximate
#'   nearest neighbors method. Because small datasets would otherwise use exact
#'   nearest neighbor calculations, setting \code{ret_model = TRUE} means that
#'   different results may be returned for small datasets in terms of both the
#'   returned nearest neighbors (if requested) and the final embedded
#'   coordinates, compared to \code{ret_model = FALSE}, even if the random
#'   number seed is fixed. To avoid this, explicitly set
#'   \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#'   nearest neighbor data that can be used as input to \code{nn_method} to
#'   avoid the overhead of repeatedly calculating the nearest neighbors when
#'   manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#'   \code{init}). See the "Value" section for the names of the list items. If
#'   \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#'   could be sensitive to data scaling, so be wary of reusing nearest neighbor
#'   data if modifying the \code{scale} parameter. This parameter can be used in
#'   conjunction with \code{ret_model} and \code{ret_extra}.
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'     \item \code{"model"} Same as setting \code{ret_model = TRUE}.
#'     \item \code{"nn"} Same as setting \code{ret_nn = TRUE}.
#'     \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy
#'       simplicial set of the merged local views of the input data). The graph
#'       is returned as a sparse symmetric N x N matrix of class
#'       \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#'       membership strength of the edge connecting vertex i and vertex j. This
#'       can be considered analogous to the input probability (or similarity or
#'       affinity) used in t-SNE and LargeVis. Note that the graph is further
#'       sparsified by removing edges with sufficiently low membership strength
#'       that they would not be sampled by the probabilistic edge sampling
#'       employed for optimization and therefore the number of non-zero elements
#'       in the matrix is dependent on \code{n_epochs}. If you are only
#'       interested in the fuzzy input graph (e.g. for clustering), setting
#'       \code{n_epochs = 0} will avoid any further sparsifying. Be aware that
#'       setting \code{binary_edge_weights = TRUE} will affect this graph (all
#'       non-zero edge weights will be 1).
#'    \item \code{"sigma"} the normalization value for each observation in the
#'       dataset when constructing the smoothed distances to each of its
#'       neighbors. This gives some sense of the local density of each
#'       observation in the high dimensional space: higher values of
#'       \code{sigma} indicate a higher dispersion or lower density.
#'   }
#' @param n_threads Number of threads to use (except during stochastic gradient
#'   descent). Default is half the number of concurrent threads supported by the
#'   system. For nearest neighbor search, only applies if
#'   \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#'   will be temporarily written to disk in the location determined by
#'   \code{\link[base]{tempfile}}.
#' @param n_sgd_threads Number of threads to use during stochastic gradient
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. Set to \code{"auto"} to use the same value as
#'   \code{n_threads}.
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   performance improvement if the overhead of thread management and context
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}.
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     step-size adaptivity and bring the behavior closer to stochastic gradient
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @param binary_edge_weights If \code{TRUE} then edge weights in the input
#'   graph are treated as binary (0/1) rather than real valued. This affects the
#'   sampling frequency of neighbors and is the strategy used by the PaCMAP
#'   method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020)
#'   and theoretical (Damrich and Hamprecht, 2021) work suggests this has little
#'   effect on UMAP's performance.
#' @param seed Integer seed to use to initialize the random number generator
#'   state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this
#'   should give consistent output across multiple runs on a given installation.
#'   Setting this value is equivalent to calling \code{\link[base]{set.seed}},
#'   but it may be more convenient in some situations than having to call a
#'   separate function. The default is to not set a seed. If
#'   \code{ret_model = TRUE}, the seed will be stored in the output model and
#'   then used to set the seed inside \code{\link{umap_transform}}.
#' @return A matrix of optimized coordinates, or:
#'   \itemize{
#'     \item if \code{ret_model = TRUE} (or \code{ret_extra} contains
#'     \code{"model"}), returns a list containing extra information that can be
#'     used to add new data to an existing embedding via
#'     \code{\link{umap_transform}}. In this case, the coordinates are available
#'     in the list item \code{embedding}. \bold{NOTE}: The contents of
#'     the \code{model} list should \emph{not} be considered stable or part of
#'     the public API, and are purposely left undocumented.
#'     \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#'     returns the nearest neighbor data as a list called \code{nn}. This
#'     contains one list for each \code{metric} calculated, itself containing a
#'     matrix \code{idx} with the integer ids of the neighbors; and a matrix
#'     \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#'     used as input to the \code{nn_method} parameter.
#'     \item if \code{ret_extra} contains \code{"fgraph"} returns the high
#'     dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type
#'     \link[Matrix]{dgCMatrix-class}.
#'     \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the
#'     smooth knn distance normalization terms for each observation as
#'     \code{"sigma"} and a vector \code{"rho"} containing the largest
#'     distance to the locally connected neighbors of each observation.
#'     \item if \code{ret_extra} contains \code{"localr"}, returns a vector of
#'     the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}.
#'   }
#'   The returned list contains the combined data from any combination of
#'   specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}.
#' @examples
#' iris_tumap <- tumap(iris, n_neighbors = 50, learning_rate = 0.5)
#'
#' @references
#' Belkin, M., & Niyogi, P. (2002).
#' Laplacian eigenmaps and spectral techniques for embedding and clustering.
#' In \emph{Advances in neural information processing systems}
#' (pp. 585-591).
#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
#'
#' Böhm, J. N., Berens, P., & Kobak, D. (2020).
#' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum.
#' \emph{arXiv preprint} \emph{arXiv:2007.08902}.
#' \url{https://arxiv.org/abs/2007.08902}
#'
#' Damrich, S., & Hamprecht, F. A. (2021).
#' On UMAP's true loss function.
#' \emph{Advances in Neural Information Processing Systems}, \emph{34}.
#' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html}
#'
#' Dong, W., Moses, C., & Li, K. (2011, March).
#' Efficient k-nearest neighbor graph construction for generic similarity measures.
#' In \emph{Proceedings of the 20th international conference on World Wide Web}
#' (pp. 577-586).
#' ACM.
#' \doi{10.1145/1963405.1963487}.
#'
#' Kingma, D. P., & Ba, J. (2014).
#' Adam: A method for stochastic optimization.
#' \emph{arXiv preprint} \emph{arXiv}:1412.6980.
#' \url{https://arxiv.org/abs/1412.6980}
#'
#' Malkov, Y. A., & Yashunin, D. A. (2018).
#' Efficient and robust approximate nearest neighbor search using hierarchical
#' navigable small world graphs.
#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836.
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' O’Neill, M. E. (2014).
#' \emph{PCG: A family of simple fast space-efficient statistically good
#' algorithms for random number generation}
#' (Report No. HMC-CS-2014-0905). Harvey Mudd College.
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' Van der Maaten, L., & Hinton, G. (2008).
#' Visualizing data using t-SNE.
#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html}
#'
#' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021).
#' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization.
#' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73.
#' \url{https://www.jmlr.org/papers/v22/20-1061.html}
#'
#' @export
tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
                  n_epochs = NULL,
                  learning_rate = 1, scale = FALSE,
                  init = "spectral", init_sdev = NULL,
                  set_op_mix_ratio = 1.0, local_connectivity = 1.0,
                  bandwidth = 1.0, repulsion_strength = 1.0,
                  negative_sample_rate = 5.0,
                  nn_method = NULL, n_trees = 50,
                  search_k = 2 * n_neighbors * n_trees,
                  n_threads = NULL,
                  n_sgd_threads = 0,
                  grain_size = 1,
                  y = NULL, target_n_neighbors = n_neighbors,
                  target_metric = "euclidean",
                  target_weight = 0.5,
                  pca = NULL, pca_center = TRUE,
                  pcg_rand = TRUE,
                  fast_sgd = FALSE,
                  ret_model = FALSE, ret_nn = FALSE, ret_extra = c(),
                  tmpdir = tempdir(),
                  verbose = getOption("verbose", TRUE),
                  batch = FALSE,
                  opt_args = NULL, epoch_callback = NULL,
                  pca_method = NULL,
                  binary_edge_weights = FALSE,
                  seed = NULL,
                  nn_args = list()) {
  uwot(
    X = X, n_neighbors = n_neighbors, n_components = n_components,
    metric = metric,
    n_epochs = n_epochs, alpha = learning_rate, scale = scale,
    init = init, init_sdev = init_sdev,
    spread = NULL, min_dist = NULL, set_op_mix_ratio = set_op_mix_ratio,
    local_connectivity = local_connectivity, bandwidth = bandwidth,
    gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
    a = NULL, b = NULL, nn_method = nn_method, n_trees = n_trees,
    search_k = search_k,
    method = "tumap",
    n_threads = n_threads, n_sgd_threads = n_sgd_threads,
    grain_size = grain_size,
    y = y, target_n_neighbors = target_n_neighbors,
    target_weight = target_weight, target_metric = target_metric,
    pca = pca, pca_center = pca_center, pca_method = pca_method,
    pcg_rand = pcg_rand,
    fast_sgd = fast_sgd,
    ret_model = ret_model || "model" %in% ret_extra,
    ret_nn = ret_nn || "nn" %in% ret_extra,
    ret_fgraph = "fgraph" %in% ret_extra,
    ret_sigma = "sigma" %in% ret_extra,
    ret_localr = "localr" %in% ret_extra,
    batch = batch,
    opt_args = opt_args,
    epoch_callback = epoch_callback,
    binary_edge_weights = binary_edge_weights,
    seed = seed,
    tmpdir = tmpdir,
    verbose = verbose,
    nn_args = nn_args
  )
}

#' Dimensionality Reduction with a LargeVis-like method
#'
#' Carry out dimensionality reduction of a dataset using a method similar to
#' LargeVis (Tang et al., 2016).
#'
#' \code{lvish} differs from the official LargeVis implementation in the
#' following:
#'
#' \itemize{
#'   \item Only the nearest-neighbor index search phase is multi-threaded.
#'   \item Matrix input data is not normalized.
#'   \item The \code{n_trees} parameter cannot be dynamically chosen based on
#'   data set size.
#'   \item Nearest neighbor results are not refined via the
#'   neighbor-of-my-neighbor method. The \code{search_k} parameter is twice
#'   as large than default to compensate.
#'   \item Gradient values are clipped to \code{4.0} rather than \code{5.0}.
#'   \item Negative edges are generated by uniform sampling of vertexes rather
#'   than their degree ^ 0.75.
#'   \item The default number of samples is much reduced. The default number of
#'   epochs, \code{n_epochs}, is set to \code{5000}, much larger than for
#'   \code{\link{umap}}, but may need to be increased further depending on your
#'   dataset. Using \code{init = "spectral"} can help.
#' }
#'
#' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
#'   \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#'   \code{"pca"}.
#' @param perplexity Controls the size of the local neighborhood used for
#'   manifold approximation. This is the analogous to \code{n_neighbors} in
#'   \code{\link{umap}}. Change this, rather than \code{n_neighbors}.
#' @param n_neighbors The number of neighbors to use when calculating the
#'  \code{perplexity}. Usually set to three times the value of the
#'  \code{perplexity}. Must be at least as large as \code{perplexity}.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. For
#'  \code{nn_method = "annoy"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' For \code{nn_method = "hnsw"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"}
#'   \item \code{"cosine"}
#'   \item \code{"correlation"}
#' }
#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed and \code{nn_method = "nndescent"} is specified then many more
#' metrics are avaiable, including:
#' \itemize{
#' \item \code{"braycurtis"}
#' \item \code{"canberra"}
#' \item \code{"chebyshev"}
#' \item \code{"dice"}
#' \item \code{"hamming"}
#' \item \code{"hellinger"}
#' \item \code{"jaccard"}
#' \item \code{"jensenshannon"}
#' \item \code{"kulsinski"}
#' \item \code{"rogerstanimoto"}
#' \item \code{"russellrao"}
#' \item \code{"sokalmichener"}
#' \item \code{"sokalsneath"}
#' \item \code{"spearmanr"}
#' \item \code{"symmetrickl"}
#' \item \code{"tsss"}
#' \item \code{"yule"}
#' }
#' For more details see the package documentation of \code{rnndescent}.
#' For \code{nn_method = "fnn"}, the distance metric is always "euclidean".
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. The default is calculate the number of epochs
#'   dynamically based on dataset size, to give the same number of edge samples
#'   as the LargeVis defaults. This is usually substantially larger than the
#'   UMAP defaults. If \code{n_epochs = 0}, then coordinates determined by
#'   \code{"init"} will be returned.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For lvish, the default is \code{"maxabs"}, for consistency with LargeVis.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#'     (Belkin and Niyogi, 2002).
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE and LargeVis. This is an alias for \code{init = "pca",
#'     init_sdev = 1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}, \code{"agspectral"}), if more than one connected
#'  component is identified, no spectral initialization is attempted. Instead
#'  a PCA-based initialization is attempted. If \code{verbose = TRUE} the
#'  number of connected components are logged to the console. The existence of
#'  multiple connected components implies that a global view of the data cannot
#'  be attained with this initialization. Increasing the value of
#'  \code{n_neighbors} may help.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default no scaling is carried out, except when \code{init =
#'   "spca"}, in which case the value is \code{0.0001}. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For
#'   compatibility with recent versions of the Python UMAP package, if you are
#'   using \code{init = "spectral"}, then you should also set
#'   \code{init_sdev = "range"}, which will range scale each of the columns
#'   containing the initial data between 0-10. This is not set by default to
#'   maintain backwards compatibility with previous versions of uwot.
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'     \item \code{"hnsw"} Use approximate nearest neighbors with the
#'       Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin,
#'       2018) via the
#'       \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package.
#'       \code{RcppHNSW} is not a dependency of this package: this option is
#'       only available if you have installed \code{RcppHNSW} yourself. Also,
#'       HNSW only supports the following arguments for \code{metric}:
#'       \code{"euclidean"}, \code{"cosine"} and \code{"correlation"}.
#'     \item \code{"nndescent"} Use approximate nearest neighbors with the
#'       Nearest Neighbor Descent method (Dong et al., 2011) via the
#'       \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#'       package. \code{rnndescent} is not a dependency of this package: this
#'       option is only available if you have installed \code{rnndescent}
#'       yourself.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass precalculated nearest neighbor data to this argument. It
#'   must be a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   Multiple nearest neighbor data (e.g. from two different precomputed
#'   metrics) can be passed by passing a list containing the nearest neighbor
#'   data lists as items.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param nn_args A list containing additional arguments to pass to the nearest
#'   neighbor method. For \code{nn_method = "annoy"}, you can specify
#'   \code{"n_trees"} and \code{"search_k"}, and these will override the
#'   \code{n_trees} and \code{search_k} parameters.
#'   For \code{nn_method = "hnsw"}, you may specify the following arguments:
#'   \itemize{
#'   \item \code{M} The maximum number of neighbors to keep for each vertex.
#'   Reasonable values are \code{2} to \code{100}. Higher values give better
#'   recall at the cost of more memory. Default value is \code{16}.
#'   \item \code{ef_construction} A positive integer specifying the size of
#'   the dynamic list used during index construction. A higher value will
#'   provide better results at the cost of a longer time to build the index.
#'   Default is \code{200}.
#'   \item \code{ef} A positive integer specifying the size of the dynamic
#'   list used during search. This cannot be smaller than \code{n_neighbors}
#'   and cannot be higher than the number of items in the index. Default is
#'   \code{10}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest.
#'   If you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   }
#' @param n_threads Number of threads to use (except during stochastic gradient
#'   descent). Default is half the number of concurrent threads supported by the
#'   system. For nearest neighbor search, only applies if
#'   \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#'   will be temporarily written to disk in the location determined by
#'   \code{\link[base]{tempfile}}.
#' @param n_sgd_threads Number of threads to use during stochastic gradient
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. Set to \code{"auto"} to use the same value as
#'   \code{n_threads}.
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   performance improvement if the overhead of thread management and context
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param kernel Type of kernel function to create input probabilities. Can be
#'   one of \code{"gauss"} (the default) or \code{"knn"}. \code{"gauss"} uses
#'   the usual Gaussian weighted similarities. \code{"knn"} assigns equal
#'   probabilities to every edge in the nearest neighbor graph, and zero
#'   otherwise, using \code{perplexity} nearest neighbors. The \code{n_neighbors}
#'   parameter is ignored in this case.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}}
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     systems without access to efficient linear algebra libraries (e.g.
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The
#'   default is \code{FALSE}. Setting this to \code{TRUE} will speed up the
#'   stochastic optimization phase, but give a potentially less accurate
#'   embedding, and which will not be exactly reproducible even with a fixed
#'   seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good
#'   results. For more generic dimensionality reduction, it's safer to leave
#'   \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied
#'   values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#'   nearest neighbor data that can be used as input to \code{nn_method} to
#'   avoid the overhead of repeatedly calculating the nearest neighbors when
#'   manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#'   \code{init}). See the "Value" section for the names of the list items. If
#'   \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#'   could be sensitive to data scaling, so be wary of reusing nearest neighbor
#'   data if modifying the \code{scale} parameter.
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'     \item \code{"nn"} same as setting \code{ret_nn = TRUE}.
#'     \item \code{"P"} the high dimensional probability matrix. The graph
#'       is returned as a sparse symmetric N x N matrix of class
#'       \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#'       input probability (or similarity or affinity) of the edge connecting
#'       vertex i and vertex j. Note that the graph is further sparsified by
#'       removing edges with sufficiently low membership strength that they
#'       would not be sampled by the probabilistic edge sampling employed for
#'       optimization and therefore the number of non-zero elements in the
#'       matrix is dependent on \code{n_epochs}. If you are only interested in
#'       the fuzzy input graph (e.g. for clustering), setting
#'       \code{n_epochs = 0} will avoid any further sparsifying. Be aware that
#'       setting \code{binary_edge_weights = TRUE} will affect this graph (all
#'       non-zero edge weights will be 1).
#'    \item \code{sigma} a vector of the bandwidths used to calibrate the input
#'       Gaussians to reproduce the target \code{"perplexity"}.
#'   }
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}.
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     step-size adaptivity and bring the behavior closer to stochastic gradient
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @param binary_edge_weights If \code{TRUE} then edge weights in the input
#'   graph are treated as binary (0/1) rather than real valued. This affects the
#'   sampling frequency of neighbors and is the strategy used by the PaCMAP
#'   method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020)
#'   and theoretical (Damrich and Hamprecht, 2021) work suggests this has little
#'   effect on UMAP's performance.
#' @return A matrix of optimized coordinates, or:
#'   \itemize{
#'     \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#'     returns the nearest neighbor data as a list called \code{nn}. This
#'     contains one list for each \code{metric} calculated, itself containing a
#'     matrix \code{idx} with the integer ids of the neighbors; and a matrix
#'     \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#'     used as input to the \code{nn_method} parameter.
#'     \item if \code{ret_extra} contains \code{"P"}, returns the high
#'     dimensional probability matrix as a sparse matrix called \code{P}, of
#'     type \link[Matrix]{dgCMatrix-class}.
#'     \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of
#'     the high dimensional gaussian bandwidths for each point, and
#'     \code{"dint"} a vector of estimates of the intrinsic dimensionality at
#'     each point, based on the method given by Lee and co-workers (2015).
#'   }
#'   The returned list contains the combined data from any combination of
#'   specifying \code{ret_nn} and \code{ret_extra}.
#'
#' @examples
#' # Default number of epochs is much larger than for UMAP, assumes random
#' # initialization. Use perplexity rather than n_neighbors to control the size
#' # of the local neighborhood 20 epochs may be too small for a random
#' # initialization
#' iris_lvish <- lvish(iris,
#'   perplexity = 50, learning_rate = 0.5,
#'   init = "random", n_epochs = 20
#' )
#'
#' @references
#' Belkin, M., & Niyogi, P. (2002).
#' Laplacian eigenmaps and spectral techniques for embedding and clustering.
#' In \emph{Advances in neural information processing systems}
#' (pp. 585-591).
#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
#'
#' Böhm, J. N., Berens, P., & Kobak, D. (2020).
#' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum.
#' \emph{arXiv preprint} \emph{arXiv:2007.08902}.
#' \url{https://arxiv.org/abs/2007.08902}
#'
#' Damrich, S., & Hamprecht, F. A. (2021).
#' On UMAP's true loss function.
#' \emph{Advances in Neural Information Processing Systems}, \emph{34}.
#' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html}
#'
#' Dong, W., Moses, C., & Li, K. (2011, March).
#' Efficient k-nearest neighbor graph construction for generic similarity measures.
#' In \emph{Proceedings of the 20th international conference on World Wide Web}
#' (pp. 577-586).
#' ACM.
#' \doi{10.1145/1963405.1963487}.
#'
#' Kingma, D. P., & Ba, J. (2014).
#' Adam: A method for stochastic optimization.
#' \emph{arXiv preprint} \emph{arXiv}:1412.6980.
#' \url{https://arxiv.org/abs/1412.6980}
#'
#' Lee, J. A., Peluffo-Ordóñez, D. H., & Verleysen, M. (2015).
#' Multi-scale similarities in stochastic neighbour embedding: Reducing
#' dimensionality while preserving both local and global structure.
#' \emph{Neurocomputing}, \emph{169}, 246-261.
#'
#' Malkov, Y. A., & Yashunin, D. A. (2018).
#' Efficient and robust approximate nearest neighbor search using hierarchical
#' navigable small world graphs.
#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836.
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' O’Neill, M. E. (2014).
#' \emph{PCG: A family of simple fast space-efficient statistically good
#' algorithms for random number generation}
#' (Report No. HMC-CS-2014-0905). Harvey Mudd College.
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' Van der Maaten, L., & Hinton, G. (2008).
#' Visualizing data using t-SNE.
#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html}
#'
#' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021).
#' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization.
#' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73.
#' \url{https://www.jmlr.org/papers/v22/20-1061.html}
#'
#' @export
lvish <- function(X, perplexity = 50, n_neighbors = perplexity * 3,
                  n_components = 2, metric = "euclidean", n_epochs = -1,
                  learning_rate = 1, scale = "maxabs",
                  init = "lvrandom", init_sdev = NULL,
                  repulsion_strength = 7,
                  negative_sample_rate = 5.0,
                  nn_method = NULL, n_trees = 50,
                  search_k = 2 * n_neighbors * n_trees,
                  n_threads = NULL,
                  n_sgd_threads = 0,
                  grain_size = 1,
                  kernel = "gauss",
                  pca = NULL, pca_center = TRUE,
                  pcg_rand = TRUE,
                  fast_sgd = FALSE,
                  ret_nn = FALSE, ret_extra = c(),
                  tmpdir = tempdir(),
                  verbose = getOption("verbose", TRUE),
                  batch = FALSE,
                  opt_args = NULL, epoch_callback = NULL,
                  pca_method = NULL,
                  binary_edge_weights = FALSE,
                  nn_args = list()) {
  uwot(X,
    n_neighbors = n_neighbors, n_components = n_components,
    metric = metric,
    n_epochs = n_epochs, alpha = learning_rate, scale = scale,
    init = init, init_sdev = init_sdev,
    gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
    nn_method = nn_method,
    n_trees = n_trees, search_k = search_k,
    method = "largevis", perplexity = perplexity,
    pca = pca, pca_center = pca_center, pca_method = pca_method,
    n_threads = n_threads, n_sgd_threads = n_sgd_threads,
    grain_size = grain_size,
    kernel = kernel,
    ret_nn = ret_nn || "nn" %in% ret_extra,
    ret_fgraph = "P" %in% ret_extra,
    ret_sigma = "sigma" %in% ret_extra,
    pcg_rand = pcg_rand,
    fast_sgd = fast_sgd,
    batch = batch,
    opt_args = opt_args,
    epoch_callback = epoch_callback,
    tmpdir = tmpdir,
    binary_edge_weights = binary_edge_weights,
    verbose = verbose,
    nn_args = list()
  )
}

#' Similarity Graph
#'
#' Create a graph (as a sparse symmetric weighted adjacency matrix) representing
#' the similarities between items in a data set. No dimensionality reduction is
#' carried out. By default, the similarities are calculated using the merged
#' fuzzy simplicial set approach in the Uniform Manifold Approximation and
#' Projection (UMAP) method (McInnes et al., 2018), but the approach from
#' LargeVis (Tang et al., 2016) can also be used.
#'
#' This is equivalent to running \code{\link{umap}} with the
#' \code{ret_extra = c("fgraph")} parameter, but without the overhead of
#' calculating (or returning) the optimized low-dimensional coordinates.
#'
#' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
#'   \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}.
#' @param n_neighbors The size of local neighborhood (in terms of number of
#'   neighboring sample points) used for manifold approximation. Larger values
#'   result in more global views of the manifold, while smaller values result in
#'   more local data being preserved. In general values should be in the range
#'   \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. For
#'  \code{nn_method = "annoy"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' For \code{nn_method = "hnsw"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"}
#'   \item \code{"cosine"}
#'   \item \code{"correlation"}
#' }
#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed and \code{nn_method = "nndescent"} is specified then many more
#' metrics are avaiable, including:
#' \itemize{
#' \item \code{"braycurtis"}
#' \item \code{"canberra"}
#' \item \code{"chebyshev"}
#' \item \code{"dice"}
#' \item \code{"hamming"}
#' \item \code{"hellinger"}
#' \item \code{"jaccard"}
#' \item \code{"jensenshannon"}
#' \item \code{"kulsinski"}
#' \item \code{"rogerstanimoto"}
#' \item \code{"russellrao"}
#' \item \code{"sokalmichener"}
#' \item \code{"sokalsneath"}
#' \item \code{"spearmanr"}
#' \item \code{"symmetrickl"}
#' \item \code{"tsss"}
#' \item \code{"yule"}
#' }
#' For more details see the package documentation of \code{rnndescent}.
#' For \code{nn_method = "fnn"}, the distance metric is always "euclidean".
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For \code{method} \code{"umap"}, the default is \code{"none"}. For
#' \code{"largevis"}, the default is \code{"maxabs"}.
#' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as
#'   the set operation used to combine local fuzzy simplicial sets to obtain a
#'   global fuzzy simplicial sets. Both fuzzy set operations use the product
#'   t-norm. The value of this parameter should be between \code{0.0} and
#'   \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while
#'   \code{0.0} will use a pure fuzzy intersection. Ignored if
#'   \code{method = "largevis"}
#' @param local_connectivity The local connectivity required -- i.e. the number
#'   of nearest neighbors that should be assumed to be connected at a local
#'   level. The higher this value the more connected the manifold becomes
#'   locally. In practice this should be not more than the local intrinsic
#'   dimension of the manifold. Ignored if \code{method = "largevis"}.
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'     \item \code{"hnsw"} Use approximate nearest neighbors with the
#'       Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin,
#'       2018) via the
#'       \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package.
#'       \code{RcppHNSW} is not a dependency of this package: this option is
#'       only available if you have installed \code{RcppHNSW} yourself. Also,
#'       HNSW only supports the following arguments for \code{metric} and
#'       \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and
#'       \code{"correlation"}.
#'     \item \code{"nndescent"} Use approximate nearest neighbors with the
#'       Nearest Neighbor Descent method (Dong et al., 2011) via the
#'       \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#'       package. \code{rnndescent} is not a dependency of this package: this
#'       option is only available if you have installed \code{rnndescent}
#'       yourself.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass pre-calculated nearest neighbor data to this argument. It
#'   must be one of two formats, either a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   or a sparse distance matrix of type \code{dgCMatrix}, with dimensions
#'   \code{n_vertices x n_vertices}. Distances should be arranged by column,
#'   i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates
#'   that the \code{j}th observation in \code{X} is a nearest neighbor of the
#'   \code{i}th observation with the distance given by the value of that
#'   element.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data. If using the sparse distance matrix input, each
#'   column can contain a different number of neighbors.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param nn_args A list containing additional arguments to pass to the nearest
#'   neighbor method. For \code{nn_method = "annoy"}, you can specify
#'   \code{"n_trees"} and \code{"search_k"}, and these will override the
#'   \code{n_trees} and \code{search_k} parameters.
#'   For \code{nn_method = "hnsw"}, you may specify the following arguments:
#'   \itemize{
#'   \item \code{M} The maximum number of neighbors to keep for each vertex.
#'   Reasonable values are \code{2} to \code{100}. Higher values give better
#'   recall at the cost of more memory. Default value is \code{16}.
#'   \item \code{ef_construction} A positive integer specifying the size of
#'   the dynamic list used during index construction. A higher value will
#'   provide better results at the cost of a longer time to build the index.
#'   Default is \code{200}.
#'   \item \code{ef} A positive integer specifying the size of the dynamic
#'   list used during search. This cannot be smaller than \code{n_neighbors}
#'   and cannot be higher than the number of items in the index. Default is
#'   \code{10}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest.
#'   If you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   \item \code{pruning_degree_multiplier} The maximum number of edges per node
#'   to retain in the search graph, relative to \code{n_neighbors}. A larger
#'   value will give more accurate results at the cost of a longer computation
#'   time. Default is \code{1.5}. This parameter only affects neighbor search
#'   when transforming new data with \code{\link{umap_transform}}.
#'   \item \code{epsilon} Controls the degree of the back-tracking when
#'   traversing the search graph. Setting this to \code{0.0} will do a greedy
#'   search with no back-tracking. A larger value will give more accurate
#'   results at the cost of a longer computation time. Default is \code{0.1}.
#'   This parameter only affects neighbor search when transforming new data with
#'   \code{\link{umap_transform}}.
#'   \item \code{max_search_fraction} Specifies the maximum fraction of the
#'   search graph to traverse. By default, this is set to \code{1.0}, so the
#'   entire graph (i.e. all items in \code{X}) may be visited. You may want to
#'   set this to a smaller value if you have a very large dataset (in
#'   conjunction with \code{epsilon}) to avoid an inefficient exhaustive search
#'   of the data in \code{X}. This parameter only affects neighbor search when
#'   transforming new data with \code{\link{umap_transform}}.
#'   }
#' @param perplexity Used only if \code{method = "largevis"}. Controls the size
#'   of the local neighborhood used for manifold approximation. Should be a
#'   value between 1 and one less than the number of items in \code{X}. If
#'   specified, you should \emph{not} specify a value for \code{n_neighbors}
#'   unless you know what you are doing.
#' @param kernel Used only if \code{method = "largevis"}. Type of kernel
#'   function to create input similiarties. Can be one of \code{"gauss"} (the
#'   default) or \code{"knn"}. \code{"gauss"} uses the usual Gaussian weighted
#'   similarities. \code{"knn"} assigns equal similiarties. to every edge in the
#'   nearest neighbor graph, and zero otherwise, using \code{perplexity} nearest
#'   neighbors. The \code{n_neighbors} parameter is ignored in this case.
#' @param method How to generate the similarities between items. One of:
#'   \itemize{
#'     \item \code{"umap"} The UMAP method of McInnes et al. (2018).
#'     \item \code{"largevis"} The LargeVis method of Tang et al. (2016).
#'   }
#' @param y Optional target data to add supervised or semi-supervised weighting
#'   to the similarity graph . Can be a vector, matrix or data frame. Use the
#'   \code{target_metric} parameter to specify the metrics to use, using the
#'   same syntax as \code{metric}. Usually either a single numeric or factor
#'   column is used, but more complex formats are possible. The following types
#'   are allowed:
#'   \itemize{
#'     \item Factor columns with the same length as \code{X}. \code{NA} is
#'     allowed for any observation with an unknown level, in which case
#'     UMAP operates as a form of semi-supervised learning. Each column is
#'     treated separately.
#'     \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the
#'     parameter \code{target_n_neighbors} to set the number of neighbors used
#'     with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors,
#'     numeric columns are grouped into one block unless \code{target_metric}
#'     specifies otherwise. For example, if you wish columns \code{a} and
#'     \code{b} to be treated separately, specify
#'     \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise,
#'     the data will be effectively treated as a matrix with two columns.
#'     \item Nearest neighbor data, consisting of a list of two matrices,
#'     \code{idx} and \code{dist}. These represent the precalculated nearest
#'     neighbor indices and distances, respectively. This
#'     is the same format as that expected for precalculated data in
#'     \code{nn_method}. This format assumes that the underlying data was a
#'     numeric vector. Any user-supplied value of the \code{target_n_neighbors}
#'     parameter is ignored in this case, because the the number of columns in
#'     the matrices is used for the value. Multiple nearest neighbor data using
#'     different metrics can be supplied by passing a list of these lists.
#'   }
#' Unlike \code{X}, all factor columns included in \code{y} are automatically
#' used. This parameter is ignored if \code{method = "largevis"}.
#' @param target_n_neighbors Number of nearest neighbors to use to construct the
#'   target simplicial set. Default value is \code{n_neighbors}. Applies only if
#'   \code{y} is non-\code{NULL} and \code{numeric}.  This parameter is ignored
#'   if \code{method = "largevis"}.
#' @param target_metric The metric used to measure distance for \code{y} if
#'   using supervised dimension reduction. Used only if \code{y} is numeric.
#'   This parameter is ignored if \code{method = "largevis"}.
#' @param target_weight Weighting factor between data topology and target
#'   topology. A value of 0.0 weights entirely on data, a value of 1.0 weights
#'   entirely on target. The default of 0.5 balances the weighting equally
#'   between data and target. Only applies if \code{y} is non-\code{NULL}. This
#'   parameter is ignored if \code{method = "largevis"}.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}}
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     systems without access to efficient linear algebra libraries (e.g.
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'    \item \code{"nn"} nearest neighbor data that can be used as input to
#'       \code{nn_method} to avoid the overhead of repeatedly calculating the
#'       nearest neighbors when manipulating unrelated parameters. See the
#'       "Value" section for the names of the list items. Note that the nearest
#'       neighbors could be sensitive to data scaling, so be wary of reusing
#'       nearest neighbor data if modifying the \code{scale} parameter.
#'    \item \code{"sigma"} the normalization value for each observation in the
#'       dataset when constructing the smoothed distances to each of its
#'       neighbors. This gives some sense of the local density of each
#'       observation in the high dimensional space: higher values of
#'       \code{sigma} indicate a higher dispersion or lower density.
#'   }
#' @param n_threads Number of threads to use. Default is half the number of
#'   concurrent threads supported by the system. For nearest neighbor search,
#'   only applies if \code{nn_method = "annoy"}. If \code{n_threads > 1}, then
#'   the Annoy index will be temporarily written to disk in the location
#'   determined by \code{\link[base]{tempfile}}.
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} will be used for
#'   processing, which might give a performance improvement if the overhead of
#'   thread management and context switching was outweighing the improvement due
#'   to concurrent processing. This should be left at default (\code{1}) and
#'   work will be spread evenly over all the threads specified.
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param binary_edge_weights If \code{TRUE} then edge weights of the returned
#'   graph are binary (0/1) rather than reflecting the degree of similarity.
#' @return A sparse symmetrized matrix of the similarities between the items in
#'   \code{X} or if \code{nn_method} contains pre-computed nearest neighbor
#'   data, the items in \code{nn_method}. Because of the symmetrization, there
#'   may be more non-zero items in each column than the specified value of
#'   \code{n_neighbors} (or pre-computed neighbors in \code{nn_method}).
#'   If \code{ret_extra} is specified then the return value will be a list
#'   containing:
#'   \itemize{
#'     \item \code{similarity_graph} the similarity graph as a sparse matrix
#'     as described above.
#'     \item \code{nn} (if \code{ret_extra} contained \code{"nn"}) the nearest
#'     neighbor data as a list called \code{nn}. This contains one list for each
#'     \code{metric} calculated, itself containing a matrix \code{idx} with the
#'     integer ids of the neighbors; and a matrix \code{dist} with the
#'     distances. The \code{nn} list (or a sub-list) can be used as input to the
#'     \code{nn_method} parameter.
#'     \item \code{sigma} (if \code{ret_extra} contains \code{"sigma"}),
#'     a vector of calibrated parameters, one for each item in the input data,
#'     reflecting the local data density for that item. The exact definition of
#'     the values depends on the choice of the \code{method} parameter.
#'     \item \code{rho} (if \code{ret_extra} contains \code{"sigma"}), a
#'     vector containing the largest distance to the locally connected neighbors
#'     of each item in the input data. This will exist only if
#'     \code{method = "umap"}.
#'     \item \code{localr} (if \code{ret_extra} contains \code{"localr"}) a
#'     vector of the estimated local radii, the sum of \code{"sigma"} and
#'     \code{"rho"}. This will exist only if \code{method = "umap"}.
#'   }
#' @examples
#'
#' iris30 <- iris[c(1:10, 51:60, 101:110), ]
#'
#' # return a 30 x 30 sparse matrix with similarity data based on 10 nearest
#' # neighbors per item
#' iris30_sim_graph <- similarity_graph(iris30, n_neighbors = 10)
#'
#' # Default is to use the UMAP method of calculating similarities, but LargeVis
#' # is also available: for that method, use perplexity instead of n_neighbors
#' # to control neighborhood size. Use ret_extra = "nn" to return nearest
#' # neighbor data as well as the similarity graph. Return value is a list
#' # containing similarity_graph' and 'nn' items.
#' iris30_lv_graph <- similarity_graph(iris30,
#'   perplexity = 10,
#'   method = "largevis", ret_extra = "nn"
#' )
#' # If you have the neighbor information you don't need the original data
#' iris30_lv_graph_nn <- similarity_graph(
#'   nn_method = iris30_lv_graph$nn,
#'   perplexity = 10, method = "largevis"
#' )
#' all(iris30_lv_graph_nn == iris30_lv_graph$similarity_graph)
#'
#' @references
#' Dong, W., Moses, C., & Li, K. (2011, March).
#' Efficient k-nearest neighbor graph construction for generic similarity measures.
#' In \emph{Proceedings of the 20th international conference on World Wide Web}
#' (pp. 577-586).
#' ACM.
#' \doi{10.1145/1963405.1963487}.
#'
#' Malkov, Y. A., & Yashunin, D. A. (2018).
#' Efficient and robust approximate nearest neighbor search using hierarchical
#' navigable small world graphs.
#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836.
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' @export
similarity_graph <- function(X = NULL, n_neighbors = NULL, metric = "euclidean",
                             scale = NULL,
                             set_op_mix_ratio = 1.0, local_connectivity = 1.0,
                             nn_method = NULL, n_trees = 50,
                             search_k = 2 * n_neighbors * n_trees,
                             perplexity = 50,
                             method = "umap",
                             y = NULL, target_n_neighbors = n_neighbors,
                             target_metric = "euclidean",
                             target_weight = 0.5,
                             pca = NULL, pca_center = TRUE,
                             ret_extra = c(),
                             n_threads = NULL,
                             grain_size = 1,
                             kernel = "gauss",
                             tmpdir = tempdir(),
                             verbose = getOption("verbose", TRUE),
                             pca_method = NULL,
                             binary_edge_weights = FALSE,
                             nn_args = list()) {
  if (is.null(n_neighbors)) {
    if (method == "largevis") {
      n_neighbors <- perplexity * 3
      scale <- "maxabs"
    } else {
      n_neighbors <- 15
      scale <- FALSE
    }
  }

  uwot_res <- uwot(
    X = X, n_neighbors = n_neighbors,
    metric = metric, n_epochs = 0, scale = scale,
    init = NULL,
    set_op_mix_ratio = set_op_mix_ratio,
    local_connectivity = local_connectivity,
    nn_method = nn_method, n_trees = n_trees,
    search_k = search_k,
    method = method,
    n_threads = n_threads,
    grain_size = grain_size,
    kernel = kernel, perplexity = perplexity,
    y = y, target_n_neighbors = target_n_neighbors,
    target_weight = target_weight, target_metric = target_metric,
    pca = pca, pca_center = pca_center, pca_method = pca_method,
    ret_model = FALSE,
    ret_nn = "nn" %in% ret_extra,
    ret_fgraph = TRUE,
    ret_sigma = "sigma" %in% ret_extra,
    ret_localr = "localr" %in% ret_extra,
    binary_edge_weights = binary_edge_weights,
    tmpdir = tempdir(),
    verbose = verbose,
    nn_args = nn_args
  )

  res <- list()
  for (name in names(uwot_res)) {
    if (name == "embedding") {
      # embedding will be NULL so remove it
      next
    }
    if (name == "P" || name == "fgraph") {
      res$similarity_graph <- uwot_res[[name]]
    } else {
      res[[name]] <- uwot_res[[name]]
    }
  }
  if (length(names(res)) == 1 && !is.null(res$similarity_graph)) {
    # return just the similarity graph if no extras were requested
    res <- res$similarity_graph
  }
  res
}

#' Optimize Graph Layout
#'
#' Carry out dimensionality reduction on an input graph, where the distances in
#' the low dimensional space attempt to reproduce the neighbor relations in the
#' input data. By default, the cost function used to optimize the output
#' coordinates use the Uniform Manifold Approximation and Projection (UMAP)
#' method (McInnes et al., 2018), but the approach from LargeVis (Tang et al.,
#' 2016) can also be used. This function can be used to produce a low
#' dimensional representation of the graph produced by
#' \code{\link{similarity_graph}}.
#'
#' @param graph A sparse, symmetric N x N weighted adjacency matrix
#'   representing a graph. Non-zero entries indicate an edge between two nodes
#'   with a given edge weight. There can be a varying number of non-zero entries
#'   in each row/column.
#' @param X Optional input data. Used only for PCA-based initialization.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. By default, this value is set to \code{500} for
#'   datasets containing 10,000 vertices or less, and \code{200} otherwise.
#'   If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will
#'   be returned.
#' For UMAP, the default is \code{"none"}.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap.
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE. This is an alias for \code{init = "pca", init_sdev =
#'     1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}, \code{"agspectral"}), if more than one connected
#'  component is identified, no spectral initialization is attempted. Instead
#'  a PCA-based initialization is attempted. If \code{verbose = TRUE} the
#'  number of connected components are logged to the console. The existence of
#'  multiple connected components implies that a global view of the data cannot
#'  be attained with this initialization. Increasing the value of
#'  \code{n_neighbors} may help.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default no scaling is carried out, except when \code{init =
#'   "spca"}, in which case the value is \code{0.0001}. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). For
#'   compatibility with recent versions of the Python UMAP package, if you are
#'   using \code{init = "spectral"}, then you should also set
#'   \code{init_sdev = "range"}, which will range scale each of the columns
#'   containing the initial data between 0-10. This is not set by default to
#'   maintain backwards compatibility with previous versions of uwot.
#' @param spread The effective scale of embedded points. In combination with
#'   \code{min_dist}, this determines how clustered/clumped the embedded points
#'   are.
#' @param min_dist The effective minimum distance between embedded points.
#'   Smaller values will result in a more clustered/clumped embedding where
#'   nearby points on the manifold are drawn closer together, while larger
#'   values will result on a more even dispersal of points. The value should be
#'   set relative to the \code{spread} value, which determines the scale at
#'   which embedded points will be spread out.
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param a More specific parameters controlling the embedding. If \code{NULL}
#'   these values are set automatically as determined by \code{min_dist} and
#'   \code{spread}.
#' @param b More specific parameters controlling the embedding. If \code{NULL}
#'   these values are set automatically as determined by \code{min_dist} and
#'   \code{spread}.
#' @param approx_pow If \code{TRUE}, use an approximation to the power function
#'   in the UMAP gradient, from
#'   \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}.
#' @param method Cost function to optimize. One of:
#' \itemize{
#'   \item{\code{"umap"}}. The UMAP method of McInnes and co-workers (2018).
#'   \item{\code{"tumap"}}. UMAP with the \code{a} and \code{b} parameters fixed
#'   to 1.
#'   \item{\code{"largevis"}}. The LargeVis method Tang and co-workers (2016).
#' }
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}}
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     systems without access to efficient linear algebra libraries (e.g.
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and
#'   \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to
#'   \code{TRUE} will speed up the stochastic optimization phase, but give a
#'   potentially less accurate embedding, and which will not be exactly
#'   reproducible even with a fixed seed. For visualization, \code{fast_sgd =
#'   TRUE} will give perfectly good results. For more generic dimensionality
#'   reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd =
#'   TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads},
#'   and \code{approx_pow} are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param n_sgd_threads Number of threads to use during stochastic gradient
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. If set to \code{"auto"} then half the number of
#'   concurrent threads supported by the system will be used.
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   performance improvement if the overhead of thread management and context
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}.
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     step-size adaptivity and bring the behavior closer to stochastic gradient
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @param binary_edge_weights If \code{TRUE} then edge weights in the input
#'   graph are treated as binary (0/1) rather than real valued.
#' @return A matrix of optimized coordinates.
#'
#' @examples
#'
#' iris30 <- iris[c(1:10, 51:60, 101:110), ]
#'
#' # return a 30 x 30 sparse matrix with similarity data based on 10 nearest
#' # neighbors per item
#' iris30_sim_graph <- similarity_graph(iris30, n_neighbors = 10)
#' # produce 2D coordinates replicating the neighbor relations in the similarity
#' # graph
#' set.seed(42)
#' iris30_opt <- optimize_graph_layout(iris30_sim_graph, X = iris30)
#'
#' # the above two steps are the same as:
#' # set.seed(42); iris_umap <- umap(iris30, n_neighbors = 10)
#'
#' @references
#' Kingma, D. P., & Ba, J. (2014).
#' Adam: A method for stochastic optimization.
#' \emph{arXiv preprint} \emph{arXiv}:1412.6980.
#' \url{https://arxiv.org/abs/1412.6980}
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' O’Neill, M. E. (2014).
#' \emph{PCG: A family of simple fast space-efficient statistically good
#' algorithms for random number generation}
#' (Report No. HMC-CS-2014-0905). Harvey Mudd College.
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' @export
optimize_graph_layout <-
  function(graph,
           X = NULL,
           n_components = 2,
           n_epochs = NULL,
           learning_rate = 1,
           init = "spectral",
           init_sdev = NULL,
           spread = 1,
           min_dist = 0.01,
           repulsion_strength = 1.0,
           negative_sample_rate = 5.0,
           a = NULL,
           b = NULL,
           method = "umap",
           approx_pow = FALSE,
           pcg_rand = TRUE,
           fast_sgd = FALSE,
           n_sgd_threads = 0,
           grain_size = 1,
           verbose = getOption("verbose", TRUE),
           batch = FALSE,
           opt_args = NULL,
           epoch_callback = NULL,
           pca_method = NULL,
           binary_edge_weights = FALSE) {
    if (!is_sparse_matrix(graph)) {
      stop("graph should be a sparse matrix")
    }
    if (nrow(graph) != ncol(graph)) {
      stop("graph should be a square matrix")
    }
    if (!Matrix::isSymmetric(graph)) {
      stop("graph should be symmetric")
    }
    if (!all(diff(graph@p) > 0)) {
      stop("All items must have at least one neighbor similarity defined")
    }

    # Just do things the UMAP way or we will have a very slow largevis
    # optimization
    if (is.null(n_epochs)) {
      n_vertices <- nrow(graph)
      if (n_vertices <= 10000) {
        n_epochs <- 500
      } else {
        n_epochs <- 200
      }
    }

    uwot(
      X = X,
      nn_method = graph,
      is_similarity_graph = TRUE,
      n_components = n_components,
      n_epochs = n_epochs,
      alpha = learning_rate,
      init = init,
      init_sdev = init_sdev,
      spread = spread,
      min_dist = min_dist,
      gamma = repulsion_strength,
      negative_sample_rate = negative_sample_rate,
      a = a,
      b = b,
      method = method,
      approx_pow = approx_pow,
      pcg_rand = pcg_rand,
      fast_sgd = fast_sgd,
      n_sgd_threads = n_sgd_threads,
      grain_size = grain_size,
      verbose = verbose,
      batch = batch,
      opt_args = opt_args,
      epoch_callback = epoch_callback,
      pca_method = pca_method
    )
  }

#' Merge Similarity Graph by Simplicial Set Union
#'
#' Combine two similarity graphs by treating them as fuzzy topological sets and
#' forming the union.
#'
#' @param x A sparse matrix representing the first similarity graph in the union
#'   operation.
#' @param y A sparse matrix representing the second similarity graph in the
#'   union operation.
#' @param n_threads Number of threads to use when resetting the local metric.
#'   Default is half the number of concurrent threads supported by the system.
#' @param verbose If \code{TRUE}, log progress to the console.
#' @returns A sparse matrix containing the union of \code{x} and \code{y}.
#' @examples
#'
#' # Form two different "views" of the same data
#' iris30 <- iris[c(1:10, 51:60, 101:110), ]
#' iris_sg12 <- similarity_graph(iris30[, 1:2], n_neighbors = 5)
#' iris_sg34 <- similarity_graph(iris30[, 3:4], n_neighbors = 5)
#'
#' # Combine the two representations into one
#' iris_combined <- simplicial_set_union(iris_sg12, iris_sg34)
#'
#' # Optimize the layout based on the combined view
#' iris_combined_umap <- optimize_graph_layout(iris_combined, n_epochs = 100)
#' @export
simplicial_set_union <-
  function(x,
           y,
           n_threads = NULL,
           verbose = FALSE) {
    if (!is_sparse_matrix(x)) {
      stop("similarity graph x must be a sparse matrix")
    }
    if (!is_sparse_matrix(y)) {
      stop("similarity graph y must be a sparse matrix")
    }
    if (!all(dim(x) == dim(y))) {
      stop("x and y must have identical dimensions")
    }

    z <- methods::as(x + y, "TsparseMatrix")

    z@x <- general_sset_union_cpp(
      x@p,
      x@i,
      x@x,
      y@p,
      y@i,
      y@x,
      z@i,
      z@j,
      z@x
    )

    z <- Matrix::drop0(z)
    reset_local_connectivity(
      z,
      reset_local_metric = TRUE,
      n_threads = n_threads,
      verbose = verbose
    )
  }


#' Merge Similarity Graph by Simplicial Set Intersection
#'
#' Combine two similarity graphs by treating them as fuzzy topological sets and
#' forming the intersection.
#'
#' @param x A sparse matrix representing the first similarity graph in the
#'   intersection operation.
#' @param y A sparse matrix representing the second similarity graph in the
#'   intersection operation.
#' @param weight A value between \code{0 - 1}, controlling the relative
#'   influence of \code{x} and \code{y} in the intersection. Default
#'   (\code{0.5}) gives equal influence. Values smaller than \code{0.5} put more
#'   weight on \code{x}. Values greater than \code{0.5} put more weight on
#'   \code{y}.
#' @param n_threads Number of threads to use when resetting the local metric.
#'   Default is half the number of concurrent threads supported by the system.
#' @param verbose If \code{TRUE}, log progress to the console.
#' @returns A sparse matrix containing the intersection of \code{x} and
#'   \code{y}.
#' @examples
#'
#' # Form two different "views" of the same data
#' iris30 <- iris[c(1:10, 51:60, 101:110), ]
#' iris_sg12 <- similarity_graph(iris30[, 1:2], n_neighbors = 5)
#' iris_sg34 <- similarity_graph(iris30[, 3:4], n_neighbors = 5)
#'
#' # Combine the two representations into one
#' iris_combined <- simplicial_set_intersect(iris_sg12, iris_sg34)
#'
#' # Optimize the layout based on the combined view
#' iris_combined_umap <- optimize_graph_layout(iris_combined, n_epochs = 100)
#' @export
simplicial_set_intersect <- function(x, y, weight = 0.5, n_threads = NULL,
                                     verbose = FALSE) {
  if (weight < 0 || weight > 1) {
    stop("weight must be between 0-1")
  }
  if (!is_sparse_matrix(x)) {
    stop("similarity graph x must be a sparse matrix")
  }
  if (!is_sparse_matrix(y)) {
    stop("similarity graph y must be a sparse matrix")
  }
  if (!all(dim(x) == dim(y))) {
    stop("x and y must have identical dimensions")
  }
  set_intersect(
    A = x, B = y, weight = weight, reset_connectivity = TRUE,
    reset_local_metric = TRUE, n_threads = n_threads,
    verbose = verbose
  )
}

# Function that does all the real work
uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
                 n_epochs = NULL,
                 alpha = 1, scale = FALSE,
                 init = "spectral", init_sdev = NULL,
                 spread = 1, min_dist = 0.01,
                 set_op_mix_ratio = 1.0, local_connectivity = 1.0,
                 bandwidth = 1.0, gamma = 1.0,
                 negative_sample_rate = 5.0, a = NULL, b = NULL,
                 nn_method = NULL, n_trees = 50,
                 search_k = 2 * n_neighbors * n_trees,
                 method = "umap",
                 perplexity = 50, approx_pow = FALSE,
                 y = NULL, target_n_neighbors = n_neighbors,
                 target_metric = "euclidean",
                 target_weight = 0.5,
                 n_threads = NULL,
                 n_sgd_threads = 0,
                 grain_size = 1,
                 kernel = "gauss",
                 ret_model = FALSE, ret_nn = FALSE, ret_fgraph = FALSE,
                 ret_sigma = FALSE, ret_localr = FALSE,
                 pca = NULL, pca_center = TRUE, pca_method = NULL,
                 pcg_rand = TRUE,
                 fast_sgd = FALSE,
                 batch = FALSE,
                 opt_args = NULL,
                 tmpdir = tempdir(),
                 verbose = getOption("verbose", TRUE),
                 epoch_callback = NULL,
                 binary_edge_weights = FALSE,
                 dens_scale = NULL,
                 is_similarity_graph = FALSE,
                 seed = NULL,
                 nn_args = list(),
                 sparse_X_is_distance_matrix = TRUE) {
  if (is.null(n_threads)) {
    n_threads <- default_num_threads()
  }
  method <- match.arg(tolower(method), c("umap", "tumap", "largevis"))

  if (method == "umap") {
    if (is.null(a) || is.null(b)) {
      ab_res <- find_ab_params(spread = spread, min_dist = min_dist)
      a <- ab_res[1]
      b <- ab_res[2]
      tsmessage("UMAP embedding parameters a = ", formatC(a), " b = ", formatC(b))
    } else {
      # set min_dist and spread to NULL so if ret_model = TRUE, their default
      # values are not mistaken for having been used for anything
      min_dist <- NULL
      spread <- NULL
    }
  }

  if (n_neighbors < 2) {
    stop("n_neighbors must be >= 2")
  }

  if (set_op_mix_ratio < 0.0 || set_op_mix_ratio > 1.0) {
    stop("set_op_mix_ratio must be between 0.0 and 1.0")
  }

  if (local_connectivity < 1.0) {
    stop("local_connectivity cannot be < 1.0")
  }

  if (!is.null(y) && is.numeric(y) && any(is.na(y))) {
    stop("numeric y cannot contain NA")
  }

  if (!is.numeric(n_components) || n_components < 1) {
    stop("'n_components' must be a positive integer")
  }

  if (!is.null(pca)) {
    if (!is.numeric(pca) || pca < 1) {
      stop("'pca' must be a positive integer")
    }
    if (pca < n_components) {
      stop("'pca' must be >= n_components")
    }
    if (pca > min(nrow(X), na.rm = col(X))) {
      stop("'pca' must be <= min(nrow(X), ncol(X))")
    }
  }
  if (is.null(pca_method)) {
    pca_method <- "auto"
  }
  pca_method <-
    match.arg(pca_method,
      choices = c("irlba", "svdr", "bigstatsr", "svd", "auto")
    )

  if (fast_sgd) {
    n_sgd_threads <- "auto"
    pcg_rand <- FALSE
    approx_pow <- TRUE
  }

  if (n_threads < 0) {
    stop("n_threads cannot be < 0")
  }
  if (n_threads %% 1 != 0) {
    n_threads <- round(n_threads)
    tsmessage("Non-integer 'n_threads' provided. Setting to ", n_threads)
  }
  if (n_sgd_threads == "auto") {
    n_sgd_threads <- n_threads
  }
  if (n_sgd_threads < 0) {
    stop("n_sgd_threads cannot be < 0")
  }
  if (n_sgd_threads %% 1 != 0) {
    n_sgd_threads <- round(n_sgd_threads)
    tsmessage("Non-integer 'n_sgd_threads' provided. Setting to ", n_sgd_threads)
  }
  if (!is.null(dens_scale) && approx_pow) {
    warning("approx_pow parameter is ignored when using dens_scale")
    approx_pow <- FALSE
  }
  # 110: for more consistent reproducibility set a user-supplied seed
  if (!is.null(seed)) {
    tsmessage("Setting random seed ", seed)
    set.seed(seed)
  }

  if (is.character(nn_method) && nn_method == "hnsw") {
    if (!is_installed("RcppHNSW")) {
      stop("RcppHNSW is required for nn_method = 'hnsw', please install it")
    }
    if (!is_ok_hnsw_metric(metric)) {
      stop(
        "bad metric: hnsw only supports 'euclidean', 'cosine' or ",
        "'correlation' metrics"
      )
    }
    if (!is_ok_hnsw_metric(target_metric)) {
      stop(
        "bad target_metric: hnsw only supports 'euclidean', 'cosine' or ",
        "'correlation' metrics"
      )
    }
  }

  if (is.character(nn_method) && nn_method == "nndescent") {
    if (!is_installed("rnndescent")) {
      stop("rnndescent is required for nn_method = 'nndescent',",
           " please install it")
    }
  }

  ret_extra <- ret_model || ret_nn || ret_fgraph || ret_sigma || ret_localr

  # Store categorical columns to be used to generate the graph
  Xcat <- NULL
  # number of original columns in data frame (or matrix)
  # will be used only if using df or matrix and ret_model = TRUE
  norig_col <- NULL
  # row names for the input data, which we will apply to the embedding if
  # needed
  Xnames <- NULL
  num_precomputed_nns <- 0
  if (is.null(X)) {
    if (!nn_is_precomputed(nn_method)) {
      stop("If X is NULL, must provide NN data in nn_method")
    }
    if (is.character(init) && tolower(init) %in% c("spca", "pca")) {
      stop("init = 'pca' and 'spca' can't be used with X = NULL")
    }
    if (length(nn_method) == 0) {
      stop("Incorrect format for precalculated neighbor data")
    }
    n_vertices <- x2nv(nn_method)
    stopifnot(n_vertices > 0)
    num_precomputed_nns <- check_graph_list(nn_method, n_vertices,
      bipartite = FALSE
    )
    Xnames <- nn_graph_row_names_list(nn_method)
  } else if (inherits(X, "dist")) {
    if (ret_model) {
      stop("Can only create models with dense matrix or data frame input")
    }
    checkna(X)
    n_vertices <- attr(X, "Size")
    tsmessage("Read ", n_vertices, " rows")
    Xnames <- labels(X)
  } else if (is_sparse_matrix(X) && sparse_X_is_distance_matrix) {
    if (ret_model) {
      stop("Can only create models with dense matrix or data frame input")
    }
    checkna(X)
    n_vertices <- nrow(X)
    if (ncol(X) != n_vertices) {
      stop("Sparse matrices are only supported as distance matrices")
    }
    tsmessage("Read ", n_vertices, " rows of sparse distance matrix")
    Xnames <- row.names(X)
  } else {
    cat_ids <- NULL
    norig_col <- ncol(X)
    if (methods::is(X, "data.frame") || methods::is(X, "matrix") || is_sparse_matrix(X)) {
      cat_res <- find_categoricals(metric)
      metric <- cat_res$metrics
      cat_ids <- cat_res$categoricals
      # Convert categorical columns to factors if they aren't already
      if (!is.null(cat_ids)) {
        X[, cat_ids] <- sapply(X[, cat_ids, drop = FALSE], factor,
          simplify = methods::is(X, "matrix")
        )
        Xcat <- X[, cat_ids, drop = FALSE]
      }

      if (methods::is(X, "data.frame")) {
        indexes <- which(vapply(X, is.numeric, logical(1)))
        if (length(indexes) == 0) {
          stop("No numeric columns found")
        }
        tsmessage("Converting dataframe to numerical matrix")
        if (length(indexes) != ncol(X)) {
          X <- X[, indexes]
        }
        X <- as.matrix(X)
      }
      if (n_components > ncol(X)) {
        warning(
          "n_components ",
          "> number of columns in input data: ",
          n_components,
          " > ",
          ncol(X),
          ", this may give poor or unexpected results"
        )
      }
    } else {
      stop("Unknown input data format")
    }
    checkna(X)
    n_vertices <- nrow(X)
    tsmessage(
      "Read ", n_vertices, " rows and found ", ncol(X),
      " numeric columns",
      appendLF = is.null(cat_ids)
    )
    if (length(cat_ids) > 0) {
      tsmessage(" and ", pluralize("categorical column", length(cat_ids)),
        time_stamp = FALSE
      )
    }
    Xnames <- row.names(X)
    X <- remove_scaling_attrs(X)
    X <- scale_input(X,
      scale_type = scale, ret_model = ret_model,
      verbose = verbose
    )
  }

  # Store number of precomputed nn if X is non-NULL (NULL X case handled above)
  if (nn_is_precomputed(nn_method) && num_precomputed_nns == 0) {
    num_precomputed_nns <- check_graph_list(nn_method, n_vertices,
      bipartite = FALSE
    )
    if (is.null(Xnames)) {
      Xnames <- nn_graph_row_names_list(nn_method)
    }
  }

  if (method == "largevis" && kernel == "knn") {
    n_neighbors <- perplexity
  }

  if (max(n_neighbors) > n_vertices) {
    # pre-calculated nearest neighbors ignores the user-supplied n_neighbors
    # which is handled later
    if (!is.list(nn_method)) {
      if (method == "largevis") {
        # for LargeVis, n_neighbors normally determined from perplexity not an
        # error to be too large
        tsmessage("Setting n_neighbors to ", n_vertices)
        n_neighbors <- n_vertices
      } else {
        stop("n_neighbors must be smaller than the dataset size")
      }
    }
  }

  if (!is.list(metric)) {
    metrics <- list(c())
    names(metrics) <- metric
  } else {
    metrics <- metric
  }

  # For typical case of numeric matrix X and not using binary metric, save
  # PCA results here in case initialization uses PCA too
  pca_models <- NULL
  pca_shortcut <- FALSE
  if (!is.null(pca) && length(metric) == 1 && !is_binary_metric(metric) &&
    is.matrix(X) && ncol(X) > pca) {
    tsmessage("Reducing X column dimension to ", pca, " via PCA")
    pca_res <- pca_init(X,
      ndim = pca, center = pca_center, pca_method = pca_method,
      ret_extra = ret_model, verbose = verbose
    )
    if (ret_model) {
      X <- pca_res$scores
      pca_models[["1"]] <- pca_res[c("center", "rotation")]
      pca_res <- NULL
    } else {
      X <- pca_res
    }
    pca_shortcut <- TRUE
  }

  if (is_similarity_graph) {
    d2sr <-
      list(
        V = nn_method,
        nns = NULL,
        pca_models = NULL,
        sigma = NULL,
        rho = NULL
      )
    need_sigma <- FALSE
  } else {
    need_sigma <- ret_sigma || ret_localr || !is.null(dens_scale)
    d2sr <- data2set(
      X,
      Xcat,
      n_neighbors,
      metrics,
      nn_method,
      n_trees,
      search_k,
      method,
      set_op_mix_ratio,
      local_connectivity,
      bandwidth,
      perplexity,
      kernel,
      need_sigma,
      n_threads,
      grain_size,
      ret_model,
      pca = pca,
      pca_center = pca_center,
      pca_method = pca_method,
      n_vertices = n_vertices,
      nn_args = nn_args,
      tmpdir = tmpdir,
      sparse_is_distance = sparse_X_is_distance_matrix,
      verbose = verbose
    )
  }
  V <- d2sr$V
  nns <- d2sr$nns
  if (is.null(pca_models)) {
    pca_models <- d2sr$pca_models
  }

  # Calculate approximate local radii
  sigma <- NULL
  rho <- NULL
  localr <- NULL
  dint <- NULL
  if (need_sigma) {
    sigma <- d2sr$sigma
    rho <- d2sr$rho
    dint <- d2sr$dint
  }
  if (!is.null(dens_scale) || ret_localr) {
    localr <- sigma + rho
  }

  if (!is.null(y)) {
    tsmessage("Processing y data")

    if (!is.list(target_metric)) {
      target_metrics <- list(c())
      names(target_metrics) <- target_metric
    } else {
      target_metrics <- target_metric
    }

    ycat <- NULL
    ycat_ids <- NULL
    if (methods::is(y, "data.frame")) {
      ycat_res <- find_categoricals(target_metric)
      target_metric <- ycat_res$metrics
      ycat_ids <- ycat_res$categoricals
      if (!is.null(ycat_ids)) {
        ycat <- y[, ycat_ids, drop = FALSE]
      } else {
        ycindexes <- which(vapply(y, is.factor, logical(1)))
        if (length(ycindexes) > 0) {
          ycat <- (y[, ycindexes, drop = FALSE])
        }
      }

      yindexes <- which(vapply(y, is.numeric, logical(1)))

      if (length(yindexes) > 0) {
        y <- as.matrix(y[, yindexes])
      } else {
        y <- NULL
      }
    } else if (is.list(y)) {
      nn_method <- y
    } else if (is.numeric(y)) {
      y <- as.matrix(y)
    } else if (is.factor(y)) {
      ycat <- data.frame(y)
      y <- NULL
    }

    if (!is.null(y)) {
      yd2sr <- data2set(y, ycat, target_n_neighbors, target_metrics, nn_method,
        n_trees, search_k,
        method,
        set_op_mix_ratio = 1.0,
        local_connectivity = 1.0,
        bandwidth = 1.0,
        perplexity = perplexity, kernel = kernel,
        ret_sigma = FALSE,
        n_threads = n_threads, grain_size = grain_size,
        ret_model = FALSE,
        pca = pca, pca_center = TRUE, pca_method = pca_method,
        n_vertices = n_vertices,
        tmpdir = tmpdir,
        verbose = verbose
      )

      tsmessage(
        "Intersecting X and Y sets with target weight = ",
        formatC(target_weight)
      )
      # behavior for supervised UMAP: do reset local connectivity
      # don't reset metric (same as Python UMAP as of 0.5.3)
      V <- set_intersect(V, yd2sr$V, target_weight, reset_connectivity = TRUE)
      yd2sr$V <- NULL
      yd2sr$nns <- NULL
    } else if (!is.null(ycat)) {
      V <- categorical_intersection_df(ycat, V,
        weight = target_weight,
        verbose = verbose
      )
    }
  }

  if (!(ret_model || ret_nn)) {
    nns <- NULL
    gc()
  }

  if (methods::is(init, "matrix")) {
    if (nrow(init) != n_vertices || ncol(init) != n_components) {
      stop("init matrix does not match necessary configuration for X: ", "should
           have dimensions (", n_vertices, ", ", n_components, ")")
    }
    tsmessage("Initializing from user-supplied matrix")
    embedding <- scale_coords(init, init_sdev, verbose = verbose)
  } else if (!(methods::is(init, "character") && length(init) == 1)) {
    if (is.null(init) && !is.null(n_epochs) && n_epochs == 0) {
      embedding <- NULL
      if (!ret_extra) {
        warning(
          "Neither high-dimensional nor low-dimensional data will be ",
          "returned with this combination of settings"
        )
      }
      if (ret_model) {
        warning(
          "Returning a model but it will not be valid for transforming ",
          "new data"
        )
      }
    } else {
      stop(
        "init should be either a matrix or string describing the ",
        "initialization method"
      )
    }
  } else {
    init <- match.arg(tolower(init), c(
      "spectral", "random", "lvrandom", "normlaplacian",
      "laplacian", "spca", "pca", "inormlaplacian", "ispectral",
      "agspectral", "irlba_spectral", "irlba_laplacian", "pacpca"
    ))

    if (init_is_spectral(init)) {
      connected <- connected_components(V)
      if (connected$n_components > 1) {
        tsmessage("Found ", connected$n_components, " connected components, ", appendLF = FALSE)
        if (is.null(X)) {
          tsmessage("falling back to random initialization", time_stamp = FALSE)
          init <- "random"
        } else {
          tsmessage("falling back to 'spca' initialization with init_sdev = 1",
            time_stamp = FALSE
          )
          init <- "spca"
          init_sdev <- 1
        }
      }
    }

    # Don't repeat PCA initialization if we've already done it once
    if (pca_shortcut && init %in% c("spca", "pca", "pacpca") && pca >= n_components) {
      embedding <- X[, 1:n_components]
      switch(init,
        spca = tsmessage("Initializing from scaled PCA"),
        pca = tsmessage("Initializing from PCA"),
        pacpca = tsmessage("Initializing from PaCMAP-style PCA"),
        stop("Unknown init method '", init, "'")
      )
    } else {
      embedding <- switch(init,
        spectral = spectral_init(V, ndim = n_components, verbose = verbose),
        random = rand_init(n_vertices, n_components, verbose = verbose),
        lvrandom = rand_init_lv(n_vertices, n_components, verbose = verbose),
        normlaplacian = normalized_laplacian_init(V,
          ndim = n_components,
          verbose = verbose
        ),
        laplacian = laplacian_eigenmap(V, ndim = n_components, verbose = verbose),
        # we handle scaling pca below
        spca = pca_init(X,
          ndim = n_components, pca_method = pca_method,
          verbose = verbose
        ),
        pca = pca_init(X,
          ndim = n_components, pca_method = pca_method,
          verbose = verbose
        ),
        pacpca = pca_init(X,
          ndim = n_components, pca_method = pca_method,
          verbose = verbose
        ),
        ispectral = irlba_spectral_init(V, ndim = n_components, verbose = verbose),
        inormlaplacian = irlba_normalized_laplacian_init(V,
          ndim = n_components,
          verbose = verbose
        ),
        agspectral = agspectral_init(V,
          n_neg_nbrs = negative_sample_rate,
          ndim = n_components, verbose = verbose
        ),
        irlba_spectral = spectral_init(V, ndim = n_components, verbose = verbose, force_irlba = TRUE),
        irlba_laplacian = laplacian_eigenmap(V, ndim = n_components, verbose = verbose, force_irlba = TRUE),
        stop("Unknown initialization method: '", init, "'")
      )
    }
    if (init == "pacpca") {
      embedding <- 0.01 * embedding
    }
    if (!is.null(init_sdev) || init == "spca") {
      if (is.null(init_sdev)) {
        init_sdev <- 1e-4
      }
      embedding <- scale_coords(embedding, init_sdev, verbose = verbose)
    }
  }
  if (any(is.na(embedding))) {
    stop("Initial data contains NA values: is n_components too high?")
  }

  if (is.null(n_epochs) || n_epochs < 0) {
    if (method == "largevis") {
      n_epochs <- lvish_epochs(n_vertices, V)
    } else {
      if (n_vertices <= 10000) {
        n_epochs <- 500
      } else {
        n_epochs <- 200
      }
    }
  }

  full_opt_args <- get_opt_args(opt_args, alpha)

  if (binary_edge_weights) {
    V@x <- rep(1, length(V@x))
  }

  if (n_epochs > 0) {
    if (any(apply(embedding, 2, stats::sd) > 10.0)) {
      warning(
        "Initial embedding standard deviation > 10.0, this can lead to ",
        "poor optimization"
      )
    }

    # remove edges which can't be sampled due to n_epochs
    V@x[V@x < max(V@x) / n_epochs] <- 0
    V <- Matrix::drop0(V)

    # Create the (0-indexed) indices of the head and tail of each directed edge
    # in V. Graph is symmetric, so both (i->j) and (j->i) are present
    if (batch) {
      V <- Matrix::t(V)
      # head is ordered in non-decreasing order of node index
      positive_head <- Matrix::which(V != 0, arr.ind = TRUE)[, 2] - 1
      # tail is unordered
      positive_tail <- V@i
    } else {
      # Use the Python UMAP ordering
      # head is unordered
      positive_head <- V@i
      # tail is ordered in non-decreasing order of node index
      positive_tail <- Matrix::which(V != 0, arr.ind = TRUE)[, 2] - 1
    }
    # start/end pointers into the ordered vector
    positive_ptr <- V@p
    epochs_per_sample <- make_epochs_per_sample(V@x, n_epochs)

    tsmessage(
      "Commencing optimization for ", n_epochs, " epochs, with ",
      length(positive_head), " positive edges",
      pluralize("thread", n_sgd_threads, " using")
    )

    ai <- NULL
    if (!is.null(dens_scale)) {
      ai <- scale_radii(localr, dens_scale, a)
      method <- "leopold"
      if (ret_model) {
        # store the linear transform from localr to ai for transforming new data
        lai2 <- 2 * log(range(ai))
        llr <- -log(rev(range(localr)))
        rad_coeff <- stats::lm(lai2 ~ llr)$coefficients
      }
    }

    method <- tolower(method)
    method_args <- switch(method,
      umap = list(a = a, b = b, gamma = gamma, approx_pow = approx_pow),
      tumap = list(),
      # a = 1 b = 10 for final phase of PaCMAP optimization
      pacmap = list(a = a, b = b),
      largevis = list(gamma = gamma),
      leopold = list(ai = ai, b = b, ndim = n_components),
      stop("Unknown dimensionality reduction method '", method, "'")
    )

    embedding <- t(embedding)
    embedding <- optimize_layout_r(
      head_embedding = embedding,
      tail_embedding = NULL,
      positive_head = positive_head,
      positive_tail = positive_tail,
      positive_ptr = positive_ptr,
      n_epochs = n_epochs,
      n_head_vertices = n_vertices,
      n_tail_vertices = n_vertices,
      epochs_per_sample = epochs_per_sample,
      method = method,
      method_args = method_args,
      initial_alpha = alpha,
      opt_args = full_opt_args,
      negative_sample_rate = negative_sample_rate,
      pcg_rand = pcg_rand,
      batch = batch,
      n_threads = n_sgd_threads,
      grain_size = grain_size,
      move_other = TRUE,
      epoch_callback = epoch_callback,
      verbose = verbose
    )
    embedding <- t(embedding)

    gc()
    # Center the points before returning
    embedding <- scale(embedding, center = TRUE, scale = FALSE)

    if (is.null(row.names(embedding)) &&
      !is.null(Xnames) && length(Xnames) == nrow(embedding)) {
      row.names(embedding) <- Xnames
    }
    tsmessage("Optimization finished")
  }

  if (ret_extra) {
    nblocks <- length(nns)
    res <- list(embedding = embedding)
    if (ret_model) {
      res <- append(res, list(
        scale_info = if (!is.null(X)) {
          attr_to_scale_info(X)
        } else {
          NULL
        },
        search_k = search_k,
        local_connectivity = local_connectivity,
        n_epochs = n_epochs,
        alpha = alpha,
        negative_sample_rate = negative_sample_rate,
        method = method,
        a = a,
        b = b,
        gamma = gamma,
        approx_pow = approx_pow,
        metric = metrics,
        norig_col = norig_col,
        pcg_rand = pcg_rand,
        batch = batch,
        opt_args = full_opt_args,
        num_precomputed_nns = num_precomputed_nns,
        # #95: min_dist and spread are exported for documentation purposes only
        min_dist = min_dist,
        spread = spread,
        binary_edge_weights = binary_edge_weights,
        seed = seed,
        nn_method = nn_method,
        nn_args = nn_args
      ))
      if (nn_is_precomputed(nn_method)) {
        res$n_neighbors <- nn_graph_nbrs_list(nn_method)
      } else {
        res$n_neighbors <- n_neighbors
      }
      if (method == "leopold") {
        res$dens_scale <- dens_scale
        res$ai <- ai
        res$rad_coeff <- rad_coeff
      }
      if (nblocks > 1) {
        if (!nn_is_precomputed(nn_method)) {
          res$nn_index <- list()
          for (i in 1:nblocks) {
            res$nn_index[[i]] <- nns[[i]]$index
          }
        }
      } else {
        if (!is.null(nns[[1]]$index)) {
          res$nn_index <- nns[[1]]$index
          if (is.null(res$metric[[1]])) {
            # 31: Metric usually lists column indices or names, NULL means use all
            # of them, but for loading the NN index we need the number of
            # columns explicitly (we don't have access to the column dimension of
            # the input data at load time)
            if (res$nn_index$type %in% c("annoyv2", "hnswv1", "nndescentv1")) {
              res$metric[[1]] <- list(ndim = res$nn_index$ndim)
            }
            else {
              # To be sure of the dimensionality, fetch the first item from the
              # index and see how many elements are in the returned vector.
              if (!is.null(X)) {
                rcppannoy <- get_rcppannoy(res$nn_index)
                res$metric[[1]] <- list(ndim = length(rcppannoy$getItemsVector(0)))
              } else {
                res$metric[[1]] <- list()
              }
            }
          }
        } else {
          if (nn_is_precomputed(nn_method)) {
            tsmessage(
              "Note: model requested with precomputed neighbors. ",
              "For transforming new data, distance data must be ",
              "provided separately"
            )
          }
        }
      }
      if (!is.null(pca_models)) {
        res$pca_models <- pca_models
      }
    }
    if (ret_nn) {
      res$nn <- list()
      for (i in 1:nblocks) {
        if (is.list(nns[[i]])) {
          res$nn[[i]] <- list(idx = nns[[i]]$idx, dist = nns[[i]]$dist)
          if (!is.null(Xnames) && nrow(res$nn[[i]]$idx) == length(Xnames)) {
            row.names(res$nn[[i]]$idx) <- Xnames
            row.names(res$nn[[i]]$dist) <- Xnames
          }
        } else if (is_sparse_matrix(nns[[i]])) {
          res$nn[[i]] <- nns[[i]]
          if (!is.null(Xnames) && nrow(res$nn[[i]]) == length(Xnames)) {
            row.names(res$nn[[i]]) <- Xnames
            colnames(res$nn[[i]]) <- Xnames
          }
        }
      }
      names(res$nn) <- names(nns)
    }
    if (ret_fgraph) {
      if (method == "largevis") {
        res$P <- V
      } else {
        res$fgraph <- V
      }
    }
    if (ret_sigma) {
      res$sigma <- sigma
      res$rho <- rho
      res$dint <- dint
    }
    if (ret_localr && !is.null(localr)) {
      res$localr <- localr
    }
  } else {
    res <- embedding
  }

  res
}

#' Save or Load a Model
#'
#' Functions to write a UMAP model to a file, and to restore.
#'
#' @param model a UMAP model create by \code{\link{umap}}.
#' @param file name of the file where the model is to be saved or read from.
#' @param unload if \code{TRUE}, unload all nearest neighbor indexes for the
#'   model. The \code{model} will no longer be valid for use in
#'   \code{\link{umap_transform}} and the temporary working directory used
#'   during model saving will be deleted. You will need to reload the model with
#'   \code{load_uwot} to use the model. If \code{FALSE}, then the model can be
#'   re-used without reloading, but you must manually unload the NN index when
#'   you are finished using it if you want to delete the temporary working
#'   directory. To unload manually, use \code{\link{unload_uwot}}. The absolute
#'   path of the working directory is found in the \code{mod_dir} item of the
#'   return value.
#' @param verbose if \code{TRUE}, log information to the console.
#' @return \code{model} with one extra item: \code{mod_dir}, which contains the
#'   path to the working directory. If \code{unload = FALSE} then this directory
#'   still exists after this function returns, and can be cleaned up with
#'   \code{\link{unload_uwot}}. If you don't care about cleaning up this
#'   directory, or \code{unload = TRUE}, then you can ignore the return value.
#' @examples
#' iris_train <- iris[c(1:10, 51:60), ]
#' iris_test <- iris[100:110, ]
#'
#' # create model
#' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20)
#'
#' # save without unloading: this leaves behind a temporary working directory
#' model_file <- tempfile("iris_umap")
#' model <- save_uwot(model, file = model_file)
#'
#' # The model can continue to be used
#' test_embedding <- umap_transform(iris_test, model)
#'
#' # To manually unload the model from memory when finished and to clean up
#' # the working directory (this doesn't touch your model file)
#' unload_uwot(model)
#'
#' # At this point, model cannot be used with umap_transform, this would fail:
#' # test_embedding2 <- umap_transform(iris_test, model)
#'
#' # restore the model: this also creates a temporary working directory
#' model2 <- load_uwot(file = model_file)
#' test_embedding2 <- umap_transform(iris_test, model2)
#'
#' # Unload and clean up the loaded model temp directory
#' unload_uwot(model2)
#'
#' # clean up the model file
#' unlink(model_file)
#'
#' # save with unloading: this deletes the temporary working directory but
#' # doesn't allow the model to be re-used
#' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20)
#' model_file3 <- tempfile("iris_umap")
#' model3 <- save_uwot(model3, file = model_file3, unload = TRUE)
#'
#' @seealso \code{\link{load_uwot}}, \code{\link{unload_uwot}}
#' @export
save_uwot <- function(model, file, unload = FALSE, verbose = FALSE) {
  if (!all_nn_indices_are_loaded(model)) {
    stop("cannot save: NN index is unloaded")
  }

  wd <- getwd()
  model_file <- abspath(file)
  if (file.exists(model_file)) {
    stop("model file ", model_file, " already exists")
  }

  tmp_model_file <- NULL
  tryCatch(
    {
      # create directory to store files in
      mod_dir <- tempfile(pattern = "dir")
      tsmessage("Creating temp model dir ", mod_dir)
      dir.create(mod_dir)

      # create the tempdir/uwot subdirectory
      uwot_dir <- file.path(mod_dir, "uwot")
      tsmessage("Creating dir ", mod_dir)
      dir.create(uwot_dir)

      # save model file to tempdir/uwot/model
      model_tmpfname <- file.path(uwot_dir, "model")
      saveRDS(model, file = model_tmpfname)

      # save each nn index inside tempdir/uwot/model
      metrics <- names(model$metric)
      n_metrics <- length(metrics)

      for (i in 1:n_metrics) {
        if (n_metrics == 1) {
          nn_index <- model$nn_index
        }
        else {
          nn_index <- model$nn_index[[i]]
        }

        if (startsWith(nn_index$type, "annoy") ||
            startsWith(nn_index$type, "hnsw")) {

            nn_tmpfname <- file.path(uwot_dir, paste0("nn", i))
            nn_meta_tmpfname <- file.path(uwot_dir, paste0("nn-meta", i))
            nn_index$ann$save(nn_tmpfname)

            # save metadata wrapper around the index separately
            meta_data <- nn_index
            meta_data$ann <- NULL
            saveRDS(meta_data, file = nn_meta_tmpfname)
        }
        else if (startsWith(nn_index$type, "nndescent")) {
          nn_tmpfname <- file.path(uwot_dir, paste0("nn", i))
          saveRDS(nn_index, file = nn_tmpfname)
        }
        else {
          stop("unsupported nn index type: ", model$nn_index$type)
        }
      }

      # archive the files under the temp dir into the single target file
      # change directory so the archive only contains one directory
      tsmessage("Changing to ", mod_dir)
      setwd(mod_dir)
      tmp_model_file <- abspath(file)
      tsmessage("Creating ", tmp_model_file)

      # #109: Windows 7 tar needs "--force-local" to avoid interpreting colon
      # as indicating a remote machine
      extra_flags <- ""
      if (is_win7()) {
        extra_flags <- "--force-local"
      }
      utils::tar(
        tarfile = tmp_model_file,
        extra_flags = extra_flags,
        files = "uwot/"
      )
    },
    finally = {
      setwd(wd)
      if (!is.null(tmp_model_file) && model_file != tmp_model_file) {
        tsmessage("Copying ", tmp_model_file, " to ", model_file)
        file.copy(from = tmp_model_file, to = model_file)
      }
      model$mod_dir <- mod_dir
      if (unload) {
        unload_uwot(model, cleanup = TRUE, verbose = verbose)
      }
    }
  )
  model
}

#' Save or Load a Model
#'
#' Functions to write a UMAP model to a file, and to restore.
#'
#' @param file name of the file where the model is to be saved or read from.
#' @param verbose if \code{TRUE}, log information to the console.
#' @return The model saved at \code{file}, for use with
#'   \code{\link{umap_transform}}. Additionally, it contains an extra item:
#'   \code{mod_dir}, which contains the path to the temporary working directory
#'   used during loading of the model. This directory cannot be removed until
#'   this model has been unloaded by using \code{\link{unload_uwot}}.
#' @examples
#' library(RSpectra)
#'
#' iris_train <- iris[c(1:10, 51:60), ]
#' iris_test <- iris[100:110, ]
#'
#' # create model
#' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20)
#'
#' # save without unloading: this leaves behind a temporary working directory
#' model_file <- tempfile("iris_umap")
#' model <- save_uwot(model, file = model_file)
#'
#' # The model can continue to be used
#' test_embedding <- umap_transform(iris_test, model)
#'
#' # To manually unload the model from memory when finished and to clean up
#' # the working directory (this doesn't touch your model file)
#' unload_uwot(model)
#'
#' # At this point, model cannot be used with umap_transform, this would fail:
#' # test_embedding2 <- umap_transform(iris_test, model)
#'
#' # restore the model: this also creates a temporary working directory
#' model2 <- load_uwot(file = model_file)
#' test_embedding2 <- umap_transform(iris_test, model2)
#'
#' # Unload and clean up the loaded model temp directory
#' unload_uwot(model2)
#'
#' # clean up the model file
#' unlink(model_file)
#'
#' # save with unloading: this deletes the temporary working directory but
#' # doesn't allow the model to be re-used
#' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20)
#' model_file3 <- tempfile("iris_umap")
#' model3 <- save_uwot(model3, file = model_file3, unload = TRUE)
#'
#' @seealso \code{\link{save_uwot}}, \code{\link{unload_uwot}}
#' @export
load_uwot <- function(file, verbose = FALSE) {
  # create directory to store files in
  mod_dir <- tempfile(pattern = "dir")
  tsmessage("Creating temp directory ", mod_dir)
  dir.create(mod_dir)

  # #109: Windows 7 tar needs "--force-local" to avoid interpreting colon
  # as indicating a remote machine
  extras <- NULL
  if (is_win7()) {
    extras <- "--force-local"
  }
  utils::untar(abspath(file),
    exdir = mod_dir,
    extras = extras,
    verbose = verbose
  )

  model_fname <- file.path(mod_dir, "uwot/model")
  if (!file.exists(model_fname)) {
    stop("Can't find model in ", file)
  }
  model <- readRDS(file = model_fname)

  metrics <- names(model$metric)
  n_metrics <- length(metrics)

  nn_method <- model$nn_method
  if (is.null(nn_method)) {
    nn_method <- "annoy"
  }
  for (i in 1:n_metrics) {
    nn_fname <- file.path(mod_dir, paste0("uwot/nn", i))
    if (!file.exists(nn_fname)) {
      stop("Can't find nearest neighbor index ", nn_fname, " in ", file)
    }
    metric <- metrics[[i]]
    # 31: need to specify the index dimensionality when creating the index
    if (is.list(model$metric[[i]])) {
      # in case where there is only one metric, the value is a one-item list
      # named 'ndim' giving the number of dimensions directly: all columns
      # are used in this metric
      ndim <- model$metric[[i]]$ndim
    } else {
      # otherwise, metric specifies the name or index used for each metric,
      # so the dimension is the number of them
      ndim <- length(model$metric[[i]])
    }
    if (nn_method == "annoy") {
      annoy_metric <- metric
      ann <- create_ann(annoy_metric, ndim = ndim)
      ann$load(nn_fname)

      idx <-
        list(
          ann = ann,
          type = "annoyv1",
          metric = annoy_metric,
          ndim = ndim
        )
      if (n_metrics == 1) {
        model$nn_index <- idx
      } else {
        model$nn_index[[i]] <- idx
      }
    }
    else if (nn_method == "hnsw") {
      ann <- hnsw_load(metric, ndim = ndim, filename = nn_fname)
      nn_meta_tmpfname <- file.path(mod_dir, paste0("uwot/nn-meta", i))
      idx <- readRDS(nn_meta_tmpfname)
      idx$ann <- ann

      if (n_metrics == 1) {
        model$nn_index <- idx
      } else {
        model$nn_index[[i]] <- idx
      }
    }
    else if (nn_method == "nndescent") {
      idx <- readRDS(nn_fname)
      if (n_metrics == 1) {
        model$nn_index <- idx
      } else {
        model$nn_index[[i]] <- idx
      }
    }
    else {
      stop("Unknown nearest neighbor method ", nn_method)
    }
  }
  model$mod_dir <- mod_dir

  model
}


#' Unload a Model
#'
#' Unloads the UMAP model. This prevents the model being used with
#' \code{\link{umap_transform}}, but allows the temporary working directory
#' associated with saving or loading the model to be removed.
#'
#' @param model a UMAP model create by \code{\link{umap}}.
#' @param cleanup if \code{TRUE}, attempt to delete the temporary working
#'   directory that was used in either the save or load of the model.
#' @param verbose if \code{TRUE}, log information to the console.
#'
#' @examples
#' iris_train <- iris[c(1:10, 51:60), ]
#' iris_test <- iris[100:110, ]
#'
#' # create model
#' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20)
#'
#' # save without unloading: this leaves behind a temporary working directory
#' model_file <- tempfile("iris_umap")
#' model <- save_uwot(model, file = model_file)
#'
#' # The model can continue to be used
#' test_embedding <- umap_transform(iris_test, model)
#'
#' # To manually unload the model from memory when finished and to clean up
#' # the working directory (this doesn't touch your model file)
#' unload_uwot(model)
#'
#' # At this point, model cannot be used with umap_transform, this would fail:
#' # test_embedding2 <- umap_transform(iris_test, model)
#'
#' # restore the model: this also creates a temporary working directory
#' model2 <- load_uwot(file = model_file)
#' test_embedding2 <- umap_transform(iris_test, model2)
#'
#' # Unload and clean up the loaded model temp directory
#' unload_uwot(model2)
#'
#' # clean up the model file
#' unlink(model_file)
#'
#' # save with unloading: this deletes the temporary working directory but
#' # doesn't allow the model to be re-used
#' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20)
#' model_file3 <- tempfile("iris_umap")
#' model3 <- save_uwot(model3, file = model_file3, unload = TRUE)
#'
#' @seealso \code{\link{save_uwot}}, \code{\link{load_uwot}}
#' @export
unload_uwot <- function(model, cleanup = TRUE, verbose = FALSE) {
  if (is.null(model$nn_method) || model$nn_method == "annoy") {
    tsmessage("Unloading NN index: model will be invalid")
    metrics <- names(model$metric)
    n_metrics <- length(metrics)
    for (i in 1:n_metrics) {
      if (n_metrics == 1) {
        rcppannoy <- get_rcppannoy(model$nn_index)
        rcppannoy$unload()
      } else {
        rcppannoy <- get_rcppannoy(model$nn_index[[i]])
        rcppannoy$unload()
      }
    }
  }

  if (cleanup) {
    if (is.null(model$mod_dir)) {
      tsmessage("Model is missing temp dir location, can't clean up")
      return()
    } else {
      mod_dir <- model$mod_dir
      if (!file.exists(mod_dir)) {
        tsmessage("model temp dir location '", mod_dir, "' no longer exists")
        return()
      }
      tsmessage("Deleting temp model dir ", mod_dir)
      res <- unlink(mod_dir, recursive = TRUE)
      if (res != 0) {
        tsmessage("Unable to delete tempdir ", mod_dir)
      }
    }
  }
}

all_nn_indices_are_loaded <- function(model) {
  if (is.null(model$nn_index)) {
    stop("Invalid model: has no 'nn_index'")
  }
  if (is.list(model$nn_index)) {
    if (is.null(model$nn_index$type)) {
      for (i in 1:length(model$nn_index)) {
        rcppannoy <- get_rcppannoy(model$nn_index[[i]])
        if (rcppannoy$getNTrees() == 0) {
          return(FALSE)
        }
      }
    }
    else if (model$nn_index$type == "annoyv1") {
      rcppannoy <- get_rcppannoy(model$nn_index)
      if (rcppannoy$getNTrees() == 0) {
        return(FALSE)
      }
    }
    else if (model$nn_index$type == "hnswv1") {
      return(TRUE)
    }
    else if (model$nn_index$type == "nndescentv1") {
      return(TRUE)
    }
    else {
      stop("Invalid model: has unknown 'nn_index' type ", model$nn_index$type)
    }
  } else {
    rcppannoy <- get_rcppannoy(model$nn_index)
    if (rcppannoy$getNTrees() == 0) {
      return(FALSE)
    }
  }
  TRUE
}

abspath <- function(filename) {
  file.path(normalizePath(dirname(filename)), basename(filename))
}

# Half of whatever the C++ implementation thinks are the number of concurrent
# threads supported, but at least 1
default_num_threads <- function() {
  max(1, hardware_concurrency() / 2)
}

# Get the number of vertices in X
x2nv <- function(X) {
  if (is.list(X)) {
    if (!is.null(X$idx)) {
      n_vertices <- x2nv(X$idx)
    } else {
      if (length(X) > 0) {
        n_vertices <- x2nv(X[[1]])
      } else {
        stop("Can't find n_vertices for list X")
      }
    }
  } else if (inherits(X, "dist")) {
    n_vertices <- attr(X, "Size")
  } else if (is_sparse_matrix(X)) {
    # older code path where distance matrix was part of X rather than nn_method
    # used nrow, but transform was not supported so nrow == ncol
    n_vertices <- ncol(X)
  } else if (methods::is(X, "data.frame") || methods::is(X, "matrix")) {
    n_vertices <- nrow(X)
  } else if (is.numeric(X)) {
    n_vertices <- length(X)
  } else {
    stop("Can't find number of vertices for X of type '", class(X)[1], "'")
  }
  n_vertices
}

data2set <- function(X,
                     Xcat,
                     n_neighbors,
                     metrics,
                     nn_method,
                     n_trees,
                     search_k,
                     method,
                     set_op_mix_ratio,
                     local_connectivity,
                     bandwidth,
                     perplexity,
                     kernel,
                     ret_sigma,
                     n_threads,
                     grain_size,
                     ret_model,
                     n_vertices = x2nv(X),
                     tmpdir = tempdir(),
                     pca = NULL,
                     pca_center = TRUE,
                     pca_method = "irlba",
                     nn_args = list(),
                     sparse_is_distance = TRUE,
                     verbose = FALSE) {
  V <- NULL
  nns <- list()
  nblocks <- length(metrics)
  sigma <- NULL
  # Check for precalculated NN data in nn_method
  if (is.list(nn_method)) {
    if (is.null(nn_method$idx)) {
      nblocks <- length(nn_method)
      if (nblocks == 0) {
        stop("Incorrect format for precalculated neighbor data")
      }
    } else {
      nblocks <- 1
      # wrap nn data in a list so data is always a list of lists
      nn_method <- list(nn_method)
    }
    metrics <- replicate(nblocks, NULL, simplify = FALSE)
    names(metrics) <- rep("precomputed", nblocks)
  }


  if (nblocks > 1) {
    tsmessage("Found ", nblocks, " blocks of data")
  }
  mnames <- tolower(names(metrics))
  if (is.null(nn_method)) {
    if (methods::is(X, "matrix")) {
      if (n_vertices < 4096 &&
        !ret_model &&
        all(mnames == "euclidean")) {
        tsmessage("Using FNN for neighbor search, n_neighbors = ", n_neighbors)
        nn_method <- "fnn"
      } else {
        tsmessage("Using Annoy for neighbor search, n_neighbors = ", n_neighbors)
        nn_method <- "annoy"
      }
    }
    else {
      # It's a dist, or an actual distance matrix (sparse or triangular)
      nn_method <- "matrix"
    }
  }

  pca_models <- list()
  for (i in 1:nblocks) {
    metric <- mnames[[i]]
    if (is.character(nn_method) && nn_method == "annoy") {
      metric <- match.arg(metric, c(
        "euclidean", "cosine", "manhattan",
        "hamming", "correlation", "precomputed"
      ))
    }
    # Defaults for this block which can be overridden
    pca_i <- pca
    pca_center_i <- pca_center

    subset <- metrics[[i]]
    if (is.null(subset)) {
      Xsub <- X
    } else if (is.list(subset)) {
      # e.g. "euclidean" = list(1:10, pca_center = FALSE),
      lsres <- lsplit_unnamed(subset)
      if (is.null(lsres$unnamed)) {
        stop("Error: no subset provided for block ", i)
      }
      if (length(lsres$unnamed) != 1) {
        stop("Error: only one unnamed item should be provided for block ", i)
      }
      subset <- lsres$unnamed[[1]]

      # possible overrides
      if (!is.null(lsres$named)) {
        lsnamed <- lsres$named
        lsnames <- names(lsnamed)
        if (!is.null(lsnamed$pca_center)) {
          pca_center_i <- lsnamed$pca_center
        }
        # PCA argument can be NULL, so need to check if it was explicitly provided
        if ("pca" %in% lsnames) {
          pca_i <- lsnamed$pca
        }
      }
      Xsub <- X[, subset, drop = FALSE]
    } else {
      Xsub <- X[, subset, drop = FALSE]
    }

    if (!is.null(X) && is.matrix(X)) {
      block_size <- ncol(Xsub)
      if (block_size == 0) {
        stop("Block ", i, " has zero size")
      }
      if (nblocks > 1) {
        tsmessage(
          "Processing block ", i, " of ", nblocks,
          " with size ", block_size,
          " using metric '", metric, "'"
        )
      }
    } else {
      # X is NULL or dist or something like that
      if (nblocks > 1) {
        tsmessage(
          "Processing block ", i, " of ", nblocks,
          " using metric '", metric, "'"
        )
      }
    }

    if (!is.null(pca_i) && is.matrix(X) && metric != "hamming" &&
      ncol(X) > pca_i && nrow(X) > pca_i) {
      tsmessage("Reducing column dimension to ", pca_i, " via PCA")
      pca_res <- pca_init(Xsub, pca_i,
        ret_extra = ret_model,
        center = pca_center_i,
        pca_method = pca_method,
        verbose = verbose
      )
      if (ret_model) {
        Xsub <- pca_res$scores
        pca_models[[as.character(i)]] <- pca_res[c("center", "rotation")]
        pca_res <- NULL
      } else {
        Xsub <- pca_res
      }
    }

    nn_sub <- nn_method
    # Extract this block of nn data from list of lists
    if (metric == "precomputed") {
      nn_sub <- nn_method[[i]]
      n_neighbors <- NULL
    }

    x2set_res <- x2set(
      Xsub,
      n_neighbors,
      metric,
      nn_method = nn_sub,
      n_trees,
      search_k,
      method,
      set_op_mix_ratio,
      local_connectivity,
      bandwidth,
      perplexity,
      kernel,
      ret_sigma,
      n_threads,
      grain_size,
      ret_model,
      n_vertices = n_vertices,
      nn_args = nn_args,
      tmpdir = tmpdir,
      sparse_is_distance = sparse_is_distance,
      verbose = verbose
    )
    Vblock <- x2set_res$V
    nn <- x2set_res$nn
    nns[[i]] <- nn
    names(nns)[[i]] <- metric
    if (is.null(V)) {
      V <- Vblock
    } else {
      # TODO: should at least offer the option to reset the local metric here
      # TODO: make , reset_local_metric = TRUE the default (breaking change)
      V <- set_intersect(V, Vblock, weight = 0.5, reset_connectivity = TRUE)
    }
    if (ret_sigma && is.null(sigma)) {
      # No idea how to combine different neighborhood sizes so just return the
      # first set
      sigma <- x2set_res$sigma
      rho <- x2set_res$rho
      dint <- x2set_res$dint
    }
  }

  if (!is.null(Xcat)) {
    V <- categorical_intersection_df(Xcat, V, weight = 0.5, verbose = verbose)
  }

  res <- list(V = V, nns = nns, pca_models = pca_models)
  if (!is.null(sigma)) {
    res$sigma <- sigma
    res$rho <- rho
    res$dint <- dint
  }
  res
}

x2nn <- function(X,
                 n_neighbors,
                 metric,
                 nn_method,
                 n_trees,
                 search_k,
                 tmpdir = tempdir(),
                 n_threads,
                 grain_size,
                 ret_model,
                 n_vertices = x2nv(X),
                 nn_args = list(),
                 sparse_is_distance = TRUE,
                 verbose = FALSE) {
  if (is.list(nn_method)) {
    validate_nn(nn_method, n_vertices)
    nn <- nn_method
  } else {
    nn_method <-
      match.arg(tolower(nn_method),
                c("annoy", "fnn", "matrix", "hnsw", "nndescent"))
    if (nn_method == "fnn" && metric != "euclidean") {
      stop(
        "nn_method = 'FNN' is only compatible with distance metric ",
        "'euclidean'"
      )
    }
    if (nn_method == "fnn" && ret_model) {
      stop("nn_method = 'FNN' is incompatible with ret_model = TRUE")
    }
    nn <- find_nn(
      X,
      n_neighbors,
      method = nn_method,
      metric = metric,
      n_trees = n_trees,
      search_k = search_k,
      nn_args = nn_args,
      tmpdir = tmpdir,
      n_threads = n_threads,
      grain_size = grain_size,
      ret_index = ret_model,
      sparse_is_distance = sparse_is_distance,
      verbose = verbose
    )
  }
  nn
}

validate_nn <- function(nn_method, n_vertices) {
  if (!is.matrix(nn_method$idx)) {
    stop("Couldn't find precalculated 'idx' matrix")
  }
  if (nrow(nn_method$idx) != n_vertices) {
    stop(
      "Precalculated 'idx' matrix must have ", n_vertices,
      " rows, but found ", nrow(nn_method$idx)
    )
  }

  if (!is.matrix(nn_method$dist)) {
    stop("Couldn't find precalculated 'dist' matrix")
  }
  if (nrow(nn_method$idx) != n_vertices) {
    stop("Precalculated 'dist' matrix must have ", n_vertices, " rows, but
         found ", nrow(nn_method$dist))
  }
  if (ncol(nn_method$dist) != ncol(nn_method$idx)) {
    stop("Precalculated 'dist' matrix must have ", ncol(nn_method$idx), " cols, but
         found ", ncol(nn_method$dist))
  }
}

nn2set <- function(method, nn,
                   set_op_mix_ratio, local_connectivity, bandwidth,
                   perplexity, kernel,
                   ret_sigma,
                   n_threads, grain_size,
                   verbose = FALSE) {
  sigma <- NULL
  res <- list()
  if (method == "largevis") {
    n_vertices <- nrow(nn$dist)
    if (perplexity >= n_vertices) {
      stop("perplexity can be no larger than ", n_vertices - 1)
    }
    Vres <- perplexity_similarities(
      nn = nn, perplexity = perplexity,
      ret_sigma = ret_sigma,
      n_threads = n_threads,
      grain_size = grain_size,
      kernel = kernel,
      verbose = verbose
    )
    res$V <- Vres$matrix
    if (ret_sigma && !is.null(Vres$sigma)) {
      res$sigma <- Vres$sigma
      res$dint <- Vres$dint
    }
  } else {
    Vres <- fuzzy_simplicial_set(
      nn = nn,
      set_op_mix_ratio = set_op_mix_ratio,
      local_connectivity = local_connectivity,
      bandwidth = bandwidth,
      ret_sigma = ret_sigma,
      n_threads = n_threads,
      grain_size = grain_size,
      verbose = verbose
    )
    if (ret_sigma) {
      res$V <- Vres$matrix
      res$sigma <- Vres$sigma
      res$rho <- Vres$rho
    } else {
      res$V <- Vres
    }
  }
  res
}

x2set <- function(X,
                  n_neighbors,
                  metric,
                  nn_method,
                  n_trees,
                  search_k,
                  method,
                  set_op_mix_ratio,
                  local_connectivity,
                  bandwidth,
                  perplexity,
                  kernel,
                  ret_sigma,
                  n_threads,
                  grain_size,
                  ret_model,
                  n_vertices = x2nv(X),
                  tmpdir = tempdir(),
                  nn_args = list(),
                  sparse_is_distance = TRUE,
                  verbose = FALSE) {
  if (is_sparse_matrix(nn_method)) {
    nn <- nn_method
    if (nrow(nn) != ncol(nn)) {
      stop("Sparse distance matrix must have same number of rows and cols")
    }
    if (nrow(nn) != n_vertices) {
      stop("Sparse distance matrix must have same dimensions as input data")
    }
  } else {
    nn <- x2nn(
      X,
      n_neighbors = n_neighbors,
      metric = metric,
      nn_method = nn_method,
      n_trees = n_trees,
      search_k = search_k,
      tmpdir = tmpdir,
      n_threads = n_threads,
      grain_size = grain_size,
      ret_model = ret_model,
      nn_args = nn_args,
      n_vertices = n_vertices,
      sparse_is_distance = sparse_is_distance,
      verbose = verbose
    )
    if (any(is.infinite(nn$dist))) {
      stop("Infinite distances found in nearest neighbors")
    }
  }
  gc()

  nn2set_res <- nn2set(
    method,
    nn,
    set_op_mix_ratio,
    local_connectivity,
    bandwidth,
    perplexity,
    kernel,
    ret_sigma,
    n_threads,
    grain_size,
    verbose = verbose
  )
  V <- nn2set_res$V

  if (any(is.na(V))) {
    stop("Non-finite entries in the input matrix")
  }
  gc()

  res <- list(
    nn = nn,
    V = V
  )
  if (ret_sigma && !is.null(nn2set_res$sigma)) {
    res$sigma <- nn2set_res$sigma
    res$rho <- nn2set_res$rho
    res$dint <- nn2set_res$dint
  }
  res
}

set_intersect <- function(A, B, weight = 0.5, reset_connectivity = TRUE,
                          reset_local_metric = FALSE, n_threads = NULL,
                          verbose = FALSE) {
  A <- general_simplicial_set_intersection(
    A, B, weight
  )
  A <- Matrix::drop0(A)
  # https://github.com/lmcinnes/umap/issues/58#issuecomment-437633658
  # For now always reset
  if (reset_connectivity) {
    A <- reset_local_connectivity(A,
      reset_local_metric = reset_local_metric,
      n_threads = n_threads, verbose = verbose
    )
  }
  A
}

categorical_intersection_df <- function(X, V, weight = 0.5, verbose = FALSE) {
  tsmessage(
    "Carrying out categorical intersection for ",
    pluralize("column", ncol(X))
  )
  for (i in 1:ncol(X)) {
    V <- categorical_intersection(X[, i], V,
      weight = weight,
      verbose = (verbose && i == 1)
    )
  }
  V
}

categorical_intersection <- function(x, V, weight, verbose = FALSE) {
  if (is.null(V)) {
    stop("V cannot be null for categorical intersection")
  }
  if (weight < 1.0) {
    far_dist <- 2.5 * (1.0 / (1.0 - weight))
  } else {
    far_dist <- 1.0e12
  }
  tsmessage(
    "Applying categorical set intersection, weight = ", formatC(weight),
    " far distance = ", formatC(far_dist)
  )

  V <- categorical_simplicial_set_intersection(V, x,
    far_dist = far_dist,
    verbose = verbose
  )
  V
}


# Creates the number of epochs per sample for each weight
# weights are the non-zero input affinities (1-simplex)
# n_epoch the total number of epochs
# There is an inverse relationship between the weights and the return vector.
make_epochs_per_sample <- function(weights, n_epochs) {
  result <- rep(-1, length(weights))
  n_samples <- n_epochs * (weights / max(weights))
  result[n_samples > 0] <- n_epochs / n_samples[n_samples > 0]
  result
}

# Create the a/b parameters from spread and min_dist
find_ab_params <- function(spread = 1, min_dist = 0.001) {
  xv <- seq(from = 0, to = spread * 3, length.out = 300)
  yv <- rep(0, length(xv))
  yv[xv < min_dist] <- 1
  yv[xv >= min_dist] <- exp(-(xv[xv >= min_dist] - min_dist) / spread)
  result <- try(
    {
      stats::nls(yv ~ 1 / (1 + a * xv^(2 * b)),
        start = list(a = 1, b = 1)
      )$m$getPars()
    },
    silent = TRUE
  )
  if (inherits(result, "try-error")) {
    stop(
      "Can't find a, b for provided spread = ", spread,
      " min_dist = ", min_dist
    )
  }
  result
}

# The default number of edge samples used by LargeVis
lvish_samples <- function(n_vertices) {
  n_samples <- 0

  if (n_vertices < 10000) {
    n_samples <- 1000
  } else if (n_vertices < 1000000) {
    n_samples <- (n_vertices - 10000) * 9000 / (1000000 - 10000) + 1000
  } else {
    n_samples <- n_vertices / 100
  }

  round(n_samples * 1000000)
}

# Returns the number of epochs required to generate the default number of edge samples
# used in LargeVis
lvish_epochs <- function(n_vertices, V) {
  n_samples <- lvish_samples(n_vertices)
  round(n_samples * max(V) / sum(V))
}

# Scale X according to various strategies
scale_input <- function(X, scale_type, ret_model = FALSE, verbose = FALSE) {
  if (is.null(scale_type)) {
    scale_type <- "none"
  } else if (is.logical(scale_type)) {
    scale_type <- ifelse(scale_type, "scale", "none")
  } else if (tolower(scale_type) == "z") {
    scale_type <- "scale"
  }

  scale_type <- match.arg(
    tolower(scale_type),
    c("none", "scale", "range", "colrange", "maxabs")
  )
  switch(scale_type,
    range = {
      tsmessage("Range scaling X")
      min_X <- min(X)
      X <- X - min_X

      max_X <- max(X)
      X <- X / max_X

      if (ret_model) {
        attr(X, "scaled:range:min") <- min_X
        attr(X, "scaled:range:max") <- max_X
      }
    },
    colrange = {
      tsmessage("Column range scaling X")
      min_X <- apply(X, 2, min)
      X <- sweep(X, 2, min_X)

      max_X <- apply(X, 2, max)
      X <- sweep(X, 2, max_X, `/`)

      if (ret_model) {
        attr(X, "scaled:colrange:min") <- min_X
        attr(X, "scaled:colrange:max") <- max_X
      }
    },
    maxabs = {
      tsmessage("Normalizing by max-abs")
      X <- base::scale(X, scale = FALSE)
      max_abs <- max(abs(X))
      X <- X / max_abs

      if (ret_model) {
        attr(X, "scaled:maxabs") <- max_abs
      }
    },
    scale = {
      tsmessage("Scaling to zero mean and unit variance")

      varf <- function(x) {
        sum((x - sum(x) / length(x))^2)
      }
      non_zero_var_cols <- apply(X, 2, varf) >= .Machine$double.xmin

      if (length(non_zero_var_cols) == 0) {
        stop("Matrix has zero variance")
      }
      X <- X[, non_zero_var_cols]
      tsmessage("Kept ", ncol(X), " non-zero-variance columns")
      X <- base::scale(X, scale = TRUE)

      if (ret_model) {
        attr(X, "scaled:nzvcols") <- which(non_zero_var_cols)
      }
    }
  )
  X
}

attr_to_scale_info <- function(X) {
  Xattr <- attributes(X)
  Xattr <- Xattr[startsWith(names(Xattr), "scaled:")]
  if (length(Xattr) == 0) {
    Xattr <- NULL
  }
  Xattr
}

get_opt_args <- function(opt_args, alpha) {
  default_batch_opt <- "adam"
  default_opt_args <- list(
    sgd = list(alpha = alpha),
    adam = list(alpha = alpha, beta1 = 0.5, beta2 = 0.9, eps = 1e-7)
  )
  if (is.null(opt_args)) {
    opt_args <- list()
  }
  if (is.null(opt_args$method)) {
    opt_args$method <- "adam"
  }
  if (!(opt_args$method %in% names(default_opt_args))) {
    stop("Unknown optimization method '", opt_args$method, "'")
  }
  lmerge(default_opt_args[[opt_args$method]], opt_args)
}

# Takes local radii from the input dimension and converts to approximate
# densities in the output space by mapping them to a vector of a parameters
# as used in the UMAP output weight: 1/(1 + a + d^2b).
# Based on testing a rough range of usable a values is 0.01-100. To get that
# we want each a value to be the product of the local density of i and j, so
# a = sqrt(a_i * a_j)
# Also, we want dens_scale to control the spread of a values and for
# dens_scale = 0, the vector of a_i give the the user-selected scalar value of
# a, so we scale the log of the reciprocal of localr to be within [log(a * 1e-(2
# * dens_scale)) ... log(a * 1e(2 * dens_scale))] We take the sqrt of the a_i in
# this function to avoid repeatedly calling it inside the optimization loop.
scale_radii <- function(localr, dens_scale, a) {
  log_denso <- -log(localr)
  min_densl <- a * (10^(-2 * dens_scale))
  log_min_densl <- log(min_densl)
  max_densl <- a * (10^(2 * dens_scale))
  log_max_densl <- log(max_densl)
  log_denso_scale <- range_scale(log_denso, log_min_densl, log_max_densl)
  sqrt(exp(log_denso_scale))
}

#' @useDynLib uwot, .registration=TRUE
#' @importFrom Rcpp sourceCpp
.onUnload <- function(libpath) {
  library.dynam.unload("uwot", libpath)
}

# Remove scaling attributes from a matrix
# if the `scale` parameter is set then these attributes are assumed to have
# been applied by uwot's internals and the equivalent scaling will be applied
# to new data in umap_transform. However, these attributes could have been
# applied by manually scaling the data before running any code in uwot, in which
# case we should not save them as part of the model. This function is called
# before applying any other scaling
remove_scaling_attrs <- function(X) {
  uwot_attrs <- c(
    "scaled:range:min",
    "scaled:range:max",
    "scaled:colrange:min",
    "scaled:colrange:max",
    "scaled:maxabs",
    "scaled:nzvcols",
    "scaled:center",
    "scaled:scale"
  )
  attrs <- names(attributes(X))
  for (attr in attrs) {
    if (attr %in% uwot_attrs) {
      attributes(X)[[attr]] <- NULL
    }
  }
  X
}
jlmelville/uwot documentation built on April 25, 2024, 5:20 a.m.