# R/uwot.R In jlmelville/uwot: The Uniform Manifold Approximation and Projection (UMAP) Method for Dimensionality Reduction

#' Dimensionality Reduction with UMAP
#'
#' Carry out dimensionality reduction of a dataset using the Uniform Manifold
#' Approximation and Projection (UMAP) method (McInnes & Healy, 2018). Some of
#' the following help text is lifted verbatim from the Python reference
#' implementation at \url{https://github.com/lmcinnes/umap}.
#'
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#'   \code{"pca"}.
#' @param n_neighbors The size of local neighborhood (in terms of number of
#'   neighboring sample points) used for manifold approximation. Larger values
#'   result in more global views of the manifold, while smaller values result in
#'   more local data being preserved. In general values should be in the range
#'   \code{2} to \code{100}.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. One
#'   of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the
#' distance metric is always "euclidean").
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. By default, this value is set to \code{500} for
#'   datasets containing 10,000 vertices or less, and \code{200} otherwise.
#'   If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will
#'   be returned.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For UMAP, the default is \code{"none"}.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#'     (Belkin and Niyogi, 2002).
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE. This is an alias for \code{init = "pca", init_sdev =
#'     1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}), if more than one connected component is identified,
#'  each connected component is initialized separately and the results are
#'  merged. If \code{verbose = TRUE} the number of connected components are
#'  logged to the console. The existence of multiple connected components
#'  implies that a global view of the data cannot be attained with this
#'  initialization. Either a PCA-based initialization or increasing the value of
#'  \code{n_neighbors} may be more appropriate.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default no scaling is carried out, except when \code{init =
#'   "spca"}, in which case the value is \code{0.0001}. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).
#' @param spread The effective scale of embedded points. In combination with
#'   \code{min_dist}, this determines how clustered/clumped the embedded points
#'   are.
#' @param min_dist The effective minimum distance between embedded points.
#'   Smaller values will result in a more clustered/clumped embedding where
#'   nearby points on the manifold are drawn closer together, while larger
#'   values will result on a more even dispersal of points. The value should be
#'   set relative to the \code{spread} value, which determines the scale at
#'   which embedded points will be spread out.
#' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as
#'   the set operation used to combine local fuzzy simplicial sets to obtain a
#'   global fuzzy simplicial sets. Both fuzzy set operations use the product
#'   t-norm. The value of this parameter should be between \code{0.0} and
#'   \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while
#'   \code{0.0} will use a pure fuzzy intersection.
#' @param local_connectivity The local connectivity required -- i.e. the number
#'   of nearest neighbors that should be assumed to be connected at a local
#'   level. The higher this value the more connected the manifold becomes
#'   locally. In practice this should be not more than the local intrinsic
#'   dimension of the manifold.
#' @param bandwidth The effective bandwidth of the kernel if we view the
#'   algorithm as similar to Laplacian Eigenmaps. Larger values induce more
#'   connectivity and a more global view of the data, smaller values concentrate
#'   more locally.
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param a More specific parameters controlling the embedding. If \code{NULL}
#'   these values are set automatically as determined by \code{min_dist} and
#' @param b More specific parameters controlling the embedding. If \code{NULL}
#'   these values are set automatically as determined by \code{min_dist} and
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass pre-calculated nearest neighbor data to this argument. It
#'   must be a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   Multiple nearest neighbor data (e.g. from two different precomputed
#'   metrics) can be passed by passing a list containing the nearest neighbor
#'   data lists as items.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param approx_pow If \code{TRUE}, use an approximation to the power function
#'   in the UMAP gradient, from
#'   \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}.
#' @param y Optional target data for supervised dimension reduction. Can be a
#' vector, matrix or data frame. Use the \code{target_metric} parameter to
#' specify the metrics to use, using the same syntax as \code{metric}. Usually
#' either a single numeric or factor column is used, but more complex formats
#' are possible. The following types are allowed:
#'   \itemize{
#'     \item Factor columns with the same length as \code{X}. \code{NA} is
#'     allowed for any observation with an unknown level, in which case
#'     UMAP operates as a form of semi-supervised learning. Each column is
#'     treated separately.
#'     \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the
#'     parameter \code{target_n_neighbors} to set the number of neighbors used
#'     with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors,
#'     numeric columns are grouped into one block unless \code{target_metric}
#'     specifies otherwise. For example, if you wish columns \code{a} and
#'     \code{b} to be treated separately, specify
#'     \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise,
#'     the data will be effectively treated as a matrix with two columns.
#'     \item Nearest neighbor data, consisting of a list of two matrices,
#'     \code{idx} and \code{dist}. These represent the precalculated nearest
#'     neighbor indices and distances, respectively. This
#'     is the same format as that expected for precalculated data in
#'     \code{nn_method}. This format assumes that the underlying data was a
#'     numeric vector. Any user-supplied value of the \code{target_n_neighbors}
#'     parameter is ignored in this case, because the the number of columns in
#'     the matrices is used for the value. Multiple nearest neighbor data using
#'     different metrics can be supplied by passing a list of these lists.
#'   }
#' Unlike \code{X}, all factor columns included in \code{y} are automatically
#' used.
#' @param target_n_neighbors Number of nearest neighbors to use to construct the
#'   target simplicial set. Default value is \code{n_neighbors}. Applies only if
#'   \code{y} is non-\code{NULL} and \code{numeric}.
#' @param target_metric The metric used to measure distance for \code{y} if
#'   using supervised dimension reduction. Used only if \code{y} is numeric.
#' @param target_weight Weighting factor between data topology and target
#'   topology. A value of 0.0 weights entirely on data, a value of 1.0 weights
#'   entirely on target. The default of 0.5 balances the weighting equally
#'   between data and target. Only applies if \code{y} is non-\code{NULL}.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and
#'   \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to
#'   \code{TRUE} will speed up the stochastic optimization phase, but give a
#'   potentially less accurate embedding, and which will not be exactly
#'   reproducible even with a fixed seed. For visualization, \code{fast_sgd =
#'   TRUE} will give perfectly good results. For more generic dimensionality
#'   reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd =
#'   TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads},
#'   and \code{approx_pow} are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param ret_model If \code{TRUE}, then return extra data that can be used to
#'   embedded coordinates are returned as the list item \code{embedding}. If
#'   \code{FALSE}, just return the coordinates. This parameter can be used in
#'   conjunction with \code{ret_nn} and \code{ret_extra}. Note that some
#'   settings are incompatible with the production of a UMAP model: external
#'   neighbor data (passed via a list to \code{nn_method}), and factor columns
#'   that were included via the \code{metric} parameter. In the latter case, the
#'   model produced is based only on the numeric data. A transformation using
#'   new data is possible, but the factor columns in the new data are ignored.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#'   nearest neighbor data that can be used as input to \code{nn_method} to
#'   avoid the overhead of repeatedly calculating the nearest neighbors when
#'   manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#'   \code{init}). See the "Value" section for the names of the list items. If
#'   \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#'   could be sensitive to data scaling, so be wary of reusing nearest neighbor
#'   data if modifying the \code{scale} parameter. This parameter can be used in
#'   conjunction with \code{ret_model} and \code{ret_extra}.
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'     \item \code{"model"} Same as setting \code{ret_model = TRUE}.
#'     \item \code{"nn"} Same as setting \code{ret_nn = TRUE}.
#'     \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy
#'       simplicial set of the merged local views of the input data). The graph
#'       is returned as a sparse symmetric N x N matrix of class
#'       \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#'       membership strength of the edge connecting vertex i and vertex j. This
#'       can be considered analogous to the input probability (or similarity or
#'       affinity) used in t-SNE and LargeVis. Note that the graph is further
#'       sparsified by removing edges with sufficiently low membership strength
#'       that they would not be sampled by the probabilistic edge sampling
#'       employed for optimization and therefore the number of non-zero elements
#'       in the matrix is dependent on \code{n_epochs}. If you are only
#'       interested in the fuzzy input graph (e.g. for clustering), setting
#'       \code{n_epochs = 0} will avoid any further sparsifying.
#'   }
#'   descent). Default is half the number of concurrent threads supported by the
#'   system. For nearest neighbor search, only applies if
#'   \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#'   will be temporarily written to disk in the location determined by
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. Set to \code{"auto"} to use the same value as
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @return A matrix of optimized coordinates, or:
#'   \itemize{
#'     \item if \code{ret_model = TRUE} (or \code{ret_extra} contains
#'     \code{"model"}), returns a list containing extra information that can be
#'     used to add new data to an existing embedding via
#'     \code{\link{umap_transform}}. In this case, the coordinates are available
#'     in the list item \code{embedding}. \bold{NOTE}: The contents of
#'     the \code{model} list should \emph{not} be considered stable or part of
#'     the public API, and are purposely left undocumented.
#'     \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#'     returns the nearest neighbor data as a list called \code{nn}. This
#'     contains one list for each \code{metric} calculated, itself containing a
#'     matrix \code{idx} with the integer ids of the neighbors; and a matrix
#'     \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#'     used as input to the \code{nn_method} parameter.
#'     \item if \code{ret_extra} contains \code{"fgraph"} returns the high
#'     dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type
#'   }
#'   The returned list contains the combined data from any combination of
#'   specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}.
#' @examples
#'
#' iris30 <- iris[c(1:10, 51:60, 101:110), ]
#'
#' # Non-numeric columns are automatically removed so you can pass data frames
#' # directly in a lot of cases without pre-processing
#' iris_umap <- umap(iris30, n_neighbors = 5, learning_rate = 0.5, init = "random", n_epochs = 20)
#'
#' # Faster approximation to the gradient and return nearest neighbors
#' iris_umap <- umap(iris30, n_neighbors = 5, approx_pow = TRUE, ret_nn = TRUE, n_epochs = 20)
#'
#' # Can specify min_dist and spread parameters to control separation and size
#' # of clusters and reuse nearest neighbors for efficiency
#' nn <- iris_umap$nn #' iris_umap <- umap(iris30, n_neighbors = 5, min_dist = 1, spread = 5, nn_method = nn, n_epochs = 20) #' #' # Supervised dimension reduction using the 'Species' factor column #' iris_sumap <- umap(iris30, n_neighbors = 5, min_dist = 0.001, y = iris30$Species,
#'                    target_weight = 0.5, n_epochs = 20)
#'
#' # Calculate Petal and Sepal neighbors separately (uses intersection of the resulting sets):
#' iris_umap <- umap(iris30, metric = list(
#'   "euclidean" = c("Sepal.Length", "Sepal.Width"),
#'   "euclidean" = c("Petal.Length", "Petal.Width")
#' ))
#'
#' @references
#' Belkin, M., & Niyogi, P. (2002).
#' Laplacian eigenmaps and spectral techniques for embedding and clustering.
#' In \emph{Advances in neural information processing systems}
#' (pp. 585-591).
#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
#'
#' Kingma, D. P., & Ba, J. (2014).
#' Adam: A method for stochastic optimization.
#' \emph{arXiv preprint} \emph{arXiv}:1412.6980.
#' \url{https://arxiv.org/abs/1412.6980}
#'
#' McInnes, L., & Healy, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' O’Neill, M. E. (2014).
#' \emph{PCG: A family of simple fast space-efficient statistically good
#' algorithms for random number generation}
#' (Report No. HMC-CS-2014-0905). Harvey Mudd College.
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' Van der Maaten, L., & Hinton, G. (2008).
#' Visualizing data using t-SNE.
#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html}
#' @export
umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
n_epochs = NULL, learning_rate = 1, scale = FALSE,
init = "spectral", init_sdev = NULL,
spread = 1, min_dist = 0.01,
set_op_mix_ratio = 1.0, local_connectivity = 1.0,
bandwidth = 1.0, repulsion_strength = 1.0,
negative_sample_rate = 5.0, a = NULL, b = NULL,
nn_method = NULL, n_trees = 50,
search_k = 2 * n_neighbors * n_trees,
approx_pow = FALSE,
y = NULL, target_n_neighbors = n_neighbors,
target_metric = "euclidean",
target_weight = 0.5,
pca = NULL, pca_center = TRUE,
pcg_rand = TRUE,
fast_sgd = FALSE,
ret_model = FALSE, ret_nn = FALSE, ret_extra = c(),
grain_size = 1,
tmpdir = tempdir(),
verbose = getOption("verbose", TRUE),
batch = FALSE,
opt_args = NULL, epoch_callback = NULL, pca_method = NULL) {
uwot(
X = X, n_neighbors = n_neighbors, n_components = n_components,
metric = metric, n_epochs = n_epochs, alpha = learning_rate, scale = scale,
init = init, init_sdev = init_sdev,
set_op_mix_ratio = set_op_mix_ratio,
local_connectivity = local_connectivity, bandwidth = bandwidth,
gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
a = a, b = b, nn_method = nn_method, n_trees = n_trees,
search_k = search_k,
method = "umap", approx_pow = approx_pow,
grain_size = grain_size,
y = y, target_n_neighbors = target_n_neighbors,
target_weight = target_weight, target_metric = target_metric,
pca = pca, pca_center = pca_center, pca_method = pca_method,
pcg_rand = pcg_rand,
fast_sgd = fast_sgd,
ret_model = ret_model || "model" %in% ret_extra,
ret_nn = ret_nn || "nn" %in% ret_extra,
ret_fgraph = "fgraph" %in% ret_extra,
batch = batch,
opt_args = opt_args,
epoch_callback = epoch_callback,
tmpdir = tempdir(),
verbose = verbose
)
}

#' Dimensionality Reduction Using t-Distributed UMAP (t-UMAP)
#'
#' A faster (but less flexible) version of the UMAP gradient. For more detail on
#' UMAP, see the  \code{\link{umap}} function.
#'
#' By setting the UMAP curve parameters \code{a} and \code{b} to \code{1}, you
#' get back the Cauchy distribution as used in t-SNE and LargeVis. It also
#' results in a substantially simplified gradient expression. This can give
#' a speed improvement of around 50\%.
#'
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#'   \code{"pca"}.
#' @param n_neighbors The size of local neighborhood (in terms of number of
#'   neighboring sample points) used for manifold approximation. Larger values
#'   result in more global views of the manifold, while smaller values result in
#'   more local data being preserved. In general values should be in the range
#'   \code{2} to \code{100}.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. One
#'   of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the
#' distance metric is always "euclidean").
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. By default, this value is set to \code{500} for
#'   datasets containing 10,000 vertices or less, and \code{200} otherwise.
#'   If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will
#'   be returned.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For t-UMAP, the default is \code{"none"}.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#'     (Belkin and Niyogi, 2002).
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE. This is an alias for \code{init = "pca", init_sdev =
#'     1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}), if more than one connected component is identified,
#'  each connected component is initialized separately and the results are
#'  merged. If \code{verbose = TRUE} the number of connected components are
#'  logged to the console. The existence of multiple connected components
#'  implies that a global view of the data cannot be attained with this
#'  initialization. Either a PCA-based initialization or increasing the value of
#'  \code{n_neighbors} may be more appropriate.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default no scaling is carried out, except when \code{init =
#'   "spca"}, in which case the value is \code{0.0001}. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).
#' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as
#'   the set operation used to combine local fuzzy simplicial sets to obtain a
#'   global fuzzy simplicial sets. Both fuzzy set operations use the product
#'   t-norm. The value of this parameter should be between \code{0.0} and
#'   \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while
#'   \code{0.0} will use a pure fuzzy intersection.
#' @param local_connectivity The local connectivity required -- i.e. the number
#'   of nearest neighbors that should be assumed to be connected at a local
#'   level. The higher this value the more connected the manifold becomes
#'   locally. In practice this should be not more than the local intrinsic
#'   dimension of the manifold.
#' @param bandwidth The effective bandwidth of the kernel if we view the
#'   algorithm as similar to Laplacian Eigenmaps. Larger values induce more
#'   connectivity and a more global view of the data, smaller values concentrate
#'   more locally.
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass pre-calculated nearest neighbor data to this argument. It
#'   must be a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   Multiple nearest neighbor data (e.g. from two different pre-calculated
#'   metrics) can be passed by passing a list containing the nearest neighbor
#'   data lists as items.
#'   The \code{n_neighbors} parameter is ignored when using pre-calculated
#'   nearest neighbor data.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param y Optional target data for supervised dimension reduction. Can be a
#' vector, matrix or data frame. Use the \code{target_metric} parameter to
#' specify the metrics to use, using the same syntax as \code{metric}. Usually
#' either a single numeric or factor column is used, but more complex formats
#' are possible. The following types are allowed:
#'   \itemize{
#'     \item Factor columns with the same length as \code{X}. \code{NA} is
#'     allowed for any observation with an unknown level, in which case
#'     UMAP operates as a form of semi-supervised learning. Each column is
#'     treated separately.
#'     \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the
#'     parameter \code{target_n_neighbors} to set the number of neighbors used
#'     with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors,
#'     numeric columns are grouped into one block unless \code{target_metric}
#'     specifies otherwise. For example, if you wish columns \code{a} and
#'     \code{b} to be treated separately, specify
#'     \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise,
#'     the data will be effectively treated as a matrix with two columns.
#'     \item Nearest neighbor data, consisting of a list of two matrices,
#'     \code{idx} and \code{dist}. These represent the precalculated nearest
#'     neighbor indices and distances, respectively. This
#'     is the same format as that expected for precalculated data in
#'     \code{nn_method}. This format assumes that the underlying data was a
#'     numeric vector. Any user-supplied value of the \code{target_n_neighbors}
#'     parameter is ignored in this case, because the the number of columns in
#'     the matrices is used for the value. Multiple nearest neighbor data using
#'     different metrics can be supplied by passing a list of these lists.
#'   }
#' Unlike \code{X}, all factor columns included in \code{y} are automatically
#' used.
#' @param target_n_neighbors Number of nearest neighbors to use to construct the
#'   target simplicial set. Default value is \code{n_neighbors}. Applies only if
#'   \code{y} is non-\code{NULL} and \code{numeric}.
#' @param target_metric The metric used to measure distance for \code{y} if
#'   using supervised dimension reduction. Used only if \code{y} is numeric.
#' @param target_weight Weighting factor between data topology and target
#'   topology. A value of 0.0 weights entirely on data, a value of 1.0 weights
#'   entirely on target. The default of 0.5 balances the weighting equally
#'   between data and target. Only applies if \code{y} is non-\code{NULL}.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The
#'   default is \code{FALSE}. Setting this to \code{TRUE} will speed up the
#'   stochastic optimization phase, but give a potentially less accurate
#'   embedding, and which will not be exactly reproducible even with a fixed
#'   seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good
#'   results. For more generic dimensionality reduction, it's safer to leave
#'   \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied
#'   values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param ret_model If \code{TRUE}, then return extra data that can be used to
#'   embedded coordinates are returned as the list item \code{embedding}. If
#'   \code{FALSE}, just return the coordinates. This parameter can be used in
#'   conjunction with \code{ret_nn} and \code{ret_extra}. Note that some
#'   settings are incompatible with the production of a UMAP model: external
#'   neighbor data (passed via a list to \code{nn_method}), and factor columns
#'   that were included via the \code{metric} parameter. In the latter case, the
#'   model produced is based only on the numeric data. A transformation using
#'   new data is possible, but the factor columns in the new data are ignored.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#'   nearest neighbor data that can be used as input to \code{nn_method} to
#'   avoid the overhead of repeatedly calculating the nearest neighbors when
#'   manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#'   \code{init}). See the "Value" section for the names of the list items. If
#'   \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#'   could be sensitive to data scaling, so be wary of reusing nearest neighbor
#'   data if modifying the \code{scale} parameter. This parameter can be used in
#'   conjunction with \code{ret_model} and \code{ret_extra}.
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'     \item \code{"model"} Same as setting \code{ret_model = TRUE}.
#'     \item \code{"nn"} Same as setting \code{ret_nn = TRUE}.
#'     \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy
#'       simplicial set of the merged local views of the input data). The graph
#'       is returned as a sparse symmetric N x N matrix of class
#'       \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#'       membership strength of the edge connecting vertex i and vertex j. This
#'       can be considered analogous to the input probability (or similarity or
#'       affinity) used in t-SNE and LargeVis. Note that the graph is further
#'       sparsified by removing edges with sufficiently low membership strength
#'       that they would not be sampled by the probabilistic edge sampling
#'       employed for optimization and therefore the number of non-zero elements
#'       in the matrix is dependent on \code{n_epochs}. If you are only
#'       interested in the fuzzy input graph (e.g. for clustering), setting
#'       \code{n_epochs = 0} will avoid any further sparsifying.
#'   }
#'   descent). Default is half the number of concurrent threads supported by the
#'   system. For nearest neighbor search, only applies if
#'   \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#'   will be temporarily written to disk in the location determined by
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. Set to \code{"auto"} to use the same value as
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @return A matrix of optimized coordinates, or:
#'   \itemize{
#'     \item if \code{ret_model = TRUE} (or \code{ret_extra} contains
#'     \code{"model"}), returns a list containing extra information that can be
#'     used to add new data to an existing embedding via
#'     \code{\link{umap_transform}}. In this case, the coordinates are available
#'     in the list item \code{embedding}. \bold{NOTE}: The contents of
#'     the \code{model} list should \emph{not} be considered stable or part of
#'     the public API, and are purposely left undocumented.
#'     \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#'     returns the nearest neighbor data as a list called \code{nn}. This
#'     contains one list for each \code{metric} calculated, itself containing a
#'     matrix \code{idx} with the integer ids of the neighbors; and a matrix
#'     \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#'     used as input to the \code{nn_method} parameter.
#'     \item if \code{ret_extra} contains \code{"fgraph"} returns the high
#'     dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type
#'   }
#'   The returned list contains the combined data from any combination of
#'   specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}.
#' @examples
#' iris_tumap <- tumap(iris, n_neighbors = 50, learning_rate = 0.5)
#' @export
tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
n_epochs = NULL,
learning_rate = 1, scale = FALSE,
init = "spectral", init_sdev = NULL,
set_op_mix_ratio = 1.0, local_connectivity = 1.0,
bandwidth = 1.0, repulsion_strength = 1.0,
negative_sample_rate = 5.0,
nn_method = NULL, n_trees = 50,
search_k = 2 * n_neighbors * n_trees,
grain_size = 1,
y = NULL, target_n_neighbors = n_neighbors,
target_metric = "euclidean",
target_weight = 0.5,
pca = NULL, pca_center = TRUE,
pcg_rand = TRUE,
fast_sgd = FALSE,
ret_model = FALSE, ret_nn = FALSE, ret_extra = c(),
tmpdir = tempdir(),
verbose = getOption("verbose", TRUE),
batch = FALSE,
opt_args = NULL, epoch_callback = NULL,
pca_method = NULL) {
uwot(
X = X, n_neighbors = n_neighbors, n_components = n_components,
metric = metric,
n_epochs = n_epochs, alpha = learning_rate, scale = scale,
init = init, init_sdev = init_sdev,
spread = NULL, min_dist = NULL, set_op_mix_ratio = set_op_mix_ratio,
local_connectivity = local_connectivity, bandwidth = bandwidth,
gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
a = NULL, b = NULL, nn_method = nn_method, n_trees = n_trees,
search_k = search_k,
method = "tumap",
grain_size = grain_size,
y = y, target_n_neighbors = target_n_neighbors,
target_weight = target_weight, target_metric = target_metric,
pca = pca, pca_center = pca_center, pca_method = pca_method,
pcg_rand = pcg_rand,
fast_sgd = fast_sgd,
ret_model = ret_model || "model" %in% ret_extra,
ret_nn = ret_nn || "nn" %in% ret_extra,
ret_fgraph = "fgraph" %in% ret_extra,
batch = batch,
opt_args = opt_args,
epoch_callback = epoch_callback,
tmpdir = tmpdir,
verbose = verbose
)
}

#' Dimensionality Reduction with a LargeVis-like method
#'
#' Carry out dimensionality reduction of a dataset using a method similar to
#' LargeVis (Tang et al., 2016).
#'
#' \code{lvish} differs from the official LargeVis implementation in the
#' following:
#'
#' \itemize{
#'   \item Only the nearest-neighbor index search phase is multi-threaded.
#'   \item Matrix input data is not normalized.
#'   \item The \code{n_trees} parameter cannot be dynamically chosen based on
#'   data set size.
#'   \item Nearest neighbor results are not refined via the
#'   neighbor-of-my-neighbor method. The \code{search_k} parameter is twice
#'   as large than default to compensate.
#'   \item Gradient values are clipped to \code{4.0} rather than \code{5.0}.
#'   \item Negative edges are generated by uniform sampling of vertexes rather
#'   than their degree ^ 0.75.
#'   \item The default number of samples is much reduced. The default number of
#'   epochs, \code{n_epochs}, is set to \code{5000}, much larger than for
#'   \code{\link{umap}}, but may need to be increased further depending on your
#'   dataset. Using \code{init = "spectral"} can help.
#' }
#'
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). A sparse matrix is interpreted as a distance
#'   matrix, and is assumed to be symmetric, so you can also pass in an
#'   explicitly upper or lower triangular sparse matrix to save storage. There
#'   must be at least \code{n_neighbors} non-zero distances for each row. Both
#'   implicit and explicit zero entries are ignored. Set zero distances you want
#'   to keep to an arbitrarily small non-zero value (e.g. \code{1e-10}).
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#'   \code{"pca"}.
#' @param perplexity Controls the size of the local neighborhood used for
#'   manifold approximation. This is the analogous to \code{n_neighbors} in
#'   \code{\link{umap}}. Change this, rather than \code{n_neighbors}.
#' @param n_neighbors The number of neighbors to use when calculating the
#'  \code{perplexity}. Usually set to three times the value of the
#'  \code{perplexity}. Must be at least as large as \code{perplexity}.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. One
#'   of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the
#' distance metric is always "euclidean").
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. The default is calculate the number of epochs
#'   dynamically based on dataset size, to give the same number of edge samples
#'   as the LargeVis defaults. This is usually substantially larger than the
#'   UMAP defaults. If \code{n_epochs = 0}, then coordinates determined by
#'   \code{"init"} will be returned.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For lvish, the default is \code{"maxabs"}, for consistency with LargeVis.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#'     (Belkin and Niyogi, 2002).
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE and LargeVis. This is an alias for \code{init = "pca",
#'     init_sdev = 1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}), if more than one connected component is identified,
#'  each connected component is initialized separately and the results are
#'  merged. If \code{verbose = TRUE} the number of connected components are
#'  logged to the console. The existence of multiple connected components
#'  implies that a global view of the data cannot be attained with this
#'  initialization. Either a PCA-based initialization or increasing the value of
#'  \code{n_neighbors} may be more appropriate.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default no scaling is carried out, except when \code{init =
#'   "spca"}, in which case the value is \code{0.0001}. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass precalculated nearest neighbor data to this argument. It
#'   must be a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   Multiple nearest neighbor data (e.g. from two different precomputed
#'   metrics) can be passed by passing a list containing the nearest neighbor
#'   data lists as items.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#'   descent). Default is half the number of concurrent threads supported by the
#'   system. For nearest neighbor search, only applies if
#'   \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#'   will be temporarily written to disk in the location determined by
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. Set to \code{"auto"} to use the same value as
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param kernel Type of kernel function to create input probabilities. Can be
#'   one of \code{"gauss"} (the default) or \code{"knn"}. \code{"gauss"} uses
#'   the usual Gaussian weighted similarities. \code{"knn"} assigns equal
#'   probabilities to every edge in the nearest neighbor graph, and zero
#'   otherwise, using \code{perplexity} nearest neighbors. The \code{n_neighbors}
#'   parameter is ignored in this case.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE} and \code{n_sgd_threads = "auto"}. The
#'   default is \code{FALSE}. Setting this to \code{TRUE} will speed up the
#'   stochastic optimization phase, but give a potentially less accurate
#'   embedding, and which will not be exactly reproducible even with a fixed
#'   seed. For visualization, \code{fast_sgd = TRUE} will give perfectly good
#'   results. For more generic dimensionality reduction, it's safer to leave
#'   \code{fast_sgd = FALSE}. If \code{fast_sgd = TRUE}, then user-supplied
#'   values of \code{pcg_rand} and \code{n_sgd_threads}, are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#'   nearest neighbor data that can be used as input to \code{nn_method} to
#'   avoid the overhead of repeatedly calculating the nearest neighbors when
#'   manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#'   \code{init}). See the "Value" section for the names of the list items. If
#'   \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#'   could be sensitive to data scaling, so be wary of reusing nearest neighbor
#'   data if modifying the \code{scale} parameter.
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'     \item \code{"nn"} same as setting \code{ret_nn = TRUE}.
#'     \item \code{"P"} the high dimensional probability matrix. The graph
#'     is returned as a sparse symmetric N x N matrix of class
#'     \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#'     input probability (or similarity or affinity) of the edge connecting
#'     vertex i and vertex j. Note that the graph is further sparsified by
#'     removing edges with sufficiently low membership strength that they would
#'     not be sampled by the probabilistic edge sampling employed for
#'     optimization and therefore the number of non-zero elements in the matrix
#'     is dependent on \code{n_epochs}. If you are only interested in the fuzzy
#'     input graph (e.g. for clustering), setting \code{n_epochs = 0} will avoid
#'     any further sparsifying.
#'   }
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @return A matrix of optimized coordinates, or:
#'   \itemize{
#'     \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#'     returns the nearest neighbor data as a list called \code{nn}. This
#'     contains one list for each \code{metric} calculated, itself containing a
#'     matrix \code{idx} with the integer ids of the neighbors; and a matrix
#'     \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#'     used as input to the \code{nn_method} parameter.
#'     \item if \code{ret_extra} contains \code{"P"}, returns the high
#'     dimensional probability matrix as a sparse matrix called \code{P}, of
#'   }
#'   The returned list contains the combined data from any combination of
#'   specifying \code{ret_nn} and \code{ret_extra}.
#' @references
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' @examples
#' # Default number of epochs is much larger than for UMAP, assumes random
#' # initialization. Use perplexity rather than n_neighbors to control the size
#' # of the local neighborhood 20 epochs may be too small for a random
#' # initialization
#' iris_lvish <- lvish(iris,
#'   perplexity = 50, learning_rate = 0.5,
#'   init = "random", n_epochs = 20
#' )
#' @export
lvish <- function(X, perplexity = 50, n_neighbors = perplexity * 3,
n_components = 2, metric = "euclidean", n_epochs = -1,
learning_rate = 1, scale = "maxabs",
init = "lvrandom", init_sdev = NULL,
repulsion_strength = 7,
negative_sample_rate = 5.0,
nn_method = NULL, n_trees = 50,
search_k = 2 * n_neighbors * n_trees,
grain_size = 1,
kernel = "gauss",
pca = NULL, pca_center = TRUE,
pcg_rand = TRUE,
fast_sgd = FALSE,
ret_nn = FALSE, ret_extra = c(),
tmpdir = tempdir(),
verbose = getOption("verbose", TRUE),
batch = FALSE,
opt_args = NULL, epoch_callback = NULL,
pca_method = NULL) {
uwot(X,
n_neighbors = n_neighbors, n_components = n_components,
metric = metric,
n_epochs = n_epochs, alpha = learning_rate, scale = scale,
init = init, init_sdev = init_sdev,
gamma = repulsion_strength, negative_sample_rate = negative_sample_rate,
nn_method = nn_method,
n_trees = n_trees, search_k = search_k,
method = "largevis", perplexity = perplexity,
pca = pca, pca_center = pca_center, pca_method = pca_method,
grain_size = grain_size,
kernel = kernel,
ret_nn = ret_nn || "nn" %in% ret_extra,
ret_fgraph = "P" %in% ret_extra,
pcg_rand = pcg_rand,
fast_sgd = fast_sgd,
batch = batch,
opt_args = opt_args,
epoch_callback = epoch_callback,
tmpdir = tmpdir,
verbose = verbose
)
}

# Function that does all the real work
uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
n_epochs = NULL,
alpha = 1, scale = FALSE,
init = "spectral", init_sdev = NULL,
spread = 1, min_dist = 0.01,
set_op_mix_ratio = 1.0, local_connectivity = 1.0,
bandwidth = 1.0, gamma = 1.0,
negative_sample_rate = 5.0, a = NULL, b = NULL,
nn_method = NULL, n_trees = 50,
search_k = 2 * n_neighbors * n_trees,
method = "umap",
perplexity = 50, approx_pow = FALSE,
y = NULL, target_n_neighbors = n_neighbors,
target_metric = "euclidean",
target_weight = 0.5,
grain_size = 1,
kernel = "gauss",
ret_model = FALSE, ret_nn = FALSE, ret_fgraph = FALSE,
pca = NULL, pca_center = TRUE, pca_method = NULL,
pcg_rand = TRUE,
fast_sgd = FALSE,
batch = FALSE,
opt_args = NULL,
tmpdir = tempdir(),
verbose = getOption("verbose", TRUE),
epoch_callback = NULL) {
}
if (method == "umap" && (is.null(a) || is.null(b))) {
a <- ab_res
b <- ab_res
tsmessage("UMAP embedding parameters a = ", formatC(a), " b = ", formatC(b))
}

if (n_neighbors < 2) {
stop("n_neighbors must be >= 2")
}

if (set_op_mix_ratio < 0.0 || set_op_mix_ratio > 1.0) {
stop("set_op_mix_ratio must be between 0.0 and 1.0")
}

if (local_connectivity < 1.0) {
stop("local_connectivity cannot be < 1.0")
}

if (!is.null(y) && is.numeric(y) && any(is.na(y))) {
stop("numeric y cannot contain NA")
}

if (!is.numeric(n_components) || n_components < 1) {
stop("'n_components' must be a positive integer")
}

if (!is.null(pca)) {
if (!is.numeric(pca) || pca < 1) {
stop("'pca' must be a positive integer")
}
if (pca < n_components) {
stop("'pca' must be >= n_components")
}
if (pca > min(nrow(X), ncol(X))) {
stop("'pca' must be <= min(nrow(X), ncol(X))")
}
}
if (is.null(pca_method)) {
pca_method <- "auto"
}
pca_method <-
match.arg(pca_method,
choices = c("irlba", "svdr", "bigstatsr", "svd", "auto"))

if (fast_sgd) {
pcg_rand <- FALSE
approx_pow <- TRUE
}

}
if (n_threads %% 1 != 0) {
}
}
}
if (n_sgd_threads %% 1 != 0) {
}

# Store categorical columns to be used to generate the graph
Xcat <- NULL
# number of original columns in data frame (or matrix)
# will be used only if using df or matrix and ret_model = TRUE
norig_col <- NULL
# row names for the input data, which we will apply to the embedding if
# needed
Xnames <- NULL
if (is.null(X)) {
if (!is.list(nn_method)) {
stop("If X is NULL, must provide NN data in nn_method")
}
if (is.character(init) && tolower(init) %in% c("spca", "pca")) {
stop("init = 'pca' and 'spca' can't be used with X = NULL")
}
n_vertices <- x2nv(nn_method)
stopifnot(n_vertices > 0)
n_neighbors <- nn_graph_nbrs(nn_method)
stopifnot(n_neighbors > 1 && n_neighbors <= n_vertices)
check_graph_list(nn_method, n_vertices, n_neighbors)
Xnames <- nn_graph_row_names(nn_method)
}
else if (methods::is(X, "dist")) {
if (ret_model) {
stop("Can only create models with dense matrix or data frame input")
}
checkna(X)
n_vertices <- attr(X, "Size")
Xnames <- labels(X)
}
else if (methods::is(X, "sparseMatrix")) {
if (ret_model) {
stop("Can only create models with dense matrix or data frame input")
}
checkna(X)
n_vertices <- nrow(X)
if (ncol(X) != n_vertices) {
stop("Sparse matrices are only supported as distance matrices")
}
tsmessage("Read ", n_vertices, " rows of sparse distance matrix")
Xnames <- row.names(X)
}
else {
cat_ids <- NULL
norig_col <- ncol(X)
if (methods::is(X, "data.frame") || methods::is(X, "matrix")) {
if (methods::is(X, "matrix")) {
X <- data.frame(X)
}
cat_res <- find_categoricals(metric)
metric <- cat_res$metrics cat_ids <- cat_res$categoricals
# Convert categorical columns to factors if they aren't already
if (!is.null(cat_ids)) {
X[, cat_ids] <- lapply(X[, cat_ids, drop = FALSE], factor)
Xcat <- X[, cat_ids, drop = FALSE]
}

indexes <- which(vapply(X, is.numeric, logical(1)))
if (length(indexes) == 0) {
stop("No numeric columns found")
}
X <- as.matrix(X[, indexes])
}
checkna(X)
n_vertices <- nrow(X)
tsmessage(
"Read ", n_vertices, " rows and found ", ncol(X),
" numeric columns",
appendLF = is.null(cat_ids)
)
if (length(cat_ids) > 0) {
tsmessage(" and ", pluralize("categorical column", length(cat_ids)),
time_stamp = FALSE
)
}
Xnames <- row.names(X)
X <- scale_input(X,
scale_type = scale, ret_model = ret_model,
verbose = verbose
)
}

if (method == "largevis" && kernel == "knn") {
n_neighbors <- perplexity
}

if (n_neighbors > n_vertices) {
# pre-calculated nearest neighbors ignores the user-supplied n_neighbors
# which is handled later
if (!is.list(nn_method)) {
if (method == "largevis") {
# for LargeVis, n_neighbors normally determined from perplexity not an
# error to be too large
tsmessage("Setting n_neighbors to ", n_vertices)
n_neighbors <- n_vertices
}
else {
stop("n_neighbors must be smaller than the dataset size")
}
}
}

if (!is.list(metric)) {
metrics <- list(c())
names(metrics) <- metric
}
else {
metrics <- metric
}

# For typical case of numeric matrix X and not using hamming distance, save
# PCA results here in case initialization uses PCA too
pca_models <- NULL
pca_shortcut <- FALSE
if (!is.null(pca) && length(metric) == 1 && metric != "hamming" &&
is.matrix(X) && ncol(X) > pca) {
tsmessage("Reducing X column dimension to ", pca, " via PCA")
pca_res <- pca_scores(X,
ncol = pca, center = pca_center, pca_method = pca_method,
ret_extra = ret_model, verbose = verbose
)
if (ret_model) {
X <- pca_res$scores pca_models[["1"]] <- pca_res[c("center", "rotation")] pca_res <- NULL } else { X <- pca_res } pca_shortcut <- TRUE } d2sr <- data2set(X, Xcat, n_neighbors, metrics, nn_method, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, n_threads, grain_size, ret_model, pca = pca, pca_center = pca_center, pca_method = pca_method, n_vertices = n_vertices, tmpdir = tmpdir, verbose = verbose ) V <- d2sr$V
nns <- d2sr$nns if (is.null(pca_models)) { pca_models <- d2sr$pca_models
}

if (!is.null(y)) {
tsmessage("Processing y data")

if (!is.list(target_metric)) {
target_metrics <- list(c())
names(target_metrics) <- target_metric
}
else {
target_metrics <- target_metric
}

ycat <- NULL
ycat_ids <- NULL
if (methods::is(y, "data.frame")) {
ycat_res <- find_categoricals(target_metric)
target_metric <- ycat_res$metrics ycat_ids <- ycat_res$categoricals
if (!is.null(ycat_ids)) {
ycat <- y[, ycat_ids, drop = FALSE]
}
else {
ycindexes <- which(vapply(y, is.factor, logical(1)))
if (length(ycindexes) > 0) {
ycat <- (y[, ycindexes, drop = FALSE])
}
}

yindexes <- which(vapply(y, is.numeric, logical(1)))

if (length(yindexes) > 0) {
y <- as.matrix(y[, yindexes])
}
else {
y <- NULL
}
}
else if (is.list(y)) {
nn_method <- y
}
else if (is.numeric(y)) {
y <- as.matrix(y)
}
else if (is.factor(y)) {
ycat <- data.frame(y)
y <- NULL
}

if (!is.null(y)) {
yd2sr <- data2set(y, ycat, target_n_neighbors, target_metrics, nn_method,
n_trees, search_k,
method,
set_op_mix_ratio = 1.0,
local_connectivity = 1.0,
bandwidth = 1.0,
perplexity = perplexity, kernel = kernel,
ret_model = FALSE,
pca = pca, pca_center = TRUE, pca_method = pca_method,
n_vertices = n_vertices,
tmpdir = tmpdir,
verbose = verbose
)

tsmessage(
"Intersecting X and Y sets with target weight = ",
formatC(target_weight)
)
V <- set_intersect(V, yd2sr$V, target_weight, reset = TRUE) yd2sr$V <- NULL
yd2sr$nns <- NULL } else if (!is.null(ycat)) { V <- categorical_intersection_df(ycat, V, weight = target_weight, verbose = verbose ) } } if (!(ret_model || ret_nn)) { nns <- NULL gc() } if (methods::is(init, "matrix")) { if (nrow(init) != n_vertices || ncol(init) != n_components) { stop("init matrix does not match necessary configuration for X: ", "should have dimensions (", n_vertices, ", ", n_components, ")") } tsmessage("Initializing from user-supplied matrix") embedding <- init } else if (!(methods::is(init, "character") && length(init) == 1)) { stop("init should be either a matrix or string describing the ", "initialization method") } else { init <- match.arg(tolower(init), c( "spectral", "random", "lvrandom", "normlaplacian", "laplacian", "spca", "pca", "inormlaplacian", "ispectral", "agspectral" )) if (init_is_spectral(init)) { connected <- connected_components(V) if (connected$n_components > 1) {
tsmessage("Found ", connected$n_components, " connected components, ", appendLF = FALSE) if (is.null(X)) { tsmessage("falling back to random initialization", time_stamp = FALSE) init <- "random" } else { tsmessage("falling back to 'spca' initialization with init_sdev = 1", time_stamp = FALSE) init <- "spca" init_sdev <- 1 } } } # Don't repeat PCA initialization if we've already done it once if (pca_shortcut && init %in% c("spca", "pca") && pca >= n_components) { embedding <- X[, 1:n_components] if (init == "spca") { tsmessage("Initializing from scaled PCA") } else { tsmessage("Initializing from PCA") } } else { embedding <- switch(init, spectral = spectral_init(V, ndim = n_components, verbose = verbose), random = rand_init(n_vertices, n_components, verbose = verbose), lvrandom = rand_init_lv(n_vertices, n_components, verbose = verbose), normlaplacian = normalized_laplacian_init(V, ndim = n_components, verbose = verbose ), laplacian = laplacian_eigenmap(V, ndim = n_components, verbose = verbose), # we handle scaling pca below spca = pca_init(X, ndim = n_components, pca_method = pca_method, verbose = verbose), pca = pca_init(X, ndim = n_components, pca_method = pca_method, verbose = verbose), ispectral = irlba_spectral_init(V, ndim = n_components, verbose = verbose), inormlaplacian = irlba_normalized_laplacian_init(V, ndim = n_components, verbose = verbose ), agspectral = agspectral_init(V, n_neg_nbrs = negative_sample_rate, ndim = n_components, verbose = verbose ), stop("Unknown initialization method: '", init, "'") ) } if (!is.null(init_sdev) || init == "spca") { if (is.null(init_sdev)) { init_sdev <- 1e-4 } embedding <- shrink_coords(embedding, init_sdev) } } if (is.null(n_epochs) || n_epochs < 0) { if (method == "largevis") { n_epochs <- lvish_epochs(n_vertices, V) } else { if (n_vertices <= 10000) { n_epochs <- 500 } else { n_epochs <- 200 } } } full_opt_args <- get_opt_args(opt_args, alpha) if (n_epochs > 0) { V@x[V@x < max(V@x) / n_epochs] <- 0 V <- Matrix::drop0(V) # Create the (0-indexed) indices of the head and tail of each directed edge # in V. Graph is symmetric, so both (i->j) and (j->i) are present if (batch) { V <- Matrix::t(V) # head is ordered in non-decreasing order of node index positive_head <- Matrix::which(V != 0, arr.ind = TRUE)[, 2] - 1 # tail is unordered positive_tail <- V@i } else { # Use the Python UMAP ordering # head is unordered positive_head <- V@i # tail is ordered in non-decreasing order of node index positive_tail <- Matrix::which(V != 0, arr.ind = TRUE)[, 2] - 1 } # start/end pointers into the ordered vector positive_ptr <- V@p epochs_per_sample <- make_epochs_per_sample(V@x, n_epochs) tsmessage( "Commencing optimization for ", n_epochs, " epochs, with ", length(positive_head), " positive edges", pluralize("thread", n_sgd_threads, " using") ) method <- tolower(method) if (method == "umap") { method_args <- list(a = a, b = b, gamma = gamma, approx_pow = approx_pow) } else if (method == "tumap") { method_args <- list() } else { method_args <- list(gamma = gamma) } embedding <- t(embedding) embedding <- optimize_layout_r( head_embedding = embedding, tail_embedding = NULL, positive_head = positive_head, positive_tail = positive_tail, positive_ptr = positive_ptr, n_epochs = n_epochs, n_head_vertices = n_vertices, n_tail_vertices = n_vertices, epochs_per_sample = epochs_per_sample, method = method, method_args = method_args, initial_alpha = alpha, opt_args = full_opt_args, negative_sample_rate = negative_sample_rate, pcg_rand = pcg_rand, batch = batch, n_threads = n_sgd_threads, grain_size = grain_size, move_other = TRUE, epoch_callback = epoch_callback, verbose = verbose ) embedding <- t(embedding) gc() # Center the points before returning embedding <- scale(embedding, center = TRUE, scale = FALSE) if (is.null(row.names(embedding)) && !is.null(Xnames) && length(Xnames) == nrow(embedding)) { row.names(embedding) <- Xnames } tsmessage("Optimization finished") } if (ret_model || ret_nn || ret_fgraph) { nblocks <- length(nns) res <- list(embedding = embedding) if (ret_model) { res <- append(res, list( scale_info = if (!is.null(X)) { attr_to_scale_info(X) } else { NULL }, n_neighbors = n_neighbors, search_k = search_k, local_connectivity = local_connectivity, n_epochs = n_epochs, alpha = alpha, negative_sample_rate = negative_sample_rate, method = method, a = a, b = b, gamma = gamma, approx_pow = approx_pow, metric = metrics, norig_col = norig_col, pcg_rand = pcg_rand, batch = batch, opt_args = full_opt_args )) if (nblocks > 1) { res$nn_index <- list()
for (i in 1:nblocks) {
res$nn_index[[i]] <- nns[[i]]$index
}
}
else {
res$nn_index <- nns[]$index
if (is.null(res$metric[])) { # 31: Metric usually lists column indices or names, NULL means use all # of them, but for loading the NN index we need the number of # columns explicitly (we don't have access to the column dimension of # the input data at load time) # To be sure of the dimensionality, fetch the first item from the # index and see how many elements are in the returned vector. if(!is.null(X)){ rcppannoy <- get_rcppannoy(res$nn_index)
res$metric[] <- list(ndim = length(rcppannoy$getItemsVector(0)))
} else {
res$metric[] <- list() } } } if (!is.null(pca_models)) { res$pca_models <- pca_models
}
}
if (ret_nn) {
res$nn <- list() for (i in 1:nblocks) { res$nn[[i]] <- list(idx = nns[[i]]$idx, dist = nns[[i]]$dist)
if (!is.null(Xnames) && nrow(res$nn[[i]]$idx) == length(Xnames)) {
row.names(res$nn[[i]]$idx) <- Xnames
row.names(res$nn[[i]]$dist) <- Xnames
}
}
names(res$nn) <- names(nns) } if (ret_fgraph) { if (method == "largevis") { res$P <- V
}
else {
res$fgraph <- V } } } else { res <- embedding } res } #' Save or Load a Model #' #' Functions to write a UMAP model to a file, and to restore. #' #' @param model a UMAP model create by \code{\link{umap}}. #' @param file name of the file where the model is to be saved or read from. #' @param unload if \code{TRUE}, unload all nearest neighbor indexes for the #' model. The \code{model} will no longer be valid for use in #' \code{\link{umap_transform}} and the temporary working directory used #' during model saving will be deleted. You will need to reload the model with #' \code{load_uwot} to use the model. If \code{FALSE}, then the model can be #' re-used without reloading, but you must manually unload the NN index when #' you are finished using it if you want to delete the temporary working #' directory. To unload manually, use \code{\link{unload_uwot}}. The absolute #' path of the working directory is found in the \code{mod_dir} item of the #' return value. #' @param verbose if \code{TRUE}, log information to the console. #' @return \code{model} with one extra item: \code{mod_dir}, which contains the #' path to the working directory. If \code{unload = FALSE} then this directory #' still exists after this function returns, and can be cleaned up with #' \code{\link{unload_uwot}}. If you don't care about cleaning up this #' directory, or \code{unload = TRUE}, then you can ignore the return value. #' @examples #' iris_train <- iris[c(1:10, 51:60), ] #' iris_test <- iris[100:110, ] #' #' # create model #' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' #' # save without unloading: this leaves behind a temporary working directory #' model_file <- tempfile("iris_umap") #' model <- save_uwot(model, file = model_file) #' #' # The model can continue to be used #' test_embedding <- umap_transform(iris_test, model) #' #' # To manually unload the model from memory when finished and to clean up #' # the working directory (this doesn't touch your model file) #' unload_uwot(model) #' #' # At this point, model cannot be used with umap_transform, this would fail: #' # test_embedding2 <- umap_transform(iris_test, model) #' #' # restore the model: this also creates a temporary working directory #' model2 <- load_uwot(file = model_file) #' test_embedding2 <- umap_transform(iris_test, model2) #' #' # Unload and clean up the loaded model temp directory #' unload_uwot(model2) #' #' # clean up the model file #' unlink(model_file) #' #' # save with unloading: this deletes the temporary working directory but #' # doesn't allow the model to be re-used #' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' model_file3 <- tempfile("iris_umap") #' model3 <- save_uwot(model3, file = model_file3, unload = TRUE) #' #' @seealso \code{\link{load_uwot}}, \code{\link{unload_uwot}} #' @export save_uwot <- function(model, file, unload = FALSE, verbose = FALSE) { if (!all_nn_indices_are_loaded(model)) { stop("cannot save: NN index is unloaded") } wd <- getwd() model_file <- abspath(file) if (file.exists(model_file)) { stop("model file ", model_file, " already exists") } tryCatch( { # create directory to store files in mod_dir <- tempfile(pattern = "dir") tsmessage("Creating temp model dir ", mod_dir) dir.create(mod_dir) # create the tempdir/uwot subdirectory uwot_dir <- file.path(mod_dir, "uwot") tsmessage("Creating dir ", mod_dir) dir.create(uwot_dir) # save model file to tempdir/uwot/model model_tmpfname <- file.path(uwot_dir, "model") saveRDS(model, file = model_tmpfname) # save each nn index inside tempdir/uwot/model metrics <- names(model$metric)
n_metrics <- length(metrics)
for (i in 1:n_metrics) {
nn_tmpfname <- file.path(uwot_dir, paste0("nn", i))
if (n_metrics == 1) {
model$nn_index$ann$save(nn_tmpfname) } else { model$nn_index[[i]]$ann$save(nn_tmpfname)
}
}

# archive the files under the temp dir into the single target file
# change directory so the archive only contains one directory
tsmessage("Changing to ", mod_dir)
setwd(mod_dir)
tmp_model_file <- abspath(file)
tsmessage("Creating ", tmp_model_file)
utils::tar(tarfile = tmp_model_file, files = "uwot/")
},
finally = {
setwd(wd)
if (model_file != tmp_model_file) {
tsmessage("Copying ", tmp_model_file, " to ", model_file)
file.copy(from = tmp_model_file, to = model_file)
}
model$mod_dir <- mod_dir if (unload) { unload_uwot(model, cleanup = TRUE, verbose = verbose) } } ) model } #' Save or Load a Model #' #' Functions to write a UMAP model to a file, and to restore. #' #' @param file name of the file where the model is to be saved or read from. #' @param verbose if \code{TRUE}, log information to the console. #' @return The model saved at \code{file}, for use with #' \code{\link{umap_transform}}. Additionally, it contains an extra item: #' \code{mod_dir}, which contains the path to the temporary working directory #' used during loading of the model. This directory cannot be removed until #' this model has been unloaded by using \code{\link{unload_uwot}}. #' @examples #' iris_train <- iris[c(1:10, 51:60), ] #' iris_test <- iris[100:110, ] #' #' # create model #' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' #' # save without unloading: this leaves behind a temporary working directory #' model_file <- tempfile("iris_umap") #' model <- save_uwot(model, file = model_file) #' #' # The model can continue to be used #' test_embedding <- umap_transform(iris_test, model) #' #' # To manually unload the model from memory when finished and to clean up #' # the working directory (this doesn't touch your model file) #' unload_uwot(model) #' #' # At this point, model cannot be used with umap_transform, this would fail: #' # test_embedding2 <- umap_transform(iris_test, model) #' #' # restore the model: this also creates a temporary working directory #' model2 <- load_uwot(file = model_file) #' test_embedding2 <- umap_transform(iris_test, model2) #' #' # Unload and clean up the loaded model temp directory #' unload_uwot(model2) #' #' # clean up the model file #' unlink(model_file) #' #' # save with unloading: this deletes the temporary working directory but #' # doesn't allow the model to be re-used #' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20) #' model_file3 <- tempfile("iris_umap") #' model3 <- save_uwot(model3, file = model_file3, unload = TRUE) #' #' @seealso \code{\link{save_uwot}}, \code{\link{unload_uwot}} #' @export load_uwot <- function(file, verbose = FALSE) { # create directory to store files in mod_dir <- tempfile(pattern = "dir") tsmessage("Creating temp directory ", mod_dir) dir.create(mod_dir) utils::untar(abspath(file), exdir = mod_dir, verbose = verbose) model_fname <- file.path(mod_dir, "uwot/model") if (!file.exists(model_fname)) { stop("Can't find model in ", file) } model <- readRDS(file = model_fname) metrics <- names(model$metric)
n_metrics <- length(metrics)

for (i in 1:n_metrics) {
nn_fname <- file.path(mod_dir, paste0("uwot/nn", i))
if (!file.exists(nn_fname)) {
stop("Can't find nearest neighbor index ", nn_fname, " in ", file)
}
metric <- metrics[[i]]
# 31: need to specify the index dimensionality when creating the index
if (is.list(model$metric[[i]])) { # in case where there is only one metric, the value is a one-item list # named 'ndim' giving the number of dimensions directly: all columns # are used in this metric ndim <- model$metric[[i]]$ndim } else { # otherwise, metric specifies the name or index used for each metric, # so the dimension is the number of them ndim = length(model$metric[[i]])
}
annoy_metric <- metric
if (metric == "correlation") {
annoy_metric <- "cosine"
}
ann <- create_ann(annoy_metric, ndim = ndim)
ann$load(nn_fname) if (n_metrics == 1) { model$nn_index <- ann
}
else {
model$nn_index[[i]] <- ann } } model$mod_dir <- mod_dir

model
}

#'
#' Unloads the UMAP model. This prevents the model being used with
#' \code{\link{umap_transform}}, but allows the temporary working directory
#'
#' @param model a UMAP model create by \code{\link{umap}}.
#' @param cleanup if \code{TRUE}, attempt to delete the temporary working
#'   directory that was used in either the save or load of the model.
#'
#' @examples
#' iris_train <- iris[c(1:10, 51:60), ]
#' iris_test <- iris[100:110, ]
#'
#' # create model
#' model <- umap(iris_train, ret_model = TRUE, n_epochs = 20)
#'
#' # save without unloading: this leaves behind a temporary working directory
#' model_file <- tempfile("iris_umap")
#' model <- save_uwot(model, file = model_file)
#'
#' # The model can continue to be used
#' test_embedding <- umap_transform(iris_test, model)
#'
#' # To manually unload the model from memory when finished and to clean up
#' # the working directory (this doesn't touch your model file)
#'
#' # At this point, model cannot be used with umap_transform, this would fail:
#' # test_embedding2 <- umap_transform(iris_test, model)
#'
#' # restore the model: this also creates a temporary working directory
#' model2 <- load_uwot(file = model_file)
#' test_embedding2 <- umap_transform(iris_test, model2)
#'
#'
#' # clean up the model file
#'
#' # save with unloading: this deletes the temporary working directory but
#' # doesn't allow the model to be re-used
#' model3 <- umap(iris_train, ret_model = TRUE, n_epochs = 20)
#' model_file3 <- tempfile("iris_umap")
#' model3 <- save_uwot(model3, file = model_file3, unload = TRUE)
#'
#' @export
unload_uwot <- function(model, cleanup = TRUE, verbose = FALSE) {
metrics <- names(model$metric) n_metrics <- length(metrics) for (i in 1:n_metrics) { if (n_metrics == 1) { rcppannoy <- get_rcppannoy(model$nn_index)
rcppannoy$unload() } else { rcppannoy <- get_rcppannoy(model$nn_index[[i]])
rcppannoy$unload() } } if (cleanup) { if (is.null(model$mod_dir)) {
tsmessage("Model is missing temp dir location, can't clean up")
return()
}
else {
mod_dir <- model$mod_dir if (!file.exists(mod_dir)) { tsmessage("model temp dir location '", mod_dir, "' no longer exists") return() } tsmessage("Deleting temp model dir ", mod_dir) res <- unlink(mod_dir, recursive = TRUE) if (res != 0) { tsmessage("Unable to delete tempdir ", mod_dir) } } } } all_nn_indices_are_loaded <- function(model) { if (is.null(model$nn_index)) {
stop("Invalid model: has no 'nn_index'")
}

if (is.list(model$nn_index) && is.null(model$nn_index$type)) { for (i in 1:length(model$nn_index)) {
rcppannoy <- get_rcppannoy(model$nn_index[[i]]) if (rcppannoy$getNTrees() == 0) {
return(FALSE)
}
}
}
else {
rcppannoy <- get_rcppannoy(model$nn_index) if (rcppannoy$getNTrees() == 0) {
return(FALSE)
}
}
TRUE
}

abspath <- function(filename) {
file.path(normalizePath(dirname(filename)), basename(filename))
}

# Half of whatever the C++ implementation thinks are the number of concurrent
# threads supported, but at least 1
max(1, hardware_concurrency() / 2)
}

# Get the number of vertices in X
x2nv <- function(X) {
if (is.list(X)) {
if (!is.null(X$idx)) { n_vertices <- x2nv(X$idx)
}
else {
if (length(X) > 0) {
n_vertices <- x2nv(X[])
}
else {
stop("Can't find n_vertices for list X")
}
}
}
else if (methods::is(X, "dist")) {
n_vertices <- attr(X, "Size")
}
else if (methods::is(X, "sparseMatrix")) {
n_vertices <- nrow(X)
}
else if (methods::is(X, "data.frame") || methods::is(X, "matrix")) {
n_vertices <- nrow(X)
}
else if (is.numeric(X)) {
n_vertices <- length(X)
}
else {
stop("Can't find number of vertices for X of type '", class(X), "'")
}
n_vertices
}

data2set <- function(X, Xcat, n_neighbors, metrics, nn_method,
n_trees, search_k,
method,
set_op_mix_ratio, local_connectivity, bandwidth,
perplexity, kernel,
ret_model,
n_vertices = x2nv(X),
tmpdir = tempdir(),
pca = NULL, pca_center = TRUE, pca_method = "irlba",
verbose = FALSE) {
V <- NULL
nns <- list()
nblocks <- length(metrics)

# Check for precalculated NN data in nn_method
if (is.list(nn_method)) {
if (is.null(nn_method$idx)) { nblocks <- length(nn_method) if (nblocks == 0) { stop("Incorrect format for precalculated neighbor data") } } else { nblocks <- 1 # wrap nn data in a list so data is always a list of lists nn_method <- list(nn_method) } metrics <- replicate(nblocks, NULL, simplify = FALSE) names(metrics) <- rep("precomputed", nblocks) } if (nblocks > 1) { tsmessage("Found ", nblocks, " blocks of data") } mnames <- tolower(names(metrics)) if (is.null(nn_method)) { if (n_vertices < 4096 && !ret_model && all(mnames == "euclidean")) { tsmessage("Using FNN for neighbor search, n_neighbors = ", n_neighbors) nn_method <- "fnn" } else { tsmessage("Using Annoy for neighbor search, n_neighbors = ", n_neighbors) nn_method <- "annoy" } } pca_models <- list() for (i in 1:nblocks) { metric <- mnames[[i]] metric <- match.arg(metric, c( "euclidean", "cosine", "manhattan", "hamming", "correlation", "precomputed" )) # Defaults for this block which can be overridden pca_i <- pca pca_center_i <- pca_center subset <- metrics[[i]] if (is.null(subset)) { Xsub <- X } else if (is.list(subset)) { # e.g. "euclidean" = list(1:10, pca_center = FALSE), lsres <- lsplit_unnamed(subset) if (is.null(lsres$unnamed)) {
stop("Error: no subset provided for block ", i)
}
if (length(lsres$unnamed) != 1) { stop("Error: only one unnamed item should be provided for block ", i) } subset <- lsres$unnamed[]

# possible overrides
if (!is.null(lsres$named)) { lsnamed <- lsres$named
lsnames <- names(lsnamed)
if (!is.null(lsnamed$pca_center)) { pca_center_i <- lsnamed$pca_center
}
# PCA argument can be NULL, so need to check if it was explicitly provided
if ("pca" %in% lsnames) {
pca_i <- lsnamed$pca } } Xsub <- X[, subset, drop = FALSE] } else { Xsub <- X[, subset, drop = FALSE] } if (!is.null(X) && is.matrix(X)) { block_size <- ncol(Xsub) if (block_size == 0) { stop("Block ", i, " has zero size") } if (nblocks > 1) { tsmessage( "Processing block ", i, " of ", nblocks, " with size ", block_size, " using metric '", metric, "'" ) } } else { # X is NULL or dist or something like that if (nblocks > 1) { tsmessage( "Processing block ", i, " of ", nblocks, " using metric '", metric, "'" ) } } if (!is.null(pca_i) && is.matrix(X) && metric != "hamming" && ncol(X) > pca_i && nrow(X) > pca_i) { tsmessage("Reducing column dimension to ", pca_i, " via PCA") pca_res <- pca_scores(Xsub, pca_i, ret_extra = ret_model, center = pca_center_i, pca_method = pca_method, verbose = verbose ) if (ret_model) { Xsub <- pca_res$scores
pca_models[[as.character(i)]] <- pca_res[c("center", "rotation")]
pca_res <- NULL
}
else {
Xsub <- pca_res
}
}

nn_sub <- nn_method
# Extract this block of nn data from list of lists
if (metric == "precomputed") {
nn_sub <- nn_method[[i]]
if (i == 1) {
n_neighbors <- NULL
}
else {
n_neighbors <- ncol(nn_method[]$idx) } } x2set_res <- x2set(Xsub, n_neighbors, metric, nn_method = nn_sub, n_trees, search_k, method, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, n_threads, grain_size, ret_model, n_vertices = n_vertices, tmpdir = tmpdir, verbose = verbose ) Vblock <- x2set_res$V
nn <- x2set_res$nn nns[[i]] <- nn names(nns)[[i]] <- metric n_neighbors <- ncol(nn$idx)
if (is.null(V)) {
V <- Vblock
}
else {
V <- set_intersect(V, Vblock, weight = 0.5, reset = TRUE)
}
}

if (!is.null(Xcat)) {
V <- categorical_intersection_df(Xcat, V, weight = 0.5, verbose = verbose)
}

list(V = V, nns = nns, pca_models = pca_models)
}

x2nn <- function(X, n_neighbors, metric, nn_method,
n_trees, search_k,
tmpdir = tempdir(),
ret_model,
n_vertices = x2nv(X),
verbose = FALSE) {
if (is.list(nn_method)) {
# on first iteration n_neighbors is NULL
# on subsequent iterations ensure n_neighbors is consistent for all data
validate_nn(nn_method, n_vertices, n_neighbors = n_neighbors)
nn <- nn_method
}
else {
nn_method <- match.arg(tolower(nn_method), c("annoy", "fnn"))
if (nn_method == "fnn" && metric != "euclidean") {
stop(
"nn_method = 'FNN' is only compatible with distance metric ",
"'euclidean'"
)
}
if (nn_method == "fnn" && ret_model) {
stop("nn_method = 'FNN' is incompatible with ret_model = TRUE")
}
nn <- find_nn(X, n_neighbors,
method = nn_method, metric = metric,
n_trees = n_trees, search_k = search_k,
tmpdir = tmpdir,
ret_index = ret_model, verbose = verbose
)
}
nn
}

validate_nn <- function(nn_method, n_vertices, n_neighbors = NULL) {
if (!is.matrix(nn_method$idx)) { stop("Couldn't find precalculated 'idx' matrix") } if (nrow(nn_method$idx) != n_vertices) {
stop(
"Precalculated 'idx' matrix must have ", n_vertices,
" rows, but found ", nrow(nn_method$idx) ) } # set n_neighbors from these matrices if it hasn't been already set if (is.null(n_neighbors)) { n_neighbors <- ncol(nn_method$idx)
}
if (!is.matrix(nn_method$dist)) { stop("Couldn't find precalculated 'dist' matrix") } if (nrow(nn_method$idx) != n_vertices) {
stop("Precalculated 'dist' matrix must have ", n_vertices, " rows, but
found ", nrow(nn_method$dist)) } if (ncol(nn_method$dist) != n_neighbors) {
stop("Precalculated 'dist' matrix must have ", n_neighbors, " cols, but
found ", ncol(nn_method$dist)) } } nn2set <- function(method, nn, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, n_threads, grain_size, verbose = FALSE) { if (method == "largevis") { n_vertices <- nrow(nn$dist)
if (perplexity >= n_vertices) {
stop("perplexity can be no larger than ", n_vertices - 1)
}
V <- perplexity_similarities(
nn = nn, perplexity = perplexity,
grain_size = grain_size,
kernel = kernel,
verbose = verbose
)
}
else {
V <- fuzzy_simplicial_set(
nn = nn,
set_op_mix_ratio = set_op_mix_ratio,
local_connectivity = local_connectivity,
bandwidth = bandwidth,
grain_size = grain_size,
verbose = verbose
)
}
}

x2set <- function(X, n_neighbors, metric, nn_method,
n_trees, search_k,
method,
set_op_mix_ratio, local_connectivity, bandwidth,
perplexity, kernel,
ret_model,
n_vertices = x2nv(X),
tmpdir = tempdir(),
verbose = FALSE) {
nn <- x2nn(X,
n_neighbors = n_neighbors,
metric = metric,
nn_method = nn_method,
n_trees = n_trees, search_k = search_k,
tmpdir = tmpdir,
ret_model = ret_model,
n_vertices = n_vertices,
verbose = verbose
)

if (any(is.infinite(nn$dist))) { stop("Infinite distances found in nearest neighbors") } gc() V <- nn2set(method, nn, set_op_mix_ratio, local_connectivity, bandwidth, perplexity, kernel, n_threads, grain_size, verbose = verbose ) if (any(is.na(V))) { stop("Non-finite entries in the input matrix") } gc() list( nn = nn, V = V ) } set_intersect <- function(A, B, weight = 0.5, reset = TRUE) { A <- general_simplicial_set_intersection( A, B, weight ) # https://github.com/lmcinnes/umap/issues/58#issuecomment-437633658 # For now always reset if (reset) { A <- reset_local_connectivity(Matrix::drop0(A)) } A } categorical_intersection_df <- function(X, V, weight = 0.5, verbose = FALSE) { tsmessage( "Carrying out categorical intersection for ", pluralize("column", ncol(X)) ) for (i in 1:ncol(X)) { V <- categorical_intersection(X[, i], V, weight = weight, verbose = (verbose && i == 1) ) } V } categorical_intersection <- function(x, V, weight, verbose = FALSE) { if (is.null(V)) { stop("V cannot be null for categorical intersection") } if (weight < 1.0) { far_dist <- 2.5 * (1.0 / (1.0 - weight)) } else { far_dist <- 1.0e12 } tsmessage( "Applying categorical set intersection, weight = ", formatC(weight), " far distance = ", formatC(far_dist) ) V <- categorical_simplicial_set_intersection(V, x, far_dist = far_dist, verbose = verbose ) V } # Creates the number of epochs per sample for each weight # weights are the non-zero input affinities (1-simplex) # n_epoch the total number of epochs # There is an inverse relationship between the weights and the return vector. make_epochs_per_sample <- function(weights, n_epochs) { result <- rep(-1, length(weights)) n_samples <- n_epochs * (weights / max(weights)) result[n_samples > 0] <- n_epochs / n_samples[n_samples > 0] result } # Create the a/b parameters from spread and min_dist find_ab_params <- function(spread = 1, min_dist = 0.001) { xv <- seq(from = 0, to = spread * 3, length.out = 300) yv <- rep(0, length(xv)) yv[xv < min_dist] <- 1 yv[xv >= min_dist] <- exp(-(xv[xv >= min_dist] - min_dist) / spread) result <- try( { stats::nls(yv ~ 1 / (1 + a * xv^(2 * b)), start = list(a = 1, b = 1) )$m$getPars() }, silent = TRUE ) if (class(result) == "try-error") { stop( "Can't find a, b for provided spread = ", spread, " min_dist = ", min_dist ) } result } # The default number of edge samples used by LargeVis lvish_samples <- function(n_vertices) { n_samples <- 0 if (n_vertices < 10000) { n_samples <- 1000 } else if (n_vertices < 1000000) { n_samples <- (n_vertices - 10000) * 9000 / (1000000 - 10000) + 1000 } else { n_samples <- n_vertices / 100 } round(n_samples * 1000000) } # Returns the number of epochs required to generate the default number of edge samples # used in LargeVis lvish_epochs <- function(n_vertices, V) { n_samples <- lvish_samples(n_vertices) round(n_samples * max(V) / sum(V)) } # Scale X according to various strategies scale_input <- function(X, scale_type, ret_model = FALSE, verbose = FALSE) { if (is.null(scale_type)) { scale_type <- "none" } else if (is.logical(scale_type)) { scale_type <- ifelse(scale_type, "scale", "none") } else if (tolower(scale_type) == "z") { scale_type <- "scale" } scale_type <- match.arg( tolower(scale_type), c("none", "scale", "range", "colrange", "maxabs") ) switch(scale_type, range = { tsmessage("Range scaling X") min_X <- min(X) X <- X - min_X max_X <- max(X) X <- X / max_X if (ret_model) { attr(X, "scaled:range:min") <- min_X attr(X, "scaled:range:max") <- max_X } }, colrange = { tsmessage("Column range scaling X") min_X <- apply(X, 2, min) X <- sweep(X, 2, min_X) max_X <- apply(X, 2, max) X <- sweep(X, 2, max_X, /) if (ret_model) { attr(X, "scaled:colrange:min") <- min_X attr(X, "scaled:colrange:max") <- max_X } }, maxabs = { tsmessage("Normalizing by max-abs") X <- base::scale(X, scale = FALSE) max_abs <- max(abs(X)) X <- X / max_abs if (ret_model) { attr(X, "scaled:maxabs") <- max_abs } }, scale = { tsmessage("Scaling to zero mean and unit variance") varf <- function(x) { sum((x - sum(x) / length(x))^2) } non_zero_var_cols <- apply(X, 2, varf) >= .Machine$double.xmin

if (length(non_zero_var_cols) == 0) {
stop("Matrix has zero variance")
}
X <- X[, non_zero_var_cols]
tsmessage("Kept ", ncol(X), " non-zero-variance columns")
X <- base::scale(X, scale = TRUE)

if (ret_model) {
attr(X, "scaled:nzvcols") <- which(non_zero_var_cols)
}
}
)
X
}

attr_to_scale_info <- function(X) {
Xattr <- attributes(X)
Xattr <- Xattr[startsWith(names(Xattr), "scaled:")]
if (length(Xattr) == 0) {
Xattr <- NULL
}
Xattr
}

get_opt_args <- function(opt_args, alpha) {
default_opt_args <- list(
sgd = list(alpha = alpha),
adam = list(alpha = alpha, beta1 = 0.5, beta2 = 0.9, eps = 1e-7)
)
if (is.null(opt_args)) {
opt_args <- list()
}
if (is.null(opt_args$method)) { opt_args$method <- "adam"
}
if (!(opt_args$method %in% names(default_opt_args))) { stop("Unknown optimization method '", opt_args$method, "'")
}
lmerge(default_opt_args[[opt_args\$method]], opt_args)
}

#' @useDynLib uwot, .registration=TRUE
#' @importFrom Rcpp sourceCpp