R/umap2.R

Defines functions umap2

Documented in umap2

#' Dimensionality Reduction with UMAP
#'
#' Carry out dimensionality reduction of a dataset using the Uniform Manifold
#' Approximation and Projection (UMAP) method (McInnes et al., 2018).
#'
#' This function behaves like \code{\link{umap}} except with some updated
#' defaults that make it behave more like the Python implementation and which
#' cannot be added to \code{\link{umap}} without breaking backwards
#' compatibility. In addition:
#'
#' \itemize{
#'   \item if \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} is
#'   installed, it will be used in preference to Annoy if a compatible metric
#'   is requested.
#'  \item if RcppHNSW is not present, but
#'  \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#'   installed, it will be used in preference to Annoy if a compatible metric
#'   is requested.
#'  \item if \code{batch = TRUE} then the default \code{n_sgd_threads} is set
#'  to the same value as \code{n_threads}.
#'  \item if the input data \code{X} is a sparse matrix, it is interpreted
#'  similarly to a dense matrix or dataframe, and not as a distance matrix.
#'  This requires \code{rnndescent} package to be installed.
#' }
#'
#' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
#'   \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
#'   Matrix and data frames should contain one observation per row. Data frames
#'   will have any non-numeric columns removed, although factor columns will be
#'   used if explicitly included via \code{metric} (see the help for
#'   \code{metric} for details). Sparse matrices must be in the \code{dgCMatrix}
#'   format, and you must also install
#'   \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#'   and set \code{nn_method = "nndescent"}
#'   \code{X} can also be \code{NULL} if pre-computed nearest neighbor data is
#'   passed to \code{nn_method}, and \code{init} is not \code{"spca"} or
#'   \code{"pca"}.
#' @param n_neighbors The size of local neighborhood (in terms of number of
#'   neighboring sample points) used for manifold approximation. Larger values
#'   result in more global views of the manifold, while smaller values result in
#'   more local data being preserved. In general values should be in the range
#'   \code{2} to \code{100}.
#' @param n_components The dimension of the space to embed into. This defaults
#'   to \code{2} to provide easy visualization, but can reasonably be set to any
#'   integer value in the range \code{2} to \code{100}.
#' @param metric Type of distance metric to use to find nearest neighbors. For
#'  \code{nn_method = "annoy"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"} (the default)
#'   \item \code{"cosine"}
#'   \item \code{"manhattan"}
#'   \item \code{"hamming"}
#'   \item \code{"correlation"} (a distance based on the Pearson correlation)
#'   \item \code{"categorical"} (see below)
#' }
#' For \code{nn_method = "hnsw"} this can be one of:
#' \itemize{
#'   \item \code{"euclidean"}
#'   \item \code{"cosine"}
#'   \item \code{"correlation"}
#' }
#' If \href{https://cran.r-project.org/package=rnndescent}{rnndescent} is
#' installed and \code{nn_method = "nndescent"} is specified then many more
#' metrics are avaiable, including:
#' \itemize{
#' \item \code{"braycurtis"}
#' \item \code{"canberra"}
#' \item \code{"chebyshev"}
#' \item \code{"dice"}
#' \item \code{"hamming"}
#' \item \code{"hellinger"}
#' \item \code{"jaccard"}
#' \item \code{"jensenshannon"}
#' \item \code{"kulsinski"}
#' \item \code{"rogerstanimoto"}
#' \item \code{"russellrao"}
#' \item \code{"sokalmichener"}
#' \item \code{"sokalsneath"}
#' \item \code{"spearmanr"}
#' \item \code{"symmetrickl"}
#' \item \code{"tsss"}
#' \item \code{"yule"}
#' }
#' For more details see the package documentation of \code{rnndescent}.
#' For \code{nn_method = "fnn"}, the distance metric is always "euclidean".
#'
#' If \code{X} is a data frame or matrix, then multiple metrics can be
#' specified, by passing a list to this argument, where the name of each item in
#' the list is one of the metric names above. The value of each list item should
#' be a vector giving the names or integer ids of the columns to be included in
#' a calculation, e.g. \code{metric = list(euclidean = 1:4, manhattan = 5:10)}.
#'
#' Each metric calculation results in a separate fuzzy simplicial set, which are
#' intersected together to produce the final set. Metric names can be repeated.
#' Because non-numeric columns are removed from the data frame, it is safer to
#' use column names than integer ids.
#'
#' Factor columns can also be used by specifying the metric name
#' \code{"categorical"}. Factor columns are treated different from numeric
#' columns and although multiple factor columns can be specified in a vector,
#' each factor column specified is processed individually. If you specify
#' a non-factor column, it will be coerced to a factor.
#'
#' For a given data block, you may override the \code{pca} and \code{pca_center}
#' arguments for that block, by providing a list with one unnamed item
#' containing the column names or ids, and then any of the \code{pca} or
#' \code{pca_center} overrides as named items, e.g. \code{metric =
#' list(euclidean = 1:4, manhattan = list(5:10, pca_center = FALSE))}. This
#' exists to allow mixed binary and real-valued data to be included and to have
#' PCA applied to both, but with centering applied only to the real-valued data
#' (it is typical not to apply centering to binary data before PCA is applied).
#' @param n_epochs Number of epochs to use during the optimization of the
#'   embedded coordinates. By default, this value is set to \code{500} for
#'   datasets containing 10,000 vertices or less, and \code{200} otherwise.
#'   If \code{n_epochs = 0}, then coordinates determined by \code{"init"} will
#'   be returned.
#' @param scale Scaling to apply to \code{X} if it is a data frame or matrix:
#' \itemize{
#'   \item{\code{"none"} or \code{FALSE} or \code{NULL}} No scaling.
#'   \item{\code{"Z"} or \code{"scale"} or \code{TRUE}} Scale each column to
#'   zero mean and variance 1.
#'   \item{\code{"maxabs"}} Center each column to mean 0, then divide each
#'   element by the maximum absolute value over the entire matrix.
#'   \item{\code{"range"}} Range scale the entire matrix, so the smallest
#'   element is 0 and the largest is 1.
#'   \item{\code{"colrange"}} Scale each column in the range (0,1).
#' }
#' For UMAP, the default is \code{"none"}.
#' @param learning_rate Initial learning rate used in optimization of the
#'   coordinates.
#' @param init Type of initialization for the coordinates. Options are:
#'   \itemize{
#'     \item \code{"spectral"} Spectral embedding using the normalized Laplacian
#'     of the fuzzy 1-skeleton, with Gaussian noise added.
#'     \item \code{"normlaplacian"}. Spectral embedding using the normalized
#'     Laplacian of the fuzzy 1-skeleton, without noise.
#'     \item \code{"random"}. Coordinates assigned using a uniform random
#'     distribution between -10 and 10.
#'     \item \code{"lvrandom"}. Coordinates assigned using a Gaussian
#'     distribution with standard deviation 1e-4, as used in LargeVis
#'     (Tang et al., 2016) and t-SNE.
#'     \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap
#'     (Belkin and Niyogi, 2002).
#'     \item \code{"pca"}. The first two principal components from PCA of
#'     \code{X} if \code{X} is a data frame, and from a 2-dimensional classical
#'     MDS if \code{X} is of class \code{"dist"}.
#'     \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled
#'     so the standard deviation is 1e-4, to give a distribution similar to that
#'     used in t-SNE. This is an alias for \code{init = "pca", init_sdev =
#'     1e-4}.
#'     \item \code{"agspectral"} An "approximate global" modification of
#'     \code{"spectral"} which all edges in the graph to a value of 1, and then
#'     sets a random number of edges (\code{negative_sample_rate} edges per
#'     vertex) to 0.1, to approximate the effect of non-local affinities.
#'     \item A matrix of initial coordinates.
#'   }
#'  For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"},
#'  \code{"laplacian"}, \code{"agspectral"}), if more than one connected
#'  component is identified, no spectral initialization is attempted. Instead
#'  a PCA-based initialization is attempted. If \code{verbose = TRUE} the
#'  number of connected components are logged to the console. The existence of
#'  multiple connected components implies that a global view of the data cannot
#'  be attained with this initialization. Increasing the value of
#'  \code{n_neighbors} may help.
#' @param init_sdev If non-\code{NULL}, scales each dimension of the initialized
#'   coordinates (including any user-supplied matrix) to this standard
#'   deviation. By default, (\code{init_sdev = "range"}), each column of the
#'   initial coordinates are range scaled between 0-10. Scaling the input may
#'   help if the unscaled versions result in initial coordinates with large
#'   inter-point distances or outliers. This usually results in small gradients
#'   during optimization and very little progress being made to the layout.
#'   Shrinking the initial embedding by rescaling can help under these
#'   circumstances. Scaling the result of \code{init = "pca"} is usually
#'   recommended and \code{init = "spca"} as an alias for \code{init = "pca",
#'   init_sdev = 1e-4} but for the spectral initializations the scaled versions
#'   usually aren't necessary unless you are using a large value of
#'   \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).
#' @param spread The effective scale of embedded points. In combination with
#'   \code{min_dist}, this determines how clustered/clumped the embedded points
#'   are.
#' @param min_dist The effective minimum distance between embedded points.
#'   Smaller values will result in a more clustered/clumped embedding where
#'   nearby points on the manifold are drawn closer together, while larger
#'   values will result on a more even dispersal of points. The value should be
#'   set relative to the \code{spread} value, which determines the scale at
#'   which embedded points will be spread out.
#' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as
#'   the set operation used to combine local fuzzy simplicial sets to obtain a
#'   global fuzzy simplicial sets. Both fuzzy set operations use the product
#'   t-norm. The value of this parameter should be between \code{0.0} and
#'   \code{1.0}; a value of \code{1.0} will use a pure fuzzy union, while
#'   \code{0.0} will use a pure fuzzy intersection.
#' @param local_connectivity The local connectivity required -- i.e. the number
#'   of nearest neighbors that should be assumed to be connected at a local
#'   level. The higher this value the more connected the manifold becomes
#'   locally. In practice this should be not more than the local intrinsic
#'   dimension of the manifold.
#' @param bandwidth The effective bandwidth of the kernel if we view the
#'   algorithm as similar to Laplacian Eigenmaps. Larger values induce more
#'   connectivity and a more global view of the data, smaller values concentrate
#'   more locally.
#' @param repulsion_strength Weighting applied to negative samples in low
#'   dimensional embedding optimization. Values higher than one will result in
#'   greater weight being given to negative samples.
#' @param negative_sample_rate The number of negative edge/1-simplex samples to
#'   use per positive edge/1-simplex sample in optimizing the low dimensional
#'   embedding.
#' @param a More specific parameters controlling the embedding. If \code{NULL}
#'   these values are set automatically as determined by \code{min_dist} and
#'   \code{spread}.
#' @param b More specific parameters controlling the embedding. If \code{NULL}
#'   these values are set automatically as determined by \code{min_dist} and
#'   \code{spread}.
#' @param nn_method Method for finding nearest neighbors. Options are:
#'   \itemize{
#'     \item \code{"fnn"}. Use exact nearest neighbors via the
#'       \href{https://cran.r-project.org/package=FNN}{FNN} package.
#'     \item \code{"annoy"} Use approximate nearest neighbors via the
#'       \href{https://cran.r-project.org/package=RcppAnnoy}{RcppAnnoy} package.
#'     \item \code{"hnsw"} Use approximate nearest neighbors with the
#'       Hierarchical Navigable Small World (HNSW) method (Malkov and Yashunin,
#'       2018) via the
#'       \href{https://cran.r-project.org/package=RcppHNSW}{RcppHNSW} package.
#'       \code{RcppHNSW} is not a dependency of this package: this option is
#'       only available if you have installed \code{RcppHNSW} yourself. Also,
#'       HNSW only supports the following arguments for \code{metric} and
#'       \code{target_metric}: \code{"euclidean"}, \code{"cosine"} and
#'       \code{"correlation"}.
#'     \item \code{"nndescent"} Use approximate nearest neighbors with the
#'       Nearest Neighbor Descent method (Dong et al., 2011) via the
#'       \href{https://cran.r-project.org/package=rnndescent}{rnndescent}
#'       package. \code{rnndescent} is not a dependency of this package: this
#'       option is only available if you have installed \code{rnndescent}
#'       yourself.
#'    }
#'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
#'   neighbors are found. Otherwise, approximate nearest neighbors are used.
#'   You may also pass pre-calculated nearest neighbor data to this argument. It
#'   must be one of two formats, either a list consisting of two elements:
#'   \itemize{
#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
#'     vertex is considered to be its own nearest neighbor, i.e.
#'     \code{idx[, 1] == 1:n_vertices}.
#'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#'     containing the distances of the nearest neighbors.
#'   }
#'   or a sparse distance matrix of type \code{dgCMatrix}, with dimensions
#'   \code{n_vertices x n_vertices}. Distances should be arranged by column,
#'   i.e. a non-zero entry in row \code{j} of the \code{i}th column indicates
#'   that the \code{j}th observation in \code{X} is a nearest neighbor of the
#'   \code{i}th observation with the distance given by the value of that
#'   element.
#'   The \code{n_neighbors} parameter is ignored when using precomputed
#'   nearest neighbor data. If using the sparse distance matrix input, each
#'   column can contain a different number of neighbors.
#' @param n_trees Number of trees to build when constructing the nearest
#'   neighbor index. The more trees specified, the larger the index, but the
#'   better the results. With \code{search_k}, determines the accuracy of the
#'   Annoy nearest neighbor search. Only used if the \code{nn_method} is
#'   \code{"annoy"}. Sensible values are between \code{10} to \code{100}.
#' @param search_k Number of nodes to search during the neighbor retrieval. The
#'   larger k, the more the accurate results, but the longer the search takes.
#'   With \code{n_trees}, determines the accuracy of the Annoy nearest neighbor
#'   search. Only used if the \code{nn_method} is \code{"annoy"}.
#' @param nn_args A list containing additional arguments to pass to the nearest
#'   neighbor method. For \code{nn_method = "annoy"}, you can specify
#'   \code{"n_trees"} and \code{"search_k"}, and these will override the
#'   \code{n_trees} and \code{search_k} parameters.
#'   For \code{nn_method = "hnsw"}, you may specify the following arguments:
#'   \itemize{
#'   \item \code{M} The maximum number of neighbors to keep for each vertex.
#'   Reasonable values are \code{2} to \code{100}. Higher values give better
#'   recall at the cost of more memory. Default value is \code{16}.
#'   \item \code{ef_construction} A positive integer specifying the size of
#'   the dynamic list used during index construction. A higher value will
#'   provide better results at the cost of a longer time to build the index.
#'   Default is \code{200}.
#'   \item \code{ef} A positive integer specifying the size of the dynamic
#'   list used during search. This cannot be smaller than \code{n_neighbors}
#'   and cannot be higher than the number of items in the index. Default is
#'   \code{10}.
#'   }
#'   For \code{nn_method = "nndescent"}, you may specify the following
#'   arguments:
#'   \itemize{
#'   \item \code{n_trees} The number of trees to use in a random projection
#'   forest to initialize the search. A larger number will give more accurate
#'   results at the cost of a longer computation time. The default of
#'   \code{NULL} means that the number is chosen based on the number of
#'   observations in \code{X}.
#'   \item \code{max_candidates} The number of potential neighbors to explore
#'   per iteration. By default, this is set to \code{n_neighbors} or \code{60},
#'   whichever is smaller. A larger number will give more accurate results at
#'   the cost of a longer computation time.
#'   \item \code{n_iters} The number of iterations to run the search. A larger
#'   number will give more accurate results at the cost of a longer computation
#'   time. By default, this will be chosen based on the number of observations
#'   in \code{X}. You may also need to modify the convergence criterion
#'   \code{delta}.
#'   \item \code{delta} The minimum relative change in the neighbor graph
#'   allowed before early stopping. Should be a value between 0 and 1. The
#'   smaller the value, the smaller the amount of progress between iterations is
#'   allowed. Default value of \code{0.001} means that at least 0.1% of the
#'   neighbor graph must be updated at each iteration.
#'   \item \code{init} How to initialize the nearest neighbor descent. By
#'   default this is set to \code{"tree"} and uses a random project forest.
#'   If you set this to \code{"rand"}, then a random selection is used. Usually
#'   this is less accurate than using RP trees, but for high-dimensional cases,
#'   there may be little difference in the quality of the initialization and
#'   random initialization will be a lot faster. If you set this to
#'   \code{"rand"}, then the \code{n_trees} parameter is ignored.
#'   \item \code{pruning_degree_multiplier} The maximum number of edges per node
#'   to retain in the search graph, relative to \code{n_neighbors}. A larger
#'   value will give more accurate results at the cost of a longer computation
#'   time. Default is \code{1.5}. This parameter only affects neighbor search
#'   when transforming new data with \code{\link{umap_transform}}.
#'   \item \code{epsilon} Controls the degree of the back-tracking when
#'   traversing the search graph. Setting this to \code{0.0} will do a greedy
#'   search with no back-tracking. A larger value will give more accurate
#'   results at the cost of a longer computation time. Default is \code{0.1}.
#'   This parameter only affects neighbor search when transforming new data with
#'   \code{\link{umap_transform}}.
#'   \item \code{max_search_fraction} Specifies the maximum fraction of the
#'   search graph to traverse. By default, this is set to \code{1.0}, so the
#'   entire graph (i.e. all items in \code{X}) may be visited. You may want to
#'   set this to a smaller value if you have a very large dataset (in
#'   conjunction with \code{epsilon}) to avoid an inefficient exhaustive search
#'   of the data in \code{X}. This parameter only affects neighbor search when
#'   transforming new data with \code{\link{umap_transform}}.
#'   }
#' @param approx_pow If \code{TRUE}, use an approximation to the power function
#'   in the UMAP gradient, from
#'   \url{https://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/}.
#'   Ignored if \code{dens_scale} is non-\code{NULL}.
#' @param y Optional target data for supervised dimension reduction. Can be a
#' vector, matrix or data frame. Use the \code{target_metric} parameter to
#' specify the metrics to use, using the same syntax as \code{metric}. Usually
#' either a single numeric or factor column is used, but more complex formats
#' are possible. The following types are allowed:
#'   \itemize{
#'     \item Factor columns with the same length as \code{X}. \code{NA} is
#'     allowed for any observation with an unknown level, in which case
#'     UMAP operates as a form of semi-supervised learning. Each column is
#'     treated separately.
#'     \item Numeric data. \code{NA} is \emph{not} allowed in this case. Use the
#'     parameter \code{target_n_neighbors} to set the number of neighbors used
#'     with \code{y}. If unset, \code{n_neighbors} is used. Unlike factors,
#'     numeric columns are grouped into one block unless \code{target_metric}
#'     specifies otherwise. For example, if you wish columns \code{a} and
#'     \code{b} to be treated separately, specify
#'     \code{target_metric = list(euclidean = "a", euclidean = "b")}. Otherwise,
#'     the data will be effectively treated as a matrix with two columns.
#'     \item Nearest neighbor data, consisting of a list of two matrices,
#'     \code{idx} and \code{dist}. These represent the precalculated nearest
#'     neighbor indices and distances, respectively. This
#'     is the same format as that expected for precalculated data in
#'     \code{nn_method}. This format assumes that the underlying data was a
#'     numeric vector. Any user-supplied value of the \code{target_n_neighbors}
#'     parameter is ignored in this case, because the the number of columns in
#'     the matrices is used for the value. Multiple nearest neighbor data using
#'     different metrics can be supplied by passing a list of these lists.
#'   }
#' Unlike \code{X}, all factor columns included in \code{y} are automatically
#' used.
#' @param target_n_neighbors Number of nearest neighbors to use to construct the
#'   target simplicial set. Default value is \code{n_neighbors}. Applies only if
#'   \code{y} is non-\code{NULL} and \code{numeric}.
#' @param target_metric The metric used to measure distance for \code{y} if
#'   using supervised dimension reduction. Used only if \code{y} is numeric.
#' @param target_weight Weighting factor between data topology and target
#'   topology. A value of 0.0 weights entirely on data, a value of 1.0 weights
#'   entirely on target. The default of 0.5 balances the weighting equally
#'   between data and target. Only applies if \code{y} is non-\code{NULL}.
#' @param pca If set to a positive integer value, reduce data to this number of
#'   columns using PCA. Doesn't applied if the distance \code{metric} is
#'   \code{"hamming"}, or the dimensions of the data is larger than the
#'   number specified (i.e. number of rows and columns must be larger than the
#'   value of this parameter). If you have > 100 columns in a data frame or
#'   matrix, reducing the number of columns in this way may substantially
#'   increase the performance of the nearest neighbor search at the cost of a
#'   potential decrease in accuracy. In many t-SNE applications, a value of 50
#'   is recommended, although there's no guarantee that this is appropriate for
#'   all settings.
#' @param pca_center If \code{TRUE}, center the columns of \code{X} before
#'   carrying out PCA. For binary data, it's recommended to set this to
#'   \code{FALSE}.
#' @param pca_method Method to carry out any PCA dimensionality reduction when
#'   the \code{pca} parameter is specified. Allowed values are:
#'   \itemize{
#'     \item{\code{"irlba"}}. Uses \code{\link[irlba]{prcomp_irlba}} from the
#'     \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     \item{\code{"rsvd"}}. Uses 5 iterations of \code{\link[irlba]{svdr}} from
#'     the \href{https://cran.r-project.org/package=irlba}{irlba} package.
#'     This is likely to give much faster but potentially less accurate results
#'     than using \code{"irlba"}. For the purposes of nearest neighbor
#'     calculation and coordinates initialization, any loss of accuracy doesn't
#'     seem to matter much.
#'     \item{\code{"bigstatsr"}}. Uses \code{\link[bigstatsr]{big_randomSVD}}
#'     from the \href{https://cran.r-project.org/package=bigstatsr}{bigstatsr}
#'     package. The SVD methods used in \code{bigstatsr} may be faster on
#'     systems without access to efficient linear algebra libraries (e.g.
#'     Windows). \strong{Note}: \code{bigstatsr} is \emph{not} a dependency of
#'     uwot: if you choose to use this package for PCA, you \emph{must} install
#'     it yourself.
#'     \item{\code{"svd"}}. Uses \code{\link[base]{svd}} for the SVD. This is
#'     likely to be slow for all but the smallest datasets.
#'     \item{\code{"auto"}} (the default). Uses \code{"irlba"}, unless more than
#'     50% of the full set of singular vectors would be calculated, in which
#'     case \code{"svd"} is used.
#'   }
#' @param pcg_rand If \code{TRUE}, use the PCG random number generator (O'Neill,
#'   2014) during optimization. Otherwise, use the faster (but probably less
#'   statistically good) Tausworthe "taus88" generator. The default is
#'   \code{TRUE}.
#' @param fast_sgd If \code{TRUE}, then the following combination of parameters
#'   is set: \code{pcg_rand = TRUE}, \code{n_sgd_threads = "auto"} and
#'   \code{approx_pow = TRUE}. The default is \code{FALSE}. Setting this to
#'   \code{TRUE} will speed up the stochastic optimization phase, but give a
#'   potentially less accurate embedding, and which will not be exactly
#'   reproducible even with a fixed seed. For visualization, \code{fast_sgd =
#'   TRUE} will give perfectly good results. For more generic dimensionality
#'   reduction, it's safer to leave \code{fast_sgd = FALSE}. If \code{fast_sgd =
#'   TRUE}, then user-supplied values of \code{pcg_rand}, \code{n_sgd_threads},
#'   and \code{approx_pow} are ignored.
#' @param batch If \code{TRUE}, then embedding coordinates are updated at the
#'   end of each epoch rather than during the epoch. In batch mode, results are
#'   reproducible with a fixed random seed even with \code{n_sgd_threads > 1},
#'   at the cost of a slightly higher memory use. You may also have to modify
#'   \code{learning_rate} and increase \code{n_epochs}, so whether this provides
#'   a speed increase over the single-threaded optimization is likely to be
#'   dataset and hardware-dependent.
#' @param ret_model If \code{TRUE}, then return extra data that can be used to
#'   add new data to an existing embedding via \code{\link{umap_transform}}. The
#'   embedded coordinates are returned as the list item \code{embedding}. If
#'   \code{FALSE}, just return the coordinates. This parameter can be used in
#'   conjunction with \code{ret_nn} and \code{ret_extra}. Note that some
#'   settings are incompatible with the production of a UMAP model: external
#'   neighbor data (passed via a list to \code{nn_method}), and factor columns
#'   that were included via the \code{metric} parameter. In the latter case, the
#'   model produced is based only on the numeric data. A transformation using
#'   new data is possible, but the factor columns in the new data are ignored.
#'   Note that setting \code{ret_model = TRUE} forces the use of the approximate
#'   nearest neighbors method. Because small datasets would otherwise use exact
#'   nearest neighbor calculations, setting \code{ret_model = TRUE} means that
#'   different results may be returned for small datasets in terms of both the
#'   returned nearest neighbors (if requested) and the final embedded
#'   coordinates, compared to \code{ret_model = FALSE}, even if the random
#'   number seed is fixed. To avoid this, explicitly set
#'   \code{nn_method = "annoy"} in the \code{ret_model = FALSE} case.
#' @param ret_nn If \code{TRUE}, then in addition to the embedding, also return
#'   nearest neighbor data that can be used as input to \code{nn_method} to
#'   avoid the overhead of repeatedly calculating the nearest neighbors when
#'   manipulating unrelated parameters (e.g. \code{min_dist}, \code{n_epochs},
#'   \code{init}). See the "Value" section for the names of the list items. If
#'   \code{FALSE}, just return the coordinates. Note that the nearest neighbors
#'   could be sensitive to data scaling, so be wary of reusing nearest neighbor
#'   data if modifying the \code{scale} parameter. This parameter can be used in
#'   conjunction with \code{ret_model} and \code{ret_extra}.
#' @param ret_extra A vector indicating what extra data to return. May contain
#'   any combination of the following strings:
#'   \itemize{
#'     \item \code{"model"} Same as setting \code{ret_model = TRUE}.
#'     \item \code{"nn"} Same as setting \code{ret_nn = TRUE}.
#'     \item \code{"fgraph"} the high dimensional fuzzy graph (i.e. the fuzzy
#'       simplicial set of the merged local views of the input data). The graph
#'       is returned as a sparse symmetric N x N matrix of class
#'       \link[Matrix]{dgCMatrix-class}, where a non-zero entry (i, j) gives the
#'       membership strength of the edge connecting vertex i and vertex j. This
#'       can be considered analogous to the input probability (or similarity or
#'       affinity) used in t-SNE and LargeVis. Note that the graph is further
#'       sparsified by removing edges with sufficiently low membership strength
#'       that they would not be sampled by the probabilistic edge sampling
#'       employed for optimization and therefore the number of non-zero elements
#'       in the matrix is dependent on \code{n_epochs}. If you are only
#'       interested in the fuzzy input graph (e.g. for clustering), setting
#'       \code{n_epochs = 0} will avoid any further sparsifying.
#'       Be aware that setting `binary_edge_weights = TRUE` will affect this
#'       graph (all non-zero edge weights will be 1).
#'    \item \code{"sigma"} the normalization value for each observation in the
#'       dataset when constructing the smoothed distances to each of its
#'       neighbors. This gives some sense of the local density of each
#'       observation in the high dimensional space: higher values of
#'       \code{sigma} indicate a higher dispersion or lower density.
#'   }
#' @param n_threads Number of threads to use (except during stochastic gradient
#'   descent). Default is half the number of concurrent threads supported by the
#'   system. For nearest neighbor search, only applies if
#'   \code{nn_method = "annoy"}. If \code{n_threads > 1}, then the Annoy index
#'   will be temporarily written to disk in the location determined by
#'   \code{\link[base]{tempfile}}.
#' @param n_sgd_threads Number of threads to use during stochastic gradient
#'   descent. If set to > 1, then be aware that if \code{batch = FALSE}, results
#'   will \emph{not} be reproducible, even if \code{set.seed} is called with a
#'   fixed seed before running. Set to \code{"auto"} to use the same value as
#'   \code{n_threads}. Default is to use only one thread, unless
#'   \code{batch = TRUE} in which case \code{"auto"} used.
#' @param grain_size The minimum amount of work to do on each thread. If this
#'   value is set high enough, then less than \code{n_threads} or
#'   \code{n_sgd_threads} will be used for processing, which might give a
#'   performance improvement if the overhead of thread management and context
#'   switching was outweighing the improvement due to concurrent processing.
#'   This should be left at default (\code{1}) and work will be spread evenly
#'   over all the threads specified.
#' @param tmpdir Temporary directory to store nearest neighbor indexes during
#'   nearest neighbor search. Default is \code{\link{tempdir}}. The index is
#'   only written to disk if \code{n_threads > 1} and
#'   \code{nn_method = "annoy"}; otherwise, this parameter is ignored.
#' @param verbose If \code{TRUE}, log details to the console.
#' @param opt_args A list of optimizer parameters, used when
#'   \code{batch = TRUE}. The default optimization method used is Adam (Kingma
#'   and Ba, 2014).
#'   \itemize{
#'     \item \code{method} The optimization method to use. Either \code{"adam"}
#'     or \code{"sgd"} (stochastic gradient descent). Default: \code{"adam"}.
#'     \item \code{beta1} (Adam only). The weighting parameter for the
#'     exponential moving average of the first moment estimator. Effectively the
#'     momentum parameter. Should be a floating point value between 0 and 1.
#'     Higher values can smooth oscillatory updates in poorly-conditioned
#'     situations and may allow for a larger \code{learning_rate} to be
#'     specified, but too high can cause divergence. Default: \code{0.5}.
#'     \item \code{beta2} (Adam only). The weighting parameter for the
#'     exponential moving average of the uncentered second moment estimator.
#'     Should be a floating point value between 0 and 1. Controls the degree of
#'     adaptivity in the step-size. Higher values put more weight on previous
#'     time steps. Default: \code{0.9}.
#'     \item \code{eps} (Adam only). Intended to be a small value to prevent
#'     division by zero, but in practice can also affect convergence due to its
#'     interaction with \code{beta2}. Higher values reduce the effect of the
#'     step-size adaptivity and bring the behavior closer to stochastic gradient
#'     descent with momentum. Typical values are between 1e-8 and 1e-3. Default:
#'     \code{1e-7}.
#'     \item \code{alpha} The initial learning rate. Default: the value of the
#'     \code{learning_rate} parameter.
#'   }
#' @param epoch_callback A function which will be invoked at the end of every
#'   epoch. Its signature should be: \code{(epoch, n_epochs, coords)}, where:
#'   \itemize{
#'     \item \code{epoch} The current epoch number (between \code{1} and
#'     \code{n_epochs}).
#'     \item \code{n_epochs} Number of epochs to use during the optimization of
#'     the embedded coordinates.
#'     \item \code{coords} The embedded coordinates as of the end of the current
#'     epoch, as a matrix with dimensions (N, \code{n_components}).
#'   }
#' @param binary_edge_weights If \code{TRUE} then edge weights in the input
#'   graph are treated as binary (0/1) rather than real valued. This affects the
#'   sampling frequency of neighbors and is the strategy used by the PaCMAP
#'   method (Wang and co-workers, 2020). Practical (Böhm and co-workers, 2020)
#'   and theoretical (Damrich and Hamprecht, 2021) work suggests this has little
#'   effect on UMAP's performance.
#' @param dens_scale A value between 0 and 1. If > 0 then the output attempts
#'   to preserve relative local density around each observation. This uses an
#'   approximation to the densMAP method (Narayan and co-workers, 2021). The
#'   larger the value of \code{dens_scale}, the greater the range of output
#'   densities that will be used to map the input densities. This option is
#'   ignored if using multiple \code{metric} blocks.
#' @param seed Integer seed to use to initialize the random number generator
#'   state. Combined with \code{n_sgd_threads = 1} or \code{batch = TRUE}, this
#'   should give consistent output across multiple runs on a given installation.
#'   Setting this value is equivalent to calling \code{\link[base]{set.seed}},
#'   but it may be more convenient in some situations than having to call a
#'   separate function. The default is to not set a seed. If
#'   \code{ret_model = TRUE}, the seed will be stored in the output model and
#'   then used to set the seed inside \code{\link{umap_transform}}.
#' @return A matrix of optimized coordinates, or:
#'   \itemize{
#'     \item if \code{ret_model = TRUE} (or \code{ret_extra} contains
#'     \code{"model"}), returns a list containing extra information that can be
#'     used to add new data to an existing embedding via
#'     \code{\link{umap_transform}}. In this case, the coordinates are available
#'     in the list item \code{embedding}. \bold{NOTE}: The contents of
#'     the \code{model} list should \emph{not} be considered stable or part of
#'     the public API, and are purposely left undocumented.
#'     \item if \code{ret_nn = TRUE} (or \code{ret_extra} contains \code{"nn"}),
#'     returns the nearest neighbor data as a list called \code{nn}. This
#'     contains one list for each \code{metric} calculated, itself containing a
#'     matrix \code{idx} with the integer ids of the neighbors; and a matrix
#'     \code{dist} with the distances. The \code{nn} list (or a sub-list) can be
#'     used as input to the \code{nn_method} parameter.
#'     \item if \code{ret_extra} contains \code{"fgraph"}, returns the high
#'     dimensional fuzzy graph as a sparse matrix called \code{fgraph}, of type
#'     \link[Matrix]{dgCMatrix-class}.
#'     \item if \code{ret_extra} contains \code{"sigma"}, returns a vector of the
#'     smooth knn distance normalization terms for each observation as
#'     \code{"sigma"} and a vector \code{"rho"} containing the largest
#'     distance to the locally connected neighbors of each observation.
#'     \item if \code{ret_extra} contains \code{"localr"}, returns a vector of
#'     the estimated local radii, the sum of \code{"sigma"} and \code{"rho"}.
#'   }
#'   The returned list contains the combined data from any combination of
#'   specifying \code{ret_model}, \code{ret_nn} and \code{ret_extra}.
#' @examples
#'
#' iris30 <- iris[c(1:10, 51:60, 101:110), ]
#' iris_umap <- umap2(iris30, n_neighbors = 5)
#'
#' @references
#' Belkin, M., & Niyogi, P. (2002).
#' Laplacian eigenmaps and spectral techniques for embedding and clustering.
#' In \emph{Advances in neural information processing systems}
#' (pp. 585-591).
#' \url{http://papers.nips.cc/paper/1961-laplacian-eigenmaps-and-spectral-techniques-for-embedding-and-clustering.pdf}
#'
#' Böhm, J. N., Berens, P., & Kobak, D. (2020).
#' A unifying perspective on neighbor embeddings along the attraction-repulsion spectrum.
#' \emph{arXiv preprint} \emph{arXiv:2007.08902}.
#' \url{https://arxiv.org/abs/2007.08902}
#'
#' Damrich, S., & Hamprecht, F. A. (2021).
#' On UMAP's true loss function.
#' \emph{Advances in Neural Information Processing Systems}, \emph{34}.
#' \url{https://proceedings.neurips.cc/paper/2021/hash/2de5d16682c3c35007e4e92982f1a2ba-Abstract.html}
#'
#' Dong, W., Moses, C., & Li, K. (2011, March).
#' Efficient k-nearest neighbor graph construction for generic similarity measures.
#' In \emph{Proceedings of the 20th international conference on World Wide Web}
#' (pp. 577-586).
#' ACM.
#' \doi{10.1145/1963405.1963487}.
#'
#' Kingma, D. P., & Ba, J. (2014).
#' Adam: A method for stochastic optimization.
#' \emph{arXiv preprint} \emph{arXiv}:1412.6980.
#' \url{https://arxiv.org/abs/1412.6980}
#'
#' Malkov, Y. A., & Yashunin, D. A. (2018).
#' Efficient and robust approximate nearest neighbor search using hierarchical
#' navigable small world graphs.
#' \emph{IEEE transactions on pattern analysis and machine intelligence}, \emph{42}(4), 824-836.
#'
#' McInnes, L., Healy, J., & Melville, J. (2018).
#' UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction
#' \emph{arXiv preprint} \emph{arXiv}:1802.03426.
#' \url{https://arxiv.org/abs/1802.03426}
#'
#' Narayan, A., Berger, B., & Cho, H. (2021).
#' Assessing single-cell transcriptomic variability through density-preserving data visualization.
#' \emph{Nature biotechnology}, \emph{39}(6), 765-774.
#' \doi{10.1038/s41587-020-00801-7}
#'
#' O’Neill, M. E. (2014).
#' \emph{PCG: A family of simple fast space-efficient statistically good
#' algorithms for random number generation}
#' (Report No. HMC-CS-2014-0905). Harvey Mudd College.
#'
#' Tang, J., Liu, J., Zhang, M., & Mei, Q. (2016, April).
#' Visualizing large-scale and high-dimensional data.
#' In \emph{Proceedings of the 25th International Conference on World Wide Web}
#' (pp. 287-297).
#' International World Wide Web Conferences Steering Committee.
#' \url{https://arxiv.org/abs/1602.00370}
#'
#' Van der Maaten, L., & Hinton, G. (2008).
#' Visualizing data using t-SNE.
#' \emph{Journal of Machine Learning Research}, \emph{9} (2579-2605).
#' \url{https://www.jmlr.org/papers/v9/vandermaaten08a.html}
#'
#' Wang, Y., Huang, H., Rudin, C., & Shaposhnik, Y. (2021).
#' Understanding How Dimension Reduction Tools Work: An Empirical Approach to Deciphering t-SNE, UMAP, TriMap, and PaCMAP for Data Visualization.
#' \emph{Journal of Machine Learning Research}, \emph{22}(201), 1-73.
#' \url{https://www.jmlr.org/papers/v22/20-1061.html}
#'
#' @export
umap2 <-
  function(X,
           n_neighbors = 15,
           n_components = 2,
           metric = "euclidean",
           n_epochs = NULL,
           learning_rate = 1,
           scale = FALSE,
           init = "spectral",
           init_sdev = "range",
           spread = 1,
           min_dist = 0.1,
           set_op_mix_ratio = 1.0,
           local_connectivity = 1.0,
           bandwidth = 1.0,
           repulsion_strength = 1.0,
           negative_sample_rate = 5.0,
           a = NULL,
           b = NULL,
           nn_method = NULL,
           n_trees = 50,
           search_k = 2 * n_neighbors * n_trees,
           approx_pow = FALSE,
           y = NULL,
           target_n_neighbors = n_neighbors,
           target_metric = "euclidean",
           target_weight = 0.5,
           pca = NULL,
           pca_center = TRUE,
           pcg_rand = TRUE,
           fast_sgd = FALSE,
           ret_model = FALSE,
           ret_nn = FALSE,
           ret_extra = c(),
           n_threads = NULL,
           n_sgd_threads = 0,
           grain_size = 1,
           tmpdir = tempdir(),
           verbose = getOption("verbose", TRUE),
           batch = TRUE,
           opt_args = NULL,
           epoch_callback = NULL,
           pca_method = NULL,
           binary_edge_weights = FALSE,
           dens_scale = NULL,
           seed = NULL,
           nn_args = list()) {
    if (is.null(nn_method)) {
      if (is_installed("RcppHNSW") &&
        is.character(metric) &&
        is_ok_hnsw_metric(metric) &&
        is_ok_hnsw_metric(target_metric)) {
        nn_method <- "hnsw"
        tsmessage("Using HNSW for nearest neighbor search")
      }
    }

    if (is.null(nn_method)) {
      if (is_installed("rnndescent")) {
        nn_method <- "nndescent"
        tsmessage("Using NN-Descent for nearest neighbor search")
      }
    }

    if (is.null(n_threads)) {
      n_threads <- default_num_threads()
      if (batch) {
        n_sgd_threads <- n_threads
      }
    }

    if (is_sparse_matrix(X)) {
      if (!methods::is(X, "dgCMatrix")) {
        stop("sparse X must be a dgCMatrix object")
      }
      if (!is.list(nn_method)) {
        if (!is_installed("rnndescent")) {
          stop(
            "nearest neighbor search for sparse matrices requires the ",
            "'rnndescent' package, please install it"
          )
        }
        if (!is.null(nn_method) && nn_method != "nndescent") {
          stop(
            "nearest neighbor search for sparse matrices only supports ",
            "the 'nndescent' method"
          )
        }
        tsmessage("Using nndescent for nearest neighbor search")
        nn_method <- "nndescent"
      }
    }

    if (is.null(n_epochs)) {
      n_epochs <- 500
    }

    if (is.numeric(a) &&
      is.numeric(b) && a == 1 && b == 1 && is.null(dens_scale)) {
      method <- "tumap"
    } else {
      method <- "umap"
    }
    uwot(
      X = X,
      n_neighbors = n_neighbors,
      n_components = n_components,
      metric = metric,
      n_epochs = n_epochs,
      alpha = learning_rate,
      scale = scale,
      init = init,
      init_sdev = init_sdev,
      spread = spread,
      min_dist = min_dist,
      set_op_mix_ratio = set_op_mix_ratio,
      local_connectivity = local_connectivity,
      bandwidth = bandwidth,
      gamma = repulsion_strength,
      negative_sample_rate = negative_sample_rate,
      a = a,
      b = b,
      nn_method = nn_method,
      n_trees = n_trees,
      search_k = search_k,
      method = method,
      approx_pow = approx_pow,
      n_threads = n_threads,
      n_sgd_threads = n_sgd_threads,
      grain_size = grain_size,
      y = y,
      target_n_neighbors = target_n_neighbors,
      target_weight = target_weight,
      target_metric = target_metric,
      pca = pca,
      pca_center = pca_center,
      pca_method = pca_method,
      pcg_rand = pcg_rand,
      fast_sgd = fast_sgd,
      ret_model = ret_model || "model" %in% ret_extra,
      ret_nn = ret_nn || "nn" %in% ret_extra,
      ret_fgraph = "fgraph" %in% ret_extra,
      ret_sigma = "sigma" %in% ret_extra,
      ret_localr = "localr" %in% ret_extra,
      batch = batch,
      opt_args = opt_args,
      epoch_callback = epoch_callback,
      binary_edge_weights = binary_edge_weights,
      tmpdir = tempdir(),
      verbose = verbose,
      dens_scale = dens_scale,
      seed = seed,
      nn_args = nn_args,
      sparse_X_is_distance_matrix = FALSE
    )
  }
jlmelville/uwot documentation built on April 25, 2024, 5:20 a.m.