#' Uniform Manifold Approximation and Projection in Monocle 3 alpha
#'
#' @description Monocle 3 alpha's implementation of UMAP. Finds a low dimensional embedding of the data that approximates an underlying manifold.
#' This functions relies on the python implementation of UMAP (https://github.com/lmcinnes/umap). This function is a pass through to the python code. Documentation is reproduced from the UMAP package here for convenience.
#' The original publication of UMAP can be found here:
#' https://arxiv.org/abs/1802.03426 (McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction, ArXiv e-prints 1802.03426, 2018).
#' A useful notebook (written in python) to check for the effects of each parameter in UMAP can be found here: https://nbviewer.jupyter.org/github/CrakeNotSnowman/umapNotebooks/blob/master/UMAP%20Usage.ipynb.
#'
#' @param X the dataset upon which to perform umap dimension reduction
#' @param python_home The python home directory where umap is installed
#' @param log A logic argument to determine whether we need to calculate
#' log of the input data. Default to be true
#' @param n_neighbors float (optional, default 15)
#' The size of local neighborhood (in terms of number of neighboring
#' sample points) used for manifold approximation. Larger values
#' result in more global views of the manifold, while smaller
#' values result in more local data being preserved. In general
#' values should be in the range 2 to 100.
#' @param n_component int (optional, default 2)
#' The dimension of the space to embed into. This defaults to 2 to
#' provide easy visualization, but can reasonably be set to any
#' integer value in the range 2 to 100.
#' @param metric string or function (optional, default 'correlation')
#' The metric to use to compute distances in high dimensional space.
#' If a string is passed it must match a valid predefined metric.
#' Valid string metrics include:
#' * euclidean
#' * manhattan
#' * chebyshev
#' * minkowski
#' * canberra
#' * braycurtis
#' * mahalanobis
#' * wminkowski
#' * seuclidean
#' * cosine
#' * correlation
#' * haversine
#' * hamming
#' * jaccard
#' * dice
#' * russelrao
#' * kulsinski
#' * rogerstanimoto
#' * sokalmichener
#' * sokalsneath
#' * yule
#' Metrics that take arguments (such as minkowski, mahalanobis etc.)
#' can have arguments passed via the metric_kwds dictionary. At this
#' time care must be taken and dictionary elements must be ordered
#' appropriately; this will hopefully be fixed in the future.
#' @param n_epochs int The number of training epochs to be used in optimizing the
#' low dimensional embedding. Larger values result in more accurate
#' embeddings. If None is specified a value will be selected based on
#' the size of the input dataset (200 for large datasets, 500 for small).
#' @param negative_sample_rate int (optional, default 5)
#' The number of negative edge/1-simplex samples to use per positive
#' edge/1-simplex sample in optimizing the low dimensional embedding.
#' @param learning_rate float (optional, default 1.0)
#' The initial learning rate for the embedding optimization.
#' @param init string (optional, default 'spectral')
#' How to initialize the low dimensional embedding. Options are:
#' * 'spectral': use a spectral embedding of the fuzzy 1-skeleton
#' * 'random': assign initial embedding positions at random.
#' * A numpy array of initial embedding positions.
#' @param min_dist float (optional, default 0.1)
#' The effective minimum distance between embedded points. Smaller values
#' will result in a more clustered/clumped embedding where nearby points
#' on the manifold are drawn closer together, while larger values will
#' result on a more even dispersal of points. The value should be set
#' relative to the ``spread`` value, which determines the scale at which
#' embedded points will be spread out.
#' @param spread float (optional, default 1.0)
#' The effective scale of embedded points. In combination with ``min_dist``
#' this determines how clustered/clumped the embedded points are.
#' @param set_op_mix_ratio float (optional, default 1.0)
#' Interpolate between (fuzzy) union and intersection as the set operation
#' used to combine local fuzzy simplicial sets to obtain a global fuzzy
#' simplicial sets. Both fuzzy set operations use the product t-norm.
#' The value of this parameter should be between 0.0 and 1.0; a value of
#' 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy
#' intersection.
#' @param local_connectivity int (optional, default 1)
#' The local connectivity required -- i.e. the number of nearest
#' neighbors that should be assumed to be connected at a local level.
#' The higher this value the more connected the manifold becomes
#' locally. In practice this should be not more than the local intrinsic
#' dimension of the manifold.
#' @param repulsion_strength float (optional, default 1.0)
#' Weighting applied to negative samples in low dimensional embedding
#' optimization. Values higher than one will result in greater weight
#' being given to negative samples.
# #' @param transform_queue_size float (optional, default 4.0)
# #' For transform operations (embedding new points using a trained model_
# #' this will control how aggressively to search for nearest neighbors.
# #' Larger values will result in slower performance but more accurate
# #' nearest neighbor evaluation.
#' @param a float (optional, default None) (not passed in)
#' More specific parameters controlling the embedding. If None these
#' values are set automatically as determined by ``min_dist`` and
#' ``spread``.
#' @param b float (optional, default None) (not passed in)
#' More specific parameters controlling the embedding. If None these
#' values are set automatically as determined by ``min_dist`` and
#' ``spread``.
#' @param random_state int, RandomState instance or None, optional (default: None) (not passed in)
#' If int, random_state is the seed used by the random number generator;
#' If RandomState instance, random_state is the random number generator;
#' If None, the random number generator is the RandomState instance used
#' by `np.random`.
#' @param metric_kwds dict (optional, default {}) (not passed in)
#' Arguments to pass on to the metric, such as the ``p`` value for
#' Minkowski distance. In R, a list should be passed in if you want to use this argument.
#' The dict function from reticulate package will then convert it into a dictionary for python to use.
#' @param angular_rp_forest bool (optional, default False)
#' Whether to use an angular random projection forest to initialise
#' the approximate nearest neighbor search. This can be faster, but is
#' mostly on useful for metric that use an angular style distance such
#' as cosine, correlation etc. In the case of those metrics angular forests
#' will be chosen automatically.
#' @param target_n_neighbors int (optional, default -1)
#' The number of nearest neighbors to use to construct the target simplcial
#' set. If set to -1 use the ``n_neighbors`` value.
#' @param target_metric string or callable (optional, default 'categorical')
#' The metric used to measure distance for a target array is using supervised
#' dimension reduction. By default this is 'categorical' which will measure
#' distance in terms of whether categories match or are different. Furthermore,
#' if semi-supervised is required target values of -1 will be trated as
#' unlabelled under the 'categorical' metric. If the target array takes
#' continuous values (e.g. for a regression problem) then metric of 'l1'
#' or 'l2' is probably more appropriate.
#' @param target_metric_kwds dict (optional, default None)
#' Keyword argument to pass to the target metric when performing
#' supervised dimension reduction. If None then no arguments are passed on.
#' @param target_weight float (optional, default 0.5)
#' weighting factor between data topology and target topology. A value of
#' 0.0 weights entirely on data, a value of 1.0 weights entirely on target.
#' The default of 0.5 balances the weighting equally between data and target.
#' @param transform_seed int (optional, default 42)
#' Random seed used for the stochastic aspects of the transform operation.
#' This ensures consistency in transform operations.
#' @param verbose bool (optional, default False)
#' Controls verbosity of logging.
#' @param return_all Whether to return all slots after UMAP
#' @return Embedding of the training data in low-dimensional space if return_all is set to be FALSE,
#' otherwise the object returned from umap function, including the following elements:
#' a, fit_transform, metric, random_state, alpha, gamma, metric_kwds, set_op_mix_ratio, angular_rp_forest,
#' get_params, min_dist, set_params, b, graph, n_components, spread, bandwidth, init, n_epochs, verbose,
#' embedding_, initial_alpha, n_neighbors, fit, local_connectivity, negative_sample_rate
#' @importFrom Matrix as.matrix
#' @export
monocle_UMAP <- function(X, python_home = system('which python', intern = TRUE),
log = TRUE,
n_neighbors = 15L,
n_component = 2L,
metric = "correlation",
n_epochs = NULL,
negative_sample_rate = 5L,
learning_rate = 1.0,
init = 'spectral',
min_dist = 0.1,
spread = 1.0,
set_op_mix_ratio = 1.0,
local_connectivity = 1L,
# bandwidth = 1.0,
repulsion_strength = 1.0,
a = NULL,
b = NULL,
random_state = 0L,
metric_kwds = reticulate::dict(),
angular_rp_forest = FALSE,
target_n_neighbors = -1L,
target_metric = 'categorical',
target_metric_kwds = reticulate::dict(),
target_weight = 0.5,
transform_seed = 42L,
verbose = FALSE,
return_all = FALSE) {
reticulate::use_python(python_home)
tryCatch({
reticulate::import("umap")
}, warning = function(w) {
}, error = function(e) {
print (e)
stop('please pass the python home directory where umap is installed with python_home argument!')
}, finally = {
})
reticulate::source_python(paste(system.file(package="cellwrangler"), "umap.py", sep="/"))
# X <- Matrix::t(X)
if(length(grep('Matrix', class(X))) == 0){
X <- as(as.matrix(X), 'TsparseMatrix')
} else {
X <- as(X, 'TsparseMatrix')
}
i <- as.integer(X@i)
j <- as.integer(X@j)
if(log) {
val <- log(X@x + 1)
} else {
val <- X@x
}
dim <- as.integer(X@Dim)
if(is.null(n_epochs) == F) {
n_epochs <- as.integer(n_epochs)
}
if(is.null(a) == F) {
a <- as.numeric(a)
}
if(is.null(b) == F) {
n_epochs <- as.numeric(b)
}
if(is.list(metric_kwds) == F) {
metric_kwds <- reticulate::dict()
} else {
metric_kwds <- reticulate::dict(metric_kwds)
}
if(is.list(target_metric_kwds) == F) {
target_metric_kwds <- reticulate::dict()
} else {
target_metric_kwds <- reticulate::dict(target_metric_kwds)
}
umap_res <- umap(i, j, val, dim,
as.integer(n_neighbors),
as.integer(n_component),
as.character(metric),
n_epochs,
as.integer(negative_sample_rate),
as.numeric(learning_rate),
as.character(init),
as.numeric(min_dist),
as.numeric(spread),
as.numeric(set_op_mix_ratio),
as.integer(local_connectivity),
# as.numeric(bandwidth),
as.numeric(repulsion_strength),
a,
b,
as.integer(random_state),
metric_kwds,
as.logical(angular_rp_forest),
as.integer(target_n_neighbors),
as.character(target_metric),
target_metric_kwds,
as.numeric(target_weight),
as.integer(transform_seed),
as.logical(verbose))
if(return_all) {
return(umap_res)
} else {
umap_res$embedding_
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.