snifter: R wrapper for the python openTSNE library

Documented in fitsne

#' Run FI-tSNE algorithm
#' 
#' See \href{https://opentsne.readthedocs.io/en/latest/}{the openTSNE documentation}
#' for further details on these arguments and the general usage of this
#' algorithm.
#' @param x Input data matrix.
#' @param simplified Logical scalar. When \code{FALSE}, the function
#' returns an object of class \code{snifter}. This contains all information
#' necessary to project new data into the embedding using \code{\link{project}}
#' If \code{TRUE}, all extra attributes will be omitted, and the return value
#' is a base matrix.
#' @param n_components Number of t-SNE components to be produced.
#' @param n_jobs Integer scalar specifying the number of cores to be used.
#' @param perplexity Numeric scalar controlling the neighborhood used
#'  when estimating the embedding.
#' @param n_iter Integer scalar specifying the number of iterations to complete.
#' @param initialization Character scalar specifying the initialization
#'  to use. "pca" may preserve global distance better than other options.
#' @param pca Logical scalar specifying whether PCA should be run on the data
#'  before creating the embedding.
#' @param partial_pca Logical scalar specifying whether
#'  \code{\link[irlba]{prcomp_irlba}} should be used if \code{pca=TRUE}.
#'  This is useful for very large data matrices.
#' @param pca_dims Integer scalar specifying the number of principal components
#'  to be calculated in the initial PCA step if \code{pca=TRUE}.
#' @param pca_center,pca_scale Logical scalars specifying whether centering and
#'  scaling should be performed before running PCA, if \code{pca=TRUE}.
#' @param neighbors Character scalar specifying the nearest neighbour
#'  algorithm to use.
#' @param negative_gradient_method Character scalar specifying the
#'  negative gradient approximation to use. "bh", referring to Barnes-Hut,
#'  is more appropriate for smaller data sets, while "fft" referring
#'  to fast Fourier transform, is more appropriate for larger datasets.
#' @param learning_rate Numeric scalar specifying the learning rate, or the
#'  string "auto", which uses \code{max(200, N / 12)}, where \code{N} is
#'  the number of observations.
#' @param early_exaggeration Numeric scalar specifying the exaggeration factor
#'  to use during the early exaggeration phase. Typical values range from 12 to
#'  32.
#' @param early_exaggeration_iter Integer scalar specifying the number of
#'  iterations to run in the early exaggeration phase.
#' @param exaggeration Numeric scalar specifying the exaggeration factor to use
#'  during the normal optimization phase. This can be used to form more densely
#'  packed clusters and is useful for large data sets.
#' @param dof Numeric scalar specifying the degrees of freedom, as described in
#'  Kobak et al. (2019).
#' @param theta Numeric scalar, only used when negative_gradient_method="bh".
#'  This is the trade-off parameter between speed and accuracy of the tree
#'  approximation method. Typical values range from 0.2 to 0.8. The value 0
#'  indicates that no approximation is to be made and produces exact results
#'  also producing longer runtime.
#' @param n_interpolation_points Integer scalar, only used when
#'  negative_gradient_method="fft". The number of
#'  interpolation points to use within each grid cell for interpolation based
#'  t-SNE. It is highly recommended leaving this value at the default 3.
#' @param min_num_intervals Integer scalar, only used when
#'  negative_gradient_method="fft". The minimum number of grid cells to use,
#'  regardless of the ints_in_interval parameter. Higher values provide more
#'  accurate gradient estimations.
#' @param ints_in_interval Numeric scalar, only used when
#'  negative_gradient_method="fft". Indicates how large a grid cell should be
#'  e.g. a value of 3 indicates a grid side length of 3. Lower values provide
#'  more accurate gradient estimations.
#' @param metric Character scalar specifying the metric to be used to compute
#'  affinities between points in the original space.
#' @param metric_params Named list of additional keyword arguments for the
#'  metric function.
#' @param initial_momentum Numeric scalar specifying the momentum to use during
#'  the early exaggeration phase.
#' @param final_momentum Numeric scalar specifying the momentum to use during
#'  the normal optimization phase.
#' @param max_grad_norm Numeric scalar specifying the maximum gradient norm.
#'  If the norm exceeds this value, it will be clipped.
#' @param random_state Integer scalar specifying the seed used by the random
#'  number generator.
#' @param verbose Logical scalar controlling verbosity.
#' @return A matrix of t-SNE embeddings.
#'
#' @references
#'  openTSNE: a modular Python library for t-SNE dimensionality reduction and
#'  embedding
#'  Pavlin G. Poličar, Martin Stražar, Blaž Zupan
#'  bioRxiv (2019) 731877; doi: \url{https://doi.org/10.1101/731877}
#'
#'  Fast interpolation-based t-SNE for improved visualization of single-cell
#'  RNA-seq data
#'  George C. Linderman, Manas Rachh, Jeremy G. Hoskins, Stefan Steinerberger,
#'  and Yuval Kluger
#'  Nature Methods 16, 243–245 (2019)
#'  doi: \url{https://doi.org/10.1038/s41592-018-0308-4}
#'
#'  Accelerating t-SNE using Tree-Based Algorithms
#'  Laurens van der Maaten
#'  Journal of Machine Learning Research (2014)
#'  \url{http://jmlr.org/papers/v15/vandermaaten14a.html}
#'
#'  Heavy-tailed kernels reveal a finer cluster structure in t-SNE
#'  visualisations
#'  Dmitry Kobak, George Linderman, Stefan Steinerberger, Yuval Kluger and
#'  Philipp Berens
#'  arXiv (2019)
#'  doi: \url{https://doi.org/10.1007/978-3-030-46150-8_8}.
#' @examples
#'  set.seed(42)
#'  m <- matrix(rnorm(2000), ncol=20)
#'  out <- fitsne(m, random_state = 42L)
#'  plot(out, pch = 19, xlab = "t-SNE 1", ylab = "t-SNE 2")
#'
#'  ## openTSNE allows us to project new points into the existing
#'  ## embedding - useful for extremely large data.
#'  ## see https://opentsne.readthedocs.io/en/latest/api/index.html
#'
#'  out_binding <- fitsne(m[-(1:2), ], random_state = 42L)
#'  new_points <- project(out_binding, new = m[1:2, ], old = m[-(1:2), ])
#'  plot(as.matrix(out_binding), col = "black", pch = 19,
#'      xlab = "t-SNE 1", ylab = "t-SNE 2")
#'  points(new_points, col = "red", pch = 19)
#'
#' @rdname snifter
#' @export
fitsne <- function(
        x,
        simplified = FALSE,
        n_components = 2L,
        n_jobs = 1L,
        perplexity = 30,
        n_iter = 500L,
        initialization = c("pca", "spectral", "random"),
        pca = FALSE,
        pca_dims = 50L,
        partial_pca = FALSE,
        pca_center = TRUE,
        pca_scale = TRUE,
        neighbors = c("auto", "exact", "annoy", "pynndescent", "approx"),
        negative_gradient_method = c("fft", "bh"),
        learning_rate = "auto",
        early_exaggeration = 12,
        early_exaggeration_iter = 250L,
        exaggeration = NULL,
        dof = 1,
        theta = 0.5,
        n_interpolation_points = 3L,
        min_num_intervals = 50L,
        ints_in_interval = 1,
        metric = "euclidean",
        metric_params = NULL,
        initial_momentum = 0.5,
        final_momentum = 0.8,
        max_grad_norm = NULL,
        random_state = NULL,
        verbose = FALSE
    ) {
    x <- as.matrix(x)

    initialization <- match.arg(initialization)
    neighbors <- match.arg(neighbors)
    negative_gradient_method <- match.arg(negative_gradient_method)
    if (!is.null(random_state)) random_state <- as.integer(random_state)
    early_exaggeration_iter <- as.integer(early_exaggeration_iter)
    n_jobs <- as.integer(n_jobs)
    n_components <- as.integer(n_components)
    min_num_intervals <- as.integer(min_num_intervals)
    n_iter <- as.integer(n_iter)

    if (pca) {
        if (verbose) cat("Performing PCA\n")
        if (partial_pca) {
            if (!requireNamespace("irlba", quietly = TRUE)) {
                stop(
                    "Package \"irlba\" is required for partial PCA.",
                    " Please install it.",
                    call. = FALSE)
            }
            x <- irlba::prcomp_irlba(
                x,
                n = pca_dims,
                center = pca_center,
                scale = pca_scale
            )$x
        } else {
            if (verbose & min(dim(x)) > 2500) {
                cat("Consider setting partial_pca=TRUE for large matrices\n")
            }
            x <- stats::prcomp(
                x,
                retx = TRUE,
                center = pca_center,
                scale. = pca_scale,
                rank. = pca_dims
            )$x
        }
    }

    out <- .create_tsne(
        x = x,
        n_iter = n_iter,
        n_components = n_components,
        n_jobs = n_jobs,
        learning_rate = learning_rate,
        perplexity = perplexity,
        early_exaggeration = early_exaggeration,
        early_exaggeration_iter = early_exaggeration_iter,
        exaggeration = exaggeration,
        dof = dof,
        theta = theta,
        n_interpolation_points = n_interpolation_points,
        min_num_intervals = min_num_intervals,
        ints_in_interval = ints_in_interval,
        initialization = initialization,
        metric = metric,
        metric_params = metric_params,
        initial_momentum = initial_momentum,
        final_momentum = final_momentum,
        max_grad_norm = max_grad_norm,
        neighbors = neighbors,
        negative_gradient_method = negative_gradient_method,
        random_state = random_state,
        verbose = verbose
    )
    if (simplified) {
        return(out$x)
    }
    structure(
        .Data = out$x,
        affinities = out$affinities,
        n_iter = n_iter,
        n_components = n_components,
        n_jobs = n_jobs,
        learning_rate = learning_rate,
        perplexity = perplexity,
        early_exaggeration = early_exaggeration,
        early_exaggeration_iter = early_exaggeration_iter,
        exaggeration = exaggeration,
        dof = dof,
        theta = theta,
        n_interpolation_points = n_interpolation_points,
        min_num_intervals = min_num_intervals,
        ints_in_interval = ints_in_interval,
        initialization = initialization,
        metric = metric,
        metric_params = metric_params,
        initial_momentum = initial_momentum,
        final_momentum = final_momentum,
        max_grad_norm = max_grad_norm,
        neighbors = neighbors,
        negative_gradient_method = negative_gradient_method,
        class = c("snifter", "matrix")
    )
}


#' @export
print.snifter <- function(x, ...) {
    attributes(x) <- NULL
    print(as.matrix(x))
}

.create_tsne <- function(
        x,
        ...
    ) {
    .checks_snifter(x, ...)
    proc <- basiliskStart(python_env)
    on.exit(basiliskStop(proc))
    out <- basiliskRun(proc, .run_tsne, x = x, ...)
    out
}
.run_tsne <- function(x, ...) {
    openTSNE <- reticulate::import("openTSNE", convert = FALSE)
    obj <- openTSNE$TSNE(...)
    out <- obj$fit(x)
    list(
        x = reticulate::py_to_r(out),
        affinities = reticulate::py_to_r(out$affinities$P)
    )
}


.checks_snifter <- function(
        x,
        n_iter,
        n_components,
        n_jobs,
        learning_rate,
        perplexity,
        early_exaggeration,
        early_exaggeration_iter,
        exaggeration,
        dof,
        theta,
        n_interpolation_points,
        min_num_intervals,
        ints_in_interval,
        initialization,
        metric,
        metric_params,
        initial_momentum,
        final_momentum,
        max_grad_norm,
        neighbors,
        negative_gradient_method,
        random_state,
        verbose
    ) {
    assert_that(
        is.numeric(x),
        length(n_iter) == 1,
        is.integer(n_iter),
        n_iter > 0,
        length(n_components) == 1,
        is.integer(n_components),
        n_components > 0,
        length(n_jobs) == 1,
        is.integer(abs(n_jobs)),
        (n_jobs %in% c(-1L, -2L)) | n_jobs > 0,
        length(learning_rate) == 1,
        (is.numeric(learning_rate) & learning_rate > 0) |
            learning_rate == "auto",
        length(perplexity) == 1,
        is.numeric(perplexity),
        perplexity > 0,
        length(early_exaggeration) == 1,
        is.numeric(early_exaggeration),
        early_exaggeration > 0,
        length(early_exaggeration_iter) == 1,
        is.integer(early_exaggeration_iter),
        early_exaggeration_iter >= 0,
        is.null(exaggeration) ||
            (is.numeric(exaggeration) &
                exaggeration > 0 &
                length(exaggeration) == 1),
        length(dof) == 1,
        is.numeric(dof),
        dof > 0,
        length(theta) == 1,
        is.numeric(theta),
        theta > 0,
        length(n_interpolation_points) == 1,
        is.integer(n_interpolation_points),
        n_interpolation_points > 0,
        length(min_num_intervals) == 1,
        is.integer(min_num_intervals),
        min_num_intervals > 0,
        length(ints_in_interval) == 1,
        is.numeric(ints_in_interval),
        min_num_intervals > 0,
        is.character(initialization) | is.matrix(initialization),
        is.character(metric),
        is.null(metric_params) |
            is.character(metric_params) |
            is.list(metric_params),
        length(initial_momentum) == 1,
        is.numeric(initial_momentum),
        initial_momentum > 0,
        length(final_momentum) == 1,
        is.numeric(final_momentum),
        final_momentum > 0,
        is.null(max_grad_norm) | is.numeric(max_grad_norm),
        is.null(random_state) | is.integer(random_state),
        is.logical(verbose)
    )
}
# max_grad_norm for embedding new points
# when adding points into an existing embedding and the new points overlap
# with the reference points, leading to large gradients. This can make points
# “shoot off” from the embedding, causing the interpolation method to compute
# a very large grid, and leads to worse results.