sneer: Stochastic Neighbor Embedding Experiments in R

# Distance-based embedding methods like metric MDS and Sammon Mapping.

# Distance-Based Embeddings
#
# The available embedding methods which work by reproducing input distances.
#
# @examples
# \dontrun{
# embed_dist(method = mmds(), ...)
# embed_dist(method = sammon_map(eps = 1e-4), ...)
# }
# @keywords internal
# @name distance_embedding_methods
# @family sneer distance embedding methods
NULL

# Create an embedding method which uses distances directly
dist_embedder <- function(cost, stiffness, eps = .Machine$double.eps) {
  remove_nulls(list(
    cost = cost,
    stiffness = stiffness,
    update_out_fn = update_dist,
    gradient = dist_gradient(),
    eps = eps
  ))
}

# Output update function that only updates distances
update_dist = function(inp, out, method) {
  out$dm <- distance_matrix(out$ym)
  list(out = out, inp = inp)
}


# Generic stiffness function for distance-based embedding
distance_stiffness_fn <- function(method, inp, out) {
  dc_dd <- method$cost$gr(inp, out, method)
  dc_dd / out$dm
}

distance_stiffness <- function() {
  list(
    fn = distance_stiffness_fn,
    name = "distance"
  )
}

# MMDS --------------------------------------------------------------------

mmds_stiffness_fn <- function(dxm, dym, eps = .Machine$double.eps) {
  (dxm - dym) / -dym
}

mmds_stiffness <- function() {
  list(
    fn = function(method, inp, out) {
      mmds_stiffness_fn(inp$dm, out$dm, eps = method$eps)
    },
    name = "MMDS"
  )
}

# Metric Multi-dimensional Scaling (MDS) Using STRESS Cost Function
#
# A distance-based embedding method.
#
# This function minimizes an embedding using \code{metric_stress_cost}
# as the cost function:
#
# \deqn{STRESS = \sum_{i<j} (r_{ij} - d_{ij})^2}{STRESS = sum(rij-dij)^2}
#
# \eqn{r_{ij}}{rij} is the input distance between point \eqn{i} and point
# \eqn{j} and \eqn{d_{ij}}{dij} is the corresponding output distance.
#
# MDS is an umbrella term for many algorithms, with an emphasis on
# non-metric problems, i.e. those where the input distance is not a metric.
# The non-metric stress loss function has a slightly different definition, so
# when comparing the result of this embedding method with that of other MDS
# algorithms, you need to be quite sure exactly how the stress is being
# calculated, which may not be apparent without examining the source code.
#
# @seealso
# There are a plethora of MDS algorithms implemented in R. For non-metric MDS,
# the most commonly reported loss function is the Kruskal
# Stress (also known as Stress-1), which when applied to a metric problem is:
#
# \deqn{K = \sqrt{\frac{\sum_{i<j} (r_{ij} - d_{ij})^2}{\sum_{i<j} d_{ij}^2}}}{K = sqrt(sum(rij-dij)^2/sum(dij^2))}
#
# i.e. the square root of a normalized metric stress, where the normalization
# is by the sum of the squares of the embedded distances. Some functions in other
# packages which use this include:
# \describe{
# \item{\code{\link[MASS]{isoMDS}}}{Also applies a monotonic
# transformation to the input distances to lower the stress, so the reported
# stress and the output configuration can't be directly compared to the output
# of a sneer MDS embedding.}
# \item{\code{\link[smacof]{mds}}}{Applies ratio metric MDS, which attempts
# to preserve the ratio of the distances in the input and output space. The
# result of this function is a list, containing a member \code{stress}, which
# is the Kruskal Stress, so can be compared to the
# \code{kruskal_stress_cost} of a sneer embedding. However, the embedded
# configuration in the result, \code{conf} configuration, is not on the
# same scale as the input coordinates.}
# \item{\code{\link[stats]{cmdscale}}}{Doesn't use Kruskal Stress, but
# implements a form of classical metric MDS called Principal Coordinate
# Analysis. However, note that if the input distances are Euclidean (which
# they are in sneer), the result is identical to PCA, so the output will
# always have a higher stress than the sneer MDS embedding.}
# }
# @section Output Data:
# If used in an embedding, the output data list will contain:
# \describe{
#  \item{\code{ym}}{Embedded coordinates.}
#  \item{\code{dm}}{Distance matrix generated from \code{ym}.}
# }
#
# @param eps Small floating point value used to prevent numerical problems,
# e.g. in gradients and cost functions.
# @return a list containing:
#  \item{\code{cost_fn}}{Cost function for the embedding: for this
#  implementation of MDS, the unweighted residual sum of squares between
#  the distances.}
#  \item{\code{stiffness_fn}}{Stiffness function for \code{cost_fn}.}
#  \item{\code{update_out_fn}}{Function to calculate and store any needed
#  data after a coordinate update.}
#  \item{\code{eps}}{Small floating point value used to prevent numerical
#  problems, e.g. in gradients and cost functions.}
# @family sneer embedding methods
# @family sneer distance embedding methods
mmds <- function(eps = .Machine$double.eps) {
  dist_embedder(
    cost = metric_stress_fg(),
    stiffness = mmds_stiffness(),
    eps = eps
  )
}


# SSTRESS MMDS ------------------------------------------------------------

smmds_stiffness_fn <- function(dxm, dym, eps = .Machine$double.eps) {
  -2 * (dxm * dxm - dym * dym)
}

smmds_stiffness <- function() {
  list(
    fn = function(method, inp, out) {
      smmds_stiffness_fn(inp$dm, out$dm, eps = method$eps)
    },
    name = "SMMDS"
  )
}


# Metric Multi-dimensional Scaling (MDS) using SSTRESS Cost Function.
#
# A distance-based embedding method.
#
# This function minimizes an embedding using the SSTRESS loss function:
#
# \deqn{SSTRESS = \sum_{i<j} ((r_{ij}^2 - d_{ij})^2)^2}{SSTRESS = sum(rij^2-dij^2)^2}
#
# \eqn{r_{ij}}{rij} is the input distance between point \eqn{i} and point
# \eqn{j} and \eqn{d_{ij}}{dij} is the corresponding output distance.
#
# @param eps Small floating point value used to prevent numerical problems,
# e.g. in gradients and cost functions.
# @return a list containing:
#  \item{\code{cost_fn}}{Cost function for the embedding: for this
#  implementation of MDS, the SSTRESS cost function.}
#  \item{\code{stiffness_fn}}{Stiffness function for \code{cost_fn}.}
#  \item{\code{update_out_fn}}{Function to calculate and store any needed
#  data after a coordinate update.}
#  \item{\code{eps}}{Small floating point value used to prevent numerical
#  problems, e.g. in gradients and cost functions.}
# @family sneer embedding methods
# @family sneer distance embedding methods
#
# @references
# Hughes, N. P., & Lowe, D. (2002).
# Artefactual Structure from Least-Squares Multidimensional Scaling.
# In \emph{Advances in Neural Information Processing Systems} (pp. 913-920).
smmds <- function(eps = .Machine$double.eps) {
  dist_embedder(
    cost = metric_sstress_fg(),
    stiffness = smmds_stiffness(),
    eps = eps
  )
}


# Sammon ------------------------------------------------------------------

sammon_stiffness_fn <- function(dxm, dym, inv_sum_rij, eps = .Machine$double.eps) {
  -2 * inv_sum_rij * (dxm - dym) / (dxm * dym + eps)
}

sammon_stiffness <- function() {
  list(
    fn = function(method, inp, out) {
      sammon_stiffness_fn(inp$dm, out$dm, inv_sum_rij = method$inv_sum_rij,
                          eps = method$eps)
    },
    name = "Sammon"
  )
}

# Sammon Mapping
#
# A distance-based embedding method.
#
# This function minimizes an embedding using a quadratic loss function on
# the difference between the input distances and the output distances:
#
# \deqn{S = \frac{\sum_{i<j}\frac{(r_{ij} - d_{ij})^2}{r_{ij}}}
# {\sum_{i<j} r_{ij}}}{S = sum(((rij-dij)/rij)^2)/sum(rij)}
#
# \eqn{r_{ij}}{rij} is the input distance between point \eqn{i} and point
# \eqn{j} and \eqn{d_{ij}}{dij} is the corresponding output distance.
#
# This puts a greater emphasis on short distances over long distances, compared
# to MDS. Note that the denominator is a constant, and is just there to
# normalize the stress.
#
# @section Output Data:
# If used in an embedding, the output data list will contain:
# \describe{
#  \item{\code{ym}}{Embedded coordinates.}
#  \item{\code{dm}}{Distance matrix generated from \code{ym}.}
# }
#
# @param eps Small floating point value used to prevent numerical problems,
# e.g. in gradients and cost functions.
# @return a list containing:
#  \item{\code{cost_fn}}{Cost function for the embedding: Sammon's stress.}
#  \item{\code{stiffness_fn}}{Stiffness function for \code{cost_fn}}.
#  \item{\code{update_out_fn}}{Function to calculate and store any needed
#  data after a coordinate update.}
#  \item{\code{after_init_fn}}{Function to calculate and store any needed
#  data after other initialization has been done. For Sammon mapping, we
#  store the sum of the input distances as used in the denominator of the
#  Sammon Stress as a normalization.}
#  \item{\code{eps}}{Small floating point value used to prevent numerical
#  problems, e.g. in gradients and cost functions.}
# @seealso \code{\link[MASS]{sammon}}, which also carries out Sammon mapping.
# Results should be comparable with those of a sneer embedding. The
# \code{stress} value is equivalent to the \code{sammon_stress_cost}
# function in sneer.
# @family sneer embedding methods
# @family sneer distance embedding methods
sammon_map <- function(eps = .Machine$double.eps) {
  dist_embedder(
    cost = sammon_fg(),
    stiffness = sammon_stiffness(),
    eps = eps
  )
}