sneer: Stochastic Neighbor Embedding Experiments in R

# Cost functions. Used by optimization routines to improve the embedding.

# Kullback-Leibler Divergence Cost Function
#
# A measure of embedding quality between input and output data.
#
# This cost function evaluates the embedding quality by calculating the KL
# divergence between the input probabilities and the output probabilities.
# More specifically, it considers the input probabilities to be the reference
# probabilities. See the note below for more details and whether you should
# care about the distinction.
#
# @note The KL divergence is asymmetric, so that D_KL(P||Q) != D_KL(Q||P).
# With this cost function, the input probability distribution is considered the
# "reference" probability, and a more precise way to describe this cost
# function is that it measures the divergence of the output probabilities
# \emph{from} from the input probabilities.
#
# For t-SNE and related embedding methods, only this type of KL divergence is
# calculated. However other methods (e.g. NeRV) also consider the "reverse"
# divergence, i.e. using the output probabilities as reference probabilities.
# Equivalently, this could be defined as the KL divergence of the input
# probabilities \emph{from} the output probabilities.
#
# This cost function requires the following matrices to be defined:
# \describe{
#  \item{\code{inp$pm}}{Input probabilities.}
#  \item{\code{out$qm}}{Output probabilities.}
# }
#
# For embedding methods which define their cost function over multiple
# probability distributions (e.g. \code{\link{asne}}), this cost function
# returns the sum of the divergences.
#
# @param inp Input data.
# @param out Output data.
# @param method Embedding method.
# @return KL divergence between \code{inp$pm} and \code{out$qm}.
# @seealso To use \code{out$qm} as the reference probability and calculate the
#   divergence of \code{inp$pm} from \code{out$qm}, see
#   \code{\link{reverse_kl_cost}}.
# @family sneer cost functions
kl_cost <- function(inp, out, method) {
  kl_divergence(inp$pm, out$qm, method$eps)
}
attr(kl_cost, "sneer_cost_type") <- "prob"

# Decomposes the cost into the sum of n values, where n is the number of points.
kl_cost_point <- function(inp, out, method) {
  kl_divergence_point(inp$pm, out$qm, method$eps)
}

# Kullback-Leibler Divergence
#
# A measure of embedding quality between input and output probability matrices.
#
# The Kullback-Leibler Divergence between two discrete probabilities P and Q
# is:
#
# \deqn{D_{KL}(P||Q) = \sum_{i}P(i)\log\frac{P(i)}{Q(i)}}{D_KL(P||Q) = sum(Pi*log(Pi/Qi))}
#
# The base of the log determines the units of the divergence.
#
# If a row probability matrix is provided (where each row in the matrix is
# a separate distribution), this function returns the sum of all divergences.
#
# @param pm Probability Matrix. First probability in the divergence.
# @param qm Probability Matrix. Second probability in the divergence.
# @param eps Small floating point value used to avoid numerical problems.
# @return KL divergence between \code{pm} and \code{qm}.
# @seealso \code{\link{kl_divergence_rows}} to obtain the separate KL
# divergences when using row probability matrices.
kl_divergence <- function(pm, qm, eps = .Machine$double.eps) {
  sum(kl_divergence_rows(pm, qm, eps))
}

# Kullback-Leibler Divergence per Row
#
# A measure of embedding quality between input and output row probability
# matrices.
#
# The Kullback-Leibler Divergence between two discrete probabilities P and Q
# is:
#
# \deqn{D_{KL}(P||Q) = \sum_{i}P(i)\log\frac{P(i)}{Q(i)}}{D_KL(P||Q) = sum(Pi*log(Pi/Qi))}
#
# The base of the log determines the units of the divergence.
#
# This function calculates the KL for each distribution in the provided
# matrices, one per row.
#
# @param pm Row probability Matrix. First probability in the divergence.
# @param qm Row Probability Matrix. Second probability in the divergence.
# @param eps Small floating point value used to avoid numerical problems.
# @return Vector of KL divergences from \code{qm} to \code{pm}.
kl_divergence_rows <- function(pm, qm, eps = .Machine$double.eps) {
  apply(pm * log((pm + eps) / (qm + eps)), 1, sum)
}

# Decomposes the divergence into the sum of n values, where n is the number
# of points.
kl_divergence_point <- function(pm, qm, eps = .Machine$double.eps) {
  kl_pair <- pm * log((pm + eps) / (qm + eps))
  0.5 * (apply(kl_pair, 1, sum) + apply(kl_pair, 2, sum))
}

# Create Cost Function Normalizer
#
# A function to transform a cost function into a normalized cost function.
#
# The cost function can be any function where the more positive the value,
# the worse the solution is considered to be. The corresponding normalized
# version is that which scales the cost so that a "null" model would give a
# normalized cost of 1.0.
#
# The definition of a "null" model is one which is as good as can be if one
# didn't use any information from the data at all. For methods that attempt
# to preserve distances, this would be equivalent to making all the embedded
# distances the same, which can only be achieved by making them all zero. For
# probability-based methods, the equivalent would be to make all the
# probabilities equal.
#
# The cost function should have the signature \code{cost_fn(inp, out, method)}
# and return a scalar numeric cost value. In addition it should have an
# appropriate \code{sneer_cost_type} attribute set. For cost functions that act
# on probabilities, this should be \code{"prob"}. For cost function that act on
# distances, this should be \code{"dist"}.
#
# Note that this function will attempt to synthesize a function to calculate
# a suitable normalization value, but isn't very sophisticated: it simply sets
# the output probabilities or distances (depending on the type of the cost)
# function to a uniform value. If this is insufficiently clever, you can set
# the a \code{sneer_cost_norm} attribute instead. The value of this attribute
# should be the name of a separate cost function which does the normalization
# explicitly. In this case, the function won't attempt to synthesize a
# function, but will return that custom function instead.
#
# @param cost_fn Cost function to normalize.
# @return Normalized cost function with the signature
#  \code{norm_fn(inp, out, method)} and which returns a scalar numeric cost
#  value.
make_normalized_cost_fn <- function(cost_fn) {
  # see if there's a normalized cost function already defined
  norm_fn_name <- attr(cost_fn, "sneer_cost_norm")
  if (!is.null(norm_fn_name)) {
    norm_fn <- get(norm_fn_name)
    if (is.null(norm_fn)) {
      stop("No normalized cost function: ", norm_fn_name, " could be found")
    }
    return(norm_fn)
  }

  # or use a function that will generate a suitable null "probability" matrix
  # (qm may in fact be un-normalized weights or distances)
  norm_mat_fn_name <- attr(cost_fn, "sneer_null_matrix")
  if (!is.null(norm_mat_fn_name)) {
    fn <- function(inp, out, method) {
      cost <- cost_fn(inp, out, method)
      out$qm <- do.call(norm_mat_fn_name, list(out$qm))
      null_cost <- cost_fn(inp, out, method)
      cost / null_cost
    }
    return(fn)
  }

  # otherwise, synthesize from the cost type
  cost_type <- attr(cost_fn, "sneer_cost_type")
  if (is.null(cost_type)) {
    stop("Cost function has no sneer_cost_type")
  }

  null_model_fn_name <- paste("null_model", cost_type, sep = "_")
  if (cost_type == "prob") {
    mat_name <- "qm"
  }
  else if (cost_type == "dist") {
    mat_name <- "dm"
  }
  else if (cost_type == "weight") {
    mat_name <- "wm"
  }
  else {
    stop("No known null model matrix name for cost type '", cost_type, "'")
  }

  function(inp, out, method) {
    cost <- cost_fn(inp, out, method)
    out[[mat_name]] <- do.call(null_model_fn_name, list(out[[mat_name]]))
    null_cost <- cost_fn(inp, out, method)
    cost / null_cost
  }
}

# Null Model for Probability Matrices
#
# For a given probability matrix, return the equivalent "null" model, i.e. one
# where all probabilities are equal.
#
# @param pm Probability matrix. Can be row, joint or conditional.
# @return Probability matrix where all elements are equal.
null_model_prob <- function(pm) {
  matrix(sum(pm) / (nrow(pm) * ncol(pm)), nrow = nrow(pm), ncol = ncol(pm))
}

# Kullback-Leibler Divergence Cost Wrapper Factory Function
#
# Cost wrapper factory function.
#
# Creates the a list containing the required functions for using the Kullback
# Leibler divergence, KL(P||Q), in an embedding.
#
# Provides the cost function and its gradient (with respect to Q).
#
# @return KL divergence function and gradient.
# @family sneer cost wrappers
kl_fg <- function() {
  list(
    fn = kl_cost,
    gr = kl_cost_gr,
    point = kl_cost_point,
    name = "KL"
  )
}

# Kullback Leibler Cost Gradients
#
# Measures the gradient of the KL divergence of an embedding, with respect
# to the probabilities of the ouput probabilities.
#
# @param inp Input data.
# @param out Output data.
# @param method Embedding method.
# @return Gradient of the KL divergence from \code{inp$pm} to \code{out$qm}.
kl_cost_gr <- function(inp, out, method) {
  kl_divergence_gr(inp$pm, out$qm, method$eps)
}

# Kullback Leibler Gradient
#
# Calculates the gradient of the KL divergence with respect to the
# probability Q in KL(P||Q).
#
# @param pm Probability Matrix. First probability in the divergence.
# @param qm Probability Matrix. Second probability in the divergence.
# @param eps Small floating point value used to avoid numerical problems.
# @return Gradient of the KL divergence from \code{pm} to \code{qm}.
kl_divergence_gr <- function(pm, qm, eps = .Machine$double.eps) {
  -pm / (qm + eps)
}

# Finite Difference Gradient Calculation
#
# Calculate the gradient of the cost function for a specified position using
# a finite difference.
#
# Only intended for testing that analytical gradients have been calculated
# correctly.
#
# @param inp Input data.
# @param out Output data containing the desired position.
# @param method Embedding method.
# @param diff Step size to take in finite difference calculation.
# @return Gradient matrix.
cost_gradient_fd <- function(inp, out, method, diff = 1e-4, mat_name = "qm") {
  mm <- out[[mat_name]]
  nr <- nrow(mm)
  nc <- ncol(mm)
  grad <- matrix(0, nrow = nr, ncol = nc)
  for (i in 1:nr) {
    for (j in 1:nc) {
      old <- mm[i, j]
      mm[i, j] <- old - diff
      out[[mat_name]] <- mm
      out <- out_updated(inp, out, method)
      cost_back <- calculate_cost(method, inp, out)

      mm[i, j] <- old + diff
      out[[mat_name]] <- mm
      out <- out_updated(inp, out, method)
      cost_fwd <- calculate_cost(method, inp, out)

      fd <- (cost_fwd - cost_back) / (2 * diff)
      grad[i, j] <- fd
    }
  }
  grad
}

# Evaluate Cost Function
#
# @param method Embedding method containing a cost function.
# @param inp Input data.
# @param out Output data.
# @return Scalar cost function value.
calculate_cost <- function(method, inp, out) {
  method$cost$fn(inp, out, method)
}


# Square Loss -------------------------------------------------------------

sum2_cost <- function(inp, out, method) {
  diff <- inp$pm - out$qm
  0.5 * sum(diff * diff)
}
attr(sum2_cost, "sneer_cost_type") <- "prob"

sum2_fg <- function() {
  list(
    fn = sum2_cost,
    gr = function(inp, out, method) {
      out$qm - inp$pm
    },
    point = function(inp, out, method) {
      diff <- inp$pm - out$qm
      0.5 * apply(diff * diff, 1, sum)
    },
    name = "sum2"
  )
}


# Un-normalized KL --------------------------------------------------------

null_model_weight <- function(wm) {
  matrix(1, nrow = nrow(wm), ncol = ncol(wm))
}

unkl_divergence <- function(vm, wm, eps = .Machine$double.eps) {
  sum(vm * log((vm + eps) / (wm + eps)) - vm + wm)
}

unkl_cost <- function(inp, out, method) {
  unkl_divergence(inp$pm, out$qm, method$eps)
}
attr(unkl_cost, "sneer_null_matrix") <- "null_model_weight"

unkl_fg <- function() {
  list(
    fn = unkl_cost,
    gr = function(inp, out, method) {
      1 - (inp$pm / (out$qm + method$eps))
    },
    point = function(inp, out, method) {
      vm <- inp$pm
      wm <- out$qm
      eps <- method$eps
      apply(vm * log((vm + eps) / (wm + eps)) - vm + wm, 1, sum)
    },
    name = "UNKL",
    keep_weights = TRUE,
    replace_probs_with_weights = TRUE
  )
}


# Reverse Un-normalized KL ------------------------------------------------

reverse_unkl_cost <- function(inp, out, method) {
  unkl_divergence(out$qm, inp$pm, method$eps)
}
attr(reverse_unkl_cost, "sneer_null_matrix") <- "null_model_weight"

reverse_unkl_fg <- function() {
  list(
    fn = reverse_unkl_cost,
    gr = function(inp, out, method) {
      log(out$qm / (inp$pm + method$eps))
    },
    point = function(inp, out, method) {
      vm <- out$qm
      wm <- inp$pm
      eps <- method$eps
      apply(vm * log((vm + eps) / (wm + eps)) - vm + wm, 1, sum)
    },
    name = "revUNKL",
    keep_weights = TRUE,
    replace_probs_with_weights = TRUE
  )
}

# Un-normalized NeRV ------------------------------------------------------

unnerv_cost <- function(inp, out, method) {
  method$cost$lambda * unkl_cost(inp, out, method) +
    (1 - method$cost$lambda) * reverse_unkl_cost(inp, out, method)
}
attr(unnerv_cost, "sneer_null_matrix") <- "null_model_weight"

unnerv_fg <- function(lambda = 0.5) {

  unkl <- unkl_fg()
  revunkl <- reverse_unkl_fg()

  list(
    fn = unnerv_cost,
    gr = function(inp, out, method) {
      method$cost$lambda * unkl$gr(inp, out, method) +
        (1 - method$cost$lambda) * revunkl$gr(inp, out, method)
    },
    point = function(inp, out, method) {
      method$cost$lambda * unkl$point(inp, out, method) +
        (1 - method$cost$lambda) * revunkl$point(inp, out, method)
    },
    lambda = lambda,
    name = "revNeRV",
    keep_weights = TRUE,
    replace_probs_with_weights = TRUE
  )
}

# NB un-normalized JSE expression simplifies to normalized version