R/nearest_point.R

Defines functions nearest_point nearest_profile get_journal_IDs multinomial_mix cosine_similarity dissimilarity

Documented in cosine_similarity dissimilarity get_journal_IDs multinomial_mix nearest_point nearest_profile

#' Calculate nearest point in convex hull
#'
#' Given a citation matrix and a community structure, `nearest_point` calculates the point
#' in the convex hull of community profiles that is nearest to a given journal profile.
#'
#' This function uses quadratic programming to calculate the closest point by Euclidean distance.
#' The `citations` matrix should be arranged so that citations are directed from columns to rows.
#' If `idx` is a name, it should correspond to a row name from `citations`.
#'
#' The function `nearest_profile` is a shorthand way to calculate the *profile* corresponding
#' to the nearest point.
#'
#' @param idx A journal name or index. Vectorised for `nearest_profile()`
#' @param citations a matrix of citations (from columns to rows) or an [igraph][igraph::igraph] object
#' @param communities A membership vector or [igraph::communities] object
#' @inheritParams cprofile
#'
#' @return
#' `nearest_point` returns the list generated by [quadprog::solve.QP()].
#'
#' @examples
#' counts <- citations[1:6, 1:6]
#' comms <- setNames(c(1, 2, 3, 2, 2, 4), colnames(counts))
#' nearest_point('AoS', counts, comms) # Inside hull (value == 0), because AoS is itself a community.
#' nearest_point('ANZS', counts, comms) # Outside hull, near to its own community.
#'
#' # To which cluster should 'Biometrika' belong?
#' distances <- as.dist(1 - cor(citations + t(citations)))
#' clusters <- cutree(hclust(distances), h = 0.8)
#' result <- nearest_point('Bka', citations, clusters)
#' # Verify Euclidean distance is calculated correctly:
#' point <- community_profile(citations, clusters) %*% result$solution
#' result$value %==% sum((citations[, 'Bka'] / sum(citations[, 'Bka']) - point)^2)
#'
#' @seealso
#' [quadprog::solve.QP()], [community_profile()]
#'
#' @importFrom quadprog solve.QP
#' @export
nearest_point <- function(idx, citations, communities, self = TRUE) {
  stopifnot(length(idx) == 1) # May vectorise later
  C <- community_profile(citations, communities, self = self)
  g <- ncol(C) # number of communities
  P <- cprofile(citations, self = self)
  D <- t(C) %*% C
  d <- P[, idx] %*% C
  A <- t(rbind(rep(1, g),
               diag(g)))
  b <- c(1, rep(0, g))

  soln <- quadprog::solve.QP(Dmat = D, dvec = d, Amat = A, bvec = b, meq = 1)
  soln$value <- base::drop(2 * soln$value + t(P[, idx]) %*% P[, idx]) # re-add constant term p^T p
  soln$cosine <- 1 - soln$value / 2
  soln
}

#' Calculate nearest point by cosine similarity
#'
#' Applying EM to each individual point. Like a separate mixture model for every journal.
#'
#' Find the nearest [citation profile][cprofile()] to `x` that is a convex combination of
#' the community profiles `y`.
#'
#' @inheritParams nearest_point
#'
#' @details
#' In order for an \eqn{n}-vector to be a [citation profile][cprofile()],
#'   the elements must be non-negative and sum to one.
#' This is also true for any convex combination (finite mixture distribution) of citation profiles.
#'
#' Geometrically, this represents a point on part of the surface of the unit \eqn{n-1}-sphere
#'   that is within the positive \href{https://en.wikipedia.org/wiki/Orthant}{closed orthant}
#'   in \eqn{R^n}.
#' In a 3-journal network, this corresponds to the eighth of the unit sphere in the first octant and
#'   in a 2-journal network, the quarter of the unit circle in the first quadrant.
#'
#' The largest possible angle between two valid citation profiles is
#'   \eqn{\theta = \frac{\pi}{2}}{\theta = \pi/2}, with cosine similarity
#'   \eqn{\cos\theta = 0}{cos \theta = 0}.
#'   For example, the most well-separated profiles in 2-d are \eqn{(1, 0)} and \eqn{(0, 1)} ---
#'   i.e. the \eqn{x}-axis and the \eqn{y}-axis.
#'
#' @return
#' An object exactly like that returned by [nearest_point()].
#' Use `$cosine` to extract the cosine similarity.
#'
#' @seealso [nearest_point()], [cosine_similarity()]
#'
#' @export
nearest_cosine <- nearest_point

#' @rdname nearest_point
#' @return `nearest_profile()` returns a stochastic vector whose length is equal
#'   to the number of rows/vertices in `citations`. It is also vectorised, so if
#'   `idx` has length > 1, a matrix will be returned instead.
#' @export
nearest_profile <- function(idx = NULL, citations, communities, self = TRUE) {
  idx <- get_journal_IDs(idx, citations)
  cp <- community_profile(citations, communities)
  vapply(idx,
         function(i) {
           drop(as.matrix(
            cp %*% nearest_point(i, citations, communities, self)$solution
           ))
           },
         FUN.VALUE = numeric(nrow(cp))
         )
}

#' Get all journal IDs if none specified
#'
#' If `idx` is `NULL` then the function will attempt to select *all* journals.
#' @inheritParams nearest_point
#' @return A vector of journal names
get_journal_IDs <- function(idx = NULL, citations) {
  if (!is.null(idx))
    return(idx)
  if (inherits(citations, 'igraph'))
    return(V(citations)$name)
  rownames(citations)
}

#' Multinomial mixture probabilities
#'
#' This is equivalent to performing the initial "E" step in expectation maximisation (EM)
#' for a multinomial mixture model.
#'
#' Here we are effectively computing the nearest point in the convex hull of community profiles,
#' implicitly using Kullback--Leibler divergence as the distance measure.
#' Alternative distance measures may be used; see [nearest_point()] and others.
#'
#' @seealso [nearest_point()]
#'
#' @param target A vector of citations to be compared
#' @param profiles A matrix of [community_profile()]s
#'
#' @return
#' A vector of log-probabilities that each community generated `target`'s citation profile.
#'
#' @examples
#' # To which cluster should 'Biometrika' belong?
#' distances <- as.dist(1 - cor(citations + t(citations)))
#' clusters <- cutree(hclust(distances), h = 0.8)
#' profiles <- community_profile(citations, clusters)
#' Biometrika <- citations[, 'Bka']
#' w <- multinomial_mix(Biometrika, profiles)
#' which.max(w) == clusters['Bka']
#' profiles %*% exp(w) # nearest point
#'
#' @importFrom stats dmultinom
#'
#' @export
multinomial_mix <- function(target, profiles) {
  # Calculate log density of each multinomial model
  logf <- apply(profiles * sum(target), 2, function(j) dmultinom(target, prob = j, log = TRUE))
  # Safely normalise the densities using the 'log-sum-exp' trick
  a <- max(logf)
  logsumf <- a + log(sum(exp(logf - a)))
  logf - logsumf # = log(f / sum(f))
}

#' Cosine similarity between two vectors
#'
#' This function computes the *cosine similarity* between two vectors, defined by
#' \deqn{\cos(\theta) = \frac{x \cdot y}{\|x\|_2\|y\|_2}.}{cos(\theta) = (x . y) / (|x| |y|).}
#'
#' Cosine similarity is related to Euclidean distance by
#' \deqn{\|x - y \|^2 = 2(1 - \cos(\theta)).}{|x - y|^2 = 2(1 - cos(\theta)).}
#' So \deqn{\cos(\theta) = 1 - \frac12\|x - y\|^2,}{cos(\theta) = 1 - 0.5 * |x - y|^2,}
#' assuming `x` and `y` have been normalised to be unit vectors.
#' Therefore, if we want to *maximise* cosine similarity, we can minimise Euclidean distance
#' and then make the conversion. See [nearest_point()].
#'
#' @param x A numeric vector
#' @param y A numeric vector, the same length as `x`
#'
#' @return
#' Generally, a scalar between \eqn{-1} and \eqn{1}.
#' Or, if `x` and `y` are non-negative, a value between \eqn{0} and \eqn{1}.
#'
#' @references
#' \url{https://en.wikipedia.org/wiki/Cosine_similarity}
#'
#' @seealso [nearest_cosine()], [cos()], [acos()]
#'
#' @export
cosine_similarity <- function(x, y) {
  x %*% y / sqrt(x %*% x * y %*% y)
}

#' Index of dissimilarity
#'
#' @param x A numeric vector
#' @param y A numeric vector, the same length as `x`
#'
#' @return A number between \eqn{0} and \eqn{1}. Lower is better.
#'
#' @references
#' Kuha, Jouni and Firth, David (2011).
#' On index of dissimilarity for lack of fit in loglinear and log-multiplicative models.
#' *Computational Statistics \& Data Analysis*, 55(1):375--388.
#'
#' @seealso [profile_residuals()]
#'
#' @export
dissimilarity <- function(x, y) {
  stopifnot(sum(x) == sum(y))
  stopifnot(length(x) == length(y))
  sum(abs(x - y)) / 2 / sum(x)
}
Selbosh/scrooge documentation built on May 5, 2019, 8 p.m.