#' Calculates the perplexity and affinity for a single point at a given precision
#'
#' This function is equivalent to the Python \code{get_perplexity()}. It takes
#' the distances from one point to all others in the dataset and calculates the
#' affinites for that point at a specific precision.
#'
#' @param d A vector of distances from one point to all the rest
#'
#' @param beta The precision of the gaussian distribution used
#'
#' @return A list with the following entries:
#' \describe{
#' \item{A}{A vector of affinities to the points given in d}
#' \item{H}{The perplexity of the point}
#' }
#'
#' @noRd
#'
getPerplexity <- function(d, beta) {
A <- exp(-d*beta)
H <- log(sum(A)) + beta*sum(d*A)/sum(A)
list(A=A, H=H)
}
#' Calculates the affinites for a given distance matrix
#'
#' This function is equivalent to the Python \code{d2a()}. It takes a distance
#' matrix and a desired perplexity (optionally a tolerance) and calculates the
#' affinities for all points by individually adjusting the gaussian kernel to
#' satisfy the desired perplexity for each point.
#'
#' @param dist A distance matrix; usually as returned by \code{\link[stats]{dist}}
#'
#' @param perplexity The desired perplexity
#'
#' @param tolerance The acceptable difference between the obtained and desired
#' perplexity
#'
#' @return A matrix matching the dimensions of dist with the affinities for each
#' point. Contrary to dist is it not symmetrical. It should be read row-wise in
#' the sense that at row i you will have sample i's affinity to all other
#' points. Conversily at column i you will have al other points affinity toward
#' sample i.
#'
#' @noRd
#'
affinity <- function(dist, perplexity, tolerance=1e-5) {
maxIter <- 50
dist <- as.matrix(dist)
n <- nrow(dist)
ans <- matrix(0, ncol=n, nrow=n)
beta <- rep(1, n)
logU <- log(perplexity)
for(i in 1:n) {
Di <- dist[i, -i]
betamin = -Inf
betamax = Inf
perp <- getPerplexity(Di, beta[i])
Hdiff <- perp$H - logU
tries <- 0
while(is.nan(Hdiff) || (abs(Hdiff) > tolerance && tries < maxIter)) {
if(is.nan(Hdiff)) {
beta[i] <- beta[i]/10
} else if(Hdiff > 0) {
betamin <- beta[i]
if(betamax == Inf || betamax == -Inf) {
beta[i] <- beta[i] * 2
} else {
beta[i] <- (beta[i]+betamax) / 2
}
} else {
betamax <- beta[i]
if(betamin == Inf || betamin == -Inf) {
beta[i] <- beta[i] / 2
} else {
beta[i] <- (beta[i]+betamin) / 2
}
}
perp <- getPerplexity(Di, beta[i])
Hdiff <- perp$H - logU
tries <- tries + 1
}
ans[i, -i] <- perp$A
}
ans
}
#' Calculate the binding probabilities given an affinity matrix
#'
#' This function is equivalent to the Python \code{a2b()}. It calculates the
#' probability that a point binds to another point in a stochastic neighbor
#' graph. The binding probability is proportional to the affinity and is in fact
#' just the affinity normalised to the sum of all affinities for a point.
#'
#' @param affinity An affinity matrix as returned by \code{\link{affinity}}
#'
#' @return A matrix matching the dimensions of affinity, with the binding
#' probabilities for each directed edge
#'
#' @noRd
#'
bindingProb <- function(affinity) {
affinity/apply(affinity, 1, sum)
}
#' Calculate the outlier probabilities given a binding probabilty matrix
#'
#' This function is equivalent to the Python \code{b2o()}. It calculates for
#' each point the probabilty that its in-degree in a stochastic neighbor graph
#' is 0.
#'
#' @param bindingP A matrix with binding probabilities as returned by
#' \code{\link{bindingProb}}
#'
#' @return A vector of length equal to the number of rows in bindingP, with the
#' outlier probability for each point.
#'
#' @noRd
#'
outlierProb <- function(bindingP) {
apply(1-bindingP, 2, prod)
}
#' Perform stochastic outlier selection on a distance matrix
#'
#' This function calculates for each point in a distance matrix its probabilty
#' of being an outlier, given a certain perplexity. The perplexity parameter can
#' be seen as the size of the neighborhood taken into account when assessing the
#' outlierness of a given point, though contrary to k in KNNDD and LOF it does
#' not have to be an integer but can take any positive value (should be lower
#' than the number of points though). By setting a probability threshold the
#' output can be converted to a hard outlier classification.
#'
#' @param dist A distance matrix or a dist object as returned by \code{\link[stats]{dist}}
#'
#' @param perplexity The size of the neighborhood to be considered
#'
#' @return A list with the following entries:
#' \describe{
#' \item{outlierProbabilty}{A vector of outlier probabilities}
#' \item{bindingProbability}{A matrix with the binding probabilities between all point}
#' }
#'
#' @examples
#' irisDist <- dist(iris[, 1:4])
#' irisSos <- sos(irisDist, 65)
#' irisMDS <- cmdscale(irisDist)
#' plot(irisMDS, pch=19, col=ifelse(irisSos$outlierProbabilty > 0.55, 'red', 'black'))
#'
#' @export
#'
sos <- function(dist, perplexity) {
A <- affinity(dist, perplexity)
B <- bindingProb(A)
O <- outlierProb(B)
list(outlierProbabilty=O, bindingProbability=B)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.