Nothing
# Part of the philentropy package
#
# Copyright (C) 2015-2021 Hajk-Georg Drost
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# http://www.r-project.org/Licenses/
#' @title Distances and Similarities between Probability Density Functions
#' @description This functions computes the distance/dissimilarity between two probability density functions.
#' @param x a numeric \code{data.frame} or \code{matrixF} (storing probability vectors) or a numeric \code{data.frame} or \code{matrix} storing counts (if \code{est.prob} is specified).
#' @param method a character string indicating whether the distance measure that should be computed.
#' @param p power of the Minkowski distance.
#' @param test.na a boolean value indicating whether input vectors should be tested for \code{NA} values. Faster computations if \code{test.na = FALSE}.
#' @param unit a character string specifying the logarithm unit that should be used to compute distances that depend on log computations.
#' @param epsilon a small value to address cases in the distance computation where division by zero occurs. In
#' these cases, x / 0 or 0 / 0 will be replaced by \code{epsilon}. The default is \code{epsilon = 0.00001}.
#' However, we recommend to choose a custom \code{epsilon} value depending on the size of the input vectors,
#' the expected similarity between compared probability density functions and
#' whether or not many 0 values are present within the compared vectors.
#' As a rough rule of thumb we suggest that when dealing with very large
#' input vectors which are very similar and contain many \code{0} values,
#' the \code{epsilon} value should be set even smaller (e.g. \code{epsilon = 0.000000001}),
#' whereas when vector sizes are small or distributions very divergent then
#' higher \code{epsilon} values may also be appropriate (e.g. \code{epsilon = 0.01}).
#' Addressing this \code{epsilon} issue is important to avoid cases where distance metrics
#' return negative values which are not defined and only occur due to the
#' technical issues of computing x / 0 or 0 / 0 cases.
#' @param est.prob method to estimate probabilities from input count vectors such as non-probability vectors. Default: \code{est.prob = NULL}. Options are:
#' \itemize{
#' \item \code{est.prob = "empirical"}: The relative frequencies of each vector are computed internally. For example an input matrix \code{rbind(1:10, 11:20)} will be transformed to a probability vector \code{rbind(1:10 / sum(1:10), 11:20 / sum(11:20))}
#' }
#' @param use.row.names a logical value indicating whether or not row names from
#' the input matrix shall be used as rownames and colnames of the output distance matrix. Default value is \code{use.row.names = FALSE}.
#' @param as.dist.obj shall the return value or matrix be an object of class \code{link[stats]{dist}}? Default is \code{as.dist.obj = FALSE}.
#' @param diag if \code{as.dist.obj = TRUE}, then this value indicates whether the diagonal of the distance matrix should be printed. Default
#' @param upper if \code{as.dist.obj = TRUE}, then this value indicates whether the upper triangle of the distance matrix should be printed.
#' @param mute.message a logical value indicating whether or not messages printed by \code{distance} shall be muted. Default is \code{mute.message = FALSE}.
#' @param num.threads an integer specifying the number of threads to be used for parallel computations. Default is taken from the `RCPP_PARALLEL_NUM_THREADS` environment variable, or `2` if not set.
#' @author Hajk-Georg Drost
#' @details
#' Here a distance is defined as a quantitative degree of how far two mathematical objects are apart from eachother (Cha, 2007).
#'
#' This function implements the following distance/similarity measures to quantify the distance between probability density functions:
#'
#'
#' \itemize{
#' \item L_p Minkowski family
#' \itemize{
#' \item Euclidean : \eqn{d = sqrt( \sum | P_i - Q_i |^2)}
#' \item Manhattan : \eqn{d = \sum | P_i - Q_i |}
#' \item Minkowski : \eqn{d = ( \sum | P_i - Q_i |^p)^1/p}
#' \item Chebyshev : \eqn{d = max | P_i - Q_i |}
#' }
#'
#' \item L_1 family
#' \itemize{
#' \item Sorensen : \eqn{d = \sum | P_i - Q_i | / \sum (P_i + Q_i)}
#' \item Gower : \eqn{d = 1/d * \sum | P_i - Q_i |}
#' \item Soergel : \eqn{d = \sum | P_i - Q_i | / \sum max(P_i , Q_i)}
#' \item Kulczynski d : \eqn{d = \sum | P_i - Q_i | / \sum min(P_i , Q_i)}
#' \item Canberra : \eqn{d = \sum | P_i - Q_i | / (P_i + Q_i)}
#' \item Lorentzian : \eqn{d = \sum ln(1 + | P_i - Q_i |)}
#' }
#'
#'
#' \item Intersection family
#' \itemize{
#' \item Intersection : \eqn{s = \sum min(P_i , Q_i)}
#' \item Non-Intersection : \eqn{d = 1 - \sum min(P_i , Q_i)}
#' \item Wave Hedges : \eqn{d = \sum | P_i - Q_i | / max(P_i , Q_i)}
#' \item Czekanowski : \eqn{d = \sum | P_i - Q_i | / \sum | P_i + Q_i |}
#' \item Motyka : \eqn{d = \sum min(P_i , Q_i) / (P_i + Q_i)}
#' \item Kulczynski s : \eqn{d = 1 / \sum | P_i - Q_i | / \sum min(P_i , Q_i)}
#' \item Tanimoto : \eqn{d = \sum (max(P_i , Q_i) - min(P_i , Q_i)) / \sum max(P_i , Q_i)} ; equivalent to Soergel
#' \item Ruzicka : \eqn{s = \sum min(P_i , Q_i) / \sum max(P_i , Q_i)} ; equivalent to 1 - Tanimoto = 1 - Soergel
#' }
#'
#' \item Inner Product family
#' \itemize{
#' \item Inner Product : \eqn{s = \sum P_i * Q_i}
#' \item Harmonic mean : \eqn{s = 2 * \sum (P_i * Q_i) / (P_i + Q_i)}
#' \item Cosine : \eqn{s = \sum (P_i * Q_i) / sqrt(\sum P_i^2) * sqrt(\sum Q_i^2)}
#' \item Kumar-Hassebrook (PCE) : \eqn{s = \sum (P_i * Q_i) / (\sum P_i^2 + \sum Q_i^2 - \sum (P_i * Q_i))}
#' \item Jaccard : \eqn{d = 1 - \sum (P_i * Q_i) / (\sum P_i^2 + \sum Q_i^2 - \sum (P_i * Q_i))} ; equivalent to 1 - Kumar-Hassebrook
#' \item Dice : \eqn{d = \sum (P_i - Q_i)^2 / (\sum P_i^2 + \sum Q_i^2)}
#' }
#'
#' \item Squared-chord family
#' \itemize{
#' \item Fidelity : \eqn{s = \sum sqrt(P_i * Q_i)}
#' \item Bhattacharyya : \eqn{d = - ln \sum sqrt(P_i * Q_i)}
#' \item Hellinger : \eqn{d = 2 * sqrt( 1 - \sum sqrt(P_i * Q_i))}
#' \item Matusita : \eqn{d = sqrt( 2 - 2 * \sum sqrt(P_i * Q_i))}
#' \item Squared-chord : \eqn{d = \sum ( sqrt(P_i) - sqrt(Q_i) )^2}
#' }
#'
#'
#' \item Squared L_2 family (\eqn{X}^2 squared family)
#' \itemize{
#' \item Squared Euclidean : \eqn{d = \sum ( P_i - Q_i )^2}
#' \item Pearson \eqn{X}^2 : \eqn{d = \sum ( (P_i - Q_i )^2 / Q_i )}
#' \item Neyman \eqn{X}^2 : \eqn{d = \sum ( (P_i - Q_i )^2 / P_i )}
#' \item Squared \eqn{X}^2 : \eqn{d = \sum ( (P_i - Q_i )^2 / (P_i + Q_i) )}
#' \item Probabilistic Symmetric \eqn{X}^2 : \eqn{d = 2 * \sum ( (P_i - Q_i )^2 / (P_i + Q_i) )}
#' \item Divergence : \eqn{X}^2 : \eqn{d = 2 * \sum ( (P_i - Q_i )^2 / (P_i + Q_i)^2 )}
#' \item Clark : \eqn{d = sqrt ( \sum (| P_i - Q_i | / (P_i + Q_i))^2 )}
#' \item Additive Symmetric \eqn{X}^2 : \eqn{d = \sum ( ((P_i - Q_i)^2 * (P_i + Q_i)) / (P_i * Q_i) ) }
#' }
#'
#' \item Shannon's entropy family
#' \itemize{
#' \item Kullback-Leibler : \eqn{d = \sum P_i * log(P_i / Q_i)}
#' \item Jeffreys : \eqn{d = \sum (P_i - Q_i) * log(P_i / Q_i)}
#' \item K divergence : \eqn{d = \sum P_i * log(2 * P_i / P_i + Q_i)}
#' \item Topsoe : \eqn{d = \sum ( P_i * log(2 * P_i / P_i + Q_i) ) + ( Q_i * log(2 * Q_i / P_i + Q_i) )}
#' \item Jensen-Shannon : \eqn{d = 0.5 * ( \sum P_i * log(2 * P_i / P_i + Q_i) + \sum Q_i * log(2 * Q_i / P_i + Q_i))}
#' \item Jensen difference : \eqn{d = \sum ( (P_i * log(P_i) + Q_i * log(Q_i) / 2) - (P_i + Q_i / 2) * log(P_i + Q_i / 2) )}
#' }
#'
#' \item Combinations
#' \itemize{
#' \item Taneja : \eqn{d = \sum ( P_i + Q_i / 2) * log( P_i + Q_i / ( 2 * sqrt( P_i * Q_i)) )}
#' \item Kumar-Johnson : \eqn{d = \sum (P_i^2 - Q_i^2)^2 / 2 * (P_i * Q_i)^1.5}
#' \item Avg(L_1, L_n) : \eqn{d = \sum | P_i - Q_i| + max{ | P_i - Q_i |} / 2}
#' }
#'
#' In cases where \code{x} specifies a count matrix, the argument \code{est.prob} can be selected to first estimate probability vectors
#' from input count vectors and second compute the corresponding distance measure based on the estimated probability vectors.
#'
#' The following probability estimation methods are implemented in this function:
#'
#' \itemize{
#' \item \code{est.prob = "empirical"} : relative frequencies of counts.
#' }
#' }
#' @examples
#' # Simple Examples
#'
#' # receive a list of implemented probability distance measures
#' getDistMethods()
#'
#' ## compute the euclidean distance between two probability vectors
#' distance(rbind(1:10/sum(1:10), 20:29/sum(20:29)), method = "euclidean")
#'
#' ## compute the euclidean distance between all pairwise comparisons of probability vectors
#' ProbMatrix <- rbind(1:10/sum(1:10), 20:29/sum(20:29),30:39/sum(30:39))
#' distance(ProbMatrix, method = "euclidean")
#'
#' # compute distance matrix without testing for NA values in the input matrix
#' distance(ProbMatrix, method = "euclidean", test.na = FALSE)
#'
#' # alternatively use the colnames of the input data for the rownames and colnames
#' # of the output distance matrix
#' ProbMatrix <- rbind(1:10/sum(1:10), 20:29/sum(20:29),30:39/sum(30:39))
#' rownames(ProbMatrix) <- paste0("Example", 1:3)
#' distance(ProbMatrix, method = "euclidean", use.row.names = TRUE)
#'
#' # Specialized Examples
#'
#' CountMatrix <- rbind(1:10, 20:29, 30:39)
#'
#' ## estimate probabilities from a count matrix
#' distance(CountMatrix, method = "euclidean", est.prob = "empirical")
#'
#' ## compute the euclidean distance for count data
#' ## NOTE: some distance measures are only defined for probability values,
#' distance(CountMatrix, method = "euclidean")
#'
#' ## compute the Kullback-Leibler Divergence with different logarithm bases:
#' ### case: unit = log (Default)
#' distance(ProbMatrix, method = "kullback-leibler", unit = "log")
#'
#' ### case: unit = log2
#' distance(ProbMatrix, method = "kullback-leibler", unit = "log2")
#'
#' ### case: unit = log10
#' distance(ProbMatrix, method = "kullback-leibler", unit = "log10")
#'
#' @note According to the reference in some distance measure computations invalid computations can
#' occur when dealing with 0 probabilities.
#'
#' In these cases the convention is treated as follows:
#'
#' \itemize{
#' \item division by zero - case \code{0/0}: when the divisor and dividend become zero, \code{0/0} is treated as \code{0}.
#' \item division by zero - case \code{n/0}: when only the divisor becomes \code{0}, the corresponsning \code{0} is replaced by a small \eqn{\epsilon = 0.00001}.
#' \item log of zero - case \code{0 * log(0)}: is treated as \code{0}.
#' \item log of zero - case \code{log(0)}: zero is replaced by a small \eqn{\epsilon = 0.00001}.
#' }
#'
#' @references Sung-Hyuk Cha. (2007). \emph{Comprehensive Survey on Distance/Similarity Measures between Probability Density Functions}. International Journal of Mathematical Models and Methods in Applied Sciences 4: 1.
#' @return
#' The following results are returned depending on the dimension of \code{x}:
#'
#' \itemize{
#' \item in case \code{nrow(x)} = 2 : a single distance value.
#' \item in case \code{nrow(x)} > 2 : a distance \code{matrix} storing distance values for all pairwise probability vector comparisons.
#' }
#' @seealso \code{\link{getDistMethods}}, \code{\link{estimate.probability}}, \code{\link{dist.diversity}}
#' @export
distance <- function(
x,
method = "euclidean",
p = NULL,
test.na = TRUE,
unit = "log",
epsilon = 0.00001,
est.prob = NULL,
use.row.names = FALSE,
as.dist.obj = FALSE,
diag = FALSE,
upper = FALSE,
mute.message = FALSE,
num.threads = NULL
) {
if (
!any(is.element(
class(x),
c(
"data.frame",
"matrix",
"data.table",
"tbl_df",
"tbl",
"array"
)
))
) {
stop(
"x should be a data.frame, data.table, tbl, tbl_df, array, or matrix.",
call. = FALSE
)
}
if (is.character(x)) {
stop(
paste0(
"Your input ",
class(x)[1],
" stores non-numeric values. Non-numeric values cannot be used to compute distances."
),
call = FALSE
)
}
dist_methods <- getDistMethods()
# transpose the matrix or data.frame
# in case of DF: DF is transformed to matrix by t()
x <- t(x)
if (!is.element(method, dist_methods)) {
stop(
"Method '",
method,
"' is not implemented in this function. Please consult getDistMethods().",
call. = FALSE
)
}
# number of input probability vectors
ncols <- ncol(x)
if (!is.null(est.prob)) {
x <- apply(x, 2, estimate.probability, method = est.prob)
}
if (!is.element(unit, c("log", "log2", "log10"))) {
stop("You can only choose units: log, log2, or log10.", call. = FALSE)
}
message("Metric: '",method, "' with unit: '", unit,
"'; comparing: ", ncols, " vectors")
# although validation would be great, it cost a lot of computation time
# for large comparisons between multiple distributions
# here a smarter (faster) way to validate distributions needs to be implemented
# if(check.distr){
# # check for distribution validity
# apply(x,2,valid.distr)
# }
dist <- NULL
if (ncols == 2) {
dist <- dist_one_one(
as.numeric(x[, 1]),
as.numeric(x[, 2]),
method,
p,
test.na,
unit,
epsilon
)
} else {
dist <- distance_cpp(x, method, p, test.na, unit, epsilon, num.threads)
}
if (ncols == 2) {
names(dist) <- method
} else {
if (!use.row.names) {
colnames(dist) <- paste0("v", seq_len(ncols))
rownames(dist) <- paste0("v", seq_len(ncols))
}
if (use.row.names) {
colnames(dist) <- colnames(x)
rownames(dist) <- colnames(x)
}
}
if (!as.dist.obj) {
return(dist)
}
if (as.dist.obj) {
if (ncols == 2) {
dist <- stats::as.dist(
matrix(c(0, dist, dist, 0), nrow = 2),
diag = diag,
upper = upper
)
} else {
dist <- stats::as.dist(dist, diag = diag, upper = upper)
}
attr(dist, "method") <- method
return(dist)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.