# R/similarity-functions.R In sgibb/similarity: Similarity measurement for binary matrices.

#### Documented in similarity

## Copyright 2012 Sebastian Gibb
## <[email protected]>
##
## This is free software: you can redistribute it and/or modify
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## It is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##

#' Similarity measurement for binary matrices.
#'
#' This function calculates different similarity indices for a binary matrix.
#'
#' @param x a binary \code{\link{matrix}}. Please take care that rows contain
#'  the samples and columns the features.
#' @param method the similarity index to be used.
#'
#' @details
#'
#' \code{p} = number of binary variables, \eqn{n_{00} + n_{01} + n_{10} + n_{11}} \cr
#' \code{i} and \code{j} = two observations \cr
#' \code{n_{00}} = \eqn{\sum\limits_{k=1}^p I(x_{ik} = 0, x_{jk} = 0} \cr
#' \code{n_{01}} = \eqn{\sum\limits_{k=1}^p I(x_{ik} = 0, x_{jk} = 1} \cr
#' \code{n_{10}} = \eqn{\sum\limits_{k=1}^p I(x_{ik} = 1, x_{jk} = 0} \cr
#' \code{n_{11}} = \eqn{\sum\limits_{k=1}^p I(x_{ik} = 1, x_{jk} = 1} \cr
#'
#' Asymmetric indices:
#' \tabular{lr}{
#'  Soerensen/Dice (\code{soerensen-dice}) \tab
#'  \eqn{\frac{2n_{11}}{n_{01} + n_{10} + 2n_{11}}} \cr
#'
#'  Jaccard (\code{jaccard}) \tab
#'  \eqn{\frac{n_{11}}{n_{01} + n_{10} + n_{11}}} \cr
#' }
#'
#' Symmetric indices:
#' \tabular{lr}{
#'  Simple Matching (\code{simple}) \tab
#'  \eqn{\frac{n_{00} + n_{11}}{p}} \cr
#'
#'  Rogers and Tanimoto (\code{rogers-tanimoto}) \tab
#'  \eqn{\frac{n_{00} + n_{11}}{n_{00} + 2(n_{01} + n_{10}) + n_{11}}} \cr
#' }
#'
#' @return An object of class \emph{similarity}.
#'
#'  The lower triangle of the similarity matrix stored by columns in a
#'  vector.
#'
#'  \itemize{
#'    \item{\code{Size:} \code{integer}, number of observations.}
#'    \item{\code{Labels:} \code{character}, contains names of the observations.}
#'    \item{\code{diagonalValues:} \code{double}, diagonal values.}
#'    \item{\code{method:} \code{character}, the method used.}
#'    \item{\code{call:} \code{language}, the \code{call} used to create the object.}
#'  }
#'
#' @references
#'  Dice, L. R. (1945),
#'  \dQuote{Measures of the amount of ecological association between species.},
#'  \emph{Ecology} \bold{26}: 297-302. \cr
#'
#'  Jaccard, P. (1901),
#'  \dQuote{\'Etude comparative de la distribution florale dans une portion des Alpes et des Jura},
#'  \emph{Bulletin de la Soci\'et\'e Vaudoise des Sciences Naturelles} \bold{37}: 547-579. \cr
#'
#'  Rogers, D. J. and Tanimoto, T. T. (1960),
#'  \dQuote{A computer program for classifying plants.},
#'  \emph{Science} \bold{132}: 1115-1118. \cr
#'
#'  Sokal, R. R. and Michener, C. D. (1956),
#'  \dQuote{A statistical method for evaluating systematic relationships.},
#'  \emph{University of Kansas Science Bulletin} \bold{38}: 1409-1438. \cr
#'
#'  Soerensen, T. (1948),
#'  \dQuote{A method of establishing groups of equal amplitude in plant sociology based on similarity of species content.},
#'  \emph{Biologiske Skrifter} \bold{4}: 1-34. \cr
#'
#' @export
#' @examples
#' library("similarity")
#'
#' a <- matrix(c(1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1),
#'             ncol=3, nrow=4, byrow=TRUE)
#' similarity(a, "soerensen")
#'
#' @rdname similarity
#'
similarity <- function(x, method=c("soerensen-dice", "soerensen", "dice",
"jaccard", "simple", "simplematching",
"rogers", "tanimoto", "rogers-tanimoto")) {
method <- match.arg(method, c("soerensen-dice", "soerensen", "dice",
"jaccard", "simple", "simplematching",
"rogers", "tanimoto", "rogers-tanimoto"),
several.ok=FALSE)

stopifnot(is.binary.matrix(x))

## test rows for zeros
.filterZeroRows <- function(x, s, method, diagValue) {
if (any(rowSums(x) == 0)) {
warning("'x' has rows containing only zeros. ",
"It doesn't make sense to use ", sQuote(method), ".",
call.=FALSE)
s[row(s) == col(s)] <- diagValue
}
return(s)
}

rowNames <- row.names(x)
nr <- nrow(x)
nc <- ncol(x)

n11 <- tcrossprod(x)
n01 <- tcrossprod(1-x, x)
n10 <- tcrossprod(x, 1-x)
n00 <- nc - n11 - n01 - n10

s <- switch(method,
dice=,
soerensen=,
"soerensen-dice"={
s <- 2*n11/(n01+n10+2*n11)
.filterZeroRows(x, s, method=method, diagValue=1)
},
jaccard={
s <- n11/(n01+n10+n11)
.filterZeroRows(x, s, method=method, diagValue=1)
},
simple=,
simplematching={ (n00+n11)/(n00+n01+n10+n11) },
rogers=,
tanimoto=,
"rogers-tanimoto"={ (n00+n11)/(n00+2*(n01+n10)+n11) },
stop(sQuote(method), " is no suitable method!")
)

## store diagonale (only needed for as.matrix)
diag <- diag(s)

s <- s[lower.tri(s)]

attr(s, "Size") <- nr
attr(s, "Labels") <- rowNames
attr(s, "diagonalValues") <- diag
attr(s, "method") <- method
attr(s, "call") <- match.call()

class(s) <- "similarity"

return(s)
}

#' @method as.matrix similarity
#' @export
#'
as.matrix.similarity <- function(x, ...) {
n <- attr(x, "Size")
m <- matrix(0, n, n)

m[lower.tri(m)] <- x
m <- m + t(m)
m[row(m) == col(m)] <- attr(x, "diagonalValues")

rowNames <- attr(x, "Labels")

if(is.null(rowNames)) {
dimnames(m) <- list(seq_len(n), seq_len(n))
} else {
dimnames(m) <- list(rowNames, rowNames)
}

return(m)
}

#' @method print similarity
#' @export
#'
print.similarity <- function(x, ...) {
stats:::print.dist(x, ...)
}

sgibb/similarity documentation built on May 29, 2019, 8:04 p.m.