library(tidyverse)
library(collapse)
library(onezero)
tibble(foo, bar) %>%
mutate(across(.fns = ~as.numeric(as.character(.x)))) %>%
pairwise_union(cols = everything())
N <- 20
set.seed(1)
foo <- factor(rbinom(N, 1, 0.3), levels = c(1, 0))
foo[3] <- NA
bar <- factor(rbinom(N, 1, 0.3), levels = c(1, 0))
w1 <- rnorm(N, 1, 0.15)
w2 <- rep(1, times = N)
xx <- qtab(foo, bar, w = w2)
# a b
# c d
# 1/1 1/0
# 0/1 0/0
conditional <- function(x, condition.on = "row") {
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
# d <- x[2, 2]
ss <- sum(x)
if (condition.on == "row") {
return(a / (a + b))
} else if (condition.on == "col") {
return(a / (a + c))
} else {
rlang::abort("Input to `condition.on` must be one of `row` or `col`.")
}
}
conditional(xx, condition.on = "row")
conditional(xx, condition.on = "col")
intersection <- function(x) {
# Alias russel_rao, joint
a <- x[1, 1]
# b <- x[1, 2]
# c <- x[2, 1]
# d <- x[2, 2]
ss <- sum(x)
a / ss
}
intersection(xx)
union <- function(x) {
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
# d <- x[2, 2]
ss <- sum(x)
(a + b + c) / ss
}
union(xx)
jaccard <- function(x, distance = FALSE) {
# This is an index in which joint absences are excluded
# from consideration. Equal weight is given to matches and
# nonmatches. Also known as the similarity ratio.
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
# d <- x[2, 2]
# ss <- sum(x)
out <- a / (a + b + c)
if (distance) {
return(1 - out)
} else {
return(out)
}
}
jaccard(xx, distance = TRUE)
tibble(foo, bar) %>%
map_df(~as.numeric(as.character(.x))) %>%
as.matrix() %>%
t() %>%
dist(method = "binary")
dice <- function(x) {
# This is an index in which joint absences are excluded
# from consideration, and matches are weighted double.
# Also known as the Czekanowski or Sorensen measure.
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
# d <- x[2, 2]
# ss <- sum(x)
(2 * a) / (2 * a + (b + c))
}
dice(xx)
ochiai <- function(x) {
# This index is the binary form of the cosine similarity measure.
# It has a range of 0 to 1.
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
# d <- x[2, 2]
# ss <- sum(x)
a / sqrt((a + b) * (a + c))
}
ochiai(xx)
simple_match <- function(x) {
# This is the ratio of matches to the total number of values.
# Equal weight is given to matches and nonmatches.
a <- x[1, 1]
# b <- x[1, 2]
# c <- x[2, 1]
d <- x[2, 2]
ss <- sum(x)
(a + d) / ss
}
simple_match(xx)
rogers_tanimoto <- function(x) {
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
d <- x[2, 2]
# ss <- sum(x)
# https://www.scielo.org.mx/pdf/cys/v20n3/1405-5546-cys-20-03-00345.pdf
(a + d) / (a + 2 * (b + c) + d)
}
rogers_tanimoto(xx)
sokal_sneath1 <- function(x) {
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
# d <- x[2, 2]
# ss <- sum(x)
# This is an index in which double weight is given to matches.
a / (a + 2 * (b + c))
}
sokal_sneath1(xx)
sokal_sneath2 <- function(x) {
# This is an index in which double weight is given to
# nonmatches, and joint absences are excluded from consideration.
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
d <- x[2, 2]
# ss <- sum(x)
0.25 * (a / (a + b) + a / (a + c) + d / (b + d) + d / (c + d))
}
sokal_sneath2(xx)
sokal_sneath3 <- function(x) {
# This is the ratio of matches to nonmatches. This index has
# a lower bound of 0 and is unbounded above. It is
# theoretically undefined when there are no nonmatches;
# however, Distances assigns an arbitrary value of 9999.999
# when the value is undefined or is greater than this value.
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
d <- x[2, 2]
# ss <- sum(x)
a * d / sqrt((a + b) * (a + c) * (d + b) * (d + c))
}
sokal_sneath3(xx)
sokal_sneath4 <- function(x) {
# This index is based on the conditional probability
# that the characteristic in one item matches the value
# in the other. The separate values for each item acting
# as predictor of the other are averaged to compute this value.
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
d <- x[2, 2]
# ss <- sum(x)
(a + d)/(b + c)
}
sokal_sneath4(xx)
sokal_sneath5 <- function(x) {
# This index is the squared geometric mean of conditional
# probabilities of positive and negative matches. It is
# independent of item coding. It has a range of 0 to 1.
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
d <- x[2, 2]
# ss <- sum(x)
2 * (a + d) / (2 * (a + d) + b + c)
}
sokal_sneath5(xx)
hamann <- function(x) {
# This index is the number of matches minus the number
# of nonmatches, divided by the total number of items.
# It ranges from -1 to 1.
a <- x[1, 1]
b <- x[1, 2]
c <- x[2, 1]
d <- x[2, 2]
ss <- sum(x)
((a + d) - (b + c)) / ss
}
hamann(xx)
goodman_kruskal_lambda <- function(x) {
# This index is Goodman and Kruskal's lambda.
# Corresponds to the proportional reduction of error (PRE)
# using one item to predict the other (predicting in both
# directions). Values range from 0 to 1.
# This one, like conditional probability, depends on the
# direction.
}
yules_y <- function(x) {
# This index is a function of the cross-ratio for a 2 x 2
# table, and is independent of the marginal totals. It has
# a range of -1 to 1. Also known as the coefficient of
# colligation.
# num <- sqrt(a * d) - sqrt(b * c)
# den <- sqrt(a * d) + sqrt(b * c)
# num / den
}
yules_q <- function(x) {
# This index is a special case of Goodman and Kruskal's
# gamma. It is a function of the cross-ratio and is
# independent of the marginal totals. It has a range of
# -1 to 1.
# Y <- yules_y()
# q <- 2Y / (1 + Y^2)
}
phi <- function(x) {
# https://en.wikipedia.org/wiki/Phi_coefficient
a <- x[1, 1]
b <- x[2, 1]
c <- x[1, 2]
d <- x[2, 2]
# ss <- sum(x)
((a*d)-(c*b))/sqrt((a+c)*(c+d)*(a+b)*(b+d))
}
phi(xx)
kulczynski_1 <- function(x) {
# This is the ratio of joint presences to all nonmatches.
# This index has a lower bound of 0 and is unbounded above.
# It is theoretically undefined when there are no nonmatches;
# however, Distances assigns an arbitrary value of 9999.999
# when the value is undefined or is greater than this value.
# !!!!!
# a / (b + c)
}
kulczynski_2 <- function(x) {
# This index is based on the conditional probability that
# the characteristic is present in one item, given that it
# is present in the other. The separate values for each item
# acting as predictor of the other are averaged to compute this
# value.
# 0.5 * (a / (a + b) + a / (a + c))
}
faith <- function(x) {
(a + d/2) / ss
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.