test-code/internal-mets.R

library(tidyverse)
library(collapse)
library(onezero)

tibble(foo, bar) %>%
    mutate(across(.fns = ~as.numeric(as.character(.x)))) %>%
    pairwise_union(cols = everything())

N <- 20

set.seed(1)
foo <- factor(rbinom(N, 1, 0.3), levels = c(1, 0))
foo[3] <- NA
bar <- factor(rbinom(N, 1, 0.3), levels = c(1, 0))
w1 <- rnorm(N, 1, 0.15)
w2 <- rep(1, times = N)

xx <- qtab(foo, bar, w = w2)


# a b
# c d

# 1/1 1/0
# 0/1 0/0

conditional <- function(x, condition.on = "row") {

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    # d <- x[2, 2]
    ss <- sum(x)

    if (condition.on == "row") {
        return(a / (a + b))
    } else if (condition.on == "col") {
        return(a / (a + c))
    } else {
        rlang::abort("Input to `condition.on` must be one of `row` or `col`.")
    }

}


conditional(xx, condition.on = "row")
conditional(xx, condition.on = "col")




intersection <- function(x) {

    # Alias russel_rao, joint

    a <- x[1, 1]
    # b <- x[1, 2]
    # c <- x[2, 1]
    # d <- x[2, 2]
    ss <- sum(x)

    a / ss

}

intersection(xx)



union <- function(x) {

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    # d <- x[2, 2]
    ss <- sum(x)

    (a + b + c) / ss
}

union(xx)

jaccard <- function(x, distance = FALSE) {

    # This is an index in which joint absences are excluded
    # from consideration. Equal weight is given to matches and
    # nonmatches. Also known as the similarity ratio.

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    # d <- x[2, 2]
    # ss <- sum(x)

    out <- a / (a + b + c)

    if (distance) {
        return(1 - out)
    } else {
        return(out)
    }

}

jaccard(xx, distance = TRUE)

tibble(foo, bar) %>%
    map_df(~as.numeric(as.character(.x))) %>%
    as.matrix() %>%
    t() %>%
    dist(method = "binary")


dice <- function(x) {

    # This is an index in which joint absences are excluded
    # from consideration, and matches are weighted double.
    # Also known as the Czekanowski or Sorensen measure.

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    # d <- x[2, 2]
    # ss <- sum(x)

    (2 * a) / (2 * a + (b + c))

}

dice(xx)



ochiai <- function(x) {

    # This index is the binary form of the cosine similarity measure.
    # It has a range of 0 to 1.

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    # d <- x[2, 2]
    # ss <- sum(x)

    a / sqrt((a + b) * (a + c))

}

ochiai(xx)

simple_match <- function(x) {

    # This is the ratio of matches to the total number of values.
    # Equal weight is given to matches and nonmatches.

    a <- x[1, 1]
    # b <- x[1, 2]
    # c <- x[2, 1]
    d <- x[2, 2]
    ss <- sum(x)

    (a + d) / ss
}

simple_match(xx)


rogers_tanimoto <- function(x) {

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    d <- x[2, 2]
    # ss <- sum(x)

    # https://www.scielo.org.mx/pdf/cys/v20n3/1405-5546-cys-20-03-00345.pdf
    (a + d) / (a + 2 * (b + c) + d)

}

rogers_tanimoto(xx)

sokal_sneath1 <- function(x) {

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    # d <- x[2, 2]
    # ss <- sum(x)

    # This is an index in which double weight is given to matches.

    a / (a + 2 * (b + c))
}

sokal_sneath1(xx)

sokal_sneath2 <- function(x) {

    # This is an index in which double weight is given to
    # nonmatches, and joint absences are excluded from consideration.

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    d <- x[2, 2]
    # ss <- sum(x)

    0.25 * (a / (a + b) + a / (a + c) + d / (b + d) + d / (c + d))
}

sokal_sneath2(xx)

sokal_sneath3 <- function(x) {

    # This is the ratio of matches to nonmatches. This index has
    # a lower bound of 0 and is unbounded above. It is
    # theoretically undefined when there are no nonmatches;
    # however, Distances assigns an arbitrary value of 9999.999
    # when the value is undefined or is greater than this value.

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    d <- x[2, 2]
    # ss <- sum(x)

    a * d / sqrt((a + b) * (a + c) * (d + b) * (d + c))

}


sokal_sneath3(xx)

sokal_sneath4 <- function(x) {

    # This index is based on the conditional probability
    # that the characteristic in one item matches the value
    # in the other. The separate values for each item acting
    # as predictor of the other are averaged to compute this value.

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    d <- x[2, 2]
    # ss <- sum(x)

    (a + d)/(b + c)
}

sokal_sneath4(xx)

sokal_sneath5 <- function(x) {

    # This index is the squared geometric mean of conditional
    # probabilities of positive and negative matches. It is
    # independent of item coding. It has a range of 0 to 1.

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    d <- x[2, 2]
    # ss <- sum(x)

    2 * (a + d) / (2 * (a + d) + b + c)
}

sokal_sneath5(xx)

hamann <- function(x) {

    # This index is the number of matches minus the number
    # of nonmatches, divided by the total number of items.
    # It ranges from -1 to 1.

    a <- x[1, 1]
    b <- x[1, 2]
    c <- x[2, 1]
    d <- x[2, 2]
    ss <- sum(x)

    ((a + d) - (b + c)) / ss

}

hamann(xx)

goodman_kruskal_lambda <- function(x) {

    # This index is Goodman and Kruskal's lambda.
    # Corresponds to the proportional reduction of error (PRE)
    # using one item to predict the other (predicting in both
    # directions). Values range from 0 to 1.

    # This one, like conditional probability, depends on the
    # direction.

}

yules_y <- function(x) {

    # This index is a function of the cross-ratio for a 2 x 2
    # table, and is independent of the marginal totals. It has
    # a range of -1 to 1. Also known as the coefficient of
    # colligation.

    # num <- sqrt(a * d) - sqrt(b * c)
    # den <- sqrt(a * d) + sqrt(b * c)
    # num / den

}

yules_q <- function(x) {

    # This index is a special case of Goodman and Kruskal's
    # gamma. It is a function of the cross-ratio and is
    # independent of the marginal totals. It has a range of
    # -1 to 1.

    # Y <- yules_y()
    # q <- 2Y / (1 + Y^2)

}

phi <- function(x) {

    # https://en.wikipedia.org/wiki/Phi_coefficient

    a <- x[1, 1]
    b <- x[2, 1]
    c <- x[1, 2]
    d <- x[2, 2]
    # ss <- sum(x)

    ((a*d)-(c*b))/sqrt((a+c)*(c+d)*(a+b)*(b+d))


}

phi(xx)



kulczynski_1 <- function(x) {

    # This is the ratio of joint presences to all nonmatches.
    # This index has a lower bound of 0 and is unbounded above.
    # It is theoretically undefined when there are no nonmatches;
    # however, Distances assigns an arbitrary value of 9999.999
    # when the value is undefined or is greater than this value.

    # !!!!!
    # a / (b + c)
}

kulczynski_2 <- function(x) {

    # This index is based on the conditional probability that
    # the characteristic is present in one item, given that it
    # is present in the other. The separate values for each item
    # acting as predictor of the other are averaged to compute this
    # value.

    # 0.5 * (a / (a + b) + a / (a + c))

}


faith <- function(x) {

    (a + d/2) / ss
}
ttrodrigz/onezero documentation built on May 9, 2023, 2:59 p.m.