test-code/new-metrics.R

# Many more distance matrices to choose from
# Really need to consider using pairwise crosstabs via xtabs
# - verified below that values returned are identical
# - need to speed test, if faster or not much slower then consider switching
# - then implement other metrics (see sequentix, )
# https://www.ibm.com/docs/en/spss-statistics/SaaS?topic=measures-distances-similarity-binary-data
# https://stats.stackexchange.com/questions/61705/similarity-coefficients-for-binary-data-why-choose-jaccard-over-russell-and-rao
# https://www.sequentix.de/gelquest/help/distance_measures.htm

library(tidyverse)
library(onezero)
library(microbenchmark)

library(collapse)


ss <- 100000
a <- factor(x = rbinom(ss, 1, 0.2))
b <- factor(x = rbinom(ss, 1, 0.2))
w <- rnorm(ss, 1, 0.15)

xtabs(w ~ a + b)

qtab(a, b, w = w)

microbenchmark(
    a = xtabs(w ~ a + b),
    b = qtab(a, b, w = w),
    times = 50
)


x <-
    FoodSample %>%
    select(Bisque, Duck, weight)

y <- with(x, table(factor(Bisque, 1:0), factor(Duck, 1:0)))
a <- y[1, 1]
b <- y[1, 2]
c <- y[2, 1]
d <- y[2, 2]


pairwise_conditional(x, Bisque:Duck, tidy = F)
a / (a + b)
a / (a + c)

pairwise_intersection(x, Bisque:Duck, tidy = F)
pi <- a / sum(y)
pi

pairwise_union(x, Bisque:Duck, tidy = F)
pu <- (a + b + c) / sum(y)
pu

pairwise_jaccard(x, Bisque:Duck, stat = "index", tidy = F)
a / (a+b+c)
pi / pu



# Weighted --

z <- xtabs(weight ~ Bisque + Duck, data = FoodSample)

aa <- z[2, 2]
bb <- z[1, 2]
cc <- z[2, 1]
dd <- z[1, 1]

pairwise_conditional(x, Bisque:Duck, weight = weight, tidy = F)
aa / (aa + bb)
aa / (aa + cc)

pairwise_intersection(x, Bisque:Duck, weight = weight,tidy = F)
pi <- aa / sum(y)
pi

pairwise_union(x, Bisque:Duck, weight = weight,tidy = F)
pu <- (aa + bb + cc) / sum(y)
pu

pairwise_jaccard(x, Bisque:Duck, weight = weight, stat = "index", tidy = F)
aa / (aa+bb+cc)
pi / pu


f <- c(0, 0, 0, NA, 1, 0) %>% factor(1:0)
g <- c(0, NA, 0, 0, 0, 0) %>% factor(1:0)
m <- data.frame(f, g)

h <- sample(c(1, 0, NA), size = 10000, replace = T, prob = c(0.4, 0.4, 0.2))

xtabs(~m[[1]] + m[[2]], drop.unused.levels = F)


microbenchmark(
    factor = factor(h),
    factor_set = factor(h, 1:0),
    as_factor = as_factor(h),
    times = 50
)




# Pairwise xtabs ----------------------------------------------------------

pairwise_xtabs <- function(data, wgt.vec) {

    item.df <- data
    item.df <- mutate(
        item.df,
        across(.fn = ~factor(.x, 1:0))
    )

    M <- ncol(item.df)
    VN <- names(item.df)

    out <- matrix(
        nrow = M,
        ncol = M
    )

    for (i in 1:M) {

        for (j in 1:M) {

            # evaluate later
            if (i <= j) next

            xt <- xtabs(wgt.vec ~ item.df[[i]] + item.df[[j]])
            a <- xt[1, 1]
            b <- xt[1, 2]
            c <- xt[2, 1]
            d <- xt[2, 2]

            out[i, j] <- a / (a + c)
            out[j, i] <- a / (a + b)


        }

        diag(out) <- 1
    }

    out
}

pairwise_xtabs(
    data = FoodSample %>% select(Bisque:PorkChop),
    wgt.vec = rep(1, times = nrow(FoodSample))
)

pairwise_conditional(
    data = FoodSample,
    cols = Bisque:PorkChop,
    tidy = FALSE
)

microbenchmark(
    a = pairwise_conditional(
        data = FoodSample,
        cols = Bisque:PorkChop,
        tidy = FALSE
    ),
    b = pairwise_xtabs(
        data = FoodSample %>% select(Bisque:PorkChop),
        wgt.vec = rep(1, times = nrow(FoodSample))
    )
)


pc2 <- function(
    data, cols, weight, tidy = TRUE
) {

    # Parse out data and weights ----------------------------------------------

    # In this section, the data used in the actual turf analysis is parsed out
    # from the data set provided, and a vector of weights is either extracted
    # from the data, or is created if not provided.

    # Grab the data needed for the analysis
    item.df <- select(data, {{cols}})

    # Check and make sure the data is "onezero"
    oz.check <- sapply(item.df, is_onezero)

    bad.vars <- names(oz.check[!oz.check])

    if (length(bad.vars) > 0) {

        bad.vars.message <- paste0(
            "The following variables do not meet the requirements of `is_onezero`:\n",
            paste(bad.vars, collapse = ", ")
        )

        stop(bad.vars.message)

    }

    # Grab the names of the items
    item.names <- names(item.df)
    num.items <- length(item.names)

    # Do weights exist? If so, grab them, if not, make them.
    if (missing(weight)) {

        ss <- nrow(data)
        wgt.vec <- rep(1, times = ss)

    } else {

        wgt.df <- select(data, {{weight}})

        if (ncol(wgt.df) > 1) {
            stop("Can only provide one column of weights in `weight` argument.")
        }

        wgt.name <- names(wgt.df)

        if (wgt.name %in% item.names) {
            warning(paste0(
                "Column '",
                wgt.name,
                "' was supplied as an input to both `cols` and `weights` arguments, this is likely ill-advised."
            ))
        }

        wgt.vec <- pull(wgt.df, {{weight}})

    }

    # Initialize matrix -------------------------------------------------------

    M <- ncol(item.df)
    VN <- names(item.df)

    out <- matrix(
        nrow = M,
        ncol = M
    )


    # Probabilities -----------------------------------------------------------

    item.df <- mutate(item.df, across(.fns = ~factor(.x, 1:0)))

    for (i in 1:M) {

        for (j in 1:M) {

            # evaluate later
            if (i <= j) next

            xt <- xtabs(
                formula = wgt.vec ~ item.df[[i]] + item.df[[j]],
                drop.unused.levels = FALSE
            )
            a <- xt[1, 1]
            b <- xt[1, 2]
            c <- xt[2, 1]
            d <- xt[2, 2]

            out[i, j] <- a / (a + c)
            out[j, i] <- a / (a + b)


        }

        diag(out) <- 1
    }


    dimnames(out) <- list("Prob of A" = VN, "Given B" = VN)

    # used for tidying
    value.prefix <- "p"


    # Replace NaNs ------------------------------------------------------------

    # In the event there are no pairwise valid observations of two variables,
    # the result will return NaN, I would rather it return NA
    out[is.nan(out)] <- NA


    # Tidy results? -----------------------------------------------------------

    if (tidy) {

        out <-
            out %>%
            as_tibble(rownames = "var_a") %>%
            pivot_longer(
                cols = -1,
                names_to = "var_b",
                values_to = paste(value.prefix, "a_given_b", sep = "_")
            )

    }

    return(out)

}

microbenchmark::microbenchmark(

    "pc2" = pc2(
        data = FoodSample,
        cols = Bisque:PorkChop,
        tidy = FALSE
    ),

    "pc1" = pairwise_conditional(
        data = FoodSample,
        cols = Bisque:PorkChop,
        tidy = FALSE
    ),
    unit = "relative",
    times = 50

)


pi2 <- function(
    data, cols, weight, stat = "prob", tidy = TRUE
) {

    # Check for correct stats -------------------------------------------------

    stats.avail <- c("prob", "count")

    if (!stat %in% stats.avail) {
        stop(paste0(
            "Stat '",
            stat,
            "' not recognized, please use one of 'prob' or 'count'."
        ))
    }


    # Parse out data and weights ----------------------------------------------

    # In this section, the data used in the actual turf analysis is parsed out
    # from the data set provided, and a vector of weights is either extracted
    # from the data, or is created if not provided.

    # Grab the data needed for the analysis
    item.df <- select(data, {{cols}})

    # Check and make sure the data is "onezero"
    oz.check <- sapply(item.df, is_onezero)

    bad.vars <- names(oz.check[!oz.check])

    if (length(bad.vars) > 0) {

        bad.vars.message <- paste0(
            "The following variables do not meet the requirements of `is_onezero`:\n",
            paste(bad.vars, collapse = ", ")
        )

        stop(bad.vars.message)

    }

    # Grab the names of the items
    item.names <- names(item.df)
    num.items <- length(item.names)

    # Do weights exist? If so, grab them, if not, make them.
    if (missing(weight)) {

        ss <- nrow(data)
        wgt.vec <- rep(1, times = ss)

    } else {

        wgt.df <- select(data, {{weight}})

        if (ncol(wgt.df) > 1) {
            stop("Can only provide one column of weights in `weight` argument.")
        }

        wgt.name <- names(wgt.df)

        if (wgt.name %in% item.names) {
            warning(paste0(
                "Column '",
                wgt.name,
                "' was supplied as an input to both `cols` and `weights` arguments, this is likely ill-advised."
            ))
        }

        wgt.vec <- pull(wgt.df, {{weight}})

    }


    # Initialize matrix -------------------------------------------------------

    M <- ncol(item.df)
    VN <- names(item.df)

    out <- matrix(
        nrow = M,
        ncol = M
    )


    # Check for fully missing -------------------------------------------------

    # If a variable has 100% missing values then the iteration may be skipped
    # entirely and move on to the next. This will reduce iterations and avoid
    # returning pairwise counts of zero when they should be NA.
    all.miss <- sapply(item.df, function(x) mean(is.na(x)) == 1)

    item.df.fct <- mutate(item.df, across(.fns = ~factor(.x, 1:0)))

    # Probabilities -----------------------------------------------------------

    if (stat == "prob") {

        for (i in 1:M) {

            for (j in 1:M) {

                # evaluate later
                if (i <= j) next

                # if either i or j are 100% missing
                if (any(all.miss[c(i, j)])) {
                    out[i, j] <- NA
                    next
                }

                xt <- xtabs(wgt.vec ~ item.df.fct[[i]] + item.df.fct[[j]])
                a <- xt[1, 1]
                out[i, j] <- a / sum(xt)

            }
        }

        # fill out upper triangle and diagonal
        diag(out) <- sapply(item.df, fmean, w = wgt.vec, na.rm = TRUE)
        upperTriangle(out) <- lowerTriangle(out, byrow = TRUE)

        dimnames(out) <- list("Prob of A" = VN, "And B" = VN)

        # used for tidying
        value.prefix <- "p"

    }


    # Counts ------------------------------------------------------------------

    if (stat == "count") {

        for (i in 1:M) {

            for (j in 1:M) {

                # evaluate later
                if (i <= j) next

                # if either i or j are 100% missing
                if (any(all.miss[c(i, j)])) {
                    out[i, j] <- NA
                    next
                }

                xt <- xtabs(wgt.vec ~ item.df.fct[[i]] + item.df.fct[[j]])
                out[i, j] <- xt[1, 1]

            }
        }

        # fill out upper triangle and diagonal
        diag(out) <- sapply(item.df, function(x) sum(x * wgt.vec, na.rm = TRUE))
        diag(out)[all.miss] <- NA
        upperTriangle(out) <- lowerTriangle(out, byrow = TRUE)

        dimnames(out) <- list("Counts of A" = VN, "And B" = VN)

        # used for tidying
        value.prefix <- "n"

    }


    # Replace NaNs ------------------------------------------------------------

    # In the event there are no pairwise valid observations of two variables,
    # the result will return NaN, I would rather it return NA
    out[is.nan(out)] <- NA


    # Tidy results? -----------------------------------------------------------

    if (tidy) {

        out <-
            out %>%
            as_tibble(rownames = "var_a") %>%
            pivot_longer(
                cols = -1,
                names_to = "var_b",
                values_to = paste(value.prefix, "a_and_b", sep = "_")
            )

    }

    return(out)

}

microbenchmark::microbenchmark(

    "pi2" = pi2(
        data = FoodSample,
        cols = Bisque:PorkChop,
        tidy = FALSE
    ),

    "pc1" = pairwise_intersection(
        data = FoodSample,
        cols = Bisque:PorkChop,
        tidy = FALSE
    ),
    unit = "s",
    times = 50

)
ttrodrigz/onezero documentation built on May 9, 2023, 2:59 p.m.