# Many more distance matrices to choose from
# Really need to consider using pairwise crosstabs via xtabs
# - verified below that values returned are identical
# - need to speed test, if faster or not much slower then consider switching
# - then implement other metrics (see sequentix, )
# https://www.ibm.com/docs/en/spss-statistics/SaaS?topic=measures-distances-similarity-binary-data
# https://stats.stackexchange.com/questions/61705/similarity-coefficients-for-binary-data-why-choose-jaccard-over-russell-and-rao
# https://www.sequentix.de/gelquest/help/distance_measures.htm
library(tidyverse)
library(onezero)
library(microbenchmark)
library(collapse)
ss <- 100000
a <- factor(x = rbinom(ss, 1, 0.2))
b <- factor(x = rbinom(ss, 1, 0.2))
w <- rnorm(ss, 1, 0.15)
xtabs(w ~ a + b)
qtab(a, b, w = w)
microbenchmark(
a = xtabs(w ~ a + b),
b = qtab(a, b, w = w),
times = 50
)
x <-
FoodSample %>%
select(Bisque, Duck, weight)
y <- with(x, table(factor(Bisque, 1:0), factor(Duck, 1:0)))
a <- y[1, 1]
b <- y[1, 2]
c <- y[2, 1]
d <- y[2, 2]
pairwise_conditional(x, Bisque:Duck, tidy = F)
a / (a + b)
a / (a + c)
pairwise_intersection(x, Bisque:Duck, tidy = F)
pi <- a / sum(y)
pi
pairwise_union(x, Bisque:Duck, tidy = F)
pu <- (a + b + c) / sum(y)
pu
pairwise_jaccard(x, Bisque:Duck, stat = "index", tidy = F)
a / (a+b+c)
pi / pu
# Weighted --
z <- xtabs(weight ~ Bisque + Duck, data = FoodSample)
aa <- z[2, 2]
bb <- z[1, 2]
cc <- z[2, 1]
dd <- z[1, 1]
pairwise_conditional(x, Bisque:Duck, weight = weight, tidy = F)
aa / (aa + bb)
aa / (aa + cc)
pairwise_intersection(x, Bisque:Duck, weight = weight,tidy = F)
pi <- aa / sum(y)
pi
pairwise_union(x, Bisque:Duck, weight = weight,tidy = F)
pu <- (aa + bb + cc) / sum(y)
pu
pairwise_jaccard(x, Bisque:Duck, weight = weight, stat = "index", tidy = F)
aa / (aa+bb+cc)
pi / pu
f <- c(0, 0, 0, NA, 1, 0) %>% factor(1:0)
g <- c(0, NA, 0, 0, 0, 0) %>% factor(1:0)
m <- data.frame(f, g)
h <- sample(c(1, 0, NA), size = 10000, replace = T, prob = c(0.4, 0.4, 0.2))
xtabs(~m[[1]] + m[[2]], drop.unused.levels = F)
microbenchmark(
factor = factor(h),
factor_set = factor(h, 1:0),
as_factor = as_factor(h),
times = 50
)
# Pairwise xtabs ----------------------------------------------------------
pairwise_xtabs <- function(data, wgt.vec) {
item.df <- data
item.df <- mutate(
item.df,
across(.fn = ~factor(.x, 1:0))
)
M <- ncol(item.df)
VN <- names(item.df)
out <- matrix(
nrow = M,
ncol = M
)
for (i in 1:M) {
for (j in 1:M) {
# evaluate later
if (i <= j) next
xt <- xtabs(wgt.vec ~ item.df[[i]] + item.df[[j]])
a <- xt[1, 1]
b <- xt[1, 2]
c <- xt[2, 1]
d <- xt[2, 2]
out[i, j] <- a / (a + c)
out[j, i] <- a / (a + b)
}
diag(out) <- 1
}
out
}
pairwise_xtabs(
data = FoodSample %>% select(Bisque:PorkChop),
wgt.vec = rep(1, times = nrow(FoodSample))
)
pairwise_conditional(
data = FoodSample,
cols = Bisque:PorkChop,
tidy = FALSE
)
microbenchmark(
a = pairwise_conditional(
data = FoodSample,
cols = Bisque:PorkChop,
tidy = FALSE
),
b = pairwise_xtabs(
data = FoodSample %>% select(Bisque:PorkChop),
wgt.vec = rep(1, times = nrow(FoodSample))
)
)
pc2 <- function(
data, cols, weight, tidy = TRUE
) {
# Parse out data and weights ----------------------------------------------
# In this section, the data used in the actual turf analysis is parsed out
# from the data set provided, and a vector of weights is either extracted
# from the data, or is created if not provided.
# Grab the data needed for the analysis
item.df <- select(data, {{cols}})
# Check and make sure the data is "onezero"
oz.check <- sapply(item.df, is_onezero)
bad.vars <- names(oz.check[!oz.check])
if (length(bad.vars) > 0) {
bad.vars.message <- paste0(
"The following variables do not meet the requirements of `is_onezero`:\n",
paste(bad.vars, collapse = ", ")
)
stop(bad.vars.message)
}
# Grab the names of the items
item.names <- names(item.df)
num.items <- length(item.names)
# Do weights exist? If so, grab them, if not, make them.
if (missing(weight)) {
ss <- nrow(data)
wgt.vec <- rep(1, times = ss)
} else {
wgt.df <- select(data, {{weight}})
if (ncol(wgt.df) > 1) {
stop("Can only provide one column of weights in `weight` argument.")
}
wgt.name <- names(wgt.df)
if (wgt.name %in% item.names) {
warning(paste0(
"Column '",
wgt.name,
"' was supplied as an input to both `cols` and `weights` arguments, this is likely ill-advised."
))
}
wgt.vec <- pull(wgt.df, {{weight}})
}
# Initialize matrix -------------------------------------------------------
M <- ncol(item.df)
VN <- names(item.df)
out <- matrix(
nrow = M,
ncol = M
)
# Probabilities -----------------------------------------------------------
item.df <- mutate(item.df, across(.fns = ~factor(.x, 1:0)))
for (i in 1:M) {
for (j in 1:M) {
# evaluate later
if (i <= j) next
xt <- xtabs(
formula = wgt.vec ~ item.df[[i]] + item.df[[j]],
drop.unused.levels = FALSE
)
a <- xt[1, 1]
b <- xt[1, 2]
c <- xt[2, 1]
d <- xt[2, 2]
out[i, j] <- a / (a + c)
out[j, i] <- a / (a + b)
}
diag(out) <- 1
}
dimnames(out) <- list("Prob of A" = VN, "Given B" = VN)
# used for tidying
value.prefix <- "p"
# Replace NaNs ------------------------------------------------------------
# In the event there are no pairwise valid observations of two variables,
# the result will return NaN, I would rather it return NA
out[is.nan(out)] <- NA
# Tidy results? -----------------------------------------------------------
if (tidy) {
out <-
out %>%
as_tibble(rownames = "var_a") %>%
pivot_longer(
cols = -1,
names_to = "var_b",
values_to = paste(value.prefix, "a_given_b", sep = "_")
)
}
return(out)
}
microbenchmark::microbenchmark(
"pc2" = pc2(
data = FoodSample,
cols = Bisque:PorkChop,
tidy = FALSE
),
"pc1" = pairwise_conditional(
data = FoodSample,
cols = Bisque:PorkChop,
tidy = FALSE
),
unit = "relative",
times = 50
)
pi2 <- function(
data, cols, weight, stat = "prob", tidy = TRUE
) {
# Check for correct stats -------------------------------------------------
stats.avail <- c("prob", "count")
if (!stat %in% stats.avail) {
stop(paste0(
"Stat '",
stat,
"' not recognized, please use one of 'prob' or 'count'."
))
}
# Parse out data and weights ----------------------------------------------
# In this section, the data used in the actual turf analysis is parsed out
# from the data set provided, and a vector of weights is either extracted
# from the data, or is created if not provided.
# Grab the data needed for the analysis
item.df <- select(data, {{cols}})
# Check and make sure the data is "onezero"
oz.check <- sapply(item.df, is_onezero)
bad.vars <- names(oz.check[!oz.check])
if (length(bad.vars) > 0) {
bad.vars.message <- paste0(
"The following variables do not meet the requirements of `is_onezero`:\n",
paste(bad.vars, collapse = ", ")
)
stop(bad.vars.message)
}
# Grab the names of the items
item.names <- names(item.df)
num.items <- length(item.names)
# Do weights exist? If so, grab them, if not, make them.
if (missing(weight)) {
ss <- nrow(data)
wgt.vec <- rep(1, times = ss)
} else {
wgt.df <- select(data, {{weight}})
if (ncol(wgt.df) > 1) {
stop("Can only provide one column of weights in `weight` argument.")
}
wgt.name <- names(wgt.df)
if (wgt.name %in% item.names) {
warning(paste0(
"Column '",
wgt.name,
"' was supplied as an input to both `cols` and `weights` arguments, this is likely ill-advised."
))
}
wgt.vec <- pull(wgt.df, {{weight}})
}
# Initialize matrix -------------------------------------------------------
M <- ncol(item.df)
VN <- names(item.df)
out <- matrix(
nrow = M,
ncol = M
)
# Check for fully missing -------------------------------------------------
# If a variable has 100% missing values then the iteration may be skipped
# entirely and move on to the next. This will reduce iterations and avoid
# returning pairwise counts of zero when they should be NA.
all.miss <- sapply(item.df, function(x) mean(is.na(x)) == 1)
item.df.fct <- mutate(item.df, across(.fns = ~factor(.x, 1:0)))
# Probabilities -----------------------------------------------------------
if (stat == "prob") {
for (i in 1:M) {
for (j in 1:M) {
# evaluate later
if (i <= j) next
# if either i or j are 100% missing
if (any(all.miss[c(i, j)])) {
out[i, j] <- NA
next
}
xt <- xtabs(wgt.vec ~ item.df.fct[[i]] + item.df.fct[[j]])
a <- xt[1, 1]
out[i, j] <- a / sum(xt)
}
}
# fill out upper triangle and diagonal
diag(out) <- sapply(item.df, fmean, w = wgt.vec, na.rm = TRUE)
upperTriangle(out) <- lowerTriangle(out, byrow = TRUE)
dimnames(out) <- list("Prob of A" = VN, "And B" = VN)
# used for tidying
value.prefix <- "p"
}
# Counts ------------------------------------------------------------------
if (stat == "count") {
for (i in 1:M) {
for (j in 1:M) {
# evaluate later
if (i <= j) next
# if either i or j are 100% missing
if (any(all.miss[c(i, j)])) {
out[i, j] <- NA
next
}
xt <- xtabs(wgt.vec ~ item.df.fct[[i]] + item.df.fct[[j]])
out[i, j] <- xt[1, 1]
}
}
# fill out upper triangle and diagonal
diag(out) <- sapply(item.df, function(x) sum(x * wgt.vec, na.rm = TRUE))
diag(out)[all.miss] <- NA
upperTriangle(out) <- lowerTriangle(out, byrow = TRUE)
dimnames(out) <- list("Counts of A" = VN, "And B" = VN)
# used for tidying
value.prefix <- "n"
}
# Replace NaNs ------------------------------------------------------------
# In the event there are no pairwise valid observations of two variables,
# the result will return NaN, I would rather it return NA
out[is.nan(out)] <- NA
# Tidy results? -----------------------------------------------------------
if (tidy) {
out <-
out %>%
as_tibble(rownames = "var_a") %>%
pivot_longer(
cols = -1,
names_to = "var_b",
values_to = paste(value.prefix, "a_and_b", sep = "_")
)
}
return(out)
}
microbenchmark::microbenchmark(
"pi2" = pi2(
data = FoodSample,
cols = Bisque:PorkChop,
tidy = FALSE
),
"pc1" = pairwise_intersection(
data = FoodSample,
cols = Bisque:PorkChop,
tidy = FALSE
),
unit = "s",
times = 50
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.