library(tidyverse)
library(tidyselect)
library(microbenchmark)
library(turfR)
library(arrangements)
library(onezero)
# Different algorithm methods:
# 1. Brute force everything
# 2. Brute force x number of set sizes then greedy the rest
# 3. Full greedy
# This has some helpful guidelines and definitions
# https://web.archive.org/web/20190530235247/https://wiki.q-researchsoftware.com/wiki/Total_Unduplicated_Reach_and_Frequency_(TURF)
# Other arguments to consider
# 1. Minimum proportion - remove items with low marginal reach
# 2. Mutual exclusivity - remove combinations where two or more items appear together
# Seems like all three options above could be combined into one argument, an
# argument that defines how many to brute force and leaves the remaining to
# greedy. Can set brute default to Inf?
terf_shapley <- function(
data, cols, weight,
depth = 1, min.prop = 0, force.in, mutual.excl,
brute = Inf, greedy.entry = "reach"
) {
# Parse information -------------------------------------------------------
# Grab the data needed for the analysis
analysis.df <- select(data, {{cols}})
analysis.names <- names(analysis.df)
# Do weights exist? If so, grab them, if not, make them.
if (missing(weight)) {
ss <- nrow(data)
wgt.vec <- rep(1, times = ss)
} else {
wgt.df <- select(data, {{weight}})
if (ncol(wgt.df) > 1) {
stop("Can only provide one column of weights in `weight` argument.")
}
wgt.name <- names(wgt.df)
if (wgt.name %in% analysis.names) {
warning(paste0(
"Column '",
wgt.name,
"' was supplied as an input to both `cols` and `weights` arguments, this is likely ill-advised."
))
}
wgt.vec <- pull(wgt.df, {{weight}})
}
# This object will be used a lot later:
item.names <- colnames(analysis.df)
# Make sure analysis data is onezero --------------------------------------
oz.check <- sapply(analysis.df, is_onezero)
bad.vars <- names(oz.check[!oz.check])
if (length(bad.vars) > 0) {
bad.vars.message <- paste0(
"The following variables do not meet the requirements of `is_onezero`:\n",
paste(bad.vars, collapse = ", ")
)
stop(bad.vars.message)
}
# Force in any items? -----------------------------------------------------
if (!missing(force.in)) {
.force.in <- enquo(force.in)
force.index <- eval_select(
expr = .force.in,
data = data
)
force.names <- names(force.index)
bad.names <- force.names[!force.names %in% item.names]
if (length(bad.names) > 0) {
stop(paste0(
"Invalid input to `force.in`, columns supplied in `force.in` must also be present in input to `cols`. The following columns must be added to `cols` if you want to force them in:\n",
paste(bad.names, collapse = ", ")
))
}
}
# Get all combos ----------------------------------------------------------
combo.list <- list()
for (i in seq_along(item.names)) {
combo.list[[i]] <- combinations(
x = item.names,
k = i
)
colnames(combo.list[[i]]) <- paste0("i", 1:i)
# subset the combos if force in
if (!missing(force.in)) {
keep.combos <- apply(
combo.list[[i]],
1,
function(x) all(force.names %in% x)
)
combo.list[[i]] <- combo.list[[i]][keep.combos, , drop = FALSE]
}
}
n.combos <- sapply(combo.list, nrow)
# Calculate ---------------------------------------------------------------
data.0 <- as.matrix(analysis.df)
data.0[is.na(data.0)] <- 0
fill.list <- list()
out.list <- list()
for (i in seq_along(combo.list)) {
cat(paste0(
"Calculating ",
length(item.names),
" choose ",
i,
"\n"
))
fill.list[[i]] <- matrix(
data = NA,
ncol = 2,
nrow = n.combos[i],
dimnames = list(
NULL,
c("reach", "freq")
)
)
combos <- combo.list[[i]]
# calculate reach
for (j in 1:nrow(combos)) {
n.reached <- Rfast::rowsums(data.0[, combos[j, ], drop = FALSE])
is.reached <- n.reached >= depth
# final versions here
reach <- weighted_mean_cpp(is.reached, wgt.vec)
freq <- sum(wgt.vec * n.reached) / sum(wgt.vec)
out <- c(reach, freq)
fill.list[[i]][j, ] <- out
}
# join the combinations to the reach/frequency values and sort
# them high to low
reach.stats <-
combos %>%
as_tibble() %>%
bind_cols(as_tibble(fill.list[[i]])) %>%
rowid_to_column("combo") %>%
arrange(desc(reach), desc(freq)) %>%
add_column(
k = i,
.before = 1
)
# calculate the reach with/without each item
item.cols <- paste0("i", 1:i)
only.items <- reach.stats[item.cols]
# This function calculates the mean with and without the item present
# in the combination, used next to apply over the vector of item names
with_without_reach <- function(cn) {
tapply(
X = reach.stats$reach,
INDEX = apply(only.items, 1, function(x) ifelse(cn %in% x, "with", "without")),
FUN = mean
)
}
with.without.reach <-
item.names %>%
lapply(with_without_reach) %>%
do.call(rbind, .) %>%
as_tibble() %>%
add_column(
item = item.names,
.before = 1
) %>%
add_column(
k = i,
.before = 1
)
# the `with_without_reach` function does not have a "without" entry
# when it is a "k choose k"
if (i == length(item.names)) {
with.without.reach$without <- 0
}
# store in the list
out.list[[i]] <- list(
"turf" = reach.stats,
"with_without_reach" = with.without.reach
)
}
names(out.list) <- paste0("k", 1:length(item.names))
# Calculate Shapley values ------------------------------------------------
shapley_values <-
out.list %>%
map(pluck, "with_without_reach") %>%
reduce(bind_rows) %>%
mutate(gap = with - without) %>%
group_by(item) %>%
summarise(shapley_value = mean(gap)) %>%
ungroup() %>%
arrange(desc(shapley_value))
list(
turf = map(out.list, pluck, "turf") %>%
reduce(bind_rows) %>%
select(k, combo, matches("^i"), reach, freq),
shapley_values = shapley_values
)
}
terf_shapley(
data = turfR::turf_ex_data,
cols = matches("item"),
depth = 1
)
shapley_approx(
data = turfR::turf_ex_data,
cols = matches("item")
) %>%
arrange(desc(shapley_approx))
# What should happen when input to `need` is 2? Should set sizes of < 2 be
# removed from the analysis? You can't have a "5 choose 1" but need 2 items
# to be considered reached, right?
terf_shapley(
data = pets,
cols = c(bird, fish, dog, cat),
need = 2
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.