library(tidyverse)
library(tidyselect)
library(microbenchmark)
library(turfR)
library(arrangements)
library(onezero)
library(crayon)
terf <- function(
data, cols, weight,
k, depth = 1, min.prop, force.in, mutual.excl
) {
# Parse information -------------------------------------------------------
# Grab the data needed for the analysis
analysis.df <- select(data, {{cols}})
analysis.names <- names(analysis.df)
# Do weights exist? If so, grab them, if not, make them.
if (missing(weight)) {
ss <- nrow(data)
wgt.vec <- rep(1, times = ss)
} else {
wgt.df <- select(data, {{weight}})
if (ncol(wgt.df) > 1) {
stop("Can only provide one column of weights in `weight` argument.")
}
wgt.name <- names(wgt.df)
if (wgt.name %in% analysis.names) {
warning(paste0(
"Column '",
wgt.name,
"' was supplied as an input to both `cols` and `weights` arguments, this is likely ill-advised."
))
}
wgt.vec <- pull(wgt.df, {{weight}})
}
# This object will be used a lot later:
item.names <- colnames(analysis.df)
# Make sure arguments are specified correctly -----------------------------
# k can't be bigger than number of columns, or less than 1
if (!between(k, 1, length(item.names))) {
stop(paste0(
"Input to `k` must be a value between 1 and number of columns provided in `cols` (",
length(item.names),
")."
))
}
# depth can't be bigger than k
if (depth > k) {
stop("Input to `depth` cannot exceed `k`. Doing so would result in a reach of zero.")
}
# Make sure analysis data is onezero --------------------------------------
oz.check <- sapply(analysis.df, is_onezero)
bad.vars <- names(oz.check[!oz.check])
if (length(bad.vars) > 0) {
bad.vars.message <- paste0(
"The following variables do not meet the requirements of `is_onezero`:\n",
paste(bad.vars, collapse = ", ")
)
stop(bad.vars.message)
}
# Force in any items? -----------------------------------------------------
if (!missing(force.in)) {
.force.in <- enquo(force.in)
force.index <- eval_select(
expr = .force.in,
data = data
)
force.names <- names(force.index)
# if k is less than the number of items forced in then there will be no
# valid combinations to run
if (k < length(force.names)) {
stop(paste0(
"Input to `k` must be greater than or equal to the number of items being forced in (",
length(force.names),
"), otherwise no valid combinations will be available."
))
}
bad.names <- force.names[!force.names %in% item.names]
if (length(bad.names) > 0) {
stop(paste0(
"Invalid input to `force.in`, columns supplied in `force.in` must also be present in input to `cols`. The following columns must be added to `cols` if you want to force them in:\n",
paste(bad.names, collapse = ", ")
))
}
}
# Should any items be dropped due to low %? -------------------------------
if (!missing(min.prop)) {
if (!between(min.prop, 0, 1)) {
stop("Input to `min.prop` must be a value between 0 and 1.")
}
# calculate %
low.prop.items <- map_lgl(
.x = analysis.df,
.f = ~weighted.mean(.x, wgt.vec, na.rm = TRUE) < min.prop
)
low.prop.item.names <- names(low.prop.items[low.prop.items])
# forced in items will override this
if (length(force.names) > 0) {
}
}
# Get all combos ----------------------------------------------------------
combos <- combinations(
x = item.names,
k = k
)
colnames(combos) <- paste0("i", 1:k)
# subset the combos if force in
if (!missing(force.in)) {
keep.combos <- apply(combos, 1, function(x) all(force.names %in% x))
combos <- combos[keep.combos, , drop = FALSE]
}
n.combos <- nrow(combos)
# Calculate ---------------------------------------------------------------
# This matrix will receive the reach and frequency for each combination.
fill <- matrix(
data = NA,
ncol = 2,
nrow = n.combos,
dimnames = list(
NULL,
c("reach", "freq")
)
)
# The `Rfast::rowsums()` function only operates on matrices, and for some
# reason I also replaced missing values with zero. I don't remember why
# but the results are equivalent with `base::rowSums()` so I'm sticking
# to it.
data.0 <- as.matrix(analysis.df)
data.0[is.na(data.0)] <- 0
# Calculate the percent reached and frequency for each combination.
header <- paste0(
"-- turf ",
paste(rep("-", times = 50), collapse = "")
)
cat.string <- paste0(
header, "\n",
italic("Number of items..... "), length(item.names), "\n",
italic("Taken at a time..... "), k, "\n",
italic("Number of combos.... "), formatC(n.combos, big.mark = ",", format = "f", drop0trailing = TRUE),
"\n"
)
if (!missing(force.in)) {
cat.string <- paste0(
cat.string,
italic("Forced inclusions...\n "),
paste(
paste0("\U2713", " ", force.names),
collapse = "\n "
),
"\n"
)
}
cat(cat.string, "\n")
for (i in 1:nrow(combos)) {
n.reached <- Rfast::rowsums(data.0[, combos[i, ], drop = FALSE])
is.reached <- n.reached >= depth
# final versions here
reach <- weighted_mean_cpp(is.reached, wgt.vec)
# freq <- sum(wgt.vec * n.reached) / sum(wgt.vec[is.reached])
freq <- sum(wgt.vec * n.reached) / sum(wgt.vec)
out <- c(reach, freq)
fill[i, ] <- out
}
# Arrange output and return -----------------------------------------------
out <-
as_tibble(combos) %>%
add_column(n = length(item.names), .before = 1) %>%
add_column(k = k, .before = 2) %>%
bind_cols(as_tibble(fill)) %>%
rowid_to_column("combo") %>%
arrange(desc(reach), desc(freq))
out
}
# Testing mine vs hack ----------------------------------------------------
# Better turf
terf(
data = xx,
cols = matches("item"),
k = 6,
depth = 1,
weight = wgt,
min.prop = 0.4,
force.in = c(item_1:item_5)
)
xx <-
turfR::turf_ex_data %>%
as_tibble()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.