test-code/turf-working/old-attempts/turf-shapley.R

library(tidyverse)
library(tidyselect)
library(microbenchmark)
library(turfR)
library(arrangements)
library(onezero)

# Different algorithm methods:
# 1. Brute force everything
# 2. Brute force x number of set sizes then greedy the rest
# 3. Full greedy

# This has some helpful guidelines and definitions
# https://web.archive.org/web/20190530235247/https://wiki.q-researchsoftware.com/wiki/Total_Unduplicated_Reach_and_Frequency_(TURF)

# Other arguments to consider
# 1. Minimum proportion - remove items with low marginal reach
# 2. Mutual exclusivity - remove combinations where two or more items appear together

# Seems like all three options above could be combined into one argument, an
# argument that defines how many to brute force and leaves the remaining to
# greedy. Can set brute default to Inf?

terf_shapley <- function(
    data, cols, weight,
    depth = 1, min.prop = 0, force.in, mutual.excl,
    brute = Inf, greedy.entry = "reach"
) {

    # Parse information -------------------------------------------------------

    # Grab the data needed for the analysis
    analysis.df <- select(data, {{cols}})
    analysis.names <- names(analysis.df)

    # Do weights exist? If so, grab them, if not, make them.
    if (missing(weight)) {

        ss <- nrow(data)
        wgt.vec <- rep(1, times = ss)

    } else {

        wgt.df <- select(data, {{weight}})

        if (ncol(wgt.df) > 1) {
            stop("Can only provide one column of weights in `weight` argument.")
        }

        wgt.name <- names(wgt.df)

        if (wgt.name %in% analysis.names) {
            warning(paste0(
                "Column '",
                wgt.name,
                "' was supplied as an input to both `cols` and `weights` arguments, this is likely ill-advised."
            ))
        }

        wgt.vec <- pull(wgt.df, {{weight}})

    }

    # This object will be used a lot later:
    item.names <- colnames(analysis.df)


    # Make sure analysis data is onezero --------------------------------------

    oz.check <- sapply(analysis.df, is_onezero)

    bad.vars <- names(oz.check[!oz.check])

    if (length(bad.vars) > 0) {

        bad.vars.message <- paste0(
            "The following variables do not meet the requirements of `is_onezero`:\n",
            paste(bad.vars, collapse = ", ")
        )

        stop(bad.vars.message)

    }


    # Force in any items? -----------------------------------------------------

    if (!missing(force.in)) {

        .force.in <- enquo(force.in)
        force.index <- eval_select(
            expr = .force.in,
            data = data
        )

        force.names <- names(force.index)

        bad.names <- force.names[!force.names %in% item.names]

        if (length(bad.names) > 0) {
            stop(paste0(
                "Invalid input to `force.in`, columns supplied in `force.in` must also be present in input to `cols`. The following columns must be added to `cols` if you want to force them in:\n",
                paste(bad.names, collapse = ", ")
            ))
        }

    }


    # Get all combos ----------------------------------------------------------

    combo.list <- list()

    for (i in seq_along(item.names)) {

        combo.list[[i]] <- combinations(
            x = item.names,
            k = i
        )

        colnames(combo.list[[i]]) <- paste0("i", 1:i)

        # subset the combos if force in
        if (!missing(force.in)) {

            keep.combos <- apply(
                combo.list[[i]],
                1,
                function(x) all(force.names %in% x)
            )

            combo.list[[i]] <- combo.list[[i]][keep.combos, , drop = FALSE]

        }
    }

    n.combos <- sapply(combo.list, nrow)


    # Calculate ---------------------------------------------------------------

    data.0 <- as.matrix(analysis.df)
    data.0[is.na(data.0)] <- 0
    fill.list <- list()
    out.list <- list()

    for (i in seq_along(combo.list)) {

        cat(paste0(
            "Calculating ",
            length(item.names),
            " choose ",
            i,
            "\n"
        ))

        fill.list[[i]] <- matrix(
            data = NA,
            ncol = 2,
            nrow = n.combos[i],
            dimnames = list(
                NULL,
                c("reach", "freq")
            )
        )

        combos <- combo.list[[i]]

        # calculate reach
        for (j in 1:nrow(combos)) {

            n.reached <- Rfast::rowsums(data.0[, combos[j, ], drop = FALSE])

            is.reached <- n.reached >= depth

            # final versions here
            reach <- weighted_mean_cpp(is.reached, wgt.vec)
            freq  <- sum(wgt.vec * n.reached) / sum(wgt.vec)

            out <- c(reach, freq)

            fill.list[[i]][j, ] <- out

        }

        # join the combinations to the reach/frequency values and sort
        # them high to low
        reach.stats <-
            combos %>%
            as_tibble() %>%
            bind_cols(as_tibble(fill.list[[i]])) %>%
            rowid_to_column("combo") %>%
            arrange(desc(reach), desc(freq)) %>%
            add_column(
                k = i,
                .before = 1
            )


        # calculate the reach with/without each item
        item.cols <- paste0("i", 1:i)
        only.items <- reach.stats[item.cols]

        # This function calculates the mean with and without the item present
        # in the combination, used next to apply over the vector of item names
        with_without_reach <- function(cn) {
            tapply(
                X = reach.stats$reach,
                INDEX = apply(only.items, 1, function(x) ifelse(cn %in% x, "with", "without")),
                FUN = mean
            )
        }

        with.without.reach <-
            item.names %>%
            lapply(with_without_reach) %>%
            do.call(rbind, .) %>%
            as_tibble() %>%
            add_column(
                item = item.names,
                .before = 1
            ) %>%
            add_column(
                k = i,
                .before = 1
            )


        # the `with_without_reach` function does not have a "without" entry
        # when it is a "k choose k"
        if (i == length(item.names)) {
            with.without.reach$without <- 0
        }

        # store in the list
        out.list[[i]] <- list(
            "turf" = reach.stats,
            "with_without_reach" = with.without.reach
        )

    }

    names(out.list) <- paste0("k", 1:length(item.names))


    # Calculate Shapley values ------------------------------------------------

    shapley_values <-
        out.list %>%
        map(pluck, "with_without_reach") %>%
        reduce(bind_rows) %>%
        mutate(gap = with - without) %>%
        group_by(item) %>%
        summarise(shapley_value = mean(gap)) %>%
        ungroup() %>%
        arrange(desc(shapley_value))

    list(
        turf = map(out.list, pluck, "turf") %>%
            reduce(bind_rows) %>%
            select(k, combo, matches("^i"), reach, freq),
        shapley_values = shapley_values
    )

}

terf_shapley(
    data = turfR::turf_ex_data,
    cols = matches("item"),
    depth = 1
)

shapley_approx(
    data = turfR::turf_ex_data,
    cols = matches("item")
) %>%
    arrange(desc(shapley_approx))


# What should happen when input to `need` is 2? Should set sizes of < 2 be
# removed from the analysis? You can't have a "5 choose 1" but need 2 items
# to be considered reached, right?
terf_shapley(
    data = pets,
    cols = c(bird, fish, dog, cat),
    need = 2
)
ttrodrigz/onezero documentation built on May 9, 2023, 2:59 p.m.