R/turf.R

Defines functions turf

Documented in turf

#' TURF Analysis
#'
#' Runs Total Unduplicated Reach & Frequency with options for case weights,
#' constraints on combinations, item weights, and three methods of a greedy
#' algorithm.
#'
#' @details
#' Need some dang details here.
#'
#' @param data A data frame.
#' @param items Columns on which to run TURF. Must contain only ones, zeros, or
#' `NA`. Suggest using [is_onezero][onezero::is_onezero] ahead of time to check.
#' @param case_weights An optional column of case weights to use in reach
#' calculations. Rows with `NA` will be removed from the base.
#' @param item_weights An optional named vector of non-zero weights to associate
#' with each item. Items not specified will be given a default weight of 1.
#'
#' Common examples are profit, revenue, or simply relative importance weights.
#'
#' @param k Set size, number of `items` to choose in a combination. Can be a vector of
#' values from 1 to number of `items`. Values outside of that range will be
#' silently ignored. [floor][base::floor] is used to override accidental use of
#' decimals. Default is 1.
#' @param depth Number of `items` needed in order to be considered "reached."
#' Can be any number between 1 to number of `items`. Default is 1.
#' @param force_in,force_in_together,force_out,force_out_together Options for
#' reducing the number of combinations by adding constraints.
#'
#' `force_in` and `force_out` accept a single tidyselect expression. Items
#' specified here will force every combination to include or exclude those items.
#'
#' Use `force_in_together` or `force_out_together` to make items appear together
#' or not appear together in the combinations. Pass an arbitrary number of
#' tidyselect expressions to [together][onezero::together]. Duplicates and
#' inclusions/exclusions containing only one item will be silently dropped.
#'
#' @param greedy_begin Set size at which the greedy algorithm will kick in. Default
#' is `Inf`.
#' @param greedy_entry Method for entering variables into greedy algorithm.
#' Options are `"shapley"` (default) which uses approximated Shapley Values,
#' `"reach"` or `"freq"` for using the combination with the highest reach or
#' frequency, respectively, from the `k[i-1]` set size.
#' @param progress Display progress? Default is `FALSE`. Adds a slight processing
#' overhead, but not much. Useful when number of `items` exceeds 20.
#'
#' @importFrom rlang enquo abort warn arg_match
#' @importFrom glue glue
#' @importFrom tidyselect eval_select
#' @importFrom purrr map discard map_int map_lgl
#' @importFrom dplyr setdiff select all_of %>% between arrange rename matches
#' filter pull as_tibble bind_cols mutate desc distinct ungroup group_by
#' case_when
#' @importFrom tibble rowid_to_column enframe tibble deframe
#' @importFrom collapse dapply roworderv fmean seqid
#' @importFrom stringr str_pad
#' @importFrom cli cat_line rule col_yellow
#' @importFrom arrangements combinations
#' @importFrom Rfast rowsums
#' @importFrom readr parse_number
#' @importFrom utils head
#' @importFrom scales percent
#' @importFrom forcats fct_inorder
#'
#' @examples
#' library(dplyr)
#'
#' # Simple 10-item TURF
#' x <- turf(FoodSample, Bisque:Ribeye, k = 1:10)
#'
#' # With items forced in and out
#' # Forcing in "Ribeye"
#' # Forcing out items with an individual reach of < 10%
#' turf(
#'     data = FoodSample,
#'     items = Bisque:Ribeye,
#'     k = 1:10,
#'     force_in = Ribeye,
#'     force_out = where(~mean(.x, na.rm = TRUE) < 0.1)
#' )
#'
#' # Forcing items in and out together
#' turf(
#'     data = FoodSample,
#'     items = Bisque:Ribeye,
#'     case_weights = weight,
#'     k = c(1:4, 6:10),
#'     force_in_together = together(
#'         c(Chicken, Salmon),
#'         c(Chili, Tofu, Turkey)
#'     ),
#'     force_out_together = together(
#'         matches("eye"),
#'         c(2, 10)
#'     ),
#'     greedy_begin = 10,
#'     greedy_entry = "reach"
#' )
#'
#' # Item weights
#' turf(
#'     data = FoodSample,
#'     items = 2:6,
#'     k = 1:6,
#'     item_weights = c(
#'         Bisque = 1.2,
#'         Chicken = 2.5,
#'         Tofu = 2.9,
#'         Chili = 1.7,
#'         PorkChop = 3.0
#'     )
#' )
#'
#' @export
turf <- function(
        data,
        items,
        case_weights,
        item_weights,
        k = 1,
        depth = 1,
        force_in,
        force_in_together,
        force_out,
        force_out_together,
        greedy_begin = Inf,
        greedy_entry = "shapley",
        progress = FALSE
) {


    # Preliminary error checks ------------------------------------------------

    # Start total clock
    total.clock1 <- Sys.time()

    if (!is.data.frame(data)) {
        abort("Input to `data` must be a data frame.")
    }

    if (!is.numeric(k)) {
        abort("Input to `k` must be one or more numeric values.")
    }

    if (!is.numeric(depth) | length(depth) != 1) {
        abort("Input to `depth` must be a single numeric value.")
    }
    depth <- floor(depth)

    if (!is.numeric(greedy_begin) | length(greedy_begin) != 1) {
        abort("Input to `greedy_begin` must be a single numeric value.")
    }
    greedy_begin <- floor(greedy_begin)

    rlang::arg_match(
        arg = greedy_entry,
        values = c("shapley", "reach", "freq")
    )

    if (greedy_entry == "shapley" & !missing(force_out_together)) {
        abort("'shapley' greedy entry cannot be combined with `force_out_together`.")
    }


    # Get names of things -----------------------------------------------------

    # `items`
    item.names <- names(eval_select(expr = enquo(items), data = data))

    # `case_weights`
    has.weights <- FALSE

    if (!missing(case_weights)) {

        case.weights.names <- names(
            eval_select(
                expr = enquo(case_weights),
                data = data
            )
        )

        has.weights <- TRUE

    }


    # `force_in`
    do.force.in <- FALSE
    force.in.names <- character()

    if (!missing(force_in)) {

        force.in.names <- names(
            eval_select(
                expr = enquo(force_in),
                data = data
            )
        )

        do.force.in <- TRUE

    }

    # `force_in_together`
    do.force.in.together <- FALSE
    force.in.together.names <- list()

    if (!missing(force_in_together)) {

        force.in.together.names <-
            map(
                .x = force_in_together,
                .f = ~eval_select(
                    expr = .x,
                    data = data
                )
            ) %>%
            map(names) %>%
            discard(~length(.x) <= 1) %>%
            # this removes accidental duplicates
            map(sort) %>%
            unique()

        do.force.in.together <- length(force.in.together.names) > 0

    }


    # `force_out`
    do.force.out <- FALSE
    force.out.names <- character()

    if (!missing(force_out)) {

        force.out.names <- names(
            eval_select(
                expr = enquo(force_out),
                data = data
            )
        )

        do.force.out <- TRUE
    }

    # `force_out_together`
    do.force.out.together <- FALSE
    force.out.together.names <- list()

    # `force_out_together`
    if (!missing(force_out_together)) {

        force.out.together.names <-
            map(
                .x = force_out_together,
                .f = ~eval_select(
                    expr = .x,
                    data = data
                )
            ) %>%
            map(names) %>%
            discard(~length(.x) <= 1) %>%
            # this removes accidental duplicates
            map(sort) %>%
            unique()

        do.force.out.together <- length(force.out.together.names) > 0

    }


    # Set up weights ----------------------------------------------------------

    if (missing(case_weights)) {

        wgt.vec <- rep(1, times = nrow(data))

    } else {

        if (length(case.weights.names) > 1) {
            abort("Can only provide one column of weights to `case_weights`.")
        }

        if (case.weights.names %in% item.names) {
            abort(glue("Column '{case.weights.names}' cannot used both in `items` and `case_weights`."))
        }

        wgt.vec <- data[[case.weights.names]]

        if (!is.numeric(wgt.vec)) {
            abort("Input to `case_weights` must be a numeric column.")
        }

    }

    # Use this as the denominator for reach calculations later
    wgt.base <- sum(wgt.vec, na.rm = TRUE)


    # Validate: all `force_in` in `items` -------------------------------------

    if (do.force.in) {

        # any items in `force_in` that are not in `items`?
        bad.names <- setdiff(force.in.names, item.names)
        fail <- length(bad.names) > 0

        if (fail) {

            # warn that mismatches will be ignored
            bad.names.string <- paste(glue("{bad.names}"), collapse = ", ")
            msg <- glue("All items in `force_in` should appear in `items`. The following items are missing in `items` and will be ignored:\n{bad.names.string}")
            warn(msg)

            # update the vector of names
            force.in.names <- force.in.names[!force.in.names %in% bad.names]

            # set it to false if there is nothing left
            if (length(force.in.names) == 0) {
                do.force.in <- FALSE
            }

        }

    }


    # Validate: all `force_out` in `items` ------------------------------------

    if (do.force.out) {

        # any items in `force_out` that are not in `items`?
        bad.names <- setdiff(force.out.names, item.names)
        fail <- length(bad.names) > 0

        if (fail) {

            # warn that mismatches will be ignored
            bad.names.string <- paste(glue("{bad.names}"), collapse = ", ")
            msg <- glue("All items in `force_out` should appear in `items`. The following items are missing in `items` and will be ignored:\n{bad.names.string}")
            warn(msg)

            # update the vector of names
            force.out.names <- force.out.names[!force.out.names %in% bad.names]

            # set it to false if there is nothing left
            if (length(force.out.names) == 0) {
                do.force.out <- FALSE
            }

        }

    }


    # Validate: no overlap between `force_in` and `force_out` -----------------

    if (do.force.in & do.force.out) {

        # any items overlap?
        bad.names <- intersect(force.in.names, force.out.names)
        fail <- length(bad.names) > 0

        if (fail) {

            bad.names.string <- paste(glue("{bad.names}"), collapse = ", ")
            msg <- glue("Cannot have items that appear both in `force_in` and `force_out`. The following variables are in both:\n{bad.names.string}")
            abort(msg)

        }

    }


    # Validate: all `force_in_together` in `items` ----------------------------

    if (do.force.in.together) {

        # index for dropping them if their lengths become 1
        drop.fi <- numeric()

        for (fi in seq_along(force.in.together.names)) {

            bad.names <- setdiff(force.in.together.names[[fi]], item.names)
            fail <- length(bad.names) > 0

            if (fail) {

                bad.names.string <- paste(glue("{bad.names}"), collapse = ", ")
                msg <- glue("All items in `force_in_together` should appear in `items`. The following items are missing in `items` and will be ignored:\n{bad.names.string}")
                warn(msg)

                # drop them
                force.in.together.names[[fi]] <-
                    force.in.together.names[[fi]][!force.in.together.names[[fi]] %in% bad.names]

                if (length(force.in.together.names[[fi]]) < 2) {
                    drop.fi <- c(drop.fi, fi)
                }

            }

        }

        # drop the combos if necessary
        if (length(drop.fi) > 0) force.in.together.names <- force.in.together.names[-drop.fi]

        # reset the "do" things if necessary
        if (length(force.in.together.names) == 0)  do.force.in.together  <- FALSE

    }


    # Validate: all `force_in_together` in `items` ----------------------------

    if (do.force.out.together) {

        # index for dropping them if their lengths become 1
        drop.fo <- numeric()

        for (fo in seq_along(force.out.together.names)) {

            bad.names <- setdiff(force.out.together.names[[fo]], item.names)
            fail <- length(bad.names) > 0

            if (fail) {

                bad.names.string <- paste(glue("{bad.names}"), collapse = ", ")
                msg <- glue("All items in `force_in_together` should appear in `items`. The following items are missing in `items` and will be ignored:\n{bad.names.string}")
                warn(msg)

                # drop them
                force.out.together.names[[fo]] <-
                    force.out.together.names[[fo]][!force.out.together.names[[fo]] %in% bad.names]

                if (length(force.out.together.names[[fo]]) < 2) {
                    drop.fo <- c(drop.fo, fo)
                }

            }

        }

        # drop the combos if necessary
        if (length(drop.fo) > 0) force.out.together.names <- force.out.together.names[-drop.fo]

        # reset the "do" things if necessary
        if (length(force.out.together.names) == 0)  do.force.out.together  <- FALSE

    }


    # Validate: no overlap btw `force_in_together` & `force_out_together -----

    if (do.force.in.together & do.force.out.together) {

        # index of list items to drop
        drop.fi <- numeric()
        drop.fo <- numeric()

        for (fi in seq_along(force.in.together.names)) {
            for (fo in seq_along(force.out.together.names)) {

                bad.names <- intersect(
                    force.in.together.names[[fi]],
                    force.out.together.names[[fo]]
                )

                # this fails if intersection is more than 1
                fail <- length(bad.names) >= 2

                if (fail) {

                    bad.names.string <- paste(glue("{bad.names}"), collapse = ", ")
                    msg <- glue("The following items appeared both in `force_in_together` and `force_out_together`, constraints with these items will be ignored:\n{bad.names.string}")
                    warn(msg)

                    # update index of list items to drop
                    drop.fi <- c(drop.fi, fi)
                    drop.fo <- c(drop.fo, fo)

                }

            }
        }

        # drop the combos if necessary
        if (length(drop.fi) > 0) force.in.together.names <- force.in.together.names[-drop.fi]
        if (length(drop.fo) > 0) force.out.together.names <- force.out.together.names[-drop.fo]

        # reset the "do" things if necessary
        if (length(force.in.together.names) == 0)  do.force.in.together  <- FALSE
        if (length(force.out.together.names) == 0) do.force.out.together <- FALSE

    }


    # Build constraints list --------------------------------------------------

    constraints <- list(
        "force_in"           = force.in.names,
        "force_in_together"  = force.in.together.names,
        "force_out"          = force.out.names,
        "force_out_together" = force.out.together.names
    )


    # Get the turf data -------------------------------------------------------

    # Just grab the columns to be turf'd
    item.mat <- select(data, all_of(item.names))


    # Validate `k` ------------------------------------------------------------

    # Make sure user is only requesting sets of `k` that make sense.

    # These are the number of items before any forced exclusions take place
    n.items <- ncol(item.mat)

    # Establish final `k` --
    # Setting to floor will force into an integer
    # Need to sort to make sure it cycles thru the set sizes in an order
    #   that makes sense
    # Drop duplicates, no need to run set sizes more than once
    # Subset `k` to be between 1 and the number of items
    k <-
        floor(k) %>%
        sort() %>%
        unique() %>%
        .[between(., 1, n.items)]

    if (length(k) == 0) {
        abort(glue(
            "Input to `k` must contain integers between 1 and number of columns in `items` ({n.items})."
        ))
    }


    # Validate `depth` --------------------------------------------------------

    if (!between(depth, 1, n.items)) {
        abort("Input to `depth` must be a value between 1 and number of `items` ({n.items}).")
    }


    # Drop items due to exclusions --------------------------------------------

    # Doing this here because it will save from having to create unnecessary
    # combinations with {arrangements}
    if (do.force.out) {
        item.mat <- select(item.mat, -all_of(force.out.names))
    }


    # Finally prep turf data --------------------------------------------------

    # Rfast::rowsums() needs the data to be in a matrix
    item.mat <- as.matrix(item.mat)

    # this needs to be updated if there are any forced exclusions
    item.names <- colnames(item.mat)
    n.items <- ncol(item.mat)

    # Check and make sure that all of the columns contain only 1/0/NA
    oz.check <- dapply(item.mat, is_onezero)

    oz.fail <- any(!oz.check)

    if (oz.fail) {
        bad.names <- names(oz.check[!oz.check])
        bad.names.string <- paste(bad.names, sep = ", ")
        msg <- glue(
            "All variables in `items` must contain only 0/1 data, the following do not:\n{bad.names.string}"
        )
        abort(msg)
    }


    # Check and see if any rows have 100% missing data
    all.miss <-
        dapply(
            X = item.mat,
            FUN = function(x) all(is.na(x)),
            MARGIN = 1
        ) %>%
        which()

    if (length(all.miss) > 0) {
        all.miss.string <- paste(all.miss, collapse = ", ")
        msg <- glue(
            "{length(all.miss)} rows in `data` have 100% missing values for the items specified in `items`. They will still be retained in the analysis and treated as \"unreached\". If you do not want those rows in the TURF analysis, please remove them ahead of time."
        )
        warn(msg)
    }

    # Replace NA with zero, makes sense since we are operating row-wise
    # for reach. This radically improves the speed of Rfast::rowsums().
    item.mat[is.na(item.mat)] <- 0


    # Initialize objects for storing and displaying results -------------------

    # For reach values
    reach.list <- vector(
        mode = "list",
        length = length(k)
    )

    # how long does it take to make and constrain the combos?
    combo.clock <- vector("double", length = length(k))

    # how long does turf calculation take place?
    turf.clock <- vector("double", length = length(k))

    # Used for string padding
    # This gets used for setting name of reach list as well as the item number
    # identifiers in the data
    pad <- max(nchar(k))
    names(reach.list) <- paste0("k_", str_pad(k, width = pad, side = "left", pad = "0"))


    # Item weights ------------------------------------------------------------

    do.item.weights <- FALSE
    if (!missing(item_weights)) {

        item.wgt.names <- names(item_weights)

        if (is.null(item.wgt.names)) {
            abort("Input to `item_weights` must be a named vector.")
        }

        if (any(item.wgt.names == "")) {
            abort("Cannot have empty characters as names in `item_weights`.")
        }

        if (any(is.na(item_weights))) {
            abort("Every element of `item_weights` must be named.")
        }

        if (length(unique(item.wgt.names)) != length(item.wgt.names)) {
            abort("There cannot be duplicate names in the names of `item_weights`.")
        }


        pos.check <- all(sign(item_weights) == 1)

        if (!pos.check) {
            abort("All `item_weights` must be positive and non-zero.")
        }

        bad <- setdiff(item.wgt.names, item.names)

        if (length(bad) > 0) {
            bad.string <- paste(bad, collapse = ", ")
            msg <- glue(
                "The following items specified in `item_weights` were not included in `items` and will be ignored:\n{bad.string}"
            )
            warn(msg)
            item_weights <- item_weights[names(item_weights) %in% item.names]

        }

        item.wgt <- rep(1, times = n.items)
        names(item.wgt) <- item.names

        item.wgt.default <- enframe(
            x = item.wgt,
            name = "item",
            value = "default"
        )

        item.wgt.new <- enframe(item_weights, name = "item", value = "new")

        item.wgt <-
            item.wgt.default %>%
            left_join(item.wgt.new, by = "item") %>%
            mutate(wgt = coalesce(new, default)) %>%
            pull(wgt, name = item)

        # if (all(item.wgt == 1)) {
        #     warn("All `item_weights` are 1 and will not be used.")
        #     do.item.weights <- FALSE
        # } else {
        #     do.item.weights <- TRUE
        # }

        if (all(item.wgt == 1)) {
            warn("All `item_weights` have a value of 1.")
        }

        do.item.weights <- TRUE

    } else {
        # item.wgt <- rep(1, times = n.items)
        # names(item.wgt) <- item.names
        item.wgt <- NULL
    }


    # Shapley approximation ---------------------------------------------------

    # Shapley approximation used for shapley greedy entry
    # Only need if using that method and greedy will actually kick in
    # if (greedy_entry == "shapley" & greedy_begin <= max(k)) {
    if (missing(case_weights)) {

        shap.approx <-
            shapley_approx(
                data = data,
                items = all_of(item.names),
                item_weights = item_weights,
                depth = depth,
                return = "tibble"
            ) %>%
            arrange(desc(shapley_value))

    } else {

        shap.approx <-
            shapley_approx(
                data = data,
                items = all_of(item.names),
                case_weights = {{case_weights}},
                item_weights = item_weights,
                depth = depth,
                return = "tibble"
            ) %>%
            arrange(desc(shapley_value))
    }
    # }


    # Rearrange the combo matrix ----------------------------------------------

    # Rearrange in order of reach
    # TODO



    # Do TURF ----------------------------------------------------------------

    cat_line(rule("TURF", line = 2))

    # string padding item names
    pad <- nchar(n.items)
    shap.keep <- 0

    # Begin iteratin'
    for (i in seq_along(k)) {

        cat_line(glue("Running combinations of {k[i]}\n"))

        # k cannot exceed number of items - this happens when there are
        # forced exclusions and running all k from 1 to # orig items
        if (k[i] > n.items) {

            cat_line(
                col_yellow(
                    "i Skipping this set size, cannot perform due to forced exclusions"
                )
            )
            next
        }

        # depth cannot exceed k
        if (depth > k[i]) {
            cat_line(
                col_yellow(
                    glue(
                        "i Skipping this set size, `depth` cannot exceed `k`"
                    )
                )
            )
            next
        }


        # start combo clock
        combo.clock1 <- Sys.time()

        # generate full set of combinations
        combos <- combinations(
            x = item.names,
            k = k[i]
        )
        colnames(combos) <- paste0("i_", str_pad(1:k[i], width = pad, side = "left", pad = "0"))

        # reduce by forced inclusions
        if (do.force.in) {

            # index of rows to keep
            keep.combos <- dapply(
                X = combos,
                FUN = function(x) all(force.in.names %in% x),
                MARGIN = 1
            )

            # subset the combos
            combos <- combos[keep.combos, , drop = FALSE]

            # check and make sure rows remain
            if (nrow(combos) == 0) {

                cat_line(
                    col_yellow(
                        "i Skipping this set size, no combinations remain after forced inclusions"
                    )
                )

                next

            }

        }

        # reduce by forced exclusions
        if (do.force.out) {

            # index of rows to keep
            # can't have any of the exclusions
            keep.combos <- dapply(
                X = combos,
                FUN = function(x) !any(force.out.names %in% x),
                MARGIN = 1
            )

            # subset the combos
            combos <- combos[keep.combos, , drop = FALSE]

            # check and make sure rows remain
            if (nrow(combos) == 0) {

                cat_line(
                    col_yellow(
                        "i Skipping this set size, no combinations remain after forced exclusions"
                    )
                )

                next

            }

        }

        # reduce by forced in together
        if (do.force.in.together) {

            # loop thru the list
            for (j in seq_along(force.in.together.names)) {

                keep.names <- force.in.together.names[[j]]

                # no need to continue if the number of forced mutual
                # incluions exceeds the current set size
                if (length(keep.names) > k[i]) {
                    next
                }

                # if any are present, all have to be present
                keep.combos <- dapply(
                    X = combos,
                    FUN = function(x) {
                        chk1 <- ifelse(any(x %in% keep.names), 1, -1)
                        chk2 <- ifelse(all(keep.names %in% x), 1, -1)

                        (chk1 * chk2) == 1
                    },
                    MARGIN = 1
                )

                combos <- combos[keep.combos, , drop = FALSE]

                if (nrow(combos) == 0) {

                    cat_line(
                        col_yellow(
                            "i Skipping this set size, no combinations remain after items forced in together"
                        )
                    )

                    break # completely exit this part of the for loop
                }

            }

        }

        # the forced mutual inclusions will break out of its for loop
        # skip over to the next set size if there are no combinations left
        if (nrow(combos) == 0) next


        # reduce by forced out together
        if (do.force.out.together) {

            # loop thru the list
            for (j in seq_along(force.out.together.names)) {

                drop.names <- force.out.together.names[[j]]

                # no need to continue if the number of forced mutual
                # incluions exceeds the current set size
                if (length(drop.names) > k[i]) {
                    next
                }


                # keep only comos where all of the drop names aren't in
                keep.combos <- dapply(
                    X = combos,
                    MARGIN = 1,
                    FUN = function(x) !all(drop.names %in% x)
                )

                combos <- combos[keep.combos, , drop = FALSE]

                if (nrow(combos) == 0) {

                    cat_line(
                        col_yellow(
                            "i Skipping this set size, no combinations remain after items forced out together"
                        )
                    )

                    break # completely exit this part of the for loop
                }

            }

        }

        # the forced mutual inclusions will break out of its for loop
        # skip over to the next set size if there are no combinations left
        if (nrow(combos) == 0) next


        # If greedying
        # set size has to be when greedy kicks in
        # can't be the first iteration
        # the set size has to be larger than the number of force inclusions
        if (k[i] >= greedy_begin & i > 1 & k[i] > length(force.in.names)) {

            if (greedy_entry == "reach") {

                # Find the best combo from k-1
                item.keep.greedy <-
                    reach.list[[i-1]] %>%
                    roworderv(cols = "reach", decreasing = TRUE) %>%
                    head(1) %>%
                    select(matches("i")) %>%
                    unlist() %>%
                    unname()

            } else if (greedy_entry == "freq") {

                item.keep.greedy <-
                    reach.list[[i-1]] %>%
                    roworderv(cols = "freq", decreasing = TRUE) %>%
                    head(1) %>%
                    select(matches("i")) %>%
                    unlist() %>%
                    unname()

            } else if (greedy_entry == "shapley") {

                shap.keep <- shap.keep + 1

                item.keep.greedy <-
                    shap.approx %>%
                    # force-ins have to be there already
                    # this skips them
                    filter(!item %in% force.in.names) %>%
                    head(shap.keep) %>%
                    pull(1)

            }

            keep.combos <- dapply(
                X = combos,
                MARGIN = 1,
                FUN = function(x) all(item.keep.greedy %in% x)
            )

            combos <- combos[keep.combos, , drop = FALSE]

        }


        # placeholder matrix
        n.combos <- nrow(combos)

        # end combo clock
        combo.clock2 <- Sys.time()
        combo.clock[i] <- as.numeric(
            difftime(combo.clock2, combo.clock1, units = "secs")
        )

        # this matrix receives the reach and freq calcs
        fill <- matrix(
            data = NA,
            ncol = 2,
            nrow = n.combos,
            dimnames = list(
                NULL,
                c("reach", "freq")
            )
        )

        # calculate reach
        # pb <- progress_bar$new(total = n.combos)
        # It should be noted that time was spent on optimizing this part of
        # the calculation. This for-loop is marginally faster than any of the
        # apply functions I have tried.

        if (progress) prog.seq <- floor(seq(1, n.combos, length.out = 10))

        turf.clock1 <- Sys.time()

        for (j in 1:nrow(combos)) {

            if (progress) {
                if (any(prog.seq == j)) {
                    cat("\r")
                    cat("  Progress:", percent(j/n.combos))
                    cat("\r")
                }
            }

            n.reached <- rowsums(item.mat[, combos[j, ], drop = FALSE])
            is.reached <- n.reached >= depth
            reach <- fmean(x = is.reached, w = wgt.vec)

            # denominator is now calculated outside the loop
            # since it does not change
            freq  <- sum(wgt.vec * n.reached) / wgt.base

            fill[j, ] <- c(reach, freq)

        }

        # Join the reach measures to the combinations
        reach.stats <-
            combos %>%
            as_tibble() %>%
            bind_cols(as_tibble(fill), .) %>%
            rowid_to_column("combo") # %>%
        # roworderv(cols = c("reach", "freq"), decreasing = TRUE)

        # update the list
        reach.list[[i]] <- reach.stats

        # log the clock
        turf.clock2 <- Sys.time()

        turf.clock[i] <- as.numeric(
            difftime(turf.clock2, turf.clock1, units = "secs")
        )

    }


    # update clocks and reach output
    k.null       <- map_lgl(reach.list, is.null)
    combo.clock  <- combo.clock[!k.null]
    turf.clock   <- turf.clock[!k.null]
    reach.list   <- reach.list[!k.null]
    total.clock2 <- Sys.time()
    total.clock  <- as.numeric(
        difftime(total.clock2, total.clock1, units = "secs")
    )


    # Add in item weights -----------------------------------------------------

    if (do.item.weights) {

        map_item_weights <- function(data, w) {
            cn <- data %>% select(matches("i")) %>% names()

            wgt.sum <- dapply(
                X = data[cn],
                MARGIN = 1,
                FUN = function(x) {
                    fsum(w[names(w) %in% x])
                }
            )

            mutate(
                .data = data,
                weight = wgt.sum,
                value = reach * weight,
                .before = matches("i_")
            )
        }

        reach.list <- map(
            .x = reach.list,
            .f = ~map_item_weights(.x, w = item.wgt)
        )
    }


    # Add in shapley values ---------------------------------------------------

    map_sv <- function(data, sv) {

        cn <- data %>% select(matches("i")) %>% names()

        sv.sum <- dapply(
            X = data[cn],
            MARGIN = 1,
            FUN = function(x) {
                fsum(sv[names(sv) %in% x])
            }
        )

        mutate(
            .data = data,
            shap = sv.sum,
            .after = freq
        )

    }

    reach.list <- map(
        .x = reach.list,
        .f = ~map_sv(.x, sv = deframe(shap.approx))
    )



    # Organize and return output ----------------------------------------------

    reach.list <-
        reach.list %>%
        enframe(name = "k", value = "results") %>%
        mutate(k = parse_number(k))


    clock.list <- list(
        total = total.clock,
        by_k = tibble(
            k = reach.list$k,
            n_combos = map_int(reach.list$results, nrow),
            combo_secs = combo.clock,
            turf_secs = turf.clock,
            combo_per_sec = n_combos / combo_secs,
            turf_per_sec =  n_combos / turf_secs
        )
    )

    info <- list(
        n = wgt.base,
        n_items = length(item.names),
        n_combos = sum(map_int(reach.list$results, nrow), na.rm = TRUE),
        items = item.names,
        item_weights = list(
            weighted = do.item.weights,
            weights = item.wgt
        ),
        k = k,
        depth = depth,
        case_weights = list(
            weighted = has.weights,
            name = ifelse(has.weights, case.weights.names, NA_character_)
        ),
        greedy = list(
            begin = greedy_begin,
            entry = greedy_entry
        ),
        progress = progress
    )

    out <- list(
        turf = reach.list,
        info = info,
        constraints = constraints,
        clock = clock.list
    )

    class(out) <- "turf"

    cat_line()

    out

}

utils::globalVariables(c(
    ".", "item", "shapley_value", "n_combos", "combo_secs", "turf_secs",
    "weight"
))

#' @exportS3Method print turf
print.turf <- function(x, ...) {

    # Begin messages
    cat_line(rule(left = "TURF", line = 2))

    # Build string for `k`
    k <- x$turf$k
    k.seq <- seqid(k)

    k.string <-
        tibble(k, k.seq) %>%
        group_by(k.seq) %>%
        mutate(min = min(k), max = max(k)) %>%
        ungroup() %>%
        mutate(
            string = case_when(
                min == max ~ as.character(k),
                .default = paste0(min,"-",max)
            ),
            string = fct_inorder(string)
        ) %>%
        distinct(string) %>%
        pull() %>%
        paste(collapse = ", ")


    # Build string for the total runtime
    runtime <- x$clock$total
    runtime <- ifelse(
        runtime <= 2*60,
        glue("{round(runtime, 2)} secs"),
        glue("{round(runtime / 60, 2)} mins")
    )

    # Print out summary messages
    cat_line(glue("# Items       {x$info$n_items}"))
    cat_line(glue("# Combos      {scales::comma(x$info$n_combos)}"))
    cat_line(glue("Set Sizes     {k.string}"))
    cat_line(glue("Sample Size   {x$info$n}"))
    cat_line(glue("Total Runtime {runtime}"))
    cat_line()

    # Item weights and case weights
    c.wgt <- x$info$case_weights$weighted
    i.wgt <- x$info$item_weights$weighted

    if (any(c.wgt, i.wgt)) {
        cat_line(rule(left = "Weights", line = 1))
        cat_line(glue("Case Weights {ifelse(c.wgt, 'Yes', 'No')}"))
        cat_line(glue("Item Weights {ifelse(i.wgt, 'Yes', 'No')}"))
        cat_line()
    }

    # Constraints
    con.in   <- length(x$constraints$force_in) > 0
    con.out  <- length(x$constraints$force_out) > 0
    con.in2  <- length(x$constraints$force_in_together) > 0
    con.out2 <- length(x$constraints$force_out_together) > 0

    if (any(con.in, con.out, con.in2, con.out2)) {

        cat_line(rule(left = "Constraints", line = 1))

        if (con.in) {
            con.in.str <-
                x$constraints$force_in %>%
                paste(collapse = ", ") %>%
                paste("\U2022", .)

            cat_line("Items Forced In")
            cat_line(con.in.str)
            cat_line()
        }

        if (con.out) {
            con.out.str <-
                x$constraints$force_out %>%
                paste(collapse = ", ") %>%
                paste("\U2022", .)

            cat_line("Items Forced Out")
            cat_line(con.out.str)
            cat_line()
        }

        if (con.in2) {
            con.in2.str <-
                x$constraints$force_in_together %>%
                map(paste, collapse = ", ") %>%
                map(~paste("\U2022", .x)) %>%
                paste(collapse = "\n")

            cat_line("Items Forced In Together")
            cat_line(con.in2.str)
            cat_line()
        }

        if (con.out2) {
            con.out2.str <-
                x$constraints$force_out_together %>%
                map(paste, collapse = ", ") %>%
                map(~paste("\U2022", .x)) %>%
                paste(collapse = "\n")

            cat_line("Items Forced Out Together")
            cat_line(con.out2.str)
            cat_line()
        }
    }

    # Greedy?
    did.greedy <- x$info$greedy$begin < max(x$turf$k)

    if (did.greedy) {
        cat_line(rule("Greedy"))
        cat_line(glue("Begins at size {x$info$greedy$begin}"))
        cat_line(glue("Entry method   {x$info$greedy$entry}"))
    }

    invisible(x)

}

utils::globalVariables(c(
    "string"
))
ttrodrigz/onezero documentation built on May 9, 2023, 2:59 p.m.