test-code/turf-working/old-attempts/best_of.R

library(tidyverse)
library(tidyselect)
library(microbenchmark)
library(turfR)
library(arrangements)
library(onezero)
library(progress)
library(profvis)

"https://stackoverflow.com/questions/51774646/efficiency-of-matrix-rowsums-vs-colsums-in-r-vs-rcpp-vs-armadillo"

x <-
    turfR::turf_ex_data %>%
    as_tibble()

terf2 <- function(
    data, cols, k, need = 1, force.in, weight, fast
) {

    # Parse information -------------------------------------------------------

    # temp data for tidyeval
    tmp <- slice(data, 1)

    # Handle weights --

    # If no weights, create a vector of 1s
    if (missing(weight)) {

        wgt.vec <- rep(1, times = nrow(data))

    } else {

        .weight <- enquo(weight)
        weight.index <- eval_select(
            expr = .weight,
            data = tmp
        )
        weight.name <- names(weight.index)

        if (length(weight.name) > 1) {
            stop("Can only pass one column of weights to `weight` input.")
        }

        wgt.vec <- data[[weight.name]]
        data    <- select(data, -all_of(weight.name))

    }

    # Get columns used in the analysis --

    .cols <- enquo(cols)
    col.index <- eval_select(
        expr = .cols,
        data = tmp
    )
    col.names <- names(col.index)

    if (length(col.names) < k) {
        stop("Value of `k` cannot exceed number of columns specified in `cols`.")
    }

    data <- data[col.names]

    # Identify names of variables forced in --

    if (!missing(force.in)) {
        .force.in <- enquo(force.in)
        force.index <- eval_select(
            expr = .force.in,
            data = tmp
        )
        force.names <- names(force.index)

        bad.names <- force.names[!force.names %in% col.names]

        if (length(bad.names) > 0) {
            stop(paste0(
                "Tried to force in columns that are not included in `cols` argument:\n",
                paste(bad.names, collapse = ", ")
            ))
        }
    }

    # Get all combos ----------------------------------------------------------

    combos <- combinations(
        x = col.names,
        k = k
    )

    colnames(combos) <- paste0("v", 1:k)

    # subset the combos if force in
    if (!missing(force.in)) {

        keep.combos <- apply(combos, 1, function(x) all(force.names %in% x))
        combos <- combos[keep.combos, , drop = FALSE]

    }

    n.combos <- nrow(combos)


    # Calculate ---------------------------------------------------------------

    fill <- matrix(
        data = NA,
        ncol = 2,
        nrow = n.combos,
        dimnames = list(
            NULL,
            c("reach", "freq")
        )
    )

    if (fast %in% c("Rfast", "cpp")) {
        data.0 <- as.matrix(data)
        data.0[is.na(data.0)] <- 0
    }

    for (i in 1:nrow(combos)) {

        if (n.combos > 1500) {
            cat('\r', paste0("Combination ", i, "/", n.combos, " (", round(100*(i/n.combos)), "%)"))
            flush.console()
        }

        if (fast == "Rfast") {
            n.reached <- Rfast::rowsums(data.0[, combos[i, ]])
        }

        if (fast == "cpp") {
            n.reached <- row_sums_cpp(data.0[, combos[i, ]])
        }

        if (fast == "no") {
            n.reached <- rowSums(data[combos[i, ]], na.rm = TRUE)
        }

        is.reached <- n.reached >= need

        # final versions here
        reach <- weighted_mean_cpp(is.reached, wgt.vec)
        freq  <- sum(wgt.vec * n.reached) / sum(wgt.vec[is.reached])

        out <- c(reach, freq)

        fill[i, ] <- out

    }


    reach.compare <- matrix(
        nrow = 3,
        ncol = length(col.names),
        dimnames = list(
            c("in", "out", "net"),
            col.names
        )
    )

    # for (i in seq_along(col.names)) {
    #
    #     item.i <- col.names[i]
    #
    #     in.out <- apply(combos, 1, function(x) ifelse(any(item.i %in% x), "in", "out"))
    #
    #     in.out.means <- aggregate(
    #         fill[, "reach"],
    #         by = list("in_out" = in.out),
    #         FUN = mean,
    #         simplify = TRUE
    #     )
    #
    #     reach.compare[-3, i] <- in.out.means$x
    #
    # }
    #
    # reach.compare[3, ] <- reach.compare[1, ] - reach.compare[2, ]

    out <-
        as_tibble(combos) %>%
        bind_cols(as_tibble(fill)) %>%
        rowid_to_column("combo") %>%
        arrange(desc(reach), desc(freq))

    cat("\n\n")
    out

    # list(
    #     reach.compare = reach.compare,
    #     turf = out
    # )

}

identical(
    terf2(
        data = pets,
        cols = dog:bird,
        weight = weight,
        k = 2,
        fast = "Rfast"
    ),

    terf2(
        data = pets,
        cols = dog:bird,
        weight = weight,
        k = 2,
        fast = "no"
    )
)




# Compare testing ---------------------------------------------------------

make_data <- function(n, m) {
    matrix(
        data = sample(c(1, 0, NA), n * m, T, c(0.45, 0.45, 0.00)),
        nrow = n,
        ncol = m,
        dimnames = list(
            NULL, paste0("i", 1:m)
        )
    ) %>%
        as_tibble() %>%
        rowid_to_column() %>%
        add_column(weight = 1, .after = 1)
}

d1 <- make_data(n = 1000, m = 12)

microbenchmark(

    "rowSums" = terf(
        data = d1,
        cols = matches("^i"),
        k = 6,
        need = 1,
        weight = weight,
        fast = "no"
    ),

    "Rfast::rowsums" = terf(
        data = d1,
        cols = matches("^i"),
        k = 6,
        need = 1,
        weight = weight,
        fast = "Rfast"
    ),

    "row_sums_cpp" = terf(
        data = d1,
        cols = matches("^i"),
        k = 6,
        need = 1,
        weight = weight,
        fast = "cpp"
    ),

    "hack" = turf(
        data = d1,
        n = 12,
        k = 6,
        depth = 1
    ),
    times = 30,
    unit = "s"
)

microbenchmark(

    "mine" = terf(
        data = d1,
        cols = matches("^i"),
        k = 6,
        need = 1,
        weight = weight,
        fast = "Rfast"
    ),

    "hack" = turf(
        data = d1,
        n = 12,
        k = 6,
        depth = 1
    ),
    times = 10,
    unit = "s"

)



N <- 1000
m <- matrix(
    data = sample(c(1, 0, NA), N * N, T, c(0.45, 0.45, 0.1)),
    ncol = N,
    nrow = N
)

mrf <- function(x) {
    Rfast::rowsums(
        x = x
    )
}

microbenchmark::microbenchmark(
    "base" = rowSums(m, na.rm = TRUE),
    "new" = row_sums_cpp(m),
    "rfast" = mrf(m)
)

identical(
    rowSums(m, na.rm = TRUE),
    mrf(m)
)
ttrodrigz/onezero documentation built on May 9, 2023, 2:59 p.m.