misc/aqp2/jindex-benchmarks.R

spc_j_DT <- function(x, i, j) {

  #  this is already live in AQP for data.table SPCs -- is it worth enabling globally?

  h <- data.table::as.data.table(horizons(x))

  # local vars to make R CMD check happy
  .N <- NULL
  .I <- NULL
  V1 <- NULL

  # data.table can do this much more efficiently
  # if (requireNamespace("data.table", quietly = TRUE)) {
  idn <- idname(x)

  # by list @horizons idname (essentially iterating over profiles)
  bylist <- list(h[[idn]])
  names(bylist) <- idn

  # figured out the data.table way to do this
  #  not using := or . anymore

  # determine j indices to KEEP
  j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1

  # determine which site indices to keep
  # in case all horizons are removed, remove sites too
  if (length(j.idx) == 0) {
    i.idx <- numeric(0)
  } else {
    # determine which profile IDs KEEP
    pids <- h[, .I[any(1:.N %in% j)][1], by = bylist]
    i.idx <- pids[, .I[!is.na(V1)]]
  }
  return(list(i.idx, j.idx))
}

# benchmark faster replacement of j subsetting of horizon data

spc_j_DT_2 <- function(x, i, j) {

  #  this is already live in AQP for data.table SPCs
  # question: is it worth enabling globally?

  h <- data.table::as.data.table(horizons(x))

  # local vars to make R CMD check happy
  .N <- NULL
  .I <- NULL
  V1 <- NULL

  idn <- idname(x)

  # by list @horizons idname (essentially iterating over profiles)
  bylist <- list(h[[idn]])
  names(bylist) <- idn

  # figured out the data.table way to do this
  #  not using := or . anymore

  # determine j indices to KEEP
  j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1

  # determine which site indices to keep
  # in case all horizons are removed, remove sites too
  if (length(j.idx) == 0) {
    i.idx <- numeric(0)
  } else {
    # determine which profile IDs KEEP
    i.idx <- which(profile_id(x) %in% unique(h[j.idx,][[idn]]))
    # i.idx <- which(vapply(j.res, function(jr) { any(jr) }, FUN.VALUE = logical(1)))
  }
  return(list(i.idx, j.idx))
}

spc_j_base <- function(x, i, j) {
    h <- horizons(x)
    # retain a base R way of doing things (plenty fast with SPCs up to ~100k or so)
    j.res <- as.list(aggregate(
        h[[hzidname(x)]],
        by = list(h[[idname(x)]]),
        FUN = function(hh) {
          list(1:length(hh) %in% j)
        },
        drop = FALSE
      )$x)

    ##  https://github.com/ncss-tech/aqp/issues/89
    # fix #89, where i with no matching j e.g. @site data returned
    i.idx <- which(as.logical(lapply(j.res, function(jr) { any(jr) })))
    j.idx <-  which(do.call('c', j.res))

    return(list(i.idx, j.idx))
}

spc_j_base_2 <- function(x, i, j) {
  h <- horizons(x)
  idn <- idname(x)

  # slightly more competitive version of base
  j.res <- as.list(aggregate(
    seq_len(nrow(h)),
    by = list(h[[idn]]),
    FUN = function(hh) {
      (1:length(hh) %in% j)
    }
  )$x)

  # i.idx <- profile_id(x) %in% unique(h[unlist(j.res), ][[idn]]) # not faster?
  i.idx <- which(vapply(j.res, function(jr) { any(jr) }, FUN.VALUE = logical(1)))

  j.idx <-  which(unlist(j.res))

  return(list(i.idx, j.idx))
}

spc_j_base_3 <- function(x, i, j) {
  h <- horizons(x)
  idn <- idname(x)

  # slightly more competitive version of base
  j.res <- aggregate(
    seq_len(nrow(h)),
    by = list(h[[idn]]),
    FUN = function(hh) {
      (1:length(hh) %in% j)
    }
  )$x

  i.idx <- which(vapply(j.res, any, FUN.VALUE = logical(1)))
  j.idx <-  which(unlist(j.res))

  return(list(i.idx, j.idx))
}

library(aqp)

data(sp4)
depths(sp4) <- id ~ top + bottom

# base outperforms old DT with 10 profiles, but new solution slightly better
microbenchmark::microbenchmark(spc_j_base(sp4, 1:10, 2:3),
                               spc_j_base_2(sp4, 1:10, 2:3),
                               spc_j_base_3(sp4, 1:10, 2:3),
                               spc_j_DT(sp4, 1:10, 2:3),
                               spc_j_DT_2(sp4, 1:10, 2:3))

# whether the SPC is a "data.table SPC" ahead of time does not affect overhead much
sp4_DT <- data.table::as.data.table(horizons(sp4))
depths(sp4_DT) <- id ~ top + bottom

bench::mark(spc_j_base(sp4_DT, 1:10, 2:3),
            spc_j_base_2(sp4_DT, 1:10, 2:3),
            spc_j_base_3(sp4_DT, 1:10, 2:3),
            spc_j_DT(sp4_DT, 1:10, 2:3),
            spc_j_DT_2(sp4_DT, 1:10, 2:3))

# basically neck-and-neck at 1000 profiles; closing the gap of base over DT from ~50% to about ~5%
# the new solution is 200% faster than both old ones
thousand <- as.data.frame(data.table::rbindlist(lapply(1:1000, random_profile)))
depths(thousand) <- id ~ top + bottom

bench::mark(spc_j_base(thousand, 1:10, 2:3),
            spc_j_base_2(thousand, 1:10, 2:3),
            spc_j_base_3(thousand, 1:10, 2:3),
            spc_j_DT(thousand, 1:10, 2:3),
            spc_j_DT_2(thousand, 1:10, 2:3))

# ten thousand profiles, we see a benefit from DT (old: ~20% faster than base; new 300% faster)
tenthousand <- as.data.frame(data.table::rbindlist(lapply(1:10000, random_profile)))
depths(tenthousand) <- id ~ top + bottom

bench::mark(spc_j_base(tenthousand, 1:10, 2:3),
            spc_j_base_2(tenthousand, 1:10, 2:3),
            spc_j_base_3(tenthousand, 1:10, 2:3),
            spc_j_DT(tenthousand, 1:10, 2:3),
            spc_j_DT_2(tenthousand, 1:10, 2:3))

# hundred thousand profiles, we see a similar benefit from DT -- 2 to 3 times faster
# hundredthousand <- as.data.frame(data.table::rbindlist(lapply(1:100000, random_profile)))
# depths(hundredthousand) <- id ~ top + bottom
#
# bench::mark(spc_j_base(hundredthousand, 1:10, 2:3),
#             spc_j_base_2(hundredthousand, 1:10, 2:3),
#             spc_j_base_3(hundredthousand, 1:10, 2:3),
#             spc_j_DT(hundredthousand, 1:10, 2:3),
#             spc_j_DT_2(hundredthousand, 1:10, 2:3))

# # million profiles, we see a benefit from DT
# million <- as.data.frame(data.table::rbindlist(lapply(1:1000000, random_profile)))
# depths(million) <- id ~ top + bottom
#
# bench::mark(
#   spc_j_base(million, 1:10, 2:3),
#   spc_j_DT(million, 1:10, 2:3),
#   spc_j_DT_2(hundredthousand, 1:10, 2:3)
# )
ncss-tech/aqp documentation built on April 19, 2024, 5:38 p.m.