tests/bist_optbin.R
In optbin: Optimal Binning of Data

library(optbin)


### Helper Functions

# Short for cat(sprintf(fmt, ...)).  Automatically appends \n to fmt.
catf <- function(fmt, ...) {
  cat(sprintf(sprintf("%s\n", fmt), ...))
}

# Nearly-equal floating point test of two numbers, within the absolute
# accuracy acc.
fpnear <- function(x1, x2, acc=1e-7) {
  return(abs(x1-x2) < acc)
}


### optbin Functionality Tests

# Test the optimal binning result res with nbin bins against the expected
# endpoints expend, threshold values expthr, the total, best (M)SE expse,
# and the average value per bin expavg.  Returns TRUE if all tests pass,
# FALSE if any fail.
verify_binning <- function(nbin, res, expend, expthr, expse, expavg) {
  passed <- TRUE

  if (nbin != res$numbins) {
    catf("    expected %d bins but result has %d", nbin, res$numbins)
    passed <- FALSE
  }

  for (b in 1:nbin) {
    if (expend[b] != res$breaks[b]) {
      catf("    expected bin %d endpoint at %d but got %d",
           b, expend[b], res$breaks[b])
      passed <- FALSE
    }
  }

  for (b in 1:nbin) {
    if (!fpnear(expthr[b], res$thr[b])) {
      catf("    expected bin %d threshold %.3f but got %.3f",
           b, expthr[b], res$thr[b])
      passed <- FALSE
    }
  }

  if (!fpnear(expse, res$minse)) {
    catf("    expected bin %d metric %.5f but got %.5f",
         b, expse, res$minse)
    passed <- FALSE
  }

  for (b in 1:nbin) {
    if (!fpnear(expavg[b], res$binavg[b])) {
      catf("    expected bin %d average %.3f but got %.3f",
           b, expavg[b], res$binavg[b])
      passed <- FALSE
    }
  }

  return(passed)
}

# Compare two binning results, resA and resB.  Any difference causes the
# test to fail.  Returns TRUE if the two are the same, FALSE if there
# are differences.
compare_binning <- function(resA, resB) {
  passed <- TRUE

  if (resA$numbins != resB$numbins) {
    catf("    binning A has %d bins, B %d", resA$numbins, resB$numbins)
    passed <- FALSE
  }

  if (resA$metric != resB$metric) {
    catf("    binning A used %s, B %s", resA$metric, resB$metric)
    passed <- FALSE
  }

  if (!fpnear(resA$minse, resB$minse)) {
    catf("    binning A best %s %.5f, B %.5f", toupper(resA$metric),
         resA$minse, resB$minse)
    passed <- FALSE
  }

  for (b in 1:resA$numbins) {
    if (!fpnear(resA$thr[b], resB$thr[b])) {
      catf("    bin %d threshold in A is %.5f, B %.5f",
           b, resA$thr[b], resB$thr[b])
      passed <- FALSE
    }
  }

  for (b in 1:resA$numbins) {
    if (!fpnear(resA$binavg[b], resB$binavg[b])) {
      catf("    bin %d average in A is %.5f, B %.5f",
           b, resA$binavg[b], resB$binavg[b])
      passed <- FALSE
    }
  }

  for (b in 1:resA$numbins) {
    if (!fpnear(resA$thr[b], resB$thr[b])) {
      catf("    bin %d (M)SE in A is %.5f, B %.5f",
           b, resA$binse[b], resB$binse[b])
      passed <- FALSE
    }
  }

  for (b in 1:resA$numbins) {
    if (resA$breaks[b] != resB$breaks[b]) {
      catf("    bin %d endpoint in A is %d, B %d",
           b, resA$breaks[b], resB$breaks[b])
      passed <- FALSE
    }
  }

  return(passed)
}


# Test 1: Small linear sequence.  Check with 2, 3, and 4 bins.
optbin_test1 <- function() {
  passed <- TRUE

  data <- c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0)

  catf("  running test set 1 with cache")

  b <- optbin(data, 2)
  if (!verify_binning(2, b, c(5, 10), c(5.0, 10.0), 20.0, c(3.0, 8.0))) {
    passed <- FALSE
  }

  b <- optbin(data, 3)
  if (!verify_binning(3, b, c(3, 6, 10), c(3.0, 6.0, 10.0),
                      9.0, c(2.0, 5.0, 8.5))) {
    passed <- FALSE
  }

  b <- optbin(data, 4)
  if (!verify_binning(4, b, c(2, 4, 7, 10), c(2.0, 4.0, 7.0, 10.0),
                      5.0, c(1.5, 3.5, 6.0, 9.0))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  catf("  running test set 1 without cache")

  b <- optbin(data, 2, max.cache=0)
  if (!verify_binning(2, b, c(5, 10), c(5.0, 10.0), 20.0, c(3.0, 8.0))) {
    passed <- FALSE
  }

  b <- optbin(data, 3, max.cache=0)
  if (!verify_binning(3, b, c(3, 6, 10), c(3.0, 6.0, 10.0),
                      9.0, c(2.0, 5.0, 8.5))) {
    passed <- FALSE
  }

  b <- optbin(data, 4, max.cache=0)
  if (!verify_binning(4, b, c(2, 4, 7, 10), c(2.0, 4.0, 7.0, 10.0),
                      5.0, c(1.5, 3.5, 6.0, 9.0))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  return(passed)
}

# Test 2: Two normals, N(1, 0.25), N(2, 0.25).  Check with 2, 3, and 4 bins.
optbin_test2 <- function() {
  passed <- TRUE

  data <- c(0.86, 0.91, 0.92, 1.04, 1.23, 1.23, 1.30, 1.36, 1.79, 1.89,
            1.89, 2.34, 2.41, 2.58, 2.79, 2.93, 2.96, 3.21, 3.41, 3.55)

  catf("  running test set 2 with cache")

  b <- optbin(data, 2)
  if (!verify_binning(2, b, c(11, 20), c(1.89, 3.55),
                      2.956779797980, c(1.31090909, 2.90888888))) {
    passed <- FALSE
  }

  b <- optbin(data, 3)
  if (!verify_binning(3, b, c(8, 14, 20), c(1.36, 2.58, 3.55),
                      1.269070833333, c(1.10625, 2.15, 3.14166666))) {
    passed <- FALSE
  }

  b <- optbin(data, 4)
  if (!verify_binning(4, b, c(8, 11, 17, 20), c(1.36, 1.89, 2.96, 3.55),
                      0.686537500000,
                      c(1.10625, 1.85666666, 2.668333333, 3.39))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  catf("  running test set 2 without cache")

  b <- optbin(data, 2, max.cache=0)
  if (!verify_binning(2, b, c(11, 20), c(1.89, 3.55),
                      2.956779797980, c(1.31090909, 2.90888888))) {
    passed <- FALSE
  }

  b <- optbin(data, 3, max.cache=0)
  if (!verify_binning(3, b, c(8, 14, 20), c(1.36, 2.58, 3.55),
                      1.269070833333, c(1.10625, 2.15, 3.14166666))) {
    passed <- FALSE
  }

  b <- optbin(data, 4, max.cache=0)
  if (!verify_binning(4, b, c(8, 11, 17, 20), c(1.36, 1.89, 2.96, 3.55),
                      0.686537500000,
                      c(1.10625, 1.85666666, 2.668333333, 3.39))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  return(passed)
}

# Test 3: Four constant bins.  Check with 2 t/m 6 bins (tie resolution 5, 6).
optbin_test3 <- function() {
  passed <- TRUE

  data <- c(1.0, 1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0,
            5.0, 5.0, 5.0, 5.0, 7.0, 7.0, 7.0, 7.0)

  catf("  running test set 3 with cache")

  b <- optbin(data, 2)
  if (!verify_binning(2, b, c(8, 16), c(3.0, 7.0), 16.0, c(2.0, 6.0))) {
    passed <- FALSE
  }

  b <- optbin(data, 3)
  if (!verify_binning(3, b, c(4, 8, 16), c(1.0, 3.0, 7.0), 8.0,
                      c(1.0, 3.0, 6.0))) {
    passed <- FALSE
  }

  b <- optbin(data, 4)
  if (!verify_binning(4, b, c(4, 8, 12, 16), c(1.0, 3.0, 5.0, 7.0), 
                      0.0, c(1.0, 3.0, 5.0, 7.0))) {
    passed <- FALSE
  }

  b <- optbin(data, 5)
  if (!verify_binning(5, b, c(2, 4, 8, 12, 16), c(1.0, 1.0, 3.0, 5.0, 7.0),
                      0.0, c(1.0, 1.0, 3.0, 5.0, 7.0))) {
    passed <- FALSE
  }

  b <- optbin(data, 6)
  if (!verify_binning(6, b, c(2, 4, 6, 8, 12, 16),
                      c(1.0, 1.0, 3.0, 3.0, 5.0, 7.0), 0.0,
                      c(1.0, 1.0, 3.0, 3.0, 5.0, 7.0))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  catf("  running test set 3 without cache")

  b <- optbin(data, 2, max.cache=0)
  if (!verify_binning(2, b, c(8, 16), c(3.0, 7.0), 16.0, c(2.0, 6.0))) {
    passed <- FALSE
  }

  b <- optbin(data, 3, max.cache=0)
  if (!verify_binning(3, b, c(4, 8, 16), c(1.0, 3.0, 7.0), 8.0,
                      c(1.0, 3.0, 6.0))) {
    passed <- FALSE
  }

  b <- optbin(data, 4, max.cache=0)
  if (!verify_binning(4, b, c(4, 8, 12, 16), c(1.0, 3.0, 5.0, 7.0), 
                      0.0, c(1.0, 3.0, 5.0, 7.0))) {
    passed <- FALSE
  }

  b <- optbin(data, 5, max.cache=0)
  if (!verify_binning(5, b, c(2, 4, 8, 12, 16), c(1.0, 1.0, 3.0, 5.0, 7.0),
                      0.0, c(1.0, 1.0, 3.0, 5.0, 7.0))) {
    passed <- FALSE
  }

  b <- optbin(data, 6, max.cache=0)
  if (!verify_binning(6, b, c(2, 4, 6, 8, 12, 16),
                      c(1.0, 1.0, 3.0, 3.0, 5.0, 7.0), 0.0,
                      c(1.0, 1.0, 3.0, 3.0, 5.0, 7.0))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  return(passed)
}

# Test 4: Data where SE and MSE bins differ.  Check with 2 and 3 bins.
optbin_test4 <- function() {
  passed <- TRUE

  data <- c(1.00, 1.25, 1.50, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00,
            5.00, 8.00, 8.25, 8.50)

  catf("  running test set 2 with cache and using SE")

  b <- optbin(data, 2, metric='se')
  if (!verify_binning(2, b, c(9, 13), c(3.0, 8.5),
                      11.796875000000, c(2.0, 7.4375))) {
    passed <- FALSE
  }

  b <- optbin(data, 3, metric='se')
  if (!verify_binning(3, b, c(8, 10, 13), c(2.75, 5.0, 8.50),
                      4.750000000000, c(1.875, 4.0, 8.25))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  catf("  running test set 2 with cache and using MSE")

  b <- optbin(data, 2, metric='mse')
  if (!verify_binning(2, b, c(10, 13), c(5.0, 8.5),
                      1.226666666667, c(2.30, 8.25))) {
    passed <- FALSE
  }

  b <- optbin(data, 3, metric='mse')
  if (!verify_binning(3, b, c(3, 10, 13), c(1.5, 5.0, 8.5),
                      1.08333333333, c(1.25, 2.75, 8.25))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  catf("  running test set 2 without cache and using SE")

  b <- optbin(data, 2, metric='se', max.cache=0)
  if (!verify_binning(2, b, c(9, 13), c(3.0, 8.5),
                      11.796875000000, c(2.0, 7.4375))) {
    passed <- FALSE
  }

  b <- optbin(data, 3, metric='se', max.cache=0)
  if (!verify_binning(3, b, c(8, 10, 13), c(2.75, 5.0, 8.50),
                      4.750000000000, c(1.875, 4.0, 8.25))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  catf("  running test set 2 without cache and using MSE")

  b <- optbin(data, 2, metric='mse', max.cache=0)
  if (!verify_binning(2, b, c(10, 13), c(5.0, 8.5),
                      1.226666666667, c(2.30, 8.25))) {
    passed <- FALSE
  }

  b <- optbin(data, 3, metric='mse', max.cache=0)
  if (!verify_binning(3, b, c(3, 10, 13), c(1.5, 5.0, 8.5),
                      1.08333333333, c(1.25, 2.75, 8.25))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  return(passed)
}

# Fill a large array with random data and compare the results with and
# without caching.
optbin_caching <- function() {
  passed <- TRUE

  catf("  running random data with and without caching using SE")

  # 2520 is a multiple of 5, 7, 8, and 9 so all bins fall on endpoints.
	set.seed(17)
  data <- runif(2520)

  for (b in 2:10) {
    bcache <- optbin(data, b)
    bnocache <- optbin(data, b, max.cache=0)
    if (!compare_binning(bcache, bnocache)) {
      passed <- FALSE
    }
  }

  if (passed) {
    catf("    tests passed")
  }
  catf("  running random data with and without caching using MSE")

  for (b in 2:10) {
    bcache <- optbin(data, b, metric='mse')
    bnocache <- optbin(data, b, metric='mse', max.cache=0)
    if (!compare_binning(bcache, bnocache)) {
      passed <- FALSE
    }
  }

  if (passed) {
    catf("    tests passed")
  }
  return(passed)
}


### assign.optbin Functionality Tests

# Check that the count of values assigned to each bin match the bin size
# in points.  Returns TRUE if all tests pass, FALSE if any fail.
assign_test1 <- function() {
  passed <- TRUE

  catf("  running assign.optbin tests")

  set.seed(19)
  data <- rnorm(1000, mean=2, sd=0.5)

  binned <- optbin(data, 10)
  cnt <- table(assign.optbin(data, binned))
  stpt <- 1
  for (b in 1:10) {
    expcnt <- binned$breaks[b] - stpt + 1
    if (cnt[b] != expcnt) {
      catf("    %d values assigned to bin %d but range has %d",
           cnt[b], b, expcnt)
      passed <- FALSE
    }
    stpt <- binned$breaks[b] + 1
  }

  if (passed) {
    catf("    tests passed")
  }
  catf("  running assign.optbin tests without extending upper bound")

  dmax <- max(data)
  data2 <- c(data, dmax+1:10)
  cnt <- table(assign.optbin(data2, binned))
  stpt <- 1
  for (b in 1:10) {
    expcnt <- binned$breaks[b] - stpt + 1
    if (cnt[b] != expcnt) {
      catf("    %d values assigned to bin %d but range has %d",
           cnt[b], b, expcnt)
      passed <- FALSE
    }
    stpt <- binned$breaks[b] + 1
  }

  if (passed) {
    catf("    tests passed")
  }
  catf("  running assign.optbin tests while extending upper bound")

  cnt <- table(assign.optbin(data2, binned, extend.upper=TRUE))
  stpt <- 1
  for (b in 1:9) {
    expcnt <- binned$breaks[b] - stpt + 1
    if (cnt[b] != expcnt) {
      catf("    %d values assigned to bin %d but range has %d",
           cnt[b], b, expcnt)
      passed <- FALSE
    }
    stpt <- binned$breaks[b] + 1
  }

  b <- 10
  expcnt <- binned$breaks[b] - stpt + 1
  if (cnt[b] != (expcnt + 10)) {
    catf("    %d values assigned to bin %d but expected %d",
         cnt[b], b, expcnt+10)
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  return(passed)
}


### Break The Part tests

# Various error checks on input data to optbin.  Returns TRUE if all tests
# pass, FALSE if any fail.
BTP_optbin <- function() {
  passed <- TRUE

  catf("  running optbin BTP tests")

  b <- tryCatch(optbin(c(1, 2, 3, 4, 5, "blah", 6), -1),
                error=function(e) { NULL })
  if (!is.null(b)) {
    catf("    negative number of bins did not raise error")
    passed <- FALSE
  }

  b <- tryCatch(optbin(c(1, 2, 3, 4, 5, "blah", 6), 1),
                error=function(e) { NULL })
  if (!is.null(b)) {
    catf("    too few bins did not raise error")
    passed <- FALSE
  }

  b <- tryCatch(optbin(c(1, 2, 3, 4, 5, "blah", 6), 2),
                error=function(e) { NULL })
  if (!is.null(b)) {
    catf("    non-numeric entry in data without na.rm did not raise error")
    passed <- FALSE
  }

  b <- tryCatch(b <- optbin(c(1, 2, 3, 4, 5, "blah", 6), 2, na.rm=TRUE),
                error=function(e) { NULL })
  if (is.null(b)) {
    catf("    non-numeric entry in data with na.rm did raise error")
    passed <- FALSE
  } else if (!verify_binning(2, b, c(3,6), c(3.0, 6.0), 4, c(2.0, 5.0))) {
    passed <- FALSE
  }

  b <- tryCatch(optbin(c(1, 2, 3, 4, 5, 6), 10),
                error=function(e) { NULL })
  if (!is.null(b)) {
    catf("    too few values for number of bins did not raise error")
    passed <- FALSE
  }

  # This is test1 data.
  data <- c(1.0, 2.0, 3.0, 4.0, NA, 5.0, 6.0, 7.0, NA, 8.0, 9.0, NA, 10.0)

  b <- tryCatch(optbin(data, 2),
                error=function(e) { NULL })
  if (!is.null(b)) {
    catf("    NAs in data without removing did not raise error")
    passed <- FALSE
  }

  b <- tryCatch(optbin(data, 2, na.rm=TRUE),
                error=function(e) { NULL })
  if (is.null(b)) {
    catf("    removed NAs still caused error")
    passed <- FALSE
  } else if (!verify_binning(2, b, c(5, 10), c(5.0, 10.0), 20.0, c(3.0, 8.0))) {
    passed <- FALSE
  }

  b <- tryCatch(optbin(reverse(data), 2, is.sorted=TRUE),
                error=function(e) { NULL })
  if (!is.null(b)) {
    catf("    unsorted data after setting is.sorted TRUE did not raise error")
    passed <- FALSE
  }

  b <- tryCatch(optbin(data, 2, is.sorted=T, na.rm=TRUE),
                error=function(e) { NULL })
  if (is.null(b)) {
    catf("    pre-sorted data raised error")
    passed <- FALSE
  } else if (!verify_binning(2, b, c(5, 10), c(5.0, 10.0), 20.0, c(3.0, 8.0))) {
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  return(passed)
}

# Error checks on the arguments to plot.optbin.  Returns TRUE if all tests
# pass, FALSE if any fail.
BTP_plot <- function() {
  passed <- TRUE

  catf("  running plot.optbin BTP tests")

  data <- 1:10
  b <- optbin(data, 2)

  # These generate an error because we use them in the basic plot.
  # plot doesn't return anything so our error function will generate an NA.
  res <- tryCatch(plot(b2, ann=TRUE), error=function(e) { NA })
  if (!is.na(res)) {
    catf("    passing ann to plot.optbin did not raise error")
    passed <- FALSE
  }

  res <- tryCatch(plot(b2, xaxt='s'), error=function(e) { NA })
  if (!is.na(res)) {
    catf("    passing xaxt to plot.optbin did not raise error")
    passed <- FALSE
  }

  if (passed) {
    catf("    tests passed")
  }
  return(passed)
}



### Top Level

allpass <- TRUE
cat('\nStarting optbin verification\n')

if (!optbin_test1()) {
  allpass <- FALSE
}
if (!optbin_test2()) {
  allpass <- FALSE
}
if (!optbin_test3()) {
  allpass <- FALSE
}
if (!optbin_test4()) {
  allpass <- FALSE
}
if (!optbin_caching()) {
  allpass <- FALSE
}
if (!suppressWarnings(BTP_optbin())) {
  allpass <- FALSE
}

if (!assign_test1()) {
  allpass <- FALSE
}

if (!BTP_plot()) {
  allpass <- FALSE
}


if (allpass) {
  cat('\nAll tests PASSED\n\n')
} else {
  cat('\nSome test FAILED\n\n')
  # This should cause the build check to fail.
  error("test failure")
}