tests/testthat/test_base_measures.R

test_that("measures", {
  requirePackagesOrSkip("Hmisc", default.method = "load")

  ct = binaryclass.task
  options(warn = 2)
  on.exit(options(warn = 0))
  mymeasure = makeMeasure(
    id = "foo", minimize = TRUE, properties = c(
      "classif", "classif.multi",
      "regr", "predtype.response", "predtype.prob"),
    fun = function(task, model, pred, feats, extra.args) {
      tt = pred
      1
    }
  )
  ms = list(mmce, acc, bac, tp, fp, tn, fn, tpr, fpr, tnr, fnr, ppv, npv, mcc, f1, mymeasure)

  lrn = makeLearner("classif.rpart")
  mod = train(lrn, task = ct, subset = binaryclass.train.inds)
  pred = predict(mod, task = ct, subset = binaryclass.test.inds)
  perf = performance(pred, measures = ms)
  expect_numeric(perf, any.missing = FALSE, len = length(ms))

  rdesc = makeResampleDesc("Holdout", split = 0.2)
  r = resample(lrn, ct, rdesc, measures = ms)
  expect_equal(
    names(r$measures.train),
    c(
      "iter", "mmce", "acc", "bac", "tp", "fp", "tn", "fn", "tpr", "fpr", "tnr",
      "fnr", "ppv", "npv", "mcc", "f1", "foo"))
  expect_equal(
    names(r$measures.test),
    c(
      "iter", "mmce", "acc", "bac", "tp", "fp", "tn", "fn", "tpr", "fpr", "tnr",
      "fnr", "ppv", "npv", "mcc", "f1", "foo"))

  # test that measures work for se
  ms = list(mse, timetrain, timepredict, timeboth, featperc)
  lrn = makeLearner("regr.lm", predict.type = "se")
  mod = train(lrn, task = regr.task, subset = regr.train.inds)
  pred = predict(mod, task = regr.task, subset = regr.test.inds)
  perf = performance(pred, measures = ms, model = mod)
  expect_numeric(perf, any.missing = FALSE, len = length(ms))

  # Test multiclass auc
  lrn = makeLearner("classif.randomForest", predict.type = "prob")
  mod = train(lrn, task = multiclass.task, subset = multiclass.train.inds)
  pred = predict(mod, task = multiclass.task, subset = multiclass.test.inds)
  perf = performance(pred, measures = list(
    multiclass.aunu, multiclass.aunp,
    multiclass.au1u, multiclass.au1p))
  expect_numeric(perf, any.missing = FALSE)

  # test survival measure
  ms = list(cindex, cindex.uno, iauc.uno)
  learners = c("surv.rpart", "surv.coxph")

  for (lrn in learners) {
    mod = suppressWarnings(train(lrn, task = surv.task, subset = surv.train.inds))
    pred = predict(mod, task = surv.task, subset = surv.test.inds)
    perf = performance(pred, model = mod, task = surv.task, measures = ms)
    Map(function(measure, perf) {
      r = range(measure$worst, measure$best)
      expect_number(perf, lower = r[1], upper = r[2], label = measure$id)
    }, measure = ms, perf = perf)
  }

  task = lung.task
  rin = makeResampleInstance("Holdout", task = task)
  for (lrn in learners) {
    res = resample(lrn, task, resampling = rin, measures = ms)$aggr
    expect_numeric(res, any.missing = FALSE)
    Map(function(measure) {
      r = range(measure$worst, measure$best)
      expect_number(res[[sprintf("%s.test.mean", measure$id)]],
        lower = r[1],
        upper = r[2], label = measure$id)
    }, measure = ms)
  }
})

test_that("classif measures do not produce integer overflow", {
  tsk = oversample(subsetTask(pid.task, features = getTaskFeatureNames(pid.task)), 1000)
  lrn = makeLearner("classif.rpart", predict.type = "prob")
  ms = listMeasures("classif", create = TRUE)
  r = holdout(lrn, tsk, measures = ms, show.info = FALSE)
  expect_numeric(r$aggr, any.missing = FALSE)
})

test_that("measures with same id still work", {
  m1 = mmce
  m2 = acc
  m1$id = m2$id = "foo"
  r = holdout("classif.rpart", iris.task, measures = list(m1, m2))
  expect_true(r$aggr[1L] < 0.2 && r$aggr[2L] > 0.8)
})

test_that("ber with faulty model produces NA", {
  data = iris
  data[, 1] = 1
  lrn = makeLearner("classif.lda", config = list(on.learner.error = "quiet"))
  task = makeClassifTask(data = data, target = "Species")
  r = holdout(lrn, task, measures = ber)
  expect_true(is.na(r$aggr))
})

test_that("db with single cluster doesn't give warnings", {
  requirePackagesOrSkip("clusterSim", default.method = "load")
  # using mtcars instead of agri task here because agri conflicts with
  # warning when column names inherit 'x' or 'y'
  expect_warning(crossval("cluster.kmeans", mtcars.task), NA)
})

test_that("mcc is implemented correctly", { # see issue 363
  r = holdout("classif.rpart", sonar.task, measure = mcc)
  p = as.data.frame(r$pred)
  cm = calculateConfusionMatrix(r$pred)$result[1:2, 1:2]

  # taken from psych::phi. the phi measure is another name for mcc
  r.sum = rowSums(cm)
  c.sum = colSums(cm)
  total = sum(r.sum)
  r.sum = r.sum / total
  c.sum = c.sum / total
  v = prod(r.sum, c.sum)
  phi = (cm[1, 1] / total - c.sum[1] * r.sum[1]) / sqrt(v)
  expect_equal(r$aggr[[1]], phi[[1L]])
})

test_that("listMeasures", {
  mycheck = function(type) {
    xs = listMeasures(type, create = TRUE)
    expect_true(is.list(xs) && length(xs) > 0L, info = type)
    expect_true(all(vlapply(xs, inherits, what = "Measure")), info = type)
  }
  mycheck("classif")
  mycheck("regr")
  mycheck("cluster")
  mycheck("surv")
  mycheck("costsens")
  mycheck("multilabel")
})

test_that("check measure calculations", {
  requirePackagesOrSkip("clusterSim", default.method = "load")

  # RWeka not avail
  skip_on_cran()
  skip_on_os("windows")

  requirePackagesOrSkip("Hmisc", default.method = "load")

  # tiny datasets for testing
  # features
  var1 = c(1, 2, 3, 4)
  var2 = c(3, 4, 1, 2)

  # for regression
  tar.regr = c(5, 10, 0, 5)
  pred.art.regr = c(4, 11, 0, 4)
  data.regr = data.frame(var1, var2, tar.regr)
  task.regr = makeRegrTask(data = data.regr, target = "tar.regr")
  lrn.regr = makeLearner("regr.rpart")
  mod.regr = train(lrn.regr, task.regr)
  pred.regr = predict(mod.regr, task.regr)
  pred.regr$data$response = pred.art.regr

  # for multiclass
  tar.classif = factor(c(1L, 2L, 0L, 1L))
  pred.art.classif = factor(c(1L, 1L, 0L, 2L))
  data.classif = data.frame(var1, var2, tar.classif)
  task.classif = makeClassifTask(data = data.classif, target = "tar.classif")
  lrn.classif = makeLearner("classif.rpart", predict.type = "prob")
  mod.classif = train(lrn.classif, task.classif)
  pred.classif = predict(mod.classif, task.classif)
  pred.classif$data$response = pred.art.classif

  # for binaryclass
  tar.bin = factor(c(1L, 0L, 0L, 1L))
  pred.art.bin = factor(c(1L, 1L, 0L, 0L))
  data.bin = data.frame(var1, var2, tar.bin)
  task.bin = makeClassifTask(data = data.bin, target = "tar.bin")
  lrn.bin = lrn.classif
  mod.bin = train(lrn.bin, task.bin)
  pred.bin = predict(mod.bin, task.bin)
  pred.bin$data$response = pred.art.bin

  # for multilabel
  tar1.multilabel = c(TRUE, FALSE, FALSE, TRUE)
  tar2.multilabel = c(TRUE, TRUE, FALSE, TRUE)
  pred.art.multilabel = cbind(c(TRUE, FALSE, FALSE, FALSE), c(FALSE, TRUE, FALSE, TRUE))
  data.multilabel = data.frame(var1, var2, tar1.multilabel, tar2.multilabel)
  label.names = c("tar1.multilabel", "tar2.multilabel")
  task.multilabel = makeMultilabelTask(data = data.multilabel, target = label.names)
  lrn.multilabel = makeLearner("multilabel.rFerns")
  mod.multilabel = train(lrn.multilabel, task.multilabel)
  pred.multilabel = predict(mod.multilabel, task.multilabel)
  pred.multilabel$data[, 4:5] = pred.art.multilabel

  # for survival
  time.surv = c(5, 10, 5, 10)
  status.surv = c(TRUE, FALSE, TRUE, FALSE)
  pred.art.surv = c(1, -1, 1, 1)
  data.surv = data.frame(var1, var2, time.surv, status.surv)
  tar.names = c("time.surv", "status.surv")
  task.surv = makeSurvTask(data = data.surv, target = tar.names)
  lrn.surv = makeLearner("surv.coxph")
  # lm does not converge due to small data and warns
  suppressWarnings({
    mod.surv = train(lrn.surv, task.surv)
  })
  pred.surv = predict(mod.surv, task.surv)
  pred.surv$data[, "response"] = pred.art.surv

  # for costsensitive
  tar.costsens = factor(c("a", "b", "c", "a"))
  pred.art.costsens = factor(c("a", "b", "c", "c"))
  data.costsens = data.frame(var1, var2)
  costs = matrix(c(0, 1, 2, 1, 0, 2, 1, 2, 0, 0, 2, 1), nrow = 4L, byrow = TRUE)
  colnames(costs) = levels(tar.costsens)
  rownames(costs) = rownames(data.costsens)
  task.costsens = makeCostSensTask(data = data.costsens, costs = costs)
  lrn.costsens = makeLearner("classif.multinom", trace = FALSE)
  lrn.costsens = makeCostSensWeightedPairsWrapper(lrn.costsens)
  mod.costsens = train(lrn.costsens, task.costsens)
  pred.costsens = predict(mod.costsens, task = task.costsens)
  pred.costsens$data$response = pred.art.costsens

  # for clustering
  pred.art.cluster = c(1L, 1L, 2L, 1L)
  data.cluster = data.frame(var1, var2)
  task.cluster = makeClusterTask(data = data.cluster)
  lrn.cluster = makeLearner("cluster.EM")
  mod.cluster = train(lrn.cluster, task.cluster)
  pred.cluster = predict(mod.cluster, task.cluster)
  pred.cluster$data$response = pred.art.cluster

  # test regression measures

  # sse
  sq.errs = c(5 - 4, 10 - 11, 0 - 0, 5 - 4)^2L
  sse.test = sum(sq.errs)
  sse.perf = performance(pred.regr, measures = sse, model = mod.regr)
  expect_equal(sse.test, sse$fun(pred = pred.regr))
  expect_equal(sse.test, as.numeric(sse.perf))

  # mse
  mse.test = mean(sq.errs)
  mse.perf = performance(pred.regr, measures = mse, model = mod.regr)
  expect_equal(mse.test, mse$fun(pred = pred.regr))
  expect_equal(mse.test, as.numeric(mse.perf))

  # rmse
  rmse.test = sqrt(mse.test)
  rmse.perf = performance(pred.regr, measures = rmse, model = mod.regr)
  expect_equal(rmse.test, rmse$fun(pred = pred.regr))
  expect_equal(rmse.test, as.numeric(rmse.perf))

  # medse
  medse.test = median(sq.errs)
  medse.perf = performance(pred.regr, measures = medse, model = mod.regr)
  expect_equal(medse.test, medse$fun(pred = pred.regr))
  expect_equal(medse.test, as.numeric(medse.perf))

  # sae
  abs.errs = abs(c(5 - 4, 10 - 11, 0 - 0, 5 - 4))
  sae.test = sum(abs.errs)
  sae.perf = performance(pred.regr, measures = sae, model = mod.regr)
  expect_equal(sae.test, sae$fun(pred = pred.regr))
  expect_equal(sae.test, as.numeric(sae.perf))

  # mae
  mae.test = mean(abs.errs)
  mae.perf = performance(pred.regr, measures = mae, model = mod.regr)
  expect_equal(mae.test, mae$fun(pred = pred.regr))
  expect_equal(mae.test, as.numeric(mae.perf))

  # medae
  medae.test = median(abs.errs)
  medae.perf = performance(pred.regr, measures = medae, model = mod.regr)
  expect_equal(medae.test, medae$fun(pred = pred.regr))
  expect_equal(medae.test, as.numeric(medae.perf))

  # rsq
  rsq.test = 1 - (sse.test / sum((tar.regr - mean(tar.regr))^2L))
  rsq.perf = performance(pred.regr, measures = rsq, model = mod.regr)
  expect_equal(rsq.test, rsq$fun(pred = pred.regr))
  expect_equal(rsq.test, as.numeric(rsq.perf))
  expect_equal(1 - ((4 - 5)^2 + (11 - 10)^2 + (0 - 0)^2 + (4 - 5)^2) / ((5 - 5)^2 + (10 - 5)^2 + (0 - 5)^2 + (5 - 5)^2), measureRSQ(c(5, 10, 0, 5), c(4, 11, 0, 4)))
  suppressWarnings({
    expect_equal(NA_real_, measureRSQ(0, 0))
    expect_warning(measureRSQ(0, 0))
    expect_warning(measureRSQ(1, 1))
    expect_warning(measureRSQ(c(1, 1, 1, 1), c(1, 2, 3, 4)))
  })
  expect_silent(measureRSQ(c(1, 1, 1, 0), c(2, 2, 2, 2)))

  # expvar
  expvar.test = sum((pred.art.regr - mean(tar.regr))^2L) / sum((tar.regr - mean(tar.regr))^2L)
  expvar.perf = performance(pred.regr, measures = expvar, model = mod.regr)
  expect_equal(expvar.test, expvar$fun(pred = pred.regr))
  expect_equal(expvar.test, as.numeric(expvar.perf))
  expect_equal(sum((1 - 3)^2 + (2 - 3)^2 + (3 - 3)^2 + (4 - 3)^2 + (5 - 3)^2) / sum((5 - 3)^2 + (4 - 3)^2 + (3 - 3)^2 + (2 - 3)^2 + (1 - 3)^2), measureEXPVAR(5:1, 1:5))
  suppressWarnings({
    expect_equal(NA_real_, measureEXPVAR(0, 0))
    expect_warning(measureEXPVAR(0, 0))
    expect_warning(measureEXPVAR(c(1, 1, 1, 1), c(1, 2, 3, 4)))
  })
  expect_silent(measureEXPVAR(c(1, 1, 1, 0), c(2, 2, 2, 2)))

  # rrse
  rrse.test = sqrt(sum((pred.art.regr - tar.regr)^2L) / sum((tar.regr - mean(tar.regr))^2L))
  rrse.perf = performance(pred.regr, measures = rrse, model = mod.regr)
  expect_equal(rrse.test, rrse$fun(pred = pred.regr))
  expect_equal(rrse.test, as.numeric(rrse.perf))
  expect_equal(sqrt((4 - 5)^2 + (11 - 10)^2 + (0 - 0)^2 + (4 - 5)^2) / sqrt((5 - 5)^2 + (10 - 5)^2 + (0 - 5)^2 + (5 - 5)^2), measureRRSE(c(5, 10, 0, 5), c(4, 11, 0, 4)))
  suppressWarnings({
    expect_equal(NA_real_, measureRRSE(0, 0))
    expect_warning(measureRRSE(0, 0))
    expect_warning(measureRRSE(c(1, 1, 1, 1), c(1, 2, 3, 4)))
  })
  expect_silent(measureRRSE(c(1, 1, 1, 0), c(2, 2, 2, 2)))

  # rae
  rae.test = sum(abs(pred.art.regr - tar.regr)) / sum(abs(tar.regr - mean(tar.regr)))
  rae.perf = performance(pred.regr, measures = rae, model = mod.regr)
  expect_equal(rae.test, rae$fun(pred = pred.regr))
  expect_equal(rae.test, as.numeric(rae.perf))
  expect_equal((abs(4 - 5) + abs(11 - 10) + abs(0 - 0) + abs(4 - 5)) / (abs(5 - 5) + abs(10 - 5) + abs(0 - 5) + abs(5 - 5)), measureRAE(c(5, 10, 0, 5), c(4, 11, 0, 4)))
  suppressWarnings({
    expect_equal(NA_real_, measureRAE(0, 0))
    expect_warning(measureRAE(0, 0))
    expect_warning(measureRAE(c(1, 1, 1, 1), c(1, 2, 3, 4)))
  })
  expect_silent(measureRAE(c(1, 1, 1, 0), c(2, 2, 2, 2)))

  # mape
  suppressWarnings({
    expect_equal(NA_real_, mape$fun(pred = pred.regr))
    expect_equal(NA_real_, measureMAPE(c(5, 10, 0, 5), c(4, 11, 0, 4)))
  })
  expect_warning(mape$fun(pred = pred.regr), regexp = "Measure is undefined if any truth value is equal to 0.")
  expect_warning(measureMAPE(c(5, 10, 0, 5), c(4, 11, 0, 4)), regexp = "Measure is undefined if any truth value is equal to 0.")

  pred.regr.mape = pred.regr
  pred.regr.mape$data$truth = c(5, 10, 1, 5) # we change the 0 target because mape is undefined
  mape.perf = performance(pred.regr.mape, measures = mape, model = mod.regr)
  mape.test = mean(c(abs((5 - 4) / 5), abs((10 - 11) / 10), abs((1 - 0) / 1), abs((5 - 4) / 5)))
  expect_equal(mape.test, mape$fun(pred = pred.regr.mape))
  expect_equal(mape.test, as.numeric(mape.perf))
  expect_equal(1 / 4 * (abs((4 - 5) / 5) + abs((11 - 10) / 10) + abs((0 - 2) / 2) + abs((4 - 5) / 5)), measureMAPE(c(5, 10, 2, 5), c(4, 11, 0, 4)))
  expect_warning(measureMAPE(0, 0))
  expect_warning(measureMAPE(c(1, 1, 1, 0), c(2, 2, 2, 2)))
  expect_silent(measureMAPE(c(1, 1, 1, 1), c(2, 2, 2, 2)))

  # msle
  msle.test = ((log(4 + 1) - log(5 + 1))^2 + (log(11 + 1) - log(10 + 1))^2 +
    (log(0 + 1) - log(0 + 1))^2 + (log(4 + 1) - log(5 + 1))^2) / 4
  msle.perf = performance(pred.regr, measures = msle, model = mod.regr)
  expect_equal(msle.test, msle$fun(pred = pred.regr))
  expect_equal(msle.test, as.numeric(msle.perf))
  # msle throws error for values smaller than -1
  pred.art.regr.neg = pred.art.regr
  pred.art.regr.neg[[1L]] = -3
  expect_error(
    measureMSLE(tar.regr, pred.art.regr.neg),
    "values must be greater or equal -1")

  # rmsle
  rmsle.test = sqrt(msle.test)
  rmsle.perf = performance(pred.regr, measures = rmsle, model = mod.regr)
  expect_equal(rmsle.test, rmsle$fun(pred = pred.regr))
  expect_equal(rmsle.test, as.numeric(rmsle.perf))

  # tau
  tau.test = 1
  tau.perf = performance(pred.regr, measures = kendalltau, model = mod.regr)
  expect_equal(tau.test, kendalltau$fun(pred = pred.regr))
  expect_equal(tau.test, as.numeric(tau.perf))

  # rho
  rho.test = 1
  rho.perf = performance(pred.regr, measures = spearmanrho, model = mod.regr)
  expect_equal(rho.test, spearmanrho$fun(pred = pred.regr))
  expect_equal(rho.test, as.numeric(rho.perf))

  # test multiclass measures

  # mmce
  mmce.test = mean(c(1L != 1L, 2L != 1L, 0L != 0L, 1L != 2L))
  mmce.perf = performance(pred.classif, measures = mmce, model = mod.classif)
  expect_equal(mmce.test, mmce$fun(pred = pred.classif))
  expect_equal(mmce.test, as.numeric(mmce.perf))

  # acc
  acc.test = mean(c(1L != 1L, 2L != 0L, 0L != 0L, 1L != 2L))
  acc.perf = performance(pred.classif, measures = acc, model = mod.classif)
  expect_equal(acc.test, acc$fun(pred = pred.classif))
  expect_equal(acc.test, as.numeric(acc.perf))

  # colAUC binary
  colauc.tab = as.matrix(table(tar.bin, pred.art.bin)) # confusion matrix
  colauc.truepos = unname(rev(cumsum(rev(colauc.tab[2, ])))) # Number of true positives
  colauc.falsepos = unname(rev(cumsum(rev(colauc.tab[1, ])))) # Number of false positives
  colauc.totpos = sum(colauc.tab[2, ]) # The total number of positives(one number)
  colauc.totneg = sum(colauc.tab[1, ]) # The total number of negatives(one number)
  colauc.sens = colauc.truepos / colauc.totpos # Sensitivity(fraction true positives)
  colauc.omspec = colauc.falsepos / colauc.totneg # 1 − specificity(false positives)
  colauc.sens = c(colauc.sens, 0) # Numbers when we classify all as 0
  colauc.omspec = c(colauc.omspec, 0) # Numbers when we classify all as 0
  colauc.height = (colauc.sens[-1] + colauc.sens[-length(colauc.sens)]) / 2
  colauc.width = -diff(colauc.omspec) # = diff(rev(omspec))
  expect_equal(sum(colauc.height * colauc.width), colAUC(as.numeric(pred.art.bin), truth = tar.bin)[[1]])

  # colAUC with "maximum = FALSE"
  colauc.min = colAUC(c(1, 0, 1, 1), truth = tar.bin, maximum = FALSE)
  colauc.max = colAUC(c(1, 0, 1, 1), truth = tar.bin, maximum = TRUE)
  expect_equal(colauc.min[[1]], 0.25)
  expect_equal(colauc.min, 1 - colauc.max)

  # colAUC multiclass
  colauc.tab = as.matrix(table(tar.classif, pred.art.classif)) # confusion matrix
  tab = t(utils::combn(0:2, 2)) # all possible 1 vs. 1 combinations
  colauc2 = matrix(NA, 3, 1)
  for (i in 1:3) {
    cind = c(which(colnames(colauc.tab) == tab[i, 1]), which(colnames(colauc.tab) == tab[i, 2])) # column indices of i-th combination
    rind = c(which(rownames(colauc.tab) == tab[i, 1]), which(rownames(colauc.tab) == tab[i, 2])) # row indices of i-th combination
    colauc.tab.part = colauc.tab[rind, cind] # resulting patrial matrix
    colauc.truepos = unname(rev(cumsum(rev(colauc.tab.part[2, ])))) # Number of true positives
    colauc.falsepos = unname(rev(cumsum(rev(colauc.tab.part[1, ])))) # Number of false positives
    colauc.totpos = sum(colauc.tab.part[2, ]) # The total number of positives(one number)
    colauc.totneg = sum(colauc.tab.part[1, ]) # The total number of negatives(one number)
    if (colauc.totpos > 0) {
      colauc.sens = colauc.truepos / colauc.totpos # Sensitivity(fraction true positives)
    } else {
      colauc.sens = c(1, 1)
    }
    if (colauc.totneg > 0) {
      colauc.omspec = colauc.falsepos / colauc.totneg # 1 − specificity(false positives)
    } else {
      colauc.omspec = c(1, 1)
    }
    colauc.sens = c(colauc.sens, 0) # Numbers when we classify all as 0
    colauc.omspec = c(colauc.omspec, 0) # Numbers when we classify all as 0
    colauc.height = (colauc.sens[-1] + colauc.sens[-length(colauc.sens)]) / 2
    colauc.width = -diff(colauc.omspec) # = diff(rev(colauc.omspec))
    if (sum(colauc.height * colauc.width) < 0.5) {
      colauc2[i, 1] = 1 - sum(colauc.height * colauc.width) # calculate AUC using formula for the area of a trapezoid
    } else {
      colauc2[i, 1] = sum(colauc.height * colauc.width) # calculate AUC using formula for the area of a trapezoid
    }
  }
  expect_equal(colauc2[, 1], as.numeric(colAUC(as.numeric(pred.art.classif), truth = tar.classif)[, 1]))

  # multiclass.auc
  expect_equal(
    as.numeric(performance(pred.bin, measures = list(
      multiclass.aunu,
      multiclass.aunp, multiclass.au1u, multiclass.au1p))),
    as.numeric(rep(performance(pred.bin, measures = auc), 4)))
  auc.lrn = makeLearner("classif.rpart", predict.type = "prob")
  auc.fit = train(auc.lrn, iris.task)
  auc.pred.constant = predict(auc.fit, subsetTask(iris.task, 1:50))
  suppressWarnings({
    expect_equal(c(multiclass.aunu = NA_real_, multiclass.aunp = NA_real_), performance(auc.pred.constant, list(multiclass.aunu, multiclass.aunp)))
    expect_warning(measureAUNU(getPredictionProbabilities(auc.pred.constant, auc.pred.constant$task.desc$class.levels), auc.pred.constant$data$truth))
    expect_warning(measureAUNP(getPredictionProbabilities(auc.pred.constant, auc.pred.constant$task.desc$class.levels), auc.pred.constant$data$truth))
  })

  p1 = p2 = matrix(c(0.1, 0.9, 0.2, 0.8), 2, 2, byrow = TRUE)
  colnames(p1) = c("a", "b")
  colnames(p2) = c("b", "a")
  y1 = factor(c("a", "b"))
  y2 = factor(c("b", "b"))

  # multiclass.brier
  expect_equal(measureMulticlassBrier(p1, y1), 0.5 * ((1 - 0.1)^2 + (0 - 0.9)^2 + (0 - 0.2)^2 + (1 - 0.8)^2))
  expect_equal(measureMulticlassBrier(p1, y2), 0.5 * ((0 - 0.1)^2 + (1 - 0.9)^2 + (0 - 0.2)^2 + (1 - 0.8)^2))
  expect_equal(measureMulticlassBrier(p2, y1), 0.5 * ((1 - 0.9)^2 + (0 - 0.1)^2 + (1 - 0.2)^2 + (0 - 0.8)^2))

  # logloss
  expect_equal(measureLogloss(p1, y1), -mean(log(c(0.1, 0.8))))
  expect_equal(measureLogloss(p1, y2), -mean(log(c(0.9, 0.8))))
  expect_equal(measureLogloss(p2, y1), -mean(log(c(0.9, 0.2))))

  pred.probs = getPredictionProbabilities(pred.classif)
  pred.probs[pred.probs > 1 - 1e-15] = 1 - 1e-15
  pred.probs[pred.probs < 1e-15] = 1e-15
  logloss.test = -1 * mean(log(pred.probs[model.matrix(~ . + 0, data = as.data.frame(tar.classif)) - pred.probs > 0]))
  logloss.perf = performance(pred.classif, measures = logloss, model = mod.classif)
  expect_equal(logloss.test, logloss$fun(pred = pred.classif))
  expect_equal(logloss.test, as.numeric(logloss.perf))

  # ssr
  pred.probs = getPredictionProbabilities(pred.classif)
  ssr.test = mean(vnapply(seq_row(pred.probs), function(i) {
    pred.probs[i, tar.classif[i]]
  }) / sqrt(rowSums(pred.probs^2)))
  ssr.perf = performance(pred.classif, measures = ssr, model = mod.classif)
  expect_equal(ssr.test, ssr$fun(pred = pred.classif))
  expect_equal(ssr.test, as.numeric(ssr.perf))
  expect_equal(measureSSR(p1, y1), 0.5 * (0.1 / sqrt(0.1^2 + 0.9^2) + 0.8 / sqrt(0.2^2 + 0.8^2)))
  expect_equal(measureSSR(p1, y2), 0.5 * (0.9 / sqrt(0.1^2 + 0.9^2) + 0.8 / sqrt(0.2^2 + 0.8^2)))
  expect_equal(measureSSR(p2, y1), 0.5 * (0.9 / sqrt(0.1^2 + 0.9^2) + 0.2 / sqrt(0.2^2 + 0.8^2)))
  expect_equal(measureSSR(p2[1, , drop = FALSE], y2[1]), 0.1 / sqrt(0.1^2 + 0.9^2))
  expect_equal(measureSSR(p2[1, , drop = FALSE], y1[1]), 0.9 / sqrt(0.1^2 + 0.9^2))

  # qsr
  qsr.test = 1 - mean(rowSums((pred.probs - model.matrix(~ . + 0, data = as.data.frame(tar.classif)))^2))
  qsr.perf = performance(pred.classif, measures = qsr, model = mod.classif)
  expect_equal(qsr.test, qsr$fun(pred = pred.classif))
  expect_equal(qsr.test, as.numeric(qsr.perf))
  expect_equal(measureQSR(p1, y1), 1 - 0.5 * ((1 - 0.1)^2 + (0 - 0.9)^2 + (0 - 0.2)^2 + (1 - 0.8)^2))
  expect_equal(measureQSR(p1, y2), 1 - 0.5 * ((0 - 0.1)^2 + (1 - 0.9)^2 + (0 - 0.2)^2 + (1 - 0.8)^2))
  expect_equal(measureQSR(p2, y1), 1 - 0.5 * ((1 - 0.9)^2 + (0 - 0.1)^2 + (1 - 0.2)^2 + (0 - 0.8)^2))
  expect_equal(measureQSR(p2[1, , drop = FALSE], y2[1]), 1 - (1 - 0.1)^2 - (0 - 0.9)^2)
  expect_equal(measureQSR(p2[1, , drop = FALSE], y1[1]), 1 - (1 - 0.9)^2 - (0 - 0.1)^2)

  # lsr
  lsr.test = mean(log(pred.probs[model.matrix(~ . + 0, data = as.data.frame(tar.classif)) - pred.probs > 0]))
  lsr.perf = performance(pred.classif, measures = lsr, model = mod.classif)
  expect_equal(lsr.test, lsr$fun(pred = pred.classif))
  expect_equal(lsr.test, as.numeric(lsr.perf))
  expect_equal(measureLSR(p1, y1), mean(log(c(0.1, 0.8))))
  expect_equal(measureLSR(p1, y2), mean(log(c(0.9, 0.8))))
  expect_equal(measureLSR(p2, y1), mean(log(c(0.9, 0.2))))
  expect_equal(measureLSR(p2[1, , drop = FALSE], y2[1]), log(0.1))
  expect_equal(measureLSR(p2[1, , drop = FALSE], y1[1]), log(0.9))

  # kappa
  p0 = 0.5
  pe = (0.25 * 0.25 + 0.5 * 0.5 + 0.25 * 0.25) / 1
  kappa.test = 1 - (1 - p0) / (1 - pe)
  kappa.perf = performance(pred.classif, measures = kappa, model = mod.classif)
  expect_equal(measureKAPPA(tar.classif, pred.art.classif), kappa.test)
  expect_equal(measureKAPPA(tar.classif, pred.art.classif), as.numeric(kappa.perf))

  # wkappa
  conf.mat = matrix(c(1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L), nrow = 3L) / 4L
  expected.mat = c(0.25, 0.5, 0.25) %*% t(c(0.25, 0.5, 0.25))
  weights = matrix(c(0, 1, 4, 1, 0, 1, 4, 1, 0), nrow = 3L)
  wkappa.test = 1 - sum(weights * conf.mat) / sum(weights * expected.mat)
  wkappa.perf = performance(pred.classif, measures = wkappa, model = mod.classif)
  expect_equal(measureWKAPPA(tar.classif, pred.art.classif), wkappa.test)
  expect_equal(measureWKAPPA(tar.classif, pred.art.classif), as.numeric(wkappa.perf))
  tar.classif2 = tar.classif
  pred.art.classif2 = pred.art.classif
  levels(tar.classif2) = as.numeric(levels(tar.classif))^2
  levels(pred.art.classif2) = as.numeric(levels(pred.art.classif))^2
  expect_equal(measureWKAPPA(tar.classif2, pred.art.classif2), wkappa.test)

  # test binaryclass measures

  # brier
  pred.probs = getPredictionProbabilities(pred.bin)
  brier.test = mean((as.numeric(tar.bin == "0") - pred.probs)^2)
  brier.perf = performance(pred.bin, measures = brier, model = mod.bin)
  expect_equal(brier.test, brier$fun(pred = pred.bin))
  expect_equal(brier.test, as.numeric(brier.perf))
  expect_equal(measureBrier(c(1, 1, 0), c("a", "a", "a"), "b", "a"), 1 / 3)
  expect_equal(measureBrier(c(1, 1, 1), c("a", "a", "a"), "b", "a"), 0)
  expect_equal(measureBrier(c(0, 0, 0), c("a", "a", "a"), "b", "a"), 1)

  # brier.scaled
  inc = mean(pred.probs)
  brier.test.max = inc * (1 - inc)^2 + (1 - inc) * inc^2
  brier.scaled.test = 1 - brier.test / brier.test.max
  brier.scaled.perf = performance(pred.bin, measures = brier.scaled, model = mod.bin)
  expect_equal(brier.scaled.test, brier.scaled$fun(pred = pred.bin))
  expect_equal(brier.scaled.test, as.numeric(brier.scaled.perf))
  expect_equal(measureBrierScaled(c(1, 1, 0), c("a", "a", "a"), "b", "a"), 1 - ((1 / 3) / (2 / 3 * 1 / 3)))
  expect_equal(measureBrierScaled(c(1, 1, 1), c("a", "a", "a"), "b", "a"), 1 - ((0) / (1 * 0)))
  expect_equal(measureBrierScaled(c(0, 0, 0), c("a", "a", "a"), "b", "a"), 1 - ((1) / (0 * 1)))

  # tp
  tp.test = sum(tar.bin == pred.art.bin & pred.art.bin == 0L)
  tp.perf = performance(pred.bin, measures = tp, model = mod.bin)
  expect_equal(tp.test, tp$fun(pred = pred.bin))
  expect_equal(tp.test, as.numeric(tp.perf))

  # tn
  tn.test = sum(tar.bin == pred.art.bin & pred.art.bin == 1L)
  tn.perf = performance(pred.bin, measures = tn, model = mod.bin)
  expect_equal(tn.test, tn$fun(pred = pred.bin))
  expect_equal(tn.test, as.numeric(tn.perf))

  # fp
  fp.test = sum(tar.bin != pred.art.bin & pred.art.bin == 0L)
  fp.perf = performance(pred.bin, measures = fp, model = mod.bin)
  expect_equal(fp.test, fp$fun(pred = pred.bin))
  expect_equal(fp.test, as.numeric(fp.perf))

  # fn
  fn.test = sum(tar.bin != pred.art.bin & pred.art.bin == 1L)
  fn.perf = performance(pred.bin, measures = fn, model = mod.bin)
  expect_equal(fn.test, fn$fun(pred = pred.bin))
  expect_equal(fn.test, as.numeric(fn.perf))

  # tpr
  tpr.test = tp.test / sum(tar.bin == 0L)
  tpr.perf = performance(pred.bin, measures = tpr, model = mod.bin)
  expect_equal(tpr.test, tpr$fun(pred = pred.bin))
  expect_equal(tpr.test, as.numeric(tpr.perf))

  # tnr
  tnr.test = tn.test / sum(tar.bin == 1L)
  tnr.perf = performance(pred.bin, measures = tnr, model = mod.bin)
  expect_equal(tnr.test, tnr$fun(pred = pred.bin))
  expect_equal(tnr.test, as.numeric(tnr.perf))

  # fpr
  fpr.test = fp.test / sum(tar.bin != 0L)
  fpr.perf = performance(pred.bin, measures = fpr, model = mod.bin)
  expect_equal(fpr.test, fpr$fun(pred = pred.bin))
  expect_equal(fpr.test, as.numeric(fpr.perf))

  # fnr
  fnr.test = fn.test / sum(tar.bin != 1L)
  fnr.perf = performance(pred.bin, measures = fnr, model = mod.bin)
  expect_equal(fnr.test, fnr$fun(pred = pred.bin))
  expect_equal(fnr.test, as.numeric(fnr.perf))

  # ppv
  ppv.test = tp.test / sum(pred.art.bin == 0L)
  ppv.perf = performance(pred.bin, measures = ppv, model = mod.bin)
  expect_equal(ppv.test, ppv$fun(pred = pred.bin))
  expect_equal(ppv.test, as.numeric(ppv.perf))

  # npv
  npv.test = tn.test / sum(pred.art.bin == 1L)
  npv.perf = performance(pred.bin, measures = npv, model = mod.bin)
  expect_equal(npv.test, npv$fun(pred = pred.bin))
  expect_equal(npv.test, as.numeric(npv.perf))

  # fdr
  fdr.test = fp.test / sum(pred.art.bin == 0L)
  fdr.perf = performance(pred.bin, measures = fdr, model = mod.bin)
  expect_equal(fdr.test, fdr$fun(pred = pred.bin))
  expect_equal(fdr.test, as.numeric(fdr.perf))

  # bac
  bac.test = 0.5 * (tpr.test / (tpr.test + fnr.test) + tnr.test /
    (tnr.test + fpr.test))
  bac.perf = performance(pred.bin, measures = bac, model = mod.bin)
  expect_equal(bac.test, bac$fun(pred = pred.bin))
  expect_equal(bac.test, as.numeric(bac.perf))

  # ber
  ber.test = 1L - bac.test
  ber.perf = performance(pred.bin, measures = ber, model = mod.bin)
  expect_equal(ber.test, ber$fun(pred = pred.bin))
  expect_equal(ber.test, as.numeric(ber.perf))

  # auc
  auc.test = (tpr.test + tnr.test) / 2L
  auc.perf = performance(pred.bin, measures = auc, model = mod.bin)
  expect_equal(auc.test, auc$fun(pred = pred.bin))
  expect_equal(auc.test, as.numeric(auc.perf))

  # mcc
  mcc.test = (tp.test * tn.test - fp.test * fn.test) /
    sqrt((tp.test + fp.test) * (tp.test + fn.test) *
      (tn.test + fp.test) * (tn.test + fn.test))
  mcc.perf = performance(pred.bin, measures = mcc, model = mod.bin)
  expect_equal(mcc.test, mcc$fun(pred = pred.bin))
  expect_equal(mcc.test, as.numeric(mcc.perf))

  # f1
  f1.test = 2 * tp.test / (sum(tar.bin == 0L) + sum(pred.art.bin == 0L))
  f1.perf = performance(pred.bin, measures = f1, model = mod.bin)
  expect_equal(f1.test, f1$fun(pred = pred.bin))
  expect_equal(f1.test, as.numeric(f1.perf))

  # gmean
  gmean.test = sqrt((tp.test / (tp.test + fn.test)) * tn.test / (tn.test + fp.test))
  gmean.perf = performance(pred.bin, measures = gmean, model = mod.bin)
  expect_equal(gmean.test, gmean$fun(pred = pred.bin))
  expect_equal(gmean.test, as.numeric(gmean.perf))

  # gpr
  gpr.test = sqrt(ppv.test * tpr.test)
  gpr.perf = performance(pred.bin, measures = gpr, model = mod.bin)
  expect_equal(gpr.test, gpr$fun(pred = pred.bin))
  expect_equal(gpr.test, as.numeric(gpr.perf))

  # test multilabel measures

  # create response and predictions using all possible combinations
  # bincombo = matrix(c(TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE), ncol = 2, byrow = TRUE)
  # multi.y = bincombo[rep(1:4, times = 4),]
  # multi.p = bincombo[rep(1:4, each = 4),]

  multi.y = getPredictionTruth(pred.multilabel)
  multi.p = getPredictionResponse(pred.multilabel)
  expect_equal(unname(multi.y), unname(as.matrix(getTaskTargets(task.multilabel))))
  expect_equal(pred.art.multilabel, unname(multi.p))

  # create true-false and true-true vector used for manual computation of measures
  tf = c(TRUE, FALSE)
  tt = c(TRUE, TRUE)

  # this is copy-paste from the mldr::mldr_evaluate function which is needed to compare with the mldr measures
  counters = data.frame(
    RealPositives = rowSums(multi.y),
    RealNegatives = rowSums(!multi.y),
    PredictedPositives = rowSums(multi.p),
    PredictedNegatives = rowSums(!multi.p),
    TruePositives = rowSums(multi.y & multi.p),
    TrueNegatives = rowSums(!multi.y & !multi.p))

  # hamloss: how many values are not identical
  hamloss.test = mean(as.vector(multi.y) != as.vector(multi.p))
  hamloss.perf = performance(pred.multilabel, measures = multilabel.hamloss, model = mod.multilabel)
  expect_equal(hamloss.test, multilabel.hamloss$fun(pred = pred.multilabel))
  expect_equal(hamloss.test, as.numeric(hamloss.perf))
  # check best and worst value
  expect_equal(measureMultilabelHamloss(multi.y, multi.y), multilabel.hamloss$best)
  expect_equal(measureMultilabelHamloss(multi.y, !multi.y), multilabel.hamloss$worst)
  # compare with mldr
  expect_equal(mldr::hamming_loss(multi.y, multi.p), measureMultilabelHamloss(multi.y, multi.p))
  # mldr defines the accuracy as 1-hamloss
  expect_equal(mldr::accuracy(multi.y, multi.p), 1 - measureMultilabelHamloss(multi.y, multi.p))
  # manual checks
  expect_equal(measureMultilabelHamloss(matrix(tf, ncol = 2), matrix(tt, ncol = 2)), 1 / 2) # 1 of 2 values are wrong
  expect_equal(measureMultilabelHamloss(cbind(tf, tf), cbind(tf, tt)), 1 / 4) # 1 of 4 values are wrong

  # subset01: how many rows are not identical
  subset01.test = mean(rowSums(multi.y == multi.p) != ncol(multi.y))
  subset01.perf = performance(pred.multilabel, measures = multilabel.subset01, model = mod.multilabel)
  expect_equal(subset01.test, multilabel.subset01$fun(pred = pred.multilabel))
  expect_equal(subset01.test, as.numeric(subset01.perf))
  # check best and worst
  expect_equal(measureMultilabelSubset01(multi.y, multi.y), multilabel.subset01$best)
  expect_equal(measureMultilabelSubset01(multi.y, !multi.y), multilabel.subset01$worst)
  # compare with mldr: we have implemented the subset loss which is 1 - subset accuracy
  expect_equal(mldr::subset_accuracy(multi.y, multi.p), 1 - measureMultilabelSubset01(multi.y, multi.p))
  # manual checks
  expect_equal(measureMultilabelSubset01(matrix(tf, ncol = 2), matrix(tt, ncol = 2)), 1) # 1 of 1 obs is wrong
  expect_equal(measureMultilabelSubset01(cbind(tf, tf), cbind(tf, tt)), 1 / 2) # 1 of 2 obs is wrong

  # f1mult
  f1.test = vnapply(seq_row(multi.y), function(i) 2 * sum(multi.y[i, ] * multi.p[i, ]) / (sum(multi.y[i, ]) + sum(multi.p[i, ])))
  f1.test[is.na(f1.test)] = 1
  f1.test = mean(f1.test)
  f1.perf = performance(pred.multilabel, measures = multilabel.f1, model = mod.multilabel)
  expect_equal(f1.test, multilabel.f1$fun(pred = pred.multilabel))
  expect_equal(f1.test, as.numeric(f1.perf))
  # check best and worst

  expect_equal(measureMultilabelF1(multi.y, multi.y), multilabel.f1$best)
  expect_equal(measureMultilabelF1(multi.y, !multi.y), multilabel.f1$worst)

  # compare with mldr: copy-pasted older mldr version
  # mldr had a bug when RealPositives or PredictedPositives are 0 (see https://github.com/fcharte/mldr/issues/36)
  mldr.precision = counters[-3, ]$TruePositives / counters[-3, ]$PredictedPositives
  mldr.recall = counters[-3, ]$TruePositives / counters[-3, ]$RealPositives
  mldr.fmeasure = mean(mldr.precision * mldr.recall * 2 / (mldr.precision + mldr.recall), na.rm = TRUE)
  expect_equal(mldr.fmeasure, measureMultilabelF1(multi.y[-3, ], multi.p[-3, ]))

  # manual checks
  expect_equal(measureMultilabelF1(matrix(tf, ncol = 2), matrix(tt, ncol = 2)), 2 * 1 / 3) # 1 TRUE-TRUE match of 3 TRUE values
  expect_equal(measureMultilabelF1(rbind(tf, tf), rbind(tf, tt)), mean(c(2 * 1 / 2, 2 * 1 / 3))) # 1 TRUE-TRUE match of 2 and 3 TRUE values per obs

  # accmult
  acc.test = vnapply(seq_row(multi.y), function(i) sum(multi.y[i, ] & multi.p[i, ]) / (sum(multi.y[i, ] | multi.p[i, ])))
  acc.test[is.na(acc.test)] = 1
  acc.test = mean(acc.test)
  acc.perf = performance(pred.multilabel, measures = multilabel.acc, model = mod.multilabel)
  expect_equal(acc.test, multilabel.acc$fun(pred = pred.multilabel))
  expect_equal(acc.test, as.numeric(acc.perf))
  # check best and worst
  expect_equal(measureMultilabelACC(multi.y, multi.y), multilabel.acc$best)
  expect_equal(measureMultilabelACC(multi.y, !multi.y), multilabel.acc$worst)
  # compare with mldr: jaccard index is not implemented in mldr see https://github.com/fcharte/mldr/issues/28
  # manual checks
  expect_equal(measureMultilabelACC(matrix(tf, ncol = 2), matrix(tt, ncol = 2)), 1 / 2)
  expect_equal(measureMultilabelACC(rbind(tf, tf), rbind(tf, tt)), mean(c(1, 1 / 2)))

  # ppvmult
  ppv.test = vnapply(seq_row(multi.y), function(i) sum(multi.y[i, ] & multi.p[i, ]) / (sum(multi.p[i, ])))
  ppv.test = mean(ppv.test, na.rm = TRUE)
  ppv.perf = performance(pred.multilabel, measures = multilabel.ppv, model = mod.multilabel)
  expect_equal(ppv.test, multilabel.ppv$fun(pred = pred.multilabel))
  expect_equal(ppv.test, as.numeric(ppv.perf))
  # check best and worst
  expect_equal(measureMultilabelPPV(multi.y, multi.y), multilabel.ppv$best)
  expect_equal(measureMultilabelPPV(multi.y, !multi.y), multilabel.ppv$worst)
  # compare with mldr
  expect_equal(mldr::precision(multi.y, multi.p), measureMultilabelPPV(multi.y, multi.p))
  # manual checks
  expect_equal(measureMultilabelPPV(matrix(tf, ncol = 2), matrix(tt, ncol = 2)), 1 / 2)
  expect_equal(measureMultilabelPPV(rbind(tf, tf), rbind(tf, tt)), mean(c(1 / 1, 1 / 2)))

  # tprmult
  tpr.test = vnapply(seq_row(multi.y), function(i) sum(multi.y[i, ] & multi.p[i, ]) / (sum(multi.y[i, ])))
  tpr.test = mean(tpr.test, na.rm = TRUE)
  tpr.perf = performance(pred.multilabel, measures = multilabel.tpr, model = mod.multilabel)
  expect_equal(tpr.test, multilabel.tpr$fun(pred = pred.multilabel))
  expect_equal(tpr.test, as.numeric(tpr.perf))
  # check best and worst
  expect_equal(measureMultilabelTPR(multi.y, multi.y), multilabel.tpr$best)
  expect_equal(measureMultilabelTPR(multi.y, !multi.y), multilabel.tpr$worst)
  # compare with mldr (it throws a warning when recall() would divide by zero)
  expect_equal(
    suppressWarnings(mldr::recall(multi.y, multi.p, undefined_value = "ignore")),
    measureMultilabelTPR(multi.y, multi.p))
  # manual checks
  expect_equal(measureMultilabelTPR(matrix(tf, ncol = 2), matrix(tt, ncol = 2)), 1 / 1)
  expect_equal(measureMultilabelTPR(rbind(tf, tf), rbind(tf, tt)), mean(c(1 / 1, 1 / 1)))

  # test survival measures

  # cindex
  pos = pred.surv$data[pred.surv$data$truth.event == TRUE, "response"]
  neg = pred.surv$data[pred.surv$data$truth.event == FALSE, "response"]
  cons = c(ifelse(pos[1L] > neg, 1L, 0L), ifelse(pos[2L] > neg, 1L, 0L))
  ties = c(ifelse(pos[1L] == neg, 0.5, 0), ifelse(pos[2L] == neg, 0.5, 0))
  n.pairs = length(pos) * length(neg)
  cindex.test = sum(c(cons, ties)) / n.pairs
  cindex.perf = performance(pred.surv, measures = cindex, model = mod.surv)
  expect_equal(cindex.test, cindex$fun(pred = pred.surv))
  expect_equal(cindex.test, as.numeric(cindex.perf))

  # test cost-sensitive measures

  # meancosts
  meancosts.test = (0 + 0 + 0 + 1) / 4L
  meancosts.perf = performance(pred.costsens,
    measures = meancosts,
    model = mod.costsens, task = task.costsens)
  expect_equal(
    meancosts.test,
    meancosts$fun(pred = pred.costsens, task = task.costsens))
  expect_equal(meancosts.test, as.numeric(meancosts.perf))
  # mcp
  mcp.test = meancosts.test - 0
  mcp.perf = performance(pred.costsens,
    measures = mcp,
    task = task.costsens, model = mod.costsens)
  expect_equal(mcp.test, mcp$fun(pred = pred.costsens, task = task.costsens))
  expect_equal(mcp.test, as.numeric(mcp.perf))

  # test clustering

  # db
  c2 = c(3, 1)
  c1 = c((1 + 2 + 4) / 3, (3 + 4 + 2) / 3)
  s1 = sqrt((sum((data.cluster[1, ] - c1)^2) + sum((data.cluster[2, ] - c1)^2) +
    sum((data.cluster[4, ] - c1)^2)) / 3L)
  M = sqrt(sum((c2 - c1)^2))
  db.test = s1 / M
  db.perf = performance(pred.cluster,
    measures = db,
    model = mod.cluster, feats = data.cluster)
  expect_equal(db.test, db$fun(
    task = task.cluster,
    pred = pred.cluster, feats = data.cluster))
  expect_equal(db.test, as.numeric(db.perf))

  # g1 index
  exsum = sqrt(sum((c(1, 3) - c(3, 1))^2)) + sqrt(sum((c(2, 4) - c(3, 1))^2)) +
    sqrt(sum((c(4, 3) - c(3, 2))^2))
  insum = sqrt(sum((c(1, 3) - c(2, 4))^2)) + sqrt(sum((c(1, 3) - c(4, 2))^2)) +
    sqrt(sum((c(2, 4) - c(4, 2))^2))
  g1.test = exsum / insum
  g1.perf = performance(pred.cluster,
    measures = G1,
    model = mod.cluster, feats = data.cluster)
  expect_equal(g1.test, G1$fun(pred = pred.cluster, feats = data.cluster))
  expect_equal(g1.test, as.numeric(g1.perf))

  # g2 index
  dists = as.matrix(dist(data.cluster, method = "euclidian"))
  c2.dists = as.vector(dists[, 3L])
  c2.dists = c2.dists[c2.dists != 0L]
  c1.dists = unique(as.vector(dists[-3L, -3L]))
  c1.dists = c1.dists[c1.dists != 0L]
  con.pairs = vapply(
    c1.dists, function(x) x < c2.dists,
    logical(length = length(c2.dists)))
  con.pairs = sum(rowSums(con.pairs))
  dis.pairs = vapply(
    c2.dists, function(x) x < c1.dists,
    logical(length = length(c1.dists)))
  dis.pairs = sum(rowSums(dis.pairs))
  g2.test = (con.pairs - dis.pairs) / (con.pairs + dis.pairs)
  g2.perf = performance(pred.cluster,
    measures = G2,
    model = mod.cluster, feats = data.cluster)
  expect_equal(g2.test, G2$fun(pred = pred.cluster, feats = data.cluster))
  expect_equal(g2.test, as.numeric(g2.perf))

  # silhouette
  dists = as.matrix(clusterSim::dist.GDM(data.cluster))
  ais = replace(dists, dists == 0, NA)[-3L, -3L]
  ais = apply(ais, MARGIN = 2L, mean, na.rm = TRUE)
  bis = dists[-3L, 3L]
  sil.data = data.frame(t(rbind(ais, bis)))
  sils = (sil.data$bis - sil.data$ais) / pmax(sil.data$bis, sil.data$ais)
  silhouette.test = sum(sils) / nrow(data.cluster)
  silhouette.perf = performance(pred.cluster,
    measures = silhouette,
    model = mod.cluster, feats = data.cluster)
  expect_equal(silhouette.test, silhouette$fun(pred = pred.cluster, feats = data.cluster))
  expect_equal(object = silhouette.test, as.numeric(silhouette.perf))

  # test that some measures are only transformations of each other

  # qsr is identical to the 1 - multiclass brier
  expect_equal(1 - measureMulticlassBrier(p1, y1), measureQSR(p1, y1))
  qsr.bin.perf = performance(pred.bin, measures = qsr, model = mod.bin)
  expect_equal(1 - 2 * brier.perf, qsr.bin.perf, ignore_attr = "names")

  expect_equal(lsr.perf, -1 * logloss.perf, ignore_attr = "names")

  # multiclass brier for a two class problem should be two times the binary brier score.
  multiclass.brier.twoclass.perf = performance(pred.bin, measures = multiclass.brier, model = mod.bin)
  expect_equal(2 * brier.perf, multiclass.brier.twoclass.perf, ignore_attr = "names")
})

test_that("getDefaultMeasure", {
  expect_equal(mmce, getDefaultMeasure(iris.task))
  expect_equal(mmce, getDefaultMeasure(getTaskDesc(iris.task)))
  expect_equal(mmce, getDefaultMeasure(makeLearner("classif.rpart")))
  expect_equal(mmce, getDefaultMeasure("classif.rpart"))
  expect_equal(mmce, getDefaultMeasure("classif"))
})

test_that("measure properties", {
  # hasMeasureProps yields correct properties
  expect_true(all(vlapply(
    listMeasures(create = TRUE),
    function(m) {
      res = hasMeasureProperties(m, m$properties)
      all(res) & length(res) > 0
    }
  )))
  props = listMeasureProperties()
  # all props exist in mlr$measure.properties
  expect_true(all(vlapply(
    listMeasures(create = TRUE),
    function(m) {
      res = all(getMeasureProperties(m) %in% props)
      all(res) & length(res) > 0
    }
  )))
})

test_that("measures ppv denominator 0", {
  set.seed(1)
  task = sonar.task
  lrn = makeLearner("classif.rpart", predict.type = "prob")
  r = holdout(lrn, task)
  d = generateThreshVsPerfData(r, measures = list(tpr, ppv), gridsize = 5)
  expect_equal(length(which(is.na(d$data))), 0)

  lrns = list(makeLearner("classif.randomForest", predict.type = "prob"), makeLearner("classif.rpart", predict.type = "prob"))
  tasks = list(bc.task, sonar.task)
  rdesc = makeResampleDesc("CV", iters = 2L)
  meas = list(acc, ber)
  bmrk = benchmark(lrns, tasks, rdesc, measures = meas)
  pr = generateThreshVsPerfData(bmrk, measures = list(tpr, ppv))
  expect_equal(length(which(is.na(pr$data))), 0)
})

test_that("measures MCC denominator 0 (#1736)", {
  res = measureMCC(c(TRUE, TRUE, TRUE), c(TRUE, TRUE, TRUE), TRUE, FALSE)
  expect_equal(res, 0)
})

test_that("setMeasurePars", {
  mm = mmce
  expect_list(mm$extra.args, len = 0L, names = "named")
  mm = setMeasurePars(mm, foo = 1, bar = 2)
  expect_list(mm$extra.args, len = 2L, names = "named")
  expect_equal(mm$extra.args, list(foo = 1, bar = 2))
  expect_list(mmce$extra.args, len = 0L, names = "named") # mmce is untouched?

  mm = setMeasurePars(mmce, foo = 1, bar = 2, par.vals = list(foobar = 99))
  expect_equal(mm$extra.args, list(foobar = 99, foo = 1, bar = 2))

  # re-setting parameters to NULL
  mm = setMeasurePars(mmce, foo = 1, bar = 2)
  expect_list(mm$extra.args, len = 2L, names = "named")
  mm = setMeasurePars(mm, foo = NULL, bar = 2)
  expect_equal(mm$extra.args, list(foo = NULL, bar = 2))

  # precedence of ... over par.vals
  mm = setMeasurePars(mmce, foo = 1, par.vals = list(foo = 2))
  expect_equal(mm$extra.args, list(foo = 1))
})

test_that("bac works as intended with multiclass tasks (#1834)", {
  var1 = c(1, 2, 3, 4)
  var2 = c(3, 4, 1, 2)
  tar.classif = factor(c(1L, 2L, 0L, 1L))
  pred.art.classif = factor(c(1L, 1L, 0L, 2L))
  data.classif = data.frame(var1, var2, tar.classif)
  task.classif = makeClassifTask(data = data.classif, target = "tar.classif")
  lrn.classif = makeLearner("classif.rpart", predict.type = "prob")
  mod.classif = train(lrn.classif, task.classif)
  pred.classif = predict(mod.classif, task.classif)

  bac.test = mean(diag(table(pred.classif$data$truth, pred.classif$data$response) /
    table(pred.classif$data$truth, pred.classif$data$truth)))
  bac.perf = performance(pred.classif, measures = bac, model = mod.bin)
  expect_equal(bac.test, bac$fun(pred = pred.classif))
  expect_equal(bac.test, as.numeric(bac.perf))
})

test_that("new bac gives the same result as old implementation", {
  lrn = makeLearner("classif.rpart")
  task = binaryclass.task
  mod = train(lrn, task = task)
  pred = predict(mod, task = task)
  perf = performance(pred, measures = bac)

  old.bac = mean(c(
    tp$fun(pred = pred) / sum(pred$data$truth == pred$task.desc$positive),
    tn$fun(pred = pred) / sum(pred$data$truth == pred$task.desc$negative)))

  expect_equal(ignore_attr = TRUE, old.bac, perf)
})
berndbischl/mlr documentation built on Jan. 6, 2023, 12:45 p.m.