context("Evaluation methods")
set.seed(1234)
parts <- create_holdout_partition(toyml)
result <- predict(br(parts$train, "SVM"), parts$test)
test_that("Multi-label confusion matrix", {
mlconfmat <- multilabel_confusion_matrix(parts$test, result)
expected <- parts$test$dataset[,parts$test$labels$index]
predicted <- as.bipartition(result)
expect_equal(dim(mlconfmat$Z), dim(result))
expect_equal(dim(mlconfmat$Y), dim(result))
expect_equal(dim(mlconfmat$R), dim(result))
expect_equal(mlconfmat$Z, predicted)
expect_equal(mlconfmat$Y, expected)
expect_true(all(apply(mlconfmat$R, 1, function (row) row %in% 1:5)))
expect_equal(dim(mlconfmat$TP), dim(result))
expect_equal(dim(mlconfmat$TN), dim(result))
expect_equal(dim(mlconfmat$FP), dim(result))
expect_equal(dim(mlconfmat$FN), dim(result))
expect_equal(length(mlconfmat$Zi), parts$test$measures$num.instances)
expect_equal(length(mlconfmat$Yi), parts$test$measures$num.instances)
expect_equal(length(mlconfmat$Zl), parts$test$measures$num.labels)
expect_equal(length(mlconfmat$Yl), parts$test$measures$num.labels)
expect_equal(mlconfmat$Yi, apply(expected, 1, sum))
expect_equal(mlconfmat$Zi, apply(predicted, 1, sum))
expect_equal(mlconfmat$Yl, apply(expected, 2, sum))
expect_equal(mlconfmat$Zl, apply(predicted, 2, sum))
totals <- mlconfmat$TPi + mlconfmat$TNi + mlconfmat$FPi + mlconfmat$FNi
expect_true(all(totals == 5))
expect_equal(mlconfmat$TPi, apply(expected & predicted, 1, sum))
expect_equal(mlconfmat$TNi, apply(!expected & !predicted, 1, sum))
expect_equal(mlconfmat$FPi, apply(!expected & predicted, 1, sum))
expect_equal(mlconfmat$FNi, apply(expected & !predicted, 1, sum))
expect_equal(mlconfmat$TPl, apply(expected & predicted, 2, sum))
expect_equal(mlconfmat$TNl, apply(!expected & !predicted, 2, sum))
expect_equal(mlconfmat$FPl, apply(!expected & predicted, 2, sum))
expect_equal(mlconfmat$FNl, apply(expected & !predicted, 2, sum))
expect_error(multilabel_confusion_matrix(parts$train, result))
})
test_that("Bipartition measures", {
labels <- as.matrix(parts$test$dataset[, parts$test$labels$index])
expected <- parts$test$dataset[, parts$test$labels$index]
#100% correct
test.result <- multilabel_prediction(labels, labels, TRUE)
mlconfmat <- multilabel_confusion_matrix(parts$test, test.result)
expect_equal(utiml_measure_accuracy(mlconfmat), 1)
expect_equal(utiml_measure_f1(mlconfmat), 1)
expect_equal(utiml_measure_subset_accuracy(mlconfmat), 1)
expect_equal(utiml_measure_precision(mlconfmat), 1)
expect_equal(utiml_measure_recall(mlconfmat), 1)
expect_equal(utiml_measure_hamming_loss(mlconfmat), 0)
expect_equal(sum(utiml_measure_macro_AUC(mlconfmat)), 5)
expect_equal(utiml_measure_micro_AUC(mlconfmat), 1)
expect_equal(sum(utiml_measure_macro_precision(mlconfmat)), 5)
expect_equal(utiml_measure_micro_precision(mlconfmat), 1)
expect_equal(sum(utiml_measure_macro_recall(mlconfmat)), 5)
expect_equal(utiml_measure_micro_recall(mlconfmat), 1)
expect_equal(sum(utiml_measure_macro_f1(mlconfmat)), 5)
expect_equal(utiml_measure_micro_f1(mlconfmat), 1)
#100% incorrect
for (i in seq(ncol(labels))) {
pos <- labels[, i] == 1
neg <- !pos
labels[pos, i] <- 0
labels[neg, i] <- 1
}
test.result <- multilabel_prediction(labels, labels, TRUE)
mlconfmat <- multilabel_confusion_matrix(parts$test, test.result)
expect_equal(utiml_measure_accuracy(mlconfmat), 0)
expect_equal(utiml_measure_f1(mlconfmat), 0)
expect_equal(utiml_measure_subset_accuracy(mlconfmat), 0)
expect_equal(utiml_measure_precision(mlconfmat), 0)
expect_equal(utiml_measure_recall(mlconfmat), 0)
expect_equal(utiml_measure_hamming_loss(mlconfmat), 1)
expect_equal(mean(utiml_measure_macro_AUC(mlconfmat)), 0)
expect_equal(utiml_measure_micro_AUC(mlconfmat), 0)
expect_equal(mean(utiml_measure_macro_precision(mlconfmat)), 0)
expect_equal(utiml_measure_micro_precision(mlconfmat), 0)
expect_equal(mean(utiml_measure_macro_recall(mlconfmat)), 0)
expect_equal(utiml_measure_micro_recall(mlconfmat), 0)
expect_equal(mean(utiml_measure_macro_f1(mlconfmat)), 0)
expect_equal(utiml_measure_micro_f1(mlconfmat), 0)
#Random
set.seed(1234)
for (i in seq(ncol(labels))) {
labels[, i] <- utiml_normalize(rnorm(nrow(labels)))
}
labels <- fixed_threshold(labels, 0.5)
test.result <- multilabel_prediction(labels, labels, TRUE)
mlconfmat <- multilabel_confusion_matrix(parts$test, test.result)
measures <- list(
Accuracy = mean(rowSums(expected & labels) / rowSums(expected | labels)),
FMeasure = mean(2 * rowSums(expected & labels) /
(rowSums(expected) + rowSums(labels))),
SubsetAccuracy = mean(rowSums(expected == labels) == ncol(labels)),
Precision = mean(rowSums(expected & labels) / rowSums(labels)),
Recall = mean(rowSums(expected & labels) / rowSums(expected)),
HammingLoss = mean(unlist(lapply(seq(nrow(labels)), function (i) {
sum(xor(labels[i,], expected[i,])) / ncol(labels)
}))),
MacroPrecision = mean(
colSums(labels == 1 & expected == 1) / colSums(labels == 1)
),
MicroPrecision = sum(colSums(labels == 1 & expected == 1)) /
sum(colSums(labels == 1)),
MacroRecall = mean(
colSums(labels == 1 & expected == 1) / colSums(expected == 1)
),
MicroRecall = sum(colSums(labels == 1 & expected == 1)) /
sum(colSums(expected == 1)),
MacroFMeasure = (function (){
prec <- colSums(labels == 1 & expected == 1) / colSums(labels == 1)
rec <- colSums(labels == 1 & expected == 1) / colSums(expected == 1)
f1 <- 2 * prec * rec / (prec + rec)
mean(ifelse(is.na(f1), 0, f1))
})(),
MicroFMeasure = (function (){
prec <- sum(colSums(labels == 1 & expected == 1)) /
sum(colSums(labels == 1))
rec <- sum(colSums(labels == 1 & expected == 1)) /
sum(colSums(expected == 1))
2 * prec * rec / (prec + rec)
})()
)
expect_equal(utiml_measure_accuracy(mlconfmat), measures$Accuracy)
expect_equal(utiml_measure_f1(mlconfmat), measures$FMeasure)
expect_equal(utiml_measure_subset_accuracy(mlconfmat),
measures$SubsetAccuracy)
expect_equal(utiml_measure_precision(mlconfmat), measures$Precision)
expect_equal(utiml_measure_recall(mlconfmat), measures$Recall)
expect_equal(utiml_measure_hamming_loss(mlconfmat), measures$HammingLoss)
expect_equal(mean(utiml_measure_macro_precision(mlconfmat)),
measures$MacroPrecision)
expect_equal(utiml_measure_micro_precision(mlconfmat),
measures$MicroPrecision)
expect_equal(mean(utiml_measure_macro_recall(mlconfmat)), measures$MacroRecall)
expect_equal(utiml_measure_micro_recall(mlconfmat), measures$MicroRecall)
expect_equal(mean(utiml_measure_macro_f1(mlconfmat)), measures$MacroFMeasure)
expect_equal(utiml_measure_micro_f1(mlconfmat), measures$MicroFMeasure)
})
test_that("Ranking measures", {
labels <- as.matrix(parts$test$dataset[, parts$test$labels$index])
expected <- parts$test$dataset[, parts$test$labels$index]
#100% correct
test.result <- multilabel_prediction(labels, labels, TRUE)
mlconfmat <- multilabel_confusion_matrix(parts$test, test.result)
expect_equal(utiml_measure_one_error(mlconfmat), 0)
expect_equal(utiml_measure_coverage(mlconfmat),
parts$test$measures$cardinality - 1)
expect_equal(utiml_measure_ranking_loss(mlconfmat), 0)
expect_equal(utiml_measure_average_precision(mlconfmat), 1)
expect_equal(utiml_measure_margin_loss(mlconfmat), 0)
expect_equal(utiml_measure_is_error(mlconfmat, mlconfmat$R), 0)
#100% incorrect
for (i in seq(ncol(labels))) {
pos <- labels[, i] == 1
neg <- !pos
labels[pos, i] <- 0
labels[neg, i] <- 1
}
test.result <- multilabel_prediction(labels, labels, TRUE)
mlconfmat <- multilabel_confusion_matrix(parts$test, test.result)
expect_equal(utiml_measure_one_error(mlconfmat), 1)
expect_equal(utiml_measure_coverage(mlconfmat), 4)
expect_equal(utiml_measure_ranking_loss(mlconfmat), 1)
#TODO study how to determine the worst case
average.precision <- mean(sapply(seq(nrow(labels)), function (row) {
Y <- mlconfmat$R[row, expected[row, ] == 1]
sum(sapply(Y, function (y){
sum(Y <= y) / y
})) / length(Y)
}))
expect_equal(utiml_measure_average_precision(mlconfmat), average.precision)
expect_equal(utiml_measure_margin_loss(mlconfmat), 4)
dif.rank <- mlconfmat$R[, 5:1]
colnames(dif.rank) <- colnames(mlconfmat$R)
expect_equal(utiml_measure_is_error(mlconfmat, dif.rank), 1)
#Random
set.seed(1234)
for (i in seq(ncol(labels))) {
labels[, i] <- utiml_normalize(rnorm(nrow(labels)))
}
ranking <- labels
idxRk <- t(apply(labels, 1, order, decreasing = TRUE))
for (row in seq(nrow(labels))) {
ranking[row, idxRk[row,]] <- 1:5
}
bipartition <- fixed_threshold(labels, 0.5)
test.result <- multilabel_prediction(bipartition, labels, TRUE)
mlconfmat <- multilabel_confusion_matrix(parts$test, test.result)
measures <- list(
OneError = mean(sapply(seq(nrow(labels)), function (row) {
ifelse(expected[row, which.min(ranking[row, ])], 0, 1)
})),
Coverage = mean(sapply(seq(nrow(labels)), function (row) {
max(ranking[row, expected[row,] == 1]) - 1
})),
RankingLoss = mean(sapply(seq(nrow(labels)), function (row) {
idxY <- expected[row, ] == 1
Y1 <- ranking[row, idxY]
Y2 <- ranking[row, !idxY]
sum(sapply(Y1, function (y) sum(y > Y2))) / (length(Y1) * length(Y2))
})),
AvgPrecision = mean(sapply(seq(nrow(labels)), function (row) {
Y <- ranking[row, expected[row, ] == 1]
sum(sapply(Y, function (y){
sum(Y <= y) / y
})) / length(Y)
})),
MarginLoss = mean(sapply(seq(nrow(labels)), function (row) {
idxY <- expected[row, ] == 1
max(0, max(ranking[row, idxY]) - min(ranking[row, !idxY]))
})),
IsError = mean(sapply(seq(nrow(labels)), function (row) {
rowSums(t(apply(ranking - dif.rank, 1, abs))) > 0
}))
)
expect_equal(utiml_measure_one_error(mlconfmat), measures$OneError)
expect_equal(utiml_measure_coverage(mlconfmat), measures$Coverage)
expect_equal(utiml_measure_ranking_loss(mlconfmat), measures$RankingLoss)
expect_equal(utiml_measure_average_precision(mlconfmat),
measures$AvgPrecision)
expect_equal(utiml_measure_margin_loss(mlconfmat), measures$MarginLoss)
expect_equal(utiml_measure_is_error(mlconfmat, dif.rank), measures$IsError)
})
test_that("Label-problem measures", {
labels <- as.matrix(parts$test$dataset[, parts$test$labels$index])
expected <- parts$test$dataset[, parts$test$labels$index]
#Constant labels
expect_equal(c(clp=0), multilabel_evaluate(parts$test, labels, "clp"))
lapply(seq(ncol(labels)), function(i){
labels[,seq(i)] <- 1
expect_equal(c(clp=i/ncol(labels)), multilabel_evaluate(parts$test, labels, "clp"))
})
#Missing labels
options(utiml.empty.prediction = TRUE)
expect_equal(c(mlp=0), multilabel_evaluate(parts$test, labels, "mlp"))
expect_equal(c(wlp=0), multilabel_evaluate(parts$test, labels, "wlp"))
lapply(seq(ncol(labels)), function(i){
labels[,seq(i)] <- 0
expect_equal(c(mlp=i/ncol(labels)), multilabel_evaluate(parts$test, labels, "mlp"))
expect_equal(c(wlp=i/ncol(labels)), multilabel_evaluate(parts$test, labels, "wlp"))
})
#Wrong labels
lapply(seq(ncol(labels)), function(i){
for (j in seq(i)) {
labels[,j] <- ifelse(expected[,j] == 1, 0, 1)
}
expect_equal(c(mlp=0), multilabel_evaluate(parts$test, labels, "mlp"))
expect_equal(c(wlp=i/ncol(labels)), multilabel_evaluate(parts$test, labels, "wlp"))
})
options(utiml.empty.prediction = FALSE)
})
test_that("Measures names", {
expect_equal(utiml_measure_names("abc"), c("abc"))
rankings <- c("average-precision", "coverage", "margin-loss", "one-error",
"ranking-loss")
expect_equal(utiml_measure_names("ranking"), rankings)
expect_equal(utiml_measure_names(c("ranking", "ranking")), rankings)
expect_equal(utiml_measure_names(c("ranking", "xyz")),
c(rankings, "xyz"))
macro <- c("macro-AUC", "macro-F1", "macro-precision", "macro-recall")
micro <- c("micro-AUC", "micro-F1", "micro-precision", "micro-recall")
expect_equal(utiml_measure_names("macro-based"), macro)
expect_equal(utiml_measure_names("micro-based"), micro)
expect_equal(utiml_measure_names("label-based"), sort(c(micro, macro)))
example <- sort(c("accuracy", "F1", "hamming-loss", "precision", "recall",
"subset-accuracy"))
expect_equal(utiml_measure_names("example-based"), example)
label <- c("clp", "mlp", "wlp")
expect_equal(utiml_measure_names("label-problem"), label)
expect_equal(utiml_measure_names(),
sort(c(rankings, example, macro, micro, label)))
})
test_that("Evaluate", {
expect_equal(length(multilabel_evaluate(parts$test, result, "accuracy")), 1)
measures <- multilabel_evaluate(parts$test, result, "example-based")
expect_equal(length(measures), 6)
expect_true(all(measures >= 0 & measures <= 1))
expect_named(measures, utiml_measure_names("example-based"))
mlconfmat <- multilabel_confusion_matrix(parts$test, result)
expect_equal(measures, multilabel_evaluate(mlconfmat, "example-based"))
measures <- multilabel_evaluate(mlconfmat, c("hamming-loss", "macro-accuracy",
"micro-accuracy"))
expect_equal(length(measures), 3)
expect_true(measures["macro-accuracy"] == measures["micro-accuracy"])
expect_true(measures["macro-accuracy"] + measures["hamming-loss"] == 1)
expect_equal(multilabel_evaluate(parts$test, as.matrix(result)),
multilabel_evaluate(parts$test, as.matrix(result)))
expect_error(multilabel_evaluate(parts$test))
expect_error(multilabel_evaluate(parts$test, result, "mymeasure"))
})
test_that("Mulan Measures", {
expected <- read.csv("../testfiles/flags-expected.csv")
bipartition <- read.csv("../testfiles/flags-bipartition.csv")
probability <- read.csv("../testfiles/flags-scores.csv")
measures <- read.csv("../testfiles/flags-measures.csv")
mnames <- measures[, 1]
measures <- measures[, 2]
names(measures) <- mnames
dataset <- cbind(attr1=rep(1, nrow(expected)), expected)
indexes <- seq(ncol(expected)) + 1
mdata <- mldr::mldr_from_dataframe(dataset, indexes, name="flags")
mlresult <- multilabel_prediction(bipartition, probability, TRUE)
evaluation <- multilabel_evaluate(mdata, mlresult)
expect_equal(evaluation["accuracy"], measures["Accuracy"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["hamming-loss"], measures["Hamming-Loss"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["F1"], measures["F-Measure"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["precision"], measures["Precision"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["recall"], measures["Recall"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["subset-accuracy"], measures["Subset-Accuracy"],
check.attributes = FALSE, tolerance = 1e-3)
#expect_equal(evaluation["macro-AUC"], measures["Macro-AUC"],
# check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["macro-recall"], measures["Macro-Recall"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["micro-F1"], measures["Micro-F-Measure"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["macro-F1"], measures["Macro-F-Measure"],
check.attributes = FALSE, tolerance = 1e-3)
#expect_equal(evaluation["micro-AUC"], measures["Micro-AUC"],
# check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["micro-precision"], measures["Micro-Precision"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["macro-precision"], measures["Macro-Precision"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["micro-recall"], measures["Micro-Recall"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["average-precision"], measures["Average-Precision"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["coverage"], measures["Coverage"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["one-error"], measures["OneError"],
check.attributes = FALSE, tolerance = 1e-3)
expect_equal(evaluation["ranking-loss"], measures["Ranking-Loss"],
check.attributes = FALSE, tolerance = 1e-2)
#ranking-loss has a lower precision because there are differences in the
#ranking order when there are ties
#expect_equal(evaluation["is-error"], measures["IsError"],
# check.attributes = FALSE, tolerance = 1e-3)
#ranking_error
})
test_that("Sum mlconfmat", {
mlconfmat <- multilabel_confusion_matrix(parts$test, result)
dml <- mlconfmat + mlconfmat
ndim <- dim(mlconfmat$Z)
ndim[1] <- ndim[1] * 2
expect_equal(dim(dml$Z), ndim)
expect_equal(dim(dml$Y), ndim)
expect_equal(dim(dml$R), ndim)
expect_equal(dim(dml$TP), ndim)
expect_equal(dim(dml$FP), ndim)
expect_equal(dim(dml$TN), ndim)
expect_equal(dim(dml$FN), ndim)
expect_equal(length(dml$Zi), ndim[1])
expect_equal(length(dml$Yi), ndim[1])
expect_equal(dml$Zl, mlconfmat$Zl * 2)
expect_equal(dml$Yl, mlconfmat$Yl * 2)
expect_equal(length(dml$TPi), ndim[1])
expect_equal(length(dml$FPi), ndim[1])
expect_equal(length(dml$TNi), ndim[1])
expect_equal(length(dml$FNi), ndim[1])
expect_equal(dml$TPl, mlconfmat$TPl * 2)
expect_equal(dml$FPl, mlconfmat$FPl * 2)
expect_equal(dml$TNl, mlconfmat$TNl * 2)
expect_equal(dml$FNl, mlconfmat$FNl * 2)
ip1 <- seq(1, 10)
ip2 <- seq(11, parts$test$measures$num.instances)
part1 <- create_subset(parts$test, ip1)
part2 <- create_subset(parts$test, ip2)
dml <- multilabel_confusion_matrix(part1, result[ip1, ]) +
multilabel_confusion_matrix(part2, result[ip2, ])
expect_equal(dml, mlconfmat)
expected <- read.csv("../testfiles/flags-expected.csv")
bipartition <- read.csv("../testfiles/flags-bipartition.csv")
probability <- read.csv("../testfiles/flags-scores.csv")
dataset <- cbind(attr1=rep(1, nrow(expected)), expected)
mdata <- mldr::mldr_from_dataframe(dataset, seq(ncol(expected)) + 1,
name="flags")
mlresult <- multilabel_prediction(bipartition, probability, TRUE)
mlconfmat2 <- multilabel_confusion_matrix(mdata, mlresult)
expect_error(mlconfmat + mlconfmat2)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.