Nothing
#' Hierarchy Of Multilabel classifiER (HOMER)
#'
#' Create a Hierarchy Of Multilabel classifiER (HOMER).
#'
#' HOMER is an algorithm for effective and computationally efficient multilabel
#' classification in domains with many labels. It constructs a hierarchy of
#' multilabel classifiers, each one dealing with a much smaller set of labels.
#'
#' @family Transformation methods
#' @param mdata A mldr dataset used to train the binary models.
#' @param base.algorithm A string with the name of the base algorithm. (Default:
#' \code{options("utiml.base.algorithm", "SVM")})
#' @param clusters Number maximum of nodes in each level. (Default: 3)
#' @param method The strategy used to organize the labels (create the
#' meta-labels). The options are: "balanced", "clustering" and "random".
#' (Default: "balanced").
#' @param iteration The number max of iterations, used by balanced or clustering
#' methods.
#' @param ... Others arguments passed to the base algorithm for all subproblems.
#' @param cores The number of cores to parallelize the training. Values higher
#' than 1 require the \pkg{parallel} package. (Default:
#' \code{options("utiml.cores", 1)})
#' @param seed An optional integer used to set the seed. (Default:
#' \code{options("utiml.seed", NA)})
#' @return An object of class \code{HOMERmodel} containing the set of fitted
#' models, including:
#' \describe{
#' \item{labels}{A vector with the label names.}
#' \item{clusters}{The number of nodes in each level}
#' \item{models}{The Hierarchy of BR models.}
#' }
#' @references
#' Tsoumakas, G., Katakis, I., & Vlahavas, I. (2008). Effective and efficient
#' multilabel classification in domains with large number of labels. In Proc.
#' ECML/PKDD 2008 Workshop on Mining Multidimensional Data (MMD'08)
#' (pp. 30-44). Antwerp, Belgium.
#' @export
#'
#' @examples
#' model <- homer(toyml, "RANDOM")
#' pred <- predict(model, toyml)
#'
#' \donttest{
#' ##Change default configurations
#' model <- homer(toyml, "RF", clusters=5, method="clustering", iteration=10)
#' }
homer <- function (mdata,
base.algorithm = getOption("utiml.base.algorithm", "SVM"),
clusters = 3, method = c("balanced", "clustering", "random"),
iteration = 100, ..., cores = getOption("utiml.cores", 1),
seed = getOption("utiml.seed", NA)) {
# Validations
if (!is(mdata, "mldr")) {
stop("First argument must be an mldr object")
}
if (clusters < 1) {
stop("The number of clusters must be greater than 1")
}
method <- switch (match.arg(method),
balanced = homer_balanced_kmeans,
clustering = homer_kmeans,
random = homer_random
)
# HOMER Model class
hmodel <- list(clusters = clusters, method = method, call = match.call())
hmodel$labels = rownames(mdata$labels)
if (!anyNA(seed)) {
set.seed(seed)
}
hmodel$models <- buildLabelHierarchy(mdata, base.algorithm, method, clusters,
iteration, ..., cores=cores, seed=seed)
class(hmodel) <- "HOMERmodel"
hmodel
}
#' Predict Method for HOMER
#'
#' This function predicts values based upon a model trained by
#' \code{\link{homer}}.
#'
#' @param object Object of class '\code{HOMERmodel}'.
#' @param newdata An object containing the new input data. This must be a
#' matrix, data.frame or a mldr object.
#' @param probability Logical indicating whether class probabilities should be
#' returned. (Default: \code{getOption("utiml.use.probs", TRUE)})
#' @param ... Others arguments passed to the base algorithm prediction for all
#' subproblems.
#' @param cores The number of cores to parallelize the prediction. Values higher
#' than 1 require the \pkg{parallel} package. (Default:
#' \code{options("utiml.cores", 1)})
#' @param seed An optional integer used to set the seed. (Default:
#' \code{options("utiml.seed", NA)})
#' @return An object of type mlresult, based on the parameter probability.
#' @seealso \code{\link[=homer]{Hierarchy Of Multilabel classifiER (HOMER)}}
#' @export
#'
#' @examples
#' model <- homer(toyml, "RANDOM")
#' pred <- predict(model, toyml)
predict.HOMERmodel <- function (object, newdata,
probability = getOption("utiml.use.probs", TRUE),
..., cores = getOption("utiml.cores", 1),
seed = getOption("utiml.seed", NA)) {
# Validations
if (!is(object, "HOMERmodel")) {
stop("First argument must be a HOMERmodel object")
}
previous.value <- getOption("utiml.empty.prediction")
options(utiml.empty.prediction = FALSE)
newdata <- utiml_newdata(newdata)
if (!anyNA(seed)) {
set.seed(seed)
}
prediction <- predictLabelHierarchy(object$model, newdata, ...,
cores=cores, seed=seed)
options(utiml.empty.prediction = previous.value)
as.mlresult(prediction, probability)
}
predictLabelHierarchy <- function(node, newdata, ..., cores, seed) {
prediction <- predict.BRmodel(node$model, newdata[, node$attributes], ...)
bipartition <- as.bipartition(prediction)
probability <- as.probability(prediction)
metalabel <- paste(unlist(lapply(node$metalabels, paste, collapse="*")),
collapse="|")
for(i in seq(node$metalabels)) {
labels <- node$metalabels[[i]]
if (length(labels) > 1) {
child <- node$children[[i]]
indexes <- bipartition[, i, drop=FALSE] == 1
if (any(indexes)) {
prediction <- predictLabelHierarchy(child, newdata[indexes, ], ...,
cores=cores, seed=seed)
new.bip <- new.prob <- as.data.frame(
matrix(0, ncol = ncol(prediction), nrow = nrow(bipartition),
dimnames = list(rownames(bipartition), colnames(prediction)))
)
new.bip[indexes, colnames(prediction)] <- as.bipartition(prediction)
bipartition <- cbind(bipartition, new.bip)
values <- probability[!indexes, i]
if (length(values) > 0) {
#TODO change this if
new.prob[!indexes, ] <- do.call(
cbind,
lapply(seq(ncol(new.prob)), function (j) values)
)
}
new.prob[indexes, colnames(prediction)] <- as.probability(prediction)
probability <- cbind(probability, new.prob)
} else {
#Predict all instances of the meta-label as negative
aux <- do.call(cbind, lapply(labels, function(lbl)
bipartition[, i, drop=FALSE]))
colnames(aux) <- labels
bipartition <- cbind(bipartition, aux)
aux <- do.call(cbind, lapply(labels, function(lbl)
probability[, i, drop=FALSE]))
colnames(aux) <- labels
probability <- cbind(probability, aux)
}
} else {
#Rename the meta-label because it is the label
colnames(bipartition)[i] <- colnames(probability)[i] <- labels
}
}
#cat(metalabel, "\n")
multilabel_prediction(
bipartition[, node$labels, drop=F], probability[, node$labels, drop=F]
)
}
buildLabelHierarchy <- function (mdata, base.algorithm, method, k, it,
..., cores, seed) {
node <- list(labels = rownames(mdata$labels), metalabels = list())
node$metalabels <- method(mdata, k, it)
newls <- do.call(cbind, lapply(node$metalabels, function (u){
as.numeric(rowSums(mdata$dataset[, u, drop=FALSE]) > 0)
}))
colnames(newls) <- paste('meta-lbl-', seq(node$metalabels), sep='')
rows <- which(rowSums(newls) > 0)
#Fix meta-label without positive instances
if (any(colSums(newls) == 0)) {
empty.labels <- colSums(newls) == 0
node$metalabels <- c(node$metalabels[!empty.labels],
unlist(node$metalabels[empty.labels]))
newls <- do.call(cbind, lapply(node$metalabels, function (u){
as.numeric(rowSums(mdata$dataset[, u, drop=FALSE]) > 0)
}))
colnames(newls) <- paste('meta-lbl-', seq(node$metalabels), sep='')
rows <- which(rowSums(newls) > 0)
}
ndata <- remove_unique_attributes(mldr_from_dataframe(
cbind(mdata$dataset[rows, mdata$attributesIndexes], newls[rows,, drop=F]),
mdata$measures$num.inputs + seq(length(node$metalabels)),
name = mdata$name
))
mtlbl <- paste(sapply(node$metalabels, paste, collapse='*'), collapse="|")
node$attributes <- colnames(ndata$dataset[, ndata$attributesIndexes])
node$model <- br(ndata, base.algorithm, ..., cores=cores, seed=seed)
rm(ndata)
node$children <- lapply(node$metalabels, function (metalabels) {
if (length(metalabels) > 1) {
excluded.label <- node$labels[!node$labels %in% metalabels]
ndata <- remove_unlabeled_instances(remove_labels(mdata, excluded.label))
buildLabelHierarchy(ndata, base.algorithm, method, k, it, ...,
cores=cores, seed=seed)
} else {
NULL
}
})
node
}
homer_balanced_kmeans <- function (mdata, k, it, ...) {
if (k >= mdata$measures$num.labels) {
return(as.list(rownames(mdata$labels)))
}
dataset <- t(mdata$dataset[, mdata$labels$index])
labels <- rownames(dataset)
Ci <- list()
centers <- dataset[sample(labels, k), ]
rownames(centers) <- NULL
for (i in seq(it)) {
ldist <- apply(dataset, 1, function (r1) {
apply(centers, 1, function (r2) stats::dist(rbind(r1, r2)))
})
has.extra <- TRUE
while(has.extra) {
j <- apply(ldist, 2, which.min)
Ci <- lapply(seq(k), function (i) sort(ldist[i, which(j == i)]))
extra <- which(unlist(lapply(Ci, length)) > ceiling(length(labels)/k))
for (i in extra) {
ldist[i, names(Ci[[i]])[length(Ci[[i]])]] <- Inf
}
has.extra <- length(extra) > 0
}
new.centers <- do.call(rbind, lapply(Ci, function (rows) {
colMeans(dataset[names(rows), , drop=FALSE])
}))
if (all(centers == new.centers)) {
break
}
centers <- new.centers
}
lapply(Ci, names)
}
homer_kmeans <- function (mdata, k, it, ...) {
if (k >= mdata$measures$num.labels) {
as.list(rownames(mdata$labels))
} else {
clusters <- stats::kmeans(t(mdata$dataset[, mdata$labels$index]),
k, iter.max = it)
split(rownames(mdata$labels), clusters$cluster)
}
}
homer_random <- function (mdata, k, ...) {
split(sample(rownames(mdata$labels)),
rep_len(seq(k), mdata$measures$num.labels))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.