#' Classification based on K-means clustering within the classes.
#'
#' Prototype methods represent the training data by a set of points in feature
#' space. Each prototype has an associated class label, and classification of a query
#' point \eqn{x} is made to the class of the closest prototype.
#'
#' In order to use K-means clustering for classification of labeled data K-means
#' clustering is applied to the training data of each class separately, using the number
#' of prototypes per class which is specified by \code{K} and defaults to 2.
#'
#' Usually for a test observation the class label of the closest prototype is predicted.
#' But it is also possible to use more than 1 prototype and to weigh the influence of the prototypes on
#' the classification according to their distances from the observation to be classified.
#' This is controlled by the arguments \code{wf}, \code{k}, \code{bw} and \code{nn.only}
#' (see \code{\link[=biweight]{wfs}}).
#'
#' The name of the window function (\code{wf}) can be specified as a character string.
#' In this case the window function is generated internally in \code{kmc}. Currently
#' supported are \code{"biweight"}, \code{"cauchy"}, \code{"cosine"}, \code{"epanechnikov"},
#' \code{"exponential"}, \code{"gaussian"}, \code{"optcosine"}, \code{"rectangular"} and
#' \code{"triangular"}.
#'
#' Moreover, it is possible to generate the window functions mentioned above in advance
#' (see \code{\link[=biweight]{wfs}}) and pass them to \code{kmc}.
#'
#' Any other function implementing a window function can also be used as \code{wf} argument.
#' This allows the user to try own window functions.
#' See help on \code{\link[=biweight]{wfs}} for details.
#'
#' It may be useful to \code{\link{scale}} the data first.
#'
#' If the predictor variables include factors, the formula interface must be used in order
#' to get a correct model matrix.
#'
#' @title K-Means Classification
#'
#' @param formula A \code{formula} of the form \code{groups ~ x1 + x2 + \dots}, that is, the response is the
#' grouping \code{factor} and the right hand side specifies the (usually non-\code{factor}) discriminators.
#' @param data A \code{data.frame} from which variables specified in \code{formula} are to be taken.
#' @param x (Required if no \code{formula} is given as principal argument.) A \code{matrix} or \code{data.frame} or \code{Matrix} containing the explanatory variables.
#' @param grouping (Required if no \code{formula} is given as principal argument.) A \code{factor} specifying the class membership for each observation.
#' @param K The number of prototypes per class, either a single number or a vector of length equal to the number of classes.
#' The numbers of centers have to be in the same order as the levels of grouping. Default is \code{K = 2}.
#' @param wf A window function which is used to calculate weights that are introduced into
#' the fitting process. Either a character string or a function, e.g. \code{wf = function(x) exp(-x)}.
#' For details see the documentation for \code{\link[=biweight]{wfs}}.
#' @param bw (Required only if \code{wf} is a string.) The bandwidth parameter of the window function. (See \code{\link[=biweight]{wfs}}.)
#' @param k (Required only if \code{wf} is a string.) The number of nearest neighbors of the decision boundary to be used in the fitting process. (See \code{\link[=biweight]{wfs}}.)
#' @param nn.only (Required only if \code{wf} is a string indicating a window function with infinite support and if \code{k} is specified.) Should
#' only the \code{k} nearest neighbors or all observations receive positive weights? (See \code{\link[=biweight]{wfs}}.)
#' @param nstart The number of random starts of the K-means algorithm. See \code{\link{kmeans}}.
#' @param \dots Further arguments to be passed to \code{\link{kmeans}}.
#' @param subset An index vector specifying the cases to be used in the training sample. (NOTE: If given, this argument must be named.)
#' @param na.action A function to specify the action to be taken if NAs are found. The default action is first
#' the \code{na.action} setting of \code{\link{options}} and second \code{\link{na.fail}} if that is unset.
#' An alternative is \code{\link{na.omit}}, which leads to rejection of cases with missing values on any required
#' variable. (NOTE: If given, this argument must be named.)
#'
#' @return An object of class \code{"kmc"} containing the following components:
#' \item{counts}{The number of observations per class.}
#' \item{x}{A \code{matrix} of prototypes.}
#' \item{grouping}{A \code{factor} specifying the class membership for each prototype.}
#' \item{lev}{The class labels (the levels of \code{grouping}).}
#' \item{N}{The number of training observations.}
#' \item{K}{The (used) number of prototypes per class.}
#' \item{call}{The (matched) function call.}
#'
#' @references
#' T. Hastie, R. Tibshirani, and J. Friedman. The Elements of Statistical Learning: Data Mining,
#' Inference, and Prediction. Springer Series in Statistics. Springer, New York, 2001.
#'
#' @seealso \code{\link{predict.kmc}}, \code{\link{kmeans}}.
#'
#' @examples
#' # generate waveform data
#' library(mlbench)
#' data.train <- as.data.frame(mlbench.waveform(300))
#'
#' # 3 centers per class
#' object <- kmc(classes ~ ., data = data.train, K = 3, wf = "rectangular", k = 1)
#' object <- kmc(data.train[,-22], data.train$classes, K = 3, wf = "rectangular", k = 1)
#'
#' # 2 centers in class 1, 3 centers in class 2, 4 centers in class 3
#' object <- kmc(classes ~ ., data = data.train, K = c(2,3,4), wf = "rectangular", k = 1)
#' object <- kmc(data.train[,-22], data.train$classes, K = c(2,3,4), wf = "rectangular", k = 1)
#'
#' @keywords classif multivariate
#'
#' @aliases kmc kmc.data.frame kmc.default kmc.formula kmc.matrix update.kmc
#'
#' @export
# \code{k} is the number of prototypes used for classification, \code{wf} the weight function.
# Per default \code{wf = "none"}, that is
# no weighting is done, and \code{k = 1}. Thus the class label of the closest prototype is assigned to each new observation to be classified.
# Other choices of \code{wf} require specification of the \code{bw} argument, the width of the window function. In these cases the argument \code{k}
# is optional. Per default all prototypes are used.
# This is similar to gaussian mixture-based discriminant analysis (cp. \code{\link[mda]{mda}}).
kmc <- function(x, ...)
UseMethod("kmc")
#' @rdname kmc
#' @export
kmc.formula <- function(formula, data, ..., subset, na.action) {
m <- match.call(expand.dots = FALSE)
m$... <- NULL
m[[1L]] <- as.name("model.frame")
m <- eval.parent(m)
Terms <- attr(m, "terms")
grouping <- model.response(m)
x <- model.matrix(Terms, m)
xint <- match("(Intercept)", colnames(x), nomatch = 0L)
if (xint > 0)
x <- x[, -xint, drop = FALSE]
res <- kmc.default(x, grouping, ...)
res$terms <- Terms
cl <- match.call()
cl[[1L]] <- as.name("kmc")
res$call <- cl
res$contrasts <- attr(x, "contrasts")
res$xlevels <- .getXlevels(Terms, m)
res$na.action <- attr(m, "na.action")
res
}
#' @rdname kmc
#' @export
kmc.data.frame <- function (x, ...) {
res <- kmc(structure(data.matrix(x, rownames.force = TRUE), class = "matrix"), ...)
cl <- match.call()
cl[[1L]] <- as.name("kmc")
res$call <- cl
res
}
#' @rdname kmc
#' @export
kmc.matrix <- function (x, grouping, ..., subset, na.action = na.fail) {
if (!missing(subset)) {
x <- x[subset, , drop = FALSE]
grouping <- grouping[subset]
}
if (missing(na.action)) {
if (!is.null(naa <- getOption("na.action"))) # if options(na.action = NULL) the default of na.action comes into play
if(!is.function(naa))
na.action <- get(naa, mode = "function")
else
na.action <- naa
}
dfr <- na.action(structure(list(g = grouping, x = x),
class = "data.frame", row.names = rownames(x)))
grouping <- dfr$g
x <- dfr$x
res <- kmc.default(x, grouping, ...)
cl <- match.call()
cl[[1L]] <- as.name("kmc")
res$call <- cl
res$na.action <- na.action
res
}
#' @rdname kmc
#' @export
kmc.default <- function(x, grouping, K = 2, wf = c("biweight", "cauchy", "cosine", "epanechnikov",
"exponential", "gaussian", "optcosine", "rectangular", "triangular"), bw, k, nn.only = TRUE, nstart = 1, ...) {
if (is.null(dim(x)))
stop("'x' is not a matrix")
x <- as.matrix(x)
n <- nrow(x)
if (any(!is.finite(x)))
stop("infinite, NA or NaN values in 'x'")
#if (missing(grouping))
# stop('argument "grouping" is missing, with no default')
if (!is.factor(grouping))
warning("'grouping' was coerced to a factor")
g <- as.factor(grouping)
if (n != length(g))
stop("'nrow(x)' and 'length(grouping)' are different")
lev <- lev1 <- levels(g)
if ((!is.numeric(K)) || !length(K))
stop("'K' must be numeric of length > 0")
if (length(K) != 1 && length(K) != length(lev))
stop("'length(K)' has to be 1 or equal to the number of classes")
if(any(abs(K - round(K)) > .Machine$double.eps^0.5)) {
K <- round(K)
warning("'K' must contain whole numbers and was rounded")
}
if (any(K < 0))
stop("'K' must be positive")
counts <- as.vector(table(g))
if (any(counts == 0)) {
empty <- counts == 0
warning(sprintf(ngettext(length(lev[empty]), "class %s is empty",
"classes %s are empty"), paste(lev[empty], collapse = ", ")),
domain = NA)
lev1 <- lev[counts > 0]
g <- factor(g, levels = lev1)
counts <- counts[counts > 0]
if (length(K) != 1) {
K <- K[counts > 0]
}
}
if (any(K > counts)) {
large <- K > counts
stop(sprintf(ngettext(length(lev1[large]), "too many prototypes for class %s",
"too many prototypes for classes %s"), paste(lev1[large], collapse = ", ")),
domain = NA)
}
nproto <- ifelse(length(K) == 1, length(lev1) * K, sum(K))
if (!missing(nstart)) {
if (!is.numeric(nstart) || !length(nstart))
stop("'nstart' must be numeric of length > 0")
if (nstart <= 0)
stop("'nstart' must be larger than zero")
if (abs(nstart - round(nstart)) > .Machine$double.eps^0.5) {
nstart <- round(nstart)
warning("'nstart' must be a whole number and was rounded")
}
}
if (is.character(wf)) {
m <- match.call(expand.dots = FALSE)
m$n <- nproto
m[[1L]] <- as.name("generatewf")
wf <- eval.parent(m)
} else if (is.function(wf)) {
if (!missing(k))
warning("argument 'k' is ignored")
if (!missing(bw))
warning("argument 'bw' is ignored")
if (!missing(nn.only))
warning("argument 'nn.only' is ignored")
if(!is.null(attr(wf, "adaptive"))) {
if(attr(wf, "adaptive")) {
if(!is.null(attr(wf, "k")) && attr(wf, "k") + 1 > n)
stop("'k + 1' is larger than 'nrow(x)'")
} else {
if(!is.null(attr(wf, "k")) && attr(wf, "k") > n)
stop("'k' is larger than 'nrow(x)'")
}
}
} else
stop("argument 'wf' is neither a character nor a function")
if (length(K) == 1) {
proto <- lapply(lev1, function(z) kmeans(x[g == z, , drop = FALSE], centers = K, nstart = nstart)$centers)
#by(x, g, function(z) kmeans(z, centers = K, nstart = nstart, ...)$centers) slower
cl <- factor(rep(lev1, each = K))
} else {
proto <- lapply(seq(along = lev1), function(z) kmeans(x[g == lev1[z], , drop = FALSE], centers = K[z], nstart = nstart, ...)$centers)
#lapply(lev1, function(z) kmeans(x[g == z,], centers = K[z], nstart = nstart, ...)$centers)
cl <- factor(rep(lev1, times = K))
names(K) <- lev1
}
proto <- do.call(rbind, proto)
colnames(proto) <- colnames(x)
call <- match.call()
call[[1]] <- as.name("kmc")
res <- structure(list(x = proto, grouping = cl, counts = counts, lev = lev, N = n, K = K, wf = wf, bw = attr(wf, "bw"), k = attr(wf, "k"),
nn.only = attr(wf, "nn.only"), adaptive = attr(wf, "adaptive"), call = call), class = "kmc")
return(res)
}
# @param x A \code{kmc} object.
# @param ... Further arguments to \code{\link{print}}.
#
#' @noRd
#'
#' @export
print.kmc <- function(x, ...) {
if (!is.null(cl <- x$call)) {
names(cl)[2] <- ""
cat("Call:\n")
dput(cl, control = NULL)
}
cat("\nNumber of prototypes per class:\n")
if (length(x$K) == 1) {
cat(x$K, "\n")
} else {
print(x$K, ...)
}
cat("\nPrototypes:\n")
print(data.frame(grouping = x$grouping, x$x, row.names = NULL), ...) # evtl. rownames weglassen
if(!is.null(attr(x$wf, "name"))) {
cat("\nWindow function: ")
cat(deparse(attr(x$wf, "name")), sep="\n")
} else {
cat("\nWindow function:\n")
cat(deparse(x$wf), sep="\n")
}
if(!is.null(x$bw))
cat("Bandwidth: ", x$bw, "\n")
if(!is.null(x$k))
cat("k: ", x$k, "\n")
if(!is.null(x$nn.only))
cat("Nearest neighbors only: ", x$nn.only, "\n")
if(!is.null(x$adaptive))
cat("Adaptive bandwidth: ", x$adaptive, "\n")
invisible(x)
}
#' @rdname kmc
#'
#' @export
#' @importFrom stats update
#'
#' @param object An object of class \code{"kmc"}.
# @param wf A window function which is used to calculate weights that are introduced into
# the fitting process. Either a character string or a function, e.g. \code{wf = function(x) exp(-x)}.
# For details see the documentation for \code{\link[=biweight]{wfs}}.
# @param bw (Required only if \code{wf} is a string.) The bandwidth parameter of the window function. (See \code{\link[=biweight]{wfs}}.)
# @param k (Required only if \code{wf} is a string.) The number of nearest neighbors of the decision boundary to be used in the fitting process. (See \code{\link[=biweight]{wfs}}.)
# @param nn.only (Required only if \code{wf} is a string indicating a window function with infinite support and if \code{k} is specified.) Should
# only the \code{k} nearest neighbors or all observations receive positive weights? (See \code{\link[=biweight]{wfs}}.)
update.kmc <- function(object, wf = c("biweight", "cauchy", "cosine", "epanechnikov",
"exponential", "gaussian", "optcosine", "rectangular", "triangular"),
bw, k, nn.only, ...) {
if (!inherits(object, "kmc"))
stop("object not of class \"kmc\"")
if (is.character(wf)) {
#if (!missing(k))
# if (k > nrow(object$x))
# stop("'k' is larger than total number of prototypes")
m <- match.call(expand.dots = FALSE)
m$object <- NULL
m$n <- nrow(object$x)
m[[1L]] <- as.name("generatewf")
wf <- eval.parent(m)
} else if (is.function(wf)) {
if (!missing(k))
warning("argument 'k' is ignored")
if (!missing(bw))
warning("argument 'bw' is ignored")
if (!missing(nn.only))
warning("argument 'nn.only' is ignored")
} else
stop("argument 'wf' is neither a character nor a function")
object$wf <- wf
object["bw"] <- list(bw = attr(wf, "bw"))
object["k"] <- list(bw = attr(wf, "k"))
object["nn.only"] <- list(bw = attr(wf, "nn.only"))
object["adaptive"] <- list(bw = attr(wf, "adaptive"))
object$call$wf <- m$wf
object$call$bw <- m$bw
object$call$k <- m$k
object$call$nn.only <- m$nn.only
return(object)
}
#' Classify multivariate observations using the prototype representation of the data
#' obtained with the \code{\link{kmc}} function.
#'
#' Classification of a new observation is based on the prototypes found in the training data by function \code{\link{kmc}}.
#'
#' This function is a method for the generic function \code{predict()} for class
#' \code{"kmc"}.
#' It can be invoked by calling \code{predict(x)} for an object \code{x} of the
#' appropriate class, or directly by calling \code{predict.kmc(x)} regardless of
#' the class of the object.
#'
#' @title K-means Classification
#'
#' @param object An object of class \code{"kmc"}.
#' @param newdata A \code{data.frame} of cases to be classified or, if object has a \code{formula}, a \code{data.frame}
#' with columns of the same names as the variables used. A vector will be interpreted
#' as a row vector. If \code{newdata} is missing, an attempt will be made to
#' retrieve the data used to fit the \code{kmc} object.
#' @param \dots Further arguments. Currently unused.
#'
#' @return A \code{list} with components:
#' \item{class}{The predicted class labels (a \code{factor}).}
#' \item{posterior}{Matrix of class posterior probabilities.}
#'
#' @references
#' T. Hastie, R. Tibshirani, and J. Friedman. The Elements of Statistical Learning: Data Mining,
#' Inference, and Prediction. Springer Series in Statistics. Springer, New York, 2001.
#'
#' @seealso \code{\link{kmc}}.
#'
#' @examples
#' # generate waveform data
#' library(mlbench)
#' data.train <- as.data.frame(mlbench.waveform(300))
#' data.test <- as.data.frame(mlbench.waveform(200))
#'
#' # prediction based on the nearest prototype
#' object <- kmc(classes ~ ., data = data.train, K = 3, wf = "rectangular", k = 1)
#' pred <- predict(object, data.test[,-22])
#' mean(pred$class != data.test$classes)
#'
#' # prediction based on the 3 nearest prototypes with weighting
#' object <- kmc(classes ~ ., data = data.train, K = 3, wf = "gaussian", bw = 1, k = 3)
#' pred <- predict(object, data.test[,-22])
#' mean(pred$class != data.test$classes)
#'
#' @keywords classif multivariate
#'
#' @rdname predict.kmc
#'
#' @export
# todo:
#system.time(replicate(1000, apply(object$x, 1, function(y) sum((y - newx)^2))))
#system.time(replicate(1000, colSums((t(object$x) - newx)^2)))
#system.time(replicate(1000, tapply(dense, object$grouping, sum)))
#system.time(replicate(1000, by(dense, object$grouping, sum)))
predict.kmc <- function(object, newdata = NULL, ...) {
weighted <- function(newx, object) {
distance <- sqrt(colSums((t(object$x) - newx)^2))
weights <- object$wf(distance)
res <- tapply(weights, object$grouping, sum)
return(res)
}
if (!inherits(object, "kmc"))
stop("object not of class \"kmc\"")
if (!is.null(Terms <- object$terms)) {
Terms <- delete.response(Terms)
if (missing(newdata))
newdata <- model.frame(object)
else {
newdata <- model.frame(Terms, newdata, na.action = na.pass,
xlev = object$xlevels)
if (!is.null(cl <- attr(Terms, "dataClasses")))
.checkMFClasses(cl, newdata)
}
x <- model.matrix(Terms, newdata, contrasts = object$contrasts)
xint <- match("(Intercept)", colnames(x), nomatch = 0L)
if (xint > 0)
x <- x[, -xint, drop = FALSE]
}
else {
if (missing(newdata)) {
if (!is.null(sub <- object$call$subset))
newdata <- eval.parent(parse(text = paste(deparse(object$call$x,
backtick = TRUE), "[", deparse(sub, backtick = TRUE),
",]")))
else newdata <- eval.parent(object$call$x)
if (!is.null(nas <- object$call$na.action))
newdata <- eval(call(nas, newdata))
}
if (is.null(dim(newdata)))
dim(newdata) <- c(1, length(newdata))
x <- as.matrix(newdata)
}
if (ncol(x) != ncol(object$x))
stop("wrong number of variables")
if (length(colnames(x)) > 0L && any(colnames(x) != dimnames(object$x)[[2L]]))
warning("variable names in 'newdata' do not match those in 'object'")
lev1 <- levels(object$grouping)
posterior <- matrix(0, nrow = nrow(x), ncol = length(lev1), dimnames = list(rownames(x), lev1))
posterior[,lev1] <- t(apply(x, 1, weighted, object = object))
gr <- factor(lev1[max.col(posterior)], levels = object$lev)
names(gr) <- rownames(x)
posterior <- posterior/rowSums(posterior)
res <- list(class = gr, posterior = posterior)
return(res)
}
#' @noRd
#'
#' @importFrom stats model.frame
#' @export
model.frame.kmc <- function (formula, ...) {
oc <- formula$call
oc $K <- oc$wf <- oc$bw <- oc$k <- oc$nn.only <- oc$nstart <- NULL
oc[[1L]] <- as.name("model.frame")
if (length(dots <- list(...))) {
nargs <- dots[match(c("data", "na.action", "subset"),
names(dots), 0)]
oc[names(nargs)] <- nargs
}
if (is.null(env <- environment(formula$terms)))
env <- parent.frame()
eval(oc, env)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.