Nothing
#' =========================================================================
#' wll-02-10-2007: tune the best number of components
tune.pcalda <- function(x, y, ncomp = NULL, tune.pars, ...) {
n <- nrow(x)
g <- length(levels(y))
if (is.null(ncomp)) {
ncomp <- max(g, n - g)
} else {
if (ncomp < 1 || ncomp > max(g, n - g)) {
ncomp <- max(g, n - g)
}
}
if (missing(tune.pars)) {
tune.pars <- valipars(sampling = "rand", niter = 1, nreps = 4)
}
cat("ncomp tune (", ncomp, "):", sep = "")
res <- sapply(1:ncomp, function(i) {
cat(" ", i, sep = "")
flush.console()
accest(x, y,
pars = tune.pars, method = "pcalda", ncomp = i,
tune = F, ...
)$acc
})
cat("\n")
list(ncomp = which.max(res), acc.tune = res)
}
#' =========================================================================
#' wll-02-10-2007: Tune the best number of components
#' NOTE: This is a debug version of tune.pcalda, which can take user defined
#' range of ncomp.
tune.pcalda.1 <- function(x, y, ncomp = NULL, tune.pars, ...) {
n <- nrow(x)
g <- length(levels(y))
if (is.null(ncomp)) {
ncomp <- 1:max(g, n - g)
} else {
if (max(ncomp) < 1 || max(ncomp) > max(g, n - g)) {
ncomp <- 1:max(g, n - g)
}
}
if (missing(tune.pars)) {
tune.pars <- valipars(sampling = "rand", niter = 1, nreps = 4)
}
#' res <- sapply(1:ncomp, function(i) {
#' accest(x, y, pars = tune.pars, method = "pcalda", ncomp=i, tune=F,...)$acc
#' })
cat("ncomp tune (", max(ncomp), "):", sep = "")
func <- function(i) {
cat(" ", i, sep = "")
flush.console()
accest(
dat = x, cl = y, pars = tune.pars, method = "pcalda", ncomp = i,
tune = F, ...
)$acc
}
res <- sapply(ncomp, FUN = func)
names(res) <- ncomp
cat("\n")
list(ncomp = which.max(res), acc.tune = res)
}
#' =========================================================================
#' wll-02-10-2007: tune the best number of components
#' NOTE: This is a test version of tune ncomp, which uses LDA directly and
#' runs fast than tune.pcalda.
tune.pcalda.2 <- function(x, y, ncomp = NULL, center = TRUE, scale. = FALSE,
tune.pars, ...) {
n <- nrow(x)
g <- length(levels(y))
if (is.null(ncomp)) {
ncomp <- max(g, n - g)
} else {
if (ncomp < 1 || ncomp > max(g, n - g)) {
ncomp <- max(g, n - g)
}
}
pca.out <- prcomp(x, center = center, scale. = scale., ...)
ncomp <- min(ncomp, length(pca.out$center))
if (missing(tune.pars)) {
tune.pars <- valipars(sampling = "rand", niter = 1, nreps = 10)
}
cat("ncomp tune (", ncomp, "):", sep = "")
res <- sapply(1:ncomp, function(i) {
cat(" ", i, sep = "")
flush.console()
acc <- accest(pca.out$x[, 1:i, drop = F], y,
pars = tune.pars,
method = "lda"
)$acc
})
cat("\n")
list(ncomp = which.max(res), acc.tune = res)
}
#' =========================================================================
#' PCA+LDA for classification
#' History:
#' wll-22-06-2007: commence
#' wll-01-07-2007: try number of PCs as n - g. Over-fitting.
#' wll-24-01-2008: strip off constant PCs within group
pcalda.default <- function(x, y, center = TRUE, scale. = FALSE, ncomp = NULL,
tune = FALSE, ...) {
#' arguments validity checking
if (missing(x) || missing(y)) {
stop("data set or class are missing")
}
x <- as.matrix(x)
if (nrow(x) != length(y)) stop("x and y don't match.")
y <- as.factor(y)
if (any(table(y) == 0)) stop("Can't have empty classes in y.")
#' lwc-NOTE: Or simple apply: y <- factor(y), which will drop factor levels
if (length(unique(y)) < 2) {
stop("Classification needs at least two classes.")
}
if (any(is.na(x)) || any(is.na(y))) {
stop("NA is not permitted in data set or class labels.")
}
n <- nrow(x)
g <- length(levels(y))
#' The singularity problem of the within-class scatter matrix is overcome
#' if number of retained PCs varies at least g to at most n-g. Here g and
#' n is the number of classes and training data, respectively.
#' if(is.null(ncomp)) ncomp <- max(g,n - g)
#' NOTE-wll: Too good for training, which means overfits the training data.
#' if(is.null(ncomp)) ncomp <- max(g,round(n/2))
#' Check the number of components
if (is.null(ncomp)) {
ncomp <- if (tune) max(g, n - g) else max(g, round(n / 2))
} else {
if (ncomp < 1 || ncomp > max(g, n - g)) {
ncomp <- max(g, n - g)
}
}
#' find the best number of components
if (tune) {
val <- tune.pcalda(x, y, ncomp, ...)
ncomp <- val$ncomp
}
#' dimension reduction by PCA
pca.out <- prcomp(x, center = center, scale. = scale., ...)
ncomp <- min(ncomp, length(pca.out$center))
x.tmp <- pca.out$x[, 1:ncomp, drop = F]
#' stip off PCs constant within groups
x.tmp <- preproc.const(x.tmp, y)
ncomp <- ncol(x.tmp)
#' NOTE-28-01-2008: If the variables being stript off is not in the end
#' of columns, they positions should be sorted somewhere. But this
#' situation is rare in PCs. Refer to predict.pcalda where the ncomp is
#' used.
lda.out <- lda(x.tmp, y)
pred <- predict(lda.out, x.tmp)
conf <- table(y, pred$class)
acc <- round(sum(diag(conf)) * 100 / n, 2)
x <- scale(x.tmp, center = colMeans(lda.out$means), scale = FALSE) %*%
lda.out$scaling
res <- list(
x = x, cl = y, pred = pred$class, posterior = pred$posterior,
conf = conf, acc = acc, ncomp = ncomp, pca.out = pca.out,
lda.out = lda.out
)
if (tune) res$acc.tune <- val$acc.tune
res$call <- match.call()
res$call[[1]] <- as.name("pcalda")
class(res) <- "pcalda"
return(res)
}
#' =========================================================================
predict.pcalda <- function(object, newdata, ...) {
if (!inherits(object, "pcalda")) stop("object not of class \"pcalda\"")
if (missing(newdata)) {
res <- list(class = object$pred, posterior = object$posterior, x = object$x)
return(res)
}
if (is.null(dim(newdata))) {
dim(newdata) <- c(1, length(newdata))
} #' a row vector
newdata <- as.matrix(newdata)
if (ncol(newdata) != length(object$pca.out$center)) {
stop("wrong number of variables")
}
#' rotated data (projection) by PCA
x <- predict(object$pca.out, newdata)
x <- x[, 1:object$ncomp, drop = F]
#' predict using LDA
pred <- predict(object$lda.out, x)
#' list(class=pred$class, posterior = pred$posterior, x = pred$x)
return(pred)
}
#' ========================================================================
#' wll-22-06-2007: print method for pcalda
#' wll-15-01-2008: add ratio(svd values)
print.pcalda <- function(x, ...) {
cat("\nCall:\n", deparse(x$call), "\n")
cat("\nNumber of components considered:", x$ncomp)
cat("\nConfusion matrix of training data:\n")
print(x$conf)
cat("\nRatio of between- and within-group s.d.:\n")
df <- colnames(x$x)
ratio <- x$lda.out$svd
names(ratio) <- df
print(ratio)
cat("\nAccurary rate of training data:\n")
print(x$acc)
invisible(x)
}
#' ========================================================================
#' lwc-22-06-2007: summary method for pcalda
summary.pcalda <- function(object, ...) {
structure(object, class = "summary.pcalda")
}
#' ========================================================================
#' lwc-22-06-2007: summary method for pcalda
print.summary.pcalda <- function(x, ...) {
print.pcalda(x)
lev <- levels(x$cl)
cat("\nNumber of Classes: ", length(lev), "\n\n")
cat("Levels:", if (is.numeric(lev)) "(as integer)", "\n", lev)
cat("\n\n")
}
#' ========================================================================
#' wll-13-12-2007: plot method for pcalda using lattice.
#' wll-11-10-2008: Definition of svd in lda documents:
#' svd is the singular values, which give the ratio of the between- and
#' within-group standard deviations on the linear discriminant variables.
#' Their squares are the canonical F-statistics.
#' wll-15-01-2008: add svd listed in the plot.
plot.pcalda <- function(x, dimen, ...) {
ld.names <- function(object, comps) {
labs <- paste("LD", 1:length(object$means), sep = "")
if (missing(comps)) {
comps <- seq(along = labs)
} else {
labs <- labs[comps]
}
svd <- object$svd
svd.p <- 100 * svd / sum(svd)
#' svd.p <- 100 * svd^2/sum(svd^2)
#' wll-11-01-2008: check lda's print method: Proportion of trace
svd <- svd[comps]
svd.p <- svd.p[comps]
#' labs <- paste(labs, " (", format(svd.p, digits = 2, trim = TRUE),
#' " %)", sep = "")
labs <- paste(labs, " (", format(svd, digits = 2, trim = TRUE), ", ",
format(svd.p, digits = 2, trim = TRUE), "%)",
sep = ""
)
return(labs)
}
if (missing(dimen)) {
dimen <- seq(along = colnames(x$x))
} else {
#' check validity
if (!all(dimen %in% c(1:ncol(x$x)))) {
stop("dimen is not valid")
}
}
dfn <- ld.names(x$lda.out, dimen)
y <- x$cl
x <- data.frame(x$x[, dimen, drop = FALSE])
names(x) <- dfn
#' call group plot
p <- grpplot(x, y, plot = "pairs", ...)
p
}
#' =========================================================================
#' lwc-22-06-2007: plot method for pcalda. It plot LDA scores.
plot.pcalda.1 <- function(x, panel = panel.pcalda, cex = 0.7, dimen,
abbrev = FALSE, ...) {
panel.pcalda <- function(x, y, ...) {
text(x, y, as.character(g.nlda), cex = tcex, col = unclass(g), ...)
}
ld.names <- function(object, comps) {
labs <- paste("LD", 1:length(object$means), sep = "")
if (missing(comps)) {
comps <- seq(along = labs)
} else {
labs <- labs[comps]
}
svd <- object$svd
svd <- 100 * svd^2 / sum(svd^2)
evar <- svd[comps]
labs <- paste(labs, " (", format(evar, digits = 2, trim = TRUE),
" %)",
sep = ""
)
return(labs)
}
xval <- x$x
g <- x$cl
if (abbrev) levels(g) <- abbreviate(levels(g), abbrev)
assign("g.nlda", g)
assign("tcex", cex)
if (missing(dimen)) {
dimen <- seq(along = colnames(xval))
} else {
#' check validity
if (!all(dimen %in% c(1:ncol(xval)))) {
stop("dimen is not valid")
}
}
xval <- xval[, dimen, drop = FALSE]
varlab <- ld.names(x$lda.out, dimen)
nDimen <- length(dimen)
if (nDimen <= 2) {
if (nDimen == 1) { #' One component
ldahist(xval[, 1], g, ...)
#' ldahist(xval, g, xlab=varlab,...)
} else { #' Second component versus first
xlab <- varlab[1]
ylab <- varlab[2]
eqscplot(xval, xlab = xlab, ylab = ylab, type = "n", ...)
panel(xval[, 1], xval[, 2], ...)
}
} else { #' Pairwise scatterplots of several components
pairs(xval, labels = varlab, panel = panel, ...)
}
invisible(NULL)
}
#' =========================================================================
pcalda <- function(x, ...) UseMethod("pcalda")
#' =========================================================================
pcalda.formula <- function(formula, data = NULL, ..., subset,
na.action = na.omit) {
call <- match.call()
if (!inherits(formula, "formula")) {
stop("method is only for formula objects")
}
m <- match.call(expand.dots = FALSE)
if (identical(class(eval.parent(m$data)), "matrix")) {
m$data <- as.data.frame(eval.parent(m$data))
}
m$... <- NULL
m[[1]] <- as.name("model.frame")
m$na.action <- na.action
m <- eval(m, parent.frame())
Terms <- attr(m, "terms")
attr(Terms, "intercept") <- 0
x <- model.matrix(Terms, m)
y <- model.extract(m, "response")
attr(x, "na.action") <- attr(y, "na.action") <- attr(m, "na.action")
ret <- pcalda.default(x, y, ..., na.action = na.action)
ret$call <- call
ret$call[[1]] <- as.name("pcalda")
ret$terms <- Terms
if (!is.null(attr(m, "na.action"))) {
ret$na.action <- attr(m, "na.action")
}
class(ret) <- c("pcalda.formula", class(ret))
return(ret)
}
#' 1) tune.pcalda
#' 2) tune.pcalda.1
#' 3) tune.pcalda.2
#' 4) pcalda.default
#' 5) predict.pcalda
#' 6) print.pcalda
#' 7) summary.pcalda
#' 8) print.summary.pcalda
#' 9) plot.pcalda
#' 10) plot.pcalda.1
#' 11) pcalda
#' 12) pcalda.formula
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.