Nothing
#' ========================================================================
#' Calculate classification accuracy.
#' History:
#' 31-07-06: Commence
#' 15-09-06: Not restrict types of classification. Just check the validity
#' of models and the output of predict method.
#' 10-01-07: some minor changes.
#' 30-03-07: revise
#' 30-06-07: Add posterior, margin and AUC
#' 03-07-07: For loocv, no AUC are available. (Should check in code)
#' 14-12-07: method and pred.dunc can be a function or a character string
#' naming function.
#' 11-05-12: Re-write calculation of err, auc and mar. Keep res.all.
accest.default <- function(dat, cl, method,
pred.func = predict, pars = valipars(),
tr.idx = NULL, ...) {
#' validity checking
if (missing(dat) || missing(cl)) {
stop("data set or class are missing")
}
if (missing(method)) { #' 01-06-2008: you can use: if(is.na(method))
stop("'method' is missing")
}
#' if(length(dim(dat)) != 2)
#' stop("'mat' must be a matrix or data frame")
#' if (!is.factor (cl)) stop("cl must be a factor.")
cl <- as.factor(cl) #' some classifier need it as factor, such as SVM.
if (nrow(dat) != length(cl)) stop("mat and cl don't match.")
if (length(unique(cl)) < 2) {
stop("Classification needs at least two classes.")
}
if (any(is.na(dat)) || any(is.na(cl))) {
stop("NA is not permitted in data set or class labels.")
}
dat <- as.matrix(dat)
n <- nrow(dat)
rownames(dat) <- NULL # strip off the row names
#' lwc-15-09-2006: avoid the warnings messages when calling SVM with
#' bootstrap methods. The warnings: 'row.names duplicated' come from:
#' df <- na.action(data.frame(y, x))
#' in svm.default's '#' subsetting and na-handling for matrices' segment.
#' wll-14-12-2007: either a function or a character string naming the
#' function to be pred.func.
if (is.function(method)) {
method <- deparse(substitute(method))
}
pred.func <-
if (is.function(pred.func)) {
pred.func
} else if (is.character(pred.func)) {
get(pred.func)
} else {
eval(pred.func)
}
#' construct index of train data
if (is.null(tr.idx)) {
if (pars$sampling == "cv" && pars$nreps > n) {
pars$sampling <- "loocv"
}
tr.idx <- trainind(cl, pars = pars)
}
pars$niter <- length(tr.idx)
pars$nreps <- length(tr.idx[[1]])
#' Main computation
res.all <- list()
cat("\nACCEST Iteration (", method, ", ", pars$niter, "):", sep = "")
for (i in 1:pars$niter) {
cat(" ", i, sep = "")
flush.console() #' for Windows
train.ind <- tr.idx[[i]]
res <- list()
for (j in 1:length(train.ind)) {
dat.tr <- dat[train.ind[[j]], , drop = F]
cl.tr <- cl[train.ind[[j]]]
dat.te <- dat[-train.ind[[j]], , drop = F]
cl.te <- cl[-train.ind[[j]]]
res[[j]] <- classifier(dat.tr, cl.tr, dat.te, cl.te,
method = method, pred.func = pred.func
)
}
names(res) <- paste("reps_", seq(1, pars$nreps), sep = "")
res.all[[i]] <- res
}
cat("\n")
names(res.all) <- paste("Iter_", seq(1, pars$niter), sep = "")
#' calculate error rate
err.all <- lapply(res.all, function(x) {
func <- function(x) sapply(x, function(y) y$err)
func(x)
})
err.all <- do.call(rbind, err.all)
err.iter <- rowMeans(err.all) #' apply(err.all,1,mean)
err <- mean(err.iter)
#' calculate mean of margin
if (!is.null(res.all[[1]][[1]]$margin)) {
mar.all <- lapply(res.all, function(x) {
func <- function(x) sapply(x, function(y) mean(y$margin))
func(x)
})
mar.all <- do.call(rbind, mar.all)
mar.iter <- rowMeans(mar.all)
mar <- mean(mar.iter)
} else {
mar.all <- NULL
mar.iter <- NULL
mar <- NULL
}
#' calculate AUC
if (!is.null(res.all[[1]][[1]]$auc) && pars$sampling != "loocv") {
auc.all <- lapply(res.all, function(x) {
func <- function(x) sapply(x, function(y) y$auc)
func(x)
})
auc.all <- do.call(rbind, auc.all)
auc.iter <- rowMeans(auc.all)
auc <- mean(auc.iter)
} else {
auc.all <- NULL
auc.iter <- NULL
auc <- NULL
}
#' calculate accuracy rate
acc.all <- 1 - err.all
acc.iter <- 1 - err.iter
acc <- 1 - err
#' if (pars$niter > 1) acc.std <- sd(acc.iter) else acc.std <- NULL
#' process bootstrap acc
if (pars$sampling == "boot") {
resub <- classifier(dat, cl,
dat.te = NULL, cl.te = NULL,
method = method, ...
)
err.boot <- lapply(err.iter, function(x) boot.err(x, resub))
#' reshape
err.boot <- t(sapply(err.boot, function(x) do.call("c", x)))
acc.boot <- 1 - err.boot
}
#' overall confusion matrix
#' lwc-01-06-2007: Do not use sapply here because sometimes get non-equal
#' fold.
all.cl <- lapply(res.all, function(x) {
foo <- function(x) lapply(x, function(y) y$cl)
foo(x)
})
all.cl <- unlist(unlist(all.cl, recursive = F, use.names = F))
all.pred <- lapply(res.all, function(x) {
foo <- function(x) lapply(x, function(y) y$pred)
foo(x)
})
all.pred <- unlist(unlist(all.pred, recursive = F, use.names = F))
#' overall confusion matrix
conf <- table(all.cl, all.pred)
#' construct output
ret <- list(
method = method,
acc = acc,
acc.iter = acc.iter,
acc.all = acc.all,
auc = auc,
auc.iter = auc.iter,
auc.all = auc.all,
mar = mar,
mar.iter = mar.iter,
mar.all = mar.all,
err = err,
err.iter = err.iter,
err.all = err.all,
sampling = switch(pars$sampling,
"loocv" = "leave-one-out cross-validation",
"cv" = "cross validation",
"boot" = "bootstrap",
"rand" = "randomised validation (holdout)"
),
niter = pars$niter,
nreps = pars$nreps,
conf = conf,
res.all = res.all #' lwc-10-05-2012: keep all results.
)
if (pars$sampling == "boot") {
ret$acc.boot <- acc.boot
}
class(ret) <- "accest"
return(ret)
}
#' ========================================================================
#' wll-01-07-2007: add AUC and margin
print.accest <- function(x, digits = 3, ...) {
#' cat("\nCall:\n", deparse(x$call), "\n")
cat("\nMethod:\t\t\t", x$method)
cat("\nAccuracy:\t\t", round(x$acc, digits))
if (!is.null(x$auc)) {
cat("\nAUC:\t\t\t", round(x$auc, digits))
}
if (!is.null(x$mar)) {
cat("\nMargin:\t\t\t", round(x$mar, digits))
}
cat("\n\nNo. of iteration:\t", x$niter)
cat("\nSampling:\t\t", x$sampling)
cat("\nNo. of replications:\t", x$nreps)
cat("\n\nOverall confusion matrix of training data:\n")
print(x$conf)
invisible(x)
}
#' ========================================================================
summary.accest <- function(object, ...) {
structure(object, class = "summary.accest")
}
#' ========================================================================
print.summary.accest <- function(x, digits = 3, ...) {
print.accest(x)
cat("\nAccuracy on each iteration:\n")
print(round(x$acc.iter, digits))
invisible(x)
}
#' =======================================================================
plot.accest <- function(x, main = NULL, xlab = NULL, ylab = NULL, ...) {
if (x$niter == 1) {
stop("Number of iteration (niter) must be greater than 1")
}
if (is.null(main)) {
main <- paste("Performance of `", x$method, "'",
sep = " ", "(",
x$sampling, ")"
)
}
if (is.null(xlab)) xlab <- "Index of niter"
if (is.null(ylab)) ylab <- "Accuracies"
plot(x$acc.iter,
type = "b", main = main, xlab = xlab, ylab = ylab,
col = "blue", ...
)
}
#' ========================================================================
accest <- function(dat, ...) UseMethod("accest")
#' ========================================================================
accest.formula <- function(formula, data = NULL, ..., subset,
na.action = na.omit) {
call <- match.call()
if (!inherits(formula, "formula")) {
stop("method is only for formula objects")
}
m <- match.call(expand.dots = FALSE)
if (identical(class(eval.parent(m$data)), "matrix")) {
m$data <- as.data.frame(eval.parent(m$data))
}
m$... <- NULL
m$scale <- NULL
m[[1]] <- as.name("model.frame")
m$na.action <- na.action
m <- eval(m, parent.frame())
Terms <- attr(m, "terms")
attr(Terms, "intercept") <- 0
x <- model.matrix(Terms, m)
y <- model.extract(m, "response")
attr(x, "na.action") <- attr(y, "na.action") <- attr(m, "na.action")
ret <- accest.default(x, y, ..., na.action = na.action)
ret$call <- call
ret$call[[1]] <- as.name("accest")
ret$terms <- Terms
if (!is.null(attr(m, "na.action"))) {
ret$na.action <- attr(m, "na.action")
}
class(ret) <- c("accest.formula", class(ret))
return(ret)
}
#' =======================================================================
#' Estimates the accuracy of pairwise comparison.
#' History:
#' 10-10-2006: Create
#' 03-12-2006: process fold of cv or scv is greater than number of data
#' row
#' 10-01-2007: vectorised
binest <- function(dat, cl, choices = NULL, method, pars = valipars(), ...) {
#' construct the pairwise comparison data set
dat.sub <- .dat.sel(dat, cl, choices = choices)
len <- sapply(dat.sub$cl, length)
if (pars$sampling == "cv" && pars$nreps > min(len)) {
pars$sampling <- "loocv"
}
if (pars$sampling == "loocv") pars$niter <- 1
n <- nrow(dat.sub$com)
acc <- sapply(c(1:n), function(x) {
accest(dat.sub$dat[[x]], dat.sub$cl[[x]],
method = method,
pars = pars, ...
)$acc
})
com <- apply(dat.sub$com, 1, paste, collapse = " ~ ")
names(acc) <- com
ret <- list(
com = dat.sub$com, acc = acc, method = method,
niter = pars$niter, sampling = pars$sampling
)
if (pars$sampling != "loocv") ret$nreps <- pars$nreps
return(ret)
}
#' =======================================================================
#' wll-29-04-2008: Wrapper function for re-sampling based classification
#' with multiple classifiers
aam.mcl <- function(x, y, method, pars = valipars(), ...) {
res <- lapply(method, function(m) {
cat("\nClassifier = :", m, "\n")
flush.console()
aam.cl(x, y, method = m, pars, ...)
})
names(res) <- method
res <- do.call(rbind, res)
}
#' ========================================================================
#' wll-29-04-2008: Calculate acc, auc and mar by calling accest
aam.cl <- function(x, y, method, pars = valipars(), ...) {
val <- accest(x, y, method = method, pars = pars, ...)
acc <- val$acc
auc <- ifelse(!is.null(val$auc), val$auc, NA)
mar <- ifelse(!is.null(val$mar), val$mar, NA)
res <- c(acc = acc, auc = auc, mar = mar)
return(round(res, digits = 3))
}
#' ========================================================================
#' lwc-05-07-2006: Calculate classification rate.
#' lwc-16-09-2006: Arguments checking.
#' lwc-30-10-2006: Convert arguments as vectors
#' lwc-05-05-2010: Change function name from class.rate
#' lwc-19-05-2-12: Add kappa and change class.error to class.acc.
#' Usage:
#' a = c(1,2,1,2,1,2,1,2,1,2)
#' b = c(2,2,2,1,1,2,1,2,1,2)
#' c = rbind(a,b)
#' d = c(1,1,1,1,1,1,1,1,1,1)
#' cl.rate(a,b)
#' cl.rate(a,d)
#' cl.rate(a,c)
cl.rate <- function(obs, pre) {
observed <- as.vector(obs)
predicted <- as.vector(pre)
if (length(observed) != length(predicted)) {
stop(" 'observed' and 'predicted' must have the same length.")
}
acc <- sum(observed == predicted) / length(observed)
#' NOTE: If arguments are factor, it will trigger an error when they
#' have different levels.
err <- (1 - acc)
con <- table(observed = observed, predicted = predicted)
kappa <- classAgreement(con)$kappa #' from e1071
con <- cbind(con, class.acc = diag(con) / rowSums(con))
res <- list(acc = acc, err = err, con.mat = con, kappa = kappa)
return(res)
}
#' =======================================================================
#' lwc-28-04-2010: Classification performance.
#' lwc-19-05-2012: add kappa, con.mat and positive in the returned list.
#' Note: Also see cl.auc and cl.roc for ROC assessment.
#' True positive rate: tpr = TP/P = TP/TP+FN
#' False positive rate: fpr = FP/N = FP/FP+TN
#' Accuracy: acc = P*tpr + N*(1-fpr)
#' References: Tom Fawcett, ROC Graphs: Notes and Practical Considerations
#' for Data Mining Researchers, January 7, 2003.
#' Arguments:
#' obs Factor or vector of observed class.
#' pre Factor or vector of predicted class.
cl.perf <- function(obs, pre, pos = levels(as.factor(obs))[2]) {
obs <- factor(obs)
pre <- factor(pre)
pos <- match.arg(pos, levels(obs))
obs <- relevel(obs, ref = pos) #' put pos in the 1st position
pre <- relevel(pre, ref = pos) #' put pos in the 1st position
#' confusion matrix
con <- table(obs, pre)
kappa <- classAgreement(con)$kappa
con <- cbind(con, class.acc = diag(con) / rowSums(con))
#' change levels as pos(Positive) and neg (Negative)
levels(obs) <- list("neg" = levels(obs)[2], "pos" = levels(obs)[1])
levels(pre) <- list("neg" = levels(pre)[2], "pos" = levels(pre)[1])
#' levels(obs) <- levels(pre) <- c("pos", "neg")
#' Temp function for number of True or False (positive or negative)
#' (Not used here)
tf <- function(x) {
x.obs <- grepl(x, obs)
x.pre <- grepl(x, pre)
Tr <- sum(x.pre & x.obs)
Fa <- sum(x.pre) - Tr
list(Tr = Tr, Fa = Fa)
}
#' pos.tf <- tf("pos")
pos.obs <- grepl("pos", obs)
pos.pre <- grepl("pos", pre)
TP <- sum(pos.pre & pos.obs)
Pos <- sum(pos.pre)
FP <- Pos - TP
#' neg.tf <- tf("neg")
neg.obs <- grepl("neg", obs)
neg.pre <- grepl("neg", pre)
TN <- sum(neg.pre & neg.obs)
Neg <- sum(neg.pre)
FN <- Neg - TN
P <- TP + FN
N <- FP + TN
tpr <- TP / (TP + FN)
fpr <- FP / (FP + TN)
acc <- (TP + TN) / (P + N)
spec <- 1 - fpr #' Specificity
sens <- tpr #' Sensitivity or recall
perf <- list(
acc = acc, tpr = tpr, fpr = fpr, sens = sens, spec = spec,
con.mat = con, kappa = kappa, positive = pos
)
return(perf)
}
#' =====================================================================
#' lwc-11-03-2007: Area under Receiver Operating Curve (ROC).
#' lwc-05-05-2010: Major modification.
#' lwc-01-05-2012: adjust auc
#' Note: 1.) AUC varies between 0.5 and 1.0 for sensible models; the higher
#' the better.
#' 2.) AUC should be adjusted as:
#' auc[auc < 0.5] <- 1 - auc[auc < 0.5]
#' 3.) This is the implementation of cutoff-independent AUC.
#' 4.) The AUC has an important statistical property: the AUC of a
#' classifier is equivalent to the probability that the classifier
#' will rank a randomly chosen positive instance higher than a
#' randomly chosen negative instance. This is equivalent to the
#' Wilcoxon test of ranks.
#' -- An introduction to ROC analysis by Tom Fawcett
#' 5.) Codes come from auRROC function in package limma.
cl.auc <- function(stat, label, pos = levels(as.factor(label))[2]) {
if (missing(label) || missing(stat)) stop("arguments miss")
if (length(label) != length(stat)) stop("lengths differ")
#' convert label as factor
label <- factor(label)
if (nlevels(label) != 2) stop("'label' must be two categorical data")
#' sort out which level is pos and convert it to "1".
pos <- match.arg(pos, levels(label))
label <- relevel(label, ref = pos) #' put pos in the 1st position
levels(label) <- list("0" = levels(label)[2], "1" = levels(label)[1])
label <- label[order(stat, decreasing = T)]
label <- as.numeric(levels(label))[as.integer(label)]
tmp <- cumsum(label) / sum(label)
auc <- mean(tmp[label == 0])
#' if (auc < 0.5) auc <- 1 - auc #' lwc-08-06-2010: fix it
auc[auc < 0.5] <- 1 - auc[auc < 0.5] #' lwc-01-05-2012: uncommented.
return(auc)
}
#' =====================================================================
#' lwc-11-03-2007:Area under Receiver Operating Curve (ROC)
#' Note: 1. Internal function with no documents.
#' 2. It is old version of cl.auc.
#' 3. It is modified from auROC function in package limma.
auc <- function(stat, label) {
if (missing(label) || missing(stat)) stop("arguments miss")
if (length(label) != length(stat)) stop("lengths differ")
if (is.factor(label)) {
label <- as.numeric(levels(label))[as.integer(label)]
}
if (!all(sort(unique(label)) == c(0, 1))) {
stop("'label' must take values 0 or 1")
}
label <- label[order(stat, decreasing = T)]
tmp <- cumsum(label) / sum(label)
auc <- mean(tmp[label == 0])
return(auc)
}
#' =======================================================================
#' lwc-12-03-2007: ROC
#' lwc-05-05-2010: Major modification.
#' lwc-15-05-2012: Major changes:
#' 1.) The cutoff points are sorted stat
#' 2.) The rule is >=, not >.
#' Note: 1. This function is partly taken from ROCfuns.R in package ROC.
#' Note: sensitivity and specificity are estimated as:
#' sensitivity (true positive rate) =
#' Positives correctly classified / Total positives
#' specificity (true negative rate) =
#' Negatives correctly classified / Total negatives
cl.roc <- function(stat, label, pos = levels(as.factor(label))[2],
plot = TRUE, ...) {
roc_rule <- function(thres, x) ifelse(x >= thres, 1, 0)
if (missing(label) || missing(stat)) stop("arguments miss")
if (length(label) != length(stat)) stop("lengths differ")
#' convert label as factor
label <- factor(label)
if (nlevels(label) != 2) stop("'label' must be two categorical data")
#' sort out which level is Positive
pos <- match.arg(pos, levels(label))
label <- relevel(label, ref = pos) #' Reorder Levels of Factor
levels(label) <- list("0" = levels(label)[2], "1" = levels(label)[1])
#' thresholds
thres <- unique(sort(stat))
#' prediction based on thresholds
pred <- lapply(thres, roc_rule, x = stat)
perf <- lapply(pred, function(x) {
sens <- mean(x[label == 1])
spec <- mean(1 - x[label == 0])
fpr <- 1 - spec #' false positive rate: 1 - spec
tpr <- sens #' true positive rate: sens
acc <- sum(label == x) / length(label)
c(acc = acc, tpr = tpr, fpr = fpr, sens = sens, spec = spec)
})
perf <- do.call(rbind, perf)
perf <- cbind(cutoff = thres, perf)
perf <- data.frame(perf)
auc <- cl.auc(stat, label)
if (plot) { #' plot ROC curve
main <- "ROC Curve"
xlab <- "False positive Rate"
ylab <- "True positive Rate"
#' xlab <- "1 - Specificity"
#' ylab <- "Sensitivity"
plot(perf$fpr, perf$tpr,
type = "n", xlim = c(0, 1), ylim = c(0, 1),
main = main, xlab = xlab, ylab = ylab, ...
)
points(perf$fpr, perf$tpr, col = "red", type = "b", pch = 19)
abline(
h = seq(0, 1, by = .1), v = seq(0, 1, by = .1), lty = 3,
lwd = 0.5, col = "grey"
)
abline(0, 1)
}
return(list(perf = perf, auc = auc, positive = pos))
}
#' =======================================================================
#' lwc-15-05-2012: Another version of ROC which call cl.perf.
#' lwc-19-05-2012: minor changes.
#' Note: 1.) The cutoff points are sorted stat
#' 2.) The rule is >=, not >.
#' 3.) call cl.perf to get acc, tpr, fpr, sens and spec.
cl.roc.1 <- function(stat, label, pos = levels(as.factor(label))[2],
plot = TRUE, ...) {
if (missing(label) || missing(stat)) stop("arguments miss")
if (length(label) != length(stat)) stop("lengths differ")
label <- factor(label)
if (nlevels(label) != 2) stop("'label' must be two categorical data")
#' sort out which level is Positive
pos <- match.arg(pos, levels(label))
label <- relevel(label, ref = pos) #' The 1st level is positive
#' thresholds
thres <- unique(sort(stat))
#' prediction based on thresholds
pred <- lapply(thres, function(x) {
tmp <- ifelse(stat >= x, levels(label)[1], levels(label)[2])
tmp <- factor(tmp, levels = levels(label))
})
#' tidy up perf
perf <- lapply(pred, function(x) {
cl.perf(label, x, pos = pos)[c("acc", "tpr", "fpr", "sens", "spec")]
})
perf <- do.call(rbind, perf)
perf <- cbind(cutoff = thres, perf)
perf <- data.frame(perf)
#' cutoff - independent AUC
auc <- cl.auc(stat, label)
if (plot) { #' plot ROC curve
main <- "ROC Curve"
xlab <- "False positive Rate"
ylab <- "True positive Rate"
#' xlab <- "1 - Specificity"
#' ylab <- "Sensitivity"
plot(perf$fpr, perf$tpr,
type = "n", xlim = c(0, 1), ylim = c(0, 1),
main = main, xlab = xlab, ylab = ylab, ...
)
points(perf$fpr, perf$tpr, col = "red", type = "b", pch = 19)
abline(
h = seq(0, 1, by = .1), v = seq(0, 1, by = .1), lty = 3,
lwd = 0.5, col = "grey"
)
abline(0, 1)
}
return(list(perf = perf, auc = auc, positive = pos))
}
#' =======================================================================
#' lwc-01-12-06: Wrapper function for single classifier
#' History:
#' 15-09-06: check predict's output
#' 24-03-07: add dat.te and cl.te as NULLs.
#' 30-06-07: Add the posterior, margin and AUC. I doubt using average of
#' margin as an assessment factor like AUC for classification.
#' 02-07-07: Deal with constant variables which have zero SD. Any
#' possible bugs or mistakes in both programming and algorithm?
#' 04-07-07: check validity of auc
#' 06-07-07: Fix a bug
#' 03-10-07: Restore user defined predict function.
#' 14-12-07: method and pred.dunc can be a function or a character string
#' naming function.
#' 09-10-08: Spot a potential bug in dealing with collinearity
#' 19-05-12: remove names of margin.
#' 22-05-12: minor changes for dots processing for svm
classifier <- function(dat.tr, cl.tr, dat.te = NULL, cl.te = NULL, method,
pred.func = predict, ...) {
#' lwc-14-12-2007: either a function or a character string naming the
#' function to be pred.func.
if (is.function(method)) {
method <- deparse(substitute(method))
}
pred.func <-
if (is.function(pred.func)) {
pred.func
} else if (is.character(pred.func)) {
get(pred.func)
} else {
eval(pred.func)
}
#' 05-10-2007: re-write this wrapper to avoid multiple arguments
if (method == "knn") {
method <- c("knn_wrap")
knn_wrap <- function(train, cl, ...) list(train = train, cl = cl, ...)
pred.func <- function(object, newdata, k = 1, ...) {
#' knn(train=object$train, test=newdata,cl=object$cl, k=k,...)
knn(train = object$train, test = newdata, cl = object$cl, k = k)
}
}
if (is.null(dat.te) || is.null(cl.te)) {
dat.te <- dat.tr
cl.te <- cl.tr
}
#' Especially for SVM
idx <- which(apply(dat.tr, 2, sd) > .Machine$double.eps)
dat.tr <- dat.tr[, idx, drop = F]
dat.te <- dat.te[, idx, drop = F]
#' 09-10-08: Potential bug here. If idx is invalid, dat.tr will be NA. See
#' preproc.sd for fixing it. It happens in the situation where
#' the data set is a single column matrix and all values are
#' same.
nte <- length(cl.te) #' number of rows in test data
dat.all <- rbind(dat.te, dat.tr)
#' cl.all <- factor(c(cl.te,cl.tr))
cl.all <- factor(c(as.character(cl.te), as.character(cl.tr)))
#' Note-04-07-2007: This is proper way to merge factors.
#' lwc-22-05-2012: minor changes
if (method != "svm") {
model <- do.call(method, c(list(dat.tr, cl.tr), list(...)))
} else {
#' pre-process only for SVM
dots <- list(...)
#' wl-08-11-2021, Mon: Use this one
if (hasArg("probability")) dots$probability <- NULL
# if (hasArg(probability)) dots$probability <- NULL
model <- do.call(method, c(list(dat.tr, cl.tr), probability = T, dots))
#' Note-30-06-2007: Using probability = T only for SVM.
}
#' pred <- pred.func(model, dat.all,...) # predict the entire data set
pred <- pred.func(model, data.frame(dat.all), ...) #' lwc-22-05-2012:
if (!is.list(pred)) {
pred.te <- pred[1:nte]
if (method == "svm") {
prob <- attr(predict(model, dat.all, probability = TRUE), "probabilities")
prob.te <- prob[1:nte, , drop = F] #' test prob
} else if (method == "randomForest") {
prob <- predict(model, dat.all, type = "prob")
prob.te <- prob[1:nte, , drop = F] #' test prob
} else {
prob.te <- NULL
}
} else {
if (!is.null(pred$class)) { # for 'lda' and 'qda'
pred.te <- pred$class[1:nte]
prob.te <- pred$posterior[1:nte, , drop = F]
} else {
stop("predict does not return a list with component 'class'.")
}
}
#' calculate error rate
err <- sum(cl.te != pred.te) / length(cl.te)
#' err <- cl.rate(cl.te, pred.te)$err.rate
#' Sort the prob by the column names. (for AUC calculation purpose)
#' if (!is.null(prob.te)) {
#' dfnames <- colnames(prob.te)
#' dfnames <- sort(dfnames)
#' prob.te <- prob.te[,dfnames,drop=F]
#' }
#' Note-06-07-07: Above code lines have bugs, e.g. if colnames are like
#' 3,2,10. After sorting, colnames are 10,2,3 not 2,3,10.
prob.te <- prob.te[, levels(cl.te), drop = F] #' lwc-06-07-07: for AUC purpose
#' calculate margin
if (!is.null(prob.te)) {
margin <- .marg(prob.te, cl.te)
names(margin) <- NULL #' lwc-19-05-2012:
if (length(levels(cl.te)) == 2 && length(cl.te) > 1) {
#' calculate AUC if two-class problem
cl.tmp <- cl.te
levels(cl.tmp) <- c(0, 1)
stat <- prob.te[, 2]
auc <- cl.auc(stat, cl.tmp)
#' auc <- auc(stat,cl.tmp)
} else {
auc <- NULL
}
} else {
margin <- NULL
auc <- NULL
}
ret <- list(
err = err, cl = cl.te, pred = pred.te, posterior = prob.te,
acc = 1 - err, margin = margin, auc = auc
)
return(ret)
}
#' ========================================================================
#' lwc-30-06-2007: calculate the margin of a classifier based on the
#' posterior
#' NOTE: 1. This function hacked from package 'randomForest'. For more
#' description, see package 'randomForest'.
#' 2. Internal function
.marg <- function(prob, observed) {
if (missing(observed) || missing(prob)) stop("arguments miss")
if (length(observed) != nrow(prob)) stop("lengths differ")
if (any(prob > 1)) {
prob <- sweep(prob, 1, rowSums(prob), "/")
}
observed <- as.factor(observed)
mat <- data.frame(prob, observed)
names(mat) <- c(dimnames(prob)[[2]], "observed")
#' NOTE-lwc: Ater data.frame() operation, the colnames may be changed (if
#' the colnames are numbers). The above line is to restore the original
#' colnames.
nlev <- length(levels(observed))
ans <- apply(mat, 1, function(x) {
pos <- match(x[nlev + 1], names(x))
t1 <- as.numeric(x[pos])
t2 <- max(as.numeric(x[-c(pos, nlev + 1)]))
t1 - t2
})
names(ans) <- observed
ans
}
#' =========================================================================
#' lwc-16-08-2006: Calculate bootstrap, .632 bootstrap and .632 plus
#' bootstrap error rate
#' History:
#' 16-08-2006: truncate r
#' 23-03-2007: major changes in principles
boot.err <- function(err, resub) {
#' apparent error rate/re-substitution rate ( not training error rate)
err.ae <- resub$err
cl <- resub$cl
pred <- resub$pred
#' .632 bootstrap error
err.b632 <- 0.368 * err.ae + 0.632 * err
gamma <-
sum(outer(cl, pred, function(x, y) ifelse(x == y, 0, 1))) / (length(cl)^2)
r <- (err - err.ae) / (gamma - err.ae)
r <- ifelse(err > err.ae & gamma > err.ae, r, 0)
#' lwc-16-08-2006: if r still falls outside of [0,1], truncate it to 0.
#' if (r > 1 || r < 0) r <- 0
errprime <- min(err, gamma)
#' weight <- .632/(1-.368*r)
#' err.b632p <- (1-weight)*err.ae + weight*err
err.b632p <-
err.b632 + (errprime - err.ae) * (0.368 * 0.632 * r) / (1 - 0.368 * r)
ret <- list(ae = err.ae, boot = err, b632 = err.b632, b632p = err.b632p)
return(ret)
}
#' =========================================================================
#' Control parameters for validation using in estimation of classification
#' and feature selection.
#' lwc-03-08-2006: commence
#' lwc-21-03-2007: revise
valipars <- function(sampling = "cv", niter = 10, nreps = 10,
strat = FALSE, div = 2 / 3) {
sampling <- match.arg(sampling, c("loocv", "cv", "boot", "rand"))
if (sampling == "cv" && nreps < 2) {
stop("Number of fold (nreps) for cv must be greater than 1.")
}
if (sampling == "loocv") {
res <- list(sampling = sampling, niter = 1)
} else if (sampling == "rand") {
res <- list(
sampling = sampling, niter = niter, nreps = nreps,
strat = strat, div = div
)
} else {
res <- list(
sampling = sampling, niter = niter, nreps = nreps,
strat = strat
)
}
class(res) <- "valipars"
return(res)
}
#' =======================================================================
#' Generate index for training or feature ranking
#' lwc-27-10-2006: commence
#' lwc-29-10-2006: empty class checking
#' lwc-04-12-2006: support iteration information
#' lwc-13-02-2007: Fix a bug (set.seed(71))
#' lwc-22-03-2007: revise
trainind <- function(cl, pars = valipars()) {
if (!inherits(pars, "valipars")) {
stop("pars not of class valipars")
}
cl <- factor(cl) #' lwc-17-09-2007: drop the factor levels
idx.func <- function(cl, pars = valipars()) {
n <- length(cl)
if (pars$sampling == "loocv") {
train.ind <- lapply(1:n, function(i) seq(1, n)[-i])
} else {
#' 1) each class must have at least 2 samples in training index
#' 2) each class must have at least 1 sample in test index
emp_f <- T
while (emp_f) {
emp_f <- !emp_f
switch(pars$sampling,
"cv" = {
train.ind <- cv.idx(cl, pars$nreps, strat = pars$strat)
},
"rand" = {
train.ind <- rand.idx(cl, pars$nreps,
strat = pars$strat,
div = pars$div
)
},
"boot" = {
train.ind <- boot.idx(cl, pars$nreps, strat = pars$strat)
}
)
for (i in 1:length(train.ind)) {
if (any(table(cl[train.ind[[i]]]) < 2) ||
any(table(cl[-train.ind[[i]]]) < 1)) {
emp_f <- !emp_f
break
}
} #' end of for
} #' end of while
} #' end of else
return(train.ind)
}
tr.idx <- list()
for (i in 1:pars$niter) {
tr.idx[[i]] <- idx.func(cl, pars = pars)
}
names(tr.idx) <- paste("Iter_", 1:pars$niter, sep = "")
return(tr.idx)
}
#' ========================================================================
boot.idx <- function(x, nreps, strat = FALSE) {
n <- length(x)
if (strat) {
x <- factor(x) #' drops the levels that do not occur
idx <- sample(1:n, n, replace = F)
#' shuffle the original x, #' idx <- c(1:n)
x <- x[idx]
v <- length(levels(x))
#' index of each factor
#' s.idx <- lapply(1:v, function(i) idx[which(x == levels(x)[i])])
#' lwc-05-10-2010: bug fixed
s.idx <- lapply(1:v, function(i) which(x == levels(x)[i]))
train.ind <- lapply(1:nreps, function(y) { #' y is not used.
tmp <- lapply(s.idx, function(x) sample(x, length(x), replace = T))
do.call("c", tmp)
})
#' shuffle the results
train.ind <- lapply(train.ind, function(x) sample(x, length(x), replace = F))
} else {
train.ind <- lapply(1:nreps, function(x) sample(1:n, n, replace = T))
}
return(train.ind)
}
#' ========================================================================
rand.idx <- function(x, nreps, strat = FALSE, div = 2 / 3) {
n <- length(x)
if (strat) {
x <- factor(x) #' drops the levels that do not occur
idx <- sample(1:n, n, replace = F)
#' shuffl the original x, #' idx <- c(1:n)
x <- x[idx]
v <- length(levels(x))
#' index of each factor
#' s.idx <- lapply(1:v, function(i) idx[which(x == levels(x)[i])])
#' lwc-05-10-2010: bug fixed
s.idx <- lapply(1:v, function(i) which(x == levels(x)[i]))
train.ind <-
lapply(1:nreps, function(y) { #' y is not used.
tmp <- lapply(
s.idx,
function(x) sample(x, trunc(length(x) * div), replace = F)
)
do.call("c", tmp)
})
#' shuffl the results
train.ind <-
lapply(train.ind, function(x) sample(x, length(x), replace = F))
} else {
train.ind <-
lapply(1:nreps, function(x) sample(1:n, trunc(n * div), replace = F))
}
return(train.ind)
}
#' ========================================================================
cv.idx <- function(x, nreps, strat = FALSE) {
#' One change has been made to get different results for each calling.
#' from package ipred
ssubset <- function(y, k, strat = TRUE) {
#' if (!is.factor(y)) stop("y is not of class factor") #' lwc-14-04-2007
if (!is.factor(y)) y <- as.factor(y)
N <- length(y)
nlevel <- table(y)
nindx <- list()
#' lwc-29-10-2006: changed
indx <- sample(1:N, N, replace = F)
y <- y[indx]
#' indx <- 1:N
outindx <- list()
if (strat) {
for (j in 1:length(nlevel)) {
nindx <- c(nindx, list(indx[which(y == levels(y)[j])]))
}
kmat <- kfoldcv(k, N, nlevel)
for (i in 1:k) {
sset <- kmat[, i]
kindx <- c()
for (j in 1:length(nlevel)) {
if (i > 1) {
kindx <- c(kindx, nindx[[j]][(sum(kmat[
j,
1:(i - 1)
]) + 1):sum(kmat[j, 1:i])])
} else {
kindx <- c(kindx, nindx[[j]][1:kmat[j, 1]])
}
}
kindx <- kindx[!is.na(kindx)]
outindx <- c(outindx, list(kindx))
}
return(outindx)
} else {
kmat <- kfoldcv(k, N)
nindx <- indx
for (i in 1:k) {
if (i > 1) {
outindx <- c(
outindx,
list(nindx[(sum(kmat[1:(i - 1)]) + 1):sum(kmat[1:i])])
)
} else {
outindx <- c(outindx, list(nindx[1:kmat[1]]))
}
}
}
return(outindx)
}
#' from package ipred
kfoldcv <- function(k, N, nlevel = NULL) {
if (is.null(nlevel)) { # no stratification
if (k > N) {
return(c(rep(1, N), rep(0, k - N)))
}
fl <- floor(N / k)
ce <- ceiling(N / k)
if (fl == ce) {
return(rep(fl, k))
} else {
return(c(rep(ce, round((N / k - fl) * k)), rep(fl, round((1 - (N / k -
fl)) * k))))
}
} else { # stratification
# if (!is.integer(nlevel)) stop("nlevel is not a vector if integers")
kmat <- matrix(0, ncol = k, nrow = length(nlevel))
for (i in 1:length(nlevel)) {
kmat[i, ] <- kfoldcv(k, nlevel[i])
}
return(kmat)
}
}
n <- length(x)
#' get index of test
test.ind <- ssubset(x, nreps, strat = strat)
#' get index of training
train.ind <- lapply(1:nreps, function(i) seq(1, n)[-test.ind[[i]]])
#' shuffle the results
train.ind <- lapply(train.ind, function(x) sample(x, length(x), replace = F))
return(train.ind)
}
#' 1) accest.default
#' 2) print.accest
#' 3) summary.accest
#' 4) print.summary.accest
#' 5) plot.accest
#' 6) accest
#' 7) accest.formula
#' 8) binest
#' 9) aam.mcl
#' 10) aam.cl
#' 11) cl.rate
#' 12) cl.perf
#' 13) cl.auc
#' 14) auc
#' 15) cl.roc
#' 16) cl.roc.1
#' 17) classifier
#' 18) .marg
#' 19) boot.err
#' 20) valipars
#' 21) trainind
#' 22) boot.idx
#' 23) rand.idx
#' 24) cv.idx
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.