##===========================================================================
## Calculate classification accuracy.
##
##===========================================================================
accest.default <- function(dat, cl, clmeth, pars = NULL,tr.idx = NULL,verb=TRUE,clmpi=NULL,seed=NULL,...)
{
dots <-list(...)
## validity checking
if (missing(dat) || missing(cl))
stop("data set or class are missing")
if (missing(clmeth))
stop("'clmeth' is missing")
if (nrow(dat) != length(cl)) stop("mat and cl don't match.")
if (length(unique(cl)) < 2)
stop("Classification needs at least two classes.")
if (any(is.na(dat)) || any(is.na(cl)))
stop("NA is not permitted in data set or class labels.")
cl <- as.factor(cl)
nam=paste(levels(cl),collapse="~")
dat <- as.matrix(dat)
n <- nrow(dat)
rownames(dat) <- NULL
## construct index of train data
if(is.null(pars))
pars=valipars(sampling="rand", niter = 1, nreps=10,strat=TRUE)
if(is.null(tr.idx)){
if (pars$sampling == "cv" && pars$nreps > n ){
pars$sampling <- "loocv"
}
tr.idx <- trainind(cl, pars = pars)
}
pars$niter <- length(tr.idx)
pars$nreps <- length(tr.idx[[1]])
## Initialize
res.all <- pred.all<- list()
params<-list(dat=dat,cl=cl,tr.idx=tr.idx,verb=verb,seed=seed,clmeth=clmeth,dots=dots)
if(verb)
cat("ACCEST Iteration (",pars$niter,"):",sep="")
## Choose between MPI/PVM or single proc
if(!is.null(clmpi)){
tmp<-clusterApplyLB(clmpi,1:pars$niter,
# loopfct,params)
FIEmspro:::loopfct,params)
for (i in 1:pars$niter) {
res.all[[i]] <- tmp[[i]]$res.all
pred.all[[i]] <- tmp[[i]]$pred.all
}
}
else{
for (i in 1:pars$niter) {
# tmp<-loopfct(i,params)
tmp<-FIEmspro:::loopfct(i,params)
res.all[[i]] <- tmp$res.all
pred.all[[i]] <- tmp$pred.all
}
}
## End
if(verb)
cat("\n")
names(res.all) <- paste("Iter_",seq(1,pars$niter),sep="")
## accurary
err.all <- parse_vec(res.all, "err")
acc.iter <- 1-apply(err.all,2,mean)
## process bootstrap acc
if (pars$sampling == "boot"){
plist<-c(list(dat,cl,1:length(cl),clmeth),params$dots,rs.iter=1,rs.rep=1)
if(is.null(getS3method("predict",clmeth,optional=TRUE)))
resub<-do.call(FIEmspro:::classifier.1,plist)
else
resub<-do.call(FIEmspro:::classifier.0,plist)
# resub<-do.call(classifier.1,plist)
# else
# resub<-do.call(classifier.0,plist)
err.boot <- lapply(1-acc.iter, function(x) FIEmspro:::boot.err(x, resub))
err.boot <- t(sapply(err.boot, function(x) do.call("c", x)))
acc.boot <- 1 - err.boot
}
## ---------------- overall confusion matrix ----------------------
## wll-01-06-2007: Do not use sapply here because sometimes get non-equal fold.
all.cl <- lapply(res.all, function(x){
foo <- function(x) lapply(x, function(y) y$cl)
foo(x)
})
all.cl <- unlist(unlist(all.cl, recursive = F,use.names=F))
all.pred <- lapply(res.all, function(x){
foo <- function(x) lapply(x, function(y) y$pred)
foo(x)
})
all.pred <- unlist(unlist(all.pred, recursive = F,use.names=F))
## overall confusion matrix
conf <- table(all.cl, all.pred)
## ---------------------------------------------------------------
## calculate mean of margin
if (!is.null(res.all[[1]][[1]]$margin)) {
mar.all <- t(sapply(res.all, function(x) {
func <- function(x) sapply(x, function(y) mean(y$margin,na.rm=TRUE))
func(x)
}))
mar.iter <- apply(mar.all,1,mean)
mar <- mean(mar.iter)
} else {
mar.iter <- NULL
mar <- NULL
}
## calculate AUC
if (!is.null(res.all[[1]][[1]]$auc) && pars$sampling != "loocv") {
auc.all <- t(sapply(res.all, function(x) {
func <- function(x) sapply(x, function(y) FIEmspro:::auc(y$auc[,1],y$auc[,2]-1))
func(x)
}))
auc.iter <- apply(auc.all,1,mean)
auc <- mean(auc.iter)
} else {
auc.iter <- NULL
auc <- NULL
}
## ---------------------------------------------------------------
########## dle addons
## process arguments -> new component in returned list
if(length(dots)==0)
argfct="default"
else
argfct=paste(paste(names(unlist(dots)),unlist(dots),sep="="),collapse=",")
## process all $mod into one list -> new component in returned list if not null
all.mod<-list()
if(!is.null(res.all[[1]][[1]]$mod)){
for(k in 1:length(res.all)){
tmp<-list()
for(l in 1:length(res.all[[k]])){
tmp[[l]]=res.all[[k]][[l]]$mod
}
all.mod[[k]]=tmp
}
}
#####
## construct output
ret <- list(clmeth = clmeth,
acc = mean(acc.iter),
acc.iter = acc.iter,
mar = mar,
mar.iter = mar.iter,
auc = auc,
auc.iter = auc.iter,
sampling = switch(pars$sampling,
"loocv" = "leave-one-out cross-validation",
"cv" = "cross validation",
"boot" = "bootstrap",
"rand" = "randomised validation (holdout)"
),
niter = pars$niter,
nreps = pars$nreps,
conf = conf,
argfct = argfct, ### added by dle to keep trace of the arguments passed to the clmeth
mod = all.mod, ### added by dle to keep elements in $mod
pred.all = pred.all, ### added by dle to keep all predictions
cl.task = nam, ### addedd by dle to keep trace of the classification task
pars = pars,
pars.mini = parsfct(pars)
)
if (pars$sampling == "boot") {
## ret$err.boot <- err.boot
ret$acc.boot <- acc.boot
}
class (ret) <- "accest"
return(ret)
}
####
## Function to be called during parallelisation
loopfct<-function(i,params){
if(params$verb)
cat(" ", i, sep = "")
flush.console() ## for Windows
train.ind <- params$tr.idx[[i]]
dat<-params$dat
cl<-params$cl
res <- list()
tmp<-list(cl=NULL,pr=NULL,fo=NULL,auc=NULL,cl2=NULL,lsamp=NULL)
for (j in 1:length(train.ind)) {
ltr<-train.ind[[j]]
lte<-(1:length(cl))[-train.ind[[j]]]
if(!is.null(params$seed))
set.seed(params$seed+1000*i+j)
plist<-c(list(dat,cl,ltr,params$clmeth,rs.iter=i,rs.rep=j),params$dots)
if(is.null(getS3method("predict",params$clmeth,optional=TRUE)))
# res[[j]]<-do.call(classifier.1,plist)
# else
# res[[j]]<-do.call(classifier.0,plist)
res[[j]]<-do.call(FIEmspro:::classifier.1,plist)
else
res[[j]]<-do.call(FIEmspro:::classifier.0,plist)
tmp$fo=c(tmp$fo,rep(j,length(lte)))
tmp$cl=c(tmp$cl,as.character(cl[-ltr]))
tmp$pr=c(tmp$pr,as.character(res[[j]]$pred))
tmp$lsamp=c(tmp$lsamp,lte)
if(nlevels(cl)==2){
tmp$auc=c(tmp$auc,res[[j]]$auc[,1])
tmp$cl2=c(tmp$cl2,res[[j]]$auc[,2])
}
else
tmp$auc<-tmp$cl2<-NULL
}
tmp$cl <- factor(tmp$cl,levels=levels(cl))
tmp$pr <- factor(tmp$pr,levels=levels(cl))
list(res.all=res,pred.all=tmp)
}
## End
## Fct to summurise pars
parsfct<-function(pars){
if(pars$sampling=="loocv")
rep="loocv"
else{
str<-"NSt"
if(pars$strat) str<-"St"
if(pars$niter>1)
rep<-paste(pars$niter,"x",str,"-",pars$nreps,"-",pars$sampling,sep="")
else
rep<-paste(str,"-",pars$nreps,"-",pars$sampling,sep="")
}
if(!is.null(pars$div))
rep=paste(rep,"-",round(pars$div,2),sep="")
rep
}
####
##========================================================================
##
print.accest <- function(x, digits=3,...) {
cat("Method:\t\t", x$clmeth)
cat("\nArguments:\t", x$argfct)
cat("\nDiscrimination:\t", x$cl.task)
cat("\n\nNo. of iteration:\t", x$niter)
cat("\nSampling:\t\t", x$sampling)
cat("\nNo. of replications:\t", x$nreps)
cat("\n\nAccuracy:\t\t", round(x$acc,digits))
if (!is.null(x$auc))
cat("\nAUC:\t\t\t", round(x$auc,digits))
if (!is.null(x$mar))
cat("\nMargin:\t\t\t", round(x$mar,digits))
cat("\n\nOverall confusion matrix of training data:\n")
print(x$conf)
invisible(x)
}
##========================================================================
summary.accest <- function(object, ...)
structure(object, class = "summary.accest")
##========================================================================
print.summary.accest <- function(x, digits=3,...)
{
print.accest(x)
cat("\nAccuracy on each iteration:\n")
print(round(x$acc.iter,digits))
invisible(x)
}
## =======================================================================
#### Modifications by dle to allow mar, acc and auc to be plotted
#' Plot Method for Class 'accest'
#'
#' Plots accuracy, margin or AUC at each iteration.
#'
#' This function is a method for the generic function \code{plot()} for class
#' \code{accest}. It plots a performance estimate against iteration index.
#'
#' @usage \method{plotaccest}(x, toplot="acc", main = NULL, xlab = NULL, ylab =
#' NULL, \dots{})
#' @param x An object of class \code{accest}.
#' @param toplot Quantity to plot (default=acc, mar, auc)
#' @param main An overall title for the plot.
#' @param xlab A title for the x axis.
#' @param ylab A title for the y axis.
#' @param \dots Additional arguments to the plot.
#' @author David Enot and Wanchang Lin \email{dle,wll@@aber.ac.uk}.
#' @seealso \code{\link{accest}}, \code{\link{accest}}
#' @keywords hplot
#' @examples
#'
#' # Iris data
#' data(iris)
#' x <- as.matrix(subset(iris, select = -Species))
#' y <- iris$Species
#' pars <- valipars(sampling="cv", niter=10, nreps=5, strat=TRUE)
#'
#' acc <- accest(x,y, clmeth = "randomForest", pars = pars, ntree=50)
#' acc
#' ## plot accuracies
#' plot(acc)
#'
#' ## plot margins
#' plot(acc,toplot="mar")
#'
plot.accest <- function(x, toplot="acc", main = NULL, xlab = NULL, ylab = NULL, ...)
{
if (x$niter == 1)
stop("Number of iteration (niter) must be greater than 1")
if (is.null(main))
main <- paste("Performance of `",x$clmeth, "'", sep=" ","(",x$sampling,")" )
if (is.null(xlab)) xlab <- "Iterations"
if (is.null(ylab)) ylab <- paste("Indicator:",toplot)
vect<-NULL
eval(parse(text=paste("vect=x$",toplot,".iter",sep="")))
plot(vect, type = "b", main=main,xlab=xlab, ylab=ylab, col="blue",...)
}
##===========================================================================
#' Classification Wrapper Using Customised Classifiers
#'
#' Wrapper function for calculating classification estimates using pre-defined
#' data partitioning sets (\code{\link{valipars}} and \code{\link{trainind}}).
#' This function works with two type of classifiers. First generic classifiers
#' that fulfil R standards to define predictive techniques such as the ones
#' available in packages like \pkg{MASS}, \pkg{e1071} or \code{randomForest}
#' and \code{nlda} are normally handle with \code{accest}: the name of function
#' (\code{clmeth} in the \code{accest} call) must be accompanied with an S3
#' method \code{predict}; the later function should return a list with
#' component 'class' (hard classification) and if possible 'prob' or
#' 'posterior' for class probabilities. If the algorithm doesn't fulfil these
#' requirements, two postions can be adopted: 1) define explicitly the
#' algorithm so that it means R standards 2) define customised a function that
#' returns necessary informations. The second ('quicky and dirty') approach is
#' illustrated in an example given below. Unless the classifier can only cope
#' with two-class tasks, this function allows the manipulation of any problem
#' complexity. Three types of estimates are given for each replication:
#' accuracy, so-called margin and AUC (see details). Data input can be in the
#' form of \code{data} matrix + \code{class} vector, following the classic
#' formula type or derived from \code{\link{dat.sel1}}.
#'
#' Seexxxx for common details.
#'
#' @aliases accest accest.formula accest.default accest.dlist print.accest
#' summary.accest print.summary.accest
#' @usage accest(\dots{})
#'
#' \method{accestdefault}(dat, cl, clmeth, pars = NULL, tr.idx = NULL,
#' verb=TRUE, clmpi=NULL, seed=NULL, \dots{})
#'
#' \method{accestformula}(formula, data = NULL, \dots{}, subset, na.action =
#' na.omit)
#'
#' \method{accestdlist}(dlist, clmeth,pars = NULL, tr.idx = NULL, \dots{})
#' @param formula A formula of the form \code{groups ~ x1 + x2 + \dots{}} That
#' is, the response is the grouping factor and the right hand side specifies
#' the (non-factor) discriminators.
#' @param data Data frame from which variables specified in \code{formula} are
#' preferentially to be taken.
#' @param dlist A matrix or data frame containing the explanatory variables if
#' no formula is given as the principal argument.
#' @param dat A matrix or data frame containing the explanatory variables if no
#' formula is given as the principal argument.
#' @param cl A factor specifying the class for each observation if no formula
#' principal argument is given.
#' @param clmeth Classifier function. For details, see \code{note} below.
#'
#' @param pars A list of parameters using by the resampling method such as
#' \emph{Leave-one-out cross-validation}, \emph{Cross-validation},
#' \emph{Bootstrap} and \emph{Randomised validation (holdout)}. See
#' \code{\link{valipars}} for details.
#' @param tr.idx User defined index of training samples. Can be generated by
#' \code{trainind}.
#' @param verb Should iterations be printed out?
#' @param clmpi snow cluster information
#' @param seed Seed.
#' @param \dots Additional parameters to be passed to \code{clmeth}.
#' @param subset Optional vector, specifying a subset of observations to be
#' used.
#' @param na.action Function which indicates what should happen when the data
#' contains \code{NA}'s, defaults to \code{\link{na.omit}}.
#' @return An object of class \code{accest}, including the components:
#' \item{clmeth}{Classification method used.} \item{acc}{Average accuracy.}
#' \item{acc.iter}{Accuracy at each iteration.} \item{acc.std}{Standard
#' derivation of accuracy.} \item{mar}{Average predictive margin.}
#' \item{mar.iter}{Predictive margin of each iteration.} \item{auc}{Average
#' area under receiver operating curve (AUC).} \item{auc.iter}{AUC of each
#' iteration.} \item{sampling}{Sampling scheme used.} \item{niter}{Number of
#' iterations.} \item{nreps}{Number of replications at each iteration. }
#' \item{acc.boot}{Detailed bootstrap accuracy estimates when bootstrap
#' validation method is employed.} \item{argfct}{Arguments passed to the
#' classifier. } \item{pred.all}{For each iteration, list of the fold/bootstrap
#' id and the true and predicted classes.} \item{cl.task}{Discrimination task.}
#' \item{mod}{List of information return by the user defined classifier
#' function. }
#' @author David Enot \email{dle@@aber.ac.uk} and Wanchang Lin
#' \email{wll@@aber.ac.uk}
#' @seealso \code{\link{valipars}}, \code{\link{trainind}}
#' @keywords classif
#' @examples
#'
#'
#' ## -----------------------------------------------------------------
#' ## simple customised function
#' ## sameasrf simply reproduces the RF modelling task
#' sameasrf <- function(data,...){
#' dots <- list(...)
#'
#' ## Build RF model and predict dat.te
#' mod <- randomForest(data$tr,data$cl,...)
#' ## Soft predictions (optional if ROC/margin analyses required)
#' prob <- predict(mod,data$te,type="vote")
#' ## Hard predictions
#' pred <- predict(mod,data$te)
#'
#' # For illustration, mod does not contain anything
#' list(mod=NULL,pred=pred,prob=prob,arg=dots)
#' }
#'
#' ## -----------------------------------------------------------------
#' ## compare accest with randomForest
#' ## and sameasrf
#' data(iris)
#' dat=as.matrix(iris[,1:4])
#' cl=as.factor(iris[,5])
#' pars <- valipars(sampling = "boot",niter = 2, nreps=10)
#' tr.idx <- trainind(iris$Species,pars)
#'
#' set.seed(71)
#' acc.1 <- accest(dat,cl, clmeth = "sameasrf",
#' pars = pars,tr.idx = tr.idx,ntree = 200)
#' summary(acc.1)
#'
#' set.seed(71)
#' acc.2 <- accest(dat,cl, clmeth = "randomForest",
#' pars = pars,tr.idx = tr.idx,ntree = 200)
#' summary(acc.2)
#'
#' ### compare acc.1 and acc.2 bootstrap error estimates
#' print(acc.1$acc.boot-acc.2$acc.boot)
#'
#' #########################################
#' ## Try formula type
#' set.seed(71)
#' acc.3 <- accest(Species~., data = iris, clmeth = "randomForest",
#' pars = pars,tr.idx = tr.idx,ntree = 200)
#' summary(acc.3)
#'
#' ## Try dlist type from dat.sel1
#' set.seed(71)
#' dat2=dat.sel1(dat,cl,pars=pars)
#' acc.4 <- accest(dat2[[1]], clmeth = "randomForest",
#' pars = pars,tr.idx = tr.idx,ntree = 200)
#' summary(acc.4)
#'
#'
#'
accest <- function (...) UseMethod ("accest")
##===========================================================================
accest.dlist<-function(dlist,clmeth,pars=NULL,tr.idx=NULL,...)
{
# call <- match.call()
if (!inherits(dlist, "dlist"))
stop("method is only for dlist objects")
cl=as.factor(dlist$cl)
dat=as.matrix(dlist$dat)
if(is.null(pars)){
if(!is.null(dlist$pars))
pars=dlist$pars
else
pars=valipars()
}
if(!is.null(dlist$tr.idx) & is.null(tr.idx))
tr.idx=dlist$tr.idx
ret <- accest.default(dat, cl, clmeth=clmeth, pars = pars, tr.idx = tr.idx, ...)
return(ret)
}
##===========================================================================
accest.formula <- function (formula, data = NULL, ..., subset,
na.action = na.omit)
{
call <- match.call()
if (!inherits(formula, "formula"))
stop("method is only for formula objects")
m <- match.call(expand.dots = FALSE)
if (identical(class(eval.parent(m$data)), "matrix"))
m$data <- as.data.frame(eval.parent(m$data))
m$... <- NULL
m$scale <- NULL
m[[1]] <- as.name("model.frame")
m$na.action <- na.action
m <- eval(m, parent.frame())
Terms <- attr(m, "terms")
attr(Terms, "intercept") <- 0
x <- model.matrix(Terms, m)
y <- model.extract(m, "response")
ret <- accest.default (x, y, ...)
return (ret)
}
## ======================================================================
## dle: a couple of functions to parse results from classifier.1
#### Parse one value from multiple lists: you know this one!
#' Aggregation of Vectors in Nested Lists
#'
#' Parser to illustrate a possible output of \code{\link{accest}}. The function
#' loop other lists of list and aggregate vectors (single values) contained in
#' component labelled \code{nam}.
#'
#'
#' @usage parse_vec(mlist,nam)
#' @param mlist List of list
#' @param nam Name of the component of interest
#' @author David Enot \email{dle@@aber.ac.uk}.
#' @seealso \code{\link{accest}}, \code{\link{parse_freq}}
#' @keywords manip
#' @examples
#'
#'
#' l1=list(list(v=1),list(v=2),list(v=3))
#' l2=list(list(v=4),list(v=5),list(v=6))
#' res=list(l1,l2)
#'
#' ## show list of list
#' print(res)
#'
#' ## tidy up
#' parse_vec(res,nam="v")
#'
#' ######## Second example
#' ## Real world problem described in accest
#'
#'
parse_vec <- function(mlist,nam){
all.res<-NULL
for(k in 1:length(mlist)){
tmp<-NULL
for(l in 1:length(mlist[[k]])){
lv<-NULL
eval(parse(text=paste("lv=mlist[[k]][[l]]$",nam,sep="")))
tmp=c(tmp,lv)
}
all.res=cbind(all.res,tmp)
}
dimnames(all.res)[[2]]=1:length(mlist)
all.res
}
#### Parse frequency from multiple lists: fairly specific
#' Output Variable Frequencies in Nested Lists
#'
#' Parser to illustrate a possible output of \code{\link{accest}}. The function
#' loop other lists of list and retrieve the frequency of the variable names
#' contained in the component labelled \code{nam}.
#'
#'
#' @usage parse_freq(mlist,nam,sorting=TRUE)
#' @param mlist List of list
#' @param nam Name of the component of interest
#' @param sorting Should the variable name be sorted by frequency?
#' @author David Enot \email{dle@@aber.ac.uk}
#' @seealso \code{\link{accest}}, \code{\link{parse_vec}}
#' @keywords manip
#' @examples
#'
#'
#' l1=list(list(v=1),list(v=c(1,2)),list(v=1))
#' l2=list(list(v=c(1,2,3)),list(v=2),list(v=c(3,4)))
#' res=list(l1,l2)
#'
#' ## show list of list
#' print(res)
#'
#' ## tidy up
#' parse_freq(res,nam="v")
#'
#' ######## Second example
#' ## Real world problem described in accest
#'
#'
#'
parse_freq<-function (mlist, nam, sorting = TRUE)
{
lvar <- lnam <- lfreq <- NULL
for (k in 1:length(mlist)) {
for (l in 1:length(mlist[[k]])) {
lv<-NULL
eval(parse(text = paste("lv=mlist[[k]][[l]]$", nam,
sep = "")))
for (i in lv) {
if (i %in% lvar) {
lfreq[lvar == i] = lfreq[lvar == i] + 1
}
else {
lvar = c(lvar, i)
lfreq = c(lfreq, 1)
if (!is.null(names(lv))) {
lnam = c(lnam, names(lv)[lv == i])
}
}
}
}
}
mat = cbind(lvar, lfreq)
if (!is.null(lnam)) {
dimnames(mat) = list(lnam, c("VarId", "Freq"))
}
if (sorting) {
lso = sort(-mat[, 2], index.return = T)$ix
mat = mat[lso, ]
}
mat
}
###### Classifier function that handles formalized classification methods
### by wll
classifier.0<-function (dat,cl,ltr, clmeth, rs.iter,rs.rep,...)
{
dots <- list(...)
dat.tr <- dat[ltr,,drop=F]
cl.tr <- cl[ltr]
if(length(cl) > length(unique(ltr))){
dat.te <- dat[-unique(ltr),,drop=F]
cl.te <- cl[-unique(ltr)]
}
else{
dat.te<-dat
cl.te<-cl
}
if (hasArg(probability))
dots$probability <- NULL
knn.wrap <- function(train, cl, k = 1, l = 0, ...) list(train = train,
cl = cl, k = k, l = l, ...)
if (clmeth == "knn") {
clmeth <- c("knn.wrap")
predict <- function(x, ...) {
knn(train = x$train, cl = x$cl, k = x$k, l = x$l,
...)
}
}
idx <- which(apply(dat.tr, 2, sd) > .Machine$double.eps)
dat.tr <- dat.tr[, idx, drop = F]
dat.te <- dat.te[, idx, drop = F]
nte <- length(cl.te)
dat.all <- rbind(dat.te, dat.tr)
cl.all <- factor(c(as.character(cl.te), as.character(cl.tr)))
model <- do.call(clmeth, c(list(dat.tr, cl.tr), probability = T,
dots))
pred <- predict(model, dat.all)
if (!is.list(pred)) {
pred.te <- pred[1:nte]
if (clmeth == "svm") {
prob <- attr(predict(model, dat.all, probability = TRUE),
"probabilities")
prob.te <- prob[1:nte, , drop = F]
}
else if (clmeth == "randomForest") {
prob <- predict(model, dat.all, type = "prob")
prob.te <- prob[1:nte, , drop = F]
}
else {
prob.te <- NULL
}
}
else {
if (!is.null(pred$class)) {
pred.te <- pred$class[1:nte]
prob.te <- pred$posterior[1:nte, , drop = F]
}
else {
stop("predict does not return a list with component 'class'.")
}
}
err <- sum(cl.te != pred.te)/length(cl.te)
prob.te <- prob.te[, levels(cl.te), drop = F]
if (!is.null(prob.te)) {
margin <- FIEmspro:::marg(prob.te, cl.te)
if (length(levels(cl.te)) == 2 && length(cl.te) > 1) {
cl.tmp <- cl.te
levels(cl.tmp) <- c(0, 1)
stat <- prob.te[, 2]
auc <- cbind(stat, cl.tmp)
}
else {
auc <- NULL
}
}
else {
margin <- NULL
auc <- NULL
}
ret <- list(err = err, cl = cl.te, pred = pred.te, posterior = prob.te,
margin = margin, auc = auc, mod=NULL, arg = dots)
return(ret)
}
##===========================================================================
###### Classifier function that handles customised classification methods
### by dle
classifier.1<-function (dat,cl,ltr, clmeth, rs.iter,rs.rep, ...)
{
dots <- list(...)
### form training and test data
dat.tr <- dat[ltr,,drop=F]
cl.tr <- cl[ltr]
if(length(cl) > length(unique(ltr))){
dat.te <- dat[-unique(ltr),,drop=F]
cl.te <- cl[-unique(ltr)]
}
else{
dat.te<-dat
cl.te<-cl
}
data=list(tr=dat.tr,cl=cl.tr,te=dat.te,clte=cl.te,ltr=ltr,rs.iter=rs.iter,rs.rep=rs.rep)
### match arguments between clmeth with those contained in dots
argdots<-names(dots)
argfct<-names(formals(clmeth))
### check the common arguments between clmeth and dots
argint<-intersect(argdots,argfct)
### add extra arguments if clmeth allows '...'
if(is.element("...",argfct))
argint<-c(argint,setdiff(argdots,argfct))
### form the list to be passed to clmeth
lpar<-list(data=data)
for(k in argint){
eval(parse(text=paste("lpar$",k,"=dots$",k,sep="")))
}
### end of argument matching
model<-do.call(clmeth, lpar)
prob.te<-model$prob
pred.te<-model$pred
## calculate error rate
err <- sum(cl.te != pred.te)/length(cl.te)
prob.te <- prob.te[,levels(cl.te),drop=F] ## wll-06-07-07: for AUC purpose
if (!is.null(prob.te)){
margin <- FIEmspro:::marg(prob.te, cl.te)
if (length(levels(cl.te))==2 && length(cl.te) > 1) { ## calculate AUC if two-class problem
cl.tmp <- cl.te
levels(cl.tmp) <- c(0,1)
stat <- prob.te[,2]
auc <- cbind(stat,cl.tmp)
} else {
auc <- NULL
}
} else {
margin <- NULL
auc <- NULL
}
ret <- list(err=err,cl=cl.te,pred=pred.te, posterior=prob.te,
margin=margin, auc=auc, mod=model$mod, arg=model$arg)
return(ret)
}
## ======================================================================
## Calculate bootstrap, .632 bootstrap and .632 plus bootstrap error rate
boot.err <- function(err, resub)
{
## apparent error rate/resubstitution rate ( not training error rate)
err.ae <- resub$err
cl <- resub$cl
pred <- resub$pred
## .632 bootstrap error
err.b632 <- 0.368*err.ae + 0.632*err
gamma <- sum(outer(cl, pred, function(x, y) ifelse(x==y, 0, 1) )) /(length(cl)^2)
r <- (err - err.ae)/(gamma - err.ae)
r <- ifelse(err > err.ae & gamma > err.ae, r, 0)
## lwc-16-08-2006: if r still falls outside of [0,1], truncate it to 0.
## if (r > 1 || r < 0) r <- 0
errprime <- min(err, gamma)
## weight <- .632/(1-.368*r)
## err.b632p <- (1-weight)*err.ae + weight*err
err.b632p <- err.b632 + (errprime - err.ae)*(0.368*0.632*r)/(1-0.368*r)
ret <- list(ae=err.ae, boot=err, b632=err.b632, b632p=err.b632p)
return(ret)
}
## ============================================================================
## Claculate the margin of a classifier based on the posterior
## NOTE: 1. This function hacked from package 'randomForest'. For more
## description, see package 'randomForest'.
##
marg <- function(prob, observed) {
if (missing(observed) || missing(prob)) stop("arguments miss")
if(length(observed) != nrow(prob)) stop("lengths differ")
if( any(prob > 1, na.rm=TRUE) ) { ## modif dle 02/10/07
prob <- sweep(prob, 1, rowSums(prob), "/")
}
observed <- as.factor(observed)
mat <- data.frame(prob, observed)
names(mat) <- c(dimnames(prob)[[2]], "observed")
## NOTE-wll: Ater data.frame() operation, the colnames may be changed (if the
## colnames are numbers). The above line is to restore the original colnames.
nlev <- length(levels(observed))
ans <- apply(mat, 1, function(x){
pos <- match(x[nlev+1], names(x));
t1 <- as.numeric(x[pos]);
t2 <- max(as.numeric(x[-c(pos, nlev+1)]));
t1 - t2
})
names(ans) <- observed
ans
}
## ====================================================================
## Gnerates the pairwise data set based on the class label.
#' Generate Pairwise Data Set Based on Class Labels
#'
#' Generate pairwise data set based on class labels.
#'
#' This function is used to provide the data set for the binary combination of
#' the class factor. If \code{choices} is \code{NULL}, the binary combination
#' of for all class labels will be done. If \code{choices} has one class label,
#' the comparisons between this one and any other class are done. If
#' \code{choices} has more than three class lables, enumerate the combinations
#' or permutations of the elements of \code{choices}. For details, see
#' \code{examples} below.
#'
#' @usage dat.sel(dat, cl, choices = NULL)
#' @param dat A data frame or matrix.
#' @param cl A factor or vector of class.
#' @param choices The vector or list of class labels to be chosen for binary
#' classification.
#' @return A list with components: \item{dat}{ Pairwise data set. } \item{cl}{
#' Pairwise class label. } \item{com}{ A matrix of the combinations or
#' permutations of the elements of pairwise vector. }
#' @author Wanchang Lin \email{wll@@aber.ac.uk}
#' @keywords manip
#' @examples
#'
#' data(iris)
#' x <- subset(iris, select = -Species)
#' y <- iris$Species
#'
#' ## generate data set with class "setosa" and "virginica"
#' (binmat.1 <- dat.sel(x,y,choices=c("setosa","virginica")))
#'
#' ## generate data sets for "setosa" vs other classes. These are:
#' ## "setosa" and "versicolor", "setosa" and "virginica".
#' (binmat.2 <- dat.sel(x,y,choices=c("setosa")))
#'
#' ## generate data set with combination of each class. These are:
#' ## "setosa" and "versicolor", "setosa" and "virginica",
#' ## "versicolor" and "virginica"
#' (binmat.3 <- dat.sel(x,y,choices= NULL))
#'
#'
dat.sel <- function(dat, cl, choices = NULL)
{
## ----------------------------------------------------------------------
## ======================================================================
## wll-29-10-2006: combnations is from package gtools.
##
## $Id: combinations.R,v 1.7 2005/06/09 14:20:28 nj7w Exp $
##
## From email by Brian D Ripley <ripley@stats.ox.ac.uk> to r-help
## dated Tue, 14 Dec 1999 11:14:04 +0000 (GMT) in response to
## Alex Ahgarin <datamanagement@email.com>. Original version was
## named "subsets" and was Written by Bill Venables.
##
## ======================================================================
combinations <- function(n, r, v = 1:n, set = TRUE, repeats.allowed=FALSE) {
if(mode(n) != "numeric" || length(n) != 1
|| n < 1 || (n %% 1) != 0) stop("bad value of n")
if(mode(r) != "numeric" || length(r) != 1
|| r < 1 || (r %% 1) != 0) stop("bad value of r")
if(!is.atomic(v) || length(v) < n)
stop("v is either non-atomic or too short")
if( (r > n) & repeats.allowed==FALSE)
stop("r > n and repeats.allowed=FALSE")
if(set) {
v <- unique(sort(v))
if (length(v) < n) stop("too few different elements")
}
v0 <- vector(mode(v), 0)
## Inner workhorse
if(repeats.allowed)
sub <- function(n, r, v){
if(r == 0) v0 else
if(r == 1) matrix(v, n, 1) else
if(n == 1) matrix(v, 1, r) else
rbind(cbind(v[1], Recall(n, r-1, v)), Recall(n-1, r, v[-1]))
}
else
sub <- function(n, r, v){
if(r == 0) v0 else
if(r == 1) matrix(v, n, 1) else
if(r == n) matrix(v, 1, n) else
rbind(cbind(v[1], Recall(n-1, r-1, v[-1])), Recall(n-1, r, v[-1]))
}
sub(n, r, v[1:n])
}
## ---------------------------------------------------------------------
func <- function(choices){
if (is.null(choices)) {
choices <- g
} else {
choices <- unique(choices)
}
i <- pmatch(choices, g)
if (any(is.na(i)))
stop("'choices' should be one of ", paste(g, collapse = ", "))
## Get the binary combinations based on the class labels (package GTOOLS)
if (length(choices) == 1) {
com <- combinations(length(g),2,v=g)
idx <- sapply(1:nrow(com), function(x){
if (match(choices, com[x,],nomatch=0) > 0) return (T) else (F)
})
com <- com[idx,]
} else {
com <- combinations(length(choices),2,v=choices)
}
return(com)
}
## ---------------------------------------------------------------------
if(missing(dat) || missing(cl))
stop(" The data set and/or class label are missing!")
cl <- as.factor(cl)
g <- levels(cl)
if (is.list(choices)) {
com <- lapply(choices, function(x) func(x))
com <- do.call("rbind", com)
com <- unique(com)
} else {
com <- func(choices)
}
## process the data set labels being selected
dat.sub <- list()
cl.sub <- list()
for (i in (1:nrow(com))) {
idx <- (cl==com[i,][1])|(cl==com[i,][2])
cl.sub[[i]] <- cl[idx]
cl.sub[[i]] <- cl.sub[[i]][,drop=T] ## drop the levels
dat.sub[[i]] <- dat[idx,,drop = F]
}
## get comparison names
com.names <- apply(com, 1, paste, collapse="~")
names(dat.sub) <- names(cl.sub) <- com.names
return(list(dat=dat.sub,cl=cl.sub, com=com))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.