R/FIEmspro_accest.r

Defines functions accest.default loopfct parsfct print.accest summary.accest print.summary.accest plot.accest accest.dlist parse_vec boot.err marg dat.sel

Documented in accest.default accest.dlist dat.sel parse_vec plot.accest print.accest print.summary.accest summary.accest

##===========================================================================
## Calculate classification accuracy.
##
##===========================================================================
accest.default <- function(dat, cl, clmeth, pars = NULL,tr.idx = NULL,verb=TRUE,clmpi=NULL,seed=NULL,...)
{

  dots <-list(...)

  ## validity checking
  if (missing(dat) || missing(cl))
    stop("data set or class are missing")
  if (missing(clmeth))
    stop("'clmeth' is missing")
  if (nrow(dat) != length(cl)) stop("mat and cl don't match.")
  if (length(unique(cl)) < 2)
    stop("Classification needs at least two classes.")
  if (any(is.na(dat)) || any(is.na(cl)))
    stop("NA is not permitted in data set or class labels.")

  cl <- as.factor(cl)
  nam=paste(levels(cl),collapse="~")
  dat <- as.matrix(dat)
  n   <- nrow(dat)
  rownames(dat) <- NULL       
 
  ## construct index of train data
  if(is.null(pars))
    pars=valipars(sampling="rand", niter = 1, nreps=10,strat=TRUE)
  if(is.null(tr.idx)){
    if (pars$sampling == "cv" && pars$nreps > n ){
      pars$sampling <- "loocv"
    }
    tr.idx <- trainind(cl, pars = pars)
  }
  pars$niter    <- length(tr.idx)
  pars$nreps    <- length(tr.idx[[1]])


## Initialize
  res.all  <- pred.all<- list()
  params<-list(dat=dat,cl=cl,tr.idx=tr.idx,verb=verb,seed=seed,clmeth=clmeth,dots=dots)

  if(verb)
     cat("ACCEST Iteration (",pars$niter,"):",sep="")

## Choose between MPI/PVM or single proc
  if(!is.null(clmpi)){
    tmp<-clusterApplyLB(clmpi,1:pars$niter,
#       loopfct,params)
       FIEmspro:::loopfct,params)
    for (i in 1:pars$niter) {
      res.all[[i]] <- tmp[[i]]$res.all
      pred.all[[i]] <- tmp[[i]]$pred.all
    }
   }
   else{
     for (i in 1:pars$niter) {
#       tmp<-loopfct(i,params)
       tmp<-FIEmspro:::loopfct(i,params)
       res.all[[i]] <- tmp$res.all
       pred.all[[i]] <- tmp$pred.all
     }
  }
## End  
  if(verb)
    cat("\n")
  names(res.all) <- paste("Iter_",seq(1,pars$niter),sep="")

  ## accurary
  err.all  <- parse_vec(res.all, "err")
  acc.iter <- 1-apply(err.all,2,mean)

  ## process bootstrap acc
  if (pars$sampling == "boot"){
    plist<-c(list(dat,cl,1:length(cl),clmeth),params$dots,rs.iter=1,rs.rep=1)
    if(is.null(getS3method("predict",clmeth,optional=TRUE)))
       resub<-do.call(FIEmspro:::classifier.1,plist)
    else
       resub<-do.call(FIEmspro:::classifier.0,plist)
#       resub<-do.call(classifier.1,plist)
#    else
#       resub<-do.call(classifier.0,plist)
    err.boot <- lapply(1-acc.iter, function(x) FIEmspro:::boot.err(x, resub))
    err.boot <- t(sapply(err.boot, function(x) do.call("c", x)))
    acc.boot <- 1 - err.boot
  }

  ## ---------------- overall confusion matrix ----------------------
  ## wll-01-06-2007: Do not use sapply here because sometimes get non-equal fold.
  all.cl <- lapply(res.all, function(x){
    foo <- function(x) lapply(x, function(y) y$cl)
    foo(x)
  })
  all.cl <- unlist(unlist(all.cl, recursive = F,use.names=F))
  
  all.pred <- lapply(res.all, function(x){
    foo <- function(x) lapply(x, function(y) y$pred)
    foo(x)
  })
  all.pred <- unlist(unlist(all.pred, recursive = F,use.names=F))

  ## overall confusion matrix
  conf <- table(all.cl, all.pred)

  ## ---------------------------------------------------------------
  ## calculate mean of margin
  if (!is.null(res.all[[1]][[1]]$margin)) {
    mar.all <- t(sapply(res.all, function(x) {
      func <- function(x) sapply(x, function(y) mean(y$margin,na.rm=TRUE))
      func(x)
    }))
    mar.iter <- apply(mar.all,1,mean)
    mar      <- mean(mar.iter)
  } else {
    mar.iter <- NULL 
    mar      <- NULL
  }  
  
  ## calculate AUC
  if (!is.null(res.all[[1]][[1]]$auc) && pars$sampling != "loocv") {
      auc.all <- t(sapply(res.all, function(x) {
       func <- function(x) sapply(x, function(y) FIEmspro:::auc(y$auc[,1],y$auc[,2]-1))
       func(x)
    })) 
    auc.iter <- apply(auc.all,1,mean)
    auc      <- mean(auc.iter)
  } else {
    auc.iter <- NULL 
    auc      <- NULL
  }  
  ## ---------------------------------------------------------------

########## dle addons 
## process arguments -> new component in returned list
if(length(dots)==0)
 argfct="default"
else
 argfct=paste(paste(names(unlist(dots)),unlist(dots),sep="="),collapse=",")

## process all $mod into one list -> new component in returned list if not null
all.mod<-list()
if(!is.null(res.all[[1]][[1]]$mod)){
 for(k in 1:length(res.all)){
   tmp<-list()
    for(l in 1:length(res.all[[k]])){
      tmp[[l]]=res.all[[k]][[l]]$mod
    }
    all.mod[[k]]=tmp
 }
}

#####

  ## construct output
  ret <- list(clmeth   = clmeth,
              acc      = mean(acc.iter),
              acc.iter = acc.iter,
              mar      = mar,
              mar.iter = mar.iter,
              auc      = auc,
              auc.iter = auc.iter,
              sampling = switch(pars$sampling,
                            "loocv"  = "leave-one-out cross-validation",
                            "cv"     = "cross validation",
                            "boot"   = "bootstrap",
                            "rand"   = "randomised validation (holdout)"
                         ),
               niter   = pars$niter,
               nreps   = pars$nreps,
               conf    = conf,
               argfct   = argfct,   ### added by dle to keep trace of the arguments passed to the clmeth
               mod   = all.mod,   ### added by dle to keep elements in $mod
               pred.all = pred.all, ### added by dle to keep all predictions
               cl.task = nam, ### addedd by dle to keep trace of the classification task
               pars = pars,
               pars.mini = parsfct(pars)
              )
   if (pars$sampling == "boot") {
    ## ret$err.boot <- err.boot
    ret$acc.boot <- acc.boot
   }

   class (ret) <- "accest"
   return(ret)
}

####
## Function to be called during parallelisation
loopfct<-function(i,params){

    if(params$verb)
      cat(" ", i, sep = "")
    flush.console()   ## for Windows
    train.ind <- params$tr.idx[[i]]
    dat<-params$dat
    cl<-params$cl
    res <- list()
    tmp<-list(cl=NULL,pr=NULL,fo=NULL,auc=NULL,cl2=NULL,lsamp=NULL)
    for (j in 1:length(train.ind)) {
      ltr<-train.ind[[j]]
      lte<-(1:length(cl))[-train.ind[[j]]]
      if(!is.null(params$seed))
        set.seed(params$seed+1000*i+j)

      plist<-c(list(dat,cl,ltr,params$clmeth,rs.iter=i,rs.rep=j),params$dots)
      if(is.null(getS3method("predict",params$clmeth,optional=TRUE)))
#        res[[j]]<-do.call(classifier.1,plist)
#      else
#        res[[j]]<-do.call(classifier.0,plist)
        res[[j]]<-do.call(FIEmspro:::classifier.1,plist)
      else
        res[[j]]<-do.call(FIEmspro:::classifier.0,plist)
      tmp$fo=c(tmp$fo,rep(j,length(lte)))
      tmp$cl=c(tmp$cl,as.character(cl[-ltr]))
      tmp$pr=c(tmp$pr,as.character(res[[j]]$pred))
      tmp$lsamp=c(tmp$lsamp,lte)
      if(nlevels(cl)==2){
        tmp$auc=c(tmp$auc,res[[j]]$auc[,1])
        tmp$cl2=c(tmp$cl2,res[[j]]$auc[,2])
      }
      else
        tmp$auc<-tmp$cl2<-NULL
    }
    tmp$cl <- factor(tmp$cl,levels=levels(cl))
    tmp$pr <- factor(tmp$pr,levels=levels(cl))
    list(res.all=res,pred.all=tmp)
 }
## End

## Fct to summurise pars
parsfct<-function(pars){
if(pars$sampling=="loocv")
  rep="loocv"
else{
 str<-"NSt"
 if(pars$strat) str<-"St"
 if(pars$niter>1)
   rep<-paste(pars$niter,"x",str,"-",pars$nreps,"-",pars$sampling,sep="")
 else
   rep<-paste(str,"-",pars$nreps,"-",pars$sampling,sep="")
}
if(!is.null(pars$div))
  rep=paste(rep,"-",round(pars$div,2),sep="")

rep
}
####

##========================================================================
## 
print.accest <- function(x, digits=3,...) {

  cat("Method:\t\t", x$clmeth)
  cat("\nArguments:\t", x$argfct)
  cat("\nDiscrimination:\t", x$cl.task)


  cat("\n\nNo. of iteration:\t", x$niter)
  cat("\nSampling:\t\t", x$sampling)
  cat("\nNo. of replications:\t", x$nreps)
  cat("\n\nAccuracy:\t\t", round(x$acc,digits))
  if (!is.null(x$auc)) 
    cat("\nAUC:\t\t\t", round(x$auc,digits))
  if (!is.null(x$mar)) 
    cat("\nMargin:\t\t\t", round(x$mar,digits))

  cat("\n\nOverall confusion matrix of training data:\n")
  print(x$conf)
  
  invisible(x)
}


##========================================================================
summary.accest <- function(object, ...)
  structure(object, class = "summary.accest")

##========================================================================
print.summary.accest <- function(x, digits=3,...) 
{
  print.accest(x)               
  cat("\nAccuracy on each iteration:\n")
  print(round(x$acc.iter,digits))
  invisible(x)
}

## =======================================================================
#### Modifications by dle to allow mar, acc and auc to be plotted



#' Plot Method for Class 'accest'
#' 
#' Plots accuracy, margin or AUC at each iteration.
#' 
#' This function is a method for the generic function \code{plot()} for class
#' \code{accest}. It plots a performance estimate against iteration index.
#' 
#' @usage \method{plotaccest}(x, toplot="acc", main = NULL, xlab = NULL, ylab =
#' NULL, \dots{})
#' @param x An object of class \code{accest}.
#' @param toplot Quantity to plot (default=acc, mar, auc)
#' @param main An overall title for the plot.
#' @param xlab A title for the x axis.
#' @param ylab A title for the y axis.
#' @param \dots Additional arguments to the plot.
#' @author David Enot and Wanchang Lin \email{dle,wll@@aber.ac.uk}.
#' @seealso \code{\link{accest}}, \code{\link{accest}}
#' @keywords hplot
#' @examples
#' 
#' # Iris data
#' data(iris)
#' x <- as.matrix(subset(iris, select = -Species))
#' y <- iris$Species
#' pars <- valipars(sampling="cv", niter=10, nreps=5, strat=TRUE)
#' 
#' acc  <- accest(x,y, clmeth = "randomForest", pars = pars, ntree=50)
#' acc
#' ## plot accuracies
#' plot(acc)
#' 
#' ## plot margins
#' plot(acc,toplot="mar")
#' 
plot.accest <- function(x, toplot="acc", main = NULL, xlab = NULL, ylab = NULL, ...)
{
  if (x$niter == 1)
    stop("Number of iteration (niter) must be greater than 1")

  if (is.null(main))
    main <- paste("Performance of `",x$clmeth, "'", sep=" ","(",x$sampling,")" )
    
  if (is.null(xlab)) xlab <- "Iterations"
  if (is.null(ylab)) ylab <- paste("Indicator:",toplot)

vect<-NULL
eval(parse(text=paste("vect=x$",toplot,".iter",sep=""))) 
plot(vect, type = "b", main=main,xlab=xlab, ylab=ylab, col="blue",...)

}

##===========================================================================


#' Classification Wrapper Using Customised Classifiers
#' 
#' Wrapper function for calculating classification estimates using pre-defined
#' data partitioning sets (\code{\link{valipars}} and \code{\link{trainind}}).
#' This function works with two type of classifiers. First generic classifiers
#' that fulfil R standards to define predictive techniques such as the ones
#' available in packages like \pkg{MASS}, \pkg{e1071} or \code{randomForest}
#' and \code{nlda} are normally handle with \code{accest}: the name of function
#' (\code{clmeth} in the \code{accest} call) must be accompanied with an S3
#' method \code{predict}; the later function should return a list with
#' component 'class' (hard classification) and if possible 'prob' or
#' 'posterior' for class probabilities. If the algorithm doesn't fulfil these
#' requirements, two postions can be adopted: 1) define explicitly the
#' algorithm so that it means R standards 2) define customised a function that
#' returns necessary informations. The second ('quicky and dirty') approach is
#' illustrated in an example given below.  Unless the classifier can only cope
#' with two-class tasks, this function allows the manipulation of any problem
#' complexity. Three types of estimates are given for each replication:
#' accuracy, so-called margin and AUC (see details). Data input can be in the
#' form of \code{data} matrix + \code{class} vector, following the classic
#' formula type or derived from \code{\link{dat.sel1}}.
#' 
#' Seexxxx for common details.
#' 
#' @aliases accest accest.formula accest.default accest.dlist print.accest
#' summary.accest print.summary.accest
#' @usage accest(\dots{})
#' 
#' \method{accestdefault}(dat, cl, clmeth, pars = NULL, tr.idx = NULL,
#' verb=TRUE, clmpi=NULL, seed=NULL, \dots{})
#' 
#' \method{accestformula}(formula, data = NULL, \dots{}, subset, na.action =
#' na.omit)
#' 
#' \method{accestdlist}(dlist, clmeth,pars = NULL, tr.idx = NULL, \dots{})
#' @param formula A formula of the form \code{groups ~ x1 + x2 + \dots{}} That
#' is, the response is the grouping factor and the right hand side specifies
#' the (non-factor) discriminators.
#' @param data Data frame from which variables specified in \code{formula} are
#' preferentially to be taken.
#' @param dlist A matrix or data frame containing the explanatory variables if
#' no formula is given as the principal argument.
#' @param dat A matrix or data frame containing the explanatory variables if no
#' formula is given as the principal argument.
#' @param cl A factor specifying the class for each observation if no formula
#' principal argument is given.
#' @param clmeth Classifier function.  For details, see \code{note} below.
#' 
#' @param pars A list of parameters using by the resampling method such as
#' \emph{Leave-one-out cross-validation}, \emph{Cross-validation},
#' \emph{Bootstrap} and \emph{Randomised validation (holdout)}.  See
#' \code{\link{valipars}} for details.
#' @param tr.idx User defined index of training samples. Can be generated by
#' \code{trainind}.
#' @param verb Should iterations be printed out?
#' @param clmpi snow cluster information
#' @param seed Seed.
#' @param \dots Additional parameters to be passed to \code{clmeth}.
#' @param subset Optional vector, specifying a subset of observations to be
#' used.
#' @param na.action Function which indicates what should happen when the data
#' contains \code{NA}'s, defaults to \code{\link{na.omit}}.
#' @return An object of class \code{accest}, including the components:
#' \item{clmeth}{Classification method used.} \item{acc}{Average accuracy.}
#' \item{acc.iter}{Accuracy at each iteration.} \item{acc.std}{Standard
#' derivation of accuracy.} \item{mar}{Average predictive margin.}
#' \item{mar.iter}{Predictive margin of each iteration.} \item{auc}{Average
#' area under receiver operating curve (AUC).} \item{auc.iter}{AUC of each
#' iteration.} \item{sampling}{Sampling scheme used.} \item{niter}{Number of
#' iterations.} \item{nreps}{Number of replications at each iteration. }
#' \item{acc.boot}{Detailed bootstrap accuracy estimates when bootstrap
#' validation method is employed.} \item{argfct}{Arguments passed to the
#' classifier. } \item{pred.all}{For each iteration, list of the fold/bootstrap
#' id and the true and predicted classes.} \item{cl.task}{Discrimination task.}
#' \item{mod}{List of information return by the user defined classifier
#' function. }
#' @author David Enot \email{dle@@aber.ac.uk} and Wanchang Lin
#' \email{wll@@aber.ac.uk}
#' @seealso \code{\link{valipars}}, \code{\link{trainind}}
#' @keywords classif
#' @examples
#' 
#' 
#' ## -----------------------------------------------------------------
#' ## simple customised function
#' ## sameasrf simply reproduces the RF modelling task
#' sameasrf <- function(data,...){
#'   dots <- list(...)
#' 
#'   ## Build RF model and predict dat.te 
#'   mod  <- randomForest(data$tr,data$cl,...)
#'   ## Soft predictions (optional if ROC/margin analyses required)
#'   prob <- predict(mod,data$te,type="vote")
#'   ## Hard predictions 
#'   pred <- predict(mod,data$te)
#' 
#'   # For illustration, mod does not contain anything 
#'   list(mod=NULL,pred=pred,prob=prob,arg=dots)
#' }
#' 
#' ## -----------------------------------------------------------------
#' ## compare accest with randomForest 
#' ## and sameasrf
#' data(iris)
#' dat=as.matrix(iris[,1:4])
#' cl=as.factor(iris[,5])
#' pars   <- valipars(sampling = "boot",niter = 2, nreps=10)
#' tr.idx <- trainind(iris$Species,pars)
#' 
#' set.seed(71)
#' acc.1 <- accest(dat,cl, clmeth = "sameasrf", 
#'                    pars = pars,tr.idx = tr.idx,ntree = 200)
#' summary(acc.1)
#' 
#' set.seed(71)
#' acc.2 <- accest(dat,cl, clmeth = "randomForest", 
#'                    pars = pars,tr.idx = tr.idx,ntree = 200)
#' summary(acc.2)
#' 
#' ### compare acc.1 and acc.2 bootstrap error estimates
#' print(acc.1$acc.boot-acc.2$acc.boot)
#' 
#' #########################################
#' ## Try formula type
#' set.seed(71)
#' acc.3 <- accest(Species~., data = iris, clmeth = "randomForest", 
#'                    pars = pars,tr.idx = tr.idx,ntree = 200)
#' summary(acc.3)
#' 
#' ## Try dlist type from dat.sel1
#' set.seed(71)
#' dat2=dat.sel1(dat,cl,pars=pars)
#' acc.4 <- accest(dat2[[1]], clmeth = "randomForest", 
#'                    pars = pars,tr.idx = tr.idx,ntree = 200)
#' summary(acc.4)
#' 
#' 
#' 
accest <- function (...) UseMethod ("accest")

##===========================================================================
accest.dlist<-function(dlist,clmeth,pars=NULL,tr.idx=NULL,...)
{
#  call <- match.call()
  if (!inherits(dlist, "dlist")) 
    stop("method is only for dlist objects")

  cl=as.factor(dlist$cl)
  dat=as.matrix(dlist$dat)
  if(is.null(pars)){
    if(!is.null(dlist$pars))
      pars=dlist$pars
    else
      pars=valipars()
  }
  if(!is.null(dlist$tr.idx) & is.null(tr.idx))
    tr.idx=dlist$tr.idx
  
  ret <- accest.default(dat, cl, clmeth=clmeth, pars = pars, tr.idx = tr.idx, ...)
  return(ret)
}

##===========================================================================
accest.formula <- function (formula, data = NULL, ..., subset, 
                            na.action = na.omit)
{
  call <- match.call()
  if (!inherits(formula, "formula")) 
    stop("method is only for formula objects")
  m <- match.call(expand.dots = FALSE)
  if (identical(class(eval.parent(m$data)), "matrix"))
    m$data <- as.data.frame(eval.parent(m$data))
  m$... <- NULL
  m$scale <- NULL
  m[[1]] <- as.name("model.frame")
  m$na.action <- na.action
  m <- eval(m, parent.frame())
  Terms <- attr(m, "terms")
  attr(Terms, "intercept") <- 0
  x <- model.matrix(Terms, m)
  y <- model.extract(m, "response")
  ret <- accest.default (x, y, ...)
  return (ret)
}



## ======================================================================
## dle: a couple of functions to parse results from classifier.1

#### Parse one value from multiple lists: you know this one!


#' Aggregation of Vectors in Nested Lists
#' 
#' Parser to illustrate a possible output of \code{\link{accest}}. The function
#' loop other lists of list and aggregate vectors (single values) contained in
#' component labelled \code{nam}.
#' 
#' 
#' @usage parse_vec(mlist,nam)
#' @param mlist List of list
#' @param nam Name of the component of interest
#' @author David Enot \email{dle@@aber.ac.uk}.
#' @seealso \code{\link{accest}}, \code{\link{parse_freq}}
#' @keywords manip
#' @examples
#' 
#' 
#' l1=list(list(v=1),list(v=2),list(v=3))
#' l2=list(list(v=4),list(v=5),list(v=6))
#' res=list(l1,l2)
#' 
#' ## show list of list
#' print(res)
#' 
#' ## tidy up
#' parse_vec(res,nam="v")
#' 
#' ######## Second example
#' ## Real world problem described in accest
#' 
#' 
parse_vec <- function(mlist,nam){
  all.res<-NULL
  for(k in 1:length(mlist)){
      tmp<-NULL
      for(l in 1:length(mlist[[k]])){
         lv<-NULL
         eval(parse(text=paste("lv=mlist[[k]][[l]]$",nam,sep="")))
         tmp=c(tmp,lv)
       }
      all.res=cbind(all.res,tmp)
  }
  dimnames(all.res)[[2]]=1:length(mlist)
  all.res
}

#### Parse frequency from multiple lists: fairly specific


#' Output Variable Frequencies in Nested Lists
#' 
#' Parser to illustrate a possible output of \code{\link{accest}}. The function
#' loop other lists of list and retrieve the frequency of the variable names
#' contained in the component labelled \code{nam}.
#' 
#' 
#' @usage parse_freq(mlist,nam,sorting=TRUE)
#' @param mlist List of list
#' @param nam Name of the component of interest
#' @param sorting Should the variable name be sorted by frequency?
#' @author David Enot \email{dle@@aber.ac.uk}
#' @seealso \code{\link{accest}}, \code{\link{parse_vec}}
#' @keywords manip
#' @examples
#' 
#' 
#' l1=list(list(v=1),list(v=c(1,2)),list(v=1))
#' l2=list(list(v=c(1,2,3)),list(v=2),list(v=c(3,4)))
#' res=list(l1,l2)
#' 
#' ## show list of list
#' print(res)
#' 
#' ## tidy up
#' parse_freq(res,nam="v")
#' 
#' ######## Second example
#' ## Real world problem described in accest
#' 
#' 
#' 
parse_freq<-function (mlist, nam, sorting = TRUE) 
{
    lvar <- lnam <- lfreq <- NULL
    for (k in 1:length(mlist)) {
        for (l in 1:length(mlist[[k]])) {
            lv<-NULL
            eval(parse(text = paste("lv=mlist[[k]][[l]]$", nam, 
                sep = "")))
            for (i in lv) {
                if (i %in% lvar) {
                  lfreq[lvar == i] = lfreq[lvar == i] + 1
                }
                else {
                  lvar = c(lvar, i)
                  lfreq = c(lfreq, 1)
                  if (!is.null(names(lv))) {
                    lnam = c(lnam, names(lv)[lv == i])
                  }
                }
            }
        }
    }
    mat = cbind(lvar, lfreq)
    if (!is.null(lnam)) {
        dimnames(mat) = list(lnam, c("VarId", "Freq"))
    }
    if (sorting) {
        lso = sort(-mat[, 2], index.return = T)$ix
        mat = mat[lso, ]
    }
    mat
}

###### Classifier function that handles formalized classification methods
### by wll
classifier.0<-function (dat,cl,ltr, clmeth, rs.iter,rs.rep,...) 
{

    dots <- list(...)
    
    dat.tr <- dat[ltr,,drop=F]
    cl.tr  <- cl[ltr]
    if(length(cl) > length(unique(ltr))){
      dat.te <- dat[-unique(ltr),,drop=F]
      cl.te  <- cl[-unique(ltr)]      
    }
    else{
      dat.te<-dat
      cl.te<-cl
    }

    if (hasArg(probability)) 
        dots$probability <- NULL
    knn.wrap <- function(train, cl, k = 1, l = 0, ...) list(train = train, 
        cl = cl, k = k, l = l, ...)
    if (clmeth == "knn") {
        clmeth <- c("knn.wrap")
        predict <- function(x, ...) {
            knn(train = x$train, cl = x$cl, k = x$k, l = x$l, 
                ...)
        }
    }

    idx <- which(apply(dat.tr, 2, sd) > .Machine$double.eps)
    dat.tr <- dat.tr[, idx, drop = F]
    dat.te <- dat.te[, idx, drop = F]
    nte <- length(cl.te)
    dat.all <- rbind(dat.te, dat.tr)
    cl.all <- factor(c(as.character(cl.te), as.character(cl.tr)))
    model <- do.call(clmeth, c(list(dat.tr, cl.tr), probability = T, 
        dots))
    pred <- predict(model, dat.all)
    if (!is.list(pred)) {
        pred.te <- pred[1:nte]
        if (clmeth == "svm") {
            prob <- attr(predict(model, dat.all, probability = TRUE), 
                "probabilities")
            prob.te <- prob[1:nte, , drop = F]
        }
        else if (clmeth == "randomForest") {
            prob <- predict(model, dat.all, type = "prob")
            prob.te <- prob[1:nte, , drop = F]
        }
        else {
            prob.te <- NULL
        }
    }
    else {
        if (!is.null(pred$class)) {
            pred.te <- pred$class[1:nte]
            prob.te <- pred$posterior[1:nte, , drop = F]
        }
        else {
            stop("predict does not return a list with component 'class'.")
        }
    }
    err <- sum(cl.te != pred.te)/length(cl.te)
    prob.te <- prob.te[, levels(cl.te), drop = F]
    if (!is.null(prob.te)) {
        margin <- FIEmspro:::marg(prob.te, cl.te)
        if (length(levels(cl.te)) == 2 && length(cl.te) > 1) {
            cl.tmp <- cl.te
            levels(cl.tmp) <- c(0, 1)
            stat <- prob.te[, 2]
            auc <- cbind(stat, cl.tmp)
        }
        else {
            auc <- NULL
        }
    }
    else {
        margin <- NULL
        auc <- NULL
    }
    ret <- list(err = err, cl = cl.te, pred = pred.te, posterior = prob.te, 
        margin = margin, auc = auc, mod=NULL, arg = dots)
    return(ret)

}

##===========================================================================
###### Classifier function that handles customised classification methods
### by dle
classifier.1<-function (dat,cl,ltr, clmeth, rs.iter,rs.rep, ...) 
{

    dots <- list(...)

### form training and test data
    dat.tr <- dat[ltr,,drop=F]
    cl.tr  <- cl[ltr]
    if(length(cl) > length(unique(ltr))){
      dat.te <- dat[-unique(ltr),,drop=F]
      cl.te  <- cl[-unique(ltr)]      
    }
    else{
      dat.te<-dat
      cl.te<-cl
    }
    data=list(tr=dat.tr,cl=cl.tr,te=dat.te,clte=cl.te,ltr=ltr,rs.iter=rs.iter,rs.rep=rs.rep)

### match arguments between clmeth with those contained in dots
   argdots<-names(dots)
   argfct<-names(formals(clmeth))

### check the common arguments between clmeth and dots
   argint<-intersect(argdots,argfct)
### add extra arguments if clmeth allows '...'
   if(is.element("...",argfct))
      argint<-c(argint,setdiff(argdots,argfct))
### form the list to be passed to clmeth
   lpar<-list(data=data)
   for(k in argint){
      eval(parse(text=paste("lpar$",k,"=dots$",k,sep="")))
   }
### end of argument matching

  model<-do.call(clmeth, lpar)

  prob.te<-model$prob
  pred.te<-model$pred

  ## calculate error rate
  err <- sum(cl.te != pred.te)/length(cl.te)
  prob.te <- prob.te[,levels(cl.te),drop=F]   ## wll-06-07-07: for AUC purpose
  if (!is.null(prob.te)){
    margin <- FIEmspro:::marg(prob.te, cl.te)
    if (length(levels(cl.te))==2  && length(cl.te) > 1) { ## calculate AUC if two-class problem
      cl.tmp <- cl.te
      levels(cl.tmp) <- c(0,1)
      stat <- prob.te[,2]
      auc <- cbind(stat,cl.tmp)
    } else {
      auc <- NULL
    }
  } else {
    margin <- NULL
    auc <- NULL
  }

  ret <- list(err=err,cl=cl.te,pred=pred.te, posterior=prob.te,
              margin=margin, auc=auc, mod=model$mod, arg=model$arg)
  return(ret)
}

## ======================================================================
## Calculate bootstrap, .632 bootstrap and .632 plus bootstrap error rate
boot.err <- function(err, resub)                       
{
  ## apparent error rate/resubstitution rate ( not training error rate)
  err.ae <- resub$err
  cl     <- resub$cl
  pred   <- resub$pred    

  ## .632 bootstrap error
  err.b632  <- 0.368*err.ae + 0.632*err

  gamma <- sum(outer(cl, pred, function(x, y) ifelse(x==y, 0, 1) )) /(length(cl)^2)
  r     <- (err - err.ae)/(gamma - err.ae)
  r     <- ifelse(err > err.ae & gamma > err.ae, r, 0)
  ## lwc-16-08-2006: if r still falls outside of [0,1], truncate it to 0.
  ## if (r > 1 || r < 0) r <- 0              

  errprime <- min(err, gamma)
  ## weight <- .632/(1-.368*r)
  ## err.b632p <- (1-weight)*err.ae + weight*err
  
  err.b632p  <- err.b632 + (errprime - err.ae)*(0.368*0.632*r)/(1-0.368*r)

  ret <- list(ae=err.ae, boot=err, b632=err.b632, b632p=err.b632p)

  return(ret)
}


## ============================================================================
## Claculate the margin of a classifier based on the posterior 
## NOTE: 1. This function hacked from package 'randomForest'. For more 
##       description, see package 'randomForest'.
##
marg <- function(prob, observed) {

  if (missing(observed) || missing(prob)) stop("arguments miss")
  if(length(observed) != nrow(prob)) stop("lengths differ")

  if( any(prob > 1, na.rm=TRUE) ) {  ## modif dle 02/10/07
    prob <- sweep(prob, 1, rowSums(prob), "/")
  }
  observed <- as.factor(observed)
  
  mat <- data.frame(prob, observed)
  names(mat) <- c(dimnames(prob)[[2]], "observed")
  ## NOTE-wll: Ater data.frame() operation, the colnames may be changed (if the 
  ## colnames are numbers). The above line is to restore the original colnames.
  
  nlev <- length(levels(observed))
  
  ans <- apply(mat, 1, function(x){
    pos <- match(x[nlev+1], names(x));
    t1  <- as.numeric(x[pos]);
    t2  <- max(as.numeric(x[-c(pos, nlev+1)]));
    t1 - t2 
  })

  names(ans) <- observed
  ans
}

## ====================================================================  
## Gnerates the pairwise data set based on the class label.


#' Generate Pairwise Data Set Based on Class Labels
#' 
#' Generate pairwise data set based on class labels.
#' 
#' This function is used to provide the data set for the binary combination of
#' the class factor. If \code{choices} is \code{NULL}, the binary combination
#' of for all class labels will be done. If \code{choices} has one class label,
#' the comparisons between this one and any other class are done. If
#' \code{choices} has more than three class lables, enumerate the combinations
#' or permutations of the elements of \code{choices}. For details, see
#' \code{examples} below.
#' 
#' @usage dat.sel(dat, cl, choices = NULL)
#' @param dat A data frame or matrix.
#' @param cl A factor or vector of class.
#' @param choices The vector or list of class labels to be chosen for binary
#' classification.
#' @return A list with components: \item{dat}{ Pairwise data set.  } \item{cl}{
#' Pairwise class label.  } \item{com}{ A matrix of the combinations or
#' permutations of the elements of pairwise vector.  }
#' @author Wanchang Lin \email{wll@@aber.ac.uk}
#' @keywords manip
#' @examples
#' 
#' data(iris)
#' x <- subset(iris, select = -Species)
#' y <- iris$Species
#'  
#' ## generate data set with class "setosa" and "virginica"
#' (binmat.1 <- dat.sel(x,y,choices=c("setosa","virginica")))
#' 
#' ## generate data sets for "setosa" vs other classes. These are: 
#' ## "setosa" and "versicolor", "setosa" and "virginica".
#' (binmat.2 <- dat.sel(x,y,choices=c("setosa")))
#' 
#' ## generate data set with combination of each class. These are:  
#' ## "setosa" and "versicolor", "setosa" and "virginica",  
#' ## "versicolor" and "virginica" 
#' (binmat.3 <- dat.sel(x,y,choices= NULL))
#' 
#' 
dat.sel <- function(dat, cl, choices = NULL) 
{
  ## ----------------------------------------------------------------------
  ## ======================================================================
  ## wll-29-10-2006: combnations is from package gtools.
  ## 
  ## $Id: combinations.R,v 1.7 2005/06/09 14:20:28 nj7w Exp $
  ##
  ## From email by Brian D Ripley <ripley@stats.ox.ac.uk> to r-help
  ## dated Tue, 14 Dec 1999 11:14:04 +0000 (GMT) in response to
  ## Alex Ahgarin <datamanagement@email.com>.  Original version was
  ## named "subsets" and was Written by Bill Venables.  
  ##
  ## ======================================================================
  combinations <- function(n, r, v = 1:n, set = TRUE, repeats.allowed=FALSE) {
    if(mode(n) != "numeric" || length(n) != 1 
       || n < 1 || (n %% 1) != 0) stop("bad value of n") 
    if(mode(r) != "numeric" || length(r) != 1 
       || r < 1 || (r %% 1) != 0) stop("bad value of r") 
    if(!is.atomic(v) || length(v) < n) 
      stop("v is either non-atomic or too short")
    if( (r > n) & repeats.allowed==FALSE)
      stop("r > n and repeats.allowed=FALSE")
    if(set) {
      v <- unique(sort(v))
      if (length(v) < n) stop("too few different elements")
    }
    v0 <- vector(mode(v), 0)
    
    ## Inner workhorse
    if(repeats.allowed)
      sub <- function(n, r, v){ 
        if(r == 0) v0 else
        if(r == 1) matrix(v, n, 1) else
        if(n == 1) matrix(v, 1, r) else
        rbind(cbind(v[1], Recall(n, r-1, v)), Recall(n-1, r, v[-1]))
      }
    else
      sub <- function(n, r, v){ 
        if(r == 0) v0 else
        if(r == 1) matrix(v, n, 1) else
        if(r == n) matrix(v, 1, n) else
        rbind(cbind(v[1], Recall(n-1, r-1, v[-1])), Recall(n-1, r, v[-1]))
      }
  
    sub(n, r, v[1:n])
  }

  ## ---------------------------------------------------------------------
  func <- function(choices){
    if (is.null(choices)) {
      choices <- g
    } else {
      choices <- unique(choices)
    }
    
    i <- pmatch(choices, g)
    if (any(is.na(i)))
      stop("'choices' should be one of ", paste(g, collapse = ", "))
  
    ## Get the binary combinations based on the class labels (package GTOOLS)
    if (length(choices) == 1) {
      com <- combinations(length(g),2,v=g) 
      idx <- sapply(1:nrow(com), function(x){
        if (match(choices, com[x,],nomatch=0) > 0) return (T) else (F)
      })
      com <- com[idx,]
    } else {
      com <- combinations(length(choices),2,v=choices) 
    }
    return(com)
  }
  ## ---------------------------------------------------------------------

  if(missing(dat) || missing(cl)) 
    stop(" The data set and/or class label are missing!")
  cl <- as.factor(cl)
  g  <- levels(cl)

  if (is.list(choices)) {
    com <- lapply(choices, function(x) func(x))
    com <- do.call("rbind", com)
    com <- unique(com)
  } else { 
    com <- func(choices)
  }
  
  ## process the data set labels being selected
  dat.sub <- list()
  cl.sub  <- list()
  for (i in (1:nrow(com))) {
    idx      <- (cl==com[i,][1])|(cl==com[i,][2])
    cl.sub[[i]]  <- cl[idx]
    cl.sub[[i]]  <- cl.sub[[i]][,drop=T]     ## drop the levels
    dat.sub[[i]] <- dat[idx,,drop = F]     
  }

  ## get comparison names
  com.names  <- apply(com, 1, paste, collapse="~") 
  names(dat.sub) <- names(cl.sub) <- com.names

  return(list(dat=dat.sub,cl=cl.sub, com=com))
}
wilsontom/FIEmspro documentation built on May 4, 2019, 6:28 a.m.