R/etc.R

#' Expected Loss of the Threshold Classifier.
#' 
#' @description The function offers a method to select variables by univariate filtering based on the 
#' estimated loss of the optimal univariate threshold classifer. No parametric assumption about the 
#' class conditional distributions is required.
#' 
#' @param class a factor vector indicating the class membership of the instances. Must have exactly two levels.
#' @param data a data frame with variables in columns.
#' @param oc a vector containing three elements. oc[1], the cost of misclassifying a negative instance, oc[2], the cost 
#' of missclassifying a positive instance, and oc[3], the share of negative instances in the population.
#' @param positive a character object indicating the factor label of the positive class.
#' @param p.val a logical indicating whether the p-values of etc values under the null hypothesis that both classes are equal
#'  should be calculated. The exact null distribution is calculated by means of a recursive algorithm.
#' @param adj.method a character string indicating the method with which to correct the p-values for multiple testing. See 
#' ?p.adjust.
#' @param plot a logical. If TRUE a plot of the null distribution will be generated.
#' 
#' @return a list containing three components:
#' \item{etc}{ a numerical vector containing the etc values for every variable of dat.}
#' \item{p.val}{ the corresponding p-values of etc. (optional)}
#' \item{p.val.adj}{ the corresponding adjusted p-values of etc. (optional)}
#' 
#' @examples 
#' oc <- c(1,3,0.5)
#' class <- factor(c(rep(0, 25), rep(1, 25)), labels=c("neg", "pos"))
#' data <- data.frame("var1"=c(rnorm(25, 0, 1/2), rnorm(25, 1, 2)))
#' res <- etc(class, data, oc, positive="pos", p.val=TRUE)
#' 
#' @export

etc <- function(class, data, oc, positive=levels(class)[1], p.val=TRUE, plot=FALSE, adj.method="BH") {
  
  if (is.null(dim(data))) data <- as.data.frame(data)
  if (!positive%in%levels(class)) stop("positive class doesn't exist")
  if (length(class)!=dim(data)[1]) stop("Dimensions of class and data do not correspond. Verify that features are in columns.")
  if (plot==TRUE & p.val==FALSE) stop("In order to plot the result p.val must be set to TRUE.")
  
  pos <- class==positive
  neg <- !pos
  p <- ifelse(class(data)=="numeric", 1, ncol(data))
  if (class(data)=="numeric") data <- as.data.frame(data)
  epe <- vector(mode="numeric", length=p)
  names(epe) <- colnames(data)
  p.value <- vector(length=p)	
  
  npos <- as.numeric(table(class)[positive])
  nneg <- as.numeric(table(class)[levels(class)[!levels(class)==positive]])
  
  for (i in 1:p) {
    
    ord <- order(data[,i])
    class.ord <- class[ord]
    feat.ord <- data[ord,i]
    
    fp1 <- c(0,cumsum(as.numeric(class.ord==positive)))
    fn1 <- c(nneg, nneg - cumsum(as.numeric(!class.ord==positive)))
    fp2 <- c(npos, npos - cumsum(as.numeric(class.ord==positive)))
    fn2 <- c(0,cumsum(as.numeric(!class.ord==positive)))
    
    epe[i] <- min(min(fp1/npos*oc[2]*(1-oc[3]) + fn1/nneg*oc[1]*oc[3]), min(fp2/npos*oc[2]*(1-oc[3]) + fn2/nneg*oc[1]*oc[3]))
  }
  
  # generate p-values
  if (p.val) {
    ND <- etc.genND(nneg, npos, oc[1], oc[2], oc[3])			
    p.val <- cumsum(as.numeric(ND$fav.perm/ND$pos.perm))[match(round(epe,4), round(ND$val,4))]
    
    if (plot) {
      plot(stats::stepfun(ND$val, c(cumsum(as.numeric(ND$fav.perm/ND$pos.perm)),1)), main="Cumulative Null Distribution of ETC", xlab="EPE", ylab="", pch=".")
      hist.info <- hist(epe, breaks=ND$val, plot=FALSE)
      points(hist.info$mids[hist.info$count!=0], hist.info$counts[hist.info$count!=0]/sum(hist.info$counts[hist.info$count!=0]), col="red", pch=20)
      text(hist.info$mids[hist.info$count!=0], y=hist.info$counts[hist.info$count!=0]/sum(hist.info$counts[hist.info$count!=0]), labels=hist.info$counts[hist.info$count!=0], col="red")
    }
    
  } else { p.value <- NULL }
  
  return(list("etc"=epe, "p.val"=p.val, "p.val.adj"=p.adjust(p.val, method=adj.method)))
  
}
SchroederFabian/CVOC documentation built on May 9, 2019, 1:18 p.m.