R/hist.trainOcc.R

Defines functions hist.trainOcc

Documented in hist.trainOcc

################################################################################
#' hist.trainOcc
#' @aliases hist
#' 
#' @title Diagnostic distributions plot for a \code{\link{trainOcc}} object.
#'
#' @description The histogram of the predicted unlabeled data is shown together with the 
#' hold-out predictions of the positive and unlabeled traning data (boxplots). 
#'
#' @param x an object of class \code{\link{trainOcc}}.
#' @param predUn a vector of unlabeled predictions (if \code{NULL} \code{x$predUn} is used, if existing).
#' @param th draw vertical lines in the histogram, indication for a threshold.
#' @param cab for a color-coded histogram a list with 
#' elements \code{colors} (vector of \code{R} colors, length n) and \code{breaks} (vector of numeric values, length n+1). 
#' @param main a title for the plot. if not given the parameters of the model are added.
#' @param ylim the y limits of the plot.
#' @param breaks see identically named argument in \code{\link{hist}}
#' @param col a colour to be used to fill the bars.
#' @param border the color of the border around the bars.
#' @param add_calBoxplot bool. Should the positive calibration predictions be plotted? Defaults to \code{TRUE}.  
#' @param noWarnRasHist Supresses warning when histogram is derived from raster.
#' @param ... other arguments that can be passed to \code{\link{plot}}. 
#' @return Diagnostic distributions plot.
#' @method hist trainOcc
#' @examples
#' data(bananas)
#' ### an underfitted model 
#' oc <- trainOcc (x = bananas$tr[, -1], y = bananas$tr[, 1], 
#'                 tuneGrid=expand.grid(sigma=0.1, 
#'                                      cNeg=0.5, 
#'                                      cMultiplier=16))
#' ### predict 10% or the unlabeled data and plot 
#' # the diagnostic distributions plot
#' # and the model in the 2D feature space 
#' set.seed(123)
#' idx.pred <- sample(400*400, 16000)
#' hist(oc, predict(oc, bananas$x[][idx.pred,]), th=0)
#' featurespace(oc, th=0)
#' 
#' ### an overfitted model 
#' oc <- trainOcc (x = bananas$tr[, -1], y = bananas$tr[, 1], 
#'                 tuneGrid=expand.grid(sigma=1, 
#'                                      cNeg=32, 
#'                                      cMultiplier=16))
#' ### predict 10% or the unlabeled data and plot 
#' # the diagnostic distributions plot
#' # and the model in the 2D feature space 
#' set.seed(123)
#' idx.pred <- sample(400*400, 16000)
#' hist(oc, predict(oc, bananas$x[][idx.pred,]), th=0)
#' featurespace(oc, th=0)
#' 
#' ### a good model 
#' oc <- trainOcc (x = bananas$tr[, -1], y = bananas$tr[, 1], 
#'                 tuneGrid=expand.grid(sigma=1, 
#'                                      cNeg=0.0625, 
#'                                      cMultiplier=64))
#' ### predict 10% or the unlabeled data and plot 
#' # the diagnostic distributions plot
#' # and the model in the 2D feature space 
#' set.seed(123)
#' idx.pred <- sample(400*400, 16000)
#' hist(oc, predict(oc, bananas$x[][idx.pred,]), th=0)
#' featurespace(oc, th=0)
#' @method hist trainOcc
#' @examples
#' \dontrun{
#' data(bananas)
#' ### an underfitted model 
#' oc <- trainOcc (x = bananas$tr[, -1], y = bananas$tr[, 1], 
#'                 tuneGrid=expand.grid(sigma=0.1, 
#'                                      cNeg=0.5, 
#'                                      cMultiplier=16))
#' ### predict 10% or the unlabeled data and plot 
#' # the diagnostic distributions plot
#' # and the model in the 2D feature space 
#' set.seed(123)
#' idx.pred <- sample(400*400, 16000)
#' hist(oc, predict(oc, bananas$x[][idx.pred,]), th=0, add_calBoxplot=F)
#' featurespace(oc, th=0)
#' 
#' ### an overfitted model 
#' oc <- trainOcc (x = bananas$tr[, -1], y = bananas$tr[, 1], 
#'                 tuneGrid=expand.grid(sigma=1, 
#'                                     cNeg=32, 
#'                                      cMultiplier=16))
#' ### predict 10% or the unlabeled data and plot 
#' # the diagnostic distributions plot
#' # and the model in the 2D feature space 
#' set.seed(123)
#' idx.pred <- sample(400*400, 16000)
#' hist(oc, predict(oc, bananas$x[][idx.pred,]), th=0)
#' featurespace(oc, th=0)
#' 
#' ### a good model 
#' oc <- trainOcc(x = bananas$tr[, -1], y = bananas$tr[, 1], 
#'                tuneGrid=expand.grid(sigma=1, 
#'                                      cNeg=0.0625, 
#'                                      cMultiplier=64))
#' ### predict 10% or the unlabeled data and plot 
#' # the diagnostic distributions plot
#' # and the model in the 2D feature space 
#' set.seed(123)
#' idx.pred <- sample(400*400, 16000)
#' pred <- predict(oc, bananas$x[][idx.pred,])
#' hist(oc, pred, th=0)
#' featurespace(oc, th=0)
#' 
#' ### color coded 
#' cab=list(colors=c("#B2182B", "#D6604D", "#F4A582", "#FDDBC7", 
#'                   "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC"),
#'          breaks=c(-max(pred), seq(-1.5, 1.5, .5), max(pred)) )
#' hist(oc, pred, th=0, colsUbreaks=cab, border=NA)
#' plot(predict(oc, bananas$x), col=cab$colors, breaks=cab$breaks)
#' }
#' @export
hist.trainOcc <- function(x, predUn=NULL, th=NULL, cab=NULL, main=NULL, 
                          ylim=NULL, breaks='Scott', col="grey", border=NA, 
                          xlim=NULL, add_calBoxplot=TRUE, 
                          noWarnRasHist=FALSE, ...) { # 
  if (!is.null(x$holdOut$pos) & !is.null(x$holdOut$un)) {
    hop <- list(pos = x$holdOut$pos, un = x$holdOut$un)
  } else {
    hop <- holdOutPredictions(x, aggregate=TRUE)
  }
  
  trSet_pos <- x$trainingData[x$trainingData[,ncol(x$trainingData)]=="pos", 
                              -ncol(x$trainingData)]
  pred_tr_pos <- predict(x, trSet_pos)
  
  
  if (!is.null(predUn)) {
    predictive.value <- predUn
    if (is.null(xlim)) {
      if (.is.raster(predictive.value)) 
      {
        if (!is.finite(predictive.value@data@min) |
            !is.finite(predictive.value@data@max))
          predictive.value <- setMinMax(predictive.value)
        xlim <- c(predictive.value@data@min, 
                  predictive.value@data@max)
      } else {
        xlim <- range(predictive.value, na.rm=T)
      }
    }
  } else if (!is.null(x$predUn)) {
    predictive.value <- x$predUn
    if (is.null(xlim))
      xlim <- range(predictive.value, na.rm=T)
  } else if ( is.null(predUn) & is.null(x$predUn) ) {
    warning('No predicted unlabeled data found.\nUnlabeled hold-out predictions used to build the histogram.')
    predictive.value <- hop$un
    if (is.null(xlim))
      xlim <- range(c(unlist(hop$un), unlist(hop$pos)), na.rm=T)
  }
  
  if (.is.raster(predictive.value) & noWarnRasHist) {
    oldw <- getOption("warn")
    options(warn = -1)
    h <- hist(predictive.value, plot=FALSE, breaks=breaks, ...)
    options(warn = oldw)
  } else {
    h <- hist(predictive.value, plot=FALSE, breaks=breaks, ...)
  }
  h$xname <- "predictive value"
  
  
  ###############################################################################
  ### set defaults if necessary 
  ### ylim
  # browser()
  if (is.null(ylim)) {
    ans <- boxplot(unlist(hop$pos), plot=FALSE)$stat[1]
    maxInRelevantRange <- max(h$density[h$mids>=ans & is.finite(h$density)])
    # ylim <- .ylimForHist( h, positives=unlist(hop$pos) )
    if (maxInRelevantRange < (max(h$density[is.finite(h$density)]))) {
      ylim <- c(0, maxInRelevantRange*2) # range(h$density)
    } else {
      ylim <- c(0, maxInRelevantRange) # range(h$density)
    }
    if (any(!is.finite(ylim)))
      ylim <- c(0, max( h$density ))
  }
  
  if (is.null(main)) {
    parvals <- sapply(x$bestTune, 
                      function(parval) 
                        ifelse(is.numeric(parval), signif(parval,3), as.character(parval)))
    signif(x$bestTune[is.numeric(x$bestTune)],3)
    main <- paste(paste(names(x$bestTune), parvals, collapse=" / "), 
                  paste("\nrow", modelPosition(x)$row, "/ #U", length(predictive.value)))
  }
  clrs <- .clrs('PU')
  
  
  ###############################################################################
  ### ### TPR and PPP
  # browser()
  #   prb <- seq(0, 1, .01)
  #   prb.scld <- approx(c(0,1), ylim, prb)$y
  #   percentiles.pnp <- quantile(predUn, prb)
  #   percentiles.pnp.tr.un <- quantile(hop$un, prb)
  #   percentiles.tpr <- quantile(hop$pos, prb)
  
  ylim[1] <- 0-diff(c(0,ylim[2]))*.15
  if (!is.null(cab) & is.list(cab)) {
    col <- rep(NA, length(h$mids))
    for(i in 1:(length(cab$colors))) {
      if (i==1) {
        idx <- h$mids<cab$breaks[i]
      } else if (i==length(cab$colors)) {
        idx <- h$mids>=cab$breaks[i-1]
      } else {
        idx <- h$mids>=cab$breaks[i-1] & h$mids<cab$breaks[i]
      }
      col[idx] <- cab$colors[i]
    }
    
  }
  plot(h, freq=FALSE, ylim=ylim, xlim=xlim, main=main, col=col, border=border, ...)
  #   legend("topright", c("TPR", "1-PPP (train, U)", "1-PPP (all U)"), 
  #          lwd=c(2,2,2), lty=c(1,1,5), col=c("black", "black", clrs$pos))
  #   
  #   axis(4, at=approx(c(0,1), c(0, ylim[2]), c(0, .25, .5, .75, 1))$y, labels=c(0, .25, .5, .75, 1) )
  #   lines(percentiles.tpr, ylim[2]-prb.scld, lwd=2, col=clrs$pos)
  #   lines(percentiles.pnp.tr.un, ylim[2]-prb.scld, lwd=2)
  #   lines(percentiles.pnp, ylim[2]-prb.scld, lwd=2, lty=5)
  #   
  bxwx <- abs(ylim[1])*.75
  if (add_calBoxplot)
    boxplot(pred_tr_pos, frame=FALSE, axes=FALSE, y=0, horizontal=TRUE, 
            at=ylim[1]*.25, add=TRUE, boxwex=bxwx, col="#a6cee3")
  boxplot(unlist(hop$pos), frame=FALSE, axes=FALSE, y=0, horizontal=TRUE, 
          at=ylim[1]*.5, add=TRUE, boxwex=bxwx, col=clrs$pos )
  
  if ( !is.null(hop$un) ) {
    boxplot(unlist(hop$un), y=0, horizontal=TRUE, axes=FALSE, 
            at=ylim[1]*.9, add=TRUE, boxwex=bxwx, col=clrs$un) # , col=cols[1]
  }
  
  if(!is.null(th)){
    for (i in 1:length(th))
      lines(rep(th[i], 2), y=c(0, ylim[2]), lwd=3)
  }
  
  invisible(list(h=h, ylim=ylim))
  
}
benmack/oneClass documentation built on Dec. 15, 2020, 7:38 p.m.