R/nmm_selection_criteria.R

Defines functions nmm_selection_criteria

Documented in nmm_selection_criteria

#' @title nmm_selection_criteria
#' 
#' @description runs a couple of checks which can be used in order to select a model. 
#' 
#' @param x an object of class \code{trainOcc}
#' @param un ...
#' @param nmm the normal mixture (density) models (see also \code{\link[Mclust]{densityMclust}}
#' @param modRow the model in \code{x} used to predict the data with which \code{nmm} has been estimated. If \code{null} the \code{finalModel} (\code{x$finalModel}) is used.
#' @param fnamePlot if given the diagnostic is saved to the specified filename (defualt is \code{NULL})
#' @export
nmm_selection_criteria <- function(x, un=NULL, nmm=NULL, modRow=NULL, fnamePlot=NULL) {
 
  # allow 
  #  #' @param aggregateHop should the hold-out predictions be aggregated (defualt is \code{TRUE})
  aggregateHop=TRUE
  
  ##############################################################################
  ### functions 
  
  ### --------------------------------------------------------------------------
  ### theoretical positive distribution
  getTheo <- function (nmm) {
    idx <- which.max(nmm$parameters$mean)
    ans <- length(nmm$parameters$variance$sigmasq)
    idx.var <- ifelse(ans==1, 1,  ans)
    theo <- list(mean = nmm$parameters$mean[idx],
                 var = nmm$parameters$variance$sigmasq[idx.var],  ### Varianz
                 sd = sqrt(nmm$parameters$variance$sigmasq[idx.var]))
    return(theo) }
  # theo <- getTheo(nmm)
  
  ### --------------------------------------------------------------------------
  ### kruskal wallis test
  myKruskal <- function (pos, un) {
    tst <- kruskal.test(x=c(pos, un), rep(c(1, 0), c(length(pos), length(un)) ) )
    return(c(s.kruskal=tst$statistic[[1]], p.kruskal=tst$p.value[[1]]))
  }
  # myKruskal(pos, un)
  
  ### --------------------------------------------------------------------------
  ### kolmogorov-smirnoff test
  myKs <- function(pos, theo) {
    tst <- ks.test(pos, "pnorm", mean = theo$mean, sd = theo$sd)
    return(c(s.ks=tst$statistic[[1]], p.ks=tst$p.value))
  }
  # myKs(pos, theo)
  
  ### --------------------------------------------------------------------------
  ### z test 
  myZ <- function (pos, theo) {
    tst <- (mean(pos)-theo$mean)/(theo$sd)
    p.z <- pnorm(abs(tst)) - pnorm(-abs(tst))
    # ifelse(pnorm(myz) < 0.025 | pnorm(myz) > 0.975, "Test signifikant für alpha=0.05", "Sorry, not significant")
    return(c(p.z=p.z[[1]]))
  }
  # myZ(pos, theo)
  # mytest <- t.test(pos,  alternative = "two.sided", mu = theo$mean,
  #                 paired = FALSE, var.equal = FALSE, conf.level = 0.95)
  
  ### --------------------------------------------------------------------------
  ### chi-square test
  getChi <- function(pos, theo) max((var(pos)/sqrt(length(pos)))/theo$var, theo$var/(var(pos)/sqrt(length(pos))))  
  myChisq <- function(pos, theo) {
    chi <- getChi(pos, theo)
    p <- pchisq(chi, df=length(pos)-1, lower.tail = FALSE, log.p = FALSE)
    return(c(s.chi=chi,p.chi=p))
  }
  # myChisq(pos, theo)
  
  ### --------------------------------------------------------------------------
  ### error/overlap between the (assumed) positive and all other components
  ### returns the minimum densitiy 
  min_f1f2 <- function(newdata, model) {
    idx <- which.max(nmm$parameters$mean)
    d.mat <- predict(model, newdata, what="cdens")
    pro.mat <- matrix(rep(model$parameters$pro), length(newdata), model$G, byrow=TRUE)
    scaled.d <- d.mat*pro.mat
    rtrn <- pmin(rowSums(scaled.d[, -idx, drop=FALSE]), scaled.d[, idx])
    return(rtrn)
  }
  getOverlap <- function(nmm, theo) {
    rng <- range(qnorm(c(.01, .99), theo$mean, theo$sd))
    integrate(min_f1f2, lower=rng[1], upper=rng[2], model=nmm, subdivisions=100L)
  }
  
  ### --------------------------------------------------------------------------
  ### error/overlap between the theoretical distribution and the empirical
  min_f1f2_PP <- function(newdata, theo, empi) {
    pmin(dnorm(newdata, theo$mean, theo$sd), 
         dnorm(newdata, empi$mean, empi$sd))
  }
  getOverlap_PP <- function(theo, empi) {
    rng <- range(c(qnorm(c(.01, .99), theo$mean, theo$sd), 
                   qnorm(c(.01, .99), empi$mean, empi$sd)))
    integrate(min_f1f2_PP, lower=rng[1], upper=rng[2], theo=theo, empi=empi, subdivisions=1000L)
  }
  
  ### --------------------------------------------------------------------------
  ### 
  plot_theoEmpi <- function(theo, empi) {
    rng <- range(c(qnorm(c(.01, .99), theo$mean, theo$sd), 
                   qnorm(c(.01, .99), empi$mean, empi$sd)))
    x <- seq(rng[1], rng[2], length.out=50)
    plot(x, dnorm(x, theo$mean, theo$sd), xlab="predictive value", ylab="density")
    points(x, dnorm(x, empi$mean, empi$sd), pch=4)
    legend("topright", c("theoretical from nmm", "est. from P"), pch=c(1,4))
  }
  
  ### --------------------------------------------------------------------------
  ### 
  diagnosticPlot <- function(x, nmm, tsts, empi=NULL, ...) {
    
    rng <- nmm$range
    input <- seq(rng[1], rng[2], length.out=101)
    ys <- min_f1f2(input, nmm)
    xs <- c(input, input[1])
    ys <- c(ys, ys[1])
    pred <- predict(nmm, input, what="cdens")
    pred.scl <- pred*NA
    for (i in 1:nmm$G)
      pred.scl[,i] <- pred[,i]*nmm$parameters$pro[i]
    
    if (!is.null(x)) {
      hist(x, predUn=nmm$data, ...)
      lines(input, rowSums(pred.scl), lwd=3)
    } else {
      plot(nmm, what="density", lwd=3, ...)
    }
    polygon(xs, ys, col="gray")
    for (i in 1:nmm$G) {
      idx.sd <- ifelse(length(nmm$parameters$variance$sigmasq)==1, 1, i)
      # lines(input, pred[,i]*nmm$parameters$pro[i], lwd=2, col="darkgrey")
      lines(input, dnorm(input, 
                         mean=nmm$parameters$mean[i], 
                         sd=sqrt(nmm$parameters$variance$sigmasq[idx.sd]) )*
              nmm$parameters$pro[i], col="red", lwd=2 )
    }
    if (!is.null(empi))
      lines(input, dnorm(input, 
                         empi$mean, empi$sd)*nmm$parameters$pro[which.max(nmm$parameters$mean)], 
            lty=3, lwd=2)
    
    legend("topleft", legend=paste(names(tsts), signif(tsts, 3), sep=": "), cex=.8)
  }
  # diagnosticPlot(x, nmm, tsts, empi=empi)
  
  ### end of functions 
  ##############################################################################
  
  
  ### update trainOcc model x if modRow is given, else it is asumed
  if(!is.null(modRow)) {
    # modRow=modelPosition(x)$row
    x <- update(x, modRow=modRow)
  }
  hop <- holdOutPredictions(x)
  
  ### calculate nmm if it is not given
  if (is.null(nmm) & !is.null(un)) {
    nmm <- mclust(un)
  } else if (!is.null(nmm) & !is.null(un)) {
    stop("\"un\" or \"nmm\" must be given.")
  }
  
  if (aggregateHop | !is.list(hop$pos)) {
    hop <- aggregate(hop)
    hop$pos <- list(hop$pos) ### just that we can use the same loop
    hop$un <- list(hop$un)   ### as if aggregateHop=FALSE
  } 
  
  ### here a loop would begin in the bootstrap case
  pos <- hop$pos[[1]]
  un <- un <- hop$un[[1]]
  
  empi <- list(mean=mean(pos), 
               var=var(pos), 
               sd=sd(pos))
  theo <- getTheo(nmm)
  
  ### ------------------------------------
  ### overlaps
  olap <- getOverlap(nmm, theo)
  
  olap.pp <- getOverlap_PP(theo, empi)
  # plot_theoEmpi(theo, empi)
  
  ### ------------------------------------
  ### tests
  tsts <- c(myKruskal(pos, un), 
            myZ(pos, theo), 
            myChisq(pos, theo), 
            myKs(pos, theo), 
            olap=olap$value, 
            olap.pp=olap.pp$value, 
            pro.pos=nmm$parameters$pro[which.max(nmm$parameters$mean)])
  
  
  if (!is.null(fnamePlot)) {
    pdf(fnamePlot)
    diagnosticPlot(x, nmm, tsts, empi=empi)
    dev.off()
  } else {
    diagnosticPlot(x, nmm, tsts, empi=empi)
  }
  return(as.data.frame(matrix(tsts, nrow=1, dimnames=list(NULL, names(tsts)))))
}
benmack/oneClass documentation built on Dec. 15, 2020, 7:38 p.m.