neastbenchmark: Benchmark Data for Disease Clusters

Documented in npv

#' Compute negative predictive value
#'
#' Compute the empirical negative predictive value (npv)
#' rate of a method from a series of tests.  In this
#' context, the npv is the average proportion of the
#' population that lies outside the most
#' likely cluster that intersects the true null regoin.  
#' The function requires the null test
#' statistics, the results from the observed data sets
#' (i.e., the maximum test statistic and most likely cluster
#' from each data set), the true hotspot locations, and the
#' vector of population sizes for each region.  See Details.
#'
#' In this context, the npv is the proportion of the
#' population lying outside the most likely cluster that
#' intersects the true null regoins, averaged over all tests.
#'
#' @inheritParams sensitivity
#'
#' @return A vector of npv values.
#' @export
#'
#' @examples
#' tnull = 1:99
#' tdata = list(list(tmax = 96, mlc = c(50, 51)),
#'              list(tmax = 101, mlc = c(48, 57)))
#' npv(tnull, tdata, 50, pop = rep(10, 100))
npv = function(tnull, tdata, hotspot, pop, alpha = c(0.05, 0.01)) {
  if (length(pop) <= 1) {
    stop("pop should be the vector of population sizes for each region. It should have length more than 1.")
  }
  # sort all max statistics
  quants = sort(tnull)
  # determine location of appropriate quantiles
  idx = (length(quants) + 1) * (1 - alpha)
  # initial quantiles
  oq = quants[idx]
  # tmax for each simulated data set
  tmax = sapply(tdata, getElement, name = "tmax")
  # determine if tmax is less than initial quantiles
  nq = sapply(oq, function(x) tmax < x)
  # if they are, those positions need a different quantile
  quants2 = matrix(oq, nrow = length(tdata), 
                   ncol = length(oq), byrow = TRUE)
  wnq = which(nq, arr.ind = TRUE)
  for (i in seq_len(nrow(wnq))) {
    quants2[wnq[i,1], wnq[i,2]] = 
      max(tmax[wnq[i,1]], quants[idx[wnq[i,2]] - 1])
  }
  
  out = apply(quants2, 2, function(x) {
    tmax >= x
  })
  colnames(out) = alpha

  all_regions = seq_along(pop)
  null_region = setdiff(all_regions, hotspot)
  pop_hotspot = sum(pop[hotspot])
  
  p = sapply(seq_along(tdata), function(i) {
    outer = setdiff(all_regions, tdata[[i]]$mlc)
    pop_mlc_complement = sum(pop[outer])
    outer = intersect(outer, null_region)
    pop_outer = sum(pop[outer])
    # proportion of population outside mlc intersecting outer
    temp = out[i, ] * pop_outer/pop_mlc_complement
    # if we didn't reject, we "detected" the entire null region
    temp[-which(out[i,])] = 1
    temp
  })
  
  if (length(p) == length(tdata)) {
    return(mean(p))
  } else {
    return(rowMeans(p))
  }
}