#' Estimate a density on the unit interval or unit square via
#' local regression
#'
#' Provide density estimates that are needed by \code{fqvalue} and \code{estimate_fpi0}
#'
#' @param x Either a vector or a 2-column matrix
#' @param transformation Either probit (default), complementary log-log, or
#' identity (not recommended)
#' @param eval.points Points at which to evaluate the estimate, default x
#' @param subsample Number of points that are randomly subsampled for
#' computing the fit; useful for computational efficiency and for ensuring
#' the density estimation does not run out of memory. NULL means no the
#' fit is performed on all points
#' @param cv Whether to use generalized cross-validation to choose the
#' nn (nearest neighbor) smoothing parameter
#' @param epsilon How close values are allowed to come to 0
#' @param epsilon.max How close values are allowed to come to 1
#' @param maxk maxk argument passed to locfit
#' @param trim In one-dimensional fitting, the very edges often have high
#' variance. This parameter fixes the estimate on the intervals
#' (0, trim) and (1 - trim, 1).
#' @param ... additional arguments to be passed to lp in locfit, used only
#' if cv=FALSE
#'
#' @import locfit
#'
#' @examples
#' set.seed(12)
#' sim.ttests = simulate_t_tests(m = 1000)
#' p <- sim.ttests$p
#' z0 <- sim.ttests$n
#' z <- rank(z0) / length(z0)
#' lambda <- 0.3
#' phi <- as.numeric(p > lambda)
#' kernelUnitInterval(z[phi == 1], eval.points = z, cv = FALSE)
#'
#' @export
kernelUnitInterval <- function(x, transformation = "probit",
eval.points = x, subsample = 1e5,
cv = FALSE, epsilon = 1e-15, epsilon.max = .999,
maxk = 100, trim = .02, ...) {
transformation <- match.arg(as.character(transformation),
c("ident", "cloglog", "probit"))
trans <- switch(transformation,
"ident" = identity,
"cloglog" = function(x) -log(-log(x)),
"probit" = qnorm)
inv <- switch(transformation,
"ident" = identity,
"cloglog" = function(x) exp(-exp(-x)),
"probit" = pnorm)
dens <- switch(transformation,
"ident" = function(x) 1,
"cloglog" = function(x) exp(-x) * exp(-exp(-x)),
"probit" = dnorm)
# remove 0s and 1s by replacing them with the the most extreme
# non 0/1 value. Further threshold them above a minimum
process.vals <- function(vals) {
vals <- pmax(vals, epsilon)
vals <- pmin(vals, epsilon.max)
}
x <- process.vals(x)
eval.points <- process.vals(eval.points)
s <- trans(x)
if (!is.null(subsample)) {
if (is.matrix(s) && subsample < nrow(s)) {
s <- s[sample(nrow(s), subsample), ]
} else if (!is.matrix(s) && subsample < length(s)) {
s <- s[sample(length(s), subsample)]
}
}
if (is.matrix(s)) {
fitfunc <- function(...) locfit(~ lp(s[, 1], s[, 2], ...), maxk = maxk)
} else {
fitfunc <- function(...) locfit(~ lp(s, ...), maxk = maxk)
}
if (cv) {
# try fitting at several nearest-neighbor bandwidths
# it's possible some will fail (esp for many hypotheses) due to
# insufficient memory
nns <- seq(.1, .9, .1)
gcvs <- sapply(nns, function(nn) {
tryCatch(gcv(fitfunc(nn = nn))["gcv"], error = function(e) NA)
})
if (all(is.na(gcvs))) {
stop(paste("Fitting fails at all attempted bandwidths; try",
"increasing maxk or working with a subsample"))
}
opt.nn <- nns[which.min(gcvs)]
lfit <- fitfunc(nn = opt.nn)
}
else {
lfit <- fitfunc(...)
}
# evaluate at the given points
eval.s <- trans(eval.points)
fs.hat <- predict(lfit, newdata = eval.s)
if (is.matrix(eval.points)) {
corrector <- apply(dens(eval.s), 1, prod)
} else {
corrector <- dens(eval.s)
}
fx.hat <- fs.hat / corrector
if (is.matrix(x)) {
colnames(eval.points) <- c("x1", "x2")
colnames(eval.s) <- c("s1", "s2")
}
ret <- cbind(x = eval.points, fx = fx.hat, eval.s, fs = fs.hat) %>%
as.data.frame() %>%
tibble::as_tibble()
if (trim && !is.matrix(x)) {
ret$fx[ret$x < trim] <- ret %>%
dplyr::slice(which.min(abs(x - trim))) %>%
.$fx
ret$fx[ret$x > 1 - trim] <- ret %>%
dplyr::slice(which.min(abs(x - (1 - trim)))) %>%
.$fx
}
attr(ret, "lfit") = lfit
return(ret)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.