Nothing
#' High Probability Lower Bounds (HPLB) for the Total Variation Distance (TV) Based on Finite Samples
#'
#' Implementations of different HPLBs for TV as described in (Michel et al., 2020).
#'
#' @param t a numeric vector value corresponding to a natural ordering of the observations. For a two-sample
#' test 0-1 numeric values values should be provided.
#' @param rho a numeric vector value providing an ordering. This could be
#' a binary classifier, a regressor, a witness function from a MMD kernel or anything else that would witness a distributional difference.
#' @param s a numeric vector value giving split points on t.
#' @param estimator.type a character value indicating which estimator to use. One option out of:
#' \itemize{
#' \item{\code{adapt}:}{adaptive binary classification estimator (asymptotic bounding function)}
#' \item{\code{bayes}:}{binary classification estimator}
#' \item{\code{bayes_finite_sample}:}{binary classification finite sample estimator}
#' \item{\code{adapt_empirical}:}{adaptive binary classification estimator (simulation-based bounding function)}
#' \item{\code{adapt_custom}:}{adaptive binary classificatrion estimator (user-defined bounding function)}
#' \item{\code{adapt_dwit}:}{adaptive binary classificatrion estimator (for distributional witnesses estimation)}
#' }
#' @param alpha a numeric value giving the overall type-I error control level.
#' @param tv.seq a sequence of values between 0 and 1 used as the grid search for the total variation distance in case of tv-search.
#' @param custom.bounding.seq a list of bounding functions respecting the order of tv.seq used in case of estimator.type "custom-tv-search".
#' @param direction a character vector value made of "left" or "right" giving which distribution witness count to estimate (t<=s or t>s?).
#' @param cutoff a numeric value. This is the cutoff used if bayes estimators are used. The theory suggests to use 1/2 but this can be changed.
#' @param verbose.plot a boolean value for additional plots.
#' @param seed an integer value. The seed for reproducibility.
#' @param ... additional parameters for the function \code{empiricalBF}.
#'
#' @return a \code{list} containing the relevant lower bounds estimates. For the total variation distance the relevant entry is \code{tvhat}.
#'
#' @import data.table
#'
#' @author Loris Michel, Jeffrey Naef
#'
#' @references
#' L. Michel, J. Naef and N. Meinshausen (2020). High-Probability Lower Bounds for the Total Variation Distance \cr
#'
#' @examples
#' ## libs
#' library(HPLB)
#' library(ranger)
#' library(distrEx)
#'
#' ## reproducibility
#' set.seed(0)
#'
#' ## Example 1: TV lower bound based on two samples (bayes estimator), Gaussian mean-shift example
#'
#' n <- 100
#' means <- rep(c(0,2), each = n / 2)
#' x <- stats::rnorm(n, mean = means)
#' t <- rep(c(0,1), each = n / 2)
#'
#' bayesRate <- function(x) {
#' return(stats::dnorm(x, mean = 2) /
#' (stats::dnorm(x, mean = 2) + stats::dnorm(x, mean = 0)))
#' }
#'
#' # estimated HPLB
#' tvhat <- HPLB(t = t, rho = bayesRate(x), estimator.type = "bayes")
#' # true TV
#' TotalVarDist(e1 = Norm(2,1), e2 = Norm(0,1))
#'
#' ## Example 2: optimal mixture detection (adapt estimator), Gaussian mean-shift example
#'
#' n <- 100
#' mean.shift <- 2
#' t.train <- runif(n, 0 ,1)
#' x.train <- ifelse(t.train>0.5, stats::rnorm(n, mean.shift), stats::rnorm(n))
#' rf <- ranger::ranger(t~x, data.frame(t=t.train,x=x.train))
#'
#' n <- 100
#' t.test <- runif(n, 0 ,1)
#'x.test <- ifelse(t.test>0.5, stats::rnorm(n, mean.shift), stats::rnorm(n))
#' rho <- predict(rf, data.frame(t=t.test,x=x.test))$predictions
#'
#' ## out-of-sample
#' tv.oos <- HPLB(t = t.test, rho = rho, s = seq(0.1,0.9,0.1), estimator.type = "adapt")
#'
#' \donttest{
#' ## total variation values
#' tv <- c()
#' for (s in seq(0.1,0.9,0.1)) {
#'
#' if (s<=0.5) {
#'
#' D.left <- Norm(0,1)
#' } else {
#'
#' D.left <- UnivarMixingDistribution(Dlist = list(Norm(0,1),Norm(mean.shift,1)),
#' mixCoeff = c(ifelse(s<=0.5, 1, 0.5/s), ifelse(s<=0.5, 0, (s-0.5)/s)))
#' }
#' if (s < 0.5) {
#'
#' D.right <- UnivarMixingDistribution(Dlist = list(Norm(0,1),Norm(mean.shift,1)),
#' mixCoeff = c(ifelse(s<=0.5, (0.5-s)/(1-s), 0), ifelse(s<=0.5, (0.5/(1-s)), 1)))
#' } else {
#'
#' D.right <- Norm(mean.shift,1)
#' }
#' tv <- c(tv, TotalVarDist(e1 = D.left, e2 = D.right))
#' }
#'
#' ## plot
#' oldpar <- par(no.readonly =TRUE)
#' par(mfrow=c(2,1))
#' plot(t.test,x.test,pch=19,xlab="t",ylab="x")
#' plot(seq(0.1,0.9,0.1), tv.oos$tvhat,type="l",ylim=c(0,1),xlab="t", ylab="TV")
#' lines(seq(0.1,0.9,0.1), tv, col="red",type="l")
#' par(oldpar)
#' }
#' @export
HPLB <- function(t,
rho,
s = 0.5,
estimator.type = "adapt",
alpha = 0.05,
tv.seq = seq(from = 0, to = 1, by = 1/length(t)),
custom.bounding.seq = NULL,
direction = rep("left", length(s)),
cutoff = 0.5,
verbose.plot = FALSE,
seed = 0,
...) {
## parameters checks
# seed check
if (!is.null(seed) && (!is.numeric(seed) || any(!is.finite(seed)) || length(seed) != 1)) {
stop("Invalid seed, please see ?HPLB.")
}
# setting seed for reproducility
if (!is.null(seed)) {
set.seed(seed)
}
# estimator.type check
if (!is.character(estimator.type) || !(estimator.type %in% c("adapt",
"bayes",
"bayes_finite_sample",
"adapt_empirical",
"adapt_dwit"
))) {
stop("Invalid estimator type, please see ?HPLB.")
}
# t check
if (!is.numeric(t) || any(!is.finite(t)) || is.matrix(t)) {
stop("Invalid values or type for t, please see ?HPLB.")
}
# s check
if (!is.numeric(s) || any(!is.finite(s)) || any(s > max(t)) || any(s < min(t)) || is.matrix(s)) {
stop("Invalid values or type for s, please see ?HPLB.")
}
# rho check
if (!is.numeric(rho) || any(!is.finite(rho))) {
stop("Invalid values for rho, please see ?HPLB.")
}
# two cases depending on the dimension of rho
if (is.matrix(rho)) {
if (nrow(rho) != length(t)) {
stop("Incompatible dimensions between t and rho, please see ?HPLB.")
}
if (ncol(rho) != length(s)) {
stop("Incompatible dimensions between s and rho, please see ?HPLB.")
}
ordering.type = "multiple"
} else {
if (length(rho) != length(t)) {
stop("Incompatible dimensions between t and rho, please see ?HPLB.")
}
ordering.type = "single"
}
# ordering.type check
if (estimator.type == "bayes" && ordering.type == "multiple") {
stop("Invalid ordering type for binomial-test estimator, please see ?HPLB.")
}
# alpha check
if (!is.numeric(alpha) || (length(alpha) != 1) || (alpha > 1) || (alpha <= 0)) {
stop("Invalid type I error level alpha, please see ?HPLB.")
}
# tv.seq check
if (!is.numeric(tv.seq) || any(!is.finite(tv.seq)) || any(tv.seq > 1) || any(tv.seq < 0) || any(duplicated(tv.seq))) {
stop("Invalid grid for tv, please see ?HPLB.")
}
# direction check
if ((estimator.type == "adapt_dwit") && (!is.character(direction) || !(direction %in% c("left", "right")) || (length(s) != length(direction)))) {
stop("Invalid direction, please see ?HPLB.")
}
# check that with user-defined bounding sequence or empirical-tv-seach s should be uni-dim
if (estimator.type %in% c("adapt_empirical", "adapt_custom")) {
if (length(s) != 1) {
stop("Incompatible value of s and estimator.type, please see ?HPLB.")
}
if (estimator.type == "adapt_custom") {
if (is.null(custom.bounding.seq)) {
stop("Invalid custom.bounding.seq, please see ?HPLB.")
} else {
if (length(custom.bounding.seq) != length(tv.seq)) {
stop("Incompatible dimensions between custom.bounding.seq and tv.seq, please see ?HPLB.")
}
}
}
if (estimator.type %in% c("adapt_empirical")) {
custom.bounding.seq <- empiricalBF(tv.seq = tv.seq,
alpha = alpha,
m = sum(t <= s),
n = sum(t > s), ...)
}
# dimensions of s, k and tresholds check
if (length(s) != length(k)) {
stop("Incompatible dimensions between s, and k, please see ?HPLB.")
}
}
## core of the code
# sort in increasing order the grid of tv value candidates
tv.seq <- sort(tv.seq, decreasing = FALSE)
# define the returned values
dwithat.left <- rep(NA, length(s))
dwithat.right <- rep(NA, length(s))
tvhat.left <- rep(NA, length(s))
tvhat.right <- rep(NA, length(s))
tvhat <- rep(NA, length(s))
# main loop, iteration over the different splits
for (k in 1:length(s)) {
# first we obtain the ranks from the rho
if (ordering.type == "single") {
ranks <- rank(rho, ties.method = "random")
} else {
ranks <- rank(rho[,k], ties.method = "random")
}
# then we cast everything in a data.table object
ranks.table <- data.table::data.table(rank = ranks, t = t)
# let us order by rank the table
data.table::setkey(ranks.table, "rank")
# getting the parameters m and n
m <- nrow(ranks.table[t <= s[k]])
n <- nrow(ranks.table) - m
if (m == 0 || n == 0) {
warning(paste0("m or n is zero for split ", s, ", please see ?HPLB."))
next
}
## binary classifer (finite sample version of bayes)
if (estimator.type == "bayes_finite_sample") {
# compute the decisions with randomization when rho is exactly 0.5
decision <- ifelse(rho == 0.5,
sample(c(TRUE, FALSE),
size = length(rho), replace = TRUE),
rho > cutoff)
# compute the rightly classified number of points per class
obs0 <- sum(!decision & t == 0)
obs1 <- sum(decision & t == 1)
# accuracy lower bound for class 0
phat0 <- stats::binom.test(x = obs0,
n = sum(t == 0),
p = 0.5,
alternative = "greater",
conf.level = 1 - (alpha / 2))$conf.int[1]
# accuracy lower bound for class 1
phat1 <- stats::binom.test(x = obs1,
n = sum(t == 1),
p = 0.5,
alternative = "greater",
conf.level = 1 - (alpha / 2))$conf.int[1]
# compute the estimated tvhat LB value
tvhat <- max(phat0 + phat1 - 1, 0)
}
## binary classifier (bayes from the paper)
if (estimator.type == "bayes") {
# compute the decisions
decision <- ifelse(rho == 0.5,
sample(c(TRUE, FALSE),
size = length(rho), replace = TRUE),
rho > cutoff)
# accuracy for class 0
Ahat0 <- sum(!decision & t == 0) / sum(t==0)
# accuracy for class 1
Ahat1 <- sum(decision & t == 1) / sum(t==1)
# compute the asymptotic standard deviation
sigma <- sqrt(Ahat0 * (1-Ahat0) / sum(t==0) + Ahat1 * (1-Ahat1) / sum(t==1))
# compute the tvhat value
tvhat <- max(Ahat0 + Ahat1 - 1 - stats::qnorm(1-alpha) * sigma, 0)
}
## estimators based on adaptive binary classification (adapt)
# define the V function corresponding to m and z (and n and z respectively)
V.left <- cumsum(ranks.table$t <= s[k])
V.right <- cumsum(rev(ranks.table$t > s[k]))
if (estimator.type == "adapt") {
# recursive search
max.stat <- 1
step <- 1
while (max.stat > 0 ) {
# search updates
tv.cur <- tv.seq[step]
step <- step + 1
# upper-bound dWit left and right
dWit.left <- stats::qbinom(p = 1 - (alpha / 3), size = m, prob = tv.cur)
dWit.right <- stats::qbinom(p = 1 - (alpha / 3), size = n, prob = tv.cur)
# left over hypergeometric parameters m0 and n0 in the middle
m0 <- m - dWit.left
n0 <- n - dWit.right
if ((m0 <= 2) | (n0 <= 2)) {
warning(paste0("m0 or n0 smaller or equal to 2 for split: ",
s[k], ",
asymtotic regime may not hold, see ?HPLB."))
break
}
# asymptotic parameters
x.alpha <- -log(-log(1-alpha/3)/2)
beta.left <- sqrt(2 * log(log(m0))) + (log(log(log(m0)))-log(pi)+2*x.alpha) / (2 * sqrt(2 * log(log(m0))))
beta.right <- sqrt(2 * log(log(n0))) + (log(log(log(n0)))-log(pi)+2*x.alpha) / (2 * sqrt(2 * log(log(n0))))
# mean and sd under the null (asymptotics)
sd0 <- sapply(1:(m0+n0), function(z) sqrt(z * (m0 /(m0+n0)) * (n0 /(m0+n0)) * ((m0+n0-z)/(m0+n0-1))))
mean0 <- sapply(1:(m0+n0), function(z) z * (m0 /(m0+n0)))
# creating the bounding function
Q <- rep(0, m + n)
if (dWit.left != 0) {
Q[1:dWit.left] <- 1
Q <- cumsum(Q)
}
# hypergeometric in the middle
ind.hypergeom <- c((dWit.left+1):(m+n-dWit.right))
Q[ind.hypergeom] <- Q[ind.hypergeom] + mean0 + beta.left * sd0
# witnesses right
Q[(m+n-dWit.right):(m+n)] <- m
Q <- pmin(Q, m)
max.stat <- max(V.left-Q)
if (verbose.plot && (tv.cur == 0)) {
graphics::plot(V.left - mean0, type="l",
main="Observed centered V-function with asymptotic bounding function", xlab="z", ylab="V(z)",
font.main=1,font.lab=1,font.axis=1)
graphics::lines(beta.left * sd0, col="blue")
}
if (tv.cur == max(tv.seq)){
break
}
}
# return the tv LB estimate
tvhat[k] <- tv.cur
} else if (estimator.type %in% c("adapt_empirical", "adapt_custom")) {
# recursive search
max.stat <- 1
step <- 1
while (max.stat > 0 & step != length(tv.seq)) {
tv.cur <- tv.seq[step]
max.stat <- max(V.left - custom.bounding.seq[[step]])
step <- step + 1
if (verbose.plot && (tv.cur == 0)) {
graphics::plot(V.left - mean0, type="l",
main="Observed centered V-function with custom bounding function",xlab="z",ylab="V(z)",
font.main=1,font.lab=1,font.axis=1)
graphics::lines(beta.left * sd0, col="blue")
}
}
# return the tv LB estimate
tvhat[k] <- tv.cur
} else if (estimator.type == "adapt_wit") {
if (direction[k] == "left") {
m <- nrow(ranks.table[t <= s[k]])
n <- nrow(ranks.table) - m
} else if (direction[k] == "right") {
n <- nrow(ranks.table[t <= s[k]])
m <- nrow(ranks.table) - n
}
# recursive search default values
max.stat <- 1
step <- 1
dwit.left.grid <- c(0:m)
while (max.stat > 0) {
# search updates
dwit.left <- dwit.left.grid[step]
step <- step + 1
# overestimating the tv for the number of right dwit
tv.cur <- stats::binom.test(x = dwit.left,
n = m,
p = 0.5,
alternative = "less",
conf.level = 1 - (alpha / 3))$conf.int[2]
# dwit right
dwit.right <- stats::qbinom(p = 1 - (alpha / 3),
size = n,
prob = tv.cur)
# define mo and no for the hypergeometric part
m0 <- m - dwit.left
n0 <- n - dwit.right
# asymptotic values
x.alpha <- -log(-log(1-alpha/3)/2)
beta.left <- sqrt(2 * log(log(m0))) + (log(log(log(m0)))-log(pi)+2*x.alpha) / (2 * sqrt(2 * log(log(m0))))
beta.right <- sqrt(2 * log(log(n0))) + (log(log(log(n0)))-log(pi)+2*x.alpha) / (2 * sqrt(2 * log(log(n0))))
# mean and sd under the null (asymptotics)
sd0 <- sapply(1:(m0+n0), function(z) sqrt(z * (m0 /(m0+n0)) * (n0 /(m0+n0)) * ((m0+n0-z)/(m0+n0-1))))
mean0 <- sapply(1:(m0+n0), function(z) z * (m0 /(m0+n0)))
# creating the bounding function
Q <- rep(0, m + n)
# adding witnesses left
if (dwit.left != 0) {
Q[1:dwit.left] <- 1
Q <- cumsum(Q)
}
# hypergeometric filling in the middle
ind.hypergeom <- c((dwit.left+1):(m+n-dwit.right))
Q[ind.hypergeom] <- Q[ind.hypergeom] + mean0 + beta.left * sd0
# adding witnesses right
Q[(m+n-dwit.right):(m+n)] <- m
Q <- pmin(Q, m)
if (direction[k] == "left") {
max.stat <- max(V.left - Q)
} else if (direction[k] == "right") {
max.stat <- max(V.right - Q)
}
}
# updating the value for the split depending on the direction
if (direction[k] == "left") {
dwithat.left[k] <- dwit.left
} else if (direction[k] == "right") {
dwithat.right[k] <- dwit.left
}
}
}
# return a final list with relevant LB estimates
return(Filter(x = list(dwithat.left = dwithat.left,
dwithat.right = dwithat.right,
tvhat.left = tvhat.left,
tvhat.right = tvhat.right,
tvhat = tvhat),
f = function(ll) !any(is.na(ll))))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.