Nothing
#' @title Homogeneity test for several polyspherical samples
#'
#' @description Permutation tests for the equality of distributions of two or
#' \eqn{k} samples of data on \eqn{\mathcal{S}^{d_1} \times \cdots \times
#' \mathcal{S}^{d_r}}. The Jensen--Shannon distance is used to construct a test
#' statistic measuring the discrepancy between the \eqn{k} kernel density
#' estimators. Tests based on the mean and scatter matrices are also available,
#' but for only two samples (\eqn{k=2}).
#'
#' @inheritParams kde_polysph
#' @param labels vector with \code{k} different levels indicating the group.
#' @param type kind of test to be performed: \code{"jsd"} (default), a test
#' comparing the kernel density estimators for \eqn{k} groups using the
#' Jensen--Shannon distance; \code{"mean"}, a simple test for the equality of
#' two means (non-omnibus for testing homogeneity); \code{"scatter"}, a simple
#' test for the equality of two scatter matrices; \code{"hd"}, a test comparing
#' the kernel density estimators for two groups using the Hellinger distance.
#' @param B number of permutations to use. Defaults to \code{1e3}.
#' @param M number of Monte Carlo samples to use when approximating the
#' Hellinger/Jensen--Shannon distance. Defaults to \code{1e4}.
#' @param cv_jsd use cross-validation to approximate the Jensen--Shannon
#' distance? Does not require Monte Carlo. Defaults to \code{TRUE}.
#' @param plot_boot flag to display a graphical output of the test decision.
#' Defaults to \code{FALSE}.
#' @param seed_jsd seed for the Monte Carlo simulations used to estimate the
#' integrals in the Jensen--Shannon distance in the original and bootstrapped
#' statistics. Defaults to \code{NULL} (no seed is fixed).
#' @details Only \code{type = "jsd"} is able to deal with \eqn{k > 2}.
#'
#' The \code{"jsd"} statistic is the Jensen--Shannon divergence. This statistic
#' is bounded in \eqn{[0, 1]}. The \code{"mean"} statistic measures the maximum
#' (chordal) distance between the estimated group means. This statistic is
#' bounded in \eqn{[0, 1]}. The \code{"scatter"} statistic measures the maximum
#' affine invariant Riemannian metric between the estimated scatter matrices.
#' The \code{"hd"} statistic computes a monotonic transformation of the
#' Hellinger distance, which is the Bhattacharyya divergence (or coefficient).
#' @return An object of class \code{"htest"} with the following fields:
#' \item{statistic}{the value of the test statistic.}
#' \item{p.value}{the p-value of the test.}
#' \item{statistic_perm}{the \code{B} permuted statistics.}
#' \item{n}{a table with the sample sizes per group.}
#' \item{h}{bandwidths used.}
#' \item{B}{number of permutations.}
#' \item{alternative}{a character string describing the alternative hypothesis.}
#' \item{method}{the kind of test performed.}
#' \item{data.name}{a character string giving the name of the data.}
#' @examples
#' ## Two-sample case
#' \donttest{
#' # H0 holds
#' n <- c(50, 100)
#' X1 <- rotasym::r_vMF(n = n[1], mu = c(0, 0, 1), kappa = 1)
#' X2 <- rotasym::r_vMF(n = n[2], mu = c(0, 0, 1), kappa = 1)
#' hom_test_polysph(X = rbind(X1, X2), labels = rep(1:2, times = n),
#' d = 2, type = "jsd", h = 0.5)
#'
#' # H0 does not hold
#' X2 <- rotasym::r_vMF(n = n[2], mu = c(0, 1, 0), kappa = 2)
#' hom_test_polysph(X = rbind(X1, X2), labels = rep(1:2, times = n),
#' d = 2, type = "jsd", h = 0.5)
#'
#' ## k-sample case
#'
#' # H0 holds
#' n <- c(50, 100, 50)
#' X1 <- rotasym::r_vMF(n = n[1], mu = c(0, 0, 1), kappa = 1)
#' X2 <- rotasym::r_vMF(n = n[2], mu = c(0, 0, 1), kappa = 1)
#' X3 <- rotasym::r_vMF(n = n[3], mu = c(0, 0, 1), kappa = 1)
#' hom_test_polysph(X = rbind(X1, X2, X3), labels = rep(1:3, times = n),
#' d = 2, type = "jsd", h = 0.5)
#'
#' # H0 does not hold
#' X3 <- rotasym::r_vMF(n = n[3], mu = c(0, 1, 0), kappa = 2)
#' hom_test_polysph(X = rbind(X1, X2, X3), labels = rep(1:3, times = n),
#' d = 2, type = "jsd", h = 0.5)
#' }
#' @export
hom_test_polysph <- function(X, d, labels,
type = c("jsd", "mean", "scatter", "hd")[1],
h = NULL, kernel = 1, kernel_type = 1, k = 10,
B = 1e3, M = 1e4, plot_boot = FALSE,
seed_jsd = NULL, cv_jsd = TRUE) {
# Dimensions and sample sizes
r <- length(d)
N <- nrow(X)
# Split X into kg groups
n <- table(labels)
stopifnot(all(n > 1))
labels_levels <- names(n)
n <- unname(n)
kg <- length(labels_levels)
# Checks
stopifnot(N == length(labels))
stopifnot(ncol(X) == sum(d + 1))
# Type of test
if (type == "mean") {
# Only two groups
stopifnot(kg == 2)
# Fill bandwidths
h <- NULL
# Index for accessing each S^dj with ind[j]:(ind[j + 1] - 1)
ind <- cumsum(c(1, d + 1))
# Test information
method <- "Permutation-based test of means equality"
alternative <- ifelse(r == 1, "both group means are different",
"at least two group means are different")
# Test statistic function
Tn <- function(perm_index) {
# Permute labels by perm_index
perm_labels <- labels[perm_index]
# Two samples
X1 <- X[perm_labels == labels_levels[1], , drop = FALSE]
X2 <- X[perm_labels == labels_levels[2], , drop = FALSE]
# Discrepancies
discs <- sapply(seq_len(r), function(j) {
# Access to S^dj
ind_j <- ind[j]:(ind[j + 1] - 1)
# Estimated means
mu_hat_1 <- DirStats::mu_ml(data = X1[, ind_j])
mu_hat_2 <- DirStats::mu_ml(data = X2[, ind_j])
# Chordal distance (monotonic relation to its intrinsic counterpart)
return(0.5 * (1 - sum(mu_hat_1 * mu_hat_2)))
})
# Maximal discrepancy
max(discs)
}
} else if (type == "scatter") {
# Only two groups
stopifnot(kg == 2)
# Fill bandwidths
h <- NULL
# Index for accessing each S^dj with ind[j]:(ind[j + 1] - 1)
ind <- cumsum(c(1, d + 1))
# Test information
method <- "Permutation-based test of scatter matrices equality"
alternative <- ifelse(r == 1, "both group scatter matrices are different",
"at least two group scatter matrices are different")
# Test statistic function
Tn <- function(perm_index) {
# Permute labels by perm_index
perm_labels <- labels[perm_index]
# Two samples
X1 <- X[perm_labels == labels_levels[1], , drop = FALSE]
X2 <- X[perm_labels == labels_levels[2], , drop = FALSE]
# Discrepancies
discs <- sapply(seq_len(r), function(j) {
# Access to S^dj
ind_j <- ind[j]:(ind[j + 1] - 1)
# Estimated scatters
S_1 <- crossprod(X1[, ind_j]) / n[1]
S_2 <- crossprod(X2[, ind_j]) / n[2]
# Affine invariant Riemannian metric between SPD matrices
return(sqrt(sum(log(eigen(x = solve(S_1, S_2), symmetric = FALSE,
only.values = TRUE)$values)^2)))
})
# Maximal discrepancy
max(discs)
}
} else {
# Set bandwidths
if (is.null(h)) {
h <- bw_rot_polysph(X = X, d = d, kernel = kernel,
kernel_type = kernel_type, k = k)$bw
} else {
stopifnot(length(h) == r)
}
# Kernel-based tests
if (type == "jsd") {
# Test information
method <- "Permutation-based Jensen--Shannon distance test of homogeneity"
alternative <- "any alternative to homogeneity"
# Estimated probabilities
probs <- n / N
# Test statistic function
Tn <- function(perm_index) {
# The Jensen--Shannon Divergence can be conveniently written as
# JSD(f_1, ..., f_k) = H(f_0) - \sum_j \pi_j * H(f_j) where
# H(f_j) = -\int \log(f_j(x)) f_j(x) is the Shannon entropy of f_j
# and f_0(x) = \sum_j \pi_j * f_j(x)
# Therefore, to approximate JSD(f_1, ..., f_k) we can:
# (1) Obtain Monte Carlo samples from f_j
# (2) Obtain a Monte Carlo sample from f_0 by recycling those extracted
# in 1) and respecting the proportion pi_j of samples from f_j in
# the final sample of f_0.
# (3) Approximate H(f_j) by Monte Carlo, evaluating \log(f_j).
# (4) Approximate H(f_0) by Monte Carlo, evaluating
# \log(f_0) = log_f0_max + \sum(\exp(log_f0 - log_f0_max)).
# Alternatively to (1)-(4), replace the Monte Carlo samples by the
# observed samples, using log_cv_kde_polysph(). Therefore:
# (1) Approximate H(f_j) = mean(log_cv_kde_polysph(X = X_j, h = h_j))
# (2) Approximate H(f_0)
# Permute labels by perm_index
perm_labels <- labels[perm_index]
# Access groups
ind_j <- lapply(1:kg, function(j)
which(perm_labels == labels_levels[j]))
# CV approximation of the H(f_j)'s and H(f_0)?
if (cv_jsd == 1) {
# Compute pi_j * log_fj_cv and concatenate them
mc_samp <- list()
log_fj_cv <- numeric()
for (j in seq_len(kg)) {
log_fj_cv <- c(log_fj_cv,
log_cv_kde_polysph(X = X[ind_j[[j]], ], d = d,
h = h, kernel = kernel,
kernel_type = kernel_type,
k = k, wrt_unif = TRUE))
}
log_fj_cv <- log_fj_cv / N
# Compute log_f0_cv
log_f0_cv <- log_cv_kde_polysph(X = X, d = d, h = h, kernel = kernel,
kernel_type = kernel_type, k = k,
weights = rep(probs / n, times = n),
wrt_unif = TRUE)
log_f0_cv <- log_f0_cv / N
# H_f0 - sum(probs * H_fj), removing Infs and NaNs from the sum
H_f0_fj_dif <- -log_f0_cv + log_fj_cv
H_f0_fj_dif[!is.finite(H_f0_fj_dif)] <- NA
jsd <- sum(H_f0_fj_dif, na.rm = TRUE)
} else if (cv_jsd == 2) {
# Estimate H(f_j)'s
H_fj <- numeric(kg)
for (j in seq_len(kg)) {
# H(f_j)
log_fj <- kde_polysph(x = X[ind_j[[j]], ], X = X[ind_j[[j]], ],
d = d, h = h, kernel = kernel,
kernel_type = kernel_type, k = k, log = TRUE,
wrt_unif = TRUE)
H_fj[j] <- -mean(log_fj)
}
# H(f_0)
log_f0 <- kde_polysph(x = X, X = X, d = d, h = h, kernel = kernel,
kernel_type = kernel_type, k = k,
weights = rep(probs / n, times = n),
log = TRUE, wrt_unif = TRUE)
H_f0 <- -mean(log_f0)
# Finally, compute Jensen--Shannon divergence
jsd <- H_f0 - sum(probs * H_fj)
} else {
# Estimate H(f_j)'s
mc_samp <- list()
H_fj <- numeric(kg)
for (j in seq_len(kg)) {
# j-th group Monte Carlo sample
mc_samp[[j]] <- r_kde_polysph(n = M, X = X[ind_j[[j]], ], d = d,
h = h, kernel = kernel,
kernel_type = kernel_type, k = k)
# H(f_j)
log_fj <- kde_polysph(x = mc_samp[[j]], X = X[ind_j[[j]], ], d = d,
h = h, kernel = kernel,
kernel_type = kernel_type, k = k, log = TRUE,
wrt_unif = TRUE)
H_fj[j] <- -mean(log_fj)
}
# Pooled Monte Carlo sample
M_j <- round(M * probs)
mc_samp_0 <- matrix(nrow = sum(M_j), ncol = ncol(mc_samp[[1]]))
ind_0 <- c(0, cumsum(M_j))
for (j in seq_len(kg)) {
mc_samp_0[(ind_0[j] + 1):ind_0[j + 1], ] <-
mc_samp[[j]][1:M_j[j], , drop = FALSE]
}
# log_f0 for H(f_0)
log_f0 <- matrix(nrow = nrow(mc_samp_0), ncol = kg)
for (j in seq_len(kg)) {
log_f0[, j] <- kde_polysph(x = mc_samp_0, X = X[ind_j[[j]], ],
d = d, h = h, kernel = kernel,
kernel_type = kernel_type, k = k,
log = TRUE, wrt_unif = TRUE) +
log(probs[j])
}
log_max <- apply(log_f0, 1, max)
log_f0 <- log_max + log(rowSums(exp(log_f0 - log_max)))
H_f0 <- -mean(log_f0)
# Finally, compute Jensen--Shannon divergence
jsd <- H_f0 - sum(probs * H_fj)
}
return(jsd)
}
} else if (type == "hd") {
# Only two groups
stopifnot(kg == 2)
# Test information
method <- "Permutation-based Hellinger distance test of homogeneity"
alternative <- "any alternative to homogeneity"
# Common Monte Carlo sample for integrals
mc_samp <- r_unif_polysph(n = M, d = d)
# Test statistic function
Tn <- function(perm_index) {
# Permute labels by perm_index
perm_labels <- labels[perm_index]
# Two samples
X1 <- X[perm_labels == labels_levels[1], , drop = FALSE]
X2 <- X[perm_labels == labels_levels[2], , drop = FALSE]
# Log kdes
log_kde_1 <- kde_polysph(x = mc_samp, X = X1, d = d, h = h,
kernel = kernel, kernel_type = kernel_type,
k = k, log = TRUE, wrt_unif = TRUE)
log_kde_2 <- kde_polysph(x = mc_samp, X = X2, d = d, h = h,
kernel = kernel, kernel_type = kernel_type,
k = k, log = TRUE, wrt_unif = TRUE)
# Distance
hd_mc(log_f = log_kde_1, log_g = log_kde_2, d = d, bhatta = TRUE)
}
} else {
stop("\"type\" must be \"jsd\", \"mean\", \"scatter\" or \"hd\".")
}
}
# Set seeds for the Monte Carlos inside the JSD statistic
if (!is.null(seed_jsd) && type == "jsd") {
# old_seed <- .Random.seed
# on.exit({.Random.seed <<- old_seed})
set.seed(seed_jsd, kind = "Mersenne-Twister")
}
# Original statistic
Tn_orig <- Tn(perm_index = 1:N)
if (!is.finite(Tn_orig) || Tn_orig == 0) {
stop("The test statistic is not finite or is zero. Bandwidth too small?")
}
# Permutations
perms <- t(replicate(B, sample(N)))
# Perform permutations
pb <- txtProgressBar(style = 3)
Tn_star <- rep(NA, B)
for (b in seq_len(B)) {
# Bootstrapped statistic
if (!is.null(seed_jsd) && type == "jsd") {
set.seed(seed_jsd, kind = "Mersenne-Twister")
}
Tn_star[b] <- Tn(perm_index = perms[b, ])
setTxtProgressBar(pb = pb, value = b / B)
# Current p-value
pvalue <- mean(Tn_star > Tn_orig, na.rm = TRUE)
# Plot the position of the original statistic with respect to the
# permutation replicates? Do it one out of ten replicates
if (plot_boot && (b %% 10 == 0 || b == B)) {
hist(Tn_star, probability = TRUE, main = paste("p-value:",
sprintf("%.4f", pvalue)),
xlab = expression(T[n]^"*"),
xlim = range(c(Tn_star, Tn_orig), na.rm = TRUE))
rug(Tn_star)
abline(v = Tn_orig, col = 2)
}
}
# Construct an "htest" result
Tn_orig <- c("Tn" = Tn_orig)
result <- list(statistic = Tn_orig, p.value = pvalue,
statistic_perm = drop(Tn_star), n = n, h = h, B = B,
alternative = alternative, method = method,
data.name = deparse(substitute(X)))
class(result) <- "htest"
# Return "htest"
return(result)
}
#' @title Hellinger distance between two densities via Monte Carlo
#'
#' @description Computes the Hellinger distance
#' \deqn{H(f, g) = \sqrt(1 - \int_{\mathcal{S}^{d_1} \times \ldots \times
#' \mathcal{S}^{d_r}} \sqrt(f(\boldsymbol{x}) g(\boldsymbol{x}))
#' \,\mathrm{d}\boldsymbol{x})} between two densities \eqn{f} and \eqn{g} on
#' \eqn{\mathcal{S}^{d_1} \times \ldots \times \mathcal{S}^{d_r}} via
#' Monte Carlo.
#'
#' @param log_f,log_g logarithms of \eqn{f} and \eqn{g} evaluated in a Monte
#' Carlo sample.
#' @inheritParams kde_polysph
#' @param bhatta compute the Bhattacharyya divergence (or coefficient) instead?
#' Defaults to \code{FALSE}.
#' @return A scalar with the estimated distance.
#' @examples
#' # Example with von Mises--Fisher distributions
#' M <- 1e3
#' d <- c(1, 3)
#' mu <- r_unif_polysph(n = 1, d = d)
#' kappa <- c(1, 5)
#' x_mc <- r_unif_polysph(n = M, d = d)
#' log_f <- d_vmf_polysph(x = x_mc, d = d, mu = mu, kappa = kappa, log = TRUE)
#' log_g <- d_vmf_polysph(x = x_mc, d = d, mu = -mu, kappa = kappa, log = TRUE)
#' polykde:::hd_mc(log_f = log_f, log_g = log_f, d = d)
#' polykde:::hd_mc(log_f = log_f, log_g = log_g, d = d)
#' polykde:::hd_mc(log_f = log_f, log_g = log_f, d = d, bhatta = TRUE)
#' polykde:::hd_mc(log_f = log_f, log_g = log_g, d = d, bhatta = TRUE)
#' @noRd
hd_mc <- function(log_f, log_g, d, bhatta = FALSE) {
# Hellinger distance: H(f, g) = \sqrt(1 - \int \sqrt(f(x) * g(x)) dx)
# Bhattacharyya divergence: DB(f; g) = -\log(\int \sqrt(f(x) * g(x)) dx)
# H(f, g) = \sqrt(1 - \int \sqrt(f(x) * g(x)) dx)
# = \sqrt(1 - \exp(\log(\int \sqrt(f(x) * g(x)) dx)))
# = \sqrt(1 - \exp(-DB(f; g)))
# DB(f; g) = -\log(\int \sqrt(f(x) * g(x)) dx)
# = -\log(\int \exp((\log(f(x)) + \log(g(x))) / 2) dx)
# ≈ -\log(w / M * \sum_i \exp((\log(f(X_i)) + \log(g(X_i))) / 2))
# = \log(M / w) -
# \log(\sum_i \exp((\log(f(X_i)) + \log(g(X_i))) / 2))
# Bhattacharyya divergence
M <- length(log_f)
stopifnot(M == length(log_g))
logs <- 0.5 * (log_f + log_g)
max_log <- max(logs)
bhatta_div <- log(M) - sum(rotasym::w_p(p = d + 1, log = TRUE)) -
(max_log + log(sum(exp(logs - max_log))))
# Hellinger distance?
return(ifelse(bhatta, bhatta_div, sqrt(1 - exp(-bhatta_div))))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.