#' Global Two-Sample Test for Network-Valued Data
#'
#' This function carries out an hypothesis test where the null hypothesis is
#' that the two populations of networks share the same underlying probabilistic
#' distribution against the alternative hypothesis that the two populations come
#' from different distributions. The test is performed in a non-parametric
#' fashion using a permutational framework in which several statistics can be
#' used, together with several choices of network matrix representations and
#' distances between networks.
#'
#' @param x Either an object of class [nvd] listing networks in sample 1 or a
#' distance matrix of size \eqn{n_1 + n_2}.
#' @param y Either an object of class [nvd] listing networks in sample 2 or an
#' integer value specifying the size of sample 1 or an integer vector
#' specifying the indices of the observations belonging to sample 1.
#' @param representation A string specifying the desired type of representation,
#' among: \code{"adjacency"}, \code{"laplacian"} and \code{"modularity"}.
#' Defaults to \code{"adjacency"}.
#' @param distance A string specifying the chosen distance for calculating the
#' test statistic, among: \code{"hamming"}, \code{"frobenius"},
#' \code{"spectral"} and \code{"root-euclidean"}. Defaults to
#' \code{"frobenius"}.
#' @param stats A character vector specifying the chosen test statistic(s),
#' among: `"original_edge_count"`, `"generalized_edge_count"`,
#' `"weighted_edge_count"`, `"student_euclidean"`, `"welch_euclidean"` or any
#' statistics based on inter-point distances available in the **flipr**
#' package: `"flipr:student_ip"`, `"flipr:fisher_ip"`, `"flipr:bg_ip"`,
#' `"flipr:energy_ip"`, `"flipr:cq_ip"`. Defaults to `c("flipr:student_ip",
#' "flipr:fisher_ip")`.
#' @param B The number of permutation or the tolerance. If this number is lower
#' than \code{1}, it is intended as a tolerance. Otherwise, it is intended as
#' the number of required permutations. Defaults to `1000L`.
#' @param test A character string specifying the formula to be used to compute
#' the permutation p-value. Choices are `"estimate"`, `"upper_bound"` and
#' `"exact"`. Defaults to `"exact"` which provides exact tests.
#' @param k An integer specifying the density of the minimum spanning tree used
#' for the edge count statistics. Defaults to `5L`.
#' @param seed An integer for specifying the seed of the random generator for
#' result reproducibility. Defaults to `NULL`.
#' @param ... Extra arguments to be passed to the distance function.
#'
#' @return A \code{\link[base]{list}} with three components: the value of the
#' statistic for the original two samples, the p-value of the resulting
#' permutation test and a numeric vector storing the values of the permuted
#' statistics.
#' @export
#'
#' @examples
#' n <- 5L
#' gnp_params <- list(p = 1/3)
#' k_regular_params <- list(k = 8L)
#'
#' # Two different models for the two populations
#' x <- nvd(model = "gnp", n = n, model_params = gnp_params)
#' y <- nvd(model = "k_regular", n = n, model_params = k_regular_params)
#' t1 <- test2_global(x, y, representation = "modularity")
#' t1$pvalue
#'
#' # Same model for the two populations
#' x <- nvd(model = "gnp", n = 10L, model_params = gnp_params)
#' y <- nvd(model = "gnp", n = 10L, model_params = gnp_params)
#' t2 <- test2_global(x, y, representation = "modularity")
#' t2$pvalue
test2_global <- function(x, y,
representation = c("adjacency", "laplacian", "modularity", "transitivity"),
distance = c("frobenius", "hamming", "spectral", "root-euclidean"),
stats = c("flipr:t_ip", "flipr:f_ip"),
B = 1000L,
test = "exact",
k = 5L,
seed = NULL,
...) {
dist_input <- inherits(x, "dist")
if (dist_input) {
if (!is.integer(y))
cli::cli_abort(
"When the input {.arg x} is an object of class {.cls dist}, the input
{.arg y} should be an integer value specifying the size of the first
sample or an integer vector specifying the indices of the observations
belonging to the first sample."
)
if (length(y) > 1) {
n1 <- length(y)
n <- attr(x, "Size")
n2 <- n - n1
dy <- setdiff(1:n, y)
xm1 <- as.matrix(x)
xm2 <- xm1
xm2[1:n1, 1:n1] <- xm1[y, y]
xm2[(n1+1):n, (n1+1):n] <- xm1[dy, dy]
xm2[1:n1, (n1+1):n] <- xm1[y, dy]
xm2[(n1+1):n, 1:n1] <- xm1[dy, y]
x <- stats::as.dist(xm2)
} else
n1 <- y
} else {
n1 <- length(x)
n2 <- length(y)
n <- n1 + n2
}
representation <- rlang::arg_match(representation)
distance <- rlang::arg_match(distance)
use_frechet_stats <- any(grepl("student_euclidean", stats)) ||
any(grepl("welch_euclidean", stats))
if (use_frechet_stats &&
(any(grepl("_ip", stats)) ||
any(grepl("edge_count", stats))))
cli::cli_abort(
"It is not possible to mix statistics based on Frechet means and
statistics based on inter-point distances."
)
if (use_frechet_stats && dist_input)
cli::cli_abort(
"You cannot use statistics based on Frechet mean computations when using a
distance matrix as input."
)
ecp <- NULL
if (use_frechet_stats)
d <- repr_nvd(x, y, representation = representation)
else {
d <- if (dist_input) x else dist_nvd(
x, y,
representation = representation,
distance = distance,
...
)
if (any(grepl("edge_count", stats)))
ecp <- edge_count_global_variables(d, n1, k = k)
}
null_spec <- function(y, parameters) {
return(y)
}
stat_functions <- stats %>%
strsplit(split = ":") %>%
purrr::map(~ {
if (length(.x) == 1) {
s <- paste0("stat_", .x)
return(rlang::as_function(s))
}
s <- paste0("stat_", .x[2])
getExportedValue(.x[1], s)
})
stat_assignments <- list(delta = 1:length(stat_functions))
if (inherits(d, "dist")) {
xx <- d
yy <- as.integer(n1)
} else {
xx <- d[1:n1]
yy <- d[(n1 + 1):(n1 + n2)]
}
pf <- flipr::PlausibilityFunction$new(
null_spec = null_spec,
stat_functions = stat_functions,
stat_assignments = stat_assignments,
xx, yy,
seed = seed
)
pf$set_nperms(B)
pf$set_pvalue_formula(test)
pf$set_alternative("right_tail")
pf$get_value(
parameters = 0,
edge_count_prep = ecp,
keep_null_distribution = TRUE
)
}
#' Local Two-Sample Test for Network-Valued Data
#'
#' @inheritParams test2_global
#' @param partition Either a list or an integer vector specifying vertex
#' memberships into partition elements.
#' @param alpha Significance level for hypothesis testing. If set to 1, the
#' function outputs properly adjusted p-values. If lower than 1, then only
#' p-values lower than alpha are properly adjusted. Defaults to `0.05`.
#' @param verbose Boolean specifying whether information on intermediate tests
#' should be printed in the process (default: \code{FALSE}).
#'
#' @return A length-2 list reporting the adjusted p-values of each element of
#' the partition for the intra- and inter-tests.
#' @export
#'
#' @examples
#' n <- 5L
#' p1 <- matrix(
#' data = c(0.1, 0.4, 0.1, 0.4,
#' 0.4, 0.4, 0.1, 0.4,
#' 0.1, 0.1, 0.4, 0.4,
#' 0.4, 0.4, 0.4, 0.4),
#' nrow = 4,
#' ncol = 4,
#' byrow = TRUE
#' )
#' p2 <- matrix(
#' data = c(0.1, 0.4, 0.4, 0.4,
#' 0.4, 0.4, 0.4, 0.4,
#' 0.4, 0.4, 0.1, 0.1,
#' 0.4, 0.4, 0.1, 0.4),
#' nrow = 4,
#' ncol = 4,
#' byrow = TRUE
#' )
#' sim <- sample2_sbm(n, 68, p1, c(17, 17, 17, 17), p2, seed = 1234)
#' m <- as.integer(c(rep(1, 17), rep(2, 17), rep(3, 17), rep(4, 17)))
#' test2_local(sim$x, sim$y, m,
#' seed = 1234,
#' alpha = 0.05,
#' B = 19)
test2_local <- function(x, y, partition,
representation = "adjacency",
distance = "frobenius",
stats = c("flipr:t_ip", "flipr:f_ip"),
B = 1000L,
alpha = 0.05,
test = "exact",
k = 5L,
seed = NULL,
verbose = FALSE) {
# Creating sigma-algebra generated by the partition
partition <- as_vertex_partition(partition)
E <- names(partition)
sa <- generate_sigma_algebra(partition)
psize <- length(sa)
# Initialize output for intra-adjusted pvalues
stop_intra <- FALSE
skip_intra <- NULL
p_intra <- utils::combn(E, 1, simplify = FALSE) %>%
purrr::transpose() %>%
purrr::simplify_all() %>%
rlang::set_names("E") %>%
tibble::as_tibble() %>%
dplyr::mutate(pvalue = 0, truncated = FALSE)
# Intialize output for inter-adjusted pvalues
stop_inter <- FALSE
skip_inter <- NULL
p_inter <- utils::combn(E, 2, simplify = FALSE) %>%
purrr::transpose() %>%
purrr::simplify_all() %>%
rlang::set_names(c("E1", "E2")) %>%
tibble::as_tibble() %>%
dplyr::mutate(pvalue = 0, truncated = FALSE)
for (i in 1:psize) {
sas <- sa[[i]]
compositions <- names(sas)
for (j in 1:length(sas)) {
if (stop_intra && stop_inter)
return(list(intra = p_intra, inter = p_inter))
element_name <- compositions[j]
update_intra <- !stop_intra && !(element_name %in% skip_intra)
update_inter <- !stop_inter && i < psize && !(element_name %in% skip_inter)
if (!update_intra && !update_intra)
next()
element_value <- sas[[j]]
individuals <- element_name %>%
strsplit(",") %>%
purrr::simplify()
# Tests on full subgraphs
p <- test2_subgraph(
x, y, element_value,
subgraph_full,
representation, distance, stats, B, test, k, seed
)
if (verbose) {
writeLines("- Type of test: FULL")
writeLines(paste0("Element of the sigma-algebra: ", element_name))
writeLines(paste0("P-value of the test: ", p))
}
# Intra-adjusted p-values from full tests
if (update_intra)
p_intra <- .update_intra_pvalues(p_intra, individuals, p, alpha)
# Inter-adjusted p-values from full tests
if (update_inter)
p_inter <- .update_inter_pvalues(p_inter, individuals, p, alpha)
# Update stopping and skipping conditions
stop_intra <- all(p_intra$truncated)
stop_inter <- all(p_inter$truncated)
if (p >= alpha) {
skip_intra <- .update_skip_list(skip_intra, individuals)
skip_inter <- .update_skip_list(skip_inter, individuals)
}
update_intra <- !stop_intra && !(element_name %in% skip_intra)
if (update_intra) {
# Tests on intra subgraphs
p <- test2_subgraph(
x, y, element_value,
subgraph_intra,
representation, distance, stats, B, test, k, seed
)
if (verbose) {
writeLines("- Type of test: INTRA")
writeLines(paste0("Element of the sigma-algebra: ", element_name))
writeLines(paste0("P-value of the test: ", p))
}
# Intra-adjusted p-values from intra tests
p_intra <- .update_intra_pvalues(p_intra, individuals, p, alpha)
# Update stopping and skipping conditions
stop_intra <- all(p_intra$truncated)
if (p >= alpha)
skip_intra <- .update_skip_list(skip_intra, individuals)
}
update_inter <- !stop_inter && i < psize && !(element_name %in% skip_inter)
if (update_inter) {
# Tests on inter subgraphs
p <- test2_subgraph(
x, y, element_value,
subgraph_inter,
representation, distance, stats, B, test, k, seed
)
if (verbose) {
writeLines("- Type of test: INTER")
writeLines(paste0("Element of the sigma-algebra: ", element_name))
writeLines(paste0("P-value of the test: ", p))
}
# Inter-adjusted p-values from inter tests
p_inter <- .update_inter_pvalues(p_inter, individuals, p, alpha)
# Update stopping and skipping conditions
stop_inter <- all(p_inter$truncated)
if (p >= alpha)
skip_inter <- .update_skip_list(skip_inter, individuals)
}
}
}
list(intra = p_intra, inter = p_inter)
}
.update_intra_pvalues <- function(output, c, p, alpha) {
output %>%
dplyr::mutate(
pvalue = purrr::map2_dbl(.data$E, .data$pvalue, ~ dplyr::if_else(.x %in% c, pmax(.y, p), .y)),
truncated = .data$pvalue >= alpha
)
}
.update_inter_pvalues <- function(output, c, p, alpha) {
output %>%
dplyr::mutate(
pvalue = purrr::pmap_dbl(
list(.data$E1, .data$E2, .data$pvalue),
~ dplyr::if_else(all(c(..1, ..2) %in% c), pmax(..3, p), ..3)
),
truncated = .data$pvalue >= alpha
)
}
.update_skip_list <- function(skip_list, individuals) {
for (k in 1:length(individuals)) {
tmp <- individuals %>%
utils::combn(k, paste0, collapse = ",", simplify = FALSE) %>%
purrr::simplify()
skip_list <- unique(c(skip_list, tmp))
}
skip_list
}
test2_subgraph <- function(x, y, subpartition, fun,
representation, distance, stats, B, test, k, seed) {
x <- x %>%
purrr::map(rlang::as_function(fun), vids = subpartition) %>%
as_nvd()
y <- y %>%
purrr::map(rlang::as_function(fun), vids = subpartition) %>%
as_nvd()
test2_global(
x, y,
representation = representation,
distance = distance,
stats = stats,
B = B,
test = test,
k = k,
seed = seed
)$pvalue
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.