#' Generate test sets for venndir
#'
#' Generate test sets for venndir
#'
#' This function generates data to use as test input to
#' Venn diagram functions. It can generate sets of items,
#' or signed sets (integer values `-1`, `1`) named by
#' item.
#'
#' This function defines a range of set sizes, using `min_size`
#' and `max_size`, with roughly square-root sequence of sizes
#' between these two extremes.
#'
#' Note that the universe size represents the total available
#' items, but not necessarily the total number of items
#' represented by the sets. For example, if `n_items=1000000`,
#' `max_size=500` and `n_sets=3` then the maximum number of
#' items actually represented is `1500`.
#'
#' The universe can be defined using optional argument `items`,
#' which takes priority over `n_items`.
#'
#' The specific size of each set can be defined with optional
#' argument `sizes`, which takes priority over `min_size`, and
#' `max_size`.
#'
#' @family venndir utility
#'
#' @return `list` of items, either as a list of item vectors,
#' or when `do_signed=TRUE` the list of vectors, where vector
#' names contain the items, and vector values are signed values
#' from `c(-1, 1)`.
#'
#' @param n_items `integer` total number of items available
#' to all sets, also known as the universe size.
#' @param n_sets `integer` number of sets that contain items.
#' @param do_signed `logical` indicating whether to return signed
#' sets, which indicate directionality with `-1` or `1` values,
#' named by the items.
#' @param concordance `numeric` between -1 and 1, used when `do_signed=TRUE`.
#' This value imposes an approximate amount of concordance between
#' random pairs of sets, using the concordance equation:
#' `concordance = (agree - disagree) / (agree + disgree)` where
#' `(agree + disagree) = n`. This equation approximates
#' the number of items that agree as:
#' `agree = ceiling((concordance * n + n) / 2)`.
#' @param min_size `integer` minimum range of items that may
#' be contained in each set.
#' @param max_size `integer` maximum range of items that may
#' be contained in each set.
#' @param items `vector` or `NULL` that contains the universe
#' of items. When `items` is defined, `n_items` is ignored.
#' @param sizes `vector` of `integer` values, or `NULL`, indicating
#' the size of each set. When `sizes` is defined, `min_size` and
#' `max_size` is ignored. When `sizes` is defined, `names(sizes)`
#' are used as names for each set.
#' @param seed `numeric` or `NULL` used with `set.seed()` for data
#' reproducibility. When `seed=NULL` then `set.seed()` is not called.
#' @param ... additional arguments are ignored.
#'
#' @examples
#' ## basic setlist without signed direction
#' setlist <- make_venn_test(n_items=100,
#' n_sets=3,
#' min_size=5,
#' max_size=25)
#' set_im <- list2im_opt(setlist);
#' table(jamba::pasteByRow(as.matrix(set_im)*1))
#'
#' ## basic setlist with signed direction
#' setlist <- make_venn_test(n_items=100,
#' n_sets=3,
#' do_signed=TRUE)
#' jamba::sdim(setlist);
#'
#' ## some example overlap summaries
#' sv1 <- signed_overlaps(setlist=setlist, "overlap")
#' sv1
#'
#' ## Familiar named overlap counts
#' jamba::nameVector(sv1[,c("count","sets")])
#'
#' ## directional count table for each combination
#' sv2 <- signed_overlaps(setlist=setlist, "each")
#' sv2
#'
#' ## directional count table for agreement or mixed
#' sv3 <- signed_overlaps(setlist=setlist, "agreement")
#' sv3
#'
#' ## signed incidence matrix
#' imv <- list2im_value(setlist)
#' dim(imv)
#' head(imv)
#'
#' ## text venn diagram
#' textvenn(setlist, overlap_type="overlap")
#'
#' ## text venn diagram with signed direction
#' textvenn(setlist, overlap_type="each")
#'
#' @export
make_venn_test <- function
(n_items=1000000,
n_sets=4,
do_signed=FALSE,
concordance=0.5,
min_size=ceiling(n_items / 50),
max_size=ceiling(n_items / 2),
items=NULL,
sizes=NULL,
seed=123,
...)
{
# create set items
if (length(seed) > 0) {
set.seed(head(seed, 1))
}
if (length(items) == 0) {
items <- paste0("item_",
jamba::padInteger(seq_len(n_items)));
} else {
n_items <- length(items);
}
if (length(sizes) > 0) {
sizes <- rep(sizes,
length.out=n_sets);
}
# define set names
if (length(names(sizes)) == 0) {
set_names <- paste0("set_",
jamba::colNum2excelName(seq_len(n_sets)));
if (length(sizes) > 0) {
names(sizes) <- set_names;
}
} else {
# make sure names are unique
names(sizes) <- jamba::makeNames(names(sizes));
set_names <- names(sizes);
}
# define sizes upfront so they are reproducible by seed
if (length(sizes) == 0) {
sizes <- lapply(jamba::nameVector(set_names), function(i){
sample_seq <- unique(ceiling(
2^seq(from=log2(min_size),
to=log2(max_size),
length.out=80)));
n <- sample(sample_seq, 1);
});
}
# define set_list items so they are reproducible by seed
# and not affected by do_signed=TRUE
set_list <- lapply(jamba::nameVector(set_names), function(i){
# choose random entries using sizes[i] length
j <- sample(items, size=sizes[[i]]);
});
# define sign
if (do_signed) {
#set_list <- lapply(set_list, function(j){
if (length(concordance) > 0) {
concordance <- jamba::noiseFloor(concordance,
minimum=-1,
ceiling=1);
concordance <- rep(concordance, length.out=length(set_names));
names(concordance) <- set_names;
}
## Randomize the sign
set_list <- lapply(jamba::nameVector(set_names), function(i){
j <- set_list[[i]];
k <- sample(c(-1, 1),
replace=TRUE,
size=length(j));
jamba::nameVector(k, j);
});
## When concordance is defined, force overlaps to have fixed concordance
## kruskalConcordance = (Agree - Disagree) / (Agree + Disagree)
## kruskalConcordance * (Agree + (n-Agree)) = (Agree - (n-Agree))
## kruskalConcordance * n = (2*Agree - n)
## kruskalConcordance * n + n = 2*Agree
## (kruskalConcordance * n + n) / 2 = Agree
##
## kruskalConcordance * (Agree + Disagree) = (Agree - Disagree)
if (length(concordance) > 0 && length(set_names) > 1) {
#set_list <- lapply(jamba::nameVector(set_names), function(i){
for (i in set_names) {
i23 <- setdiff(set_names, i);
i2 <- sample(i23, 1);
j <- set_list[[i]];
j2 <- set_list[[i2]];
jj2 <- j2[intersect(names(j), names(j2))];
n12 <- length(jj2);
KC <- concordance[[i]];
if (n12 > 0) {
KC_agree <- ceiling((KC * n12 + n12) / 2);
jj2a <- head(names(jj2), KC_agree);
jj2d <- tail(names(jj2), -KC_agree);
jj2s <- rep(c(1, -1), c(length(jj2a), length(jj2d)));
j[c(jj2a, jj2d)] <- j2[c(jj2a, jj2d)] * jj2s;
}
set_list[[i]] <- j;
}
}
}
names(set_list) <- set_names;
return(set_list);
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.