knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) source("knitr_markup.R") bmatrix.digits <- 4
set.seed(1337) library(bayessource) requireNamespace("mvtnorm")
Let's generate a test dataset with known parameters.
We will use some supporting functions:
# Generate n random integers in {min, ..., max} randi <- function(n, min, max) { round(runif(n, min, max)) } #' Generate random invertible symmetric pxp matrix #' #' @param p matrix size #' @param alpha regularization > 0 randinvmat <- function(p, M = 5, alpha = NULL) { stopifnot(p > 0) stopifnot(M > 0) if (is.null(alpha)) { alpha <- p + 1 } else { stopifnot(alpha > 0) } M * stats::rWishart(1, p + alpha, diag(p))[, , 1] / (p + alpha) } # Compute inverse of X if X is symmetric-positive definite solve_sympd <- function(X) { chol2inv(chol(X)) }
We will generate the data according to the model: for sources $i = 1, \ldots, m$,
\begin{align} X_{ij} \mid \theta_i, \; W_i &\sim N_p(\theta_i, W_i) \quad \forall j = 1, \ldots, n \ \theta_i \mid \mu, B &\sim N_p(\mu, B) \ W_i \mid U, n_w &\sim IW(U, n_w) \end{align}
p <- 4 # number of variables m <- 100 # number of sources n <- 500 # number of items per source
Upper hierarchical level: choose some generating hyperparameters $\mu$, $n_w$, $B$, $U$
list_exact <- list() list_exact$B.exact <- randinvmat(p, alpha = 2) # Between Covariance matrix list_exact$U.exact <- randinvmat(p, alpha = 2) # Inverted Wishart scale matrix list_exact$mu.exact <- randi(p, 0, 10) # mean of means list_exact$nw.exact <- 2 * (p + 1) + 1 # dof for Inverted Wishart, as small as possible
Middle hierarchical level: for each $i$-th source generate
theta.sources.exact <- list() for (i in seq(m)) { theta.sources.exact[[i]] <- mvtnorm::rmvnorm(1, list_exact$mu.exact, list_exact$B.exact) } # Sample from an inverted Wishart W.sources.exact <- list() for (i in seq(m)) { # ours: W.sources.exact[[i]] <- bayessource::riwish_Press(list_exact$nw.exact, list_exact$U.exact) # R: # W.sources.exact[[i]] <- solve(rWishart(1, nw.exact, solve(U.exact))[,,1]) }
Lower hierarchical level: generate raw data from every source
df <- list() for (i in seq(m)) { # Sample data from the i-th source # returned as a matrix df[[i]] <- mvtnorm::rmvnorm(n, theta.sources.exact[[i]], W.sources.exact[[i]]) # Store the true source in the last column df[[i]] <- cbind(df[[i]], i) } # Convert to a dataframe # columns v_1, v_2, ..., v_p, source df <- data.frame(do.call(rbind, df)) colnames(df) <- paste(c(paste0("v", seq(1:p)), "source")) head(df)
The item column is the last one (p + 1
).
Show the first two components across sources:
library(ggplot2) df_sub <- df[sample(1:nrow(df), nrow(df) * 0.05, replace = FALSE), ] ggplot(df_sub, aes(x = v1, y = v2, col = factor(source))) + geom_point(size = 1, show.legend = FALSE) + labs( x = "x1", y = "x2", title = "Raw data across sources: first two components", subtitle = paste("Color = source; ", m, "sources, ", n, "observations per source") )
The package supplies the function make_priors_and_init()
.
Suppose that df
represents a background dataset.
We can use it to elicit the model hyperparameters using the estimators described in [@Bozza2008Probabilistic].
It also chooses an initialization value for the Gibbs sampler.
col.variables <- 1:p col.item <- p + 1 use.priors <- "ML" use.init <- "random" priors <- make_priors_and_init(df, col.variables, col.item, use.priors, use.init) names(priors)
$$\theta_i \mid \mu, B \sim N_p(\mu, B)$$
priors$mu bmatrix(as.matrix(priors$mu), pre = "\\hat{\'mu} =", digits = bmatrix.digits)
Exact:
list_exact$mu.exact bmatrix(as.matrix(list_exact$mu.exact), pre = "\\mu =", digits = bmatrix.digits)
priors$B.inv bmatrix(priors$B.inv, pre = "\\hat{B}^{-1} =", digits = bmatrix.digits)
Exact:
list_exact$B.exact bmatrix(solve_sympd(list_exact$B.exact), pre = "B^{-1} =", digits = bmatrix.digits)
$$ W_i \sim IW(n_w, U) $$
$$n_w = r priors$nw
$$
priors$U bmatrix(priors$U, pre = "\\hat{U} =", digits = bmatrix.digits)
Exact:
list_exact$U.exact bmatrix(list_exact$U.exact, pre = "U =", digits = bmatrix.digits)
Within-source covariance matrices and their inverses: we initialize the chain with
priors$W.inv.1 priors$W.inv.2 bmatrix(priors$W.inv.1, pre = "W^{-1}_1 =", digits = bmatrix.digits) bmatrix(priors$W.inv.2, pre = "W^{-1}_2 =", digits = bmatrix.digits)
The list returned by make_priors_and_init()
can be used in calls to marginalLikelihood()
and samesource_C()
:
priors <- make_priors_and_init(df, col.variables, col.item, use.priors, use.init) mtx_data <- as.matrix(df[, col.variables]) ml <- marginalLikelihood( X = mtx_data, n.iter = 1000, burn.in = 100, B.inv = priors$B.inv, W.inv = priors$W.inv.1, U = priors$U, nw = priors$nw, mu = priors$mu ) ml
bf <- samesource_C( quest = mtx_data, ref = mtx_data, n.iter = 1000, burn.in = 100, B.inv = priors$B.inv, W.inv.1 = priors$W.inv.1, W.inv.2 = priors$W.inv.1, U = priors$U, nw = priors$nw, mu = priors$mu ) bf
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.