cat(readLines("extra.tex"))
Below we derive the posterior distributions for various scenarios, given their conjugate priors.
In this scenario, we look for the posterior for the model
\begin{equation} Y \sim \mathcal{MN}_{n,p}(X\beta, U, V) \end{equation}
with $X\beta$ an $n$ by $p$ matrix, U a $n$ by $n$ precision matrix, and V a $p$ by $p$ precision matrix.
We rewrite the model as
\begin{equation} \label{eq:matnormsetup} \begin{split} \vect(Y) &\sim \norm(\vect(X\beta), V \otimes U) \ \vect(\beta) &\sim \norm(\mu_0, Q_0) \end{split} \end{equation}
Let's consider the likelihood
\begin{equation} L(\beta \vert U, V, Y, X, \mu_0, Q_0) \propto \exp\left{-\frac{1}{2}(\vect(Y) - \vect(X\beta))^\intercal (V \otimes U) (\vect(Y) - \vect(X\beta)) + -\frac{1}{2}(\vect(\beta) - \mu_0)^\intercal Q_0 (\vect(\beta) - \mu_0)\right} \end{equation}
Expanding the inner part (ignoring the $-1/2$):
\begin{equation} \begin{split} &\vect(Y)^\intercal (V \otimes U) \vect(Y) \ &- 2\vect(Y)^\intercal(V \otimes U)\vect(X\beta) \ &+ \vect(X\beta)^\intercal (V \otimes U) \vect(X\beta) \ &+ \vect{\beta}^\intercal Q_0 \vect{\beta} \ &- 2\mu_0^\intercal Q_0 \beta \ &+ \mu_0^\intercal Q_0 \mu_0 \end{split} \end{equation}
Note that $\vect(X\beta) = (I_p \otimes X)\vect(\beta)$. Then \begin{equation} \vect(X\beta)^\intercal = \vect(\beta)^\intercal(I_p \otimes X)^\intercal = \vect(\beta)^\intercal (I_p \otimes X^\intercal) \end{equation}
Dropping unused terms and regrouping:
\begin{equation} \label{eq:matnormexpanded} \begin{split} &\vect(\beta)^\intercal ((I_p \otimes X^\intercal) (V \otimes U) (I_p \otimes X) + Q_0)\vect(\beta) \ &- 2(\vect(Y)^\intercal(V \otimes U)(I_p \otimes X) + \mu_0^\intercal Q_0)\vect(\beta) \end{split} \end{equation}
The first line gives the posterior precision:
\begin{equation} \begin{split} \tilde{Q} &= (I_p \otimes X^\intercal) (V \otimes U) (I_p \otimes X) + Q_0\ &=(I_p \otimes X^\intercal) ((VI_p) \otimes (UX)) + Q_0\ &= (I_pVI_p) \otimes (X^\intercal U X) + Q_0\ &= V \otimes (X^\intercal U X) + Q_0 \end{split} \end{equation}
Of course, if $U$ is the identity, we can precompute $X^\intercal X$ for additional efficiency.
The inverse of $\tilde{Q}$ is used to complete the square, and together with the transpose of the second line gives us the posterior mean. We gain additional efficiency with one more trick: for appropriate matrices, $(C^\intercal \otimes A) \vect(B) = \vect(ABC)$. Note that $V$ is symmetric.
\begin{equation} \label{eq:matnormmu} \begin{split} \tilde{\mu} &= \tilde{Q}^{-1}((I_p \otimes X^\intercal) (V \otimes U) \vect{Y} + Q_0 \mu_0) \ &= \tilde{Q}^{-1}\left[((I_pV) \otimes (X^\intercal U))\vect{Y} + Q_0 \mu_0 \right] \ &= \tilde{Q}^{-1}\left[(V \otimes (X^\intercal U))\vect{Y} + Q_0 \mu_0 \right] \ &= \tilde{Q}^{-1}\left[\vect{(X^\intercal UYV)} + Q_0 \mu_0 \right] \end{split} \end{equation}
Finally, if $U$ is the identity matrix, we get a further efficiency by precomputing $X^\intercal Y$
\begin{equation} \tilde{\mu} = \tilde{Q}^{-1}\left[\vect{(X^\intercal YV)} + Q_0 \mu_0 \right] \end{equation}
Occasionally, there are times when $X$ varies by element of $\beta$. Here we consider when $\beta$ is $p$ by $1$, so that each column of $X$ gets multiplied by one (and only one) $\beta$.
\begin{equation} Y \sim \mathcal{MN}_{n,p}(X\diag{\beta}, U, V) \end{equation}
We rewrite the model as in \eqref{eq:matnormsetup}, but with a slightly different prior.
\begin{equation} \begin{split} \vect(Y) &\sim \norm(\vect(X\diag{\beta}), V \otimes U) \ \beta &\sim \norm(\mu_0, Q_0) \end{split} \end{equation}
We jump to the equivalent of \eqref{eq:matnormexpanded}:
\begin{equation} \begin{split} &\vect(\diag{\beta})^\intercal ((I_p \otimes X^\intercal) (V \otimes U) (I_p \otimes X))\vect(\diag{\beta}) + \beta^\intercal Q_0 \beta \ &- 2(\vect(Y)^\intercal(V \otimes U)(I_p \otimes X))\vect(\diag{\beta}) - 2\mu_0^\intercal Q_0\beta \end{split} \end{equation}
Now, we note that \begin{equation} \begin{split} \vect{\diag{\beta}} &= \vect{\sum e_i e_i^\intercal \beta e_i^\intercal} \ &= \sum \vect{e_i e_i^\intercal \beta e_i^\intercal} \ &= \sum (e_i \otimes (e_i e_i^\intercal))\beta = E\beta \end{split} \end{equation}
Then we get \begin{equation} \begin{split} \tilde{Q} &= E^\intercal(I_p \otimes X^\intercal) (V \otimes U) (I_p \otimes X)E + Q_0\ &=E^\intercal(I_p \otimes X^\intercal) ((VI_p) \otimes (UX))E + Q_0\ &= E^\intercal((I_pVI_p) \otimes (X^\intercal U X))E + Q_0\ &= E^\intercal (V \otimes (X^\intercal U X))E + Q_0 \ &= \sum E^\intercal (V \otimes (X^\intercal U X))(e_j \otimes (e_j e_j^\intercal)) + Q_0 \ &= \sum E^\intercal ((Ve_j) \otimes (X^\intercal U X e_j e_j^\intercal)) \ &= \sum ((Ve_j) \otimes (X^\intercal U X e_j e_j^\intercal)) \ &= \sum \sum (e_i^\intercal Ve_j) \otimes (e_i e_i^\intercal X^\intercal U X e_j e_j^\intercal) \ &= \sum \sum V_{ij} (e_i (X^\intercal U X){ij} e_j^\intercal) \ &= \sum \sum V{ij}(X^\intercal U X)_{ij} e_i e_j^\intercal \ &= V \odot (X^\intercal U X) \end{split} \end{equation}
Then we get the mean as in \eqref{eq:matnormmu}
\begin{equation} \begin{split} \tilde{\mu} &= \tilde{Q}^{-1}\left[E^\intercal\vect{(X^\intercal UYV)} + Q_0 \mu_0 \right] \ &= \tilde{Q}^{-1}\left[\diag(X^\intercal UYV) + Q_0 \mu_0 \right] \end{split} \end{equation}
More generally, here we consider when $\beta$ has structural zeros. This is equivalent to setting $Q_0$ entries to $\infty$ where appropriate (in fact, that's the the code does in some cases). However, when $\beta$ is sparse, we can gain efficiency by not computing the Kronecker product of $V$ and $U$.
Let $\beta$ be $m$ by $p$, and $X$ be $n$ by $m$. Define a matrix $Z$ that is $m$ by $p$, such that $Z_{ij} = 0 \iff \beta_{ij} \equiv 0$, else $z_{ij}=1$. Then denote $\beta_Z$ as the (vectorized) vector corresponding to the nonzero entries of $\beta$.
We will further design a matrix $E$ which is $mp$ by $z = \sum Z_{ij}$. Let $i \in {1, ..., z}$ denote the i-th nonzero element of $Z$ (counting down rows first, then across columns, as with standard vectorization). Let $m(i)$ denote the row of that element, and let $p(i)$ denote the column of that element in $Z$. Then we define $$E = \sum_{i=1}^{z} e^p_{p(i)} \otimes (e^m_{m(i)} {e^z_{i}}^\intercal)$$
Alternatively, we can define $E$ as the column-subset of the $mp$ by $mp$ identity matrix, whose columns correspond to the $z$ nonzero entries of $\vect Z$.
Note then that $E\beta_Z = \vect\beta$ and that $\beta_Z = E^\intercal \vect\beta$
So then we have
\begin{equation} \begin{split} \vect(Y) &\sim \norm(\vect(X\beta) = (I_p \otimes X)\vect(\beta) = (I_p \otimes X)E\beta_Z, V \otimes U) \ \beta_Z &\sim \norm(\mu_0, Q_0) \end{split} \end{equation}
We jump to a temporary results from the diagonal beta derivation:
\begin{equation} \begin{split} \tilde{Q} &= E^\intercal (V \otimes (X^\intercal U X))E + Q_0 \ &= Q_0 + \sum \sum ({e^p_{p(i)}}^\intercal V e^p_{p(j)}) \otimes (e^z_{i} {e^m_{m(i)}}^\intercal X^\intercal U X e^m_{m(j)} {e^z_{j}}^\intercal) \ &= Q_0 + \sum \sum V_{p(i)p(j)} (e^z_{i} (X^\intercal U X){m(i)m(j)} {e^z{j}}^\intercal) \ &= Q_0 + \sum \sum V_{p(i)p(j)}(X^\intercal U X){m(i)m(j)} e^z{i} {e^z_{j}}^\intercal \ &= {Q_{ij}} = { V_{p(i)p(j)}(X^\intercal U X){m(i)m(j)} + {Q_0}{ij} } \end{split} \end{equation}
Then we get the mean
\begin{equation} \begin{split} \tilde{\mu} &= \tilde{Q}^{-1}\left[E^\intercal\vect{(X^\intercal UYV)} + Q_0 \mu_0 \right] \ &= \tilde{Q}^{-1}\left[(X^\intercal UYV)_Z + Q_0 \mu_0 \right] \end{split} \end{equation}
In this scenario, we look for the posterior for the model
\begin{equation} \begin{split} y &\sim \norm(X\beta, Q) \ \beta &\sim \norm(\mu_0, Q_0) \end{split} \end{equation}
This is a specific case of the matrix normal regression, with $V \equiv 1$ and $U \equiv Q$. We immediately get the posterior precision and mean: \begin{equation} \begin{split} \tilde{Q} &= (X^\intercal Q X) + Q_0 \ \tilde{\mu} &= \tilde{Q}^{-1}(X^\intercal Q y + Q_0 \mu_0) \end{split} \end{equation}
We set $Q \equiv \tau I_p$ for independence among observations:
\begin{equation} \begin{split} y &\sim \norm(X\beta, \tau I_p) \ \beta &\sim \norm(\mu_0, Q_0) \end{split} \end{equation}
We immediately get the posterior precision and mean: \begin{equation} \begin{split} \tilde{Q} = \tau X^\intercal X + Q_0 \ \tilde{\mu} = \tilde{Q}^{-1}(\tau X^\intercal y + Q_0 \mu_0) \end{split} \end{equation}
In this scenario, we look for the posterior for the model
\begin{equation} \begin{split} Y &\sim \mathcal{MN}_{n,p}(\beta, U, V) \ \vect(\beta) &\sim \norm(\mu_0, Q_0) \end{split} \end{equation}
If there is only one realization, this is a specific case of the matrix normal regression, with $X \equiv I_n$. We immediately get the posterior precision and mean: \begin{equation} \begin{split} \tilde{Q} &= V \otimes U + Q_0 \ \tilde{\mu} &= \tilde{Q}^{-1}\left[\vect{(UYV)} + Q_0 \mu_0 \right] \end{split} \end{equation}
Again, if $U$ is the identity, we collapse even further to \begin{equation} \tilde{\mu} = \tilde{Q}^{-1}\left[\vect{(YV)} + Q_0 \mu_0 \right] \end{equation}
In this scenario, we look for the posterior for the model
\begin{equation} \begin{split} y &\sim \norm(\mu, Q) \ \mu &\sim \norm(\mu_0, Q_0) \end{split} \end{equation}
If there is only one realization, this is a specific case of the normal regression, with $X \equiv I_p$. However, usually more than one realization is observed. We'll call the number of realizations $n$.
\begin{equation} \begin{split} \tilde{Q} &= nQ + Q_0 \ \tilde{\mu} &= \tilde{Q}^{-1}(Q \sum_{i=1}^{n}y_i + Q_0 \mu_0) \end{split} \end{equation}
In this scenario, we look for the posterior for the model
\begin{equation} \begin{split} y &\sim \norm(\mu, \tau) \ \mu &\sim \norm(\mu_0, \tau_0) \end{split} \end{equation}
This is a specific case of the multivariate normal, with $Q \equiv \tau$. We immediately get the posterior precision and mean: \begin{equation} \begin{split} \tilde{\tau} &= n\tau + \tau_0 \ \tilde{\mu} &= \tilde{\tau}^{-1}(\tau \sum_{i=1}^{n}y_i + \tau_0 \mu_0) \end{split} \end{equation}
In this scenario, we look for the posterior for the situation
\begin{equation} \begin{split} Y &\sim \mathcal{MN}_{n,p}(\beta, U, V) \ V &\sim \Wish(V_0, v_0) \end{split} \end{equation}
We use the relevant parts of the convenient form of the matrix normal involving the trace, and multiply by the relevant parts of the Wishart density:
\begin{equation} \vert V \vert^{n/2} \exp{(-\frac{1}{2}\Tr{[V(X - M)^\intercal U (X - M)]})} \cdot \vert V \vert^{(v_0 - p - 1)/2} \exp{(-\frac{1}{2}\Tr{[VV_0^{-1}]})} \end{equation}
It's clear that the trace term becomes
\begin{equation} \Tr{[V(V_0^{-1} + (X - M)^\intercal U (X - M))]} \end{equation} and the determinant term becomes \begin{equation} \vert V \vert ^ {(n + v_0 - p - 1)/2} \end{equation}
So that we get \begin{equation} V \sim \Wish{((V_0^{-1} + (X - M)^\intercal U (X - M))^{-1}, n + v_0)} \end{equation}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.