library(igraph) library(plyr) library(dplyr) library(tidyr) library(stringr) library(ggplot2) library(devtools) library(gridExtra) load_all() knitr::opts_chunk$set(echo=FALSE, message=FALSE, fig.width=12, fig.height=12)
This file shows the (analytically-derived) expected values for several quantities under the stochastic block model simulation design. There are three variants of the design: simple, nested, and two-parameter. These can be distinguished by the mixing matrices, as shown below.
In the simple mixing matrix, the probability of a tie between any two vertices that are not in exactly the same group is multiplied by $\rho$:
$$ \begin{matrix} & ~F~H & ~F \lnot H & \lnot F ~H & \lnot F \lnot H \ ~F~H & \zeta & \rho\cdot\zeta & \rho\cdot\zeta & \rho\cdot\zeta \ ~F \lnot H & \rho\cdot\zeta & \zeta & \rho\cdot\zeta & \rho\cdot\zeta \ \lnot F ~H & \rho\cdot\zeta & \rho\cdot\zeta & \zeta & \rho\cdot\zeta \ \lnot F \lnot H & \rho\cdot\zeta & \rho\cdot\zeta & \rho\cdot\zeta & \zeta \end{matrix} $$
In the nested mixing matrix, the probability of a tie between any two vertices who differ in one dimension (membership in $F$ or membership in $H$) is multiplied by $\rho$; the probability of a tie between two vertices that differ on both dimensions is multiplied by $\rho^2$:
$$ \begin{matrix} & ~F~H & ~F \lnot H & \lnot F ~H & \lnot F \lnot H \ ~F~H & \zeta & \rho\cdot\zeta & \rho\cdot\zeta & \rho^2\cdot\zeta \ ~F \lnot H & \rho\cdot\zeta & \zeta & \rho^2\cdot\zeta& \rho\cdot\zeta \ \lnot F ~H & \rho\cdot\zeta & \rho^2\cdot\zeta& \zeta & \rho\cdot\zeta \ \lnot F \lnot H & \rho^2\cdot\zeta& \rho\cdot\zeta & \rho\cdot\zeta & \zeta \end{matrix} $$
In the two-parameter mixing matrix, the probability of a tie between two vertices that differ in membership in $F$ is multiplied by $\xi$; and, independently, the probability of a tie between two vertices that differ in membership in $H$ is multiplied by $\rho$:
$$ \begin{matrix} & ~F~H & ~F \lnot H & \lnot F ~H & \lnot F \lnot H \ ~F~H & \zeta & \rho\cdot\zeta & \xi\cdot\zeta & \xi\cdot\rho\cdot\zeta \ ~F \lnot H & \rho\cdot\zeta & \zeta & \xi\cdot\rho\cdot\zeta & \xi\cdot\zeta \ \lnot F ~H & \xi\cdot\zeta & \xi\cdot\rho\cdot\zeta & \zeta & \rho\cdot\zeta \ \lnot F \lnot H & \xi\cdot\rho\cdot\zeta & \xi\cdot\zeta & \rho\cdot\zeta & \zeta \end{matrix} $$
The expected values for the number of connections between groups or sets of groups depends on the structure of the mixing matrix. Write the number of vertices in each of the four blocks as a vector, $\mathbf{n}$.
Suppose we wish to count the number of expected connections from a set $A$ to another set $B$, with both $A$ and $B$ built up from one or more of the four blocks. Let $\mathbf{a}$ be the 0/1 vector that indicates which blocks $A$ contains, and $\mathbf{b}$ be the 0/1 vector that indicates which blocks $B$ contains. The size of A is then $N_A = \mathbf{a}^T \mathbf{n}$.
Let $\mathbf{a}_N = \mathbf{a} * \mathbf{n}$, where $*$ denotes element-wise multiplication. Then
$$ \begin{equation} d_{A,B} = \mathbf{a}_N^T ~M~ \mathbf{b}_N - (\mathbf{a} * \mathbf{b} * \mathbf{n})^T \text{diag}(M). \end{equation} $$
To understand this expression, note that the number of edges between two distinct blocks $i$ and $j$ will be $N_i~N_j~M_{ij}$. Since we do not permit self-loops, the number of edges from one block to itself will be $\frac{N_i~(N_i - 1)}{2}~M_{ii}$. The expression
$$ \begin{equation} \mathbf{a}N^T~M~\mathbf{b}_N = \sum{i \in b(A)} \sum_{j \in b(B)} N_i~N_j~M_{ij} \end{equation} $$ where $b(A)$ and $b(B)$ are the sets of blocks consisting of $A$ and $B$ is almost what we want. The problem is that each block $i$ in both $b(A)$ and $b(B)$ is sending edges to itself. In the equation above, it contributes $N_i^2~M_{i,i}$ instead of $\frac{N_i~(N_i - 1)}{2}~M_{i,i}$. So instead of the equation above, we actually want
$$ \begin{equation} d_{A, B} = \sum_{i \in b(A)} \sum_{j \in b(B)} N_i~N_j~M_{ij} - \sum_{i \in b(A) \cap b(B)} \frac{(N_i^2 + N_i)}{2}~M_{i,i}, \end{equation} $$
In this equation, the subtracted term accounts for the difference between $N_i^2$ and $\frac{N_i~(N_i - 1)}{2}$.
Next, we will look at the expected values for several important quantities under all of the versions of the stochastic block-model.
step.size <- .2 param.vals <- expand.grid(# size of total population, U N=10e3, # frame population is half of total p.F=seq(from=.2, to=1, by=step.size), # hidden popn is 3 percent of total p.H=.03, p.F.given.H=1, # prob of edge between two nodes in the same group zeta=.05, xi=seq(from=.2, to=1, by=step.size), rho=seq(from=.2, to=1, by=step.size), tau=seq(from=.2, to=1, by=step.size)) all.param.list <- split(param.vals, rownames(param.vals))
## get a list of parameter objects based on the 1 param/nested specification all.param.list.nested <- llply(all.param.list, sbm_params, type="4group_1param_nested") res.nested <- ldply(all.param.list.nested, sbm_ev) res.nested$type <- "nested" ## now do the same for the simple version all.param.list.simple <- llply(all.param.list, sbm_params, type="4group_1param_simple") res.simple <- ldply(all.param.list.simple, sbm_ev) res.simple$type <- "simple" ## now do the same for the two-param version all.param.list.twoparam <- llply(all.param.list, sbm_params, type="4group_2param") res.twoparam <- ldply(all.param.list.twoparam, sbm_ev) res.twoparam$type <- "twoparam" res <- rbind(res.nested, res.simple, res.twoparam) res.nsum <- res %>% mutate(estimate=tau*d.F.H / dbar.U.F, estimator="nsum.est") res.ansum <- res %>% mutate(estimate=tau*d.F.H / dbar.F.F, estimator="adapted.est") res.gnsum <- res %>% mutate(estimate=tau*d.F.H / (tau * dbar.H.F), estimator="generalized.est") res.both <- rbind(res.nsum, res.ansum, res.gnsum) res.both <- res.both %>% mutate(N.H = N*p.H) res.both <- res.both %>% mutate(bias = N.H - estimate) ## give estimators nice labels res.both <- res.both %>% mutate(estimator.label=ifelse(estimator=="generalized", "generalized scale-up", ifelse(estimator=="adapted", "adapted scale-up", "basic scale-up")))
res.vals <- c(.2, .6, 1) dbl.in <- function(a, vals, tol=1e-10) { laply(a, function(x) { return(any(abs(x-vals) < tol)) }) } xi.fixed <- 0.6 toplot.fix.xi <- res.both %>% filter(dbl.in(tau, res.vals), dbl.in(p.F, res.vals), dbl.in(xi, xi.fixed)) %>% mutate(type=ifelse(type=="twoparam", paste0("twoparam (xi=", xi.fixed, ")"), type)) rho.fixed <- 0.6 toplot.fix.rho <- res.both %>% filter(dbl.in(tau, res.vals), dbl.in(p.F, res.vals), dbl.in(rho, rho.fixed)) %>% mutate(type=ifelse(type=="twoparam", paste0("twoparam (rho=", rho.fixed, ")"), type))
First we'll look at these holding $\xi$ fixed at r xi.fixed
(which only affects the twoparam model):
phi.byrho.lim <- ggplot(toplot.fix.xi) + geom_line(aes(x=rho, y=phi.F, color=type, linetype=type)) + geom_hline(aes(yintercept=1), linetype=1, color="gray") + facet_grid(tau ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~(rho))) + ylab(bquote(paste("frame ratio (", phi[F], ")"))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) delta.byrho.lim <- ggplot(toplot.fix.xi) + geom_line(aes(x=rho, y=delta.F, color=type, linetype=type)) + geom_hline(aes(yintercept=1), linetype=1, color="gray") + facet_grid(tau ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~(rho))) + ylab(bquote(paste("degree ratio (", delta[F], ")"))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) phidelta.byrho.lim <- ggplot(toplot.fix.xi) + geom_line(aes(x=rho, y=deltaXphi.F, color=type, linetype=type)) + geom_hline(aes(yintercept=1), linetype=1, color="gray") + facet_grid(tau ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~(rho))) + ylab(bquote(paste("partial adjustment factor (", delta[F]~phi[F], ")"))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) grid.arrange(phi.byrho.lim, delta.byrho.lim, phidelta.byrho.lim, nrow=3)
Next we'll look at these holding $\rho$ fixed at r rho.fixed
(which only affects the twoparam model):
phi.byxi.lim <- ggplot(toplot.fix.rho) + geom_line(aes(x=xi, y=phi.F, color=type, linetype=type)) + geom_hline(aes(yintercept=1), linetype=1, color="gray") + facet_grid(tau ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~(xi))) + ylab(bquote(paste("frame ratio (", phi[F], ")"))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) delta.byxi.lim <- ggplot(toplot.fix.rho) + geom_line(aes(x=xi, y=delta.F, color=type, linetype=type)) + geom_hline(aes(yintercept=1), linetype=1, color="gray") + facet_grid(tau ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~(xi))) + ylab(bquote(paste("degree ratio (", delta[F], ")"))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) phidelta.byxi.lim <- ggplot(toplot.fix.rho) + geom_line(aes(x=xi, y=deltaXphi.F, color=type, linetype=type)) + geom_hline(aes(yintercept=1), linetype=1, color="gray") + facet_grid(tau ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~(xi))) + ylab(bquote(paste("partial adjustment factor (", delta[F]~phi[F], ")"))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) grid.arrange(phi.byxi.lim, delta.byxi.lim, phidelta.byxi.lim, nrow=3)
These plots show the bias in three different scale-up estimators under the two-parameter model when $\tau=1$.
toplot.bias.twoparam <- res.both %>% filter( #dbl.in(tau, res.vals), dbl.in(tau, 1), #dbl.in(rho, res.vals), dbl.in(p.F, res.vals) #dbl.in(xi, 0.6), #dbl.in(xi, res.vals) ) %>% filter(type == "twoparam") bias.twoparam <- ggplot(toplot.bias.twoparam) + #geom_line(aes(x=rho, y=bias, color=xi, group=xi)) + geom_line(aes(x=rho, y=abs(bias), color=estimator)) + geom_hline(aes(yintercept=1), linetype=1, color="gray") + facet_grid(xi ~ p.F, labeller = labeller(rows=label_bquote(xi == .(xi)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~(rho))) + ylab(bquote(paste("|bias|"))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + ggtitle(bquote(paste(tau, "=1"))) bias.twoparam
## helper fns for plots estimator_shape_scale <- scale_shape_manual(values=c("generalized scale-up"=5, "adapted scale-up"=6, "basic scale-up"=1))
bnsum.ev.byrho.lim <- ggplot(toplot.fix.xi %>% filter(estimator == "nsum.est")) + geom_line(aes(x=rho, y=estimate, color=type, linetype=type)) + geom_hline(aes(yintercept=N.H), linetype=1, color="gray") + estimator_shape_scale + facet_grid(tau ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~(rho))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + ggtitle("Expected value: basic scale-up") bnsum.ev.byrho.lim
ansum.ev.byrho.lim <- ggplot(toplot.fix.xi %>% filter(estimator == "adapted.est")) + geom_line(aes(x=rho, y=estimate, color=type, linetype=type)) + geom_hline(aes(yintercept=N.H), linetype=1, color="gray") + estimator_shape_scale + facet_grid(tau ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~(rho))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + ggtitle("Expected value: adapted scale-up") ansum.ev.byrho.lim
gnsum.ev.byrho.lim <- ggplot(toplot.fix.xi %>% filter(estimator == "generalized.est")) + geom_line(aes(x=rho, y=estimate, color=type, linetype=type)) + geom_hline(aes(yintercept=N.H), linetype=1, color="gray") + estimator_shape_scale + facet_grid(tau ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~(rho))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + ggtitle("Expected value: generalized scale-up") gnsum.ev.byrho.lim
Now we look at a specific scenario for just the two-parameter model.
step.size <- .1 param.vals <- expand.grid(# size of total population, U N=10e2, # frame population is half of total p.F=seq(from=.2, to=1, by=step.size), # hidden popn is 3 percent of total p.H=.03, p.F.given.H=c(0, .5, 1, NA), # prob of edge between two nodes in the same group zeta=.05, xi=seq(from=.2, to=1, by=step.size), rho=seq(from=.2, to=1, by=step.size), #rho=.6, tau=1) #tau=seq(from=.2, to=1, by=step.size)) all.param.list <- split(param.vals, rownames(param.vals))
## get a list of parameter objects based on the 1 param/nested specification ## now do the same for the two-param version all.param.list.twoparam <- llply(all.param.list, sbm_params, type="4group_2param") res.twoparam <- ldply(all.param.list.twoparam, sbm_ev) res.twoparam$type <- "twoparam" res <- res.twoparam res.nsum <- res %>% mutate(estimate=tau*d.F.H / dbar.U.F, estimator="nsum.est") res.ansum <- res %>% mutate(estimate=tau*d.F.H / dbar.F.F, estimator="adapted.est") res.gnsum <- res %>% mutate(estimate=tau*d.F.H / (tau * dbar.H.F), estimator="generalized.est") res.both <- rbind(res.nsum, res.ansum, res.gnsum) res.both <- res.both %>% mutate(N.H = N*p.H) res.both <- res.both %>% mutate(bias = N.H - estimate) ## give estimators nice labels res.both <- res.both %>% mutate(estimator.label=ifelse(estimator=="generalized", "generalized scale-up", ifelse(estimator=="adapted", "adapted scale-up", "basic scale-up")))
tol <- 1e-2 ############# ## fixing rho at 1 res.fixrho <- res.both %>% filter(rho==1, tau==1) bias.twoparam <- ggplot(res.fixrho) + geom_line(aes(x=xi, y=abs(bias), color=estimator)) + facet_grid(p.F.given.H ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~F~and~not~F~(xi))) + ylab(bquote(paste("|bias|"))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + ggtitle(bquote(paste(rho, "=1 and ", tau, "=1"))) bias.twoparam ############# ## fixing xi at 0.6 res.fixxi <- res.both %>% filter(dbl.in(xi, 0.6), tau==1) %>% filter(estimator != 'adapted.est') bias.twoparam <- ggplot(res.fixxi) + geom_line(aes(x=rho, y=abs(bias), color=estimator)) + facet_grid(p.F.given.H ~ p.F, labeller = labeller(rows=label_bquote(tau == .(tau)), cols=label_bquote(p[F] == .(p.F)))) + xlab(bquote(amount~of~inhomogenous~mixing~F~and~not~F~(xi))) + ylab(bquote(paste("|bias|"))) + theme(legend.position="bottom", axis.text.x=element_text(angle=90, vjust=0.5), panel.border=element_rect(size=.5, fill=NA), panel.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + ggtitle(bquote(paste(xi, "=0.6 and ", tau, "=1"))) bias.twoparam
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.