In dfeehan/nrsimulatr: network reporting simulations

library(knitr)
opts_chunk$set(eval=FALSE)

library(igraph)
library(dplyr)
library(tidyr)
library(stringr)

library(nrsimulatr)

Two-parameter model

set.seed(12345)

Now set up the params for one random draw.

We'll set this up to explore what happens when we change $\rho$, the relative probability of an edge between a hidden population member and a non-hidden population member. We'll hold all of the other parameters fixed:

$N = 10,000$, the population size
$p_F = 0.5$, the prevalence of the frame population
$p_H = 0.03$, the prevalence of the hidden population
$p_{F|H} = 1$, the fraction of nodes in $H$ that are also in $F$; here, all nodes in $H$ are also in $F$
$\zeta = 0.05$, the probability of an edge between any two nodes in the same group (e.g., between two nodes that are both in $F$ and both not in $H$)
$\tau=0.8$, a true positive rate corresponding to a moderate lack of awareness
the sampling fraction for the simple random sample we will take of the nodes in the frame population
the sampling fraction for the relative probability sample we will take from the hidden population

We first use expand.grid to build a dataframe containing all of the combinations of the parameters we will explore with the simulation.

sim.params <- expand.grid(# size of total population, U
                          N=10e3,
                          # fraction of the population that is in the frame population (F)
                          p.F=.5,
                          # fraction of the population that is in the hidden popn (H)
                          p.H=.03,
                          # probability of being in F, conditional on being in H
                          p.F.given.H=1,
                          # prob of edge between two nodes in the same group 
                          zeta=.05,
                          # relative prob of edge between hidden and not hidden 
                          rho=c(.5, .8, 1),
                          # relative prob of edge between frame and not frame
                          xi = 0.6,
                          # sampling fraction for simple random sample w/out replacement from F
                          frame.sampling.frac=.3,
                          # sampling fraction for relative probability sample from H
                          hidden.sampling.frac=.4,
                          # true positive rate
                          tau=.8)

# add an id for each parameter combination
sim.params$param.index <- 1:nrow(sim.params)

Next, we take our dataframe and turn it into a list of rows; then, for each entry in the list of rows, we call the function required to construct a simulation parameters object.

all.param.list <- split(sim.params, rownames(sim.params))

# we would change 'type' if we wanted to use a different type of
# simulation model -- for example, a one-parameter stochastic blockmodel with four groups
all.param.list <- plyr::llply(all.param.list,
                              sbm_params,
                              type="4group_2param")

Now we conduct the actual simulation.

## we'll repeat each set of parameter combinations this number of times
## (small here to keep this reasonably fast; in practice, you probably want many more)
num.sims <- 2

sim.ests <- plyr::ldply(all.param.list,
             function(these.sim.params) {

                # construct an object containing the parameters that govern
                # reporting (in our case, this will use our tau parameter to
                # simulate false negatives)
                rep.params <- imperfect_reporting(these.sim.params)

                # construct an object containing the parameters that govern
                # sampling from the frame population
                frame.sample.params <- frame_srs(list(sampling.frac=
                                                      these.sim.params$frame.sampling.frac))

                # construct an object containing the parameters that govern
                # sampling from the hidden population
                hidden.sample.params <- hidden_relprob(list(sampling.frac=
                                                            these.sim.params$hidden.sampling.frac,
                                                            inclusion.trait='d.degree'))

                this.N <- these.sim.params$N

                ## this goes through each repetition for the parameter set we're on
                these.res <- plyr::ldply(1:num.sims,
                                   function(this.idx) {

                                     # draw a random graph from the stochastic block model
                                     this.g <- generate_graph(these.sim.params)

                                     # use the reporting parameters to convert the
                                     # undirected social network into a directed
                                     # reporting graph
                                     this.r <- reporting_graph(rep.params, this.g)

                                     # get some quantities about the entire reporting graph
                                     # from a census
                                     census.df <- sample_graph(entire_census(), this.r)
                                     #res <- sbm_census_quantities(census.df)

                                     res <- list()

                                     # get sample-based estimates from
                                     # a sample of the frame population and a sample
                                     # of the hidden population
                                     frame.df <- sample_graph(frame.sample.params,
                                                              this.r)
                                     hidden.df <- sample_graph(hidden.sample.params,
                                                               this.r)

                                     # note that we will assume that the degrees
                                     # are correct (ie, we are not simulating the
                                     # known population method)
                                     res$sample.y.F.H <- with(frame.df,
                                                              sum((y.FH + y.notFH)*
                                                                  sampling.weight))

                                     res$sample.d.F.U <- with(frame.df,
                                                              sum(d.degree*sampling.weight))

                                     res$sample.dbar.F.F <- with(frame.df,
                                                                 sum((d.FH + d.FnotH)*
                                                                     sampling.weight)/
                                                                 sum(sampling.weight))

                                     res$sample.vbar.H.F <- with(hidden.df,
                                                              sum((v.FnotH + v.FH)*
                                                                  sampling.weight)/
                                                              sum(sampling.weight))

                                     res$sample.basic <- with(res,
                                                              (sample.y.F.H / sample.d.F.U)*
                                                              this.N)

                                     res$sample.adapted <- with(res,
                                                                sample.y.F.H / sample.dbar.F.F)

                                     res$sample.generalized <- with(res,
                                                                    sample.y.F.H / sample.vbar.H.F)

                                     # for convenience, keep track of which rep this is
                                     res$rep <- this.idx

                                     return(data.frame(res))
                                   })

                ## for convenience, keep track of which parameter set this is
                these.res$param.index <- these.sim.params$param.index

                return(these.res)
             })

Finally, summarize the results

ci_low <- function(x) { quantile(x, .025, na.rm=TRUE) }
ci_high <- function(x) { quantile(x, .975, na.rm=TRUE) }

sim.agg.byest <- sim.ests %>%
                 group_by(param.index) %>%
                 rename(basic=sample.basic, 
                        adapted=sample.adapted, 
                        generalized=sample.generalized) %>%
                 summarise_each(funs(mean, ci_low, ci_high),
                                basic, adapted, generalized) %>%
                 gather(qty, value,
                        matches("basic|adapted|generalized")) %>%
                 separate(qty, c("estimator", "qty"), sep="\\_", extra="merge") %>%
                 spread(qty, value)

# merge in parameter values
sim.params <- sim.params %>% mutate(N.H = p.H * N)

sim.agg.byest <- inner_join(sim.agg.byest, sim.params, by='param.index')

Plot the bias

sim.agg.byest <- sim.agg.byest %>% mutate(bias = mean - N.H) 

bias.tab <- sim.agg.byest %>% select(estimator, rho, bias) %>%
                              spread(estimator, bias)

kable(bias.tab)