knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

A Bayesian approach to weighted networks

SimuNet's simulation process can be assimilated to a network constructed via Bayesian inference over the network's edges. Through repeated Bayesian-inferred simulations, one can assess a network uncertainty by determining the distributions of simulated network metrics.

Simulated edge weight distribution

By replicating many simulations, one can assess a distribution of edges' weights:

library(SimuNet)
library(reshape2)
library(dplyr)
library(ggplot2)
library(cowplot)
library(extraDistr)

set.seed(42)
n <- 5L
samp.effort <- 100L
Adj <- sample(1:samp.effort,n * n) |>
  matrix(nrow = n,dimnames = list(letters[1:n],letters[1:n]))
Adj[lower.tri(Adj,diag = TRUE)] <- 0L
Adj

# Running a single simulation
sL <- simunet(Adj,samp.effort,"upper",100)
sL |> sum_scans()

# Replicating many simulations
sL.list <- replicate(n = 1000,simunet(Adj,samp.effort,"upper",100) |> sum_scans())

edge.weights <- 
  melt(sL.list,varnames = c("i","j","rep"),value.name = "weight") |> 
  subset(as.numeric(i) < as.numeric(j)) |> 
  mutate(ij = paste0(i,"-",j))
head(edge.weights)

These are for instance the distributions of three edge weights with small, medium and large weights:

edge.weights <- 
  Adj |> 
  melt(varnames = c("i","j"),value.name = "original.weight") |> 
  merge(x = edge.weights,by = c("i","j"))

edge.weights |> 
  subset((i == "c" & j == "e") | (i == "a" & j == "e") | (i == "c" & j == "d")) |> 
  ggplot(aes(x = weight,colour = ij,fill = ij))+
  facet_grid(ij ~ .)+
  geom_line(size = 1,lty = "dashed",
    aes(
      y = extraDistr::dbbinom(
        weight,
        size = samp.effort,
        alpha = original.weight + 0.5,
        beta = samp.effort - original.weight + 0.5
      )
    )
  )+
  geom_text(colour = "grey70",
    aes(
      x = 50,y = 0.12,
      label = paste0("Beta-Binomial(\u03B1 = ",samp.effort - original.weight + 0.5,", \u03B2 = ",
                     original.weight + 0.5,", n = ",samp.effort,")")
    )
  )+
  geom_ribbon(aes(ymax = extraDistr::dbbinom(weight,size = samp.effort,
                                              alpha = original.weight + 0.5,
                                              beta = samp.effort - original.weight + 0.5)),
              ymin = 0,alpha = 0.2,colour = NA)+
  geom_histogram(aes(y = stat(count) / sum(count)),alpha = .1)+
  geom_point(aes(x = original.weight, y = 0.01, colour = ij),
             shape = 23,stroke = 1.3,fill = "white",size = 3)+
  labs(x = "Edge weight value\n(after 100 scans)",y = "Frequency",colour = "edge",fill = "edge")+
  cowplot::theme_half_open(15)+
  theme(plot.background = element_rect(fill = 'white', colour = NA))

with:

Sampling effort on edge weight variance

See how the distribution is more narrow/wide as the sampling effort varies in these examples:

Adj.10 <- Adj.100 <- Adj.1000 <- Adj.5000 <-
  matrix(0L,nrow = n,ncol = n,dimnames = list(letters[1:n],letters[1:n]))

Adj.10[upper.tri(Adj.10)] <- seq(1,10,length.out = (n^2 - n) / 2) |> round()
Adj.10

Adj.100[upper.tri(Adj.100)] <- seq(1,100,length.out = (n^2 - n) / 2) |> round()
Adj.100

Adj.1000[upper.tri(Adj.1000)] <- seq(1,1000,length.out = (n^2 - n) / 2) |> round()
Adj.1000

Adj.5000[upper.tri(Adj.5000)] <- seq(1,5000,length.out = (n^2 - n) / 2) |> round()
Adj.5000

We use box plots and scaled edge weights hereafter to ease comparison across orders of magnitudes of sampling efforts:

replicate_for_sampEffort <- function(Adj,samp.effort,n.scans = samp.effort,n.rep = 50) {
  Adj.df <- 
    Adj |> 
    melt(varnames = c("i","j"),value.name = "original.weight")

  replicate(n = n.rep,simunet(Adj,samp.effort,"upper",n.scans) |> sum_scans()) |> 
  melt(varnames = c("i","j","rep"),value.name = "weight") |> 
  subset(as.numeric(i) < as.numeric(j)) |> 
  merge(Adj.df,by = c("i","j")) |> 
  cbind(samp.effort = samp.effort)
}


edge.weights <- rbind(
  replicate_for_sampEffort(Adj.10,10L),
  replicate_for_sampEffort(Adj.100,100L),
  replicate_for_sampEffort(Adj.1000,1000L),
  replicate_for_sampEffort(Adj.5000,5000L)
) |> 
  subset((i == "a" & j == "c") | (i == "b" & j == "d") | (i == "c" & j == "e")) |> 
  mutate(ij = paste0(i,"-",j))
head(edge.weights)

edge.weights |> 
  ggplot(aes(weight / samp.effort,colour = ij,fill = ij))+
  facet_grid(samp.effort ~ ij,scale = "free")+
  geom_histogram(aes(y = stat(count)),alpha = .25,bins =  50)+
  geom_line(lty = "dashed",
    aes(
      y = extraDistr::dbbinom(
        weight,
        size = samp.effort,
        alpha = original.weight + 0.5,
        beta = samp.effort - original.weight + 0.5
      ) * samp.effort
    )
  )+
  geom_ribbon(aes(
      ymax = extraDistr::dbbinom(weight,size = samp.effort,alpha = original.weight + 0.5,
        beta = samp.effort - original.weight + 0.5
      ) * samp.effort
    ),ymin = 0,alpha = 0.15,colour = NA)+
  geom_point(aes(x = original.weight / samp.effort, y = 0.75, colour = ij),
             shape = 23,stroke = 1.3,fill = "white",size = 3)+
  guides(colour = "none",fill = "none")+
  labs(x = "Scaled edge weight",y = "Counts")+
  cowplot::theme_half_open(12)+
  theme(plot.background = element_rect(fill = 'white', colour = NA))

The spread around the scaled theoretical value is smaller for scaled edges weight obtained with higher sampling effort.

Edge weight variability is not affected by the number of simulated scans

However, for a given sampling effort, the number of simulated scans is not influential on the (scaled) edge weights, with the distribution converging for n.scans values high enough.

rbind(
  cbind(replicate_for_sampEffort(Adj.100,100L,10),n.scans = 10),
  cbind(replicate_for_sampEffort(Adj.100,100L,100),n.scans = 100),
  cbind(replicate_for_sampEffort(Adj.100,100L,1000),n.scans = 1000),
  cbind(replicate_for_sampEffort(Adj.100,100L,10000),n.scans = 10000),
  cbind(replicate_for_sampEffort(Adj.100,100L,20000),n.scans = 20000)
) |> 
  subset((i == "a" & j == "c") | (i == "b" & j == "d") | (i == "c" & j == "e")) |> 
  mutate(ij = paste0(i,"-",j)) |> 
  ggplot(aes(weight / n.scans,colour = ij,fill = ij))+
  facet_grid(n.scans ~ ij,scale = "free")+
  geom_histogram(aes(y = stat(count)),alpha = .25,bins =  50)+
  geom_line(lty = "dashed",
    aes(
      y = extraDistr::dbbinom(
        weight,
        size = n.scans,
        alpha = original.weight + 0.5,
        beta = samp.effort - original.weight + 0.5
      ) * n.scans
    )
  )+
  geom_ribbon(aes(
      ymax = extraDistr::dbbinom(weight,size = n.scans,alpha = original.weight + 0.5,
        beta = samp.effort - original.weight + 0.5) * n.scans
    ),ymin = 0,alpha = 0.15,colour = NA)+
  geom_point(aes(x = original.weight / samp.effort, y = 0.75, colour = ij),
             shape = 23,stroke = 1.3,fill = "white",size = 3)+
  guides(colour = "none",fill = "none")+
  labs(x = "Scaled edge weight",y = "Counts")+
  cowplot::theme_half_open(12)+
  theme(plot.background = element_rect(fill = 'white', colour = NA))

Simulated network metrics distribution

Going further from edge weights, one can similarly use SimuNet to extract network metrics from simulated networks. Let's consider for instance node eigen-vector centrality:

compute_EV <- function(mat) {
  EV <- mat |> 
    igraph::graph.adjacency("upper",weighted = TRUE) |> 
    igraph::eigen_centrality(directed = FALSE,scale = FALSE)
  EV$vector
}

replicate_EV <- function(Adj,samp.effort,n.scans = samp.effort,n.rep = 50) {
  replicate(n = n.rep,simunet(Adj,samp.effort,"upper",n.scans) |> scale_scans() |> compute_EV()) |> 
  melt(varnames = c("node","rep"),value.name = "EV")
}

EV.ori <- data.frame(node = as.factor(letters[1:n]),EV = compute_EV(Adj))

replicate_EV(Adj,samp.effort,n.rep = 1000) |> 
  ggplot(aes(node,EV,colour = node,fill = node))+
  geom_boxplot(alpha = 0.5)+
  geom_point(data = EV.ori,shape = 21,size = 3,stroke = 1.5,fill = "white")+
  guides(colour = "none",fill = "none")+
  labs(x = "Node",y = "Eigen-vector centrality")+
  theme_minimal(12)+
  theme(plot.background = element_rect(fill = 'white', colour = NA))

And again, let's replicate this procedure with variable sampling effort:

EV.ori <- rbind(
  data.frame(node = as.factor(letters[1:n]),EV = compute_EV(Adj.10),samp.effort = as.factor(10)),
  data.frame(node = as.factor(letters[1:n]),EV = compute_EV(Adj.100),samp.effort = as.factor(100)),
  data.frame(node = as.factor(letters[1:n]),EV = compute_EV(Adj.1000),samp.effort = as.factor(1000))
)

rbind(
  cbind(replicate_EV(Adj.10,10L,n.scans = 10,n.rep = 100),samp.effort = as.factor(10)),
  cbind(replicate_EV(Adj.100,100L,n.scans = 100,n.rep = 100),samp.effort = as.factor(100)),
  cbind(replicate_EV(Adj.1000,1000L,n.scans = 1000,n.rep = 100),samp.effort = as.factor(1000))
) |> 
  ggplot(aes(node,EV,colour = node,fill = node))+
  facet_grid(. ~ paste0("sampl. effort = ",samp.effort))+
  geom_boxplot(aes(group = interaction(samp.effort,node)),alpha = 0.5)+
  geom_point(data = EV.ori,aes(group = interaction(samp.effort,node)),
             position = position_dodge(.8),
             shape = 21,size = 3,stroke = 1.5,fill = "white")+
  guides(colour = "none",fill = "none")+
  labs(x = "Node",y = "Eigen-vector centrality")+
  theme_minimal(12)+
  theme(plot.background = element_rect(fill = 'white', colour = NA))

Networks metrics available are too many to cover here, but the approach shown here - simulate a network, retrieve the metric, repeat many times - can be used to infere any network metric's distribution.



R-KenK/SimuNet documentation built on Oct. 22, 2021, 1:27 a.m.