This vignette shows examples of assessing bias in literature review networks based on covariates from metadata about the studies and authors included or excluded from the review on redistricting in the main manuscript. Specifically, for each study, we collect metadata on the lead author's gender, H-Index, and total number of citations. We then assess the impact of selecting studies on covariates in two ways:

  1. First, we subset the network (e.g., to studies where the lead author is a man) and observe how many nodes and edges are missing in these subsets. This reveals the contributions of underrepresented scholars to the network by showing what we lose if they are excluded.

  2. Second, we draw random samples of 100 studies weighted by covariates. This simulates a literature review that is biased (e.g., toward scholars who are men or have many citations). We then compare these biased samples to an unweighted random sample of studies in the network.

library(knitr)
library(igraph)
library(netlit)
library(tidyverse)
library(tidyr)
library(magrittr)
library(ggraph)
knitr::opts_chunk$set(echo = TRUE, 
                      cache = FALSE,
                      fig.width=10, 
                      fig.height=7,
                      fig.show="hold", 
                      out.width = "100%",
                      #split = T,
                       fig.align = "default",
                      #fig.path='../figs/',
                      fig.cap= "",
                      out.extra = "",
                      fig.retina = 1, 
                      warning=FALSE, 
                      message=FALSE)

library(kableExtra) # Table formatting

# format kable for document type 
kable <- function(...){
  if(knitr::is_latex_output()){
    head(..., 25) %>% 
    knitr::kable(booktabs = T, format = 'latex') %>% 
  kable_styling(latex_options = c("striped", "scale_down", "HOLD_position"))
    } else {
    knitr::kable(...) %>% 
  kable_styling() %>% 
  scroll_box(height = "400px")
    }
}

Metadata

Lead Author Gender, H-Index, and Citation

# Load replication version of main data and metadata on citations
load(here::here("replication_data","literature_metadata.rda"))
load(here::here("replication_data","literature.rda"))

names(literature_metadata) %<>% janitor::make_clean_names()

literature_metadata %<>% 
  rename(author_gender = author_sex)

literature_metadata%>% kable()

# split out multiple cites per edge 
literature_long <- literature %>% 
  mutate(id = str_split(cites, ";")) %>% 
  unnest(id)

# merge edgelist with metadata
literature_long %<>% full_join(literature_metadata)
literature_long %>% 
  ggplot() +
  aes(x = author_h_index, fill = author_gender)+
  geom_histogram()

literature_long %>% 
  ggplot() +
  aes(x = author_citations, fill = author_gender)+
  geom_histogram()

library(ggraph)
clean <- . %>% 
  str_replace_all("([a-z| |-]{8}) ","\\1\n") %>%
  str_replace_all(" ([a-z| |-]{9})",  "\n\\1") %>% str_to_title() %>% 
  str_replace("\nOf\n", "\nOf ") %>% 
  str_replace("\nFellow ", " Fellow\n") %>% 
  str_replace("\nState\n", " State\n") %>% 
  str_replace("\nDistrict\n", " District\n") %>% 
  str_replace("\nWith\n", " With\n")

literature_long$from %<>% clean() 
literature_long$to %<>% clean()

The Full Graph

lit <- literature_long %>% 
  distinct(to, from) %>% 
  review()

lit

# best seed 1,4, *5*
set.seed(5)

netlit_plot <- function(g){
ggraph(g, layout = 'fr') + 
  geom_node_point(
    aes(color = degree_total %>% as.factor() ),
    size = 6, 
    alpha = .7
    ) + 
  geom_edge_arc2(
    start_cap = circle(3, 'mm'),
    end_cap = circle(6, 'mm'),
    aes(
      color = edge_betweenness,
      ),
    curvature = 0,
    arrow = arrow(length = unit(2, 'mm'), 
                  type = "open")
    ) +
  geom_edge_loop(
      start_cap = circle(5, 'mm'),
      end_cap = circle(2, 'mm'),
      aes( color = edge_betweenness),
      n = 300,
      strength = .6,
    arrow = arrow(length = unit(2, 'mm'), 
                  type = "open")
    ) +
  geom_node_text( aes(label = name), size = 2.3) + 
  ggplot2::theme_void() + 
  theme(legend.position="bottom") + 
  labs(edge_color = "Edge Betweenness",
       color = "Total Degree\nCentrality",
       edge_linetype = "") + 
scale_edge_color_viridis(option = "plasma", 
                         begin = 0, 
                         end = .9, 
                         direction = -1, 
                         guide = "legend") +
  scale_color_viridis_d(option = "mako", 
                        begin = 1, 
                        end = .5)
}


g <- literature_long %>% 
  distinct(to, from) %>% 
  review()  %>% 
  .$graph 

g %>% 
  netlit_plot()

# for plotting bias
netlit_bias_plot <- function(subgraph){

  # lit with edge attribute indicating missing from subgraph 
lit <- literature_long %>% 
  distinct(to, from) %>% 
    left_join( subgraph$edgelist %>% distinct(to, from) %>% mutate(missing_edges = "Not missing") 
) %>% 
    mutate(missing_edges = tidyr::replace_na(missing_edges, "Missing")) 

lit %<>% 
  review(edge_attributes = names(lit))  

#  missing nodes 
  missing_nodes <- lit$nodelist$node[!lit$nodelist$node %in% subgraph$nodelist$node]

  set.seed(5)

ggraph(lit$g, layout = 'fr') + 
  geom_node_point(
    aes(color = ifelse(name %in% missing_nodes, "Missing", "Not Missing")),
    size = 6, 
    alpha = .7
    ) + 
  geom_edge_arc2(
    start_cap = circle(3, 'mm'),
    end_cap = circle(6, 'mm'),
    aes(
      color = missing_edges,
      ),
    curvature = 0,
    arrow = arrow(length = unit(2, 'mm'), 
                  type = "open")
    ) +
  geom_edge_loop(
      start_cap = circle(5, 'mm'),
      end_cap = circle(2, 'mm'),
      aes(color = missing_edges),
      n = 300,
      strength = .6,
    arrow = arrow(length = unit(2, 'mm'), 
                  type = "open")
    ) +
  geom_node_text( aes(label = name), size = 2.3) + 
  ggplot2::theme_void() + 
  theme(legend.position="bottom") + 
  labs(edge_color = "",
       color = "",
       edge_linetype = "") +
  scale_color_discrete() + 
  scale_edge_color_discrete()
}


literature_long %<>%
  mutate(author_is_man = author_gender == "M")

Biased Samples

# biased sample weights 
literature_long %<>% 
    mutate(unbiased = .5,
           weight = case_when(
      author_is_man ~ .6,
      !author_is_man ~ .4,
      TRUE~ .5 
    ))


# a function to sample the network 
sample_lit <- function(n, literature_long, prob){

  # create an index for the sample
  samp_idx <- sample(seq_len(nrow(literature_long)), 
                     100, # 100 draws = number of studies to draw 
                     prob=prob # with prob var provided 
                     )

  # subset sample to index 
  sample <- literature_long %>% 
    rowid_to_column() %>% 
    filter(rowid %in% samp_idx) %>% 
    distinct(to, from) %>% 
    review()

    return(sample)
}
n_samples <-1000

Random draws of 100 studies (r n_samples draws)

There are 165 studies in the original literature review. We draw 100 of them---first at random, then weighted random samples. For each type of simulated bias we use r n_samples draws.

random_samples <- map(1:n_samples, # 100 samples 
                      sample_lit,
                      literature_long=literature_long, 
                      prob = literature_long$unbiased)

samples <- random_samples

mean_edge_betw <- . %>% pull(edge_betweenness) %>% mean()
mean_node_betw <- . %>% pull(betweenness) %>% mean()
mean_node_degree <- . %>% pull(degree_total) %>% mean()

# make a table of the total number of nodes, edges, and the graph object for plotting
summarise_samples <- function(samples){
summary <- tibble(
  #edge stats
  edges = samples %>% map(1) %>% modify(nrow) %>% unlist(),
  edge_between_mean = samples %>% map(1) %>% modify(mean_edge_betw) %>% unlist(),
  # nodes stats
    nodes = samples %>% map(2) %>% modify(nrow) %>% unlist(),
  node_between_mean = samples %>% map(2) %>% modify(mean_node_betw) %>% unlist(),
  node_degree_mean = samples %>% map(2) %>% modify(mean_node_degree) %>% unlist(),
  #graph stats 
  communities = samples %>% map(3) %>% modify(cluster_walktrap) %>% modify(length) %>% unlist(),
  diameter = samples %>% map(3) %>% modify(diameter)  %>% unlist(),
  graph = samples %>% map(3)
  )
return(summary)
}

summary <- summarise_samples(samples)

random <- summary %>% mutate(
  sample = "Random"
)

# map(random$graph, netlit_plot)
map(random_samples[1:10], netlit_bias_plot) 

Average nodes recovered: r summary$nodes %>% mean()

Average node betweenness recovered: r summary$node_between_mean %>% mean()

Average edges recovered: r summary$edges %>% mean()

Average edge betweenness recovered: r summary$edge_between_mean %>% mean()

Average node degree recovered: r summary$node_degree_mean %>% mean()

Average communities recovered: r summary$communities %>% mean()

Average diameter recovered: r summary$diameter %>% mean()


Gender-biased draws

pr(cite|man) = .60, pr(cite|woman) = .40

#  biased samples
gender_samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)

samples <- gender_samples

summary <- summarise_samples(samples)

gender <- summary %>% mutate(sample = "Gender bias favoring men")

# map(gender_samples[1:10], netlit_bias_plot)
map(gender_samples[1:10], netlit_bias_plot)

Average nodes recovered: r summary$nodes %>% mean()

Average node betweenness recovered: r summary$node_between_mean %>% mean()

Average edges recovered: r summary$edges %>% mean()

Average edge betweenness recovered: r summary$edge_between_mean %>% mean()

Average node degree recovered: r summary$node_degree_mean %>% mean()

Average communities recovered: r summary$communities %>% mean()

Average diameter recovered: r summary$diameter %>% mean()


pr(man) = 1, pr(woman) = .30

# biased sample weights 
literature_long %<>% 
    mutate(weight = case_when(
      author_is_man ~ 1,
      !author_is_man ~ .3,
      TRUE~ .5 
    ))


#  biased samples
gender_samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)

samples <- gender_samples

summary <- summarise_samples(samples)

gender <- summary %>% mutate(
  sample = "Gender bias favoring men"
)

#map(gender$graph, netlit_plot)
map(gender_samples[1:10], netlit_bias_plot)

Average nodes recovered: r summary$nodes %>% mean()

Average node betweenness recovered: r summary$node_between_mean %>% mean()

Average edges recovered: r summary$edges %>% mean()

Average edge betweenness recovered: r summary$edge_between_mean %>% mean()

Average node degree recovered: r summary$node_degree_mean %>% mean()

Average communities recovered: r summary$communities %>% mean()

Average diameter recovered: r summary$diameter %>% mean()


pr(man) = .30, pr(woman) = 1

# biased sample weights 
literature_long %<>% 
    mutate(weight = case_when(
      author_is_man ~ .3,
      !author_is_man ~ 1,
      TRUE~ .5 
    ))

gender_samples2 <- samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)


# biased samples
summary <- summarise_samples(samples)

gender2 <- summary %>% mutate(
  sample = "Gender bias favoring women"
)

#map(gender$graph, netlit_plot)
map(gender_samples2[1:10], netlit_bias_plot)

Average nodes recovered: r summary$nodes %>% mean()

Average node betweenness recovered: r summary$node_between_mean %>% mean()

Average edges recovered: r summary$edges %>% mean()

Average edge betweenness recovered: r summary$edge_between_mean %>% mean()

Average node degree recovered: r summary$node_degree_mean %>% mean()

Average communities recovered: r summary$communities %>% mean()

Average diameter recovered: r summary$diameter %>% mean()


H-Index-biased draws

(replacing NA HIndex with 0)

literature_long %<>%
  mutate(author_h_index = tidyr::replace_na(author_h_index, 0 ))

#  biased samples
hindex_samples <- samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight)



summary <- summarise_samples(samples)

hindex <- summary %>% mutate(
  sample = "H-Index bias"
)

#map(gender$graph, netlit_plot)
map(hindex_samples[1:10], netlit_bias_plot)

Average nodes recovered: r summary$nodes %>% mean()

Average node betweenness recovered: r summary$node_between_mean %>% mean()

Average edges recovered: r summary$edges %>% mean()

Average edge betweenness recovered: r summary$edge_between_mean %>% mean()

Average node degree recovered: r summary$node_degree_mean %>% mean()

Average communities recovered: r summary$communities %>% mean()

Average diameter recovered: r summary$diameter %>% mean()


Citation-biased draws

(replacing NA author citations with 0)

literature_long %<>%
  mutate(author_citations = tidyr::replace_na(author_citations, 0 ))

# gender-biased samples
citations_samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$author_citations)

samples <- citations_samples

summary <- summarise_samples(samples)

citations <- summary %>% mutate(
  sample = "Citations bias"
)

# map(citations$graph, netlit_plot)
map(citations_samples[1:10], netlit_bias_plot) # %>% .[c(1:10)]

Average nodes recovered: r summary$nodes %>% mean()

Average node betweenness recovered: r summary$node_between_mean %>% mean()

Average edges recovered: r summary$edges %>% mean()

Average edge betweenness recovered: r summary$edge_between_mean %>% mean()

Average node degree recovered: r summary$node_degree_mean %>% mean()

Average communities recovered: r summary$communities %>% mean()

Average diameter recovered: r summary$diameter %>% mean()

Comparing Biases

s <- full_join(random, gender) %>% 
  full_join(gender2) %>% 
  full_join(hindex) %>% 
  full_join(citations)

round2 <- . %>% round(1)

s_table <- s %>% group_by(sample) %>% 
  select(where(is.numeric)) %>% summarise_all(mean) %>% 
  group_by(sample) %>% 
  mutate_all(round2) %>% 
  arrange(rev(sample))

color.me <- which(s_table$sample == "Random")

names(s_table) %<>% str_remove("_mean")

s_table %>% 
  kable(booktabs = T) %>% 
  kable_styling()  


s %>% 
  ggplot() + 
  aes(x = nodes, fill = sample, color = sample) +
  geom_density(alpha = .3) + 
  scale_color_viridis_d() + 
  scale_fill_viridis_d() +
  theme_minimal() + 
  labs(color = "", 
       fill = "", y = "Density",
       x = "Nodes Recovered (out of 56)") + 
  theme(axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

s %>% 
  ggplot() + 
  aes(x = edges, fill = sample, color = sample) +
  geom_density(alpha = .3) + 
  scale_color_viridis_d() + 
  scale_fill_viridis_d() +
  theme_minimal() + 
  labs(color = "", 
       fill = "", y = "Density",
       x = "Edges Recovered (out of 69)") + 
  theme(axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())


s %>% 
  ggplot() + 
  aes(x = edge_between_mean, fill = sample, color = sample) +
  geom_density(alpha = .3) + 
  scale_color_viridis_d() + 
  scale_fill_viridis_d() +
  theme_minimal() + 
  labs(color = "", 
       fill = "", y = "Density",
       x = "Average Edge Betweenness") + 
  theme(axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

s %>% 
  ggplot() + 
  aes(x = node_between_mean, fill = sample, color = sample) +
  geom_density(alpha = .3) + 
  scale_color_viridis_d() + 
  scale_fill_viridis_d() +
  theme_minimal() + 
  labs(color = "", 
       fill = "", y = "Density",
       x = "Average Node Betweenness") + 
  theme(axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

s %>% 
  ggplot() + 
  aes(x = node_degree_mean, fill = sample, color = sample) +
  geom_density(alpha = .3) + 
  scale_color_viridis_d() + 
  scale_fill_viridis_d() +
  theme_minimal() + 
  labs(color = "", 
       fill = "", y = "Density",
       x = "Average Degree") + 
  theme(axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

s %>% 
  ggplot() + 
  aes(x = communities, fill = sample, color = sample) +
  geom_density(alpha = .3) + 
  scale_color_viridis_d() + 
  scale_fill_viridis_d() +
  theme_minimal() + 
  labs(color = "", 
       fill = "", y = "Density",
       x = "Communities") + 
  theme(axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())

s %>% 
  ggplot() + 
  aes(x = diameter, fill = sample, color = sample) +
  geom_density(alpha = .3) + 
  scale_color_viridis_d() + 
  scale_fill_viridis_d() +
  theme_minimal() + 
  labs(color = "", 
       fill = "", y = "Density",
       x = "Diameter") + 
  theme(axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())


judgelord/netlit documentation built on Jan. 3, 2023, 2:42 p.m.