This vignette shows examples of assessing bias in literature review networks based on covariates from metadata about the studies and authors included or excluded from the review on redistricting in the main manuscript. Specifically, for each study, we collect metadata on the lead author's gender, H-Index, and total number of citations. We then assess the impact of selecting studies on covariates in two ways:
First, we subset the network (e.g., to studies where the lead author is a man) and observe how many nodes and edges are missing in these subsets. This reveals the contributions of underrepresented scholars to the network by showing what we lose if they are excluded.
Second, we draw random samples of 100 studies weighted by covariates. This simulates a literature review that is biased (e.g., toward scholars who are men or have many citations). We then compare these biased samples to an unweighted random sample of studies in the network.
library(knitr) library(igraph) library(netlit) library(tidyverse) library(tidyr) library(magrittr) library(ggraph) knitr::opts_chunk$set(echo = TRUE, cache = FALSE, fig.width=10, fig.height=7, fig.show="hold", out.width = "100%", #split = T, fig.align = "default", #fig.path='../figs/', fig.cap= "", out.extra = "", fig.retina = 1, warning=FALSE, message=FALSE) library(kableExtra) # Table formatting # format kable for document type kable <- function(...){ if(knitr::is_latex_output()){ head(..., 25) %>% knitr::kable(booktabs = T, format = 'latex') %>% kable_styling(latex_options = c("striped", "scale_down", "HOLD_position")) } else { knitr::kable(...) %>% kable_styling() %>% scroll_box(height = "400px") } }
# Load replication version of main data and metadata on citations load(here::here("replication_data","literature_metadata.rda")) load(here::here("replication_data","literature.rda")) names(literature_metadata) %<>% janitor::make_clean_names() literature_metadata %<>% rename(author_gender = author_sex) literature_metadata%>% kable() # split out multiple cites per edge literature_long <- literature %>% mutate(id = str_split(cites, ";")) %>% unnest(id) # merge edgelist with metadata literature_long %<>% full_join(literature_metadata)
literature_long %>% ggplot() + aes(x = author_h_index, fill = author_gender)+ geom_histogram() literature_long %>% ggplot() + aes(x = author_citations, fill = author_gender)+ geom_histogram()
library(ggraph)
clean <- . %>% str_replace_all("([a-z| |-]{8}) ","\\1\n") %>% str_replace_all(" ([a-z| |-]{9})", "\n\\1") %>% str_to_title() %>% str_replace("\nOf\n", "\nOf ") %>% str_replace("\nFellow ", " Fellow\n") %>% str_replace("\nState\n", " State\n") %>% str_replace("\nDistrict\n", " District\n") %>% str_replace("\nWith\n", " With\n") literature_long$from %<>% clean() literature_long$to %<>% clean()
lit <- literature_long %>% distinct(to, from) %>% review() lit # best seed 1,4, *5* set.seed(5) netlit_plot <- function(g){ ggraph(g, layout = 'fr') + geom_node_point( aes(color = degree_total %>% as.factor() ), size = 6, alpha = .7 ) + geom_edge_arc2( start_cap = circle(3, 'mm'), end_cap = circle(6, 'mm'), aes( color = edge_betweenness, ), curvature = 0, arrow = arrow(length = unit(2, 'mm'), type = "open") ) + geom_edge_loop( start_cap = circle(5, 'mm'), end_cap = circle(2, 'mm'), aes( color = edge_betweenness), n = 300, strength = .6, arrow = arrow(length = unit(2, 'mm'), type = "open") ) + geom_node_text( aes(label = name), size = 2.3) + ggplot2::theme_void() + theme(legend.position="bottom") + labs(edge_color = "Edge Betweenness", color = "Total Degree\nCentrality", edge_linetype = "") + scale_edge_color_viridis(option = "plasma", begin = 0, end = .9, direction = -1, guide = "legend") + scale_color_viridis_d(option = "mako", begin = 1, end = .5) } g <- literature_long %>% distinct(to, from) %>% review() %>% .$graph g %>% netlit_plot()
# for plotting bias netlit_bias_plot <- function(subgraph){ # lit with edge attribute indicating missing from subgraph lit <- literature_long %>% distinct(to, from) %>% left_join( subgraph$edgelist %>% distinct(to, from) %>% mutate(missing_edges = "Not missing") ) %>% mutate(missing_edges = tidyr::replace_na(missing_edges, "Missing")) lit %<>% review(edge_attributes = names(lit)) # missing nodes missing_nodes <- lit$nodelist$node[!lit$nodelist$node %in% subgraph$nodelist$node] set.seed(5) ggraph(lit$g, layout = 'fr') + geom_node_point( aes(color = ifelse(name %in% missing_nodes, "Missing", "Not Missing")), size = 6, alpha = .7 ) + geom_edge_arc2( start_cap = circle(3, 'mm'), end_cap = circle(6, 'mm'), aes( color = missing_edges, ), curvature = 0, arrow = arrow(length = unit(2, 'mm'), type = "open") ) + geom_edge_loop( start_cap = circle(5, 'mm'), end_cap = circle(2, 'mm'), aes(color = missing_edges), n = 300, strength = .6, arrow = arrow(length = unit(2, 'mm'), type = "open") ) + geom_node_text( aes(label = name), size = 2.3) + ggplot2::theme_void() + theme(legend.position="bottom") + labs(edge_color = "", color = "", edge_linetype = "") + scale_color_discrete() + scale_edge_color_discrete() } literature_long %<>% mutate(author_is_man = author_gender == "M")
# biased sample weights literature_long %<>% mutate(unbiased = .5, weight = case_when( author_is_man ~ .6, !author_is_man ~ .4, TRUE~ .5 )) # a function to sample the network sample_lit <- function(n, literature_long, prob){ # create an index for the sample samp_idx <- sample(seq_len(nrow(literature_long)), 100, # 100 draws = number of studies to draw prob=prob # with prob var provided ) # subset sample to index sample <- literature_long %>% rowid_to_column() %>% filter(rowid %in% samp_idx) %>% distinct(to, from) %>% review() return(sample) }
n_samples <-1000
r n_samples
draws)There are 165 studies in the original literature review. We draw 100 of them---first at random, then weighted random samples. For each type of simulated bias we use r n_samples
draws.
random_samples <- map(1:n_samples, # 100 samples sample_lit, literature_long=literature_long, prob = literature_long$unbiased) samples <- random_samples mean_edge_betw <- . %>% pull(edge_betweenness) %>% mean() mean_node_betw <- . %>% pull(betweenness) %>% mean() mean_node_degree <- . %>% pull(degree_total) %>% mean() # make a table of the total number of nodes, edges, and the graph object for plotting summarise_samples <- function(samples){ summary <- tibble( #edge stats edges = samples %>% map(1) %>% modify(nrow) %>% unlist(), edge_between_mean = samples %>% map(1) %>% modify(mean_edge_betw) %>% unlist(), # nodes stats nodes = samples %>% map(2) %>% modify(nrow) %>% unlist(), node_between_mean = samples %>% map(2) %>% modify(mean_node_betw) %>% unlist(), node_degree_mean = samples %>% map(2) %>% modify(mean_node_degree) %>% unlist(), #graph stats communities = samples %>% map(3) %>% modify(cluster_walktrap) %>% modify(length) %>% unlist(), diameter = samples %>% map(3) %>% modify(diameter) %>% unlist(), graph = samples %>% map(3) ) return(summary) } summary <- summarise_samples(samples) random <- summary %>% mutate( sample = "Random" ) # map(random$graph, netlit_plot) map(random_samples[1:10], netlit_bias_plot)
Average nodes recovered: r summary$nodes %>% mean()
Average node betweenness recovered: r summary$node_between_mean %>% mean()
Average edges recovered: r summary$edges %>% mean()
Average edge betweenness recovered: r summary$edge_between_mean %>% mean()
Average node degree recovered: r summary$node_degree_mean %>% mean()
Average communities recovered: r summary$communities %>% mean()
Average diameter recovered: r summary$diameter %>% mean()
# biased samples gender_samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight) samples <- gender_samples summary <- summarise_samples(samples) gender <- summary %>% mutate(sample = "Gender bias favoring men") # map(gender_samples[1:10], netlit_bias_plot) map(gender_samples[1:10], netlit_bias_plot)
Average nodes recovered: r summary$nodes %>% mean()
Average node betweenness recovered: r summary$node_between_mean %>% mean()
Average edges recovered: r summary$edges %>% mean()
Average edge betweenness recovered: r summary$edge_between_mean %>% mean()
Average node degree recovered: r summary$node_degree_mean %>% mean()
Average communities recovered: r summary$communities %>% mean()
Average diameter recovered: r summary$diameter %>% mean()
# biased sample weights literature_long %<>% mutate(weight = case_when( author_is_man ~ 1, !author_is_man ~ .3, TRUE~ .5 )) # biased samples gender_samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight) samples <- gender_samples summary <- summarise_samples(samples) gender <- summary %>% mutate( sample = "Gender bias favoring men" ) #map(gender$graph, netlit_plot) map(gender_samples[1:10], netlit_bias_plot)
Average nodes recovered: r summary$nodes %>% mean()
Average node betweenness recovered: r summary$node_between_mean %>% mean()
Average edges recovered: r summary$edges %>% mean()
Average edge betweenness recovered: r summary$edge_between_mean %>% mean()
Average node degree recovered: r summary$node_degree_mean %>% mean()
Average communities recovered: r summary$communities %>% mean()
Average diameter recovered: r summary$diameter %>% mean()
# biased sample weights literature_long %<>% mutate(weight = case_when( author_is_man ~ .3, !author_is_man ~ 1, TRUE~ .5 )) gender_samples2 <- samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight) # biased samples summary <- summarise_samples(samples) gender2 <- summary %>% mutate( sample = "Gender bias favoring women" ) #map(gender$graph, netlit_plot) map(gender_samples2[1:10], netlit_bias_plot)
Average nodes recovered: r summary$nodes %>% mean()
Average node betweenness recovered: r summary$node_between_mean %>% mean()
Average edges recovered: r summary$edges %>% mean()
Average edge betweenness recovered: r summary$edge_between_mean %>% mean()
Average node degree recovered: r summary$node_degree_mean %>% mean()
Average communities recovered: r summary$communities %>% mean()
Average diameter recovered: r summary$diameter %>% mean()
(replacing NA HIndex with 0)
literature_long %<>% mutate(author_h_index = tidyr::replace_na(author_h_index, 0 )) # biased samples hindex_samples <- samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$weight) summary <- summarise_samples(samples) hindex <- summary %>% mutate( sample = "H-Index bias" ) #map(gender$graph, netlit_plot) map(hindex_samples[1:10], netlit_bias_plot)
Average nodes recovered: r summary$nodes %>% mean()
Average node betweenness recovered: r summary$node_between_mean %>% mean()
Average edges recovered: r summary$edges %>% mean()
Average edge betweenness recovered: r summary$edge_between_mean %>% mean()
Average node degree recovered: r summary$node_degree_mean %>% mean()
Average communities recovered: r summary$communities %>% mean()
Average diameter recovered: r summary$diameter %>% mean()
(replacing NA author citations with 0)
literature_long %<>% mutate(author_citations = tidyr::replace_na(author_citations, 0 )) # gender-biased samples citations_samples <- map(1:n_samples, sample_lit,literature_long=literature_long, prob = literature_long$author_citations) samples <- citations_samples summary <- summarise_samples(samples) citations <- summary %>% mutate( sample = "Citations bias" ) # map(citations$graph, netlit_plot) map(citations_samples[1:10], netlit_bias_plot) # %>% .[c(1:10)]
Average nodes recovered: r summary$nodes %>% mean()
Average node betweenness recovered: r summary$node_between_mean %>% mean()
Average edges recovered: r summary$edges %>% mean()
Average edge betweenness recovered: r summary$edge_between_mean %>% mean()
Average node degree recovered: r summary$node_degree_mean %>% mean()
Average communities recovered: r summary$communities %>% mean()
Average diameter recovered: r summary$diameter %>% mean()
s <- full_join(random, gender) %>% full_join(gender2) %>% full_join(hindex) %>% full_join(citations) round2 <- . %>% round(1) s_table <- s %>% group_by(sample) %>% select(where(is.numeric)) %>% summarise_all(mean) %>% group_by(sample) %>% mutate_all(round2) %>% arrange(rev(sample)) color.me <- which(s_table$sample == "Random") names(s_table) %<>% str_remove("_mean") s_table %>% kable(booktabs = T) %>% kable_styling() s %>% ggplot() + aes(x = nodes, fill = sample, color = sample) + geom_density(alpha = .3) + scale_color_viridis_d() + scale_fill_viridis_d() + theme_minimal() + labs(color = "", fill = "", y = "Density", x = "Nodes Recovered (out of 56)") + theme(axis.text.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) s %>% ggplot() + aes(x = edges, fill = sample, color = sample) + geom_density(alpha = .3) + scale_color_viridis_d() + scale_fill_viridis_d() + theme_minimal() + labs(color = "", fill = "", y = "Density", x = "Edges Recovered (out of 69)") + theme(axis.text.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) s %>% ggplot() + aes(x = edge_between_mean, fill = sample, color = sample) + geom_density(alpha = .3) + scale_color_viridis_d() + scale_fill_viridis_d() + theme_minimal() + labs(color = "", fill = "", y = "Density", x = "Average Edge Betweenness") + theme(axis.text.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) s %>% ggplot() + aes(x = node_between_mean, fill = sample, color = sample) + geom_density(alpha = .3) + scale_color_viridis_d() + scale_fill_viridis_d() + theme_minimal() + labs(color = "", fill = "", y = "Density", x = "Average Node Betweenness") + theme(axis.text.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) s %>% ggplot() + aes(x = node_degree_mean, fill = sample, color = sample) + geom_density(alpha = .3) + scale_color_viridis_d() + scale_fill_viridis_d() + theme_minimal() + labs(color = "", fill = "", y = "Density", x = "Average Degree") + theme(axis.text.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) s %>% ggplot() + aes(x = communities, fill = sample, color = sample) + geom_density(alpha = .3) + scale_color_viridis_d() + scale_fill_viridis_d() + theme_minimal() + labs(color = "", fill = "", y = "Density", x = "Communities") + theme(axis.text.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) s %>% ggplot() + aes(x = diameter, fill = sample, color = sample) + geom_density(alpha = .3) + scale_color_viridis_d() + scale_fill_viridis_d() + theme_minimal() + labs(color = "", fill = "", y = "Density", x = "Diameter") + theme(axis.text.y = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.