knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)
# load libraries

library(readr)
library(dplyr)
library(stringr)
library(magrittr)
library(ggplot2)
library(readr)
library(tidytext)
library(tidyr)
library(tidyverse)
library(ggthemes)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(mapsf)
library(colorspace)
library(shiny)
library(shinydashboard)
library(DT)
library(treemapify)
# global variables
who_pm2.5_guideline <- 5

# global operations
`%notin%` <- Negate(`%in%`)

# epi studies analysis raw sheet
epi <- readxl::read_xlsx("./data-raw/pm2.5_distribution/AQLI_Epidemiology Literature Research.xlsx", sheet = "AnalysisDatasetPM2.5MortalityAn")


#> change default columns types
epi$cohort_size <- as.numeric(epi$cohort_size)
epi$study_start_year<- as.numeric(epi$study_start_year)
epi$study_end_year <- as.numeric(epi$study_end_year)
epi$pm2.5_exposure_ll <- as.numeric(epi$pm2.5_exposure_ll)
epi$pm2.5_exposure_ul <- as.numeric(epi$pm2.5_exposure_ul)
epi$mean_pm2.5 <- as.numeric(epi$mean_pm2.5)
epi$sd_pm2.5 <- as.numeric(epi$sd_pm2.5)
epi$cohort_age_ll <- as.numeric(epi$cohort_age_ll)
epi$cohort_age_ul <- as.numeric(epi$cohort_age_ul)

# AQLI color file and gadm0 file
aqli_color <- read_csv("./data-raw/pm2.5_distribution/[june2023]gadm2_aqli_2021_post_waterbody_adj_finalized_internal.csv")
gadm0_aqli_2021 <- read_csv("./data-raw/pm2.5_distribution/[june2023]gadm0_aqli_2021_post_waterbody_adj_finalized_internal.csv")


# aqli_color <- aqli_color %>%
#   dplyr::rename_with(~str_replace(.x, "who", "llpp_who_"), dplyr::contains("who")) %>%
#   dplyr::rename_with(~str_replace(.x, "nat", "llpp_nat_"), dplyr::contains("nat")) %>%
#   dplyr::rename(country = name0) %>%
#   dplyr::mutate(whostandard = who_pm2.5_guideline) %>%
#   dplyr::select(objectid_gadm2:population, whostandard, everything())

aqli_color <- aqli_color %>%
  dplyr::mutate(objectid_color = objectid_gadm2)

# getting a country continent file
country_continent <- read_csv("./data-raw/pm2.5_distribution/country_continent.csv")

# adding a continent column to the color file
aqli_color <- aqli_color %>%
  left_join(country_continent, by = "country") %>%
  select(objectid_color, continent, country, everything())

gadm0_aqli_2021 <- gadm0_aqli_2021 %>%
  left_join(country_continent, by = "country")

#> Filling in missing continents-----------------------------------------------

# countries for which continent is NA: for these fill in the continent manually
countries_with_missing_continent <- gadm0_aqli_2021 %>% filter(is.na(continent)) %>% pull(country) %>% unique()

# continent fill in for missing coutries
continents_for_missing_countries <- c("Africa", "Europe", "North America", "Asia", "Asia",
                                      "Oceania", "Africa", "Africa", "Africa", "Asia")

# [CAUTION: perform a sanity check on the above 2 vectors and how they map countries to continents before proceeding]
#creating a data frame using the above 2 vectors as columns
missing_continents_df <- tibble(country = countries_with_missing_continent,
                                continent = continents_for_missing_countries)


# adding in the missing continent information in the gadmx_aqli_2021 datasets
aqli_color <- aqli_color %>%
  left_join(missing_continents_df, by = "country") %>%
  mutate(continent = ifelse(is.na(continent.x), continent.y, continent.x))


#> setting report parameters
min_sample_size <- 1000
min_study_duration_in_years <- 1

#> Filtering the epi database (minimum cohort size: >1000, minimum study duration: > 1 year): commented for now.
epi <- epi %>%
  mutate(study_duration = (study_end_year - study_start_year) + 1) %>%
  filter(cohort_size >= min_sample_size, study_duration >= min_study_duration_in_years, !is.na(cohort_uid)) %>%
  distinct(cohort_uid, .keep_all = TRUE)

# creating this temp dataset for the  continent panel dataset
epi_tmp <- epi %>%
  mutate(continent = ifelse(country == "China", "China", continent),
         continent = ifelse(continent == "Asia", "Asia - China", continent))

# continent list (specifucally for the continent panel graph, that contains the Asia and Asia -  China panels, and considers them as continents in a
# separate epi_tmp dataset, that is specially created for this graph)
continent_list <- c("China", "Asia - China", "Africa", "North America", "South America", "Europe", "Oceania")

# missing continents
missing_continents_logical <- continent_list %notin% unique(epi_tmp$continent)
missing_continents <- continent_list[missing_continents_logical]

#> Calculations needed for all sections above the "Results section".

# percent of population not in compliance with the WHO guideline
num_people_above_who <- aqli_color %>%
  filter(pm2020 > who_pm2.5_guideline) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist()

# world population
world_pop <- aqli_color %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist()

percent_people_above_who <- round((num_people_above_who/world_pop)*100, 1)

# pop pollution graph with 2 buckets: function.

pop_pol_graph_2_buckets <- function(aqli_color, epi, thresh_ll, thresh_ul, pm2.5_col_name){

  if(thresh_ll > thresh_ul){
    stop("Lower Limit should be less than or equal to the Upper limit! Please try again!")
  }

  # aqli color population in ordered pm2.5 buckets
  aqli_color_grp_pm2.5_buckets <- aqli_color %>%
    dplyr::filter(!is.na(!!as.symbol(pm2.5_col_name))) %>%
    dplyr::mutate(region = ifelse((!!as.symbol(pm2.5_col_name) >= thresh_ll) & (!!as.symbol(pm2.5_col_name) <= thresh_ul), stringr::str_c(thresh_ll, "-", thresh_ul), !!as.symbol(pm2.5_col_name)),
           region = ifelse(!!as.symbol(pm2.5_col_name) > thresh_ul, stringr::str_c(">", thresh_ul), region),
           region = ifelse(!!as.symbol(pm2.5_col_name) < thresh_ll, stringr::str_c("<", thresh_ll), region)) %>%
    dplyr::group_by(region) %>%
    dplyr::summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(order_pollution_group = ifelse(region == stringr::str_c("<", thresh_ll), 1, 0),
           order_pollution_group = ifelse(region == stringr::str_c(thresh_ll, "-", thresh_ul), 2, order_pollution_group),
           order_pollution_group = ifelse(region == stringr::str_c(">", thresh_ul), 3, order_pollution_group)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(tot_pop_prop = (tot_pop/sum(tot_pop))*100)


  # epi total number of studies in ordered pollution buckets (initially filtering out pooled studies, will add back in using a for loop)
  epi_num_studies_grp_pm2.5_buckets <- epi %>%
    dplyr::filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
    dplyr::group_by(paper_uid) %>%
    dplyr::summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
    dplyr::mutate(region = ifelse(mean_pm2.5 >= thresh_ll & mean_pm2.5 <= thresh_ul, stringr::str_c(thresh_ll, "-", thresh_ul), mean_pm2.5),
           region = ifelse(mean_pm2.5 > thresh_ul, stringr::str_c(">", thresh_ul), region),
           region = ifelse(mean_pm2.5 < thresh_ll, stringr::str_c("<", thresh_ll), region)) %>%
    dplyr::mutate(order_pollution_group = ifelse(region == stringr::str_c("<", thresh_ll), 1, 0),
           order_pollution_group = ifelse(region == stringr::str_c(thresh_ll, "-", thresh_ul), 2, order_pollution_group),
           order_pollution_group = ifelse(region == stringr::str_c(">", thresh_ul), 3, order_pollution_group)) %>%
    dplyr::group_by(region) %>%
    dplyr::summarise(tot_studies = dplyr::n(), order_pollution_group = order_pollution_group[1]) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(tot_studies_prop = (tot_studies/sum(tot_studies))*100)

  #> bar chart version

  # creating a master dataset for both population and studies data
  pop_epi_studies_data <- aqli_color_grp_pm2.5_buckets %>%
    dplyr::left_join(epi_num_studies_grp_pm2.5_buckets, by = c("region", "order_pollution_group")) %>%
    dplyr::select(region,  order_pollution_group, tot_pop_prop, tot_studies_prop) %>%
    tidyr::pivot_longer(cols = tot_pop_prop:tot_studies_prop, names_to = "type_of_prop", values_to = "val") %>%
    dplyr::mutate(type_of_prop = ifelse(type_of_prop == "tot_pop_prop", "Percent Population in PM₂.₅ bucket", "Percent Studies in PM₂.₅ bucket"))

pop_num_studies_in_pollution_buckets_graph <- pop_epi_studies_data %>%
  ggplot2::ggplot() +
  ggplot2::geom_col(mapping = aes(x = forcats::fct_reorder(region, order_pollution_group), y = val, fill = type_of_prop), position = position_dodge(), width = 0.3) +
  ggplot2::scale_y_continuous(breaks = seq(0, 100, 10)) +
  ggplot2::scale_fill_manual(values = c("tot_pop_prop" = "grey", "tot_studies_prop" = "cornflowerblue"), labels = c("Proportion of World Population in Bucket", "Proportion of Studies Completed in Bucket")) +
  ggplot2::labs(x = expression("Mean" ~ PM[2.5] ~ "bucket (in µg/m³)"),  y = "Percentage", fill = "",
                title = expression("Percentage of Global population in different" ~ PM[2.5] ~ "buckets"),
       caption = stringr::str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) +
 # ggplot2::geom_text(mapping = aes(x = forcats::fct_reorder(region, order_pollution_group), y = val, label = paste0(round(val, 1), "%")), position = ggplot2::position_dodge2(width = 1, preserve = "single"), vjust = -0.5, hjust = -0.2, size = 3) +
ggthemes::theme_tufte() +
  ggplot2::theme(axis.line.y = element_line(color = "black"),
        axis.line.x = element_line(color = "black"),
        plot.caption = element_text(size = 8, hjust = 0, margin = margin(t = 0.7, unit = "cm")),
        plot.title = element_text(size = 16, margin = margin(b = 1, unit = "cm"), hjust = 0.5),
        plot.caption.position = "plot",
        axis.title.y = element_text(margin = margin(r = 0.5, unit = "cm"), size = 14),
        axis.title.x = element_text(margin = margin(t = 0.5, b = 0.5, unit = "cm"), size  = 14),
        axis.text = element_text(size = 12),
        legend.box.background = element_rect(color = "black"),
        legend.position = "bottom",
        legend.text = element_text(size = 12)) +
  viridis::scale_fill_viridis(discrete = TRUE)

return(pop_num_studies_in_pollution_buckets_graph)



}

Epidemiological Studies: Meta Analysis

This analysis is our ongoing attempt to capture all epidemiological research studies (>r min_sample_size people; max sample size: r round(max(epi$cohort_size, na.rm = TRUE)/1000000, 1) million), that are relatively long term (> r min_study_duration_in_years year(s), max study duration: r max(epi$study_duration, na.rm =TRUE) years), with unique cohorts (by unique we mean that even if a given cohort is studied by different groups over different years/different sample size, it'll be counted as separate cohorts, instead of 1) and measure the impact of ambient PM~2.5~, PM~10~, TSP, Ultra Fine Particulate Matter on Mortality (cause-specific, all cause, premature)/Life Expectancy published between r min(epi$publishing_year) and present*, findable in available peer-reviewed literature. Hereafter, we refer to these studies as "AQ epi studies" for short. As of now, this meta-analysis analyzes r epi %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow() AQ epi studies. This analysis will be continually updated to incorporate new AQ epi studies.

We are seeking to make this analysis as current, complete and error-free as possible and view it as a continual work in progress. We would appreciate the air quality community’s comments, corrections, and suggestions. Please contact aqli-epic@uchicago.edu or leave a comment in this GitHub repository/directly leave comments in the analysis dataset (more on this below).

 


Major Takeaways

Purpose

The purpose of this analysis is to understand the landscape of epidemiological research on the relationship between PM~2.5~, PM~10~, TSP, Ultrafine Particulate Matter and Mortality (all types, as specified above)/Life Expectancy and to surface demographic, geographic, or other trends that may exist in the current state of literature. While the overall arc of the relationship between these pollutants and human health is clear enough to take action, understanding such trends can help the field reflect on itself, take stock of any biases or gaps – and point toward future research and policy opportunities.

Why do epidemiological studies on air pollution and mortality matter?

While global estimates of air pollution’s toll on public health vary, they all point in the same direction: air pollution poses one of the largest health risks on the planet to humans [Paper1, Paper2, Paper3, Paper4]. Epidemiological studies on air pollution and mortality help us understand the burden of air pollution on human health at global, national and regional levels. According to Vahlsing and Smith (2012), these sorts of studies can also help countries take policy action, pushing forward and shaping national-level ambient air quality standards.

The burden of air pollution across the world is also not uniform. While r percent_people_above_who percent of the world population is out of compliance with the latest WHO annual PM~2.5~ guideline of r who_pm2.5_guideline µg/m³, there is huge variation in the quality of air one breathes.

Results

 

PM~2.5~ concentration range and the Global Population Distribution

 

#> mean pm2.5 distribution segregated by country (density plot): makes sense to count different country studies separately, as we are making country wise distributions, so we want to see all PM2.5 ranges covered in a given country.
mean_pm2.5_country_wise_graph <- epi %>% 
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  ggplot() +
  geom_density(mapping = aes(x = mean_pm2.5, fill = country), alpha = 0.5, color = "black", position = "identity") +
  labs(x = expression(paste("Mean PM2.5 concentration (", mu, "g", "/", m^3, ")")), 
       caption = str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) +
  theme_hc() +
  theme(plot.caption = element_text(hjust = 0)) +
viridis::scale_fill_viridis(discrete = TRUE)

#> PM2.5 exposure range and population distribution

# aqli color population in ordered pm2.5 buckets
aqli_color_grp_pm2.5_buckets <- aqli_color %>%
  mutate(region = ifelse(pm2020 >= 0 & pm2020 <= 25, "0-25", pm2020), 
         region = ifelse(pm2020 > 25 & pm2020 <= 50, ">25-50", region), 
         region = ifelse(pm2020 > 50 & pm2020 <= 75, ">50-75", region), 
         region = ifelse(pm2020 > 75 & pm2020 <= 100, ">75-100", region), 
         region = ifelse(pm2020 > 100, ">100", region)) %>%
  group_by(region) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  ungroup() %>%
    mutate(order_pollution_group = ifelse(region == "0-25", 1, 0), 
         order_pollution_group = ifelse(region == ">25-50", 2, order_pollution_group), 
         order_pollution_group = ifelse(region == ">50-75", 3, order_pollution_group), 
         order_pollution_group = ifelse(region == ">75-100", 4, order_pollution_group), 
         order_pollution_group = ifelse(region == ">100", 5, order_pollution_group)) %>%
  ungroup() %>%
  mutate(tot_pop_prop = (tot_pop/sum(tot_pop))*100)


# epi total number of studies in ordered pollution buckets (initially filtering out pooled studies, will add back in using a for loop)
epi_num_studies_grp_pm2.5_buckets <- epi %>%
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  group_by(paper_uid) %>%
  summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
    mutate(region = ifelse(mean_pm2.5 >= 0 & mean_pm2.5 <= 25, "0-25", mean_pm2.5), 
         region = ifelse(mean_pm2.5 > 25 & mean_pm2.5 <= 50, ">25-50", region), 
         region = ifelse(mean_pm2.5 > 50 & mean_pm2.5 <= 75, ">50-75", region), 
         region = ifelse(mean_pm2.5 > 75 & mean_pm2.5 <= 100, ">75-100", region), 
         region = ifelse(mean_pm2.5 > 100, ">100", region)) %>%
  mutate(order_pollution_group = ifelse(region == "0-25", 1, 0), 
         order_pollution_group = ifelse(region == ">25-50", 2, order_pollution_group), 
         order_pollution_group = ifelse(region == ">50-75", 3, order_pollution_group), 
         order_pollution_group = ifelse(region == ">75-100", 4, order_pollution_group), 
         order_pollution_group = ifelse(region == ">100", 5, order_pollution_group)) %>%
  group_by(region) %>%
  summarise(tot_studies = n(), order_pollution_group = order_pollution_group[1]) %>%
  ungroup() %>%
  mutate(tot_studies_prop = (tot_studies/sum(tot_studies))*100)

#> bar chart version

# creating a master dataset for both population and studies data
pop_epi_studies_data <- aqli_color_grp_pm2.5_buckets %>%
  left_join(epi_num_studies_grp_pm2.5_buckets, by = c("region", "order_pollution_group")) %>%
  select(region,  order_pollution_group, tot_pop_prop, tot_studies_prop) %>%
  pivot_longer(cols = tot_pop_prop:tot_studies_prop, names_to = "type_of_prop", values_to = "val")

# plotting the bar graph
pop_num_studies_in_pollution_buckets_graph <- pop_epi_studies_data %>%
  ggplot() +
  geom_col(mapping = aes(x = fct_reorder(region, order_pollution_group), y = val, fill = type_of_prop), position = position_dodge(), width = 0.7) +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  scale_fill_manual(values = c("tot_pop_prop" = "grey", "tot_studies_prop" = "cornflowerblue"), labels = c("Proportion of World Population in Bucket", "Proportion of Studies Completed in Bucket")) + 
  labs(x = "Mean PM2.5 bucket (in µg/m³)",  y = "Percentage", fill = "", 
       caption = str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) +
  theme_hc() +
  theme(axis.line.y = element_line(color = "black"), 
        axis.line.x = element_line(color = "black"), 
        plot.caption = element_text(size = 8, hjust = 0), 
        plot.caption.position = "plot") +
  viridis::scale_fill_viridis(discrete = TRUE)


#> other calculations for this block of code

#> total pop where pollution is > 50 micrograms per cubic meter 

# thresholds
pm2.5_thresh_highly_polluted <- 50
pm2.5_thresh_severly_polluted <- 75
pm2.5_thresh_low_polluted <- 25

# number of PM2.5 specific studies
epi_pm2.5_spec_studies_total <- epi %>% 
  distinct(paper_uid, .keep_all = TRUE) %>%
  filter(non_pm2.5 == 0) %>%
  nrow()

# higher than hp

# total pop where pollution is > x micrograms per cubic meter (hp)
total_pop_above_x_hp_micr_gm <- aqli_color %>%
  filter(pm2020 > pm2.5_thresh_highly_polluted) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist()

# percent pop where pollution is > x micrograms per cubic meter (hp)
perc_pop_above_x_hp_micr_gm <- round((total_pop_above_x_hp_micr_gm/world_pop)*100, 1)

# total pop where pollution is > x micrograms per cubic meter in millions (hp)
tot_pop_above_x_hp_micr_gm_millions <- round(total_pop_above_x_hp_micr_gm/1000000, 1)

# higher than sp

# total pop where pollution is > x micrograms per cubic meter (sp)
total_pop_above_x_sp_micr_gm <- aqli_color %>%
  filter(pm2020 > pm2.5_thresh_severly_polluted) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist()

# percent pop where pollution is > x micrograms per cubic meter (sp)
perc_pop_above_x_sp_micr_gm <- round((total_pop_above_x_sp_micr_gm/world_pop)*100, 1)

# total pop where pollution is > x micrograms per cubic meter in millions (sp)
tot_pop_above_x_sp_micr_gm_millions <- round(total_pop_above_x_sp_micr_gm/1000000, 1)

# less than lp

# total pop where pollution is < x micrograms per cubic meter (lp)
total_pop_below_x_lp_micr_gm <- aqli_color %>%
  filter(pm2020 < pm2.5_thresh_low_polluted) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist()

# percent pop where pollution is < x micrograms per cubic meter (lp)
perc_pop_below_x_lp_micr_gm <- round((total_pop_below_x_lp_micr_gm/world_pop)*100, 1)

# total pop where pollution is > x micrograms per cubic meter in millions (lp)
tot_pop_below_x_lp_micr_gm_millions <- round(total_pop_below_x_lp_micr_gm/1000000, 1)


#> percent studies in a given pollution bucket 


# higher than hp: 
epi_num_studies_in_hp <- epi %>%
  filter(!is.na(mean_pm2.5) , mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  group_by(paper_uid) %>%
  summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
  filter(mean_pm2.5 > pm2.5_thresh_highly_polluted) %>%
  nrow() %>%
  unlist()

perc_epi_studies_in_hp <- round((epi_num_studies_in_hp/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% filter(non_pm2.5 == 0)))*100, 1)

# higher than sp: makes sense to count different countries in a pooled study as different studies.

epi_num_studies_in_sp <- epi %>%
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
    group_by(paper_uid) %>%
  summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
  filter(mean_pm2.5 > pm2.5_thresh_severly_polluted) %>%
  nrow() %>%
  unlist()

perc_epi_studies_in_sp <- round((epi_num_studies_in_sp/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% filter(non_pm2.5 == 0)))*100, 1)

# less than lp: makes sense to count different countries in a pooled study as different studies.

epi_num_studies_in_lp <- epi %>%
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  group_by(paper_uid) %>%
  summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
  filter(mean_pm2.5 < pm2.5_thresh_low_polluted) %>%
  nrow() %>%
  unlist()

perc_epi_studies_in_lp <- round((epi_num_studies_in_lp/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% filter(non_pm2.5 == 0)))*100, 1)


# plot population and number of studies bar graph
pop_num_studies_in_pollution_buckets_graph

# plot mean pm2.5 country wise graph
mean_pm2.5_country_wise_graph

 

 


Geographic Distribution of Studies

#> Add a note mentioning that the underlying data for this graph includes all types of pollutants and also that, each country in a pooled study is counted separately.

    # generate summary dataset for total number of studies conducted in a given country
    epi_country_count <- epi %>%
      dplyr::select(country) %>%
      dplyr::count(country) %>%
      dplyr::filter(country != "NA", !is.na(country), !is.na(n))

    # making country names match before joining epi file with color file
    epi_country_count$country <- stringr::str_replace(epi_country_count$country, "USA", "United States")

    #> plot choropleth map

    # get data ready for the choropleth map
    world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")


    # making country names match before joining epi file with color file
    world$name_long <- stringr::str_replace(world$name_long, "Russian Federation", "Russia")
    world$name_long <- stringr::str_replace(world$name_long, "Lao PDR", "Laos")
    world$name_long <- stringr::str_replace(world$name_long, "Dem. Rep. Korea", "North Korea")
    world$name_long <- stringr::str_replace(world$name_long, "Republic of Korea", "South Korea")
    world$name_long <- stringr::str_replace(world$name_long, "Vatican", "Vatican City")
    world$name_long <- stringr::str_replace(world$name_long, "Brunei Darussalam", "Brunei")
    world$name_long <- stringr::str_replace(world$name_long, "Somaliland", "Somalia")
    world$name_long <- stringr::str_replace(world$name_long, "Mexico", "México")
    world$name_long <- stringr::str_replace(world$name_long, "Republic of Congo", "Republic of the Congo")
    world$name_long <- stringr::str_replace(world$name_long, "Czech Republic", "Czechia")

    # joining world shape file with the epi conutry wise count dataset
    world_shp_epi <- world %>%
      dplyr::left_join(epi_country_count, by = c("name_long" = "country")) %>%
      dplyr::select(name_long, n, geometry) %>%
      dplyr::rename(num_studies = n, country = name_long)

    # color 2020 collapse to country level to get the PM2.5 pollution layer (first layer of the map)
    aqli_color_country <- aqli_color %>%
      dplyr::group_by(country) %>%
      dplyr::mutate(pop_weights = population/sum(population, na.rm = TRUE),
                    pm2021_pop_weighted = pm2021*pop_weights) %>%
      dplyr::summarise(avg_pm2.5_2021 = sum(pm2021_pop_weighted, na.rm = TRUE))

    # joining the world shape + epi joined file with the aqli color file
    world_shp_epi_color <- world_shp_epi %>%
      dplyr::left_join(aqli_color_country, by = "country") %>%
      dplyr::filter(country != "Antarctica") %>%
      dplyr::select(country, num_studies, avg_pm2.5_2021, geometry)

    # In the total number of studies column, replacing NAs with 0s
    world_shp_epi_color <- world_shp_epi_color %>%
      dplyr::mutate(num_studies = ifelse(is.na(num_studies) == TRUE, 0, num_studies))

    # getting centroids for countries after correcting for problematic polygons
    world_shp_epi_color_for_centroids <<- sf::st_make_valid(world_shp_epi_color)
    country_wise_centroids <<- sf::st_centroid(world_shp_epi_color_for_centroids, of_largest_polygon = TRUE)

    # adding an identifier column for countries where no studies have happened
    country_wise_centroids$color_circle = ifelse(country_wise_centroids$num_studies == 0,  "No Studies", "Atleast 1 study")

    # creating a dataframe out of the latitude, longitude coordinates of the country centroids, so that we can plot it
    # using geom_point.
    coord_df <- dplyr::as_tibble(sf::st_coordinates(country_wise_centroids))

    # reassigning the X and Y coordinates to x_coord and y_coord
    country_wise_centroids$x_coord <- coord_df$X
    country_wise_centroids$y_coord <- coord_df$Y

    # moving the geometry column to the end and making sure that the entire dataset is a sf object
    country_wise_centroids <- country_wise_centroids %>%
      dplyr::select(-geometry, geometry) %>%
    sf::st_as_sf()

    # without geometry version of country wise centroids dataset and adding a label_plt column
    country_wise_centroids_sans_geom <- country_wise_centroids %>%
    sf::st_drop_geometry() %>%
      dplyr::as_tibble() %>%
      dplyr::mutate(label_plt = ifelse(num_studies == 0, "", num_studies))


# plotting the choropleth
geographic_dist_graph <- world_shp_epi_color %>%
        ggplot2::ggplot() +
        ggplot2::geom_sf(mapping = ggplot2::aes(fill = avg_pm2.5_2021, group = avg_pm2.5_2021), color = "darkgrey") +
        ggplot2::geom_sf(data = world_shp_epi_color %>% dplyr::filter(num_studies > 0) %>% sf::st_as_sf(), color = "black", size = 6, fill = "transparent") +
        colorspace::scale_fill_continuous_sequential(palette = "YlOrRd", name = expression("Annual Average" ~ PM[2.5] ~ "in 2021 (in µg/m³)")) +
        ggplot2::geom_jitter(data = country_wise_centroids_sans_geom, mapping = aes(x = x_coord, y = y_coord, size = num_studies, color = color_circle)) +
        ggplot2::geom_text(data = country_wise_centroids_sans_geom, mapping = aes(x = x_coord, y = y_coord, label = label_plt), size = 3, color = "white", size = 2) +
        ggplot2::theme(legend.position = "bottom") +
        ggplot2::scale_size(breaks = c(1, 5, 10, 20, 30), range = c(1, 12),
                            guide = ggplot2::guide_legend(override.aes = list(color = "darkgrey"))) +
        ggplot2::labs(caption = stringr::str_wrap("*The size of the circles is directly proportional to the total number of times a given country has been included
                                                in a study (any study, not necessarily PM2.5 specific - mentioned as the number in the circle). Countries with atleast 1 study are highlighted (black borders)."), width = 30,
                      size = "Number of Studies") +
        ggplot2::scale_color_manual(values = c("No Studies" = "cadetblue3", "Atleast 1 studys" = "darkolivegreen"), name = "") +
        ggthemes::theme_map() +
       ggplot2::ggtitle(expression("Country wise distribution of epi studies and Annual Average" ~PM[2.5] ~ "pollution (in µg/m³)")) +
        ggplot2::theme(plot.caption = ggplot2::element_text(hjust = 0, size = 7),
                       legend.position = "bottom",
                       legend.justification = c(0.5, 3),
                       plot.title = ggplot2::element_text(hjust = 0.5, size = 11), 
                       legend.text = element_text(size = 5), 
                       legend.title = element_text(size = 6))

#> Other calculations for this block of code

# Of all AQ epi studies included in this analysis how many included populations in either the US or Canada or both
num_times_usa_canada_included <- epi %>% 
  filter(country %in% c("USA", "Canada")) %>%
  distinct(paper_uid, .keep_all = TRUE) %>%
  nrow()

# total number of times USA, Canada and Europe have been included in a study
num_times_usa_canada_europe_included <- epi %>% 
  filter((country %in% c("Canada", "USA")) | (continent %in% c("Europe"))) %>% 
  distinct(paper_uid, .keep_all = TRUE) %>%
  nrow()

# Countries other than these: USA, Canada and the ones that are included in Europe that have been included in a study
areas_other_than_usa_canada_europe <- epi %>% 
  filter((country %notin% c("Canada", "USA")) & (continent %notin% c("Europe"))) %>%
  select(country) %>%
  unique() %>%
  unlist() %>%
  as.vector()

# number of times a continent have been included in a study
continent_wise_study_summary <- epi %>%
  group_by(paper_uid, continent) %>% 
  summarise(count = n()) %>%
  ungroup() %>%
  select(continent) %>%
  count(continent) %>%
  slice_max(n, n = 6) %>%
  mutate(prop_studies = round((n/sum(n, na.rm = TRUE))*100, 1))

# total number of times South America was included in a study
num_times_sa_included <- continent_wise_study_summary %>%
  filter(continent == "South America") %>%
  select(n) %>%
  unlist() %>%
  as.vector()

# total number of times Oceania was included in a study
num_times_oceania_included <- continent_wise_study_summary %>%
  filter(continent == "Oceania") %>%
  select(n) %>%
  unlist() %>%
  as.vector()

# total number of studies Africa has been included in a study
num_times_africa_included <- continent_wise_study_summary %>%
  filter(continent == "Africa") %>%
  select(n) %>%
  unlist() %>%
  as.vector()

# plot geographic distribution
geographic_dist_graph

 


AQ Epi studies over time

 

#> Add a note mentioning that the underlying data for this graph includes all types of pollutants but only one row is counted for a given pooled (multi-country) study.

epi %>% 
dplyr::distinct(paper_uid, .keep_all = TRUE) %>%
  group_by(publishing_year) %>%
  summarise(total_studies = n()) %>%
  ungroup() %>%
  filter(total_studies != "NA", !is.na(total_studies), publishing_year != "NA", !is.na(publishing_year)) %>%
  ggplot() +
  geom_line(mapping = aes(x = publishing_year, y = total_studies), lwd = 1.2, color = "cornflowerblue") +
  labs(x = "Year", y = "Total Number of Studies", 
       caption = str_wrap("*This graph displays the total number of AQ epi studies (all studies, not PM2.5 specific) published over time."), width = 10) +
  scale_x_continuous(breaks = seq(min(epi$publishing_year, na.rm = TRUE), max(epi$publishing_year, na.rm = TRUE), 2)) +
  theme_hc() +
    theme(plot.caption = element_text(hjust = 0))

 


Distribution of duration of study by continent

#>  In this, in the graph multiple countries of a pooled study are counted separately. Also, this is not a PM2.5 specific graph.

#> Density plot of study duration (commented for now, can use later if required)
# epi %>%
#   filter(!is.na(study_duration)) %>%
#   ggplot() +
#   geom_density(mapping = aes(x = study_duration, fill = continent), alpha = 0.5) +
#     labs(x = "Study Duration") +
#   ggtitle("Epi Studies Duration Distribution (Continent Wise): Density Plot") +
#   scale_x_continuous(breaks = seq(0, 40, 5)) +
#   theme_tufte() +
#   theme(legend.position = "bottom")


#> Histogram of study duration



#> adding an order panel and also dummy panels (for those cases in which we are looking at a subset of a dataset that does not include certain continents).


   if(sum(missing_continents_logical) == 0){
      continent_wise_study_duration_graph_summary_table <-   epi_tmp %>%
        dplyr::filter(continent != "NA", !is.na(continent), study_duration != "NA", !is.na(study_duration)) %>%
        dplyr::group_by(paper_uid, continent) %>%
        dplyr::summarise(average_study_duration = mean(study_duration, na.rm = TRUE)) %>%
        dplyr::ungroup() %>%
        dplyr::filter(continent != "Oceania") %>%
        #dplyr::mutate(continent = as.factor(continent)) %>%
        dplyr::mutate(order_continent = ifelse(continent == "China", 2, 0),
                      order_continent = ifelse(continent == "Asia - China", 1, order_continent),
                      order_continent = ifelse(continent == "Europe", 5, order_continent),
                      order_continent = ifelse(continent == "North America", 6, order_continent),
                      order_continent = ifelse(continent == "South America", 4, order_continent),
                      order_continent = ifelse(continent == "Africa", 3, order_continent)) %>%
        dplyr::mutate(continent = ifelse(continent == "China", "China (1402.5 million people)", continent),
                      continent = ifelse(continent == "Africa", "Africa (1373.8 million people)", continent),
                      continent = ifelse(continent == "North America", "North America (589.2 million people)", continent),
                      continent = ifelse(continent == "South America", "South America (429.2 million people)", continent),
                      continent = ifelse(continent == "Europe", "Europe (740.8 million people)", continent),
                      continent = ifelse(continent == "Asia - China", "Asia - China (3161.5 million people)", continent))

    } else{
      for(i in 1:length(missing_continents)){
        if(i == 1){
          continent_wise_study_duration_graph_summary_table <- epi_tmp %>%
            dplyr::filter(continent != "NA", !is.na(continent), study_duration != "NA", !is.na(study_duration)) %>%
            dplyr::group_by(paper_uid, continent) %>%
            dplyr::summarise(average_study_duration = mean(study_duration, na.rm = TRUE)) %>%
            dplyr::ungroup() %>%
          dplyr::add_row(continent = missing_continents[i])
        } else{
          continent_wise_study_duration_graph_summary_table <- continent_wise_study_duration_graph_summary_table %>%
            dplyr::add_row(continent = missing_continents[i])
        }

      }
      continent_wise_study_duration_graph_summary_table <- continent_wise_study_duration_graph_summary_table %>%
        dplyr::filter(continent != "Oceania") %>%
       # dplyr::mutate(continent = as.factor(continent)) %>%
        dplyr::mutate(order_continent = ifelse(continent == "China", 2, 0),
                      order_continent = ifelse(continent == "Asia - China", 1, order_continent),
                      order_continent = ifelse(continent == "Europe", 5, order_continent),
                      order_continent = ifelse(continent == "North America", 6, order_continent),
                      order_continent = ifelse(continent == "South America", 4, order_continent),
                      order_continent = ifelse(continent == "Africa", 3, order_continent)) %>%
        dplyr::mutate(continent = ifelse(continent == "China", "China (1402.5 million people)", continent),
                      continent = ifelse(continent == "Africa", "Africa (1373.8 million people)", continent),
                      continent = ifelse(continent == "North America", "North America (589.2 million people)", continent),
                      continent = ifelse(continent == "South America", "South America (429.2 million people)", continent),
                      continent = ifelse(continent == "Europe", "Europe (740.8 million people)", continent),
                      continent = ifelse(continent == "Asia - China", "Asia - China (3161.5 million people)", continent))
    }



continent_wise_study_duration_graph <- continent_wise_study_duration_graph_summary_table %>%
        ggplot2::ggplot() +
        ggplot2::geom_histogram(mapping = ggplot2::aes(x = average_study_duration, fill = continent, group = continent), color = "white", position = "identity", binwidth = 2) +
        ggplot2::labs(x = "Study Duration", y = "Count",
                      caption = stringr::str_wrap("*This graph displays distribution by continent (and for China, Asia - China) for all studies (not PM2.5 specific)."), width = 10,
                      title = expression("Continent (and China) wise Study Duration Distribution and 2021 Annual Average" ~ PM[2.5] ~ "(in µg/m³)"),
                      subtitle = "(Ordered from most polluted to least polluted)") +
        ggplot2::scale_x_continuous(breaks = seq(0, 40, 10)) +
        ggthemes::theme_hc() +
        ggplot2::theme(legend.title = element_blank()) +
        ggplot2::facet_wrap(~forcats::fct_reorder(continent, order_continent), nrow = 1) +
        ggplot2::theme(axis.line.y = ggplot2::element_line(color = "black"),
                       axis.line.x = ggplot2::element_line(color = "black"),
                       plot.caption = ggplot2::element_text(hjust = 0, size = 7),
                       plot.title = element_text(size = 10, hjust = 0.5),
                       plot.subtitle = element_text(size = 8, hjust = 0.5),
                       strip.background = element_rect(fill = "floralwhite"),
                       strip.text = element_text(size = 4.5),
                       legend.background = element_rect(color = "black"), 
                       legend.text = element_text(size = 7)) +
        ggplot2::scale_fill_manual(values = c("North America (589.2 million people)" = "#FFBA00", "Europe (740.8 million people)" = "#FF9600",
                                     "South America (429.2 million people)" = "#FF6908", "Africa (1373.8 million people)" = "#E63D23", "China (1402.5 million people)" = "#BD251C",
                                     "Asia - China (3161.5 million people)" = "#8C130E")) +
        ggplot2::scale_y_continuous(breaks = seq(0, 10, 2))





#> calculations relevant for this block of code

# long term study threshold
long_term_study_duration <- 25

# number of "really long term studies"
num_really_long_term_studies <- epi %>%
  group_by(paper_uid) %>%
  summarise(mean_duration = mean(study_duration, na.rm = TRUE)) %>%
  filter(mean_duration > long_term_study_duration) %>%
  nrow()


# continents that have seen studies greater than x (long_term) years of study duration
epi_long_term_studies_countries <- epi %>%
  group_by(paper_uid, continent) %>%
  summarise(mean_study_duration = mean(study_duration, na.rm = TRUE)) %>% 
  ungroup() %>%
  filter(mean_study_duration > long_term_study_duration, mean_study_duration != "NA", !is.na(mean_study_duration)) %>%
  select(continent) %>%
  count(continent) %>%
  mutate(prop_n = round((n/sum(n, na.rm = TRUE))*100, 1)) %>%
  slice_max(prop_n, n = 6)

# total population in missing continents combined
tot_pop_africa_missing_continents <- aqli_color %>%
  filter(continent %in% missing_continents) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist() %>%
  as.numeric() 

tot_pop_africa_missing_continents_in_billions <- round(tot_pop_africa_missing_continents/1000000000, 1)

# total world population
world_population <- aqli_color %>%
  summarise(total_population = sum(population, na.rm = TRUE)) %>%
  unlist() %>%
  as.numeric()



# continent order list (order variable: number of studies): code present in the geographical distribution map block.

# plot continent wise study duration graph
continent_wise_study_duration_graph
# press ctrl + shift + c to uncomment this.

# ### Major takeaways
# 
# * `r round((num_times_usa_canada_included/(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow()))*100, 1)` percent of all AQ epi studies (`r num_times_usa_canada_included` studies) included in this analysis included populations in either the US or Canada or both.
# 
# * Of the total number of times a given continent's population has been included in any given AQ epi study, `r continent_wise_study_summary$continent[1]` dominates the rest of the continents and has been included `r continent_wise_study_summary$prop_studies[1]` percent of times (all due to studies in the US or Canada). Closely following North America, populations in `r continent_wise_study_summary$continent[2]` and `r continent_wise_study_summary$continent[3]` have been included in AQ epi studies `r continent_wise_study_summary$prop_studies[2]` and `r continent_wise_study_summary$prop_studies[3]` percent of times respectively. `r missing_continents` have seen 0 long term (> `r min_study_duration_in_years` year), large (>`r min_sample_size` people in the sample) AQ epi study.
#   
# * There have been no large (> `r min_study_duration_in_years` year) or long (>`r min_sample_size` people in the sample) PM epi studies that were performed in `r missing_continents`, with a combined population of `r tot_pop_africa_missing_continents_in_billions` billion, `r round((tot_pop_africa_missing_continents/world_population)*100, 1)` percent of the Earth's total human population.
# 
# * Post 2009, there has been a noticeable increase in the overall volume of AQ epi studies published.
# 
# * In total, `r num_really_long_term_studies` "really" long term studies (> `r long_term_study_duration` years) have been performed. That is, `r round((num_really_long_term_studies/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE)))*100, 1)` percent of the total number of studies.
# 
# * All of the following continents have seen at least 1 really long term study: `r epi_long_term_studies_countries %>% filter(n >= 1) %>% select(continent) %>% unlist() %>% as.vector()`.

 


Conclusion

All studies included in this analysis point to the same overall picture:air pollution is a serious health threat. The existing state of scientific literature on air pollution and health is clear that air pollution’s impact on health is well-established and taking action to improve a polluted environment should not be delayed in order to complete multi-year large sample (> r min_sample_size) epidemiological studies in an area, even if there has not been a prior study in that particular geography. That said, it is important for the field of air quality epidemiolgy to understand the contours of its current research landscape to most effectively identify directions for future research and deploy limited resources.

 

Methodology

Through a comprehensive and ongoing* literature review, we are making an attempt at creating an exhaustive public listing of all the epidemiological studies out there (that we could find) that examine the relationship between PM~2.5~ and Life Expectancy/Mortality.

For each study, we record data on key defining features, such as: Geography, Sample Size, Study Duration, PM~2.5~ exposure range, etc. Then we used this analysis dataset to carry out a meta-analysis, results of which are detailed in the Results section above.

We are seeking to make this analysis as current, complete and error-free as possible and view it as a continual work in progress. We would welcome the air quality community’s any comments, corrections, andor suggestions. Please contact aqli-epic@uchicago.edu or leave a comment in this GitHub repository.

 

Inclusion/Exclusion Criteria for studies

 

Other important points

 

How can you (the community) help in improving this analysis?

 

 


References

 


Other Graphs and Interactive Summary Dashboard

To further explore these graphs and more in an interactive setup, visit the AQ Epi dashboard.

####  PM2.5 distributions of the lower limits and upper limits of exposure range (Density and Histogram Plots)

## All graphs below this block of code is deactivated for the blog post. This will be displayed on the dashboard, alongside other graphs.


#> Specific to PM2.5 and count separate countries in a pooled study separately

#> exposure distribution density and histogram plots (PM2.5 LL and PM2.5 UL)

# create long dataset
epi_long <- epi %>% 
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  pivot_longer(cols = contains("pm2.5_exposure"), names_to =  "exposure_type", values_to = "exposure_value") %>%
  select(paper_uid, exposure_type, exposure_value) %>%
  filter(!is.na(exposure_value)) 

# density plot
epi_long %>%
  group_by(paper_uid, exposure_type) %>%
  summarise(exposure_value = ifelse(exposure_type == "pm2.5_exposure_ll", min(exposure_value, na.rm = TRUE), max(exposure_value, na.rm = TRUE))) %>%
  ggplot() +
  geom_density(mapping = aes(x = exposure_value, fill = exposure_type), alpha = 0.6, position = "identity") +
  labs(x = expression(paste("PM2.5 concentration (", mu, "g", "/", m^3, ")"))) +
  theme_tufte()

# histogram plot
epi_long %>%
  group_by(paper_uid, exposure_type) %>%
  summarise(exposure_value = ifelse(exposure_type == "pm2.5_exposure_ll", min(exposure_value, na.rm = TRUE), max(exposure_value, na.rm = TRUE))) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = exposure_value, fill = exposure_type), position = "identity", alpha = 0.4, color = "white") +
  labs(x = expression(paste("PM2.5 concentration (", mu, "g", "/", m^3, ")"))) +
  theme_tufte()
#### Age distributions of lower limits and upper limits of age range (Density and Histogram plots)

#> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future.

#> (5) age distribution plot (LL and UL) using the long dataset (density plot)
epi_long_age_dist <- epi %>% 
  filter(((!is.na(cohort_age_ll)) | (cohort_age_ll != "NA")) & ((!is.na(cohort_age_ul)) | (cohort_age_ul != "NA"))) %>%
  pivot_longer(cols = contains("cohort_age"), names_to =  "age_dist_type", values_to = "age_value") %>%
  select(paper_uid, age_dist_type, age_value) %>%
  filter(!is.na(age_value)) 

epi_long_age_dist %>%
  ggplot() +
  geom_density(mapping = aes(x = age_value, fill = age_dist_type), alpha = 0.6, color = "black") +
  labs(x = "Age") +
  theme_tufte()

#> (6) age distribution plot (LL and UL) using the long dataset (histogram plot)
epi_long_age_dist %>%
  ggplot() +
  geom_histogram(mapping = aes(x = age_value, fill = age_dist_type), alpha = 0.5, position = "identity", color = "white") +
  labs(x = "Age") +
   theme_tufte()
#### Country wise distribution of Cohort Sizes (Density and Histogram plots)

#> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future.


#> (7) Country Wise Cohort Size density plot
epi %>%
    filter(cohort_size != "NA", !is.na(cohort_size)) %>%
  ggplot() +
  geom_density(mapping = aes(x = log10(cohort_size), fill = country), alpha = 0.5) +
  theme_tufte()

#> (8) Country wise Cohort Size histogram plot
epi %>%
  filter(cohort_size != "NA", !is.na(cohort_size)) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = log10(cohort_size), fill = country), alpha = 0.5, position = "identity") +
  theme_tufte()
#### Country wise Distribution of Study Duration (Density and Histogram Plots)

#> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future.


#> (9) Country wise Study duration density plot
epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_density(mapping = aes(x = study_duration, fill = country, alpha = 0.5)) +
  theme_tufte()

#> Country Wise Study Duration histogram

epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = study_duration, fill = country, alpha = 0.5), position = "identity", alpha = 0.5, color = "white", binwidth = 1) +
  theme_tufte() +
  facet_wrap(~country, nrow = 5) +
  theme(legend.position = "bottom")
# Experimental graphs (not to be put on the GitHub blog)

#### Country wise Distribution of Study Duration (Density and Histogram Plots)

#> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future.


#> (9) Country wise Study duration density plot
epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_density(mapping = aes(x = study_duration, fill = country, alpha = 0.5)) +
  theme_tufte()

#> Country Wise Study Duration histogram

epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = study_duration, fill = country, alpha = 0.5), position = "identity", alpha = 0.5, color = "white", binwidth = 1) +
  theme_tufte() +
  facet_wrap(~country, nrow = 5) +
  theme(legend.position = "bottom")
# Experimental Graphs (others not to be put on the GitHub blog)

#### Country wise Distribution of Study Duration (Density and Histogram Plots)

#> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future.


#> (9) Country wise Study duration density plot
epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_density(mapping = aes(x = study_duration, fill = country, alpha = 0.5)) +
  theme_tufte()

#> Country Wise Study Duration histogram

epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = study_duration, fill = country, alpha = 0.5), position = "identity", alpha = 0.5, color = "white", binwidth = 1) +
  theme_tufte() +
  facet_wrap(~country, nrow = 5) +
  theme(legend.position = "bottom")


# Experimental graphs

#> exp graph 1 - global aqli 2020 data pm2020 column distribution overlayed with vertical lines showing mean_pm2.5 values of various studies.---------------------------------------------------------
mean_pm2.5_studies <- epi$mean_pm2.5[!is.na(epi$mean_pm2.5)] 


# exp graph 1 v1
exp_graph_1_v1 <- aqli_color %>%
  ggplot() +
  geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8) +
  geom_vline(xintercept = mean_pm2.5_studies, color = "black", alpha = 0.4, linetype = "longdash", size = 0.4) + 
  ggthemes::theme_tufte() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), 
       y = "count") +
  theme(axis.line = element_line()) +
  scale_x_continuous(breaks = seq(0, 120, 20)) +
  scale_y_continuous(breaks = seq(0, 12000, 2000))

# exp graph 1 v2
FOO <- aqli_color %>%
  ggplot() +
  geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8) +
  geom_vline(xintercept = mean_pm2.5_studies, color = "black", alpha = 0.4, linetype = "longdash", size = 0.4) + 
  ggthemes::theme_tufte() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), 
       y = "count") +
  theme(axis.line = element_line()) +
  scale_x_continuous(breaks = seq(0, 120, 20)) +
  scale_y_continuous(breaks = seq(0, 12000, 2000)) + 
  annotate("segment", x = min(mean_pm2.5_studies, na.rm = TRUE), y = 12000, xend = max(mean_pm2.5_studies, na.rm = TRUE), yend = 12000, 
             arrow = arrow(length = unit(0.3, "cm"), type = "closed")) + 
  annotate("segment", x = 60, y = 12000, xend = 60, yend = 10000) + 
  annotate("segment", x = 60, y = 10000, xend = 90, yend = 10000) +  
  annotate("text", x = 99, y = 10000, label = str_wrap("Mean PM2.5 range of studies", 12))

# exp graph 1 v3

aqli_color %>%
  ggplot() +
  geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8)+
  ggthemes::theme_tufte() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), 
       y = "count") +
  theme(axis.line = element_line()) +
  scale_x_continuous(breaks = seq(0, 120, 20)) +
  scale_y_continuous(breaks = seq(0, 12000, 2000)) + 
  annotate("segment", x = min(mean_pm2.5_studies, na.rm = TRUE), y = 12000, xend = max(mean_pm2.5_studies, na.rm = TRUE), yend = 12000, 
             arrow = arrow(length = unit(0.3, "cm"), type = "closed")) + 
  annotate("segment", x = 60, y = 12000, xend = 60, yend = 10000) + 
  annotate("segment", x = 60, y = 10000, xend = 90, yend = 10000) +  
  annotate("text", x = 99, y = 10000, label = str_wrap("Mean PM2.5 range of studies", 12))

# exp graph 1 v4

foo <- aqli_color %>%
  ggplot() +
  geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8)+
  ggthemes::theme_tufte() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), 
       y = "count") +
  theme(axis.line = element_line()) +
  scale_x_continuous(breaks = seq(0, 120, 20)) +
  scale_y_continuous(breaks = seq(0, 12000, 2000)) + 
  annotate("rect", xmin = min(mean_pm2.5_studies, na.rm = TRUE), ymin = 0, xmax = max(mean_pm2.5_studies, na.rm = TRUE), ymax = 12000, alpha = 0.2) + 
  annotate("segment", x = max(mean_pm2.5_studies, na.rm = TRUE), y = 10000, xend = 90, yend = 10000) +  
  annotate("text", x = 99, y = 10000, label = str_wrap("Mean PM2.5 range of studies", 12))

# hex version


#> exp graph v2-------------------------------------------------------------------------

#> PM2.5 exposure range and population distribution

# exp graph 2 v1

thresh_ll <- 0
thresh_ul <- 25


# aqli color population in ordered pm2.5 buckets
aqli_color_grp_pm2.5_buckets <- aqli_color %>%
  mutate(region = ifelse(pm2020 >= thresh_ll & pm2020 <= thresh_ul, str_c(thresh_ll, "-", thresh_ul), pm2020), 
         region = ifelse(pm2020 > thresh_ul, str_c(">", thresh_ul), region)) %>%
  group_by(region) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  ungroup() %>%
    mutate(order_pollution_group = ifelse(region == str_c(thresh_ll, "-", thresh_ul), 1, 0), 
         order_pollution_group = ifelse(region == str_c(">", thresh_ul), 2, order_pollution_group)) %>%
  ungroup() %>%
  mutate(tot_pop_prop = (tot_pop/sum(tot_pop))*100)


# epi total number of studies in ordered pollution buckets (initially filtering out pooled studies, will add back in using a for loop)
epi_num_studies_grp_pm2.5_buckets <- epi %>%
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  group_by(paper_uid) %>%
  summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
    mutate(region = ifelse(mean_pm2.5 >= thresh_ll & mean_pm2.5 <= thresh_ul, str_c(thresh_ll, "-", thresh_ul), mean_pm2.5), 
         region = ifelse(mean_pm2.5 > 25, str_c(">", thresh_ul), region)) %>%
  mutate(order_pollution_group = ifelse(region == str_c(thresh_ll, "-", thresh_ul), 1, 0), 
         order_pollution_group = ifelse(region == str_c(">", thresh_ul), 2, order_pollution_group)) %>%
  group_by(region) %>%
  summarise(tot_studies = n(), order_pollution_group = order_pollution_group[1]) %>%
  ungroup() %>%
  mutate(tot_studies_prop = (tot_studies/sum(tot_studies))*100)

#> bar chart version

# creating a master dataset for both population and studies data
pop_epi_studies_data <- aqli_color_grp_pm2.5_buckets %>%
  left_join(epi_num_studies_grp_pm2.5_buckets, by = c("region", "order_pollution_group")) %>%
  select(region,  order_pollution_group, tot_pop_prop, tot_studies_prop) %>%
  pivot_longer(cols = tot_pop_prop:tot_studies_prop, names_to = "type_of_prop", values_to = "val")

# plotting the bar graph
pop_num_studies_in_pollution_buckets_graph <- pop_epi_studies_data %>%
  ggplot() +
  geom_col(mapping = aes(x = fct_reorder(region, order_pollution_group), y = val, fill = type_of_prop), position = position_dodge(), width = 0.4) +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  scale_fill_manual(values = c("tot_pop_prop" = "grey", "tot_studies_prop" = "cornflowerblue"), labels = c("Proportion of World Population in Bucket", "Proportion of Studies Completed in Bucket")) + 
  labs(x = "Mean PM2.5 bucket (in µg/m³)",  y = "Percentage", fill = "", 
       caption = str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) +
  theme_hc() +
  theme(axis.line.y = element_line(color = "black"), 
        axis.line.x = element_line(color = "black"), 
        plot.caption = element_text(size = 8, hjust = 0), 
        plot.caption.position = "plot")

# cartogram (using objects from the geographic distribution chunk)
foo <- world_shp_epi_color %>%
  st_as_sf()

# country level population
aqli_color_country_summary_population <- aqli_color %>%
  group_by(country) %>%
  mutate(pop_weights = population/sum(population, na.rm = TRUE), 
         pm2020_pop_weighted = pm2020*pop_weights) %>%
  summarise(avg_pm2.5_2020 = sum(pm2020_pop_weighted, na.rm = TRUE), 
            tot_pop = sum(population, na.rm = TRUE)) %>%
  select(country, tot_pop)

foo <- foo %>%
  left_join(aqli_color_country_summary_population, by = "country")


for(i in 1:nrow(foo)) {
  if(is.na(foo$num_studies[i])){
    foo$num_studies[i] <- (sample(1:20, 1))*10000
  } else {
    foo$num_studies[i] <- foo$num_studies[i]*10000
  }
}

foo <- foo %>% mutate(num_studies = ifelse(is.na(num_studies), 0, num_studies))

foo <-  foo %>% 
  mutate(num_studies_norm = num_studies/sum(num_studies, na.rm = TRUE), 
         num_studies_log = log10(num_studies + 1.00001))

foo <- foo %>%
  select(-geometry, geometry) %>%
  st_as_sf()

foo %>% 
  ggplot() + 
  geom_sf()

foo_transform <- st_transform(foo, 3857)

foo_transform_cont <- cartogram_cont(foo_transform,
                 weight = "num_studies_norm", itermax = 5)


  ggplot(foo_transform_cont) +
  geom_sf(mapping = aes(fill = num_studies))


foo_sp <- as(foo, "Spatial")



foo_sp_transform <- spTransform(foo_sp, CRS("+init=epsg:3395"))
foo_sp_transform_cont <- cartogram_cont(foo_sp_transform, "num_studies", itermax = 2)
tm_shape(foo_sp_transform_cont) + tm_polygons("total_pop", style = "jenks") +
   tm_layout(frame = FALSE, legend.position = c("left", "bottom"))


#> population pollution figure (version 2)-------------------------------
# pollution buckets on the x 


# adding pollution buckets
aqli_color_with_pol_buckets <- aqli_color %>%
  mutate(pol_buckets = ifelse(((pm2021 >= 0) & (pm2021 <= 5)), "0 - 5", NA),
         pol_buckets = ifelse(((pm2021 > 5) & (pm2021 <= 10)), ">5 - 10", pol_buckets),
         pol_buckets = ifelse(((pm2021 > 10) & (pm2021 <= 20)), ">10 - 20", pol_buckets),
         pol_buckets = ifelse(((pm2021 > 20) & (pm2021 <= 40)), ">20 - 40", pol_buckets),
         pol_buckets = ifelse(((pm2021 > 40) & (pm2021 <= 80)), ">40 - 80", pol_buckets),
         pol_buckets = ifelse(((pm2021 > 80)), ">80", pol_buckets)) %>%
  mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA),
         pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order))

# converting the 2 newly created columns to type factor
aqli_color_with_pol_buckets$pol_buckets <- as.factor(aqli_color_with_pol_buckets$pol_buckets)
aqli_color_with_pol_buckets$pol_bucket_order <- as.factor(aqli_color_with_pol_buckets$pol_bucket_order)


foo <- aqli_color_with_pol_buckets %>%
  group_by(pol_buckets) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA),
         pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order)) %>%
  filter(!is.na(pol_buckets)) %>%
  mutate(tot_pop_millions = round(tot_pop/1000000, 1)) %>%
  ggplot() +
  geom_col(mapping = aes(x = forcats::fct_reorder(pol_buckets, pol_bucket_order), y = tot_pop_millions, fill = forcats::fct_reorder(pol_buckets, pol_bucket_order)), width = 0.2) +
  ggthemes::theme_hc() +
  labs(x =  expression("Annual Average" ~ PM[2.5] ~ ("in µg/m³")), y = "Total Population (in millions)",
       title = "", fill = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)")) +
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        axis.title.x = element_text(size = 9),
        axis.title.y = element_text(size = 9),
        legend.title = element_text(size = 8),
        legend.background = element_rect(color = "black")) +
  viridis::scale_fill_viridis(option = "magma", direction = -1, discrete = TRUE) +
  scale_y_continuous(breaks = seq(0, 3000, 500))


# doing a similar thing with the epi dataset to get the total number of studies in different pollution buckets.
epi_with_pol_buckets <- epi %>%
  mutate(pol_buckets = ifelse(((mean_pm2.5 >= 0) & (mean_pm2.5 <= 5)), "0 - 5", NA),
         pol_buckets = ifelse(((mean_pm2.5 > 5) & (mean_pm2.5 <= 10)), ">5 - 10", pol_buckets),
         pol_buckets = ifelse(((mean_pm2.5 > 10) & (mean_pm2.5 <= 20)), ">10 - 20", pol_buckets),
         pol_buckets = ifelse(((mean_pm2.5 > 20) & (mean_pm2.5 <= 40)), ">20 - 40", pol_buckets),
         pol_buckets = ifelse(((mean_pm2.5 > 40) & (mean_pm2.5 <= 80)), ">40 - 80", pol_buckets),
         pol_buckets = ifelse(((mean_pm2.5 > 80)), ">80", pol_buckets)) %>%
  mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA),
         pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order))



epi_with_pol_buckets$pol_buckets <- as.factor(epi_with_pol_buckets$pol_buckets)
epi_with_pol_buckets$pol_bucket_order <- as.factor(epi_with_pol_buckets$pol_bucket_order)

foo1 <- epi_with_pol_buckets %>%
  group_by(pol_buckets) %>%
  summarise(total_studies = n()) %>%
  ungroup() %>%
  mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA),
         pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order)) %>%
  filter(!is.na(pol_buckets)) %>%
  add_row(pol_bucket_order = 1, pol_buckets = "0 - 5", total_studies = 0) %>%
  add_row(pol_bucket_order = 6, pol_buckets = ">80", total_studies = 0) %>%
  ggplot() +
  geom_col(mapping = aes(x = forcats::fct_reorder(pol_buckets, pol_bucket_order), y = total_studies, fill = forcats::fct_reorder(pol_buckets, pol_bucket_order)), width = 0.2) +
  ggthemes::theme_hc() +
  labs(x =  "", y = "Total number of epi studies",
       title = "", fill = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)")) +
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        axis.title.x = element_text(size = 9),
        axis.title.y = element_text(size = 9),
        legend.title = element_text(size = 8),
        legend.background = element_rect(color = "black")) +
  viridis::scale_fill_viridis(option = "magma", direction = -1, discrete = TRUE) +
  theme(legend.position = "none")



gridExtra::grid.arrange(foo1, foo)

#> Template for adding a red line that goes over all panels in the continent wise panel plot-----------

# convert the contnent wise plot into a grob
# plt_cont_grob <- ggplotGrob(plt_cont)
# 
# # get the panel grobs
# panels <- getGrob(plt_cont_grob, "panel", grep = TRUE)
# 
# # add a horizontal line at y = 0 on top of each panel
# for (i in 1:6) {
#   vp <- viewport(layout.pos.row = i, layout.pos.col = 1)
#   pushViewport(vp)
#   grid.lines(x = c(unit(0.1, "npc"), unit(1, "npc")),
#              y = unit(0.82, "npc"),
#              gp = gpar(col = "darkred", lwd = 1),
#              arrow = arrow(length = unit(0.15, "in"), type = "closed", ends = "last",
#                            angle = 30))
#   popViewport()
# }


#-------------------------------------

#> Countries withing continent wise plot of studies in form of a bar graphs (with countries not labelled)-------

# generate plot dataset
plot_data <- epi %>%
  group_by(continent, country) %>%
  summarise(count_studies = n(), .groups = "keep")


# plot the countries within continent plot
plot_data %>%
  ggplot() +
  geom_col(mapping = aes(x = continent, y = count_studies, fill = continent), position = position_dodge2(preserve = "single"))


#> Treemaps of the number of studies that have happened in each continent by country (used plot_data object from above)-----------------

plt1 <- ggplot(plot_data %>% filter(continent == "Asia"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "Asia (24 times was part of a epi study)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))

plt2 <- ggplot(plot_data %>% filter(continent == "Africa"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "Africa (0)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))

plt3 <- ggplot(plot_data %>% filter(continent == "South America"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "South America (0)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))

plt4 <- ggplot(plot_data %>% filter(continent == "Europe"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "Europe (15)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))

plt5 <- ggplot(plot_data %>% filter(continent == "North America"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "North America (49)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))


plt6 <- ggplot(plot_data %>% filter(continent == "Oceania"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "Oceania (0)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))

plt_final <- gridExtra::grid.arrange(plt1, plt2, plt3, plt4, plt5, plt6, nrow = 1, top = "Number of times a country has been included in an epi study (Continent Wise)")

plt_final + ggtitle("foo")


#> Population pollution figure (v3): trying a continuous but binned scale and adding Ken's figure----------

# population v/s pollution 
foo <- aqli_color %>%
  group_by(pm2021) %>%
  summarise(count_pm2021 = sum(population, na.rm = TRUE),
            count_pm2021_millions = round(count_pm2021/1000000, 1)) %>%
  ggplot() +
 geom_hline(yintercept = c(300, 600, 900, 1200), linetype = "dashed", color = "lavenderblush2") +
  geom_histogram(mapping = aes(x = pm2021, weight = count_pm2021_millions), fill = "#8FA1AD", color = "white", alpha = 0.7, width = 0.5) + 
  ggthemes::theme_hc() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)"),
       y = "Population (in millions)") +
  scale_y_continuous(breaks = seq(0, 1500, 300)) +
  scale_x_continuous(breaks = seq(0, 140, 15), limits = c(0, 140)) +
  theme(axis.line = element_line(),
        panel.grid.major.y = element_blank(),
        axis.title = element_text(size = 11)) 

foo_v2 <- aqli_color %>%
  group_by(pm2021) %>%
  summarise(count_pm2021 = sum(population, na.rm = TRUE),
            count_pm2021_millions = round(count_pm2021/1000000, 1)) %>%
  ggplot() +
 geom_hline(yintercept = c(500, 1000, 1500, 2000, 2500), linetype = "dashed", color = "lavenderblush2") +
  geom_col(mapping = aes(x = pm2021, y = count_pm2021_millions), fill = "#8FA1AD", alpha = 0.7, width = 0.9) + 
  ggthemes::theme_hc() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)"),
       y = "Population (in millions)") +
  scale_x_binned(breaks = seq(0, 140, 15), limits = c(0, 140)) +
  scale_y_continuous(breaks = seq(0, 2500, 500)) +
  theme(axis.line = element_line(),
        panel.grid.major.y = element_blank(),
           axis.text.y = element_text(size = 13),
        axis.text.x = element_text(size = 13),
        axis.title = element_text(size = 15)) 


# pollution v/s number of studies
foo1 <- epi %>%
  group_by(mean_pm2.5) %>%
  summarise(count_mean_pm2.5 = n()) %>%
  ggplot() +
     geom_hline(yintercept = c(5, 10, 15, 20), linetype = "dashed", color = "lavenderblush2") +
  geom_histogram(mapping = aes(x = mean_pm2.5, weight = count_mean_pm2.5), fill = "#343434", color = "white", alpha = 0.7, width = 0.5) +  ggthemes::theme_hc() +
  labs(x = "",
       y = "Number of epi papers") +
  scale_y_continuous(breaks = seq(0, 30, 5)) +
  scale_x_continuous(breaks = seq(0, 140, 15), limits = c(0, 140)) +
  theme(axis.line = element_line(),
        panel.grid.major.y = element_blank(),
        axis.title = element_text(size = 11))


foo1_v2 <-  epi %>%
  group_by(mean_pm2.5) %>%
  summarise(count_mean_pm2.5 = n()) %>%
  ggplot() +
     geom_hline(yintercept = c(10, 20, 30, 40), linetype = "dashed", color = "lavenderblush2") +
  geom_col(mapping = aes(x = mean_pm2.5, y = count_mean_pm2.5), fill = "#343434", alpha = 0.7, width = 0.9) +  ggthemes::theme_hc() +
  labs(x = "",
       y = "Number of epi papers") +
  scale_y_continuous(breaks = seq(0, 40, 10)) +
  scale_x_binned(breaks = seq(0, 140, 15), limits = c(0, 140)) +
  theme(axis.line = element_line(),
        panel.grid.major.y = element_blank(), 
           axis.text.y = element_text(size = 13),
        axis.text.x = element_text(size = 13),
        axis.title = element_text(size = 15),)


gridExtra::grid.arrange(foo1, foo)

## Ken's figure

# get 4 datasets corresponding to gemm/ier/aqli/meta and for each keep concentrations till 120 micrograms per cubic meter and keep only relevant columns. and so some basic cleaning
gemm <- haven::read_dta("./data-raw/pm2.5_distribution/gemm.dta")
gemm <- gemm %>%
  filter(pm_gemm <= 120) %>%
  select(pm_gemm, gemm_lyl) %>%
  mutate(method = "GEMM") %>%
  rename(pm_val = pm_gemm, 
         lyl_val = gemm_lyl)

# stupid adjustment
gemm[1:3, 2] <- c(-0.49, -0.392, -0.294)

ier2019 <- haven::read_dta("./data-raw/pm2.5_distribution/ier_2019.dta")
ier2019 <- ier2019 %>% 
  filter(pm_ier_2019 <= 120) %>%
  select(pm_ier_2019, ier_2019_lyl) %>%
  mutate(method = "IER-2019") %>%
  rename(pm_val = pm_ier_2019, 
         lyl_val = ier_2019_lyl)

meta <- haven::read_dta("./data-raw/pm2.5_distribution/meta.dta")
meta <- meta %>% 
  filter(pm_meta <= 120) %>%
  select(pm_meta, meta_lyl) %>%
    mutate(method = "META") %>%
  rename(pm_val = pm_meta, 
         lyl_val = meta_lyl)

aqli <- tibble(pm_val = 0:120, lyl_val = (pm_val - who_pm2.5_guideline)*0.098, method = "AQLI")
#   mutate(lyl_val = ifelse(lyl_val < 0, 0, lyl_val))

## calculating empirical versions of above methods

# Approximating a derivative function of gemm
gemm_empirical <- splinefun(x  = gemm$pm_val, 
                                    y = gemm$lyl_val)

# Approximating a derivative function of gemm
aqli_empirical <- splinefun(x  = aqli$pm_val, 
                                    y = aqli$lyl_val)

# meta empirical
meta_empirical <- splinefun(x  = meta$pm_val, 
                                    y = meta$lyl_val)

# ier 2019 empirical

ier2019_empirical <- splinefun(x  = ier2019$pm_val, 
                                    y = ier2019$lyl_val)

## plot the 4 curves (empirical version)

foo3 <- ggplot(data = tibble(x = 1:120), mapping = aes(x = x)) +
      #geom_hline(yintercept = seq(0, 0.175, 0.05), linetype = "dashed", color = "lavenderblush2") +
  geom_function(fun = gemm_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#5B96AD")) + 
  geom_function(fun = aqli_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#8F3931")) +
   geom_function(fun = meta_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#91AB5A")) +
  geom_function(fun = ier2019_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#767676")) +
  scale_y_continuous(breaks = seq(0, 0.2, 0.05), limits = c(0, .2)) +
  scale_x_continuous(breaks = seq(0, 140, 15), limits = c(0, 140)) + 
  labs(x = "", y = "Marginal Life Expectancy Lost", color = "", 
       title = str_wrap("PM2.5 exposure in comparison with population and number of epi papers and alternative approaches to measure PM2.5 Life Expectancy relationship.", width = 90)) + 
  scale_color_identity(breaks = c("#8F3931", "#91AB5A", "#5B96AD", "#767676"),
                       labels = c("AQLI", "META", "GEMM", "IER-2019"),
                       guide = "legend") +
  ggthemes::theme_hc() %+replace%
  theme(legend.position = c(0.8, 0.9), 
        legend.direction = "horizontal",
        axis.line.x = element_line(), 
        axis.line.y = element_line(),
        plot.title = element_text(hjust =  0.5, size = 16, color = "black"), 
        axis.text.y = element_text(size = 13),
        axis.text.x = element_text(size = 13),
        axis.title = element_text(size = 15),
        legend.text = element_text(size = 10),
        plot.caption = element_text(face = "italic", hjust = 0), 
        legend.title = element_text(hjust = 0.5), 
        panel.grid.major.y = element_blank(), 
        legend.key.size = unit(0.5, "cm")) 

gridExtra::grid.arrange(foo3,foo1_v2, foo_v2, heights = c(1, 1, 1))

# Some stats for inputting China's positive story in the epi piece----------------------

# cumulative sum of all epi studies in China over the years
cum_sum_studies_chronological <- epi %>% 
  filter(country == "China") %>% 
  arrange(publishing_year) %>%
  count(country, publishing_year) %>% 
  pull(n) %>% 
  cumsum() %>%
  as_tibble() 

# naming the column for cumulative studies
colnames(cum_sum_studies_chronological) <- "cumulative_sum_studies"

# adding the cumulative study column to the summary
china_studies_year_summary <- cbind(epi %>% filter(country == "China") %>% arrange(publishing_year) %>% count(country, publishing_year), cum_sum_studies_chronological)

# adding pre-2010 (up till 1998) studies (total = 0) to the above tibble
china_studies_pre2010 <- tibble(country = "China", publishing_year = 1998:2009, n = 0, cumulative_sum_studies = 0)

china_studies_year_summary <- china_studies_year_summary %>%
  bind_rows(china_studies_pre2010)

foo <- china_studies_year_summary %>% 
  pivot_longer(n:cumulative_sum_studies, names_to = "sum_type", values_to = "value") %>%
  mutate(sum_type = ifelse(sum_type == "n", "Number of studies", "Cumulative sum of studies")) %>%
  ggplot() +
  geom_line(mapping = aes(x = publishing_year, y = value, color = sum_type), lwd = 1) +
  scale_x_continuous(breaks = c(seq(1998, 2020, 2), 2021, 2023), limits = c(1998, 2023)) +
  scale_y_continuous(breaks = seq(0, 20, 2)) +
  labs(x = "Publishing Year", y = "Studies Count", 
       title = "Epi air pollution studies published in China, 1998 to 2023", 
       color = "", 
       caption = "* Sources: (a) Does the Squeaky Wheel Get More Grease? The Direct and Indirect Effects of Citizen Participation on Environmental Governance in China, Buntaine et al. (b) AQLI epi meta analysis.") +
  ggthemes::theme_tufte() +
  theme(plot.title = element_text(size = 22, hjust = 0.5, margin = margin(b = 0.8, unit = "cm")),
        plot.subtitle = element_text(size = 13, hjust = 0.5, margin = margin(b = 0.5, unit = "cm")),
        axis.title.x = element_text(size = 15, margin = margin(t = 0.7, b = 1, unit = "cm")), 
        axis.title.y = element_text(size = 15, margin = margin(r = 0.7, unit = "cm")), 
        axis.ticks.y = element_blank(), 
        axis.text = element_text(size = 14), 
        legend.position = "bottom", 
        legend.box.background = element_rect(color = "black"),
        legend.text = element_text(size = 15),
        plot.caption = element_text(size = 9.5, hjust = 0, face = "italic", margin = margin(t = 1, b = 2, unit = "cm"))) + 
   annotate("segment", x = 2013, xend = 2013, y = 0, yend = 20, linetype = "dashed") +
   annotate("segment", x = 2014, xend = 2014, y = 0, yend = 20, linetype = "dashed") +
   annotate("segment", x = 2017, xend = 2017, y = 0, yend = 20, linetype = "dashed") +
    annotate("segment", x = 2017, xend = 2017, y = 0, yend = 20, linetype = "dashed") +
  annotate("point", x = 2023, y = 16, label = "2023", size = 3) +
  annotate("point", x = 2023, y = 1, label = "2023", size = 3, color = "yellow") +
  viridis::scale_color_viridis(discrete = TRUE)
# annotate("text", x = 2022, y = 45, label = "Satellite data not available") +
#   annotate("rect", xmin = 2021, xmax = 2023, ymin = 60, ymax = 60)


# China's pollution over time 1998 to 2021
foo1 <- aqli_color %>%
  filter(country == "China") %>% 
  group_by(country) %>%
  mutate(pop_weights = population/sum(population, na.rm = TRUE)) %>%
   dplyr::mutate(across(dplyr::starts_with("pm"), ~(.x*pop_weights), .names = "{col}_pop_weighted")) %>%
    dplyr::summarise(across(dplyr::contains("pop_weighted"), ~(round(sum(.x, na.rm = TRUE), 2)), .names = "avg_{col}"),
                     total_population = sum(population, na.rm = TRUE), objectid_color = objectid_color[1], whostandard = whostandard[1], .groups = "keep") %>%
  dplyr::rename_with(~str_replace_all(.x, "(_pop_weighted)|(avg_)", ""), dplyr::contains("_pop_weighted")) %>%
  pivot_longer(pm1998:pm2021, names_to = "year", values_to = "pm2.5_value") %>%
  mutate(year = as.numeric(str_remove(year, "pm"))) %>%
  select(country, year, pm2.5_value) %>%
   ggplot2::ggplot() +
      ggplot2::geom_line(mapping = ggplot2::aes(x = year, y = pm2.5_value), lwd = 1.1,
                color = "darkred") +
  geom_hline(mapping = aes(yintercept = 35), linetype = "dashed") +
      ggplot2::scale_x_continuous(breaks = c(seq(1998, 2020, 2), 2021, 2023), limits = c(1998, 2023)) +
      scale_y_continuous(breaks = seq(0, 60, 10), limits = c(0, 60)) +
      ggthemes::theme_tufte() +
   theme(plot.title = element_text(size = 22, hjust = 0.5, margin = margin(b = 2.5, unit = "cm")),
        plot.subtitle = element_text(size = 13, hjust = 0.5, margin = margin(b = 0.5, unit = "cm")),
        axis.title.x = element_text(size = 15, margin = margin(t = 0.7, b = 0.5, unit = "cm")), 
        axis.title.y = element_text(size = 15, margin = margin(r = 0.7, unit = "cm")), 
        axis.ticks.y = element_blank(), 
        axis.text = element_text(size = 14), 
        legend.position = "bottom", 
        legend.text = element_text(size = 15),
        legend.box.background = element_rect(color = "black"),
        plot.caption = element_text(size = 9.5, hjust = 0, face = "italic", margin = margin(t = 0.8, b = 1.5, unit = "cm"))) +
      ggplot2::labs(x = "Years",
           y = "Annual Average PM2.5 concentration (in µg/m³)", 
           title = expression(PM[2.5] ~ "pollution in China, 1998 to 2021")) +
  annotate("text", x = 2018, y = 58, label = "War on pollution begins", size = 5) +
   annotate("text", x = 2023.5, y = 15, label = "2023", size = 3) +
   annotate("text", x = 2027, y = 27, label = "", size = 3) +
   annotate("text", x = 2009.2, y = 20, label = str_wrap("Prefectural EPAs publicize CEMS data", width = 20), size = 5) +
  annotate("segment", x = 2011, xend = 2013, y = 18, yend = 18) +
   annotate("segment", x = 2014, xend = 2016, y = 58, yend = 58) +
   annotate("segment", x = 2014, xend = 2014, y = 0, yend = 58, linetype = "dashed") +
   annotate("segment", x = 2013, xend = 2013, y = 0, yend = 18, linetype = "dashed") +
   annotate("text", x = 2021, y = 45, label = str_wrap("All prefectural cities operating official Weibo and WeChat accounts", width = 30), size = 5) +
   annotate("segment", x = 2017, xend = 2017, y = 0, yend = 45, linetype = "dashed") +
     annotate("text", x = 2006, y = 37, label = expression("National" ~ PM[2.5] ~ "standard: 35 µg/m³"), size = 4) +
   annotate("segment", x = 2017, xend = 2018.5, y = 45, yend = 45) 


plt <- gridExtra::grid.arrange(foo1, foo)


aqli-epic/epi.meta.analysis documentation built on July 2, 2023, 4:18 p.m.