In aqli-epic/epi.meta.analysis: An Amazing Shiny App

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

# load libraries

library(readr)
library(dplyr)
library(stringr)
library(magrittr)
library(ggplot2)
library(readr)
library(tidytext)
library(tidyr)
library(tidyverse)
library(ggthemes)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(mapsf)
library(colorspace)
library(shiny)
library(shinydashboard)
library(DT)
library(treemapify)

# global variables
who_pm2.5_guideline <- 5

# global operations
`%notin%` <- Negate(`%in%`)

# epi studies analysis raw sheet
epi <- readxl::read_xlsx("./data-raw/pm2.5_distribution/AQLI_Epidemiology Literature Research.xlsx", sheet = "AnalysisDatasetPM2.5MortalityAn")


#> change default columns types
epi$cohort_size <- as.numeric(epi$cohort_size)
epi$study_start_year<- as.numeric(epi$study_start_year)
epi$study_end_year <- as.numeric(epi$study_end_year)
epi$pm2.5_exposure_ll <- as.numeric(epi$pm2.5_exposure_ll)
epi$pm2.5_exposure_ul <- as.numeric(epi$pm2.5_exposure_ul)
epi$mean_pm2.5 <- as.numeric(epi$mean_pm2.5)
epi$sd_pm2.5 <- as.numeric(epi$sd_pm2.5)
epi$cohort_age_ll <- as.numeric(epi$cohort_age_ll)
epi$cohort_age_ul <- as.numeric(epi$cohort_age_ul)

# AQLI color file and gadm0 file
aqli_color <- read_csv("./data-raw/pm2.5_distribution/[june2023]gadm2_aqli_2021_post_waterbody_adj_finalized_internal.csv")
gadm0_aqli_2021 <- read_csv("./data-raw/pm2.5_distribution/[june2023]gadm0_aqli_2021_post_waterbody_adj_finalized_internal.csv")


# aqli_color <- aqli_color %>%
#   dplyr::rename_with(~str_replace(.x, "who", "llpp_who_"), dplyr::contains("who")) %>%
#   dplyr::rename_with(~str_replace(.x, "nat", "llpp_nat_"), dplyr::contains("nat")) %>%
#   dplyr::rename(country = name0) %>%
#   dplyr::mutate(whostandard = who_pm2.5_guideline) %>%
#   dplyr::select(objectid_gadm2:population, whostandard, everything())

aqli_color <- aqli_color %>%
  dplyr::mutate(objectid_color = objectid_gadm2)

# getting a country continent file
country_continent <- read_csv("./data-raw/pm2.5_distribution/country_continent.csv")

# adding a continent column to the color file
aqli_color <- aqli_color %>%
  left_join(country_continent, by = "country") %>%
  select(objectid_color, continent, country, everything())

gadm0_aqli_2021 <- gadm0_aqli_2021 %>%
  left_join(country_continent, by = "country")

#> Filling in missing continents-----------------------------------------------

# countries for which continent is NA: for these fill in the continent manually
countries_with_missing_continent <- gadm0_aqli_2021 %>% filter(is.na(continent)) %>% pull(country) %>% unique()

# continent fill in for missing coutries
continents_for_missing_countries <- c("Africa", "Europe", "North America", "Asia", "Asia",
                                      "Oceania", "Africa", "Africa", "Africa", "Asia")

# [CAUTION: perform a sanity check on the above 2 vectors and how they map countries to continents before proceeding]
#creating a data frame using the above 2 vectors as columns
missing_continents_df <- tibble(country = countries_with_missing_continent,
                                continent = continents_for_missing_countries)


# adding in the missing continent information in the gadmx_aqli_2021 datasets
aqli_color <- aqli_color %>%
  left_join(missing_continents_df, by = "country") %>%
  mutate(continent = ifelse(is.na(continent.x), continent.y, continent.x))


#> setting report parameters
min_sample_size <- 1000
min_study_duration_in_years <- 1

#> Filtering the epi database (minimum cohort size: >1000, minimum study duration: > 1 year): commented for now.
epi <- epi %>%
  mutate(study_duration = (study_end_year - study_start_year) + 1) %>%
  filter(cohort_size >= min_sample_size, study_duration >= min_study_duration_in_years, !is.na(cohort_uid)) %>%
  distinct(cohort_uid, .keep_all = TRUE)

# creating this temp dataset for the  continent panel dataset
epi_tmp <- epi %>%
  mutate(continent = ifelse(country == "China", "China", continent),
         continent = ifelse(continent == "Asia", "Asia - China", continent))

# continent list (specifucally for the continent panel graph, that contains the Asia and Asia -  China panels, and considers them as continents in a
# separate epi_tmp dataset, that is specially created for this graph)
continent_list <- c("China", "Asia - China", "Africa", "North America", "South America", "Europe", "Oceania")

# missing continents
missing_continents_logical <- continent_list %notin% unique(epi_tmp$continent)
missing_continents <- continent_list[missing_continents_logical]

#> Calculations needed for all sections above the "Results section".

# percent of population not in compliance with the WHO guideline
num_people_above_who <- aqli_color %>%
  filter(pm2020 > who_pm2.5_guideline) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist()

# world population
world_pop <- aqli_color %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist()

percent_people_above_who <- round((num_people_above_who/world_pop)*100, 1)

# pop pollution graph with 2 buckets: function.

pop_pol_graph_2_buckets <- function(aqli_color, epi, thresh_ll, thresh_ul, pm2.5_col_name){

  if(thresh_ll > thresh_ul){
    stop("Lower Limit should be less than or equal to the Upper limit! Please try again!")
  }

  # aqli color population in ordered pm2.5 buckets
  aqli_color_grp_pm2.5_buckets <- aqli_color %>%
    dplyr::filter(!is.na(!!as.symbol(pm2.5_col_name))) %>%
    dplyr::mutate(region = ifelse((!!as.symbol(pm2.5_col_name) >= thresh_ll) & (!!as.symbol(pm2.5_col_name) <= thresh_ul), stringr::str_c(thresh_ll, "-", thresh_ul), !!as.symbol(pm2.5_col_name)),
           region = ifelse(!!as.symbol(pm2.5_col_name) > thresh_ul, stringr::str_c(">", thresh_ul), region),
           region = ifelse(!!as.symbol(pm2.5_col_name) < thresh_ll, stringr::str_c("<", thresh_ll), region)) %>%
    dplyr::group_by(region) %>%
    dplyr::summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(order_pollution_group = ifelse(region == stringr::str_c("<", thresh_ll), 1, 0),
           order_pollution_group = ifelse(region == stringr::str_c(thresh_ll, "-", thresh_ul), 2, order_pollution_group),
           order_pollution_group = ifelse(region == stringr::str_c(">", thresh_ul), 3, order_pollution_group)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(tot_pop_prop = (tot_pop/sum(tot_pop))*100)


  # epi total number of studies in ordered pollution buckets (initially filtering out pooled studies, will add back in using a for loop)
  epi_num_studies_grp_pm2.5_buckets <- epi %>%
    dplyr::filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
    dplyr::group_by(paper_uid) %>%
    dplyr::summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
    dplyr::mutate(region = ifelse(mean_pm2.5 >= thresh_ll & mean_pm2.5 <= thresh_ul, stringr::str_c(thresh_ll, "-", thresh_ul), mean_pm2.5),
           region = ifelse(mean_pm2.5 > thresh_ul, stringr::str_c(">", thresh_ul), region),
           region = ifelse(mean_pm2.5 < thresh_ll, stringr::str_c("<", thresh_ll), region)) %>%
    dplyr::mutate(order_pollution_group = ifelse(region == stringr::str_c("<", thresh_ll), 1, 0),
           order_pollution_group = ifelse(region == stringr::str_c(thresh_ll, "-", thresh_ul), 2, order_pollution_group),
           order_pollution_group = ifelse(region == stringr::str_c(">", thresh_ul), 3, order_pollution_group)) %>%
    dplyr::group_by(region) %>%
    dplyr::summarise(tot_studies = dplyr::n(), order_pollution_group = order_pollution_group[1]) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(tot_studies_prop = (tot_studies/sum(tot_studies))*100)

  #> bar chart version

  # creating a master dataset for both population and studies data
  pop_epi_studies_data <- aqli_color_grp_pm2.5_buckets %>%
    dplyr::left_join(epi_num_studies_grp_pm2.5_buckets, by = c("region", "order_pollution_group")) %>%
    dplyr::select(region,  order_pollution_group, tot_pop_prop, tot_studies_prop) %>%
    tidyr::pivot_longer(cols = tot_pop_prop:tot_studies_prop, names_to = "type_of_prop", values_to = "val") %>%
    dplyr::mutate(type_of_prop = ifelse(type_of_prop == "tot_pop_prop", "Percent Population in PM₂.₅ bucket", "Percent Studies in PM₂.₅ bucket"))

pop_num_studies_in_pollution_buckets_graph <- pop_epi_studies_data %>%
  ggplot2::ggplot() +
  ggplot2::geom_col(mapping = aes(x = forcats::fct_reorder(region, order_pollution_group), y = val, fill = type_of_prop), position = position_dodge(), width = 0.3) +
  ggplot2::scale_y_continuous(breaks = seq(0, 100, 10)) +
  ggplot2::scale_fill_manual(values = c("tot_pop_prop" = "grey", "tot_studies_prop" = "cornflowerblue"), labels = c("Proportion of World Population in Bucket", "Proportion of Studies Completed in Bucket")) +
  ggplot2::labs(x = expression("Mean" ~ PM[2.5] ~ "bucket (in µg/m³)"),  y = "Percentage", fill = "",
                title = expression("Percentage of Global population in different" ~ PM[2.5] ~ "buckets"),
       caption = stringr::str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) +
 # ggplot2::geom_text(mapping = aes(x = forcats::fct_reorder(region, order_pollution_group), y = val, label = paste0(round(val, 1), "%")), position = ggplot2::position_dodge2(width = 1, preserve = "single"), vjust = -0.5, hjust = -0.2, size = 3) +
ggthemes::theme_tufte() +
  ggplot2::theme(axis.line.y = element_line(color = "black"),
        axis.line.x = element_line(color = "black"),
        plot.caption = element_text(size = 8, hjust = 0, margin = margin(t = 0.7, unit = "cm")),
        plot.title = element_text(size = 16, margin = margin(b = 1, unit = "cm"), hjust = 0.5),
        plot.caption.position = "plot",
        axis.title.y = element_text(margin = margin(r = 0.5, unit = "cm"), size = 14),
        axis.title.x = element_text(margin = margin(t = 0.5, b = 0.5, unit = "cm"), size  = 14),
        axis.text = element_text(size = 12),
        legend.box.background = element_rect(color = "black"),
        legend.position = "bottom",
        legend.text = element_text(size = 12)) +
  viridis::scale_fill_viridis(discrete = TRUE)

return(pop_num_studies_in_pollution_buckets_graph)



}

Epidemiological Studies: Meta Analysis

This analysis is our ongoing attempt to capture all epidemiological research studies (>r min_sample_size people; max sample size: r round(max(epi$cohort_size, na.rm = TRUE)/1000000, 1) million), that are relatively long term (> r min_study_duration_in_years year(s), max study duration: r max(epi$study_duration, na.rm =TRUE) years), with unique cohorts (by unique we mean that even if a given cohort is studied by different groups over different years/different sample size, it'll be counted as separate cohorts, instead of 1) and measure the impact of ambient PM~2.5~, PM~10~, TSP, Ultra Fine Particulate Matter on Mortality (cause-specific, all cause, premature)/Life Expectancy published between r min(epi$publishing_year) and present*, findable in available peer-reviewed literature. Hereafter, we refer to these studies as "AQ epi studies" for short. As of now, this meta-analysis analyzes r epi %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow() AQ epi studies. This analysis will be continually updated to incorporate new AQ epi studies.

We are seeking to make this analysis as current, complete and error-free as possible and view it as a continual work in progress. We would appreciate the air quality community’s comments, corrections, and suggestions. Please contact aqli-epic@uchicago.edu or leave a comment in this GitHub repository/directly leave comments in the analysis dataset (more on this below).

Major Takeaways

53.57 percent of all AQ epi studies (45 studies) included in this analysis included populations in either the US or Canada or both.
Of the total number of times a given continent’s population has been included in any given AQ epi study, North America dominates the rest of the continents and has been included 53.57 percent of times (all due to studies in the US or Canada). Closely following North America, populations in Asia and Europe have been included in AQ epi studies 28.57 and 17.86 percent of times respectively. Africa, South America, Oceania have seen 0 long term (>= 1 year), large (>=1000 people in the sample) AQ epi study.
There have been no large (>= 1 year) or long (>=1000 people in the sample) PM epi studies that were performed in Africa, South America, Oceania, with a combined population of 1.84 billion people , 23.8 percent of the Earth’s total human population.
In total, 7 “really” long term studies (> 25 years) have been performed. That is, 8.33 percent of the total number of studies.
All of the following continents have seen at least 1 really long term study: North America, Europe.

Purpose

The purpose of this analysis is to understand the landscape of epidemiological research on the relationship between PM~2.5~, PM~10~, TSP, Ultrafine Particulate Matter and Mortality (all types, as specified above)/Life Expectancy and to surface demographic, geographic, or other trends that may exist in the current state of literature. While the overall arc of the relationship between these pollutants and human health is clear enough to take action, understanding such trends can help the field reflect on itself, take stock of any biases or gaps – and point toward future research and policy opportunities.

Why do epidemiological studies on air pollution and mortality matter?

While global estimates of air pollution’s toll on public health vary, they all point in the same direction: air pollution poses one of the largest health risks on the planet to humans [Paper1, Paper2, Paper3, Paper4]. Epidemiological studies on air pollution and mortality help us understand the burden of air pollution on human health at global, national and regional levels. According to Vahlsing and Smith (2012), these sorts of studies can also help countries take policy action, pushing forward and shaping national-level ambient air quality standards.

The burden of air pollution across the world is also not uniform. While r percent_people_above_who percent of the world population is out of compliance with the latest WHO annual PM~2.5~ guideline of r who_pm2.5_guideline µg/m³, there is huge variation in the quality of air one breathes.

Results

PM~2.5~ concentration range and the Global Population Distribution

#> mean pm2.5 distribution segregated by country (density plot): makes sense to count different country studies separately, as we are making country wise distributions, so we want to see all PM2.5 ranges covered in a given country.
mean_pm2.5_country_wise_graph <- epi %>% 
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  ggplot() +
  geom_density(mapping = aes(x = mean_pm2.5, fill = country), alpha = 0.5, color = "black", position = "identity") +
  labs(x = expression(paste("Mean PM2.5 concentration (", mu, "g", "/", m^3, ")")), 
       caption = str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) +
  theme_hc() +
  theme(plot.caption = element_text(hjust = 0)) +
viridis::scale_fill_viridis(discrete = TRUE)

#> PM2.5 exposure range and population distribution

# aqli color population in ordered pm2.5 buckets
aqli_color_grp_pm2.5_buckets <- aqli_color %>%
  mutate(region = ifelse(pm2020 >= 0 & pm2020 <= 25, "0-25", pm2020), 
         region = ifelse(pm2020 > 25 & pm2020 <= 50, ">25-50", region), 
         region = ifelse(pm2020 > 50 & pm2020 <= 75, ">50-75", region), 
         region = ifelse(pm2020 > 75 & pm2020 <= 100, ">75-100", region), 
         region = ifelse(pm2020 > 100, ">100", region)) %>%
  group_by(region) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  ungroup() %>%
    mutate(order_pollution_group = ifelse(region == "0-25", 1, 0), 
         order_pollution_group = ifelse(region == ">25-50", 2, order_pollution_group), 
         order_pollution_group = ifelse(region == ">50-75", 3, order_pollution_group), 
         order_pollution_group = ifelse(region == ">75-100", 4, order_pollution_group), 
         order_pollution_group = ifelse(region == ">100", 5, order_pollution_group)) %>%
  ungroup() %>%
  mutate(tot_pop_prop = (tot_pop/sum(tot_pop))*100)


# epi total number of studies in ordered pollution buckets (initially filtering out pooled studies, will add back in using a for loop)
epi_num_studies_grp_pm2.5_buckets <- epi %>%
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  group_by(paper_uid) %>%
  summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
    mutate(region = ifelse(mean_pm2.5 >= 0 & mean_pm2.5 <= 25, "0-25", mean_pm2.5), 
         region = ifelse(mean_pm2.5 > 25 & mean_pm2.5 <= 50, ">25-50", region), 
         region = ifelse(mean_pm2.5 > 50 & mean_pm2.5 <= 75, ">50-75", region), 
         region = ifelse(mean_pm2.5 > 75 & mean_pm2.5 <= 100, ">75-100", region), 
         region = ifelse(mean_pm2.5 > 100, ">100", region)) %>%
  mutate(order_pollution_group = ifelse(region == "0-25", 1, 0), 
         order_pollution_group = ifelse(region == ">25-50", 2, order_pollution_group), 
         order_pollution_group = ifelse(region == ">50-75", 3, order_pollution_group), 
         order_pollution_group = ifelse(region == ">75-100", 4, order_pollution_group), 
         order_pollution_group = ifelse(region == ">100", 5, order_pollution_group)) %>%
  group_by(region) %>%
  summarise(tot_studies = n(), order_pollution_group = order_pollution_group[1]) %>%
  ungroup() %>%
  mutate(tot_studies_prop = (tot_studies/sum(tot_studies))*100)

#> bar chart version

# creating a master dataset for both population and studies data
pop_epi_studies_data <- aqli_color_grp_pm2.5_buckets %>%
  left_join(epi_num_studies_grp_pm2.5_buckets, by = c("region", "order_pollution_group")) %>%
  select(region,  order_pollution_group, tot_pop_prop, tot_studies_prop) %>%
  pivot_longer(cols = tot_pop_prop:tot_studies_prop, names_to = "type_of_prop", values_to = "val")

# plotting the bar graph
pop_num_studies_in_pollution_buckets_graph <- pop_epi_studies_data %>%
  ggplot() +
  geom_col(mapping = aes(x = fct_reorder(region, order_pollution_group), y = val, fill = type_of_prop), position = position_dodge(), width = 0.7) +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  scale_fill_manual(values = c("tot_pop_prop" = "grey", "tot_studies_prop" = "cornflowerblue"), labels = c("Proportion of World Population in Bucket", "Proportion of Studies Completed in Bucket")) + 
  labs(x = "Mean PM2.5 bucket (in µg/m³)",  y = "Percentage", fill = "", 
       caption = str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) +
  theme_hc() +
  theme(axis.line.y = element_line(color = "black"), 
        axis.line.x = element_line(color = "black"), 
        plot.caption = element_text(size = 8, hjust = 0), 
        plot.caption.position = "plot") +
  viridis::scale_fill_viridis(discrete = TRUE)


#> other calculations for this block of code

#> total pop where pollution is > 50 micrograms per cubic meter 

# thresholds
pm2.5_thresh_highly_polluted <- 50
pm2.5_thresh_severly_polluted <- 75
pm2.5_thresh_low_polluted <- 25

# number of PM2.5 specific studies
epi_pm2.5_spec_studies_total <- epi %>% 
  distinct(paper_uid, .keep_all = TRUE) %>%
  filter(non_pm2.5 == 0) %>%
  nrow()

# higher than hp

# total pop where pollution is > x micrograms per cubic meter (hp)
total_pop_above_x_hp_micr_gm <- aqli_color %>%
  filter(pm2020 > pm2.5_thresh_highly_polluted) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist()

# percent pop where pollution is > x micrograms per cubic meter (hp)
perc_pop_above_x_hp_micr_gm <- round((total_pop_above_x_hp_micr_gm/world_pop)*100, 1)

# total pop where pollution is > x micrograms per cubic meter in millions (hp)
tot_pop_above_x_hp_micr_gm_millions <- round(total_pop_above_x_hp_micr_gm/1000000, 1)

# higher than sp

# total pop where pollution is > x micrograms per cubic meter (sp)
total_pop_above_x_sp_micr_gm <- aqli_color %>%
  filter(pm2020 > pm2.5_thresh_severly_polluted) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist()

# percent pop where pollution is > x micrograms per cubic meter (sp)
perc_pop_above_x_sp_micr_gm <- round((total_pop_above_x_sp_micr_gm/world_pop)*100, 1)

# total pop where pollution is > x micrograms per cubic meter in millions (sp)
tot_pop_above_x_sp_micr_gm_millions <- round(total_pop_above_x_sp_micr_gm/1000000, 1)

# less than lp

# total pop where pollution is < x micrograms per cubic meter (lp)
total_pop_below_x_lp_micr_gm <- aqli_color %>%
  filter(pm2020 < pm2.5_thresh_low_polluted) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist()

# percent pop where pollution is < x micrograms per cubic meter (lp)
perc_pop_below_x_lp_micr_gm <- round((total_pop_below_x_lp_micr_gm/world_pop)*100, 1)

# total pop where pollution is > x micrograms per cubic meter in millions (lp)
tot_pop_below_x_lp_micr_gm_millions <- round(total_pop_below_x_lp_micr_gm/1000000, 1)


#> percent studies in a given pollution bucket 


# higher than hp: 
epi_num_studies_in_hp <- epi %>%
  filter(!is.na(mean_pm2.5) , mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  group_by(paper_uid) %>%
  summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
  filter(mean_pm2.5 > pm2.5_thresh_highly_polluted) %>%
  nrow() %>%
  unlist()

perc_epi_studies_in_hp <- round((epi_num_studies_in_hp/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% filter(non_pm2.5 == 0)))*100, 1)

# higher than sp: makes sense to count different countries in a pooled study as different studies.

epi_num_studies_in_sp <- epi %>%
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
    group_by(paper_uid) %>%
  summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
  filter(mean_pm2.5 > pm2.5_thresh_severly_polluted) %>%
  nrow() %>%
  unlist()

perc_epi_studies_in_sp <- round((epi_num_studies_in_sp/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% filter(non_pm2.5 == 0)))*100, 1)

# less than lp: makes sense to count different countries in a pooled study as different studies.

epi_num_studies_in_lp <- epi %>%
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  group_by(paper_uid) %>%
  summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
  filter(mean_pm2.5 < pm2.5_thresh_low_polluted) %>%
  nrow() %>%
  unlist()

perc_epi_studies_in_lp <- round((epi_num_studies_in_lp/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% filter(non_pm2.5 == 0)))*100, 1)


# plot population and number of studies bar graph
pop_num_studies_in_pollution_buckets_graph

# plot mean pm2.5 country wise graph
mean_pm2.5_country_wise_graph

In total r length(unique(epi$paper_uid)) AQ epi studies were included in the final analysis dataset. Of these, r epi_pm2.5_spec_studies_total were PM~2.5~ specific. Others (r (length(unique(epi$paper_uid)) - epi_pm2.5_spec_studies_total) studies) are multi-pollutant studies.
r perc_pop_above_x_hp_micr_gm percent of the world population, or r tot_pop_above_x_hp_micr_gm_millions million people, live in areas where the annual average PM~2.5~ pollution is greater than r pm2.5_thresh_highly_polluted µg/m³. But, only r perc_epi_studies_in_hp percent (r epi_num_studies_in_hp PM~2.5~ specific studies) of the total PM~2.5~ studies, have been performed in these highly polluted parts of the world. These highly polluted areas are areas where the average PM~2.5~ pollution is at least r round(pm2.5_thresh_highly_polluted/who_pm2.5_guideline, 1) times the WHO PM~2.5~ safe guideline of r who_pm2.5_guideline µg/m³.
Approximately r perc_pop_above_x_sp_micr_gm percent of the world population (r tot_pop_above_x_sp_micr_gm_millions million people) live in the most severely polluted parts of the world, where annual average PM~2.5~ pollution is upwards of r pm2.5_thresh_severly_polluted µg/m³ (at least r round(pm2.5_thresh_severly_polluted/who_pm2.5_guideline, 1) times the WHO safe guideline). In these most severely polluted parts of the world, r epi_num_studies_in_sp PM~2.5~ AQ epi studies have been performed .
Most of the PM~2.5~ AQ epi studies (r perc_epi_studies_in_lp percent of the total number of PM~2.5~ studies, or r epi_num_studies_in_lp studies) performed so far, are concentrated in areas where the average PM~2.5~ concentration is in the 0-25 µg/m³ range. People living in these areas (r perc_pop_below_x_lp_micr_gm percent of the world population) are breathing air that is much less polluted relative to the people living in the most polluted parts of the world (as seen above). But, even in the 0-25 µg/m³ bucket, anyone living above r who_pm2.5_guideline µg/m³, is out of compliance with the WHO PM~2.5~ guideline.

Geographic Distribution of Studies

#> Add a note mentioning that the underlying data for this graph includes all types of pollutants and also that, each country in a pooled study is counted separately.

    # generate summary dataset for total number of studies conducted in a given country
    epi_country_count <- epi %>%
      dplyr::select(country) %>%
      dplyr::count(country) %>%
      dplyr::filter(country != "NA", !is.na(country), !is.na(n))

    # making country names match before joining epi file with color file
    epi_country_count$country <- stringr::str_replace(epi_country_count$country, "USA", "United States")

    #> plot choropleth map

    # get data ready for the choropleth map
    world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")


    # making country names match before joining epi file with color file
    world$name_long <- stringr::str_replace(world$name_long, "Russian Federation", "Russia")
    world$name_long <- stringr::str_replace(world$name_long, "Lao PDR", "Laos")
    world$name_long <- stringr::str_replace(world$name_long, "Dem. Rep. Korea", "North Korea")
    world$name_long <- stringr::str_replace(world$name_long, "Republic of Korea", "South Korea")
    world$name_long <- stringr::str_replace(world$name_long, "Vatican", "Vatican City")
    world$name_long <- stringr::str_replace(world$name_long, "Brunei Darussalam", "Brunei")
    world$name_long <- stringr::str_replace(world$name_long, "Somaliland", "Somalia")
    world$name_long <- stringr::str_replace(world$name_long, "Mexico", "México")
    world$name_long <- stringr::str_replace(world$name_long, "Republic of Congo", "Republic of the Congo")
    world$name_long <- stringr::str_replace(world$name_long, "Czech Republic", "Czechia")

    # joining world shape file with the epi conutry wise count dataset
    world_shp_epi <- world %>%
      dplyr::left_join(epi_country_count, by = c("name_long" = "country")) %>%
      dplyr::select(name_long, n, geometry) %>%
      dplyr::rename(num_studies = n, country = name_long)

    # color 2020 collapse to country level to get the PM2.5 pollution layer (first layer of the map)
    aqli_color_country <- aqli_color %>%
      dplyr::group_by(country) %>%
      dplyr::mutate(pop_weights = population/sum(population, na.rm = TRUE),
                    pm2021_pop_weighted = pm2021*pop_weights) %>%
      dplyr::summarise(avg_pm2.5_2021 = sum(pm2021_pop_weighted, na.rm = TRUE))

    # joining the world shape + epi joined file with the aqli color file
    world_shp_epi_color <- world_shp_epi %>%
      dplyr::left_join(aqli_color_country, by = "country") %>%
      dplyr::filter(country != "Antarctica") %>%
      dplyr::select(country, num_studies, avg_pm2.5_2021, geometry)

    # In the total number of studies column, replacing NAs with 0s
    world_shp_epi_color <- world_shp_epi_color %>%
      dplyr::mutate(num_studies = ifelse(is.na(num_studies) == TRUE, 0, num_studies))

    # getting centroids for countries after correcting for problematic polygons
    world_shp_epi_color_for_centroids <<- sf::st_make_valid(world_shp_epi_color)
    country_wise_centroids <<- sf::st_centroid(world_shp_epi_color_for_centroids, of_largest_polygon = TRUE)

    # adding an identifier column for countries where no studies have happened
    country_wise_centroids$color_circle = ifelse(country_wise_centroids$num_studies == 0,  "No Studies", "Atleast 1 study")

    # creating a dataframe out of the latitude, longitude coordinates of the country centroids, so that we can plot it
    # using geom_point.
    coord_df <- dplyr::as_tibble(sf::st_coordinates(country_wise_centroids))

    # reassigning the X and Y coordinates to x_coord and y_coord
    country_wise_centroids$x_coord <- coord_df$X
    country_wise_centroids$y_coord <- coord_df$Y

    # moving the geometry column to the end and making sure that the entire dataset is a sf object
    country_wise_centroids <- country_wise_centroids %>%
      dplyr::select(-geometry, geometry) %>%
    sf::st_as_sf()

    # without geometry version of country wise centroids dataset and adding a label_plt column
    country_wise_centroids_sans_geom <- country_wise_centroids %>%
    sf::st_drop_geometry() %>%
      dplyr::as_tibble() %>%
      dplyr::mutate(label_plt = ifelse(num_studies == 0, "", num_studies))


# plotting the choropleth
geographic_dist_graph <- world_shp_epi_color %>%
        ggplot2::ggplot() +
        ggplot2::geom_sf(mapping = ggplot2::aes(fill = avg_pm2.5_2021, group = avg_pm2.5_2021), color = "darkgrey") +
        ggplot2::geom_sf(data = world_shp_epi_color %>% dplyr::filter(num_studies > 0) %>% sf::st_as_sf(), color = "black", size = 6, fill = "transparent") +
        colorspace::scale_fill_continuous_sequential(palette = "YlOrRd", name = expression("Annual Average" ~ PM[2.5] ~ "in 2021 (in µg/m³)")) +
        ggplot2::geom_jitter(data = country_wise_centroids_sans_geom, mapping = aes(x = x_coord, y = y_coord, size = num_studies, color = color_circle)) +
        ggplot2::geom_text(data = country_wise_centroids_sans_geom, mapping = aes(x = x_coord, y = y_coord, label = label_plt), size = 3, color = "white", size = 2) +
        ggplot2::theme(legend.position = "bottom") +
        ggplot2::scale_size(breaks = c(1, 5, 10, 20, 30), range = c(1, 12),
                            guide = ggplot2::guide_legend(override.aes = list(color = "darkgrey"))) +
        ggplot2::labs(caption = stringr::str_wrap("*The size of the circles is directly proportional to the total number of times a given country has been included
                                                in a study (any study, not necessarily PM2.5 specific - mentioned as the number in the circle). Countries with atleast 1 study are highlighted (black borders)."), width = 30,
                      size = "Number of Studies") +
        ggplot2::scale_color_manual(values = c("No Studies" = "cadetblue3", "Atleast 1 studys" = "darkolivegreen"), name = "") +
        ggthemes::theme_map() +
       ggplot2::ggtitle(expression("Country wise distribution of epi studies and Annual Average" ~PM[2.5] ~ "pollution (in µg/m³)")) +
        ggplot2::theme(plot.caption = ggplot2::element_text(hjust = 0, size = 7),
                       legend.position = "bottom",
                       legend.justification = c(0.5, 3),
                       plot.title = ggplot2::element_text(hjust = 0.5, size = 11), 
                       legend.text = element_text(size = 5), 
                       legend.title = element_text(size = 6))

#> Other calculations for this block of code

# Of all AQ epi studies included in this analysis how many included populations in either the US or Canada or both
num_times_usa_canada_included <- epi %>% 
  filter(country %in% c("USA", "Canada")) %>%
  distinct(paper_uid, .keep_all = TRUE) %>%
  nrow()

# total number of times USA, Canada and Europe have been included in a study
num_times_usa_canada_europe_included <- epi %>% 
  filter((country %in% c("Canada", "USA")) | (continent %in% c("Europe"))) %>% 
  distinct(paper_uid, .keep_all = TRUE) %>%
  nrow()

# Countries other than these: USA, Canada and the ones that are included in Europe that have been included in a study
areas_other_than_usa_canada_europe <- epi %>% 
  filter((country %notin% c("Canada", "USA")) & (continent %notin% c("Europe"))) %>%
  select(country) %>%
  unique() %>%
  unlist() %>%
  as.vector()

# number of times a continent have been included in a study
continent_wise_study_summary <- epi %>%
  group_by(paper_uid, continent) %>% 
  summarise(count = n()) %>%
  ungroup() %>%
  select(continent) %>%
  count(continent) %>%
  slice_max(n, n = 6) %>%
  mutate(prop_studies = round((n/sum(n, na.rm = TRUE))*100, 1))

# total number of times South America was included in a study
num_times_sa_included <- continent_wise_study_summary %>%
  filter(continent == "South America") %>%
  select(n) %>%
  unlist() %>%
  as.vector()

# total number of times Oceania was included in a study
num_times_oceania_included <- continent_wise_study_summary %>%
  filter(continent == "Oceania") %>%
  select(n) %>%
  unlist() %>%
  as.vector()

# total number of studies Africa has been included in a study
num_times_africa_included <- continent_wise_study_summary %>%
  filter(continent == "Africa") %>%
  select(n) %>%
  unlist() %>%
  as.vector()

# plot geographic distribution
geographic_dist_graph

r round((num_times_usa_canada_included/(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow()))*100, 1) percent of all AQ epi studies (r num_times_usa_canada_included studies) included in this analysis included populations in either the US or Canada or both.
USA, Canada and Europe (or some combination of them) were focus countries in r round((num_times_usa_canada_europe_included/(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow()))*100, 1) percent of all AQ epi studies (r num_times_usa_canada_europe_included studies). Other countrie(s) that have been included in an AQ epi study: r areas_other_than_usa_canada_europe.
Of the total number of times a given continent's population has been included in any given AQ epi study, r continent_wise_study_summary$continent[1] dominates the rest of the continents and has been included r continent_wise_study_summary$prop_studies[1] percent of times (all due to studies in the US or Canada). Closely following North America, populations in r continent_wise_study_summary$continent[2] and r continent_wise_study_summary$continent[3] have been included in AQ epi studies r continent_wise_study_summary$prop_studies[2] and r continent_wise_study_summary$prop_studies[3] percent of times respectively. r missing_continents have seen 0 long term (> r min_study_duration_in_years year), large (>r min_sample_size people in the sample) AQ epi study.

AQ Epi studies over time

#> Add a note mentioning that the underlying data for this graph includes all types of pollutants but only one row is counted for a given pooled (multi-country) study.

epi %>% 
dplyr::distinct(paper_uid, .keep_all = TRUE) %>%
  group_by(publishing_year) %>%
  summarise(total_studies = n()) %>%
  ungroup() %>%
  filter(total_studies != "NA", !is.na(total_studies), publishing_year != "NA", !is.na(publishing_year)) %>%
  ggplot() +
  geom_line(mapping = aes(x = publishing_year, y = total_studies), lwd = 1.2, color = "cornflowerblue") +
  labs(x = "Year", y = "Total Number of Studies", 
       caption = str_wrap("*This graph displays the total number of AQ epi studies (all studies, not PM2.5 specific) published over time."), width = 10) +
  scale_x_continuous(breaks = seq(min(epi$publishing_year, na.rm = TRUE), max(epi$publishing_year, na.rm = TRUE), 2)) +
  theme_hc() +
    theme(plot.caption = element_text(hjust = 0))

In most of the 90’s and early 2000’s, the rate of AQ epi studies publishing was around 1 to 2 studies per annum on average.
Post 2009, there has been a noticeable increase in the overall volume of AQ epi studies published.

Distribution of duration of study by continent

#>  In this, in the graph multiple countries of a pooled study are counted separately. Also, this is not a PM2.5 specific graph.

#> Density plot of study duration (commented for now, can use later if required)
# epi %>%
#   filter(!is.na(study_duration)) %>%
#   ggplot() +
#   geom_density(mapping = aes(x = study_duration, fill = continent), alpha = 0.5) +
#     labs(x = "Study Duration") +
#   ggtitle("Epi Studies Duration Distribution (Continent Wise): Density Plot") +
#   scale_x_continuous(breaks = seq(0, 40, 5)) +
#   theme_tufte() +
#   theme(legend.position = "bottom")


#> Histogram of study duration



#> adding an order panel and also dummy panels (for those cases in which we are looking at a subset of a dataset that does not include certain continents).


   if(sum(missing_continents_logical) == 0){
      continent_wise_study_duration_graph_summary_table <-   epi_tmp %>%
        dplyr::filter(continent != "NA", !is.na(continent), study_duration != "NA", !is.na(study_duration)) %>%
        dplyr::group_by(paper_uid, continent) %>%
        dplyr::summarise(average_study_duration = mean(study_duration, na.rm = TRUE)) %>%
        dplyr::ungroup() %>%
        dplyr::filter(continent != "Oceania") %>%
        #dplyr::mutate(continent = as.factor(continent)) %>%
        dplyr::mutate(order_continent = ifelse(continent == "China", 2, 0),
                      order_continent = ifelse(continent == "Asia - China", 1, order_continent),
                      order_continent = ifelse(continent == "Europe", 5, order_continent),
                      order_continent = ifelse(continent == "North America", 6, order_continent),
                      order_continent = ifelse(continent == "South America", 4, order_continent),
                      order_continent = ifelse(continent == "Africa", 3, order_continent)) %>%
        dplyr::mutate(continent = ifelse(continent == "China", "China (1402.5 million people)", continent),
                      continent = ifelse(continent == "Africa", "Africa (1373.8 million people)", continent),
                      continent = ifelse(continent == "North America", "North America (589.2 million people)", continent),
                      continent = ifelse(continent == "South America", "South America (429.2 million people)", continent),
                      continent = ifelse(continent == "Europe", "Europe (740.8 million people)", continent),
                      continent = ifelse(continent == "Asia - China", "Asia - China (3161.5 million people)", continent))

    } else{
      for(i in 1:length(missing_continents)){
        if(i == 1){
          continent_wise_study_duration_graph_summary_table <- epi_tmp %>%
            dplyr::filter(continent != "NA", !is.na(continent), study_duration != "NA", !is.na(study_duration)) %>%
            dplyr::group_by(paper_uid, continent) %>%
            dplyr::summarise(average_study_duration = mean(study_duration, na.rm = TRUE)) %>%
            dplyr::ungroup() %>%
          dplyr::add_row(continent = missing_continents[i])
        } else{
          continent_wise_study_duration_graph_summary_table <- continent_wise_study_duration_graph_summary_table %>%
            dplyr::add_row(continent = missing_continents[i])
        }

      }
      continent_wise_study_duration_graph_summary_table <- continent_wise_study_duration_graph_summary_table %>%
        dplyr::filter(continent != "Oceania") %>%
       # dplyr::mutate(continent = as.factor(continent)) %>%
        dplyr::mutate(order_continent = ifelse(continent == "China", 2, 0),
                      order_continent = ifelse(continent == "Asia - China", 1, order_continent),
                      order_continent = ifelse(continent == "Europe", 5, order_continent),
                      order_continent = ifelse(continent == "North America", 6, order_continent),
                      order_continent = ifelse(continent == "South America", 4, order_continent),
                      order_continent = ifelse(continent == "Africa", 3, order_continent)) %>%
        dplyr::mutate(continent = ifelse(continent == "China", "China (1402.5 million people)", continent),
                      continent = ifelse(continent == "Africa", "Africa (1373.8 million people)", continent),
                      continent = ifelse(continent == "North America", "North America (589.2 million people)", continent),
                      continent = ifelse(continent == "South America", "South America (429.2 million people)", continent),
                      continent = ifelse(continent == "Europe", "Europe (740.8 million people)", continent),
                      continent = ifelse(continent == "Asia - China", "Asia - China (3161.5 million people)", continent))
    }



continent_wise_study_duration_graph <- continent_wise_study_duration_graph_summary_table %>%
        ggplot2::ggplot() +
        ggplot2::geom_histogram(mapping = ggplot2::aes(x = average_study_duration, fill = continent, group = continent), color = "white", position = "identity", binwidth = 2) +
        ggplot2::labs(x = "Study Duration", y = "Count",
                      caption = stringr::str_wrap("*This graph displays distribution by continent (and for China, Asia - China) for all studies (not PM2.5 specific)."), width = 10,
                      title = expression("Continent (and China) wise Study Duration Distribution and 2021 Annual Average" ~ PM[2.5] ~ "(in µg/m³)"),
                      subtitle = "(Ordered from most polluted to least polluted)") +
        ggplot2::scale_x_continuous(breaks = seq(0, 40, 10)) +
        ggthemes::theme_hc() +
        ggplot2::theme(legend.title = element_blank()) +
        ggplot2::facet_wrap(~forcats::fct_reorder(continent, order_continent), nrow = 1) +
        ggplot2::theme(axis.line.y = ggplot2::element_line(color = "black"),
                       axis.line.x = ggplot2::element_line(color = "black"),
                       plot.caption = ggplot2::element_text(hjust = 0, size = 7),
                       plot.title = element_text(size = 10, hjust = 0.5),
                       plot.subtitle = element_text(size = 8, hjust = 0.5),
                       strip.background = element_rect(fill = "floralwhite"),
                       strip.text = element_text(size = 4.5),
                       legend.background = element_rect(color = "black"), 
                       legend.text = element_text(size = 7)) +
        ggplot2::scale_fill_manual(values = c("North America (589.2 million people)" = "#FFBA00", "Europe (740.8 million people)" = "#FF9600",
                                     "South America (429.2 million people)" = "#FF6908", "Africa (1373.8 million people)" = "#E63D23", "China (1402.5 million people)" = "#BD251C",
                                     "Asia - China (3161.5 million people)" = "#8C130E")) +
        ggplot2::scale_y_continuous(breaks = seq(0, 10, 2))





#> calculations relevant for this block of code

# long term study threshold
long_term_study_duration <- 25

# number of "really long term studies"
num_really_long_term_studies <- epi %>%
  group_by(paper_uid) %>%
  summarise(mean_duration = mean(study_duration, na.rm = TRUE)) %>%
  filter(mean_duration > long_term_study_duration) %>%
  nrow()


# continents that have seen studies greater than x (long_term) years of study duration
epi_long_term_studies_countries <- epi %>%
  group_by(paper_uid, continent) %>%
  summarise(mean_study_duration = mean(study_duration, na.rm = TRUE)) %>% 
  ungroup() %>%
  filter(mean_study_duration > long_term_study_duration, mean_study_duration != "NA", !is.na(mean_study_duration)) %>%
  select(continent) %>%
  count(continent) %>%
  mutate(prop_n = round((n/sum(n, na.rm = TRUE))*100, 1)) %>%
  slice_max(prop_n, n = 6)

# total population in missing continents combined
tot_pop_africa_missing_continents <- aqli_color %>%
  filter(continent %in% missing_continents) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  unlist() %>%
  as.numeric() 

tot_pop_africa_missing_continents_in_billions <- round(tot_pop_africa_missing_continents/1000000000, 1)

# total world population
world_population <- aqli_color %>%
  summarise(total_population = sum(population, na.rm = TRUE)) %>%
  unlist() %>%
  as.numeric()



# continent order list (order variable: number of studies): code present in the geographical distribution map block.

# plot continent wise study duration graph
continent_wise_study_duration_graph

In total, r num_really_long_term_studies "really" long term studies (> r long_term_study_duration years) have been performed. That is, r round((num_really_long_term_studies/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE)))*100, 1) percent of the total number of studies.
All of the following continents have seen at least 1 really long term study: r epi_long_term_studies_countries %>% filter(n >= 1) %>% select(continent) %>% unlist() %>% as.vector().
All of the following continents have seen more than 1 really long term study: r epi_long_term_studies_countries %>% filter(n > 1) %>% select(continent) %>% unlist() %>% as.vector().
In all of the really long term studies (including both multi-country/continent and single-country), i.e. ones that have a study duration of > r long_term_study_duration years - r epi_long_term_studies_countries$continent[1] was included r epi_long_term_studies_countries$n[1] times in those studies (r epi_long_term_studies_countries$prop_n[1] percent of the total number of times any given continent is included in any given study).
Similarly, r epi_long_term_studies_countries$continent[2] has been included r epi_long_term_studies_countries$n[2] times in such long term studies (r epi_long_term_studies_countries$prop_n[2] percent of the total number of times any given continent is included in any given study).

# press ctrl + shift + c to uncomment this.

# ### Major takeaways
# 
# * `r round((num_times_usa_canada_included/(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow()))*100, 1)` percent of all AQ epi studies (`r num_times_usa_canada_included` studies) included in this analysis included populations in either the US or Canada or both.
# 
# * Of the total number of times a given continent's population has been included in any given AQ epi study, `r continent_wise_study_summary$continent[1]` dominates the rest of the continents and has been included `r continent_wise_study_summary$prop_studies[1]` percent of times (all due to studies in the US or Canada). Closely following North America, populations in `r continent_wise_study_summary$continent[2]` and `r continent_wise_study_summary$continent[3]` have been included in AQ epi studies `r continent_wise_study_summary$prop_studies[2]` and `r continent_wise_study_summary$prop_studies[3]` percent of times respectively. `r missing_continents` have seen 0 long term (> `r min_study_duration_in_years` year), large (>`r min_sample_size` people in the sample) AQ epi study.
#   
# * There have been no large (> `r min_study_duration_in_years` year) or long (>`r min_sample_size` people in the sample) PM epi studies that were performed in `r missing_continents`, with a combined population of `r tot_pop_africa_missing_continents_in_billions` billion, `r round((tot_pop_africa_missing_continents/world_population)*100, 1)` percent of the Earth's total human population.
# 
# * Post 2009, there has been a noticeable increase in the overall volume of AQ epi studies published.
# 
# * In total, `r num_really_long_term_studies` "really" long term studies (> `r long_term_study_duration` years) have been performed. That is, `r round((num_really_long_term_studies/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE)))*100, 1)` percent of the total number of studies.
# 
# * All of the following continents have seen at least 1 really long term study: `r epi_long_term_studies_countries %>% filter(n >= 1) %>% select(continent) %>% unlist() %>% as.vector()`.

Conclusion

All studies included in this analysis point to the same overall picture:air pollution is a serious health threat. The existing state of scientific literature on air pollution and health is clear that air pollution’s impact on health is well-established and taking action to improve a polluted environment should not be delayed in order to complete multi-year large sample (> r min_sample_size) epidemiological studies in an area, even if there has not been a prior study in that particular geography. That said, it is important for the field of air quality epidemiolgy to understand the contours of its current research landscape to most effectively identify directions for future research and deploy limited resources.

Methodology

Through a comprehensive and ongoing* literature review, we are making an attempt at creating an exhaustive public listing of all the epidemiological studies out there (that we could find) that examine the relationship between PM~2.5~ and Life Expectancy/Mortality.

For each study, we record data on key defining features, such as: Geography, Sample Size, Study Duration, PM~2.5~ exposure range, etc. Then we used this analysis dataset to carry out a meta-analysis, results of which are detailed in the Results section above.

We are seeking to make this analysis as current, complete and error-free as possible and view it as a continual work in progress. We would welcome the air quality community’s any comments, corrections, andor suggestions. Please contact aqli-epic@uchicago.edu or leave a comment in this GitHub repository.

Inclusion/Exclusion Criteria for studies

The underlying analysis dataset for this meta analysis focuses only on those papers that study the link between PM~2.5~, PM~10~, TSP, Fine Particulate Matter and __Mortality/Life Expectancy__. In addition to the analysis dataset, there is a master dataset that expands on the analysis dataset to include other useful information. The idea behind the master dataset is to record any additional details (whether additional facts about the paper, or details on other pollutants studied in the paper) that will not be a part of the main data analysis exercise. Master dataset will also list additional papers that do not fit into the inclusion criteria.
The analysis dataset excludes the following types of papers: meta-analysis, unpublished papers, papers studying the effects of indoor air pollution on Mortality/Life Expectancy, papers forecasting future air pollution/life expectancy/mortality. But, the master dataset lists all of these studies.
In Multi-Country (pooled) studies, one entry (one row in the analysis dataset) is recorded for each country in papers where country level data is available. There are some pooled studies where country level data is not available, but rather data is available for a custom region (e.g. South-East Asia), such pooled studies are excluded from the analysis dataset. But, the master dataset still lists all of these additional studies for reference.
In cases where the same cohort is studied by different research groups at different points in time/using different methods: we have included and counted all of those studies as separate unique studies in the analysis below.
Papers studying the health effects of pollution segregated by sectors (source apportionment type studies), regions (example, Rural v/s Urban), season (Winter v/s Summer) are excluded from this analysis.
Papers that include combined estimates of Household and Ambient Air Pollution, but not individual estimates are excluded.
All papers studying the impact of pollutants on DALY's (Disability Adjusted Life Years), are excluded from this analysis.

Other important points

In papers, where the minimum PM~2.5~ concentration was not reported, we have assumed the lowest available percentile data available on PM2.5 concentration as the minimum concentration.
In many papers, only one of the mean PM~2.5~ or PM~2.5~ range is reported but not both. In these cases, wherever the data is not available (whether it is mean PM~2.5~ or PM~2.5~ range) we have recorded the instances as "NA". Apart from this, anywhere we couldn’t find data, we have recorded it as "NA". This is one of the limitations of this analysis.
Different papers talk about different types of mean PM~2.5~ values. Some report daily averages, while others may report monthly/annual averages. Also, in some cases, the averages are population weighted and in other cases they are not. But, it is not always clearly stated as to which type of average the paper is referring to (or how it was calculated). In scenarios like these, we had to make a value judgement and this is one of the limitations of this analysis.
In cases, where mean PM~2.5~ is reported for both the start year and the end year(or also for sometime between a start year and the end year) of the study, we have reported the end year mean PM~2.5~ value.
Papers that have used a conversion factor to convert mean PM~10~ concentrations to mean PM~2.5~ concentrations are included but, there PM~2.5~ values (mean, lower limit, upper limit, standard deviation) is not recorded.
In multi-pollutant studies where different pollutants have been studied over different periods of time, we have chosen the specific time period that corresponds to the PM~2.5~ pollutant (in cases where we PM~2.5~ is not present, we have recorded the time period corresponding to one of the other pollutants).
We have recorded one mean PM~2.5~ value for each paper. But, there are papers where more than one mean PM~2.5~ value is reported (for example, one for the male group and one for the female group). In these cases, we have picked one value from the ones that are available.
In some papers, the exact start and/or end year of a study is unclear. In these cases, we have mostly chosen the first instance of the multiple “study duration ranges” (depending on multiple plausible interpretations of when the final follow up ended) the paper.
In many papers, the upper limit or lower limit for age is not precisely specified. In these cases, we have recorded a NA. For example, there are papers where the upper limit category for age is 85+. In these cases, although we know the upper limit category, we still don't know the upper limit of the age, which could be 90, 95, 100, etc.
In places, where the sample size is specifed in terms of "number of regions" (for example, 17 districts) and not "number of people", we have recorded a NA in the cohort size column.
Under the “methods” column (in the master dataset), we have only mentioned a subset of methods that were used to carry out different parts of the study. Authors may have used other methods than those mentioned in our meta analysis.
Different graphs are generated using different subsets of the analysis dataset. Most of them are PM~2.5~ specific (our main focus), others include all pollutants (PM~2.5~, PM~10~, TSP, Ultrafine Particles). The type of papers used to generate a given graph and other nuances for the graph in question are specified within the graph (as a note) and/or the accompanying text.

How can you (the community) help in improving this analysis?

Add to the analysis dataset and and grow the epi database:
If you know of other papers that: (a) are attempting to study the link between PM~2.5~ and Life Expectancy/Mortality and (b) are not included in this analysis: Please leave a comment in the analysis dataset, providing a link to the paper. You can also write to us at aqli-info@uchicago.edu (with the link to the paper mentioned in the email).
- As a next step, we’ll go through your submission, and if it fits our inclusion criteria, we will update the underlying analysis dataset and re-render the entire blog, so that it represents the most up to date data and figures.
- If you are unsure (given the inclusion criteria) about whether a particular paper (that you want to post) should be posted on not, we encourage you to post it and let us worry about the inclusion/exclusion bit.
In case you have comments on some aspects of a paper/if you find any errors, please leave a comment in the analysis dataset on the cell (tagging aqli-info@uchicago.edu, using @ symbol) where the error is found or write to us at aqli-info@uchicago.edu, detailing the error and its location (cell address on the sheet).

References

Other Graphs and Interactive Summary Dashboard

To further explore these graphs and more in an interactive setup, visit the AQ Epi dashboard.

####  PM2.5 distributions of the lower limits and upper limits of exposure range (Density and Histogram Plots)

## All graphs below this block of code is deactivated for the blog post. This will be displayed on the dashboard, alongside other graphs.


#> Specific to PM2.5 and count separate countries in a pooled study separately

#> exposure distribution density and histogram plots (PM2.5 LL and PM2.5 UL)

# create long dataset
epi_long <- epi %>% 
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  pivot_longer(cols = contains("pm2.5_exposure"), names_to =  "exposure_type", values_to = "exposure_value") %>%
  select(paper_uid, exposure_type, exposure_value) %>%
  filter(!is.na(exposure_value)) 

# density plot
epi_long %>%
  group_by(paper_uid, exposure_type) %>%
  summarise(exposure_value = ifelse(exposure_type == "pm2.5_exposure_ll", min(exposure_value, na.rm = TRUE), max(exposure_value, na.rm = TRUE))) %>%
  ggplot() +
  geom_density(mapping = aes(x = exposure_value, fill = exposure_type), alpha = 0.6, position = "identity") +
  labs(x = expression(paste("PM2.5 concentration (", mu, "g", "/", m^3, ")"))) +
  theme_tufte()

# histogram plot
epi_long %>%
  group_by(paper_uid, exposure_type) %>%
  summarise(exposure_value = ifelse(exposure_type == "pm2.5_exposure_ll", min(exposure_value, na.rm = TRUE), max(exposure_value, na.rm = TRUE))) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = exposure_value, fill = exposure_type), position = "identity", alpha = 0.4, color = "white") +
  labs(x = expression(paste("PM2.5 concentration (", mu, "g", "/", m^3, ")"))) +
  theme_tufte()

#### Age distributions of lower limits and upper limits of age range (Density and Histogram plots)

#> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future.

#> (5) age distribution plot (LL and UL) using the long dataset (density plot)
epi_long_age_dist <- epi %>% 
  filter(((!is.na(cohort_age_ll)) | (cohort_age_ll != "NA")) & ((!is.na(cohort_age_ul)) | (cohort_age_ul != "NA"))) %>%
  pivot_longer(cols = contains("cohort_age"), names_to =  "age_dist_type", values_to = "age_value") %>%
  select(paper_uid, age_dist_type, age_value) %>%
  filter(!is.na(age_value)) 

epi_long_age_dist %>%
  ggplot() +
  geom_density(mapping = aes(x = age_value, fill = age_dist_type), alpha = 0.6, color = "black") +
  labs(x = "Age") +
  theme_tufte()

#> (6) age distribution plot (LL and UL) using the long dataset (histogram plot)
epi_long_age_dist %>%
  ggplot() +
  geom_histogram(mapping = aes(x = age_value, fill = age_dist_type), alpha = 0.5, position = "identity", color = "white") +
  labs(x = "Age") +
   theme_tufte()

#### Country wise distribution of Cohort Sizes (Density and Histogram plots)

#> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future.


#> (7) Country Wise Cohort Size density plot
epi %>%
    filter(cohort_size != "NA", !is.na(cohort_size)) %>%
  ggplot() +
  geom_density(mapping = aes(x = log10(cohort_size), fill = country), alpha = 0.5) +
  theme_tufte()

#> (8) Country wise Cohort Size histogram plot
epi %>%
  filter(cohort_size != "NA", !is.na(cohort_size)) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = log10(cohort_size), fill = country), alpha = 0.5, position = "identity") +
  theme_tufte()

#### Country wise Distribution of Study Duration (Density and Histogram Plots)

#> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future.


#> (9) Country wise Study duration density plot
epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_density(mapping = aes(x = study_duration, fill = country, alpha = 0.5)) +
  theme_tufte()

#> Country Wise Study Duration histogram

epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = study_duration, fill = country, alpha = 0.5), position = "identity", alpha = 0.5, color = "white", binwidth = 1) +
  theme_tufte() +
  facet_wrap(~country, nrow = 5) +
  theme(legend.position = "bottom")

# Experimental graphs (not to be put on the GitHub blog)

#### Country wise Distribution of Study Duration (Density and Histogram Plots)

#> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future.


#> (9) Country wise Study duration density plot
epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_density(mapping = aes(x = study_duration, fill = country, alpha = 0.5)) +
  theme_tufte()

#> Country Wise Study Duration histogram

epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = study_duration, fill = country, alpha = 0.5), position = "identity", alpha = 0.5, color = "white", binwidth = 1) +
  theme_tufte() +
  facet_wrap(~country, nrow = 5) +
  theme(legend.position = "bottom")

# Experimental Graphs (others not to be put on the GitHub blog)

#### Country wise Distribution of Study Duration (Density and Histogram Plots)

#> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future.


#> (9) Country wise Study duration density plot
epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_density(mapping = aes(x = study_duration, fill = country, alpha = 0.5)) +
  theme_tufte()

#> Country Wise Study Duration histogram

epi %>%
  filter(study_duration != "NA", !is.na(study_duration)) %>%
  ggplot() +
  geom_histogram(mapping = aes(x = study_duration, fill = country, alpha = 0.5), position = "identity", alpha = 0.5, color = "white", binwidth = 1) +
  theme_tufte() +
  facet_wrap(~country, nrow = 5) +
  theme(legend.position = "bottom")


# Experimental graphs

#> exp graph 1 - global aqli 2020 data pm2020 column distribution overlayed with vertical lines showing mean_pm2.5 values of various studies.---------------------------------------------------------
mean_pm2.5_studies <- epi$mean_pm2.5[!is.na(epi$mean_pm2.5)] 


# exp graph 1 v1
exp_graph_1_v1 <- aqli_color %>%
  ggplot() +
  geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8) +
  geom_vline(xintercept = mean_pm2.5_studies, color = "black", alpha = 0.4, linetype = "longdash", size = 0.4) + 
  ggthemes::theme_tufte() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), 
       y = "count") +
  theme(axis.line = element_line()) +
  scale_x_continuous(breaks = seq(0, 120, 20)) +
  scale_y_continuous(breaks = seq(0, 12000, 2000))

# exp graph 1 v2
FOO <- aqli_color %>%
  ggplot() +
  geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8) +
  geom_vline(xintercept = mean_pm2.5_studies, color = "black", alpha = 0.4, linetype = "longdash", size = 0.4) + 
  ggthemes::theme_tufte() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), 
       y = "count") +
  theme(axis.line = element_line()) +
  scale_x_continuous(breaks = seq(0, 120, 20)) +
  scale_y_continuous(breaks = seq(0, 12000, 2000)) + 
  annotate("segment", x = min(mean_pm2.5_studies, na.rm = TRUE), y = 12000, xend = max(mean_pm2.5_studies, na.rm = TRUE), yend = 12000, 
             arrow = arrow(length = unit(0.3, "cm"), type = "closed")) + 
  annotate("segment", x = 60, y = 12000, xend = 60, yend = 10000) + 
  annotate("segment", x = 60, y = 10000, xend = 90, yend = 10000) +  
  annotate("text", x = 99, y = 10000, label = str_wrap("Mean PM2.5 range of studies", 12))

# exp graph 1 v3

aqli_color %>%
  ggplot() +
  geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8)+
  ggthemes::theme_tufte() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), 
       y = "count") +
  theme(axis.line = element_line()) +
  scale_x_continuous(breaks = seq(0, 120, 20)) +
  scale_y_continuous(breaks = seq(0, 12000, 2000)) + 
  annotate("segment", x = min(mean_pm2.5_studies, na.rm = TRUE), y = 12000, xend = max(mean_pm2.5_studies, na.rm = TRUE), yend = 12000, 
             arrow = arrow(length = unit(0.3, "cm"), type = "closed")) + 
  annotate("segment", x = 60, y = 12000, xend = 60, yend = 10000) + 
  annotate("segment", x = 60, y = 10000, xend = 90, yend = 10000) +  
  annotate("text", x = 99, y = 10000, label = str_wrap("Mean PM2.5 range of studies", 12))

# exp graph 1 v4

foo <- aqli_color %>%
  ggplot() +
  geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8)+
  ggthemes::theme_tufte() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), 
       y = "count") +
  theme(axis.line = element_line()) +
  scale_x_continuous(breaks = seq(0, 120, 20)) +
  scale_y_continuous(breaks = seq(0, 12000, 2000)) + 
  annotate("rect", xmin = min(mean_pm2.5_studies, na.rm = TRUE), ymin = 0, xmax = max(mean_pm2.5_studies, na.rm = TRUE), ymax = 12000, alpha = 0.2) + 
  annotate("segment", x = max(mean_pm2.5_studies, na.rm = TRUE), y = 10000, xend = 90, yend = 10000) +  
  annotate("text", x = 99, y = 10000, label = str_wrap("Mean PM2.5 range of studies", 12))

# hex version


#> exp graph v2-------------------------------------------------------------------------

#> PM2.5 exposure range and population distribution

# exp graph 2 v1

thresh_ll <- 0
thresh_ul <- 25


# aqli color population in ordered pm2.5 buckets
aqli_color_grp_pm2.5_buckets <- aqli_color %>%
  mutate(region = ifelse(pm2020 >= thresh_ll & pm2020 <= thresh_ul, str_c(thresh_ll, "-", thresh_ul), pm2020), 
         region = ifelse(pm2020 > thresh_ul, str_c(">", thresh_ul), region)) %>%
  group_by(region) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  ungroup() %>%
    mutate(order_pollution_group = ifelse(region == str_c(thresh_ll, "-", thresh_ul), 1, 0), 
         order_pollution_group = ifelse(region == str_c(">", thresh_ul), 2, order_pollution_group)) %>%
  ungroup() %>%
  mutate(tot_pop_prop = (tot_pop/sum(tot_pop))*100)


# epi total number of studies in ordered pollution buckets (initially filtering out pooled studies, will add back in using a for loop)
epi_num_studies_grp_pm2.5_buckets <- epi %>%
  filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
  group_by(paper_uid) %>%
  summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
    mutate(region = ifelse(mean_pm2.5 >= thresh_ll & mean_pm2.5 <= thresh_ul, str_c(thresh_ll, "-", thresh_ul), mean_pm2.5), 
         region = ifelse(mean_pm2.5 > 25, str_c(">", thresh_ul), region)) %>%
  mutate(order_pollution_group = ifelse(region == str_c(thresh_ll, "-", thresh_ul), 1, 0), 
         order_pollution_group = ifelse(region == str_c(">", thresh_ul), 2, order_pollution_group)) %>%
  group_by(region) %>%
  summarise(tot_studies = n(), order_pollution_group = order_pollution_group[1]) %>%
  ungroup() %>%
  mutate(tot_studies_prop = (tot_studies/sum(tot_studies))*100)

#> bar chart version

# creating a master dataset for both population and studies data
pop_epi_studies_data <- aqli_color_grp_pm2.5_buckets %>%
  left_join(epi_num_studies_grp_pm2.5_buckets, by = c("region", "order_pollution_group")) %>%
  select(region,  order_pollution_group, tot_pop_prop, tot_studies_prop) %>%
  pivot_longer(cols = tot_pop_prop:tot_studies_prop, names_to = "type_of_prop", values_to = "val")

# plotting the bar graph
pop_num_studies_in_pollution_buckets_graph <- pop_epi_studies_data %>%
  ggplot() +
  geom_col(mapping = aes(x = fct_reorder(region, order_pollution_group), y = val, fill = type_of_prop), position = position_dodge(), width = 0.4) +
  scale_y_continuous(breaks = seq(0, 100, 10)) +
  scale_fill_manual(values = c("tot_pop_prop" = "grey", "tot_studies_prop" = "cornflowerblue"), labels = c("Proportion of World Population in Bucket", "Proportion of Studies Completed in Bucket")) + 
  labs(x = "Mean PM2.5 bucket (in µg/m³)",  y = "Percentage", fill = "", 
       caption = str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) +
  theme_hc() +
  theme(axis.line.y = element_line(color = "black"), 
        axis.line.x = element_line(color = "black"), 
        plot.caption = element_text(size = 8, hjust = 0), 
        plot.caption.position = "plot")

# cartogram (using objects from the geographic distribution chunk)
foo <- world_shp_epi_color %>%
  st_as_sf()

# country level population
aqli_color_country_summary_population <- aqli_color %>%
  group_by(country) %>%
  mutate(pop_weights = population/sum(population, na.rm = TRUE), 
         pm2020_pop_weighted = pm2020*pop_weights) %>%
  summarise(avg_pm2.5_2020 = sum(pm2020_pop_weighted, na.rm = TRUE), 
            tot_pop = sum(population, na.rm = TRUE)) %>%
  select(country, tot_pop)

foo <- foo %>%
  left_join(aqli_color_country_summary_population, by = "country")


for(i in 1:nrow(foo)) {
  if(is.na(foo$num_studies[i])){
    foo$num_studies[i] <- (sample(1:20, 1))*10000
  } else {
    foo$num_studies[i] <- foo$num_studies[i]*10000
  }
}

foo <- foo %>% mutate(num_studies = ifelse(is.na(num_studies), 0, num_studies))

foo <-  foo %>% 
  mutate(num_studies_norm = num_studies/sum(num_studies, na.rm = TRUE), 
         num_studies_log = log10(num_studies + 1.00001))

foo <- foo %>%
  select(-geometry, geometry) %>%
  st_as_sf()

foo %>% 
  ggplot() + 
  geom_sf()

foo_transform <- st_transform(foo, 3857)

foo_transform_cont <- cartogram_cont(foo_transform,
                 weight = "num_studies_norm", itermax = 5)


  ggplot(foo_transform_cont) +
  geom_sf(mapping = aes(fill = num_studies))


foo_sp <- as(foo, "Spatial")



foo_sp_transform <- spTransform(foo_sp, CRS("+init=epsg:3395"))
foo_sp_transform_cont <- cartogram_cont(foo_sp_transform, "num_studies", itermax = 2)
tm_shape(foo_sp_transform_cont) + tm_polygons("total_pop", style = "jenks") +
   tm_layout(frame = FALSE, legend.position = c("left", "bottom"))


#> population pollution figure (version 2)-------------------------------
# pollution buckets on the x 


# adding pollution buckets
aqli_color_with_pol_buckets <- aqli_color %>%
  mutate(pol_buckets = ifelse(((pm2021 >= 0) & (pm2021 <= 5)), "0 - 5", NA),
         pol_buckets = ifelse(((pm2021 > 5) & (pm2021 <= 10)), ">5 - 10", pol_buckets),
         pol_buckets = ifelse(((pm2021 > 10) & (pm2021 <= 20)), ">10 - 20", pol_buckets),
         pol_buckets = ifelse(((pm2021 > 20) & (pm2021 <= 40)), ">20 - 40", pol_buckets),
         pol_buckets = ifelse(((pm2021 > 40) & (pm2021 <= 80)), ">40 - 80", pol_buckets),
         pol_buckets = ifelse(((pm2021 > 80)), ">80", pol_buckets)) %>%
  mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA),
         pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order))

# converting the 2 newly created columns to type factor
aqli_color_with_pol_buckets$pol_buckets <- as.factor(aqli_color_with_pol_buckets$pol_buckets)
aqli_color_with_pol_buckets$pol_bucket_order <- as.factor(aqli_color_with_pol_buckets$pol_bucket_order)


foo <- aqli_color_with_pol_buckets %>%
  group_by(pol_buckets) %>%
  summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA),
         pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order)) %>%
  filter(!is.na(pol_buckets)) %>%
  mutate(tot_pop_millions = round(tot_pop/1000000, 1)) %>%
  ggplot() +
  geom_col(mapping = aes(x = forcats::fct_reorder(pol_buckets, pol_bucket_order), y = tot_pop_millions, fill = forcats::fct_reorder(pol_buckets, pol_bucket_order)), width = 0.2) +
  ggthemes::theme_hc() +
  labs(x =  expression("Annual Average" ~ PM[2.5] ~ ("in µg/m³")), y = "Total Population (in millions)",
       title = "", fill = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)")) +
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        axis.title.x = element_text(size = 9),
        axis.title.y = element_text(size = 9),
        legend.title = element_text(size = 8),
        legend.background = element_rect(color = "black")) +
  viridis::scale_fill_viridis(option = "magma", direction = -1, discrete = TRUE) +
  scale_y_continuous(breaks = seq(0, 3000, 500))


# doing a similar thing with the epi dataset to get the total number of studies in different pollution buckets.
epi_with_pol_buckets <- epi %>%
  mutate(pol_buckets = ifelse(((mean_pm2.5 >= 0) & (mean_pm2.5 <= 5)), "0 - 5", NA),
         pol_buckets = ifelse(((mean_pm2.5 > 5) & (mean_pm2.5 <= 10)), ">5 - 10", pol_buckets),
         pol_buckets = ifelse(((mean_pm2.5 > 10) & (mean_pm2.5 <= 20)), ">10 - 20", pol_buckets),
         pol_buckets = ifelse(((mean_pm2.5 > 20) & (mean_pm2.5 <= 40)), ">20 - 40", pol_buckets),
         pol_buckets = ifelse(((mean_pm2.5 > 40) & (mean_pm2.5 <= 80)), ">40 - 80", pol_buckets),
         pol_buckets = ifelse(((mean_pm2.5 > 80)), ">80", pol_buckets)) %>%
  mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA),
         pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order))



epi_with_pol_buckets$pol_buckets <- as.factor(epi_with_pol_buckets$pol_buckets)
epi_with_pol_buckets$pol_bucket_order <- as.factor(epi_with_pol_buckets$pol_bucket_order)

foo1 <- epi_with_pol_buckets %>%
  group_by(pol_buckets) %>%
  summarise(total_studies = n()) %>%
  ungroup() %>%
  mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA),
         pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order),
         pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order)) %>%
  filter(!is.na(pol_buckets)) %>%
  add_row(pol_bucket_order = 1, pol_buckets = "0 - 5", total_studies = 0) %>%
  add_row(pol_bucket_order = 6, pol_buckets = ">80", total_studies = 0) %>%
  ggplot() +
  geom_col(mapping = aes(x = forcats::fct_reorder(pol_buckets, pol_bucket_order), y = total_studies, fill = forcats::fct_reorder(pol_buckets, pol_bucket_order)), width = 0.2) +
  ggthemes::theme_hc() +
  labs(x =  "", y = "Total number of epi studies",
       title = "", fill = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)")) +
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        axis.title.x = element_text(size = 9),
        axis.title.y = element_text(size = 9),
        legend.title = element_text(size = 8),
        legend.background = element_rect(color = "black")) +
  viridis::scale_fill_viridis(option = "magma", direction = -1, discrete = TRUE) +
  theme(legend.position = "none")



gridExtra::grid.arrange(foo1, foo)

#> Template for adding a red line that goes over all panels in the continent wise panel plot-----------

# convert the contnent wise plot into a grob
# plt_cont_grob <- ggplotGrob(plt_cont)
# 
# # get the panel grobs
# panels <- getGrob(plt_cont_grob, "panel", grep = TRUE)
# 
# # add a horizontal line at y = 0 on top of each panel
# for (i in 1:6) {
#   vp <- viewport(layout.pos.row = i, layout.pos.col = 1)
#   pushViewport(vp)
#   grid.lines(x = c(unit(0.1, "npc"), unit(1, "npc")),
#              y = unit(0.82, "npc"),
#              gp = gpar(col = "darkred", lwd = 1),
#              arrow = arrow(length = unit(0.15, "in"), type = "closed", ends = "last",
#                            angle = 30))
#   popViewport()
# }


#-------------------------------------

#> Countries withing continent wise plot of studies in form of a bar graphs (with countries not labelled)-------

# generate plot dataset
plot_data <- epi %>%
  group_by(continent, country) %>%
  summarise(count_studies = n(), .groups = "keep")


# plot the countries within continent plot
plot_data %>%
  ggplot() +
  geom_col(mapping = aes(x = continent, y = count_studies, fill = continent), position = position_dodge2(preserve = "single"))


#> Treemaps of the number of studies that have happened in each continent by country (used plot_data object from above)-----------------

plt1 <- ggplot(plot_data %>% filter(continent == "Asia"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "Asia (24 times was part of a epi study)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))

plt2 <- ggplot(plot_data %>% filter(continent == "Africa"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "Africa (0)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))

plt3 <- ggplot(plot_data %>% filter(continent == "South America"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "South America (0)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))

plt4 <- ggplot(plot_data %>% filter(continent == "Europe"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "Europe (15)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))

plt5 <- ggplot(plot_data %>% filter(continent == "North America"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "North America (49)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))


plt6 <- ggplot(plot_data %>% filter(continent == "Oceania"), aes(area = count_studies, fill = country, label = count_studies)) +
  geom_treemap() +
  viridis::scale_fill_viridis(discrete = TRUE) +
  geom_treemap_text(color = "white") +
  labs(title = "Oceania (0)", fill = "Country") +
  theme(legend.position = "bottom",
        legend.background = element_rect(color = "black"),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "white"))

plt_final <- gridExtra::grid.arrange(plt1, plt2, plt3, plt4, plt5, plt6, nrow = 1, top = "Number of times a country has been included in an epi study (Continent Wise)")

plt_final + ggtitle("foo")


#> Population pollution figure (v3): trying a continuous but binned scale and adding Ken's figure----------

# population v/s pollution 
foo <- aqli_color %>%
  group_by(pm2021) %>%
  summarise(count_pm2021 = sum(population, na.rm = TRUE),
            count_pm2021_millions = round(count_pm2021/1000000, 1)) %>%
  ggplot() +
 geom_hline(yintercept = c(300, 600, 900, 1200), linetype = "dashed", color = "lavenderblush2") +
  geom_histogram(mapping = aes(x = pm2021, weight = count_pm2021_millions), fill = "#8FA1AD", color = "white", alpha = 0.7, width = 0.5) + 
  ggthemes::theme_hc() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)"),
       y = "Population (in millions)") +
  scale_y_continuous(breaks = seq(0, 1500, 300)) +
  scale_x_continuous(breaks = seq(0, 140, 15), limits = c(0, 140)) +
  theme(axis.line = element_line(),
        panel.grid.major.y = element_blank(),
        axis.title = element_text(size = 11)) 

foo_v2 <- aqli_color %>%
  group_by(pm2021) %>%
  summarise(count_pm2021 = sum(population, na.rm = TRUE),
            count_pm2021_millions = round(count_pm2021/1000000, 1)) %>%
  ggplot() +
 geom_hline(yintercept = c(500, 1000, 1500, 2000, 2500), linetype = "dashed", color = "lavenderblush2") +
  geom_col(mapping = aes(x = pm2021, y = count_pm2021_millions), fill = "#8FA1AD", alpha = 0.7, width = 0.9) + 
  ggthemes::theme_hc() +
  labs(x = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)"),
       y = "Population (in millions)") +
  scale_x_binned(breaks = seq(0, 140, 15), limits = c(0, 140)) +
  scale_y_continuous(breaks = seq(0, 2500, 500)) +
  theme(axis.line = element_line(),
        panel.grid.major.y = element_blank(),
           axis.text.y = element_text(size = 13),
        axis.text.x = element_text(size = 13),
        axis.title = element_text(size = 15)) 


# pollution v/s number of studies
foo1 <- epi %>%
  group_by(mean_pm2.5) %>%
  summarise(count_mean_pm2.5 = n()) %>%
  ggplot() +
     geom_hline(yintercept = c(5, 10, 15, 20), linetype = "dashed", color = "lavenderblush2") +
  geom_histogram(mapping = aes(x = mean_pm2.5, weight = count_mean_pm2.5), fill = "#343434", color = "white", alpha = 0.7, width = 0.5) +  ggthemes::theme_hc() +
  labs(x = "",
       y = "Number of epi papers") +
  scale_y_continuous(breaks = seq(0, 30, 5)) +
  scale_x_continuous(breaks = seq(0, 140, 15), limits = c(0, 140)) +
  theme(axis.line = element_line(),
        panel.grid.major.y = element_blank(),
        axis.title = element_text(size = 11))


foo1_v2 <-  epi %>%
  group_by(mean_pm2.5) %>%
  summarise(count_mean_pm2.5 = n()) %>%
  ggplot() +
     geom_hline(yintercept = c(10, 20, 30, 40), linetype = "dashed", color = "lavenderblush2") +
  geom_col(mapping = aes(x = mean_pm2.5, y = count_mean_pm2.5), fill = "#343434", alpha = 0.7, width = 0.9) +  ggthemes::theme_hc() +
  labs(x = "",
       y = "Number of epi papers") +
  scale_y_continuous(breaks = seq(0, 40, 10)) +
  scale_x_binned(breaks = seq(0, 140, 15), limits = c(0, 140)) +
  theme(axis.line = element_line(),
        panel.grid.major.y = element_blank(), 
           axis.text.y = element_text(size = 13),
        axis.text.x = element_text(size = 13),
        axis.title = element_text(size = 15),)


gridExtra::grid.arrange(foo1, foo)

## Ken's figure

# get 4 datasets corresponding to gemm/ier/aqli/meta and for each keep concentrations till 120 micrograms per cubic meter and keep only relevant columns. and so some basic cleaning
gemm <- haven::read_dta("./data-raw/pm2.5_distribution/gemm.dta")
gemm <- gemm %>%
  filter(pm_gemm <= 120) %>%
  select(pm_gemm, gemm_lyl) %>%
  mutate(method = "GEMM") %>%
  rename(pm_val = pm_gemm, 
         lyl_val = gemm_lyl)

# stupid adjustment
gemm[1:3, 2] <- c(-0.49, -0.392, -0.294)

ier2019 <- haven::read_dta("./data-raw/pm2.5_distribution/ier_2019.dta")
ier2019 <- ier2019 %>% 
  filter(pm_ier_2019 <= 120) %>%
  select(pm_ier_2019, ier_2019_lyl) %>%
  mutate(method = "IER-2019") %>%
  rename(pm_val = pm_ier_2019, 
         lyl_val = ier_2019_lyl)

meta <- haven::read_dta("./data-raw/pm2.5_distribution/meta.dta")
meta <- meta %>% 
  filter(pm_meta <= 120) %>%
  select(pm_meta, meta_lyl) %>%
    mutate(method = "META") %>%
  rename(pm_val = pm_meta, 
         lyl_val = meta_lyl)

aqli <- tibble(pm_val = 0:120, lyl_val = (pm_val - who_pm2.5_guideline)*0.098, method = "AQLI")
#   mutate(lyl_val = ifelse(lyl_val < 0, 0, lyl_val))

## calculating empirical versions of above methods

# Approximating a derivative function of gemm
gemm_empirical <- splinefun(x  = gemm$pm_val, 
                                    y = gemm$lyl_val)

# Approximating a derivative function of gemm
aqli_empirical <- splinefun(x  = aqli$pm_val, 
                                    y = aqli$lyl_val)

# meta empirical
meta_empirical <- splinefun(x  = meta$pm_val, 
                                    y = meta$lyl_val)

# ier 2019 empirical

ier2019_empirical <- splinefun(x  = ier2019$pm_val, 
                                    y = ier2019$lyl_val)

## plot the 4 curves (empirical version)

foo3 <- ggplot(data = tibble(x = 1:120), mapping = aes(x = x)) +
      #geom_hline(yintercept = seq(0, 0.175, 0.05), linetype = "dashed", color = "lavenderblush2") +
  geom_function(fun = gemm_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#5B96AD")) + 
  geom_function(fun = aqli_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#8F3931")) +
   geom_function(fun = meta_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#91AB5A")) +
  geom_function(fun = ier2019_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#767676")) +
  scale_y_continuous(breaks = seq(0, 0.2, 0.05), limits = c(0, .2)) +
  scale_x_continuous(breaks = seq(0, 140, 15), limits = c(0, 140)) + 
  labs(x = "", y = "Marginal Life Expectancy Lost", color = "", 
       title = str_wrap("PM2.5 exposure in comparison with population and number of epi papers and alternative approaches to measure PM2.5 Life Expectancy relationship.", width = 90)) + 
  scale_color_identity(breaks = c("#8F3931", "#91AB5A", "#5B96AD", "#767676"),
                       labels = c("AQLI", "META", "GEMM", "IER-2019"),
                       guide = "legend") +
  ggthemes::theme_hc() %+replace%
  theme(legend.position = c(0.8, 0.9), 
        legend.direction = "horizontal",
        axis.line.x = element_line(), 
        axis.line.y = element_line(),
        plot.title = element_text(hjust =  0.5, size = 16, color = "black"), 
        axis.text.y = element_text(size = 13),
        axis.text.x = element_text(size = 13),
        axis.title = element_text(size = 15),
        legend.text = element_text(size = 10),
        plot.caption = element_text(face = "italic", hjust = 0), 
        legend.title = element_text(hjust = 0.5), 
        panel.grid.major.y = element_blank(), 
        legend.key.size = unit(0.5, "cm")) 

gridExtra::grid.arrange(foo3,foo1_v2, foo_v2, heights = c(1, 1, 1))

# Some stats for inputting China's positive story in the epi piece----------------------

# cumulative sum of all epi studies in China over the years
cum_sum_studies_chronological <- epi %>% 
  filter(country == "China") %>% 
  arrange(publishing_year) %>%
  count(country, publishing_year) %>% 
  pull(n) %>% 
  cumsum() %>%
  as_tibble() 

# naming the column for cumulative studies
colnames(cum_sum_studies_chronological) <- "cumulative_sum_studies"

# adding the cumulative study column to the summary
china_studies_year_summary <- cbind(epi %>% filter(country == "China") %>% arrange(publishing_year) %>% count(country, publishing_year), cum_sum_studies_chronological)

# adding pre-2010 (up till 1998) studies (total = 0) to the above tibble
china_studies_pre2010 <- tibble(country = "China", publishing_year = 1998:2009, n = 0, cumulative_sum_studies = 0)

china_studies_year_summary <- china_studies_year_summary %>%
  bind_rows(china_studies_pre2010)

foo <- china_studies_year_summary %>% 
  pivot_longer(n:cumulative_sum_studies, names_to = "sum_type", values_to = "value") %>%
  mutate(sum_type = ifelse(sum_type == "n", "Number of studies", "Cumulative sum of studies")) %>%
  ggplot() +
  geom_line(mapping = aes(x = publishing_year, y = value, color = sum_type), lwd = 1) +
  scale_x_continuous(breaks = c(seq(1998, 2020, 2), 2021, 2023), limits = c(1998, 2023)) +
  scale_y_continuous(breaks = seq(0, 20, 2)) +
  labs(x = "Publishing Year", y = "Studies Count", 
       title = "Epi air pollution studies published in China, 1998 to 2023", 
       color = "", 
       caption = "* Sources: (a) Does the Squeaky Wheel Get More Grease? The Direct and Indirect Effects of Citizen Participation on Environmental Governance in China, Buntaine et al. (b) AQLI epi meta analysis.") +
  ggthemes::theme_tufte() +
  theme(plot.title = element_text(size = 22, hjust = 0.5, margin = margin(b = 0.8, unit = "cm")),
        plot.subtitle = element_text(size = 13, hjust = 0.5, margin = margin(b = 0.5, unit = "cm")),
        axis.title.x = element_text(size = 15, margin = margin(t = 0.7, b = 1, unit = "cm")), 
        axis.title.y = element_text(size = 15, margin = margin(r = 0.7, unit = "cm")), 
        axis.ticks.y = element_blank(), 
        axis.text = element_text(size = 14), 
        legend.position = "bottom", 
        legend.box.background = element_rect(color = "black"),
        legend.text = element_text(size = 15),
        plot.caption = element_text(size = 9.5, hjust = 0, face = "italic", margin = margin(t = 1, b = 2, unit = "cm"))) + 
   annotate("segment", x = 2013, xend = 2013, y = 0, yend = 20, linetype = "dashed") +
   annotate("segment", x = 2014, xend = 2014, y = 0, yend = 20, linetype = "dashed") +
   annotate("segment", x = 2017, xend = 2017, y = 0, yend = 20, linetype = "dashed") +
    annotate("segment", x = 2017, xend = 2017, y = 0, yend = 20, linetype = "dashed") +
  annotate("point", x = 2023, y = 16, label = "2023", size = 3) +
  annotate("point", x = 2023, y = 1, label = "2023", size = 3, color = "yellow") +
  viridis::scale_color_viridis(discrete = TRUE)
# annotate("text", x = 2022, y = 45, label = "Satellite data not available") +
#   annotate("rect", xmin = 2021, xmax = 2023, ymin = 60, ymax = 60)


# China's pollution over time 1998 to 2021
foo1 <- aqli_color %>%
  filter(country == "China") %>% 
  group_by(country) %>%
  mutate(pop_weights = population/sum(population, na.rm = TRUE)) %>%
   dplyr::mutate(across(dplyr::starts_with("pm"), ~(.x*pop_weights), .names = "{col}_pop_weighted")) %>%
    dplyr::summarise(across(dplyr::contains("pop_weighted"), ~(round(sum(.x, na.rm = TRUE), 2)), .names = "avg_{col}"),
                     total_population = sum(population, na.rm = TRUE), objectid_color = objectid_color[1], whostandard = whostandard[1], .groups = "keep") %>%
  dplyr::rename_with(~str_replace_all(.x, "(_pop_weighted)|(avg_)", ""), dplyr::contains("_pop_weighted")) %>%
  pivot_longer(pm1998:pm2021, names_to = "year", values_to = "pm2.5_value") %>%
  mutate(year = as.numeric(str_remove(year, "pm"))) %>%
  select(country, year, pm2.5_value) %>%
   ggplot2::ggplot() +
      ggplot2::geom_line(mapping = ggplot2::aes(x = year, y = pm2.5_value), lwd = 1.1,
                color = "darkred") +
  geom_hline(mapping = aes(yintercept = 35), linetype = "dashed") +
      ggplot2::scale_x_continuous(breaks = c(seq(1998, 2020, 2), 2021, 2023), limits = c(1998, 2023)) +
      scale_y_continuous(breaks = seq(0, 60, 10), limits = c(0, 60)) +
      ggthemes::theme_tufte() +
   theme(plot.title = element_text(size = 22, hjust = 0.5, margin = margin(b = 2.5, unit = "cm")),
        plot.subtitle = element_text(size = 13, hjust = 0.5, margin = margin(b = 0.5, unit = "cm")),
        axis.title.x = element_text(size = 15, margin = margin(t = 0.7, b = 0.5, unit = "cm")), 
        axis.title.y = element_text(size = 15, margin = margin(r = 0.7, unit = "cm")), 
        axis.ticks.y = element_blank(), 
        axis.text = element_text(size = 14), 
        legend.position = "bottom", 
        legend.text = element_text(size = 15),
        legend.box.background = element_rect(color = "black"),
        plot.caption = element_text(size = 9.5, hjust = 0, face = "italic", margin = margin(t = 0.8, b = 1.5, unit = "cm"))) +
      ggplot2::labs(x = "Years",
           y = "Annual Average PM2.5 concentration (in µg/m³)", 
           title = expression(PM[2.5] ~ "pollution in China, 1998 to 2021")) +
  annotate("text", x = 2018, y = 58, label = "War on pollution begins", size = 5) +
   annotate("text", x = 2023.5, y = 15, label = "2023", size = 3) +
   annotate("text", x = 2027, y = 27, label = "", size = 3) +
   annotate("text", x = 2009.2, y = 20, label = str_wrap("Prefectural EPAs publicize CEMS data", width = 20), size = 5) +
  annotate("segment", x = 2011, xend = 2013, y = 18, yend = 18) +
   annotate("segment", x = 2014, xend = 2016, y = 58, yend = 58) +
   annotate("segment", x = 2014, xend = 2014, y = 0, yend = 58, linetype = "dashed") +
   annotate("segment", x = 2013, xend = 2013, y = 0, yend = 18, linetype = "dashed") +
   annotate("text", x = 2021, y = 45, label = str_wrap("All prefectural cities operating official Weibo and WeChat accounts", width = 30), size = 5) +
   annotate("segment", x = 2017, xend = 2017, y = 0, yend = 45, linetype = "dashed") +
     annotate("text", x = 2006, y = 37, label = expression("National" ~ PM[2.5] ~ "standard: 35 µg/m³"), size = 4) +
   annotate("segment", x = 2017, xend = 2018.5, y = 45, yend = 45) 


plt <- gridExtra::grid.arrange(foo1, foo)

aqli-epic/epi.meta.analysis documentation built on July 2, 2023, 4:18 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

aqli-epic/epi.meta.analysis
An Amazing Shiny App

In aqli-epic/epi.meta.analysis: An Amazing Shiny App

Epidemiological Studies: Meta Analysis

Major Takeaways

Purpose

Why do epidemiological studies on air pollution and mortality matter?

Results

PM~2.5~ concentration range and the Global Population Distribution

Geographic Distribution of Studies

AQ Epi studies over time

Distribution of duration of study by continent

Conclusion

Methodology

Inclusion/Exclusion Criteria for studies

Other important points

How can you (the community) help in improving this analysis?

References

Other Graphs and Interactive Summary Dashboard

R Package Documentation

Browse R Packages

We want your feedback!

aqli-epic/epi.meta.analysis An Amazing Shiny App

In aqli-epic/epi.meta.analysis: An Amazing Shiny App

Epidemiological Studies: Meta Analysis

Major Takeaways

Purpose

Why do epidemiological studies on air pollution and mortality matter?

Results

PM~2.5~ concentration range and the Global Population Distribution

Geographic Distribution of Studies

AQ Epi studies over time

Distribution of duration of study by continent

Conclusion

Methodology

Inclusion/Exclusion Criteria for studies

Other important points

How can you (the community) help in improving this analysis?

References

Other Graphs and Interactive Summary Dashboard

R Package Documentation

Browse R Packages

We want your feedback!

aqli-epic/epi.meta.analysis
An Amazing Shiny App