knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" )
# load libraries library(readr) library(dplyr) library(stringr) library(magrittr) library(ggplot2) library(readr) library(tidytext) library(tidyr) library(tidyverse) library(ggthemes) library(sf) library(rnaturalearth) library(rnaturalearthdata) library(mapsf) library(colorspace) library(shiny) library(shinydashboard) library(DT) library(treemapify)
# global variables who_pm2.5_guideline <- 5 # global operations `%notin%` <- Negate(`%in%`) # epi studies analysis raw sheet epi <- readxl::read_xlsx("./data-raw/pm2.5_distribution/AQLI_Epidemiology Literature Research.xlsx", sheet = "AnalysisDatasetPM2.5MortalityAn") #> change default columns types epi$cohort_size <- as.numeric(epi$cohort_size) epi$study_start_year<- as.numeric(epi$study_start_year) epi$study_end_year <- as.numeric(epi$study_end_year) epi$pm2.5_exposure_ll <- as.numeric(epi$pm2.5_exposure_ll) epi$pm2.5_exposure_ul <- as.numeric(epi$pm2.5_exposure_ul) epi$mean_pm2.5 <- as.numeric(epi$mean_pm2.5) epi$sd_pm2.5 <- as.numeric(epi$sd_pm2.5) epi$cohort_age_ll <- as.numeric(epi$cohort_age_ll) epi$cohort_age_ul <- as.numeric(epi$cohort_age_ul) # AQLI color file and gadm0 file aqli_color <- read_csv("./data-raw/pm2.5_distribution/[june2023]gadm2_aqli_2021_post_waterbody_adj_finalized_internal.csv") gadm0_aqli_2021 <- read_csv("./data-raw/pm2.5_distribution/[june2023]gadm0_aqli_2021_post_waterbody_adj_finalized_internal.csv") # aqli_color <- aqli_color %>% # dplyr::rename_with(~str_replace(.x, "who", "llpp_who_"), dplyr::contains("who")) %>% # dplyr::rename_with(~str_replace(.x, "nat", "llpp_nat_"), dplyr::contains("nat")) %>% # dplyr::rename(country = name0) %>% # dplyr::mutate(whostandard = who_pm2.5_guideline) %>% # dplyr::select(objectid_gadm2:population, whostandard, everything()) aqli_color <- aqli_color %>% dplyr::mutate(objectid_color = objectid_gadm2) # getting a country continent file country_continent <- read_csv("./data-raw/pm2.5_distribution/country_continent.csv") # adding a continent column to the color file aqli_color <- aqli_color %>% left_join(country_continent, by = "country") %>% select(objectid_color, continent, country, everything()) gadm0_aqli_2021 <- gadm0_aqli_2021 %>% left_join(country_continent, by = "country") #> Filling in missing continents----------------------------------------------- # countries for which continent is NA: for these fill in the continent manually countries_with_missing_continent <- gadm0_aqli_2021 %>% filter(is.na(continent)) %>% pull(country) %>% unique() # continent fill in for missing coutries continents_for_missing_countries <- c("Africa", "Europe", "North America", "Asia", "Asia", "Oceania", "Africa", "Africa", "Africa", "Asia") # [CAUTION: perform a sanity check on the above 2 vectors and how they map countries to continents before proceeding] #creating a data frame using the above 2 vectors as columns missing_continents_df <- tibble(country = countries_with_missing_continent, continent = continents_for_missing_countries) # adding in the missing continent information in the gadmx_aqli_2021 datasets aqli_color <- aqli_color %>% left_join(missing_continents_df, by = "country") %>% mutate(continent = ifelse(is.na(continent.x), continent.y, continent.x)) #> setting report parameters min_sample_size <- 1000 min_study_duration_in_years <- 1 #> Filtering the epi database (minimum cohort size: >1000, minimum study duration: > 1 year): commented for now. epi <- epi %>% mutate(study_duration = (study_end_year - study_start_year) + 1) %>% filter(cohort_size >= min_sample_size, study_duration >= min_study_duration_in_years, !is.na(cohort_uid)) %>% distinct(cohort_uid, .keep_all = TRUE) # creating this temp dataset for the continent panel dataset epi_tmp <- epi %>% mutate(continent = ifelse(country == "China", "China", continent), continent = ifelse(continent == "Asia", "Asia - China", continent)) # continent list (specifucally for the continent panel graph, that contains the Asia and Asia - China panels, and considers them as continents in a # separate epi_tmp dataset, that is specially created for this graph) continent_list <- c("China", "Asia - China", "Africa", "North America", "South America", "Europe", "Oceania") # missing continents missing_continents_logical <- continent_list %notin% unique(epi_tmp$continent) missing_continents <- continent_list[missing_continents_logical] #> Calculations needed for all sections above the "Results section". # percent of population not in compliance with the WHO guideline num_people_above_who <- aqli_color %>% filter(pm2020 > who_pm2.5_guideline) %>% summarise(tot_pop = sum(population, na.rm = TRUE)) %>% unlist() # world population world_pop <- aqli_color %>% summarise(tot_pop = sum(population, na.rm = TRUE)) %>% unlist() percent_people_above_who <- round((num_people_above_who/world_pop)*100, 1) # pop pollution graph with 2 buckets: function. pop_pol_graph_2_buckets <- function(aqli_color, epi, thresh_ll, thresh_ul, pm2.5_col_name){ if(thresh_ll > thresh_ul){ stop("Lower Limit should be less than or equal to the Upper limit! Please try again!") } # aqli color population in ordered pm2.5 buckets aqli_color_grp_pm2.5_buckets <- aqli_color %>% dplyr::filter(!is.na(!!as.symbol(pm2.5_col_name))) %>% dplyr::mutate(region = ifelse((!!as.symbol(pm2.5_col_name) >= thresh_ll) & (!!as.symbol(pm2.5_col_name) <= thresh_ul), stringr::str_c(thresh_ll, "-", thresh_ul), !!as.symbol(pm2.5_col_name)), region = ifelse(!!as.symbol(pm2.5_col_name) > thresh_ul, stringr::str_c(">", thresh_ul), region), region = ifelse(!!as.symbol(pm2.5_col_name) < thresh_ll, stringr::str_c("<", thresh_ll), region)) %>% dplyr::group_by(region) %>% dplyr::summarise(tot_pop = sum(population, na.rm = TRUE)) %>% dplyr::ungroup() %>% dplyr::mutate(order_pollution_group = ifelse(region == stringr::str_c("<", thresh_ll), 1, 0), order_pollution_group = ifelse(region == stringr::str_c(thresh_ll, "-", thresh_ul), 2, order_pollution_group), order_pollution_group = ifelse(region == stringr::str_c(">", thresh_ul), 3, order_pollution_group)) %>% dplyr::ungroup() %>% dplyr::mutate(tot_pop_prop = (tot_pop/sum(tot_pop))*100) # epi total number of studies in ordered pollution buckets (initially filtering out pooled studies, will add back in using a for loop) epi_num_studies_grp_pm2.5_buckets <- epi %>% dplyr::filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>% dplyr::group_by(paper_uid) %>% dplyr::summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>% dplyr::mutate(region = ifelse(mean_pm2.5 >= thresh_ll & mean_pm2.5 <= thresh_ul, stringr::str_c(thresh_ll, "-", thresh_ul), mean_pm2.5), region = ifelse(mean_pm2.5 > thresh_ul, stringr::str_c(">", thresh_ul), region), region = ifelse(mean_pm2.5 < thresh_ll, stringr::str_c("<", thresh_ll), region)) %>% dplyr::mutate(order_pollution_group = ifelse(region == stringr::str_c("<", thresh_ll), 1, 0), order_pollution_group = ifelse(region == stringr::str_c(thresh_ll, "-", thresh_ul), 2, order_pollution_group), order_pollution_group = ifelse(region == stringr::str_c(">", thresh_ul), 3, order_pollution_group)) %>% dplyr::group_by(region) %>% dplyr::summarise(tot_studies = dplyr::n(), order_pollution_group = order_pollution_group[1]) %>% dplyr::ungroup() %>% dplyr::mutate(tot_studies_prop = (tot_studies/sum(tot_studies))*100) #> bar chart version # creating a master dataset for both population and studies data pop_epi_studies_data <- aqli_color_grp_pm2.5_buckets %>% dplyr::left_join(epi_num_studies_grp_pm2.5_buckets, by = c("region", "order_pollution_group")) %>% dplyr::select(region, order_pollution_group, tot_pop_prop, tot_studies_prop) %>% tidyr::pivot_longer(cols = tot_pop_prop:tot_studies_prop, names_to = "type_of_prop", values_to = "val") %>% dplyr::mutate(type_of_prop = ifelse(type_of_prop == "tot_pop_prop", "Percent Population in PM₂.₅ bucket", "Percent Studies in PM₂.₅ bucket")) pop_num_studies_in_pollution_buckets_graph <- pop_epi_studies_data %>% ggplot2::ggplot() + ggplot2::geom_col(mapping = aes(x = forcats::fct_reorder(region, order_pollution_group), y = val, fill = type_of_prop), position = position_dodge(), width = 0.3) + ggplot2::scale_y_continuous(breaks = seq(0, 100, 10)) + ggplot2::scale_fill_manual(values = c("tot_pop_prop" = "grey", "tot_studies_prop" = "cornflowerblue"), labels = c("Proportion of World Population in Bucket", "Proportion of Studies Completed in Bucket")) + ggplot2::labs(x = expression("Mean" ~ PM[2.5] ~ "bucket (in µg/m³)"), y = "Percentage", fill = "", title = expression("Percentage of Global population in different" ~ PM[2.5] ~ "buckets"), caption = stringr::str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) + # ggplot2::geom_text(mapping = aes(x = forcats::fct_reorder(region, order_pollution_group), y = val, label = paste0(round(val, 1), "%")), position = ggplot2::position_dodge2(width = 1, preserve = "single"), vjust = -0.5, hjust = -0.2, size = 3) + ggthemes::theme_tufte() + ggplot2::theme(axis.line.y = element_line(color = "black"), axis.line.x = element_line(color = "black"), plot.caption = element_text(size = 8, hjust = 0, margin = margin(t = 0.7, unit = "cm")), plot.title = element_text(size = 16, margin = margin(b = 1, unit = "cm"), hjust = 0.5), plot.caption.position = "plot", axis.title.y = element_text(margin = margin(r = 0.5, unit = "cm"), size = 14), axis.title.x = element_text(margin = margin(t = 0.5, b = 0.5, unit = "cm"), size = 14), axis.text = element_text(size = 12), legend.box.background = element_rect(color = "black"), legend.position = "bottom", legend.text = element_text(size = 12)) + viridis::scale_fill_viridis(discrete = TRUE) return(pop_num_studies_in_pollution_buckets_graph) }
This analysis is our ongoing attempt to capture all epidemiological research studies (>r min_sample_size
people; max sample size: r round(max(epi$cohort_size, na.rm = TRUE)/1000000, 1)
million), that are relatively long term (> r min_study_duration_in_years
year(s), max study duration: r max(epi$study_duration, na.rm =TRUE)
years), with unique cohorts (by unique we mean that even if a given cohort is studied by different groups over different years/different sample size, it'll be counted as separate cohorts, instead of 1) and measure the impact of ambient PM~2.5~, PM~10~, TSP, Ultra Fine Particulate Matter on Mortality (cause-specific, all cause, premature)/Life Expectancy published between r min(epi$publishing_year)
and present*, findable in available peer-reviewed literature. Hereafter, we refer to these studies as "AQ epi studies" for short. As of now, this meta-analysis analyzes r epi %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow()
AQ epi studies. This analysis will be continually updated to incorporate new AQ epi studies.
We are seeking to make this analysis as current, complete and error-free as possible and view it as a continual work in progress. We would appreciate the air quality community’s comments, corrections, and suggestions. Please contact aqli-epic@uchicago.edu or leave a comment in this GitHub repository/directly leave comments in the analysis dataset (more on this below).
53.57 percent of all AQ epi studies (45 studies) included in this analysis included populations in either the US or Canada or both.
Of the total number of times a given continent’s population has been included in any given AQ epi study, North America dominates the rest of the continents and has been included 53.57 percent of times (all due to studies in the US or Canada). Closely following North America, populations in Asia and Europe have been included in AQ epi studies 28.57 and 17.86 percent of times respectively. Africa, South America, Oceania have seen 0 long term (>= 1 year), large (>=1000 people in the sample) AQ epi study.
There have been no large (>= 1 year) or long (>=1000 people in the sample) PM epi studies that were performed in Africa, South America, Oceania, with a combined population of 1.84 billion people , 23.8 percent of the Earth’s total human population.
In total, 7 “really” long term studies (> 25 years) have been performed. That is, 8.33 percent of the total number of studies.
All of the following continents have seen at least 1 really long term study: North America, Europe.
The purpose of this analysis is to understand the landscape of epidemiological research on the relationship between PM~2.5~, PM~10~, TSP, Ultrafine Particulate Matter and Mortality (all types, as specified above)/Life Expectancy and to surface demographic, geographic, or other trends that may exist in the current state of literature. While the overall arc of the relationship between these pollutants and human health is clear enough to take action, understanding such trends can help the field reflect on itself, take stock of any biases or gaps – and point toward future research and policy opportunities.
While global estimates of air pollution’s toll on public health vary, they all point in the same direction: air pollution poses one of the largest health risks on the planet to humans [Paper1, Paper2, Paper3, Paper4]. Epidemiological studies on air pollution and mortality help us understand the burden of air pollution on human health at global, national and regional levels. According to Vahlsing and Smith (2012), these sorts of studies can also help countries take policy action, pushing forward and shaping national-level ambient air quality standards.
The burden of air pollution across the world is also not uniform. While r percent_people_above_who
percent of the world population is out of compliance with the latest WHO annual PM~2.5~ guideline of r who_pm2.5_guideline
µg/m³, there is huge variation in the quality of air one breathes.
#> mean pm2.5 distribution segregated by country (density plot): makes sense to count different country studies separately, as we are making country wise distributions, so we want to see all PM2.5 ranges covered in a given country. mean_pm2.5_country_wise_graph <- epi %>% filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>% ggplot() + geom_density(mapping = aes(x = mean_pm2.5, fill = country), alpha = 0.5, color = "black", position = "identity") + labs(x = expression(paste("Mean PM2.5 concentration (", mu, "g", "/", m^3, ")")), caption = str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) + theme_hc() + theme(plot.caption = element_text(hjust = 0)) + viridis::scale_fill_viridis(discrete = TRUE) #> PM2.5 exposure range and population distribution # aqli color population in ordered pm2.5 buckets aqli_color_grp_pm2.5_buckets <- aqli_color %>% mutate(region = ifelse(pm2020 >= 0 & pm2020 <= 25, "0-25", pm2020), region = ifelse(pm2020 > 25 & pm2020 <= 50, ">25-50", region), region = ifelse(pm2020 > 50 & pm2020 <= 75, ">50-75", region), region = ifelse(pm2020 > 75 & pm2020 <= 100, ">75-100", region), region = ifelse(pm2020 > 100, ">100", region)) %>% group_by(region) %>% summarise(tot_pop = sum(population, na.rm = TRUE)) %>% ungroup() %>% mutate(order_pollution_group = ifelse(region == "0-25", 1, 0), order_pollution_group = ifelse(region == ">25-50", 2, order_pollution_group), order_pollution_group = ifelse(region == ">50-75", 3, order_pollution_group), order_pollution_group = ifelse(region == ">75-100", 4, order_pollution_group), order_pollution_group = ifelse(region == ">100", 5, order_pollution_group)) %>% ungroup() %>% mutate(tot_pop_prop = (tot_pop/sum(tot_pop))*100) # epi total number of studies in ordered pollution buckets (initially filtering out pooled studies, will add back in using a for loop) epi_num_studies_grp_pm2.5_buckets <- epi %>% filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>% group_by(paper_uid) %>% summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>% mutate(region = ifelse(mean_pm2.5 >= 0 & mean_pm2.5 <= 25, "0-25", mean_pm2.5), region = ifelse(mean_pm2.5 > 25 & mean_pm2.5 <= 50, ">25-50", region), region = ifelse(mean_pm2.5 > 50 & mean_pm2.5 <= 75, ">50-75", region), region = ifelse(mean_pm2.5 > 75 & mean_pm2.5 <= 100, ">75-100", region), region = ifelse(mean_pm2.5 > 100, ">100", region)) %>% mutate(order_pollution_group = ifelse(region == "0-25", 1, 0), order_pollution_group = ifelse(region == ">25-50", 2, order_pollution_group), order_pollution_group = ifelse(region == ">50-75", 3, order_pollution_group), order_pollution_group = ifelse(region == ">75-100", 4, order_pollution_group), order_pollution_group = ifelse(region == ">100", 5, order_pollution_group)) %>% group_by(region) %>% summarise(tot_studies = n(), order_pollution_group = order_pollution_group[1]) %>% ungroup() %>% mutate(tot_studies_prop = (tot_studies/sum(tot_studies))*100) #> bar chart version # creating a master dataset for both population and studies data pop_epi_studies_data <- aqli_color_grp_pm2.5_buckets %>% left_join(epi_num_studies_grp_pm2.5_buckets, by = c("region", "order_pollution_group")) %>% select(region, order_pollution_group, tot_pop_prop, tot_studies_prop) %>% pivot_longer(cols = tot_pop_prop:tot_studies_prop, names_to = "type_of_prop", values_to = "val") # plotting the bar graph pop_num_studies_in_pollution_buckets_graph <- pop_epi_studies_data %>% ggplot() + geom_col(mapping = aes(x = fct_reorder(region, order_pollution_group), y = val, fill = type_of_prop), position = position_dodge(), width = 0.7) + scale_y_continuous(breaks = seq(0, 100, 10)) + scale_fill_manual(values = c("tot_pop_prop" = "grey", "tot_studies_prop" = "cornflowerblue"), labels = c("Proportion of World Population in Bucket", "Proportion of Studies Completed in Bucket")) + labs(x = "Mean PM2.5 bucket (in µg/m³)", y = "Percentage", fill = "", caption = str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) + theme_hc() + theme(axis.line.y = element_line(color = "black"), axis.line.x = element_line(color = "black"), plot.caption = element_text(size = 8, hjust = 0), plot.caption.position = "plot") + viridis::scale_fill_viridis(discrete = TRUE) #> other calculations for this block of code #> total pop where pollution is > 50 micrograms per cubic meter # thresholds pm2.5_thresh_highly_polluted <- 50 pm2.5_thresh_severly_polluted <- 75 pm2.5_thresh_low_polluted <- 25 # number of PM2.5 specific studies epi_pm2.5_spec_studies_total <- epi %>% distinct(paper_uid, .keep_all = TRUE) %>% filter(non_pm2.5 == 0) %>% nrow() # higher than hp # total pop where pollution is > x micrograms per cubic meter (hp) total_pop_above_x_hp_micr_gm <- aqli_color %>% filter(pm2020 > pm2.5_thresh_highly_polluted) %>% summarise(tot_pop = sum(population, na.rm = TRUE)) %>% unlist() # percent pop where pollution is > x micrograms per cubic meter (hp) perc_pop_above_x_hp_micr_gm <- round((total_pop_above_x_hp_micr_gm/world_pop)*100, 1) # total pop where pollution is > x micrograms per cubic meter in millions (hp) tot_pop_above_x_hp_micr_gm_millions <- round(total_pop_above_x_hp_micr_gm/1000000, 1) # higher than sp # total pop where pollution is > x micrograms per cubic meter (sp) total_pop_above_x_sp_micr_gm <- aqli_color %>% filter(pm2020 > pm2.5_thresh_severly_polluted) %>% summarise(tot_pop = sum(population, na.rm = TRUE)) %>% unlist() # percent pop where pollution is > x micrograms per cubic meter (sp) perc_pop_above_x_sp_micr_gm <- round((total_pop_above_x_sp_micr_gm/world_pop)*100, 1) # total pop where pollution is > x micrograms per cubic meter in millions (sp) tot_pop_above_x_sp_micr_gm_millions <- round(total_pop_above_x_sp_micr_gm/1000000, 1) # less than lp # total pop where pollution is < x micrograms per cubic meter (lp) total_pop_below_x_lp_micr_gm <- aqli_color %>% filter(pm2020 < pm2.5_thresh_low_polluted) %>% summarise(tot_pop = sum(population, na.rm = TRUE)) %>% unlist() # percent pop where pollution is < x micrograms per cubic meter (lp) perc_pop_below_x_lp_micr_gm <- round((total_pop_below_x_lp_micr_gm/world_pop)*100, 1) # total pop where pollution is > x micrograms per cubic meter in millions (lp) tot_pop_below_x_lp_micr_gm_millions <- round(total_pop_below_x_lp_micr_gm/1000000, 1) #> percent studies in a given pollution bucket # higher than hp: epi_num_studies_in_hp <- epi %>% filter(!is.na(mean_pm2.5) , mean_pm2.5 != "NA", non_pm2.5 == 0) %>% group_by(paper_uid) %>% summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>% filter(mean_pm2.5 > pm2.5_thresh_highly_polluted) %>% nrow() %>% unlist() perc_epi_studies_in_hp <- round((epi_num_studies_in_hp/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% filter(non_pm2.5 == 0)))*100, 1) # higher than sp: makes sense to count different countries in a pooled study as different studies. epi_num_studies_in_sp <- epi %>% filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>% group_by(paper_uid) %>% summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>% filter(mean_pm2.5 > pm2.5_thresh_severly_polluted) %>% nrow() %>% unlist() perc_epi_studies_in_sp <- round((epi_num_studies_in_sp/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% filter(non_pm2.5 == 0)))*100, 1) # less than lp: makes sense to count different countries in a pooled study as different studies. epi_num_studies_in_lp <- epi %>% filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>% group_by(paper_uid) %>% summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>% filter(mean_pm2.5 < pm2.5_thresh_low_polluted) %>% nrow() %>% unlist() perc_epi_studies_in_lp <- round((epi_num_studies_in_lp/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% filter(non_pm2.5 == 0)))*100, 1) # plot population and number of studies bar graph pop_num_studies_in_pollution_buckets_graph # plot mean pm2.5 country wise graph mean_pm2.5_country_wise_graph
In total r length(unique(epi$paper_uid))
AQ epi studies were included in the final analysis dataset. Of these, r epi_pm2.5_spec_studies_total
were PM~2.5~ specific. Others (r (length(unique(epi$paper_uid)) - epi_pm2.5_spec_studies_total)
studies) are multi-pollutant studies.
r perc_pop_above_x_hp_micr_gm
percent of the world population, or r tot_pop_above_x_hp_micr_gm_millions
million people, live in areas where the annual average PM~2.5~ pollution is greater than r pm2.5_thresh_highly_polluted
µg/m³. But, only r perc_epi_studies_in_hp
percent (r epi_num_studies_in_hp
PM~2.5~ specific studies) of the total PM~2.5~ studies, have been performed in these highly polluted parts of the world. These highly polluted areas are areas where the average PM~2.5~ pollution is at least r round(pm2.5_thresh_highly_polluted/who_pm2.5_guideline, 1)
times the WHO PM~2.5~ safe guideline of r who_pm2.5_guideline
µg/m³.
Approximately r perc_pop_above_x_sp_micr_gm
percent of the world population (r tot_pop_above_x_sp_micr_gm_millions
million people) live in the most severely polluted parts of the world, where annual average PM~2.5~ pollution is upwards of r pm2.5_thresh_severly_polluted
µg/m³ (at least r round(pm2.5_thresh_severly_polluted/who_pm2.5_guideline, 1)
times the WHO safe guideline). In these most severely polluted parts of the world, r epi_num_studies_in_sp
PM~2.5~ AQ epi studies have been performed .
Most of the PM~2.5~ AQ epi studies (r perc_epi_studies_in_lp
percent of the total number of PM~2.5~ studies, or r epi_num_studies_in_lp
studies) performed so far, are concentrated in areas where the average PM~2.5~ concentration is in the 0-25 µg/m³ range. People living in these areas (r perc_pop_below_x_lp_micr_gm
percent of the world population) are breathing air that is much less polluted relative to the people living in the most polluted parts of the world (as seen above). But, even in the 0-25 µg/m³ bucket, anyone living above r who_pm2.5_guideline
µg/m³, is out of compliance with the WHO PM~2.5~ guideline.
#> Add a note mentioning that the underlying data for this graph includes all types of pollutants and also that, each country in a pooled study is counted separately. # generate summary dataset for total number of studies conducted in a given country epi_country_count <- epi %>% dplyr::select(country) %>% dplyr::count(country) %>% dplyr::filter(country != "NA", !is.na(country), !is.na(n)) # making country names match before joining epi file with color file epi_country_count$country <- stringr::str_replace(epi_country_count$country, "USA", "United States") #> plot choropleth map # get data ready for the choropleth map world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf") # making country names match before joining epi file with color file world$name_long <- stringr::str_replace(world$name_long, "Russian Federation", "Russia") world$name_long <- stringr::str_replace(world$name_long, "Lao PDR", "Laos") world$name_long <- stringr::str_replace(world$name_long, "Dem. Rep. Korea", "North Korea") world$name_long <- stringr::str_replace(world$name_long, "Republic of Korea", "South Korea") world$name_long <- stringr::str_replace(world$name_long, "Vatican", "Vatican City") world$name_long <- stringr::str_replace(world$name_long, "Brunei Darussalam", "Brunei") world$name_long <- stringr::str_replace(world$name_long, "Somaliland", "Somalia") world$name_long <- stringr::str_replace(world$name_long, "Mexico", "México") world$name_long <- stringr::str_replace(world$name_long, "Republic of Congo", "Republic of the Congo") world$name_long <- stringr::str_replace(world$name_long, "Czech Republic", "Czechia") # joining world shape file with the epi conutry wise count dataset world_shp_epi <- world %>% dplyr::left_join(epi_country_count, by = c("name_long" = "country")) %>% dplyr::select(name_long, n, geometry) %>% dplyr::rename(num_studies = n, country = name_long) # color 2020 collapse to country level to get the PM2.5 pollution layer (first layer of the map) aqli_color_country <- aqli_color %>% dplyr::group_by(country) %>% dplyr::mutate(pop_weights = population/sum(population, na.rm = TRUE), pm2021_pop_weighted = pm2021*pop_weights) %>% dplyr::summarise(avg_pm2.5_2021 = sum(pm2021_pop_weighted, na.rm = TRUE)) # joining the world shape + epi joined file with the aqli color file world_shp_epi_color <- world_shp_epi %>% dplyr::left_join(aqli_color_country, by = "country") %>% dplyr::filter(country != "Antarctica") %>% dplyr::select(country, num_studies, avg_pm2.5_2021, geometry) # In the total number of studies column, replacing NAs with 0s world_shp_epi_color <- world_shp_epi_color %>% dplyr::mutate(num_studies = ifelse(is.na(num_studies) == TRUE, 0, num_studies)) # getting centroids for countries after correcting for problematic polygons world_shp_epi_color_for_centroids <<- sf::st_make_valid(world_shp_epi_color) country_wise_centroids <<- sf::st_centroid(world_shp_epi_color_for_centroids, of_largest_polygon = TRUE) # adding an identifier column for countries where no studies have happened country_wise_centroids$color_circle = ifelse(country_wise_centroids$num_studies == 0, "No Studies", "Atleast 1 study") # creating a dataframe out of the latitude, longitude coordinates of the country centroids, so that we can plot it # using geom_point. coord_df <- dplyr::as_tibble(sf::st_coordinates(country_wise_centroids)) # reassigning the X and Y coordinates to x_coord and y_coord country_wise_centroids$x_coord <- coord_df$X country_wise_centroids$y_coord <- coord_df$Y # moving the geometry column to the end and making sure that the entire dataset is a sf object country_wise_centroids <- country_wise_centroids %>% dplyr::select(-geometry, geometry) %>% sf::st_as_sf() # without geometry version of country wise centroids dataset and adding a label_plt column country_wise_centroids_sans_geom <- country_wise_centroids %>% sf::st_drop_geometry() %>% dplyr::as_tibble() %>% dplyr::mutate(label_plt = ifelse(num_studies == 0, "", num_studies)) # plotting the choropleth geographic_dist_graph <- world_shp_epi_color %>% ggplot2::ggplot() + ggplot2::geom_sf(mapping = ggplot2::aes(fill = avg_pm2.5_2021, group = avg_pm2.5_2021), color = "darkgrey") + ggplot2::geom_sf(data = world_shp_epi_color %>% dplyr::filter(num_studies > 0) %>% sf::st_as_sf(), color = "black", size = 6, fill = "transparent") + colorspace::scale_fill_continuous_sequential(palette = "YlOrRd", name = expression("Annual Average" ~ PM[2.5] ~ "in 2021 (in µg/m³)")) + ggplot2::geom_jitter(data = country_wise_centroids_sans_geom, mapping = aes(x = x_coord, y = y_coord, size = num_studies, color = color_circle)) + ggplot2::geom_text(data = country_wise_centroids_sans_geom, mapping = aes(x = x_coord, y = y_coord, label = label_plt), size = 3, color = "white", size = 2) + ggplot2::theme(legend.position = "bottom") + ggplot2::scale_size(breaks = c(1, 5, 10, 20, 30), range = c(1, 12), guide = ggplot2::guide_legend(override.aes = list(color = "darkgrey"))) + ggplot2::labs(caption = stringr::str_wrap("*The size of the circles is directly proportional to the total number of times a given country has been included in a study (any study, not necessarily PM2.5 specific - mentioned as the number in the circle). Countries with atleast 1 study are highlighted (black borders)."), width = 30, size = "Number of Studies") + ggplot2::scale_color_manual(values = c("No Studies" = "cadetblue3", "Atleast 1 studys" = "darkolivegreen"), name = "") + ggthemes::theme_map() + ggplot2::ggtitle(expression("Country wise distribution of epi studies and Annual Average" ~PM[2.5] ~ "pollution (in µg/m³)")) + ggplot2::theme(plot.caption = ggplot2::element_text(hjust = 0, size = 7), legend.position = "bottom", legend.justification = c(0.5, 3), plot.title = ggplot2::element_text(hjust = 0.5, size = 11), legend.text = element_text(size = 5), legend.title = element_text(size = 6)) #> Other calculations for this block of code # Of all AQ epi studies included in this analysis how many included populations in either the US or Canada or both num_times_usa_canada_included <- epi %>% filter(country %in% c("USA", "Canada")) %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow() # total number of times USA, Canada and Europe have been included in a study num_times_usa_canada_europe_included <- epi %>% filter((country %in% c("Canada", "USA")) | (continent %in% c("Europe"))) %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow() # Countries other than these: USA, Canada and the ones that are included in Europe that have been included in a study areas_other_than_usa_canada_europe <- epi %>% filter((country %notin% c("Canada", "USA")) & (continent %notin% c("Europe"))) %>% select(country) %>% unique() %>% unlist() %>% as.vector() # number of times a continent have been included in a study continent_wise_study_summary <- epi %>% group_by(paper_uid, continent) %>% summarise(count = n()) %>% ungroup() %>% select(continent) %>% count(continent) %>% slice_max(n, n = 6) %>% mutate(prop_studies = round((n/sum(n, na.rm = TRUE))*100, 1)) # total number of times South America was included in a study num_times_sa_included <- continent_wise_study_summary %>% filter(continent == "South America") %>% select(n) %>% unlist() %>% as.vector() # total number of times Oceania was included in a study num_times_oceania_included <- continent_wise_study_summary %>% filter(continent == "Oceania") %>% select(n) %>% unlist() %>% as.vector() # total number of studies Africa has been included in a study num_times_africa_included <- continent_wise_study_summary %>% filter(continent == "Africa") %>% select(n) %>% unlist() %>% as.vector() # plot geographic distribution geographic_dist_graph
r round((num_times_usa_canada_included/(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow()))*100, 1)
percent of all AQ epi studies (r num_times_usa_canada_included
studies) included in this analysis included populations in either the US or Canada or both.
USA, Canada and Europe (or some combination of them) were focus countries in r round((num_times_usa_canada_europe_included/(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow()))*100, 1)
percent of all AQ epi studies (r num_times_usa_canada_europe_included
studies). Other countrie(s) that have been included in an AQ epi study: r areas_other_than_usa_canada_europe
.
Of the total number of times a given continent's population has been included in any given AQ epi study, r continent_wise_study_summary$continent[1]
dominates the rest of the continents and has been included r continent_wise_study_summary$prop_studies[1]
percent of times (all due to studies in the US or Canada). Closely following North America, populations in r continent_wise_study_summary$continent[2]
and r continent_wise_study_summary$continent[3]
have been included in AQ epi studies r continent_wise_study_summary$prop_studies[2]
and r continent_wise_study_summary$prop_studies[3]
percent of times respectively. r missing_continents
have seen 0 long term (> r min_study_duration_in_years
year), large (>r min_sample_size
people in the sample) AQ epi study.
#> Add a note mentioning that the underlying data for this graph includes all types of pollutants but only one row is counted for a given pooled (multi-country) study. epi %>% dplyr::distinct(paper_uid, .keep_all = TRUE) %>% group_by(publishing_year) %>% summarise(total_studies = n()) %>% ungroup() %>% filter(total_studies != "NA", !is.na(total_studies), publishing_year != "NA", !is.na(publishing_year)) %>% ggplot() + geom_line(mapping = aes(x = publishing_year, y = total_studies), lwd = 1.2, color = "cornflowerblue") + labs(x = "Year", y = "Total Number of Studies", caption = str_wrap("*This graph displays the total number of AQ epi studies (all studies, not PM2.5 specific) published over time."), width = 10) + scale_x_continuous(breaks = seq(min(epi$publishing_year, na.rm = TRUE), max(epi$publishing_year, na.rm = TRUE), 2)) + theme_hc() + theme(plot.caption = element_text(hjust = 0))
In most of the 90’s and early 2000’s, the rate of AQ epi studies publishing was around 1 to 2 studies per annum on average.
Post 2009, there has been a noticeable increase in the overall volume of AQ epi studies published.
#> In this, in the graph multiple countries of a pooled study are counted separately. Also, this is not a PM2.5 specific graph. #> Density plot of study duration (commented for now, can use later if required) # epi %>% # filter(!is.na(study_duration)) %>% # ggplot() + # geom_density(mapping = aes(x = study_duration, fill = continent), alpha = 0.5) + # labs(x = "Study Duration") + # ggtitle("Epi Studies Duration Distribution (Continent Wise): Density Plot") + # scale_x_continuous(breaks = seq(0, 40, 5)) + # theme_tufte() + # theme(legend.position = "bottom") #> Histogram of study duration #> adding an order panel and also dummy panels (for those cases in which we are looking at a subset of a dataset that does not include certain continents). if(sum(missing_continents_logical) == 0){ continent_wise_study_duration_graph_summary_table <- epi_tmp %>% dplyr::filter(continent != "NA", !is.na(continent), study_duration != "NA", !is.na(study_duration)) %>% dplyr::group_by(paper_uid, continent) %>% dplyr::summarise(average_study_duration = mean(study_duration, na.rm = TRUE)) %>% dplyr::ungroup() %>% dplyr::filter(continent != "Oceania") %>% #dplyr::mutate(continent = as.factor(continent)) %>% dplyr::mutate(order_continent = ifelse(continent == "China", 2, 0), order_continent = ifelse(continent == "Asia - China", 1, order_continent), order_continent = ifelse(continent == "Europe", 5, order_continent), order_continent = ifelse(continent == "North America", 6, order_continent), order_continent = ifelse(continent == "South America", 4, order_continent), order_continent = ifelse(continent == "Africa", 3, order_continent)) %>% dplyr::mutate(continent = ifelse(continent == "China", "China (1402.5 million people)", continent), continent = ifelse(continent == "Africa", "Africa (1373.8 million people)", continent), continent = ifelse(continent == "North America", "North America (589.2 million people)", continent), continent = ifelse(continent == "South America", "South America (429.2 million people)", continent), continent = ifelse(continent == "Europe", "Europe (740.8 million people)", continent), continent = ifelse(continent == "Asia - China", "Asia - China (3161.5 million people)", continent)) } else{ for(i in 1:length(missing_continents)){ if(i == 1){ continent_wise_study_duration_graph_summary_table <- epi_tmp %>% dplyr::filter(continent != "NA", !is.na(continent), study_duration != "NA", !is.na(study_duration)) %>% dplyr::group_by(paper_uid, continent) %>% dplyr::summarise(average_study_duration = mean(study_duration, na.rm = TRUE)) %>% dplyr::ungroup() %>% dplyr::add_row(continent = missing_continents[i]) } else{ continent_wise_study_duration_graph_summary_table <- continent_wise_study_duration_graph_summary_table %>% dplyr::add_row(continent = missing_continents[i]) } } continent_wise_study_duration_graph_summary_table <- continent_wise_study_duration_graph_summary_table %>% dplyr::filter(continent != "Oceania") %>% # dplyr::mutate(continent = as.factor(continent)) %>% dplyr::mutate(order_continent = ifelse(continent == "China", 2, 0), order_continent = ifelse(continent == "Asia - China", 1, order_continent), order_continent = ifelse(continent == "Europe", 5, order_continent), order_continent = ifelse(continent == "North America", 6, order_continent), order_continent = ifelse(continent == "South America", 4, order_continent), order_continent = ifelse(continent == "Africa", 3, order_continent)) %>% dplyr::mutate(continent = ifelse(continent == "China", "China (1402.5 million people)", continent), continent = ifelse(continent == "Africa", "Africa (1373.8 million people)", continent), continent = ifelse(continent == "North America", "North America (589.2 million people)", continent), continent = ifelse(continent == "South America", "South America (429.2 million people)", continent), continent = ifelse(continent == "Europe", "Europe (740.8 million people)", continent), continent = ifelse(continent == "Asia - China", "Asia - China (3161.5 million people)", continent)) } continent_wise_study_duration_graph <- continent_wise_study_duration_graph_summary_table %>% ggplot2::ggplot() + ggplot2::geom_histogram(mapping = ggplot2::aes(x = average_study_duration, fill = continent, group = continent), color = "white", position = "identity", binwidth = 2) + ggplot2::labs(x = "Study Duration", y = "Count", caption = stringr::str_wrap("*This graph displays distribution by continent (and for China, Asia - China) for all studies (not PM2.5 specific)."), width = 10, title = expression("Continent (and China) wise Study Duration Distribution and 2021 Annual Average" ~ PM[2.5] ~ "(in µg/m³)"), subtitle = "(Ordered from most polluted to least polluted)") + ggplot2::scale_x_continuous(breaks = seq(0, 40, 10)) + ggthemes::theme_hc() + ggplot2::theme(legend.title = element_blank()) + ggplot2::facet_wrap(~forcats::fct_reorder(continent, order_continent), nrow = 1) + ggplot2::theme(axis.line.y = ggplot2::element_line(color = "black"), axis.line.x = ggplot2::element_line(color = "black"), plot.caption = ggplot2::element_text(hjust = 0, size = 7), plot.title = element_text(size = 10, hjust = 0.5), plot.subtitle = element_text(size = 8, hjust = 0.5), strip.background = element_rect(fill = "floralwhite"), strip.text = element_text(size = 4.5), legend.background = element_rect(color = "black"), legend.text = element_text(size = 7)) + ggplot2::scale_fill_manual(values = c("North America (589.2 million people)" = "#FFBA00", "Europe (740.8 million people)" = "#FF9600", "South America (429.2 million people)" = "#FF6908", "Africa (1373.8 million people)" = "#E63D23", "China (1402.5 million people)" = "#BD251C", "Asia - China (3161.5 million people)" = "#8C130E")) + ggplot2::scale_y_continuous(breaks = seq(0, 10, 2)) #> calculations relevant for this block of code # long term study threshold long_term_study_duration <- 25 # number of "really long term studies" num_really_long_term_studies <- epi %>% group_by(paper_uid) %>% summarise(mean_duration = mean(study_duration, na.rm = TRUE)) %>% filter(mean_duration > long_term_study_duration) %>% nrow() # continents that have seen studies greater than x (long_term) years of study duration epi_long_term_studies_countries <- epi %>% group_by(paper_uid, continent) %>% summarise(mean_study_duration = mean(study_duration, na.rm = TRUE)) %>% ungroup() %>% filter(mean_study_duration > long_term_study_duration, mean_study_duration != "NA", !is.na(mean_study_duration)) %>% select(continent) %>% count(continent) %>% mutate(prop_n = round((n/sum(n, na.rm = TRUE))*100, 1)) %>% slice_max(prop_n, n = 6) # total population in missing continents combined tot_pop_africa_missing_continents <- aqli_color %>% filter(continent %in% missing_continents) %>% summarise(tot_pop = sum(population, na.rm = TRUE)) %>% unlist() %>% as.numeric() tot_pop_africa_missing_continents_in_billions <- round(tot_pop_africa_missing_continents/1000000000, 1) # total world population world_population <- aqli_color %>% summarise(total_population = sum(population, na.rm = TRUE)) %>% unlist() %>% as.numeric() # continent order list (order variable: number of studies): code present in the geographical distribution map block. # plot continent wise study duration graph continent_wise_study_duration_graph
In total, r num_really_long_term_studies
"really" long term studies (> r long_term_study_duration
years) have been performed. That is, r round((num_really_long_term_studies/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE)))*100, 1)
percent of the total number of studies.
All of the following continents have seen at least 1 really long term study: r epi_long_term_studies_countries %>% filter(n >= 1) %>% select(continent) %>% unlist() %>% as.vector()
.
All of the following continents have seen more than 1 really long term study: r epi_long_term_studies_countries %>% filter(n > 1) %>% select(continent) %>% unlist() %>% as.vector()
.
In all of the really long term studies (including both multi-country/continent and single-country), i.e. ones that have a study duration of > r long_term_study_duration
years - r epi_long_term_studies_countries$continent[1]
was included r epi_long_term_studies_countries$n[1]
times in those studies (r epi_long_term_studies_countries$prop_n[1]
percent of the total number of times any given continent is included in any given study).
Similarly, r epi_long_term_studies_countries$continent[2]
has been included r epi_long_term_studies_countries$n[2]
times in such long term studies (r epi_long_term_studies_countries$prop_n[2]
percent of the total number of times any given continent is included in any given study).
# press ctrl + shift + c to uncomment this. # ### Major takeaways # # * `r round((num_times_usa_canada_included/(epi %>% distinct(paper_uid, .keep_all = TRUE) %>% nrow()))*100, 1)` percent of all AQ epi studies (`r num_times_usa_canada_included` studies) included in this analysis included populations in either the US or Canada or both. # # * Of the total number of times a given continent's population has been included in any given AQ epi study, `r continent_wise_study_summary$continent[1]` dominates the rest of the continents and has been included `r continent_wise_study_summary$prop_studies[1]` percent of times (all due to studies in the US or Canada). Closely following North America, populations in `r continent_wise_study_summary$continent[2]` and `r continent_wise_study_summary$continent[3]` have been included in AQ epi studies `r continent_wise_study_summary$prop_studies[2]` and `r continent_wise_study_summary$prop_studies[3]` percent of times respectively. `r missing_continents` have seen 0 long term (> `r min_study_duration_in_years` year), large (>`r min_sample_size` people in the sample) AQ epi study. # # * There have been no large (> `r min_study_duration_in_years` year) or long (>`r min_sample_size` people in the sample) PM epi studies that were performed in `r missing_continents`, with a combined population of `r tot_pop_africa_missing_continents_in_billions` billion, `r round((tot_pop_africa_missing_continents/world_population)*100, 1)` percent of the Earth's total human population. # # * Post 2009, there has been a noticeable increase in the overall volume of AQ epi studies published. # # * In total, `r num_really_long_term_studies` "really" long term studies (> `r long_term_study_duration` years) have been performed. That is, `r round((num_really_long_term_studies/nrow(epi %>% distinct(paper_uid, .keep_all = TRUE)))*100, 1)` percent of the total number of studies. # # * All of the following continents have seen at least 1 really long term study: `r epi_long_term_studies_countries %>% filter(n >= 1) %>% select(continent) %>% unlist() %>% as.vector()`.
All studies included in this analysis point to the same overall picture:air pollution is a serious health threat. The existing state of scientific literature on air pollution and health is clear that air pollution’s impact on health is well-established and taking action to improve a polluted environment should not be delayed in order to complete multi-year large sample (> r min_sample_size
) epidemiological studies in an area, even if there has not been a prior study in that particular geography. That said, it is important for the field of air quality epidemiolgy to understand the contours of its current research landscape to most effectively identify directions for future research and deploy limited resources.
Through a comprehensive and ongoing* literature review, we are making an attempt at creating an exhaustive public listing of all the epidemiological studies out there (that we could find) that examine the relationship between PM~2.5~ and Life Expectancy/Mortality.
For each study, we record data on key defining features, such as: Geography, Sample Size, Study Duration, PM~2.5~ exposure range, etc. Then we used this analysis dataset to carry out a meta-analysis, results of which are detailed in the Results section above.
We are seeking to make this analysis as current, complete and error-free as possible and view it as a continual work in progress. We would welcome the air quality community’s any comments, corrections, andor suggestions. Please contact aqli-epic@uchicago.edu or leave a comment in this GitHub repository.
The underlying analysis dataset for this meta analysis focuses only on those papers that study the link between PM~2.5~, PM~10~, TSP, Fine Particulate Matter and __Mortality/Life Expectancy__. In addition to the analysis dataset, there is a master dataset that expands on the analysis dataset to include other useful information. The idea behind the master dataset is to record any additional details (whether additional facts about the paper, or details on other pollutants studied in the paper) that will not be a part of the main data analysis exercise. Master dataset will also list additional papers that do not fit into the inclusion criteria.
The analysis dataset excludes the following types of papers: meta-analysis, unpublished papers, papers studying the effects of indoor air pollution on Mortality/Life Expectancy, papers forecasting future air pollution/life expectancy/mortality. But, the master dataset lists all of these studies.
In Multi-Country (pooled) studies, one entry (one row in the analysis dataset) is recorded for each country in papers where country level data is available. There are some pooled studies where country level data is not available, but rather data is available for a custom region (e.g. South-East Asia), such pooled studies are excluded from the analysis dataset. But, the master dataset still lists all of these additional studies for reference.
In cases where the same cohort is studied by different research groups at different points in time/using different methods: we have included and counted all of those studies as separate unique studies in the analysis below.
Papers studying the health effects of pollution segregated by sectors (source apportionment type studies), regions (example, Rural v/s Urban), season (Winter v/s Summer) are excluded from this analysis.
Papers that include combined estimates of Household and Ambient Air Pollution, but not individual estimates are excluded.
All papers studying the impact of pollutants on DALY's (Disability Adjusted Life Years), are excluded from this analysis.
In papers, where the minimum PM~2.5~ concentration was not reported, we have assumed the lowest available percentile data available on PM2.5 concentration as the minimum concentration.
In many papers, only one of the mean PM~2.5~ or PM~2.5~ range is reported but not both. In these cases, wherever the data is not available (whether it is mean PM~2.5~ or PM~2.5~ range) we have recorded the instances as "NA". Apart from this, anywhere we couldn’t find data, we have recorded it as "NA". This is one of the limitations of this analysis.
Different papers talk about different types of mean PM~2.5~ values. Some report daily averages, while others may report monthly/annual averages. Also, in some cases, the averages are population weighted and in other cases they are not. But, it is not always clearly stated as to which type of average the paper is referring to (or how it was calculated). In scenarios like these, we had to make a value judgement and this is one of the limitations of this analysis.
In cases, where mean PM~2.5~ is reported for both the start year and the end year(or also for sometime between a start year and the end year) of the study, we have reported the end year mean PM~2.5~ value.
Papers that have used a conversion factor to convert mean PM~10~ concentrations to mean PM~2.5~ concentrations are included but, there PM~2.5~ values (mean, lower limit, upper limit, standard deviation) is not recorded.
In multi-pollutant studies where different pollutants have been studied over different periods of time, we have chosen the specific time period that corresponds to the PM~2.5~ pollutant (in cases where we PM~2.5~ is not present, we have recorded the time period corresponding to one of the other pollutants).
We have recorded one mean PM~2.5~ value for each paper. But, there are papers where more than one mean PM~2.5~ value is reported (for example, one for the male group and one for the female group). In these cases, we have picked one value from the ones that are available.
In some papers, the exact start and/or end year of a study is unclear. In these cases, we have mostly chosen the first instance of the multiple “study duration ranges” (depending on multiple plausible interpretations of when the final follow up ended) the paper.
In many papers, the upper limit or lower limit for age is not precisely specified. In these cases, we have recorded a NA. For example, there are papers where the upper limit category for age is 85+. In these cases, although we know the upper limit category, we still don't know the upper limit of the age, which could be 90, 95, 100, etc.
In places, where the sample size is specifed in terms of "number of regions" (for example, 17 districts) and not "number of people", we have recorded a NA in the cohort size column.
Under the “methods” column (in the master dataset), we have only mentioned a subset of methods that were used to carry out different parts of the study. Authors may have used other methods than those mentioned in our meta analysis.
Different graphs are generated using different subsets of the analysis dataset. Most of them are PM~2.5~ specific (our main focus), others include all pollutants (PM~2.5~, PM~10~, TSP, Ultrafine Particles). The type of papers used to generate a given graph and other nuances for the graph in question are specified within the graph (as a note) and/or the accompanying text.
Add to the analysis dataset and and grow the epi database:
If you know of other papers that: (a) are attempting to study the link between PM~2.5~ and Life Expectancy/Mortality and (b) are not included in this analysis: Please leave a comment in the analysis dataset, providing a link to the paper. You can also write to us at aqli-info@uchicago.edu (with the link to the paper mentioned in the email).
As a next step, we’ll go through your submission, and if it fits our inclusion criteria, we will update the underlying analysis dataset and re-render the entire blog, so that it represents the most up to date data and figures.
If you are unsure (given the inclusion criteria) about whether a particular paper (that you want to post) should be posted on not, we encourage you to post it and let us worry about the inclusion/exclusion bit.
In case you have comments on some aspects of a paper/if you find any errors, please leave a comment in the analysis dataset on the cell (tagging aqli-info@uchicago.edu, using @ symbol) where the error is found or write to us at aqli-info@uchicago.edu, detailing the error and its location (cell address on the sheet).
To further explore these graphs and more in an interactive setup, visit the AQ Epi dashboard.
#### PM2.5 distributions of the lower limits and upper limits of exposure range (Density and Histogram Plots) ## All graphs below this block of code is deactivated for the blog post. This will be displayed on the dashboard, alongside other graphs. #> Specific to PM2.5 and count separate countries in a pooled study separately #> exposure distribution density and histogram plots (PM2.5 LL and PM2.5 UL) # create long dataset epi_long <- epi %>% filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>% pivot_longer(cols = contains("pm2.5_exposure"), names_to = "exposure_type", values_to = "exposure_value") %>% select(paper_uid, exposure_type, exposure_value) %>% filter(!is.na(exposure_value)) # density plot epi_long %>% group_by(paper_uid, exposure_type) %>% summarise(exposure_value = ifelse(exposure_type == "pm2.5_exposure_ll", min(exposure_value, na.rm = TRUE), max(exposure_value, na.rm = TRUE))) %>% ggplot() + geom_density(mapping = aes(x = exposure_value, fill = exposure_type), alpha = 0.6, position = "identity") + labs(x = expression(paste("PM2.5 concentration (", mu, "g", "/", m^3, ")"))) + theme_tufte() # histogram plot epi_long %>% group_by(paper_uid, exposure_type) %>% summarise(exposure_value = ifelse(exposure_type == "pm2.5_exposure_ll", min(exposure_value, na.rm = TRUE), max(exposure_value, na.rm = TRUE))) %>% ggplot() + geom_histogram(mapping = aes(x = exposure_value, fill = exposure_type), position = "identity", alpha = 0.4, color = "white") + labs(x = expression(paste("PM2.5 concentration (", mu, "g", "/", m^3, ")"))) + theme_tufte()
#### Age distributions of lower limits and upper limits of age range (Density and Histogram plots) #> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future. #> (5) age distribution plot (LL and UL) using the long dataset (density plot) epi_long_age_dist <- epi %>% filter(((!is.na(cohort_age_ll)) | (cohort_age_ll != "NA")) & ((!is.na(cohort_age_ul)) | (cohort_age_ul != "NA"))) %>% pivot_longer(cols = contains("cohort_age"), names_to = "age_dist_type", values_to = "age_value") %>% select(paper_uid, age_dist_type, age_value) %>% filter(!is.na(age_value)) epi_long_age_dist %>% ggplot() + geom_density(mapping = aes(x = age_value, fill = age_dist_type), alpha = 0.6, color = "black") + labs(x = "Age") + theme_tufte() #> (6) age distribution plot (LL and UL) using the long dataset (histogram plot) epi_long_age_dist %>% ggplot() + geom_histogram(mapping = aes(x = age_value, fill = age_dist_type), alpha = 0.5, position = "identity", color = "white") + labs(x = "Age") + theme_tufte()
#### Country wise distribution of Cohort Sizes (Density and Histogram plots) #> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future. #> (7) Country Wise Cohort Size density plot epi %>% filter(cohort_size != "NA", !is.na(cohort_size)) %>% ggplot() + geom_density(mapping = aes(x = log10(cohort_size), fill = country), alpha = 0.5) + theme_tufte() #> (8) Country wise Cohort Size histogram plot epi %>% filter(cohort_size != "NA", !is.na(cohort_size)) %>% ggplot() + geom_histogram(mapping = aes(x = log10(cohort_size), fill = country), alpha = 0.5, position = "identity") + theme_tufte()
#### Country wise Distribution of Study Duration (Density and Histogram Plots) #> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future. #> (9) Country wise Study duration density plot epi %>% filter(study_duration != "NA", !is.na(study_duration)) %>% ggplot() + geom_density(mapping = aes(x = study_duration, fill = country, alpha = 0.5)) + theme_tufte() #> Country Wise Study Duration histogram epi %>% filter(study_duration != "NA", !is.na(study_duration)) %>% ggplot() + geom_histogram(mapping = aes(x = study_duration, fill = country, alpha = 0.5), position = "identity", alpha = 0.5, color = "white", binwidth = 1) + theme_tufte() + facet_wrap(~country, nrow = 5) + theme(legend.position = "bottom")
# Experimental graphs (not to be put on the GitHub blog) #### Country wise Distribution of Study Duration (Density and Histogram Plots) #> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future. #> (9) Country wise Study duration density plot epi %>% filter(study_duration != "NA", !is.na(study_duration)) %>% ggplot() + geom_density(mapping = aes(x = study_duration, fill = country, alpha = 0.5)) + theme_tufte() #> Country Wise Study Duration histogram epi %>% filter(study_duration != "NA", !is.na(study_duration)) %>% ggplot() + geom_histogram(mapping = aes(x = study_duration, fill = country, alpha = 0.5), position = "identity", alpha = 0.5, color = "white", binwidth = 1) + theme_tufte() + facet_wrap(~country, nrow = 5) + theme(legend.position = "bottom")
# Experimental Graphs (others not to be put on the GitHub blog) #### Country wise Distribution of Study Duration (Density and Histogram Plots) #> Different countries in a pooled study should be counted separately. As of now this is PM2.5 specific, but can be generalized in the future. #> (9) Country wise Study duration density plot epi %>% filter(study_duration != "NA", !is.na(study_duration)) %>% ggplot() + geom_density(mapping = aes(x = study_duration, fill = country, alpha = 0.5)) + theme_tufte() #> Country Wise Study Duration histogram epi %>% filter(study_duration != "NA", !is.na(study_duration)) %>% ggplot() + geom_histogram(mapping = aes(x = study_duration, fill = country, alpha = 0.5), position = "identity", alpha = 0.5, color = "white", binwidth = 1) + theme_tufte() + facet_wrap(~country, nrow = 5) + theme(legend.position = "bottom") # Experimental graphs #> exp graph 1 - global aqli 2020 data pm2020 column distribution overlayed with vertical lines showing mean_pm2.5 values of various studies.--------------------------------------------------------- mean_pm2.5_studies <- epi$mean_pm2.5[!is.na(epi$mean_pm2.5)] # exp graph 1 v1 exp_graph_1_v1 <- aqli_color %>% ggplot() + geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8) + geom_vline(xintercept = mean_pm2.5_studies, color = "black", alpha = 0.4, linetype = "longdash", size = 0.4) + ggthemes::theme_tufte() + labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), y = "count") + theme(axis.line = element_line()) + scale_x_continuous(breaks = seq(0, 120, 20)) + scale_y_continuous(breaks = seq(0, 12000, 2000)) # exp graph 1 v2 FOO <- aqli_color %>% ggplot() + geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8) + geom_vline(xintercept = mean_pm2.5_studies, color = "black", alpha = 0.4, linetype = "longdash", size = 0.4) + ggthemes::theme_tufte() + labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), y = "count") + theme(axis.line = element_line()) + scale_x_continuous(breaks = seq(0, 120, 20)) + scale_y_continuous(breaks = seq(0, 12000, 2000)) + annotate("segment", x = min(mean_pm2.5_studies, na.rm = TRUE), y = 12000, xend = max(mean_pm2.5_studies, na.rm = TRUE), yend = 12000, arrow = arrow(length = unit(0.3, "cm"), type = "closed")) + annotate("segment", x = 60, y = 12000, xend = 60, yend = 10000) + annotate("segment", x = 60, y = 10000, xend = 90, yend = 10000) + annotate("text", x = 99, y = 10000, label = str_wrap("Mean PM2.5 range of studies", 12)) # exp graph 1 v3 aqli_color %>% ggplot() + geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8)+ ggthemes::theme_tufte() + labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), y = "count") + theme(axis.line = element_line()) + scale_x_continuous(breaks = seq(0, 120, 20)) + scale_y_continuous(breaks = seq(0, 12000, 2000)) + annotate("segment", x = min(mean_pm2.5_studies, na.rm = TRUE), y = 12000, xend = max(mean_pm2.5_studies, na.rm = TRUE), yend = 12000, arrow = arrow(length = unit(0.3, "cm"), type = "closed")) + annotate("segment", x = 60, y = 12000, xend = 60, yend = 10000) + annotate("segment", x = 60, y = 10000, xend = 90, yend = 10000) + annotate("text", x = 99, y = 10000, label = str_wrap("Mean PM2.5 range of studies", 12)) # exp graph 1 v4 foo <- aqli_color %>% ggplot() + geom_histogram(mapping = aes(x = pm2020), fill = "darkred", color = "black", alpha = 0.7, size = 0.8)+ ggthemes::theme_tufte() + labs(x = expression("Annual Average" ~ PM[2.5] ~ "concentrations in 2020 (in µg/m³)"), y = "count") + theme(axis.line = element_line()) + scale_x_continuous(breaks = seq(0, 120, 20)) + scale_y_continuous(breaks = seq(0, 12000, 2000)) + annotate("rect", xmin = min(mean_pm2.5_studies, na.rm = TRUE), ymin = 0, xmax = max(mean_pm2.5_studies, na.rm = TRUE), ymax = 12000, alpha = 0.2) + annotate("segment", x = max(mean_pm2.5_studies, na.rm = TRUE), y = 10000, xend = 90, yend = 10000) + annotate("text", x = 99, y = 10000, label = str_wrap("Mean PM2.5 range of studies", 12)) # hex version #> exp graph v2------------------------------------------------------------------------- #> PM2.5 exposure range and population distribution # exp graph 2 v1 thresh_ll <- 0 thresh_ul <- 25 # aqli color population in ordered pm2.5 buckets aqli_color_grp_pm2.5_buckets <- aqli_color %>% mutate(region = ifelse(pm2020 >= thresh_ll & pm2020 <= thresh_ul, str_c(thresh_ll, "-", thresh_ul), pm2020), region = ifelse(pm2020 > thresh_ul, str_c(">", thresh_ul), region)) %>% group_by(region) %>% summarise(tot_pop = sum(population, na.rm = TRUE)) %>% ungroup() %>% mutate(order_pollution_group = ifelse(region == str_c(thresh_ll, "-", thresh_ul), 1, 0), order_pollution_group = ifelse(region == str_c(">", thresh_ul), 2, order_pollution_group)) %>% ungroup() %>% mutate(tot_pop_prop = (tot_pop/sum(tot_pop))*100) # epi total number of studies in ordered pollution buckets (initially filtering out pooled studies, will add back in using a for loop) epi_num_studies_grp_pm2.5_buckets <- epi %>% filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>% group_by(paper_uid) %>% summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>% mutate(region = ifelse(mean_pm2.5 >= thresh_ll & mean_pm2.5 <= thresh_ul, str_c(thresh_ll, "-", thresh_ul), mean_pm2.5), region = ifelse(mean_pm2.5 > 25, str_c(">", thresh_ul), region)) %>% mutate(order_pollution_group = ifelse(region == str_c(thresh_ll, "-", thresh_ul), 1, 0), order_pollution_group = ifelse(region == str_c(">", thresh_ul), 2, order_pollution_group)) %>% group_by(region) %>% summarise(tot_studies = n(), order_pollution_group = order_pollution_group[1]) %>% ungroup() %>% mutate(tot_studies_prop = (tot_studies/sum(tot_studies))*100) #> bar chart version # creating a master dataset for both population and studies data pop_epi_studies_data <- aqli_color_grp_pm2.5_buckets %>% left_join(epi_num_studies_grp_pm2.5_buckets, by = c("region", "order_pollution_group")) %>% select(region, order_pollution_group, tot_pop_prop, tot_studies_prop) %>% pivot_longer(cols = tot_pop_prop:tot_studies_prop, names_to = "type_of_prop", values_to = "val") # plotting the bar graph pop_num_studies_in_pollution_buckets_graph <- pop_epi_studies_data %>% ggplot() + geom_col(mapping = aes(x = fct_reorder(region, order_pollution_group), y = val, fill = type_of_prop), position = position_dodge(), width = 0.4) + scale_y_continuous(breaks = seq(0, 100, 10)) + scale_fill_manual(values = c("tot_pop_prop" = "grey", "tot_studies_prop" = "cornflowerblue"), labels = c("Proportion of World Population in Bucket", "Proportion of Studies Completed in Bucket")) + labs(x = "Mean PM2.5 bucket (in µg/m³)", y = "Percentage", fill = "", caption = str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) + theme_hc() + theme(axis.line.y = element_line(color = "black"), axis.line.x = element_line(color = "black"), plot.caption = element_text(size = 8, hjust = 0), plot.caption.position = "plot") # cartogram (using objects from the geographic distribution chunk) foo <- world_shp_epi_color %>% st_as_sf() # country level population aqli_color_country_summary_population <- aqli_color %>% group_by(country) %>% mutate(pop_weights = population/sum(population, na.rm = TRUE), pm2020_pop_weighted = pm2020*pop_weights) %>% summarise(avg_pm2.5_2020 = sum(pm2020_pop_weighted, na.rm = TRUE), tot_pop = sum(population, na.rm = TRUE)) %>% select(country, tot_pop) foo <- foo %>% left_join(aqli_color_country_summary_population, by = "country") for(i in 1:nrow(foo)) { if(is.na(foo$num_studies[i])){ foo$num_studies[i] <- (sample(1:20, 1))*10000 } else { foo$num_studies[i] <- foo$num_studies[i]*10000 } } foo <- foo %>% mutate(num_studies = ifelse(is.na(num_studies), 0, num_studies)) foo <- foo %>% mutate(num_studies_norm = num_studies/sum(num_studies, na.rm = TRUE), num_studies_log = log10(num_studies + 1.00001)) foo <- foo %>% select(-geometry, geometry) %>% st_as_sf() foo %>% ggplot() + geom_sf() foo_transform <- st_transform(foo, 3857) foo_transform_cont <- cartogram_cont(foo_transform, weight = "num_studies_norm", itermax = 5) ggplot(foo_transform_cont) + geom_sf(mapping = aes(fill = num_studies)) foo_sp <- as(foo, "Spatial") foo_sp_transform <- spTransform(foo_sp, CRS("+init=epsg:3395")) foo_sp_transform_cont <- cartogram_cont(foo_sp_transform, "num_studies", itermax = 2) tm_shape(foo_sp_transform_cont) + tm_polygons("total_pop", style = "jenks") + tm_layout(frame = FALSE, legend.position = c("left", "bottom")) #> population pollution figure (version 2)------------------------------- # pollution buckets on the x # adding pollution buckets aqli_color_with_pol_buckets <- aqli_color %>% mutate(pol_buckets = ifelse(((pm2021 >= 0) & (pm2021 <= 5)), "0 - 5", NA), pol_buckets = ifelse(((pm2021 > 5) & (pm2021 <= 10)), ">5 - 10", pol_buckets), pol_buckets = ifelse(((pm2021 > 10) & (pm2021 <= 20)), ">10 - 20", pol_buckets), pol_buckets = ifelse(((pm2021 > 20) & (pm2021 <= 40)), ">20 - 40", pol_buckets), pol_buckets = ifelse(((pm2021 > 40) & (pm2021 <= 80)), ">40 - 80", pol_buckets), pol_buckets = ifelse(((pm2021 > 80)), ">80", pol_buckets)) %>% mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA), pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order)) # converting the 2 newly created columns to type factor aqli_color_with_pol_buckets$pol_buckets <- as.factor(aqli_color_with_pol_buckets$pol_buckets) aqli_color_with_pol_buckets$pol_bucket_order <- as.factor(aqli_color_with_pol_buckets$pol_bucket_order) foo <- aqli_color_with_pol_buckets %>% group_by(pol_buckets) %>% summarise(tot_pop = sum(population, na.rm = TRUE)) %>% ungroup() %>% mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA), pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order)) %>% filter(!is.na(pol_buckets)) %>% mutate(tot_pop_millions = round(tot_pop/1000000, 1)) %>% ggplot() + geom_col(mapping = aes(x = forcats::fct_reorder(pol_buckets, pol_bucket_order), y = tot_pop_millions, fill = forcats::fct_reorder(pol_buckets, pol_bucket_order)), width = 0.2) + ggthemes::theme_hc() + labs(x = expression("Annual Average" ~ PM[2.5] ~ ("in µg/m³")), y = "Total Population (in millions)", title = "", fill = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)")) + theme(plot.title = element_text(hjust = 0.5, size = 12), axis.title.x = element_text(size = 9), axis.title.y = element_text(size = 9), legend.title = element_text(size = 8), legend.background = element_rect(color = "black")) + viridis::scale_fill_viridis(option = "magma", direction = -1, discrete = TRUE) + scale_y_continuous(breaks = seq(0, 3000, 500)) # doing a similar thing with the epi dataset to get the total number of studies in different pollution buckets. epi_with_pol_buckets <- epi %>% mutate(pol_buckets = ifelse(((mean_pm2.5 >= 0) & (mean_pm2.5 <= 5)), "0 - 5", NA), pol_buckets = ifelse(((mean_pm2.5 > 5) & (mean_pm2.5 <= 10)), ">5 - 10", pol_buckets), pol_buckets = ifelse(((mean_pm2.5 > 10) & (mean_pm2.5 <= 20)), ">10 - 20", pol_buckets), pol_buckets = ifelse(((mean_pm2.5 > 20) & (mean_pm2.5 <= 40)), ">20 - 40", pol_buckets), pol_buckets = ifelse(((mean_pm2.5 > 40) & (mean_pm2.5 <= 80)), ">40 - 80", pol_buckets), pol_buckets = ifelse(((mean_pm2.5 > 80)), ">80", pol_buckets)) %>% mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA), pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order)) epi_with_pol_buckets$pol_buckets <- as.factor(epi_with_pol_buckets$pol_buckets) epi_with_pol_buckets$pol_bucket_order <- as.factor(epi_with_pol_buckets$pol_bucket_order) foo1 <- epi_with_pol_buckets %>% group_by(pol_buckets) %>% summarise(total_studies = n()) %>% ungroup() %>% mutate(pol_bucket_order = ifelse(pol_buckets == "0 - 5", 1, NA), pol_bucket_order = ifelse(pol_buckets == ">5 - 10", 2, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">10 - 20", 3, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">20 - 40", 4, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">40 - 80", 5, pol_bucket_order), pol_bucket_order = ifelse(pol_buckets == ">80", 6, pol_bucket_order)) %>% filter(!is.na(pol_buckets)) %>% add_row(pol_bucket_order = 1, pol_buckets = "0 - 5", total_studies = 0) %>% add_row(pol_bucket_order = 6, pol_buckets = ">80", total_studies = 0) %>% ggplot() + geom_col(mapping = aes(x = forcats::fct_reorder(pol_buckets, pol_bucket_order), y = total_studies, fill = forcats::fct_reorder(pol_buckets, pol_bucket_order)), width = 0.2) + ggthemes::theme_hc() + labs(x = "", y = "Total number of epi studies", title = "", fill = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)")) + theme(plot.title = element_text(hjust = 0.5, size = 12), axis.title.x = element_text(size = 9), axis.title.y = element_text(size = 9), legend.title = element_text(size = 8), legend.background = element_rect(color = "black")) + viridis::scale_fill_viridis(option = "magma", direction = -1, discrete = TRUE) + theme(legend.position = "none") gridExtra::grid.arrange(foo1, foo) #> Template for adding a red line that goes over all panels in the continent wise panel plot----------- # convert the contnent wise plot into a grob # plt_cont_grob <- ggplotGrob(plt_cont) # # # get the panel grobs # panels <- getGrob(plt_cont_grob, "panel", grep = TRUE) # # # add a horizontal line at y = 0 on top of each panel # for (i in 1:6) { # vp <- viewport(layout.pos.row = i, layout.pos.col = 1) # pushViewport(vp) # grid.lines(x = c(unit(0.1, "npc"), unit(1, "npc")), # y = unit(0.82, "npc"), # gp = gpar(col = "darkred", lwd = 1), # arrow = arrow(length = unit(0.15, "in"), type = "closed", ends = "last", # angle = 30)) # popViewport() # } #------------------------------------- #> Countries withing continent wise plot of studies in form of a bar graphs (with countries not labelled)------- # generate plot dataset plot_data <- epi %>% group_by(continent, country) %>% summarise(count_studies = n(), .groups = "keep") # plot the countries within continent plot plot_data %>% ggplot() + geom_col(mapping = aes(x = continent, y = count_studies, fill = continent), position = position_dodge2(preserve = "single")) #> Treemaps of the number of studies that have happened in each continent by country (used plot_data object from above)----------------- plt1 <- ggplot(plot_data %>% filter(continent == "Asia"), aes(area = count_studies, fill = country, label = count_studies)) + geom_treemap() + viridis::scale_fill_viridis(discrete = TRUE) + geom_treemap_text(color = "white") + labs(title = "Asia (24 times was part of a epi study)", fill = "Country") + theme(legend.position = "bottom", legend.background = element_rect(color = "black"), plot.title = element_text(hjust = 0.5), plot.background = element_rect(fill = "white")) plt2 <- ggplot(plot_data %>% filter(continent == "Africa"), aes(area = count_studies, fill = country, label = count_studies)) + geom_treemap() + viridis::scale_fill_viridis(discrete = TRUE) + geom_treemap_text(color = "white") + labs(title = "Africa (0)", fill = "Country") + theme(legend.position = "bottom", legend.background = element_rect(color = "black"), plot.title = element_text(hjust = 0.5), plot.background = element_rect(fill = "white")) plt3 <- ggplot(plot_data %>% filter(continent == "South America"), aes(area = count_studies, fill = country, label = count_studies)) + geom_treemap() + viridis::scale_fill_viridis(discrete = TRUE) + geom_treemap_text(color = "white") + labs(title = "South America (0)", fill = "Country") + theme(legend.position = "bottom", legend.background = element_rect(color = "black"), plot.title = element_text(hjust = 0.5), plot.background = element_rect(fill = "white")) plt4 <- ggplot(plot_data %>% filter(continent == "Europe"), aes(area = count_studies, fill = country, label = count_studies)) + geom_treemap() + viridis::scale_fill_viridis(discrete = TRUE) + geom_treemap_text(color = "white") + labs(title = "Europe (15)", fill = "Country") + theme(legend.position = "bottom", legend.background = element_rect(color = "black"), plot.title = element_text(hjust = 0.5), plot.background = element_rect(fill = "white")) plt5 <- ggplot(plot_data %>% filter(continent == "North America"), aes(area = count_studies, fill = country, label = count_studies)) + geom_treemap() + viridis::scale_fill_viridis(discrete = TRUE) + geom_treemap_text(color = "white") + labs(title = "North America (49)", fill = "Country") + theme(legend.position = "bottom", legend.background = element_rect(color = "black"), plot.title = element_text(hjust = 0.5), plot.background = element_rect(fill = "white")) plt6 <- ggplot(plot_data %>% filter(continent == "Oceania"), aes(area = count_studies, fill = country, label = count_studies)) + geom_treemap() + viridis::scale_fill_viridis(discrete = TRUE) + geom_treemap_text(color = "white") + labs(title = "Oceania (0)", fill = "Country") + theme(legend.position = "bottom", legend.background = element_rect(color = "black"), plot.title = element_text(hjust = 0.5), plot.background = element_rect(fill = "white")) plt_final <- gridExtra::grid.arrange(plt1, plt2, plt3, plt4, plt5, plt6, nrow = 1, top = "Number of times a country has been included in an epi study (Continent Wise)") plt_final + ggtitle("foo") #> Population pollution figure (v3): trying a continuous but binned scale and adding Ken's figure---------- # population v/s pollution foo <- aqli_color %>% group_by(pm2021) %>% summarise(count_pm2021 = sum(population, na.rm = TRUE), count_pm2021_millions = round(count_pm2021/1000000, 1)) %>% ggplot() + geom_hline(yintercept = c(300, 600, 900, 1200), linetype = "dashed", color = "lavenderblush2") + geom_histogram(mapping = aes(x = pm2021, weight = count_pm2021_millions), fill = "#8FA1AD", color = "white", alpha = 0.7, width = 0.5) + ggthemes::theme_hc() + labs(x = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)"), y = "Population (in millions)") + scale_y_continuous(breaks = seq(0, 1500, 300)) + scale_x_continuous(breaks = seq(0, 140, 15), limits = c(0, 140)) + theme(axis.line = element_line(), panel.grid.major.y = element_blank(), axis.title = element_text(size = 11)) foo_v2 <- aqli_color %>% group_by(pm2021) %>% summarise(count_pm2021 = sum(population, na.rm = TRUE), count_pm2021_millions = round(count_pm2021/1000000, 1)) %>% ggplot() + geom_hline(yintercept = c(500, 1000, 1500, 2000, 2500), linetype = "dashed", color = "lavenderblush2") + geom_col(mapping = aes(x = pm2021, y = count_pm2021_millions), fill = "#8FA1AD", alpha = 0.7, width = 0.9) + ggthemes::theme_hc() + labs(x = expression("Annual Average" ~ PM[2.5] ~ "(in µg/m³)"), y = "Population (in millions)") + scale_x_binned(breaks = seq(0, 140, 15), limits = c(0, 140)) + scale_y_continuous(breaks = seq(0, 2500, 500)) + theme(axis.line = element_line(), panel.grid.major.y = element_blank(), axis.text.y = element_text(size = 13), axis.text.x = element_text(size = 13), axis.title = element_text(size = 15)) # pollution v/s number of studies foo1 <- epi %>% group_by(mean_pm2.5) %>% summarise(count_mean_pm2.5 = n()) %>% ggplot() + geom_hline(yintercept = c(5, 10, 15, 20), linetype = "dashed", color = "lavenderblush2") + geom_histogram(mapping = aes(x = mean_pm2.5, weight = count_mean_pm2.5), fill = "#343434", color = "white", alpha = 0.7, width = 0.5) + ggthemes::theme_hc() + labs(x = "", y = "Number of epi papers") + scale_y_continuous(breaks = seq(0, 30, 5)) + scale_x_continuous(breaks = seq(0, 140, 15), limits = c(0, 140)) + theme(axis.line = element_line(), panel.grid.major.y = element_blank(), axis.title = element_text(size = 11)) foo1_v2 <- epi %>% group_by(mean_pm2.5) %>% summarise(count_mean_pm2.5 = n()) %>% ggplot() + geom_hline(yintercept = c(10, 20, 30, 40), linetype = "dashed", color = "lavenderblush2") + geom_col(mapping = aes(x = mean_pm2.5, y = count_mean_pm2.5), fill = "#343434", alpha = 0.7, width = 0.9) + ggthemes::theme_hc() + labs(x = "", y = "Number of epi papers") + scale_y_continuous(breaks = seq(0, 40, 10)) + scale_x_binned(breaks = seq(0, 140, 15), limits = c(0, 140)) + theme(axis.line = element_line(), panel.grid.major.y = element_blank(), axis.text.y = element_text(size = 13), axis.text.x = element_text(size = 13), axis.title = element_text(size = 15),) gridExtra::grid.arrange(foo1, foo) ## Ken's figure # get 4 datasets corresponding to gemm/ier/aqli/meta and for each keep concentrations till 120 micrograms per cubic meter and keep only relevant columns. and so some basic cleaning gemm <- haven::read_dta("./data-raw/pm2.5_distribution/gemm.dta") gemm <- gemm %>% filter(pm_gemm <= 120) %>% select(pm_gemm, gemm_lyl) %>% mutate(method = "GEMM") %>% rename(pm_val = pm_gemm, lyl_val = gemm_lyl) # stupid adjustment gemm[1:3, 2] <- c(-0.49, -0.392, -0.294) ier2019 <- haven::read_dta("./data-raw/pm2.5_distribution/ier_2019.dta") ier2019 <- ier2019 %>% filter(pm_ier_2019 <= 120) %>% select(pm_ier_2019, ier_2019_lyl) %>% mutate(method = "IER-2019") %>% rename(pm_val = pm_ier_2019, lyl_val = ier_2019_lyl) meta <- haven::read_dta("./data-raw/pm2.5_distribution/meta.dta") meta <- meta %>% filter(pm_meta <= 120) %>% select(pm_meta, meta_lyl) %>% mutate(method = "META") %>% rename(pm_val = pm_meta, lyl_val = meta_lyl) aqli <- tibble(pm_val = 0:120, lyl_val = (pm_val - who_pm2.5_guideline)*0.098, method = "AQLI") # mutate(lyl_val = ifelse(lyl_val < 0, 0, lyl_val)) ## calculating empirical versions of above methods # Approximating a derivative function of gemm gemm_empirical <- splinefun(x = gemm$pm_val, y = gemm$lyl_val) # Approximating a derivative function of gemm aqli_empirical <- splinefun(x = aqli$pm_val, y = aqli$lyl_val) # meta empirical meta_empirical <- splinefun(x = meta$pm_val, y = meta$lyl_val) # ier 2019 empirical ier2019_empirical <- splinefun(x = ier2019$pm_val, y = ier2019$lyl_val) ## plot the 4 curves (empirical version) foo3 <- ggplot(data = tibble(x = 1:120), mapping = aes(x = x)) + #geom_hline(yintercept = seq(0, 0.175, 0.05), linetype = "dashed", color = "lavenderblush2") + geom_function(fun = gemm_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#5B96AD")) + geom_function(fun = aqli_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#8F3931")) + geom_function(fun = meta_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#91AB5A")) + geom_function(fun = ier2019_empirical, size = 1.2, args = list(deriv = 1), mapping = aes(color = "#767676")) + scale_y_continuous(breaks = seq(0, 0.2, 0.05), limits = c(0, .2)) + scale_x_continuous(breaks = seq(0, 140, 15), limits = c(0, 140)) + labs(x = "", y = "Marginal Life Expectancy Lost", color = "", title = str_wrap("PM2.5 exposure in comparison with population and number of epi papers and alternative approaches to measure PM2.5 Life Expectancy relationship.", width = 90)) + scale_color_identity(breaks = c("#8F3931", "#91AB5A", "#5B96AD", "#767676"), labels = c("AQLI", "META", "GEMM", "IER-2019"), guide = "legend") + ggthemes::theme_hc() %+replace% theme(legend.position = c(0.8, 0.9), legend.direction = "horizontal", axis.line.x = element_line(), axis.line.y = element_line(), plot.title = element_text(hjust = 0.5, size = 16, color = "black"), axis.text.y = element_text(size = 13), axis.text.x = element_text(size = 13), axis.title = element_text(size = 15), legend.text = element_text(size = 10), plot.caption = element_text(face = "italic", hjust = 0), legend.title = element_text(hjust = 0.5), panel.grid.major.y = element_blank(), legend.key.size = unit(0.5, "cm")) gridExtra::grid.arrange(foo3,foo1_v2, foo_v2, heights = c(1, 1, 1)) # Some stats for inputting China's positive story in the epi piece---------------------- # cumulative sum of all epi studies in China over the years cum_sum_studies_chronological <- epi %>% filter(country == "China") %>% arrange(publishing_year) %>% count(country, publishing_year) %>% pull(n) %>% cumsum() %>% as_tibble() # naming the column for cumulative studies colnames(cum_sum_studies_chronological) <- "cumulative_sum_studies" # adding the cumulative study column to the summary china_studies_year_summary <- cbind(epi %>% filter(country == "China") %>% arrange(publishing_year) %>% count(country, publishing_year), cum_sum_studies_chronological) # adding pre-2010 (up till 1998) studies (total = 0) to the above tibble china_studies_pre2010 <- tibble(country = "China", publishing_year = 1998:2009, n = 0, cumulative_sum_studies = 0) china_studies_year_summary <- china_studies_year_summary %>% bind_rows(china_studies_pre2010) foo <- china_studies_year_summary %>% pivot_longer(n:cumulative_sum_studies, names_to = "sum_type", values_to = "value") %>% mutate(sum_type = ifelse(sum_type == "n", "Number of studies", "Cumulative sum of studies")) %>% ggplot() + geom_line(mapping = aes(x = publishing_year, y = value, color = sum_type), lwd = 1) + scale_x_continuous(breaks = c(seq(1998, 2020, 2), 2021, 2023), limits = c(1998, 2023)) + scale_y_continuous(breaks = seq(0, 20, 2)) + labs(x = "Publishing Year", y = "Studies Count", title = "Epi air pollution studies published in China, 1998 to 2023", color = "", caption = "* Sources: (a) Does the Squeaky Wheel Get More Grease? The Direct and Indirect Effects of Citizen Participation on Environmental Governance in China, Buntaine et al. (b) AQLI epi meta analysis.") + ggthemes::theme_tufte() + theme(plot.title = element_text(size = 22, hjust = 0.5, margin = margin(b = 0.8, unit = "cm")), plot.subtitle = element_text(size = 13, hjust = 0.5, margin = margin(b = 0.5, unit = "cm")), axis.title.x = element_text(size = 15, margin = margin(t = 0.7, b = 1, unit = "cm")), axis.title.y = element_text(size = 15, margin = margin(r = 0.7, unit = "cm")), axis.ticks.y = element_blank(), axis.text = element_text(size = 14), legend.position = "bottom", legend.box.background = element_rect(color = "black"), legend.text = element_text(size = 15), plot.caption = element_text(size = 9.5, hjust = 0, face = "italic", margin = margin(t = 1, b = 2, unit = "cm"))) + annotate("segment", x = 2013, xend = 2013, y = 0, yend = 20, linetype = "dashed") + annotate("segment", x = 2014, xend = 2014, y = 0, yend = 20, linetype = "dashed") + annotate("segment", x = 2017, xend = 2017, y = 0, yend = 20, linetype = "dashed") + annotate("segment", x = 2017, xend = 2017, y = 0, yend = 20, linetype = "dashed") + annotate("point", x = 2023, y = 16, label = "2023", size = 3) + annotate("point", x = 2023, y = 1, label = "2023", size = 3, color = "yellow") + viridis::scale_color_viridis(discrete = TRUE) # annotate("text", x = 2022, y = 45, label = "Satellite data not available") + # annotate("rect", xmin = 2021, xmax = 2023, ymin = 60, ymax = 60) # China's pollution over time 1998 to 2021 foo1 <- aqli_color %>% filter(country == "China") %>% group_by(country) %>% mutate(pop_weights = population/sum(population, na.rm = TRUE)) %>% dplyr::mutate(across(dplyr::starts_with("pm"), ~(.x*pop_weights), .names = "{col}_pop_weighted")) %>% dplyr::summarise(across(dplyr::contains("pop_weighted"), ~(round(sum(.x, na.rm = TRUE), 2)), .names = "avg_{col}"), total_population = sum(population, na.rm = TRUE), objectid_color = objectid_color[1], whostandard = whostandard[1], .groups = "keep") %>% dplyr::rename_with(~str_replace_all(.x, "(_pop_weighted)|(avg_)", ""), dplyr::contains("_pop_weighted")) %>% pivot_longer(pm1998:pm2021, names_to = "year", values_to = "pm2.5_value") %>% mutate(year = as.numeric(str_remove(year, "pm"))) %>% select(country, year, pm2.5_value) %>% ggplot2::ggplot() + ggplot2::geom_line(mapping = ggplot2::aes(x = year, y = pm2.5_value), lwd = 1.1, color = "darkred") + geom_hline(mapping = aes(yintercept = 35), linetype = "dashed") + ggplot2::scale_x_continuous(breaks = c(seq(1998, 2020, 2), 2021, 2023), limits = c(1998, 2023)) + scale_y_continuous(breaks = seq(0, 60, 10), limits = c(0, 60)) + ggthemes::theme_tufte() + theme(plot.title = element_text(size = 22, hjust = 0.5, margin = margin(b = 2.5, unit = "cm")), plot.subtitle = element_text(size = 13, hjust = 0.5, margin = margin(b = 0.5, unit = "cm")), axis.title.x = element_text(size = 15, margin = margin(t = 0.7, b = 0.5, unit = "cm")), axis.title.y = element_text(size = 15, margin = margin(r = 0.7, unit = "cm")), axis.ticks.y = element_blank(), axis.text = element_text(size = 14), legend.position = "bottom", legend.text = element_text(size = 15), legend.box.background = element_rect(color = "black"), plot.caption = element_text(size = 9.5, hjust = 0, face = "italic", margin = margin(t = 0.8, b = 1.5, unit = "cm"))) + ggplot2::labs(x = "Years", y = "Annual Average PM2.5 concentration (in µg/m³)", title = expression(PM[2.5] ~ "pollution in China, 1998 to 2021")) + annotate("text", x = 2018, y = 58, label = "War on pollution begins", size = 5) + annotate("text", x = 2023.5, y = 15, label = "2023", size = 3) + annotate("text", x = 2027, y = 27, label = "", size = 3) + annotate("text", x = 2009.2, y = 20, label = str_wrap("Prefectural EPAs publicize CEMS data", width = 20), size = 5) + annotate("segment", x = 2011, xend = 2013, y = 18, yend = 18) + annotate("segment", x = 2014, xend = 2016, y = 58, yend = 58) + annotate("segment", x = 2014, xend = 2014, y = 0, yend = 58, linetype = "dashed") + annotate("segment", x = 2013, xend = 2013, y = 0, yend = 18, linetype = "dashed") + annotate("text", x = 2021, y = 45, label = str_wrap("All prefectural cities operating official Weibo and WeChat accounts", width = 30), size = 5) + annotate("segment", x = 2017, xend = 2017, y = 0, yend = 45, linetype = "dashed") + annotate("text", x = 2006, y = 37, label = expression("National" ~ PM[2.5] ~ "standard: 35 µg/m³"), size = 4) + annotate("segment", x = 2017, xend = 2018.5, y = 45, yend = 45) plt <- gridExtra::grid.arrange(foo1, foo)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.