# app.R helper file-------------------------------------------------------------
## libraries
library(readr)
library(dplyr)
library(stringr)
library(magrittr)
library(ggplot2)
library(readr)
library(tidytext)
library(tidyr)
library(tidyverse)
library(ggthemes)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(mapsf)
library(colorspace)
library(shiny)
library(shinydashboard)
library(DT)
# global variables
who_pm2.5_guideline <- 5
# global operations
`%notin%` <- Negate(`%in%`)
# epi studies analysis raw sheet
epi <- readxl::read_xlsx("./data-raw/pm2.5_distribution/AQLI_Epidemiology Literature Research.xlsx", sheet = "AnalysisDatasetPM2.5MortalityAn")
#> change default columns types
epi$cohort_size <- as.numeric(epi$cohort_size)
epi$study_start_year<- as.numeric(epi$study_start_year)
epi$study_end_year <- as.numeric(epi$study_end_year)
epi$pm2.5_exposure_ll <- as.numeric(epi$pm2.5_exposure_ll)
epi$pm2.5_exposure_ul <- as.numeric(epi$pm2.5_exposure_ul)
epi$mean_pm2.5 <- as.numeric(epi$mean_pm2.5)
epi$sd_pm2.5 <- as.numeric(epi$sd_pm2.5)
epi$cohort_age_ll <- as.numeric(epi$cohort_age_ll)
epi$cohort_age_ul <- as.numeric(epi$cohort_age_ul)
# AQLI color file and gadm0 file
aqli_color <- read_csv("./data-raw/pm2.5_distribution/[june2023]gadm2_aqli_2021_post_waterbody_adj_finalized_internal.csv")
gadm0_aqli_2021 <- read_csv("./data-raw/pm2.5_distribution/[june2023]gadm0_aqli_2021_post_waterbody_adj_finalized_internal.csv")
# aqli_color <- aqli_color %>%
# dplyr::rename_with(~str_replace(.x, "who", "llpp_who_"), dplyr::contains("who")) %>%
# dplyr::rename_with(~str_replace(.x, "nat", "llpp_nat_"), dplyr::contains("nat")) %>%
# dplyr::rename(country = name0) %>%
# dplyr::mutate(whostandard = who_pm2.5_guideline) %>%
# dplyr::select(objectid_gadm2:population, whostandard, everything())
aqli_color <- aqli_color %>%
dplyr::mutate(objectid_color = objectid_gadm2)
# getting a country continent file
country_continent <- read_csv("./data-raw/pm2.5_distribution/country_continent.csv")
# adding a continent column to the color file
aqli_color <- aqli_color %>%
left_join(country_continent, by = "country") %>%
select(objectid_color, continent, country, everything())
gadm0_aqli_2021 <- gadm0_aqli_2021 %>%
left_join(country_continent, by = "country")
#> Filling in missing continents-----------------------------------------------
# countries for which continent is NA: for these fill in the continent manually
countries_with_missing_continent <- gadm0_aqli_2021 %>% filter(is.na(continent)) %>% pull(country) %>% unique()
# continent fill in for missing coutries
continents_for_missing_countries <- c("Africa", "Europe", "North America", "Asia", "Asia",
"Oceania", "Africa", "Africa", "Africa", "Asia")
# [CAUTION: perform a sanity check on the above 2 vectors and how they map countries to continents before proceeding]
#creating a data frame using the above 2 vectors as columns
missing_continents_df <- tibble(country = countries_with_missing_continent,
continent = continents_for_missing_countries)
# adding in the missing continent information in the gadmx_aqli_2021 datasets
aqli_color <- aqli_color %>%
left_join(missing_continents_df, by = "country") %>%
mutate(continent = ifelse(is.na(continent.x), continent.y, continent.x))
#> setting report parameters
min_sample_size <- 1000
min_study_duration_in_years <- 1
#> Filtering the epi database (minimum cohort size: >1000, minimum study duration: > 1 year): commented for now.
epi <- epi %>%
mutate(study_duration = (study_end_year - study_start_year) + 1) %>%
filter(cohort_size >= min_sample_size, study_duration >= min_study_duration_in_years, !is.na(cohort_uid)) %>%
distinct(cohort_uid, .keep_all = TRUE)
# creating this temp dataset for the continent panel dataset
epi_tmp <- epi %>%
mutate(continent = ifelse(country == "China", "China", continent),
continent = ifelse(continent == "Asia", "Asia - China", continent))
# continent list (specifucally for the continent panel graph, that contains the Asia and Asia - China panels, and considers them as continents in a
# separate epi_tmp dataset, that is specially created for this graph)
continent_list <- c("China", "Asia - China", "Africa", "North America", "South America", "Europe", "Oceania")
# missing continents
missing_continents_logical <- continent_list %notin% unique(epi_tmp$continent)
missing_continents <- continent_list[missing_continents_logical]
#> Calculations needed for all sections above the "Results section".
# percent of population not in compliance with the WHO guideline
num_people_above_who <- aqli_color %>%
filter(pm2020 > who_pm2.5_guideline) %>%
summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
unlist()
# world population
world_pop <- aqli_color %>%
summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
unlist()
percent_people_above_who <- round((num_people_above_who/world_pop)*100, 1)
# pop pollution graph with 2 buckets: function.
pop_pol_graph_2_buckets <- function(aqli_color, epi, thresh_ll, thresh_ul, pm2.5_col_name){
if(thresh_ll > thresh_ul){
stop("Lower Limit should be less than or equal to the Upper limit! Please try again!")
}
# aqli color population in ordered pm2.5 buckets
aqli_color_grp_pm2.5_buckets <- aqli_color %>%
dplyr::filter(!is.na(!!as.symbol(pm2.5_col_name))) %>%
dplyr::mutate(region = ifelse((!!as.symbol(pm2.5_col_name) >= thresh_ll) & (!!as.symbol(pm2.5_col_name) <= thresh_ul), stringr::str_c(thresh_ll, "-", thresh_ul), !!as.symbol(pm2.5_col_name)),
region = ifelse(!!as.symbol(pm2.5_col_name) > thresh_ul, stringr::str_c(">", thresh_ul), region),
region = ifelse(!!as.symbol(pm2.5_col_name) < thresh_ll, stringr::str_c("<", thresh_ll), region)) %>%
dplyr::group_by(region) %>%
dplyr::summarise(tot_pop = sum(population, na.rm = TRUE)) %>%
dplyr::ungroup() %>%
dplyr::mutate(order_pollution_group = ifelse(region == stringr::str_c("<", thresh_ll), 1, 0),
order_pollution_group = ifelse(region == stringr::str_c(thresh_ll, "-", thresh_ul), 2, order_pollution_group),
order_pollution_group = ifelse(region == stringr::str_c(">", thresh_ul), 3, order_pollution_group)) %>%
dplyr::ungroup() %>%
dplyr::mutate(tot_pop_prop = (tot_pop/sum(tot_pop))*100)
# epi total number of studies in ordered pollution buckets (initially filtering out pooled studies, will add back in using a for loop)
epi_num_studies_grp_pm2.5_buckets <- epi %>%
dplyr::filter(!is.na(mean_pm2.5), mean_pm2.5 != "NA", non_pm2.5 == 0) %>%
dplyr::group_by(paper_uid) %>%
dplyr::summarise(mean_pm2.5 = mean(mean_pm2.5, na.rm = TRUE)) %>%
dplyr::mutate(region = ifelse(mean_pm2.5 >= thresh_ll & mean_pm2.5 <= thresh_ul, stringr::str_c(thresh_ll, "-", thresh_ul), mean_pm2.5),
region = ifelse(mean_pm2.5 > thresh_ul, stringr::str_c(">", thresh_ul), region),
region = ifelse(mean_pm2.5 < thresh_ll, stringr::str_c("<", thresh_ll), region)) %>%
dplyr::mutate(order_pollution_group = ifelse(region == stringr::str_c("<", thresh_ll), 1, 0),
order_pollution_group = ifelse(region == stringr::str_c(thresh_ll, "-", thresh_ul), 2, order_pollution_group),
order_pollution_group = ifelse(region == stringr::str_c(">", thresh_ul), 3, order_pollution_group)) %>%
dplyr::group_by(region) %>%
dplyr::summarise(tot_studies = dplyr::n(), order_pollution_group = order_pollution_group[1]) %>%
dplyr::ungroup() %>%
dplyr::mutate(tot_studies_prop = (tot_studies/sum(tot_studies))*100)
#> bar chart version
# creating a master dataset for both population and studies data
pop_epi_studies_data <- aqli_color_grp_pm2.5_buckets %>%
dplyr::left_join(epi_num_studies_grp_pm2.5_buckets, by = c("region", "order_pollution_group")) %>%
dplyr::select(region, order_pollution_group, tot_pop_prop, tot_studies_prop) %>%
tidyr::pivot_longer(cols = tot_pop_prop:tot_studies_prop, names_to = "type_of_prop", values_to = "val") %>%
dplyr::mutate(type_of_prop = ifelse(type_of_prop == "tot_pop_prop", "Percent Population in PM₂.₅ bucket", "Percent Studies in PM₂.₅ bucket"))
pop_num_studies_in_pollution_buckets_graph <- pop_epi_studies_data %>%
ggplot2::ggplot() +
ggplot2::geom_col(mapping = aes(x = forcats::fct_reorder(region, order_pollution_group), y = val, fill = type_of_prop), position = position_dodge(), width = 0.3) +
ggplot2::scale_y_continuous(breaks = seq(0, 100, 10)) +
ggplot2::scale_fill_manual(values = c("tot_pop_prop" = "grey", "tot_studies_prop" = "cornflowerblue"), labels = c("Proportion of World Population in Bucket", "Proportion of Studies Completed in Bucket")) +
ggplot2::labs(x = expression("Mean" ~ PM[2.5] ~ "bucket (in µg/m³)"), y = "Percentage", fill = "",
title = expression("Percentage of Global population in different" ~ PM[2.5] ~ "buckets"),
caption = stringr::str_wrap("*This graph 'only' takes into account the PM2.5 specific studies. For multi-country (pooled) studies, it averages the mean PM2.5 values, across all countries."), width = 10) +
# ggplot2::geom_text(mapping = aes(x = forcats::fct_reorder(region, order_pollution_group), y = val, label = paste0(round(val, 1), "%")), position = ggplot2::position_dodge2(width = 1, preserve = "single"), vjust = -0.5, hjust = -0.2, size = 3) +
ggthemes::theme_tufte() +
ggplot2::theme(axis.line.y = element_line(color = "black"),
axis.line.x = element_line(color = "black"),
plot.caption = element_text(size = 8, hjust = 0, margin = margin(t = 0.7, unit = "cm")),
plot.title = element_text(size = 16, margin = margin(b = 1, unit = "cm"), hjust = 0.5),
plot.caption.position = "plot",
axis.title.y = element_text(margin = margin(r = 0.5, unit = "cm"), size = 14),
axis.title.x = element_text(margin = margin(t = 0.5, b = 0.5, unit = "cm"), size = 14),
axis.text = element_text(size = 12),
legend.box.background = element_rect(color = "black"),
legend.position = "bottom",
legend.text = element_text(size = 12)) +
viridis::scale_fill_viridis(discrete = TRUE)
return(pop_num_studies_in_pollution_buckets_graph)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.