knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
# Load packages library(tidyverse) library(WDI) # silence dplyr summarise warnings options(dplyr.summarise.inform = FALSE)
Let's start by reading-in the obesity data (using the pre-cleaned data/obesity-cleaned.csv
file).
# Read-in and tidy obesity data file <- here::here("data-raw", "raw", "obesity-cleaned.csv") colspec <- cols( X1 = col_skip(), Country = col_character(), Year = col_double(), `Obesity (%)` = col_character(), Sex = col_character() ) obesity <- read_csv(file, col_types = colspec) %>% janitor::clean_names() %>% filter(sex != "Both sexes") %>% filter(obesity_percent != "No data") %>% mutate(rate_obesity = as.numeric(str_extract(obesity_percent, "^\\d+\\.\\d")) / 100) %>% select(-obesity_percent)
Next, let's download some data from the World Bank. We'll use (sex-disaggregated) indicators on percentage completion of basic education, smoking rates, and population.
# List World Bank indicators indicators <- c( rate_primedu.Male = "SE.PRM.CMPT.MA.ZS", rate_primedu.Female = "SE.PRM.CMPT.FE.ZS", youthpop.Female = "SP.POP.0014.FE.IN", youthpop.Male = "SP.POP.0014.MA.IN", literacy.Male = "SE.ADT.LITR.MA.ZS", literacy.Female = "SE.ADT.LITR.FE.ZS", rate_smoke.Male = "SH.PRV.SMOK.MA", rate_smoke.Female = "SH.PRV.SMOK.FE", rate_unemployed.Male = "SL.UEM.TOTL.MA.ZS", rate_unemployed.Female = "SL.UEM.TOTL.FE.ZS", lifexp.Female = "SP.DYN.LE00.FE.IN", lifexp.Male = "SP.DYN.LE00.MA.IN", pop.Male = "SP.POP.TOTL.MA.IN", pop.Female = "SP.POP.TOTL.FE.IN" ) # Download World Bank data wb <- WDI(indicator = indicators, extra = TRUE)
Next, we combine both datasets and ensure that all variables are converted to counts. This ensures that each row is a proper stratum that can be aggregated any way.
# Read-in a dictionary of mappings for country names cyfile <- here::here("data-raw", "processed", "country-ids.csv") cydict <- read_csv(cyfile) %>% mutate(pref = coalesce(world_bank, obesity, altair)) # Helper function to relabel country names fix_cy <- function(x, old) { mask <- !is.na(old) plyr::mapvalues(x, old[mask], cydict$pref[mask], FALSE) }
# Reshape world bank values to counts wb_df <- wb %>% as_tibble %>% pivot_longer(contains("."), names_to = c("indicator", "sex"), names_pattern = "(.*)\\.(.*)") %>% pivot_wider(names_from = indicator, values_from = value) %>% pivot_longer(starts_with("rate_"), names_to = "indicator", values_to = "rate", names_pattern = "_(.*)$") %>% mutate(across(rate, ~ . / 100), count = round(pop * rate)) %>% select(-rate) %>% mutate(literacy = (literacy / 100) * (pop - youthpop)) %>% pivot_wider(names_from = indicator, values_from = count) %>% mutate(across(country, fix_cy, old = cydict$world_bank)) # Combine data combo <- ob <- obesity %>% mutate(across(country, fix_cy, old = cydict$obesity)) %>% full_join(wb_df, by = c("country", "year", "sex")) %>% mutate(obese = rate_obesity * pop) %>% drop_na(pop) %>% select(-rate_obesity) %>% mutate(none = "All") %>% filter(region != "Aggregates") %>% arrange(country, sex, -year) %>% group_by(country, sex) %>% mutate(flag_smoke = if_else(is.na(smoke), "missing", "observed")) %>% fill(smoke, .direction = "updown") %>% ungroup()
# save to /data directory usethis::use_data(ob, overwrite = TRUE) usethis::use_data(cydict, overwrite = TRUE) # write to disk write_csv(ob, here::here("data-raw", "processed", "obesity-combo.csv"))
Finally, for convenience, let's create a dataset containing the latest data:
# create a version that only contains the latest year combo_latest <- combo %>% arrange(desc(year)) %>% group_by(country, sex) %>% fill(where(is.numeric), .direction = "up") %>% slice(1) %>% ungroup
Let's try visualizing a few of the relationships. Let's start with looking for an association between smoking and obesity:
rate <- function(numerator, denom) { invalid <- is.na(numerator) | is.na(denom) sum(numerator[!invalid]) / sum(denom[!invalid]) } combo_latest %>% group_by(region, country) %>% summarise(smoke_rate = rate(smoke, pop), obesity_rate = rate(obese, pop)) %>% ggplot(aes(x = smoke_rate, y = obesity_rate)) + geom_point(aes(colour = region)) + geom_smooth(method = "lm") + labs(title = "Positive Relationship Between Smoking and Obesity", x = "Rate of Smoking", y = "Rate of Obesity", colour = "Region")
Next, let's look at how obesity has changed over time.
combo %>% group_by(region, year) %>% summarise(obesity_rate = rate(obese, pop)) %>% ggplot(aes(x = year, y = obesity_rate, colour = region)) + geom_line() + labs(title = "Obesity Rates Over Time, by Region", x = "Year", y = "Rate of Obesity", colour = "Region")
A few of the values are a bit erratic. That's something to look into later.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.