library(tidyverse)
library(here)
devtools::load_all()
data(demo_area_hierarchy)
#' ## Data sources
#'
#' Malawi 2018 Census population tables
#' Accessed 30 August 2019
#'
#'
## dir.create(here("data-raw/population"))
## download.file("http://www.nsomalawi.mw/images/stories/data_on_line/demography/census_2018/2018%20MPHC%20Published%20Tables/Series%20A.%20Population%20Tables.xlsx",
## here("data-raw/population", "Series A. Population Tables.xlsx"))
readxl::excel_sheets(here("data-raw/population", "Series A. Population Tables.xlsx"))
#' Table A3: Population by sex, district, residence
#' - Only read total (both urban/rural) population by sex.
#' - Possibly opportunity to refine with urban/rural data later
a3 <- readxl::read_excel(here("data-raw/population", "Series A. Population Tables.xlsx"),
sheet = "A3", skip = 2) %>%
select(1, 3:4) %>%
rename(name = 1, male = 2, female = 3) %>%
filter(!is.na(name)) %>%
mutate(
area_level = case_when(name == "Malawi" ~ 0,
name %in% c("Northern", "Central", "Southern") ~ 1,
TRUE ~ 4),
area_name = name
) %>%
left_join(demo_area_hierarchy %>% filter(area_level %in% c(0, 1, 4))) %>%
gather(sex, pop_a3, male, female)
a3 %>% count(sex, area_level, wt = pop_a3)
#' Table A5: Population by sex, single-year age, and region
#' - Lowest geographic stratification for population by age and sex
a5 <- Map(readxl::read_excel,
path = here("data-raw/population", "Series A. Population Tables.xlsx"),
sheet = "A5",
skip = 3 + 0:3 * 101,
n_max = 98) %>%
lapply(select, c(1, 3:4)) %>%
lapply(rename, age = 1, male = 2, female = 3) %>%
Map(mutate, ., area_name = list("Malawi", "Northern", "Central", "Southern")) %>%
bind_rows %>%
gather(sex, pop_a5, male, female) %>%
left_join(demo_area_hierarchy %>% filter(area_level %in% 0:1))
a5aggr <- a5 %>%
filter(age != "Total") %>%
mutate(age = sub("\\+", "", age) %>% utils::type.convert,
age_group = cut(age, c(0, 1, 1:18*5, Inf),
c("Less than 1 Year", "1-4", paste0(1:17*5, "-", 2:18*5 - 1), "90+"),
TRUE, FALSE)) %>%
count(area_id, area_level, area_name, sex, age_group, wt = pop_a5, name = "pop_a5")
#' Table A6: Population by age and district (not sex)
a6 <- Map(readxl::read_excel,
path = here("data-raw/population", "Series A. Population Tables.xlsx"),
sheet = "A6",
skip = c(2, 27, 51),
n_max = 22) %>%
lapply(rename, age_group = 1) %>%
lapply(gather, name, pop_a6, -age_group) %>%
bind_rows() %>%
mutate(area_name = sub(" Total", "", name) %>%
recode("Nklhotakota" = "Nkhotakota")) %>%
left_join(demo_area_hierarchy %>% filter(area_level %in% c(1, 4)))
#' Initial population: district by age disaggregated by region sex ratio by age
cens18 <- spread_areas(demo_area_hierarchy) %>%
left_join(
a6 %>%
filter(area_level == 4, age_group != "Total") %>%
select(area_id, age_group, pop_a6)
,
by = c("area_id4" = "area_id")
) %>%
left_join(
a3 %>%
filter(area_level == 4) %>%
select(area_id, sex, pop_a3)
,
by = c("area_id4" = "area_id")
) %>%
left_join(
a5aggr %>%
filter(area_level == 1) %>%
select(area_id, sex, age_group, pop_a5)
,
by = c("area_id1" = "area_id", "sex", "age_group")
) %>%
mutate(population = pop_a6)
#' Use inverse proportional fitting to adjust district population to match:
#' - Region population by age/sex (A5)
#' - District population by sex (A3)
#' - District population by age (A6)
for(i in 1:5) {
cens18 <- cens18 %>%
group_by(area_id1, sex, age_group) %>%
mutate(ratio_a5 = pop_a5 / sum(population),
population = population * ratio_a5) %>%
group_by(area_id4, sex) %>%
mutate(ratio_a3 = pop_a3 / sum(population),
population = population * ratio_a3) %>%
group_by(area_id4, age_group) %>%
mutate(ratio_a6 = pop_a6 / sum(population),
population = population * ratio_a6) %>%
ungroup
}
cens18 %>%
summarise(a3 = max(abs(log(ratio_a3))),
a5 = max(abs(log(ratio_a5))),
a6 = max(abs(log(ratio_a6))))
cens18 <- cens18 %>%
select(area_id, sex, age_group, population)
#' Malawi NSO Population Projections 2008-2030
#' Source: http://www.nsomalawi.mw/images/stories/data_on_line/demography/census_2008/Main%20Report/ThematicReports/Population%20Projections%20Malawi.pdf
#' Accessed: 30 August 2019
#' Excel sheets transcription obtained from Malawi DHA
nso <- here::here("data-raw", "population", "Pop projections 2008-2030 Master file_transformed.xlsx") %>%
readxl::read_excel("dataset") %>%
gather(key, value, -year, -agegr) %>%
rename(age_group_label = agegr) %>%
mutate(area_name = sub("(.*) - (.*)", "\\1", key),
sex = tolower(sub("(.*) - (.*)", "\\2", key))) %>%
filter(area_name != "Total",
age_group_label != "Total",
sex != "total") %>%
left_join(get_age_groups() %>% select(age_group, age_group_label)) %>%
left_join(
demo_area_hierarchy %>%
filter(area_level == 4) %>%
select(area_id, area_name)
) %>%
count(area_id, year, sex, age_group, wt = value, name = "population")
count(nso, area_id) %>% print(n = Inf)
count(nso, sex)
count(nso, age_group)
#' ## Adjust projection to 2018 census population
#' - Log-linear interpoloation of adjustment ratio between 2008 to 2018
#' - Assume 2018 ratio for future years.
#' - Assume 1.0 ratio for before 2008
cens18adj <- nso %>%
filter(year == 2018) %>%
left_join(
cens18 %>%
mutate(age_group_label = age_group %>%
recode("Less than 1 Year" = "0-4",
"1-4" = "0-4",
"80-84" = "80+",
"85-89" = "80+",
"90+" = "80+"),
age_group = NULL) %>%
left_join(get_age_groups() %>% select(age_group_label, age_group)) %>%
count(area_id,sex, age_group, wt = population, name = "cens18adj")
) %>%
mutate(cens18adj = cens18adj / population,
year = NULL,
population = NULL)
#' Review the adjustments
#' - Urban population for adjult 15-64 MUCH smaller than projections, especially
#' for men.
#' - Age 0-4 population much smaller: probably a combination of lower than
#' projected fertility and undercount of U5 population.
cens18adj %>%
left_join(demo_area_hierarchy %>% select(area_id, area_name, area_sort_order)) %>%
left_join(get_age_groups() %>% select(age_group, age_group_label, age_group_sort_order)) %>%
mutate(area = fct_reorder(area_name, area_sort_order),
age_group = fct_reorder(age_group_label, age_group_sort_order)) %>%
ggplot(aes(age_group, cens18adj, color = sex, group = sex)) +
geom_hline(yintercept = 1.0, linetype = "dashed") +
geom_step() +
scale_y_log10() +
coord_cartesian(ylim = c(0.6, 2.0)) +
facet_wrap(~area, ncol = 8) +
theme_light() +
theme(legend.position = "bottom",
axis.text.x = element_text(hjust = 1.0, angle = 90)) +
ggtitle("Ratio of 2018 Census to 2008 population projections")
## ggsave("demo_census2018_projections2008_comparison.pdf", h=8.5, w=16)
population_agesex <- nso %>%
filter(year %in% 2008:2025) %>%
left_join(cens18adj) %>%
mutate(
source = "Census 2018",
population = population * exp(log(cens18adj) * pmax((year - 2008) / (2018 - 2008), 0.0)),
cens18adj = NULL,
calendar_quarter = convert_calendar_quarter(year, 2),
year = NULL
) %>%
select(area_id, source, calendar_quarter, sex, age_group, population)
#' ## Add ASFR column
asfr <- read_csv(here("data-raw/population/mwi-demo-asfr.csv"))
population_agesex <- population_agesex %>%
left_join(
demo_area_hierarchy %>%
select(area_id, area_name)
) %>%
left_join(
asfr %>%
filter(area_level == 4) %>%
mutate(area_name = recode(area_name, "Nkhatabay" = "Nkhata Bay"),
sex = "female",
asfr = median,
median = NULL,
lower = NULL,
upper = NULL)
) %>%
mutate(area_level = NULL)
#' ## Save datasets
demo_population_agesex <- population_agesex
usethis::use_data(demo_population_agesex, overwrite=TRUE)
write_csv(population_agesex, here("inst/extdata/demo_population_agesex.csv"), na = "")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.