# Activate libraries ------------------------------------------------------
library(dplyr)
library(ggplot2)
library(broom)
library(segmented)
library(readr)
library(reshape2)
library(tidyr)
# Import base data --------------------------------------------------------
# Registered births in Mexico by state and by year of registration
# To obtain the information:
# - "Entidad y Municipio de registro” was selected in Características del nacimiento
# - “Sexo” was selected in Características del registrado.
# Source: https://www.inegi.org.mx/sistemas/olap/proyectos/bd/continuas/natalidad/nacimientos.asp?s=est&c=23699&proy=nat_nac
#
# In the original dataset, there is the category "Not specified" in sex variable
# this category is available but was not included
df_muj_base <- read_csv(file = "data-raw/births/Mujeres_reg.csv",
locale = locale(encoding = "latin1"))
df_hom_base <- read_csv(file = "data-raw/births/Hombres_reg.csv",
locale = locale(encoding = "latin1"))
# # Not specified sex
# df_ne_base <- read_csv(file = "data-raw/births/No_especificado_reg.csv",
# locale = locale(encoding = "latin1"))
# Modify Data -------------------------------------------------------------
# Convert from wide to long format
df_muj_long <- df_muj_base %>%
melt(id.vars = c("State_id", "State"),
variable.name = "Year",
value.name = "Births") %>%
mutate(Sex = "Female") %>%
as_tibble()
df_hom_long <- df_hom_base %>%
melt(id.vars = c("State_id", "State"),
variable.name = "Year",
value.name = "Births") %>%
mutate(Sex = "Male") %>%
as_tibble()
# Check if columns have NA
# apply(X = df_muj_long, MARGIN = 2, FUN = function(x) sum(is.na(x)))
# apply(X = df_hom_long, MARGIN = 2, FUN = function(x) sum(is.na(x)))
colSums(x = is.na(df_hom_long))
colSums(x = is.na(df_muj_long))
# Join both dataframes
df_tot_long_0 <- df_hom_long %>%
bind_rows(df_muj_long) %>%
mutate(year = as.numeric(levels(Year)[Year])) %>%
arrange(Year, State, Sex)
# Create Total category in Sex Variable
df_births_INEGI_0 <- df_tot_long_0 %>%
group_by(State_id, State, Year, year) %>%
mutate(prop = Births/sum(Births)) %>%
ungroup() %>%
bind_rows(df_tot_long_0 %>%
group_by(State_id, State, Year, year) %>%
summarise(Births = sum(Births)) %>%
mutate(Sex = "Total")) %>%
arrange(year, State, Sex) %>%
mutate(prop = replace_na(prop, replace = 1))
# Format rename columns
df_births_INEGI_1 <- df_births_INEGI_0 %>%
rename(CVE_GEO = State_id,
state = State,
sex = Sex,
births = Births,
birth_prop = prop) %>%
dplyr::select(year, state, sex, CVE_GEO, births, birth_prop)
## CVE_GEO codes
df_births_INEGI_2 <- df_births_INEGI_1 %>%
mutate(state = replace(state, CVE_GEO == 1, "Aguascalientes" ),
state = replace(state, CVE_GEO == 2, "Baja California" ),
state = replace(state, CVE_GEO == 3, "Baja California Sur"),
state = replace(state, CVE_GEO == 4, "Campeche" ),
state = replace(state, CVE_GEO == 5, "Coahuila" ),
state = replace(state, CVE_GEO == 6, "Colima" ),
state = replace(state, CVE_GEO == 7, "Chiapas" ),
state = replace(state, CVE_GEO == 8, "Chihuahua" ),
state = replace(state, CVE_GEO == 9, "Mexico City" ),
state = replace(state, CVE_GEO == 10, "Durango" ),
state = replace(state, CVE_GEO == 11, "Guanajuato" ),
state = replace(state, CVE_GEO == 12, "Guerrero" ),
state = replace(state, CVE_GEO == 13, "Hidalgo" ),
state = replace(state, CVE_GEO == 14, "Jalisco" ),
state = replace(state, CVE_GEO == 15, "State of Mexico" ),
state = replace(state, CVE_GEO == 16, "Michoacan" ),
state = replace(state, CVE_GEO == 17, "Morelos" ),
state = replace(state, CVE_GEO == 18, "Nayarit" ),
state = replace(state, CVE_GEO == 19, "Nuevo Leon" ),
state = replace(state, CVE_GEO == 20, "Oaxaca" ),
state = replace(state, CVE_GEO == 21, "Puebla" ),
state = replace(state, CVE_GEO == 22, "Queretaro" ),
state = replace(state, CVE_GEO == 23, "Quintana Roo" ),
state = replace(state, CVE_GEO == 24, "San Luis Potosi" ),
state = replace(state, CVE_GEO == 25, "Sinaloa" ),
state = replace(state, CVE_GEO == 26, "Sonora" ),
state = replace(state, CVE_GEO == 27, "Tabasco" ),
state = replace(state, CVE_GEO == 28, "Tamaulipas" ),
state = replace(state, CVE_GEO == 29, "Tlaxcala" ),
state = replace(state, CVE_GEO == 30, "Veracruz" ),
state = replace(state, CVE_GEO == 31, "Yucatan" ),
state = replace(state, CVE_GEO == 32, "Zacatecas" ),
state = replace(state, CVE_GEO == 0, "National")) %>%
mutate(state = as.factor(state))
# # Age == 200 is the population of all ages
# df_pop_filt <- df_mortrate_state_age_sex %>%
# bind_rows(df_mortrate_state_age_sex %>%
# group_by(year, state, sex, CVE_GEO) %>%
# mutate(population = sum(population),
# age = 200))
# Obtain the total population of each state by sex
df_pop_states <- df_mortrate_state_age_sex %>%
filter(state != "MCMA") %>%
group_by(year, state, sex, CVE_GEO) %>%
summarise(population = sum(population)) %>%
ungroup()
# Include population information
df_births_INEGI <- df_births_INEGI_2 %>%
left_join(y = df_pop_states,
by = c("year", "state", "CVE_GEO", "sex")) %>%
mutate(birth_rate = births/population)
# # Save data
# usethis::use_data(df_births_INEGI, overwrite = TRUE)
# Visualizations ----------------------------------------------------------
df_births_INEGI_0
df_tot_long %>%
filter(State == "Total",
Sex != "Total") %>%
ggplot(aes(x = year, y = Births, fill = Sex)) +
geom_bar(position = "fill", stat = "identity")
df_tot_long %>%
filter(State == "Total") %>%
ggplot(aes(x = year, y = Births, color = Sex)) +
geom_line()
plt_male_birts_prop <- df_births_INEGI_0 %>%
filter(Sex == "Male",
State == "Total") %>%
ggplot(aes(x = year, y = prop)) +
geom_line() +
geom_abline(slope = 0, intercept = 0.5, linetype = "dashed", color = "blue") +
scale_x_continuous(breaks = seq(from = 1985, to = 2020, by = 5),
minor_breaks = seq(from = 1985, to = 2020, by = 1)) +
scale_y_continuous(limits = c(0.49, 0.51),
breaks = seq(from = 0.49, to = 0.51, by = 0.002)) +
theme_bw() +
theme(panel.grid.major.x = element_line(size = 1.5)) +
labs(title = "Proportion of male births",
subtitle = "Mexico, 1985 - 2020",
caption = "INEGI, 2020")
# ggsave(plt_male_birts_prop,
# filename = "figs/plt_male_births_prop.png",
# width = 9, height = 5)
# df_tot_long %>%
# filter(Sex == "Male",
# State == "Total") %>%
# ggplot(aes(x = year, y = prop)) +
# geom_line() +
# geom_abline(slope = 0, intercept = 0.5, linetype = "dashed", color = "blue") +
# scale_x_continuous(breaks = seq(from = 1985, to = 2020, by = 5),
# minor_breaks = seq(from = 1985, to = 2020, by = 1)) +
# scale_y_continuous(limits = c(0.49, 0.51),
# breaks = seq(from = 0.49, to = 0.51, by = 0.002)) +
# theme_bw() +
# theme(panel.grid.major.x = element_line(size = 1.5)) +
# labs(title = "Proportion of male births",
# subtitle = "Mexico, 1985 - 2020",
# caption = "INEGI, 2020")
# df_tot_long %>%
# filter(Sex == "Male",
# State == "Total") %>%
# ggplot(aes(x = year, y = prop)) +
# geom_line() +
# geom_abline(slope = 0, intercept = 0.5, linetype = "dashed", color = "blue") +
# scale_x_continuous(breaks = seq(from = 1985, to = 2020, by = 5),
# minor_breaks = seq(from = 1985, to = 2020, by = 1)) +
# scale_y_continuous(limits = c(0.49, 0.51),
# breaks = seq(from = 0.49, to = 0.51, by = 0.002)) +
# theme_bw() +
# theme(panel.grid.major.x = element_line(size = 1.5)) +
# labs(title = "Proportion of male births",
# subtitle = "Mexico, 1985 - 2020",
# caption = "INEGI, 2020")
plt_male_births_prop_states <- df_births_INEGI_0 %>%
filter(Sex == "Male",
State != "Total") %>%
ggplot(aes(x = year, y = prop, color = State)) +
geom_line(size = 0.7) +
geom_abline(slope = 0, intercept = 0.5, linetype = "dashed", color = "blue") +
scale_x_continuous(breaks = seq(from = 1985, to = 2020, by = 5),
minor_breaks = seq(from = 1985, to = 2020, by = 1)) +
theme_bw() +
theme(panel.grid.major.x = element_line(size = 1.5),
legend.position = "bottom") +
labs(title = "Proportion of males",
subtitle = "Mexico states, 1985 - 2020")
# df_births_states <- df_births_INEGI_0 %>%
# filter(Sex == "Male",
# State != "Total")
#
# plty_prop_males_states <- plot_ly(data = df_births_states, x = ~year, y = ~prop, color = ~State,
# type = "scatter", mode = "lines+markers",
# line = list(width = 1.3),
# marker = list(size = 4)) %>%
# layout(showlegend = T, autosize = TRUE)
#
# class(plty_prop_males_states)
#
# htmlwidgets::saveWidget(widget = plty_prop_males_states,
# file = "figs/prop_males_states.html",
# selfcontained = TRUE)
df_tot_sum <- df_tot_long %>%
filter(State != "Total",
Sex == "Male") %>%
group_by(State) %>%
summarise(max = max(prop),
min = min(prop),
difmm = max - min,
mean = mean(prop),
mode = quantile(x = prop, prob = 0.5))
df_tot_long %>%
filter(Sex == "Total",
State == "Total") %>%
ggplot(aes(x = year, y = Births)) +
theme_bw() +
geom_line() +
scale_x_continuous(breaks = seq(from = 1985, to = 2020, by = 5),
minor_breaks = seq(from = 1985, to = 2020, by = 1)) +
scale_y_continuous(breaks = seq(from = 1500000,
to = 3000000,
by = 300000),
minor_breaks = seq(from = 1500000,
to = 3000000,
by = 100000),
labels = scales::comma) +
theme(panel.grid.major.x = element_line(size = 1.5),
panel.grid.major.y = element_line(size = 1.5))
# Piecewise Models --------------------------------------------------------
aa <- df_tot_long %>%
filter(Sex == "Male",
State == "Total")
plot(x = aa$year, y = aa$prop)
points(x = 1995, y = y, col = "red")
points(x = 2010, y = y, col = "green")
## with data modifications ------------------------------------------------
# Create needed new rows for pw regression
bb <- aa %>%
mutate(knot = 2004,
dummy_knot = ifelse(year > knot, 1, 0),
pos_year = year - knot,
pos_inter = dummy_knot*pos_year)
# Fit model
fit_pw <- lm(formula = prop ~ year + pos_inter,
data = bb)
# Check Model results
summary(fit_pw)
# Extract estimates
ss <- tidy(fit_pw)
intercept <- ss$estimate[1]
c_year <- ss$estimate[2]
c_year_2 <- ss$estimate[3]
# y
## if year <= 2003
y <- intercept +c_year*1995
## if year > 2003, suppose 2010
y <- intercept + c_year*2003 + (c_year+c_year_2)*(2010 - 2003)
bb2 <- bb %>%
mutate(y_value = ifelse(test = dummy_knot == 0,
yes = intercept + c_year*year,
no = intercept + c_year*year + (c_year_2)*(year-knot)))
bb2 %>%
ggplot() +
geom_point(aes(x = 2003, y = 0.4985273), size = 3, color = "green") +
geom_line(aes(x = year, y = prop)) +
geom_line(aes(x = year, y = y_value), linetype = "dotted", size = 0.8, color = "blue") +
geom_abline(slope = 0, intercept = 0.5, linetype = "dashed", color = "red") +
scale_x_continuous(breaks = seq(from = 1985, to = 2020, by = 5),
minor_breaks = seq(from = 1985, to = 2020, by = 1)) +
scale_y_continuous(limits = c(0.49, 0.51),
breaks = seq(from = 0.49, to = 0.51, by = 0.002)) +
theme_bw() +
theme(panel.grid.major.x = element_line(size = 1.5))
## With package Segment ---------------------------------------------------
# Fit Model
fit <- lm(formula = prop ~ year,
data = aa)
segmented_fit <- segmented(fit, seg.Z = ~year, psi = 2003)
summary(segmented_fit)
plot(x = aa$year, y = aa$prop)
plot(segmented_fit, add = T)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.