In joebrew/maragra: An analysis of malaria at the Maragra sugar company, Mozambique

# csl: journal-of-health-economics.csl

# Packages
library(tidyverse)
library(knitr)
library(Hmisc)
library(brew)
library(maragra)
library(RColorBrewer)
library(extrafont)
library(kableExtra)
library(raster)
library(sp)
library(tidyverse)
library(knitr)
library(Hmisc)
library(brew)
library(maragra)
library(knitr)
library(RColorBrewer)
# library(lme4)
library(lfe)
library(restorepoint)
library(regtools)

loadfonts()
## Global options
options(max.print="75")
opts_chunk$set(echo=FALSE,
                 cache=FALSE,
               prompt=FALSE,
               fig.height = 4,
               fig.width = 5,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE,
               # dev = "cairo_pdf",
               fig.pos="!h",
               fig.align = 'center')
knitr::opts_chunk$set(error = FALSE)

opts_knit$set(width=75)
options(xtable.comment = FALSE)
source('code.R')

Absences: We observed r nrow(ab_panel) unique worker-days among r length(unique(ab_panel$oracle_number)) workers. The all-period average absenteeism rate was r round(length(which(ab_panel$absent)) / nrow(ab_panel) * 100, digits = 2)%, though this rate varied widely as a function of worker department, sex, residence, and season (table).

Model results (one model)

library(broom)
broom::tidy(final_model)

# Prediction based on individual stuff
fake <- expand.grid(protection = seq(0, 1, 0.5),
                    herd_protection = seq(0, 0.6, 0.1),
                    rain_var = seq(0, 6, 1))
fe <- getfe(final_model)
predictions <- exp(predict(final_model_lm, newdata = fake, interval = 'confidence'))-1
predictions <- data.frame(predictions)
fixed_effects <- mean(fe$effect)
fake$fit <- predictions$fit + fixed_effects
fake$lwr <- predictions$lwr + fixed_effects
fake$upr <- predictions$upr + fixed_effects

n_cols <- length(unique(fake$protection))
cols <- colorRampPalette(RColorBrewer::brewer.pal(n = 8, 'Spectral'))(n_cols)
cols[2] <- 'black'
library(maragra)
ggplot(data = fake %>% filter(rain_var == 3) %>% mutate(protection = factor(protection)),
       aes(x = herd_protection,
           y = fit,
           color = protection,
           group = protection)) +
  geom_ribbon(aes(ymin = lwr, ymax = upr,
                  fill = protection),
              color = NA,
              alpha = 0.15) +
  geom_point() +
  geom_line(alpha = 0.6) +
  scale_color_manual(name = 'Individual protection',
                     values = cols) +
  scale_fill_manual(name = 'Individual protection',
                     values = cols) +
  labs(x = 'Community protection score',
       title = 'Association between community protection, individual protection, and absenteeism',
       y = 'Estimated monthly absenteeism rate\n(Average fixed effect, 3 cm of 15-60 day prior precipitation)') + theme_maragra()

Model results (segregated models)

out_list <- list()
worker_groups <- names(model_list)
for(i in 1:length(worker_groups)){
  this_group <- worker_groups[i]
  out_list[[i]] <- broom::tidy(model_list[[i]]) %>% mutate(grp = this_group)
}
out <- bind_rows(out_list)
# Get the non segrated version too
non_seg <- broom::tidy(final_model) %>% mutate(grp = 'Non segregated')
options(scipen =  '999')
out <- out %>% bind_rows(non_seg) %>% dplyr::select(term, estimate, p.value, grp) %>%
  mutate(Type = 'Distance weighted')

Radial model

out_list <- list()
worker_groups <- names(radial_model_list)
for(i in 1:length(worker_groups)){
  this_group <- worker_groups[i]
  out_list[[i]] <- broom::tidy(radial_model_list[[i]]) %>% mutate(grp = this_group)
}
out2 <- bind_rows(out_list)
# Get the non segrated version too
non_seg <- broom::tidy(final_model_radial) %>% mutate(grp = 'Non segregated')
options(scipen =  '999')
out2 <- out2 %>% bind_rows(non_seg) %>% dplyr::select(term, estimate, p.value, grp) %>%
  mutate(Type = 'Hard threshold')

Combined radial and hard threshold

combined <- bind_rows(out, out2)
combined$term <- ifelse(combined$term == 'radial_herd_protection', 'herd_protection', combined$term)
combined$term <- ifelse(combined$term == 'herd_protection', 'Community protection', combined$term)
combined$term <- ifelse(combined$term == 'rain_var', 'Lagged precipitation', combined$term)
combined$term <- ifelse(combined$term == 'protection', 'Individual protection', combined$term)

combined$term <- Hmisc::capitalize(gsub('_', ' ', combined$term))
ggplot(data = combined,
       aes(x = term,
           y = estimate,
           fill = Type)) +
  facet_wrap(~grp) +
  geom_bar(stat = 'identity',
           position = position_dodge()) +
  coord_flip() +
  geom_hline(yintercept = 0) +
  theme_maragra() +
  labs(x = 'Coefficient',
       y = 'Term',
       title = 'Comparison between distance-weighted vs. hard-threshold community protection estimation')

# Also make predictions by groups
worker_groups <- names(model_list)
the_groups <- c(worker_groups, 'All')

# Do predictions on the different groups
fake <- expand.grid(protection = seq(0, 1, 0.5),
                    herd_protection = seq(0, 1, 0.1),
                    rain_var = mean(final_data$rain_var, na.rm = TRUE),
                    group = the_groups)
fake_list <- list()
for(i in 1:length(the_groups)){
  this_worker_group <- the_groups[i]
  if(this_worker_group == 'All'){
    this_model <- final_model
    this_model_lm <- final_model_lm
    this_fake_data <- fake %>% mutate(group = 'All')
  } else {
      this_model <- model_list[[i]]
    this_model_lm <- model_list_lm[[i]]
    this_fake_data <- fake %>% dplyr::filter(group == this_worker_group)
  }

  predictions <- exp(predict(this_model_lm, newdata = this_fake_data, interval = 'confidence'))-1
  predictions <- data.frame(predictions)
  this_fe <- getfe(this_model)
  fixed_effects <- mean(this_fe$effect)
  out <- this_fake_data
  out$fit <- predictions$fit + fixed_effects
  out$lwr <- predictions$lwr + fixed_effects
  out$upr <- predictions$upr + fixed_effects
  fake_list[[i]] <- out
}
fake <- bind_rows(fake_list)

n_cols <- length(unique(fake$protection))
cols <- colorRampPalette(RColorBrewer::brewer.pal(n = 8, 'Spectral'))(n_cols)
cols[2] <- 'black'
library(maragra)
ggplot(data = fake  %>% mutate(protection = factor(protection)),
       aes(x = herd_protection,
           y = fit,
           color = protection,
           group = protection)) +
  # geom_ribbon(aes(ymin = lwr, ymax = upr,
  #                 fill = protection),
  #             color = NA,
  #             alpha = 0.15) +
  geom_point() +
  facet_wrap(~group) +
  geom_line(alpha = 0.6) +
  scale_color_manual(name = 'Individual protection',
                     values = cols) +
  scale_fill_manual(name = 'Individual protection',
                     values = cols) +
  labs(x = 'Community protection score',
       title = 'Association between community protection, individual protection, and absenteeism',
       y = 'Estimated monthly absenteeism rate') +    theme_maragra()

Wage trade-offs

# Share of avoided absences by worker type
per_year <- predictions %>%
  filter(strategy == 'Zero') %>%
  mutate(year = as.numeric(format(date, '%Y'))) %>%
  # mutate(year_month = as.Date(paste0(format(date, '%Y-%m'), '-01'))) %>%
  group_by(date = year, 
           permanent_or_temporary) %>%
  summarise(n = n(),
            actual_absences = sum(absent),
            predicted_absences = sum(fit, na.rm = TRUE),
            actual = mean(absent, na.rm = TRUE),
            predicted = mean(fit, na.rm = TRUE)) %>%
  mutate(avoided = predicted_absences - actual_absences)

per_year = per_year %>% group_by(permanent_or_temporary) %>% summarise(avoided = mean(avoided))
per_year$p <- per_year$avoided / sum(per_year$avoided)

p_permanent <- per_year$p[per_year$permanent_or_temporary == 'Permanent']

# Cost to avoid one absence
cost <- 17.31
temporary <- seq(2, 30, by = 2)
permanent <- seq(2, 30, by = 2)
fake_wage <- expand.grid(temporary = temporary,
                         permanent = permanent)
fake_wage$z <- ((fake_wage$temporary * (1- p_permanent)) + (fake_wage$permanent * p_permanent)) 
fake_wage$f <- (fake_wage$z / cost * 100)-100
# Add the percentage of absences prevented in each group

# MAke chart of relation between wages of 2 and whether profitable or not

cols <- colorRampPalette(c('darkred', 'white', 'blue'))(200)
ggplot(data = fake_wage,
       aes(x = temporary,
           y = permanent,
           fill = f)) +
  geom_tile() +
  scale_fill_gradientn(name = '',
                   colors = cols) +
  geom_text(aes(label = paste0(round(f, digits = 1), '%'))) +
  labs(x = 'Average daily productivity value (wage): temporary workers',
       y = 'Average daily productivity value (wage): permanent workers',
       title = 'Return on investment of IRS program as a function of daily wages') +
  theme_maragra() +
  theme(legend.position = 'right')


# Make for different ratios of temporary and permanent
p_permanents <- c(0.15, 0.3, 0.45, 0.6, 0.75, 0.9)
fake_wage_list <- list()
for(i in 1:length(p_permanents)){
  p_permanent <- p_permanents[i]
    # Cost to avoid one absence
  cost <- 17.31
  temporary <- seq(2, 30, by = 2)
  permanent <- seq(2, 30, by = 2)
  fake_wage <- expand.grid(temporary = temporary,
                           permanent = permanent)
  fake_wage$z <- ((fake_wage$temporary * (1- p_permanent)) + (fake_wage$permanent * p_permanent)) 
  fake_wage$f <- (fake_wage$z / cost * 100)-100
  fake_wage_list[[i]] <- fake_wage %>% mutate(p = p_permanent)
}

fake_wage <- bind_rows(fake_wage_list)
fake_wage$p <- paste0('Percent permanent: ', fake_wage$p * 100)

ggplot(data = fake_wage,
       aes(x = temporary,
           y = permanent,
           fill = f)) +
  geom_tile() +
  scale_fill_gradientn(name = '',
                   colors = cols) +
  # geom_text(aes(label = paste0(round(f, digits = 1), '%')), size = 2) +
  labs(x = 'Average daily productivity value (wage): temporary workers',
       y = 'Average daily productivity value (wage): permanent workers',
       title = 'Return on investment of IRS program as a function of daily wages') +
  theme_maragra() +
  theme(legend.position = 'right') +
  facet_wrap(~p)

Simulations

# 
# # SIMULATIONS
# sim_data <- simulations
# sim_data$idx <- sim_data$oracle_number
# fe <- getfe(final_model)
# predictions <- exp(predict(final_model_lm, sim_data, interval = 'confidence'))-1
# predictions <- data.frame(predictions)
# # Add the fixed effects
# fixed_effects <- left_join(sim_data,
#                            data.frame(fe) %>% dplyr::select(idx, effect) %>%
#                              mutate(idx = as.character(idx))) %>%
#   .$effect
# fixed_effects <- as.numeric(fixed_effects)
# 
# predictions$fit <- predictions$fit + fixed_effects
# predictions$lwr <- predictions$lwr + fixed_effects
# predictions$upr <- predictions$upr + fixed_effects
# 
# # combine other info
# predictions <- predictions %>% bind_cols(sim_data %>% dplyr::select(date, absent, strategy, permanent_or_temporary)) %>%
#   mutate(absent = as.numeric(absent)) %>%
#   filter(!is.na(fit))
# 
# simple <- predictions %>%
#   group_by(strategy) %>%
#   summarise(n = n(),
#             actual_absences = sum(absent),
#             predicted_absences = sum(fit, na.rm = TRUE),
#             actual = mean(absent, na.rm = TRUE),
#             predicted = mean(fit, na.rm = TRUE)) %>%
#   dplyr::rename(value = predicted) %>%
#   dplyr::select(strategy, value, predicted_absences)
# 
# zero <- predictions %>%
#   filter(strategy == 'Zero') %>%
#   mutate(year = as.numeric(format(date, '%Y'))) %>%
#   group_by(year) %>%
#   summarise(n = n(),
#             actual_absences = sum(absent),
#             predicted_absences = sum(fit, na.rm = TRUE),
#             actual = mean(absent, na.rm = TRUE),
#             predicted = mean(fit, na.rm = TRUE)) %>%
#   mutate(avoided = predicted_absences - actual_absences)
# # dplyr::rename(value = predicted) %>%
# # dplyr::select(permanent_or_temporary, value, predicted_absences)
# 
# kable(simple)
# 
# # Eligible working days
# sum(zero$n)
# # Absences
# sum(zero$actual_absences)
# # Actual absence rate
# sum(zero$actual_absences) / sum(zero$n)
# # Would have observed
# sum(zero$predicted_absences)
# # Predicted - observ
# sum(zero$predicted_absences) - sum(zero$actual_absences)
#   # Overall reduction from
# sum(zero$predicted_absences) / sum(zero$n)
# # To
# sum(zero$actual_absences) / sum(zero$n)
# # Number of working days permanent vs temp
# table(predictions$permanent_or_temporary[predictions$strategy=='Real'])
# # Absences avoided among permanent workers
# predictions %>% filter(strategy == 'Zero') %>% group_by(permanent_or_temporary) %>% summarise(n = n(),
#                                                                                               actual_absences = sum(absent),
#                                                                                               predicted_absences = sum(fit, na.rm = TRUE),
#                                                                                               actual = mean(absent, na.rm = TRUE),
#                                                                                               predicted = mean(fit, na.rm = TRUE)) %>%
#   mutate(avoided = predicted_absences - actual_absences) %>%
#   # decline due to irs
#   mutate(decline = predicted - actual)
# # One absence every n days
# sum(zero$n) / sum(zero$avoided)
# # Annual reduction in absence
# mean(zero$avoided[zero$year < 2016])
# # Cost per absence avoided
# 68984 / 4183
# # Percentage of absence among permanent workers
# x = predictions %>%
#   filter(strategy == 'Zero') %>%
#   # filter(date < '2016-01-01') %>%
#   group_by(permanent_or_temporary) %>%
#   summarise(absences = sum(absent),
#             predicted = sum(fit)) %>%
#   mutate(averted = predicted - absences) %>%
#   mutate(p = averted / sum(averted))
# x
# # Weighted average cost
# y = sum(c(25, 5) * x$p)
# y
# # Cost of treatment per absence
# 22 / 80
# # Total savings per aversion
# 0.275 + y
# # Total yearly savings
# (0.275 + y) * 4183
# # Total yearly savings
# ((0.275 + y) * 4183) - 68984
# # ROI
# ((0.275 + y) * 4183) / 68984
# # Additional max strategy
# simple
# 47943-42131
# 
# ggplot(data = simple,
#        aes(x = strategy,
#            y = value)) +
#   geom_bar(stat = 'identity',
#            alpha = 0.6,
#            color = 'black') +
#   labs(y = 'Absenteeism rate',
#        x = 'Strategy') +
#   geom_label(aes(label = round(value, digits = 3)),
#              nudge_y = -0.005)

# SIMULATIONS WITH WORKER SPECIFIC TYPE

# SIMULATIONS
sim_data <- simulations# %>%
  # # Get grade
  # left_join(ab_panel %>% dplyr::distinct(grade, oracle_number, date)) %>%
  # mutate(year = as.numeric(format(date, '%Y'))) %>%
  # left_join(wages %>% dplyr::distinct(grade, department, year, daily_usd)) 
# sim_data$group5 <- ifelse(is.na(sim_data$daily_usd) & sim_data$group != 'Temporary not field worker', 'No wage data', sim_data$group)
sim_data$idx <- sim_data$oracle_number

worker_groups <- names(model_list)
sim_list <- list()
radial_sim_list <- list()
for(i in 1:length(worker_groups)){
  message(i)
  this_worker_group <- worker_groups[i]

  this_model <- model_list[[i]]
  this_model_lm <- model_list_lm[[i]]
  this_fake_data <- sim_data %>% filter(group == this_worker_group)

  # this_radial_model <- radial_model_list[[i]]
  # this_radial_model_lm <- radial_model_list_lm[[i]]
  # this_radial_fake_data <- radial_model_data_list[[i]]

  predictions <- exp(predict(this_model_lm, newdata = this_fake_data, interval = 'confidence'))-1
  predictions <- data.frame(predictions)

  # radial_predictions <- exp(predict(this_radial_model_lm, newdata = this_radial_fake_data, interval = 'confidence'))-1
  # radial_predictions <- data.frame(radial_predictions)

  this_fe <- getfe(this_model)
  this_radial_fe <- getfe(this_radial_model)

  # fixed_effects <- mean(this_fe$effect)
  # radial_fixed_effects <- mean(this_radial_fe$effect)

  out <- this_fake_data
  # radial_out <- this_radial_fake_data

  out$fit <- predictions$fit + fixed_effects
  out$lwr <- predictions$lwr + fixed_effects
  out$upr <- predictions$upr + fixed_effects
  sim_list[[i]] <- out

  # radial_out$fit <- radial_predictions$fit + fixed_effects
  # radial_out$lwr <- radial_predictions$lwr + fixed_effects
  # radial_out$upr <- radial_predictions$upr + fixed_effects
  # radial_sim_list[[i]] <- radial_out
}
sim_data <- bind_rows(sim_list)
# radial_sim_data <- bind_rows(radial_sim_list)

# combine other info
predictions <- sim_data %>%
  mutate(absent = as.numeric(absent)) %>%
  filter(!is.na(fit))

simple <- predictions %>%
  group_by(strategy) %>%
  summarise(n = n(),
            actual_absences = sum(absent),
            predicted_absences = sum(fit, na.rm = TRUE)
            # actual = mean(absent, na.rm = TRUE),
            # predicted = mean(fit, na.rm = TRUE)
            ) %>%
  mutate(actual = actual_absences / n,
         predicted = predicted_absences / n) 

zero <- predictions %>%
  filter(strategy == 'Zero') %>%
  mutate(year = as.numeric(format(date, '%Y'))) %>%
  group_by(year) %>%
  summarise(n = n(),
            actual_absences = sum(absent),
            predicted_absences = sum(fit, na.rm = TRUE),
            actual = mean(absent, na.rm = TRUE),
            predicted = mean(fit, na.rm = TRUE)) %>%
  mutate(avoided = predicted_absences - actual_absences)

# Scenario splot
pd <- simple
ggplot(data = pd,
       aes(x = strategy,
           y = value)) +
  geom_bar(stat = 'identity') +
  theme_maragra() +
  geom_text(aes(label = round(value * 100, digits = 2)), color = 'white',
            nudge_y = -0.01, 
            size = 7) +
  labs(x = 'Strategy',
       y = )

pd <- predictions %>%
  filter(strategy == 'Zero') %>%
  mutate(year = as.numeric(format(date, '%Y'))) %>%
  # mutate(year_month = as.Date(paste0(format(date, '%Y-%m'), '-01'))) %>%
  group_by(date = year, 
           permanent_or_temporary) %>%
  summarise(n = n(),
            actual_absences = sum(absent),
            predicted_absences = sum(fit, na.rm = TRUE),
            actual = mean(absent, na.rm = TRUE),
            predicted = mean(fit, na.rm = TRUE)) %>%
  mutate(avoided = predicted_absences - actual_absences)

pdx <- pd %>%
  dplyr::select(date, permanent_or_temporary,
                actual, predicted) %>%
  gather(key, value, actual:predicted) %>%
  mutate(key = ifelse(key == 'actual', 'Observed', 
                      'Counterfactual'))
ggplot(data = pdx,
       aes(x = date,
           y = value,
            lty = key,
           color = permanent_or_temporary,
           pch = key)) +
  geom_point() +
  geom_line() +
  theme_maragra() +
  scale_color_manual(name = '',
                     values = c('red', 'blue')) +
  scale_linetype_manual(name = '',
                        values = c(1,2)) +
  scale_shape_manual(name = '',
                     values = c(1,2)) +
  labs(x = 'Year',
       y = 'Absenteeism',
       title = 'Predicted vs. observed absenteeism')




kable(simple)

# Eligible working days
sum(zero$n)
# Absences
sum(zero$actual_absences)
# Actual absence rate
sum(zero$actual_absences) / sum(zero$n)
# Would have observed
sum(zero$predicted_absences)
# Predicted - observ
sum(zero$predicted_absences) - sum(zero$actual_absences)
  # Overall reduction from
sum(zero$predicted_absences) / sum(zero$n)
# To
sum(zero$actual_absences) / sum(zero$n)
# Number of working days permanent vs temp
table(predictions$permanent_or_temporary[predictions$strategy=='Real'])
# Absences avoided among permanent workers
predictions %>% filter(strategy == 'Zero') %>% group_by(permanent_or_temporary) %>% summarise(n = n(),
                                                                                              actual_absences = sum(absent),
                                                                                              predicted_absences = sum(fit, na.rm = TRUE),
                                                                                              actual = mean(absent, na.rm = TRUE),
                                                                                              predicted = mean(fit, na.rm = TRUE)) %>%
  mutate(avoided = predicted_absences - actual_absences) %>%
  # decline due to irs
  mutate(decline = predicted - actual)
# One absence every n days
sum(zero$n) / sum(zero$avoided)
# Annual reduction in absence
mean(zero$avoided[zero$year < 2016])
# Cost per absence avoided
68984 / 4183
# Percentage of absence among permanent workers
x = predictions %>%
  filter(strategy == 'Zero') %>%
  # filter(date < '2016-01-01') %>%
  group_by(permanent_or_temporary) %>%
  summarise(absences = sum(absent),
            predicted = sum(fit)) %>%
  mutate(averted = predicted - absences) %>%
  mutate(p = averted / sum(averted))
x
# Weighted average cost
y = sum(c(25, 5) * x$p)
y
# Cost of treatment per absence
22 / 80
# Total savings per aversion
0.275 + y
# Total yearly savings
(0.275 + y) * 4183
# Total yearly savings
((0.275 + y) * 4183) - 68984
# ROI
((0.275 + y) * 4183) / 68984
# Additional max strategy
simple
47943-42131

# Per year
per_year <- predictions %>%
  filter(strategy == 'Zero') %>%
  mutate(year = as.numeric(format(date, '%Y'))) %>%
  # mutate(year_month = as.Date(paste0(format(date, '%Y-%m'), '-01'))) %>%
  group_by(date = year, 
           permanent_or_temporary) %>%
  summarise(n = n(),
            actual_absences = sum(absent),
            predicted_absences = sum(fit, na.rm = TRUE),
            actual = mean(absent, na.rm = TRUE),
            predicted = mean(fit, na.rm = TRUE)) %>%
  mutate(avoided = predicted_absences - actual_absences)

per_year = per_year %>% group_by(permanent_or_temporary) %>% summarise(avoided = mean(avoided))
sum(per_year$avoided)

Table : Absenteeism rate by year and worker type

pd <- final_data %>%
  group_by(permanent_or_temporary, year) %>% 
  summarise(absences = length(which(absent)),
            eligibles = n()) %>%
  ungroup %>%
  mutate(p = absences / eligibles * 100)
pd

# paste0(
#   "Of the ", nrow(simulations[simulations$strategy == 'Real',]),
#   " eligible working days in our panel, we observed ", nrow(simulations[simulations$strategy == 'Real' & simulations$absent,]), 
#   " absences. Using the above approach, we estimate that we would have observed ",
#   length(which(simulations$absent[simulations$strategy == 'Zero'])),
#   " had it not been for the firm's IRS program, for a total of ",
#   length(which(simulations$absent[simulations$strategy == 'Zero'])) - length(which(simulations$absent[simulations$strategy == 'Real'])),
#   " avoided absences. This translates to an overall reduction in absenteeism from ",
#   round(length(which(simulations$absent[simulations$strategy == 'Zero'])) / nrow(simulations %>% filter(strategy == 'Zero')) * 100, digits = 2),
#   "% (without its IRS program) to the observed ", round(length(which(final_data$absent)) / nrow(final_data) * 100, digits = 2),"%.
# The number of working-days observed among permanent workers (",
# length(which(final_data$permanent_or_temporary == 'Permanent')),") was nearly thrice that of those of temporary workers (", 
# length(which(final_data$permanent_or_temporary == 'Temporary')),
# ").") 
# 
# 
# 
# paste0(
#   "By the same token, the number of absences avoided among permanent workers (39,890) was far greater than that of temporary workers (8,972). The latter had a greater relative decline in absenteeism due to IRS (from 9.3% to 3.91%) relative to the former (from 14.2% to 11%), but the aggregate numbers are less consequential since (a) temporary workers have a lower baseline absenteeism rate and (b) they work, by definition, fewer days. In total, the IRS program is estimated to prevent one absence every 26.6 working days.
# Annually, this translates to approximately 4,076 avoided absences per year (slightly greater than the simple splicing of the overall figure into the 4-year study period, since we did not have complete data for all worker types).
# "
# )



ggplot(data = simple,
       aes(x = strategy,
           y = predicted_absences)) +
  geom_bar(stat = 'identity',
           alpha = 0.6,
           color = 'black') +
  labs(y = 'Absenteeism rate',
       x = 'Strategy') +
  geom_label(aes(label = round(predicted_absences, digits = 3)),
             nudge_y = -10000)


# Costs

x <- final_data %>%
  left_join(irs %>% dplyr::select(date, unidade, days_since))

out <- data.frame(
  strategy = c('Max',
               'Real',
               'Time-optimized',
               'Zero'),
  sprays = c(nrow(final_data) / 60,
             length(which(x$days_since == 0)),
             length(which(format(final_data$date, '%m-%d') == '12-11')),
             0
  )
) %>%
  mutate(sprays = sprays / length(unique(final_data$oracle_number)))

kable(out)
ggplot(data = out,
       aes(x = strategy,
           y = sprays)) +
  geom_bar(stat = 'identity',
           alpha = 0.6,
           color = 'black') +
  labs(y = 'Number of sprays per person',
       x = 'Strategy')

Ad hoc analysis of seasonality and absenteeism by worker group

pd <- final_data %>%
  group_by(group5, month_number) %>%
  summarise(absences = length(which(absent)),
            eligibles = n()) %>%
  ungroup %>%
  mutate(p = absences / eligibles * 100)

ggplot(data = pd,
       aes(x = month_number,
           y = p,
           group = group5,
           color = group5)) +
  geom_point() +
  facet_wrap(~group5) +
  theme_maragra() +
  geom_smooth() +
  theme(legend.position = 'none') +
  labs(x = 'Month of year',
       y = 'Absenteeism rate',
       title = 'Worker type and monthly absenteeism')


pd <- final_data %>%
  group_by(grade, month_number) %>%
  summarise(absences = length(which(absent)),
            eligibles = n()) %>%
  ungroup %>%
  mutate(p = absences / eligibles * 100)

ggplot(data = pd,
       aes(x = month_number,
           y = p,
           group = grade,
           color = grade)) +
  geom_point() +
  facet_wrap(~grade) +
  theme_maragra() +
  geom_smooth() +
  theme(legend.position = 'none') +
  labs(x = 'Month of year',
       y = 'Absenteeism rate',
       title = 'Worker type and monthly absenteeism')

Ad hoc analysis of IRS and absenteeism by worker group

Individual level

pd <- final_data %>%
  group_by(protection = round(protection,2), group5) %>%
    summarise(absences = length(which(absent)),
            eligibles = n()) %>%
  ungroup %>%
  mutate(p = absences / eligibles * 100)

ggplot(data = pd,
       aes(x = protection,
           y = p,
           group = group5,
           color = group5)) +
  geom_point(alpha = 0.7, aes(size = eligibles)) +
  facet_wrap(~group5) +
  theme_maragra() +
  geom_smooth(method='lm', formula= y~x) +
  geom_smooth(se = FALSE, size = 0.5, alpha = 0.6) +
  theme(legend.position = 'none') +
  labs(x = 'Estimated INDIVIDUAL protection level',
       y = 'Absenteeism rate',
       title = 'Absenteeism as a function of time since IRS at individual level')

pd <- final_data %>%
  group_by(protection = herd_protection, group5) %>%
    summarise(absences = length(which(absent)),
            eligibles = n()) %>%
  ungroup %>%
  mutate(p = absences / eligibles * 100)

ggplot(data = pd,
       aes(x = protection,
           y = p,
           group = group5,
           color = group5)) +
  # geom_point(alpha = 0.7, size = 0.3) +
  facet_wrap(~group5) +
  theme_maragra() +
  geom_smooth(method='lm', formula= y~x) +
  # geom_smooth(se = FALSE, size = 0.5, alpha = 0.6) +
  theme(legend.position = 'none') +
  labs(x = 'Estimated COMMUNITY protection level',
       y = 'Absenteeism rate',
       title = 'Absenteeism as a function of time since IRS at community level')

Fumigations over time

pd <- mc %>%
  group_by(date = as.Date(paste0(year, '-', month, '-01'))) %>%
  tally

ggplot(data = pd,
       aes(x = date,
           y = n)) +
  geom_bar(stat = 'identity')