In joebrew/maragra: An analysis of malaria at the Maragra sugar company, Mozambique

# csl: journal-of-health-economics.csl

# Packages
library(tidyverse)
library(knitr)
library(Hmisc)
library(brew)
library(maragra)
library(RColorBrewer)
library(extrafont)
library(kableExtra)
library(raster)
loadfonts()
## Global options
options(max.print="75")
opts_chunk$set(echo=FALSE,
                 cache=FALSE,
               prompt=FALSE,
               fig.height = 4,
               fig.width = 5,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE,
               # dev = "cairo_pdf",
               fig.pos="!h",
               fig.align = 'center')
knitr::opts_chunk$set(error = FALSE)

opts_knit$set(width=75)
options(xtable.comment = FALSE)

source('prepare_data.R')

Calculate household protection

Manuscript draft

Draft of the paper \href{https://docs.google.com/document/d/1bUWRBCgVcgjSPHchIQxiTG8Vwv5hV1GLU4Tlu386sWA/edit#}{HERE}.

Classify protection plot

pd <- data_frame(days_since = 1:230)
pd$protection <- classify_protection(pd$days_since)

ggplot(data = pd,
       aes(x = days_since,
           y = protection)) +
  geom_area(fill = 'lightblue', alpha = 0.8,
            lwd = 0.2,
            color = 'black') +
  geom_vline(xintercept = 0,
             lty = 2, alpha = 0.6) +
  theme_maragra() +
  labs(x = 'Days since IRS application',
       y = 'Protection score')

Descriptive overview

desc <- final_data %>%
  group_by(permanent_or_temporary) %>%
  summarise(workers = length(unique(oracle_number)),
            days = n())

Herd-only protection score

pq <- quantile(final_data$herd_protection)
x <- final_data %>%
  # mutate(protection_cat = protection_var) %>%
  mutate(protection_cat = ifelse(herd_protection <= pq[2],
                                 'Protection 1',
                                 ifelse(herd_protection <= pq[3],
                                        'Protection 2',
                                        ifelse(herd_protection <= pq[4],
                                               'Protection 3',
                                               ifelse(herd_protection <= pq[5],
                                                      'Protection 4',
                                                      NA))))) %>%
  group_by(protection_cat) %>%
  summarise(ab_rate = length(which(absent))/ n() * 100)

names(x) <- c('Quantile', 'Absenteeism rate')
kable(x)

ggplot(data = x,
       aes(x = `Quantile`,
           y = `Absenteeism rate`)) +
  geom_bar(stat = 'identity',
           alpha = 0.6,
           color = 'black') +
  labs(x = 'Protection score quantile')

\newpage

Individual-only protection score

pq <- sort(unique(final_data$protection))
# pq <- quantile(final_data$protection,
#                seq(0, 1, 0.1))
# pq <- sort(unique(pq))
x <- final_data %>%
  # mutate(protection_cat = protection_var) %>%
  mutate(protection_cat = ifelse(protection <= pq[2],
                                 'Protection 1',
                                 ifelse(protection <= pq[3],
                                        'Protection 2',
                                        ifelse(protection <= pq[4],
                                               'Protection 3',
                                               ifelse(protection <= (pq[5] + 0.1),
                                                      'Protection 4',
                                                      NA))))) %>%
  group_by(protection_cat) %>%
  summarise(ab_rate = length(which(absent))/ n() * 100)

names(x) <- c('Quantile', 'Absenteeism rate')
kable(x)

ggplot(data = x,
       aes(x = `Quantile`,
           y = `Absenteeism rate`)) +
  geom_bar(stat = 'identity',
           alpha = 0.6,
           color = 'black') +
  labs(x = 'Protection score quantile')

\newpage

Overall protection score

pq <- quantile(final_data$protection_var)
x <- final_data %>%
  # mutate(protection_cat = protection_var) %>%
  mutate(protection_cat = ifelse(protection_var <= pq[2],
                                 'Protection 1',
                                 ifelse(protection_var <= pq[3],
                                        'Protection 2',
                                        ifelse(protection_var <= pq[4],
                                               'Protection 3',
                                               ifelse(protection_var <= pq[5],
                                                      'Protection 4',
                                                      NA))))) %>%
  group_by(protection_cat, group) %>%
  summarise(ab_rate = length(which(absent))/ n() * 100) %>%
  ungroup

names(x) <- c('Quantile', 'Group', 'Absenteeism rate')
kable(x)

ggplot(data = x,
       aes(x = `Quantile`,
           y = `Absenteeism rate`)) +
  geom_bar(stat = 'identity',
           alpha = 0.8,
           color = 'black',
           lwd = 0.2,
           fill = 'lightblue') +
  labs(x = 'Protection score quantile') +
  facet_wrap(~Group) +
  theme_maragra()

\newpage

Precipitation lag

rq <- quantile(final_data$rain_var, na.rm = TRUE)
x <- final_data %>%
  # mutate(protection_cat = protection_var) %>%
  mutate(protection_cat = ifelse(rain_var <= rq[2],
                                 'Protection 1',
                                 ifelse(rain_var <= rq[3],
                                        'Protection 2',
                                        ifelse(rain_var <= rq[4],
                                               'Protection 3',
                                               ifelse(rain_var <= rq[5],
                                                      'Protection 4',
                                                      NA))))) %>%
  group_by(protection_cat, group) %>%
  summarise(ab_rate = length(which(absent))/ n() * 100) %>%
  ungroup %>%
  filter(!is.na(protection_cat))

names(x) <- c('Quantile', 'Group', 'Absenteeism rate')
kable(x)

ggplot(data = x,
       aes(x = `Quantile`,
           y = `Absenteeism rate`)) +
  geom_bar(stat = 'identity',
           alpha = 0.6,
           color = 'black') +
  labs(x = 'Lagged precipitation score quantile') +
  facet_wrap(~Group)

\newpage

Results

Regression table

The below table shows the results of the model devised thus far.

# make_final_table(model = final_model, the_caption = "Model results", type = 'latex', multiplier = 10000)

# Combined protection
summary(final_model)

Intepretation

The below charts show predicted absenteeism rates at different rain and protection levels.

# on <- sort(unique(final_data$oracle_number))
fake <- expand.grid(#group = sort(unique(monthly$group)),
  rain_var = rq,
  protection_var = pq)#,
fake$predicted <- NA #fake$predicted_upr <- fake$predicted_lwr<- NA

fe <- getfe(final_model)
# # fake <- left_join(fake, fe %>% dplyr::select(idx, effect, obs), 
# #                                              by = c('oracle_number'= 'idx'))
predictions <- exp(predict(final_model_lm, fake, interval = 'confidence'))-1
# predictions <- data.frame(predictions)
# # # Add the fixed effects
fixed_effects <- fe$effect
# 
fake$predicted<- predictions[,1]
fake$predicted_upr <- predictions[,3]
fake$predicted_lwr <- predictions[,2]
# 
# # # Add fixed effects
# # fake <- fake %>%
# #   mutate(predicted = predicted + effect,
# #          predicted_upr = predicted_upr + effect,
# #          predicted_lwr = predicted_lwr + effect)
# 
# pd <- fake %>%
#   filter(protection_var == pq[1]) 
# ggplot(data = pd,
#        aes(x = rain_var,
#            y = predicted)) +
#   geom_point()

Precipitation's effect

g1 <- ggplot(data = fake %>% filter(protection_var == pq[1]),
       aes(x = rain_var,
           y = predicted)) +
  geom_point() +
  geom_line() +
  geom_ribbon(aes(ymin = predicted_lwr,
                  ymax = predicted_upr),
              alpha = 0.4,
              fill = 'darkorange') +
  labs(y = 'Predicted absenteeism rate',
       x = '1-3 month precipitation lag',
       title = 'Lagged precipitation and absenteeism') +
  theme_maragra()

g2 <- ggplot(data = fake %>% filter(rain_var == rq[1]),
       aes(x = protection_var,
           y = predicted)) +
  geom_point() +
  geom_line() +
  geom_ribbon(aes(ymin = predicted_lwr,
                  ymax = predicted_upr),
              alpha = 0.4,
              fill = 'darkorange') +
  labs(y = 'Predicted absenteeism rate',
       x = 'Protection index',
       title = 'IRS-afforded protection and absenteeism') +
  theme_maragra()

Rmisc::multiplot(g1, g2, cols = 2)

Simulations

sim_data <- simulations
sim_data$idx <- sim_data$oracle_number
fe <- getfe(final_model)
predictions <- exp(predict(final_model_lm, sim_data, interval = 'confidence'))-1
predictions <- data.frame(predictions)
# Add the fixed effects
fixed_effects <- left_join(sim_data,
                           data.frame(fe) %>% dplyr::select(idx, effect) %>%
                             mutate(idx = as.character(idx))) %>%
  .$effect
fixed_effects <- as.numeric(fixed_effects)

predictions$fit <- predictions$fit + fixed_effects
predictions$lwr <- predictions$lwr + fixed_effects
predictions$upr <- predictions$upr + fixed_effects

# combine other info
predictions <- predictions %>% bind_cols(sim_data %>% dplyr::select(date, absent, strategy, permanent_or_temporary)) %>%
  mutate(absent = as.numeric(absent)) %>%
  filter(!is.na(fit))

simple <- predictions %>%
  group_by(strategy) %>%
  summarise(n = n(),
            actual_absences = sum(absent),
            predicted_absences = sum(fit, na.rm = TRUE),
            actual = mean(absent, na.rm = TRUE),
            predicted = mean(fit, na.rm = TRUE)) %>%
  dplyr::rename(value = predicted) %>%
  dplyr::select(strategy, value, predicted_absences)

zero <- predictions %>%
  filter(strategy == 'Zero') %>%
  mutate(year = as.numeric(format(date, '%Y'))) %>%
  group_by(year) %>%
  summarise(n = n(),
            actual_absences = sum(absent),
            predicted_absences = sum(fit, na.rm = TRUE),
            actual = mean(absent, na.rm = TRUE),
            predicted = mean(fit, na.rm = TRUE)) %>%
  mutate(avoided = predicted_absences - actual_absences)
  # dplyr::rename(value = predicted) %>%
  # dplyr::select(permanent_or_temporary, value, predicted_absences)

kable(simple)

# Eligible working days
sum(zero$n)
# Absences
sum(zero$actual_absences)
# Actual absence rate
sum(zero$actual_absences) / sum(zero$n)
# Would have observed
sum(zero$predicted_absences)
# Predicted - observ
sum(zero$predicted_absences) - sum(zero$actual_absences)
# Overall reduction from
sum(zero$predicted_absences) / sum(zero$n)
# To
sum(zero$actual_absences) / sum(zero$n)
# Number of working days permanent vs temp
table(predictions$permanent_or_temporary[predictions$strategy=='Real'])
# Absences avoided among permanent workers
predictions %>% filter(strategy == 'Zero') %>% group_by(permanent_or_temporary) %>% summarise(n = n(),
            actual_absences = sum(absent),
            predicted_absences = sum(fit, na.rm = TRUE),
            actual = mean(absent, na.rm = TRUE),
            predicted = mean(fit, na.rm = TRUE)) %>%
  mutate(avoided = predicted_absences - actual_absences) %>%
  # decline due to irs
  mutate(decline = predicted - actual)
# One absence every n days
sum(zero$n) / sum(zero$avoided)
# Annual reduction in absence
mean(zero$avoided[zero$year < 2016])
# Cost per absence avoided
68984 / 4183
# Percentage of absence among permanent workers
x = predictions %>%
  filter(strategy == 'Zero') %>%
  # filter(date < '2016-01-01') %>%
  group_by(permanent_or_temporary) %>%
  summarise(absences = sum(absent),
            predicted = sum(fit)) %>%
  mutate(averted = predicted - absences) %>%
  mutate(p = averted / sum(averted))
x
# Weighted average cost
y = sum(c(25, 5) * x$p)
y
# Cost of treatment per absence
22 / 80
# Total savings per aversion
0.275 + y
# Total yearly savings
(0.275 + y) * 4183
# Total yearly savings
((0.275 + y) * 4183) - 68984
# ROI
((0.275 + y) * 4183) / 68984
# Additional max strategy
simple
47943-42131

ggplot(data = simple,
       aes(x = strategy,
           y = value)) +
  geom_bar(stat = 'identity',
           alpha = 0.6,
           color = 'black') +
  labs(y = 'Absenteeism rate',
       x = 'Strategy') +
  geom_label(aes(label = round(value, digits = 3)),
             nudge_y = -0.005)

# paste0(
#   "Of the ", nrow(simulations[simulations$strategy == 'Real',]),
#   " eligible working days in our panel, we observed ", nrow(simulations[simulations$strategy == 'Real' & simulations$absent,]), 
#   " absences. Using the above approach, we estimate that we would have observed ",
#   length(which(simulations$absent[simulations$strategy == 'Zero'])),
#   " had it not been for the firm's IRS program, for a total of ",
#   length(which(simulations$absent[simulations$strategy == 'Zero'])) - length(which(simulations$absent[simulations$strategy == 'Real'])),
#   " avoided absences. This translates to an overall reduction in absenteeism from ",
#   round(length(which(simulations$absent[simulations$strategy == 'Zero'])) / nrow(simulations %>% filter(strategy == 'Zero')) * 100, digits = 2),
#   "% (without its IRS program) to the observed ", round(length(which(final_data$absent)) / nrow(final_data) * 100, digits = 2),"%.
# The number of working-days observed among permanent workers (",
# length(which(final_data$permanent_or_temporary == 'Permanent')),") was nearly thrice that of those of temporary workers (", 
# length(which(final_data$permanent_or_temporary == 'Temporary')),
# ").") 
# 
# 
# 
# paste0(
#   "By the same token, the number of absences avoided among permanent workers (39,890) was far greater than that of temporary workers (8,972). The latter had a greater relative decline in absenteeism due to IRS (from 9.3% to 3.91%) relative to the former (from 14.2% to 11%), but the aggregate numbers are less consequential since (a) temporary workers have a lower baseline absenteeism rate and (b) they work, by definition, fewer days. In total, the IRS program is estimated to prevent one absence every 26.6 working days.
# Annually, this translates to approximately 4,076 avoided absences per year (slightly greater than the simple splicing of the overall figure into the 4-year study period, since we did not have complete data for all worker types).
# "
# )

ggplot(data = simple,
       aes(x = strategy,
           y = predicted_absences)) +
  geom_bar(stat = 'identity',
           alpha = 0.6,
           color = 'black') +
  labs(y = 'Absenteeism rate',
       x = 'Strategy') +
  geom_label(aes(label = round(predicted_absences, digits = 3)),
             nudge_y = -10000)

Costs

x <- final_data %>%
  left_join(irs %>% dplyr::select(date, unidade, days_since))

out <- data.frame(
  strategy = c('Max',
               'Real',
               'Time-optimized',
               'Zero'),
  sprays = c(nrow(final_data) / 60,
             length(which(x$days_since == 0)),
             length(which(format(final_data$date, '%m-%d') == '12-11')),
             0
  )
) %>%
  mutate(sprays = sprays / length(unique(final_data$oracle_number)))

kable(out)
ggplot(data = out,
       aes(x = strategy,
           y = sprays)) +
  geom_bar(stat = 'identity',
           alpha = 0.6,
           color = 'black') +
  labs(y = 'Number of sprays per person',
       x = 'Strategy')

# Introduction is really all that matters - specifying contribution, summarizing results
# Journal of Health Economics (1) and then Health Economics (2).
# Elisa: American Journal of Health Economics (from American Health Economic Society)
# Ensure that we explicitly explain these 21 days issue to rule out reverse causality.
# Large firms are big enough to capture externalities
# What is the effect of anti-malaria campaign on worker producitivity
# Data special
# Identification problems (no external spraying, no home spraying, unobserved govt - omitted variable bias dealt with through worker and time fixed effects.)
# Exploring herd protection through various functional forms

Charts for paper

# x <- ab_panel %>%
#   left_join(workers %>%
#               dplyr::select(oracle_number, date_of_birth, permanent_or_temporary, sex))
x <- final_data
x <- x %>% left_join(workers %>% dplyr::select(oracle_number, date_of_birth))
x$age <-as.numeric(as.Date('2014-01-01') - x$date_of_birth) / 365.25
x$agey <- x$age
x$age <- round(x$age, digits = -1)
x$status <- x$permanent_or_temporary


calculate <- function(var= 'sex',
                      data){
  out <- 
    data %>%
    group_by_(var) %>%
    summarise(`Number of workers` = length(unique(oracle_number)),
              `Number of days observed` = n()) %>%
    # summarise(`Treatment workers` = length(unique(oracle_number[!months_since %in% c('Never', 'Before')])),
    #           `Treatment days` = length(oracle_number[!months_since %in% c('Never', 'Before')]),
    #           `Control workers` = length(unique(oracle_number[months_since %in% c('Never', 'Before')])),
    #           `Control days` = length(oracle_number[months_since %in% c('Never', 'Before')])) %>%
    ungroup() %>%
    # mutate(p = round(n / sum(n) * 100, digits = 2)) %>%
    mutate(variable = '') 
    names(out)[1] <- ' '
    out$variable[1] <- Hmisc::capitalize(var)

    out <- out %>% dplyr::select(variable, ` `, 
                                 `Number of workers`, `Number of days observed`)
                                 # `Treatment workers`, `Treatment days`, `Control workers`, `Control days`)
    out$` ` <- as.character(out$` `)
  return(out)
}
out_list <- list()
vars <- c('department', 
           'sex',
           'status',
           'age')
for(i in 1:length(vars)){
  message(i)
  y <- calculate(var = vars[i], data = x)
  out_list[[i]] <- y
}
demo_table <- bind_rows(out_list)
names(demo_table)[1:2] <- c('Variable', ' ')
kable(demo_table, format = 'html') %>% kable_styling()
# print(xtable::xtable(demo_table,
#                      caption = 'Overall worker characteristics',
#                      align = c('r', 'r', rep('l', 5))),
#       include.rownames = FALSE)

x <- weather %>%
  mutate(month = format(date, '%m')) %>%
  mutate(year = format(date, '%Y')) %>%
  mutate(date = as.Date(paste0(year, '-', month, '-01'))) %>%
  group_by(date) %>%
  summarise(precipitation = sum(precipitation, na.rm = TRUE),
            temp_max = max(temp_max, na.rm = TRUE),
            temp_min = min(temp_min, na.rm = TRUE),
            temp = mean(temp))
# x <- gather(x, key, value,precipitation:temp)

g1 <- ggplot(data = x,
       aes(x = date,
           y = precipitation)) +
  geom_bar(stat = 'identity',
           fill = 'lightblue',
           size = 0.1) +
  theme_maragra() +
  theme(axis.text.x = element_text(angle = 0)) +
  labs(x = 'Month',
       y = 'Milimeters',
       title = 'A. Monthly precipitation') +
  scale_x_date(labels = scales::date_format("%Y")) 

g2 <- ggplot(data = x,
            aes(x = date,
                y = temp)) +
  # geom_bar(stat = 'identity', fill = 'grey', size = 0.1) +
  geom_line(aes(y = temp_max),
             color = 'darkred', 
             alpha = 0.5) +
  geom_line(aes(y = temp_min),
             color = 'blue', 
             alpha = 0.5) +
  # geom_linerange(aes(ymin = temp_min,
  #                    ymax = temp_max),
  #                alpha = 0.6,
  #                color = 'blue') +
   theme_maragra() +
  theme(axis.text.x = element_text(angle = 0)) +
  labs(x = 'Month',
       y = 'Degrees (Celcius)',
       title = 'B. Monthly temperature') +
  scale_x_date(labels = scales::date_format("%Y"))
Rmisc::multiplot(g1, g2, cols = 2)

# BES incidence
g1 <- 
  ggplot(data = bes %>%
           mutate(season = ifelse(p >= median(p), 'High season',
                                  'Low season')) %>%
           filter(weekdays(date) == 'Monday',
                  date <= '2016-12-01'),
       aes(x = date,
           y = p)) +
  geom_point(alpha = 0.6,
             aes(color = season),
             size = 0.2) +
  geom_line(alpha = 0.1) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Incidence',
       title = 'A: Annualized district clinical\nmalaria incidence per 1,000') +
  scale_color_manual(name = '',
                     values = c('red', 'black')) + 
  guides(color = guide_legend(ncol = 1)) +
  theme(legend.position = c(.8, .8),
        legend.text = element_text(size = 5),
        legend.background = element_rect(fill=alpha('white', 0))) +
  theme(axis.title.x = element_blank())

# Absenteeism rate
g2_data <- final_data %>%
  # filter(date <= '2016-12-01') %>%
  group_by(month = as.Date(paste0(format(date, '%Y-%m'), '-01')), permanent_or_temporary) %>%
  summarise(absences = length(which(absent)),
            # absences_sick = length(which(absent_sick)),

            eligibles = n()) %>%
  mutate(absenteeism_rate = absences / eligibles * 100)

g2 <- ggplot(data = g2_data,
             aes(x = month,
                 y = absenteeism_rate,
                 lty = permanent_or_temporary)) +
  geom_line(alpha = 0.6) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Absenteeism rate',
       title = 'B: Monthly absenteeism rate') +
  scale_linetype_manual(name = '', values = c(1,4)) +
  theme(legend.position = c(.84, .8)) +
  guides(lty = guide_legend(ncol = 1)) +
    theme(axis.title.x = element_blank())



# Weather
g4 <-
  ggplot(data = weather %>%
           group_by(date = as.Date(paste0(format(date, '%Y-%m'), '-01'))) %>%
           summarise(precipitation = sum(precipitation, na.rm = TRUE)),
         aes(x = date,
             y = precipitation)) +
  geom_line(alpha = 0.6) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Mililiters',
       title = 'D: Monthly precipitation') +  theme(axis.title.x = element_blank())


# Clinical incidence of malaria
g5 <- clinic_agg %>%
  filter(date >= '2013-01-01', date <= '2016-12-31') %>%
  # mutate(date = as.Date(paste0(year, '-', month, '-01'))) %>%
  group_by(date) %>%
  summarise(tested = sum(tested),
            positive = sum(positive)) %>%
  ungroup %>%
  mutate(percent_positive = positive / tested * 100) %>%
  ggplot(aes(x = date,
             y = positive)) +
  geom_line(alpha = 0.6) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Cases',
       title = 'C: Monthly malaria cases\nat company clinic') +
    theme(axis.title.x = element_blank())


g6 <- clinic_agg %>%
  filter(date >= '2013-01-01', date <= '2016-12-31') %>%
  # mutate(date = as.Date(paste0(year, '-', month, '-01'))) %>%
  group_by(date) %>%
  summarise(tested = sum(tested),
            positive = sum(positive)) %>%
  ungroup %>%
  mutate(percent_positive = positive / tested * 100) %>%
  ggplot(aes(x = date,
             y = percent_positive)) +
  geom_line(alpha = 0.6) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Cases',
       title = 'D: Monthly malaria test positivity rate\nat company clinic') +
    theme(axis.title.x = element_blank())

# Rmisc::multiplot(g1, g2, g3,
#                  # g4, 
#                  g5, g6, layout = matrix(c(1, 1, 2, 3, 4, 5), nrow=2, byrow=TRUE)) 
Rmisc::multiplot(g1,  g5,g2, g6, cols = 2)

fumigations <- mc %>%
  filter(date >= '2013-01-01',
         date <= '2016-12-31')
n_fumigations <- nrow(fumigations)
n_buildings <- sum(fumigations$casas_cobertas, na.rm = TRUE)
unique_agregados <- length(unique(fumigations$unidade))
people_sprayed <- length(unique(final_data$oracle_number[final_data$days_since != 'Never']))
people_observed <- length(unique(final_data$oracle_number))
pr <- function(x){prettyNum(x, big.mark = ',')}

Fumigations: During the period from January 1st, 2013 through December 31st, 2016, the Maragra Malaria Control Unit carried out r pr(n_fumigations) episodes of fumigation of residential "agregados" (household combinations), for a total of r pr(n_buildings) building-fumigation combinations. The total number of unique agregados sprayed during this period was r pr(unique_agregados). Among the r pr(people_observed) workers for whom we have reliable absenteeism and residential data, r pr(people_sprayed) had their homes fumigated at least once (the majority of workers live off of the facility).

dry_days <- length(which(weather$precipitation > 0))
n_days <- length(weather$precipitation)
p_dry <- round(dry_days / n_days * 100)
amount_rain <- round(mean(weather$precipitation[weather$precipitation > 0], na.rm = T), digits = 2)
x <- weather %>%
  group_by(month = format(date, '%m')) %>%
  summarise(precipitation = mean(precipitation, na.rm = TRUE))

Absences: We observed r pr(nrow(ab_panel)) unique worker-days among the r pr(people_observed) workers. The all-period average absenteeism rate was r round(100 * sum(ab_panel$absent) / nrow(ab_panel), digits = 2)%, though this rate varied widely as a function of worker department, sex, residence, and season (table 1).

calculate <- function(var){
  out <- 
  final_data %>%
    mutate(year = format(date, '%Y')) %>%
    # mutate(on_site = ifelse(on_site,
    #                                 # 'On site',
    #                                 # 'Off site'),
    #        rainy = ifelse(rainy,
    #                       'Rainy',
    #                       'Dry')) %>%
    # filter(!is.na(rainy)) %>%
  group_by_(var, 'year') %>%
  summarise(absences = length(which(absent)),
            eligibles = length(absent),
            workers = length(unique(oracle_number))) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100) 
  names(out)[1] <- 'variable'
  out$val <- paste0(round(out$absenteeism_rate, digits = 1),
                    '%')#,
                    # ')',
                    # '\n',
                    # ' (workers:', out$workers,
                    # ')')
  out <- 
    out %>%
    dplyr::select(variable, year, val)

  out <- out %>%
    spread(key = year,
           value = val)
  out$variable <- as.character(out$variable)
  new_var <- #ifelse(var == 'on_site',
                    # 'Residence',
                    ifelse(var == 'rainy',
                           'Precipitation',
                           var)
  # )
  left <- data.frame(x = c(new_var, rep('', (nrow(out) - 1))))
  out <- cbind(left, out)
  names(out)[1:2] <- c('Variable', 'x')
  # out$variable[1] <- paste0(Hmisc::capitalize(new_var), ': ', out$variable[1])
  return(out)
}
out_list <- list()
vars <- c('field', 'permanent_or_temporary', 'sex')
# vars <- c(#'season', 
#            'field',
#           'permanent_or_temporary',
#            'sex')#,
#            # 'on_site',
#            # 'rainy'
#           # )
for(i in 1:length(vars)){
  x <- calculate(var = vars[i])
  out_list[[i]] <- x
}
ab_table <- bind_rows(out_list)
ab_table$Variable <- Hmisc::capitalize(ab_table$Variable)
# ab_table$Variable[ab_table$Variable == 'Season'] <- 'Malaria season'
ab_table$Variable[ab_table$Variable == 'Field'] <- 'Worker type'
ab_table$Variable[ab_table$Variable == 'Permanent_or_temporary'] <- 'Contract'
ab_table$x <- Hmisc::capitalize(ab_table$x)
names(ab_table)[2] <- ''
kable(ab_table, format = 'html') %>% kable_styling()
# print(xtable::xtable(ab_table,
#                      caption = 'Absenteeism rate by year and worker characteristics',
#                      align = c('r', 'r', 'l', rep('l', 4))),
#       include.rownames = FALSE)

ov_ab <- round(100 * mean(ab_panel$absent[between(ab_panel$date, as.Date('2012-01-01'), as.Date('2016-12-31'))]), digits = 2)

ov_cl <- length(which(between(clinic$date, as.Date('2012-01-01'), as.Date('2016-12-31'))))

Precipitation: Of the r n_days days observed, r dry_days had no rainfall at all (ie, approximately two-thirds). On days for which there was any rainfall at all, the average amount was approximately r amount_rain centimeters. Rain was most common in December and January (average of 5-5.5 cm daily) and least common in August and September (average of 0.5 cm daily). Days with any rainfall saw more absenteeism than days with no rainfall (Figure 6, panel A) and among days with any rainfall, more precipitation was associated with greater absenteeism (Figure 6, panel B) (correlation coefficient of 0.25).

bi_data <- final_data %>%
  group_by(date) %>%
  mutate(any_rain = mean(precipitation, na.rm = TRUE) > 0) %>%
  ungroup %>%
  filter(!is.na(any_rain)) %>%
  mutate(any_rain = ifelse(any_rain, 'Some rain', 'No rain')) %>%
  group_by(any_rain, date) %>%
  summarise(absenteeism = mean(absent) * 100)
bi_data_simple <- bi_data %>%
  group_by(any_rain) %>%
  summarise(absenteeism = mean(absenteeism))
p1 <- ggplot(data = bi_data,
       aes(x = any_rain,
           y = absenteeism)) +
  geom_jitter(alpha = 0.3, size = 0.5) +
  geom_violin(alpha = 0.4, fill = 'lightblue') +
  scale_y_log10() +
  theme_maragra() +
  geom_point(data = bi_data_simple,
             aes(x = any_rain,
                 y = absenteeism),
             alpha = 0.8) +
  geom_text(data = bi_data_simple,
             aes(x = any_rain,
                 y = absenteeism,
                 label = paste0(round(absenteeism, digits = 2), '\n(median)')),
             color = 'black') +
  labs(x = '',
       y = 'Absenteeism rate (log scale)',
       title = 'A')

plot_data <- final_data %>%
  group_by(date) %>%
  summarise(precipitation = mean(precipitation, na.rm = TRUE),
            absenteeism = mean(absent)) %>%
  ungroup %>%
  filter(precipitation > 0.1)
p2 <- ggplot(data = plot_data,
       aes(x = precipitation,
           y = absenteeism * 100)) +
  geom_point(alpha = 0.3, size = 0.5) +
  scale_x_log10() +
  geom_smooth() +
  # scale_y_log10() +
  theme_maragra() +
  labs(x = 'Daily precipitation (log scale)',
       y = 'Absenteeism rate', 
       title = 'B')
Rmisc::multiplot(p1, p2, cols = 2)

pd <- final_data %>%
  group_by(protection) %>% tally
ggplot(data = pd,
       aes(x = protection,
           y = n)) +
  geom_bar(stat = 'identity',
           fill = 'lightblue',
               alpha = 0.6) +
  theme_maragra() +
  labs(x = 'Herd protection score',
       y = 'Density')

\newpage

Sensitivity analysis

# The below shows average FE for year / worker, during malaria season, on a rainy day...

expander <- function(x){sort(unique(unlist(final_data[,x])))}
dummy <- expand.grid(#season = expander('season'),
                     months_since = expander('months_since'),
                     rainy_day = expander('rainy_day'),
                     herd = mean(final_data$herd),
                     group = expander('group'))
dummy$predicted <- NA

groups <- sort(unique(dummy$group))
for (i in 1:length(groups)){
    message(i)
    this_group <- groups[i]
    indices <- which(dummy$group == this_group)
    model <- protection_models[[this_group]]
    fe <- getfe(model)
    # Add back in the mean effect of fixed effects
    add <- fe %>% group_by(fe) %>% summarise(effect = mean(effect)) %>% summarise(sum(effect)) %>% unlist %>% as.numeric
    data <- dummy[indices,]  
    # predict_felm(model = model, data = )
    dummy$predicted[indices] <- 
      (regtools::predict.felm(object = model,
                             newdata = data,
                             use.fe = FALSE) + add) * 100
}

ggplot(data = dummy %>% filter(rainy_day),
       aes(x = months_since,
           y = predicted,
           color = season,
           group = season)) +
  geom_line() +
  geom_point() +
  facet_wrap(~group, scales = 'free_y') +
  scale_colour_manual(name = 'Malaria\nseason',
                      values = c('blue', 'red')) +
  theme_maragra() +
  labs(x = 'IRS status',
       y = 'Estimated absenteeism') #+
  # theme(legend.position = 'right')

# The below shows estimated importance of herd protection, and heterogeniety across groups.


dummy <- expand.grid(season = expander('season'),
                     months_since = expander('months_since'),
                     rainy_day = expander('rainy_day'),
                     herd = 0:1000,
                     group = expander('group'))
dummy$predicted <- NA


groups <- sort(unique(dummy$group))
for (i in 1:length(groups)){
    message(i)
    this_group <- groups[i]
    indices <- which(dummy$group == this_group)
    model <- protection_models[[this_group]]
    fe <- getfe(model)
    # Add back in the mean effect of fixed effects
    add <- fe %>% group_by(fe) %>% summarise(effect = mean(effect)) %>% summarise(sum(effect)) %>% unlist %>% as.numeric
    data <- dummy[indices,]  
    dummy$predicted[indices] <- 
      (regtools::predict.felm(object = model,
                             newdata = data,
                             use.fe = FALSE) + add) * 100
}

ggplot(data = dummy %>% filter(rainy_day, season == 'high'),
       aes(x = herd,
           y = predicted,
           color = months_since,
           group = months_since)) +
  geom_line() +
  facet_wrap(~group, scales = 'free_y') +
  scale_colour_manual(name = 'IRS status',
                      values = c('darkorange', 'darkgreen')) +
  theme_maragra() +
  labs(x = 'Herd protection score',
       y = 'Estimated absenteeism')

What would happen without IRS?

# The below chart shows the modeled average absenteeism rate during malaria season for permanent workers, by group type.

x <- final_data %>%
  # dplyr::filter(season == 'high'#,
  #               # group == groups[1],
  #               # permanent_or_temporary == 'Permanent'
  #               )
x <- x [,grepl('predicted', names(x)) | names(x) == 'group']
x <- x %>% gather(key, value, predicted_max_herd_max_irs:predicted)
dict <- data.frame(key = c('predicted', 
                           'predicted_no_herd',
                           'predicted_no_irs',
                           'predicted_no_herd_no_irs',
                           'predicted_max_herd_max_irs',
                           'predicted_max_herd',
                           'predicted_max_irs'),
                   new_key = c('As is',
                               'No herd protection',
                               'No individual protection',
                               'No protection (herd or individual)',
                               'Protection: maximum IRS and herd',
                               'Protection: maximum herd',
                               'Protection: maximum IRS'))

x <- left_join(x,
               dict)
plot_data <- x %>%
  mutate(new_key = gsub(' ', '\n', new_key)) %>%
  group_by(group, 
           new_key) %>%
  summarise(avg = mean(value, na.rm = TRUE),
            med = median(value, na.rm = TRUE),
            upr = quantile(value, 0.75, na.rm = TRUE),
            lwr = quantile(value, 0.25, na.rm = TRUE))
levs <- gsub(' ', '\n', c('No herd protection',
                               'No individual protection',
                               'No protection (herd or individual)',
                               'As is',
                               'Protection: maximum IRS',
                               'Protection: maximum herd',
                               'Protection: maximum IRS and herd'))
plot_data$new_key <- factor(plot_data$new_key,
                       levels = levs)
cols <- colorRampPalette(brewer.pal(n = 9,
                                    name = 'Set1'))(length(unique(plot_data$group)))

library(ggrepel)
ggplot(data = plot_data,
       aes(x = new_key,
           y = avg * 100,
           color = group)) +
  geom_point(size = 4) +
  theme_maragra() +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x = 'Scenario',
       y = 'Absenteeism rate') +
  geom_line(aes(group = group),
            alpha = 0.5,
            size = 1) +
  scale_color_manual(name = '',
                     values = cols) #+
  # ggrepel::geom_label_repel(aes(label = round(avg * 100, digits = 1)))

Return on investment

Costs

costs <- maragra::costs
mean(costs$total)

# 14152 avoided absences

74103 / 14152

# # Working days per year
# x <- ab_panel %>%
#   mutate(year = as.numeric(format(date, '%Y'))) %>%
#   # remove 2016, since smaller sample
#   filter(year != 2016) %>%
#   group_by(year) %>%
#   tally %>%
#   summarise(x = mean(n))
# x
# 
# x$x / 26.6 # number of days to prevent an absence

Number of working days

x <- ab_panel %>% group_by(date) %>% tally
ggplot(data = x,aes(x = date,y = n)) + geom_line() +
  labs(title = mean(x$n))

Number of working days annually

options(scipen = '999')
x <- ab_panel %>% mutate(year = as.numeric(format(date, '%Y'))) %>% group_by(year) %>% tally
ggplot(data = x,aes(x = year,y = n)) + geom_line() +
  labs(title = mean(x$n))

Number of prevented absences per year

ratio <- length(unique(ab_panel$oracle_number)) / length(unique(final_data$oracle_number))

x <- final_data %>%
  group_by(year = format(date, '%Y')) %>%
  summarise(eligibles = n(),
            absences = mean(as.numeric(absent), na.rm = TRUE) * n(),
            counterfactual = mean(predicted_no_herd_no_irs,na.rm =TRUE) * n()) 
y <- x %>%
  summarise(x = mean(counterfactual) - mean(absences)) %>%
  unlist * ratio
# Annual prevented
y
x <- x %>%
  mutate(prevented = counterfactual - absences)
# Number of presences to prevent 1 case
sum(x$eligibles) / sum(x$prevented)

# Clinical costs
(y /
  80) * # number of absences it takes to produce one company clinic positve
  31.49 # cost per clinical case per ezenduka

# Wage costs
y * c(3.5, 22)
  # 4 # assuming USD

\newpage

Robustness and generalizability

ro_data <- final_data %>%
# ro_data <- ab_panel %>%
#   left_join(irs, by = c('date', 'unidade')) %>%
#   filter(days_since < 0) %>%
  mutate(pre_irs = between(days_since, -10, -1)) 
#   # mutate(time_period = lendable::time_period_extract(date)) %>%
#   # mutate(time_period = as.character(time_period)) %>%
#   mutate(time_period = make_season(date = date)) %>%
#   mutate(absent_sick = ifelse(is.na(absent_sick), FALSE, absent_sick)) %>%
#   left_join(workers %>%
#               dplyr::select(oracle_number,
#                             sex,
#                             department,
#                             permanent_or_temporary),
#             by = 'oracle_number')
# ro_data$time_period <- as.character(ro_data$time_period)


make_model <- function(formula = 'absent ~ pre_irs + time_period'){
  ro_model <- glm(as.formula(formula), 
                data = ro_data,
                family = binomial('logit'))
  coefs <- exp(coef(ro_model))
  ci <- confint(ro_model)
  ci <- exp(ci)
  out <- data.frame(Variable = names(coefs),
                    Estimate = coefs,
                    Lower = ci[,1],
                    Upper = ci[,2])
  row.names(out) <- NULL
  out$`P value` <- coef(summary(ro_model))[,4]
  out$Significant <- ifelse(out$`P value` <= 0.001, 'xxx',
                            ifelse(out$`P value` <= 0.01, 'xx',
                                   ifelse(out$`P value` <= 0.05, 'x',
                                          '')))
  out$Variable <- as.character(out$Variable)
  out$Variable <- gsub('pre_irsTRUE', '10 days prior to IRS', out$Variable)
  out$Variable <- gsub('time_period', '', out$Variable)
  return(kable(out, format = "html") %>%
  kable_styling()# %>%
    # row_spec(row = which(grepl('prior', out$Variable)),background = "yellow")
  )
}

# # BASIC FORMULA
# make_model('absent ~ pre_irs + time_period')

# # BASIC FORMULA WITH INTERACTIONS
# make_model('absent ~ pre_irs*time_period')

# # BASIC FORMULA WITH INTERACTIONS AND ADJUSTING FOR SEX
# make_model('absent ~ pre_irs*time_period + sex')
# 
# # BASIC FORMULA WITH INTERACTIONS AND ADJUSTING FOR SEX AND DEPARTMENT
# make_model('absent ~ pre_irs*time_period + sex + department')
# 
# 
# # BASIC FORMULA WITH INTERACTIONS AND ADJUSTING FOR SEX AND DEPARTMENT AND STATUS
# make_model('absent ~ pre_irs*time_period + sex + department + permanent_or_temporary')

# Adjust for seasonality and department
# make_model('absent ~ pre_irs*season + group + rainy_day')

# The below shows our robustness check for sick only absenteeism. Unlike with all cause absenteeism, during malaria season, being in the period 10 days prior to IRS is associated with statistically greater likelihood of being absent for illness. In other words, it appears that there is somewhat of a feedback loop: when a worker misses work due to illness, his/her likelihood of getting IRS doubles in the next 10 days.\todo{What to do about this???}
# make_model('absent_sick ~ pre_irs*time_period + department')

Bibliography

joebrew/maragra documentation built on Aug. 11, 2020, 8:39 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

joebrew/maragra
An analysis of malaria at the Maragra sugar company, Mozambique

In joebrew/maragra: An analysis of malaria at the Maragra sugar company, Mozambique

Calculate household protection

Manuscript draft

Classify protection plot

Descriptive overview

Herd-only protection score

Individual-only protection score

Overall protection score

Precipitation lag

Results

Regression table

Intepretation

Precipitation's effect

Simulations

Costs

Charts for paper

Sensitivity analysis

What would happen without IRS?

Return on investment

Robustness and generalizability

Bibliography

R Package Documentation

Browse R Packages

We want your feedback!

joebrew/maragra An analysis of malaria at the Maragra sugar company, Mozambique

In joebrew/maragra: An analysis of malaria at the Maragra sugar company, Mozambique

Calculate household protection

Manuscript draft

Classify protection plot

Descriptive overview

Herd-only protection score

Individual-only protection score

Overall protection score

Precipitation lag

Results

Regression table

Intepretation

Precipitation's effect

Simulations

Costs

Charts for paper

Sensitivity analysis

What would happen without IRS?

Return on investment

Robustness and generalizability

Bibliography

R Package Documentation

Browse R Packages

We want your feedback!

joebrew/maragra
An analysis of malaria at the Maragra sugar company, Mozambique