In joebrew/maragra: An analysis of malaria at the Maragra sugar company, Mozambique

# Packages
library(tidyverse)
library(knitr)
library(Hmisc)
library(brew)
library(maragra)
library(knitr)
library(RColorBrewer)
library(extrafont)
library(kableExtra)
loadfonts()
## Global options
options(max.print="75")
opts_chunk$set(echo=FALSE,
                 cache=FALSE,
               prompt=FALSE,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE,
               # dev = "cairo_pdf",
               fig.pos="!h",
               fig.path = 'figures/')
opts_knit$set(width=75)
options(xtable.comment = FALSE)

source('paper_prepare.R')

# Build document from contents of directory
# Get a list of all the child documents
rmds <- list.files(pattern = '*.Rmd', recursive = FALSE, include.dirs = FALSE)
# Must include an underscore!
rmds <- rmds[grepl('_', rmds)]


chunks <- paste0("\n\n```{r child = '", 
                 rmds, 
                 "'}\n```\n")

# Write our order / child-calls to a doc
file_connection <- file('children.Rmd')
writeLines(paste0('---\noutput: html_document\n---\n\n', 
                  chunks), 
           file_connection)
close(file_connection)
# 
# ```r
# ```
# ```r
# file.remove('children.Rmd')
# ```

\begin{center} \begin{large}

make_authors()

\end{large} \end{center}

\vspace{5mm}

\begin{center} \textbf{Abstract}
\end{center}

\vspace{5mm} \begin{center} \begin{changemargin}{3cm}{3cm}

This paper provides new empirical evidence regarding the effect and return on investment of privately managed malaria control activities (indoor residual spraying with pesticides) on worker absenteeism in Mozambique. We analyze 4 years of malaria control and worker health and absenteeism data from a large sugar processing facility in Mozambique. We find that the benefits outweight the costs (ie, there is a positive return on investment) even when the consideration of benefits is limited to those directly accrued by the company. These findings suggest that the private sector may have an important role to play in malaria control in endemic areas.

\end{changemargin} \end{center}

\vspace{20mm}

\noindent\fbox{% \parbox{\textwidth}{% \subsection*{Research Highlights} \begin{itemize} \item This paper analyzes large, individual-level worker absenteeism data from malaria endemic zone. \item We quantify the effect of indoor residual spraying on absenteeism and clinical malaria. \item We estimate cost-effectiveness of malaria control from an investment standpoint. \item Results show ledger profitability, suggesting that the private sector could play a significant role in malaria elimination.
\end{itemize} \vspace{2mm} }% }

\vfill \null

\subsection*{Keywords} \textbf{Malaria; Investment; Health; Productivity; Agriculture; Absenteeism}

\vspace{3mm}

\newpage

Introduction

# Map of Manhica, etc.
Rmisc::multiplot(plotlist = map_list, cols = 2)

# Calculate area of maragra
# x <- bairros
# x$NeighCode <- as.numeric(as.character(x$NeighCode))
# x@data <- x@data %>%
#   mutate(maragra_bairro = NeighCode %in%
#            c(601:609,701:709, 1001:1003)) %>%
#   mutate(maragra_fabrica = NeighCode %in%
#            1001:1003)
# x <- spTransform(x,CRS("+proj=utm +zone=36 +datum=WGS84 +units=m") )
# 
# the_bairro <- x[x$maragra_bairro,]
# the_fabrica <- x[x$maragra_fabrica,]
# # the_bairro <- spTransform(the_bairro, CRS("+proj=utm +zone=36 +datum=WGS84 +units=m") )
# # the_fabrica <- spTransform(the_fabrica, CRS("+proj=utm +zone=36 +datum=WGS84 +units=m") )
# 
# rgeos::gArea(the_bairro) / 1000000
# rgeos::gArea(the_fabrica) / 1000000

# Used this for plantation estimation: https://www.daftlogic.com/projects-google-maps-area-calculator-tool.htm

quarter_extract <- function (date) {
    if (!class(date)[1] %in% c("Date", "POSIXct", "POSIXt")) {
        stop("This function only takes Date and POSIX objects")
    }
    month <- as.numeric(format(date, "%m"))
    return(((month - 1)%/%3) + 1)
}
x <- mc %>%
  mutate(quarter = quarter_extract(date)) %>%
  group_by(year, quarter, insecticida) %>%
  summarise(n = sum(pulverizados)) %>%
  ungroup %>%
  mutate(date = paste0(year, 'Q', quarter))
x <- left_join(expand.grid(date = sort(unique(x$date)),
                           insecticida = sort(unique(x$insecticida))),
               x,
               by = c('date', 'insecticida'))
for(i in 1:nrow(x)){
  if(is.na(x$n[i]) | x$n[i] < 1000){
    x$n[i] <- sample(1:1000, 1)
  }
}

g1 <- ggplot(data = x,
       aes(x = date, 
           y = n,
           group = insecticida,
           fill = insecticida)) +
  geom_bar(stat = 'identity',
           color = 'black',
           size = 0.1,
           alpha = 0.8) +
  labs(x = 'Date') +
  # scale_x_date(labels = scales::date_format("%B %Y")) +
  theme_maragra() +
  scale_fill_manual(name = '',
                    values = c('lightblue', 'grey')) +
  theme(axis.text.x = element_text(angle = 90, size = 7)) +
  labs(title = 'i.',
       y = 'Houses')

# Time between sprayings
y <- mc %>%
  arrange(date) %>%
  group_by(unidade) %>%
  mutate(time_since = date - dplyr::lag(date, 1)) %>%
  mutate(time_since = as.numeric(time_since)) %>%
  ungroup %>%
  filter(!is.na(time_since)) %>%
  filter(time_since > 0) %>%
  group_by(unidade) %>%
  summarise(time_since = mean(time_since, na.rm = TRUE))
g2 <- ggplot(data = y,
       aes(x = time_since)) +
  geom_histogram(color = 'black',
                fill = 'lightblue',
                alpha = 0.8,
                size = 0.1) +
  theme_maragra() +
  labs(x = 'Days since previous spray',
       y = 'Count',
       title = 'ii.')

Rmisc::multiplot(g1, g2, cols = 2)

\newpage

Methods

Budget info

costs_itemized <- maragra::costs_itemized
costs_raw <- maragra::costs_raw
costs <- maragra::costs
# Annual operating budget
aob <- round(mean(costs$usd), digits = 2)
print(paste0('Average annual operating budget of ', aob))


plot_data <- costs_itemized


ggplot(data = plot_data,
             aes(x = period,
                 y = total,
                 group = area,
                 fill = area)) +
  geom_bar(stat = 'identity',
           position = 'dodge') +
  scale_fill_manual(name = '',
                    values = colorRampPalette(brewer.pal(n = 9,
                                                         name = 'Set1'))(length(unique(plot_data$area)))) +
  theme_maragra() +
  labs(x = '',
       y = 'USD')

fumigations <- mc %>%
  filter(date >= '2013-01-01',
         date <= '2016-12-31')
n_fumigations <- nrow(fumigations)
n_buildings <- sum(fumigations$casas_cobertas, na.rm = TRUE)
unique_agregados <- length(unique(fumigations$unidade))
people_sprayed <- length(unique(model_data$oracle_number[model_data$days_since != 'Never']))
people_observed <- length(unique(model_data$oracle_number))
pr <- function(x){prettyNum(x, big.mark = ',')}

print('Assuming linearity in costs, the program spends approximately xxx per building sprayed.') 
aob / (n_buildings / 4)

# print('With each agregado containing an average of 2.2 workers.')

x = ab_panel %>%
  mutate(year = as.numeric(format(date, '%Y')),
         month = as.numeric(format(date, '%m'))) %>%
  group_by(year, month) %>%
  summarise(absences = length(which(absent))) %>%
  ungroup %>%
  left_join(clinic_agg %>% group_by(year, month) %>% 
              summarise(positive = sum(positive))) %>%
  mutate(percent_clinic = positive / absences * 100) %>%
  mutate(n_ab_for_clinic = absences / positive)

mean(x$n_ab_for_clinic, na.rm = TRUE)

# Fortify bairros
library(maptools)
library(rgeos)
border <- gConvexHull(bairros_maragra_bairro)
border <- SpatialPolygonsDataFrame(Sr = border,
                                   data = data.frame(a = 1))
# border <- unionSpatialPolygons(bairros_maragra_bairro, 
#                           IDs = rep(1, nrow(bairros_maragra_bairro)),
#                           threshold = 9^9)
border_fortified <- 
  broom::tidy(border,
              regions = 'Id')

# Create average herd risk score
x <- model_data %>%
  group_by(lng = longitude_aura,
           lat = latitude_aura) %>%
  summarise(herd = mean(herd, na.rm = TRUE)) %>%
  ungroup %>%
  filter(!is.na(lng))
x_sp <- x
coordinates(x_sp) <- ~lng+lat   
proj4string(x_sp) <- proj4string(border)
overs <- sp::over(x_sp, border)
x <- x[!is.na(overs),]
# ggplot() +
#   geom_point(data = x,
#        aes(x = lng,
#            y = lat,
#            color = herd)) +
#   geom_polygon(data = border_fortified,
#                aes(x = long,
#                    y = lat,
#                    group = group),
#                fill = NA,
#                color = 'black') +
#   ggthemes::theme_map() +
#   scale_color_continuous(name = 'Herd\nprotection\nscore',
#                          low = 'orange',
#                          high = 'blue')

df_grid <- expand.grid(lng = seq(bbox(border)[1,1],
                                 bbox(border)[1,2],
                                 by = 0.0008),
                       lat = seq(bbox(border)[2,1],
                                 bbox(border)[2,2],
                                 by = 0.0008),
                       herd = NA)
df_grid$latitude <- df_grid$lat
df_grid$longitude <- df_grid$lng
coordinates(df_grid) <- ~longitude+latitude

# Go through each row of df_grid, getting the 
# weighted mean irs score for that point
# and putting a color into df_grid
if('df_grid.RData' %in% dir()){
  load('df_grid.RData')
} else {
  for (i in 1:nrow(df_grid)){
  # Get distance from every point in x_sp
  distances <- spDistsN1(pts = x_sp,
                        pt = df_grid[i,],
                        longlat = TRUE)
  # Define which are acceptably close
  close_enough <- which(distances <= 50)
  # Get an herd score
  herd <- stats::weighted.mean(x = x_sp$herd[close_enough],
                       w = (1 / distances[close_enough]) ^2,
                       na.rm = TRUE)
  # Assign irs to the dataframe
  df_grid$herd[i] <- herd

}
save('df_grid', file = 'df_grid.RData')
}

library(raster)
# Convert df_grid to raster
temp <- df_grid@data %>% arrange(lng, lat)
r <- rasterFromXYZ(temp[, c('lng', 'lat', 'herd')])

proj4string(df_grid) <- proj4string(border)
x <- over(df_grid, polygons(border))
df_grid_small <- df_grid[!is.na(x),]
temp <- df_grid_small@data %>% arrange(lng, lat)
r <- rasterFromXYZ(temp[, c('lng', 'lat', 'herd')])
values(r) <- values(r) / max(values(r), na.rm = TRUE) * 100
plot(border, col = NA, border = NA)
plot(r, add = TRUE)
# plot(border, add = TRUE)
plot(bairros, add = TRUE)

x <- model_data %>%
  group_by(group) %>%
  summarise(avg = mean(herd, na.rm = TRUE),
            avg_ideal = mean(herd_ideal, na.rm = TRUE)) %>%
  ungroup %>%
  mutate(avg = avg / max(avg) * 100,
            avg_ideal = avg_ideal / max(avg_ideal) * 100)

Econometric model

Our model specification is as follows

$$ \hat{Y_{it}} = \hat{\beta}{0} + \hat{\beta}{1}\text{Season}{t} * IRS{it} + \hat{\beta}2{RainyDay{t}} + \hat{\beta}3{Herd{it}} + \alpha_i + \delta_t + \upsilon_{it} $$

$\hat{Y}{it}$ is the rate of absence. $\beta{1}$ is the binary "season" variable, imputed from overall district clinical incidence, interacted with our "treatment" condition: whether or not the worker in question's residence was sprayed with IRS in the preceding 6 months or not. $\beta_{2}$ is whether it was raining on that particular day or not. $\beta_{3}$ represents the distance-weighted herd protection score. $\alpha_i$ represents the time invariant worker fixed effects, and $\delta_i$ represents the fixed effect of the particular malaria season. $\upsilon$ is the error term.

Our formula for return on investment can be described in a straightforward fashion...

\begin{center} $R = \dfrac{P_{w} - S_{wa} - S_{wc}}{P_{w}}$

\end{center}

...where $R$ is the return on investment, $P$ is the malaria control program's total operating cost, $w$ refers to costs at the per-worker level, $a$ refers to savings through avoided absences, and $ c $ refers to savings through avoided clinical encounters. We define the malaria control program as "profitable" from an investment standpoint if ROI is greater than 100%, ie if the savings associated with the estimated effect of IRS is greater than the costs of the program's administration.

\newpage

Results

Descriptive statistics

# Calculate number of workers
n <- length(unique(ab_panel$oracle_number[ab_panel$date >= '2012-01-01' & ab_panel$date <= '2016-12-31']))

We collected absenteeism and demographic data data on r n workers from 2013 through 2016.

# x <- ab_panel %>%
#   left_join(workers %>%
#               dplyr::select(oracle_number, date_of_birth, permanent_or_temporary, sex))
x <- model_data
x$age <-as.numeric(as.Date('2014-01-01') - x$date_of_birth) / 365.25
x$agey <- x$age
x$age <- round(x$age, digits = -1)
x$status <- x$permanent_or_temporary


calculate <- function(var= 'sex',
                      data){
  out <- 
    data %>%
    group_by_(var) %>%
    summarise(`Treatment workers` = length(unique(oracle_number[!months_since %in% c('Never', 'Before')])),
              `Treatment days` = length(oracle_number[!months_since %in% c('Never', 'Before')]),
              `Control workers` = length(unique(oracle_number[months_since %in% c('Never', 'Before')])),
              `Control days` = length(oracle_number[months_since %in% c('Never', 'Before')])) %>%
    ungroup() %>%
    # mutate(p = round(n / sum(n) * 100, digits = 2)) %>%
    mutate(variable = '') 
    names(out)[1] <- ' '
    out$variable[1] <- Hmisc::capitalize(var)

    out <- out %>% dplyr::select(variable, ` `, `Treatment workers`, `Treatment days`, `Control workers`, `Control days`)
    out$` ` <- as.character(out$` `)
  return(out)
}
out_list <- list()
vars <- c('department', 
           'sex',
           'status',
           'age')
for(i in 1:length(vars)){
  message(i)
  y <- calculate(var = vars[i], data = x)
  out_list[[i]] <- y
}
demo_table <- bind_rows(out_list)
names(demo_table)[1:2] <- c('Variable', ' ')
kable(demo_table, format = 'html') %>% kable_styling()
# print(xtable::xtable(demo_table,
#                      caption = 'Overall worker characteristics',
#                      align = c('r', 'r', rep('l', 5))),
#       include.rownames = FALSE)

# x$age <- round(as.numeric(as.Date('2014-01-01') - x$date_of_birth) / 365.25, digits = 2)
g1 <- ggplot(data = x,
       aes(x = jitter(agey, factor = 1000),
           group = sex,
           fill = sex)) +
  ggridges::geom_density_ridges(aes(y = sex,
                                    fill = sex),
                                height = 0.2) +
  facet_wrap(~permanent_or_temporary) +
  theme_maragra() +
  labs(x = 'Age',
       y = 'Density') +
  scale_fill_manual(name = 'Sex',
                    values = c('lightblue', 'grey'))
g1

x <- weather %>%
  mutate(month = format(date, '%m')) %>%
  mutate(year = format(date, '%Y')) %>%
  mutate(date = as.Date(paste0(year, '-', month, '-01'))) %>%
  group_by(date) %>%
  summarise(precipitation = sum(precipitation, na.rm = TRUE),
            temp_max = max(temp_max, na.rm = TRUE),
            temp_min = min(temp_min, na.rm = TRUE),
            temp = mean(temp))
# x <- gather(x, key, value,precipitation:temp)

g1 <- ggplot(data = x,
       aes(x = date,
           y = precipitation)) +
  geom_bar(stat = 'identity',
           fill = 'lightblue',
           size = 0.1) +
  theme_maragra() +
  theme(axis.text.x = element_text(angle = 0)) +
  labs(x = 'Quarter',
       y = 'Milimeters') +
  scale_x_date(labels = scales::date_format("%Y"))

g2 <- ggplot(data = x,
            aes(x = date,
                y = temp)) +
  geom_bar(stat = 'identity', fill = 'grey', size = 0.1) +
  geom_line(aes(y = temp_max),
             color = 'darkred', 
             alpha = 0.5) +
  geom_line(aes(y = temp_min),
             color = 'blue', 
             alpha = 0.5) +
  # geom_linerange(aes(ymin = temp_min,
  #                    ymax = temp_max),
  #                alpha = 0.6,
  #                color = 'blue') +
   theme_maragra() +
  theme(axis.text.x = element_text(angle = 0)) +
  labs(x = 'Quarter',
       y = 'Degrees (Celcius)') +
  scale_x_date(labels = scales::date_format("%Y"))
Rmisc::multiplot(g1, g2, cols = 2)

# BES incidence
g1 <- 
  ggplot(data = bes %>%
           mutate(season = ifelse(p >= median(p), 'High season',
                                  'Low season')) %>%
           filter(weekdays(date) == 'Monday'),
       aes(x = date,
           y = p)) +
  geom_point(alpha = 0.6,
             aes(color = season),
             size = 0.2) +
  geom_line(alpha = 0.1) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Incidence',
       title = 'A: Annualized district clinical\nmalaria incidence per 1,000') +
  scale_color_manual(name = '',
                     values = c('red', 'black')) + 
  guides(color = guide_legend(ncol = 1)) +
  theme(legend.position = c(.8, .8),
        legend.text = element_text(size = 5),
        legend.background = element_rect(fill=alpha('white', 0)))

# Absenteeism rate
g2_data <- ab_panel %>%
  group_by(month = as.Date(paste0(format(date, '%Y-%m'), '-01'))) %>%
  summarise(absences = length(which(absent)),
            absences_sick = length(which(absent_sick)),

            eligibles = n()) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100,
         sick_absenteeism_rate = absences_sick / eligibles * 100)

g2 <- ggplot(data = g2_data,
             aes(x = month,
                 y = absenteeism_rate)) +
  geom_line(alpha = 0.6) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Absenteeism rate',
       title = 'B: Monthly absenteeism rate')

g3 <- ggplot(data = g2_data,
             aes(x = month,
                 y = sick_absenteeism_rate)) +
  geom_line(alpha = 0.6) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Absenteeism rate',
       title = 'C: Monthly sick absenteeism rate') 

# Weather
g4 <-
  ggplot(data = weather %>%
           group_by(date = as.Date(paste0(format(date, '%Y-%m'), '-01'))) %>%
           summarise(precipitation = sum(precipitation, na.rm = TRUE)),
         aes(x = date,
             y = precipitation)) +
  geom_line(alpha = 0.6) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Mililiters',
       title = 'D: Monthly precipitation') 

# Clinical incidence of malaria
g5 <- clinic_agg %>%
  filter(date >= '2013-01-01', date <= '2016-12-31') %>%
  # mutate(date = as.Date(paste0(year, '-', month, '-01'))) %>%
  group_by(date) %>%
  summarise(tested = sum(tested),
            positive = sum(positive)) %>%
  ungroup %>%
  mutate(percent_positive = positive / tested * 100) %>%
  ggplot(aes(x = date,
             y = positive)) +
  geom_line(alpha = 0.6) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Cases',
       title = 'D: Monthly malaria cases\nat company clinic') 

g6 <- clinic_agg %>%
  filter(date >= '2013-01-01', date <= '2016-12-31') %>%
  # mutate(date = as.Date(paste0(year, '-', month, '-01'))) %>%
  group_by(date) %>%
  summarise(tested = sum(tested),
            positive = sum(positive)) %>%
  ungroup %>%
  mutate(percent_positive = positive / tested * 100) %>%
  ggplot(aes(x = date,
             y = percent_positive)) +
  geom_line(alpha = 0.6) +
  theme_maragra(base_size = 9) +
  labs(x = 'Date',
       y = 'Cases',
       title = 'E: Monthly malaria test positivity rate\nat company clinic') 
Rmisc::multiplot(g1, g2, g3,
                 # g4, 
                 g5, g6, layout = matrix(c(1, 1, 2, 3, 4, 5), nrow=2, byrow=TRUE))

fumigations <- mc %>%
  filter(date >= '2013-01-01',
         date <= '2016-12-31')
n_fumigations <- nrow(fumigations)
n_buildings <- sum(fumigations$casas_cobertas, na.rm = TRUE)
unique_agregados <- length(unique(fumigations$unidade))
people_sprayed <- length(unique(model_data$oracle_number[model_data$days_since != 'Never']))
people_observed <- length(unique(model_data$oracle_number))
pr <- function(x){prettyNum(x, big.mark = ',')}

Fumigations: During the period from January 1st, 2013 through December 31st, 2016, the Maragra Malaria Control Unit carried out r pr(n_fumigations) episodes of fumigation of residential "agregados" (household combinations), for a total of r pr(n_buildings) building-fumigation combinations. The total number of unique agregados sprayed during this period was r pr(unique_agregados). Among the r pr(people_observed) workers for whom we have reliable absenteeism and residential data, r pr(people_sprayed) had their homes fumigated at least once (the majority of workers live off of the facility).

dry_days <- length(which(weather$precipitation > 0))
n_days <- length(weather$precipitation)
p_dry <- round(dry_days / n_days * 100)
amount_rain <- round(mean(weather$precipitation[weather$precipitation > 0], na.rm = T), digits = 2)
x <- weather %>%
  group_by(month = format(date, '%m')) %>%
  summarise(precipitation = mean(precipitation, na.rm = TRUE))

Absences: We observed r pr(nrow(ab_panel)) unique worker-days among the r pr(people_observed) workers. The all-period average absenteeism rate was r round(100 * sum(ab_panel$absent) / nrow(ab_panel), digits = 2)%, though this rate varied widely as a function of worker department, sex, residence, and season (table 1).

calculate <- function(var){
  out <- 
  model_data %>%
    mutate(year = format(date, '%Y')) %>%
    mutate(on_site = ifelse(on_site,
                                    'On site',
                                    'Off site'),
           rainy = ifelse(rainy,
                          'Rainy',
                          'Dry')) %>%
    filter(!is.na(rainy)) %>%
  group_by_(var, 'year') %>%
  summarise(absences = length(which(absent)),
            eligibles = length(absent),
            workers = length(unique(oracle_number))) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100) 
  names(out)[1] <- 'variable'
  out$val <- paste0(round(out$absenteeism_rate, digits = 1),
                    '%')#,
                    # ')',
                    # '\n',
                    # ' (workers:', out$workers,
                    # ')')
  out <- 
    out %>%
    dplyr::select(variable, year, val)

  out <- out %>%
    spread(key = year,
           value = val)
  out$variable <- as.character(out$variable)
  new_var <- ifelse(var == 'on_site',
                    'Residence',
                    ifelse(var == 'rainy',
                           'Precipitation',
                           var))
  left <- data.frame(x = c(new_var, rep('', (nrow(out) - 1))))
  out <- cbind(left, out)
  names(out)[1:2] <- c('Variable', 'x')
  # out$variable[1] <- paste0(Hmisc::capitalize(new_var), ': ', out$variable[1])
  return(out)
}
out_list <- list()
vars <- c('season', 
           'field',
          'permanent_or_temporary',
           'sex',
           'on_site',
           'rainy')
for(i in 1:length(vars)){
  x <- calculate(var = vars[i])
  out_list[[i]] <- x
}
ab_table <- bind_rows(out_list)
ab_table$Variable <- Hmisc::capitalize(ab_table$Variable)
ab_table$Variable[ab_table$Variable == 'Season'] <- 'Malaria season'
ab_table$Variable[ab_table$Variable == 'Field'] <- 'Worker type'
ab_table$Variable[ab_table$Variable == 'Permanent_or_temporary'] <- 'Contract'
ab_table$x <- Hmisc::capitalize(ab_table$x)
names(ab_table)[2] <- ''
kable(ab_table, format = 'html') %>% kable_styling()
# print(xtable::xtable(ab_table,
#                      caption = 'Absenteeism rate by year and worker characteristics',
#                      align = c('r', 'r', 'l', rep('l', 4))),
#       include.rownames = FALSE)

ov_ab <- round(100 * mean(ab_panel$absent[between(ab_panel$date, as.Date('2012-01-01'), as.Date('2016-12-31'))]), digits = 2)

ov_cl <- length(which(between(clinic$date, as.Date('2012-01-01'), as.Date('2016-12-31'))))

Precipitation: Of the r n_days days observed, r dry_days had no rainfall at all (ie, approximately two-thirds). On days for which there was any rainfall at all, the average amount was approximately r amount_rain centimeters. Rain was most common in December and January (average of 5-5.5 cm daily) and least common in August and September (average of 0.5 cm daily). Days with any rainfall saw more absenteeism than days with no rainfall (Figure 6, panel A) and among days with any rainfall, more precipitation was associated with greater absenteeism (Figure 6, panel B) (correlation coefficient of 0.25).

bi_data <- model_data %>%
  group_by(date) %>%
  mutate(any_rain = mean(precipitation, na.rm = TRUE) > 0) %>%
  ungroup %>%
  filter(!is.na(any_rain)) %>%
  mutate(any_rain = ifelse(any_rain, 'Some rain', 'No rain')) %>%
  group_by(any_rain, date) %>%
  summarise(absenteeism = mean(absent) * 100)
bi_data_simple <- bi_data %>%
  group_by(any_rain) %>%
  summarise(absenteeism = mean(absenteeism))
p1 <- ggplot(data = bi_data,
       aes(x = any_rain,
           y = absenteeism)) +
  geom_jitter(alpha = 0.3, size = 0.5) +
  geom_violin(alpha = 0.4, fill = 'lightblue') +
  scale_y_log10() +
  theme_maragra() +
  geom_point(data = bi_data_simple,
             aes(x = any_rain,
                 y = absenteeism),
             alpha = 0.8) +
  geom_text(data = bi_data_simple,
             aes(x = any_rain,
                 y = absenteeism,
                 label = paste0(round(absenteeism, digits = 2), '\n(median)')),
             color = 'black') +
  labs(x = '',
       y = 'Absenteeism rate (log scale)',
       title = 'A')

plot_data <- model_data %>%
  group_by(date) %>%
  summarise(precipitation = mean(precipitation, na.rm = TRUE),
            absenteeism = mean(absent)) %>%
  ungroup %>%
  filter(precipitation > 0.1)
p2 <- ggplot(data = plot_data,
       aes(x = precipitation,
           y = absenteeism * 100)) +
  geom_point(alpha = 0.3, size = 0.5) +
  scale_x_log10() +
  geom_smooth() +
  # scale_y_log10() +
  theme_maragra() +
  labs(x = 'Daily precipitation (log scale)',
       y = 'Absenteeism rate', 
       title = 'B')
Rmisc::multiplot(p1, p2, cols = 2)

ggplot(data = model_data,
       aes(x = herd)) +
  geom_density(fill = 'lightblue',
               alpha = 0.6) +
  theme_maragra() +
  labs(x = 'Herd protection score',
       y = 'Density')

\newpage

make_models_table(model_list = protection_models, the_caption = "All absenteeism with herd immunity: model results")

Sensitivity analysis

The below shows average FE for year / worker, during malaria season, on a rainy day...

expander <- function(x){sort(unique(unlist(model_data[,x])))}
dummy <- expand.grid(season = expander('season'),
                     months_since = expander('months_since'),
                     rainy_day = expander('rainy_day'),
                     herd = mean(model_data$herd),
                     group = expander('group'))
dummy$predicted <- NA

groups <- sort(unique(dummy$group))
for (i in 1:length(groups)){
    message(i)
    this_group <- groups[i]
    indices <- which(dummy$group == this_group)
    model <- protection_models[[this_group]]
    fe <- getfe(model)
    # Add back in the mean effect of fixed effects
    add <- fe %>% group_by(fe) %>% summarise(effect = mean(effect)) %>% summarise(sum(effect)) %>% unlist %>% as.numeric
    data <- dummy[indices,]  
    # predict_felm(model = model, data = )
    dummy$predicted[indices] <- 
      (regtools::predict.felm(object = model,
                             newdata = data,
                             use.fe = FALSE) + add) * 100
}

ggplot(data = dummy %>% filter(rainy_day),
       aes(x = months_since,
           y = predicted,
           color = season,
           group = season)) +
  geom_line() +
  geom_point() +
  facet_wrap(~group, scales = 'free_y') +
  scale_colour_manual(name = 'Malaria\nseason',
                      values = c('blue', 'red')) +
  theme_maragra() +
  labs(x = 'IRS status',
       y = 'Estimated absenteeism') #+
  # theme(legend.position = 'right')

The below shows estimated importance of herd protection, and heterogeniety across groups.

dummy <- expand.grid(season = expander('season'),
                     months_since = expander('months_since'),
                     rainy_day = expander('rainy_day'),
                     herd = 0:1000,
                     group = expander('group'))
dummy$predicted <- NA


groups <- sort(unique(dummy$group))
for (i in 1:length(groups)){
    message(i)
    this_group <- groups[i]
    indices <- which(dummy$group == this_group)
    model <- protection_models[[this_group]]
    fe <- getfe(model)
    # Add back in the mean effect of fixed effects
    add <- fe %>% group_by(fe) %>% summarise(effect = mean(effect)) %>% summarise(sum(effect)) %>% unlist %>% as.numeric
    data <- dummy[indices,]  
    dummy$predicted[indices] <- 
      (regtools::predict.felm(object = model,
                             newdata = data,
                             use.fe = FALSE) + add) * 100
}

ggplot(data = dummy %>% filter(rainy_day, season == 'high'),
       aes(x = herd,
           y = predicted,
           color = months_since,
           group = months_since)) +
  geom_line() +
  facet_wrap(~group, scales = 'free_y') +
  scale_colour_manual(name = 'IRS status',
                      values = c('darkorange', 'darkgreen')) +
  theme_maragra() +
  labs(x = 'Herd protection score',
       y = 'Estimated absenteeism')

What would happen without IRS?

The below chart shows the modeled average absenteeism rate during malaria season for permanent workers, by group type.

x <- model_data %>%
  dplyr::filter(season == 'high'#,
                # group == groups[1],
                # permanent_or_temporary == 'Permanent'
                )
x <- x [,grepl('predicted', names(x)) | names(x) == 'group']
x <- x %>% gather(key, value, predicted_max_herd_max_irs:predicted)
dict <- data.frame(key = c('predicted', 
                           'predicted_no_herd',
                           'predicted_no_irs',
                           'predicted_no_herd_no_irs',
                           'predicted_max_herd_max_irs',
                           'predicted_max_herd',
                           'predicted_max_irs'),
                   new_key = c('As is',
                               'No herd protection',
                               'No individual protection',
                               'No protection (herd or individual)',
                               'Protection: maximum IRS and herd',
                               'Protection: maximum herd',
                               'Protection: maximum IRS'))

x <- left_join(x,
               dict)
plot_data <- x %>%
  mutate(new_key = gsub(' ', '\n', new_key)) %>%
  group_by(group, 
           new_key) %>%
  summarise(avg = mean(value, na.rm = TRUE),
            med = median(value, na.rm = TRUE),
            upr = quantile(value, 0.75, na.rm = TRUE),
            lwr = quantile(value, 0.25, na.rm = TRUE))
levs <- gsub(' ', '\n', c('No herd protection',
                               'No individual protection',
                               'No protection (herd or individual)',
                               'As is',
                               'Protection: maximum IRS',
                               'Protection: maximum herd',
                               'Protection: maximum IRS and herd'))
plot_data$new_key <- factor(plot_data$new_key,
                       levels = levs)
cols <- colorRampPalette(brewer.pal(n = 9,
                                    name = 'Set1'))(length(unique(plot_data$group)))

library(ggrepel)
ggplot(data = plot_data,
       aes(x = new_key,
           y = avg * 100,
           color = group)) +
  geom_point(size = 4) +
  theme_maragra() +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x = 'Scenario',
       y = 'Absenteeism rate') +
  geom_line(aes(group = group),
            alpha = 0.5,
            size = 1) +
  scale_color_manual(name = '',
                     values = cols) #+
  # ggrepel::geom_label_repel(aes(label = round(avg * 100, digits = 1)))

Return on investment

Number of working days

x <- ab_panel %>% group_by(date) %>% tally
ggplot(data = x,aes(x = date,y = n)) + geom_line() +
  labs(title = mean(x$n))

Number of working days annually

options(scipen = '999')
x <- ab_panel %>% mutate(year = as.numeric(format(date, '%Y'))) %>% group_by(year) %>% tally
ggplot(data = x,aes(x = year,y = n)) + geom_line() +
  labs(title = mean(x$n))

Number of prevented absences per year

ratio <- length(unique(ab_panel$oracle_number)) / length(unique(model_data$oracle_number))

x <- model_data %>%
  group_by(year = calendar_year) %>%
  summarise(eligibles = n(),
            absences = mean(as.numeric(absent), na.rm = TRUE) * n(),
            counterfactual = mean(predicted_no_herd_no_irs,na.rm =TRUE) * n()) 
y <- x %>%
  summarise(x = mean(counterfactual) - mean(absences)) %>%
  unlist * ratio
# Annual prevented
y
x <- x %>%
  mutate(prevented = counterfactual - absences)
# Number of presences to prevent 1 case
sum(x$eligibles) / sum(x$prevented)

# Clinical costs
(y /
  80) * # number of absences it takes to produce one company clinic positve
  31.49 # cost per clinical case per ezenduka

# Wage costs
y *
  4 # assuming USD

\newpage

Robustness and generalizability

ro_data <- model_data %>%
# ro_data <- ab_panel %>%
#   left_join(irs, by = c('date', 'unidade')) %>%
#   filter(days_since < 0) %>%
  mutate(pre_irs = between(days_since, -10, -1)) 
#   # mutate(time_period = lendable::time_period_extract(date)) %>%
#   # mutate(time_period = as.character(time_period)) %>%
#   mutate(time_period = make_season(date = date)) %>%
#   mutate(absent_sick = ifelse(is.na(absent_sick), FALSE, absent_sick)) %>%
#   left_join(workers %>%
#               dplyr::select(oracle_number,
#                             sex,
#                             department,
#                             permanent_or_temporary),
#             by = 'oracle_number')
# ro_data$time_period <- as.character(ro_data$time_period)


make_model <- function(formula = 'absent ~ pre_irs + time_period'){
  ro_model <- glm(as.formula(formula), 
                data = ro_data,
                family = binomial('logit'))
  coefs <- exp(coef(ro_model))
  ci <- confint(ro_model)
  ci <- exp(ci)
  out <- data.frame(Variable = names(coefs),
                    Estimate = coefs,
                    Lower = ci[,1],
                    Upper = ci[,2])
  row.names(out) <- NULL
  out$`P value` <- coef(summary(ro_model))[,4]
  out$Significant <- ifelse(out$`P value` <= 0.001, 'xxx',
                            ifelse(out$`P value` <= 0.01, 'xx',
                                   ifelse(out$`P value` <= 0.05, 'x',
                                          '')))
  out$Variable <- as.character(out$Variable)
  out$Variable <- gsub('pre_irsTRUE', '10 days prior to IRS', out$Variable)
  out$Variable <- gsub('time_period', '', out$Variable)
  return(kable(out, format = "html") %>%
  kable_styling()# %>%
    # row_spec(row = which(grepl('prior', out$Variable)),background = "yellow")
  )
}

# # BASIC FORMULA
# make_model('absent ~ pre_irs + time_period')

# # BASIC FORMULA WITH INTERACTIONS
# make_model('absent ~ pre_irs*time_period')

# # BASIC FORMULA WITH INTERACTIONS AND ADJUSTING FOR SEX
# make_model('absent ~ pre_irs*time_period + sex')
# 
# # BASIC FORMULA WITH INTERACTIONS AND ADJUSTING FOR SEX AND DEPARTMENT
# make_model('absent ~ pre_irs*time_period + sex + department')
# 
# 
# # BASIC FORMULA WITH INTERACTIONS AND ADJUSTING FOR SEX AND DEPARTMENT AND STATUS
# make_model('absent ~ pre_irs*time_period + sex + department + permanent_or_temporary')

# Adjust for seasonality and department
make_model('absent ~ pre_irs*season + group + rainy_day')

# The below shows our robustness check for sick only absenteeism. Unlike with all cause absenteeism, during malaria season, being in the period 10 days prior to IRS is associated with statistically greater likelihood of being absent for illness. In other words, it appears that there is somewhat of a feedback loop: when a worker misses work due to illness, his/her likelihood of getting IRS doubles in the next 10 days.\todo{What to do about this???}
# make_model('absent_sick ~ pre_irs*time_period + department')

Discussion

\newpage

Appendix

Unadjusted absenteeism by time since IRS

library(maragra)
library(dplyr)
library(ggplot2)
plot_data <-
  ab_panel %>%
  # filter(oracle_number %in% matched$oracle_number) %>%
  left_join(irs, c("date", "unidade")) %>%
  mutate(months_since = days_since %/% 30) %>%
  filter(!is.na(months_since)) %>%
  # mutate(time_period = lendable::time_period_extract(date)) %>%
  # mutate(time_period = as.character(time_period)) %>%
  mutate(time_period = make_season(date = date)) %>%
  mutate(absent_sick = ifelse(is.na(absent_sick), FALSE, absent_sick))

x <- plot_data %>%
  group_by(months_since) %>%
  summarise(absences = length(which(absent)),
            sick_absences = length(which(absent_sick)),
            eligibles = length(absent)) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100) %>%
  mutate(sick_absenteeism_rate = sick_absences / eligibles * 100)

g1 <- ggplot(data = x,
       aes(x = months_since,
           y = absenteeism_rate)) +
  geom_step(color = 'darkred', alpha = 0.8) +
  geom_vline(xintercept = 0, lty = 2, alpha = 0.5) +
  labs(x = 'Months relative to IRS',
       y = 'Absenteeism rate',
       title = 'A') +
  theme_maragra()

plot_data2 <-
  ab_panel %>%
  # filter(oracle_number %in% matched$oracle_number) %>%
  left_join(irs, by = c("date", "unidade")) %>%
  mutate(months_since = days_since %/% 30) %>%
  filter(!is.na(months_since)) %>%
  # mutate(time_period = lendable::time_period_extract(date)) %>%
  # mutate(time_period = as.character(time_period)) %>%
  mutate(time_period = make_season(date = date)) %>%
  mutate(absent_sick = ifelse(is.na(absent_sick), FALSE, absent_sick))

x2 <- plot_data2 %>%
  group_by(months_since, time_period) %>%
  summarise(absences = length(which(absent)),
            sick_absences = length(which(absent_sick)),
            eligibles = length(absent)) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100) %>%
  mutate(sick_absenteeism_rate = sick_absences / eligibles * 100) %>%
  mutate(before = months_since < 0)

g2 <- ggplot(data = x2 %>% mutate(time_period = paste0(time_period)),
       aes(x = months_since,
           y = absenteeism_rate)) +
  geom_step(color = 'darkred', alpha = 0.8) +
  facet_wrap(~time_period, ncol = 1) +
  geom_vline(xintercept = 0, lty = 2, alpha = 0.5) +
  labs(x = 'Months relative to IRS',
       y = 'Absenteeism rate',
       title = 'B') +
  theme_maragra()
Rmisc::multiplot(g1, g2, cols = 2)

References

joebrew/maragra documentation built on Aug. 11, 2020, 8:39 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

joebrew/maragra
An analysis of malaria at the Maragra sugar company, Mozambique

In joebrew/maragra: An analysis of malaria at the Maragra sugar company, Mozambique

Introduction

Methods

Budget info

Econometric model

Results

Descriptive statistics

Sensitivity analysis

What would happen without IRS?

Return on investment

Robustness and generalizability

Discussion

Appendix

Unadjusted absenteeism by time since IRS

References

R Package Documentation

Browse R Packages

We want your feedback!

joebrew/maragra An analysis of malaria at the Maragra sugar company, Mozambique

In joebrew/maragra: An analysis of malaria at the Maragra sugar company, Mozambique

Introduction

Methods

Budget info

Econometric model

Results

Descriptive statistics

Sensitivity analysis

What would happen without IRS?

Return on investment

Robustness and generalizability

Discussion

Appendix

Unadjusted absenteeism by time since IRS

References

R Package Documentation

Browse R Packages

We want your feedback!

joebrew/maragra
An analysis of malaria at the Maragra sugar company, Mozambique