# Packages
library(tidyverse)
library(knitr)
library(Hmisc)
library(brew)
library(maragra)
library(knitr)
library(RColorBrewer)

## Global options
options(max.print="75")
opts_chunk$set(echo=FALSE,
                 cache=TRUE,
               prompt=FALSE,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE)
opts_knit$set(width=75)


http://economicsofmalaria.com joebrew@gmail.com


# Read in data
ab <- maragra::ab
ab_panel <- maragra::ab_panel
bairros <- maragra::bairros
census <- maragra::census
clinic <- maragra::clinic
clinic_agg <- maragra::clinic_agg
irs <- maragra::irs
mc <- maragra::mc
workers <- maragra::workers

This document contains some basic preliminary analysis of data relating to malaria, IRS, and absenteeism at Maragra Açucar, in Maragra, Mozambique. To explore a specific section, click any of the below tabs.

Summary

Description of data

d <- data(package = "maragra")
## assign it to use later
nm <- d$results[, "Item"]

The "Maragra database" consists of the following r length(nm) tables:

## call the promised data
data(list = nm, package = "maragra")
## get the dimensions of each data set
x <- lapply(mget(nm), dim)
results <- list()
for (i in 1:length(x)){
  z <- x[[i]]
  results[[i]] <- data_frame(`Table name` = names(x)[i],
             `Number or rows` = z[1],
             `Number of columns` = z[2])
}
results <- bind_rows(results)

kable(results) %>%
  kableExtra::kable_styling(full_width = FALSE)

These tables all constitute three kinds of data: those pertaining to medical outcomes (generally referred to as "clinic" data), those pertaining to worker information (demographic and absenteeism-related), and those pertaining to malaria control activities.

These datasets are all readily available in the (private) maragra R package (access credentials given to collaborators upon joebrew@gmail.com). All datasets except for ab_panel are as is (ie, in their original raw format), except for 4 kinds of changes:

  1. Removal of unecessary information (columns deemed irrelevant to the study have been removed).
  2. Modification of column names (all data now employ underscores instead of spaces, and are all lowercase).
  3. Standardization of data formats (all dates are now in a YYYY-MM-DD format, all names use similar capitalization, irrelevant tablet-created timestamps, etc. have been removed).
  4. "Feature generation": certain columns were created in order to ease grouped analysis and merging of datasets (for example, year_month, day_number, etc.).

The ab_panel dataset is not raw data; rather, it is an amolgamation of the ab and workers datasets, using absence data from the former with worker elibility dates from the latter, so as to create a "panel" style dataset (ie, one row for each day in which a worker was supposed to work).

The clinic and clinic_agg are similar, but not identical, in form. The former has individual-level data (useful for pairing with absences and demographic census data), whereas the latter is simply the raw counts of cases by nationality over time. Though clinic is much richer and more detailed than clinic_agg, is only covers the period from 2014 through 2016, whereas the latter covers a larger time period (2010-2016).

For a "deep dive" into each kind of data, click the tabs at the top of this page.

In order to reproduce this analysis (and others related to these datasets), follow the instructions on this project's research compendium page.

IRS data

Data structure

Data about "fumigações" (indoor residual spraying or "IRS") dates from r min(mc$date) through r max(mc$date). The dataset consists of the following fields:

cat(paste0(names(mc), collapse = '\n'))

Each row is one fumigation activity. The location/residence key is the unidade column. The casas_cobertas column indicates the number of houses in that unidade that were sprayed, wereas the pulverizados column indicates the number of rooms.

Insecticide type

The insecticide used is either DDT or ACT. The below is a breakdown of their respective use.

ggplot(data = mc,
       aes(x = insecticida)) +
  geom_bar(fill = '#159957',
           alpha = 0.6) +
  theme_maragra() +
  labs(x = 'Insecticide',
       y = 'IRS activities',
       title = 'Maragra IRS activity by insecticide type')

The respective use over time is as follows:

x <- mc %>%
  group_by(date = as.Date(paste0(format(date, '%Y-%m'), '-01')),
           insecticida) %>%
  summarise(houses = sum(casas_cobertas)) %>%
  ungroup
# Get a left side for filling in the 0s
left <- expand.grid(date = seq(min(x$date),
                               max(x$date),
                               by = 'month'),
                    insecticida = sort(unique(x$insecticida)))
# Join together
x <- left_join(x = left,
               y = x,
               by = c('date', 
                      'insecticida'))
# Replace NAs with 0
x <- x %>%
  mutate(houses = ifelse(is.na(houses), 0, houses))
# Define a color vector
cols <- colorRampPalette(brewer.pal(n = 9, 'Spectral'))(length(unique(x$insecticida)))

ggplot(data = x,
       aes(x = date,
           y = houses,
           group = insecticida,
           color = insecticida)) +
  geom_point(alpha = 0.6) +
  geom_line(alpha = 0.6) +
  theme_maragra() +
  labs(x = 'Month',
       y = 'Houses fumigated',
       title = 'Maragra IRS activity by insecticide type over time') +
  scale_color_manual(name = 'Chemical',
                     values = cols) +
  geom_vline(xintercept = as.numeric(as.Date(paste0(2012:2017, '-01-01'))),
             alpha = 0.2)

Below is a table of the same data.

names(x) <- Hmisc::capitalize(names(x))
DT::datatable(x)

IRS seasonality

We can examine the date of fumigations to see if they are seasonal vs. randomly/uniformly distributed throughout the year.

x <- mc %>%
  group_by(year, day_number) %>%
  summarise(houses = sum(casas_cobertas)) %>%
  ungroup %>%
  mutate(fake_date = as.Date('1990-01-01') + day_number) %>%
  mutate(fake_month = as.Date(paste0(format(fake_date, '%Y-%m'), '-01'))) %>%
  group_by(year, fake_month) %>%
  summarise(houses = sum(houses)) %>%
  ungroup %>%
  mutate(year = factor(year))

# Fill the nas with 0s
left <- expand.grid(year = sort(unique(x$year)),
                    fake_month = sort(unique(x$fake_month)))

x <- left_join(x = left,
               y = x)
x$houses[is.na(x$houses)] <- 0
cols <- colorRampPalette(brewer.pal(n = 9, 'Spectral'))(length(unique(x$year)))
ggplot(data = x %>% 
         filter(!year %in% c(2011:2012, 2017)),
       aes(x = fake_month,
           y = houses)) +#,
           # group = year,
           # color = year)) +
  geom_line(alpha = 0.8) +
  # scale_color_manual(name = 'Year',
  #                    values = cols) +
  scale_x_date(labels = scales::date_format("%b")) +
  theme_maragra() +
  labs(x = 'Month',
       y = 'Houses covered',
       title = 'Seasonality of IRS operations') +
  facet_wrap(~year)

Absenteeism data

Data structure

The analysis of absenteeism data will rely on a panel-style dataset in which one row exists for each worker-day (for which the worker is estimated to have supposed to work), with columns indicating the outcome (absent or not, sick or not, etc.). The Maragra CRM does not natively store panel style data, so we construct it from a combination of the workers dataset (from the Human Resources department) and the absenteeism dataset. Certain features pertaining to illness are merged from the clinic dataset.

The ab_panel dataset has the following column names:

cat(paste0(names(ab_panel), collapse = '\n'))

Data magnitude

Overall, we have observed r nrow(ab_panel) eligible worker days (the equivalent of r round(nrow(ab_panel) / 365.25, digits = 0) years of human activity!), from the period of r min(ab_panel$date) through r max(ab_panel$date).

Absenteeism

Our dataset includes r length(which(ab_panel$absent)) absences and r length(which(!ab_panel$absent)) presences, which can be broken down below in percentage terms.

x <- ab_panel %>%
  group_by(absent = ifelse(absent, 'Absent', 'Present')) %>%
  tally %>%
  ungroup %>%
  mutate(p = n / sum(n) * 100) %>%
  mutate(p = round(p, digits = 2))

ggplot(data = x,
       aes(x = absent,
           y = p)) +
    geom_bar(fill = '#159957',
           alpha = 0.6,
           stat = 'identity') +
  theme_maragra() +
  labs(x = 'Status',
       y = 'Percent',
       title = 'Absence/presence breakdown of all workers') +
  geom_label(aes(label = paste0(p, '%')))

Among absences, the breakdown of absence "type" is as follows.

ggplot(data = ab_panel %>% filter(absent) %>%
         mutate(sick_leave = ifelse(leave_type == 'SIC',
                                    'Sick leave',
                                    'Other leave')),
       aes(x = leave_type,
           fill = sick_leave)) +
  geom_bar(alpha = 0.8) +
  labs(x = 'Leave type',
       y = 'Absences',
       title = 'Breakdown of absences by type') +
  theme_maragra() +
  theme(axis.text.x = element_text(angle = 90)) +
  scale_fill_manual(name = '',
                    values = c('black', 'red'))

Absenteeism rate

We calculate an absenteeism rate for any given time period (day, month, etc.) as the number of days not worked divided by the number of days which should have been worked, and multiplied by 100. The below shows the absenteeism rate, by day, for the entire observation period. The size of each dot is a reflection of the number of workers observed at that time.

x <- 
  ab_panel %>%
  group_by(date) %>%
  summarise(absences = length(which(absent)),
            eligibles = length(absent)) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100) %>%
  mutate(Workers = eligibles)

ggplot(data = x,
       aes(x = date,
           y = absenteeism_rate)) +
  geom_point(alpha = 0.1,
             color = '#159957',
             aes(size = Workers)) +
  geom_smooth() +
  theme_maragra() +
  labs(x = 'Date',
       y = 'Absenteeism rate',
       title = 'Absenteeism rate over time') 

Sick absenteeism rate

We again calculate the absenteeism rate, but only for those who absences which are classified as "sick leave".

x <- 
  ab_panel %>%
  group_by(date) %>%
  summarise(absences = length(which(absent_sick)),
            eligibles = length(absent)) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100) %>%
  mutate(Workers = eligibles)

ggplot(data = x,
       aes(x = date,
           y = absenteeism_rate)) +
  geom_point(alpha = 0.1,
             color = '#159957',
             aes(size = Workers)) +
  geom_smooth() +
  theme_maragra() +
  labs(x = 'Date',
       y = 'Absenteeism rate',
       title = 'Sick absenteeism rate over time') 

Monthly absenteeism

Daily absenteeism is problematic in that it introduces a great deal of noise. So, we examine monthly absenteeism rates for both all and sick absences. The below chart shows these metrics, as well as the percentage of monthly absences due to sickness, and the number of worker-days observed. A local regression smoothed line is overlaid to see overall trends.

x <- 
  ab_panel %>%
  group_by(date = as.Date(paste0(format(date, '%Y-%m'), '-01'))) %>%
  summarise(absences = length(which(absent)),
            sick_absences = length(which(absent_sick)),
            eligibles = length(absent)) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100) %>%
  mutate(sick_absenteeism_rate = sick_absences / eligibles * 100) %>%
  mutate(Workers = eligibles) %>%
  mutate(sick_absence_p = sick_absenteeism_rate / absenteeism_rate * 100)

# Gather
x <- x %>%
  gather(key, value, absenteeism_rate:sick_absence_p) %>%
  mutate(key = ifelse(key == 'absenteeism_rate', 'Absenteeism rate',
                      ifelse(key == 'sick_absenteeism_rate', 'Sick absenteeism rate',
                             ifelse(key == 'sick_absence_p', '% of absences due to sickness',
                                    key))))

ggplot(data = x,
       aes(x = date,
           y = value)) +
  geom_point(alpha = 0.7,
             color = '#159957') +
  geom_line(alpha = 0.4,
            color = '#159957') +
  geom_smooth() +
  theme_maragra() +
  labs(x = 'Date',
       y = 'Absenteeism rate',
       title = 'Sick absenteeism rate over time') +
  facet_wrap(~key, scales = 'free_y')

Absenteeism by worker type

The below chart is a paneling of (a) type of absenteeism metric (columns) and (b) type of worker (rows).

x <- 
  ab_panel %>%
  left_join(workers %>% dplyr::select(oracle_number, department_name)) %>%
  mutate(department = ifelse(department_name %in% c('ADMIN AND FINANCE',
                                                    'CIVILS SERVICES',
                                                    'HUMAN RESOURCES SERVICES',
                                                    'RISK CONTROL'),
                             'Administrative',
                             ifelse(department_name == 'CANE GROWING', 'Field',
                                    ifelse(department_name == 'SUGAR MILLING', 'Factory', 'Other')))) %>%
  group_by(department, 
           date = as.Date(paste0(format(date, '%Y-%m'), '-01'))) %>%
  summarise(absences = length(which(absent)),
            sick_absences = length(which(absent_sick)),
            eligibles = length(absent)) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100) %>%
  mutate(sick_absenteeism_rate = sick_absences / eligibles * 100) %>%
  mutate(Workers = eligibles) %>%
  mutate(sick_absence_p = sick_absenteeism_rate / absenteeism_rate * 100)

# Gather
x <- x %>%
  gather(key, value, absenteeism_rate:sick_absence_p) %>%
  mutate(key = ifelse(key == 'absenteeism_rate', 'Absenteeism rate',
                      ifelse(key == 'sick_absenteeism_rate', 'Sick absenteeism rate',
                             ifelse(key == 'sick_absence_p', '% of absences due to sickness',
                                    key)))) 

# Keep just relevant metrics
x <- x %>%
  filter(key != 'Workers')

ggplot(data = x,
       aes(x = date,
           y = value)) +
  geom_point(alpha = 0.7,
             color = '#159957') +
  geom_line(alpha = 0.4,
            color = '#159957') +
  geom_smooth() +
  theme_maragra() +
  labs(x = 'Date',
       y = 'Absenteeism rate',
       title = 'Sick absenteeism rate over time') +
  facet_grid(department~key, scales = 'free')

Clinic data

Data structure

There are two clinical datasets: clinic and clinic_agg. The former is detailed at the individual level, but the latter covers a slightly wider timespan.

The clinic dataset has the following fields:

cat(paste0(names(clinic), collapse = '\n'))

The clinic_agg dataset has the following fields:

cat(paste0(names(clinic_agg), collapse = '\n'))

Cases over time - from clinic_agg

All workers

If we combine all workers, we can examine the total incidence of malaria since 2011.

x <- 
  clinic_agg %>%
  group_by(date) %>%
  summarise(p = sum(positive),
            pp = sum(positive) / sum(tested) * 100,
            tested = sum(tested))

# Gather into long format
y <- 
  rbind(x %>% dplyr::select(date, p) %>%
          rename(val = p) %>%
          mutate(key = 'positive'),
        x %>% dplyr::select(date, tested) %>%
          rename(val = tested) %>%
          mutate(key = 'tested')) %>%
  mutate(key = factor(key, levels = c('tested', 'positive')))

ggplot(data = x,
       aes(x = date,
           y = p)) +
  geom_bar(stat = 'identity') +
  xlab('Date') +
  ylab('Number of positive cases') +
    theme_maragra() +
  ggtitle('Malaria miscroscopy positive results',
          'All workers')

By nationality

The below charts show the total number of tests and postive cases, by month, for Mozambican and foreign workers, respectively, at Ilovo-Maragra.

ggplot(data = clinic_agg %>% filter(group == 'nacionais'),
            aes(x = date,
                y = tested)) +
  geom_line() +
  geom_line(aes(y = positive)) +
  theme_maragra() +
  labs(x = 'Date',
       y = 'People (tested and positive)',
       title = 'Mozambicans')
ggplot(data = clinic_agg %>% filter(group != 'nacionais'),
            aes(x = date,
                y = tested)) +
  geom_line() +
  geom_line(aes(y = positive)) +
  theme_maragra() +
  labs(x = 'Date',
       y = 'People (tested and positive)',
       title = 'Expatriates')

Cases and tests from clinic_agg

Absolute numbers

The below chart shows both the number of positive cases (all workers, in red) and the number of tests, by month.

ggplot(data = y,
       aes(x = date,
           y = val,
           group = key,
           fill = key)) +
  geom_bar(stat = 'identity', position = 'stack', alpha = 0.5) +
  xlab('Date') +
  ylab('Number of positive cases') +
  theme_maragra() +
  scale_fill_manual(name = '',
                    values = c('black', 'darkred')) +
  ggtitle('Malaria miscroscopy tests and positive results',
          'All workers, absolute') +
    theme(title = element_text(size = 12))

Relative numbers

The below chart shows the same data as above, but convers the number of positive cases to a percentage of all tests, rather than an absolute number.

ggplot(data = x,
       aes(x = date,
           y = pp)) +
  geom_bar(stat = 'identity', fill = 'darkblue', alpha = 0.5) +
  xlab('Date') +
  ylab('Number of positive cases') +
  theme_maragra() +
  ggtitle('Malaria miscroscopy tests and positive results',
          'All workers, relative') +
    theme(title = element_text(size = 12))

Seasonality data from clinic_agg

We can examine the annual seasonality of positive cases by overlaying all years' data onto one axis.

make_month_name <- function(number){
  format(as.Date(paste0('2015-', number, '-01')), '%B')
}
x <- clinic_agg %>%
  # mutate(day_number = as.numeric(format(date, '%j'))) %>%
  # mutate(day_number = round(day_number, -1) + 5)
  mutate(month_number = as.numeric(format(date, '%m'))) %>%
  group_by(month_number, year) %>%
  summarise(p = sum(positive),
            pp = sum(positive) / sum(tested) * 100) %>%
  mutate(year = factor(year,
                       levels = as.character(2011:2016))) %>%
  ungroup %>%
  mutate(month_number = factor(make_month_name(month_number),
                               levels = make_month_name(1:12)))

cols <- colorRampPalette(brewer.pal(n = 9,  'Spectral'))(length(unique(x$year)))

ggplot(data = x,
       aes(x = month_number,
           y = p,
           group = year,
           color = year)) +
  geom_line() +
  scale_color_manual(name = 'Year',
                     values = cols) +
  theme_maragra() +
  ggtitle('Malaria miscroscopy positive results',
          'Absolute cases, year overlapping') +
    theme(title = element_text(size = 12)) +
  xlab('Month') +
  ylab('Positive results') +
  theme(axis.text.x = element_text(angle = 90))

The below chart uses the same data as above, but instead of positive cases, it shows the percent of tests which were positive.

ggplot(data = x,
       aes(x = month_number,
           y = p,
           group = year,
           color = year)) +
  geom_line() +
  scale_color_manual(name = 'Year',
                     values = cols) +
  theme_maragra() +
  ggtitle('Malaria miscroscopy positive results',
          'Percent positive, year overlapping') +
    theme(title = element_text(size = 12)) +
  xlab('Month') +
  ylab('Positive results') +
  theme(axis.text.x = element_text(angle = 90))

If we aggregate and view distributions (via "violin" charts) at the level of the month, seasonality is more apparent.

ggplot(data = x,
       aes(x = month_number,
           y = p)) +
  geom_violin(alpha = 0.6, fill = 'darkorange') +
    theme_maragra() +
  ggtitle('Malaria miscroscopy positive results',
          'Percent positive, year overlapping') +
    theme(title = element_text(size = 12)) +
  xlab('Month') +
  ylab('Positive results') +
  theme(axis.text.x = element_text(angle = 90))

Malaria severity distribution from clinic

The below chart shows the severity of all clinical malaria cases in the Maragra clinic.

ggplot(data = clinic,
       aes(x = severity)) +
  geom_bar(alpha = 0.6,
           fill = 'darkorange') +
  theme_maragra() +
  labs(x = 'Severity',
       y = 'Cases',
       title = 'Severity of all malaria cases: Maragra clinic')

The below chart shows severity, but over time.

x <- 
  clinic %>%
  group_by(severity,
           date = as.Date(paste0(year_month, '-01'))) %>%
  tally %>%
  ungroup %>%
  group_by(date) %>%
  mutate(p = n / sum(n) * 100) %>%
  ungroup %>%
  mutate(severity = factor(severity,
                           levels = as.character(rev(sort(unique(severity))))))
cols <- (colorRampPalette(brewer.pal(9, 'Spectral'))(length(unique(x$severity))))

ggplot(data = x,
       aes(x = date,
           y = n)) +
  geom_line(aes(color = severity)) +
  scale_color_manual(name = 'Severity',
                     values = cols) +
  theme_maragra() +
  labs(x = 'Date',
       y = 'Cases',
       title = 'Severity over time')
ggplot(data = x,
       aes(x = date,
           y = p,
           fill = severity)) +
  geom_bar(stat = 'identity',
           position = 'stack',
           alpha = 0.6) +
  scale_fill_manual(name = 'Severity',
                    values = cols) +
  theme_maragra() +
  labs(x = 'Date',
       y = 'Percentage of all cases',
       title = 'Distribution of severity of cases over time')

Analysis

Effectiveness analysis

Identification strategy

For our purposes we are analyzing the effect of one intervention (IRS) on 2 outcomes (absence and illness) with many confounders (age, worker type, seasonality, etc.). Our analysis can be visualized formulaically as follows:

$$ \begin{equation} \operatorname{Pr}(\text{Outcome} = 1 \mid \text{X}) = \beta_{0} + \beta_{1} \text{Location} + \beta_{2} \text{Season} + (\beta_3{IRS}*\beta_4{IRS_t} + ... ) \end{equation} $$

Our outcome is probabilistic and binomial (ie, one is either absent or present / infected or not infected). Our demographic confounders (represented by $...$) will be a function of iterative model selection. Our intervention (IRS) is not a simple yes/no, but rather the product of whether the residence of the worker in question was treated in the last year, and, if so, the time since treatment (represented above as the interaction term, where where $_t$ represents time elapsed since commencement of the most recent IRS campaign).

Propensity score matching

Since the full accounting of confounders would greatly reduce the degrees of freedom of our analysis, we employ propensity score matching to generate a matched sample of workers who are alike in characteristics and time, but not treatment. We do this by first estimating the likelihood of having ever received the intervention, given a worker's age, sex, department and temporary vs. permanent status. We justify the necessity of this matching by noting that the differences between those workers who received IRS and those who did not (see table 1) are striking and in most cases statistically significant.

right_side <- irs %>%
  group_by(unidade) %>% 
  tally %>%
  ungroup %>%
  mutate(received = ifelse(n > 0, TRUE, FALSE))

left_side <-
  workers %>%
  dplyr::select(oracle_number, unidade, permanent_or_temporary, department, sex, date_of_birth)

psm <- left_join(x = left_side,
                 y = right_side,
                 by = 'unidade') %>%
  mutate(received = ifelse(is.na(received), FALSE, received)) %>%
  mutate(age = round((as.numeric(as.Date('2016-01-01') - date_of_birth)) / 365.25, digits = 2)) %>%
  filter(!is.na(sex),
         !is.na(age)) %>%
  dplyr::select(-unidade, -date_of_birth, -n)

psmt1 <- psm
names(psmt1) <- toupper(names(psmt1))
psmt1 <- psmt1 %>%
  mutate(RECEIVED = ifelse(RECEIVED == TRUE,'IRS', 'No IRS')) %>%
  mutate(STATUS = PERMANENT_OR_TEMPORARY)

pacman::p_load(tableone)
table1 <- CreateTableOne(vars = toupper(c('STATUS', 'department', 'age',
                                  'sex', 'received')), 
                         data = psmt1, 
                         factorVars = toupper(c('STATUS', 'department',
                                  'sex')), 
                         strata = 'RECEIVED')
table1 <- print(table1, 
                printToggle = FALSE, 
                noSpaces = TRUE)
kable(table1[,1:3],  
      align = 'c', 
      caption = 'Table 1: Comparison of unmatched samples') %>%
  kableExtra::kable_styling(full_width = FALSE)

Having now demonstrated that are treatment and control groups are qualitatively different (and therefore require either statistical adjustment or a priori matching), we proceed to carry out the matching, using those best practices suggested by Ho et al 2004 "for improving parametric statistical models by preprocessing data with nonparametric matching methods" (Daniel Ho, Kosuke Imai, Gary King, and Elizabeth Stuart (2007). Matching as Nonparametric Preprocessing for Reducing Model Dependence in Parametric Causal Inference. Political Analysis 15(3): 199-236. http://gking.harvard.edu/files/abs/matchp-abs.shtml). We emply the nearest neighbor method for identifying those workers from our control group who most resemble those workers in the treatment group.

Our match is a 1-to-1 cut, meaning those control workers who do not resemble those in the treatment group are left out of primary analysis. The below table shows the match results.

library(MatchIt)
set.seed(1234)
match.it <- matchit(received ~ age + sex + permanent_or_temporary + department, data = psm, method="nearest", ratio=1)
a <- summary(match.it)
kable(a$nn, digits = 2, align = 'c', 
      caption = 'Table 2: Sample sizes') %>%
  kableExtra::kable_styling(full_width = FALSE)

The following output shows, that the distributions of our numeric variables are now extremely similar.

kable(a$sum.matched[c(1,2,3,4)], digits = 2, align = 'c', 
      caption = 'Table 3: Summary of balance for matched data') %>%
  kableExtra::kable_styling(full_width = FALSE)

The propensity scores can be visualized below.

plot(match.it, type = 'jitter', interactive = FALSE, col = adjustcolor('darkblue', alpha.f  =0.3))
# Save matched samples
matched <- match.data(match.it)[1:ncol(psm)]

Modeling

Having now created a matched sample of r nrow(matched) workers, of which 50% received IRS and 50% did not, we can confidently carry out our analysis on this sample. Since the propensity score matching effectively cancels out demographic differences, our model only need take into account those differences which are not at the person-level. In our case, these include seasonality (defined here by quarter) (later, will add other factors).

For the purposes of this first pass, we "bin" IRS exposure into 5 groups: before IRS (includes IRS > 365 days ago), 180+ days ago, 90-80 days, 60-90 days, and in the last 60 days.

Having estimated our binomial logistic regression model, we examine the odds ratios for absence as a function of our predictive variables.

# Create model data
model_data <-
  ab_panel %>%
  filter(oracle_number %in% matched$oracle_number) %>%
  left_join(irs) %>%
  left_join(matched) %>%
  mutate(days_since = ifelse(days_since > 180, '180+',
                                    ifelse(days_since > 90, '090-180',
                                           ifelse(days_since > 60, '060-090',
                                                  ifelse(days_since >= 0, '000-060',
                                                         ifelse(days_since < 0, 'Before', NA)))))) %>%
  filter(!is.na(days_since)) %>%
  mutate(days_since = factor(days_since, levels = c('Before', '000-060', '060-090', '090-180', '180+'))) %>%
  mutate(quarter = lendable::quarter_extract(date)) %>%
  mutate(quarter = as.character(quarter)) %>%
  mutate(absent_sick = ifelse(is.na(absent_sick), FALSE, absent_sick))
# Need to add seasonality
# Need to adjust for houses on and off site
fit <- glm(absent ~ days_since + quarter,
           family = binomial('logit'),
          data = model_data)
x <- exp(coef(fit))
ors <- exp(confint(fit))
ors <- data.frame(ors)
names(ors) <- c('Lower', 'Upper')

ors <- cbind(x, ors)
ors$Variable <- row.names(ors)
ors <- ors %>%
  dplyr::rename(OR = x) %>%
  dplyr::select(Variable, OR, Lower, Upper)
# Combine into a df
kable(ors) %>%
  kableExtra::kable_styling(full_width = FALSE)

We run the same model, but instead of estimating absences, we estimate only the likelihood of sick absences. The results (in form of odds ratios) are below.

# Need to add seasonality
# Need to adjust for houses on and off site
fit <- glm(absent_sick ~ days_since + quarter,
           family = binomial('logit'),
          data = model_data)
x <- exp(coef(fit))
ors <- exp(confint(fit))
ors <- data.frame(ors)
names(ors) <- c('Lower', 'Upper')

ors <- cbind(x, ors)
ors$Variable <- row.names(ors)
ors <- ors %>%
  dplyr::rename(OR = x) %>%
  dplyr::select(Variable, OR, Lower, Upper)
# Combine into a df
kable(ors) %>%
  kableExtra::kable_styling(full_width = FALSE)

Discontinuity analysis (not using propoensity score matching)

# Create model data for all obs (not just propensity score)
model_data <-
  ab_panel %>%
  # filter(oracle_number %in% matched$oracle_number) %>%
  left_join(irs) %>%
  # left_join(matched) %>%
  mutate(days_since = ifelse(days_since > 180, '180+',
                                    ifelse(days_since > 90, '090-180',
                                           ifelse(days_since > 60, '060-090',
                                                  ifelse(days_since >= 0, '000-060',
                                                         ifelse(days_since < 0, 'Before', NA)))))) %>%
  filter(!is.na(days_since)) %>%
  mutate(days_since = factor(days_since, levels = c('Before', '000-060', '060-090', '090-180', '180+'))) %>%
  mutate(quarter = lendable::quarter_extract(date)) %>%
  mutate(quarter = as.character(quarter)) %>%
  mutate(absent_sick = ifelse(is.na(absent_sick), FALSE, absent_sick)) #%>%
  # # remove the period 30 days after spraying
  # filter(days_since >= 30 | days_since <= 0)
# Need to add seasonality
# Need to adjust for houses on and off site

Raw absenteeism as a function of days since IRS

x <- model_data %>%
  group_by(days_since) %>%
  summarise(absences = length(which(absent)),
            eligibles = length(absent)) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100)

ggplot(data = x,
       aes(x = days_since,
           y = absenteeism_rate)) +
  geom_bar(stat = 'identity',
           alpha = 0.6,
           fill = 'darkblue') +
  theme_maragra() +
  labs(x = 'Days since IRS',
       y = 'Absenteeism rate') +
  geom_label(aes(label = paste0(round(absenteeism_rate, digits = 2), '%')))

Raw absenteeism as a function of days since IRS by quarter

x <- model_data %>%
  group_by(days_since, quarter) %>%
  summarise(absences = length(which(absent)),
            eligibles = length(absent)) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100)

ggplot(data = x %>% mutate(quarter = paste0('Q', quarter)),
       aes(x = days_since,
           y = absenteeism_rate)) +
  geom_bar(stat = 'identity',
           alpha = 0.6,
           fill = 'darkblue') +
  theme_maragra() +
  labs(x = 'Days since IRS',
       y = 'Absenteeism rate') +
  # geom_label(aes(label = paste0(round(absenteeism_rate, digits = 2), '%'))) +
  facet_wrap(~quarter) +
  theme(axis.text.x = element_text(angle = 90))

Modeling without propensity score

(Absence)

# Need to adjust for houses on and off site
fit <- glm(absent ~ days_since + quarter,
           family = binomial('logit'),
          data = model_data)
x <- exp(coef(fit))
ors <- exp(confint(fit))
ors <- data.frame(ors)
names(ors) <- c('Lower', 'Upper')

ors <- cbind(x, ors)
ors$Variable <- row.names(ors)
ors <- ors %>%
  dplyr::rename(OR = x) %>%
  dplyr::select(Variable, OR, Lower, Upper)
# Combine into a df
kable(ors) %>%
  kableExtra::kable_styling(full_width = FALSE)

(Sick absence)

# Need to add seasonality
# Need to adjust for houses on and off site
fit <- glm(absent_sick ~ days_since + quarter,
           family = binomial('logit'),
          data = model_data)
x <- exp(coef(fit))
ors <- exp(confint(fit))
ors <- data.frame(ors)
names(ors) <- c('Lower', 'Upper')

ors <- cbind(x, ors)
ors$Variable <- row.names(ors)
ors <- ors %>%
  dplyr::rename(OR = x) %>%
  dplyr::select(Variable, OR, Lower, Upper)
# Combine into a df
kable(ors) %>%
  kableExtra::kable_styling(full_width = FALSE)

Visualization of absenteeism before / after IRS

plot_data <-
  ab_panel %>%
  # filter(oracle_number %in% matched$oracle_number) %>%
  left_join(irs) %>%
  mutate(months_since = days_since %/% 30) %>%
  filter(!is.na(months_since)) %>%
  mutate(quarter = lendable::quarter_extract(date)) %>%
  mutate(quarter = as.character(quarter)) %>%
  mutate(absent_sick = ifelse(is.na(absent_sick), FALSE, absent_sick))

x <- plot_data %>%
  group_by(months_since) %>%
  summarise(absences = length(which(absent)),
            sick_absences = length(which(absent_sick)),
            eligibles = length(absent)) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100) %>%
  mutate(sick_absenteeism_rate = sick_absences / eligibles * 100)

ggplot(data = x,
       aes(x = months_since,
           y = absenteeism_rate)) +
  geom_step(color = 'darkred', alpha = 0.8) +
  geom_vline(xintercept = 0, lty = 2, alpha = 0.5) +
  labs(x = 'Months relative to IRS',
       y = 'Absenteeism rate',
       title = 'Before/after IRS',
       subtitle = 'All absences') +
  theme_maragra()


ggplot(data = x,
       aes(x = months_since,
           y = sick_absenteeism_rate)) +
  geom_step(color = 'darkred', alpha = 0.8) +
  geom_vline(xintercept = 0, lty = 2, alpha = 0.5) +
  labs(x = 'Months relative to IRS',
       y = 'Absenteeism rate',
       title = 'Before/after IRS',
       subtitle = 'Sick absences only') +
  theme_maragra()
plot_data <-
  ab_panel %>%
  # filter(oracle_number %in% matched$oracle_number) %>%
  left_join(irs) %>%
  mutate(months_since = days_since %/% 30) %>%
  filter(!is.na(months_since)) %>%
  mutate(quarter = lendable::quarter_extract(date)) %>%
  mutate(quarter = as.character(quarter)) %>%
  mutate(absent_sick = ifelse(is.na(absent_sick), FALSE, absent_sick))

x <- plot_data %>%
  group_by(months_since, quarter) %>%
  summarise(absences = length(which(absent)),
            sick_absences = length(which(absent_sick)),
            eligibles = length(absent)) %>%
  ungroup %>%
  mutate(absenteeism_rate = absences / eligibles * 100) %>%
  mutate(sick_absenteeism_rate = sick_absences / eligibles * 100)

ggplot(data = x %>% mutate(quarter = paste0('Q', quarter)),
       aes(x = months_since,
           y = absenteeism_rate)) +
  geom_step(color = 'darkred', alpha = 0.8) +
  facet_wrap(~quarter) +
  geom_vline(xintercept = 0, lty = 2, alpha = 0.5) +
  labs(x = 'Months relative to IRS',
       y = 'Absenteeism rate') +
  theme_maragra()


joebrew/maragra documentation built on Aug. 11, 2020, 8:39 p.m.