In R4EPI/sitrep: Report templates and helper functions for applied epidemiology

Introduction to this template

This is a template which can be used to create an automated outbreak situation report for acute jaundice syndrome (AJS).

It is organised by time, place and person. For a more detailed explanation of this template, please visit https://r4epis.netlify.com/outbreaks
Feedback and suggestions are welcome at the GitHub issues page
Text within <! > will not show in your final document. These comments are used to explain the template. You can delete them if you want.

Installing and loading required packages

<!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// setup \\

Several packages are required for different aspects of analysis with R. You will need to install these before starting.

These packages can be quite large and may take a while to download in the field. If you have access to a USB key with these packages, it makes sense to copy and paste the packages into your computer's R package library (run the command .libPaths() to see the folder path).

For help installing packages, please visit https://r4epis.netlify.com/welcome ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

## hide all code chunks in the output, but show errors
knitr::opts_chunk$set(echo = FALSE,       # hide all code chunks in output
                      error = TRUE,       # show errors if they appear, but don't stop
                      fig.width = 6*1.25, # Figure width
                      fig.height = 6      # Figure height
                     )



## set default NA to - in output, define figure width/height
options(knitr.kable.NA = "-")

# Ensures the package "pacman" is installed
if (!require("pacman")) {
     install.packages("pacman") }

# Install and load required packages for this template
pacman::p_load(
  knitr,           # create output docs
  here,            # find your files
  rio,             # read in data
  forcats,         # handle ordinal variables
  lubridate,       # handle dates
  dplyr,           # clean/shape data
  tidyr,           # clean/shape data
  stringr,         # clean text
  ggplot2,         # create plots and charts
  slider,          # moving averages
  parsedate,       # guessing dates
  patchwork,       # combine multiple plots
  sitrep,          # MSF field epi functions
  janitor,         # clean data
  matchmaker,      # matching data
  gtsummary,       # tables
  flextable,
  tsibble,         # epiweeks
  sf,              # encode spatial vector data
  ggspatial,       # plot maps
  purrr            # iterate over data
)

## replace the date below with the latest date 
## for which you want data to appear in the report
reporting_date <- ymd("2018-04-23")

## This command sets the reporting week (e.g 2018 W17) based on the reporting_date
## Mondays are the week start by default. Adjust the "1" to another number if desired.
## E.g. 7 for Sundays, 5 for Fridays
reporting_week <- tsibble::yearweek(reporting_date, 1)

## Set default options for plots and charts

## set default text size to 16 for plots
## give classic black/white axes for plots
ggplot2::theme_set(theme_classic(base_size = 18))

## sets the theme in ggplot for epicurves
epicurve_theme <- theme(
  axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), 
  legend.title = element_blank(),
  legend.position = "bottom",
  panel.grid.major.x = element_line(color = "grey60", linetype = 3),
  panel.grid.major.y = element_line(color = "grey60", linetype = 3))

## sets the labels in ggplot for the epicurves
epicurve_labels <- labs(x = "Calendar week", 
                        y = "Cases (n)", 
                        title = "Cases by week of onset",
                        subtitle = str_glue("Source: MSF data from {reporting_week}")
                       )

linelist_raw <- gen_data("AJS")

## fake lab data
# lab_results <- linelist_cleaned %>% 
#                 select(case_number) %>% 
#                 mutate(test_result = sample(c("Positive", "Negative"),
#                                             nrow(linelist_cleaned), 
#                                             replace = TRUE)
#                        )

### Read in data ---------------------------------------------------------------

## Excel file ------------------------------------------------------------------
## For a specific sheet, use "which"
# linelist_raw <- rio::import(here::here("Data", "linelist.xlsx"), which = "Sheet1")



## Excel file with password ----------------------------------------------------
## Use this section if your Excel has a password.

# install.packages(c("excel.link", "askpass"))
# library(excel.link)

# linelist_raw <- xl.read.file(here::here("Data", "linelist.xlsx"),
#                              xl.sheet = "Sheet1",
#                              password = askpass::askpass(prompt = "please enter file password"))

## MSF AJS Dictionary ----------------------------------------------------------
## get MSF standard dictionary for AJS 
linelist_dict <- msf_dict("AJS", compact = FALSE)

## look at the standard dictionary by uncommenting the line below
# View(linelist_dict) 



## Clean column names ----------------------------------------------------------
## make a copy of your orginal dataset and name it linelist_cleaned
linelist_cleaned <- linelist_raw

## define clean variable names using clean_names from the janitor package. 
linelist_cleaned <- janitor::clean_names(linelist_cleaned)



## Standardising values --------------------------------------------------------
## This step fixes the values so that you can read them.
## values like 1/0 will be recoded as "Yes" / "No" based on the dictionary
linelist_cleaned <- matchmaker::match_df(linelist_cleaned,
  dict  = linelist_dict,
  from  = "option_code",
  to    = "option_name",
  by    = "data_element_shortname",
  order = "option_order_in_set"
)

## Excel file ------------------------------------------------------------------
## to read in a specific sheet use "which"
# linelist_raw <- rio::import(here::here("Data", "linelist.xlsx"), which = "Sheet1")



## Excel file -- Specific range ------------------------------------------------
## you can specify a range in an excel sheet.
# linelist_raw <- rio::import(here::here("Data", "linelist.xlsx"), range = "B2:J102")



## Excel file with password ----------------------------------------------------
## use this section if your Excel has a password.

# install.packages(c("excel.link", "askpass"))
# library(excel.link)

# linelist_raw <- xl.read.file(here::here("Data", "linelist.xlsx"),
#                              xl.sheet = "Sheet1",
#                              password = askpass::askpass(prompt = "please enter file password"))


## CSV file --------------------------------------------------------------------
# linelist_raw <- rio::import(here::here("Data", "linelist.csv"))



## Stata data file -------------------------------------------------------------
# linelist_raw <- rio::import(here::here("Data", "linelist.dat"))

## MSF AJS Dictionary ----------------------------------------------------------
## get MSF standard dictionary for AJS 
# linelist_dict <- msf_dict("AJS", compact = FALSE) %>%
#   select(option_code, option_name, everything())

## look at the standard dictionary by uncommenting the line below
# View(linelist_dict) 

## You will need to recode your variables to match the data dictionary. This is
## addressed below.



## Clean column names ----------------------------------------------------------
## This step fixes the column names so they are easy to use in R.

## make a copy of your orginal dataset and name it linelist_cleaned
# linelist_cleaned <- linelist_raw

## define clean variable names using clean_names from the janitor package. 
# linelist_cleaned <- janitor::clean_names(linelist_cleaned)



## Match column names ---------------------------------------------------------
## This step helps you match your variables to the standard variables.
## This step will require some patience. Courage!

## Use the function msf_dict_rename_helper() to create a template based on the
## AJS dictionary. This will copy a rename command like the one above to your
## clipboard.

# msf_dict_rename_helper("AJS")

## Paste the result below and your column names to the matching variable.
## Be careful! You still need to be aware of what each variable means and what
## values it takes.
## If there are any variables that are in the MSF dictionary that are not in
## your data set, then you should comment them out, but be aware that some
## analyses may not run because of this. 


## PASTE HERE


## Here is an EXAMPLE for changing a few specific names. function. In this
## example, we have the columns "gender" and "age" that we want to rename as
## "sex" and "age_years". 
## The formula for this is rename(data, NEW_NAME = OLD_NAME).

# linelist_cleaned <- rename(linelist_cleaned, 
#                            sex       = gender, # TEXT
#                            age_years = age     # INTEGER_POSITIVE
# )

## Read data -------------------------------------------------------------------
## This step reads in your population data from Excel.
## You may need to rename your columns.

# population_data <- rio::import(here::here("Data", "population.xlsx"), which = "Sheet1")

## repeat preparation steps as appropriate



## Enter counts directly -------------------------------------------------------

# population_data_age <- gen_population(
#   groups = c("0-2", "3-14", "15-29", "30-44", "45+"),
#   counts = c(3600, 18110, 13600, 8080, 6600),
#   strata = NULL) %>%               # do not stratify by gender
#   rename(age_group = groups,       # rename columns (syntax is NEW NAME = OLD NAME)
#     population = n)

# population_data_region <- gen_population(
#   groups = c("Village A", "Village B", "Village C", "Village D"),
#   counts = c(10000, 20000, 15000, 5000),
#   strata = NULL) %>%               # do not stratify
#   rename(patient_origin = groups,  # rename columns (syntax is NEW NAME = OLD NAME)
#          population = n)


## Create counts from proportions ----------------------------------------------
## This step helps you estimate sub-group size with proportions.
## You need to replace the total_pop and proportions. You can change the groups
## to fit your needs.


# estimate population size by age group in years
population_data_age <- gen_population(total_pop = 5000,
  groups = c("0-1", "2-14", "15-29", "30-44", "45+"),
  proportions = c(0.068, 0.3622, 0.276, 0.1616, 0.1321),
  strata = NULL) %>%
  rename(age_group = groups,
    population = n)


# estimate population size by age group in months (under 2 years) 
population_data_age_months <- gen_population(total_pop = 5000,
  groups = c("0-5", "6-8", "9-11","12-23"),
  proportions = c(0.0164, 0.088, 0.088, 0.034),
  strata = NULL) %>%
  rename(age_group_mon = groups,
    population = n)


## estimate population size by age categories in months and years 
# population_data_age_categories <- gen_population(total_pop = 5000,
#   groups = c("0-5 months", "6-8 months", "9-11 months","12-23 months",
#              "2-14 years", "15-29 years", "30-44 years", "45+ years"),
#   proportions = c(0.0164, 0.088, 0.088, 0.034, 
#                   0.3622, 0.276, 0.1616, 0.1321),
#   strata = NULL) %>%
#   rename(age_category = groups,
#     population = n)


## estimate population size by region proportion
population_data_region <- gen_population(total_pop = 5000,         # set the total population
  groups = c("Village A", "Village B", "Village C", "Village D"),  # set the groups
  proportions = c(0.221, 0.174, 0.355, 0.245),                     # set the proportions for each group
  strata = NULL) %>%               # do not stratify
  rename(patient_origin = groups,  # rename columns (syntax is NEW NAME = OLD NAME)
         population = n)

## Read data -------------------------------------------------------------------
## This step reads in your lab data from Excel.
## You may need to rename your columns.

## to read in a specific sheet use "which"
# lab_data <- rio::import(here::here("Data", "labresults.xlsx"), which = "Sheet1")

## repeat preparation steps as appropriate
## make sure your unique ID variable name matches!

## Merge lab to linelist -------------------------------------------------------
# linelist_cleaned <- left_join(linelist_cleaned, lab_results, 
#                               by = "case_number")

## view the first ten rows of data
head(linelist_cleaned, n = 10)

## view your whole dataset interactively (in an excel style format)
View(linelist_cleaned)

## overview of variable types and contents
str(linelist_cleaned)

## get summary: 
## mean, median and max values of numeric variables
## counts for categorical variables
## also gives number of NAs
summary(linelist_cleaned)

## view unique values contained in variables 
## you can run this for any column -- just replace the column name
unique(linelist_cleaned$patient_origin) 

## use the dfSummary function in combination with view
## note that view is not capitalised with this package
# summarytools::dfSummary(linelist_cleaned) %>%
#   summarytools::view()

## Drop unused rows  -----------------------------------------------------------
## This step removes blank rows that don't have both a date of admission and ID
## number.
linelist_cleaned <- linelist_cleaned %>% 
  filter(!is.na(case_number) & !is.na(date_of_consultation_admission)) 


## Drop columns ----------------------------------------------------------------
## OPTIONAL: This step shows you how you can remove certain variables.

# linelist_cleaned <- linelist_cleaned %>%
#   select(-c(msf_involvement, treatment_facility_site))


## OPTIONAL: if you want to inspect certain variables, you can select these by
## name or column number. This example creates a reduced dataset for the first
## three columns, age_years, and sex.

# linelist_reduced <- select(linelist_cleaned, c(1:3, "age_years", "sex"))

## DHIS2 standard data ---------------------------------------------------------
## If you got your data from DHIS2, use this portion of the code.
## If not, comment this section out and use the below.

## make sure all date variables are formatted as dates 

DATEVARS <- linelist_dict %>% 
  filter(data_element_valuetype == "DATE") %>%
  filter(data_element_shortname %in% names(linelist_cleaned)) %>% 
  ## filter to match the column names of your data
  pull(data_element_shortname) 

## find if there are date variables which are completely empty
## (otherwise the linelist package does not work)
EMPTY_DATEVARS <- purrr::map(DATEVARS, ~all(
  is.na(linelist_cleaned[[.x]])
  )) %>% 
  unlist() %>% 
  which()

## remove exclude the names of variables which are completely emptys
DATEVARS <- DATEVARS[-EMPTY_DATEVARS]

## change to dates 
linelist_cleaned <- linelist_cleaned %>%
    mutate(
      across(.cols = all_of(DATEVARS),
           .fns = ~ymd(parsedate::parse_date(.x))))



## Non-DHIS2 data --------------------------------------------------------------
## Use this section if you did not have DHIS2 data. 

## use the parse_dates() function to make a first pass at date variables.
# linelist_cleaned <- linelist_cleaned %>%
#   mutate(
#     across(.cols = matches("date|Date"),
#            .fns  = ~ymd(parsedate::parse_date(.x))))

## once you have run parse_date(), take a look at your date variables.
## here is an example:
# table(linelist_cleaned$date_of_consultation_admission)


## Some dates will be unrealistic or wrong.
## Here is an example of how to manually fix dates. 
## Look at your data and edit as needed.
# linelist_cleaned <- linelist_cleaned %>% 
#   mutate(date_of_onset = case_when(
#     date_of_onset < as.Date("2017-11-01")  ~ as.Date(NA),
#     date_of_onset == as.Date("2081-01-01") ~ as.Date("2018-01-01"),
#     TRUE                                   ~ date_of_onset))



## Create epiweek variable -----------------------------------------------------
## This step creates an epiweek variable from the date of onset.
## You can use date_of_consultation_admission if you are missing many date_of_onset.

linelist_cleaned <- linelist_cleaned %>% 
  mutate(
    ## create an epiweek variable 
    epiweek = tsibble::yearweek(
      date_of_onset,
      week_start = 1),  # 1 is Monday start; use 7 for Sundays or 5 for Fridays
    ## create a date version of epiweek 
    epiweek_date = as.Date(epiweek))

## Age group variables ----------------------------------------------------------
## This step shows you how to create categorical variables from numeric variables.
## We have some intermediate steps on the way.


## OPTIONAL: add under 2 years to the age_years variable
## data dictionary defines that under 2s dont have year filled in (but months/days instead)
linelist_cleaned <- linelist_cleaned %>% 
  mutate(age_months = case_when(
    is.na(age_years) & is.na(age_months) ~ as.numeric(age_days / 30), 
    TRUE                                 ~ as.numeric(age_months)
  ),
    age_years = case_when(
    is.na(age_years) & is.na(age_months) ~ as.numeric(age_days / 365.25),
    is.na(age_years)                     ~ as.numeric(age_months / 12),
    TRUE                                 ~ as.numeric(age_years)
  ))


## OPTIONAL: change those who are above or below a certain age to NA
# linelist_cleaned <- linelist_cleaned %>%
#   mutate(age_years = case_when(
#     is.na(age_years) ~ NA_integer_,
#     age_years < 0   ~ NA_integer_,
#     age_years > 120 ~ NA_integer_,
#     TRUE            ~ as.integer(age_years) #Preserves other values
#     ))

## OPTIONAL: create an age_months variable from decimal years variable
# linelist_cleaned <- linelist_cleaned %>% 
#   mutate(age_months = case_when(
#             age_years < 5 ~ age_years * 12))


## create age group variable for under 2 years based on months
linelist_cleaned <- linelist_cleaned %>% 
  mutate(age_group_mon = age_categories(
    age_months, 
    breakers = c(0, 6, 9, 12, 23), 
    ceiling = TRUE))

## create an age group variable by specifying categorical breaks
linelist_cleaned <- linelist_cleaned %>% 
  mutate(age_group = age_categories(
    age_years,
    breakers = c(0, 2, 15, 30, 45)))

## alternatively, create an age group variable by specifying a sequence
## this creates age groups for every ten years between 0 and 100.
# linelist_cleaned <- linelist_cleaned %>% 
#   mutate(age_group = age_categories(
#     age, 
#     lower = 0,
#     upper = 100,
#     by = 10)))

## If you already have an age group variable defined, you should manually
## arrange the categories in order.
# linelist_cleaned <- linelist_cleaned %>% 
#   mutate(age_group = fct_relevel(
#     age_group,
#     "0-4y", "5-14y", "15-29y", "30-44y", "45+y")


## to combine different age categories use the following function 
## this prioritises the smaller unit, i.e. if given months and years, will return months first
## generally, these delineations are NOT used for AJS.
##
# linelist_cleaned <- group_age_categories(linelist_cleaned, 
#                                          years = age_group, 
#                                          months = age_group_mon)

## drop the 0-2 years age level! (need to fix group_age_categories)
# linelist_cleaned <- linelist_cleaned %>% 
#   mutate(age_category = factor(age_category, 
#                                 levels = c("0-5 months",
#                                            "6-8 months",
#                                            "9-11 months",
#                                            "12-23 months",
#                                            "2-14 years",
#                                            "15-29 years",
#                                            "30-44 years", 
#                                            "45+ years")))

## Numeric variables -----------------------------------------------------------
## This step creates a variable for number of days under observation.
## You can adapt this step to create other calculated variables

## create number of days under observation
linelist_cleaned <- linelist_cleaned %>% 
  mutate(obs_days = as.numeric(date_of_exit - date_of_consultation_admission))


## Factor (categorical) variables ----------------------------------------------

## This step creates a variable from another character/factor variable
## You can adapt this step to create other calculated variables

## The variable DIED will be binary (TRUE/FALSE) -- if patient died or not
## if your variable is named ("Dead on arrival", "Dead in facility"), use this
linelist_cleaned <- linelist_cleaned %>% 
  mutate(DIED = str_detect(exit_status, "Dead"))

## use this version for coded values (e.g. "DOA", "DD")
# linelist_cleaned <- linelist_cleaned %>% 
#   mutate(DIED = exit_status %in% c("DD", "DOA"))



## Create a new grouping for exit status variable 
linelist_cleaned <- linelist_cleaned %>% 
  mutate(exit_status2 = case_when(
    exit_status %in% c("Dead on arrival", "Dead in facility") ~ "Died",
    exit_status %in% c("Transferred (to an MSF facility)", 
                       "Transferred (to External Facility)")  ~ "Transferred",
    exit_status == "Discharged home"                          ~ "Discharged",
    exit_status == "Left against medical advice"              ~ "Left"
  ))



## Recode character variables -------------------------------------------------
## This step shows how to fix misspellings in the geographic region variable.
## Ideally, you want these values to match your shapefile and population data!

# linelist_cleaned <- linelist_cleaned %>%
#   mutate(patient_origin = case_when(
#     patient_origin == "Valliages D"       ~ "Village D",
#     patient_origin == "VillageD"          ~ "Village D",
#     patient_origin == "Town C"            ~ "Village C",
#     TRUE ~ as.character(patient_origin))
#   ))



## sometimes, coding is inconsistent across variables -- for example, "Yes" / "No"
## may be coded as Y, y, yes, 1 / N, n, no, 0. You can change them all at once!
## Create a list of the variables you want to change, and run the following.
## You may need to edit this code if options are different in your data.

## create list of variables
# change_yn <- c("headache", "fever", "vomiting")

## standardize options
# linelist_cleaned <- linelist_cleaned %>%  
#   mutate(
#     across(
#       .cols = all_of(change_yn),
#       .fns = ~forcats::fct_recode(.x,
#                                   Yes = "y",
#                                   Yes = "Y",
#                                   Yes = "yes",
#                                   Yes = "1",
#                                   No  = "n",
#                                   No  = "N",
#                                   No  = "no",
#                                   No  = "0")))



## Create a variable based on rules from other simple character variables
## If you have access to lab results, you can create a case definition variable 
## the tilde (~) is used to assign the new values (Conf, prob, susp, unknown)
## starting from the specific to the general
## TRUE assigns all remaining rows 
## You MUST modify this section to match your case definition. The below
## uses positive RDT for Confirmed and epi link only for Probable.

linelist_cleaned <- linelist_cleaned %>%
  mutate(case_def = case_when(
    is.na(hep_e_rdt) & is.na(other_cases_in_hh)           ~ NA_character_,
    hep_e_rdt == "Positive"                               ~ "Confirmed",
    hep_e_rdt != "Positive" & other_cases_in_hh == "Yes"  ~ "Probable",
    TRUE                                                  ~ "Suspected"
  ))

linelist_cleaned <- linelist_cleaned %>% 
  mutate(case = case_def == "Confirmed")

# Force missing values to NA
# important for sex to generate age pyramids
linelist_cleaned <- linelist_cleaned %>% 
  mutate(sex = fct_recode(sex, NULL = "Unknown/unspecified"))


# change the order of levels in a single categorical variable 
# This orders the levels -- since in all figures / tables, 0-4 should come
# before 4-24, etc
linelist_cleaned <- linelist_cleaned %>% 
  mutate(time_to_death = factor(time_to_death, 
                                levels = c("0-4 hours", 
                                           ">4-24 hours", 
                                           ">24-48 hours", 
                                           ">48 hours")))



# Change the order of levels of multiple categorical variables at the same time
linelist_cleaned <- linelist_cleaned %>%
  mutate(
    across(
      # Looks for variables beginning with "test"
      .cols = starts_with("test"),  
      # Sets order of levels
      .fns = ~fct_relevel(.x, 
                          levels = c("Positive", "Negative", "Not done")))) %>% 
  ## stop a meaningless tidyverse warning from showing
  suppressWarnings()

# create a grouping of all symptoms 
SYMPTOMS <- c("history_of_fever", 
              "fever",
              "nausea_anorexia",
              "vomiting",
              "epigastric_pain_heartburn",
              "generalized_itch",
              "headache",
              "joint_pains",
              "diarrhoea",
              "bleeding", 
              "convulsions",
              "mental_state",
              "other_symptoms"  
              )



# create a grouping of all lab tests 
LABS <- c("hep_b_rdt", 
          "hep_c_rdt",
          "hep_e_rdt",
          "test_hepatitis_a",
          "test_hepatitis_b",
          "test_hepatitis_c",
          "test_hepatitis_e_igg",
          "test_hepatitis_e_igm" ,
          "test_hepatitis_e_genotype",
          "test_hepatitis_e_virus",
          "malaria_rdt_at_admission",
          "malaria_blood_film", 
          "dengue",
          "dengue_rdt", 
          "yellow_fever",
          "typhoid",
          "chikungunya_onyongnyong", 
          "ebola_marburg",
          "lassa_fever",
          "other_arthropod_transmitted_virus", 
          "other_pathogen"
          )

# return the last day of the last reporting week
obs_end   <- as.Date((reporting_week + 1)) - 1

# filter out cases after end of reporting week
linelist_cleaned <- linelist_cleaned %>% 
  filter(epiweek <= reporting_week)

# define the first week of outbreak (date of first case)
first_week <- min(linelist_cleaned$epiweek)

# outbreak start 
# return the first day in the week of first case 
obs_start <- as.Date(first_week)

## pull the unique weeks which occur
all_weeks <- seq(first_week,
                 reporting_week, by = 1) 

## change weeks to dates
all_weeks_date <-  as.Date(all_weeks)

## option 1: only keep the first occurence of duplicate case 

linelist_cleaned <- linelist_cleaned %>% 
  ## find duplicates based on case number, sex and age group 
  ## only keep the first occurence 
  distinct(case_number, sex, age_group, .keep_all = TRUE)


# ## option 2: create flagging variables for duplicates (then use to browse)
# 
# linelist_cleaned <- linelist_cleaned %>% 
#   ## choose which variables to use for finding unique rows 
#   group_by(case_number, sex, age_group) %>% 
#   mutate(
#     ## get the number of times duplicate occurs 
#     num_dupes = n(), 
#     duped = if_else(num_dupes > 1 , TRUE, FALSE)
#   )
# 
# ## browse duplicates based on flagging variables 
# linelist_cleaned %>% 
#   ## only keep rows that are duplicated
#   filter(duped) %>% 
#   ## arrange by variables of interest 
#   arrange(case_number, sex, age_group) %>% 
#   View()
# 
# ## filter duplicates to only keep the row with the earlier entry 
# linelist_cleaned %>% 
#   ## choose which variables to use for finding unique rows 
#   group_by(case_number, sex, age_group) %>% 
#   ## sort to have the earliest date by person first
#   arrange(date_of_consultation_admission) %>% 
#   ## only keep the earliest row 
#   slice(1)

# rio::export(linelist_cleaned, here::here("Data", str_glue("linelist_cleaned_{Sys.Date()}.xlsx")))

Person

From the start of the outbreak up until r reporting_week there were a total of r nrow(linelist_cleaned) cases. There were r fmt_count(linelist_cleaned, sex == "Female") females and r fmt_count(linelist_cleaned, sex == "Male") males.

The most affected age group was r tabyl(linelist_cleaned, age_group) %>% slice(which.max(n)) %>% pull(age_group ) years.

Demographics

Cases by age group and definition <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// describe_by_age_group_and_def \\

This chunk will create a table of cases by age group and case definition. You can only use this chunk if you have lab results and have defined a case definition variable. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

linelist_cleaned %>% 
  ## only keep variables of interest
  select(age_group, case_def) %>% 
  ## make NAs show up as "Missing" (so they are included in counts)
  mutate(case_def = forcats::fct_explicit_na(case_def, "Missing")) %>%
  ## create a table with counts/percentages by column
  tbl_summary(by = case_def,
              percent = "column",
              label = list(age_group = "Age group")) %>% 
  ## add totals
  add_overall() %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Cases by age group and sex

linelist_cleaned %>% 
  select(age_group, sex) %>% 
  mutate(sex = forcats::fct_explicit_na(sex, "Missing")) %>%
  tbl_summary(by = sex,
              percent = "cell",
              label = list(age_group = "Age group")) %>% 
  add_overall() %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Age pyramid

There were r fmt_count(linelist_cleaned, is.na(sex)| sex == "Unknown/unspecified") cases missing information on sex and r fmt_count(linelist_cleaned, is.na(age_group)) missing age group.

 age_pyramid(linelist_cleaned,
                  age_group = "age_group", 
                  split_by = "sex") + 
  labs(y = "Cases (n)", x = "Age group") + # change axis  labels
  theme(legend.position = "bottom",     # move legend to bottom
        legend.title = element_blank(), # remove title
        text = element_text(size = 18)  # change text size
       )

# ## plot age pyramid by month groups, for observations under 2 years
# linelist_cleaned %>% 
#   filter(age_years < 2) %>%
#   age_pyramid(age_group = "age_group_mon",
#                    split_by = "sex") +
#                   # stack_by = "case_def") +
#   labs(y = "Cases (n)", x = "Age group (months)") + # change axis  labels (nb. x/y flip)
#   theme(legend.position = "bottom",     # move legend to bottom
#         legend.title = element_blank(), # remove title
#         text = element_text(size = 18)  # change text size
#        )

Of the patients, r fmt_count(linelist_cleaned, patient_facility_type == "Outpatient") were seen as outpatients and r fmt_count(linelist_cleaned, patient_facility_type == "Inpatient") were inpatients. Among inpatients, the median number of days admitted was r median(linelist_cleaned$obs_days, na.rm = T), with a range between r min(linelist_cleaned$obs_days, na.rm = T) and r max(linelist_cleaned$obs_days, na.rm = T) days.

Cases by symptoms <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// describe_by_symptoms \\

This chunk gives the counts and proportions for all of the variables in SYMPTOMS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

linelist_cleaned %>% 
  select(all_of(SYMPTOMS)) %>% 
  tbl_summary() %>% 
  # fix the way symptoms are displayed
  # str_replace_all switches underscore for space in the variable column
  # str_to_sentence makes the first letter capital, and all others lowercase
  modify_table_body(
    ~.x %>% 
      mutate(label = str_to_sentence(str_replace_all(label, "_", " ")))
  ) %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Cases by lab results <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// describe_by_labs \\

This chunk gives the counts and proportions for all of the variables in LABS. If you do not have lab results, comment out this chunk. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

linelist_cleaned %>% 
  select(all_of(LABS)) %>% 
  ## create long dataset for counting
  pivot_longer(cols = everything()) %>% 
  # fix the way lab test names are displayed
  # str_replace_all switches underscore for space in the variable column
  # str_to_sentence makes the first letter capital, and all others lowercase
  mutate(name = str_to_sentence(str_replace_all(name, "_", " "))) %>% 
  ## show percentages by row (i.e. for each test)
  tbl_summary(percent = "row",
              by = value,
              label = list(name = "Test")) %>% 
  add_overall() %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Case fatality ratio

Of r fmt_count(linelist_cleaned, patient_facility_type == "Inpatient") inpatients, there have been r fmt_count(linelist_cleaned, str_detect(exit_status, "Dead"), patient_facility_type == "Inpatient") deaths, of which r fmt_count(linelist_cleaned, exit_status == "Dead on arrival") were dead on arrival.

Among inpatients who died, the time to death is shown below. <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// describe_time_to_death \\

If you have the time_to_death available, you can use this chunk to describe when inpatients died. If not, comment it out. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

linelist_cleaned %>% 
  filter(DIED, patient_facility_type == "Inpatient") %>% 
  select(time_to_death) %>% 
  tbl_summary(label = list(time_to_death = "Time to Death")) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

The case fatality ratio among inpatients with known outcomes is below. <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// overall_cfr \\

This chunk gives the case fatality ratio among inpatients with outcomes. If you have no deaths reported, the table will not be useful. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

# use arguments from above to produce overal CFR
linelist_cleaned %>%
  filter(patient_facility_type == "Inpatient") %>%
  select(DIED) %>%
  gtsummary::tbl_summary(
    statistic = everything() ~ "{N}",
    label = DIED ~ "Inpatients"
    ) %>%
  modify_header(stat_0 = "Cases (N)") %>% 
  # Use wrapper function to calculate cfr
  epitabulate::add_cfr(deaths_var = "DIED") %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

The case fatality ratio by sex among inpatients with known outcomes is below. <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// cfr_by_sex \\

This chunk gives the case fatality ratio among inpatients with outcomes, divided by sex. If you have no deaths reported, the table will not be useful. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

linelist_cleaned %>%
  filter(patient_facility_type == "Inpatient") %>%
  select(DIED, sex) %>%
  mutate(sex = forcats::fct_explicit_na(sex, "Missing")) %>%
  gtsummary::tbl_summary(
    include = sex,
    statistic = sex ~ "{n}",
    label = sex ~ "Gender") %>%
  modify_header(stat_0 = "Cases (N)") %>% 
  # Use wrapper function to calculate cfr
  add_cfr(deaths_var = "DIED") %>%
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

CFR by age group among inpatients with known outcomes <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// cfr_by_age_group \\

This chunk gives the case fatality ratio among inpatients with outcomes, divided by age group. If you have no deaths reported, the table will not be useful. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

linelist_cleaned %>% 
  filter(patient_facility_type == "Inpatient") %>%
  select(DIED, age_group) %>%
  gtsummary::tbl_summary(
    include = age_group,
    statistic = age_group ~ "{n}",
    missing = "no",
    label = age_group ~ "Age group"
    ) %>%
  modify_header(stat_0 = "Cases (N)") %>%
  # Use wrapper function to calculate cfr
  add_cfr(deaths_var = "DIED") %>%
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

linelist_cleaned %>%
  filter(patient_facility_type == "Inpatient") %>%
  select(DIED, case_def) %>%
  gtsummary::tbl_summary(
    include = case_def,
    statistic = case_def ~ "{n}",
    missing = "no",
    label = case_def ~ "Case definition"
    ) %>%
  modify_header(stat_0 = "Cases (N)") %>%
  # Use wrapper function to calculate cfr
  add_cfr(deaths_var = "DIED") %>%
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Attack rate

The attack rate per 10,000 population is below (based on available population data available for the catchment area/region of interest).

# define population 
population <- sum(population_data_age$population)

Below gives the attack rate per 10,000 population (N = r format(population, big.mark = ","))

ar <- attack_rate(nrow(linelist_cleaned), population, multiplier = 10000)

ar %>%
  merge_ci_df(e = 3, digits = 1) %>% # merge the lower and upper CI into one column
  rename("Cases (n)" = cases, 
         "Population" = population, 
         "AR (per 10,000)" = ar, 
         "95%CI" = ci) %>% 
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit") %>% 
  ## set to only show 1 decimal place 
  colformat_double(digits = 1)

Here, we can see that the attack rate for a population of r format(population, big.mark = ",") was r fmt_ci_df(ar, percent = FALSE).

To give attack rate by age group, with appropriate population denominators, use the following code.

linelist_cleaned %>%
  ## create a dummy variable so all rows are counted
  mutate(case_total = TRUE) %>% 
  select(case_total, age_group) %>%
  gtsummary::tbl_summary(
    ## only include age group counts
    include = age_group, 
    label = list(age_group ~ "Age Group")
    ) %>%
  epitabulate::add_ar(
    ## specify which variable counts cases (dummy)
    case_var = "case_total",
    ## get the population counts as a vector
    population = pull(population_data_age, population),
    drop_tblsummary_stat = TRUE) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Mortality attributable to AJS

<!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This section can only be used if you are in a closed population (eg refugee camp). The assumptions don't hold in an open/community setting.

This chunk calculates the attack rate by age group and then gives a table of attack rate by group.

This section gives mortality rates attributable to AJS in a closed population. It does not calculate all-cause mortality. It assumes that all AJS deaths are among inpatients.

This demonstrates three ways of calculating mortality rate based on catchment population (twice) and based on hospital population. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

# count number of deaths 
deaths <- sum(linelist_cleaned$DIED)

# outbreak duration in days 
obs_time <- as.numeric(obs_end - obs_start)

# patient observation time 
pat_obs_time <- linelist_cleaned %>% 
  filter(!is.na(exit_status)) %>% 
  summarise(days = sum(obs_days)) %>%
  pull(days)

Mortality rate attributable to AJS per 10,000 population

mortality_rate(deaths, population, multiplier = 10000, mergeCI = TRUE) %>%
  rename("Deaths" = deaths, 
         "Population" = population, 
         "Mortality (per 10,000)" = `mortality per 10 000`, 
         "95%CI" = ci) %>% 
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit") %>% 
  ## set to only show 1 decimal place 
  colformat_double(digits = 1)

Crude mortality rate attributable to AJS per 10,000 population per day

mortality_rate(deaths, population*obs_time, multiplier = 10000, mergeCI = TRUE) %>%
  rename("Deaths" = deaths, 
         "Person-days" = population, 
         "Mortality (per 10,000/day)" = `mortality per 10 000`, 
         "95%CI" = ci) %>% 
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit") %>% 
  ## set to only show 1 decimal place 
  colformat_double(digits = 1)

Mortality rate attributable to AJS per 10,000 patients per day

mortality_rate(deaths, pat_obs_time, multiplier = 10000, mergeCI = TRUE) %>%
  rename("Deaths" = deaths, 
         "Person-days" = population, 
         "Mortality (per 10,000/day)" = `mortality per 10 000`, 
         "95%CI" = ci) %>% 
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit") %>% 
  ## set to only show 1 decimal place 
  colformat_double(digits = 1)

Time

<!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This section focuses on the when of the outbreak: - When did cases fall ill? - Are numbers increasing or decreasing?

There is code to include an epi curve. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

There were r fmt_count(linelist_cleaned, is.na(date_of_onset)) cases missing dates of onset.

## make plot
basic_curve <- linelist_cleaned %>%
  filter(!is.na(epiweek_date)) %>%

  ## plot those not missing using the date variable 
  ggplot(aes(x = epiweek_date)) +
  ## plot as a bar
  geom_bar(width = 7,
           fill = "red",
           colour = "black") +
  ## align plot with bottom of plot
  scale_y_continuous(expand = c(0, NA)) +
  ## control x-axis breaks and display
  scale_x_date(
    breaks = all_weeks_date,
    labels = all_weeks) + 
  ## use pre-defined labels
  epicurve_labels +
  ## use pre-defined theme (design) settings
  epicurve_theme

# show your plot (stored for later use) 
basic_curve

## if the outbreak has been going on for a while, your x-axis might look messy.
## to reduce the number of labels, uncomment the below.
## you can customize the number of breaks by changing n_breaks
## you can also use months for your break (here in three-month intervals)

# basic_curve + scale_x_date(breaks = "3 months", date_labels = "%b %Y")

The peak of the outbreak was in r tabyl(linelist_cleaned, epiweek) %>% slice(which.max(n)) %>% pull(epiweek)

## define the width of the moving average window here
moving_window_days <- 14

## calculates the moving average using counts by epiweek
moving_avg <- linelist_cleaned %>% 
  ## count cases by epiweek
  count(epiweek_date) %>% 
  ## create column with sliding window for day and window prior
  mutate(indexed_window = slider::slide_index_dbl(
    n,                       # calculate on new_cases
    .i = epiweek_date,       # indexed with epiweek_date 
    .f = ~mean(.x, na.rm = TRUE),     # function is mean() with missing values removed
    .before = days(moving_window_days - 1)))

## create plot
moving_plot <-  basic_curve + 
  geom_line(data = moving_avg,
       mapping = aes(x = epiweek_date, y = indexed_window)) +

  # adjust title and subtitle for this particular plot
  labs(title = str_glue("Cases by date of onset"),
       subtitle = str_glue("{moving_window_days}-day moving average
                           Source: MSF data from {reporting_week}"))

# show your plot (stored for later use) 
moving_plot

## calculates the moving sum using counts by epiweek
moving_sum <- linelist_cleaned %>% 
  ## count cases by date of onset
  count(epiweek_date) %>% 
  mutate(biweekly = slide_sum(x = n, 
            after = 1, 
            step = 2)) %>% 
  filter(!is.na(biweekly))

ggplot(moving_sum, aes(x = epiweek_date, y = biweekly)) +
  ## plot as a bar
  geom_bar(
    ## use the count we calculated above
    stat = "identity", 
    ## make each bar two weeks wide
    width = 14,
    fill = "red",
    colour = "black") +
  ## align plot with bottom of plot
  scale_y_continuous(expand = c(0, NA)) +
  ## control x-axis breaks and display
  scale_x_date(
    breaks = all_weeks_date,
    labels = all_weeks) + 
  ## use pre-defined labels
  epicurve_labels +
  ## use pre-defined theme (design) settings
  epicurve_theme

# get counts by gender
linelist_cleaned %>%
  filter(!is.na(epiweek_date)) %>%
  ## plot those not missing using the date variable 
  ggplot(aes(x = epiweek_date, fill = fct_rev(fct_explicit_na(sex)))) +
  ## plot as a bar
  geom_bar(width = 7,
           colour = "black") +
  # align plot with bottom of plot
  scale_y_continuous(expand = c(0, NA)) +
  # control x-axis breaks and display
  scale_x_date(
    breaks = all_weeks_date,
    labels = all_weeks) + 
  # use pre-defined labels
  epicurve_labels +
  # use pre-defined theme (design) settings
  epicurve_theme

linelist_cleaned %>%
  filter(!is.na(epiweek_date)) %>%

  ## plot those not missing using the date variable 
  ggplot(aes(x = epiweek_date, fill = fct_rev(fct_explicit_na(case_def)))) +
  ## plot as a bar
  geom_bar(width = 7,
           colour = "black") +
  # align plot with bottom of plot
  scale_y_continuous(expand = c(0, NA)) +
  # control x-axis breaks and display
  scale_x_date(
    breaks = all_weeks_date,
    labels = all_weeks) +
  # use pre-defined labels
  epicurve_labels +
  # use pre-defined theme (design) settings
  epicurve_theme

linelist_cleaned %>%
  filter(!is.na(epiweek_date)) %>%

  ## plot those not missing using the date variable 
  ggplot(aes(x = epiweek_date, fill = fct_rev(fct_explicit_na(water_source)))) +
  ## plot as a bar
  geom_bar(width = 7,
           colour = "black") +
  # align plot with bottom of plot
  scale_y_continuous(expand = c(0, NA)) +
  # control x-axis breaks and display
  scale_x_date(
    breaks = all_weeks_date,
    labels = all_weeks) +
  # use pre-defined labels
  epicurve_labels +
  # use pre-defined theme (design) settings
  epicurve_theme

Cases by week of onset among inpatients by sex

linelist_cleaned %>%
  filter(!is.na(epiweek_date)) %>%
  filter(patient_facility_type == "Inpatient") %>% 

  ## plot those not missing using the date variable 
  ggplot(aes(x = epiweek_date, fill = fct_rev(fct_explicit_na(sex)))) +
  ## plot as a bar
  geom_bar(width = 7,
           colour = "black") +
  ## align plot with bottom of plot
  scale_y_continuous(expand = c(0, NA)) +
  ## control x-axis breaks and display
  scale_x_date(
    breaks = all_weeks_date,
    labels = all_weeks) +
  ## use pre-defined labels
  epicurve_labels +
  ## use pre-defined theme (design) settings
  epicurve_theme

Attack rate per 10,000 population by week

# counts and cumulative counts by week
cases <- linelist_cleaned %>%
  arrange(date_of_onset) %>%        # arrange by date of onset
  count(epiweek, .drop = FALSE) %>% # count all epiweeks and include zero counts
  mutate(cumulative = cumsum(n))    # add a cumulative sum


# attack rate for each week
ar <- attack_rate(cases$n, population, multiplier = 10000) %>% 
  bind_cols(select(cases, epiweek), .) # add the epiweek column to table


ar %>%
  merge_ci_df(e = 4) %>% # merge the lower and upper CI into one column
  rename("Epiweek" = epiweek, 
         "Cases (n)" = cases, 
         "Population" = population, 
         "AR (per 10,000)" = ar, 
         "95%CI" = ci) %>% 
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit") %>% 
  ## set to only show 1 decimal place 
  colformat_double(j = -1, digits = 1)

Cumulative attack rate per 10,000 population per week

attack_rate(cases$cumulative, population, multiplier = 10000) %>% 
  bind_cols(select(cases, epiweek), .) %>% # add the epiweek column to table
  merge_ci_df(e = 4) %>% # merge the lower and upper CI into one column
  rename("Epiweek" = epiweek, 
         "Cases (n)" = cases, 
         "Population" = population, 
         "AR (per 10,000)" = ar, 
         "95%CI" = ci) %>% 
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit") %>% 
  ## set to only show 1 decimal place 
  colformat_double(j = -1, digits = 1)

Case fatality ratio as a proportion among inpatients by week

# calculate CFR among inpatients by week
cfr <- linelist_cleaned %>%
  filter(patient_facility_type == "Inpatient") %>%
  mutate(epiweek = as.character(epiweek)) %>% 
  case_fatality_rate_df(str_detect(exit_status, "Dead"), group = epiweek)


cfr %>%
  merge_ci_df(e = 4) %>% # merge the lower and upper CI into one column
  rename("Epiweek" = epiweek, 
         "Deaths" = deaths, 
         "Cases" = population, 
         "CFR (%)" = cfr, 
         "95%CI" = ci) %>% 
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit") %>% 
  ## set to only show 1 decimal place 
  colformat_double(j = -1, digits = 1)

ar_plot <- ggplot(ar, aes(x = as.Date(epiweek), group = 1)) +
  # add confidence intervals as a ribbon
  geom_ribbon(aes(ymin = lower, ymax = upper), 
              color = "blue",  fill = "blue", 
              linetype = 2, alpha = 0.2, show.legend = FALSE) +
  # add AR as a line
  geom_line(aes(y = ar), color = "blue", show.legend = FALSE) +
  # set origin for axes
  scale_y_continuous(expand = c(0, NA)) +
  # control x-axis breaks and display
  scale_x_date(
    breaks = all_weeks_date,
    labels = all_weeks) +
  # add labels to axes and below chart
  labs(x = "Calendar week", y = "AR [95% CI]", subtitle = "Attack Rate (per 10,000)") + 
  # change visuals of dates and remove legend title
  epicurve_theme

cfr_plot <- ggplot(cfr, aes(x = as.Date(yearweek(as.character(epiweek)))), group = 1) +
  # add confidence intervals as a ribbon
  geom_ribbon(aes(ymin = lower, ymax = upper), 
              color = "red", fill = "red", 
              linetype = 2, alpha = 0.2, show.legend = FALSE) +
  # add CFR as a line
  geom_line(aes(y = cfr), color = "red", show.legend = FALSE) +
  # set origin for axes
  scale_y_continuous(expand = c(0, NA)) +  
  # control x-axis breaks and display
  scale_x_date(
    breaks = all_weeks_date,
    labels = all_weeks) +
  # add labels to axes and below chart
  labs(x = "Calendar week", y = "CFR [95% CI]", 
       subtitle = "Case Fatality Ratio [95% CI] Among Inpatients") + 
  # change visuals of dates and remove legend title
  epicurve_theme

no_x <- theme(axis.text.x = element_blank(),
              axis.title.x = element_blank())

## use patchwork to stack the three plots on top of one another
(basic_curve + no_x) / (ar_plot + no_x) / cfr_plot

Inpatient admissions by case definition and week

linelist_cleaned %>% 
  filter(patient_facility_type == "Inpatient") %>%
  tbl_cross(
    row = epiweek, 
    col = case_def, 
    percent = "column", 
    label = list(epiweek = "Calendar Week", 
                 case_def = "Case definition")
  ) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Inpatient discharges by reason for exit and week

linelist_cleaned %>% 
  filter(patient_facility_type == "Inpatient") %>%
  tbl_cross(
    row = epiweek, 
    col = exit_status2, 
    percent = "column", 
    label = list(epiweek = "Calendar Week", 
                 exit_status2 = "Outcome")
  ) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Place

<!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This section focuses on the where of the outbreak: what area is affected, how many villages, and so on.

There is code to include maps based on distribution of cases. You must have a shapefile to create this map. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

Descriptive

Cases by region and facility type

linelist_cleaned %>% 
  tbl_cross(
    row = patient_origin, 
    col = patient_facility_type, 
    percent = "column", 
    label = list(patient_origin = "Region", 
                 patient_facility_type = "Facility type")
  ) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Inpatient discharges by reason for exit and region

linelist_cleaned %>% 
  filter(patient_facility_type == "Inpatient") %>% 
  tbl_cross(
    row = patient_origin, 
    col = exit_status2, 
    percent = "column", 
    label = list(patient_origin = "Region", 
                 exit_status2 = "Outcome")
  ) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Attack rate per 10,000 population by region

# count cases by region
cases <- linelist_cleaned %>% 
  count(patient_origin) %>%
  # add in population data
  left_join(population_data_region, by = "patient_origin") 


# calculate attack rate for region
ar_region <- attack_rate(cases$n, cases$population, multiplier = 10000) %>% 
  # add the region column to table
  bind_cols(select(cases, patient_origin), .) %>% 
  rename("Region" = patient_origin, 
         "Cases (n)" = cases, 
         "Population" = population, 
         "AR (per 10,000)" = ar, 
         "Lower 95%CI" = lower,
         "Upper 95%CI" = upper) 

ar_region %>% 
  merge_ci_df(e = 4) %>% # merge lower and upper CI in to one column 
  rename("95%CI" = ci) %>%  # rename single 95%CI column
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit") %>% 
  ## set to only show 1 decimal place 
  colformat_double(digits = 1)

ggplot(ar_region, aes(x = reorder(Region, `AR (per 10,000)`),
               y = `AR (per 10,000)`)) +
  # plot as bars (identity = as is)
  geom_bar(stat = "identity", col = "black", fill = "red") + 
  # add CIs
  geom_errorbar(aes(ymin = `Lower 95%CI`, ymax = `Upper 95%CI`), width = 0.2) +
  # set origin for axes
  scale_y_continuous(expand = c(0,0)) +  
  # add labels to axes and below chart
  labs(x = "Region", y = "AR (per 10,000)", 
       captions = str_glue("Source: MSF data from {reporting_week}")) + 
  epicurve_theme

Mortality rate per 10,000 population by region

deaths <- group_by(linelist_cleaned, patient_origin) %>%
  filter(str_detect(exit_status, "Dead")) %>% 
  summarise(deaths = n()) %>% # count deaths by region
  left_join(population_data_region, by = "patient_origin") # merge population data 

mortality_rate(deaths$deaths, deaths$population, multiplier = 10000) %>%
  # add the region column to table
  bind_cols(select(deaths, patient_origin), .) %>% 
  merge_ci_df(e = 4) %>% # merge the lower and upper CI into one column
  rename("Region" = patient_origin, 
         "Deaths" = deaths, 
         "Population" = population, 
         "Mortality (per 10,000)" = `mortality per 10 000`, 
         "95%CI" = ci) %>% 
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit") %>% 
  ## set to only show 1 decimal place 
  colformat_double(digits = 1)

Maps

<!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// read_shapefiles \\

To create maps, you need to have a shapefile of the area. Often, the MSF GIS unit can provide shapefiles.

Your shapefile can be a polygon or points. Polygons do not need to be contiguous.

The names of the polygons or points MUST match the names in your linelist.

Your coordinate reference system needs to be WGS84. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

## fake map data - DELETE if you are using real data
map <- gen_polygon(regions = unique(linelist_cleaned$patient_origin))


## read in shapefile
# map <- read_sf(here::here("mapfolder", "region.shp"))

## check the coordinate reference system (CRS)
# st_crs(map)

## if CRS not WGS84, reset it
# map <- st_set_crs(map, value = 4326) # Sets to WGS84

# ggplot() +
#   # load shapefile
#   geom_sf(data = map, col = "grey50") + 
#   # add in case coordinates
#   geom_point(mapping = aes(x = lat, y = lon), 
#              data = linelist_cleaned)

## Create a categories of ARs by region ----------------------------------------

# define maximum
max_ar    <- max(ar_region$`Upper 95%CI`, na.rm = TRUE) # define your highest AR

# create groups - one group will be 0 only, then up to 4 more groups evenly
# divided to maximum
breakers <- as.integer(c(
                         # include zero as a standalone group
                         0, 
                         # 1 to 4 divisions, snapping the boundaries to the nearest 100
                         find_breaks(n = max_ar, breaks = 4, snap = 100)
             ))

## create a categorical variable using the age_categories function 
## (we aren't using ages - but it functions the same way!)
ar_map <- mutate(ar_region, 
             categories = age_categories(`AR (per 10,000)`, 
                                         breakers = breakers))



## Join table to shapefile -----------------------------------------------------
mapsub <- left_join(map, ar_map, by = c("name" = "Region"))



## Plot cases or AR by geography -----------------------------------------------

## you could also fill by cases using `Cases (n)` in the fill option instead of `AR (per 10,000)`
ggplot() +
  # shapefile as polygon
  geom_sf(data = mapsub, aes(fill = categories), col = "grey50") + 
  # needed to avoid gridlines being drawn
  coord_sf(datum = NA) + 
  # add a scalebar
  annotation_scale() + 
  # color the scale to be perceptually uniform 
  # drop FALSE keeps all levels 
  # name allows you to change the legend title 
  scale_fill_brewer(drop = FALSE, palette = "OrRd", name = "AR (per 10,000)") + 
  # label polygons
  geom_sf_text(data = mapsub, aes(label = name), colour = "grey50") + 
  # remove coordinates and axes
  theme_void()

## Prepare data ---------------------------------------------------------------

# change region variable to a factor so that zero counts can be included
linelist_cleaned$patient_origin <- as.factor(linelist_cleaned$patient_origin)


# case counts
cases <- linelist_cleaned %>% 
  group_by(epiweek) %>%
  count(patient_origin, .drop = FALSE) %>%   # cases for each week by region
  left_join(population_data_region, by = "patient_origin")    # merge population data 


# attack rate for region
ar <- attack_rate(cases$n, cases$population, multiplier = 10000) %>% 
  # add the region column to table
  bind_cols(select(cases, epiweek, patient_origin), .) %>% 
  rename("Region" = patient_origin, 
         "Cases (n)" = cases, 
         "Population" = population, 
         "AR (per 10,000)" = ar, 
         "Lower 95%CI" = lower,
         "Upper 95%CI" = upper)

## define the last four weeks available in your dataset
which_weeks <- tail(unique(cases$epiweek), n = 4)

## only keep the last four weeks of data 
ar <- ar %>% 
  filter(epiweek %in% which_weeks)

# define the maximum number of cases / max AR for the color palette 
max_cases <- max(cases$n, na.rm = TRUE) 
max_ar    <- max(ar$`Upper 95%CI`, na.rm = TRUE)


# define breaks for standardising color palette
breakers <- as.integer(c(0, # include zero as a standalone group
             find_breaks(max_ar, breaks = 4, snap = 100) # four breaks rounded to nearest 100
             ))


# add a categorical variable for AR
# (replace with `Cases (n)` if you want to map cases)
ar <- mutate(ar, 
             categories = age_categories(`AR (per 10,000)`, 
                                         breakers = breakers))

## define the last four weeks available in your dataset
which_weeks <- tail(unique(cases$epiweek), n = 4)


# go through each epiweek, filter and plot the data
purrr::walk(.x = which_weeks, 
           .f = ~{
  this_ar <- filter(ar, epiweek == yearweek(.x))

  ## map 
  mapsub <- left_join(map, this_ar, by = c("name" = "Region"))

  ## choropleth 
  map_plot <- ggplot() +
    ## shapefile as polygon
    geom_sf(data = mapsub, aes(fill = categories), col = "grey50") + 
    ## needed to avoid gridlines being drawn
    coord_sf(datum = NA) + 
    ## add a scalebar
    annotation_scale() + 
    ## color the scale to be perceptually uniform (keep levels)
    scale_fill_brewer(drop = FALSE, 
                    palette = "OrRd", 
                    name = "AR (per 10,000)") +  
    ## remove coordinates and axes
    theme_void() 


  ## plot with the region on the x axis sorted by increasing ar
  ## ar value on the y axis 
  barplot <- ggplot(this_ar, aes(x = reorder(Region, `AR (per 10,000)`),
                                 y = `AR (per 10,000)`)) + 
    ## plot as bars (identity = as is)
    geom_bar(stat = "identity", col = "black", fill = "red") + 
     ## add CIs
    geom_errorbar(aes(ymin = `Lower 95%CI`, ymax = `Upper 95%CI`), width = 0.2) +
    ## set origin for axes
    scale_y_continuous(expand = c(0, 0), limits = c(0, max_ar)) +  
    ## add labels to axes and below chart
    labs(x = "Region", y = "AR (per 10,000)") +  
    epicurve_theme



  ## combine the barplot and map plot into one with patchwork and add a title
  ## in the top-left corner and a caption in the bottom-right
  print(
    barplot + map_plot + plot_annotation(
    title   = str_glue("Epiweek: {.x}"),
    caption = str_glue("Source: MSF data from {reporting_week}")
    )
  )

})