library(introexercises)  # get data for exercises 
library(learnr)          # create lessons from rmd 
library(gradethis)       # evaluate exercises
library(dplyr)           # wrangle data
library(flair)           # highlight code 
library(ggplot2)         # visualise data 
library(lubridate)       # work with dates
library(forcats)         # work with factors
library(fontawesome)     # for emojis 
library(scales)          # defining axes and units 
library(stringr)         # work with character strings
library(apyramid)        # creating demographic pyramids 
library(viridis)         # defining colour palettes 
library(janitor)         # clean data
library(tsibble)         # working with epiweeks
library(tidyr)           # clean data
# library(RMariaDB)        # connect to sql database

## set options for exercises and checking ---------------------------------------

## Define how exercises are evaluated 
gradethis::gradethis_setup(
  ## note: the below arguments are passed to learnr::tutorial_options
  ## set the maximum execution time limit in seconds
  exercise.timelimit = 60, 
  ## set how exercises should be checked (defaults to NULL - individually defined)
  # exercise.checker = gradethis::grade_learnr
  ## set whether to pre-evaluate exercises (so users see answers)
  exercise.eval = FALSE 
)

# ## event recorder ---------------------------------------------------------------
# ## see for details: 
# ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events
# ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R
# 
# ## connect to your sql database
# sqldtbase <- dbConnect(RMariaDB::MariaDB(),
#                        user     = Sys.getenv("userid"),
#                        password = Sys.getenv("pwd"),
#                        dbname   = 'excersize_log',
#                        host     = "144.126.246.140")
# 
# 
# ## define a function to collect data 
# ## note that tutorial_id is defined in YAML
#     ## you could set the tutorial_version too (by specifying version:) but use package version instead 
# recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) {
#     
#   ## define a sql query 
#   ## first bracket defines variable names
#   ## values bracket defines what goes in each variable
#   event_log <- paste("INSERT INTO responses (
#                        tutorial_id, 
#                        tutorial_version, 
#                        date_time, 
#                        user_id, 
#                        event, 
#                        section,
#                        label, 
#                        question, 
#                        answer, 
#                        code, 
#                        correct)
#                        VALUES('", tutorial_id,  "', 
#                        '", tutorial_version, "', 
#                        '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "',
#                        '", Sys.getenv("SHINYPROXY_PROXY_ID"), "',
#                        '", event, "',
#                        '", data$section, "',
#                        '", data$label,  "',
#                        '", paste0('"', data$question, '"'),  "',
#                        '", paste0('"', data$answer,   '"'),  "',
#                        '", paste0('"', data$code,     '"'),  "',
#                        '", data$correct, "')",
#                        sep = '')
# 
#     # Execute the query on the sqldtbase that we connected to above
#     rsInsert <- dbSendQuery(sqldtbase, event_log)
#   
# }
# 
# options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)
# data prep --------------------------------------------------------------------
combined <- rio::import(system.file("dat/linelist_combined_20141201.rds", package = "introexercises"))

combined <- combined %>% 
  mutate(week_onset      = yearweek(date_onset, week_start = 1), ## create week of onset variable  
         week_onset_date = as.Date(week_onset))                  ## create a date version 

# Create hospital weeks for heat plot
#####################################
# count cases by hospital-week
hospital_weeks_raw <- combined %>%
  count(hospital, week_onset)

# create longer dataset of all possible hospital-weeks
expanded <- hospital_weeks_raw %>%
  select(hospital, week_onset) %>%
  tidyr::expand(., week_onset, hospital)

# merge so that all hospital-weeks are represented in the data
hospital_weeks <- hospital_weeks_raw %>%
  right_join(expanded) %>%
  mutate(n = replace_na(n, 0)) %>%
  filter(week_onset >= ymd("2014-06-15"))




# 
# 
# count_data <- combined %>%
#   group_by(hospital, date_hospitalisation) %>%
#   summarize(n_cases = dplyr::n()) %>%
#   ungroup()
# # 
# # hospitalisation_week <- incidence(combined,
# #           date_index = date_hospitalisation,
# #           interval = "weeks")
# 
# demo_agg <- combined %>%
#   count(age_cat, , name = "cases") %>%
#   pivot_wider(
#     id_cols = age_cat,
#     names_from = ,
#     values_from = cases) %>%
#   rename(`missing_` = `NA`)
# 
# aggregate_data <- demo_agg %>%
#   pivot_longer(
#     col = c(female, male, missing_),            # cols to elongate
#     names_to = "",                # name for new col of categories
#     values_to = "counts") %>%           # name for new col of counts
#   mutate(
#      = na_if(, "missing_")) # convert "missing_" to NA

Introduction to R for Applied Epidemiology and Public Health

Welcome

Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.

knitr::include_graphics("images/logo.png", error = F)

Heat plots, epidemic curves, and pyramids

This exercise focuses on more complex plots that are common in public health work, such as heat plots, epidemic curves, and age/sex demographic pyramids.

This will involve using previously explored aspects such as:

Format

This exercise guides you through tasks that you should perform in RStudio on your local computer.

Getting Help

There are several ways to get help:

1) Look for the "helpers" (see below) 2) Ask your live course instructor/facilitator for help
3) Schedule a 1-on-1 call with an instructor for "Course Tutoring" 4) Post a question in Applied Epi Community

Here is what those "helpers" will look like:

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Here you will see a helpful hint!


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

linelist %>% 
  filter(
    age > 25,
    district == "Bolo"
  )

Here is more explanation about why the solution works.


Quiz questions

Answering quiz questions will help you to comprehend the material. The answers are not recorded.

To practice, please answer the following questions:

quiz(
  question_radio("When should I view the red 'helper' code?",
    answer("After trying to write the code myself", correct = TRUE),
    answer("Before I try coding", correct = FALSE),
    correct = "Reviewing best-practice code after trying to write yourself can help you improve",
    incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer."
  )
)
question_numeric(
 "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?",
 answer(10, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(9, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(8, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(7, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(6, message = "Ok, we will get there together", correct = T),
 answer(5, message = "Ok, we will get there together", correct = T),
 answer(4, message = "I like your confidence!", correct = T),
 answer(3, message = "I like your confidence!", correct = T),
 answer(2, message = "I like your confidence!", correct = T),
 answer(1, message = "I like your confidence!", correct = T),
 allow_retry = TRUE,
 correct = "Thanks for sharing. ",
 min = 1,
 max = 10,
 step = 1
)

License

Please email contact@appliedepi.org with questions about the use of these materials.

Learning objectives

In this exercise you will:

During this exercise you will be asked to learn new functions by reading their documentation. This is intentional. If you are confused, just ask an instructor for assistance.

Refresh your memory

First, let us refresh ourselves on the basics of ggplot2!

quiz(
  question("Where are dynamic aesthetics placed in ggplot code?",
    allow_retry = TRUE,
    answer("inside aes()", correct = T),
    answer("outside aes()", message = "dynamic assignments means the aesthetic is mapped to a column in the data, and the presentation can vary for each row. These must be done inside 'mapping = aes()'")
  ),
  question("Are static aesthetics in the initial ggplot() call inherited by subsequent geoms?",
    allow_retry = TRUE,
    answer("No", correct = TRUE),
    answer("Yes", message = "No, for example if you set the size of points equal to 3, the lines will not be size 3 as well.")
  ),
  question("Are dynamic aesthetics in the initial ggplot() call inherited by subsequent geoms?",
    allow_retry = TRUE,
    answer("No", message = "Yes they are, for example, assigning wt_kg to the X-axis is inherited by all the subsequent geoms."),
    answer("Yes", correct = TRUE)
  ),
    question("Should adjustments to the theme be made before or after setting one of the default themes?",
    allow_retry = TRUE,
    answer("before", message = "A complete/default theme will override all previous micro adjustments to the theme"),
    answer("after", correct = TRUE)
  ),
  question("Which of the following are prebuilt themes in ggplot",
    allow_retry = TRUE,
    answer("theme_bw()", correct = TRUE),
    answer("theme_classic()", correct = TRUE),
    answer("theme_red()"),
    answer("scale_color_brewer()")
  ),
  question("How would you hide a legend in ggplot?",
    allow_retry = TRUE,
    answer("theme(legend.title = 'element.blank()')"),
    answer("theme(legend.position = 'right')"),
    answer("theme(legend.position(none))"),
    answer("theme(legend.position = 'none')", correct = TRUE)
  ),
  question("How would you set your legend to appear in the centre of your graph?",
    answer("theme(legend.position = 'middle')"),
    answer("theme(legend.position =  c(0.5,0.5))", correct = TRUE)
  )
)

Great, now we have refreshed our memory with a quiz, we will start on our first topic, heat plots.

Preparation

Open your ebola project. Run all the code chunks in your "ebola_sitrep.Rmd", so that you are able to use the data frame combined.

If you did not complete the joining exercise, or are seeing errors when trying to run your script, you can import a "backup" combined data frame from the "data/clean/backup/" folder using this command:

combined <- import(here("data", "clean", "backup", "linelist_combined_20141201.rds"))

Talk to an instructor if you have problems running your script!

Packages

Add the following packages and load them:

Remember to keep {tidyverse} as the last package in the command.

Your packages chunk should look like this:

# This chunk loads packages
pacman::p_load(
  rio,          # for importing data
  here,         # for locating files
  janitor,      # for data cleaning  
  lubridate,    # for date cleaning
  epikit,       # for easy inline code
  scales,       # for percents and date labels
  officer,      # border lines
  flextable,    # make HTML tables
  gtsummary,    # for nice tables
  apyramid,     # for age/sex pyramids
  RColorBrewer, # for colour palettes
  gghighlight,  # highlighting plot parts  
  ggExtra,      # special plotting functions
  viridis,      # for color-blind friendly color scales
  tsibble,      # for epiweeks and other time series analyses
  tidyverse     # for data management and visualization
)

New code chunk in your R Markdown

Add a new "Heat plots" chunk near the bottom of your R Markdown.

# Heat plots 

You can also add a heading in the text portion of the R Markdown to deliniate this new content.

Your "ebola_sitrep.Rmd" script should have approximately this outline:

  1. YAML
  2. Setup code chunk
  3. Load packages
  4. Import data
  5. (Optional) Exploratory analysis
  6. Clean and export surveillance linelist
  7. Join data
  8. Text summary of the outbreak
  9. Summary tables
  10. Plots
  11. Spotlight on a particular district
  12. Pivoting - patient timelines
  13. NEW - Heat plots
  14. (Optional) Testing area chunk

Epiweeks

Before we proceed, it is worth covering the creation of "weeks", or "epiweeks" (epidemiological weeks) in your data frame.

In order to create tables of case counts by week, you may need to create a new column that contains the "epiweek", based on a date such as date_onset.

There are many different ways to define what a "week" is. Likewise, some countries formulate epiweeks differently (e.g. starting on Sundays, Mondays, or Fridays). Most of the world uses weeks that begin on Mondays.

Below, we present the most common approach and provide you with a reference sheet for the rest (in the "intro_course/learning materials" folder).

{tsibble} to create Epidemic Weeks (e.g. "2014 W44")

We recommend the {tsibble} package for creating epidemic "Week" values. This package is often used for time series analysis and has a variety of useful time/date functions.

Its function yearweek() conveniently produces weeks from a provided date column. You can also designate the week_start = argument, where 1 is for Monday, 7 is for Sunday, etc.

Putting yearweek() inside mutate(), we can create a new column of class "yearweek", called week_onset, which will display values like "2014 W44". Importantly, these can be used to bin the rows into discrete weeks for an epidemic curve or another table/plot.

Do NOT add this code to your script, we are simply showing how yearweek() alters the column format.

# new column for week of onset (as year-week)
combined %>% 
  mutate(week_onset = yearweek(date_onset, week_start = 1))

The values of week_onset will look like this:

combined %>% 
  mutate(week_onset = yearweek(date_onset, week_start = 1)) %>% 
  pull(week_onset) %>% 
  head()

Epidemic week start dates

For maximum flexibility, we now create a second column that will be a normal date class, displaying the start date of the week.

We create it by wrapping the week_onset (class "yearweek") in the {base} function as.Date(). The "weeks" will be converted to dates. If you selected week_start = 1 (Mondays), then the date returned will be the Monday of that week (rounding down). This is saved as a new column, week_onset_date.

Do not add this code to your script, it is simply an example.

# new column for week of onset (as a date)
combined %>% 
  mutate(week_onset = yearweek(date_onset, week_start = 1)) %>% 
  mutate(week_onset_date = as.Date(week_onset)) 

The values of week_onset_date will look like this:

combined %>% 
  mutate(week_onset = yearweek(date_onset, week_start = 1)) %>% 
  mutate(week_onset_date = as.Date(week_onset)) %>% 
  pull(week_onset_date) %>% 
  head()

For now, add the below mutate() command at the end of your "Joining" code chunk, before combined is exported as a .rds file. Later, you may choose to integrate these commands into the cleaning pipe chain of the surveillance dataset instead.

Re-run the "Joining" command so that the changes are implemented in combined.

Advanced mutate() style

mutate() allows more than one statement within its parentheses, separated by commas. In an optional step, you can insert both changes within the same mutate() command.

combined <- combined %>% 
  mutate(week_onset      = yearweek(date_onset, week_start = 1), ## create week of onset variable  
         week_onset_date = as.Date(week_onset))                  ## create a date version 

Now we have two versions of "epiweek" columns that we can use to make plots and tables!

Heat plots

Heat plots, also called "heat tiles", are useful visualizations when trying to display 3 variables (x-axis, y-axis and fill).

For instance, you may want to look at a breakdown of how many cases were reported by week across several hospitals, to get an idea of how the epidemic has progressed in several places. Let's see how this works.

Let's say that we have a data frame called hospital_weeks, which has the following columns (you will create this in a moment):

head(hospital_weeks)    # print the first 6 rows

This data is in "long" format, and every possible hospital-week has a row.

Create this data frame in your "Heat plots" code chunk using this code below. Do not try to understand this code (try our advanced course in data management). In essence, it uses count() to create a data frame of case counts by week, then creates a data frame with all possible hospital-weeks, and then joins them together.

# count cases by hospital-week
hospital_weeks_raw <- combined %>%
  count(hospital, week_onset_date) 

# create longer dataset of all possible hospital-weeks
expanded <- hospital_weeks_raw %>% 
  select(hospital, week_onset_date) %>%
  tidyr::expand(., week_onset_date, hospital)

# merge so that all hospital-weeks are represented in the data
hospital_weeks <- hospital_weeks_raw %>%      
  right_join(expanded) %>%                            
  mutate(n = replace_na(n, 0)) %>% 
  filter(week_onset_date >= ymd("2014-06-15"))

Basic heat plot

Review documentation

r fontawesome::fa("eye", fill = "darkblue") Take a few minutes to review the documentation for the function geom_tile(), which is from {ggplot2}. You can access this by either:

1) Entering ?geom_tile in the Console, or
2) Click to the "Help" pane (lower-right) and entering geom_tile in the search bar

Make a heat plot

What ggplot() code would you write to create a heat plot that:

Add a heatplot to your "Heat plots" code chunk. Remember the format of ggplot(), where we use the + to add geom layers to the ggplot(). Don't forget to use the argument mapping = aes(x = , y = , fill = ) in your ggplot() function.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = hospital_weeks,
       mapping = aes(x = week_onset_date, y = hospital, fill = n)) +
  geom_tile()


quiz(
  question("Which hospital reported the most cases with onset in September?",
    allow_retry = TRUE,
    answer("SMMH"),
    answer("Port Hospital", correct = T),
    answer("Military Hospital"),
    answer("Central Hospital"),
    answer("Unknown")
  ),
    question("Which hospital reported the highest case fatality ratio?",
    allow_retry = TRUE,
    answer("SMMH", message = "It is not possible to know the case fatality ratio from this plot"),
    answer("Port Hospital", message = "It is not possible to know the case fatality ratio from this plot"),
    answer("Military Hospital", message = "It is not possible to know the case fatality ratio from this plot"),
    answer("Central Hospital", message = "It is not possible to know the case fatality ratio from this plot"),
    answer("Unknown", correct = T)
    )
)

Customising your heat plot

The plot is nice, but let us do a few more things to refine the presentation.

Update your heat plot code with the following:

1) Modify hospital_weeks to create a new data frame: hospital_weeks_adjusted. In this data frame, ensure that NA values in the column hospital are re-coded as "Missing" and this column is class "factor" with the levels in this order : "Central Hospital", "Military Hospital", "Port Hospital", "SMMH", "Other", "Missing".
* You can convert NA to "Missing" using fct_na_value_to_level() from {forcats} within mutate(). Make sure to use the level = argument to change NA to Missing with quotations around Missing.
* You can then use fct_relevel() also from {forcats}, in a subsequent mutate() to provide the desired level order

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Within a mutate() command, use the {forcats} package and its function fct_na_value_to_level() to re-define the column hospital to convert NA to "Missing" (this also converts the column to class "factor"). Within the fct_na_value_to_level(), include level = "Missing" to reassign NA to the word "Missing".

Pipe this into another mutate command, and use the function fct_relevel() to adjust the ordering of the levels.

If you need a refresher on how to change NA values in factor columns, and how to reorder, checkout the R Handbook chapter on factors.


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

hospital_weeks_adjusted <- hospital_weeks %>% 
  mutate(hospital = fct_na_value_to_level(hospital, level = "Missing"),
         hospital = fct_relevel(
           hospital,
           "Central Hospital", "Military Hospital", "Port Hospital", "SMMH", "Other", "Missing"))




2) In the ggplot of the new adjusted data frame, change the color scheme to a diverging color scale. Provide a scale that extends from "skyblue" at low values to "tomato" at high values.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Add scale_fill_gradient() to the ggplot() command. See the help documentation with ?scale_fill_gradient to understand the arguments.


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(
  data = hospital_weeks_adjusted,
  mapping = aes(
    x = week_onset_date,
    y = hospital,
    fill = n)) +
  geom_tile()+
  scale_fill_gradient(low = "skyblue", high = "tomato")




3) The default x-axis ticks show each month. Change these to efficiently show every two weeks.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Add scale_x_date() to the bottom of the ggplot, and provide the arguments date_breaks = "2 weeks" (for example), and labels = scales::label_date_short().


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(
  data = hospital_weeks_adjusted,
  mapping = aes(
    x = week_onset_date,
    y = hospital, fill = n)) +
  geom_tile()+
  scale_fill_gradient(low = "skyblue", high = "tomato")+
  scale_x_date(
    date_breaks = "2 weeks",
    labels = scales::label_date_short()
  )




4) Make the labels for the axes and legend more presentable.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Adjust the labs(). Recall that the legend title in this case is edited with fill = because it was created by the fill.


Below is all the solution code together:

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Edit the dataset
hospital_weeks_adjusted <- hospital_weeks %>% 
    mutate(hospital = fct_na_value_to_level(hospital, level = "Missing"),
           hospital = fct_relevel(
             hospital,
             "Central Hospital", "Military Hospital", "Port Hospital", "SMMH", "Other", "Missing"))

# Plot with adjustments
ggplot(data = hospital_weeks_adjusted) +
  geom_tile(mapping = aes(
    x = week_onset_date,
    y = hospital,
    fill = n))+
  scale_fill_gradient(low = "skyblue", high = "tomato")+
  scale_x_date(
    date_breaks = "2 weeks",
    labels = scales::label_date_short())+
  labs(
    x = "Week of symptom onset",
    y = "Hospital",
    fill = "Number of\nweekly cases")+
  theme_minimal()


In the solution code, notice how we can use \n to split our labels onto multiple lines. This is a very useful addition to our labels! Particularly if we want to format plots so they fit nicely in our report.

Setting up the data, creating and customising a heat plot

Note that to make the heat plot using geom_tile(), you had to provide a data frame with counts - aggregated data (not a linelist). There are ways to create heat plots from linelist data (see geom_density() functions) but those will not be covered today.

Now that you are familiar with the heat tile plotting syntax, we will move to an exercise where you create the counts data frame.

Try to do the following on your own. It will be challenging, but this is good exposure to intermediate-level R programming.

First, create a new code chunk for an age and outcome summary, below your heat plots chunk. Do the following:

1) Use combined to create a new data frame named age_outcome_summary.
* Drop any rows with NA values by piping to drop_na(sex, age_cat).
* Use group_by() and summarise() to show summary counts for each unique combination of age_cat and sex.
* In addition to case counts, include a column showing the number of rows in each group whose outcome is equal to "Death".
* Finally, include a column calculating the proportion in each group equal to "Death".

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# make new data frame summarizing age, sex, and deaths
age_outcome_summary <- combined %>%
  drop_na(sex, age_cat) %>%            # remove any NA values
  group_by(age_cat, sex) %>%           # group rows by age- groups
  summarise(                           # begin creating new summary columns
    n = n(),
    n_death = sum(outcome == "Death", na.rm = TRUE), # number of rows with Death outcome
    pct_death = n_death / n)           # create proportion dead in the group




2) Create a heat plot using this new data frame. Make the tiles filled according to the percent of Death outcomes, as displayed by sex and age_cat.

Here are some extra challenges, if you have time:

Add this to your "Age outcome summary" code chunk,

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# make plot
ggplot(data = age_outcome_summary,            # use new data frame
       mapping = aes(x = sex,                 #  on x-axis
                     y = age_cat,             # age category on y-axis
                     fill = pct_death)) +     # fill (color) is shaded by the proportion dead
  geom_tile() +                               # display data as tiles  
  scale_fill_viridis(                         # adjust colors scale
    option = "B",                             # pick any option
    limits = c(0, 1),                         # set the limits to go from 0 to 1
    labels = scales::percent,                 # adjust legend values to be percents
    direction = -1) +                         # reverses color gradient direction

  geom_text(                                  # add text over the tiles
    mapping = aes(
      label = scales::percent(                # show percents instead of proportions
        pct_death,
        accuracy = 0.1)))+
  labs(x = "Sex",                             # add labels
       y = "Age categories",
       fill = "Percent of\ncases fatal")


quiz(
  question("Who has the highest proportion of 'Death' in combined gender and age category?",
           answer("Male and 20-29", correct = TRUE),
           answer("Female and 20-29"),
           answer("Male and 50-59"),
           answer("Female and 40-49")
           ),
  question("Which  has the oldest ages?",
           answer("Female"),
           answer("Male", correct = TRUE)
           )
)

Ok, good job! You have managed to build summary datasets for a heat plot and then customize the plot so that you can analyze the information.

Even if you did not know how to do all of those edits to the ggplot, the purpose of that task was to expose you to a new plot and customization possibilities.

You can always find sample code in the Heat Plots chapter of the Epi R Handbook, and become more well-practiced later.

Epi curves

An epidemic curve (also known as an “epi curve”) is a core epidemiological chart typically used to visualize the temporal pattern of illness onset among a cluster or epidemic of cases.

Analysis of the epicurve can reveal temporal trends, outliers, the magnitude of the outbreak, the most likely time period of exposure, time intervals between case generations, and can even help identify the mode of transmission of an unidentified disease (e.g. point source, continuous common source, person-to-person propagation).

We are going to build epidemic curves with {ggplot2}, which allows for advanced customizability. There is another option, the {incidence2} package as described in the Epi R Handbook, which is perhaps more simple but less customizeable.

As you have been through several exercises using {ggplot2}, you are now familiar with the syntax and customization it allows (themes, labels, axes, etc).

Weeks or dates

When deciding to make an epidemic curve, you must first decide whether you want to display the "week" on the X-axis (e.g. "2014 W35") or some variety of calendar date.

Epicurves with calendar dates

In its most basic form, plotting epidemic curves in {ggplot2} is a case of using the ggplot function geom_histogram() to show the distribution of the date values. However, there are 3 main things to watch out for:

Simple bins

Create a new section and code chunk for "Epicurves" below the "Age outcomes summary".

Make a simple histogram from the data frame combined which shows the distribution of date_onset using 5-day intervals? The histogram "binwidth" can be specified as a number (as a static aesthetic) to binwidth =. Also, clean up the labels and use theme_bw().

Add this to your "Epicurves" code chunk. Remember, you can use your "Testing area" code chunk for any code testing you need to do first!

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = combined, mapping = aes(x = date_onset)) +
    geom_histogram(binwidth = 5) +
    labs(x = "Date of onset", y = "Incidence") +
    theme_bw()


Defining manual bins

Plotting using bin_width() will start the first bin at the first case. As epidemiologists and public health practitioners we typically need to produce an epidemic curve that adheres to very particular specifications. For example, a 7-day week that starts on a particular day. Likewise, this approach does not work for monthly bins, because months can be 28, 29, 30 or 31 days!

Instead of using binwidth =, we specify the histogram bars with breaks =.

A vector of dates

breaks = expects a vector of dates to use as breaks (cut-points).

You know that a vector can be made using c(), for example c(54, 22, 89) or c("Paris", "Delhi", "Kigali").
Similarly, you could make a vector of dates like: c(ymd("2014-06-15"), ymd("2014-06-22"), ymd("2014-06-29")).
But this would be very long code to include all the weekly break points in this outbreak!

Instead, use the {base} function seq.Date() to automatically create a sequence of dates between a from = value, to a to = value, and in increments specified to by =. You can specify by values such as "day", "week", "month", etc.

The sequence below uses dates of the first and last case onsets. Write this command in your Testing chunk and run it to see the output.

seq.Date(from = ymd("2014-05-06"),
         to = ymd("2014-11-28"),
         by = "week")

This can be integrated into the epicurve command. Try it if you want to, but know that we will be changing it later.

ggplot(data = combined, aes(x = date_onset))+
  geom_histogram(
    breaks = seq.Date(
      from = ymd("2014-05-06"),
      to = ymd("2014-11-28"),
      by = "week")) +
  labs(x = "Date of onset", y = "Incidence") +
  theme_bw()

Now we are confident that the breaks are using 7-day intervals starting and ending on a specific date. But typically we need the intervals to start on Mondays (or some other specified day).

Rounding down and up to weeks

To round dates, we suggest using floor_date() and it's companion ceiling_date() from the {lubridate} package.

Try these codes in your Testing chunk. They simply showcase how these functions work in isolation.

# Monday BEFORE the earliest case
floor_date(ymd("2014-05-06"), unit = "week", week_start = 1)

# Monday AFTER the last case
ceiling_date(ymd("2014-11-28"), unit = "week", week_start = 1)

By integrating these into the seq.Date() command, we can have a weekly vector starting on the Monday before the earliest case.

Adapt your seq.Date() command to use floor_date() and ceiling_date() as below.

# Sequence of Mondays from before earliest case, to after latest case
seq.Date(from = floor_date(ymd("2014-05-06"), unit = "week", week_start = 1),
         to =   ceiling_date(ymd("2014-11-28"), unit = "week", week_start = 1),
         by =   "week")

Dynamic start and end dates

Lastly, what happens if our data are refreshed or updated with new records? Those static dates in our code will not be accurate!

Replace the static dates with min() and max(), using the column combined$date_onset. Because this is not a {dplyr} pipe, we need to specify the data frame and column with the $ index operator. Don't forget that these functions need na.rm = TRUE to remove missing values.

seq.Date(from = floor_date(min(combined$date_onset, na.rm=T), unit = "week", week_start = 1),
         to =   ceiling_date(max(combined$date_onset, na.rm=T), unit = "week", week_start = 1),
         by =   "week")

In this way, we have made our code more dynamic. Dynamic code ensures that all the plots and tables update automatically.

Separate the breaks command

The easiest way to organize this command is to separate the seq.Date() command from the ggplot() command.

Pull out your seq.Date() command and move it before the ggplot(), saving it with a name like ebola_weeks that is provided to the breaks = argument in the geom_histogram().

# Define and save the vector
ebola_weeks <- seq.Date(
  from = floor_date(min(combined$date_onset, na.rm=T), unit = "week", week_start = 1),
  to =   ceiling_date(max(combined$date_onset, na.rm=T), unit = "week", week_start = 1),
  by =   "week")

One final change before we finish is to add closed = "left" to geom_histogram(), which makes the bars also count the cases reported on the break days themselves.

# Define and save the vector
ebola_weeks <- seq.Date(
  from = floor_date(min(combined$date_onset, na.rm=T), unit = "week", week_start = 1),
  to =   ceiling_date(max(combined$date_onset, na.rm=T), unit = "week", week_start = 1),
  by =   "week")


# Run the plot, using the vector
ggplot(data = combined, aes(x = date_onset)) +
  geom_histogram(
    breaks = ebola_weeks,
    closed = "left") +
  labs(x = "Date of onset", y = "Incidence") +
  theme_bw()

Compare this plot to the histogram printed in the exercise above (7-days, no specified breaks). Do you see any small differences in how the cases have been binned? (look at the beginning of the epidemic).

The above can seem like a lot of work to just produce a list of dates! But realise the control that you have over the output. And, now that you have this code, it will be easy to modify. All this is also in the Epi R Handbook chapter on Epidemic Curves for easy reference!

Monthly epidemic curve

Now, make a new epidemic curve in your Epidemic Curves chunk, in which the histogram bin breaks are by month from the month-start prior to the first case.

Once you have done this, adjust the code so that the fill in the bars is by hospital, and the x-axis labels appear every month.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

As we want the values to be aggregated by calendar month, do the following:

  • Set the unit = to "month" in the two {lubridate} functions and remove the week_start arguments (check ?floor_date for details).
  • Change the by = argument of seq.Date() to "month"
  • Change the name of the vector ebola_weeks to ebola_months, and change it in the ggplot() command too.

  • To set the fill, put fill = hospital in the aesthetic mappings

  • To adjust the x-axis labels, do the following within scale_x_date():
  • set labels = label_date_short()
  • set date_breaks = "months"


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# define and save the vector (by month)
ebola_months <- seq.Date(
  from = floor_date(min(combined$date_onset, na.rm=T), unit = "month"),
  to =   ceiling_date(max(combined$date_onset, na.rm=T), unit = "month"),
  by =   "month")

# run the plot with monthly breaks, and fill
ggplot(data = combined, mapping = aes(x = date_onset, fill = hospital)) +
  geom_histogram(breaks = ebola_months, closed = "left") +
  scale_x_date(
    date_breaks = "months",
    labels = scales::label_date_short())+
  labs(x = "Date of onset", y = "Incidence") +
  theme_bw()


Facets

It may be difficult to read this epicurve and to understand the case trends from each hospital.

Faceting the plots means creating "small-multiple" (mini plots) for each unique value in a column (e.g. hospital).

You create facets by adding facet_wrap() to the ggplot with a +, and writing a tilde ~ with the name of the column to use. For example:

# run the plot with monthly breaks, and fill
ggplot(data = combined,
       mapping = aes(x = date_onset, fill = hospital)) +
  geom_histogram() +
  facet_wrap(~sex)

Now, create facets based on hospital for your epidemic curve. Add this to your R Markdown by updating the code in your "Epicurves" code chunk.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = combined, aes(x = date_onset, fill = hospital)) +
  geom_histogram(breaks = ebola_months, closed = "left") +
  scale_x_date(
    date_breaks = "months",
    labels = scales::label_date_short())+
  labs(x = "Date of onset", y = "Incidence") +
  theme_bw()+
  facet_wrap(~hospital)


There are MANY other adjustments that you can make to your epidemic curve made with {ggplot2}. Many of them are outlined in the Epidemic curves chapter of the Epi R Handbook.

Epicurves with week numbers

If you want to display "week numbers", leverage the fact that you created the column week_onset at the beginning of this module. Still, there will be several steps:

1) Create a vector of epiweek values (similar to what we created in the previous section, but written like "2014 W35")
2) Create a second vector of date values, from the epiweek values
3) Use count() on the discrete "yearweek" column (e.g. week_onset) to get counts by week
3) Plot with geom_col() as shown below

Add another section and code chunk within the "Epicurves" section of your R Markdown script for "Epicurve with week numbers".

Below, we create a vector using seq() - not seq.Date(), because these values are special week values, not dates.

## create vector of the unique weeks which occur
all_weeks <- seq(min(combined$week_onset, na.rm = TRUE),
                 max(combined$week_onset, na.rm = TRUE),
                 by = 1) 

all_weeks

Then we make a second vector with these weeks transformed to dates:

## change weeks to dates
all_weeks_date <-  as.Date(all_weeks)  

all_weeks_date

Then we use count() to get counts by epiweek. The as_tsibble() and fill_gaps(n = 0) are from the {tsibble} package and ensure that weeks with no cases are represented by 0, not dropped.

counts <- combined %>%
  filter(!is.na(week_onset_date)) %>%
  count(week_onset_date) %>%
  as_tsibble() %>%     # ensure that all weeks are present in the table 
  fill_gaps(n = 0)     # filled in with 0 as necessary

Finally, we plot the counts using geom_col(). Note:

## plot 
counts %>%
  ggplot(aes(x = week_onset_date, y = n)) +        # n is the counts 
  geom_col(width = 7) +                            # for full-width weekly columns                             
  theme(axis.text.x = element_text(angle = 90)) +  # pivot the labels 90 degrees
  scale_x_date(
    breaks = all_weeks_date,
    labels = all_weeks)

Age pyramids

Age pyramids are a useful way of illustrating demographics, and can be customised in a variety of different ways. For our purposes, we will be using the function age_pyramid() from the apyramid package.

Create a new "Age pyramid" section heading and code chunk below your "Epicurves" section.

apyramid

It is good to practice learning a function by reading its documentation. The function age_pyramid() is fairly simple, in that it produces high quality age pyramids with relatively few arguments.

Look up the documentation for the function (?age_pyramid) and review the arguments and examples.

Now create an age pyramid using the combined data frame and the age_cat categories, that is split by sex. Add this to a new "Age pyramids" code chunk.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

age_pyramid(data = combined,
            age_group = age_cat,
            split_by = sex)


You can also plot the values by proportion, rather than count, and include a column for missing data.

Can you now, after reviewing the documentation, create an age pyramid of sex that includes missing (NA) values and plots the proportions?

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

age_pyramid(data = combined,
            age_group = age_cat,
            proportional = TRUE,
            na.rm = FALSE) 


Counts

In the previous example, the data is in a linelist format, where each row is a unique observation. However, you may be given data that is already aggregated into counts.

Just to practice, create this dataset demo_wide_counts by running the following command in a new code chunk.

# Creates the object demo_agg
#############################
demo_wide_counts <- structure(list(
  age_cat = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, NA),
                      .Label = c("0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70+"), class = "factor"),
  female = c(149L, 104L, 55L, 15L, 4L, 1L, NA, NA, NA),
  male = c(96L, 80L, 65L, 39L, 19L, 5L, 5L, 1L, NA),
  missing_sex = c(5L, 4L, 4L, 2L, NA, NA, 1L, NA, 26L)),
  row.names = c(NA, -9L),
  class = c("tbl_df", "tbl", "data.frame"))

It should look like this:

demo_wide_counts

To be acceptable to age_pyramid() the data must be in "long" format, so first, you would need to pivot longer so that the information is in one column (remember tidy data!) and the counts are in one column.

You could use this command:

demo_long_counts <- demo_wide_counts %>% 
  pivot_longer(
    col = c(female, male, missing_sex),    # cols to elongate
    names_to = "sex",                      # name for new col of categories
    values_to = "counts") %>%              # name for new col of counts
  mutate(
    sex = na_if(sex, "missing_sex"))       # convert "missing_" to NA

Which results in:

demo_long_counts

Now, use what is written in the age_pyramid() documentation to adjust the plotting command to accept counts.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

age_pyramid(data = demo_long_counts,
            age_group = age_cat, 
            split_by = sex,
            count = counts)


Adding ggplot to age pyramids

Let's try this out with our combined data frame object. Create an age pyramid stratified by sex, add this to the "Age pyramid" code chunk:

# Age pyramid with ggplot modifications
age_pyramid(
     data = combined,
     age_group = age_cat,
     split_by = sex,
     proportional = TRUE,
     show_midpoint = FALSE)

You can also easily add ggplot commands using the + to {apyramid} age pyramid to add labels and adjust the color scale.

Try this out now:

# Age pyramid with ggplot modifications
age_pyramid(
     data = combined,
     age_group = age_cat,
     split_by = sex,
     proportional = TRUE,
     show_midpoint = FALSE)+

  theme_minimal()+

  scale_fill_brewer(type = "qual", palette = 2)+

  labs(title = "Age and sex of confirmed cases",
       y = "Proportion of all cases",
       x = "Age group",
       caption = "Ebola outbreak",
       fill = "Sex")

Note, because the ggplot() function in the {apyramid} function age_pyramid() uses coord_flip(), when adding labels on an age_pyramid() plot, you need to supply the x = and y = labels to the opposite axis of where you would like them to appear.

End

Congratulations!

You have finished the Ebola case study and learned A LOT of R coding along the way!

You have learned a lot:

Make sure to save and knit your R Markdown script.

Consider adding more written content between the plots and tables, interpreting the results. Decide which plots to keep or exclude. The choice is up to you!

If you want to work through a GIS example with your ebola dataset, proceed to the Extra below...

Extras: GIS demonstration

Creating maps in R

The following is a brief demonstration of making maps with R.

You can use {ggplot2} to relatively easily produce maps and conduct spatial analyses.
The Ebola linelist includes information on latitude and longitude for each case (lat and lon columns). There are shape files (map layers) in the "ebola/data/shp" folder. We will not provide a long explanation of GIS or shapefiles, however, this exercise will demonstrate plotting the linelist data in simple maps.

If you are interested in GIS with R, ask an instructor about our advanced course.

Open a new R script. You can save the R script as "ebola_GIS_analyses.R" in your "ebola" project and in the "scripts" subfolder.

#############################################
# GIS Ebola example
# Bonus section
# Your NAME here
#############################################

You will first need to load specific packages for mapping:

Add them to your R script, and remember to include a section heading.

# Load packages -----------------------------------------------------------
pacman::p_load(
  rio, 
  here,
  sf,           # for working with geospatial data
  ggspatial,    # for base maps and north arrows
  raster,       # for spatial formatting  
  tidyverse
)

Next, import the required data. Use the combined linelist first. Then import shapefiles using the special read_sf() function.

# Import data -------------------------------------------------------------

# import cleaned and joined case linelist
combined <- import(here("data", "clean", "backup", "linelist_combined_20141201.rds"))

## import shapefile of administrative boundaries 
shapefile <- read_sf(here("data", "shp", "sle_adm3.shp"))

Before plotting we have some data cleaning to do. Create an object called districts to identify the distinct district values present in the linelist. This will allow us to filter the shapefiles to only keep districts with cases present.

# Filter data -------------------------------------------------------------

## districts we are interested in 
districts <- combined %>%    
  distinct(admin3pcod) %>%   # identify distinct values present in admin3pcod column
  drop_na() %>%   # remove NA
  pull()          # pulls out the distinct values

## filter shapefile for districts of interest 
shapefile <- shapefile %>% 
  filter(admin3Pcod %in% districts)

Now that you have filtered the shapefile to the outbreak region, create a basic ggplot using the admin boundaries.

# Basic plot of the district shapes ---------------------------------------
## open up a ggplot
shape_plot <- ggplot() + 

  ## add the shapefile on top
  geom_sf(data = shapefile, 
          fill = NA,         # no fill
          colour = "black")  # black borders
# print
shape_plot

Here we assign the map to the name shape_plot. Because we assign it to this name, it will not automatically print to the viewer pane. To view the plot run the object's name. This is the difference between saving an output (using <- to create it) and printing it.

This map will become a foundational layer of our plot. Just like we can layer geoms in non-map ggplots, this shape_plot will be the first layer in our map plots.

We will go through 3 simple plotting examples:

Points

To plot points, remove rows with NAs present in lon and lat columns. Next, define the coordinate reference system (CRS). We will not go into detail about this here, but be aware that these vary by region. See the Epi R Handbook chapter on GIS for more details.

############# POINTS  ##########################################################

combined_sf <- combined %>% 
  drop_na(lat, lon) %>% 
  st_as_sf(                                               
    # define the coordinates based on lat/long variables
    coords = c("lon", "lat"),                             
    # set the coordinate reference system to WGS84
    crs = 4326,                                           
    # do not change string variables to factors 
    stringsAsFactors = FALSE                              
  )



# view the first 10 rows, first 5 columns, and the the geometry column
combined_sf$geometry

Notice that the combined_sf object this created is not longer a simple data frame, but rather a spatial object. Within combined_sf, there is a geometry column which provides spatially-formatted case locations. We can now plot these points onto our shape_plot object, using the geom_sf() function (sf stands for spatial features).

## plot points on the district shapes
shape_plot + 
  geom_sf(data = combined_sf)+
  labs(title = "Case locations")


## plot points on the district shapes, colored by outcome
shape_plot + 
     geom_sf(data = combined_sf,
             mapping = aes(color = fct_na_value_to_level(outcome)))+
     labs(color = "Outcome",
          title = "Case locations, by outcome") + 
  theme_minimal()

Pretty cool, right! We've now created our first maps of our case linelist data in R.

Notice in the second plot, you can color the cases by "Outcome", and also add a title. What other features could you alter using ggplot syntax that you know?

Choropleths

Next we add a choropleth layer to display case counts for each impacted district.

To do this, we must format our combined object into a data frame with case counts by district. Use the admin3pcod column for our districts. Once we have created a case counts object, we can merge this with our shapefile and then plot the counts using the fill aesthetic.

############# CHOROPLETHS ####################################################

## get counts of cases by district
case_counts <- combined %>%
  count(admin3pcod, name = "counts")



## add case counts to the districts shapefile 
shapefile <- left_join(shapefile, case_counts, by = c("admin3Pcod" = "admin3pcod"))

# look at the districts shapefile and see the new column
View(shapefile)

## plot choropleth 
ggplot() + 
  ## add the shapefile on top
  geom_sf(data = shapefile, 
          # fill by case count
          aes(fill = counts),
          # black borders
          colour = "black")

Base maps

Base maps provide an underlying tile with geographic features like roads, rivers, forests, etc. We must create a "bounding box" based on the geographic extent of the shapefile. This is then used to clip a publicly-available source like Google Maps or Open Street Map. This requires an internet connection.

############ BASE MAPS #########################################################

# get the bounding box for the shapefile 
bounding_box <- shapefile %>% 
  st_bbox()


# plot a base map including scale bar 
basemap <- ggplot() +
  # change the bounding box to an sf object
  # this defines the area to download map tiles for
  geom_sf(data = st_as_sfc(bounding_box)) +
  # download map tiles and add to the plot
  annotation_map_tile(
    # define what map tiles to use
    type =  "cartolight",
    # define folder to store tile images 
    cachedir = here::here("data", "map_tiles"),
    # define if should download tiles each time
    forcedownload = FALSE,
    # hide messages about download status and zoom
    progress = "none" )


# show basemap
basemap

We can now make a ggplot with the linelist data plotted on to the basemap.

# plot cases on top of basemap
basemap + 
  ## add the shapefile on top
  geom_sf(data = shapefile, 
          # no fill
          fill = NA,
          # black borders
          colour = "black") + 
  geom_sf(data = combined_sf,
          mapping = aes(color = fct_explicit_na(outcome)))+
  labs(color = "Outcome",
       title = "Case locations, by outcome")

What if we wanted to plot case locations by outcome and month? Use the facet_wrap() function to format the same plot into monthly facets so we can see cases over time.

# plot cases on top of basemap
basemap + 
  ## add the shapefile on top
  geom_sf(data = shapefile, 
          # no fill
          fill = NA,
          # black borders
          colour = "black") + 
  geom_sf(data = combined_sf,
          mapping = aes(color = fct_na_value_to_level(outcome)))+
  labs(color = "Outcome",
       title = "Case locations, by outcome and month")+
  facet_wrap(~month(date_onset, label = TRUE))

GIS with R is an entire topic and this was a simple demonstration.

Depending on the region, you will need to download shapefiles that represent your region of interest.

Don't forget to save your script!



appliedepi/introexercises documentation built on April 22, 2024, 1:01 a.m.