library(introexercises) # get data for exercises library(learnr) # create lessons from rmd library(gradethis) # evaluate exercises library(dplyr) # wrangle data library(flair) # highlight code library(ggplot2) # visualise data library(lubridate) # work with dates library(forcats) # work with factors library(fontawesome) # for emojis library(scales) # defining axes and units library(stringr) # work with character strings library(apyramid) # creating demographic pyramids library(viridis) # defining colour palettes library(janitor) # clean data library(tsibble) # working with epiweeks library(tidyr) # clean data # library(RMariaDB) # connect to sql database ## set options for exercises and checking --------------------------------------- ## Define how exercises are evaluated gradethis::gradethis_setup( ## note: the below arguments are passed to learnr::tutorial_options ## set the maximum execution time limit in seconds exercise.timelimit = 60, ## set how exercises should be checked (defaults to NULL - individually defined) # exercise.checker = gradethis::grade_learnr ## set whether to pre-evaluate exercises (so users see answers) exercise.eval = FALSE ) # ## event recorder --------------------------------------------------------------- # ## see for details: # ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events # ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R # # ## connect to your sql database # sqldtbase <- dbConnect(RMariaDB::MariaDB(), # user = Sys.getenv("userid"), # password = Sys.getenv("pwd"), # dbname = 'excersize_log', # host = "144.126.246.140") # # # ## define a function to collect data # ## note that tutorial_id is defined in YAML # ## you could set the tutorial_version too (by specifying version:) but use package version instead # recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) { # # ## define a sql query # ## first bracket defines variable names # ## values bracket defines what goes in each variable # event_log <- paste("INSERT INTO responses ( # tutorial_id, # tutorial_version, # date_time, # user_id, # event, # section, # label, # question, # answer, # code, # correct) # VALUES('", tutorial_id, "', # '", tutorial_version, "', # '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "', # '", Sys.getenv("SHINYPROXY_PROXY_ID"), "', # '", event, "', # '", data$section, "', # '", data$label, "', # '", paste0('"', data$question, '"'), "', # '", paste0('"', data$answer, '"'), "', # '", paste0('"', data$code, '"'), "', # '", data$correct, "')", # sep = '') # # # Execute the query on the sqldtbase that we connected to above # rsInsert <- dbSendQuery(sqldtbase, event_log) # # } # # options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE)
# data prep -------------------------------------------------------------------- combined <- rio::import(system.file("dat/linelist_combined_20141201.rds", package = "introexercises")) combined <- combined %>% mutate(week_onset = yearweek(date_onset, week_start = 1), ## create week of onset variable week_onset_date = as.Date(week_onset)) ## create a date version # Create hospital weeks for heat plot ##################################### # count cases by hospital-week hospital_weeks_raw <- combined %>% count(hospital, week_onset) # create longer dataset of all possible hospital-weeks expanded <- hospital_weeks_raw %>% select(hospital, week_onset) %>% tidyr::expand(., week_onset, hospital) # merge so that all hospital-weeks are represented in the data hospital_weeks <- hospital_weeks_raw %>% right_join(expanded) %>% mutate(n = replace_na(n, 0)) %>% filter(week_onset >= ymd("2014-06-15")) # # # count_data <- combined %>% # group_by(hospital, date_hospitalisation) %>% # summarize(n_cases = dplyr::n()) %>% # ungroup() # # # # hospitalisation_week <- incidence(combined, # # date_index = date_hospitalisation, # # interval = "weeks") # # demo_agg <- combined %>% # count(age_cat, , name = "cases") %>% # pivot_wider( # id_cols = age_cat, # names_from = , # values_from = cases) %>% # rename(`missing_` = `NA`) # # aggregate_data <- demo_agg %>% # pivot_longer( # col = c(female, male, missing_), # cols to elongate # names_to = "", # name for new col of categories # values_to = "counts") %>% # name for new col of counts # mutate( # = na_if(, "missing_")) # convert "missing_" to NA
Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.
knitr::include_graphics("images/logo.png", error = F)
This exercise focuses on more complex plots that are common in public health work, such as heat plots, epidemic curves, and age/sex demographic pyramids.
This will involve using previously explored aspects such as:
ggplot()
%>%
theme_minimal()
scale_y_continuous()
) This exercise guides you through tasks that you should perform in RStudio on your local computer.
There are several ways to get help:
1) Look for the "helpers" (see below)
2) Ask your live course instructor/facilitator for help
3) Schedule a 1-on-1 call with an instructor for "Course Tutoring"
4) Post a question in Applied Epi Community
Here is what those "helpers" will look like:
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Here you will see a helpful hint!
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
linelist %>% filter( age > 25, district == "Bolo" )
Here is more explanation about why the solution works.
Answering quiz questions will help you to comprehend the material. The answers are not recorded.
To practice, please answer the following questions:
quiz( question_radio("When should I view the red 'helper' code?", answer("After trying to write the code myself", correct = TRUE), answer("Before I try coding", correct = FALSE), correct = "Reviewing best-practice code after trying to write yourself can help you improve", incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer." ) )
question_numeric( "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?", answer(10, message = "Try not to worry, we will help you succeed!", correct = T), answer(9, message = "Try not to worry, we will help you succeed!", correct = T), answer(8, message = "Try not to worry, we will help you succeed!", correct = T), answer(7, message = "Try not to worry, we will help you succeed!", correct = T), answer(6, message = "Ok, we will get there together", correct = T), answer(5, message = "Ok, we will get there together", correct = T), answer(4, message = "I like your confidence!", correct = T), answer(3, message = "I like your confidence!", correct = T), answer(2, message = "I like your confidence!", correct = T), answer(1, message = "I like your confidence!", correct = T), allow_retry = TRUE, correct = "Thanks for sharing. ", min = 1, max = 10, step = 1 )
Please email contact@appliedepi.org with questions about the use of these materials.
In this exercise you will:
During this exercise you will be asked to learn new functions by reading their documentation. This is intentional. If you are confused, just ask an instructor for assistance.
First, let us refresh ourselves on the basics of ggplot2!
quiz( question("Where are dynamic aesthetics placed in ggplot code?", allow_retry = TRUE, answer("inside aes()", correct = T), answer("outside aes()", message = "dynamic assignments means the aesthetic is mapped to a column in the data, and the presentation can vary for each row. These must be done inside 'mapping = aes()'") ), question("Are static aesthetics in the initial ggplot() call inherited by subsequent geoms?", allow_retry = TRUE, answer("No", correct = TRUE), answer("Yes", message = "No, for example if you set the size of points equal to 3, the lines will not be size 3 as well.") ), question("Are dynamic aesthetics in the initial ggplot() call inherited by subsequent geoms?", allow_retry = TRUE, answer("No", message = "Yes they are, for example, assigning wt_kg to the X-axis is inherited by all the subsequent geoms."), answer("Yes", correct = TRUE) ), question("Should adjustments to the theme be made before or after setting one of the default themes?", allow_retry = TRUE, answer("before", message = "A complete/default theme will override all previous micro adjustments to the theme"), answer("after", correct = TRUE) ), question("Which of the following are prebuilt themes in ggplot", allow_retry = TRUE, answer("theme_bw()", correct = TRUE), answer("theme_classic()", correct = TRUE), answer("theme_red()"), answer("scale_color_brewer()") ), question("How would you hide a legend in ggplot?", allow_retry = TRUE, answer("theme(legend.title = 'element.blank()')"), answer("theme(legend.position = 'right')"), answer("theme(legend.position(none))"), answer("theme(legend.position = 'none')", correct = TRUE) ), question("How would you set your legend to appear in the centre of your graph?", answer("theme(legend.position = 'middle')"), answer("theme(legend.position = c(0.5,0.5))", correct = TRUE) ) )
Great, now we have refreshed our memory with a quiz, we will start on our first topic, heat plots.
Open your ebola project. Run all the code chunks in your "ebola_sitrep.Rmd", so that you are able to use the data frame combined
.
If you did not complete the joining exercise, or are seeing errors when trying to run your script, you can import a "backup" combined
data frame from the "data/clean/backup/" folder using this command:
combined <- import(here("data", "clean", "backup", "linelist_combined_20141201.rds"))
Talk to an instructor if you have problems running your script!
Add the following packages and load them:
Remember to keep {tidyverse} as the last package in the command.
Your packages chunk should look like this:
# This chunk loads packages pacman::p_load( rio, # for importing data here, # for locating files janitor, # for data cleaning lubridate, # for date cleaning epikit, # for easy inline code scales, # for percents and date labels officer, # border lines flextable, # make HTML tables gtsummary, # for nice tables apyramid, # for age/sex pyramids RColorBrewer, # for colour palettes gghighlight, # highlighting plot parts ggExtra, # special plotting functions viridis, # for color-blind friendly color scales tsibble, # for epiweeks and other time series analyses tidyverse # for data management and visualization )
Add a new "Heat plots" chunk near the bottom of your R Markdown.
# Heat plots
You can also add a heading in the text portion of the R Markdown to deliniate this new content.
Your "ebola_sitrep.Rmd" script should have approximately this outline:
Before we proceed, it is worth covering the creation of "weeks", or "epiweeks" (epidemiological weeks) in your data frame.
In order to create tables of case counts by week, you may need to create a new column that contains the "epiweek", based on a date such as date_onset
.
There are many different ways to define what a "week" is. Likewise, some countries formulate epiweeks differently (e.g. starting on Sundays, Mondays, or Fridays). Most of the world uses weeks that begin on Mondays.
Below, we present the most common approach and provide you with a reference sheet for the rest (in the "intro_course/learning materials" folder).
We recommend the {tsibble} package for creating epidemic "Week" values. This package is often used for time series analysis and has a variety of useful time/date functions.
Its function yearweek()
conveniently produces weeks from a provided date column. You can also designate the week_start =
argument, where 1 is for Monday, 7 is for Sunday, etc.
Putting yearweek()
inside mutate()
, we can create a new column of class "yearweek", called week_onset
, which will display values like "2014 W44". Importantly, these can be used to bin the rows into discrete weeks for an epidemic curve or another table/plot.
Do NOT add this code to your script, we are simply showing how yearweek()
alters the column format.
# new column for week of onset (as year-week) combined %>% mutate(week_onset = yearweek(date_onset, week_start = 1))
The values of week_onset
will look like this:
combined %>% mutate(week_onset = yearweek(date_onset, week_start = 1)) %>% pull(week_onset) %>% head()
For maximum flexibility, we now create a second column that will be a normal date class, displaying the start date of the week.
We create it by wrapping the week_onset
(class "yearweek") in the {base} function as.Date()
. The "weeks" will be converted to dates. If you selected week_start = 1
(Mondays), then the date returned will be the Monday of that week (rounding down). This is saved as a new column, week_onset_date
.
Do not add this code to your script, it is simply an example.
# new column for week of onset (as a date) combined %>% mutate(week_onset = yearweek(date_onset, week_start = 1)) %>% mutate(week_onset_date = as.Date(week_onset))
The values of week_onset_date
will look like this:
combined %>% mutate(week_onset = yearweek(date_onset, week_start = 1)) %>% mutate(week_onset_date = as.Date(week_onset)) %>% pull(week_onset_date) %>% head()
For now, add the below mutate()
command at the end of your "Joining" code chunk, before combined
is exported as a .rds file. Later, you may choose to integrate these commands into the cleaning pipe chain of the surveillance dataset instead.
Re-run the "Joining" command so that the changes are implemented in combined
.
mutate()
stylemutate()
allows more than one statement within its parentheses, separated by commas. In an optional step, you can insert both changes within the same mutate()
command.
combined <- combined %>% mutate(week_onset = yearweek(date_onset, week_start = 1), ## create week of onset variable week_onset_date = as.Date(week_onset)) ## create a date version
Now we have two versions of "epiweek" columns that we can use to make plots and tables!
Heat plots, also called "heat tiles", are useful visualizations when trying to display 3 variables (x-axis, y-axis and fill).
For instance, you may want to look at a breakdown of how many cases were reported by week across several hospitals, to get an idea of how the epidemic has progressed in several places. Let's see how this works.
Let's say that we have a data frame called hospital_weeks
, which has the following columns (you will create this in a moment):
head(hospital_weeks) # print the first 6 rows
This data is in "long" format, and every possible hospital-week has a row.
Create this data frame in your "Heat plots" code chunk using this code below. Do not try to understand this code (try our advanced course in data management). In essence, it uses count()
to create a data frame of case counts by week, then creates a data frame with all possible hospital-weeks, and then joins them together.
# count cases by hospital-week hospital_weeks_raw <- combined %>% count(hospital, week_onset_date) # create longer dataset of all possible hospital-weeks expanded <- hospital_weeks_raw %>% select(hospital, week_onset_date) %>% tidyr::expand(., week_onset_date, hospital) # merge so that all hospital-weeks are represented in the data hospital_weeks <- hospital_weeks_raw %>% right_join(expanded) %>% mutate(n = replace_na(n, 0)) %>% filter(week_onset_date >= ymd("2014-06-15"))
r fontawesome::fa("eye", fill = "darkblue")
Take a few minutes to review the documentation for the function geom_tile()
, which is from {ggplot2}. You can access this by either:
1) Entering ?geom_tile
in the Console, or
2) Click to the "Help" pane (lower-right) and entering geom_tile
in the search bar
What ggplot()
code would you write to create a heat plot that:
Add a heatplot to your "Heat plots" code chunk. Remember the format of ggplot()
, where we use the +
to add geom
layers to the ggplot()
. Don't forget to use the argument mapping = aes(x = , y = , fill = )
in your ggplot()
function.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = hospital_weeks, mapping = aes(x = week_onset_date, y = hospital, fill = n)) + geom_tile()
quiz( question("Which hospital reported the most cases with onset in September?", allow_retry = TRUE, answer("SMMH"), answer("Port Hospital", correct = T), answer("Military Hospital"), answer("Central Hospital"), answer("Unknown") ), question("Which hospital reported the highest case fatality ratio?", allow_retry = TRUE, answer("SMMH", message = "It is not possible to know the case fatality ratio from this plot"), answer("Port Hospital", message = "It is not possible to know the case fatality ratio from this plot"), answer("Military Hospital", message = "It is not possible to know the case fatality ratio from this plot"), answer("Central Hospital", message = "It is not possible to know the case fatality ratio from this plot"), answer("Unknown", correct = T) ) )
The plot is nice, but let us do a few more things to refine the presentation.
Update your heat plot code with the following:
1) Modify hospital_weeks
to create a new data frame: hospital_weeks_adjusted
. In this data frame, ensure that NA
values in the column hospital
are re-coded as "Missing" and this column is class "factor" with the levels in this order : "Central Hospital", "Military Hospital", "Port Hospital", "SMMH", "Other", "Missing".
* You can convert NA
to "Missing" using fct_na_value_to_level()
from {forcats} within mutate()
. Make sure to use the level =
argument to change NA
to Missing
with quotations around Missing
.
* You can then use fct_relevel()
also from {forcats}, in a subsequent mutate()
to provide the desired level order
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Within a mutate()
command, use the {forcats} package and its function fct_na_value_to_level()
to re-define the column hospital
to convert NA
to "Missing" (this also converts the column to class "factor"). Within the fct_na_value_to_level()
, include level = "Missing"
to reassign NA
to the word "Missing".
Pipe this into another mutate command, and use the function fct_relevel()
to adjust the ordering of the levels.
If you need a refresher on how to change NA
values in factor columns, and how to reorder, checkout the R Handbook chapter on factors.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
hospital_weeks_adjusted <- hospital_weeks %>% mutate(hospital = fct_na_value_to_level(hospital, level = "Missing"), hospital = fct_relevel( hospital, "Central Hospital", "Military Hospital", "Port Hospital", "SMMH", "Other", "Missing"))
2) In the ggplot of the new adjusted data frame, change the color scheme to a diverging color scale. Provide a scale that extends from "skyblue" at low values to "tomato" at high values.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Add scale_fill_gradient()
to the ggplot()
command. See the help documentation with ?scale_fill_gradient
to understand the arguments.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot( data = hospital_weeks_adjusted, mapping = aes( x = week_onset_date, y = hospital, fill = n)) + geom_tile()+ scale_fill_gradient(low = "skyblue", high = "tomato")
3) The default x-axis ticks show each month. Change these to efficiently show every two weeks.
label_date_short()
function from {scales}, which you learned in the scales ggplot module. r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Add scale_x_date()
to the bottom of the ggplot, and provide the arguments date_breaks = "2 weeks"
(for example), and labels = scales::label_date_short()
.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot( data = hospital_weeks_adjusted, mapping = aes( x = week_onset_date, y = hospital, fill = n)) + geom_tile()+ scale_fill_gradient(low = "skyblue", high = "tomato")+ scale_x_date( date_breaks = "2 weeks", labels = scales::label_date_short() )
4) Make the labels for the axes and legend more presentable.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Adjust the labs()
. Recall that the legend title in this case is edited with fill =
because it was created by the fill.
Below is all the solution code together:
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Edit the dataset hospital_weeks_adjusted <- hospital_weeks %>% mutate(hospital = fct_na_value_to_level(hospital, level = "Missing"), hospital = fct_relevel( hospital, "Central Hospital", "Military Hospital", "Port Hospital", "SMMH", "Other", "Missing")) # Plot with adjustments ggplot(data = hospital_weeks_adjusted) + geom_tile(mapping = aes( x = week_onset_date, y = hospital, fill = n))+ scale_fill_gradient(low = "skyblue", high = "tomato")+ scale_x_date( date_breaks = "2 weeks", labels = scales::label_date_short())+ labs( x = "Week of symptom onset", y = "Hospital", fill = "Number of\nweekly cases")+ theme_minimal()
In the solution code, notice how we can use \n
to split our labels onto multiple lines. This is a very useful addition to our labels! Particularly if we want to format plots so they fit nicely in our report.
Note that to make the heat plot using geom_tile()
, you had to provide a data frame with counts - aggregated data (not a linelist). There are ways to create heat plots from linelist data (see geom_density()
functions) but those will not be covered today.
Now that you are familiar with the heat tile plotting syntax, we will move to an exercise where you create the counts data frame.
Try to do the following on your own. It will be challenging, but this is good exposure to intermediate-level R programming.
First, create a new code chunk for an age and outcome summary, below your heat plots chunk. Do the following:
1) Use combined
to create a new data frame named age_outcome_summary
.
* Drop any rows with NA
values by piping to drop_na(sex, age_cat)
.
* Use group_by()
and summarise()
to show summary counts for each unique combination of age_cat
and sex
.
* In addition to case counts, include a column showing the number of rows in each group whose outcome
is equal to "Death".
* Finally, include a column calculating the proportion in each group equal to "Death".
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# make new data frame summarizing age, sex, and deaths age_outcome_summary <- combined %>% drop_na(sex, age_cat) %>% # remove any NA values group_by(age_cat, sex) %>% # group rows by age- groups summarise( # begin creating new summary columns n = n(), n_death = sum(outcome == "Death", na.rm = TRUE), # number of rows with Death outcome pct_death = n_death / n) # create proportion dead in the group
2) Create a heat plot using this new data frame. Make the tiles filled according to the percent of Death outcomes, as displayed by sex
and age_cat
.
Here are some extra challenges, if you have time:
"Percent of\ncases fatal"
scale_fill_viridis()
and pick a color scheme (e.g. use argument option = "B"
), and use limits
argument to set the scale limits = c(0, 1)
to align with the proportions.direction = -1
within scale_fill_viridis()
to inverse the direction of the color palette, so highest percent are dark colors and lowest are light colors. geom_text()
to display the percent on each tile. Read the documentation for geom_text
first. labels = scales::percent
within aes()
. Round to one decimal place with the argument accuracy = 0.1
. Add this to your "Age outcome summary" code chunk,
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# make plot ggplot(data = age_outcome_summary, # use new data frame mapping = aes(x = sex, # on x-axis y = age_cat, # age category on y-axis fill = pct_death)) + # fill (color) is shaded by the proportion dead geom_tile() + # display data as tiles scale_fill_viridis( # adjust colors scale option = "B", # pick any option limits = c(0, 1), # set the limits to go from 0 to 1 labels = scales::percent, # adjust legend values to be percents direction = -1) + # reverses color gradient direction geom_text( # add text over the tiles mapping = aes( label = scales::percent( # show percents instead of proportions pct_death, accuracy = 0.1)))+ labs(x = "Sex", # add labels y = "Age categories", fill = "Percent of\ncases fatal")
quiz( question("Who has the highest proportion of 'Death' in combined gender and age category?", answer("Male and 20-29", correct = TRUE), answer("Female and 20-29"), answer("Male and 50-59"), answer("Female and 40-49") ), question("Which has the oldest ages?", answer("Female"), answer("Male", correct = TRUE) ) )
Ok, good job! You have managed to build summary datasets for a heat plot and then customize the plot so that you can analyze the information.
Even if you did not know how to do all of those edits to the ggplot, the purpose of that task was to expose you to a new plot and customization possibilities.
You can always find sample code in the Heat Plots chapter of the Epi R Handbook, and become more well-practiced later.
An epidemic curve (also known as an “epi curve”) is a core epidemiological chart typically used to visualize the temporal pattern of illness onset among a cluster or epidemic of cases.
Analysis of the epicurve can reveal temporal trends, outliers, the magnitude of the outbreak, the most likely time period of exposure, time intervals between case generations, and can even help identify the mode of transmission of an unidentified disease (e.g. point source, continuous common source, person-to-person propagation).
We are going to build epidemic curves with {ggplot2}, which allows for advanced customizability. There is another option, the {incidence2} package as described in the Epi R Handbook, which is perhaps more simple but less customizeable.
As you have been through several exercises using {ggplot2}, you are now familiar with the syntax and customization it allows (themes, labels, axes, etc).
When deciding to make an epidemic curve, you must first decide whether you want to display the "week" on the X-axis (e.g. "2014 W35") or some variety of calendar date.
In its most basic form, plotting epidemic curves in {ggplot2} is a case of using the ggplot function geom_histogram()
to show the distribution of the date values. However, there are 3 main things to watch out for:
Create a new section and code chunk for "Epicurves" below the "Age outcomes summary".
Make a simple histogram from the data frame combined
which shows the distribution of date_onset
using 5-day intervals? The histogram "binwidth" can be specified as a number (as a static aesthetic) to binwidth =
. Also, clean up the labels and use theme_bw()
.
Add this to your "Epicurves" code chunk. Remember, you can use your "Testing area" code chunk for any code testing you need to do first!
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = combined, mapping = aes(x = date_onset)) + geom_histogram(binwidth = 5) + labs(x = "Date of onset", y = "Incidence") + theme_bw()
Plotting using bin_width()
will start the first bin at the first case. As epidemiologists and public health practitioners we typically need to produce an epidemic curve that adheres to very particular specifications. For example, a 7-day week that starts on a particular day. Likewise, this approach does not work for monthly bins, because months can be 28, 29, 30 or 31 days!
Instead of using binwidth =
, we specify the histogram bars with breaks =
.
breaks =
expects a vector of dates to use as breaks (cut-points).
You know that a vector can be made using c()
, for example c(54, 22, 89)
or c("Paris", "Delhi", "Kigali")
.
Similarly, you could make a vector of dates like: c(ymd("2014-06-15"), ymd("2014-06-22"), ymd("2014-06-29"))
.
But this would be very long code to include all the weekly break points in this outbreak!
Instead, use the {base} function seq.Date()
to automatically create a sequence of dates between a from =
value, to a to =
value, and in increments specified to by =
. You can specify by
values such as "day", "week", "month", etc.
The sequence below uses dates of the first and last case onsets. Write this command in your Testing chunk and run it to see the output.
seq.Date(from = ymd("2014-05-06"), to = ymd("2014-11-28"), by = "week")
This can be integrated into the epicurve command. Try it if you want to, but know that we will be changing it later.
ggplot(data = combined, aes(x = date_onset))+ geom_histogram( breaks = seq.Date( from = ymd("2014-05-06"), to = ymd("2014-11-28"), by = "week")) + labs(x = "Date of onset", y = "Incidence") + theme_bw()
Now we are confident that the breaks are using 7-day intervals starting and ending on a specific date. But typically we need the intervals to start on Mondays (or some other specified day).
To round dates, we suggest using floor_date()
and it's companion ceiling_date()
from the {lubridate} package.
Try these codes in your Testing chunk. They simply showcase how these functions work in isolation.
# Monday BEFORE the earliest case floor_date(ymd("2014-05-06"), unit = "week", week_start = 1) # Monday AFTER the last case ceiling_date(ymd("2014-11-28"), unit = "week", week_start = 1)
By integrating these into the seq.Date()
command, we can have a weekly vector starting on the Monday before the earliest case.
Adapt your seq.Date()
command to use floor_date()
and ceiling_date()
as below.
# Sequence of Mondays from before earliest case, to after latest case seq.Date(from = floor_date(ymd("2014-05-06"), unit = "week", week_start = 1), to = ceiling_date(ymd("2014-11-28"), unit = "week", week_start = 1), by = "week")
Lastly, what happens if our data are refreshed or updated with new records? Those static dates in our code will not be accurate!
Replace the static dates with min()
and max()
, using the column combined$date_onset
. Because this is not a {dplyr} pipe, we need to specify the data frame and column with the $
index operator. Don't forget that these functions need na.rm = TRUE
to remove missing values.
seq.Date(from = floor_date(min(combined$date_onset, na.rm=T), unit = "week", week_start = 1), to = ceiling_date(max(combined$date_onset, na.rm=T), unit = "week", week_start = 1), by = "week")
In this way, we have made our code more dynamic. Dynamic code ensures that all the plots and tables update automatically.
The easiest way to organize this command is to separate the seq.Date()
command from the ggplot()
command.
Pull out your seq.Date()
command and move it before the ggplot()
, saving it with a name like ebola_weeks
that is provided to the breaks =
argument in the geom_histogram()
.
# Define and save the vector ebola_weeks <- seq.Date( from = floor_date(min(combined$date_onset, na.rm=T), unit = "week", week_start = 1), to = ceiling_date(max(combined$date_onset, na.rm=T), unit = "week", week_start = 1), by = "week")
One final change before we finish is to add closed = "left"
to geom_histogram()
, which makes the bars also count the cases reported on the break days themselves.
# Define and save the vector ebola_weeks <- seq.Date( from = floor_date(min(combined$date_onset, na.rm=T), unit = "week", week_start = 1), to = ceiling_date(max(combined$date_onset, na.rm=T), unit = "week", week_start = 1), by = "week") # Run the plot, using the vector ggplot(data = combined, aes(x = date_onset)) + geom_histogram( breaks = ebola_weeks, closed = "left") + labs(x = "Date of onset", y = "Incidence") + theme_bw()
Compare this plot to the histogram printed in the exercise above (7-days, no specified breaks). Do you see any small differences in how the cases have been binned? (look at the beginning of the epidemic).
The above can seem like a lot of work to just produce a list of dates! But realise the control that you have over the output. And, now that you have this code, it will be easy to modify. All this is also in the Epi R Handbook chapter on Epidemic Curves for easy reference!
Now, make a new epidemic curve in your Epidemic Curves chunk, in which the histogram bin breaks are by month from the month-start prior to the first case.
Once you have done this, adjust the code so that the fill
in the bars is by hospital, and the x-axis labels appear every month.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
As we want the values to be aggregated by calendar month, do the following:
unit =
to "month" in the two {lubridate} functions and remove the week_start
arguments (check ?floor_date
for details). by =
argument of seq.Date()
to "month" Change the name of the vector ebola_weeks
to ebola_months
, and change it in the ggplot()
command too.
To set the fill, put fill = hospital
in the aesthetic mappings
scale_x_date()
: labels = label_date_short()
date_breaks = "months"
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# define and save the vector (by month) ebola_months <- seq.Date( from = floor_date(min(combined$date_onset, na.rm=T), unit = "month"), to = ceiling_date(max(combined$date_onset, na.rm=T), unit = "month"), by = "month") # run the plot with monthly breaks, and fill ggplot(data = combined, mapping = aes(x = date_onset, fill = hospital)) + geom_histogram(breaks = ebola_months, closed = "left") + scale_x_date( date_breaks = "months", labels = scales::label_date_short())+ labs(x = "Date of onset", y = "Incidence") + theme_bw()
It may be difficult to read this epicurve and to understand the case trends from each hospital.
Faceting the plots means creating "small-multiple" (mini plots) for each unique value in a column (e.g. hospital
).
You create facets by adding facet_wrap()
to the ggplot with a +
, and writing a tilde ~ with the name of the column to use. For example:
# run the plot with monthly breaks, and fill ggplot(data = combined, mapping = aes(x = date_onset, fill = hospital)) + geom_histogram() + facet_wrap(~sex)
Now, create facets based on hospital
for your epidemic curve. Add this to your R Markdown by updating the code in your "Epicurves" code chunk.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = combined, aes(x = date_onset, fill = hospital)) + geom_histogram(breaks = ebola_months, closed = "left") + scale_x_date( date_breaks = "months", labels = scales::label_date_short())+ labs(x = "Date of onset", y = "Incidence") + theme_bw()+ facet_wrap(~hospital)
There are MANY other adjustments that you can make to your epidemic curve made with {ggplot2}. Many of them are outlined in the Epidemic curves chapter of the Epi R Handbook.
If you want to display "week numbers", leverage the fact that you created the column week_onset
at the beginning of this module. Still, there will be several steps:
1) Create a vector of epiweek values (similar to what we created in the previous section, but written like "2014 W35")
2) Create a second vector of date values, from the epiweek values
3) Use count()
on the discrete "yearweek" column (e.g. week_onset
) to get counts by week
3) Plot with geom_col()
as shown below
Add another section and code chunk within the "Epicurves" section of your R Markdown script for "Epicurve with week numbers".
Below, we create a vector using seq()
- not seq.Date()
, because these values are special week values, not dates.
## create vector of the unique weeks which occur all_weeks <- seq(min(combined$week_onset, na.rm = TRUE), max(combined$week_onset, na.rm = TRUE), by = 1) all_weeks
Then we make a second vector with these weeks transformed to dates:
## change weeks to dates all_weeks_date <- as.Date(all_weeks) all_weeks_date
Then we use count()
to get counts by epiweek. The as_tsibble()
and fill_gaps(n = 0)
are from the {tsibble} package and ensure that weeks with no cases are represented by 0, not dropped.
counts <- combined %>% filter(!is.na(week_onset_date)) %>% count(week_onset_date) %>% as_tsibble() %>% # ensure that all weeks are present in the table fill_gaps(n = 0) # filled in with 0 as necessary
Finally, we plot the counts using geom_col()
. Note:
scale_x_date()
, the breaks are defined as the vector of dates because it is a date axis ## plot counts %>% ggplot(aes(x = week_onset_date, y = n)) + # n is the counts geom_col(width = 7) + # for full-width weekly columns theme(axis.text.x = element_text(angle = 90)) + # pivot the labels 90 degrees scale_x_date( breaks = all_weeks_date, labels = all_weeks)
Age pyramids are a useful way of illustrating demographics, and can be customised in a variety of different ways. For our purposes, we will be using the function age_pyramid()
from the apyramid package.
Create a new "Age pyramid" section heading and code chunk below your "Epicurves" section.
It is good to practice learning a function by reading its documentation. The function age_pyramid()
is fairly simple, in that it produces high quality age pyramids with relatively few arguments.
Look up the documentation for the function (?age_pyramid
) and review the arguments and examples.
Now create an age pyramid using the combined
data frame and the age_cat
categories, that is split by sex
. Add this to a new "Age pyramids" code chunk.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
age_pyramid(data = combined, age_group = age_cat, split_by = sex)
You can also plot the values by proportion, rather than count, and include a column for missing data.
Can you now, after reviewing the documentation, create an age pyramid of sex that includes missing (NA
) values and plots the proportions?
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
age_pyramid(data = combined, age_group = age_cat, proportional = TRUE, na.rm = FALSE)
In the previous example, the data is in a linelist format, where each row is a unique observation. However, you may be given data that is already aggregated into counts.
Just to practice, create this dataset demo_wide_counts
by running the following command in a new code chunk.
# Creates the object demo_agg ############################# demo_wide_counts <- structure(list( age_cat = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, NA), .Label = c("0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70+"), class = "factor"), female = c(149L, 104L, 55L, 15L, 4L, 1L, NA, NA, NA), male = c(96L, 80L, 65L, 39L, 19L, 5L, 5L, 1L, NA), missing_sex = c(5L, 4L, 4L, 2L, NA, NA, 1L, NA, 26L)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
It should look like this:
demo_wide_counts
To be acceptable to age_pyramid()
the data must be in "long" format, so first, you would need to pivot longer so that the information is in one column (remember tidy data!) and the counts are in one column.
You could use this command:
demo_long_counts <- demo_wide_counts %>% pivot_longer( col = c(female, male, missing_sex), # cols to elongate names_to = "sex", # name for new col of categories values_to = "counts") %>% # name for new col of counts mutate( sex = na_if(sex, "missing_sex")) # convert "missing_" to NA
Which results in:
demo_long_counts
Now, use what is written in the age_pyramid()
documentation to adjust the plotting command to accept counts.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
age_pyramid(data = demo_long_counts, age_group = age_cat, split_by = sex, count = counts)
Let's try this out with our combined
data frame object. Create an age pyramid stratified by sex, add this to the "Age pyramid" code chunk:
# Age pyramid with ggplot modifications age_pyramid( data = combined, age_group = age_cat, split_by = sex, proportional = TRUE, show_midpoint = FALSE)
You can also easily add ggplot commands using the +
to {apyramid} age pyramid to add labels and adjust the color scale.
Try this out now:
# Age pyramid with ggplot modifications age_pyramid( data = combined, age_group = age_cat, split_by = sex, proportional = TRUE, show_midpoint = FALSE)+ theme_minimal()+ scale_fill_brewer(type = "qual", palette = 2)+ labs(title = "Age and sex of confirmed cases", y = "Proportion of all cases", x = "Age group", caption = "Ebola outbreak", fill = "Sex")
Note, because the ggplot()
function in the {apyramid} function age_pyramid()
uses coord_flip()
, when adding labels on an age_pyramid()
plot, you need to supply the x =
and y =
labels to the opposite axis of where you would like them to appear.
Congratulations!
You have finished the Ebola case study and learned A LOT of R coding along the way!
You have learned a lot:
Make sure to save and knit your R Markdown script.
Consider adding more written content between the plots and tables, interpreting the results. Decide which plots to keep or exclude. The choice is up to you!
If you want to work through a GIS example with your ebola dataset, proceed to the Extra below...
The following is a brief demonstration of making maps with R.
You can use {ggplot2} to relatively easily produce maps and conduct spatial analyses.
The Ebola linelist includes information on latitude and longitude for each case (lat
and lon
columns). There are shape files (map layers) in the "ebola/data/shp" folder. We will not provide a long explanation of GIS or shapefiles, however, this exercise will demonstrate plotting the linelist data in simple maps.
If you are interested in GIS with R, ask an instructor about our advanced course.
Open a new R script. You can save the R script as "ebola_GIS_analyses.R" in your "ebola" project and in the "scripts" subfolder.
############################################# # GIS Ebola example # Bonus section # Your NAME here #############################################
You will first need to load specific packages for mapping:
Add them to your R script, and remember to include a section heading.
# Load packages ----------------------------------------------------------- pacman::p_load( rio, here, sf, # for working with geospatial data ggspatial, # for base maps and north arrows raster, # for spatial formatting tidyverse )
Next, import the required data. Use the combined
linelist first. Then import shapefiles using the special read_sf()
function.
# Import data ------------------------------------------------------------- # import cleaned and joined case linelist combined <- import(here("data", "clean", "backup", "linelist_combined_20141201.rds")) ## import shapefile of administrative boundaries shapefile <- read_sf(here("data", "shp", "sle_adm3.shp"))
Before plotting we have some data cleaning to do. Create an object called districts
to identify the distinct district values present in the linelist. This will allow us to filter the shapefiles to only keep districts with cases present.
# Filter data ------------------------------------------------------------- ## districts we are interested in districts <- combined %>% distinct(admin3pcod) %>% # identify distinct values present in admin3pcod column drop_na() %>% # remove NA pull() # pulls out the distinct values ## filter shapefile for districts of interest shapefile <- shapefile %>% filter(admin3Pcod %in% districts)
Now that you have filtered the shapefile to the outbreak region, create a basic ggplot using the admin boundaries.
# Basic plot of the district shapes --------------------------------------- ## open up a ggplot shape_plot <- ggplot() + ## add the shapefile on top geom_sf(data = shapefile, fill = NA, # no fill colour = "black") # black borders # print shape_plot
Here we assign the map to the name shape_plot
. Because we assign it to this name, it will not automatically print to the viewer pane. To view the plot run the object's name. This is the difference between saving an output (using <-
to create it) and printing it.
This map will become a foundational layer of our plot. Just like we can layer geoms in non-map ggplots, this shape_plot
will be the first layer in our map plots.
We will go through 3 simple plotting examples:
To plot points, remove rows with NA
s present in lon
and lat
columns. Next, define the coordinate reference system (CRS). We will not go into detail about this here, but be aware that these vary by region. See the Epi R Handbook chapter on GIS for more details.
############# POINTS ########################################################## combined_sf <- combined %>% drop_na(lat, lon) %>% st_as_sf( # define the coordinates based on lat/long variables coords = c("lon", "lat"), # set the coordinate reference system to WGS84 crs = 4326, # do not change string variables to factors stringsAsFactors = FALSE ) # view the first 10 rows, first 5 columns, and the the geometry column combined_sf$geometry
Notice that the combined_sf
object this created is not longer a simple data frame, but rather a spatial object. Within combined_sf
, there is a geometry
column which provides spatially-formatted case locations. We can now plot these points onto our shape_plot
object, using the geom_sf()
function (sf stands for spatial features).
## plot points on the district shapes shape_plot + geom_sf(data = combined_sf)+ labs(title = "Case locations") ## plot points on the district shapes, colored by outcome shape_plot + geom_sf(data = combined_sf, mapping = aes(color = fct_na_value_to_level(outcome)))+ labs(color = "Outcome", title = "Case locations, by outcome") + theme_minimal()
Pretty cool, right! We've now created our first maps of our case linelist data in R.
Notice in the second plot, you can color the cases by "Outcome", and also add a title. What other features could you alter using ggplot syntax that you know?
Next we add a choropleth layer to display case counts for each impacted district.
To do this, we must format our combined
object into a data frame with case counts by district. Use the admin3pcod
column for our districts. Once we have created a case counts object, we can merge this with our shapefile and then plot the counts using the fill aesthetic.
############# CHOROPLETHS #################################################### ## get counts of cases by district case_counts <- combined %>% count(admin3pcod, name = "counts") ## add case counts to the districts shapefile shapefile <- left_join(shapefile, case_counts, by = c("admin3Pcod" = "admin3pcod")) # look at the districts shapefile and see the new column View(shapefile) ## plot choropleth ggplot() + ## add the shapefile on top geom_sf(data = shapefile, # fill by case count aes(fill = counts), # black borders colour = "black")
Base maps provide an underlying tile with geographic features like roads, rivers, forests, etc. We must create a "bounding box" based on the geographic extent of the shapefile. This is then used to clip a publicly-available source like Google Maps or Open Street Map. This requires an internet connection.
############ BASE MAPS ######################################################### # get the bounding box for the shapefile bounding_box <- shapefile %>% st_bbox() # plot a base map including scale bar basemap <- ggplot() + # change the bounding box to an sf object # this defines the area to download map tiles for geom_sf(data = st_as_sfc(bounding_box)) + # download map tiles and add to the plot annotation_map_tile( # define what map tiles to use type = "cartolight", # define folder to store tile images cachedir = here::here("data", "map_tiles"), # define if should download tiles each time forcedownload = FALSE, # hide messages about download status and zoom progress = "none" ) # show basemap basemap
We can now make a ggplot with the linelist data plotted on to the basemap.
# plot cases on top of basemap basemap + ## add the shapefile on top geom_sf(data = shapefile, # no fill fill = NA, # black borders colour = "black") + geom_sf(data = combined_sf, mapping = aes(color = fct_explicit_na(outcome)))+ labs(color = "Outcome", title = "Case locations, by outcome")
What if we wanted to plot case locations by outcome and month? Use the facet_wrap()
function to format the same plot into monthly facets so we can see cases over time.
# plot cases on top of basemap basemap + ## add the shapefile on top geom_sf(data = shapefile, # no fill fill = NA, # black borders colour = "black") + geom_sf(data = combined_sf, mapping = aes(color = fct_na_value_to_level(outcome)))+ labs(color = "Outcome", title = "Case locations, by outcome and month")+ facet_wrap(~month(date_onset, label = TRUE))
GIS with R is an entire topic and this was a simple demonstration.
Depending on the region, you will need to download shapefiles that represent your region of interest.
Don't forget to save your script!
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.