# load packages ---------------------------------------------------------------- library(introexercises) # get data for exercises library(learnr) # create lessons from rmd library(gradethis) # evaluate exercises library(dplyr) # wrangle data library(flair) # highlight data library(ggplot2) # visualise data library(gghighlight) # emphasise parts of visualisations library(janitor) # clean data library(fontawesome) # for emojis library(scales) # defining axes and units # library(RMariaDB) # connect to sql database ## set options for exercises and checking --------------------------------------- ## Define how exercises are evaluated gradethis::gradethis_setup( ## note: the below arguments are passed to learnr::tutorial_options ## set the maximum execution time limit in seconds exercise.timelimit = 60, ## set how exercises should be checked (defaults to NULL - individually defined) # exercise.checker = gradethis::grade_learnr ## set whether to pre-evaluate exercises (so users see answers) exercise.eval = FALSE ) # ## event recorder --------------------------------------------------------------- # ## see for details: # ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events # ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R # # ## connect to your sql database # sqldtbase <- dbConnect(RMariaDB::MariaDB(), # user = Sys.getenv("userid"), # password = Sys.getenv("pwd"), # dbname = 'excersize_log', # host = "144.126.246.140") # # # ## define a function to collect data # ## note that tutorial_id is defined in YAML # ## you could set the tutorial_version too (by specifying version:) but use package version instead # recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) { # # ## define a sql query # ## first bracket defines variable names # ## values bracket defines what goes in each variable # event_log <- paste("INSERT INTO responses ( # tutorial_id, # tutorial_version, # date_time, # user_id, # event, # section, # label, # question, # answer, # code, # correct) # VALUES('", tutorial_id, "', # '", tutorial_version, "', # '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "', # '", Sys.getenv("SHINYPROXY_PROXY_ID"), "', # '", event, "', # '", data$section, "', # '", data$label, "', # '", paste0('"', data$question, '"'), "', # '", paste0('"', data$answer, '"'), "', # '", paste0('"', data$code, '"'), "', # '", data$correct, "')", # sep = '') # # # Execute the query on the sqldtbase that we connected to above # rsInsert <- dbSendQuery(sqldtbase, event_log) # # } # # options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE)
# data prep -------------------------------------------------------------------- surv <- rio::import(system.file("dat/surveillance_linelist_clean_20141201.rds", package = "introexercises"))
Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.
knitr::include_graphics("images/logo.png", error = F)
This exercise focuses on the fundamentals of using {ggplot2} for data visualization.
This exercise guides you through tasks that you should perform in RStudio on your local computer.
There are several ways to get help:
1) Look for the "helpers" (see below)
2) Ask your live course instructor/facilitator for help
3) Schedule a 1-on-1 call with an instructor for "Course Tutoring"
4) Post a question in Applied Epi Community
Here is what those "helpers" will look like:
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Here you will see a helpful hint!
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
linelist %>% filter( age > 25, district == "Bolo" )
Here is more explanation about why the solution works.
Answering quiz questions will help you to comprehend the material. The answers are not recorded.
To practice, please answer the following questions:
quiz( question_radio("When should I view the red 'helper' code?", answer("After trying to write the code myself", correct = TRUE), answer("Before I try coding", correct = FALSE), correct = "Reviewing best-practice code after trying to write yourself can help you improve", incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer." ) )
question_numeric( "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?", answer(10, message = "Try not to worry, we will help you succeed!", correct = T), answer(9, message = "Try not to worry, we will help you succeed!", correct = T), answer(8, message = "Try not to worry, we will help you succeed!", correct = T), answer(7, message = "Try not to worry, we will help you succeed!", correct = T), answer(6, message = "Ok, we will get there together", correct = T), answer(5, message = "Ok, we will get there together", correct = T), answer(4, message = "I like your confidence!", correct = T), answer(3, message = "I like your confidence!", correct = T), answer(2, message = "I like your confidence!", correct = T), answer(1, message = "I like your confidence!", correct = T), allow_retry = TRUE, correct = "Thanks for sharing. ", min = 1, max = 10, step = 1 )
Please email contact@appliedepi.org with questions about the use of these materials.
In this exercise you will:
Open the R project "ebola", as usual.
Then, open your script "ebola_analysis.R". You can locate this file by clicking the menus for "File -> Open File", or by clicking your RStudio Files Pane (lower-right) and navigating to the file there.
Add the {ggExtra} and {gghighlight} packages to the pacman::p_load()
command at the top of your script.
The {ggplot2} package does not need to be listed explicitly in this command, because it is included in the {tidyverse}.
We will use {ggExtra} to add additional plotting functions to {ggplot2} and {gghighlight} to highlight portions of the plots.
Remember to always keep {tidyverse} as the last package in your pacman::p_load()
command.
In Module 1 we instructed you to adjust R's global settings so that each session begins cleanly, with no stored memory of the commands you ran or objects you created in the previous session.
This is best practice. It forces you to write scripts that are independent of previous sessions. This makes your work more easily "reproducible" by others.
Look at your R environment pane. Do you see any objects saved from the last session in the Environment? If so, click the small "broom" icon to clear all objects. Go back to Module 1 or ask an instructor to assist you in adjusting the settings.
Now, run all the important commands in your script, section-by-section:
1) Load packages
2) Import the "raw" surveillance linelist
3) Run the long cleaning command (a "pipe chain" using the %>%
operator) to produce the clean linelist
4) Export the clean linelist
5) Produce a few nice tables using the clean linelist
If you have kept your script clean and organized, these commands should all be able to run in sequence with no errors.
Verify that you have the object surv
in your Environment.
As a backup for this module - IF your script is seriously malfunctioning with lots of errors, you can import the "backup" clean surv
dataset from the folder "ebola/data/clean/backup", where it is saved as "surveillance_linelist_clean_20141201.rds". Please notify an instructor that you will need assistance cleaning your script so that it runs smoothly next time.
Add a new section heading in your script called "Simple plots".
The new header should look something like this:
# Simple plots ----------------------------------------------
This section should be placed after "Summary tables" section and before the "Testing area".
We will build some plots to understand the outbreak dynamics in each district.
Any ggplot()
command begins with the function ggplot()
. Within this function, the first argument is data =
. Provide the name of the dataset to this argument.
Run the command below
ggplot(data = surv)
In RStudio's Plots pane, the blank plot canvas has been "opened" and is ready for further instructions. You will add to this ggplot()
command, building the plot step-by-step.
Now that we have a canvas, we need to tell R what we want on the X-axis and on the Y-axis. The columns that you will map to the axes are simple examples of the plot's "aesthetics". Note this word "aesthetics" - the root aes becomes part of the next argument of ggplot()
.
The second argument of ggplot()
is mapping = aes()
. This is where you map columns in the data to components of the plot. Specifically, this is for components of the plot that you allow to vary for each row of data.
For example, if we "map" the Y-axis to the column age_years
, then every row in the data will be evaluated and may be placed on a different height on the Y-axis.
aes()
The mapping = aes()
argument is atypical in that you must provide the values to it within the aes()
function.
mapping = aes(x = COLUMN, y = COLUMN)
Remember that this code above is occurring within the ggplot()
function. The mapping = aes()
itself IS the second expected argument to ggplot()
.
Try to write a ggplot()
command where you map the X-axis to the column district
, and Y-axis to the column age_years
, and then answer the questions about the plot that appears in RStudio.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = district, y = age_years))
quiz(caption = "Quiz - aesthetics", question("How many breaks are there in the X-axis?", allow_retry = T, answer("12"), answer("9"), answer("10", correct = T), answer("4") ), question("What is the default Y-axis label interval?", allow_retry = T, answer("5"), answer("20", correct = T, message = "This will change depending on the range of the data."), answer("30"), answer("10") ) )
Now that we have our canvas, and we have designated our axes... we can begin to visualize the data!
This is done by adding geom_()
commands, which each create a different kind of visualization.
We have completed one full component of the ggplot()
command - the parentheses are closed, and the command can run without error. So how do we add additional components to the plot?
The "plus" symbol connects the components of the plotting command. In some ways, it is like the pipe operator, but links steps within an overall plotting command.
Add the plus, and on the new line write geom_point()
. Then run the command
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = district, y = age_years))+ geom_point()
The geom_point()
has visualized each row of the data frame surv
as as points, corresponding to their district
value (X-axis) and their age_years
value (Y-axis).
quiz(caption = "Quiz - points", question("How many cases were reported in Central I district?", allow_retry = T, answer("25"), answer("35"), answer("Not possible to know from this plot", correct = T), answer("21") ) )
With a discrete variable on the X-axis and a continuous distribution on the Y-axis, it is difficult to know exactly how many cases there are! Each point represents one case, but you could be looking at 5 points of the same age that are overlapping and hiding each other.
Let's try another geom to solve this problem. Replace geom_point()
with geom_jitter()
and re-run the command.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = district, y = age_years))+ geom_jitter()
Go to the Help pane of RStudio (by the Plots pane) and search for "geom_jitter" in the search bar in the upper-right.
This is the R documentation of the function geom_jitter()
. Every function, whether in {base} R, or in an installed package, has documentation on its functions. You can access this documentation offline - it is stored within R once the package is downloaded.
Sometimes, this documentation is difficult to read. We will practice reading documentation throughout the course. However, there are often tutorials, guides, and "vignettes" online that are easier to read. We will also practice reviewing these.
For now, do the following:
Read the "Description" to understand exactly what the function is doing to avoid overlapping points.
Review the "Aesthetics" list near the bottom. These are other plot aesthetics that you can adjust, to vary the appearance of this geom layer.
Within the geom_jitter()
function, enter a few of these aesthetics and try adjusting them as follows:
color = "blue"
size = 5
shape = 4
quiz(caption = "Quiz - aesthetics", question("What do you think the plot aesthetic `alpha` adjusts?", allow_retry = T, answer("Alphabetical ordering"), answer("Axis color"), answer("Transparency", correct = T), answer("Size") ), question("What shape is produced by setting `shape = 3` within geom_jitter()?", allow_retry = T, answer("Circle"), answer("Triangle"), answer("Plus", correct = T), answer("Square") ) )
The jitter plot is nice, but let's add some statistical display to it. Add geom_boxplot()
below the geom_jitter()
command.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Don't forget to add a + at the end of geom_jitter()
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = district, y = age_years))+ geom_jitter()+ geom_boxplot()
We can now see where summary statistical values lie for each district, such as the median and inter-quartile ranges. But the points are now hidden by the boxplots.
Add and adjust the alpha =
aesthetic for the boxplots only. What alpha value allows you to see the plots?
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Try several values between 0 and 1.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = district, y = age_years))+ geom_jitter()+ geom_boxplot(alpha = 0.5)
As alternative, replace the geom_boxplot()
with geom_violin()
, and use the same alpha =
level as before to have some transparency.
ggplot(data = surv, mapping = aes(x = district, y = age_years))+ geom_jitter()+ geom_violin(alpha = 0.5)
There is not much difference in the case ages between each district... let's try a violin plot on another metric. Try this code:
ggplot(data = surv, mapping = aes(x = sex, y = ht_cm))+ geom_violin(alpha = 0.5, size = 2)
quiz(caption = "Quiz - violin plot", question("What is aes(x = sex, y = ht_cm) doing in this command?", allow_retry = T, answer("Sets sex as the Y axis, and height as the X axis", message = "Almost... but check again - which axes?"), answer("Sets the Automated Esoteric Settings (AES)", message = "No, this does not exist."), answer("Sets sex as the X axis, and height as the Y axis", correct = T) ), question("What did the 'size = ' aesthetic adjust for these violin plots?", allow_retry = T, answer("Size of the violin"), answer("Size of the border line", correct = T), answer("Size of the plot"), answer("Size of the plot labels") ) )
Think about the following questions:
What do you think is generally the best combination of these three geoms (points, boxplots, violins)?
Until now, you have been editing aesthetics like color
, alpha
, size
, and shape
by assigning them to static values such as 3, "blue", or 0.2.
Now, we will assign them to columns, and the display for each point in the data will vary.
Re-build the old boxplot, with the aesthetic color
statically-assigned to "blue", and alpha
statically-assigned to 0.5:
ggplot(data = surv, mapping = aes(x = district, y = age_years))+ geom_jitter()+ geom_boxplot(fill = "blue", alpha = 0.5)
Now change the command so that fill
is assigned to the column district
.
quiz(caption = "Quiz - aesthetic assignments", question("What happens if you run the command above, but replace 'blue' with the column name: district?", allow_retry = T, answer("The boxplots re-arrange by district"), answer("The boxes become the geographical shape of the district"), answer("The boxes fill (color) become different for each district", message = "No... this is what we wanted, but it did not happen."), answer("There is an R error saying 'object district not found'", correct = T) ) )
quiz(caption = "Quiz - Dynamic aesthetic assignments", question("What needs to change in order to make each boxplot a different fill?", allow_retry = T, answer("Try another column name"), answer("Move this part of the command ('fill = district') inside the aes()", correct = T), answer("Try the command five more times, as written"), answer("Restart R and try again") ) )
There are two important steps to resolve this error:
1) Change the value "blue" to district
(note the lack of quotes around district
)
2) Move this argument to within the mapping = aes()
in the top ggplot()
function
Because we are assigning a column to the aesthetic fill =
, it must be done within the "mapping" of the ggplot. Remember how we assigned columns to the X-axis and the Y-axis? Those assignments were dynamic as well, and they are located within the mapping = aes()
.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = district, y = age_years, fill = district))+ geom_jitter()+ geom_boxplot(alpha = 0.5)
Note the legend that has automatically appeared on the right side of the plot.
Think to yourself: how have missing values been treated in this plot? How are they marked? Which color was automatically assigned to them? Where in the order do they appear, along the X-axis and in the legend?
How would you change the color of the points?
Be aware of this nuance:
fill
aesthetic. geom_jitter()
) are changed with the aesthetic color
. Change the color of the points to also vary by district, and set the box plot's alpha = 0
.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = district, y = age_years, fill = district, color = district))+ geom_jitter()+ geom_boxplot(alpha = 0)
Now, remove the color = district
from the mapping = aes()
to return the points to their original black color.
Let's add a few quick labels to make our box plot more clear.
Add labs()
to the bottom of the command (using a + symbol), and provide the following arguments within the function:
title = "My title
subtitle = "Subtitle here"
x = "District"
y = "Age (years)"
caption = "Among an Ebola outbreak, 2014"
fill = "District"
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot( # open the ggplot data = surv, # use surveillance linelist mapping = aes( # assign columns to plot aesthetics x = district, # district on the x-axis y = age_years, # age on the y-axis fill = district))+ # fill of boxplots by district geom_jitter()+ # add jittered points geom_boxplot( # add boxplots, semi-transparent alpha = 0.5)+ labs( # add labels to plot title = "My title", subtitle = "Subtitle here", x = "District", y = "Age (years)", caption = "Among an Ebola outbreak, 2014", fill = "District" )
Now look at the solution code above - What is different about it, aside from the new labs()
command?
It has been written in a "longer" format. For any parentheses, arguments are written after a newline and an indent. This can be read more easily, like a Table of Contents:
Indenting and newlines will not impact the execution of the code, but now you have space on the right to add # comments.
While it can take more time to write detailed comments, it can really help people in the future who are reading your code and wondering what you are doing (including for yourself in the future!).
quiz(caption = "Quiz - Plot labels", question("What side of the plot is the caption on, by default?", allow_retry = T, answer("Top-center"), answer("Lower-left"), answer("Lower-right", correct = T), answer("Top-right'") ), question("Why did we use the fill argument to adjust the legend title?", allow_retry = T, answer("Because fill always adjusts the legend", message = "No, try again. You would actually adjust the legend using a different parameter, if it had been created another way."), answer("Because the legend was created when we mapped fill to a column", correct = T), answer("Because the world 'fill' means 'legend' in old English", message = "No, this is not true.") ) )
Now that you have explored points, jittered points, boxplots, and violins, let's try some other geoms:
A bar plot could be used to show the number of cases reported in each district.
Try a command using the geom geom_bar()
.
Note that for geom_bar()
you should only supply an x =
aesthetic. There is no need for y =
because the number of rows in the dataset per district will be counted and be reflected in the bar height.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
In this new ggplot command, remove the geom_jitter()
and geom_boxplot()
commands. Replace them with one geom_bar()
command.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = district))+ geom_bar()
The rows in the data have now been displayed as a bar plot!
Can you change the bars to all be the color "lightblue" using the fill
argument? (think: where should this argument be placed, seeing as "lightblue" is an assignment to a static value)
How would we change the previous plot show the age distribution in each district?
Change the plot so that the fill =
aesthetic is assigned dynamically to a categorical column that reflects age.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Assign fill =
to the column age_cat
.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = district, fill = age_cat))+ geom_bar()
Now we can see the age categories in the legend, and the bars are "stacked".
What if you want to reverse the order of the bars?
Perhaps you want the youngest age category on the bottom: we'll give you this code:
ggplot(data = surv, mapping = aes(x = district, fill = fct_rev(age_cat)))+ geom_bar()
In the above code we've used the fct_rev()
function, which comes from the {forcats} package (from the {tidyverse}). We'll explain more later, but this package contains functions that create explicit ordering of values. By wrapping the fct_rev()
around the column age_cat
, ggplot reverses the default ordering of that variable.
If the colors look ugly to you, don't worry. We'll learn later how to adjust the colors.
geom_bar()
accepts the argument position =
, which can be assigned to the value "dodge". Try to add this argument to geom_bar()
and see what happens (don't forget the quotation marks).
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = district, fill = age_cat))+ geom_bar(position = "dodge")
Until now, we have used our surveillance linelist to create these plots. The plots are treating each row as one data point. For example, in the district bar plot made with geom_bar()
, the bar height was determined by the number of rows per district in the surv
data frame.
But often, we have datasets which arrive to us as counts (e.g. the weekly number of malaria cases reported by each jurisdiction). This can also occur if a particular facility is overwhelmed and can no longer report detailed case information.
To explain how to handle these data, we will first practice the skills from the previous module, by creating a counts dataset:
Create a new dataset that includes only the total case counts per district. Use your knowledge of the {dplyr} functions group_by()
and summarise()
to do this, and save the data frame as case_counts_district
.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Begin with the surv
data frame and pipe into `group_by
count(district)`.
Alternatively, pipe into group_by()
and then, summarise()
. In summarise()
, create a column in the new summary data frame with n()
(to return the total number of rows in each district group).
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
case_counts_district <- surv %>% count(district)
case_counts_district %>% knitr::kable()
When your data use counts, do not make bar plots with geom_bar()
. This is becausegeom_bar()
records one bar segment for each row of data.
Try running this command and see what the result is (note the changed dataset):
ggplot(data = case_counts_district, mapping = aes(x = district))+ geom_bar()
That is not the plot we want! The geom_bar()
has accepted the data and recognized only one row for each district. Consequently, it has plotted exactly one bar for each district, at height 1.
How to fix this?
1) Add y = n
within the ggplot aesthetic mappings. This assigns the bar height to the numeric count value in the column n
.
2) Change geom_bar()
to geom_col()
Now run the command. The graph should look exactly the same as the prior geom_bar()
command, when used on the surv
linelist.
ggplot(data = case_counts_district, mapping = aes(x = district, y = n))+ geom_col()
What made this work?
The function geom_col()
is built to handle count data - it does not stack the dataset rows on top of each other, but rather accepts a y =
column of count values, then draws the columns with heights that correspond to those values.
It is important to remember to use geom_bar()
for linelist data, and geom_col()
for count data.
Lastly, try to add a fill =
aesthetic mapping to make the bar show age group. Can you do this with count data?
No, you cannot, because our count data does not have the detail of age group that was available in the linelist.
Write a new plotting command that uses geom_point()
on the clean surv
linelist to display the two continuous variables: age_years
and ht_cm
, assigned to the X-axis and Y-axis, respectively.
Next, add the aesthetic color =
and assign it dynamically to the column age_cat
.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Color of points is adjusted with the color =
aesthetic (fill
is for shapes like bars and histograms). Be sure to place this assignment within the mapping = aes()
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
This code has been written in a "long" style. You may have different indentation preferences.
ggplot(data = surv, mapping = aes( x = age_years, y = ht_cm, color = age_cat))+ geom_point()
This looks pretty, but is not actually that useful. Yet, it does serve to show that the age categories are aligning with the continuous values.
What if you dynamically assign the aesthetic color =
to the column sex
? Are you able to see any pattern in the data? It is still difficult.
Try this advanced R code below. There is a package {ggExtra} with several functions that add dimensions to ggplots.
Add this package to your pacman command, and re-run so that this package installs and loads.
Now, run the command below:
height_plot <- ggplot( data = surv, mapping = aes( x = age_years, y = ht_cm, color = sex))+ geom_point() ggMarginal(height_plot)
What is happening?
1) We have assigned the ggplot command a name (height_plot
) and saved it as an object. Look, it appears in the Environment now as an object that you can reference in other commands.
2) The second command uses the {ggExtra} function ggMarginal()
. This draws distributions in the "margins" (outsides) of the plot.
* The first argument of ggMarginal()
accepts the name of our plot object, height_plot
, and prints it with the marginal distributions.
Now run the command again, but with the groupFill = TRUE
argument specified.
ggMarginal(height_plot, groupFill = TRUE)
quiz(caption = "Quiz - Scatterplots", question("In the ggMarginal command, the term 'height_plot' is referring to what exactly?", allow_retry = T, answer("A setting that determines how tall the plot output is"), answer("It ensures that the plot has height"), answer("height_plot is the name of a ggplot saved in a prior command", correct = T), answer("This is an argument that does not need to be specified'") ), question("What do the grouped marginal distributions show about the heights of male and female cases?", allow_retry = T, answer("That the male cases are generally taller than the female cases", correct = T), answer("That at a given age, male cases are taller than female cases", message = "No, we do not know this from these plots. We only know the overall distribution of the data."), answer("The cases are more likely to be male than female", message = "No, the distributions do not tell us anything about whether cases are more likely to be male or female.") ), question("What is ggMarginal()?", allow_retry = T, answer("A function of the {ggplot2} package"), answer("A function of the package {ggExtra} that can be used in coordination with {ggplot2} functions", correct = T), answer("A {base} R function") ) )
Finally, try another scatterplot using date_report
on the X-axis and diff
on the Y-axis. Add geom_smooth()
to see a trend line.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot( data = surv, mapping = aes( x = date_report, y = diff))+ geom_point()+ geom_smooth()
Return to the Help RStudio pane (near the plots pane in the lower-right). Type geom_smooth
in the search bar. Read the "Details" section to understand exactly what geom_smooth()
does to produce it's line and confidence interval.
Histograms are used to show the distribution of continuous variables. If you have a column with numbers or dates, it is better to use geom_histogram()
- not geom_bar()
or geom_col()
. This assumes that your data are in a linelist format (one row per observation).
We will cover the nuances of epidemic curves ("epicurves") in depth in a later module, but for now:
Make a basic histogram of the onset dates by assigning the X-axis to the column date_onset
. Change its default coloring to "darkgreen"
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
For geom_histogram()
, the inside color of the "bars" is set by the fill =
aesthetic. The bar outline is set by color =
.
If assigning the fill or color to a static value like "purple", this assignment should be made outside of aes()
, within geom_histogram()
.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = date_onset))+ geom_histogram(fill = "darkgreen")
Look in your R console, where the command is printed that you ran. Do you see a warning, written in red?
In R, a "warning" is given if R was able to execute your command, but it encountered something unexpected that it thinks you should know.
In contrast, an "error" means that R was unable to execute your command.
quiz(caption = "Quiz - histogram warning", question("What does this warning mean (Removed XX rows containing non-finite values)", allow_retry = T, answer("That those cases had an infinitely bad sense of humor"), answer("Those cases had date values outside the range of the plot", message = "We have not set any limits to the plot axis, so R will display all values possible."), answer("That those cases had missing date_onset values and so could not be plotted.", correct = T), answer("That ggplot just removed those rows from your linelist data frame", message = "ggplot will not alter the contents of your dataset.") ) )
Now try making a histogram for the column age_years
, with fill =
assigned dynamically to sex
.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = age_years, fill = sex))+ geom_histogram()
You now have "stacked" histograms. These can be difficult to interpret sometimes, but are a useful tool to have.
Histograms show "bars", but there is no space between the bars. This is because the variable is continuous. Still, the size and frequency of the "histogram bins" can be adjusted.
Try running the histogram with the following adjustments (one at a time):
bins = 100
within geom_histogram()
bins = 5
within geom_histogram()
bin_width = 5
within geom_histogram()
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = age_years, fill = sex))+ geom_histogram(bins = 100) ggplot(data = surv, mapping = aes(x = age_years, fill = sex))+ geom_histogram(bins = 5) ggplot(data = surv, mapping = aes(x = age_years, fill = sex))+ geom_histogram(binwidth = 5)
Consider the following:
binwidth
represent? These are all questions that epidemiologists and public health workers must ask themselves when looking at an epidemic curve.
In the coming module on public health plots, we will discuss the intricacies of making epidemic curves that start on specific dates and that use bins of specific widths (e.g. Monday weeks, Sunday weeks, etc.)
A stacked histogram can be difficult to interpret. Faceting in R is the process of creating one smaller plot for each unique level of a variable (this is also called "small-multiples").
This is done by adding the command facet_wrap()
to the ggplot command. Within this function, write a tilde ~
and the name of the faceting column on the right side.
Try to facet this histogram by sex
(leave fill = sex
as well).
ggplot(data = surv, mapping = aes(x = date_onset, fill = sex))+ geom_histogram()
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
ggplot(data = surv, mapping = aes(x = date_onset, fill = sex))+ geom_histogram()+ facet_wrap(~ sex)
Now try another plot: instead of sex
, set the plot to be faceted by district
. You can choose to remove the fill = sex
, for clarity. It will not have an impact on the faceting.
quiz(caption = "Quiz - facets", question("Which district had the earliest cases?", allow_retry = T, answer("West I"), answer("East II"), answer("West III", correct = T), answer("Central I") ) )
You can read more about faceting in this section of the Epi R Handbook.
Congratulations! ggplot is a difficult subject, but a tool that offers many rewards once you understand the basics. We will reinforce this more in the coming days.
Just like {ggExtra}, the package {gghighlight} is an extension that works with {ggplot2} to add functionalities for your plotting.
Many R packages have "vignettes" to assist people using the package. Go online and view this {gghighlight} vignette to answer the questions below:
quiz(caption = "Quiz - gghighlight", question("When was this vignette updated?", allow_retry = T, answer("2021-01-01"), answer("1999-04-03"), answer("2023-12-16", correct = T), answer("2022-03-12") ), question("The author believes that this package is the equivalent of which function, but for ggplot2?", allow_retry = T, answer("select()", message = "Look at the bottom of the Motivation section"), answer("filter()", correct = T), answer("mutate()", message = "Look at the bottom of the Motivation section"), answer(" %>% ", message = "Look at the bottom of the Motivation section") ), question("What is the main function of the {gghighlight} package?", allow_retry = T, answer("flair()", message = "Look at the section called gghighlight()"), answer("stark()", message = "Look at the section called gghighlight()"), answer("gghighlight()", correct = T), answer("highlighter()", message = "Look at the section called gghighlight()") ) )
Now, make a histogram using the following settings:
date_onset
fill
) is by district
gghighlight()
Finally, make a variation of this plot:
gghighlight()
only highlights the cases from "West II" r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Plot 1 ######## ggplot(data = surv, mapping = aes(x = date_onset, fill = district))+ geom_histogram()+ facet_wrap(~ district)+ gghighlight() # Plot 2 ######## ggplot(data = surv, mapping = aes(x = date_onset, fill = district))+ geom_histogram()+ gghighlight(district == "West III")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.