# load packages ----------------------------------------------------------------
library(introexercises)  # get data for exercises
library(learnr)          # create lessons from rmd
library(gradethis)       # evaluate exercises
library(dplyr)           # wrangle data
library(flair)           # highlight data
library(ggplot2)         # visualise data
library(gghighlight)     # emphasise parts of visualisations
library(janitor)         # clean data
library(fontawesome)     # for emojis 
library(scales)          # defining axes and units
# library(RMariaDB)        # connect to sql database 

## set options for exercises and checking ---------------------------------------

## Define how exercises are evaluated 
gradethis::gradethis_setup(
  ## note: the below arguments are passed to learnr::tutorial_options
  ## set the maximum execution time limit in seconds
  exercise.timelimit = 60, 
  ## set how exercises should be checked (defaults to NULL - individually defined)
  # exercise.checker = gradethis::grade_learnr
  ## set whether to pre-evaluate exercises (so users see answers)
  exercise.eval = FALSE 
)

# ## event recorder ---------------------------------------------------------------
# ## see for details: 
# ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events
# ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R
# 
# ## connect to your sql database
# sqldtbase <- dbConnect(RMariaDB::MariaDB(),
#                        user     = Sys.getenv("userid"),
#                        password = Sys.getenv("pwd"),
#                        dbname   = 'excersize_log',
#                        host     = "144.126.246.140")
# 
# 
# ## define a function to collect data 
# ## note that tutorial_id is defined in YAML
#     ## you could set the tutorial_version too (by specifying version:) but use package version instead 
# recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) {
#     
#   ## define a sql query 
#   ## first bracket defines variable names
#   ## values bracket defines what goes in each variable
#   event_log <- paste("INSERT INTO responses (
#                        tutorial_id, 
#                        tutorial_version, 
#                        date_time, 
#                        user_id, 
#                        event, 
#                        section,
#                        label, 
#                        question, 
#                        answer, 
#                        code, 
#                        correct)
#                        VALUES('", tutorial_id,  "', 
#                        '", tutorial_version, "', 
#                        '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "',
#                        '", Sys.getenv("SHINYPROXY_PROXY_ID"), "',
#                        '", event, "',
#                        '", data$section, "',
#                        '", data$label,  "',
#                        '", paste0('"', data$question, '"'),  "',
#                        '", paste0('"', data$answer,   '"'),  "',
#                        '", paste0('"', data$code,     '"'),  "',
#                        '", data$correct, "')",
#                        sep = '')
# 
#     # Execute the query on the sqldtbase that we connected to above
#     rsInsert <- dbSendQuery(sqldtbase, event_log)
#   
# }
# 
# options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)
# data prep --------------------------------------------------------------------
surv <- rio::import(system.file("dat/surveillance_linelist_clean_20141201.rds", package = "introexercises"))

Introduction to R for Applied Epidemiology and Public Health

Welcome

Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.

knitr::include_graphics("images/logo.png", error = F)

Data visualization

This exercise focuses on the fundamentals of using {ggplot2} for data visualization.

Format

This exercise guides you through tasks that you should perform in RStudio on your local computer.

Getting Help

There are several ways to get help:

1) Look for the "helpers" (see below) 2) Ask your live course instructor/facilitator for help
3) Schedule a 1-on-1 call with an instructor for "Course Tutoring" 4) Post a question in Applied Epi Community

Here is what those "helpers" will look like:

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Here you will see a helpful hint!


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

linelist %>% 
  filter(
    age > 25,
    district == "Bolo"
  )

Here is more explanation about why the solution works.


Quiz questions

Answering quiz questions will help you to comprehend the material. The answers are not recorded.

To practice, please answer the following questions:

quiz(
  question_radio("When should I view the red 'helper' code?",
    answer("After trying to write the code myself", correct = TRUE),
    answer("Before I try coding", correct = FALSE),
    correct = "Reviewing best-practice code after trying to write yourself can help you improve",
    incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer."
  )
)
question_numeric(
 "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?",
 answer(10, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(9, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(8, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(7, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(6, message = "Ok, we will get there together", correct = T),
 answer(5, message = "Ok, we will get there together", correct = T),
 answer(4, message = "I like your confidence!", correct = T),
 answer(3, message = "I like your confidence!", correct = T),
 answer(2, message = "I like your confidence!", correct = T),
 answer(1, message = "I like your confidence!", correct = T),
 allow_retry = TRUE,
 correct = "Thanks for sharing. ",
 min = 1,
 max = 10,
 step = 1
)

License

Please email contact@appliedepi.org with questions about the use of these materials.

Learning objectives

In this exercise you will:

Prepare

Open the R project "ebola", as usual.

Then, open your script "ebola_analysis.R". You can locate this file by clicking the menus for "File -> Open File", or by clicking your RStudio Files Pane (lower-right) and navigating to the file there.

Load packages

Add the {ggExtra} and {gghighlight} packages to the pacman::p_load() command at the top of your script.

The {ggplot2} package does not need to be listed explicitly in this command, because it is included in the {tidyverse}.

We will use {ggExtra} to add additional plotting functions to {ggplot2} and {gghighlight} to highlight portions of the plots.

Remember to always keep {tidyverse} as the last package in your pacman::p_load() command.

Run the previous code in your script

In Module 1 we instructed you to adjust R's global settings so that each session begins cleanly, with no stored memory of the commands you ran or objects you created in the previous session.

This is best practice. It forces you to write scripts that are independent of previous sessions. This makes your work more easily "reproducible" by others.

Look at your R environment pane. Do you see any objects saved from the last session in the Environment? If so, click the small "broom" icon to clear all objects. Go back to Module 1 or ask an instructor to assist you in adjusting the settings.

Now, run all the important commands in your script, section-by-section:

1) Load packages
2) Import the "raw" surveillance linelist 3) Run the long cleaning command (a "pipe chain" using the %>% operator) to produce the clean linelist
4) Export the clean linelist
5) Produce a few nice tables using the clean linelist

If you have kept your script clean and organized, these commands should all be able to run in sequence with no errors.

Verify that you have the object surv in your Environment.

As a backup for this module - IF your script is seriously malfunctioning with lots of errors, you can import the "backup" clean surv dataset from the folder "ebola/data/clean/backup", where it is saved as "surveillance_linelist_clean_20141201.rds". Please notify an instructor that you will need assistance cleaning your script so that it runs smoothly next time.

Create new section for plotting

Add a new section heading in your script called "Simple plots".

The new header should look something like this:

# Simple plots ----------------------------------------------

This section should be placed after "Summary tables" section and before the "Testing area".

Build a plot

We will build some plots to understand the outbreak dynamics in each district.

Open the plot

Any ggplot() command begins with the function ggplot(). Within this function, the first argument is data =. Provide the name of the dataset to this argument.

Run the command below

ggplot(data = surv)

In RStudio's Plots pane, the blank plot canvas has been "opened" and is ready for further instructions. You will add to this ggplot() command, building the plot step-by-step.

Map columns to "aesthetics"

Now that we have a canvas, we need to tell R what we want on the X-axis and on the Y-axis. The columns that you will map to the axes are simple examples of the plot's "aesthetics". Note this word "aesthetics" - the root aes becomes part of the next argument of ggplot().
The second argument of ggplot() is mapping = aes(). This is where you map columns in the data to components of the plot. Specifically, this is for components of the plot that you allow to vary for each row of data.

For example, if we "map" the Y-axis to the column age_years, then every row in the data will be evaluated and may be placed on a different height on the Y-axis.

aes()

The mapping = aes() argument is atypical in that you must provide the values to it within the aes() function.

mapping = aes(x = COLUMN, y = COLUMN)

Remember that this code above is occurring within the ggplot() function. The mapping = aes() itself IS the second expected argument to ggplot().

Try to write a ggplot() command where you map the X-axis to the column district, and Y-axis to the column age_years, and then answer the questions about the plot that appears in RStudio.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = district, y = age_years))


quiz(caption = "Quiz - aesthetics",
  question("How many breaks are there in the X-axis?",
    allow_retry = T,
    answer("12"),
    answer("9"),
    answer("10", correct = T),
    answer("4")
  ),
  question("What is the default Y-axis label interval?",
    allow_retry = T,
    answer("5"),
    answer("20", correct = T, message = "This will change depending on the range of the data."),
    answer("30"),
    answer("10")
  )
)

Add geom layers

Now that we have our canvas, and we have designated our axes... we can begin to visualize the data!

This is done by adding geom_() commands, which each create a different kind of visualization.

We have completed one full component of the ggplot() command - the parentheses are closed, and the command can run without error. So how do we add additional components to the plot?

Add components of the command with +

The "plus" symbol connects the components of the plotting command. In some ways, it is like the pipe operator, but links steps within an overall plotting command.

Add the plus, and on the new line write geom_point(). Then run the command

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = district, y = age_years))+
  geom_point()


The geom_point() has visualized each row of the data frame surv as as points, corresponding to their district value (X-axis) and their age_years value (Y-axis).

quiz(caption = "Quiz - points",
  question("How many cases were reported in Central I district?",
    allow_retry = T,
    answer("25"),
    answer("35"),
    answer("Not possible to know from this plot", correct = T),
    answer("21")
  )
)

With a discrete variable on the X-axis and a continuous distribution on the Y-axis, it is difficult to know exactly how many cases there are! Each point represents one case, but you could be looking at 5 points of the same age that are overlapping and hiding each other.

Let's try another geom to solve this problem. Replace geom_point() with geom_jitter() and re-run the command.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = district, y = age_years))+
     geom_jitter()



Go to the Help pane of RStudio (by the Plots pane) and search for "geom_jitter" in the search bar in the upper-right.

This is the R documentation of the function geom_jitter(). Every function, whether in {base} R, or in an installed package, has documentation on its functions. You can access this documentation offline - it is stored within R once the package is downloaded.

Sometimes, this documentation is difficult to read. We will practice reading documentation throughout the course. However, there are often tutorials, guides, and "vignettes" online that are easier to read. We will also practice reviewing these.

For now, do the following:

Read the "Description" to understand exactly what the function is doing to avoid overlapping points.

Review the "Aesthetics" list near the bottom. These are other plot aesthetics that you can adjust, to vary the appearance of this geom layer.

Within the geom_jitter() function, enter a few of these aesthetics and try adjusting them as follows:

quiz(caption = "Quiz - aesthetics",
  question("What do you think the plot aesthetic `alpha` adjusts?",
    allow_retry = T,
    answer("Alphabetical ordering"),
    answer("Axis color"),
    answer("Transparency", correct = T),
    answer("Size")
  ),
  question("What shape is produced by setting `shape = 3` within geom_jitter()?",
    allow_retry = T,
    answer("Circle"),
    answer("Triangle"),
    answer("Plus", correct = T),
    answer("Square")
  )
)

Add more geoms

The jitter plot is nice, but let's add some statistical display to it. Add geom_boxplot() below the geom_jitter() command.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Don't forget to add a + at the end of geom_jitter()

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = district, y = age_years))+
     geom_jitter()+
     geom_boxplot()



We can now see where summary statistical values lie for each district, such as the median and inter-quartile ranges. But the points are now hidden by the boxplots.

Add and adjust the alpha = aesthetic for the boxplots only. What alpha value allows you to see the plots?

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Try several values between 0 and 1.


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = district, y = age_years))+
     geom_jitter()+
     geom_boxplot(alpha = 0.5)


Violin plots

As alternative, replace the geom_boxplot() with geom_violin(), and use the same alpha = level as before to have some transparency.

ggplot(data = surv, mapping = aes(x = district, y = age_years))+
     geom_jitter()+
     geom_violin(alpha = 0.5)

There is not much difference in the case ages between each district... let's try a violin plot on another metric. Try this code:

ggplot(data = surv, mapping = aes(x = sex, y = ht_cm))+
     geom_violin(alpha = 0.5, size = 2)
quiz(caption = "Quiz - violin plot",
  question("What is aes(x = sex, y = ht_cm) doing in this command?",
    allow_retry = T,
    answer("Sets sex as the Y axis, and height as the X axis", message = "Almost... but check again - which axes?"),
    answer("Sets the Automated Esoteric Settings (AES)", message = "No, this does not exist."),
    answer("Sets sex as the X axis, and height as the Y axis", correct = T)
  ),
  question("What did the 'size = ' aesthetic adjust for these violin plots?",
    allow_retry = T,
    answer("Size of the violin"),
    answer("Size of the border line", correct = T),
    answer("Size of the plot"),
    answer("Size of the plot labels")
  )
)

Think about the following questions:

What do you think is generally the best combination of these three geoms (points, boxplots, violins)?

Dynamic aesthetics

Until now, you have been editing aesthetics like color, alpha, size, and shape by assigning them to static values such as 3, "blue", or 0.2.

Now, we will assign them to columns, and the display for each point in the data will vary.

Boxplot fill

Re-build the old boxplot, with the aesthetic color statically-assigned to "blue", and alpha statically-assigned to 0.5:

ggplot(data = surv, mapping = aes(x = district, y = age_years))+
     geom_jitter()+
     geom_boxplot(fill = "blue", alpha = 0.5)

Now change the command so that fill is assigned to the column district.

quiz(caption = "Quiz - aesthetic assignments",
  question("What happens if you run the command above, but replace 'blue' with the column name: district?",
    allow_retry = T,
    answer("The boxplots re-arrange by district"),
    answer("The boxes become the geographical shape of the district"),
    answer("The boxes fill (color) become different for each district",
           message = "No... this is what we wanted, but it did not happen."),
    answer("There is an R error saying 'object district not found'",
           correct = T)
  )
)
quiz(caption = "Quiz - Dynamic aesthetic assignments",
  question("What needs to change in order to make each boxplot a different fill?",
    allow_retry = T,
    answer("Try another column name"),
    answer("Move this part of the command ('fill = district') inside the aes()",
           correct = T),
    answer("Try the command five more times, as written"),
    answer("Restart R and try again")
  )
)

There are two important steps to resolve this error:

1) Change the value "blue" to district (note the lack of quotes around district)
2) Move this argument to within the mapping = aes() in the top ggplot() function

Because we are assigning a column to the aesthetic fill =, it must be done within the "mapping" of the ggplot. Remember how we assigned columns to the X-axis and the Y-axis? Those assignments were dynamic as well, and they are located within the mapping = aes().

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = district, y = age_years, fill = district))+
     geom_jitter()+
     geom_boxplot(alpha = 0.5)


Note the legend that has automatically appeared on the right side of the plot.

Think to yourself: how have missing values been treated in this plot? How are they marked? Which color was automatically assigned to them? Where in the order do they appear, along the X-axis and in the legend?

Point color

How would you change the color of the points?

Be aware of this nuance:

Change the color of the points to also vary by district, and set the box plot's alpha = 0.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = district, y = age_years, fill = district, color = district))+
     geom_jitter()+
     geom_boxplot(alpha = 0)



Now, remove the color = district from the mapping = aes() to return the points to their original black color.

Labels (and indenting)

Let's add a few quick labels to make our box plot more clear.

Add labs() to the bottom of the command (using a + symbol), and provide the following arguments within the function:

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(                                             # open the ggplot
  data = surv,                                      # use surveillance linelist
  mapping = aes(                                    # assign columns to plot aesthetics
    x = district,                                      # district on the x-axis
    y = age_years,                                     # age on the y-axis
    fill = district))+                                 # fill of boxplots by district
geom_jitter()+                                      # add jittered points  
geom_boxplot(                                       # add boxplots, semi-transparent
  alpha = 0.5)+
labs(                                               # add labels to plot
  title = "My title",
  subtitle = "Subtitle here",
  x = "District",
  y = "Age (years)",
  caption = "Among an Ebola outbreak, 2014",
  fill = "District"
  )


Now look at the solution code above - What is different about it, aside from the new labs() command?

It has been written in a "longer" format. For any parentheses, arguments are written after a newline and an indent. This can be read more easily, like a Table of Contents:

Indenting and newlines will not impact the execution of the code, but now you have space on the right to add # comments.

While it can take more time to write detailed comments, it can really help people in the future who are reading your code and wondering what you are doing (including for yourself in the future!).

quiz(caption = "Quiz - Plot labels",
  question("What side of the plot is the caption on, by default?",
    allow_retry = T,
    answer("Top-center"),
    answer("Lower-left"),
    answer("Lower-right", correct = T),
    answer("Top-right'")
  ),
  question("Why did we use the fill argument to adjust the legend title?",
    allow_retry = T,
    answer("Because fill always adjusts the legend", message = "No, try again. You would actually adjust the legend using a different parameter, if it had been created another way."),
    answer("Because the legend was created when we mapped fill to a column", correct = T),
    answer("Because the world 'fill' means 'legend' in old English", message = "No, this is not true.")
  )
)

Bar plots

Now that you have explored points, jittered points, boxplots, and violins, let's try some other geoms:

A bar plot could be used to show the number of cases reported in each district.

Try a command using the geom geom_bar().

Note that for geom_bar() you should only supply an x = aesthetic. There is no need for y = because the number of rows in the dataset per district will be counted and be reflected in the bar height.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

In this new ggplot command, remove the geom_jitter() and geom_boxplot() commands. Replace them with one geom_bar() command.


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = district))+
     geom_bar()


The rows in the data have now been displayed as a bar plot!

Can you change the bars to all be the color "lightblue" using the fill argument? (think: where should this argument be placed, seeing as "lightblue" is an assignment to a static value)

Set the fill aesthetic dynamically

How would we change the previous plot show the age distribution in each district?

Change the plot so that the fill = aesthetic is assigned dynamically to a categorical column that reflects age.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Assign fill = to the column age_cat.


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = district, fill = age_cat))+
  geom_bar()


Now we can see the age categories in the legend, and the bars are "stacked".

Order of stacked bars

What if you want to reverse the order of the bars?

Perhaps you want the youngest age category on the bottom: we'll give you this code:

ggplot(data = surv, mapping = aes(x = district, fill = fct_rev(age_cat)))+
  geom_bar()

In the above code we've used the fct_rev() function, which comes from the {forcats} package (from the {tidyverse}). We'll explain more later, but this package contains functions that create explicit ordering of values. By wrapping the fct_rev() around the column age_cat, ggplot reverses the default ordering of that variable.

If the colors look ugly to you, don't worry. We'll learn later how to adjust the colors.

Adjacent bars

geom_bar() accepts the argument position =, which can be assigned to the value "dodge". Try to add this argument to geom_bar() and see what happens (don't forget the quotation marks).

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = district, fill = age_cat))+
  geom_bar(position = "dodge")


Bar plots from count data

Until now, we have used our surveillance linelist to create these plots. The plots are treating each row as one data point. For example, in the district bar plot made with geom_bar(), the bar height was determined by the number of rows per district in the surv data frame.

But often, we have datasets which arrive to us as counts (e.g. the weekly number of malaria cases reported by each jurisdiction). This can also occur if a particular facility is overwhelmed and can no longer report detailed case information.

To explain how to handle these data, we will first practice the skills from the previous module, by creating a counts dataset:

Create a new dataset that includes only the total case counts per district. Use your knowledge of the {dplyr} functions group_by() and summarise() to do this, and save the data frame as case_counts_district.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Begin with the surv data frame and pipe into `group_bycount(district)`.

Alternatively, pipe into group_by() and then, summarise(). In summarise(), create a column in the new summary data frame with n() (to return the total number of rows in each district group).


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

case_counts_district <- surv %>% 
  count(district)


case_counts_district %>% 
  knitr::kable()

When your data use counts, do not make bar plots with geom_bar(). This is becausegeom_bar() records one bar segment for each row of data.

Try running this command and see what the result is (note the changed dataset):

ggplot(data = case_counts_district,
       mapping = aes(x = district))+
  geom_bar()

That is not the plot we want! The geom_bar() has accepted the data and recognized only one row for each district. Consequently, it has plotted exactly one bar for each district, at height 1.

How to fix this?

1) Add y = n within the ggplot aesthetic mappings. This assigns the bar height to the numeric count value in the column n.
2) Change geom_bar() to geom_col()

Now run the command. The graph should look exactly the same as the prior geom_bar() command, when used on the surv linelist.

ggplot(data = case_counts_district,
       mapping = aes(x = district, y = n))+
  geom_col()

What made this work?

The function geom_col() is built to handle count data - it does not stack the dataset rows on top of each other, but rather accepts a y = column of count values, then draws the columns with heights that correspond to those values.

It is important to remember to use geom_bar() for linelist data, and geom_col() for count data.

Lastly, try to add a fill = aesthetic mapping to make the bar show age group. Can you do this with count data?

No, you cannot, because our count data does not have the detail of age group that was available in the linelist.

Scatter plots

Write a new plotting command that uses geom_point() on the clean surv linelist to display the two continuous variables: age_years and ht_cm, assigned to the X-axis and Y-axis, respectively.

Next, add the aesthetic color = and assign it dynamically to the column age_cat.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Color of points is adjusted with the color = aesthetic (fill is for shapes like bars and histograms). Be sure to place this assignment within the mapping = aes()


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

This code has been written in a "long" style. You may have different indentation preferences.

ggplot(data = surv,
       mapping = aes(
         x = age_years,
         y = ht_cm,
         color = age_cat))+
  geom_point()


This looks pretty, but is not actually that useful. Yet, it does serve to show that the age categories are aligning with the continuous values.

What if you dynamically assign the aesthetic color = to the column sex? Are you able to see any pattern in the data? It is still difficult.

ggplot extension packages

Try this advanced R code below. There is a package {ggExtra} with several functions that add dimensions to ggplots.

Add this package to your pacman command, and re-run so that this package installs and loads.

Now, run the command below:

height_plot <- ggplot(
  data = surv,
  mapping = aes(
    x = age_years,
    y = ht_cm,
    color = sex))+
  geom_point()

ggMarginal(height_plot)

What is happening?

1) We have assigned the ggplot command a name (height_plot) and saved it as an object. Look, it appears in the Environment now as an object that you can reference in other commands.

2) The second command uses the {ggExtra} function ggMarginal(). This draws distributions in the "margins" (outsides) of the plot.
* The first argument of ggMarginal() accepts the name of our plot object, height_plot, and prints it with the marginal distributions.

Now run the command again, but with the groupFill = TRUE argument specified.

ggMarginal(height_plot, groupFill = TRUE)
quiz(caption = "Quiz - Scatterplots",
  question("In the ggMarginal command, the term 'height_plot' is referring to what exactly?",
    allow_retry = T,
    answer("A setting that determines how tall the plot output is"),
    answer("It ensures that the plot has height"),
    answer("height_plot is the name of a ggplot saved in a prior command", correct = T),
    answer("This is an argument that does not need to be specified'")
  ),
  question("What do the grouped marginal distributions show about the heights of male and female cases?",
    allow_retry = T,
    answer("That the male cases are generally taller than the female cases", correct = T),
    answer("That at a given age, male cases are taller than female cases", message = "No, we do not know this from these plots. We only know the overall distribution of the data."),
    answer("The cases are more likely to be male than female", message = "No, the distributions do not tell us anything about whether cases are more likely to be male or female.")
  ),
  question("What is ggMarginal()?",
    allow_retry = T,
    answer("A function of the {ggplot2} package"),
    answer("A function of the package {ggExtra} that can be used in coordination with {ggplot2} functions", correct = T),
    answer("A {base} R function")
  )
)

Add smoothed means to a scatterplot

Finally, try another scatterplot using date_report on the X-axis and diff on the Y-axis. Add geom_smooth() to see a trend line.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(
  data = surv,
  mapping = aes(
     x = date_report,
     y = diff))+
geom_point()+
geom_smooth()


Return to the Help RStudio pane (near the plots pane in the lower-right). Type geom_smooth in the search bar. Read the "Details" section to understand exactly what geom_smooth() does to produce it's line and confidence interval.

Histogram

Histograms are used to show the distribution of continuous variables. If you have a column with numbers or dates, it is better to use geom_histogram() - not geom_bar() or geom_col(). This assumes that your data are in a linelist format (one row per observation).

Basic histogram syntax

We will cover the nuances of epidemic curves ("epicurves") in depth in a later module, but for now:

Make a basic histogram of the onset dates by assigning the X-axis to the column date_onset. Change its default coloring to "darkgreen"

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

For geom_histogram(), the inside color of the "bars" is set by the fill = aesthetic. The bar outline is set by color =.

If assigning the fill or color to a static value like "purple", this assignment should be made outside of aes(), within geom_histogram().


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = date_onset))+
  geom_histogram(fill = "darkgreen")


Warnings and Errors

Look in your R console, where the command is printed that you ran. Do you see a warning, written in red?

In R, a "warning" is given if R was able to execute your command, but it encountered something unexpected that it thinks you should know.

In contrast, an "error" means that R was unable to execute your command.

quiz(caption = "Quiz - histogram warning",
  question("What does this warning mean (Removed XX rows containing non-finite values)",
    allow_retry = T,
    answer("That those cases had an infinitely bad sense of humor"),
    answer("Those cases had date values outside the range of the plot", message = "We have not set any limits to the plot axis, so R will display all values possible."),
    answer("That those cases had missing date_onset values and so could not be plotted.", correct = T),
    answer("That ggplot just removed those rows from your linelist data frame", message = "ggplot will not alter the contents of your dataset.")
  )
)

Stacked histograms

Now try making a histogram for the column age_years, with fill = assigned dynamically to sex.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = age_years, fill = sex))+
  geom_histogram()


You now have "stacked" histograms. These can be difficult to interpret sometimes, but are a useful tool to have.

Histogram breaks

Histograms show "bars", but there is no space between the bars. This is because the variable is continuous. Still, the size and frequency of the "histogram bins" can be adjusted.

Try running the histogram with the following adjustments (one at a time):

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = age_years, fill = sex))+
  geom_histogram(bins = 100)

ggplot(data = surv, mapping = aes(x = age_years, fill = sex))+
  geom_histogram(bins = 5)

ggplot(data = surv, mapping = aes(x = age_years, fill = sex))+
  geom_histogram(binwidth = 5)



Consider the following:

These are all questions that epidemiologists and public health workers must ask themselves when looking at an epidemic curve.

In the coming module on public health plots, we will discuss the intricacies of making epidemic curves that start on specific dates and that use bins of specific widths (e.g. Monday weeks, Sunday weeks, etc.)

Facets

A stacked histogram can be difficult to interpret. Faceting in R is the process of creating one smaller plot for each unique level of a variable (this is also called "small-multiples").

This is done by adding the command facet_wrap() to the ggplot command. Within this function, write a tilde ~ and the name of the faceting column on the right side.

Try to facet this histogram by sex (leave fill = sex as well).

ggplot(data = surv, mapping = aes(x = date_onset, fill = sex))+
  geom_histogram()

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

ggplot(data = surv, mapping = aes(x = date_onset, fill = sex))+
  geom_histogram()+
  facet_wrap(~ sex)


Now try another plot: instead of sex, set the plot to be faceted by district. You can choose to remove the fill = sex, for clarity. It will not have an impact on the faceting.

quiz(caption = "Quiz - facets",
  question("Which district had the earliest cases?",
    allow_retry = T,
    answer("West I"),
    answer("East II"),
    answer("West III", correct = T),
    answer("Central I")
  )
)

You can read more about faceting in this section of the Epi R Handbook.

End

Congratulations! ggplot is a difficult subject, but a tool that offers many rewards once you understand the basics. We will reinforce this more in the coming days.

Extra - gghighlight

Just like {ggExtra}, the package {gghighlight} is an extension that works with {ggplot2} to add functionalities for your plotting.

Many R packages have "vignettes" to assist people using the package. Go online and view this {gghighlight} vignette to answer the questions below:

quiz(caption = "Quiz - gghighlight",
  question("When was this vignette updated?",
    allow_retry = T,
    answer("2021-01-01"),
    answer("1999-04-03"),
    answer("2023-12-16", correct = T),
    answer("2022-03-12")
  ),
    question("The author believes that this package is the equivalent of which function, but for ggplot2?",
    allow_retry = T,
    answer("select()", message = "Look at the bottom of the Motivation section"),
    answer("filter()", correct = T),
    answer("mutate()", message = "Look at the bottom of the Motivation section"),
    answer(" %>% ", message = "Look at the bottom of the Motivation section")
  ),
    question("What is the main function of the {gghighlight} package?",
    allow_retry = T,
    answer("flair()", message = "Look at the section called gghighlight()"),
    answer("stark()", message = "Look at the section called gghighlight()"),
    answer("gghighlight()", correct = T),
    answer("highlighter()", message = "Look at the section called gghighlight()")
  )
)

Now, make a histogram using the following settings:

Finally, make a variation of this plot:

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Plot 1
########
ggplot(data = surv, mapping = aes(x = date_onset, fill = district))+
  geom_histogram()+
  facet_wrap(~ district)+
gghighlight()


# Plot 2
########
ggplot(data = surv, mapping = aes(x = date_onset, fill = district))+
  geom_histogram()+
  gghighlight(district == "West III")




appliedepi/introexercises documentation built on April 22, 2024, 1:01 a.m.