# load packages ---------------------------------------------------------------- library(introexercises) library(learnr) library(gradethis) library(dplyr) library(flair) library(ggplot2) library(lubridate) library(fontawesome) library(tidyr) library(forcats) library(janitor) library(kableExtra) # library(RMariaDB) # connect to sql database ## set options for exercises and checking --------------------------------------- ## Define how exercises are evaluated gradethis::gradethis_setup( ## note: the below arguments are passed to learnr::tutorial_options ## set the maximum execution time limit in seconds exercise.timelimit = 60, ## set how exercises should be checked (defaults to NULL - individually defined) # exercise.checker = gradethis::grade_learnr ## set whether to pre-evaluate exercises (so users see answers) exercise.eval = FALSE ) # ## event recorder --------------------------------------------------------------- # ## see for details: # ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events # ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R # # ## connect to your sql database # sqldtbase <- dbConnect(RMariaDB::MariaDB(), # user = Sys.getenv("userid"), # password = Sys.getenv("pwd"), # dbname = 'excersize_log', # host = "144.126.246.140") # # # ## define a function to collect data # ## note that tutorial_id is defined in YAML # ## you could set the tutorial_version too (by specifying version:) but use package version instead # recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) { # # ## define a sql query # ## first bracket defines variable names # ## values bracket defines what goes in each variable # event_log <- paste("INSERT INTO responses ( # tutorial_id, # tutorial_version, # date_time, # user_id, # event, # section, # label, # question, # answer, # code, # correct) # VALUES('", tutorial_id, "', # '", tutorial_version, "', # '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "', # '", Sys.getenv("SHINYPROXY_PROXY_ID"), "', # '", event, "', # '", data$section, "', # '", data$label, "', # '", paste0('"', data$question, '"'), "', # '", paste0('"', data$answer, '"'), "', # '", paste0('"', data$code, '"'), "', # '", data$correct, "')", # sep = '') # # # Execute the query on the sqldtbase that we connected to above # rsInsert <- dbSendQuery(sqldtbase, event_log) # # } # # options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE) # Data prep -------------------------------------------------------------------- # Import combined <- rio::import(system.file("dat/linelist_combined_20141201.rds", package = "introexercises"))
# hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE)
Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.
knitr::include_graphics("images/logo.png", error = F)
This exercise focuses on pivoting columns within data frames from wide-to-long, and introduces the column class "factor".
This exercise guides you through tasks that you should perform in RStudio on your local computer.
There are several ways to get help:
1) Look for the "helpers" (see below)
2) Ask your live course instructor/facilitator for help
3) Schedule a 1-on-1 call with an instructor for "Course Tutoring"
4) Post a question in Applied Epi Community
Here is what those "helpers" will look like:
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hintHere you will see a helpful hint!
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)linelist %>% filter( age > 25, district == "Bolo" )
Here is more explanation about why the solution works.
Answering quiz questions will help you to comprehend the material. The answers are not recorded.
To practice, please answer the following questions:
quiz( question_radio("When should I view the red 'helper' code?", answer("After trying to write the code myself", correct = TRUE), answer("Before I try coding", correct = FALSE), correct = "Reviewing best-practice code after trying to write yourself can help you improve", incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer." ) )
question_numeric( "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?", answer(10, message = "Try not to worry, we will help you succeed!", correct = T), answer(9, message = "Try not to worry, we will help you succeed!", correct = T), answer(8, message = "Try not to worry, we will help you succeed!", correct = T), answer(7, message = "Try not to worry, we will help you succeed!", correct = T), answer(6, message = "Ok, we will get there together", correct = T), answer(5, message = "Ok, we will get there together", correct = T), answer(4, message = "I like your confidence!", correct = T), answer(3, message = "I like your confidence!", correct = T), answer(2, message = "I like your confidence!", correct = T), answer(1, message = "I like your confidence!", correct = T), allow_retry = TRUE, correct = "Thanks for sharing. ", min = 1, max = 10, step = 1 )
Please email contact@appliedepi.org with questions about the use of these materials.
In this exercise you will:
The code from this exercise is not vital to future exercises. If you are tired, you can simply read through the exercise, copy/paste the code, and absorb the material.
This exercise uses the combined
data frame that was created in the previous exercise on "Joining data".
If you did not complete that exercise, or are seeing errors when trying to use combined
, you can import and use a "backup" combined
data frame from the "data/clean/backup/" folder by adding this command to your import code chunk:
combined <- import(here("data", "clean", "backup", "linelist_combined_20141201.rds"))
Add a new code chunk for "Patient Timelines", near the bottom, following the "Spotlight analysis" section of your R Markdown script.
You can add a section heading in the code chunk for clarity:
# Patient timelines - pivoting exercise
Now that we have the combined
data frame, we can produce a more complete picture of each patient's journey through the health system. We have information on date_infection
, date_onset
, date_report
, date_hospitalization
, and date_outcome
.
Let's create a small data frame and plot to examine the timelines of 5 patients. We will create the following figure. Each case has its own row, and the milestone dates are visualised by points of varying color and shape.
# Pivoting - patient timelines ---------------------------------------- timelines <- combined %>% arrange(date_onset) %>% # sort dataset so that earliest are at the top head(5) %>% # keep only the top 5 rows select(case_id, starts_with("date")) # keep only certain columns timelines_long <- timelines %>% pivot_longer( cols = starts_with("date"), names_to = "date_type", values_to = "date" ) %>% mutate(date_type = fct_relevel(date_type, "date_infection", "date_onset", "date_report", "date_hospitalisation", "date_outcome")) timelines_long %>% ggplot(mapping = aes(x = date, y = case_id, color = date_type, shape = date_type, group = case_id))+ geom_point(size = 4)+ geom_line()+ theme_minimal()
Let's build this plot together, and along the way learn about pivoting longer and about factors.
First, we reduce the dataset in the folling ways:
case_id
and any column that begins with "date" Add this code to your "Pivoting - Patient Timelines" code chunk, highlight and run the code.
timelines <- combined %>% arrange(date_onset) %>% # sort dataset so that earliest are at the top head(5) %>% # keep only the top 5 rows select(case_id, starts_with("date")) # keep only certain columns
Notice we can use the "tidyselect" helper function starts_with()
to refer to all of the date columns at once.
Let's look at this new dataset. Run a command timelines
in your Testing area, or directly in the Console, to view the records.
timelines
Anticipate plotting this data with ggplot()
. As you know, ggplot()
with geom_point()
will ask for column names to use for mapping to the axes (x =
and y =
).
quiz( question("In it's current form, which column would be assigned to the X-axis to create the plot??", answer("date_onset", message = "This will not work because in the plot, the date axis reflects all the different date types"), answer("date_outcome", message = "This will not work because in the plot, the date axis reflects all the different date types"), answer("date_infection", message = "This will not work because in the plot, the date axis reflects all the different date types"), answer("date", message = "This is not a column in the current dataset."), answer("Not possible in current format", correct=TRUE, message = "Yes, the dataset must be transformed."), allow_retry = TRUE ) )
To use this dataset in ggplot()
we need to transform or "pivot" the columns into "long" format. This will result in a new data frame called timelines_long
, with only 3 columns:
case_id
date_type
(a new column with values like "date_infection" and "date_report" - the current column names) date
(the actual date values, all in one column) To do this, we will use pivot_longer()
to collect all of the date columns and pivot their values into just those two new columns (date_type
and date
).
At it's most minimal, the function needs only the argument cols =
, which should be provided with a vector of the columns to pivot (in this case, the date columns).
Thankfully, we can reference all the "date" columns with the helper starts_with("date")
. In other circumstances you might list column names within a vector c()
.
Add the following code to your "pivoting" code chunk.
# Pivot dates longer timelines_long <- timelines %>% pivot_longer(cols = starts_with("date"))
See what this data set looks like now:
timelines_long
Notice the following things:
name
". These are now character values. values
". case_id
- once for each possible date type. quiz( question("How did the dimensions of the data frame change?", answer("The pivoted data frame is the same as the old."), answer("The pivoted data frame has more columns"), answer("The pivoted data frame has more columns, but fewer rows"), answer("The pivoted data frame has fewer columns, but more rows", correct=TRUE, message = "Yes, since there were 5 date columns pivoted, there is now 5x as many rows as before. All the 5 date columns have been collapsed into 2 columns."), allow_retry = TRUE ) )
If you want, you can re-run the pivoting command and add these arguments, which allow you to change these default names for the two new columns:
names_to =
(try "date_type") values_to =
(try "date") Update your pivot_longer()
command in the "Pivoting" code chunk to this:
# Pivot dates longer timelines_long <- timelines %>% pivot_longer( cols = starts_with("date"), names_to = "date_type", values_to = "date")
See below how the column names of the timelines_long
data frame have been updated:
timelines_long
What happens if we make the ggplot right now, using the dataset timelines_long
?
quiz( question("Which column in the pivoted data frame will be mapped to the X-axis?", answer("case_id", message = "No, this discrete column will be on the Y-axis."), answer("date", correct = TRUE, message = "Yes, this column is continuous date values."), answer("date_type", message = "No, this column contains discrete character values. It will be used as color for the points and lines."), allow_retry = TRUE ), question("Which column in the pivoted data frame will be mapped to the Y-axis?", answer("case_id", correct = TRUE, message = "Yes, these are discrete character values."), answer("date", correct = FALSE, message = "No, this column is mapped to the X-axis."), answer("date_type", message = "No, this column contains discrete character values. It will be used as color for the points and lines."), allow_retry = TRUE ) )
Add this code to your Pivoting code chunk.
# create plot of patient timelines ggplot(data = timelines_long, # use the long dataset mapping = aes( x = date, # dates of all types displayed along the x-axis y = case_id, # case_id are discrete, character values color = date_type, # color of the points shape = date_type, # shape of the points group = case_id))+ # this makes the lines appear by color geom_point(size = 4)+ # show points geom_line()+ # show lines theme_minimal()
The points and lines are there, but are they in a sensible order in the legend?
If a variable has an inherent order, we might call it an "ordinal" variable. Think if the values in a column were "first", "second", or "third". We would want them to appear in a plot in a specific order.
In R, these types of variables should be converted to the class "factor". A factor has "levels", such that the values are ordered (first, second, third, fourth, etc.).
In this case, the expected ordering would be:
1) "date_infection"
2) "date_onset"
3) "date_report"
4) "date_hospitalisation"
5) "date_outcome"
Of course for some patients there may be hospitalised before they are reported, but generally let's say that this is the order that we want to embed in the variable.
What is the current class of date_type
?
class(timelines_long$date_type)
It is not a factor. The character values have no inherent ordering. By default they will appear alphabetically.
We can change this using fct_relevel()
from the {forcats} package. {forcats} is part of the tidyverse. This function converts the column to class "factor" and gives you the opportunity to set the desired order.
Below, we add a mutate()
step to the pipe chain that re-defines this new column, and then lists the values in the order we want. Update your pivot_longer()
command in the "Pivoting" code chunk to include the mutate()
step below:
# Pivot dates longer timelines_long <- timelines %>% # pivot the dataset longer pivot_longer( cols = starts_with("date"), names_to = "date_type", values_to = "date") %>% # set the new column date_type as class factor, and define order for its values mutate(date_type = fct_relevel( date_type, "date_infection", "date_onset", "date_report", "date_hospitalisation", "date_outcome"))
The class of the column date_type
is now "factor".
class(timelines_long$date_type)
The column date_type
now has "levels".
levels(timelines_long$date_type)
After re-running the pipe chain above (with the mutate()
function added), we try the ggplot again - see how the ordering has changed (look at the legend):
timelines_long %>% ggplot(data = timelines_long, mapping = aes( x = date, y = case_id, color = date_type, shape = date_type, group = case_id))+ geom_point(size = 4)+ geom_line()+ theme_minimal()
quiz( question("Which case seems to have an error in date_outcome?", answer("dce5cc"), answer("9d4019"), answer("974bc1"), answer("76b97a"), answer("2ae019", correct=TRUE, message = "Yes, The recorded date of outcome is prior to the recorded date of onset."), allow_retry = TRUE ) )
There are many other {forcats} functions to handle factors, see this chapter of the Epi R Handbook.
One {forcats} function that is worth showing you is fct_lump()
. This function will aggregate together values in a column into an "Other" category based on frequency.
See this epidemic curve - because the column district
is assigned to the aesthetic fill =
, it shows every district in the legend. This is quite overwhelming and difficult to interpret! Add this in a chunk to your plotting area and run the command:
ggplot(data = combined, mapping = aes( x = date_onset, fill = district))+ geom_histogram(binwidth = 7)
We can use fct_lump()
and its variations like fct_lump_n()
to reduce the number of district that are shown in the plot:
fct_lump_n()
shows only the top "n" values (by counts), with all remaining put in "Other" fct_lump_prop()
shows only those that exceed n proportion of rows, with all remaining in "Other"Below, we wrap district
within fct_lump_n()
and specify that we want to keep only the 3 most-common districts. Update your code to use fct_lump_n()
for the fill =
argument, as seen below:
ggplot(data = combined, mapping = aes( x = date_onset, fill = fct_lump_n(district, 3)))+ geom_histogram(binwidth = 7)+ labs(fill = "District")
Note that applying this function within the ggplot()
does not change the underlying district
data. The data are lumped only for this plot. If you want to lump the underlying data you can do that with mutate()
in a cleaning pipe.
We will not focus on pivoting wider in this exercise, as it is less common. However, know that if you need to pivot data wider you can find good examples in these two chapters of the Epi R Handbook:
Congratulations! You are done with this exercise. You have made case timelines, have practiced pivoting data longer, and using some functions to handle factors!
If you want to learn a bit more about pivoting, you can go on to the next extra topic.
This is a demonstration of how to fix pivoting longer of columns of multiple classes. When we say multi-class, we mean we would like to pivot multiple columns (from wide to long format) that include different classes in each column.
It is a common occurrence for datasets that track cases or people over time. Our example will be of daily follow-up checks on the health of contacts of cases (people exposed to cases).
First, open a new classic format R script. Save it in your "ebola" project in the sub-folder "scripts" as "pivot_multiclass_example.R". Add a description at the top.
############################################# # Pivoting multi-class example # Bonus section # Your NAME here #############################################
Next, load the {tidyverse} packages in a "Load packages" section.
# Load packages ---------------------------------------------------------------- pacman::p_load(tidyverse)
We will not be using our surv
data for this example, but rather a simple dataset we create ourselves using R code.
Add a "Create data" section to your script and run the code below in it. This code creates a mini data frame in the object followup
. You should have 5 columns and 3 rows present.
The tribble()
function allows you to create a data frame by providing row and column values in a standard table layout. The top row of values (with tildes in front) is the column names.
# Create demo data ---------------------------------------------------------------- followup <- tribble( ~id, ~day1_date, ~day1_status, ~day2_date, ~day2_status, ~day3_date, ~day3_status, "A", "2022-04-01", "Healthy", "2022-04-02", "Healthy", "2022-04-03", "Sick", "B", "2022-04-01", "Healthy", "2022-04-02", "Sick", "2022-04-03", "Sick", "C", "2022-04-01", "Healthy", "2022-04-02", "Healthy", "2022-04-03", "Sick", ) # review the dataset followup
Note how the columns alternate between character values like "Healthy" and date values.
We would like to pivot the data frame longer, so that there are 4 columns:
id
containing the patient id (e.g. A, B, C)Day
containing the day number (e.g. day 1, day 2, day 3)Date
containing the date (e.g. 22-04-01, 22-04-02, 22-04-03)Status
containing the status (e.g. Healthy, Sick)First, try to pivot the data frame simply using pivot_longer()
. Pivot all columns except id
. Use names_to =
and values_to =
arguments to adjust the new column names.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)# Pivoting the data ---------------------------------------------------------------- # initial pivot (note dates and status values are combined into one column) pivot_longer( followup, cols = -id, names_to = "Day", values_to = "Status")
As you will notice, this creates only 3 columns, not 4!
What do you notice about the Status
column? It has both Day and Date values inside, and is class character. This is not helpful for analysis.
We must pivot the data frame differently, so that these values are in separate columns with the correct class, respectively.
To prevent this situation of mixed classes, take advantage of the syntax structure of the original column names.
There is a common naming structure, with the observation number, an underscore, and then either “status” or “date”. We can leverage this syntax to keep these two data types in separate columns after the pivot.
We do this by:
names_to =
argument, with the second item being (".value" ). This special term indicates that the pivoted columns will be split based on a character in their name... names_sep =
argument. In this case, it is the underscore “_“. Thus, the naming and split of new columns is based around the underscore in the existing variable names.
Update your command as below, and review the result.
# correct pivot # .value is a special term. Pivoted columns are split based on a character in their name. pivot_longer(followup, cols = -id, names_to = c("Day", ".value"), names_sep = "_")
Now we have our 4 desired columns.
This data frame is "long" and "tidy". We can pipe it into a ggplot()
command to visualise how the health of the contacts changed over time. Note the intermediate step that ensures the date
column has the correct Date class.
# Pivot data pivot_longer(followup, cols = -id, names_to = c("Day", ".value"), names_sep = "_") %>% mutate(date = ymd(date)) %>% # ensure date class ggplot(mapping = aes(x = date, y = id, color = status, shape = status, group = id))+ geom_line()+ geom_point()+ scale_color_manual( values = c("Healthy" = "darkgreen", "Sick" = "red"))+ labs( title = "Follow-up of patients over time", y = "Patient ID", x = "Date", color = "Status", shape = "Status")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.