In appliedepi/introexercises:

# load packages ----------------------------------------------------------------
library(introexercises)
library(learnr)
library(gradethis)
library(dplyr)
library(flair)
library(ggplot2)
library(lubridate)
library(fontawesome)
library(tidyr)
library(forcats)
library(janitor)
library(kableExtra)
# library(RMariaDB)        # connect to sql database 

## set options for exercises and checking ---------------------------------------


## Define how exercises are evaluated 
gradethis::gradethis_setup(
  ## note: the below arguments are passed to learnr::tutorial_options
  ## set the maximum execution time limit in seconds
  exercise.timelimit = 60, 
  ## set how exercises should be checked (defaults to NULL - individually defined)
  # exercise.checker = gradethis::grade_learnr
  ## set whether to pre-evaluate exercises (so users see answers)
  exercise.eval = FALSE 
)

# ## event recorder ---------------------------------------------------------------
# ## see for details: 
# ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events
# ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R
# 
# ## connect to your sql database
# sqldtbase <- dbConnect(RMariaDB::MariaDB(),
#                        user     = Sys.getenv("userid"),
#                        password = Sys.getenv("pwd"),
#                        dbname   = 'excersize_log',
#                        host     = "144.126.246.140")
# 
# 
# ## define a function to collect data 
# ## note that tutorial_id is defined in YAML
#     ## you could set the tutorial_version too (by specifying version:) but use package version instead 
# recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) {
#     
#   ## define a sql query 
#   ## first bracket defines variable names
#   ## values bracket defines what goes in each variable
#   event_log <- paste("INSERT INTO responses (
#                        tutorial_id, 
#                        tutorial_version, 
#                        date_time, 
#                        user_id, 
#                        event, 
#                        section,
#                        label, 
#                        question, 
#                        answer, 
#                        code, 
#                        correct)
#                        VALUES('", tutorial_id,  "', 
#                        '", tutorial_version, "', 
#                        '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "',
#                        '", Sys.getenv("SHINYPROXY_PROXY_ID"), "',
#                        '", event, "',
#                        '", data$section, "',
#                        '", data$label,  "',
#                        '", paste0('"', data$question, '"'),  "',
#                        '", paste0('"', data$answer,   '"'),  "',
#                        '", paste0('"', data$code,     '"'),  "',
#                        '", data$correct, "')",
#                        sep = '')
# 
#     # Execute the query on the sqldtbase that we connected to above
#     rsInsert <- dbSendQuery(sqldtbase, event_log)
#   
# }
# 
# options(tutorial.event_recorder = recorder_function)

# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)


# Data prep --------------------------------------------------------------------
# Import
combined <- rio::import(system.file("dat/linelist_combined_20141201.rds", package = "introexercises"))

# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)

Introduction to R for Applied Epidemiology and Public Health

Welcome

Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.

knitr::include_graphics("images/logo.png", error = F)

Pivoting data

This exercise focuses on pivoting columns within data frames from wide-to-long, and introduces the column class "factor".

Format

This exercise guides you through tasks that you should perform in RStudio on your local computer.

Getting Help

There are several ways to get help:

1) Look for the "helpers" (see below) 2) Ask your live course instructor/facilitator for help
3) Schedule a 1-on-1 call with an instructor for "Course Tutoring" 4) Post a question in Applied Epi Community

Here is what those "helpers" will look like:

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Here you will see a helpful hint!

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

linelist %>% 
  filter(
    age > 25,
    district == "Bolo"
  )

Here is more explanation about why the solution works.

Quiz questions

Answering quiz questions will help you to comprehend the material. The answers are not recorded.

To practice, please answer the following questions:

quiz(
  question_radio("When should I view the red 'helper' code?",
    answer("After trying to write the code myself", correct = TRUE),
    answer("Before I try coding", correct = FALSE),
    correct = "Reviewing best-practice code after trying to write yourself can help you improve",
    incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer."
  )
)

question_numeric(
 "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?",
 answer(10, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(9, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(8, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(7, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(6, message = "Ok, we will get there together", correct = T),
 answer(5, message = "Ok, we will get there together", correct = T),
 answer(4, message = "I like your confidence!", correct = T),
 answer(3, message = "I like your confidence!", correct = T),
 answer(2, message = "I like your confidence!", correct = T),
 answer(1, message = "I like your confidence!", correct = T),
 allow_retry = TRUE,
 correct = "Thanks for sharing. ",
 min = 1,
 max = 10,
 step = 1
)

License

Please email contact@appliedepi.org with questions about the use of these materials.

Learning objectives

In this exercise you will:

Practice pivoting a dataset from wide to long
See the class "factor" applied to provide order to a columns values

The code from this exercise is not vital to future exercises. If you are tired, you can simply read through the exercise, copy/paste the code, and absorb the material.

Preparation

This exercise uses the combined data frame that was created in the previous exercise on "Joining data".

If you did not complete that exercise, or are seeing errors when trying to use combined, you can import and use a "backup" combined data frame from the "data/clean/backup/" folder by adding this command to your import code chunk:

combined <- import(here("data", "clean", "backup", "linelist_combined_20141201.rds"))

New chunk for Pivoting

Add a new code chunk for "Patient Timelines", near the bottom, following the "Spotlight analysis" section of your R Markdown script.

You can add a section heading in the code chunk for clarity:

# Patient timelines - pivoting exercise

Pivoting to plot patient timelines

Now that we have the combined data frame, we can produce a more complete picture of each patient's journey through the health system. We have information on date_infection, date_onset, date_report, date_hospitalization, and date_outcome.

Let's create a small data frame and plot to examine the timelines of 5 patients. We will create the following figure. Each case has its own row, and the milestone dates are visualised by points of varying color and shape.

# Pivoting - patient timelines ----------------------------------------

timelines <- combined %>% 
  arrange(date_onset) %>%                 # sort dataset so that earliest are at the top
  head(5) %>%                             # keep only the top 5 rows
  select(case_id, starts_with("date"))    # keep only certain columns 

timelines_long <- timelines %>% 
  pivot_longer(
    cols = starts_with("date"),
    names_to = "date_type",
    values_to = "date"
  ) %>% 
  mutate(date_type = fct_relevel(date_type, "date_infection", "date_onset", "date_report", "date_hospitalisation", "date_outcome"))

timelines_long %>% 
  ggplot(mapping = aes(x = date, y = case_id, color = date_type, shape = date_type, group = case_id))+
  geom_point(size = 4)+
  geom_line()+
  theme_minimal()

Let's build this plot together, and along the way learn about pivoting longer and about factors.

Select cases

First, we reduce the dataset in the folling ways:

Sort the dataset so the cases with earliest onset are the top
Filter to only the top 5 rows
Select only the columns case_id and any column that begins with "date"

Add this code to your "Pivoting - Patient Timelines" code chunk, highlight and run the code.

timelines <- combined %>% 
  arrange(date_onset) %>%                 # sort dataset so that earliest are at the top
  head(5) %>%                             # keep only the top 5 rows
  select(case_id, starts_with("date"))    # keep only certain columns

Notice we can use the "tidyselect" helper function starts_with() to refer to all of the date columns at once.

Let's look at this new dataset. Run a command timelines in your Testing area, or directly in the Console, to view the records.

timelines

Anticipate plotting this data with ggplot(). As you know, ggplot() with geom_point() will ask for column names to use for mapping to the axes (x = and y =).

quiz(
  question("In it's current form, which column would be assigned to the X-axis to create the plot??",
    answer("date_onset", message = "This will not work because in the plot, the date axis reflects all the different date types"),
    answer("date_outcome", message = "This will not work because in the plot, the date axis reflects all the different date types"),
    answer("date_infection", message = "This will not work because in the plot, the date axis reflects all the different date types"),
    answer("date", message = "This is not a column in the current dataset."),
    answer("Not possible in current format", correct=TRUE, message = "Yes, the dataset must be transformed."),
    allow_retry = TRUE
  )
)

Pivot longer

To use this dataset in ggplot() we need to transform or "pivot" the columns into "long" format. This will result in a new data frame called timelines_long, with only 3 columns:

case_id
date_type (a new column with values like "date_infection" and "date_report" - the current column names)
date (the actual date values, all in one column)

To do this, we will use pivot_longer() to collect all of the date columns and pivot their values into just those two new columns (date_type and date).

At it's most minimal, the function needs only the argument cols =, which should be provided with a vector of the columns to pivot (in this case, the date columns).

Thankfully, we can reference all the "date" columns with the helper starts_with("date"). In other circumstances you might list column names within a vector c().

Add the following code to your "pivoting" code chunk.

# Pivot dates longer
timelines_long <- timelines %>% 
  pivot_longer(cols = starts_with("date"))

See what this data set looks like now:

timelines_long

Notice the following things:

The function has taken all the date column names and placed them in a new column called "name". These are now character values.
It has also taken all the date values and placed them in a new column called "values".
There are now 5 rows for each case_id - once for each possible date type.

quiz(
  question("How did the dimensions of the data frame change?",
    answer("The pivoted data frame is the same as the old."),
    answer("The pivoted data frame has more columns"),
    answer("The pivoted data frame has more columns, but fewer rows"),
    answer("The pivoted data frame has fewer columns, but more rows", correct=TRUE, message = "Yes, since there were 5 date columns pivoted, there is now 5x as many rows as before. All the 5 date columns have been collapsed into 2 columns."),
    allow_retry = TRUE
  )
)

If you want, you can re-run the pivoting command and add these arguments, which allow you to change these default names for the two new columns:

names_to = (try "date_type")
values_to = (try "date")

Update your pivot_longer() command in the "Pivoting" code chunk to this:

# Pivot dates longer
timelines_long <- timelines %>% 
  pivot_longer(
    cols = starts_with("date"),
    names_to = "date_type",
    values_to = "date")

See below how the column names of the timelines_long data frame have been updated:

timelines_long

Plotting

What happens if we make the ggplot right now, using the dataset timelines_long?

quiz(
  question("Which column in the pivoted data frame will be mapped to the X-axis?",
    answer("case_id", message = "No, this discrete column will be on the Y-axis."),
    answer("date", correct = TRUE, message = "Yes, this column is continuous date values."),
    answer("date_type", message = "No, this column contains discrete character values. It will be used as color for the points and lines."),
    allow_retry = TRUE
  ),
  question("Which column in the pivoted data frame will be mapped to the Y-axis?",
    answer("case_id",
           correct = TRUE,
           message = "Yes, these are discrete character values."),
    answer("date",
           correct = FALSE,
           message = "No, this column is mapped to the X-axis."),
    answer("date_type",
           message = "No, this column contains discrete character values. It will be used as color for the points and lines."),
    allow_retry = TRUE
  )
)

Add this code to your Pivoting code chunk.

# create plot of patient timelines
ggplot(data = timelines_long,      # use the long dataset
         mapping = aes(
           x = date,               # dates of all types displayed along the x-axis
           y = case_id,            # case_id are discrete, character values
           color = date_type,      # color of the points
           shape = date_type,      # shape of the points
           group = case_id))+      # this makes the lines appear by color
  geom_point(size = 4)+            # show points
  geom_line()+                     # show lines
  theme_minimal()

The points and lines are there, but are they in a sensible order in the legend?

Factors

If a variable has an inherent order, we might call it an "ordinal" variable. Think if the values in a column were "first", "second", or "third". We would want them to appear in a plot in a specific order.

In R, these types of variables should be converted to the class "factor". A factor has "levels", such that the values are ordered (first, second, third, fourth, etc.).

In this case, the expected ordering would be:

1) "date_infection"
2) "date_onset"
3) "date_report"
4) "date_hospitalisation" 5) "date_outcome"

Of course for some patients there may be hospitalised before they are reported, but generally let's say that this is the order that we want to embed in the variable.

What is the current class of date_type?

class(timelines_long$date_type)

It is not a factor. The character values have no inherent ordering. By default they will appear alphabetically.

We can change this using fct_relevel() from the {forcats} package. {forcats} is part of the tidyverse. This function converts the column to class "factor" and gives you the opportunity to set the desired order.

Below, we add a mutate() step to the pipe chain that re-defines this new column, and then lists the values in the order we want. Update your pivot_longer() command in the "Pivoting" code chunk to include the mutate() step below:

# Pivot dates longer
timelines_long <- timelines %>% 

  # pivot the dataset longer
  pivot_longer(
    cols = starts_with("date"),
    names_to = "date_type",
    values_to = "date") %>% 

  # set the new column date_type as class factor, and define order for its values
  mutate(date_type = fct_relevel(
    date_type,
    "date_infection", "date_onset", "date_report", "date_hospitalisation", "date_outcome"))

The class of the column date_type is now "factor".

class(timelines_long$date_type)

The column date_type now has "levels".

levels(timelines_long$date_type)

After re-running the pipe chain above (with the mutate() function added), we try the ggplot again - see how the ordering has changed (look at the legend):

timelines_long %>% 
  ggplot(data = timelines_long,
         mapping = aes(
           x = date,
           y = case_id,
           color = date_type,
           shape = date_type,
           group = case_id))+
  geom_point(size = 4)+
  geom_line()+
  theme_minimal()

quiz(
  question("Which case seems to have an error in date_outcome?",
    answer("dce5cc"),
    answer("9d4019"),
    answer("974bc1"),
    answer("76b97a"),
    answer("2ae019", correct=TRUE, message = "Yes, The recorded date of outcome is prior to the recorded date of onset."),
    allow_retry = TRUE
  )
)

There are many other {forcats} functions to handle factors, see this chapter of the Epi R Handbook.

fct_lump()

One {forcats} function that is worth showing you is fct_lump(). This function will aggregate together values in a column into an "Other" category based on frequency.

See this epidemic curve - because the column district is assigned to the aesthetic fill =, it shows every district in the legend. This is quite overwhelming and difficult to interpret! Add this in a chunk to your plotting area and run the command:

ggplot(data = combined, 
       mapping = aes(
         x = date_onset,
         fill = district))+
  geom_histogram(binwidth = 7)

We can use fct_lump() and its variations like fct_lump_n() to reduce the number of district that are shown in the plot:

fct_lump_n() shows only the top "n" values (by counts), with all remaining put in "Other"
fct_lump_prop() shows only those that exceed n proportion of rows, with all remaining in "Other"
There are other variations that you can see in the R documentation

Below, we wrap district within fct_lump_n() and specify that we want to keep only the 3 most-common districts. Update your code to use fct_lump_n() for the fill = argument, as seen below:

ggplot(data = combined, 
       mapping = aes(
         x = date_onset,
         fill = fct_lump_n(district, 3)))+
  geom_histogram(binwidth = 7)+
  labs(fill = "District")

Note that applying this function within the ggplot() does not change the underlying district data. The data are lumped only for this plot. If you want to lump the underlying data you can do that with mutate() in a cleaning pipe.

Pivoting wider

We will not focus on pivoting wider in this exercise, as it is less common. However, know that if you need to pivot data wider you can find good examples in these two chapters of the Epi R Handbook:

End

Congratulations! You are done with this exercise. You have made case timelines, have practiced pivoting data longer, and using some functions to handle factors!

If you want to learn a bit more about pivoting, you can go on to the next extra topic.

Extras: Advanced pivoting

Pivoting with multiple classes

This is a demonstration of how to fix pivoting longer of columns of multiple classes. When we say multi-class, we mean we would like to pivot multiple columns (from wide to long format) that include different classes in each column.

It is a common occurrence for datasets that track cases or people over time. Our example will be of daily follow-up checks on the health of contacts of cases (people exposed to cases).

First, open a new classic format R script. Save it in your "ebola" project in the sub-folder "scripts" as "pivot_multiclass_example.R". Add a description at the top.

#############################################
# Pivoting multi-class example
# Bonus section
# Your NAME here
#############################################

Next, load the {tidyverse} packages in a "Load packages" section.

# Load packages ----------------------------------------------------------------
pacman::p_load(tidyverse)

Create a demo dataset

We will not be using our surv data for this example, but rather a simple dataset we create ourselves using R code.

Add a "Create data" section to your script and run the code below in it. This code creates a mini data frame in the object followup. You should have 5 columns and 3 rows present.

The tribble() function allows you to create a data frame by providing row and column values in a standard table layout. The top row of values (with tildes in front) is the column names.

# Create demo data ----------------------------------------------------------------
followup <- tribble(
  ~id, ~day1_date, ~day1_status, ~day2_date,   ~day2_status, ~day3_date,   ~day3_status,
  "A", "2022-04-01", "Healthy",  "2022-04-02", "Healthy",    "2022-04-03", "Sick",
  "B", "2022-04-01", "Healthy",  "2022-04-02", "Sick",       "2022-04-03", "Sick",
  "C", "2022-04-01", "Healthy",  "2022-04-02", "Healthy",    "2022-04-03", "Sick",
)


# review the dataset
followup

Note how the columns alternate between character values like "Healthy" and date values.

We would like to pivot the data frame longer, so that there are 4 columns:

A column id containing the patient id (e.g. A, B, C)
A column Day containing the day number (e.g. day 1, day 2, day 3)
A column Date containing the date (e.g. 22-04-01, 22-04-02, 22-04-03)
A column Status containing the status (e.g. Healthy, Sick)

Initial attempt

First, try to pivot the data frame simply using pivot_longer(). Pivot all columns except id. Use names_to = and values_to = arguments to adjust the new column names.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Pivoting the data ----------------------------------------------------------------

# initial pivot (note dates and status values are combined into one column)
pivot_longer(
  followup,
  cols = -id,
  names_to = "Day",
  values_to = "Status")

As you will notice, this creates only 3 columns, not 4!

What do you notice about the Status column? It has both Day and Date values inside, and is class character. This is not helpful for analysis.

We must pivot the data frame differently, so that these values are in separate columns with the correct class, respectively.

Base the pivot on column name structure

To prevent this situation of mixed classes, take advantage of the syntax structure of the original column names.

There is a common naming structure, with the observation number, an underscore, and then either “status” or “date”. We can leverage this syntax to keep these two data types in separate columns after the pivot.

We do this by:

Providing a character vector to the names_to = argument, with the second item being (".value" ). This special term indicates that the pivoted columns will be split based on a character in their name...
You must also provide the “splitting” character to the names_sep = argument. In this case, it is the underscore “_“.

Thus, the naming and split of new columns is based around the underscore in the existing variable names.

Update your command as below, and review the result.

# correct pivot
# .value is a special term. Pivoted columns are split based on a character in their name. 
pivot_longer(followup,
             cols = -id,
             names_to = c("Day", ".value"), 
             names_sep = "_")

Now we have our 4 desired columns.

This data frame is "long" and "tidy". We can pipe it into a ggplot() command to visualise how the health of the contacts changed over time. Note the intermediate step that ensures the date column has the correct Date class.

# Pivot data
pivot_longer(followup,
             cols = -id,
             names_to = c("Day", ".value"), 
             names_sep = "_") %>% 

mutate(date = ymd(date)) %>%   # ensure date class

ggplot(mapping = aes(x = date, y = id, color = status, shape = status, group = id))+
  geom_line()+
  geom_point()+
  scale_color_manual(
    values = c("Healthy" = "darkgreen",
               "Sick" = "red"))+
  labs(
    title = "Follow-up of patients over time",
    y = "Patient ID",
    x = "Date",
    color = "Status",
    shape = "Status")