# load packages ----------------------------------------------------------------
library(introexercises) # get data for exercises  
library(learnr)         # create lessons from rmd
library(gradethis)      # evaluate exercises
library(dplyr)          # wrangle data  
library(stringr)        # strings
library(flair)          # highlight code
library(ggplot2)        # visualise data     
library(lubridate)      # work with dates  
library(epikit)         # for emojis   
library(skimr)          # browse data
library(fontawesome)    # for emojis  
library(janitor)        # clean data
# library(RMariaDB)       # connect to sql database 

## set options for exercises and checking ---------------------------------------

## Define how exercises are evaluated 
gradethis::gradethis_setup(
  ## note: the below arguments are passed to learnr::tutorial_options
  ## set the maximum execution time limit in seconds
  exercise.timelimit = 60, 
  ## set how exercises should be checked (defaults to NULL - individually defined)
  # exercise.checker = gradethis::grade_learnr
  ## set whether to pre-evaluate exercises (so users see answers)
  exercise.eval = FALSE 
)

# ## event recorder ---------------------------------------------------------------
# ## see for details: 
# ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events
# ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R
# 
# ## connect to your sql database
# sqldtbase <- dbConnect(RMariaDB::MariaDB(),
#                        user     = Sys.getenv("userid"),
#                        password = Sys.getenv("pwd"),
#                        dbname   = 'excersize_log',
#                        host     = "144.126.246.140")
# 
# 
# ## define a function to collect data 
# ## note that tutorial_id is defined in YAML
#     ## you could set the tutorial_version too (by specifying version:) but use package version instead 
# recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) {
#     
#   ## define a sql query 
#   ## first bracket defines variable names
#   ## values bracket defines what goes in each variable
#   event_log <- paste("INSERT INTO responses (
#                        tutorial_id, 
#                        tutorial_version, 
#                        date_time, 
#                        user_id, 
#                        event, 
#                        section,
#                        label, 
#                        question, 
#                        answer, 
#                        code, 
#                        correct)
#                        VALUES('", tutorial_id,  "', 
#                        '", tutorial_version, "', 
#                        '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "',
#                        '", Sys.getenv("SHINYPROXY_PROXY_ID"), "',
#                        '", event, "',
#                        '", data$section, "',
#                        '", data$label,  "',
#                        '", paste0('"', data$question, '"'),  "',
#                        '", paste0('"', data$answer,   '"'),  "',
#                        '", paste0('"', data$code,     '"'),  "',
#                        '", data$correct, "')",
#                        sep = '')
# 
#     # Execute the query on the sqldtbase that we connected to above
#     rsInsert <- dbSendQuery(sqldtbase, event_log)
#   
# }
# 
# options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)
# data prep --------------------------------------------------------------------
surv_raw <- rio::import(system.file("dat/surveillance_linelist_20141201.csv", package = "introexercises"))
surv <- rio::import(system.file("dat/surveillance_linelist_clean_20141201.rds", package = "introexercises"))

Introduction to R for Applied Epidemiology and Public Health

Welcome

Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.

knitr::include_graphics("images/logo.png", error = F)

Data cleaning

This exercise focuses on cleaning data, including filtering rows, selecting columns, deduplicating rows, creating new columns, creating categorical variables, and cleaning dates.

You will import and clean a fictional Ebola outbreak case linelist.

Format

This exercise guides you through tasks that you should perform in RStudio on your local computer.

Getting Help

There are several ways to get help:

1) Look for the "helpers" (see below) 2) Ask your live course instructor/facilitator for help
3) Schedule a 1-on-1 call with an instructor for "Course Tutoring" 4) Post a question in Applied Epi Community

Here is what those "helpers" will look like:

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Here you will see a helpful hint!


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

linelist %>% 
  filter(
    age > 25,
    district == "Bolo"
  )

Here is more explanation about why the solution works.


Quiz questions

Answering quiz questions will help you to comprehend the material. The answers are not recorded.

To practice, please answer the following questions:

quiz(
  question_radio("When should I view the red 'helper' code?",
    answer("After trying to write the code myself", correct = TRUE),
    answer("Before I try coding", correct = FALSE),
    correct = "Reviewing best-practice code after trying to write yourself can help you improve",
    incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer."
  )
)
question_numeric(
 "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?",
 answer(10, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(9, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(8, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(7, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(6, message = "Ok, we will get there together", correct = T),
 answer(5, message = "Ok, we will get there together", correct = T),
 answer(4, message = "I like your confidence!", correct = T),
 answer(3, message = "I like your confidence!", correct = T),
 answer(2, message = "I like your confidence!", correct = T),
 answer(1, message = "I like your confidence!", correct = T),
 allow_retry = TRUE,
 correct = "Thanks for sharing. ",
 min = 1,
 max = 10,
 step = 1
)

License

Please email contact@appliedepi.org with questions about the use of these materials.

Learning objectives

In this exercise you will:

New RStudio project

For the next several modules, we will work on simulated data from a fictional Ebola outbreak.
Create a new RStudio project inside the "ebola" subfolder.

This is a distinct outbreak and analysis, so it deserves its own project (self-contained and portable R environment with data, scripts, and outputs). If you do not remember how to make a new project, see the "hint" below.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

1) Open RStudio (ensure that you open RStudio and not just R).

2) In RStudio click File -> New Project. In the pop-up window, select "Existing directory".

knitr::include_graphics("images/create_project.png")

3) Create the project in the "intro_course/ebola" subfolder
* Click "Browse" and navigate to the "intro_course" folder that you downloaded and unzipped earlier (probably saved on your Desktop) and then into the "ebola" subfolder. We are creating a project for all the Ebola outbreak analysis activities.
* Click "Create project" (RStudio may briefly close and re-open)



Confirm that you are in the correct project

Confirm that the name of the project in the upper-right corner of the current RStudio session is "ebola". If you are not in any RStudio project, it will read "Project: (None)".

# adding xfun::relative_path() creates a dynamic file path between the Rmd location and the here() path.
# It dynamically creates the ../../etc filepath.

knitr::include_graphics("images/ebola_project_dropdown.png", error = F)

Review your RStudio project

Your "ebola" RStudio project should have the following subfolders:

# adding xfun::relative_path() creates a dynamic file path between the Rmd location and the here() path.
# It dynamically creates the ../../etc filepath.

knitr::include_graphics("images/ebola_project_home.png", error = F)

Typically, public health analysis projects will contain subfolders such as:

Here is how your folder structure should look:

r emo::ji("warning") You may also have some other files like ".Rhistory" - do not delete these. Just leave them.

Begin a new R script for Ebola analysis

Open a new R script and save it in the "ebola/scripts" subfolder, with the name "ebola_analysis.R". You can open a new script by clicking File -> New File -> R Script. This will be the primary R script for your work in the next several sessions of the course.

Add documentation

Let's get coding! Add a section heading at the top called "About this script". To do this, place your cursor where the new section should start and press Ctrl, Shift, and R at the same time (or Cmd Shift R on a Mac). In the pop-up, name the section "About this script".

The new header should look something like this:

# About this script ----------------------------------------------

Next, use # to write some comment lines below the heading to document:

# About this script ----------------------------------------------
# Purpose: Ebola outbreak analysis
# Author: Your name
# Date: Today's date

You will be able to navigate between section headers using the "Outline" button in the upper-right of the R script.

Script organization

Script organisation is VERY important. Recall that a script is usually run from top-to-bottom. Therefore, you will need to organize your script in a logical manner.

Typically, you begin with loading packages, and then continue to importing data. Recall that we only want one pacman::p_load() command, near the top of the script, for installing/loading packages.

Load packages

The first R command of any script is typically to install and load the packages for the entire script.

Add a new section heading called "Load packages".

Your script should look something like this:

# About this script ----------------------------------------------
# Purpose: Ebola outbreak analysis
# Author: Your name
# Date: Today's date

# Load packages  ----------------------------------------------

Now, write a command using {pacman} and its function p_load() that will install/load the following packages:

Write your command vertically so that you can add brief comments about each package. Don't forget to write pacman::p_load(), a special syntax style which also loads the {pacman} package.

Try to write this command on your own first, and then check the solution code below.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Load packages ----------------------------------------------
# The pacman package will install each package if necessary,
# and load it for use in the current session

pacman::p_load(
  rio,          # for importing data
  here,         # for relative file paths
  skimr,        # for reviewing the data
  janitor,      # for cleaning data  
  epikit,       # for creating age categories
  tidyverse     # for data management and visualization
)


Import data

We will work with a "case linelist" dataset (each row represents one case of Ebola in an outbreak). This is stored as a CSV file in the "ebola/data/raw/" folder, with the name "surveillance_linelist_20141201.csv".

We will name the imported data frame in R as surv_raw, as shorthand for "surveillance linelist raw".

Add a new section heading titled "Import data".

Your script should now look something like this:

# About this script ----------------------------------------------
# Purpose: Ebola outbreak analysis
# Author: Your name
# Date: Today's date

# Load packages ----------------------------------------------
# The pacman package will install each package if necessary,
# and load it for use in the current session

pacman::p_load(
  rio,          # for importing data
  here,         # for relative file paths
  skimr,        # for reviewing the data
  janitor,      # for cleaning data  
  epikit,       # for creating age categories
  tidyverse     # for data management and visualization
)

# Import data ----------------------------------------------

Importing data from a subfolder

The command to import the Ebola linelist will differ from the import command in Module 1 because the data are saved in the "ebola/data/raw" subfolder, not in the "root" folder the project.

Store data in subfolders is usually advised - perhaps by status as "raw" or "clean", or by time period if the data are updated routinely.

File paths

Any import() function must know where to find the file to import - this is called the file path.

An import command with an "absolute" or "fixed" file path might look like the command below.

DO NOT ADD THIS CODE INTO YOUR SCRIPT

surv_raw <- import("C:/Users/Laura/Documents/intro_course/ebola/data/raw/surveillance_linelist_20141201.csv")

What is the problem with this? Recall that an RStudio project may be transferred to another colleague to run the script on their computer... but most of this file path (the C:, "Laura", "Documents", etc.) may not exist in the colleague's computer, and this command would not run!

Relative files paths with the RStudio project

Use of an RStudio project allows us to start the file path at the "ebola" project root. For example, this command would work to import the linelist on most computers:

surv_raw <- import("data/raw/surveillance_linelist_20141201.csv")

But this is still not optimal.

We suggest using the here() function to write the file path. This has the following advantages:

1) If avoids complications due to slash direction.

2) It makes importing data from subfolders much easier when writing automated reports ("R Markdown"), which we will cover later in the course.

The solution below solves both of these problems:


The {here} package

We recommend using the here() function to build the most flexible file paths. Make sure you have installed and loaded the package {here} by adding it to your pacman::p_load() command and re-running it.

Run the command here() in your script - leave the parentheses empty.

here()

What do you see?

here() automatically prints your full file path up to the "ebola" RStudio project folder. Your output should match your computer's directory situation.

In Laura's example computer, the output of here() printed to her Console is:

"C:/Users/Laura/Documents/intro_course/ebola")

Add sub-folders to the end of this printed path, in quotes and separated by commas like this:

here("data", "raw")

For Laura, this would print:

"C:/Users/Laura/Documents/intro_course/ebola/data/raw"

End your here() command with the dataset file name and extension, like this:

here("data", "raw", "surveillance_linelist_20141201.csv")

On Laura's computer, this would print:

"C:/Users/Laura/Documents/intro_course/ebola/data/raw/surveillance_linelist_20141201.csv")

The output of this here() command is the full file path to the data from the RStudio project root, adapted to the user's computer. Importantly, this also works for automated reports.


here() and import() together

Now comes the secret combination: insert the here() command into the import() command! "Nest" one function inside another!

# import the file from the "data" and "raw" subfolders
surv_raw <- import(here("data", "raw", "surveillance_linelist_20141201.csv"))

Note the two closing parentheses at the end of the command - one for each function.

The file path produced by here() is provided as a character value to the first argument of import(). This tells import() where to find the dataset.

In your script, write a command with import() and here() that imports the "surveillance_linelist_20141201.csv" from the "data/raw" subfolder of the ebola R project. Name the object surv_raw (a shorthand for "surveillance linelist raw") using the assignment operator <-.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Import data ----------------------------------------------
# Import raw surveillance linelist data 
surv_raw <- import(here("data", "raw", "surveillance_linelist_20141201.csv"))


Remember to clean up your script as you work through the exercises!

Your ebola script should now look like this:

# About this script ----------------------------------------------
# Purpose: Ebola outbreak analysis
# Author: Your name
# Date: Today's date

# Load packages ----------------------------------------------
# The pacman package will install each package if necessary,
# and load it for use in the current session

pacman::p_load(
  rio,          # for importing data
  here,         # for relative file paths
  skimr,        # for reviewing the data
  janitor,      # for cleaning data  
  epikit,       # for creating age categories
  tidyverse     # for data management and visualization
)

# Import data ----------------------------------------------
# Import raw surveillance linelist data 
surv_raw <- import(here("data", "raw", "surveillance_linelist_20141201.csv"))

NOTE, although you wrote some test here() commands in your script, you should remove or de-activate them by placing a # symbol on their left.

As you develop your "Ebola outbreak analysis" script, try to only keep commands that are necessary for the analysis. Other commands can go in a "Testing Area" at the bottom of the script, or be deactivated.

Preparing to clean

First let's run some code to look at items in the dataset that might require cleaning.

Add a new section heading "Exploratory analysis", below the "Import Data" section.

# About this script ----------------------------------------------
# Purpose: Ebola outbreak analysis
# Author: Your name
# Date: Today's date

# Load packages --------------------------------------------------
# The pacman package will install each package if necessary,
# and load it for use in the current session

pacman::p_load(
  rio,          # for importing data
  here,         # for relative file paths
  skimr,        # for reviewing the data
  janitor,      # for cleaning data  
  epikit,       # for creating age categories
  tidyverse     # for data management and visualization
)

# Import data --------------------------------------------------
# Import raw surveillance linelist data 
surv_raw <- import(here("data", "raw", "surveillance_linelist_20141201.csv"))


# Exploratory analysis -----------------------------------------

Example commands

In the coming sections, you will see unfamiliar commands using functions like tabyl() and ggplot().

For now, simply copy-paste the code and run it to see the outputs. You will learn more about these functions in upcoming modules.

Column names

The {base} function names() returns the column names of a data frame. Write the dataset name inside the function, and view the result in the Console.

names(surv_raw)

Recall that in Module 1, all column names were standardised with no spaces or special characters. That is not the case for the ebola data!

quiz(caption = "Quiz - column names",
  question("Which of the column names contain non-standard syntax and will need to be cleaned?",
    allow_retry = T,
    answer("onset date", correct = T, message = "This one contains a space"),
    answer("wt (kg)", correct = T, message = "Spaces and parentheses need to be removed"),
    answer("hospital", message = "Spaces and parentheses need to be removed"),
    answer("age unit", correct = T, message = "space needs to be removed"),
    answer("ht (cm)", correct = T, message = "Spaces and parentheses need to be removed"),
    answer("date of report", correct = T, message = "Spaces need to be removed or converted")

  )
)

Sex tabulation

The package {janitor} and its function tabyl() can be used to make a rapid tabulation of values in a specified column. Simply write tabyl() with the name of the data frame as the first argument, and the name of the column as the second argument.

Try the following code out now by add it to your "Exploratory analysis" section, and answer the quiz question that follows:

# tabulate sex
tabyl(surv_raw, sex)
quiz(caption = "Quiz - Sex",
  question_numeric(
    "How many rows are coded as 'Unknown' sex?",
    answer(length(surv_raw$sex[surv_raw$sex == "Unknown"]), correct = T),
    allow_retry = TRUE,
    correct = "Correct, nice work.",
    min = 1,
    max = 700,
    step = 1
  )
)

Note that missing values have been coded in the sex column as the word "Unknown". Missing data in R is most properly coded as the special value NA. Let's explore this more...

Missing values in R

Missing values in R should be represented by a reserved (special) value: NA. This ensures the R properly handles these values as missing. This is important for certain data cleaning and visualisation functions.

Note that NA as the special value for missingness is typed without quotes.

“NA” with quotes is just a normal character value (and is a Beatles lyric from the song Hey Jude).

Your data may represent missingness in other ways... the number 99, a period, “Missing”, “Unknown”, or empty cells. Often, empty cells in a CSV or XLSX can import into R with a blank space - it will look empty but actually there is a character space like this: " ", or even a character no space, like this: "".

Below is a short table to show you how these scenarios look.

Only the final value in this table will be recognized as a truly "missing value" by R.

# create the demo dataset
#########################


demo_missing <- data.frame(
  `As written in data` = c('"Missing"', '"Unknown"', '" "', '""', '"."', '"NA"', NA),
  `As displayed in column` = c("Missing", "Unknown", " ", "", ".", "NA", NA)) %>% 

  rename(
    `As written in data` = `As.written.in.data`,
    `As displayed in column` = `As.displayed.in.column`
  ) %>% 

  mutate(`As assessed by is.na()` = is.na(`As displayed in column`))

#   
# demo_missing <- tribble(
#   
#   ~`As written in data`, ~`As displayed in column`,
#   '"Missing"',            "Missing",       
#   '"Unknown"',            "Unknown",       
#   '" "',                  " ",             
#   '""',                   "",              
#   NA,                      NA) %>% 
#   
#   mutate(`As assessed by is.na()` = is.na(`As displayed in column`))

demo_missing

When importing data from Excel you can sometimes have an empty character value which will look blank but would actually be referenced in R as the following character: "" with no space. For example, the code below produces a table of the counts of unique values in the column fever.

# tabulate fever in the surv_raw data frame
tabyl(surv_raw, fever)

Do you see the counts for "no", "yes", and the third value is ... an empty space. This is not properly recognized as NA (missing) by R, so we will need to recode this value in our cleaning.

Age distribution

Let's review the age distribution using the {ggplot2} package, which is the most common R package for data visualization.

Do not worry about understanding this command now - we will have two entire modules that cover {ggplot2}. In the command, the function ggplot() uses our data frame surv_raw. The column age is mapped to the x-axis, and the data are visualized as a histogram.

Copy and paste the following code into your script in the "Exploratory Analysis" section.

# a histogram of the case ages 
ggplot(data = surv_raw, mapping = aes(x = age))+
  geom_histogram()

A warning message may appear when you run this command. A "Warning" message in the Console means that R is able to execute your command, but there is something you should probably know. In contrast, an "Error" message means that R was not able to execute your command.

Below, the command is changed only slightly, to display a boxplot instead of a histogram.

Copy and paste the following code into your script in the "Exploratory analysis" section, run it through your console and answer the question that follows:

# a boxplot of the case age distribution
ggplot(data = surv_raw, mapping = aes(x = age))+
  geom_boxplot()
quiz(caption = "Quiz - age distribution",
  question("The median age is:",
    allow_retry = T,
    answer("Below 20", correct = T, message = ""),
    answer("Above 20", message = "The median age can be found in the boxplot")
  )
)

HOLD ON! You may have noticed something important if you looked through the other columns. Some of the cases have recorded age in months instead of year!

We can use the tabyl() function from the {janitor} package to quickly summarise the values in the age unit column of our surv_raw object. Add this code to your "Exploratory analysis" section and run it:

tabyl(surv_raw, `age unit`)

Note that we had to write the column name surrounded by backticks, because it has a space in it.. Cleaning this column name to remove spaces will be one of our first cleaning actions.

Backticks are NOT single quotation marks. See the difference:

On US and UK keyboards the backtick is often near the ESC, 1, or ~ key (for other keyboard layouts, see this guide.

quiz(caption = "Quiz - age unit",
  question_numeric(
    "How many rows contain age recorded in months?",
    answer(surv_raw %>% filter(`age unit` == "months") %>% nrow(),
           message = "",
           correct = T),
    allow_retry = TRUE,
    correct = "Correct, nice work.",
    min = 1,
    max = 700,
    step = 1
  )
)

Date of onset

When "cleaning" a date column in R, the first concern is which "class" the column has been assigned by default. Remember that "class" represents which type of values are contained in each columns. For example, age column would ideally be of "class" numeric, where as a district column would ideally be of "class" character.

The class() command is a {base} function that returns the class of an object.

Note the class of the object surv_raw, our linelist. Add the following code to your "Exploratory analysis" section and run it.

class(surv_raw)

Our data set is class "data frame".

Below, we inspect the class of the column age. Use the $ operator to specify that within the data frame surv_raw, you want to examine the column age.

Type this command letter-by-letter. When you type the $, you should see a list appear of all the columns in the data frame (this is an auto-complete feature, which can be adjusted in RStudio -> Tools -> Global Options).

Add the following code to your "Exploratory analysis" section and run it.

class(surv_raw$age)

Add the following code to your "Exploratory analysis" section and run it.

class(surv_raw$`onset date`)

Note again the use of backticks, because this column's name contains spaces. We need to fix this soon!

During the import of the data, R understood this column as character values (e.g. "11/03/2014") and did not assume that these were dates. Indeed - how would R know which digits referred to months, or to days? The value "11/03/2014" could be November 3rd, or the 11th of March!

For analysis purposes, it is essential that this column is properly recognized as class "date" and the days and months correctly identified. We will address this in the cleaning steps.

Now try this on your own to look at the "class" of the column date of report. Use the class() function just like above to look at the date of report column. Remember, since there are spaces in the column name, you need to use back ticks to reference the column in your R code.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

class(surv_raw$`date of report`)


quiz(caption = "Quiz - column class",
  question("What is the class of the column `date of report`?",
    allow_retry = TRUE,
    answer("Numeric"),
    answer("Character", correct = T),
    answer("Proletariat"),
    answer("Logical"),
    answer("Date")
  )
)

Numeric columns

We can use histograms to examine other numeric columns like weight, height, temperature, etc.

Does anything seem amiss? Are there any values to clean? Values that are impossible?

Copy and paste the following code into your "Exploratory analysis" section and run it to plot the weight distribution:

ggplot(data = surv_raw, mapping = aes(x = `wt (kg)`))+
     geom_histogram()

Copy and paste this code into your "Exploratory analysis" section and run it to plot the height distribution:

ggplot(data = surv_raw, mapping = aes(x = `ht (cm)`))+
     geom_histogram()

Note again the need for backticks because the height and weight column names contain spaces...

Location

There are two columns in the dataset that describe the district of the case:

How well do these two align? We can run a cross-tabulation:

Copy and paste the following code into your "Exploratory analysis" section and run it. The first-listed column (district of residence) will appear in the rows, and the district of detection will appear as the columns.

tabyl(surv_raw, adm3_name_res, adm3_name_det)
quiz(caption = "Quiz - residence",
  question("How many cases were residents of Mountain Rural but were detected in Central II?",
    allow_retry = T,

    answer("12", message = "residence is the rows, and detected is the columns"),

    answer("4", message = "residence is the rows, and detected is the columns"),

    answer(surv_raw %>% filter(adm3_name_res == "Mountain Rural" & adm3_name_det == "Central II") %>% nrow(),
           correct = T,
           message = "residence is the rows, and detected is the columns"),

    answer("209", message = "residence is the rows, and detected is the columns")
  )
)

See the first row and the last column of the tabulation - it appears that again, missing values were imported into R as a blank character space. This may be a problem in all of the columns that are class "character"! Don't worry, we will address this when cleaning the data.

r fontawesome::fa("exclamation", fill = "red") Have you saved your script lately? If not, make sure to save your script now before moving ahead.

Your script should now look something like this:

# About this script ----------------------------------------------
# Purpose: Ebola outbreak analysis
# Author: Your name
# Date: Today's date

# Load packages --------------------------------------------------
# The pacman package will install each package if necessary,
# and load it for use in the current session

pacman::p_load(
  rio,          # for importing data
  here,         # for relative file paths
  skimr,        # for reviewing the data
  janitor,      # for cleaning data  
  epikit,       # for creating age categories
  tidyverse     # for data management and visualization
)

# Import data --------------------------------------------------
# Import raw surveillance linelist data 
surv_raw <- import(here("data", "raw", "surveillance_linelist_20141201.csv"))


# Exploratory analysis -----------------------------------------

# print out column names
names(surv_raw)

# print table of sex column 
tabyl(surv_raw, sex)

# print table of fever column 
tabyl(surv_raw, fever)

# histogram of age column
ggplot(data = surv_raw, mapping = aes(x = age))+
  geom_histogram()

# box plot of age column
ggplot(data = surv_raw, mapping = aes(x = age))+
  geom_boxplot()

# print table of `age unit` column
tabyl(surv_raw, `age unit`)

# print class of surv_raw data frame
class(surv_raw)

# print class of age column 
class(surv_raw$age)

# print class of `onset date` column
class(surv_raw$`onset date`)

# print class of `date of report` column 
class(surv_raw$`date of report`)

# plot histogram of `wt kg` column
ggplot(data = surv_raw, mapping = aes(x = `wt (kg)`))+
     geom_histogram()

# plot histogram of `ht cm` column
ggplot(data = surv_raw, mapping = aes(x = `ht (cm)`))+
     geom_histogram()

# print cross tabulation of two district columns 
tabyl(surv_raw, adm3_name_res, adm3_name_det)

NOTE Have you been adding comments as you work through? Remember, this can explain to yourself and others what your code is doing and why.

One advanced tip: To "minimize" or "fold" the entire Explanatory Analysis section, click the small grey arrow on the left side of the section heading (# Exploratory Analysis). This collapses the section and provides a purple button to expand it later.

Cleaning command

Now that we have explored the raw surveillance case linelist, you can begin to build a command that will apply a series of cleaning actions to it, and produce a "clean" version.

The "pipe" operator %>%

A multi-step cleaning command is sometimes called a "pipe chain" because the "pipe" operator %>% is used to link, or "chain" together multiple operations in a single command that alter the original data frame.

It can be helpful to think of the pipe operator %>% in these analogies:

Keyboard shortcut

R coders use the pipe operator so frequently that there is a keyboard shortcut! You should memorize this:

Windows/Linux: Press Ctrl, Shift, and "m" at the same time
Mac: Press Cmd, Shift, and "m" at the same time

The "m" is probably a reference to the {magrittr} package, which hosts the pipe. {magrittr} is part of the tidyverse so you do not need to load this package separately.

As of R 4.1.0, there is also a "native pipe" from {base} R which looks like this: |>. You can change your RStudio to use the native pipe by going into Tools -> Global Options -> Code, but we recommend using the {magrittr} pipe for this course. If interested, you can read about the subtle differences here.

Create a cleaning section in your script

To begin, add a new section to your script, below the exploratory analysis section:

# Cleaning surveillance data -----------------------------------------

Reminder: press Ctrl+Shift+R (or CMD+Shift+R on a mac) to quickly create a new section in the script.

One, long cleaning command

It is best to store all the cleaning steps in one connected sequence of commands. Each step is linked by "pipe" operators %>%. You will build this command step-by-step, adding functions to the bottom. You should only write ONE cleaning command.

Do NOT write the code below - simply view it as an example. This is the command that you will write over the next two sessions. Note the pipe operators %>% highlighted after each step.

knitr::include_graphics("images/cleaning_pipe_chain.png", error = F)

Because all the steps are linked with %>% pipe operators, the entire sequence of actions is one command that can be run all at one time.

Note that there is no pipe after the last line in the command.

Begin the cleaning command

A first line of a multi-step cleaning command begins with:
1) The name of "clean" dataset that you want to create
2) The <- assignment operator 3) The name of the "raw" dataset to start from

In effect, this makes a copy of the raw data and saves it as surv.

surv <- surv_raw        # make a copy of the raw data 

Run this command and notice surv appear in your Environment. Click on it to see the values. It is an exact copy of surv_raw.

Add modifications with the pipe

The next step is to modify surv_raw before it is saved as surv.

This is done by adding a pipe operator, which "passes" surv_raw to another function that makes a change such as renaming columns, de-duplicating, recoding values, or filtering the rows by logical criteria.

Standardize column names

Do you remember how annoying the spaces in the column names are? Typically, the first function in a cleaning command is to standardize the column names.

Start by adding a pipe after surv_raw and then on the next line write the function clean_names(). Do not put anything inside the parentheses.

surv_raw is passed to (and processed by) clean_names() before it is saved as the clean surv dataset. clean_names() standardizes the column names as lower-case, with spaces converted to underscores, and special characters removed.

Add clean_names() to your command, and re-run it.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Cleaning command
surv <- surv_raw %>%    # the raw dataset 
  clean_names()         # standardize the column names 


View surv by clicking on it in RStudio. See that some column names have changed! There is a difference now between surv_raw and surv.

quiz(caption = "Quiz - clean_names()",
  question("How is the column name 'ht (cm)' written after using clean_names()?",
    allow_retry = T,
    answer("height"),
    answer("ht_cm", correct = T),
    answer("ht_(cm)"),
    answer("HT-CM")
  ),

  question("What is the new name of the column that contains date of symptom onset?",
    allow_retry = T,
    answer("onset date"),
    answer("date-of-onset"),
    answer("onset_date", correct = T),
    answer("onset")
  )
)

Never leave a pipe at the end of your command. R will assume the command is not complete and unexpected results can occur.

This is the beginning of your cleaning command, but there is much more to do...

Manually clean column names

Next, make manual changes to column names using the rename() function. This function is from the package {dplyr}, which you loaded within the {tidyverse} mega-package.

A few things to note about the rename() syntax:

Look at this generic example. Do not run this code as it will not work as written. We will replace the OLDname and NEWname with appropriate column names.

surv <- surv_raw %>%    # start with surv_raw; save the final output as surv
  clean_names() %>%     # standardize column names
  rename(               # make manual column name changes
    NEWname = OLDname,    # first change
    NEWname = OLDname     # second change
  )                     # end of rename() function

ATTENTION: Think for a moment - what if you wanted to edit the name of the column which holds date of symptom onset... what is the "old" name to write in the rename() function?

quiz(caption = "Quiz - rename()",
  question("In surv_raw, what is the name of the column that stores date of symptom onset?",
    allow_retry = T,
    answer("date_onset"),
    answer("onset_date"),
    answer("onset date", correct = T),
    answer("date of onset")
  ),

  question("When surv_raw is piped into the rename() function, what is the name of the column that stores date of symptom onset?",
    allow_retry = T,
    answer("date_onset"),
    answer("onset_date", correct = T),
    answer("onset date"),
    answer("date of onset")
  )
)

In surv_raw, this column name contains a space. However, because we pipe the surv_raw data through clean_names() before piping it to rename(), it's column names have underscores instead of spaces. Therefore, the rename() command should use the column name with an underscore.

Remember - a pipe takes the output of one function and passes it to the next function.

Add a renaming step to the cleaning command

In your cleaning command, add the rename() function after clean_names() and change the following column names:

Change the date columns so that "date" is at the beginning of their name (this will be useful later):

Simplify the location column names for easier typing:

Finally, check the placement of your pipes. Look at the image below and consider where you should see pipes in the code (position A,B,C, and/or D).

knitr::include_graphics("images/pipe_placement.png", error = F)
quiz(caption = "Quiz - Pipe placement",
  question("Which locations in the image above should have a pipe operator? (select all that apply)",
    allow_retry = T,
    answer("A", correct = T),
    answer("B", correct = T),
    answer("C", message = "No pipe within a function."),
    answer("D", message = "No pipe at the end.")
  )
)

Re-run the cleaning command to implement the changes. View surv to see the updated column names.

You should have only one (1) cleaning command, which is getting longer. It should look similar to this:

# begin cleaning command
surv <- surv_raw %>%    # start with the raw data

   # automatically standardize the column names
   clean_names() %>% 

   # manually clean column names   
   rename(
      date_onset = onset_date,
      date_report = date_of_report,
      district_res = adm3_name_res,
      district_det = adm3_name_det)

Running commands

Running a multi-step command with pipes can be nuanced. Pick a strategy that works best for you.

Always ensure that you do not have an extra pipe at the end of your command.

knitr::include_graphics("images/run_pipe_hang.png", error = F)

You can run the entire cleaning command by highlighting it -from the top- and clicking "Run" (or using a keyboard shortcut to run)

knitr::include_graphics("images/run_highlight_top.png", error = F)

Be careful to not miss the last parenthesis.

knitr::include_graphics("images/run_highlight_top_miss.png", error = F)

You can also run the entire cleaning command by placing your cursor at the top line and clicking "Run" (or using a keyboard shortcut to run)

knitr::include_graphics("images/run_cursor_top_ok.png", error = F)

You can place your cursor anywhere in the command and press run, as long as all the necessary pipes are present.

knitr::include_graphics("images/run_cursor_middle_ok.png", error = F)

BUT! You cannot run a highlighted portion of your command without the top line. The lower commands do not work unless the dataset is piped into them from the beginning.

knitr::include_graphics("images/run_highlight_middle.png", error = F)

If you run only a part of your command, start from the top and do not highlight a final pipe Only the changes you have highlighted will be saved to the clean dataset.

knitr::include_graphics("images/run_highlight_some_ok.png", error = F)

If you accidentally highlight and run with a "hanging pipe", this will result in unexpected results. Click the "esc" key to reset the Console.

knitr::include_graphics("images/run_highlight_some_bad.png", error = F)
quiz(caption = "Quiz - Running a multi-step command",
  question("Which of these approaches work to run the cleaning command (select all that apply)",
    allow_retry = T,
    answer("Highlight the entire command and press the Run button",
           correct = T),
    answer("Highlight the entire command and press Ctrl and Enter",
           correct = T),
    answer("Place the cursor anywhere inside the command, and press the Run button", 
           correct = T),
    answer("Place the cursor anywhere inside the command, and press Ctrl and Enter",
           correct = T)
    ),

  question("Which of these are common mistakes when running a multi-step command with pipes?(select all that apply)",
    allow_retry = T,
    answer("Highlighting and running without starting at the top of the command.",
           correct = T),
    answer("Running the command in reverse.",
           correct = F),
    answer("Deleting the command by accident", 
           correct = F),
    answer("Highlighting an extra pipe by accident",
           correct = T)
    )
)

Create a "Testing" area

Add a new section called "Testing area", below your "Cleaning surveillance data" section. This will be used to run commands that test the impact of your cleaning code.

# Testing area -----------------------------------------

Think of this as a "playground" to hold stand-alone, independent commands that are not part of your cleaning command.

For example, use the names() function in your "Testing area" to review the surv column names and ensure the rename() changes were correct.

# Testing area -----------------------------------------
names(surv_raw)  # columns in the raw data
names(surv)      # columns in the clean data

The surv column names no longer have spaces (replaced by underscores) nor special characters such as parentheses, and are all in lower case! Additionally, note the names of the date and district columns.

Select columns

The select() function is versatile and often used to:

Select columns to keep

The primary purpose of select() is to select columns to keep in the dataset. Any column name that is not included in the parentheses will be dropped.

To practice the use of select(), run this command in your "Testing area".

surv %>% 
  select(epilink, age, sex)
quiz(caption = "Quiz - Select",
  question("Is this command part of your cleaning command?",
    allow_retry = T,
    answer("No, it is separate and in the Testing area",
           correct = T),
    answer("Yes, because it has a pipe",
           correct = F,
           message = "No, just because it has a pipe does not mean it is part of your cleaning command. A pipe can be used in many circumstances.")
  ),

  question("Is this a PRINTING or SAVING command?",
    allow_retry = T,
    answer("Printing",
           correct = T,
           message = "This is a printing command because there is no arrow assignment operator. No changes are actually saved to the dataset."),
    answer("Saving",
           correct = F,
           message = "This is a printing command because there is no arrow assignment operator. No changes are actually saved to the dataset.")
  ),

question("Where did the modified dataset appear?",
    allow_retry = T,
    answer("In the Environment",
           correct = F,
           message = "No, there were no changes saved, so there are no changes in the Environment."),
    answer("In the Console",
           correct = T,
           message = "Because no changes were saved, the output was printed to the Console.")
  )
)

Notice how select() also re-orders columns in the order you write.

Some useful shortcuts for select()

It may be taxing to type every column individually that you want to keep. You can use a colon ( : ) to keep two named columns and all the columns in between them.

Modify the code in your "Testing area" as follows:

# keeps fever, vomit, and the columns in between them 
surv %>% 
  select(fever:vomit)

You will see only the columns between fever and vomit printed.

Another tip is to use select() to adjust the order of the columns.

Write a few columns at the beginning, and then write everything() which includes all the other columns (note the empty parentheses at the end of everything()).

everything() is a "tidyselect" helper function. Others include contains() and starts_with() (read more about in this Epi R Handbook chapter).

Modify your Testing Area command to be as below:

# use everything()
surv %>% 
  select(fever, sex, starts_with("date"), everything())

Drop columns by name

To drop specific columns, you can add a minus symbol (-) before a column name in the select() function. All other columns will be retained.

In the "Testing area", practice using a minus symbol ( - ) to drop the column epilink. All other columns are retained.

surv %>% 
  select(-epilink)

Next, update the code to practice dropping multiple columns by wrapping all their names c() - this approach will apply the negative sign to all of them. We learned the {base} function c() in the previous module.

# Removes these three columns
surv %>% 
  select(-c(epilink, district_det, age))

# an equivalent command to above
surv %>% 
  select(-epilink, -district_det, -age)

A more advanced manoeuvre is to use select() to rename columns! If you try this, it uses the same syntax as rename(), but remember that it only keeps the columns that you include in the select() function!

Add select() to cleaning command

OK, enough practice with select()! Now, add a select() statement to your cleaning command!

1) Return to the "Cleaning surveillance data" section of your script.
2) Add a step to your command that removes the column row_num.
3) Re-run the cleaning command to see the effect in surv.

If you have forgotten, the keyboard shortcut for a pipe (%>%) is Ctrl Shift M for Windows and CMD Shift M for Macs.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

Your cleaning command should now look like this:

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    # New name    # Old name 
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num)


Remember there should be only one cleaning command. Add additional lines to it as you complete this exercise.

Deduplicate

At this point, we should check for duplicates in our data. This can be done with the distinct() function.

If distinct() is run with empty parentheses, it will remove rows that are 100% duplicates (it will keep only the first row of any that have exactly the same values in every column).

Add the function distinct() to the cleaning command in your "Clean surveillance data" section of your script, and re-run your cleaning command.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

Your cleaning command should now look like this:

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct()


Were any rows removed?

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Compare the number of rows in surv to the number of rows in surv_raw (in the Environment pane). How many rows (observations) are present in surv compared to surv_raw?


distinct() can be used in advanced ways, such as to only consider certain columns when evaluating duplicates (e.g. only remove rows that have the same age, sex, and first name).

For example, adjusting your command to read: distinct(case_id) will only consider a row as a "duplicate" if the case_id values are identical (values in ALL columns no longer have to be identical to be considered a "duplicate").

Notice however, that the default function behaviour if you run distinct(case_id) - all the other columns are dropped from the dataset! You can add the argument .keep_all = TRUE and all other columns will be kept.

distinct(case_id, .keep_all = TRUE)

You can read more about the distinct() function its documentation, and in the Epi R Handbook's data cleaning chapter or de-duplication chapter.

Before you move on, ensure that your de-duplication line is written as distinct() with nothing included within the parentheses.

distinct()

r fontawesome::fa("exclamation", fill = "red") Have you saved your script lately?

Column classes

The class() function

Each column in the data frame has a specific "class". Classes reflect how R understands the values in the column.

Previously, you reviewed the classes of each column with the skim() function from the {skimr} package. You can also view the class of a single column using the class() function from {base} R.

Type and run these commands in the "Testing area" of your R script:

# print the class of a column
class(surv$sex)
# print the class of a column
class(surv$date_onset)

Let's review what we've learned so far!

How is this command different from your cleaning command / "pipe chain"?

In a long cleaning command with pipes, the $ is not necessary because the data frame is being piped from one function to the next with the pipe operator. In this command which does not use a pipe, we must write the data frame explicitly.

What are possible column classes?

Common column classes include:

An early (and important!) part of the data cleaning process is ensuring that R correctly understands the class for each column. R is generally good about guessing this, but sometimes it needs your help.

Date class

Above, you recognized that the column date_onset is recognized by R as class "character". Dates in Excel worksheets imported into R are often recognized this way, or in numeric (e.g. 48256), or in a strange class called "POSIXct".

We want to designate this column as class "Date".

First, we need to understand how the date is currently written in the data.

In your "Testing area", run the following command to show the first six values of the column date_onset in the data frame surv:

head(surv$date_onset)
quiz(caption = "Quiz - raw date format",
  question("Which format is date_onset??",
    allow_retry = T,
    answer("YYYY-MM-DD"),
    answer("DD/MM/YYYY"),
    answer("MM/DD/YYYY", correct = T)
  )
)

Working with dates using {lubridate}

{lubridate} has several functions designed to convert character values to class Date in an intuitive and lenient way.

The 3 functions ymd(), mdy(), and dmy() correspond to the format of the RAW date values, prior to use of the function. Thankfully, they are smart in that they allow for a variety of separators (dashes, slashes, spaces) and date synonyms (01 vs Jan vs January):

Day, Month, Year

The dmy() function should be used for raw values written as: day, month , year. Copy and paste the following code into your "Testing area" to have a look:

dmy("11 October 2020")
dmy("11/10/2020")

Month, Day, Year

The mdy() function does the same but for raw values written as: month, day, year. Copy and paste the following code into your "Testing area" to have a look:

mdy("Oct 11 20")
mdy("10/11/20")

Year, Month, Day

ymd() should be used to convert raw values that are written as: year, month, day. Copy and paste the following code into your "Testing area" to have a look:

ymd("2020-10-11")
ymd("20201011")

In ALL of the above commands, the outcome is values in date class that are displayed as YYYY-MM-DD.

quiz(caption = "Quiz - date format",
  question("Which function should be used to convert the column date_onset?",
    allow_retry = T,
    answer("ymd()"),
    answer("dmy()"),
    answer("mdy()", correct = T)
  )
)

The most important thing to remember is to use the function that matches the raw date format.

These examples apply {lubridate} functions to single values. In the next section we will learn how to apply these functions to an entire column.

mutate()

The function mutate() is used to change an entire column, or create a new column. You can pipe a dataset into mutate().

Within mutate(), the syntax is as follows:

mutate(column name = a function you use to create/modify the column)

Thus, to convert the column date_onset to class "Date", you would add this line to the cleaning command with a pipe:

mutate(date_onset = mdy(date_onset))

The code is saying: "change (mutate) the date_onset column, such that it contains the values of the date_onset column as modified by the function mdy()."

Add two mutate() lines to your cleaning command, to convert date_onset and date_report columns to class "Date".

Remember to re-run the whole command once you add these lines.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report))


As class Date, you can use this column as a continuous variable and produce a simple histogram (epidemic curve) using the code below. With the dates as character values, this would not be possible.

Paste the following code in your "Testing area" and run it.

# plot a histogram of dates using the cleaned dataset and date column
ggplot(data = surv, mapping = aes(x = date_onset))+
  geom_histogram()
quiz(caption = "Quiz - Date",
  question("In the most recent weeks, what is the apparent trend in the number of reported cases?",
    allow_retry = T,
    answer("Increasing"),
    answer("Decreasing", correct = T)
  )
)

Age as numeric

There is another column that we should check the class of: age. What is the class of this column?

Write and run this command in your "Testing area" section:

class(surv$age)

The command above returned "integer" (whole values). While this is OK for the moment... we may want to perform calculations that require that this column is class "numeric" to accept decimal values.

Thankfully in {base} R there are functions like as.numeric(), as.character(), and as.integer() to easily convert columns from one class to another.

In your cleaning command, add another mutate() line that converts the age column using the as.numeric() function.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

In the mutate() function, re-define age as a numeric class by wrapping age within the as.numeric() function, and set it equal to a "new" column called age.


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age))


r fontawesome::fa("exclamation", fill = "red") Have you saved your script lately?

Simple recoding

There are a variety of functions to perform simple recoding of values.

Missing values

As identified earlier, in the "raw" data there is a value "Unknown" in the column sex. Missing data should be stored in R as NA, which will have benefits later in the course.

Run the code below in your "Testing area". It creates a counts table of the unique values in the column sex in the current surv data.

# Table of sex values  
tabyl(surv, sex) 
surv_raw %>% 
     # automatically clean column names
     clean_names() %>% 

     # manually clean column names   
     rename(
          date_onset = onset_date,
          date_report = date_of_report,
          district_res = adm3_name_res,
          district_det = adm3_name_det) %>%

     # remove unnecessary column
     select(-row_num) %>% 

     # de-duplicate rows  
     distinct() %>% 

     # convert date_onset to date class
     mutate(date_onset = mdy(date_onset)) %>% 
     mutate(date_report = mdy(date_report)) %>% 

     # convert age to numeric class
     mutate(age = as.numeric(age)) %>% 

  tabyl(sex)

We want to tell R that if the value in sex is "Unknown", then R should change the value to NA. To do this, we will add a step to our cleaning command that uses mutate() and the function na_if() to change the sex column.

The function na_if() replaces a specified value with NA:
Its first argument is the column name (sex)
Its second argument is the value present in the column that you want to change to NA ("Unknown"). Note: you must include quotations around "Unknown" so that R can recognize it as a character value.

Add this step to the cleaning command.

Return to the "Clean surveillance data" section of your R script.

At the bottom of your cleaning command, add a pipe and another mutate() line.

Inside mutate(), set the column sex equal to the function na_if(). In the parentheses of na_if() again write sex, then a comma, and then "Unknown" in quote marks:

mutate(sex = na_if(sex, "Unknown"))

This code is saying: "Change the column sex, such that its values equal the current column sex, except for where the value is "Unknown", in which case that value should be changed to NA".

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" sex to NA
  mutate(sex = na_if(sex, "Unknown"))


Note: it is also useful to know the opposite function, replace_na(), which replaces NA with a specified value.

Testing the output

Now return to the "Testing area" and re-run the tabyl() command on column sex, using the clean dataset surv.

# Table of sex values  
tabyl(surv, sex)
surv_raw %>% 
  # automatically clean column names
     clean_names() %>% 

     # manually clean column names   
     rename(
          date_onset = onset_date,
          date_report = date_of_report,
          district_res = adm3_name_res,
          district_det = adm3_name_det) %>%

     # remove unnecessary column
     select(-row_num) %>% 

     # de-duplicate rows  
     distinct() %>% 

     # convert date_onset to date class
     mutate(date_onset = mdy(date_onset)) %>% 
     mutate(date_report = mdy(date_report)) %>% 

     # convert age to numeric class
     mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" sex to NA
  mutate(sex = na_if(sex, "Unknown")) %>%

  tabyl(sex)

The missing values should now be represented as NA.

Across

As mentioned earlier, when R imports dataset files from CSV or XLSX, sometimes it records missing values in "character" columns as an empty character space "", instead of as the special symbol NA.
Run the code below in your "Testing area", which creates a counts table of the unique values in the column fever.

tabyl(surv, fever)

We need to tell R that if the value in fever is empty space "" then R should change the value to NA. We know that we can achieve this by adding this to our cleaning command with a pipe:

mutate(fever = na_if(fever, ""))

But there are many columns where missing data are recorded as "". How can we make all these changes efficiently?

The command below uses across() within the mutate() to change multiple columns at once. This is an intermediate R command, so do not spend much time trying to understand it. Later, you can ask an instructor, or read more about across() in the Epi R Handbook here. In short, this command is saying: "mutate all the columns that are character class, and use the function na_if() to convert any instances of empty space like "" to NA".

Add the following line as the next step n your cleaning command. Remember to re-run the whole cleaning command.

mutate(across(.cols = where(is.character), .fns = ~na_if(.x, "")))

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" sex to NA
  mutate(sex = na_if(sex, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = ~na_if(.x, "")))


Assessing missingness

While we are discussing missing values, let's introduce you to the core R functions used to assess missingness.

We have two date columns in the data frame: date_onset and date_report. The first is the date of symptom onset, as recorded by the case investigation team. The second is the date the case was reported to the surveillance team.

Which one should we be primarily using?

There are many factors that go into a decision like this, such as the purpose of the analysis, the audience, etc. One important factor is data quality and missingness.

Missingness in R is represented by NA - or at least it should be - as we discussed above. The function to assess missingness is is.na(). Likewise, the expression !is.na() is used to test for non-missingness - the exclamation point is a negation operator. In both of these functions, the object you want to assess is placed within the parentheses.

Please remember the below commands related to missingness in R.

Run this command in the "Testing area":

is.na(surv$date_onset)

In your console, you should see one TRUE or FALSE for every row in the column. It is not very useful.
To count the number of rows that are TRUE, you can wrap this command in sum().

Run these two commands in the "Testing area" of your R script:

sum(is.na(surv$date_onset))   # number of rows where is.na(surv$date_onset) evaluates to TRUE
sum(is.na(surv$date_report))  # number of rows where is.na(surv$date_report) evaluates to TRUE
quiz(caption = "Quiz - Date",
  question("Which column has more missing data?",
    allow_retry = T,
    answer("date_onset", correct = T),
    answer("date_report")
  ),
  question("Which percent of 'date_onset' is missing? Hint: you have the count of NAs in date_onset and you know the number of rows in the whole surv data frame...",
    allow_retry = T,
    answer("16.4 %"),
    answer("10.2 %"),
    answer("1 %"),
    answer(paste(round(sum(is.na(surv$date_onset))/length(surv$date_onset), 3) * 100, "%"), correct = T)
  )
)

Re-coding hospital names

Our cleaning command is now quite long! Each step is important, and thankfully it is all well-documented in our code. This transparency is useful for ourselves, anyone reviewing our code later, and for scientific integrity!

Let's now have a look at the column hospital.

Write and run this code in your "Testing area". It will print a tabulation of all the values in this column:

tabyl(surv, hospital)

It seems there is some cleaning to be done! Note the different spellings of "hospital" and "hopital", of "Military" and "Mitilary", and of "Port" and "Port Hospital". These need to be synchronized.

An easy option is recode(). Review the generic syntax below. The code can be understood as: "Re-define hospital as hospital with these recodes applied". Note that this uses the opposite syntax order from rename(), which was NEW = OLD. recode() uses OLD = NEW.

mutate(hospital = recode(hospital,
  "OLD value" = "NEW value",
  "OLD value" = "NEW value",
  "OLD value" = "NEW value"))

Using the above code as a template, add a line at the END of your cleaning command that aligns all the hospital names to the following:

We will leave NA hospitals as they are. Don't forget to put quotes around the hospital names (OLD and NEW names), as they are character values.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Begin by piping to a new mutate() function. Within the mutate() parentheses, write the name of the column to re-define, hospital and an equals sign,. On the right side of the equals sign, you write the function recode(). Within the parentheses of recode(), the first argument is again the name of the column to re-define.

After a comma, continue with one line per change, with the old value on the left, and the new value on the right. As these are character values, put them in quotes. Don't forget commas between each line, and to close with the correct number of parentheses.


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" sex to NA
  mutate(sex = na_if(sex, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = ~na_if(.x, ""))) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH"))


Recoding sex

Using the same method as above, add another line to your cleaning command to recode() the values in the column sex from "m" to "male" and from "f" to "female".

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" sex to NA
  mutate(sex = na_if(sex, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = ~na_if(.x, ""))) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>% 

  # recode sex
  mutate(sex = recode(sex,
    "m" = "male",
    "f" = "female"))


End

Congratulations! You have finished the first part of the data cleaning in this course!

Save your script, and check in with your instructor to let them know that you finished.

If you have extra time, ensure that your script is aligned with the script below. Does it look similar?

Consider how you might clean up your script! Ask your instructors to review script organisation if you are confused about format or flow.

NOTE: the script below can also be found in the "ebola/scripts/backups" folder. You may notice the "Exploratory analysis" section and "Testing area" section in the backup scripts are empty, because the contents may vary.

# About this script ----------------------------------------------
# Purpose: Ebola outbreak analysis
# Author: Your name
# Date: Today's date

# Load packages --------------------------------------------------
# The pacman package will install each package if necessary,
# and load it for use in the current session

pacman::p_load(
  rio,          # for importing data
  here,         # for relative file paths
  skimr,        # for reviewing the data
  janitor,      # for cleaning data  
  epikit,       # for creating age categories
  tidyverse     # for data management and visualization
)

# Import data --------------------------------------------------
# Import raw surveillance linelist data 
surv_raw <- import(here("data", "raw", "surveillance_linelist_20141201.csv"))


# Exploratory analysis -----------------------------------------
# Contents will vary


# Clean surveillance data -----------------------------------------

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" sex to NA
  mutate(sex = na_if(sex, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = ~na_if(.x, ""))) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>% 

  # recode sex
  mutate(sex = recode(sex,
    "m" = "male",
    "f" = "female"))


# Testing area -----------------------------------------
# Contents will vary

If you have extra time after cleaning up your script, continue onwards to the extras on the next page.

Extras

Manual file selection for data import

A useful alternative to providing a file path is using the {base} R function file.choose(). This triggers a pop-up window to appear, from which you can manually select the file to import.

Your import command would look like this - note the empty parentheses at the end of file.choose():

surv_raw <- import(file.choose())

This might not work with some of the more recent RStudio versions, due to a bug.

This approach has the disadvantage of not being as well documented (for reproducibility) but can be extremely useful if you are routinely receiving updated data by email and simply need to select the most up-to-date version saved to a local folder.

In the Epi R Handbook chapter on importing, there is sample code on how to write code to automatically import the most recent dataset from a folder. However, file.choose() can often be a more simple alternative.

Try to write a command with file.choose() in your R script and test this process. If you do not see the pop-up window, check behind your RStudio window.

{tidylog}

If you have extra time, try to install the {tidylog} package by inserting it into your pacman::p_load() command and re-running it.

Once loaded, this function records every data cleaning step that you run (e.g. filter(), mutate(), select(), etc.) and will print a record of the changes in the Console.

Re-run your cleaning command to see the impact of each step in the process printed to the Console.

Deduplication

Deduplication is a significant topic that is often more complex than the simple example in this exercise. We have written about more complex deduplication scenarios is this the Epi R Handbook chapter on de-duplication.

More date cleaning

Date and times involve many special circumstances. Familiarise yourself with this Epi R Handbook chapter on working with dates.

Mix of date formats

Of particular note, review the package {parsedate} and its function parse_date(). This is used to clean those very messy date columns for which the {lubridate} functions mdy(), dmy(), and ymd() will not work because the dates are not in the same general format across all the rows. parse_date() will evaluate each row individually, intelligently guessing the format of each cell. This takes more time, but is useful if the dates are very messy.

Excel dates

See also the excel_numeric_to_date() function from the {janitor} package. There is also the function convert_to_date() that helps when Excel numeric dates are mixed with other formats.

File naming best practices

File naming can seem like a small thing to focus on, but the ramifications can be significant. Proper file naming can really save you time and headache, and switching your practice can be easy.

Imagine the power of R - you can write code to automatically import the most recent data files by the date in their file name! But this can be hindered by poor file naming practices.

Even if you never try coding such as that, you surely know that bad file naming practices can cause significant problems for version control and archiving.

Good and bad file names

Have you ever seen files like these?

r emo::ji("cross mark") notes.docx
r emo::ji("cross mark") a.R
r emo::ji("cross mark") 2b.xlsx
r emo::ji("cross mark") notes.txt

How about something like this?

r emo::ji("cross mark") case control San Jose factory.pptx
r emo::ji("cross mark") abstract for Mark?.xlsx
r emo::ji("cross mark") December 20 update.R
r emo::ji("cross mark") draft "malaria study" report(1).docs

These are very difficult to work with. Try to follow these principles when naming files (your future self will thank you!):

Let's discuss tips to achieve each of these principles.

Names that are machine-readable

Don't use white (empty) space in file names (machines sometimes get confused by spaces)

r emo::ji("cross mark") Draft Vax outreach team.docx
r emo::ji("check") vaccination-outreach_draft01.docx

Use letters, numbers, hyphens (-), and underscores (_) only (special characters can have special meanings and confuse machines e.g. ^.*?+|$ )

r emo::ji("cross mark") Ministry report ?.docx
r emo::ji("check") ministry_report_draft01.docx

Never have two file names that differ only by case (some operating systems treat b and B the same, while others treat them as different), and be consistent with your case (usually lowercase is better)

r emo::ji("cross mark") study.docx
r emo::ji("cross mark") Study.docx
r emo::ji("cross mark") Belgium-and-france.docx

r emo::ji("check") study.docx
r emo::ji("check") belgium-and-france.docx

Names that are human-readable

Use hyphens and underscores differently

r emo::ji("check") 20200316_goma_linelist.xlsx
r emo::ji("check") transmission-analysis_california_mmwr.docx

Names that sort easily

Dates should be written as YYYY-MM-DD (the ISO 8601 standard). Usually, it is best to place the dates before other name components to preserve chronology.

File names with this style of dates do not sort in chronological order!

r emo::ji("cross mark") 1-April-2012_linelist.xlsx
r emo::ji("cross mark") 1-Jan-2009_linelist.xlsx
r emo::ji("cross mark") 1-Jan-2012_linelist.xlsx
r emo::ji("cross mark") 12-Jan-2012_linelist.xlsx
r emo::ji("cross mark") 2-Jan-2012_linelist.xlsx
r emo::ji("cross mark") 31-Dec-2009_linelist.xlsx

But this style does sort correctly!

r emo::ji("check") 2009-01-01_linelist.xlsx
r emo::ji("check") 2009-12-01_linelist.xlsx
r emo::ji("check") 2009-12-31_linelist.xlsx
r emo::ji("check") 2012-01-01_linelist.xlsx
r emo::ji("check") 2012-01-02_linelist.xlsx
r emo::ji("check") 2012-04-01_linelist.xlsx

To order files without dates, use numbers as prefixes (left pad with 0 so all numbers have the same length!)

r emo::ji("check") 01_introduction.docx
r emo::ji("check") 02_methods.docx
r emo::ji("check") 03_analysis.docx

... (more chapters)...

r emo::ji("check") 19_appendix-04.docx
r emo::ji("check") 20_appendix-05.docx
r emo::ji("check") 21_appendix-06.docx

In the end, file naming does not come with hard rules - it is a personal choice. However, certain choices can lead to more frustration, lost files, and coding headaches later.

quiz(caption = "Quiz - file names",

  question("What components of this file name do not conform to the guidance above (select ALL that apply):\n\nSARS.Exposure.listing 1stdraft 01032016.csv",

    allow_retry = T,
    answer("The date is incorrectly ordered", correct = T, message = "The date should be in YYYY-MM-DD format"),
    answer("It is actually for Measles, not SARS ", message = "You don't know that! This is an example :-)"),
    answer("It uses special characters", correct = T, message = "File names should avoid periods"),
    answer("It uses spaces", correct = T, message = "Avoid spaces in file names"),
    answer("The date should be at the beginning", correct = T, message = "For proper ordering, the date should be at the beginning")
  )
)

Concepts in this section on file names were borrowed from this slide deck on project structure by DJ Navarro.

Additional reading

If you have extra time, we encourage you to review the following for more data cleaning tips:

If you have questions about any of the above subjects, contact your facilitator and if they are not busy helping other students they can help you.



appliedepi/introexercises documentation built on April 22, 2024, 1:01 a.m.