# load packages ----------------------------------------------------------------
library(introexercises) # get data for exercises  
library(learnr)         # create lessons from rmd
library(gradethis)      # evaluate exercises
library(dplyr)          # wrangle data  
library(stringr)        # strings
library(flair)          # highlight code
library(ggplot2)        # visualise data     
library(lubridate)      # work with dates  
library(epikit)         # for emojis   
library(skimr)          # browse data
library(fontawesome)    # for emojis  
library(janitor)        # clean data
# library(RMariaDB)       # connect to sql database 

## set options for exercises and checking ---------------------------------------

## Define how exercises are evaluated 
gradethis::gradethis_setup(
  ## note: the below arguments are passed to learnr::tutorial_options
  ## set the maximum execution time limit in seconds
  exercise.timelimit = 60, 
  ## set how exercises should be checked (defaults to NULL - individually defined)
  # exercise.checker = gradethis::grade_learnr
  ## set whether to pre-evaluate exercises (so users see answers)
  exercise.eval = FALSE 
)

# ## event recorder ---------------------------------------------------------------
# ## see for details: 
# ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events
# ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R
# 
# ## connect to your sql database
# sqldtbase <- dbConnect(RMariaDB::MariaDB(),
#                        user     = Sys.getenv("userid"),
#                        password = Sys.getenv("pwd"),
#                        dbname   = 'excersize_log',
#                        host     = "144.126.246.140")
# 
# 
# ## define a function to collect data 
# ## note that tutorial_id is defined in YAML
#     ## you could set the tutorial_version too (by specifying version:) but use package version instead 
# recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) {
#     
#   ## define a sql query 
#   ## first bracket defines variable names
#   ## values bracket defines what goes in each variable
#   event_log <- paste("INSERT INTO responses (
#                        tutorial_id, 
#                        tutorial_version, 
#                        date_time, 
#                        user_id, 
#                        event, 
#                        section,
#                        label, 
#                        question, 
#                        answer, 
#                        code, 
#                        correct)
#                        VALUES('", tutorial_id,  "', 
#                        '", tutorial_version, "', 
#                        '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "',
#                        '", Sys.getenv("SHINYPROXY_PROXY_ID"), "',
#                        '", event, "',
#                        '", data$section, "',
#                        '", data$label,  "',
#                        '", paste0('"', data$question, '"'),  "',
#                        '", paste0('"', data$answer,   '"'),  "',
#                        '", paste0('"', data$code,     '"'),  "',
#                        '", data$correct, "')",
#                        sep = '')
# 
#     # Execute the query on the sqldtbase that we connected to above
#     rsInsert <- dbSendQuery(sqldtbase, event_log)
#   
# }
# 
# options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)
# data prep --------------------------------------------------------------------
surv_raw <- rio::import(system.file("dat/old_version/surveillance_linelist_20141201.csv", package = "introexercises"))

# for quizes
surv <- rio::import(system.file("dat/old_version/surveillance_linelist_clean_20141201.rds", package = "introexercises"))

Introduction to R for Applied Epidemiology and Public Health

Welcome

Welcome to the live course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation that offers open-source tools, training, and support to frontline public health practitioners.

knitr::include_graphics("images/logo.png", error = F)

Data cleaning

This exercise focuses on logical recoding of values, filtering rows, creating age categories, and other more complex data cleaning tasks.

Format

This exercise will guide you through a set of tasks.
You should perform these tasks in RStudio and on your local computer.

Getting Help

There are several ways to get help:

1) Look for the "helpers" (see below) 2) Ask your live course instructor/facilitator for help
3) Ask a colleague or other participant in the course for tips
4) Post a question in Applied Epi Community in the category for questions about Applied Epi Training

Here is what those "helpers" will look like:

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Here you will see a helpful hint!


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

linelist %>% 
  filter(
    age > 25,
    district == "Bolo"
  )

Here is more explanation about why the solution works.


Quiz questions

Please complete the quiz questions that you encounter throughout the tutorial. Answering will help you to comprehend the material, and will also help us to improve the exercises for future students.

To practice, please answer the following questions:

quiz(
  question_radio("When should I view the red 'helper' code?",
    answer("After trying to write the code myself", correct = TRUE),
    answer("Before I try coding", correct = FALSE),
    correct = "Reviewing best-practice code after trying to write yourself can help you improve",
    incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer."
  )
)
question_numeric(
 "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?",
 answer(10, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(9, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(8, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(7, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(6, message = "Ok, we will get there together", correct = T),
 answer(5, message = "Ok, we will get there together", correct = T),
 answer(4, message = "I like your confidence!", correct = T),
 answer(3, message = "I like your confidence!", correct = T),
 answer(2, message = "I like your confidence!", correct = T),
 answer(1, message = "I like your confidence!", correct = T),
 allow_retry = TRUE,
 correct = "Thanks for sharing. ",
 min = 1,
 max = 10,
 step = 1
)

Icons

You will see these icons throughout the exercises:

Icon |Meaning ------|-------------------- r fontawesome::fa("eye", fill = "darkblue")|Observe
r fontawesome::fa("exclamation", fill = "red")|Alert!
r fontawesome::fa("pen", fill = "brown")|An informative note
r fontawesome::fa("terminal", fill = "black")|Time for you to code!
r fontawesome::fa("window-restore", fill = "darkgrey")|Change to another window
r fontawesome::fa("bookmark", fill = "orange")|Remember this for later

License

Creative Commons License Applied Epi Incorporated, 2022
This work is licensed by Applied Epi Incorporated under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.

Please email contact@appliedepi.org with questions about the use of these materials for academic courses and epidemiologist training programs.

Learning objectives

In this exercise you will:

Setup

As before, open RStudio by double-clicking on the intro_course.Rproj file in the "intro_course" folder.

Re-open the "ebola_analysis.R" script that you began writing in the previous session. You will continue your work in this script, by expanding the cleaning R code.

Then do the following:

You should also have "Exploratory analysis" or "Testing" sections in your script. Take a moment to organize these in a coherent manner so they do not interrupt the flow of your script by causing errors. For purposes of reproducibility, you should be able to run your entire script with no errors. This will make sharing the script with colleagues much easier.

Now we will continue by adding further commands to your cleaning pipe chain...

Logical recoding

Sometimes, the recoding we want is not as simple as replacing one value for another. Instead, we use logical criteria and logical operators, and often reference values in several columns.

Negative values

In the weight column wt_kg, there are some negative values. Open the data frame in the viewer (e.g. by clicking the data frame name in the Environment pane) and sort the dataset by clicking on the wt_kg column. Note the negative values.

Alternatively, in your Testing section, run the {base} command summary() on the column surv$wt_kg to return key summary statistics on this numeric column:

summary(surv$wt_kg)

This must be some data entry error. To avoid errors in any future calculations or plots, convert these values to NA (missing). Remember, you should make edits like this here in the script. This is better than editing in CSV file, because here there is a record of changes performed.

The logic we want to apply is simple: "If wt_kg negative, convert it to NA, otherwise remain the same." Therefore, we can use the ifelse() function within mutate().

The syntax is: ifelse(CONDITION APPLIED TO ROW, OUTCOME IF TRUE, OUTCOME IF FALSE)

mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg))

Add this to your pipe chain and verify that it worked correctly.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" gender to NA
  mutate(gender = na_if(gender, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>%

  # recode gender
  mutate(gender = recode(gender,
    "m" = "male",
    "f" = "female")) %>% 

  # convert negative weight values to NA
  mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) 


Now re-run your summary() command to check the new range of the values:

summary(surv$wt_kg)

Review logical operators

As you clean, it is important to know these logical and relational operators. There is no need to memorize them (see the Epi R Handbook basics chapter), but they are here for you review quickly.

Relational operators

Relational operators compare values and are often used when defining new variables and subsets of datasets. Here are the common relational operators in R:

+--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+ | Meaning | Operator | Example | Example Result | +==========================+============+==============+========================================================================================================================================================+ | Equal to | == | "A" == "a" | FALSE (because R is case sensitive) Note that == (double equals) is different from = (single equals), see note below | +--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+ | Not equal to | != | 2 != 0 | TRUE | +--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+ | Greater than | > | 4 > 2 | TRUE | +--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+ | Less than | < | 4 < 2 | FALSE | +--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+ | Greater than or equal to | >= | 6 >= 4 | TRUE | +--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+ | Less than or equal to | <= | 6 <= 4 | FALSE | +--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+ | Value is missing | is.na() | is.na(7) | FALSE | +--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+ | Value is not missing | !is.na() | !is.na(7) | TRUE | +--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+

Note that the == (double equals) asks the question to R: "IS the value on the right equal to the value on the left?. The result is either TRUE or FALSE.

In contrast, the = (single equals) is used to assign values, such as for an argument within a function:

Logical operators

Logical operators, such as AND and OR, are often used to connect relational operators and create more complicated criteria. Complex statements might require parentheses ( ) for grouping and order of application.

+---------------------+-----------------------------------------------------------------------+ | Meaning | Operator | +=====================+=======================================================================+ | AND | & | +---------------------+-----------------------------------------------------------------------+ | OR | | (vertical bar) | +---------------------+-----------------------------------------------------------------------+ | Parentheses | ( ) Used to group criteria together and clarify order of operations | +---------------------+-----------------------------------------------------------------------+

Other simple logic recoding options

Now that you have learned simple logical re-coding, you can use ifelse() for many situations. Here are two other simple logical recoding functions that you should bookmark to read about later:

case_when() complex recoding

For more complex re-coding, you will likely switch to using case_when() from {dplyr}.

This function allows you to run the equivalent of multiple ifelse() statements in a series - it is R's equivalent to the SQL CASE WHEN statement.

It is just a coincidence that the case_when() function has a name that sounds like it was made for building case definitions! It is a general function - it is not specific to public health. Still, here is a demonstration of a case_when() command, used within mutate(), to create a case definition column:

mutate(case_def = case_when(            # create new column case_def, using the function case_when()
    lab_confirmed == TRUE             ~ "Confirmed",       # if lab confirmed, assign as Confirmed
    epilink == "yes" & fever == "yes" ~ "Suspect",         # else, if epilink and fever, assign as Suspect
    TRUE                              ~ "To investigate")) # else, assign as "To investigate"

Let us examine what is happening in the above command:

Setup

Structure within the case_when():

How the new values are assigned:

Every row in the data frame is evaluated for the top logical statement (lab_confirmed == TRUE). If this statement is TRUE for that data frame row, its value in column case_def is set as "Confirmed". However if lab_confirmed is not TRUE, then the case_when() function advances to check if the second logical statement is valid for that row, and the process repeats. Thus, for each row in the data frame, the first logical statement that evaluates to TRUE is used.

Therefore, it is very important to write the formulae in an intentional order - typically from very specific (at the top) to very broad (at the bottom).

If none of the logical statements apply to a row in the data, that row's value is assigned NA by default, unless a TRUE statement is written as the last statement, as shown above. When the bottom logical statement is simply written as TRUE, this applies for any other situation that has not triggered one of the logical statements above.

In the case definition example above, if a patient is lab confirmed they are immediately categorized as a "Confirmed". If lab confirmation is FALSE or missing, then the second formula is checked: if they have an epidemiological link and a fever the patient is categorized as a "Suspect". Finally, any row that has not met either of those criteria is marked as "To investigate".

Add the above mutate() command to the bottom of your pipe chain.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" gender to NA
  mutate(gender = na_if(gender, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>%

  # recode gender
  mutate(gender = recode(gender,
    "m" = "male",
    "f" = "female")) %>% 

  # convert negative weight values to NA
  mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% 

  # create case definition
  mutate(case_def = case_when(
    lab_confirmed == TRUE             ~ "Confirmed",
    epilink == "yes" & fever == "yes" ~ "Suspect",
    TRUE                              ~ "To investigate"))


Now we can tabulate the new column case_def and see the results (in the Testing section):

surv %>% 
  tabyl(case_def)

It is also wise to go look at the dataset to confirm that the logic was applied the way that you intended!

r fontawesome::fa("exclamation", fill = "red") DANGER: All outcome values on the RIGHT-side must all be the same class - either numeric, integer, character, date, logical, etc. Sometimes, this requires assigning variations of NA such as NA_character_, NA_real_ (for numeric), and as.Date(NA). Read more in the dates chapter of the Epi R Handbook.

Create column age_years

Now apply your case_when() knowledge to this next scenario.

Create a new column age_years that converts the value in column age correctly, depending on whether the value in age_unit is "years" or "months".

Below we give you the outline of the command - it is your responsibility to fill in the logic statements on the right, and the calculations on the right.

  # create age-in-years
  mutate(age_years = case_when(

    # logic for age_unit    # adjustment to make age_years  
    [if months]               ~ [calculation],    # return: age / 12
    [if years]                ~ [calculation],    # return: age 
    [if missing]              ~ [calculation]))   # if unit is missing, assume years 

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Within the parentheses of case_when(), on the left side of each ~ you are writing logical criteria that will be assessed for each row in the dataset, and on the right side of the ~ you write the resulting value to be returned.

The first logical criteria should be: "is age_unit equal to "months"? In code, this is written age_unit == "months" (note the double equals, which asks the question of equivalence). One the right side of the ~, you would write an equation that would produce the correct outcome in the new column age_years. In this case, it would be the value in age divided by 12. In code this would be written as: age_unit == "months" ~ age / 12. Don't forget a comma at the end, and to continue on the next line with the next logical criteria.

If you happen to encounter this error: 'names' attribute [1] must be the same length as the vector [0], then confirm that you converted the age column to class numeric earlier in the pipe chain. This error is because by dividing by 12, we are introducing decimals, which is not acceptable if the column is still "integer" class.


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" gender to NA
  mutate(gender = na_if(gender, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>%

  # recode gender
  mutate(gender = recode(gender,
    "m" = "male",
    "f" = "female")) %>% 

  # convert negative weight values to NA
  mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg))  %>% 

  # create case definition
  mutate(case_def = case_when(
    lab_confirmed == TRUE             ~ "Confirmed",
    epilink == "yes" & fever == "yes" ~ "Suspect",
    TRUE                              ~ "To investigate")) %>% 

  # create age-in-years
  mutate(age_years = case_when(
    age_unit == "months" ~ age/12,   # if age is given in months
    age_unit == "years"  ~ age,      # if age is given in years
    is.na(age_unit)      ~ age))     # if age_unit missing assume years, else NA


That was difficult! If the above exercise with case_when() was very challenging for you, do not despair! case_when() can be difficult, but will come with practice. Talk with your facilitator about the errors or difficulties you encountered. It will come!

It is important to remember that in case_when(), all of the right-side values must be of the same class (numeric, integer, character, date, logical, etc). Sometimes, this requires assigning variations of NA such as NA_character_, NA_real_ (for numeric), and as.Date(NA). Read more in the dates chapter of the Epi R Handbook. In this scenario, we want any circumstance not caught by a logic statement to return NA, so we can leave it.

Age categories

Now that we have a valid age_years column, let's create a categorical column for "age category".

There are several ways to do this (see this chapter of the Epi R Handbook for details), but here we just apply the most simple method.

The function age_categories() from {epikit} can be used within mutate() to define a new column age_cat with 10-year age categories.

The arguments for age_categories are:

1) The numeric column to divide into categories (age_years)
2) lower = A number that is the lower numeric boundary (0)
3) upper = A number that is the upper numeric boundary for categories (e.g. set this to 70, so that all ages above 70 are put in the category "70+")
4) by = A number for the width of the categories (10)

Try adding this command to the pipe chain If you get stuck, see the solution code hidden below.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" gender to NA
  mutate(gender = na_if(gender, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>%

  # recode gender
  mutate(gender = recode(gender,
    "m" = "male",
    "f" = "female")) %>% 

  # convert negative weight values to NA
  mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg))  %>% 

  # create case definition
  mutate(case_def = case_when(
    lab_confirmed == TRUE             ~ "Confirmed",
    epilink == "yes" & fever == "yes" ~ "Suspect",
    TRUE                              ~ "To investigate")) %>% 

  # create age-in-years
  mutate(age_years = case_when(
    age_unit == "months" ~ age/12,   # if age is given in months
    age_unit == "years"  ~ age,      # if age is given in years
    is.na(age_unit)      ~ age)) %>% # if unit missing assume years, else NA

  # create age category column
  mutate(age_cat = age_categories(         # create new column
    age_years,                             # numeric column to make groups from
    lower = 0,
    upper = 70,
    by = 10))


You can also define your age categories other ways, for example, by providing a vector of age breaks such as c(0, 5, 10, 15, 20, 30, 40, 50). Ask your instructor if you want to know how to do this. Or read about in the function's documentation by entering ?age_categories in the R console, or by searching for age_categories in the Help pane of RStudio (lower-right).

r fontawesome::fa("exclamation", fill = "red") Have you saved your script lately?

Date differences

Now that both of the date columns are correctly classified as "date", we can do math on them with simple + and - operators.

Add a mutate() command to the pipe chain, to create a new column named diff. Define this new column to equal the number of days between date_onset and date_report for each case.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" gender to NA
  mutate(gender = na_if(gender, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>%

  # recode gender
  mutate(gender = recode(gender,
    "m" = "male",
    "f" = "female")) %>% 

  # convert negative weight values to NA
  mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg))  %>% 

  # create case definition
  mutate(case_def = case_when(
    lab_confirmed == TRUE             ~ "Confirmed",
    epilink == "yes" & fever == "yes" ~ "Suspect",
    TRUE                              ~ "To investigate")) %>% 

  # create age-in-years
  mutate(age_years = case_when(
    age_unit == "months" ~ age/12,   # if age is given in months
    age_unit == "years"  ~ age,      # if age is given in years
    is.na(age_unit)      ~ age)) %>% # if unit missing assume years, else NA

  # create age category column
  mutate(age_cat = age_categories(         # create new column
    age_years,                             # numeric column to make groups from
    lower = 0,
    upper = 70,
    by = 10)) %>% 

  # Make date-difference column  
  mutate(diff = date_report - date_onset)


Check the calculation for a few rows, manually. Is it how you expect?

Note that this column's values appear like "4 days" - this is a special class called "difftime". If you want the result to be purely numeric, you have to wrap the result of the calculation in the {base} function as.numeric(). That would look like this:

mutate(diff = as.numeric(date_report - date_onset))

Note that currently, a patient with date_onset and date_report on 9 November 2014 would be recorded with diff of 0 days. In your work context, you may want to add a +1 after the end of the as.numeric() command, if you prefer same day admission to be recorded as 1 and not 0.

Location

We have two columns that express districts (locations) - of the cases's residence, and where they were detected as a case.

How similar are these two columns?

One way to assess how similar the columns are, is to create a new column that indicates whether the two column's values in that row are different. Then, you can tabulate this new column.

The syntax is actually quite simple, using mutate() and the != operator (which means NOT EQUAL TO). Add this to your pipe chain.

# New column will be TRUE if the two values are different
mutate(moved = district_res != district_det)

Now try to assess the differences manually in RStudio. Open the surv dataset in the viewer (by clicking on it in the Environment pane) and click the filter button on the upper-left to limit the view to rows in which the new column moved is TRUE. Was the logic for the new column applied correctly? Are TRUE and FALSE appearing where they should be?

quiz(caption = "Quiz - Moving cases",
  question_numeric(
    "How many cases have different districts of residence and of detection?",
    answer(surv_raw %>%
             distinct() %>%
             mutate(across(.cols = c(adm3_name_res, adm3_name_det), .fns = na_if, "")) %>% 
             mutate(moved = adm3_name_res != adm3_name_det) %>% 
             filter(moved) %>%
             nrow(),
           correct = T),
    allow_retry = TRUE,
    correct = "Correct, nice work.",
    min = 1,
    max = 700,
    step = 1
  )
)

You can use R code to find the answer as well, by using the tabyl() function on the new column:

surv %>% 
  tabyl(moved)

Prioritize certain values

Now that you know there is some discrepancy, which of these values should be prioritized? District of residence, or district of detection?

This is a question that each outbreak epidemiology team must answer given their knowledge of the data collection process, and of the outbreak itself. For this exercise, we will prioritize location of detection, because it might better reflect the nearby populations who are at higher risk for infection (which can be best understood by good case investigation).

Create a new column in the data simply named district. Create it using the coalesce() function, which will take its first value if it is available, else it will take its second value. This function allows us to "fill-in" missing values by using the first non-missing value (given a preference order), or to prioritize one value over another if they differ.

# Add this command to the pipechain
mutate(district = coalesce(district_det, district_res))

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" gender to NA
  mutate(gender = na_if(gender, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>%

  # recode gender
  mutate(gender = recode(gender,
    "m" = "male",
    "f" = "female")) %>% 

  # convert negative weight values to NA
  mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg))  %>% 

  # create case definition
  mutate(case_def = case_when(
    lab_confirmed == TRUE             ~ "Confirmed",
    epilink == "yes" & fever == "yes" ~ "Suspect",
    TRUE                              ~ "To investigate")) %>% 

  # create age-in-years
  mutate(age_years = case_when(
    age_unit == "months" ~ age/12,   # if age is given in months
    age_unit == "years"  ~ age,      # if age is given in years
    is.na(age_unit)      ~ age)) %>% # if unit missing assume years, else NA

  # create age category column
  mutate(age_cat = age_categories(         # create new column
    age_years,                             # numeric column to make groups from
    lower = 0,
    upper = 70,
    by = 10)) %>% 

  # Make date-difference column  
  mutate(diff = date_report - date_onset) %>% 

  # create column marking TRUE if district of residence and detection differ
  mutate(moved = district_res != district_det) %>% 

  # create new column that prioritizes district of detection
  mutate(district = coalesce(district_det, district_res))


View the dataset in the RStudio viewer again, and verify with your eye that this code worked as expected

Use the following command to answer the question below:

# cross tabulate "moved" column with "district_res" column
surv %>% 
  tabyl(moved, district_res)
quiz(caption = "Quiz - Moving cases origin",
  question_numeric(
    "Of cases with different residence and detection locations, how many had residence in 'Mountain Rural' district?",
    answer(surv_raw %>%
             distinct() %>%
             mutate(across(
               .cols = c(adm3_name_res, adm3_name_det),
               .fns = na_if, "")) %>% 
             mutate(moved = adm3_name_res != adm3_name_det) %>% 
             filter(moved & adm3_name_res == "Mountain Rural") %>%
             nrow(),
           correct = T),
    allow_retry = TRUE,
    correct = "Correct, nice work.",
    min = 1,
    max = 700,
    step = 1
  )
)

r fontawesome::fa("exclamation", fill = "red") Have you saved your script lately?

Filter rows

Sub-setting rows is done with the function filter(). Inside the parentheses you write logical criteria that are applied to each row. Rows that meet the criteria are kept.

Remove suspect cases

For the purposes of this exercise, we will remove any "suspect" cases from the dataset.

The simplest way of filtering, is to use a logical condition ("==", "!=", ">", "<", etc), so that only rows where the logical statement is TRUE are kept. For example here, we are only keeping rows where the gender is equal to "female".

filter(gender == "female")

Multiple logical criteria can be combined using & (and), | (or), and parentheses.

Add a filter() command that restricts the dataset to only cases where case_def is "Confirmed".

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" gender to NA
  mutate(gender = na_if(gender, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>%

  # recode gender
  mutate(gender = recode(gender,
    "m" = "male",
    "f" = "female")) %>% 

  # convert negative weight values to NA
  mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg))  %>% 

  # create case definition
  mutate(case_def = case_when(
    lab_confirmed == TRUE             ~ "Confirmed",
    epilink == "yes" & fever == "yes" ~ "Suspect",
    TRUE                              ~ "To investigate")) %>% 

  # create age-in-years
  mutate(age_years = case_when(
    age_unit == "months" ~ age/12,   # if age is given in months
    age_unit == "years"  ~ age,      # if age is given in years
    is.na(age_unit)      ~ age)) %>% # if unit missing assume years, else NA

  # create age category column
  mutate(age_cat = age_categories(         # create new column
    age_years,                             # numeric column to make groups from
    lower = 0,
    upper = 70,
    by = 10)) %>% 

  # Make date-difference column  
  mutate(diff = date_report - date_onset) %>% 

  # create column marking TRUE if district of residence and detection differ
  mutate(moved = district_res != district_det) %>% 

  # create new column that prioritizes district of detection
  mutate(district = coalesce(district_det, district_res)) %>% 

  # remove suspect cases
  filter(case_def == "Confirmed")


r fontawesome::fa("eye", fill = "darkblue") Now review your cleaning pipe chain, and the one in the solution above. Ensure that all the cleaning steps in the solution are also present in your R script "ebola_analysis.R".

Re-arrange columns

Click on your data frame surv and review the order of the column.

quiz(caption = "Quiz - Re-arrange columns",
  question("Where did the new columns appear, that you have created during this exercise?",
    allow_retry = TRUE,
    answer("On the left-most side of the data frame"),
    answer("On the right-most side of the data frame", correct = TRUE),
    answer("In the middle of the data frame")
  )
)

This location has no effect on being able to reference columns within a pipe chain, nor referencing columns with a $. However, if we want to export this clean data frame and send to it a colleague, we may prefer that some columns are naturally placed next to each other - for example age and age_years.

If you remember from the data cleaning section on select(), you can use the select() command to re-arrange columns as well as to select certain columns.

You can also list some columns, to move them to the beginning, and finish the select() statement by using the "helper" everything(). This will include all the other columns.

Note the empty parentheses at the end of everything()). There are other such "tidyselect" helpers available to you such as contains() and starts_with(), which you can write within the select() command like contains("date_"). Read more about these helpers in this Epi R Handbook chapter.

Can you add a select() command to your pipe chain that does re-arranges the columns into the following order:

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv <- surv_raw %>% 

  # automatically clean column names
  clean_names() %>% 

  # manually clean column names   
  rename(
    date_onset = onset_date,
    date_report = date_of_report,
    district_res = adm3_name_res,
    district_det = adm3_name_det) %>%

  # remove unnecessary column
  select(-row_num) %>% 

  # de-duplicate rows  
  distinct() %>% 

  # convert date_onset to date class
  mutate(date_onset = mdy(date_onset)) %>% 
  mutate(date_report = mdy(date_report)) %>% 

  # convert age to numeric class
  mutate(age = as.numeric(age)) %>% 

  # convert "Unknown" gender to NA
  mutate(gender = na_if(gender, "Unknown")) %>% 

  # properly record missing values in many character columns
  mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% 

  # re-code hospital column
  mutate(hospital = recode(hospital,
    # for reference: OLD = NEW
    "Mitilary Hospital"  = "Military Hospital",
    "Port"               = "Port Hospital",
    "Port Hopital"       = "Port Hospital",
    "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>%

  # recode gender
  mutate(gender = recode(gender,
    "m" = "male",
    "f" = "female")) %>% 

  # convert negative weight values to NA
  mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg))  %>% 

  # create case definition
  mutate(case_def = case_when(
    lab_confirmed == TRUE             ~ "Confirmed",
    epilink == "yes" & fever == "yes" ~ "Suspect",
    TRUE                              ~ "To investigate")) %>% 

  # create age-in-years
  mutate(age_years = case_when(
    age_unit == "months" ~ age/12,   # if age is given in months
    age_unit == "years"  ~ age,      # if age is given in years
    is.na(age_unit)      ~ age)) %>% # if unit missing assume years, else NA

  # create age category column
  mutate(age_cat = age_categories(         # create new column
    age_years,                             # numeric column to make groups from
    lower = 0,
    upper = 70,
    by = 10)) %>% 

  # Make date-difference column  
  mutate(diff = date_report - date_onset) %>% 

  # create column marking TRUE if district of residence and detection differ
  mutate(moved = district_res != district_det) %>% 

  # create new column that prioritizes district of detection
  mutate(district = coalesce(district_det, district_res)) %>% 

  # remove suspect cases
  filter(case_def == "Confirmed") %>% 

  # re-arrange columns
  select(case_id, starts_with("date"), diff, gender, age, age_unit, age_years, age_cat, hospital, district, district_res, district_det, moved, everything())


r fontawesome::fa("exclamation", fill = "red") It is important to note that filtering will by default remove missing values. E.g. filter(case_def == "Confirmed") will remove rows in which case_def is missing (NA).

Likewise, filtering based on a numeric or date criteria will also remove missing values. For example:

To keep rows with missing values, add an "OR" bar and then the is.na() function to the filter:

The statements above allow rows to remain that meet the first criteria, or that have missing values.

Export clean dataset

Now that we have cleaned this dataset, we should save it in our file directory for later use, and perhaps to share with other people.

We can use the export() function from the {rio} package - this is the same package that we used to import() the raw data. This is a standalone command - not part of your pipe chain. The arguments that it expects are:

1) The R object (data frame) that should be exported
2) The desired name for the file, in quotation marks, with extension (e.g. "surveillance_linelist_clean_20141201.csv")

The standalone command below (not part of your pipe chain) will save surv as a CSV file to the root folder of your R project. Run the command and go look for it with your file explorer.

# Export cleaned file to the root folder of the R project  
export(surv, "surveillance_linelist_clean_20141201.csv")

Saving as .rds

If you save the file as .xlsx or .csv, any person using R to further analyze the data will have to re-clean it by again specifying the column classes. To avoid this, save the clean dataset as an ".rds" file. RDS is a file format specific to R, and is very useful if you know you will work with the exported data again in R.

For example, if you work in an epidemiology team and need to send files to a GIS team for mapping, and they use R as well, just send them the .rds file! Then all the column classes are retained and they have less cleaning work to do.

In the example below, we save surv as a .rds file. We also specify that we want it to be saved in a subfolder ("data/clean") by writing a file path using here() in the same way that we did to import the data. Try this one too!

# Export cleaned file  
export(surv, here("data", "clean", "surveillance_linelist_clean_20141201.rds"))

Above, we write the name of the R object (surv), and then provide the file path and file namethat we want to save it to (using the same here() syntax that we used to import data). Note the file extension ".rds".

If you wanted to export the data as CSV or Excel, you would simply change the file extension in the command to: .csv or .xlsx

End

Congratulations! You have completed the cleaning exercise! This is a huge step in your introduction to R. You have now been introduced to most of the important data management functions in R.

Be sure to save your R script, and check-in with your facilitator.

In case you did not finish, do not worry. In the case that you can not finish, look in the "scripts/backup" folder. You will find R scripts for the Ebola analysis at every stage of this course. So you can always us these as a "backup" for later modules.

Extras

Here are some exercises to practice what you have learned. Can you create these data frames, starting from the clean surv data frame?

Spotlight on children

Create a separate data frame for children and save it to the "outputs" folder as "children_spotlight.csv"

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# create data frame with only child cases
children <- surv %>% 
  filter(age_years < 18) %>% 
  mutate(infant = ifelse(age_years < 1, "Infant", "Non-infant")) %>% 
  select(case_id, age_years, infant, gender, district, hospital, date_report)

# export
export(children, "children_spotlight.csv")


GIS dataset

Your GIS team needs a data frame to put into their routine maps for this outbreak. They use R, so they expect a .rds file containin the case_id, any columns containing information on location or whether the patient moved significantly from their residence before being detected.

They have access to your RStudio project and expect the file to be saved into the "outputs" sub-folder with the current date of the data at the end of the file name.

Can you make this file for them?

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# create data frame with only gis information
gis <- surv %>% 
  select(case_id, district, district_res, district_det, moved, lat, lon)

# export
export(gis, here("outputs", "gis_ebola_cases_20141201.rds"))


String detection

The R package {stringr} is part of the {tidyverse} and contains many different functions to handle words and character "strings". If you ever want to split, unite, or search within a character value, you should look in the Characters and strings chapter of the Epi R Handbook.

For example the function str_detect() returns TRUE or FALSE depending on whether particular characters are found within a specified value. The true arguments are:

1) string = this is the column or text to search in
2) pattern = this the character pattern to search for

For example:

Be aware that this function is case sensitive by default. You can adjust this using the tip found here in the Epi R Handbook.

Now, starting with the clean data frame surv, create a new data frame that contains only the cases reported from district whose names contains "West". Then export this as "western_district_cases.csv" to the "outputs" folder.

quiz(caption = "Quiz - Western districts",
  question("How many cases were there in western districts?",
    allow_retry = TRUE,
    answer("170"),
    answer("211", correct = TRUE),
    answer("418")
  )
)

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# create data frame with only cases from the "West" districts
western_district_cases <- surv %>% 
  filter(str_detect(string = district, pattern = "West"))

# export
export(western_district_cases, here("outputs", "western_district_cases.csv"))


Searching for multiple patterns

You can search for multiple patterns at the same time, for example: "Doctor" OR "Physician" OR "Surgeon" by using the "OR bar" in your pattern string. For example:

str_detect(string = district, pattern = "West|North|west|north")

Read more about this function, and how to make it not case-sensitive in this section of the Epi R Handbook.



appliedepi/introexercises documentation built on April 22, 2024, 1:01 a.m.