# load packages ---------------------------------------------------------------- library(introexercises) # get data for exercises library(learnr) # create lessons from rmd library(gradethis) # evaluate exercises library(dplyr) # wrangle data library(stringr) # strings library(flair) # highlight code library(ggplot2) # visualise data library(lubridate) # work with dates library(epikit) # for emojis library(skimr) # browse data library(fontawesome) # for emojis library(janitor) # clean data # library(RMariaDB) # connect to sql database ## set options for exercises and checking --------------------------------------- ## Define how exercises are evaluated gradethis::gradethis_setup( ## note: the below arguments are passed to learnr::tutorial_options ## set the maximum execution time limit in seconds exercise.timelimit = 60, ## set how exercises should be checked (defaults to NULL - individually defined) # exercise.checker = gradethis::grade_learnr ## set whether to pre-evaluate exercises (so users see answers) exercise.eval = FALSE ) # ## event recorder --------------------------------------------------------------- # ## see for details: # ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events # ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R # # ## connect to your sql database # sqldtbase <- dbConnect(RMariaDB::MariaDB(), # user = Sys.getenv("userid"), # password = Sys.getenv("pwd"), # dbname = 'excersize_log', # host = "144.126.246.140") # # # ## define a function to collect data # ## note that tutorial_id is defined in YAML # ## you could set the tutorial_version too (by specifying version:) but use package version instead # recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) { # # ## define a sql query # ## first bracket defines variable names # ## values bracket defines what goes in each variable # event_log <- paste("INSERT INTO responses ( # tutorial_id, # tutorial_version, # date_time, # user_id, # event, # section, # label, # question, # answer, # code, # correct) # VALUES('", tutorial_id, "', # '", tutorial_version, "', # '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "', # '", Sys.getenv("SHINYPROXY_PROXY_ID"), "', # '", event, "', # '", data$section, "', # '", data$label, "', # '", paste0('"', data$question, '"'), "', # '", paste0('"', data$answer, '"'), "', # '", paste0('"', data$code, '"'), "', # '", data$correct, "')", # sep = '') # # # Execute the query on the sqldtbase that we connected to above # rsInsert <- dbSendQuery(sqldtbase, event_log) # # } # # options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE)
# data prep -------------------------------------------------------------------- surv_raw <- rio::import(system.file("dat/old_version/surveillance_linelist_20141201.csv", package = "introexercises")) # for quizes surv <- rio::import(system.file("dat/old_version/surveillance_linelist_clean_20141201.rds", package = "introexercises"))
Welcome to the live course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation that offers open-source tools, training, and support to frontline public health practitioners.
knitr::include_graphics("images/logo.png", error = F)
This exercise focuses on logical recoding of values, filtering rows, creating age categories, and other more complex data cleaning tasks.
This exercise will guide you through a set of tasks.
You should perform these tasks in RStudio and on your local computer.
There are several ways to get help:
1) Look for the "helpers" (see below)
2) Ask your live course instructor/facilitator for help
3) Ask a colleague or other participant in the course for tips
4) Post a question in Applied Epi Community in the category for questions about Applied Epi Training
Here is what those "helpers" will look like:
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Here you will see a helpful hint!
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
linelist %>% filter( age > 25, district == "Bolo" )
Here is more explanation about why the solution works.
Please complete the quiz questions that you encounter throughout the tutorial. Answering will help you to comprehend the material, and will also help us to improve the exercises for future students.
To practice, please answer the following questions:
quiz( question_radio("When should I view the red 'helper' code?", answer("After trying to write the code myself", correct = TRUE), answer("Before I try coding", correct = FALSE), correct = "Reviewing best-practice code after trying to write yourself can help you improve", incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer." ) )
question_numeric( "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?", answer(10, message = "Try not to worry, we will help you succeed!", correct = T), answer(9, message = "Try not to worry, we will help you succeed!", correct = T), answer(8, message = "Try not to worry, we will help you succeed!", correct = T), answer(7, message = "Try not to worry, we will help you succeed!", correct = T), answer(6, message = "Ok, we will get there together", correct = T), answer(5, message = "Ok, we will get there together", correct = T), answer(4, message = "I like your confidence!", correct = T), answer(3, message = "I like your confidence!", correct = T), answer(2, message = "I like your confidence!", correct = T), answer(1, message = "I like your confidence!", correct = T), allow_retry = TRUE, correct = "Thanks for sharing. ", min = 1, max = 10, step = 1 )
You will see these icons throughout the exercises:
Icon |Meaning
------|--------------------
r fontawesome::fa("eye", fill = "darkblue")
|Observe
r fontawesome::fa("exclamation", fill = "red")
|Alert!
r fontawesome::fa("pen", fill = "brown")
|An informative note
r fontawesome::fa("terminal", fill = "black")
|Time for you to code!
r fontawesome::fa("window-restore", fill = "darkgrey")
|Change to another window
r fontawesome::fa("bookmark", fill = "orange")
|Remember this for later
Applied Epi Incorporated, 2022
This work is licensed by Applied Epi Incorporated under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
Please email contact@appliedepi.org with questions about the use of these materials for academic courses and epidemiologist training programs.
In this exercise you will:
As before, open RStudio by double-clicking on the intro_course.Rproj file in the "intro_course" folder.
Re-open the "ebola_analysis.R" script that you began writing in the previous session. You will continue your work in this script, by expanding the cleaning R code.
Then do the following:
You should also have "Exploratory analysis" or "Testing" sections in your script. Take a moment to organize these in a coherent manner so they do not interrupt the flow of your script by causing errors. For purposes of reproducibility, you should be able to run your entire script with no errors. This will make sharing the script with colleagues much easier.
Now we will continue by adding further commands to your cleaning pipe chain...
Sometimes, the recoding we want is not as simple as replacing one value for another. Instead, we use logical criteria and logical operators, and often reference values in several columns.
In the weight column wt_kg
, there are some negative values. Open the data frame in the viewer (e.g. by clicking the data frame name in the Environment pane) and sort the dataset by clicking on the wt_kg
column. Note the negative values.
Alternatively, in your Testing section, run the {base} command summary()
on the column surv$wt_kg
to return key summary statistics on this numeric column:
summary(surv$wt_kg)
This must be some data entry error. To avoid errors in any future calculations or plots, convert these values to NA
(missing). Remember, you should make edits like this here in the script. This is better than editing in CSV file, because here there is a record of changes performed.
The logic we want to apply is simple: "If wt_kg
negative, convert it to NA
, otherwise remain the same." Therefore, we can use the ifelse()
function within mutate()
.
The syntax is: ifelse(CONDITION APPLIED TO ROW, OUTCOME IF TRUE, OUTCOME IF FALSE)
mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg))
Add this to your pipe chain and verify that it worked correctly.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv <- surv_raw %>% # automatically clean column names clean_names() %>% # manually clean column names rename( date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% # remove unnecessary column select(-row_num) %>% # de-duplicate rows distinct() %>% # convert date_onset to date class mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% # convert age to numeric class mutate(age = as.numeric(age)) %>% # convert "Unknown" gender to NA mutate(gender = na_if(gender, "Unknown")) %>% # properly record missing values in many character columns mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% # re-code hospital column mutate(hospital = recode(hospital, # for reference: OLD = NEW "Mitilary Hospital" = "Military Hospital", "Port" = "Port Hospital", "Port Hopital" = "Port Hospital", "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>% # recode gender mutate(gender = recode(gender, "m" = "male", "f" = "female")) %>% # convert negative weight values to NA mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg))
Now re-run your summary()
command to check the new range of the values:
summary(surv$wt_kg)
As you clean, it is important to know these logical and relational operators. There is no need to memorize them (see the Epi R Handbook basics chapter), but they are here for you review quickly.
Relational operators compare values and are often used when defining new variables and subsets of datasets. Here are the common relational operators in R:
+--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
| Meaning | Operator | Example | Example Result |
+==========================+============+==============+========================================================================================================================================================+
| Equal to | ==
| "A" == "a"
| FALSE
(because R is case sensitive) Note that == (double equals) is different from = (single equals), see note below |
+--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
| Not equal to | !=
| 2 != 0
| TRUE
|
+--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
| Greater than | >
| 4 > 2
| TRUE
|
+--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
| Less than | <
| 4 < 2
| FALSE
|
+--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
| Greater than or equal to | >=
| 6 >= 4
| TRUE
|
+--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
| Less than or equal to | <=
| 6 <= 4
| FALSE
|
+--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
| Value is missing | is.na()
| is.na(7)
| FALSE
|
+--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
| Value is not missing | !is.na()
| !is.na(7)
| TRUE
|
+--------------------------+------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
Note that the ==
(double equals) asks the question to R: "IS the value on the right equal to the value on the left?. The result is either TRUE
or FALSE
.
age == 12
Asks is the value in column age
for a given row equal to 12? In contrast, the =
(single equals) is used to assign values, such as for an argument within a function:
max(age, na.rm = TRUE)
This sets the na.rm=
argument to TRUE
paste(jurisdictions, collapse = ",")
This sets the collapse argument to a commaLogical operators, such as AND and OR, are often used to connect relational operators and create more complicated criteria. Complex statements might require parentheses ( ) for grouping and order of application.
+---------------------+-----------------------------------------------------------------------+
| Meaning | Operator |
+=====================+=======================================================================+
| AND | &
|
+---------------------+-----------------------------------------------------------------------+
| OR | |
(vertical bar) |
+---------------------+-----------------------------------------------------------------------+
| Parentheses | ( )
Used to group criteria together and clarify order of operations |
+---------------------+-----------------------------------------------------------------------+
Now that you have learned simple logical re-coding, you can use ifelse()
for many situations. Here are two other simple logical recoding functions that you should bookmark to read about later:
if_else()
is particularly useful if re-coding dates replace()
can be a shortcut for ifelse()
logical recoding case_when()
complex recodingFor more complex re-coding, you will likely switch to using case_when()
from {dplyr}.
This function allows you to run the equivalent of multiple ifelse()
statements in a series - it is R's equivalent to the SQL CASE WHEN
statement.
It is just a coincidence that the case_when()
function has a name that sounds like it was made for building case definitions! It is a general function - it is not specific to public health. Still, here is a demonstration of a case_when()
command, used within mutate()
, to create a case definition column:
mutate(case_def = case_when( # create new column case_def, using the function case_when() lab_confirmed == TRUE ~ "Confirmed", # if lab confirmed, assign as Confirmed epilink == "yes" & fever == "yes" ~ "Suspect", # else, if epilink and fever, assign as Suspect TRUE ~ "To investigate")) # else, assign as "To investigate"
Let us examine what is happening in the above command:
Setup
case_when()
is nested within a mutate()
- just like if_else()
and na_if()
case_def
case_when()
function, used to assing the values for this new column Structure within the case_when()
:
case_when()
commands consist of formulae that have a Right-Hand Side (RHS) and a Left-Hand Side (LHS), separated by a “tilde” ~ TRUE
or FALSE
for a given row in the data TRUE
for a given row How the new values are assigned:
Every row in the data frame is evaluated for the top logical statement (lab_confirmed == TRUE
). If this statement is TRUE for that data frame row, its value in column case_def
is set as "Confirmed". However if lab_confirmed
is not TRUE
, then the case_when()
function advances to check if the second logical statement is valid for that row, and the process repeats. Thus, for each row in the data frame, the first logical statement that evaluates to TRUE
is used.
Therefore, it is very important to write the formulae in an intentional order - typically from very specific (at the top) to very broad (at the bottom).
If none of the logical statements apply to a row in the data, that row's value is assigned NA
by default, unless a TRUE
statement is written as the last statement, as shown above. When the bottom logical statement is simply written as TRUE
, this applies for any other situation that has not triggered one of the logical statements above.
In the case definition example above, if a patient is lab confirmed they are immediately categorized as a "Confirmed". If lab confirmation is FALSE
or missing, then the second formula is checked: if they have an epidemiological link and a fever the patient is categorized as a "Suspect". Finally, any row that has not met either of those criteria is marked as "To investigate".
Add the above mutate()
command to the bottom of your pipe chain.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv <- surv_raw %>% # automatically clean column names clean_names() %>% # manually clean column names rename( date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% # remove unnecessary column select(-row_num) %>% # de-duplicate rows distinct() %>% # convert date_onset to date class mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% # convert age to numeric class mutate(age = as.numeric(age)) %>% # convert "Unknown" gender to NA mutate(gender = na_if(gender, "Unknown")) %>% # properly record missing values in many character columns mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% # re-code hospital column mutate(hospital = recode(hospital, # for reference: OLD = NEW "Mitilary Hospital" = "Military Hospital", "Port" = "Port Hospital", "Port Hopital" = "Port Hospital", "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>% # recode gender mutate(gender = recode(gender, "m" = "male", "f" = "female")) %>% # convert negative weight values to NA mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% # create case definition mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate"))
Now we can tabulate the new column case_def
and see the results (in the Testing section):
surv %>% tabyl(case_def)
It is also wise to go look at the dataset to confirm that the logic was applied the way that you intended!
r fontawesome::fa("exclamation", fill = "red")
DANGER: All outcome values on the RIGHT-side must all be the same class - either numeric, integer, character, date, logical, etc. Sometimes, this requires assigning variations of NA
such as NA_character_
, NA_real_
(for numeric), and as.Date(NA)
. Read more in the dates chapter of the Epi R Handbook.
age_years
Now apply your case_when()
knowledge to this next scenario.
Create a new column age_years
that converts the value in column age
correctly, depending on whether the value in age_unit
is "years" or "months".
Below we give you the outline of the command - it is your responsibility to fill in the logic statements on the right, and the calculations on the right.
# create age-in-years mutate(age_years = case_when( # logic for age_unit # adjustment to make age_years [if months] ~ [calculation], # return: age / 12 [if years] ~ [calculation], # return: age [if missing] ~ [calculation])) # if unit is missing, assume years
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Within the parentheses of case_when()
, on the left side of each ~ you are writing logical criteria that will be assessed for each row in the dataset, and on the right side of the ~ you write the resulting value to be returned.
The first logical criteria should be: "is age_unit
equal to "months"? In code, this is written age_unit == "months"
(note the double equals, which asks the question of equivalence). One the right side of the ~, you would write an equation that would produce the correct outcome in the new column age_years
. In this case, it would be the value in age
divided by 12. In code this would be written as: age_unit == "months" ~ age / 12
. Don't forget a comma at the end, and to continue on the next line with the next logical criteria.
If you happen to encounter this error: 'names' attribute [1] must be the same length as the vector [0]
, then confirm that you converted the age
column to class numeric earlier in the pipe chain. This error is because by dividing by 12, we are introducing decimals, which is not acceptable if the column is still "integer" class.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv <- surv_raw %>% # automatically clean column names clean_names() %>% # manually clean column names rename( date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% # remove unnecessary column select(-row_num) %>% # de-duplicate rows distinct() %>% # convert date_onset to date class mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% # convert age to numeric class mutate(age = as.numeric(age)) %>% # convert "Unknown" gender to NA mutate(gender = na_if(gender, "Unknown")) %>% # properly record missing values in many character columns mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% # re-code hospital column mutate(hospital = recode(hospital, # for reference: OLD = NEW "Mitilary Hospital" = "Military Hospital", "Port" = "Port Hospital", "Port Hopital" = "Port Hospital", "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>% # recode gender mutate(gender = recode(gender, "m" = "male", "f" = "female")) %>% # convert negative weight values to NA mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% # create case definition mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate")) %>% # create age-in-years mutate(age_years = case_when( age_unit == "months" ~ age/12, # if age is given in months age_unit == "years" ~ age, # if age is given in years is.na(age_unit) ~ age)) # if age_unit missing assume years, else NA
That was difficult! If the above exercise with case_when()
was very challenging for you, do not despair! case_when()
can be difficult, but will come with practice. Talk with your facilitator about the errors or difficulties you encountered. It will come!
It is important to remember that in case_when()
, all of the right-side values must be of the same class (numeric, integer, character, date, logical, etc). Sometimes, this requires assigning variations of NA
such as NA_character_
, NA_real_
(for numeric), and as.Date(NA)
. Read more in the dates chapter of the Epi R Handbook. In this scenario, we want any circumstance not caught by a logic statement to return NA
, so we can leave it.
Now that we have a valid age_years
column, let's create a categorical column for "age category".
There are several ways to do this (see this chapter of the Epi R Handbook for details), but here we just apply the most simple method.
The function age_categories()
from {epikit} can be used within mutate()
to define a new column age_cat
with 10-year age categories.
The arguments for age_categories
are:
1) The numeric column to divide into categories (age_years
)
2) lower =
A number that is the lower numeric boundary (0)
3) upper =
A number that is the upper numeric boundary for categories (e.g. set this to 70, so that all ages above 70 are put in the category "70+")
4) by =
A number for the width of the categories (10)
Try adding this command to the pipe chain If you get stuck, see the solution code hidden below.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv <- surv_raw %>% # automatically clean column names clean_names() %>% # manually clean column names rename( date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% # remove unnecessary column select(-row_num) %>% # de-duplicate rows distinct() %>% # convert date_onset to date class mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% # convert age to numeric class mutate(age = as.numeric(age)) %>% # convert "Unknown" gender to NA mutate(gender = na_if(gender, "Unknown")) %>% # properly record missing values in many character columns mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% # re-code hospital column mutate(hospital = recode(hospital, # for reference: OLD = NEW "Mitilary Hospital" = "Military Hospital", "Port" = "Port Hospital", "Port Hopital" = "Port Hospital", "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>% # recode gender mutate(gender = recode(gender, "m" = "male", "f" = "female")) %>% # convert negative weight values to NA mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% # create case definition mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate")) %>% # create age-in-years mutate(age_years = case_when( age_unit == "months" ~ age/12, # if age is given in months age_unit == "years" ~ age, # if age is given in years is.na(age_unit) ~ age)) %>% # if unit missing assume years, else NA # create age category column mutate(age_cat = age_categories( # create new column age_years, # numeric column to make groups from lower = 0, upper = 70, by = 10))
You can also define your age categories other ways, for example, by providing a vector of age breaks such as c(0, 5, 10, 15, 20, 30, 40, 50)
. Ask your instructor if you want to know how to do this. Or read about in the function's documentation by entering ?age_categories
in the R console, or by searching for age_categories
in the Help pane of RStudio (lower-right).
r fontawesome::fa("exclamation", fill = "red")
Have you saved your script lately?
Now that both of the date columns are correctly classified as "date", we can do math on them with simple + and - operators.
Add a mutate()
command to the pipe chain, to create a new column named diff
. Define this new column to equal the number of days between date_onset
and date_report
for each case.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv <- surv_raw %>% # automatically clean column names clean_names() %>% # manually clean column names rename( date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% # remove unnecessary column select(-row_num) %>% # de-duplicate rows distinct() %>% # convert date_onset to date class mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% # convert age to numeric class mutate(age = as.numeric(age)) %>% # convert "Unknown" gender to NA mutate(gender = na_if(gender, "Unknown")) %>% # properly record missing values in many character columns mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% # re-code hospital column mutate(hospital = recode(hospital, # for reference: OLD = NEW "Mitilary Hospital" = "Military Hospital", "Port" = "Port Hospital", "Port Hopital" = "Port Hospital", "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>% # recode gender mutate(gender = recode(gender, "m" = "male", "f" = "female")) %>% # convert negative weight values to NA mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% # create case definition mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate")) %>% # create age-in-years mutate(age_years = case_when( age_unit == "months" ~ age/12, # if age is given in months age_unit == "years" ~ age, # if age is given in years is.na(age_unit) ~ age)) %>% # if unit missing assume years, else NA # create age category column mutate(age_cat = age_categories( # create new column age_years, # numeric column to make groups from lower = 0, upper = 70, by = 10)) %>% # Make date-difference column mutate(diff = date_report - date_onset)
Check the calculation for a few rows, manually. Is it how you expect?
Note that this column's values appear like "4 days" - this is a special class called "difftime". If you want the result to be purely numeric, you have to wrap the result of the calculation in the {base} function as.numeric()
. That would look like this:
mutate(diff = as.numeric(date_report - date_onset))
Note that currently, a patient with date_onset
and date_report
on 9 November 2014 would be recorded with diff
of 0 days. In your work context, you may want to add a +1
after the end of the as.numeric()
command, if you prefer same day admission to be recorded as 1 and not 0.
We have two columns that express districts (locations) - of the cases's residence, and where they were detected as a case.
How similar are these two columns?
One way to assess how similar the columns are, is to create a new column that indicates whether the two column's values in that row are different. Then, you can tabulate this new column.
The syntax is actually quite simple, using mutate()
and the !=
operator (which means NOT EQUAL TO). Add this to your pipe chain.
# New column will be TRUE if the two values are different mutate(moved = district_res != district_det)
Now try to assess the differences manually in RStudio. Open the surv
dataset in the viewer (by clicking on it in the Environment pane) and click the filter button on the upper-left to limit the view to rows in which the new column moved
is TRUE. Was the logic for the new column applied correctly? Are TRUE and FALSE appearing where they should be?
quiz(caption = "Quiz - Moving cases", question_numeric( "How many cases have different districts of residence and of detection?", answer(surv_raw %>% distinct() %>% mutate(across(.cols = c(adm3_name_res, adm3_name_det), .fns = na_if, "")) %>% mutate(moved = adm3_name_res != adm3_name_det) %>% filter(moved) %>% nrow(), correct = T), allow_retry = TRUE, correct = "Correct, nice work.", min = 1, max = 700, step = 1 ) )
You can use R code to find the answer as well, by using the tabyl()
function on the new column:
surv %>% tabyl(moved)
Now that you know there is some discrepancy, which of these values should be prioritized? District of residence, or district of detection?
This is a question that each outbreak epidemiology team must answer given their knowledge of the data collection process, and of the outbreak itself. For this exercise, we will prioritize location of detection, because it might better reflect the nearby populations who are at higher risk for infection (which can be best understood by good case investigation).
Create a new column in the data simply named district
. Create it using the coalesce()
function, which will take its first value if it is available, else it will take its second value. This function allows us to "fill-in" missing values by using the first non-missing value (given a preference order), or to prioritize one value over another if they differ.
# Add this command to the pipechain mutate(district = coalesce(district_det, district_res))
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv <- surv_raw %>% # automatically clean column names clean_names() %>% # manually clean column names rename( date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% # remove unnecessary column select(-row_num) %>% # de-duplicate rows distinct() %>% # convert date_onset to date class mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% # convert age to numeric class mutate(age = as.numeric(age)) %>% # convert "Unknown" gender to NA mutate(gender = na_if(gender, "Unknown")) %>% # properly record missing values in many character columns mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% # re-code hospital column mutate(hospital = recode(hospital, # for reference: OLD = NEW "Mitilary Hospital" = "Military Hospital", "Port" = "Port Hospital", "Port Hopital" = "Port Hospital", "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>% # recode gender mutate(gender = recode(gender, "m" = "male", "f" = "female")) %>% # convert negative weight values to NA mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% # create case definition mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate")) %>% # create age-in-years mutate(age_years = case_when( age_unit == "months" ~ age/12, # if age is given in months age_unit == "years" ~ age, # if age is given in years is.na(age_unit) ~ age)) %>% # if unit missing assume years, else NA # create age category column mutate(age_cat = age_categories( # create new column age_years, # numeric column to make groups from lower = 0, upper = 70, by = 10)) %>% # Make date-difference column mutate(diff = date_report - date_onset) %>% # create column marking TRUE if district of residence and detection differ mutate(moved = district_res != district_det) %>% # create new column that prioritizes district of detection mutate(district = coalesce(district_det, district_res))
View the dataset in the RStudio viewer again, and verify with your eye that this code worked as expected
Use the following command to answer the question below:
# cross tabulate "moved" column with "district_res" column surv %>% tabyl(moved, district_res)
quiz(caption = "Quiz - Moving cases origin", question_numeric( "Of cases with different residence and detection locations, how many had residence in 'Mountain Rural' district?", answer(surv_raw %>% distinct() %>% mutate(across( .cols = c(adm3_name_res, adm3_name_det), .fns = na_if, "")) %>% mutate(moved = adm3_name_res != adm3_name_det) %>% filter(moved & adm3_name_res == "Mountain Rural") %>% nrow(), correct = T), allow_retry = TRUE, correct = "Correct, nice work.", min = 1, max = 700, step = 1 ) )
r fontawesome::fa("exclamation", fill = "red")
Have you saved your script lately?
Sub-setting rows is done with the function filter()
. Inside the parentheses you write logical criteria that are applied to each row. Rows that meet the criteria are kept.
For the purposes of this exercise, we will remove any "suspect" cases from the dataset.
The simplest way of filtering, is to use a logical condition ("==", "!=", ">", "<", etc), so that only rows where the logical statement is TRUE
are kept. For example here, we are only keeping rows where the gender is equal to "female".
filter(gender == "female")
Multiple logical criteria can be combined using & (and), | (or), and parentheses.
Add a filter()
command that restricts the dataset to only cases where case_def
is "Confirmed".
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv <- surv_raw %>% # automatically clean column names clean_names() %>% # manually clean column names rename( date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% # remove unnecessary column select(-row_num) %>% # de-duplicate rows distinct() %>% # convert date_onset to date class mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% # convert age to numeric class mutate(age = as.numeric(age)) %>% # convert "Unknown" gender to NA mutate(gender = na_if(gender, "Unknown")) %>% # properly record missing values in many character columns mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% # re-code hospital column mutate(hospital = recode(hospital, # for reference: OLD = NEW "Mitilary Hospital" = "Military Hospital", "Port" = "Port Hospital", "Port Hopital" = "Port Hospital", "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>% # recode gender mutate(gender = recode(gender, "m" = "male", "f" = "female")) %>% # convert negative weight values to NA mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% # create case definition mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate")) %>% # create age-in-years mutate(age_years = case_when( age_unit == "months" ~ age/12, # if age is given in months age_unit == "years" ~ age, # if age is given in years is.na(age_unit) ~ age)) %>% # if unit missing assume years, else NA # create age category column mutate(age_cat = age_categories( # create new column age_years, # numeric column to make groups from lower = 0, upper = 70, by = 10)) %>% # Make date-difference column mutate(diff = date_report - date_onset) %>% # create column marking TRUE if district of residence and detection differ mutate(moved = district_res != district_det) %>% # create new column that prioritizes district of detection mutate(district = coalesce(district_det, district_res)) %>% # remove suspect cases filter(case_def == "Confirmed")
r fontawesome::fa("eye", fill = "darkblue")
Now review your cleaning pipe chain, and the one in the solution above. Ensure that all the cleaning steps in the solution are also present in your R script "ebola_analysis.R".
Click on your data frame surv
and review the order of the column.
quiz(caption = "Quiz - Re-arrange columns", question("Where did the new columns appear, that you have created during this exercise?", allow_retry = TRUE, answer("On the left-most side of the data frame"), answer("On the right-most side of the data frame", correct = TRUE), answer("In the middle of the data frame") ) )
This location has no effect on being able to reference columns within a pipe chain, nor referencing columns with a $
. However, if we want to export this clean data frame and send to it a colleague, we may prefer that some columns are naturally placed next to each other - for example age
and age_years
.
If you remember from the data cleaning section on select()
, you can use the select()
command to re-arrange columns as well as to select certain columns.
You can also list some columns, to move them to the beginning, and finish the select()
statement by using the "helper" everything()
. This will include all the other columns.
Note the empty parentheses at the end of everything()
). There are other such "tidyselect" helpers available to you such as contains()
and starts_with()
, which you can write within the select()
command like contains("date_")
. Read more about these helpers in this Epi R Handbook chapter.
Can you add a select()
command to your pipe chain that does re-arranges the columns into the following order:
case_id
, any column that starts with "date_", diff
, gender
, age
, age_unit
, age_years
, age_cat5
, hospital
, district
, district_res
, district_det
, moved
, all other columns r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv <- surv_raw %>% # automatically clean column names clean_names() %>% # manually clean column names rename( date_onset = onset_date, date_report = date_of_report, district_res = adm3_name_res, district_det = adm3_name_det) %>% # remove unnecessary column select(-row_num) %>% # de-duplicate rows distinct() %>% # convert date_onset to date class mutate(date_onset = mdy(date_onset)) %>% mutate(date_report = mdy(date_report)) %>% # convert age to numeric class mutate(age = as.numeric(age)) %>% # convert "Unknown" gender to NA mutate(gender = na_if(gender, "Unknown")) %>% # properly record missing values in many character columns mutate(across(.cols = where(is.character), .fns = na_if, "")) %>% # re-code hospital column mutate(hospital = recode(hospital, # for reference: OLD = NEW "Mitilary Hospital" = "Military Hospital", "Port" = "Port Hospital", "Port Hopital" = "Port Hospital", "St. Mark's Maternity Hospital (SMMH)" = "SMMH")) %>% # recode gender mutate(gender = recode(gender, "m" = "male", "f" = "female")) %>% # convert negative weight values to NA mutate(wt_kg = ifelse(wt_kg < 0, NA, wt_kg)) %>% # create case definition mutate(case_def = case_when( lab_confirmed == TRUE ~ "Confirmed", epilink == "yes" & fever == "yes" ~ "Suspect", TRUE ~ "To investigate")) %>% # create age-in-years mutate(age_years = case_when( age_unit == "months" ~ age/12, # if age is given in months age_unit == "years" ~ age, # if age is given in years is.na(age_unit) ~ age)) %>% # if unit missing assume years, else NA # create age category column mutate(age_cat = age_categories( # create new column age_years, # numeric column to make groups from lower = 0, upper = 70, by = 10)) %>% # Make date-difference column mutate(diff = date_report - date_onset) %>% # create column marking TRUE if district of residence and detection differ mutate(moved = district_res != district_det) %>% # create new column that prioritizes district of detection mutate(district = coalesce(district_det, district_res)) %>% # remove suspect cases filter(case_def == "Confirmed") %>% # re-arrange columns select(case_id, starts_with("date"), diff, gender, age, age_unit, age_years, age_cat, hospital, district, district_res, district_det, moved, everything())
r fontawesome::fa("exclamation", fill = "red")
It is important to note that filtering will by default remove missing values. E.g. filter(case_def == "Confirmed")
will remove rows in which case_def
is missing (NA
).
Likewise, filtering based on a numeric or date criteria will also remove missing values. For example:
filter(age > 12)
will keep only rows in which age
is greater than 12. Rows with missing age will be removed. filter(date_onset > ymd("2014-11-25")
will keep only rows in which date_onset
is after 25 November 2014. Rows with missing date of onset will be removed. To keep rows with missing values, add an "OR" bar and then the is.na()
function to the filter:
filter(age > 12 | is.na(age))
filter(date_onset > ymd("2014-11-25") | is.na(date_onset))
The statements above allow rows to remain that meet the first criteria, or that have missing values.
Now that we have cleaned this dataset, we should save it in our file directory for later use, and perhaps to share with other people.
We can use the export()
function from the {rio} package - this is the same package that we used to import()
the raw data. This is a standalone command - not part of your pipe chain. The arguments that it expects are:
1) The R object (data frame) that should be exported
2) The desired name for the file, in quotation marks, with extension (e.g. "surveillance_linelist_clean_20141201.csv")
The standalone command below (not part of your pipe chain) will save surv
as a CSV file to the root folder of your R project. Run the command and go look for it with your file explorer.
# Export cleaned file to the root folder of the R project export(surv, "surveillance_linelist_clean_20141201.csv")
If you save the file as .xlsx or .csv, any person using R to further analyze the data will have to re-clean it by again specifying the column classes. To avoid this, save the clean dataset as an ".rds" file. RDS is a file format specific to R, and is very useful if you know you will work with the exported data again in R.
For example, if you work in an epidemiology team and need to send files to a GIS team for mapping, and they use R as well, just send them the .rds file! Then all the column classes are retained and they have less cleaning work to do.
In the example below, we save surv
as a .rds file. We also specify that we want it to be saved in a subfolder ("data/clean") by writing a file path using here()
in the same way that we did to import the data. Try this one too!
# Export cleaned file export(surv, here("data", "clean", "surveillance_linelist_clean_20141201.rds"))
Above, we write the name of the R object (surv
), and then provide the file path and file namethat we want to save it to (using the same here()
syntax that we used to import data). Note the file extension ".rds".
If you wanted to export the data as CSV or Excel, you would simply change the file extension in the command to: .csv or .xlsx
Congratulations! You have completed the cleaning exercise! This is a huge step in your introduction to R. You have now been introduced to most of the important data management functions in R.
Be sure to save your R script, and check-in with your facilitator.
In case you did not finish, do not worry. In the case that you can not finish, look in the "scripts/backup" folder. You will find R scripts for the Ebola analysis at every stage of this course. So you can always us these as a "backup" for later modules.
Here are some exercises to practice what you have learned. Can you create these data frames, starting from the clean surv
data frame?
Create a separate data frame for children and save it to the "outputs" folder as "children_spotlight.csv"
surv
and assign this new data frame with the name children
infant
that says "Infant" if the case is less than 1 year old, and says "Non-infant" otherwise case_id
, age_years
, infant
, gender
, district
, hospital
, and date_report
children
data frame to the root folder of your RStudio project as "children_spotlight.csv" r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# create data frame with only child cases children <- surv %>% filter(age_years < 18) %>% mutate(infant = ifelse(age_years < 1, "Infant", "Non-infant")) %>% select(case_id, age_years, infant, gender, district, hospital, date_report) # export export(children, "children_spotlight.csv")
Your GIS team needs a data frame to put into their routine maps for this outbreak. They use R, so they expect a .rds file containin the case_id
, any columns containing information on location or whether the patient moved significantly from their residence before being detected.
They have access to your RStudio project and expect the file to be saved into the "outputs" sub-folder with the current date of the data at the end of the file name.
Can you make this file for them?
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# create data frame with only gis information gis <- surv %>% select(case_id, district, district_res, district_det, moved, lat, lon) # export export(gis, here("outputs", "gis_ebola_cases_20141201.rds"))
The R package {stringr} is part of the {tidyverse} and contains many different functions to handle words and character "strings". If you ever want to split, unite, or search within a character value, you should look in the Characters and strings chapter of the Epi R Handbook.
For example the function str_detect()
returns TRUE
or FALSE
depending on whether particular characters are found within a specified value. The true arguments are:
1) string =
this is the column or text to search in
2) pattern =
this the character pattern to search for
For example:
str_detect(string = name, pattern = "Isha")
will return TRUE
if the characters "Isha" are found in a particular row of the colunm name
str_detect(string = occupation, pattern = "nurse")
will return TRUE
if the characters "nurse" are found in a particular row of the colunm occupation
Be aware that this function is case sensitive by default. You can adjust this using the tip found here in the Epi R Handbook.
Now, starting with the clean data frame surv
, create a new data frame that contains only the cases reported from district whose names contains "West". Then export this as "western_district_cases.csv" to the "outputs" folder.
quiz(caption = "Quiz - Western districts", question("How many cases were there in western districts?", allow_retry = TRUE, answer("170"), answer("211", correct = TRUE), answer("418") ) )
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# create data frame with only cases from the "West" districts western_district_cases <- surv %>% filter(str_detect(string = district, pattern = "West")) # export export(western_district_cases, here("outputs", "western_district_cases.csv"))
You can search for multiple patterns at the same time, for example: "Doctor" OR "Physician" OR "Surgeon" by using the "OR bar" in your pattern string. For example:
str_detect(string = district, pattern = "West|North|west|north")
Read more about this function, and how to make it not case-sensitive in this section of the Epi R Handbook.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.