In appliedepi/introexercises:

# load packages ----------------------------------------------------------------
library(introexercises)
library(learnr)
library(gradethis)
library(dplyr)
library(flair)
library(ggplot2)
library(lubridate)
library(fontawesome)
library(janitor)
library(kableExtra)
# library(RMariaDB)        # connect to sql database

## set options for exercises and checking ---------------------------------------

## Define how exercises are evaluated 
gradethis::gradethis_setup(
  ## note: the below arguments are passed to learnr::tutorial_options
  ## set the maximum execution time limit in seconds
  exercise.timelimit = 60, 
  ## set how exercises should be checked (defaults to NULL - individually defined)
  # exercise.checker = gradethis::grade_learnr
  ## set whether to pre-evaluate exercises (so users see answers)
  exercise.eval = FALSE 
)

# ## event recorder ---------------------------------------------------------------
# ## see for details: 
# ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events
# ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R
# 
# ## connect to your sql database
# sqldtbase <- dbConnect(RMariaDB::MariaDB(),
#                        user     = Sys.getenv("userid"),
#                        password = Sys.getenv("pwd"),
#                        dbname   = 'excersize_log',
#                        host     = "144.126.246.140")
# 
# 
# ## define a function to collect data 
# ## note that tutorial_id is defined in YAML
#     ## you could set the tutorial_version too (by specifying version:) but use package version instead 
# recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) {
#     
#   ## define a sql query 
#   ## first bracket defines variable names
#   ## values bracket defines what goes in each variable
#   event_log <- paste("INSERT INTO responses (
#                        tutorial_id, 
#                        tutorial_version, 
#                        date_time, 
#                        user_id, 
#                        event, 
#                        section,
#                        label, 
#                        question, 
#                        answer, 
#                        code, 
#                        correct)
#                        VALUES('", tutorial_id,  "', 
#                        '", tutorial_version, "', 
#                        '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "',
#                        '", Sys.getenv("SHINYPROXY_PROXY_ID"), "',
#                        '", event, "',
#                        '", data$section, "',
#                        '", data$label,  "',
#                        '", paste0('"', data$question, '"'),  "',
#                        '", paste0('"', data$answer,   '"'),  "',
#                        '", paste0('"', data$code,     '"'),  "',
#                        '", data$correct, "')",
#                        sep = '')
# 
#     # Execute the query on the sqldtbase that we connected to above
#     rsInsert <- dbSendQuery(sqldtbase, event_log)
#   
# }
# 
# options(tutorial.event_recorder = recorder_function)

# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)


# Data prep --------------------------------------------------------------------
# Import
surv <- rio::import(system.file("dat/surveillance_linelist_clean_20141201.rds", package = "introexercises")) 

hospitals <- bind_rows(rio::import(system.file("dat/hospitals/20141201_hosp_port.csv", package = "introexercises")),
                    rio::import(system.file("dat/hospitals/20141201_hosp_central.csv", package = "introexercises"))) %>% 
             select(hospital, date_hospitalisation, outcome, date_outcome) %>% 
             janitor::clean_names()

## NOT USED ANYMORE
# geo_data <- rio::import(system.file("dat/pop/sle_admpop_adm3_2020.csv", package = "introexercises")) %>% 
#                  select(-c(Female, Male), -starts_with("T"))


## NOT USED ANYMORE Make the hospital information dataframe
# hospital_dirty = data.frame(
#   hosp_name     = c("central hospital", "military", "military", "port", "St. Mark's", "ignace", "sisters"),
#   catchment_pop = c(1950280, 40500, 10000, 50280, 12000, 5000, 4200),
#   level         = c("Tertiary", "Secondary", "Primary", "Secondary", "Secondary", "Primary", "Primary")
# )
# 
# hospital_clean <- hospital_dirty %>% 
#   mutate(
#     hosp_name = case_when(
#       # criteria                       # new value
#       hosp_name == "military"          ~ "Military Hospital",
#       hosp_name == "port"              ~ "Port Hospital",
#       hosp_name == "St. Mark's"        ~ "St. Mark's Maternity Hospital (SMMH)",
#       hosp_name == "central hospital"  ~ "Central Hospital",
#       TRUE                             ~ hosp_name
#       )
#     )

# Create smaller linelists for the easier exemples
patients <- tibble(ID = c("patient_1", "patient_2", "patient_3", 
                            "patient_4", "patient_10"), 
                     sexe = c("F", "M", "M", "F", "F"), 
                     age = c(5, 10, 2, 15, 14), 
                     age_unit = c("Year", "Year", "Year", "Year", "Year"))

results <- tibble(ID = c("patient_1", "patient_2", "patient_4", 
                        "patient_5", "patient_6"), 
                 test_result = c("positive", "negative", 
                                 "negative", "positive", "positive"))


df1 <- tibble(ID = c("patient_1", "patient_2", "patient_3"),
              sexe = c("F", "M", "M"))

df2 <- tibble(ID = c("patient_1", "patient_1", "patient_1", "patient_2", "patient_4"),
              date_test = as.Date(c("2021-12-01", "2021-12-26", "2022-01-05", "2021-12-18", "2022-01-01")),
                 test_result = c("positive", "negative", "negative", "positive", "positive"))

hosp_central <- rio::import(system.file("dat/hospitals/20141201_hosp_central.csv", package = "introexercises"))
hosp_military <- rio::import(system.file("dat/hospitals/20141201_hosp_military.csv", package = "introexercises"))
hosp_port <- rio::import(system.file("dat/hospitals/20141201_hosp_port.csv", package = "introexercises"))
hosp_smmh <- rio::import(system.file("dat/hospitals/20141201_hosp_smmh.csv", package = "introexercises"))
hosp_other <- rio::import(system.file("dat/hospitals/20141201_hosp_other.csv", package = "introexercises"))
hosp_missing <- rio::import(system.file("dat/hospitals/20141201_hosp_missing.csv", package = "introexercises"))

lab <- rio::import(system.file("dat/lab_results_20141201.xlsx", package = "introexercises"))
investigations <- rio::import(system.file("dat/case_investigations_20141201.xlsx", package = "introexercises"))

# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)

Introduction to R for Applied Epidemiology and Public Health

Welcome

Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.

knitr::include_graphics("images/logo.png", error = F)

Joining data

This exercise focuses on joining data frames.

Format

This exercise guides you through tasks that you should perform in RStudio on your local computer.

Getting Help

There are several ways to get help:

1) Look for the "helpers" (see below) 2) Ask your live course instructor/facilitator for help
3) Schedule a 1-on-1 call with an instructor for "Course Tutoring" 4) Post a question in Applied Epi Community

Here is what those "helpers" will look like:

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Here you will see a helpful hint!

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

linelist %>% 
  filter(
    age > 25,
    district == "Bolo"
  )

Here is more explanation about why the solution works.

Quiz questions

Answering quiz questions will help you to comprehend the material. The answers are not recorded.

To practice, please answer the following questions:

quiz(
  question_radio("When should I view the red 'helper' code?",
    answer("After trying to write the code myself", correct = TRUE),
    answer("Before I try coding", correct = FALSE),
    correct = "Reviewing best-practice code after trying to write yourself can help you improve",
    incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer."
  )
)

question_numeric(
 "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?",
 answer(10, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(9, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(8, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(7, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(6, message = "Ok, we will get there together", correct = T),
 answer(5, message = "Ok, we will get there together", correct = T),
 answer(4, message = "I like your confidence!", correct = T),
 answer(3, message = "I like your confidence!", correct = T),
 answer(2, message = "I like your confidence!", correct = T),
 answer(1, message = "I like your confidence!", correct = T),
 allow_retry = TRUE,
 correct = "Thanks for sharing. ",
 min = 1,
 max = 10,
 step = 1
)

License

Please email contact@appliedepi.org with questions about the use of these materials.

Learning objectives

In this exercise you will:

Understand the types of joins, their corresponding commands in R, and when to use them
Combine several hospital case linelists together by binding rows
Join together the surveillance linelist, medical linelist, and other lab and case investigation datasets
Better understand script organization and structure

Preparation

Open your R project and R Markdown script.

Run each code chunk in your R Markdown script, chunk by chunk, using the green play button in the top right corner of each chunk. The following three chunks are of particular importance:

Load packages
Import data
Clean the surveillance linelist surv

You should now have successfully loaded the required packages and populated your environment with surv_raw and surv.

Create a code chunk for Joining Data

Create a new chunk for "Joining data" below your data cleaning code chunk (for creating surv).

You can write a comment in the chunk to note its purpose

# Joining data

Script organization

NOTE: We will be navigate through our R Markdown script frequently, so make sure that it is well organised!

Ideally, your script should approximately align with this outline:

YAML
Setup options
Load packages
Import data
(Optional) Exploratory analysis
Clean surveillance linelist
NEW - Joining data
Text summary of the outbreak
Summary tables
Plots
(Optional) Testing area

Ensure that eval = FALSE is set for the two optional code chunks so they are not printed to the report.

Binding rows

Combining datasets can sometimes be as simple as combining the rows of data frames that have exactly the same columns. For example, if you have multiple study sites, or clinical sites, which are sending you data in exactly the same format.

Hospital datasets

In our case study Ebola outbreak, there are six datasets from various hospitals in the "data/raw/hospitals" subfolder of the "ebola" R project. :

20141201_hosp_central.csv
20141201_hosp_military.csv
20141201_hosp_port.csv
20141201_hosp_smmh.csv
20141201_hosp_other.csv
20141201_hosp_missing.csv

Each file corresponds with a particular hospital (Central, Military, Port, St. Mark's). One contains records for patients from other hospitals, and another with patients for whom the hospital data is missing.

In the code chunk for Importing Data (near the top of your script), add commands to import the hospital files from the "data/raw/hospitals" subfolder. Name the data frames in R as hosp_central, hosp_port, hosp_missing, etc.

Remember, you can run entire code chunks with their green play button, or you can run specific lines within a chunk by placing the cursor and pressing Ctrl and Enter (or Cmd Enter on a Mac).

Once you have added these commands, re-run the entire chunk, to import the 6 hospital datasets.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

As a reminder, you should write these commands in the importing data code chunk of your R Markdown, for optimal organization. Here is how your import data code chunk should look:

# Import data -------------------------------------------------------------

# surveillance dataset
surv_raw <- import(here("data", "raw", "surveillance_linelist_20141201.csv"))

# hospital datasets
hosp_central  <- import(here("data", "raw", "hospitals", "20141201_hosp_central.csv"))
hosp_military <- import(here("data", "raw", "hospitals", "20141201_hosp_military.csv"))
hosp_other    <- import(here("data", "raw", "hospitals", "20141201_hosp_other.csv"))
hosp_port     <- import(here("data", "raw", "hospitals", "20141201_hosp_port.csv"))
hosp_smmh     <- import(here("data", "raw", "hospitals", "20141201_hosp_smmh.csv"))
hosp_missing  <- import(here("data", "raw", "hospitals", "20141201_hosp_missing.csv"))

Navigate to your Environment and click on the data frames to observe their column names.

quiz(caption = "Quiz - Hospital linelists",
    question("Are the column names the same in each of the six hospital linelists?",
    allow_retry = T,
    answer("Yes", correct = T),
    answer("No")
  )
)

Because the column names are the same, you can simply "bind" the rows - digitally stacking the 6 data frames on top of one another.

This is best done with the function bind_rows(), from the package {dplyr}. This function is flexible in that the columns do not have to be in the same order for the data frames to be bound together. It will align the columns automatically - they must only have the same names.

Here is the syntax - it is quite simple:

bind_rows(df1, df2, df3, df4, ...)

Above, the "dfX" values would be replaced by the names of the data frames you wish to combine.

In your "Joining data" code chunk (just below your data cleaning chunk), write and run a command that binds the six hospital datasets, and save the result into an object called hosp.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

hosp <- bind_rows(hosp_central, 
                  hosp_port, 
                  hosp_military, 
                  hosp_smmh, 
                  hosp_other, 
                  hosp_missing)

The order of the data frames within the bind_rows() function is not important, for the purposes of this example, since all 6 hospital datasets have the same column names and number of columns.

You will know the commands have worked if you see hosp object appear in your environment.

Now click on hosp to view the data frame that you created. Does it look correct?

Is there a column hospital with various values?
Do the number of rows equal the sum of all the smaller datasets combined?
Is the number of columns consistent with the smaller datasets?
Are the column names the same as in the smaller datasets?
What is the name of the unique identifier column?

Joins review

Often, you will want to combine data using joins, not simply by binding rows together. Let's take a moment to review the types of joins:

Review of join types

Take a few minutes to review the information below. We give it to you as text descriptions, as a schematic, and as animations (source).

NOTE: the most common joins in public health contexts are left joins and anti joins.

Schematic

knitr::include_graphics("images/summary_joins.png", error = F)

Text descriptions

Mutating joins: add new variables to a data frame

Left join: keep all rows from the baseline data frame; add columns and matched rows from the second data frame. For unmatched records, the columns of the baseline data frame are filled as NA.
Right join: keep only matched rows from the baseline data frame, and add columns and unmatched rows from the second data frame. For unmatched records, the columns of the second data frame are filled as NA. Another way to think about this is: it is a left join in which the second-listed data frame is the baseline.
Full join: combine two full data frames, keeps all rows and columns.
Inner join: take the intersection of two data frames, keeps only rows present in both data frames

Filtering joins: filter a data frame based on another one

Anti join: keep all lines from baseline data frame without matches in the second data frame; no columns are added from the second data frame.
Semi join: keep all lines from baseline data frame with matches in the second data frame; no columns are added from the second data frame.

Animations

knitr::include_graphics("images/left-join.gif", error = F)

knitr::include_graphics("images/right-join.gif", error = F)

knitr::include_graphics("images/full-join.gif", error = F)

knitr::include_graphics("images/inner-join.gif", error = F)

knitr::include_graphics("images/anti-join.gif", error = F)

knitr::include_graphics("images/semi-join.gif", error = F)

Further resources

Tidyverse site presentation of mutating joins and filtering joins: some explanations + examples
Tidyverse reference page on mutating joins: here you will find the full list of arguments for the joins functions
Epi R Handbook page on joins

Show your understanding

Review these two mini dataframes:

Patient data (named patients)

patients %>% kbl() %>% kable_styling(full_width = FALSE, 
                                       bootstrap_options = c("striped", "hover", "condensed"))

Lab results (named results)

results %>% kbl() %>% kable_styling(full_width = FALSE, 
                                   bootstrap_options = c("striped", "hover", "condensed"))

Now answer the quiz questions below.

quiz(caption = "Which join would you use:",
     question("To add the age and sex for all patients into the lab database, if available there",
              answer("left_join(patients, results, by = 'ID')"),
              answer("right_join(patients, results, by = 'ID')",
                     correct = TRUE),
              answer("inner_join(patients, results, by = 'ID')",
                     message = "An inner join would not include patients from the results database without a match in the patient database. Rows for patient 5 and 6 would be lost"),
              answer("full_join(patients, results, by = 'ID')",
                     message = "An full join would bring in rows with no results from the patient database and add unwanted NA. Here we want to use the results dataframe as the reference."),
              answer("anti_join(patients, results, by = 'ID')"),
              answer("semi_join(patients, results, by = 'ID')"),
              allow_retry = TRUE),


     question("To get a dataframe with patients for which we have data for age, sex and test result",
              answer("left_join(patients, results, by = 'ID')"),
              answer("right_join(patients, results, by = 'ID')"),
              answer("inner_join(patients, results, by = 'ID')",
                     correct = TRUE),
              answer("full_join(patients, results, by = 'ID')",
                     message = "The full join would include all lines from the two dataframes, even if they do not have a match in the other dataframe"),
              answer("anti_join(patients, results, by = 'ID')"),
              answer("semi_join(patients, results, by = 'ID')"),
              allow_retry = TRUE),


     question("To keep all the patients from the results dataframe for which we have age and sex data (but not add those columns to the results dataframe)",
              answer("left_join(results, patients, by = 'ID')"),
              answer("right_join(patients, results, by = 'ID')"),
              answer("inner_join(results, patients, by = 'ID')",
                     message = "You are close. But an inner join in this case would import the columns from the patients dataframe, which we do not want"),
              answer("full_join(results, patients,  by = 'ID')"),
              answer("anti_join(patients, results, by = 'ID')"),
              answer("semi_join(results, patients, by = 'ID')",
                     correct = TRUE),
              allow_retry = TRUE), 


     question("To get the list of patients for whom we have age and sex, but no test result",
              answer("left_join(patients, results, by = 'ID')"),
              answer("right_join(patients, results, by = 'ID')"),
              answer("inner_join(patients, results, by = 'ID')"),
              answer("full_join(patients, results, by = 'ID')"),
              answer("anti_join(patients, results, by = 'ID')",
                     correct = TRUE),
              answer("semi_join(results, patients, by = 'ID')"),
              allow_retry = TRUE),


     question("Which of these would return only the rows found in patients?",
              answer("inner_join(patients, results, by = 'ID')",
                     correct = FALSE),
              answer("full_join(patients, results, by = 'ID')",
                     correct = FALSE),
              answer("left_join(patients, results, by = 'ID')",
                     correct = TRUE),
              allow_retry = TRUE),


     question("Which of these would use NA to fill-in values in rows that did not match?",
              answer("full_join(patients, results, by = 'ID')",
                     correct = TRUE),
              answer("semi_join(patients, results, by = 'ID')",
                     correct = FALSE),
              answer("anti_join(patients, results, by = 'ID')",
                     correct = FALSE),
              answer("inner_join(patients, results, by = 'ID')",
                     correct = FALSE),
              allow_retry = TRUE)
)

Joining

Syntax

The syntax of the join functions is always the same:

The first data frame listed is the "baseline"
The second data frame listed is joined to the "baseline"

DO NOT ADD THE BELOW CODE INTO YOUR R Markdown. This is simply showcasing the format of the join functions.

XXX_join(dataframe1,      # "baseline" dataframe
         dataframe2,      # other dataframe
         by = "column")   # column used to match rows


# Or, with a pipe:
dataframe1 %>%                # "baseline" dataframe
     XXX_join(dataframe2,     # other dataframe
              by = "column")  # columns used to match rows

If you want to match data frames based on two or more columns, you can provide the columns to the by = argument within a vector c(), separated by commas. Rows will only match to each other if the values in all these columns are exactly the same. In the join below, both data frames have the columns age and sex, and rows are matched if they have the same values in both these columns.

DO NOT ADD THE BELOW CODE INTO YOUR R Markdown. This is simply show-casing how to join based on two columns.

XXX_join(dataframe1, dataframe2,      
         by = c("age", "sex")   # columns used to match rows

Many times, the names of comparable columns in the two data frames are different. For example, ages might be recorded in column age in one data frame, and recorded in the column Ages in the second data frame. You can tell the join both of the names by using the = equals symbol within the vector c(). This join below matches rows when the value in age in the first data frame is the same as the value in Ages in the second data frame.

DO NOT ADD THE BELOW CODE INTO YOUR R Markdown. This is only for example.

XXX_join(dataframe1, dataframe2,      
         by = c("age" = "Ages")   # columns with different names

Finally, below is an example of both syntaxes together. This command matches rows when:

The value in age in the first data frame is the same as the value in Ages in the second data frame, AND
The value in sex in the first data frame is the same as the value in Sex in the second data frame, AND
The value in city is the first data frame is the same as the value in city in the second data frame

DO NOT ADD THE BELOW CODE INTO YOUR R Markdown. This is simply show-casing how to join based on matching multiple columns, where some of the columns to match are named differently in each respective data frame.

XXX_join(dataframe1, dataframe2,      
         by = c("age" = "Ages",  # different column names
                "sex" = "Sex",   # different column names
                "city")          # both data frames have column "city"

Join hospital data

Now that we have a hosp (hospital) linelist, let's join it into the surv surveillance linelist.

Inspect data before joining

It is critically important to know your datasets before joining them together and anticipate what will happen when you join.

So much can go wrong and be hidden from view. Spend time in advance to understand:

The dimensions (number of rows and columns) of your datasets
Whether there are duplicates in either of the datasets
Whether the identifiers are written similarly in both datasets
How the contents of the two datasets overlap
Which column names are present in both datasets

Much of this pre-join testing will occur in a Testing area code chunk.

Review dimensions

In the Testing area code chunk, use the function dim() on the two data frames.

dim() is a shortcut to return the number of rows and the number of columns. Alternatively, you can use nrow() and ncol() commands individually.

How do the dimensions of surv and hosp compare?

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

dim(surv)
dim(hosp)

Review duplicates

Now, it would be good to understand whether there are duplicated identifiers in the surv data frame, or the hosp data frame. Duplicates may result in joins that increase the number of rows in the baseline data frame.

Using your knowledge from the module on summary tables, write a command in your Testing area chunk that shows the surv identifiers (case_id) that appear more than once.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

You can pipe the surv dataset into count(case_id) to produce a table of counts for all case_id values in surv. Then pipe to filter() and specify a logic criteria so that only those which appear more than once remain.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv %>% 
  count(case_id) %>% 
  filter(n > 1)

Now do the same for the data frame hosp using the identifier column ID, also in your "Testing area" chunk.

We now know that there are some patients who have several rows in surv. Ideally, we would investigate these duplicates to see if they correspond to different visits, or if some patients were wrongly entered twice. However, today our focus is on joining so let's keep all these records.

You may ask: Why were these duplicates not removed during the data cleaning steps? The answer is that we used distinct(), which by default removes rows that are entirely duplicates of another row. These rows we identified here have duplicate case_id, but different values in other columns.

Review identifier values

Before joining, it is extremely important to consider whether the column(s) used to match the two data frames are clean.

The join functions from {dplyr} are using exact matching: the values need to be exactly the same in both data frames to be matched.

Note: Other packages perform probabilistic ("fuzzy") matching (see the chapter on joining in the Epi R Handbook).

Do we know whether the patient IDs were entered in the same way in both data frames? If data comes from a clean database and is a primary key, probably. With Excel data entered by many different people in emergency settings, maybe not. In general, it is a good rule to scrutinize the columns you want to use as identifier/keys to join your data frames.

Spend some time looking at the structure of the values in the surv column case_id. Do they follow the same pattern as ID in the hosp data frame?

Let's look together at a sample of IDs from both data frames. The commands below arrange both data frames by their identifier column, then return only that column with pull(), and then take only the first 10 values.

Run the below code in your "Testing area" code chunk, if you would like to view the output in RStudio. Alternatively, see below the first 10 identifier values from surv.

# for each dataset, sort and print the first 10 identifier values

# for the surveillance dataset
surv %>% 
  arrange(case_id) %>%
  pull(case_id) %>%
  head(10)

Note that if you see the above output appearing on two lines, this is not because of any difference in the values. There are 10 values produced by head(10), but because the window is narrow they have been wrapped around. Note the numbers in brackets (e.g. [9]) that tell you which value is starting the new line.

Run the same code in your "Testing area" code chunk, but this time for hosp.

# for the hospital dataset
hosp %>%
  arrange(ID) %>%
  pull(ID) %>%
  head(10)

It seems the identifiers use similar patterns and style. With very dirty data, you should inspect the IDs in case several patterns of IDs were used within the same file.

Review overlap

We will now run an anti_join() to understand how the two data frames relate to each other.

Write a command in your "Joining" chunk, that performs an anti_join(), first using hosp as the "baseline" data frame. Assign the output to an object called anti_surv.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Use anti_join(dataframe1, dataframe2, by = ). Recall that the identifier columns in the two data frames are named differently, so you must tell the join that they should be compared with an =, and within a vector c().

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

anti_surv <- anti_join(hosp, surv, by = c("ID" = "case_id"))  # Which rows in hosp are NOT present in surv?

We can see that there are some patients in hosp that are not present in surv. Now perform the opposite anti_join(), using hosp as the baseline, also in your "Joining" code chunk.

Are there any patients in surv not present in hosp? Remember to swap the column names in the by = argument to align with the order of the joined data frames.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

Did you remember to switch the order of the by = as well as the data frames?

anti_hosp <- anti_join(surv, hosp, by = c("case_id" = "ID")) # Which rows in surv are not present in hosp?

So we see that there are 0 rows from that second anti join. The surv data frame has all the patients that are in the hosp data frame, and that the hosp data frame is a subset of the surv patients.

quiz(caption = "Quiz - Data inspections",
    question("If you were to conduct a full_join() with surv as the baseline and hosp joined into it, how many *additional* rows would you expect to appear in the joined data frame?",
    allow_retry = T,
    answer("0", message = "We are asking about a 'full' join."),
    answer("17"),
    answer("40", correct = T, message = "Remember, a full join includes ALL rows from both data frames."),
    answer("5 rows would be removed")
  )
)

Review column names

It is important to know what the column names are in each data frame.

Review again the column names of each data frame by running the following code in your "Testing area" code chunk:

surv %>% names()

hosp %>% names()

quiz(caption = "Quiz - Compare columns",
  question("Which columns in the hosp linelist also exist in the surv linelist?",
    allow_retry = T,
    answer("hospital", correct = T),
    answer("date_outcome"),
    answer("age", correct = T),
    answer("age_unit", correct = T),
    answer("sex", correct = T)
  ),
  question("Which columns exist only in the hosp linelist?",
    allow_retry = T,
    answer("date_hospitalisation", correct = T),
    answer("age"),
    answer("time_admission", correct = T),
    answer("outcome", correct = T),
    answer("date_outcome", correct = T)
  )
)

Let's do an experiment: write and run a command in the "Joining" chunk to join the two data frames:

Use a left_join()
Use surv as the baseline/reference data frame
Use c("case_id" = "ID") as the link between the data frames
Save the result as the object test_join

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

test_join <- surv %>% 
  left_join(hosp, by = c("case_id" = "ID"))

What are the dimensions of the new data frame? Run dim(test_join) in your "Testing area" chunk and compare it with the previous dimensions of surv.

Now, click on test_join in your R Environment to view it closely. See the columns near the left side that were part of the original surv data frame, and then scroll to the right to see the new columns that have been added, from the hosp data frame.

What do you notice about the columns sex, age, age_unit, and hospital?

quiz(caption = "Quiz - Joins",
    question("What happened to these columns?",
    allow_retry = T,
    answer("They were removed"),
    answer("They were all moved to the left end of the data frame"),
    answer("They appear twice - once with '.x' appended, and again with '.y' appended", correct = T)
  )
)

These column names existed in both data frames and were NOT used in by = to match rows, so when the data were joined, R had to differentiate between the duplicate columns by adding the suffixes .x and .y.

Are the values the same in both sex.x and sex.y How can you examine this?

Since they are now in the same data frame, you can cross-tabulate the two columns. Run the following code in your "Testing area" code chunk to print the results.

# cross tabulate the two columns in the joined dataset
test_join %>% 
  tabyl(sex.x, sex.y)

You can see that sex.x (from the original surv dataset, which was cleaned by you in a previous module) has the values "female" and "male".

In contrast, sex.y, which came from the hospital linelist, has uncleaned values "f", "m", "", and NA.

What to do with this situation? This is a classic scenario when trying to combine datasets - what to do with redundant columns?

But are they truly redundant columns? You will need to ask yourself these questions:

Do you want to keep the sex, age, hospital, etc values from the hospital linelist - for later reference?
If the age or sex for a patient differ between surv and hosp, which one will you consider to be correct?

These are questions that only you can answer about your dataset.

You have three options:

1) Perform the join as above, to keep all the columns
* This will create the .x and .y suffices, which you can edit with rename()

2) Set all of the redundant columns as identifiers in the join
* This keep only the columns of the baseline data frame. DO NOT ADD THIS CODE TO YOUR SCRIPT, it is simply an example.

test <- full_join(surv, hosp,
          by = c(
            "case_id" = "ID",
            "age",                    # the other columns are
            "age_unit",               # named the same in both 
            "sex",
            "hospital"))

3) Remove the redundant columns before the merge
* If you do not care to retain the redundant columns in the second data frame, remove them with select() prior to the join.

For this exercise, return to the "Joining" code chunk of your R Markdown and write code that uses Option 3.

1) Go to the bind_rows() command that creates hosp, and pipe to a select command that leaves only "ID" and the new columns.
2) Take advantage of the re-naming functionality of select(), to rename "ID" to "case_id", so that the join command will be more simple.
3) Re-run the command

hosp <- bind_rows(hosp_central, hosp_port, hosp_military,
                  hosp_smmh, hosp_other, hosp_missing) %>% 

  # select specific columns from hosp, and re-name ID as case_ID
  select(
    case_id = ID,          # select and rename
    date_hospitalisation,  # select
    time_admission,        # select
    date_outcome,          # select
    outcome)               # select

4) Now, in your Joining code chunk, write a left_join() for surv and hosp, using case_id as the common identifier. Assign the output joined object to a data frame called combined.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Join the two data frames with a left join
combined <- left_join(surv, hosp, by = "case_id")

Congratulations on completing your first join!

Join lab data

Now we need to join the laboratory testing results.

Complete the following steps to join the lab data frame into the combined data frame

Import and clean the lab data

In the importing code chunk of (near the top), write and run a command to do the following:

Import the XLSX file located at "data/raw/lab_results_20141201.xlsx"
Clean the column names using clean_names()
Assign it to an object called lab.

Make sure to run the code once you have written the command!

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Import lab data
# (place this in the importing chunk of your script)
lab <- import(here("data", "raw", "lab_results_20141201.xlsx")) %>% 
  clean_names()

Don't forget to use the correct file extension for the lab file (.xlsx)! And make sure to highlight and run this code to ensure the object lab is added to the R Environment.

Join the lab data

In the "Testing area" chunk, write and run commands to compare the combined and lab data frames, such as dim().

Then, in the "Joining" chunk (below your hosp join), conduct a left_join() to add the lab column ct_blood to the combined data frame, matching on case_id.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Join the two data frames with a left-join
combined <- left_join(combined, lab, by = "case_id")

Verify manually in the RStudio Viewer that the join occurred correctly.

Add case investigation data

Finally, we need to add new data from case investigations. Teams have made phone calls and conducted home visits to do epidemiological detective work. They have recorded the likely source cases, dates of infection, and context of infection for some of the cases.

Complete the following steps to left join the case investigations data frame to the combined data frame

1) In the "Importing" chunk of your R Markdown (near the top), write and run a command to import the XLSX file located at "data/raw/case_investigations_20141201.xlsx" and save it as investigations.
* Pipe the imported data into a select() function in which you keep only the column(s) necessary for the join and new columns to be added to combined. We want to include all columns except for age, age_unit and sex.

2) In the Testing area chunk, write and run commands to compare the dimensions of investigations and combined

3) In the Joining chunk, write and run commands to conduct a left_join() to add the investigations columns to combined

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

In the import chunk:

Don't forget to use the correct file extension for the file (.xlsx)!

# In the import data code chunk 

# Import lab data 
# (add to the import code chunk of your R script)
investigations <- import(here("data", "raw", "case_investigations_20141201.xlsx")) %>% 
 # remove unnecessary columns  
 select(-c(age, age_unit, sex))

In the testing chunk:

# In the testing area code chunk

# Compare data frame dimensions
# (write and run directly in the Console)
dim(investigations)
dim(combined)

# Compare case_id (our matching variable)
# (write and run directly in the Console)
# for the investigations dataset
investigations %>% arrange(case_id) %>% pull(case_id) %>% head(10)

# for the combined dataset
combined %>% arrange(case_id) %>% pull(case_id) %>% head(10)

In the joining chunk:

# In the joining code chunk

# Join the two data frames with a left-join
# (add to the joining code chunk of your R script)
combined <- left_join(combined, investigations, by = "case_id")

Verify manually by looking at the data frame that the join occurred correctly.

Post-join cleaning

Now that we have these new columns, we should do some final cleaning on the combined data frame.

Column names and classes

We have new columns in our data frame, so we should write code to ensure that they are the correct class.

Write a short command in your Joining chunk to address the following issues in combined:

Use clean_names() to ensure all the column names have proper syntax
Use mutate() to properly classify the three new date columns as class Date, using the {lubridate} functions you learned earlier
Redefine date_hospitalisation as class Date
Redefine date_outcome as class Date
Redefine date_infection as class Date
Use mutate() to replace empty characters "" with NA, using na_if(), for the columns outcome and hospital (see your cleaning pipe chain code chunk for an example)

quiz(caption = "Quiz - Date conversions",
  question("Which {lubridate} function should you use to convert date_hospitalisation to class date?",
    allow_retry = T,
    answer("mdy()", correct = T),
    answer("ymd()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("dmy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("imho()", message = "In my humble opinion, this is not a real function.")
  ),
    question("Which {lubridate} function should you use to convert date_outcome to class date?",
    allow_retry = T,
    answer("mdy()", correct = T),
    answer("ymd()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("dmy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("imho()", message = "In my humble opinion, this is not a real function.")
  ),
    question("Which {lubridate} function should you use to convert date_infection to class date?",
    allow_retry = T,
    answer("mdy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("ymd()", correct = T),
    answer("dmy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("imho()", message = "In my humble opinion, this is not a real function.")
  )
)

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Use clean_names() first to standardize the syntax. To establish date class, choose between mdy(), ymd(), or dmy() to place within the mutate() function, based on the current format of the date (look at the data frame) for example:

mutate(date_outcome = mdy(date_outcome))

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

Add to your Joining chunk

# (add this to your joining code chunk)

# Clean the new columns that have been joined to 'combined'
combined <- combined %>% 

  # convert all column names to lower case and remove spaces
  clean_names() %>% 

  # covert new columns to class date
  mutate(date_hospitalisation = mdy(date_hospitalisation),
         date_outcome         = mdy(date_outcome),
         date_infection       = ymd(date_infection)) %>% 

  # clean outcome and hospital missings
  mutate(outcome = na_if(outcome, ""),
         hospital = na_if(hospital, ""))

You have some flexibility on where to put this data cleaning:

You could do this cleaning to each of the lab and investigations data frames when they are imported
You could have a part of your "Cleaning pipe chain" code chunk in which you clean these data frames
You can do this final cleaning once you join in your joining code chunk, as we have done in this exercise

Clean up script

At this point, we now have joins and several new import commands.

It is important to keep our R Markdown script orderly and in a logical flow. You should have the following general organization:

1) Load packages
3) Import data
4) Clean primary data frame
5) Perform joins (and perform any residual cleaning) 6) Tables and visualizations

Take a moment to ensure that your script aligns with the above, and that you can run it from top to bottom without any errors.

Final checks

Knit your report to ensure there are no errors. If you encounter errors, try to "de-bug" them by:

Identifying which line caused the problem
Run the chunks above the error location
Run code in the problem chunk line-by-line
Ask an instructor for help if needed

Add explanatory text

Most of the code you have added will run "silently" and should not appear in the knitted report. However, review the steps and see if you want to add any informative text about the datasets that have been added to the analysis.

Also ensure the code is well documented with # symbols, so that other readers can understand your actions and rationale.

End

Congratulations! You have finished this exercise on joining data.

We now have a much more complete dataset to work with, including columns like outcome, date_hospitalisation, blood_ct, and source.

Remember to save your script before leaving RStudio.

If you want to learn a bit more about shortcuts in R Studio and other helpful tips, you can go on to the next extra topic.

Extras

File paths

R Studio can help you quickly and correctly specify file paths by using Tab to auto-complete.

Start typing a file path (in quotes) and press Tab on your keyboard. R Studio will list the available sub-folders and files for that location. The example below shows the result of pressing Tab after typing "data/". Click the desired sub-folder or file or navigate to it with your keyboard and press enter.

knitr::include_graphics("images/filepaths1.png", error = F)

You can keep doing this until reaching your final destination:

knitr::include_graphics("images/filepaths2.PNG", error = F)

Especially in an R Markdown it is important to still write this file path within here() because of the behavior noted in the Extras of the R Markdown module.

Keyboard shortcuts

Some very useful keyboard shortcuts are below.

See a full list by going to Tools menu and "Keyboard Shortcuts Help".

See more examples here: https://epirhandbook.com/en/r-basics.html#keyboard-shortcuts.

| Windows/Linux | Mac | Action | |---------------------|---------------------|-------------------------------| | Esc | Esc | Interrupt current command (useful if code being run is stuck) | | Ctrl+s | Cmd+s | Save (script) | | Tab | Tab | Auto-complete | | Ctrl + Enter | Cmd + Enter | Run current line(s)/selection of code | | Ctrl + Shift + C | Cmd + Shift + c | comment/uncomment the highlighted lines | | Alt + - | Option + - | Insert \<- | | Ctrl + Shift + m | Cmd + Shift + m | Insert %>% | | Ctrl + Alt + i | Cmd + Shift + r | Insert code chunk (into R Markdown) | | Ctrl + f | Cmd + f | Find and replace in current script | | Ctrl + Shift + f | Cmd + Shift + f | Find in files (search/replace across many scripts) |

appliedepi/introexercises documentation built on April 22, 2024, 1:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

appliedepi/introexercises

In appliedepi/introexercises:

Introduction to R for Applied Epidemiology and Public Health

Welcome

Joining data

Format

Getting Help

Quiz questions

License

Learning objectives

Preparation

Create a code chunk for Joining Data

Script organization

Binding rows

Hospital datasets

Joins review

Review of join types

Schematic

Text descriptions

Mutating joins: add new variables to a data frame

Filtering joins: filter a data frame based on another one

Animations

Further resources

Show your understanding

Joining

Syntax

Join hospital data

Inspect data before joining

Review dimensions

Review duplicates

Review identifier values

Review overlap

Review column names

Join lab data

Import and clean the lab data

Join the lab data

Add case investigation data

Post-join cleaning

Column names and classes

Clean up script

Final checks

End

Extras

File paths

Keyboard shortcuts

R Package Documentation

Browse R Packages

We want your feedback!