In appliedepi/introexercises:

# load packages ----------------------------------------------------------------
library(introexercises)
library(learnr)
library(gradethis)
library(dplyr)
library(flair)
library(ggplot2)
library(lubridate)
library(fontawesome)
library(janitor)
library(kableExtra)
# library(RMariaDB)        # connect to sql database

## set options for exercises and checking ---------------------------------------

## Define how exercises are evaluated 
gradethis::gradethis_setup(
  ## note: the below arguments are passed to learnr::tutorial_options
  ## set the maximum execution time limit in seconds
  exercise.timelimit = 60, 
  ## set how exercises should be checked (defaults to NULL - individually defined)
  # exercise.checker = gradethis::grade_learnr
  ## set whether to pre-evaluate exercises (so users see answers)
  exercise.eval = FALSE 
)

# ## event recorder ---------------------------------------------------------------
# ## see for details: 
# ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events
# ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R
# 
# ## connect to your sql database
# sqldtbase <- dbConnect(RMariaDB::MariaDB(),
#                        user     = Sys.getenv("userid"),
#                        password = Sys.getenv("pwd"),
#                        dbname   = 'excersize_log',
#                        host     = "144.126.246.140")
# 
# 
# ## define a function to collect data 
# ## note that tutorial_id is defined in YAML
#     ## you could set the tutorial_version too (by specifying version:) but use package version instead 
# recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) {
#     
#   ## define a sql query 
#   ## first bracket defines variable names
#   ## values bracket defines what goes in each variable
#   event_log <- paste("INSERT INTO responses (
#                        tutorial_id, 
#                        tutorial_version, 
#                        date_time, 
#                        user_id, 
#                        event, 
#                        section,
#                        label, 
#                        question, 
#                        answer, 
#                        code, 
#                        correct)
#                        VALUES('", tutorial_id,  "', 
#                        '", tutorial_version, "', 
#                        '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "',
#                        '", Sys.getenv("SHINYPROXY_PROXY_ID"), "',
#                        '", event, "',
#                        '", data$section, "',
#                        '", data$label,  "',
#                        '", paste0('"', data$question, '"'),  "',
#                        '", paste0('"', data$answer,   '"'),  "',
#                        '", paste0('"', data$code,     '"'),  "',
#                        '", data$correct, "')",
#                        sep = '')
# 
#     # Execute the query on the sqldtbase that we connected to above
#     rsInsert <- dbSendQuery(sqldtbase, event_log)
#   
# }
# 
# options(tutorial.event_recorder = recorder_function)

# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)


# Data prep --------------------------------------------------------------------
# Import
surv <- rio::import(system.file("dat/old_version/surveillance_linelist_clean_20141201.rds", package = "introexercises")) 

hospitals <- bind_rows(rio::import(system.file("dat/old_version/hospitals/20141201_hosp_port.csv", package = "introexercises")),
                    rio::import(system.file("dat/old_version/hospitals/20141201_hosp_central.csv", package = "introexercises"))) %>% 
             select(hospital, date_hospitalisation, outcome, date_outcome) %>% 
             janitor::clean_names()

## NOT USED ANYMORE
# geo_data <- rio::import(system.file("dat/pop/sle_admpop_adm3_2020.csv", package = "introexercises")) %>% 
#                  select(-c(Female, Male), -starts_with("T"))


## NOT USED ANYMORE Make the hospital information dataframe
# hospital_dirty = data.frame(
#   hosp_name     = c("central hospital", "military", "military", "port", "St. Mark's", "ignace", "sisters"),
#   catchment_pop = c(1950280, 40500, 10000, 50280, 12000, 5000, 4200),
#   level         = c("Tertiary", "Secondary", "Primary", "Secondary", "Secondary", "Primary", "Primary")
# )
# 
# hospital_clean <- hospital_dirty %>% 
#   mutate(
#     hosp_name = case_when(
#       # criteria                       # new value
#       hosp_name == "military"          ~ "Military Hospital",
#       hosp_name == "port"              ~ "Port Hospital",
#       hosp_name == "St. Mark's"        ~ "St. Mark's Maternity Hospital (SMMH)",
#       hosp_name == "central hospital"  ~ "Central Hospital",
#       TRUE                             ~ hosp_name
#       )
#     )

# Create smaller linelists for the easier exemples
patients <- tibble(ID = c("patient_1", "patient_2", "patient_3", 
                            "patient_4", "patient_10"), 
                     sexe = c("F", "M", "M", "F", "F"), 
                     age = c(5, 10, 2, 15, 14), 
                     age_unit = c("Year", "Year", "Year", "Year", "Year"))

results <- tibble(ID = c("patient_1", "patient_2", "patient_4", 
                        "patient_5", "patient_6"), 
                 test_result = c("positive", "negative", 
                                 "negative", "positive", "positive"))


df1 <- tibble(ID = c("patient_1", "patient_2", "patient_3"),
              sexe = c("F", "M", "M"))

df2 <- tibble(ID = c("patient_1", "patient_1", "patient_1", "patient_2", "patient_4"),
              date_test = as.Date(c("2021-12-01", "2021-12-26", "2022-01-05", "2021-12-18", "2022-01-01")),
                 test_result = c("positive", "negative", "negative", "positive", "positive"))

hosp_central <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_central.csv", package = "introexercises"))
hosp_military <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_military.csv", package = "introexercises"))
hosp_port <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_port.csv", package = "introexercises"))
hosp_smmh <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_smmh.csv", package = "introexercises"))
hosp_other <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_other.csv", package = "introexercises"))
hosp_missing <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_missing.csv", package = "introexercises"))

lab <- rio::import(system.file("dat/old_version/lab_results_20141201.xlsx", package = "introexercises"))
investigations <- rio::import(system.file("dat/old_version/case_investigations_20141201.xlsx", package = "introexercises"))

# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)

Introduction to R for Applied Epidemiology and Public Health

Welcome

Welcome to the live course "Introduction to R for applied epidemiologists", offered by Applied Epi - a nonprofit organisation that offers open-source tools, training, and support to frontline public health practitioners.

knitr::include_graphics("images/logo.png", error = F)

Joining data

This exercise focuses on joining data frames.

Format

This exercise will guide you through a set of tasks.
You should perform these tasks in RStudio and on your local computer.

Getting Help

There are several ways to get help:

1) Look for the "helpers" (see below) 2) Ask your live course instructor/facilitator for help
3) Ask a colleague or other participant in the course for tips
4) Post a question in Applied Epi Community in the category for questions about Applied Epi Training

Here is what those "helpers" will look like:

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Here you will see a helpful hint!

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

linelist %>% 
  filter(
    age > 25,
    district == "Bolo"
  )

Here is more explanation about why the solution works.

Quiz questions

Please complete the quiz questions that you encounter throughout the tutorial. Answering will help you to comprehend the material, and will also help us to improve the exercises for future students.

To practice, please answer the following questions:

quiz(
  question_radio("When should I view the red 'helper' code?",
    answer("After trying to write the code myself", correct = TRUE),
    answer("Before I try coding", correct = FALSE),
    correct = "Reviewing best-practice code after trying to write yourself can help you improve",
    incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer."
  )
)

question_numeric(
 "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?",
 answer(10, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(9, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(8, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(7, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(6, message = "Ok, we will get there together", correct = T),
 answer(5, message = "Ok, we will get there together", correct = T),
 answer(4, message = "I like your confidence!", correct = T),
 answer(3, message = "I like your confidence!", correct = T),
 answer(2, message = "I like your confidence!", correct = T),
 answer(1, message = "I like your confidence!", correct = T),
 allow_retry = TRUE,
 correct = "Thanks for sharing. ",
 min = 1,
 max = 10,
 step = 1
)

Icons

You will see these icons throughout the exercises:

Icon |Meaning ------|-------------------- r fontawesome::fa("eye", fill = "darkblue")|Observe
r fontawesome::fa("exclamation", fill = "red")|Alert!
r fontawesome::fa("pen", fill = "brown")|An informative note
r fontawesome::fa("terminal", fill = "black")|Time for you to code!
r fontawesome::fa("window-restore", fill = "darkgrey")|Change to another window
r fontawesome::fa("bookmark", fill = "orange")|Remember this for later

License

Applied Epi Incorporated, 2022
This work is licensed by Applied Epi Incorporated under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.

Please email contact@appliedepi.org with questions about the use of these materials for academic courses and epidemiologist training programs.

Learning objectives

In this exercise you will:

Review the types of joins, and their corresponding commands in R
Understand which joins to apply and when
Combine several hospital case linelists together by binding rows
Join together surveillance linelist, medical linelist, and other lab and case investigation datasets

Prepare

Open your R project and R script.

Run the code from the top of your script:

Load packages
Import and clean the surveillance linelist surv

Below your cleaning code (pipe chain), create a new script section for "Joining data".

The new header should look something like this:

# Joining data ----------------------------------------------

Binding rows

Combining datasets can sometimes be as simple as combining the rows of data frames that have exactly the same columns. For example, if you have multiple study sites, or clinical sites, that are sending you data in exactly the same format.

Hospital datasets

r fontawesome::fa("window-restore", fill = "darkgrey")

In our case study Ebola outbreak, there are six datasets from various hospitals in the "data/raw/hospitals" subfolder of the "intro_course" R project. In Module 1, you should have re-named them to the following standardized syntax (if not, please do so now, or replace them using the "backup" renamed files in the "data/raw/hospitals/backup" folder):

20141201_hosp_central.csv
20141201_hosp_military.csv
20141201_hosp_port.csv
20141201_hosp_smmh.csv
20141201_hosp_other.csv
20141201_hosp_missing.csv

Each file corresponds with a particular hospital (Central, Military, Port, St. Mark's) and there are datasets containing record for patients from other hospitals, or for whom the hospital is missing.

Near the top of your R script (in the section for importing data), write and run some commands to import the hospital linelist files from the "data/raw/hospitals" subfolder. Name the data frames in R as hosp_central, hosp_port, hosp_missing, etc.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

As a reminder, you should write these commands in the importing section of your script, for optimal organization.

hosp_central <- import(here("data", "raw", "hospitals", "20141201_hosp_central.csv"))
hosp_military <- import(here("data", "raw", "hospitals", "20141201_hosp_military.csv"))
hosp_port <- import(here("data", "raw", "hospitals", "20141201_hosp_port.csv"))
hosp_smmh <- import(here("data", "raw", "hospitals", "20141201_hosp_smmh.csv"))
hosp_other <- import(here("data", "raw", "hospitals", "20141201_hosp_other.csv"))
hosp_missing <- import(here("data", "raw", "hospitals", "20141201_hosp_missing.csv"))

r fontawesome::fa("eye", fill = "darkblue") Open the data frames in R, and observe the column names

quiz(caption = "Quiz - Hospital linelists",
    question("Are the column names the same in each of the six hospital linelists?",
    allow_retry = T,
    answer("Yes", correct = T),
    answer("No")
  )
)

Because the columns have the same names, you should be able to simply "bind" the rows together - digitally stacking the 6 data frames on top of one another.

This is best done with the function bind_rows(), from the package {dplyr}. This function is flexible in that the columns do not even have to be in the same order for the data frames to be bound together. The function will align the columns automatically - the columns only have to have the same names.

Here is the syntax - it is quite simple:

bind_rows(df1, df2, df3, df4, ...)

Above, "dfX" would be replaced by the names of the data frames that you want to combine.

In the "Joining data" section of your script, write and run a command that binds together the six hospital datasets, and saves the result as hosp.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

hosp <- bind_rows(hosp_central, hosp_port, hosp_military, hosp_smmh, hosp_other, hosp_missing)

The order of the data frames is not important, for the purposes of this example.

r fontawesome::fa("eye", fill = "darkblue") Now click open and view the hosp data frame that you have created. Does it look correct?

Is there a column hospital with various values?
Do the number of rows equal the sum of all the smaller datasets combined?
Is the number of columns consistent with the smaller datasets?
Are the column names the same as in the smaller datasets?
What is the name of the unique identifier column?

Joins review

Other times, you want to combine data using joins, not simply binding more rows. Let us take a moment to review the different types of joins:

Review of join types

Take a few minutes to review the information below. We give it to you as text descriptions, as a schematic, and as animations (source).

To note, the most common joins in public health contexts are left joins and anti joins.

Schematic

knitr::include_graphics("images/summary_joins.png", error = F)

Text descriptions

Mutating joins: add new variables to a data frame

Left join: keep all rows from the baseline data frame; add columns and matched rows from the second data frame. For unmatched records, the columns of the baseline data frame are filled as NA.
Right join: keep only matched rows from the baseline data frame, and add columns and unmatched rows from the second data frame. For unmatched records, the columns of the second data frame are filled as NA. Another way to think about this is: it is a left join in which the second-listed data frame is the baseline.
Full join: combine two full data frames, keeps all rows and columns.
Inner join: take the intersection of two data frames, keeps only rows present in both data frames

Filtering joins: filter a data frame based on another one

Anti join: keep all lines from baseline data frame without matches in the second data frame; no columns are added from the second data frame.
Semi join: keep all lines from baseline data frame with matches in the second data frame; no columns are added from the second data frame.

Animations

knitr::include_graphics("images/left-join.gif", error = F)

knitr::include_graphics("images/right-join.gif", error = F)

knitr::include_graphics("images/full-join.gif", error = F)

knitr::include_graphics("images/inner-join.gif", error = F)

knitr::include_graphics("images/anti-join.gif", error = F)

knitr::include_graphics("images/semi-join.gif", error = F)

Further resources

Tidyverse site presentation of mutating joins and filtering joins: some explanations + examples
Tidyverse reference page on mutating joins: here you will find the full list of arguments for the joins functions
Epi R Handbook page on joins

Show your understanding

Review these two mini dataframes containing patient data (named patients) and lab results (named results).

patients %>% kbl() %>% kable_styling(full_width = FALSE, 
                                       bootstrap_options = c("striped", "hover", "condensed"))

results %>% kbl() %>% kable_styling(full_width = FALSE, 
                                   bootstrap_options = c("striped", "hover", "condensed"))

Now answer the quiz questions below.

quiz(caption = "Which join would you use:",
     question("To add the age and sex for all patients into the lab database, if available there",
              answer("left_join(patients, results, by = 'ID')"),
              answer("right_join(patients, results, by = 'ID')", correct = TRUE),
              answer("inner_join(patients, results, by = 'ID')", message = "An inner join would not include patients from the results database without a match in the patient database. Rows for patient 5 and 6 would be lost"),
              answer("full_join(patients, results, by = 'ID')", message = "An full join would bring in rows with no results from the patient database and add unwanted NA. Here we want to use the results dataframe as the reference."),
              answer("anti_join(patients, results, by = 'ID')"),
              answer("semi_join(patients, results, by = 'ID')"),
              allow_retry = TRUE),


     question("To get a dataframe with patients for which we have data for age, sex and test result",
              answer("left_join(patients, results, by = 'ID')"),
              answer("right_join(patients, results, by = 'ID')"),
              answer("inner_join(patients, results, by = 'ID')", correct = TRUE),
              answer("full_join(patients, results, by = 'ID')", message = "The full join would include all lines from the two dataframes, even if they do not have a match in the other dataframe"),
              answer("anti_join(patients, results, by = 'ID')"),
              answer("semi_join(patients, results, by = 'ID')"),
              allow_retry = TRUE),


     question("To keep all the patients from the results dataframe for which we have age and sex data (but not add those columns to the results dataframe)",
              answer("left_join(results, patients, by = 'ID')"),
              answer("right_join(patients, results, by = 'ID')"),
              answer("inner_join(results, patients, by = 'ID')", message = "You are close. But an inner join in this case would import the columns from the patients dataframe, which we do not want"),
              answer("full_join(results, patients,  by = 'ID')"),
              answer("anti_join(patients, results, by = 'ID')"),
              answer("semi_join(results, patients, by = 'ID')", correct = TRUE),
              allow_retry = TRUE), 


     question("To get the list of patients for whom we have age and sex, but no test result",
              answer("left_join(patients, results, by = 'ID')"),
              answer("right_join(patients, results, by = 'ID')"),
              answer("inner_join(patients, results, by = 'ID')"),
              answer("full_join(patients, results, by = 'ID')"),
              answer("anti_join(patients, results, by = 'ID')", correct = TRUE),
              answer("semi_join(results, patients, by = 'ID')"),
              allow_retry = TRUE),


     question("Which of these would return only the rows found in patients?",
              answer("inner_join(patients, results, by = 'ID')", correct = TRUE),
              answer("full_join(patients, results, by = 'ID')", correct = FALSE),
              answer("left_join(patients, results, by = 'ID')", correct = TRUE),
              allow_retry = TRUE),


     question("Which of these would use NA to fill-in values in rows that did not match?",
              answer("full_join(patients, results, by = 'ID')", correct = TRUE),
              answer("semi_join(patients, results, by = 'ID')", correct = FALSE),
              answer("anti_join(patients, results, by = 'ID')", correct = FALSE),
              answer("inner_join(patients, results, by = 'ID')", correct = FALSE),
              allow_retry = TRUE)
)

Joining

Syntax

The syntax of the join functions is always the same:

The first data frame listed is the "baseline"
The second data frame listed is joining to the "baseline"

XXX_join(dataframe1,      # "baseline" dataframe
         dataframe2,      # other dataframe
         by = "column")   # column used to match rows


# Or, with a pipe:
dataframe1 %>%                # "baseline" dataframe
     XXX_join(dataframe2,     # other dataframe
              by = "column")  # columns used to match rows

If you want to match based on two or more columns, you can list them within a vector c(), separated by commas. Rows will only match to each other if the values in all these columns are exactly the same. In the join below, both data frames have the columns age and gender, and rows are matched if they have the same values in both these columns.

XXX_join(dataframe1, dataframe2,      
         by = c("age", "gender")   # columns used to match rows

Many times, the names of comparable columns in the two data frames are different. For example, ages might be recorded in column age in one data frame, and recorded in the column Ages in the second data frame. You can tell the join both of the names by using the = equals symbol within the vector c(). This join matches rows when the value in age in the first data frame is the same as the value in Ages in the second data frame.

XXX_join(dataframe1, dataframe2,      
         by = c("age" = "Ages")   # columns with different column names

Finally, here is an example of both syntaxes together. This command matches rows when:

The value in age in the first data frame is the same as the value in Ages in the second data frame, AND
The value in gender in the first data frame is the same as the value in Sex in the second data frame, AND
The value in city is the first data frame is the same as the value in city in the second data frame

XXX_join(dataframe1, dataframe2,      
         by = c("age"    = "Ages",  # different column names
                "gender" = "Sex",   # different column names
                "city")             # both data frames have column "city"

Join hospital data

Now that we have a hosp hospital linelist, let's join it into the surv surveillance linelist.

Inspect data before joining

It is critically important to know your datasets before joining them together. This prepares you to anticipate what will happen when you join.

There is so much that can potentially go wrong when joining (and hidden from view!) that you must know things in advance such as:

The dimensions (number of rows and columns) of your datasets
Whether there are duplicates in either of the datasets
Whether the identifiers are written similarly in both datasets
How the contents of the two datasets overlap
Which column names are present in both datasets

Review dimensions

Use the function dim() on the two data frames. This is a shortcut to return the number of rows and the number of columns. Alternatively, you can use nrow() and ncol() commands individually.

Check the dimensions (number of lines and columns) of the two data frames.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

dim(surv)
dim(hosp)

Review duplicates

Now, it would be good to understand whether there are duplicated identifiers in the surv or the hosp data frames. Recall that duplicates may result in joins that increase the number of rows in the baseline data frame.

Using your knowledge from the module on summary tables, can you write a command that shows the surv identifiers (case_id) that appear more than once.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

You can pipe the surv dataset into count(case_id) to produce a table counts for all case_id values in surv. Then pipe to filter() and specify a logic criteria so that only those which appear more than once remain.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv %>% 
  count(case_id) %>% 
  filter(n > 1)

Now do the same for the data frame hosp, using its identifier column ID.

So we know that there are some patients who have several rows in the surv file. Ideally we would go and investigate these duplicates to see if they correspond to different visits, or if some patients were wrongly entered twice, but since our focus is on joining, let's keep them all for now.

You may ask: why were these duplicates not removed during the data cleaning steps? The answer is that we used distinct() which by default removes rows that are entirely duplicates of another row. These rows we have identified here have duplicate case_id, but different values in other columns.

Review identifier values

Before joining, it is extremely important to consider whether the column(s) used to match the two data frames are clean.

The join functions from {dplyr} are using exact matching: the values need to be exactly the same in both data frames to be matched.

Note: There are packages that perform probabilistic ("fuzzy") matching, but we will not see them today (refer to the chapter on joining in the Epi R Handbook).

In our case, do we know if the patient IDs were entered in the same way in both data frames? If data comes from a clean SQL database and is a primary key, probably. With Excel data entered by many different people in emergency settings, maybe not. In general, it is a good rule to scrutinize the columns you want to use as identifier/keys to join your data frames.

Spend a bit of time looking at the structure of the values in the surv column case_id. Do they follow the same pattern?

Let's look together at a sample of IDs from both data frames. These commands arrange both data frames by their identifier column, then return only that column with pull(), and then take only the first 10 values.

# for each dataset, sort and print the first 10 identifier values

# for the surveillance dataset
surv %>% arrange(case_id) %>% pull(case_id) %>% head(10)

Note that if you see the above result appearing on two lines, this is not because of any difference in the values. There are 10 values produced by head(10), but because the window is narrow they have been wrapped around. Note the numbers in brackets (e.g. [9]) that tell you which value is starting the new line.

# for the hospital dataset
hosp %>% arrange(ID) %>% pull(ID) %>% head(10)

They seem to match. With very dirty data, you would want to inspect your IDs more, especially the ones who don't match, in case several patterns of IDs were used within the same file.

Review overlap

Now, to understand how the two data frames relate to each other, perform an anti_join(), first using hosp as the "baseline" data frame.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Use anti_join(dataframe1, dataframe2, by = ). Recall that the identifier columns in the two data frames are named differently, so you must tell the join that they should be compared with an =, and within a vector c().

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

anti_join(hosp, surv, by = c("ID" = "case_id"))

We can see that there are some patients in hosp that are not present in surv. Now perform the opposite anti_join(), using hosp as the baseline. Are there any patients in surv not present in hosp?

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

Did you remember to switch the order of the by = as well as the data frames?

anti_join(surv, hosp, by = c("case_id" = "ID"))

So we see that there are 0 rows from that second anti join. The surv data frame has all the patients that are in the hosp data frame, and that the hosp data frame is a subset of the surv patients.

quiz(caption = "Quiz - Data inspections",
    question("If you were to conduct a full_join() with surv as the baseline and hosp joined into it, how many additional rows would you expect to appear in the joined data frame?",
    allow_retry = T,
    answer("0", message = "We are asking about a 'full' join."),
    answer("17"),
    answer("129", correct = T, message = "Remember, a full join includes ALL rows from both data frames."),
    answer("5 rows would be removed")
  )
)

Review column names

It is important to know what the column names are in each data frame.

Review again the column names of each data frame:

surv %>% colnames()

hosp %>% colnames()

quiz(caption = "Quiz - Compare columns",
  question("Which columns in the hosp linelist also exist in the surv linelist?",
    allow_retry = T,
    answer("hospital", correct = T),
    answer("date_outcome"),
    answer("age", correct = T),
    answer("age_unit", correct = T),
    answer("gender", correct = T)
  ),
  question("Which columns exist only in the hosp linelist?",
    allow_retry = T,
    answer("date_hospitalisation", correct = T),
    answer("age"),
    answer("time_admission", correct = T),
    answer("outcome", correct = T),
    answer("date_outcome", correct = T)
  )
)

Let's do an experiment: run a command in your RStudio to join the two data frames:

Use a left_join()
Use surv as the baseline/reference data frame
Use c("case_id" = "ID") as the link between the data frames
Save the result as the object test_join

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

test_join <- surv %>% 
  left_join(hosp, by = c("case_id" = "ID"))

What are the dimensions of the new data frame?

Run dim(test_join) and compare with the previous dimensions of surv.

r fontawesome::fa("eye", fill = "darkblue")

Now, click on test_join in your R Environment to view it closely. See the columns near the left side that were part of the original surv data frame, and then scroll to the right to see the new columns that have been added, from the hosp data frame.

What do you notice about the columns gender, age, age_unit, and hospital?

quiz(caption = "Quiz - Joins",
    question("What happened to these columns?",
    allow_retry = T,
    answer("They were removed"),
    answer("They were all moved to the left end of the data frame"),
    answer("They appear twice - once with '.x' appended, and again with '.y' appended", correct = T)
  )
)

These column names existed in both data frames and were NOT used in by = to match rows, so when the data were joined, R had to differentiate between the duplicate columns by adding the suffixes .x and .y.

Are the values the same in both gender.x and gender.y How can you examine this?

Since they are now in the same data frame, you can cross-tabulate the two columns.

# cross tabulate the two columns in the joined dataset
test_join %>% 
  tabyl(gender.x, gender.y)

You can see that gender.x (from the original surv dataset, which was cleaned by you in a previous module) has the values "female" and "male".

In contrast, gender.y, which came from the hospital linelist, has uncleaned values "f", "m", "", and NA.

What to do with this situation? This is a classic scenario when trying to combine datasets - what to do with redundant columns?

But are they truly redundant columns? You will need to ask yourself these questions:

Do you want to keep the gender, age, hospital, etc values from the hospital linelist - for records?
If the age or gender for a patient differ between surv and hosp, which one will you consider to be correct?

These are questions that only you can answer about your dataset.

You have three options:

1) Perform the join as above, to keep all the columns
* This will create the .x and .y suffices, which you can edit with rename()

2) Set all of the redundant columns as identifiers in the join
* This keep only the columns of the baseline data frame. The command would look like this:

test <- full_join(surv, hosp,
          by = c(
            "case_id" = "ID",
            "age",                    # the other columns are
            "age_unit",               # named the same in both 
            "gender",
            "hospital"))

3) Remove the redundant columns before the merge
* If you do not care about retaining the redundant columns from the second data frame, simply remove them with select() prior to the join.

For this exercise, write code that uses *Option 3 above*

1) Go to the bind_rows() command that creates hosp, and pipe to a select command that leaves only "ID" and the new columns.
2) Take advantage of the re-naming functionality of select(), to rename "ID" to "case_id", so that the join command will be more simple.
3) Re-run the command

hosp <- bind_rows(hosp_central, hosp_port, hosp_military,
                  hosp_smmh, hosp_other, hosp_missing) %>% 

  # select specific columns from hosp, and re-name ID as case_ID
  select(
    case_id = ID,          # select and rename
    date_hospitalisation,  # select
    time_admission,        # select
    date_outcome,          # select
    outcome)               # select

4) Now, run a left_join() on surv and hosp, using "case_id" as the common identifier, and add it to your script in the "Joining" section.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Join the two data frames with a left join
combined <- left_join(surv, hosp, by = "case_id")

Join lab data

Now we need to join the data frame of laboratory testing results.

Complete the following steps to join the lab data frame into the combined data frame

Import and clean the lab data

In the importing section of your script (near the top), write and run a command to import the XLSX file located at "data/raw/lab_results_20141201.xlsx", clean the column names using clean_names(), and save it as lab in your R Environment.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Import lab data
# (place this near the top of your script, in the importing section)
lab <- import(here("data", "raw", "lab_results_20141201.xlsx")) %>% 
  clean_names()

Don't forget to use the correct file extension for the lab file (.xlsx)!

Join the lab data

In the Joining data section of your script (below your hosp join), write and run commands to compare the combined and lab data frames, and then conduct a left_join() to add the lab column ct_blood to the combined data frame, matching on case_id.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Join the two data frames with a left-join
combined <- left_join(combined, lab, by = "case_id")

r fontawesome::fa("eye", fill = "darkblue") Verify manually in RStudio viewer that the join occurred correctly.

Add case investigation data

Finally, we need to add new data from case investigations. Teams have called and conducted home visits to do epidemiological detective work. They have recorded the likely source cases, dates of infection, and context of infection for some of the cases.

Complete the following steps to join the case investigations data frame into the combined data frame

1) In the importing section of your script (near the top), write and run a command to import the XLSX file located at "data/raw/case_investigations_20141201.xlsx" and save it as investigations.
* Pipe the data into a cleaning command in which you select only the column(s) necessary for the join and new columns to be added to combined

2) In the Joining data section of your script, write and run commands to:
* Compare the combined and investigations data frames
* Conduct a left_join() to add the investigations columns to combined

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Import lab data 
# (add to the import section of your R script)
investigations <- import(here("data", "raw", "case_investigations_20141201.xlsx")) %>% 
 # remove unnecessary columns  
 select(-c(age, age_unit, gender))

# Join the two data frames with a left-join
# (add to the joining section of your R script)
combined <- left_join(combined, investigations, by = "case_id")

Don't forget to use the correct file extension for the file (.xlsx)!

r fontawesome::fa("eye", fill = "darkblue") Verify manually that the join occurred correctly.

Post-join cleaning

Now that we have these new columns, we should do some final cleaning on the combined data frame, after the joins.

Column names and classes

Now that we have new columns in our data frame, we should write some code to ensure that the new columns names have correct class.

Write a short pipe chain to address the following issues in combined:

Use clean_names() to ensure all the column names have proper syntax
Use mutate() to properly classify the three new date columns as class date, using the {lubridate} functions you learned earlier
Redefine date_hospitalisation as class date
Redefine date_outcome as class date
Redefine date_infection as class date
Use mutate() to replace empty characters "" with NA, using na_if(), for the columns outcome and hospital (see your cleaning pipe chain for an example)

quiz(caption = "Quiz - Date conversions",
  question("Which {lubridate} function should you use to convert date_hospitalisation to class date?",
    allow_retry = T,
    answer("mdy()", correct = T),
    answer("ymd()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("dmy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("imho()", message = "In my humble opinion, this is not a real function.")
  ),
    question("Which {lubridate} function should you use to convert date_outcome to class date?",
    allow_retry = T,
    answer("mdy()", correct = T),
    answer("ymd()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("dmy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("imho()", message = "In my humble opinion, this is not a real function.")
  ),
    question("Which {lubridate} function should you use to convert date_infection to class date?",
    allow_retry = T,
    answer("mdy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("ymd()", correct = T),
    answer("dmy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."),
    answer("imho()", message = "In my humble opinion, this is not a real function.")
  )
)

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Use clean_names() first to standardize the syntax. To establish date class, choose between mdy(), ymd(), or dmy() to place within the mutate() function, based on the current format of the date (look at the data frame) for example: mutate(date_outcome = mdy(date_outcome))

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Clean the new columns that have been joined to 'combined'
combined <- combined %>% 

  # convert all column names to lower case and remove spaces
  clean_names() %>% 

  # covert new columns to class date
  mutate(date_hospitalisation = mdy(date_hospitalisation),
         date_outcome         = mdy(date_outcome),
         date_infection       = ymd(date_infection)) %>% 

  # clean outcome and hospital missings
  mutate(outcome = na_if(outcome, ""),
         hospital = na_if(hospital, ""))

You have some flexibility on where to put this data cleaning:

You could do this cleaning to each of the lab and investigations data frames when they are imported
You could have a part of your "Cleaning" section in which you clean these data frames
You can do this final cleaning once you join, as we have done in this exercise

Clean up script

At this point, we now have joins and several new import commands.

It is important to keep our R script orderly and in a logical flow:

1) Load packages
3) Import data
4) Clean primary data frame
5) Join other data (and perform any residual cleaning) 6) Tables and visualizations

Take a moment to ensure that your script aligns with the above, and that you can run it from top to bottom without any errors.

End

Congratulations! You have finished this exercise on joining data.

We now have a much more complete dataset to work with, including columns like outcome, date_hospitalisation, blood_ct, and source.

Remember to save your R script before leaving RStudio.

If you want to learn a bit more about shortcuts in R Studio and other helpful tips, you can go on to the next extra topic.

Extras: Tips and tricks

File paths

R Studio can help you quickly and correctly specify file paths by using Tab to auto-complete.

Start typing a file path (in quotes) and press Tab on your keyboard. R Studio will list the available sub-folders and files for that location. The example below shows the result of pressing Tab after typing "data/". Click the desired sub-folder or file or navigate to it with your keyboard and press enter.

knitr::include_graphics("images/filepaths1.png", error = F)

You can keep doing this until reaching your final destination:

knitr::include_graphics("images/filepaths2.PNG", error = F)

Note that the full file path is within one set of quotation marks, and has forward slashes in between the folders and files. This is how paths are usually written, although the direction of the slash can change depending on your operating system. When using the {here} package, we have been writing folders and files in their own quotation marks and separated by commas. The {here} package evaluates this into a full file path with slashes in the correct direction.

Keyboard shortcuts

Some very useful keyboard shortcuts are below - see more examples here: https://epirhandbook.com/en/r-basics.html#keyboard-shortcuts.

| Windows/Linux | Mac | Action | |---------------------|---------------------|-------------------------------| | Esc | Esc | Interrupt current command (useful if code being run is stuck) | | Ctrl+s | Cmd+s | Save (script) | | Tab | Tab | Auto-complete | | Ctrl + Enter | Cmd + Enter | Run current line(s)/selection of code | | Ctrl + Shift + C | Cmd + Shift + c | comment/uncomment the highlighted lines | | Alt + - | Option + - | Insert \<- | | Ctrl + Shift + m | Cmd + Shift + m | Insert %>% | | Ctrl + Alt + i | Cmd + Shift + r | Insert code chunk (into R Markdown) | | Ctrl + f | Cmd + f | Find and replace in current script | | Ctrl + Shift + f | Cmd + Shift + f | Find in files (search/replace across many scripts) |

appliedepi/introexercises documentation built on April 22, 2024, 1:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

appliedepi/introexercises

In appliedepi/introexercises:

Introduction to R for Applied Epidemiology and Public Health

Welcome

Joining data

Format

Getting Help

Quiz questions

Icons

License

Learning objectives

Prepare

Binding rows

Hospital datasets

Joins review

Review of join types

Schematic

Text descriptions

Mutating joins: add new variables to a data frame

Filtering joins: filter a data frame based on another one

Animations

Further resources

Show your understanding

Joining

Syntax

Join hospital data

Inspect data before joining

Review dimensions

Review duplicates

Review identifier values

Review overlap

Review column names

Join lab data

Import and clean the lab data

Join the lab data

Add case investigation data

Post-join cleaning

Column names and classes

Clean up script

End

Extras: Tips and tricks

File paths

Keyboard shortcuts

R Package Documentation

Browse R Packages

We want your feedback!