# load packages ---------------------------------------------------------------- library(introexercises) library(learnr) library(gradethis) library(dplyr) library(flair) library(ggplot2) library(lubridate) library(fontawesome) library(janitor) library(kableExtra) # library(RMariaDB) # connect to sql database ## set options for exercises and checking --------------------------------------- ## Define how exercises are evaluated gradethis::gradethis_setup( ## note: the below arguments are passed to learnr::tutorial_options ## set the maximum execution time limit in seconds exercise.timelimit = 60, ## set how exercises should be checked (defaults to NULL - individually defined) # exercise.checker = gradethis::grade_learnr ## set whether to pre-evaluate exercises (so users see answers) exercise.eval = FALSE ) # ## event recorder --------------------------------------------------------------- # ## see for details: # ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events # ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R # # ## connect to your sql database # sqldtbase <- dbConnect(RMariaDB::MariaDB(), # user = Sys.getenv("userid"), # password = Sys.getenv("pwd"), # dbname = 'excersize_log', # host = "144.126.246.140") # # # ## define a function to collect data # ## note that tutorial_id is defined in YAML # ## you could set the tutorial_version too (by specifying version:) but use package version instead # recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) { # # ## define a sql query # ## first bracket defines variable names # ## values bracket defines what goes in each variable # event_log <- paste("INSERT INTO responses ( # tutorial_id, # tutorial_version, # date_time, # user_id, # event, # section, # label, # question, # answer, # code, # correct) # VALUES('", tutorial_id, "', # '", tutorial_version, "', # '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "', # '", Sys.getenv("SHINYPROXY_PROXY_ID"), "', # '", event, "', # '", data$section, "', # '", data$label, "', # '", paste0('"', data$question, '"'), "', # '", paste0('"', data$answer, '"'), "', # '", paste0('"', data$code, '"'), "', # '", data$correct, "')", # sep = '') # # # Execute the query on the sqldtbase that we connected to above # rsInsert <- dbSendQuery(sqldtbase, event_log) # # } # # options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE) # Data prep -------------------------------------------------------------------- # Import surv <- rio::import(system.file("dat/old_version/surveillance_linelist_clean_20141201.rds", package = "introexercises")) hospitals <- bind_rows(rio::import(system.file("dat/old_version/hospitals/20141201_hosp_port.csv", package = "introexercises")), rio::import(system.file("dat/old_version/hospitals/20141201_hosp_central.csv", package = "introexercises"))) %>% select(hospital, date_hospitalisation, outcome, date_outcome) %>% janitor::clean_names() ## NOT USED ANYMORE # geo_data <- rio::import(system.file("dat/pop/sle_admpop_adm3_2020.csv", package = "introexercises")) %>% # select(-c(Female, Male), -starts_with("T")) ## NOT USED ANYMORE Make the hospital information dataframe # hospital_dirty = data.frame( # hosp_name = c("central hospital", "military", "military", "port", "St. Mark's", "ignace", "sisters"), # catchment_pop = c(1950280, 40500, 10000, 50280, 12000, 5000, 4200), # level = c("Tertiary", "Secondary", "Primary", "Secondary", "Secondary", "Primary", "Primary") # ) # # hospital_clean <- hospital_dirty %>% # mutate( # hosp_name = case_when( # # criteria # new value # hosp_name == "military" ~ "Military Hospital", # hosp_name == "port" ~ "Port Hospital", # hosp_name == "St. Mark's" ~ "St. Mark's Maternity Hospital (SMMH)", # hosp_name == "central hospital" ~ "Central Hospital", # TRUE ~ hosp_name # ) # ) # Create smaller linelists for the easier exemples patients <- tibble(ID = c("patient_1", "patient_2", "patient_3", "patient_4", "patient_10"), sexe = c("F", "M", "M", "F", "F"), age = c(5, 10, 2, 15, 14), age_unit = c("Year", "Year", "Year", "Year", "Year")) results <- tibble(ID = c("patient_1", "patient_2", "patient_4", "patient_5", "patient_6"), test_result = c("positive", "negative", "negative", "positive", "positive")) df1 <- tibble(ID = c("patient_1", "patient_2", "patient_3"), sexe = c("F", "M", "M")) df2 <- tibble(ID = c("patient_1", "patient_1", "patient_1", "patient_2", "patient_4"), date_test = as.Date(c("2021-12-01", "2021-12-26", "2022-01-05", "2021-12-18", "2022-01-01")), test_result = c("positive", "negative", "negative", "positive", "positive")) hosp_central <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_central.csv", package = "introexercises")) hosp_military <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_military.csv", package = "introexercises")) hosp_port <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_port.csv", package = "introexercises")) hosp_smmh <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_smmh.csv", package = "introexercises")) hosp_other <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_other.csv", package = "introexercises")) hosp_missing <- rio::import(system.file("dat/old_version/hospitals/20141201_hosp_missing.csv", package = "introexercises")) lab <- rio::import(system.file("dat/old_version/lab_results_20141201.xlsx", package = "introexercises")) investigations <- rio::import(system.file("dat/old_version/case_investigations_20141201.xlsx", package = "introexercises"))
# hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE)
Welcome to the live course "Introduction to R for applied epidemiologists", offered by Applied Epi - a nonprofit organisation that offers open-source tools, training, and support to frontline public health practitioners.
knitr::include_graphics("images/logo.png", error = F)
This exercise focuses on joining data frames.
This exercise will guide you through a set of tasks.
You should perform these tasks in RStudio and on your local computer.
There are several ways to get help:
1) Look for the "helpers" (see below)
2) Ask your live course instructor/facilitator for help
3) Ask a colleague or other participant in the course for tips
4) Post a question in Applied Epi Community in the category for questions about Applied Epi Training
Here is what those "helpers" will look like:
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Here you will see a helpful hint!
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
linelist %>% filter( age > 25, district == "Bolo" )
Here is more explanation about why the solution works.
Please complete the quiz questions that you encounter throughout the tutorial. Answering will help you to comprehend the material, and will also help us to improve the exercises for future students.
To practice, please answer the following questions:
quiz( question_radio("When should I view the red 'helper' code?", answer("After trying to write the code myself", correct = TRUE), answer("Before I try coding", correct = FALSE), correct = "Reviewing best-practice code after trying to write yourself can help you improve", incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer." ) )
question_numeric( "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?", answer(10, message = "Try not to worry, we will help you succeed!", correct = T), answer(9, message = "Try not to worry, we will help you succeed!", correct = T), answer(8, message = "Try not to worry, we will help you succeed!", correct = T), answer(7, message = "Try not to worry, we will help you succeed!", correct = T), answer(6, message = "Ok, we will get there together", correct = T), answer(5, message = "Ok, we will get there together", correct = T), answer(4, message = "I like your confidence!", correct = T), answer(3, message = "I like your confidence!", correct = T), answer(2, message = "I like your confidence!", correct = T), answer(1, message = "I like your confidence!", correct = T), allow_retry = TRUE, correct = "Thanks for sharing. ", min = 1, max = 10, step = 1 )
You will see these icons throughout the exercises:
Icon |Meaning
------|--------------------
r fontawesome::fa("eye", fill = "darkblue")
|Observe
r fontawesome::fa("exclamation", fill = "red")
|Alert!
r fontawesome::fa("pen", fill = "brown")
|An informative note
r fontawesome::fa("terminal", fill = "black")
|Time for you to code!
r fontawesome::fa("window-restore", fill = "darkgrey")
|Change to another window
r fontawesome::fa("bookmark", fill = "orange")
|Remember this for later
Applied Epi Incorporated, 2022
This work is licensed by Applied Epi Incorporated under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
Please email contact@appliedepi.org with questions about the use of these materials for academic courses and epidemiologist training programs.
In this exercise you will:
Open your R project and R script.
Run the code from the top of your script:
surv
Below your cleaning code (pipe chain), create a new script section for "Joining data".
The new header should look something like this:
# Joining data ----------------------------------------------
Combining datasets can sometimes be as simple as combining the rows of data frames that have exactly the same columns. For example, if you have multiple study sites, or clinical sites, that are sending you data in exactly the same format.
r fontawesome::fa("window-restore", fill = "darkgrey")
In our case study Ebola outbreak, there are six datasets from various hospitals in the "data/raw/hospitals" subfolder of the "intro_course" R project. In Module 1, you should have re-named them to the following standardized syntax (if not, please do so now, or replace them using the "backup" renamed files in the "data/raw/hospitals/backup" folder):
Each file corresponds with a particular hospital (Central, Military, Port, St. Mark's) and there are datasets containing record for patients from other hospitals, or for whom the hospital is missing.
Near the top of your R script (in the section for importing data), write and run some commands to import the hospital linelist files from the "data/raw/hospitals" subfolder. Name the data frames in R as hosp_central
, hosp_port
, hosp_missing
, etc.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
As a reminder, you should write these commands in the importing section of your script, for optimal organization.
hosp_central <- import(here("data", "raw", "hospitals", "20141201_hosp_central.csv")) hosp_military <- import(here("data", "raw", "hospitals", "20141201_hosp_military.csv")) hosp_port <- import(here("data", "raw", "hospitals", "20141201_hosp_port.csv")) hosp_smmh <- import(here("data", "raw", "hospitals", "20141201_hosp_smmh.csv")) hosp_other <- import(here("data", "raw", "hospitals", "20141201_hosp_other.csv")) hosp_missing <- import(here("data", "raw", "hospitals", "20141201_hosp_missing.csv"))
r fontawesome::fa("eye", fill = "darkblue")
Open the data frames in R, and observe the column names
quiz(caption = "Quiz - Hospital linelists", question("Are the column names the same in each of the six hospital linelists?", allow_retry = T, answer("Yes", correct = T), answer("No") ) )
Because the columns have the same names, you should be able to simply "bind" the rows together - digitally stacking the 6 data frames on top of one another.
This is best done with the function bind_rows()
, from the package {dplyr}. This function is flexible in that the columns do not even have to be in the same order for the data frames to be bound together. The function will align the columns automatically - the columns only have to have the same names.
Here is the syntax - it is quite simple:
bind_rows(df1, df2, df3, df4, ...)
Above, "dfX" would be replaced by the names of the data frames that you want to combine.
In the "Joining data" section of your script, write and run a command that binds together the six hospital datasets, and saves the result as hosp
.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
hosp <- bind_rows(hosp_central, hosp_port, hosp_military, hosp_smmh, hosp_other, hosp_missing)
The order of the data frames is not important, for the purposes of this example.
r fontawesome::fa("eye", fill = "darkblue")
Now click open and view the hosp
data frame that you have created. Does it look correct?
hospital
with various values? Other times, you want to combine data using joins, not simply binding more rows. Let us take a moment to review the different types of joins:
Take a few minutes to review the information below. We give it to you as text descriptions, as a schematic, and as animations (source).
To note, the most common joins in public health contexts are left joins and anti joins.
knitr::include_graphics("images/summary_joins.png", error = F)
Left join: keep all rows from the baseline data frame; add columns and matched rows from the second data frame. For unmatched records, the columns of the baseline data frame are filled as NA
.
Right join: keep only matched rows from the baseline data frame, and add columns and unmatched rows from the second data frame. For unmatched records, the columns of the second data frame are filled as NA
. Another way to think about this is: it is a left join in which the second-listed data frame is the baseline.
Full join: combine two full data frames, keeps all rows and columns.
Inner join: take the intersection of two data frames, keeps only rows present in both data frames
Anti join: keep all lines from baseline data frame without matches in the second data frame; no columns are added from the second data frame.
Semi join: keep all lines from baseline data frame with matches in the second data frame; no columns are added from the second data frame.
knitr::include_graphics("images/left-join.gif", error = F)
knitr::include_graphics("images/right-join.gif", error = F)
knitr::include_graphics("images/full-join.gif", error = F)
knitr::include_graphics("images/inner-join.gif", error = F)
knitr::include_graphics("images/anti-join.gif", error = F)
knitr::include_graphics("images/semi-join.gif", error = F)
Tidyverse site presentation of mutating joins and filtering joins: some explanations + examples
Tidyverse reference page on mutating joins: here you will find the full list of arguments for the joins functions
Epi R Handbook page on joins
Review these two mini dataframes containing patient data (named patients
) and lab results (named results
).
patients %>% kbl() %>% kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) results %>% kbl() %>% kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
Now answer the quiz questions below.
quiz(caption = "Which join would you use:", question("To add the age and sex for all patients into the lab database, if available there", answer("left_join(patients, results, by = 'ID')"), answer("right_join(patients, results, by = 'ID')", correct = TRUE), answer("inner_join(patients, results, by = 'ID')", message = "An inner join would not include patients from the results database without a match in the patient database. Rows for patient 5 and 6 would be lost"), answer("full_join(patients, results, by = 'ID')", message = "An full join would bring in rows with no results from the patient database and add unwanted NA. Here we want to use the results dataframe as the reference."), answer("anti_join(patients, results, by = 'ID')"), answer("semi_join(patients, results, by = 'ID')"), allow_retry = TRUE), question("To get a dataframe with patients for which we have data for age, sex and test result", answer("left_join(patients, results, by = 'ID')"), answer("right_join(patients, results, by = 'ID')"), answer("inner_join(patients, results, by = 'ID')", correct = TRUE), answer("full_join(patients, results, by = 'ID')", message = "The full join would include all lines from the two dataframes, even if they do not have a match in the other dataframe"), answer("anti_join(patients, results, by = 'ID')"), answer("semi_join(patients, results, by = 'ID')"), allow_retry = TRUE), question("To keep all the patients from the results dataframe for which we have age and sex data (but not add those columns to the results dataframe)", answer("left_join(results, patients, by = 'ID')"), answer("right_join(patients, results, by = 'ID')"), answer("inner_join(results, patients, by = 'ID')", message = "You are close. But an inner join in this case would import the columns from the patients dataframe, which we do not want"), answer("full_join(results, patients, by = 'ID')"), answer("anti_join(patients, results, by = 'ID')"), answer("semi_join(results, patients, by = 'ID')", correct = TRUE), allow_retry = TRUE), question("To get the list of patients for whom we have age and sex, but no test result", answer("left_join(patients, results, by = 'ID')"), answer("right_join(patients, results, by = 'ID')"), answer("inner_join(patients, results, by = 'ID')"), answer("full_join(patients, results, by = 'ID')"), answer("anti_join(patients, results, by = 'ID')", correct = TRUE), answer("semi_join(results, patients, by = 'ID')"), allow_retry = TRUE), question("Which of these would return only the rows found in patients?", answer("inner_join(patients, results, by = 'ID')", correct = TRUE), answer("full_join(patients, results, by = 'ID')", correct = FALSE), answer("left_join(patients, results, by = 'ID')", correct = TRUE), allow_retry = TRUE), question("Which of these would use NA to fill-in values in rows that did not match?", answer("full_join(patients, results, by = 'ID')", correct = TRUE), answer("semi_join(patients, results, by = 'ID')", correct = FALSE), answer("anti_join(patients, results, by = 'ID')", correct = FALSE), answer("inner_join(patients, results, by = 'ID')", correct = FALSE), allow_retry = TRUE) )
The syntax of the join functions is always the same:
XXX_join(dataframe1, # "baseline" dataframe dataframe2, # other dataframe by = "column") # column used to match rows # Or, with a pipe: dataframe1 %>% # "baseline" dataframe XXX_join(dataframe2, # other dataframe by = "column") # columns used to match rows
If you want to match based on two or more columns, you can list them within a vector c()
, separated by commas. Rows will only match to each other if the values in all these columns are exactly the same. In the join below, both data frames have the columns age
and gender
, and rows are matched if they have the same values in both these columns.
XXX_join(dataframe1, dataframe2, by = c("age", "gender") # columns used to match rows
Many times, the names of comparable columns in the two data frames are different. For example, ages might be recorded in column age
in one data frame, and recorded in the column Ages
in the second data frame. You can tell the join both of the names by using the = equals symbol within the vector c()
. This join matches rows when the value in age
in the first data frame is the same as the value in Ages
in the second data frame.
XXX_join(dataframe1, dataframe2, by = c("age" = "Ages") # columns with different column names
Finally, here is an example of both syntaxes together. This command matches rows when:
age
in the first data frame is the same as the value in Ages
in the second data frame, AND gender
in the first data frame is the same as the value in Sex
in the second data frame, AND city
is the first data frame is the same as the value in city
in the second data frame XXX_join(dataframe1, dataframe2, by = c("age" = "Ages", # different column names "gender" = "Sex", # different column names "city") # both data frames have column "city"
Now that we have a hosp
hospital linelist, let's join it into the surv
surveillance linelist.
It is critically important to know your datasets before joining them together. This prepares you to anticipate what will happen when you join.
There is so much that can potentially go wrong when joining (and hidden from view!) that you must know things in advance such as:
Use the function dim()
on the two data frames. This is a shortcut to return the number of rows and the number of columns. Alternatively, you can use nrow()
and ncol()
commands individually.
Check the dimensions (number of lines and columns) of the two data frames.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
dim(surv) dim(hosp)
Now, it would be good to understand whether there are duplicated identifiers in the surv
or the hosp
data frames. Recall that duplicates may result in joins that increase the number of rows in the baseline data frame.
Using your knowledge from the module on summary tables, can you write a command that shows the surv
identifiers (case_id
) that appear more than once.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
You can pipe the surv
dataset into count(case_id)
to produce a table counts for all case_id
values in surv
. Then pipe to filter()
and specify a logic criteria so that only those which appear more than once remain.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv %>% count(case_id) %>% filter(n > 1)
Now do the same for the data frame hosp
, using its identifier column ID
.
So we know that there are some patients who have several rows in the surv
file. Ideally we would go and investigate these duplicates to see if they correspond to different visits, or if some patients were wrongly entered twice, but since our focus is on joining, let's keep them all for now.
You may ask: why were these duplicates not removed during the data cleaning steps? The answer is that we used distinct()
which by default removes rows that are entirely duplicates of another row. These rows we have identified here have duplicate case_id
, but different values in other columns.
Before joining, it is extremely important to consider whether the column(s) used to match the two data frames are clean.
The join
functions from {dplyr} are using exact matching: the values need to be exactly the same in both data frames to be matched.
Note: There are packages that perform probabilistic ("fuzzy") matching, but we will not see them today (refer to the chapter on joining in the Epi R Handbook).
In our case, do we know if the patient IDs were entered in the same way in both data frames? If data comes from a clean SQL database and is a primary key, probably. With Excel data entered by many different people in emergency settings, maybe not. In general, it is a good rule to scrutinize the columns you want to use as identifier/keys to join your data frames.
Spend a bit of time looking at the structure of the values in the surv
column case_id
. Do they follow the same pattern?
Let's look together at a sample of IDs from both data frames. These commands arrange both data frames by their identifier column, then return only that column with pull()
, and then take only the first 10 values.
# for each dataset, sort and print the first 10 identifier values # for the surveillance dataset surv %>% arrange(case_id) %>% pull(case_id) %>% head(10)
Note that if you see the above result appearing on two lines, this is not because of any difference in the values. There are 10 values produced by head(10)
, but because the window is narrow they have been wrapped around. Note the numbers in brackets (e.g. [9]) that tell you which value is starting the new line.
# for the hospital dataset hosp %>% arrange(ID) %>% pull(ID) %>% head(10)
They seem to match. With very dirty data, you would want to inspect your IDs more, especially the ones who don't match, in case several patterns of IDs were used within the same file.
Now, to understand how the two data frames relate to each other, perform an anti_join()
, first using hosp
as the "baseline" data frame.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Use anti_join(dataframe1, dataframe2, by = )
. Recall that the identifier columns in the two data frames are named differently, so you must tell the join that they should be compared with an =, and within a vector c()
.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
anti_join(hosp, surv, by = c("ID" = "case_id"))
We can see that there are some patients in hosp
that are not present in surv
. Now perform the opposite anti_join(), using hosp
as the baseline. Are there any patients in surv
not present in hosp
?
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
Did you remember to switch the order of the by =
as well as the data frames?
anti_join(surv, hosp, by = c("case_id" = "ID"))
So we see that there are 0 rows from that second anti join. The surv
data frame has all the patients that are in the hosp
data frame, and that the hosp
data frame is a subset of the surv
patients.
quiz(caption = "Quiz - Data inspections", question("If you were to conduct a full_join() with surv as the baseline and hosp joined into it, how many additional rows would you expect to appear in the joined data frame?", allow_retry = T, answer("0", message = "We are asking about a 'full' join."), answer("17"), answer("129", correct = T, message = "Remember, a full join includes ALL rows from both data frames."), answer("5 rows would be removed") ) )
It is important to know what the column names are in each data frame.
Review again the column names of each data frame:
surv %>% colnames()
hosp %>% colnames()
quiz(caption = "Quiz - Compare columns", question("Which columns in the hosp linelist also exist in the surv linelist?", allow_retry = T, answer("hospital", correct = T), answer("date_outcome"), answer("age", correct = T), answer("age_unit", correct = T), answer("gender", correct = T) ), question("Which columns exist only in the hosp linelist?", allow_retry = T, answer("date_hospitalisation", correct = T), answer("age"), answer("time_admission", correct = T), answer("outcome", correct = T), answer("date_outcome", correct = T) ) )
Let's do an experiment: run a command in your RStudio to join the two data frames:
left_join()
surv
as the baseline/reference data frame c("case_id" = "ID")
as the link between the data frames test_join
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
test_join <- surv %>% left_join(hosp, by = c("case_id" = "ID"))
What are the dimensions of the new data frame?
Run dim(test_join)
and compare with the previous dimensions of surv
.
r fontawesome::fa("eye", fill = "darkblue")
Now, click on test_join
in your R Environment to view it closely. See the columns near the left side that were part of the original surv
data frame, and then scroll to the right to see the new columns that have been added, from the hosp
data frame.
What do you notice about the columns gender
, age
, age_unit
, and hospital
?
quiz(caption = "Quiz - Joins", question("What happened to these columns?", allow_retry = T, answer("They were removed"), answer("They were all moved to the left end of the data frame"), answer("They appear twice - once with '.x' appended, and again with '.y' appended", correct = T) ) )
These column names existed in both data frames and were NOT used in by =
to match rows, so when the data were joined, R had to differentiate between the duplicate columns by adding the suffixes .x
and .y
.
Are the values the same in both gender.x
and gender.y
How can you examine this?
Since they are now in the same data frame, you can cross-tabulate the two columns.
# cross tabulate the two columns in the joined dataset test_join %>% tabyl(gender.x, gender.y)
You can see that gender.x
(from the original surv
dataset, which was cleaned by you in a previous module) has the values "female" and "male".
In contrast, gender.y
, which came from the hospital linelist, has uncleaned values "f", "m", "", and NA.
What to do with this situation? This is a classic scenario when trying to combine datasets - what to do with redundant columns?
But are they truly redundant columns? You will need to ask yourself these questions:
surv
and hosp
, which one will you consider to be correct? These are questions that only you can answer about your dataset.
You have three options:
1) Perform the join as above, to keep all the columns
* This will create the .x and .y suffices, which you can edit with rename()
2) Set all of the redundant columns as identifiers in the join
* This keep only the columns of the baseline data frame. The command would look like this:
test <- full_join(surv, hosp, by = c( "case_id" = "ID", "age", # the other columns are "age_unit", # named the same in both "gender", "hospital"))
3) Remove the redundant columns before the merge
* If you do not care about retaining the redundant columns from the second data frame, simply remove them with select()
prior to the join.
For this exercise, write code that uses *Option 3 above*
1) Go to the bind_rows()
command that creates hosp
, and pipe to a select command that leaves only "ID" and the new columns.
2) Take advantage of the re-naming functionality of select()
, to rename "ID" to "case_id", so that the join command will be more simple.
3) Re-run the command
hosp <- bind_rows(hosp_central, hosp_port, hosp_military, hosp_smmh, hosp_other, hosp_missing) %>% # select specific columns from hosp, and re-name ID as case_ID select( case_id = ID, # select and rename date_hospitalisation, # select time_admission, # select date_outcome, # select outcome) # select
4) Now, run a left_join()
on surv
and hosp
, using "case_id" as the common identifier, and add it to your script in the "Joining" section.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Join the two data frames with a left join combined <- left_join(surv, hosp, by = "case_id")
Now we need to join the data frame of laboratory testing results.
Complete the following steps to join the lab data frame into the combined
data frame
In the importing section of your script (near the top), write and run a command to import the XLSX file located at "data/raw/lab_results_20141201.xlsx", clean the column names using clean_names()
, and save it as lab
in your R Environment.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Import lab data # (place this near the top of your script, in the importing section) lab <- import(here("data", "raw", "lab_results_20141201.xlsx")) %>% clean_names()
Don't forget to use the correct file extension for the lab file (.xlsx)!
In the Joining data section of your script (below your hosp
join), write and run commands to compare the combined
and lab
data frames, and then conduct a left_join()
to add the lab
column ct_blood
to the combined
data frame, matching on case_id
.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Join the two data frames with a left-join combined <- left_join(combined, lab, by = "case_id")
r fontawesome::fa("eye", fill = "darkblue")
Verify manually in RStudio viewer that the join occurred correctly.
Finally, we need to add new data from case investigations. Teams have called and conducted home visits to do epidemiological detective work. They have recorded the likely source cases, dates of infection, and context of infection for some of the cases.
Complete the following steps to join the case investigations data frame into the combined
data frame
1) In the importing section of your script (near the top), write and run a command to import the XLSX file located at "data/raw/case_investigations_20141201.xlsx" and save it as investigations
.
* Pipe the data into a cleaning command in which you select only the column(s) necessary for the join and new columns to be added to combined
2) In the Joining data section of your script, write and run commands to:
* Compare the combined
and investigations
data frames
* Conduct a left_join()
to add the investigations
columns to combined
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Import lab data # (add to the import section of your R script) investigations <- import(here("data", "raw", "case_investigations_20141201.xlsx")) %>% # remove unnecessary columns select(-c(age, age_unit, gender)) # Join the two data frames with a left-join # (add to the joining section of your R script) combined <- left_join(combined, investigations, by = "case_id")
Don't forget to use the correct file extension for the file (.xlsx)!
r fontawesome::fa("eye", fill = "darkblue")
Verify manually that the join occurred correctly.
Now that we have these new columns, we should do some final cleaning on the combined
data frame, after the joins.
Now that we have new columns in our data frame, we should write some code to ensure that the new columns names have correct class.
Write a short pipe chain to address the following issues in combined
:
clean_names()
to ensure all the column names have proper syntax mutate()
to properly classify the three new date columns as class date, using the {lubridate} functions you learned earlier date_hospitalisation
as class date date_outcome
as class date date_infection
as class date mutate()
to replace empty characters "" with NA
, using na_if()
, for the columns outcome
and hospital
(see your cleaning pipe chain for an example) quiz(caption = "Quiz - Date conversions", question("Which {lubridate} function should you use to convert date_hospitalisation to class date?", allow_retry = T, answer("mdy()", correct = T), answer("ymd()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."), answer("dmy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."), answer("imho()", message = "In my humble opinion, this is not a real function.") ), question("Which {lubridate} function should you use to convert date_outcome to class date?", allow_retry = T, answer("mdy()", correct = T), answer("ymd()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."), answer("dmy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."), answer("imho()", message = "In my humble opinion, this is not a real function.") ), question("Which {lubridate} function should you use to convert date_infection to class date?", allow_retry = T, answer("mdy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."), answer("ymd()", correct = T), answer("dmy()", message = "No, you need to choose the function that aligns with the format of the data as written PRIOR to use of the function."), answer("imho()", message = "In my humble opinion, this is not a real function.") ) )
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Use clean_names()
first to standardize the syntax. To establish date class, choose between mdy()
, ymd()
, or dmy()
to place within the mutate()
function, based on the current format of the date (look at the data frame) for example: mutate(date_outcome = mdy(date_outcome))
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Clean the new columns that have been joined to 'combined' combined <- combined %>% # convert all column names to lower case and remove spaces clean_names() %>% # covert new columns to class date mutate(date_hospitalisation = mdy(date_hospitalisation), date_outcome = mdy(date_outcome), date_infection = ymd(date_infection)) %>% # clean outcome and hospital missings mutate(outcome = na_if(outcome, ""), hospital = na_if(hospital, ""))
You have some flexibility on where to put this data cleaning:
lab
and investigations
data frames when they are imported At this point, we now have joins and several new import commands.
It is important to keep our R script orderly and in a logical flow:
1) Load packages
3) Import data
4) Clean primary data frame
5) Join other data (and perform any residual cleaning)
6) Tables and visualizations
Take a moment to ensure that your script aligns with the above, and that you can run it from top to bottom without any errors.
Congratulations! You have finished this exercise on joining data.
We now have a much more complete dataset to work with, including columns like outcome
, date_hospitalisation
, blood_ct
, and source
.
Remember to save your R script before leaving RStudio.
If you want to learn a bit more about shortcuts in R Studio and other helpful tips, you can go on to the next extra topic.
R Studio can help you quickly and correctly specify file paths by using Tab
to auto-complete.
Start typing a file path (in quotes) and press Tab
on your keyboard. R Studio will list the available sub-folders and files for that location. The example below shows the result of pressing Tab
after typing "data/"
. Click the desired sub-folder or file or navigate to it with your keyboard and press enter
.
knitr::include_graphics("images/filepaths1.png", error = F)
You can keep doing this until reaching your final destination:
knitr::include_graphics("images/filepaths2.PNG", error = F)
Note that the full file path is within one set of quotation marks, and has forward slashes in between the folders and files. This is how paths are usually written, although the direction of the slash can change depending on your operating system. When using the {here} package, we have been writing folders and files in their own quotation marks and separated by commas. The {here} package evaluates this into a full file path with slashes in the correct direction.
Some very useful keyboard shortcuts are below - see more examples here: https://epirhandbook.com/en/r-basics.html#keyboard-shortcuts.
| Windows/Linux | Mac | Action | |---------------------|---------------------|-------------------------------| | Esc | Esc | Interrupt current command (useful if code being run is stuck) | | Ctrl+s | Cmd+s | Save (script) | | Tab | Tab | Auto-complete | | Ctrl + Enter | Cmd + Enter | Run current line(s)/selection of code | | Ctrl + Shift + C | Cmd + Shift + c | comment/uncomment the highlighted lines | | Alt + - | Option + - | Insert \<- | | Ctrl + Shift + m | Cmd + Shift + m | Insert %>% | | Ctrl + Alt + i | Cmd + Shift + r | Insert code chunk (into R Markdown) | | Ctrl + f | Cmd + f | Find and replace in current script | | Ctrl + Shift + f | Cmd + Shift + f | Find in files (search/replace across many scripts) |
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.