# load packages ---------------------------------------------------------------- library(introexercises) library(learnr) library(gradethis) library(dplyr) library(flair) library(ggplot2) library(lubridate) library(fontawesome) library(janitor) library(kableExtra) # library(RMariaDB) # connect to sql database ## set options for exercises and checking --------------------------------------- ## Define how exercises are evaluated gradethis::gradethis_setup( ## note: the below arguments are passed to learnr::tutorial_options ## set the maximum execution time limit in seconds exercise.timelimit = 60, ## set how exercises should be checked (defaults to NULL - individually defined) # exercise.checker = gradethis::grade_learnr ## set whether to pre-evaluate exercises (so users see answers) exercise.eval = FALSE ) # ## event recorder --------------------------------------------------------------- # ## see for details: # ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events # ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R # # ## connect to your sql database # sqldtbase <- dbConnect(RMariaDB::MariaDB(), # user = Sys.getenv("userid"), # password = Sys.getenv("pwd"), # dbname = 'excersize_log', # host = "144.126.246.140") # # # ## define a function to collect data # ## note that tutorial_id is defined in YAML # ## you could set the tutorial_version too (by specifying version:) but use package version instead # recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) { # # ## define a sql query # ## first bracket defines variable names # ## values bracket defines what goes in each variable # event_log <- paste("INSERT INTO responses ( # tutorial_id, # tutorial_version, # date_time, # user_id, # event, # section, # label, # question, # answer, # code, # correct) # VALUES('", tutorial_id, "', # '", tutorial_version, "', # '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "', # '", Sys.getenv("SHINYPROXY_PROXY_ID"), "', # '", event, "', # '", data$section, "', # '", data$label, "', # '", paste0('"', data$question, '"'), "', # '", paste0('"', data$answer, '"'), "', # '", paste0('"', data$code, '"'), "', # '", data$correct, "')", # sep = '') # # # Execute the query on the sqldtbase that we connected to above # rsInsert <- dbSendQuery(sqldtbase, event_log) # # } # # options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE) # Data prep -------------------------------------------------------------------- # Import surv <- rio::import(system.file("dat/surveillance_linelist_clean_20141201.rds", package = "introexercises")) hospitals <- bind_rows(rio::import(system.file("dat/hospitals/20141201_hosp_port.csv", package = "introexercises")), rio::import(system.file("dat/hospitals/20141201_hosp_central.csv", package = "introexercises"))) %>% select(hospital, date_hospitalisation, outcome, date_outcome) %>% janitor::clean_names() ## NOT USED ANYMORE # geo_data <- rio::import(system.file("dat/pop/sle_admpop_adm3_2020.csv", package = "introexercises")) %>% # select(-c(Female, Male), -starts_with("T")) ## NOT USED ANYMORE Make the hospital information dataframe # hospital_dirty = data.frame( # hosp_name = c("central hospital", "military", "military", "port", "St. Mark's", "ignace", "sisters"), # catchment_pop = c(1950280, 40500, 10000, 50280, 12000, 5000, 4200), # level = c("Tertiary", "Secondary", "Primary", "Secondary", "Secondary", "Primary", "Primary") # ) # # hospital_clean <- hospital_dirty %>% # mutate( # hosp_name = case_when( # # criteria # new value # hosp_name == "military" ~ "Military Hospital", # hosp_name == "port" ~ "Port Hospital", # hosp_name == "St. Mark's" ~ "St. Mark's Maternity Hospital (SMMH)", # hosp_name == "central hospital" ~ "Central Hospital", # TRUE ~ hosp_name # ) # ) # Create smaller linelists for the easier exemples patients <- tibble(ID = c("patient_1", "patient_2", "patient_3", "patient_4", "patient_10"), sexe = c("F", "M", "M", "F", "F"), age = c(5, 10, 2, 15, 14), age_unit = c("Year", "Year", "Year", "Year", "Year")) results <- tibble(ID = c("patient_1", "patient_2", "patient_4", "patient_5", "patient_6"), test_result = c("positive", "negative", "negative", "positive", "positive")) df1 <- tibble(ID = c("patient_1", "patient_2", "patient_3"), sexe = c("F", "M", "M")) df2 <- tibble(ID = c("patient_1", "patient_1", "patient_1", "patient_2", "patient_4"), date_test = as.Date(c("2021-12-01", "2021-12-26", "2022-01-05", "2021-12-18", "2022-01-01")), test_result = c("positive", "negative", "negative", "positive", "positive")) hosp_central <- rio::import(system.file("dat/hospitals/20141201_hosp_central.csv", package = "introexercises")) hosp_military <- rio::import(system.file("dat/hospitals/20141201_hosp_military.csv", package = "introexercises")) hosp_port <- rio::import(system.file("dat/hospitals/20141201_hosp_port.csv", package = "introexercises")) hosp_smmh <- rio::import(system.file("dat/hospitals/20141201_hosp_smmh.csv", package = "introexercises")) hosp_other <- rio::import(system.file("dat/hospitals/20141201_hosp_other.csv", package = "introexercises")) hosp_missing <- rio::import(system.file("dat/hospitals/20141201_hosp_missing.csv", package = "introexercises")) lab <- rio::import(system.file("dat/lab_results_20141201.xlsx", package = "introexercises")) investigations <- rio::import(system.file("dat/case_investigations_20141201.xlsx", package = "introexercises"))
# hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE)
Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.
knitr::include_graphics("images/logo.png", error = F)
This exercise focuses on developing a COVID-19 situation report using R Markdown.
This exercise guides you through tasks that you should perform in RStudio on your local computer.
Please email contact@appliedepi.org with questions about the use of these materials.
In this exercise you will:
THIS IS NOT A TEST. This is a challenging exercise, but you can engage with it at the difficulty level that suits you.
Open RStudio and create a new RStudio project, inside the "covid" folder within your "intro_course" directory.
You will be starting a distinct outbreak and analysis, so it deserves its own project (self-contained and portable R environment with data, scripts, outputs, etc).
If you do not remember how to make a new project, see the "hint" below.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
1) Open RStudio (ensure that you open RStudio and not just R).
2) In RStudio click File -> New Project. In the pop-up window, select "Existing directory".
knitr::include_graphics("images/create_project.png")
3) Create the project in the "intro_course/covid" subfolder
Confirm that the name of the project in the upper-right corner of the current RStudio session is "covid". If you are not in any RStudio project, it will read "Project: (None)".
Here is how your "covid" folder structure should look:
r emo::ji("folder")
Desktop r emo::ji("folder")
intro_courser emo::ji("folder")
module1 r emo::ji("folder")
ebola r emo::ji("folder")
learning_materials r emo::ji("folder")
covid r emo::ji("file")
covid_sitrep.docxr emo::ji("folder")
scriptsr emo::ji("folder")
backupr emo::ji("folder")
datar emo::ji("file")
city_pops.xlsx r emo::ji("file")
covid_example_data.xlsx This case study is an opportunity to apply all of the skills you have learned in the course. Have fun!
These COVID-19 data are anonymised and jittered. They were provided to Applied Epi by the public health department of Fulton County, Georgia, United States for use in training epidemiologists.
As background, the City of Atlanta is a major metropolitan area in the southeastern US state of Georgia. About 90% of Atlanta lies within Fulton County. Fulton County has a population of around 1 million people, and includes numerous other smaller municipalities in addition to Atlanta.
Open "covid_example_data.xlsx" in Excel. There are approximately 80,000 records. Note the columns:
report_dt
and sym_startdt
case_age
case_gender
City
is the municipality of their residence, within the County. See the corresponding dataset "city_pops.xlsx" case_zip
refers to the "ZIP code" (postal code) hospitalized
died
confirmed_case
Open the "city_pops.xlsx" dataset in Excel.
You are tasked with re-creating the "covid_sitrep.docx" located in the "covid" folder. You must write an R Markdown script that produces a Word document as similar to this report as possible.
All the functions and methods required to create this output report have been included in the folder.
Open and review "covid_sitrep.docx".
Note the title, subtitle, and date at the top. These can all be specified in the YAML section of the R Markdown.
Notice the summary information about the COVID-19 outbreak - a heading and some bullets which include numbers and dates from the data (likely created with "in-line" R code).
Do you remember how to create headings and bullets in the written text portion of an RMD?
The age / sex pyramid is stratified by gender (do you remember the age_pyrmaid()
function?)
The epidemic curve is colored by city (remember the fill =
argument).
Advanced users will note the caption and other plot elements that could make use of str_glue()
. Also that not all cities are explicitly given colors - only those that appear the most frequently in the data (... fct_lump()
).
Two summary tables are included in the report:
A demographic table on hospitalisation status by race (remember the tblsummary()
function?)
A table showing cumulative case incidence per 10,000, stratified by city (this will require grouping and summarising the data, and probably joining!)
NOTE: the exercise today will provide you some tips and tricks for re-creating the R Markdown which would output this report. However, answers will not be provided.
You are welcome to proceed with no assistance, to truly test your skills.
However, the sections of the exercise below can be considered a "tips sheet" that can provide you with suggestions and reminders. However, it will not provide solutions.
You can also request assistance from an instructor at any point.
Remember: this is not a test. Have fun. Try your best.
Open a new R Markdown script and save it in your "covid/scripts" folder as "covid_sitrep_2.Rmd". Choose a different name than "covid_sitrep.docx" to avoid over-writing the original Word report by accident, for example: "covid_sitrep_YOUR_NAME.docx".
Remove the example content in the template R Markdown script. Keep the YAML section and "setup" code chunk.
Begin with the standard YAML provided in the template. Edit the YAML to include a title and subtitle to match the Word report:
Recognize that the report is as of "21 June 2021", so one approach is to specify that date as a parameter in the YAML, so that you can use it to filter the data in the cleaning process. If you are not comfortable with parameters, you can skip this part or ask for help.
The easiest method is to simply add a params
section at the bottom of the YAML and add this date as below. Remember YAML is very sensitive about spaces, colons, and indentations. Do not forget the ! before the r.
--- params: data_date: !r lubridate::ymd("2021-06-21") ---
Alternatively, you can the date selection process a point-and-click interface using the below YAML.
Now, if you click to “Knit with parameters”, a pop-up window will ask you to select the data_date. Ask the instructors if you are confused about the point-and-click interface option.
--- params: data_date: label: "Date of data:" value: !r lubridate::ymd("2021-06-21") input: date ---
Whichever way you choose - you must reference the date parameter in your code as params$data_date
.
Finally, make sure your YAML is set to output a word document.
The very first "setup" code chunk in an R Markdown typically sets the default visibility options for all chunks. Consider setting the following:
eval = TRUE
(output from code chunk is included; however you may want to set this to FALSE
for testing or exploratory analysis code chunks)echo = FALSE
(code itself is not printed in the report)warning = FALSE
(warnings do not appear in the report)message = FALSE
(messages do not appear in the report)error = TRUE
(show errors in the report) Create a loading packages code chunk in your RMD. Consider which packages you will need to create this document. We recommend you adapt code from the Ebola analysis situation report RMD as you work through this exercise.
Consider the following packages in particular:
Use pacman::p_load()
to install packages if needed and load them for use. Additionally, remember to put {tidyverse} last in the command.
Data import should have its own chunk. Because this is an R Markdown, it is especially important to use the here()
function within the import()
function to locate the files.
Remember, there are two files of interest for the present analysis:
Both datasets are located in the "data" sub-folder, which means you will need to include "data" in your here()
function.
Give the datasets names that are easy to reference, e.g. linelist_raw
and pops
.
Before cleaning the data, consider examining the data in an exploratory analysis code chunk. Consider setting this chunk to eval=FALSE
so the outputs are not printed to the report.
Remember, when you have multiple commands in one chunk, you can highlight specific commands and press Ctrl + Enter to run them alone.
For running R code from code chunks through the console, we can either:
Make sure you have run the packages and importing chunks before beginning your exploratory analysis.
Examine the case linelist data frame and consider the following when developing a cleaning pipeline:
names()
are present in the data frame?class()
of columns of interest?clean_names()
?rename()
some columns for standardisation? ymd()
, mdy()
, or dmy()
order?filter()
our data frame based on a date (e.g. June 21 2021)?filter()
based on case status? (We only want confirmed cases!)age_categories()
column for our age pyramid? gender
column? Remember, when you have multiple commands in one chunk, you can highlight specific commands and press Ctrl + Enter to run them alone.
Make sure you have run the packages and importing chunks before beginning your exploratory analysis.
Examine the case linelist:
names()
are present in the data frame? class()
of columns of interest? clean_names()
? rename()
some columns for standardisation? ymd()
, mdy()
, or dmy()
order? filter()
our data frame based on a date (e.g. June 21 2021)? filter()
based on case status? (if we only want confirmed cases!) age_categories()
column for our age pyramid? gender
column? Create a cleaning code chunk. Your exact number of cases may not align perfectly with the report - this is OK. You can take liberty to make decisions about inclusion and exclusion criteria.
clean_names()
function from {janitor} to convert all column names to lowercase with no spaces or special characters rename()
- REMEMBER the expected syntax is NEW = OLD dmy()
, mdy()
, or ymd()
. Use the function that corresponds to the how the dates are formatted prior to applying the function. filter()
. Remember to assign your cleaning pipe chain to a new object, called something easy to reference like linelist
or linelist_clean
.
Think about order of your code. Don’t forget that in order to print these text statement with summary statistics inline, those calculations and data cleaning need to happen above this text in the R Markdown (but not emit any visible output).
To display dates in a specific format, you can wrap the date in format()
and supply the strptime
syntax that you prefer, for example: * To display dates in a specific format, you can wrap the date in format()
and supply the strptime
syntax that you prefer, for example:
format(params$data_date, format = "%d %B, %Y")
fmt_count()
function from {epikit}. This accepts the name of a data frame, and then a logical statement used to filter rows.fmt_count(combined, outcome == "Death")
paste("242 (44.9%)")
Create a cleaning code chunk. Your exact number of cases may not align perfectly with the report - this is OK. You can take liberty to make decisions about inclusion and exclusion criteria.
clean_names()
function from {janitor} to convert all column names to lowercase with no spaces or special characters rename()
- REMEMBER the expected syntax is NEW = OLD dmy()
, mdy()
, or ymd()
. Use the function that corresponds to the how the dates are formatted prior to applying the function. filter()
.Remember to assign your cleaning pipe chain to a new object, called something easy to reference like linelist
or linelist_clean
.
To get page breaks in specific places (e.g. after the summary text portion), insert \pagebreak
into the R Markdown text area. This is not R code.
Use the {apyramid} package and its age_pyramid()
function (don’t forget to add {apyramid} to your packages!).
Remember, this creates a ggplot object, so you can modify it by adding a +
afterward and then use labs()
to add labels like title and caption. Review the epidemiological plots exercise (module 9) or your ebola_sitrep.Rmd
if you do not remember this function. Alternatively, search for the function documentation using ?age_pyramid
.
Create a new code chunk for the epidemic curve.
binwidth = 7
, or to be more precise you can define a series of weekly break values that start the Monday prior to the first case, and end the Monday after the last case.weekly_breaks <- seq.Date( from = floor_date(min(linelist$date_report, na.rm=T), "week", week_start = 1), # Monday before first case to = ceiling_date(max(linelist$date_report, na.rm=T), "week", week_start = 1), # Monday after last case by = "week")
Then you need to provide weekly_breaks
to the breaks =
argument within geom_histogram()
.
To make the dates appear nicely and efficient (no duplicate years), recall how to assign label_date_short()
to the labels =
argument of geom_histogram()
.
Observe how the list of cities (displayed, and in the legend) has been truncated to 5
and “Other”
... this was done using fct_lump_n()
within the ggplot command, for example:
ggplot(data = linelist, mapping = aes( x = report_dt, fill = fct_lump_n(city, 5)))+ geom....
labs()
to add a title, subtitle, axis names, legend name, and figure caption.Remember str_glue()
is a very helpful function for creating figure captions that glues together values from your dataframe with written context.
This is actually a very simple table, thanks to the {gtsummary} package!
tbl_summary()
. add_p()
to add p-values as well! select()
the columns you need before sending the data into tbl_summary()
using a pipe (%>%
) race
column first, consider including the following in your cleaning pipe chain section of the R Markdown: # Change values in race column to title case mutate(race = str_to_title(race))
Create a new code chunk for the cumulative incidence table, the last output in your R Markdown report!
This is the most advanced output in the report, as it involves multiple cleaning and calculation steps prior to creating the table.
group_by()
to group case counts by city and then use summarise()
to sum()
the number of cases per city.sum()
and logical statements within the summarise()
function (don’t forget na.rm = T
). data_date
parameter, you could include this within the summarise()
as: recent_14d = sum(date_report >= params$data_date - 14)
(total_cases / population) * 10000
to calculate the incidence rate per 10000 population. round()
with the argument digits = 1
to get the rounding on the cumulative incidence. select()
to remove any unwanted columns for the final table. You can use qflextable()
to create a table easily from the summarised data frame, and set_header_labels()
to update column headings in the table.
Well done! Even if your R Markdown doesn't not re-create the exact same word document, don't worry, just do the best you can!
This is not a test, and it is a VERY challenging exercise, but the learning process involved makes it worth it!
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.