In appliedepi/introexercises:

# load packages ----------------------------------------------------------------
library(introexercises)
library(learnr)
library(gradethis)
library(dplyr)
library(flair)
library(ggplot2)
library(lubridate)
library(fontawesome)
library(janitor)
library(kableExtra)
# library(RMariaDB)        # connect to sql database

## set options for exercises and checking ---------------------------------------

## Define how exercises are evaluated 
gradethis::gradethis_setup(
  ## note: the below arguments are passed to learnr::tutorial_options
  ## set the maximum execution time limit in seconds
  exercise.timelimit = 60, 
  ## set how exercises should be checked (defaults to NULL - individually defined)
  # exercise.checker = gradethis::grade_learnr
  ## set whether to pre-evaluate exercises (so users see answers)
  exercise.eval = FALSE 
)

# ## event recorder ---------------------------------------------------------------
# ## see for details: 
# ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events
# ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R
# 
# ## connect to your sql database
# sqldtbase <- dbConnect(RMariaDB::MariaDB(),
#                        user     = Sys.getenv("userid"),
#                        password = Sys.getenv("pwd"),
#                        dbname   = 'excersize_log',
#                        host     = "144.126.246.140")
# 
# 
# ## define a function to collect data 
# ## note that tutorial_id is defined in YAML
#     ## you could set the tutorial_version too (by specifying version:) but use package version instead 
# recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) {
#     
#   ## define a sql query 
#   ## first bracket defines variable names
#   ## values bracket defines what goes in each variable
#   event_log <- paste("INSERT INTO responses (
#                        tutorial_id, 
#                        tutorial_version, 
#                        date_time, 
#                        user_id, 
#                        event, 
#                        section,
#                        label, 
#                        question, 
#                        answer, 
#                        code, 
#                        correct)
#                        VALUES('", tutorial_id,  "', 
#                        '", tutorial_version, "', 
#                        '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "',
#                        '", Sys.getenv("SHINYPROXY_PROXY_ID"), "',
#                        '", event, "',
#                        '", data$section, "',
#                        '", data$label,  "',
#                        '", paste0('"', data$question, '"'),  "',
#                        '", paste0('"', data$answer,   '"'),  "',
#                        '", paste0('"', data$code,     '"'),  "',
#                        '", data$correct, "')",
#                        sep = '')
# 
#     # Execute the query on the sqldtbase that we connected to above
#     rsInsert <- dbSendQuery(sqldtbase, event_log)
#   
# }
# 
# options(tutorial.event_recorder = recorder_function)

# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)


# Data prep --------------------------------------------------------------------
# Import
surv <- rio::import(system.file("dat/surveillance_linelist_clean_20141201.rds", package = "introexercises")) 

hospitals <- bind_rows(rio::import(system.file("dat/hospitals/20141201_hosp_port.csv", package = "introexercises")),
                    rio::import(system.file("dat/hospitals/20141201_hosp_central.csv", package = "introexercises"))) %>% 
             select(hospital, date_hospitalisation, outcome, date_outcome) %>% 
             janitor::clean_names()

## NOT USED ANYMORE
# geo_data <- rio::import(system.file("dat/pop/sle_admpop_adm3_2020.csv", package = "introexercises")) %>% 
#                  select(-c(Female, Male), -starts_with("T"))


## NOT USED ANYMORE Make the hospital information dataframe
# hospital_dirty = data.frame(
#   hosp_name     = c("central hospital", "military", "military", "port", "St. Mark's", "ignace", "sisters"),
#   catchment_pop = c(1950280, 40500, 10000, 50280, 12000, 5000, 4200),
#   level         = c("Tertiary", "Secondary", "Primary", "Secondary", "Secondary", "Primary", "Primary")
# )
# 
# hospital_clean <- hospital_dirty %>% 
#   mutate(
#     hosp_name = case_when(
#       # criteria                       # new value
#       hosp_name == "military"          ~ "Military Hospital",
#       hosp_name == "port"              ~ "Port Hospital",
#       hosp_name == "St. Mark's"        ~ "St. Mark's Maternity Hospital (SMMH)",
#       hosp_name == "central hospital"  ~ "Central Hospital",
#       TRUE                             ~ hosp_name
#       )
#     )

# Create smaller linelists for the easier exemples
patients <- tibble(ID = c("patient_1", "patient_2", "patient_3", 
                            "patient_4", "patient_10"), 
                     sexe = c("F", "M", "M", "F", "F"), 
                     age = c(5, 10, 2, 15, 14), 
                     age_unit = c("Year", "Year", "Year", "Year", "Year"))

results <- tibble(ID = c("patient_1", "patient_2", "patient_4", 
                        "patient_5", "patient_6"), 
                 test_result = c("positive", "negative", 
                                 "negative", "positive", "positive"))


df1 <- tibble(ID = c("patient_1", "patient_2", "patient_3"),
              sexe = c("F", "M", "M"))

df2 <- tibble(ID = c("patient_1", "patient_1", "patient_1", "patient_2", "patient_4"),
              date_test = as.Date(c("2021-12-01", "2021-12-26", "2022-01-05", "2021-12-18", "2022-01-01")),
                 test_result = c("positive", "negative", "negative", "positive", "positive"))

hosp_central <- rio::import(system.file("dat/hospitals/20141201_hosp_central.csv", package = "introexercises"))
hosp_military <- rio::import(system.file("dat/hospitals/20141201_hosp_military.csv", package = "introexercises"))
hosp_port <- rio::import(system.file("dat/hospitals/20141201_hosp_port.csv", package = "introexercises"))
hosp_smmh <- rio::import(system.file("dat/hospitals/20141201_hosp_smmh.csv", package = "introexercises"))
hosp_other <- rio::import(system.file("dat/hospitals/20141201_hosp_other.csv", package = "introexercises"))
hosp_missing <- rio::import(system.file("dat/hospitals/20141201_hosp_missing.csv", package = "introexercises"))

lab <- rio::import(system.file("dat/lab_results_20141201.xlsx", package = "introexercises"))
investigations <- rio::import(system.file("dat/case_investigations_20141201.xlsx", package = "introexercises"))

# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)

Introduction to R for Applied Epidemiology and Public Health

Welcome

Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.

knitr::include_graphics("images/logo.png", error = F)

COVID-19 situation report

This exercise focuses on developing a COVID-19 situation report using R Markdown.

Format

This exercise guides you through tasks that you should perform in RStudio on your local computer.

License

Please email contact@appliedepi.org with questions about the use of these materials.

Learning objectives

In this exercise you will:

Practice the skillsets you have learned through-out the course
Apply your learning to a new data frame
Develop an R Markdown for a COVID-19 situation report

THIS IS NOT A TEST. This is a challenging exercise, but you can engage with it at the difficulty level that suits you.

Prepare

Open RStudio and create a new RStudio project, inside the "covid" folder within your "intro_course" directory.

You will be starting a distinct outbreak and analysis, so it deserves its own project (self-contained and portable R environment with data, scripts, outputs, etc).

If you do not remember how to make a new project, see the "hint" below.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

1) Open RStudio (ensure that you open RStudio and not just R).

2) In RStudio click File -> New Project. In the pop-up window, select "Existing directory".

knitr::include_graphics("images/create_project.png")

3) Create the project in the "intro_course/covid" subfolder

Click "Browse" and navigate to the "intro_course" folder that you downloaded and unzipped earlier (probably saved on your Desktop) and then into the "covid" subfolder.
Click "Create project" (RStudio may briefly close and re-open)

Confirm that you are in the correct project

Confirm that the name of the project in the upper-right corner of the current RStudio session is "covid". If you are not in any RStudio project, it will read "Project: (None)".

Here is how your "covid" folder structure should look:

r emo::ji("folder") Desktop
r emo::ji("folder") intro_course
- r emo::ji("folder") module1
- r emo::ji("folder") ebola
- r emo::ji("folder") learning_materials
- r emo::ji("folder") covid
- covid.Rproj (COVID R project file)
- r emo::ji("file") covid_sitrep.docx
- r emo::ji("folder") scripts
- r emo::ji("folder") backup
- r emo::ji("folder") data
  - r emo::ji("file") city_pops.xlsx
  - r emo::ji("file") covid_example_data.xlsx

The scenario

This case study is an opportunity to apply all of the skills you have learned in the course. Have fun!

These COVID-19 data are anonymised and jittered. They were provided to Applied Epi by the public health department of Fulton County, Georgia, United States for use in training epidemiologists.

As background, the City of Atlanta is a major metropolitan area in the southeastern US state of Georgia. About 90% of Atlanta lies within Fulton County. Fulton County has a population of around 1 million people, and includes numerous other smaller municipalities in addition to Atlanta.

Review the data

Open "covid_example_data.xlsx" in Excel. There are approximately 80,000 records. Note the columns:

report_dt and sym_startdt
case_age
case_gender
Self-reported "race" and ethnicity are common demographic variables collected in the United States
City is the municipality of their residence, within the County. See the corresponding dataset "city_pops.xlsx"
case_zip refers to the "ZIP code" (postal code)
A series of symptoms columns
hospitalized
died
confirmed_case

Open the "city_pops.xlsx" dataset in Excel.

This includes all the cities in the State of Georgia and their populations.

Review the situation report document

You are tasked with re-creating the "covid_sitrep.docx" located in the "covid" folder. You must write an R Markdown script that produces a Word document as similar to this report as possible.

All the functions and methods required to create this output report have been included in the folder.

Open and review "covid_sitrep.docx".

Title elements

Note the title, subtitle, and date at the top. These can all be specified in the YAML section of the R Markdown.

Summary text and bullets

Notice the summary information about the COVID-19 outbreak - a heading and some bullets which include numbers and dates from the data (likely created with "in-line" R code).

Do you remember how to create headings and bullets in the written text portion of an RMD?

Demographic pyramid

The age / sex pyramid is stratified by gender (do you remember the age_pyrmaid() function?)

Epidemic curve

The epidemic curve is colored by city (remember the fill = argument).

Advanced users will note the caption and other plot elements that could make use of str_glue(). Also that not all cities are explicitly given colors - only those that appear the most frequently in the data (... fct_lump()).

Tables

Two summary tables are included in the report:

A demographic table on hospitalisation status by race (remember the tblsummary() function?)
A table showing cumulative case incidence per 10,000, stratified by city (this will require grouping and summarising the data, and probably joining!)

NOTE: the exercise today will provide you some tips and tricks for re-creating the R Markdown which would output this report. However, answers will not be provided.

How to proceed

You are welcome to proceed with no assistance, to truly test your skills.

However, the sections of the exercise below can be considered a "tips sheet" that can provide you with suggestions and reminders. However, it will not provide solutions.

You can also request assistance from an instructor at any point.

Remember: this is not a test. Have fun. Try your best.

New R Markdown script

Open a new R Markdown script and save it in your "covid/scripts" folder as "covid_sitrep_2.Rmd". Choose a different name than "covid_sitrep.docx" to avoid over-writing the original Word report by accident, for example: "covid_sitrep_YOUR_NAME.docx".

Remove the example content in the template R Markdown script. Keep the YAML section and "setup" code chunk.

YAML

Begin with the standard YAML provided in the template. Edit the YAML to include a title and subtitle to match the Word report:

Title: "Covid-19 outbreak situation report"
Subtitle: "Exercise for Applied Epi Intro to R Course"

Recognize that the report is as of "21 June 2021", so one approach is to specify that date as a parameter in the YAML, so that you can use it to filter the data in the cleaning process. If you are not comfortable with parameters, you can skip this part or ask for help.

The easiest method is to simply add a params section at the bottom of the YAML and add this date as below. Remember YAML is very sensitive about spaces, colons, and indentations. Do not forget the ! before the r.

---
params:
  data_date: !r lubridate::ymd("2021-06-21")
---

Alternatively, you can the date selection process a point-and-click interface using the below YAML.

Now, if you click to “Knit with parameters”, a pop-up window will ask you to select the data_date. Ask the instructors if you are confused about the point-and-click interface option.

---
params:
  data_date:
    label: "Date of data:"
    value: !r lubridate::ymd("2021-06-21")
    input: date
---

Whichever way you choose - you must reference the date parameter in your code as params$data_date.

Finally, make sure your YAML is set to output a word document.

Setup code chunk

The very first "setup" code chunk in an R Markdown typically sets the default visibility options for all chunks. Consider setting the following:

eval = TRUE (output from code chunk is included; however you may want to set this to FALSE for testing or exploratory analysis code chunks)
echo = FALSE (code itself is not printed in the report)
warning = FALSE (warnings do not appear in the report)
message = FALSE (messages do not appear in the report)
error = TRUE (show errors in the report)

Load packages

Create a loading packages code chunk in your RMD. Consider which packages you will need to create this document. We recommend you adapt code from the Ebola analysis situation report RMD as you work through this exercise.

Consider the following packages in particular:

(tidyverse) (for data management, cleaning, and visualisation)
{lubridate} (for dates)
{apyramid} (for creating age pyramids)
{gtsummary} (for creating tables)

Use pacman::p_load() to install packages if needed and load them for use. Additionally, remember to put {tidyverse} last in the command.

Data import should have its own chunk. Because this is an R Markdown, it is especially important to use the here() function within the import() function to locate the files.

Remember, there are two files of interest for the present analysis:

"covid_example_data.xlsx", covid cases linelist
"city_pops.xlsx", population data by city in Fulton County, Georgia (for the incidence calculation table)

Both datasets are located in the "data" sub-folder, which means you will need to include "data" in your here() function.

Give the datasets names that are easy to reference, e.g. linelist_raw and pops.

Before cleaning the data, consider examining the data in an exploratory analysis code chunk. Consider setting this chunk to eval=FALSE so the outputs are not printed to the report.

Remember, when you have multiple commands in one chunk, you can highlight specific commands and press Ctrl + Enter to run them alone.

For running R code from code chunks through the console, we can either:

Run whole code chunks using the green triangle in the top-right corner
Highlight specific commands or portions of code within a code chunk and hit run (or Ctrl + Enter for windows, CMD + Enter for Mac)

Make sure you have run the packages and importing chunks before beginning your exploratory analysis.

Examine the case linelist data frame and consider the following when developing a cleaning pipeline:

What column names() are present in the data frame?
What is the class() of columns of interest?
Do we have some messy columns that could use clean_names()?
Do we need to rename() some columns for standardisation?
Do we need to reformat our dates columns to the correct class?
Are the date columns in ymd(), mdy(), or dmy() order?
Do we need to filter() our data frame based on a date (e.g. June 21 2021)?
Do we need to filter() based on case status? (We only want confirmed cases!)
Do we need to create an age_categories() column for our age pyramid?
Do we need to deal with how missing cases are coded in our gender column?

Remember, when you have multiple commands in one chunk, you can highlight specific commands and press Ctrl + Enter to run them alone.

Make sure you have run the packages and importing chunks before beginning your exploratory analysis.

Examine the case linelist:

What column names() are present in the data frame?
What is the class() of columns of interest?
Do we have some messy columns that could use clean_names()?
Do we need to rename() some columns for standardisation?
Do we need to reformat our dates columns to the correct class?
Are the date columns in ymd(), mdy(), or dmy() order?
Do we need to filter() our data frame based on a date (e.g. June 21 2021)?
Do we need to filter() based on case status? (if we only want confirmed cases!)
Do we need to create an age_categories() column for our age pyramid?
Do we need to deal with how missing values are coded in our gender column?

Create a cleaning code chunk. Your exact number of cases may not align perfectly with the report - this is OK. You can take liberty to make decisions about inclusion and exclusion criteria.

Consider using the clean_names() function from {janitor} to convert all column names to lowercase with no spaces or special characters
Consider using rename() - REMEMBER the expected syntax is NEW = OLD
Convert the important date columns to class “Date” using one of the {lubridate} functions, dmy(), mdy(), or ymd(). Use the function that corresponds to the how the dates are formatted prior to applying the function.
The report is for confirmed cases before 21 June 2021, so do not forget to apply those criteria using filter().

Remember to assign your cleaning pipe chain to a new object, called something easy to reference like linelist or linelist_clean.

Text and bullet points

Think about order of your code. Don’t forget that in order to print these text statement with summary statistics inline, those calculations and data cleaning need to happen above this text in the R Markdown (but not emit any visible output).
To display dates in a specific format, you can wrap the date in format() and supply the strptime syntax that you prefer, for example: * To display dates in a specific format, you can wrap the date in format() and supply the strptime syntax that you prefer, for example:

format(params$data_date, format = "%d %B, %Y")

To display counts and % symbols quickly, use the fmt_count() function from {epikit}. This accepts the name of a data frame, and then a logical statement used to filter rows.

fmt_count(combined, outcome == "Death")

paste("242 (44.9%)")

Cleaning pipe chain

Create a cleaning code chunk. Your exact number of cases may not align perfectly with the report - this is OK. You can take liberty to make decisions about inclusion and exclusion criteria.

Consider using the clean_names() function from {janitor} to convert all column names to lowercase with no spaces or special characters
Consider using rename() - REMEMBER the expected syntax is NEW = OLD
Convert the important date columns to class “Date” using one of the {lubridate} functions, dmy(), mdy(), or ymd(). Use the function that corresponds to the how the dates are formatted prior to applying the function.
The report is for confirmed cases before 21 June 2021, so do not forget to apply those criteria using filter().

Remember to assign your cleaning pipe chain to a new object, called something easy to reference like linelist or linelist_clean.

Page breaks

To get page breaks in specific places (e.g. after the summary text portion), insert \pagebreak into the R Markdown text area. This is not R code.

Age pyramid

Use the {apyramid} package and its age_pyramid() function (don’t forget to add {apyramid} to your packages!).

Remember, this creates a ggplot object, so you can modify it by adding a + afterward and then use labs() to add labels like title and caption. Review the epidemiological plots exercise (module 9) or your ebola_sitrep.Rmd if you do not remember this function. Alternatively, search for the function documentation using ?age_pyramid.

Epidemic curve

Create a new code chunk for the epidemic curve.

To break the histogram into weeks, you can either set binwidth = 7, or to be more precise you can define a series of weekly break values that start the Monday prior to the first case, and end the Monday after the last case.

 weekly_breaks <- seq.Date(
      from = floor_date(min(linelist$date_report, na.rm=T),   "week", week_start = 1), # Monday before first case
      to   = ceiling_date(max(linelist$date_report, na.rm=T), "week", week_start = 1), # Monday after last case
      by   = "week")

Then you need to provide weekly_breaks to the breaks = argument within geom_histogram().
To make the dates appear nicely and efficient (no duplicate years), recall how to assign label_date_short() to the labels = argument of geom_histogram().
Observe how the list of cities (displayed, and in the legend) has been truncated to 5 and “Other”... this was done using fct_lump_n() within the ggplot command, for example:

ggplot(data = linelist, 
       mapping = aes(
         x = report_dt, 
         fill = fct_lump_n(city, 5)))+
     geom....

Use labs() to add a title, subtitle, axis names, legend name, and figure caption.

Remember str_glue() is a very helpful function for creating figure captions that glues together values from your dataframe with written context.

Outcome table

This is actually a very simple table, thanks to the {gtsummary} package!

This table is produced with {gtsummary} and it’s function tbl_summary().
Since we are looking at an outcome, we can pipe the table into add_p() to add p-values as well!
Don't forget to only select() the columns you need before sending the data into tbl_summary() using a pipe (%>%)
If you would like to nicely format the race column first, consider including the following in your cleaning pipe chain section of the R Markdown:

# Change values in race column to title case  

mutate(race = str_to_title(race))

Case incidence table

Create a new code chunk for the cumulative incidence table, the last output in your R Markdown report!

This is the most advanced output in the report, as it involves multiple cleaning and calculation steps prior to creating the table.

You will need to first use group_by() to group case counts by city and then use summarise() to sum() the number of cases per city.
You add other columns using sum() and logical statements within the summarise() function (don’t forget na.rm = T).
For example, to calculate cases reported in the 14 days prior to a data_date parameter, you could include this within the summarise() as:
recent_14d = sum(date_report >= params$data_date - 14)
The final column in the table for cumulative incidence is created by joining the summarized table to the city populations data frame.
You want to keep all the rows in the summarized table, but only the rows in city populatons that match.
You then have the population data available to calculate the number of cumulative cases per population.
Use (total_cases / population) * 10000 to calculate the incidence rate per 10000 population.
You can use round() with the argument digits = 1 to get the rounding on the cumulative incidence.
Make sure to use select() to remove any unwanted columns for the final table.

You can use qflextable() to create a table easily from the summarised data frame, and set_header_labels() to update column headings in the table.

Good luck!

Well done! Even if your R Markdown doesn't not re-create the exact same word document, don't worry, just do the best you can!
This is not a test, and it is a VERY challenging exercise, but the learning process involved makes it worth it!

appliedepi/introexercises documentation built on April 22, 2024, 1:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com