# load packages ----------------------------------------------------------------
library(introexercises)
library(learnr)
library(gradethis)
library(dplyr)
library(flair)
library(ggplot2)
library(lubridate)
library(fontawesome)
library(janitor)
library(kableExtra)
# library(RMariaDB)        # connect to sql database

## set options for exercises and checking ---------------------------------------

## Define how exercises are evaluated 
gradethis::gradethis_setup(
  ## note: the below arguments are passed to learnr::tutorial_options
  ## set the maximum execution time limit in seconds
  exercise.timelimit = 60, 
  ## set how exercises should be checked (defaults to NULL - individually defined)
  # exercise.checker = gradethis::grade_learnr
  ## set whether to pre-evaluate exercises (so users see answers)
  exercise.eval = FALSE 
)

# ## event recorder ---------------------------------------------------------------
# ## see for details: 
# ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events
# ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R
# 
# ## connect to your sql database
# sqldtbase <- dbConnect(RMariaDB::MariaDB(),
#                        user     = Sys.getenv("userid"),
#                        password = Sys.getenv("pwd"),
#                        dbname   = 'excersize_log',
#                        host     = "144.126.246.140")
# 
# 
# ## define a function to collect data 
# ## note that tutorial_id is defined in YAML
#     ## you could set the tutorial_version too (by specifying version:) but use package version instead 
# recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) {
#     
#   ## define a sql query 
#   ## first bracket defines variable names
#   ## values bracket defines what goes in each variable
#   event_log <- paste("INSERT INTO responses (
#                        tutorial_id, 
#                        tutorial_version, 
#                        date_time, 
#                        user_id, 
#                        event, 
#                        section,
#                        label, 
#                        question, 
#                        answer, 
#                        code, 
#                        correct)
#                        VALUES('", tutorial_id,  "', 
#                        '", tutorial_version, "', 
#                        '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "',
#                        '", Sys.getenv("SHINYPROXY_PROXY_ID"), "',
#                        '", event, "',
#                        '", data$section, "',
#                        '", data$label,  "',
#                        '", paste0('"', data$question, '"'),  "',
#                        '", paste0('"', data$answer,   '"'),  "',
#                        '", paste0('"', data$code,     '"'),  "',
#                        '", data$correct, "')",
#                        sep = '')
# 
#     # Execute the query on the sqldtbase that we connected to above
#     rsInsert <- dbSendQuery(sqldtbase, event_log)
#   
# }
# 
# options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)


# Data prep --------------------------------------------------------------------
# Import
surv <- rio::import(system.file("dat/surveillance_linelist_clean_20141201.rds", package = "introexercises")) 

hospitals <- bind_rows(rio::import(system.file("dat/hospitals/20141201_hosp_port.csv", package = "introexercises")),
                    rio::import(system.file("dat/hospitals/20141201_hosp_central.csv", package = "introexercises"))) %>% 
             select(hospital, date_hospitalisation, outcome, date_outcome) %>% 
             janitor::clean_names()

## NOT USED ANYMORE
# geo_data <- rio::import(system.file("dat/pop/sle_admpop_adm3_2020.csv", package = "introexercises")) %>% 
#                  select(-c(Female, Male), -starts_with("T"))


## NOT USED ANYMORE Make the hospital information dataframe
# hospital_dirty = data.frame(
#   hosp_name     = c("central hospital", "military", "military", "port", "St. Mark's", "ignace", "sisters"),
#   catchment_pop = c(1950280, 40500, 10000, 50280, 12000, 5000, 4200),
#   level         = c("Tertiary", "Secondary", "Primary", "Secondary", "Secondary", "Primary", "Primary")
# )
# 
# hospital_clean <- hospital_dirty %>% 
#   mutate(
#     hosp_name = case_when(
#       # criteria                       # new value
#       hosp_name == "military"          ~ "Military Hospital",
#       hosp_name == "port"              ~ "Port Hospital",
#       hosp_name == "St. Mark's"        ~ "St. Mark's Maternity Hospital (SMMH)",
#       hosp_name == "central hospital"  ~ "Central Hospital",
#       TRUE                             ~ hosp_name
#       )
#     )

# Create smaller linelists for the easier exemples
patients <- tibble(ID = c("patient_1", "patient_2", "patient_3", 
                            "patient_4", "patient_10"), 
                     sexe = c("F", "M", "M", "F", "F"), 
                     age = c(5, 10, 2, 15, 14), 
                     age_unit = c("Year", "Year", "Year", "Year", "Year"))

results <- tibble(ID = c("patient_1", "patient_2", "patient_4", 
                        "patient_5", "patient_6"), 
                 test_result = c("positive", "negative", 
                                 "negative", "positive", "positive"))


df1 <- tibble(ID = c("patient_1", "patient_2", "patient_3"),
              sexe = c("F", "M", "M"))

df2 <- tibble(ID = c("patient_1", "patient_1", "patient_1", "patient_2", "patient_4"),
              date_test = as.Date(c("2021-12-01", "2021-12-26", "2022-01-05", "2021-12-18", "2022-01-01")),
                 test_result = c("positive", "negative", "negative", "positive", "positive"))

hosp_central <- rio::import(system.file("dat/hospitals/20141201_hosp_central.csv", package = "introexercises"))
hosp_military <- rio::import(system.file("dat/hospitals/20141201_hosp_military.csv", package = "introexercises"))
hosp_port <- rio::import(system.file("dat/hospitals/20141201_hosp_port.csv", package = "introexercises"))
hosp_smmh <- rio::import(system.file("dat/hospitals/20141201_hosp_smmh.csv", package = "introexercises"))
hosp_other <- rio::import(system.file("dat/hospitals/20141201_hosp_other.csv", package = "introexercises"))
hosp_missing <- rio::import(system.file("dat/hospitals/20141201_hosp_missing.csv", package = "introexercises"))

lab <- rio::import(system.file("dat/lab_results_20141201.xlsx", package = "introexercises"))
investigations <- rio::import(system.file("dat/case_investigations_20141201.xlsx", package = "introexercises"))
# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)

Introduction to R for Applied Epidemiology and Public Health

Welcome

Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.

knitr::include_graphics("images/logo.png", error = F)

COVID-19 situation report

This exercise focuses on developing a COVID-19 situation report using R Markdown.

Format

This exercise guides you through tasks that you should perform in RStudio on your local computer.

License

Please email contact@appliedepi.org with questions about the use of these materials.

Learning objectives

In this exercise you will:

THIS IS NOT A TEST. This is a challenging exercise, but you can engage with it at the difficulty level that suits you.

Prepare

Open RStudio and create a new RStudio project, inside the "covid" folder within your "intro_course" directory.

You will be starting a distinct outbreak and analysis, so it deserves its own project (self-contained and portable R environment with data, scripts, outputs, etc).

If you do not remember how to make a new project, see the "hint" below.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

1) Open RStudio (ensure that you open RStudio and not just R).

2) In RStudio click File -> New Project. In the pop-up window, select "Existing directory".

knitr::include_graphics("images/create_project.png")

3) Create the project in the "intro_course/covid" subfolder

  • Click "Browse" and navigate to the "intro_course" folder that you downloaded and unzipped earlier (probably saved on your Desktop) and then into the "covid" subfolder.
  • Click "Create project" (RStudio may briefly close and re-open)


Confirm that you are in the correct project

Confirm that the name of the project in the upper-right corner of the current RStudio session is "covid". If you are not in any RStudio project, it will read "Project: (None)".

Here is how your "covid" folder structure should look:

The scenario

This case study is an opportunity to apply all of the skills you have learned in the course. Have fun!

These COVID-19 data are anonymised and jittered. They were provided to Applied Epi by the public health department of Fulton County, Georgia, United States for use in training epidemiologists.

As background, the City of Atlanta is a major metropolitan area in the southeastern US state of Georgia. About 90% of Atlanta lies within Fulton County. Fulton County has a population of around 1 million people, and includes numerous other smaller municipalities in addition to Atlanta.

Review the data

Open "covid_example_data.xlsx" in Excel. There are approximately 80,000 records. Note the columns:

Open the "city_pops.xlsx" dataset in Excel.

Review the situation report document

You are tasked with re-creating the "covid_sitrep.docx" located in the "covid" folder. You must write an R Markdown script that produces a Word document as similar to this report as possible.

All the functions and methods required to create this output report have been included in the folder.

Open and review "covid_sitrep.docx".

Title elements

Note the title, subtitle, and date at the top. These can all be specified in the YAML section of the R Markdown.

Summary text and bullets

Notice the summary information about the COVID-19 outbreak - a heading and some bullets which include numbers and dates from the data (likely created with "in-line" R code).

Do you remember how to create headings and bullets in the written text portion of an RMD?

Demographic pyramid

The age / sex pyramid is stratified by gender (do you remember the age_pyrmaid() function?)

Epidemic curve

The epidemic curve is colored by city (remember the fill = argument).

Advanced users will note the caption and other plot elements that could make use of str_glue(). Also that not all cities are explicitly given colors - only those that appear the most frequently in the data (... fct_lump()).

Tables

Two summary tables are included in the report:

NOTE: the exercise today will provide you some tips and tricks for re-creating the R Markdown which would output this report. However, answers will not be provided.

How to proceed

You are welcome to proceed with no assistance, to truly test your skills.

However, the sections of the exercise below can be considered a "tips sheet" that can provide you with suggestions and reminders. However, it will not provide solutions.

You can also request assistance from an instructor at any point.

Remember: this is not a test. Have fun. Try your best.

New R Markdown script

Open a new R Markdown script and save it in your "covid/scripts" folder as "covid_sitrep_2.Rmd". Choose a different name than "covid_sitrep.docx" to avoid over-writing the original Word report by accident, for example: "covid_sitrep_YOUR_NAME.docx".

Remove the example content in the template R Markdown script. Keep the YAML section and "setup" code chunk.

YAML

Begin with the standard YAML provided in the template. Edit the YAML to include a title and subtitle to match the Word report:

Recognize that the report is as of "21 June 2021", so one approach is to specify that date as a parameter in the YAML, so that you can use it to filter the data in the cleaning process. If you are not comfortable with parameters, you can skip this part or ask for help.

The easiest method is to simply add a params section at the bottom of the YAML and add this date as below. Remember YAML is very sensitive about spaces, colons, and indentations. Do not forget the ! before the r.

---
params:
  data_date: !r lubridate::ymd("2021-06-21")
---

Alternatively, you can the date selection process a point-and-click interface using the below YAML.

Now, if you click to “Knit with parameters”, a pop-up window will ask you to select the data_date. Ask the instructors if you are confused about the point-and-click interface option.

---
params:
  data_date:
    label: "Date of data:"
    value: !r lubridate::ymd("2021-06-21")
    input: date
---

Whichever way you choose - you must reference the date parameter in your code as params$data_date.

Finally, make sure your YAML is set to output a word document.

Setup code chunk

The very first "setup" code chunk in an R Markdown typically sets the default visibility options for all chunks. Consider setting the following:

Load packages

Create a loading packages code chunk in your RMD. Consider which packages you will need to create this document. We recommend you adapt code from the Ebola analysis situation report RMD as you work through this exercise.

Consider the following packages in particular:

Use pacman::p_load() to install packages if needed and load them for use. Additionally, remember to put {tidyverse} last in the command.

Data import should have its own chunk. Because this is an R Markdown, it is especially important to use the here() function within the import() function to locate the files.

Remember, there are two files of interest for the present analysis:

Both datasets are located in the "data" sub-folder, which means you will need to include "data" in your here() function.

Give the datasets names that are easy to reference, e.g. linelist_raw and pops.

Before cleaning the data, consider examining the data in an exploratory analysis code chunk. Consider setting this chunk to eval=FALSE so the outputs are not printed to the report.

Remember, when you have multiple commands in one chunk, you can highlight specific commands and press Ctrl + Enter to run them alone.

For running R code from code chunks through the console, we can either:

Make sure you have run the packages and importing chunks before beginning your exploratory analysis.

Examine the case linelist data frame and consider the following when developing a cleaning pipeline:

Remember, when you have multiple commands in one chunk, you can highlight specific commands and press Ctrl + Enter to run them alone.

Make sure you have run the packages and importing chunks before beginning your exploratory analysis.

Examine the case linelist:

Create a cleaning code chunk. Your exact number of cases may not align perfectly with the report - this is OK. You can take liberty to make decisions about inclusion and exclusion criteria.

Remember to assign your cleaning pipe chain to a new object, called something easy to reference like linelist or linelist_clean.

Text and bullet points

format(params$data_date, format = "%d %B, %Y")
fmt_count(combined, outcome == "Death")
paste("242 (44.9%)")

Cleaning pipe chain

Create a cleaning code chunk. Your exact number of cases may not align perfectly with the report - this is OK. You can take liberty to make decisions about inclusion and exclusion criteria.

Remember to assign your cleaning pipe chain to a new object, called something easy to reference like linelist or linelist_clean.

Page breaks

To get page breaks in specific places (e.g. after the summary text portion), insert \pagebreak into the R Markdown text area. This is not R code.

Age pyramid

Use the {apyramid} package and its age_pyramid() function (don’t forget to add {apyramid} to your packages!).

Remember, this creates a ggplot object, so you can modify it by adding a + afterward and then use labs() to add labels like title and caption. Review the epidemiological plots exercise (module 9) or your ebola_sitrep.Rmd if you do not remember this function. Alternatively, search for the function documentation using ?age_pyramid.

Epidemic curve

Create a new code chunk for the epidemic curve.

 weekly_breaks <- seq.Date(
      from = floor_date(min(linelist$date_report, na.rm=T),   "week", week_start = 1), # Monday before first case
      to   = ceiling_date(max(linelist$date_report, na.rm=T), "week", week_start = 1), # Monday after last case
      by   = "week")  
ggplot(data = linelist, 
       mapping = aes(
         x = report_dt, 
         fill = fct_lump_n(city, 5)))+
     geom....  

Remember str_glue() is a very helpful function for creating figure captions that glues together values from your dataframe with written context.

Outcome table

This is actually a very simple table, thanks to the {gtsummary} package!

# Change values in race column to title case  

mutate(race = str_to_title(race))

Case incidence table

Create a new code chunk for the cumulative incidence table, the last output in your R Markdown report!

This is the most advanced output in the report, as it involves multiple cleaning and calculation steps prior to creating the table.

You can use qflextable() to create a table easily from the summarised data frame, and set_header_labels() to update column headings in the table.

Good luck!

Well done! Even if your R Markdown doesn't not re-create the exact same word document, don't worry, just do the best you can!
This is not a test, and it is a VERY challenging exercise, but the learning process involved makes it worth it!



appliedepi/introexercises documentation built on April 22, 2024, 1:01 a.m.