# load packages ----------------------------------------------------------------
library(introexercises)
library(learnr)
library(gradethis)
library(flair)
library(dplyr)
library(ggplot2)
library(lubridate)
library(janitor)
library(fontawesome)
library(gtsummary)
library(scales)
# library(RMariaDB)        # connect to sql database

## set options for exercises and checking ---------------------------------------

## Define how exercises are evaluated 
gradethis::gradethis_setup(
  ## note: the below arguments are passed to learnr::tutorial_options
  ## set the maximum execution time limit in seconds
  exercise.timelimit = 60, 
  ## set how exercises should be checked (defaults to NULL - individually defined)
  # exercise.checker = gradethis::grade_learnr
  ## set whether to pre-evaluate exercises (so users see answers)
  exercise.eval = FALSE 
)

# ## event recorder ---------------------------------------------------------------
# ## see for details: 
# ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events
# ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R
# 
# ## connect to your sql database
# sqldtbase <- dbConnect(RMariaDB::MariaDB(),
#                        user     = Sys.getenv("userid"),
#                        password = Sys.getenv("pwd"),
#                        dbname   = 'excersize_log',
#                        host     = "144.126.246.140")
# 
# 
# ## define a function to collect data 
# ## note that tutorial_id is defined in YAML
#     ## you could set the tutorial_version too (by specifying version:) but use package version instead 
# recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) {
#     
#   ## define a sql query 
#   ## first bracket defines variable names
#   ## values bracket defines what goes in each variable
#   event_log <- paste("INSERT INTO responses (
#                        tutorial_id, 
#                        tutorial_version, 
#                        date_time, 
#                        user_id, 
#                        event, 
#                        section,
#                        label, 
#                        question, 
#                        answer, 
#                        code, 
#                        correct)
#                        VALUES('", tutorial_id,  "', 
#                        '", tutorial_version, "', 
#                        '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "',
#                        '", Sys.getenv("SHINYPROXY_PROXY_ID"), "',
#                        '", event, "',
#                        '", data$section, "',
#                        '", data$label,  "',
#                        '", paste0('"', data$question, '"'),  "',
#                        '", paste0('"', data$answer,   '"'),  "',
#                        '", paste0('"', data$code,     '"'),  "',
#                        '", data$correct, "')",
#                        sep = '')
# 
#     # Execute the query on the sqldtbase that we connected to above
#     rsInsert <- dbSendQuery(sqldtbase, event_log)
#   
# }
# 
# options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)
# data prep --------------------------------------------------------------------
surv <- rio::import(system.file("dat/surveillance_linelist_clean_20141201.rds", package = "introexercises"))

Introduction to R for Applied Epidemiology and Public Health

Welcome

Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.

knitr::include_graphics("images/logo.png", error = F)

Grouping and summarizing

This exercise focuses on grouping and summarizing data in descriptive tables.

Format

This exercise guides you through tasks that you should perform in RStudio on your local computer.

Getting Help

There are several ways to get help:

1) Look for the "helpers" (see below) 2) Ask your live course instructor/facilitator for help
3) Schedule a 1-on-1 call with an instructor for "Course Tutoring" 4) Post a question in Applied Epi Community

Here is what those "helpers" will look like:

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Here you will see a helpful hint!


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

linelist %>% 
  filter(
    age > 25,
    district == "Bolo"
  )

Here is more explanation about why the solution works.


Quiz questions

Answering quiz questions will help you to comprehend the material. The answers are not recorded.

To practice, please answer the following questions:

quiz(
  question_radio("When should I view the red 'helper' code?",
    answer("After trying to write the code myself", correct = TRUE),
    answer("Before I try coding", correct = FALSE),
    correct = "Reviewing best-practice code after trying to write yourself can help you improve",
    incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer."
  )
)
question_numeric(
 "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?",
 answer(10, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(9, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(8, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(7, message = "Try not to worry, we will help you succeed!", correct = T),
 answer(6, message = "Ok, we will get there together", correct = T),
 answer(5, message = "Ok, we will get there together", correct = T),
 answer(4, message = "I like your confidence!", correct = T),
 answer(3, message = "I like your confidence!", correct = T),
 answer(2, message = "I like your confidence!", correct = T),
 answer(1, message = "I like your confidence!", correct = T),
 allow_retry = TRUE,
 correct = "Thanks for sharing. ",
 min = 1,
 max = 10,
 step = 1
)

License

Please email contact@appliedepi.org with questions about the use of these materials.

Learning objectives

In this exercise you will:

Prepare for analysis

Open the RStudio project

r fontawesome::fa("window-restore", fill = "darkgrey") Double-click on the "ebola" RStudio project file to open it (as in the previous module).

Re-open your script: "ebola_analysis.R" where you wrote the cleaning code from the previous module.

Add packages

Ensure that the following packages are included in the pacman::p_load() command in the Load packages section of your script:

However, make sure when you add packages that the {tidyverse} package is still listed last in this command. This is so that any {tidyverse} functions will take precedence over functions in other packages with the same name.

When you run this command, these packages will be installed (if not already) and loaded for use.

Remember, you should only have one pacman::p_load() command at the top of your script.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

pacman::p_load(
  rio,          # for importing data
  here,         # for locating files
  janitor,      # for data cleaning  
  lubridate,    # for date cleaning  
  epikit,       # creating age categories
  gtsummary,    # creating tables  
  scales,       # percents in tables  
  flextable,    # for making pretty tables
  tidyverse     # for data management and visualization
)


Run your R script

Run all the code in your script. If you have successfully finished the previous modules, this will do the following:

Now you can write more code at the bottom of the script, using the cleaned data frame surv.

Backup

If your R script causes to many errors, or you did not finish the last module, alert your instructor. You can import a clean version of the surveillance linelist with the following command and use it for this exercise:

surv <- import(here("data", "clean", "backup", "surveillance_linelist_clean_20141201.rds"))

Add a new section

Ensure that your script is well-commented (#) so that it is easy to know what is happening in each part of the script.

Add a new section to your script for "Summary Tables", just above the "Testing area". Place your cursor where the new section should start and press Ctrl, Shift, and R at the same time (or Cmd Shift R on a Mac) to create a new section heading.

# Summary tables ----------------------------------------------

Remember that you can navigate your script using these section headers with the "Outline" button located in the upper-right corner of the R script.

{janitor} functions

Often, we want just a quick tabulation of counts of a categorical variable, or simply to know "what are the unique values" in a particular column.

The tabyl() function from {janitor} does this quickly by tabulating the unique values of a column provided to it. Easy customization can follow, using the related adorn_() functions.

The use of pipes

Pipes are not only for "cleaning" commands. Pipes simply pass a dataset from one function to the next.

Making tables with {janitor} functions can require multiple steps, and you can use pipes %>% to pass the data from one step to the next.

Write a command in your Summary Tables section that pipes the surv data frame into the function tabyl(), and then specify the column district in the tabyl() function.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Pipe the clean surv data frame into tabyl() and write the column district in the parentheses.


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv %>% 
  tabyl(district)


quiz(caption = "Quiz - tabyl()",
  question("Is this command a PRINTING or SAVING command?",
    allow_retry = T,
    answer("A printing command",
           correct = T,
           message  = "This is a printing command because there is no assignment operator arrow. The data are converted to a table and simply printed to the console. There is no lasting change to the dataset."),
    answer("A saving command",
           correct = F,
           message = "This is a printing command because there is no assignment operator arrow. The data are converted to a table and simply printed to the console. There is no lasting change to the dataset."),
    answer("A universal command",
           correct = F,
           message = "There is no such thing as a universal command")
    ),
  question("What is the district with the most cases recorded?",
    allow_retry = T,
    answer("Mountain Rural", correct = T),
    answer("West II"),
    answer("Central II"),
    answer("Manhattan")
  ),
  question("How many rows are missing (NA) district?",
    allow_retry = T,
    answer("203"),
    answer("37"),
    answer("None"),
    answer(surv %>% filter(is.na(district)) %>% nrow(),
           correct = T)
  )
)

show_na = FALSE

The tabyl() output, when used on a column with missing values, by default contains a "valid_percent" column that shows the proportions as calculated when excluding any missing values.

Run the tabyl() command again but specify the argument show_na = FALSE. How does the output change?

surv %>% 
  tabyl(district, show_na = FALSE)

How would you know to run this argument, or what the defaults are? This information is written in the function's documentation, which you can read by either:

In this documentation, you can see that the default for this argument is TRUE, but that if you set it to FALSE it will remove NA values from the table.

knitr::include_graphics("images/show_na_argument.png", error = F)

Adorn formatting

{janitor} includes a series of "adorn" functions that you can pipe this table into, which will make the formatting nicer.

Add the following functions to the command, using pipes, *in this order*. Leave their parentheses empty. Observe how each function changes the table.

Sorting

Currently, the table of districts is sorted alphabetically by district name. Change this by piping the result into the {dplyr} function arrange() and specifying to sort on column n descending.

To specify descending, you can either wrap the column name in the desc() function, or put a - (minus) symbol in front of the column name.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv %>% 
  tabyl(district) %>% 
  adorn_totals() %>% 
  adorn_pct_formatting() %>% 
  arrange(desc(n))

or

surv %>% 
  tabyl(district) %>% 
  adorn_totals() %>% 
  adorn_pct_formatting() %>% 
  arrange(-n)


Experiment with the order of your steps... what happens if you add adorn_totals() before arranging by column n?

Re-arrange your steps so that the rows are in descending order by n, but the Total row remains at the bottom.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv %>% 
  tabyl(district) %>% 
  arrange(desc(n)) %>% 
  adorn_totals() %>% 
  adorn_pct_formatting()


Cross-tabulation

Now cross-tabulate district and hospital, by placing both column names in the tabyl() function, separated by a comma.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv %>% 
  tabyl(district, hospital)


quiz(caption = "Quiz - hospital tabulation",
  question("Based on this cross-tabulation, which district is SMMH likely located in?",
    allow_retry = T,
    answer("West II"),
    answer("Central II"),
    answer("Mountain Rural", correct = T),
    answer("East I")
  ),
  question("How does the argument show_na = FALSE impact this cross-tabulation?",
    allow_retry = T,
    answer("No effect"),
    answer("Removes NA from the rows"),
    answer("Removes NA from rows and columns", correct = T),
    answer("Removes NA from the columns")
  )
)

Adorn a cross tabulation

Try to add the adorn_() functions to your tabyl() command. See how they behave differently on cross-tabulations than they did on tabulation of a single column.

Now, try adding these functions (only one at a time):

Final tabyl() exercise

Explore the surv data frame with tabyl().

Choose 2 tables to keep in your ebola_analysis.R script in the "Summmary tables" section which showcase your new abilities to use tabyl(). Remove all the other tables you have created (or move them to the "Testing area" if you would like to keep them for reference).

{dplyr} functions

Add a sub-section in your "Summary Tables" section that is called "Tables with dplyr". Ensure it has two hash symbols so that it appears properly indented in the script's outline (see the grey Outline button in the script's top-right corner).

# Summary tables ----------------------------------------------


## Tables with janitor ----------------------------------------


## Tables with dplyr ------------------------------------------

count()

The {dplyr} function count() provides a simple alternative way to tabulate counts.

Try running this code in your "Summary tables" section:

## tables with dplyr
surv %>% 
  count(hospital)

How does it compare to the simple tabyl() on the same column?

Now, adjust the command so that instead of printing the table in the R Console, the table is saved as a new data frame named hospital_counts.

Once done, open the new data frame by clicking on it in the R Environment, or by running View(hospital_counts).

This step is to show you, that you in are in fact creating another data frame! You could conduct analyses on this data frame, or even export it as a csv or xlsx file.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

hospital_counts <- surv %>% 
  count(hospital)




Now try entering 2 categorical columns into the count() command, like district and sex (separated by a comma).

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

hospital_counts <- surv %>% 
  count(hospital, sex)




What does this table look like? How does it compare to the cross-tabulation feature of tabyl()? What might be some advantages or disadvantages of this "long" format?

Which of the two formats better adheres to the guidelines of "tidy data" explained in the Extras of the previous module? (Remember, in tidy data, each variable has its own column and each observation has its own row...)

NOTE: Tidy data structure was addressed in the Extras section of the 2nd data cleaning module, please return when you have the time to review "tidy data" formatting in depth!

group_by() and summarise()

The functions group_by() and summarise() together are the most versatile tool to make a new summary data frame to hold summary statistics.

One major advantage of summarise() is the ability to return more customized statistical summaries like median(), mean(), max(), min(), sd() (standard deviation), percentiles, and the number/percent of rows that meet certain logical criteria.

Summary statistics

Try this command below. It should produce exactly the same table as you made with count().

surv %>% 
  group_by(hospital) %>% 
  summarise(n_rows = n())      # amount of rows per group

You might wonder why anyone would write the command this way, when they could simply write count()? The answer is that you can add more lines within summarise() that will create new calculated summary columns.

Regard below how the command is expanded to create 3 summary columns, each with their own respective calculations.

surv %>% 
  group_by(hospital) %>% 
  summarise(
    n_rows  = n(),                           # amount of rows per group
    age_avg = mean(age_years, na.rm = T),    # mean age in the group
    max_onset = max(date_onset, na.rm = T)   # latest onset date
)

In the two lines that create age_avg and max_onset summaries, the mathematical functions mean() and max() are used. In these functions, the first argument is the column in the original dataset that is being summarised (e.g. age_years). This is followed by any other relevant arguments (e.g. na.rm = TRUE for most mathematical functions).

The argument na.rm = TRUE

We have mentioned before how in most mathematical functions you should include the argument na.rm = TRUE. This is because R wants to alert you to any missing values in the calculation, and will return NA if any are present. Setting na.rm = TRUE (NA "remove") turns off this behavior.

As an experiment, temporarily re-run your above code but without na.rm = T in the max_onset calculation. What changes? Do you see how the result is impacted by the fact that there are missing values of date_onset within the max() calculation?

Logical criteria using sum()

Similar to how we used max() and mean() within summarise(), you can use sum() to return the number of rows per group that meet logical criteria. The row in the original dataset is "counted" if this logical expression evaluates to TRUE for that row. For example:

surv %>% 
  group_by(hospital) %>% 
  summarise(num_adults = sum(age_years >= 18, na.rm = T))

Other examples are:

Note the use of na.rm = TRUE within sum(), for reasons described just above.

Note also the difference between sum() and summarise():
sum() is a basic mathematical function from {base} R summarise() (or summarize()) is from the {dplyr} package and allows you to summarise data by group
Now, write a summarise() command that returns, for each hospital, the number of female cases.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv %>% 
  group_by(hospital) %>% 
  summarise(fem = sum(sex == "female", na.rm = TRUE))

You may find this more clear to read, written as:

surv %>% 
  group_by(hospital) %>% 
  summarise(
    fem = sum(      # column name
      sex == "female",   # column name used to calculate sum
      na.rm = TRUE))     # remove NAs for accurate calculation


What would you change to the calculation to summarize number of male cases?

What would you change to summarize the number of cases with unknown/missing sex? Hint: it involves using is.na()

Percents

Once you begin calculating these columns, you may want to divide one by another, to create a percent.

Once you have defined a column within summarise(), you can reference it in lower parts of the same summarise() command. For example:

1) You calculate the total number of cases per hospital, and call it n_cases
2) You calculate the number of male cases per hospital, and call it males
3) You calculate the proportion male using the two previous

surv %>% 
  group_by(hospital) %>%                        # group by hospital
  summarise(
    n_cases   = n(),                            # number of cases
    males     = sum(sex == "male", na.rm=T),    # count males
    males_pct = males / n_cases                 # percent males of total, using previous two columns created
  )

What does this percent look like? Is it really a percent? Or more of a (decimal) proportion?

There is a function you can use to quickly transform this proportion into a percent - it is the function percent() from the package {scales}.

Try this now

surv %>% 
  group_by(hospital) %>%                      # group by hospital
  summarise(
    n_cases   = n(),                          # number of cases
    males     = sum(sex == "male", na.rm=T),  # count males
    males_pct = percent(males / n_cases)      # percent males of total
  )

Ensure that you have the {scales} package loaded in your pacman command.

For more details, see the chapter on Descriptive tables in the Epi R Handbook.

Rounding

If you need to round a number produced by summarise(), wrap it in the {base} function round() and use argument digits = to adjust the number of decimals.

Try running this code, with, and without the round() function:

surv %>% 
  group_by(hospital) %>% 
  summarise(
    mean_age = round(mean(age_years, na.rm = T), digits = 0)
  )

As you remove round(), don't forget to also remove the comma and digits = 0, which are the second argument of that function.

Un-nesting the functions and writing the command "longer" with strategic indentations can help some coders understand which arguments belong to which function. This can be personal preference.

surv %>% 
  group_by(hospital) %>% 
  summarise(
    mean_age = round(
      mean(
        age_years,
        na.rm = T),
      digits = 0
      )
  )

Conditional statistics

One of the more advanced summarise() calculation tools you can use are subsetting brackets [ ]. These symbols can be used after a column and will filter it according to whatever logical criteria you write inside.

For example, placed within summarise(), the statement max_temp_fvr = max(temp[fever == "yes"], na.rm = T) will return the maximum temperature recorded in the group, but only among the cases that did report a fever.

It is a complicated command - ask your facilitator if you do not understand it.

Final exercise

Create a data frame that summarises the following, for each hospital:

Be sure to write this command into your R script, and adjust your command to save this data frame in your R Environment as hospital_info, to keep for later.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

hospital_info <- surv %>% 
  group_by(hospital) %>%                      # get statistics for each hospital
  summarise(
    n_cases   = n(),                                      # number of rows (cases)
    max_onset = max(date_onset, na.rm = T),               # latest onset date
    under5    = sum(age_years < 5, na.rm = T),            # number of children under 5
    vomit_n   = sum(vomit == "yes", na.rm=T),             # number vomiting
    vomit_pct = percent(vomit_n / n_cases),               # percent vomiting
    max_wt_male = max(wt_kg[sex == "male"], na.rm = T)    # max weight among men
  )


When making a complex summary table with {dplyr}, be sure to read the chapter in the Epi R Handbook, as there are details that we did not have time to cover here.

Let's clean up our code now! Keep only one version of the hospital_info {dplyr}-style table in your "Summary tables" section. Remove other practice {dplyr} code or move it to the "Testing area".

Your "Summary tables" section may look something like this:

# Summary tables ----------------------------------------------

## tables with janitor ----------------------------------------

surv %>% 
  tabyl(district, hospital)


surv %>% 
  tabyl(district) %>% 
  arrange(desc(n)) %>% 
  adorn_totals() %>% 
  adorn_pct_formatting()


## tables with dplyr ------------------------------------------

hospital_info <- surv %>% 
  group_by(hospital) %>%                       # get statistics for each hospital
  summarise(
    n_cases   = n(),                                      # number of rows (cases)
    max_onset = max(date_onset, na.rm = T),               # latest onset date
    under5    = sum(age_years < 5, na.rm = T),            # number of children under 5
    vomit_n   = sum(vomit == "yes", na.rm=T),             # number vomiting
    vomit_pct = percent(vomit_n / n_cases),               # percent vomiting
    max_wt_male = max(wt_kg[sex == "male"], na.rm = T)    # max weight among men
  )

Notice how clear and clean the section looks!

{flextable}

The {flextable} package can accept a data frame and convert or export it into an pretty HTML table.

This is useful if you create a table with {janitor} or {dplyr}, but want to put it in a report or print it as a PNG image.

qflextable()

The most simple command from {flextable} is qflextable(), which stands for "quick flextable". It will convert a table into an HTML image, after making some quick display ratio and formatting adjustments. Look for it to appear in your RStudio Viewer pane.

Add the following code to your "Tables with dplyr" section of your code:

surv %>% 
  group_by(hospital) %>%                        # get statistics for each hospital
  summarise(
    n_cases   = n(),                            # number of rows (cases)
    max_onset = max(date_onset, na.rm = T),     # latest onset date
    under5    = sum(age_years < 5, na.rm = T),  # number of children under 5
    vomit_n   = sum(vomit == "yes", na.rm=T),   # number vomiting
    vomit_pct = percent(vomit_n / n_cases),     # percent vomiting
    max_wt_male = max(wt_kg[sex == "male"], na.rm = T)) %>%  # max weight among men
  qflextable()

Review the nice table that has appeared in the RStudio Viewer pane.

Try out the same thing with one of your {janitor} tables (e.g. add qflextable() to the end of a {janitor} table pipe).

There are a lot of further adjustments that you can make to your flextable. See the Tables for presentation chapter of the Epi R Handbook.

Here is one example of a different table, used in our Epi R Handbook:

knitr::include_graphics("images/flextable.png", error = F)

The Extras section of this exercise contains more practice using {flextable}.

Saving an image

Once your table is converted to a flextable object, you can export it to Word, PowerPoint or HTML or as an image (PNG) files.

To do this:

1) Save the table as a named object (e.g. hospital_table) using the assignment operator <-

hospital_table <- surv %>% 
  group_by(hospital) %>%                          # get statistics for each hospital
  summarise( 
    n_cases   = n(),                              # number of rows (cases)
    max_onset = max(date_onset, na.rm = T),       # latest onset date
    under5    = sum(age_years < 5, na.rm = T),    # number of children under 5
    vomit_n   = sum(vomit == "yes", na.rm=T),     # number vomiting
    vomit_pct = percent(vomit_n / n_cases),       # percent vomiting
    max_wt_male = max(wt_kg[sex == "male"], na.rm = T)) %>%   # max weight among men
  qflextable()

Note that this is a saving command, not a printing command. Therefore, to see this new table in the Viewer pane, you have to run the command hospital_table.

2) In a separate command, write one of the {flextable} functions below. Do not pipe from your previous code.

Within the parentheses of the function, first provide the name of the flextable object that you saved (e.g. my_table). Then provide the argument path = and supply the desired file name, in quotes, that you want to save to (including the file extension), for example:

# saves the table object my_table as "hospital_table.docx"
save_as_docx(hospital_table, path = "hospital_table.docx")

Because no subfolders were specified in the example above, the file will be saved in the root/top folder of your RStudio project, the "ebola" folder.

How would we specify that we want to save the table in the "outputs" sub-folder of the "ebola" folder? Remember the here() function?

Update your export command hospital_table is saved in the correct folder:

# saves the table object my_table as "hospital_table.docx"
save_as_docx(hospital_table, path = here("outputs", "hospital_table.docx"))

To save as a PNG image, you will need to install Phantom JS (free) for the save_as_image() function to work. You can do this by installing the package {webshot} with your pacman command, then running the command webshot::install_phantomjs(). Consider doing this after finishing this exercise.

# saves the table as hospital_table.png in the root folder of your R project
save_as_image(my_table, path = here("outputs", "hospital_table.png"))

{gtsummary} functions

The final approach to making tables that you should practice is function tbl_summary() from the {gtsummary} package.

This package aims to produce publication-ready images with very simple and short code. Indeed, you can make very complex and detailed adjustments to these tables, but it is also easy to get a nice table with very little code.

Select columns

Different from the previous methods is that you must first select() only the columns that you want to use in the table.

Pipe surv into select(), select two columns, and pipe into tbl_summary()

What was produced for the categorical column(s)?

What was produced for the continuous column(s)?

Stratified tables

You can also add a by = argument to the tbl_summary() command, and designate a column to stratify the table (in columns). Do not forget to include this column in the select() command above!

Try to make a table that assesses district, age_cat, all of the symptoms columns, weight, and height

Now make the same table, stratified by sex.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

surv %>% 
  select(district, age_cat, fever, chills, cough, aches, vomit, wt_kg, ht_cm, sex) %>% 
  tbl_summary(by = sex)


Changing to flextable format

An important note with {gtsummary} is you cannot directly export the table to a Word document. To allow this, pipe the table to the function as_flex_table() from {flextable}. This allows the {gtsummary} table to then be exported to a Word document.

gt_to_flex <- surv %>% 
  select(district, age_cat, fever, chills, cough, aches, vomit, wt_kg, ht_cm, sex) %>% 
  tbl_summary(by = sex) %>% 
  as_flex_table()

save_as_docx(gt_to_flex, "my_table.docx")

Add p-values

{gtsummary} makes it very easy to run statistical tests. For example, the function add_p() can be added if you have the appropriate configuration of the table.

Read more about the many ways to customize a {gtsummary} table here.

End

Congratulations! You finished the module on summary tables!

Extras

More fun with {flextable}

{flextable} is a useful tool for creating presentation-ready tables.

Start by creating a new script, called "ebola_flextable.R".

This is the table we aim to create:

knitr::include_graphics("images/table_final.png", error = F)

Introduction

Just like with the "ebola_analyses.R" script, write some descriptive information at the top of the script.

#############################################
# Ebola flextable example
# Bonus section
# Your NAME here
#############################################

Packages

Next, add a section heading and load these packages using pacman_p_load():

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Load packages ----------------------------------------------------------------
pacman::p_load(
  rio,            # import/export
  here,           # file pathways
  flextable,      # make HTML tables
  scales,         # helper functions
  officer,        # helper functions for nice tables
  tidyverse)      # data management, summary, and visualization


Data import

Next, create an import section and import the cleaned ebola linelist, which should be saved in your "ebola/data/clean" folder as "surveillance_linelist_clean_20141201.rds". As a backup, you can use the version at the "ebola/data/clean/backup" folder.

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Import data -------------------------------------------------------------

# Your saved clean linelist
surv <- import(here("data", "clean", "surveillance_linelist_clean_20141201.rds"))    

# Back-up version
surv <- import(here("data", "clean", "backup", "surveillance_linelist_clean_20141201.rds"))    


Border style

Next, define the border style of the flextable using the {officer} package. Use the fp_border() function to specify the border color of the table (black), and the border line width (set to 1). Feel free to play around with this function later.

# define border style -----------------------------------------------------
border_style = officer::fp_border(color="black", width=1)

Table creation

Now begin creating your table, with step-by-step changes.

First, create a table using group_by() and summarise() that groups the linelist rows by hospital. Create summary columns that include:

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Pipe the linelist into group_by() and specify to group by unique values in the column hospital.

Pipe to summarise() and inside this, specify three new columsn:

  • To get the number of rows per group you can use n()
  • To get the number who vomited, use sum() to count cases where vomit == "yes" (dont' forget na.rm = TRUE!
  • To get the percent of cases who vomited, divide the vomit by case counts.



r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Create table -----------------------------------------------------------
surv %>% 
  group_by(hospital) %>%                           # get statistics for each hospital
  summarise(
    n_cases   = n(),                               # number of rows (cases)
    vomit_n   = sum(vomit == "yes", na.rm=T),      # number vomiting
    vomit_pct = percent(vomit_n / n_cases))        # percent vomiting


Include "Missing" values

In this course we explained that missing values are most properly recorded in R as NA. This is true, but now that we are preparing analysis for presentation, we can transform these values to more appropriate words. Still, this change should occur only in the output table, not the underlying dataset.

Add a mutate() above the group_by() function, and transform the hospital column with the function fct_na_value_to_level() from the {forcats} package, setting the explicit value as "Missing".

Finally, pipe the table to qflextable() and save the table as an object called vom_table.

r fontawesome::fa("lightbulb", fill = "gold") Click to read a hint

Try searching the Help documentation for the fct_na_value_to_level() function.


r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Create table -----------------------------------------------------------
vom_table <- surv %>% 
  mutate(hospital = fct_na_value_to_level(hospital, "Missing")) %>% # explicit NA values
  group_by(hospital) %>%                                     # get statistics for each hospital
  summarise(
    n_cases   = n(),                                         # number of rows (cases)
    vomit_n   = sum(vomit == "yes", na.rm=T),                # number vomiting
    vomit_pct = percent(vomit_n / n_cases)) %>%              # percent vomiting

  # convert to flextable
  qflextable()


Remember, now that the command saves the table with <- to the object vom_table, in order to view the updated table you need to run the command vom_table.

Table headers

Let's begin formatting the table for presentation!

First, use the {flextable} function add_header_row() to add a header row on top of the existing header. Here is the code to add to your command (with a pipe).

  add_header_row(
    top = TRUE,                           # new header on top of existing header
    values = c("Hospital",                # values for new header
               "Total cases", 
               "Cases with vomiting"),    # This will span 2 columns
    colwidths = c(1,1,2))                 # span the 4 columns

This section sets three new header values (the text), and the how many of the underlying four columns each should span. The last header value should span two underlying columns.

r fontawesome::fa("exclamation", fill = "red") Alert! if you add a header row with fewer column names than the number of underlying columns, you will see an error like this:

Error in inverse.rle(structure(list(lengths = colwidths, values = values), : invalid 'rle' structure

Run the command vom_table and see the changes in your RStudio Viewer. You should see the second header and how the "Cases with vomiting" spans the right-most two columns.

Let's make some manual adjustments to the header values. Pipe the table to this code:

  # Re-label secondary headers
  set_header_labels(n_cases   = "No.",                                         
                    vomit_n   = "No.",                           
                    vomit_pct = "Percent of total")

Run the command vom_table and see the changes in your RStudio Viewer.

Your full table command should look like this:

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Create table -----------------------------------------------------------
vom_table <- surv %>% 

  mutate(hospital = fct_na_value_to_level(hospital, "Missing")) %>% # explicit NA values

  group_by(hospital) %>%                                     # get statistics for each hospital
  summarise(
    n_cases   = n(),                                         # number of rows (cases)
    vomit_n   = sum(vomit == "yes", na.rm=T),                # number vomiting
    vomit_pct = percent(vomit_n / n_cases)) %>%              # percent vomiting

  # convert to flextable
  qflextable() %>% 

  add_header_row(
    top = TRUE,                             # new header on top of existing header
    colwidths = c(1,1,2),                   # span the 4 columns
    values = c("Hospital",                  # values for new header
               "Total cases", 
               "Cases with vomiting")) %>%  # This will span 2 columns

  # Re-label secondary headers
  set_header_labels(n_cases   = "No.",                                         
                    vomit_n   = "No.",                           
                    vomit_pct = "Percent of total")


Formatting rows and columns

Next, we will adjust the alignment of the values in the cells, and merge some of the values together.

To make specific formatting edits to specific rows, columns, or cells we must understand {flextable}'s location-targeting syntax. Many of this package's functions utilise this syntax:

function(table, part = "", i = , j = )

Let's understand it:

You can find the complete list of flextable formatting function here or review the documentation by entering ?flextable. You can also see another worked example in the Epi R Handbook.

Pipe your table into the following code:

  # Align and merge cells
  align(part = "all",
        align = "center",
        j = c(2:4)) %>%           # center values in columns 2-4

  merge_at(i = 1:2,
           j = 1,
           part = "header")       # merge vertically the 'hospital' title

Run the command vom_table and see the changes in your RStudio Viewer.

Your full table command should look like this:

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Create table -----------------------------------------------------------
vom_table <- surv %>% 

  mutate(hospital = fct_na_value_to_level(hospital, "Missing")) %>% # explicit NA values

  group_by(hospital) %>%                                     # get statistics for each hospital
  summarise(
    n_cases   = n(),                                         # number of rows (cases)
    vomit_n   = sum(vomit == "yes", na.rm=T),                # number vomiting
    vomit_pct = percent(vomit_n / n_cases)) %>%              # percent vomiting

  # convert to flextable
  qflextable() %>% 

  add_header_row(
    top = TRUE,                             # new header on top of existing header
    colwidths = c(1,1,2),                   # span the 4 columns
    values = c("Hospital",                  # values for new header
               "Total cases", 
               "Cases with vomiting")) %>%  # This will span 2 columns

  # Re-label secondary headers
  set_header_labels(n_cases   = "No.",                                         
                    vomit_n   = "No.",                           
                    vomit_pct = "Percent of total") %>% 

  # Align and merge cells
  align(part = "all",
        align = "center",
        j = c(2:4)) %>%           # center values in columns 2-4

  merge_at(i = 1:2,
           j = 1,
           part = "header")       # merge vertically the 'hospital' title


Flair

Finally, we will adjust the font and background color of the table.

Add the code below to your pipe chain.

  # Change background for specific columns
  bg(part = "body",         # body of the table (exclusing header)
     j = 4,                 # 4th column
     bg = "gray95") %>%     # grey color

  # Change background conditionally
  bg(j = 4,                 # 4th column
     i = ~ vomit_pct >= 55, # The ~ shows that the rows affected are conditional
     part = "body",         # the body of the table - not the header
     bg = "orange") %>%     # the color

  # Main headers are bold
  bold(i = 1,           # row 1 of the header
       bold = TRUE,
       part = "header") 

Title and footer rows

The most simple option is to use add_footer_lines() to append one large cell that spans with width of the entire table.

Adding references in the table (with superscript numbers and footer references) can be more complicated.

Add a footer row to your table that explains the data source and the logic behind the conditional coloring. Extra points if you create it with str_glue() to make some of the text dynamic, and make it italicised!

Then use the inverse function add_header_lines() to give your table a title. Extra points if you then align this title to the center!

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

Pipe your table into this code like this:

  #add footer
  add_footer_lines(values = str_glue(
    "Cases reported as of {max_report_date}. Hospitals with >=55% vomiting highlighted.",
    max_report_date = max(surv$date_report, na.rm = T))) %>% 
  italic(part = "footer") %>% 

  # add title
  add_header_lines(values = "Vomiting case, by hospital") %>% 
  align(part = "header", i = 1, align = "center")


Your full table command should look like this:

r fontawesome::fa("check", fill = "red")Click to see a solution (try it yourself first!)

# Create table -----------------------------------------------------------
vom_table <- surv %>% 

  mutate(hospital = fct_na_value_to_level(hospital, "Missing")) %>%     # explicit NA values as "Missing"

  group_by(hospital) %>%                                     # get statistics for each hospital
  summarise(
    n_cases   = n(),                                         # number of rows (cases)
    vomit_n   = sum(vomit == "yes", na.rm=T),                # number vomiting
    vomit_pct = percent(vomit_n / n_cases)) %>%              # percent vomiting

  # convert to flextable
  qflextable() %>% 

  add_header_row(
    top = TRUE,                             # new header on top of existing header
    colwidths = c(1,1,2),                   # span the 4 columns
    values = c("Hospital",                  # values for new header
               "Total cases", 
               "Cases with vomiting")) %>%  # This will span 2 columns

  # Re-label secondary headers
  set_header_labels(n_cases   = "No.",                                         
                    vomit_n   = "No.",                           
                    vomit_pct = "Percent of total") %>% 

  # Align and merge cells
  align(part = "all",
        align = "center",
        j = c(2:4)) %>%           # center values in columns 2-4

  merge_at(i = 1:2,
           j = 1,
           part = "header") %>%   # merge vertically the 'hospital' title

  # Change background for specific columns
  bg(part = "body",         # body of the table (exclusing header)
     j = 4,                 # 4th column
     bg = "gray95") %>%     # grey color

  # Change background conditionally
  bg(j = 4,                 # 4th column
     i = ~ vomit_pct >= 55, # The ~ shows that the rows affected are conditional
     part = "body",         # the body of the table - not the header
     bg = "orange") %>%     # the color

  # Main headers are bold
  bold(i = 1,           # row 1 of the header
       bold = TRUE,
       part = "header") %>% 

  #add footer
  add_footer_lines(values = str_glue(
    "Cases reported as of {max_report_date}. Hospitals with >=55% vomiting highlighted.",
    max_report_date = max(surv$date_report, na.rm = T))) %>% 
  italic(part = "footer") %>% 

  # add title
  add_header_lines(values = "Vomiting cases, by hospital") %>% 
  align(part = "header", i = 1, align = "center")


Save the table

Finally, save the table to a Word document, Powerpoint, or your preferred output. Remember that this command should not be connected by a pipe to the table creation command.

This code should work to save it as Word to an "outputs" folder in the RStudio project.

# Save
save_as_docx(vom_table, path = here("outputs", "vom_table.docx"))


appliedepi/introexercises documentation built on April 22, 2024, 1:01 a.m.