# load packages ---------------------------------------------------------------- library(introexercises) library(learnr) library(gradethis) library(flair) library(dplyr) library(ggplot2) library(lubridate) library(janitor) library(fontawesome) library(gtsummary) library(scales) # library(RMariaDB) # connect to sql database ## set options for exercises and checking --------------------------------------- ## Define how exercises are evaluated gradethis::gradethis_setup( ## note: the below arguments are passed to learnr::tutorial_options ## set the maximum execution time limit in seconds exercise.timelimit = 60, ## set how exercises should be checked (defaults to NULL - individually defined) # exercise.checker = gradethis::grade_learnr ## set whether to pre-evaluate exercises (so users see answers) exercise.eval = FALSE ) # ## event recorder --------------------------------------------------------------- # ## see for details: # ## https://pkgs.rstudio.com/learnr/articles/publishing.html#events # ## https://github.com/dtkaplan/submitr/blob/master/R/make_a_recorder.R # # ## connect to your sql database # sqldtbase <- dbConnect(RMariaDB::MariaDB(), # user = Sys.getenv("userid"), # password = Sys.getenv("pwd"), # dbname = 'excersize_log', # host = "144.126.246.140") # # # ## define a function to collect data # ## note that tutorial_id is defined in YAML # ## you could set the tutorial_version too (by specifying version:) but use package version instead # recorder_function <- function(tutorial_id, tutorial_version, user_id, event, data) { # # ## define a sql query # ## first bracket defines variable names # ## values bracket defines what goes in each variable # event_log <- paste("INSERT INTO responses ( # tutorial_id, # tutorial_version, # date_time, # user_id, # event, # section, # label, # question, # answer, # code, # correct) # VALUES('", tutorial_id, "', # '", tutorial_version, "', # '", format(Sys.time(), "%Y-%M%-%D %H:%M:%S %Z"), "', # '", Sys.getenv("SHINYPROXY_PROXY_ID"), "', # '", event, "', # '", data$section, "', # '", data$label, "', # '", paste0('"', data$question, '"'), "', # '", paste0('"', data$answer, '"'), "', # '", paste0('"', data$code, '"'), "', # '", data$correct, "')", # sep = '') # # # Execute the query on the sqldtbase that we connected to above # rsInsert <- dbSendQuery(sqldtbase, event_log) # # } # # options(tutorial.event_recorder = recorder_function)
# hide non-exercise code chunks ------------------------------------------------ knitr::opts_chunk$set(echo = FALSE)
# data prep -------------------------------------------------------------------- surv <- rio::import(system.file("dat/surveillance_linelist_clean_20141201.rds", package = "introexercises"))
Welcome to the course "Introduction to R for applied epidemiology", offered by Applied Epi - a nonprofit organisation and the leading provider of R training, support, and tools to frontline public health practitioners.
knitr::include_graphics("images/logo.png", error = F)
This exercise focuses on grouping and summarizing data in descriptive tables.
This exercise guides you through tasks that you should perform in RStudio on your local computer.
There are several ways to get help:
1) Look for the "helpers" (see below)
2) Ask your live course instructor/facilitator for help
3) Schedule a 1-on-1 call with an instructor for "Course Tutoring"
4) Post a question in Applied Epi Community
Here is what those "helpers" will look like:
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Here you will see a helpful hint!
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
linelist %>% filter( age > 25, district == "Bolo" )
Here is more explanation about why the solution works.
Answering quiz questions will help you to comprehend the material. The answers are not recorded.
To practice, please answer the following questions:
quiz( question_radio("When should I view the red 'helper' code?", answer("After trying to write the code myself", correct = TRUE), answer("Before I try coding", correct = FALSE), correct = "Reviewing best-practice code after trying to write yourself can help you improve", incorrect = "Please attempt the exercise yourself, or use the hint, before viewing the answer." ) )
question_numeric( "How anxious are you about beginning this tutorial - on a scale from 1 (least anxious) to 10 (most anxious)?", answer(10, message = "Try not to worry, we will help you succeed!", correct = T), answer(9, message = "Try not to worry, we will help you succeed!", correct = T), answer(8, message = "Try not to worry, we will help you succeed!", correct = T), answer(7, message = "Try not to worry, we will help you succeed!", correct = T), answer(6, message = "Ok, we will get there together", correct = T), answer(5, message = "Ok, we will get there together", correct = T), answer(4, message = "I like your confidence!", correct = T), answer(3, message = "I like your confidence!", correct = T), answer(2, message = "I like your confidence!", correct = T), answer(1, message = "I like your confidence!", correct = T), allow_retry = TRUE, correct = "Thanks for sharing. ", min = 1, max = 10, step = 1 )
Please email contact@appliedepi.org with questions about the use of these materials.
In this exercise you will:
tabyl()
and adorn_()
functions group_by()
, summarise()
, and count()
tbl_summary()
r fontawesome::fa("window-restore", fill = "darkgrey")
Double-click on the "ebola" RStudio project file to open it (as in the previous module).
Re-open your script: "ebola_analysis.R" where you wrote the cleaning code from the previous module.
Ensure that the following packages are included in the pacman::p_load()
command in the Load packages section of your script:
However, make sure when you add packages that the {tidyverse} package is still listed last in this command. This is so that any {tidyverse} functions will take precedence over functions in other packages with the same name.
When you run this command, these packages will be installed (if not already) and loaded for use.
Remember, you should only have one pacman::p_load()
command at the top of your script.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
pacman::p_load( rio, # for importing data here, # for locating files janitor, # for data cleaning lubridate, # for date cleaning epikit, # creating age categories gtsummary, # creating tables scales, # percents in tables flextable, # for making pretty tables tidyverse # for data management and visualization )
Run all the code in your script. If you have successfully finished the previous modules, this will do the following:
surv_raw
data frame using a "pipe chain", and save the clean data as surv
Now you can write more code at the bottom of the script, using the cleaned data frame surv
.
If your R script causes to many errors, or you did not finish the last module, alert your instructor. You can import a clean version of the surveillance linelist with the following command and use it for this exercise:
surv <- import(here("data", "clean", "backup", "surveillance_linelist_clean_20141201.rds"))
Ensure that your script is well-commented (#) so that it is easy to know what is happening in each part of the script.
Add a new section to your script for "Summary Tables", just above the "Testing area". Place your cursor where the new section should start and press Ctrl, Shift, and R at the same time (or Cmd Shift R on a Mac) to create a new section heading.
# Summary tables ----------------------------------------------
Remember that you can navigate your script using these section headers with the "Outline" button located in the upper-right corner of the R script.
Often, we want just a quick tabulation of counts of a categorical variable, or simply to know "what are the unique values" in a particular column.
The tabyl()
function from {janitor} does this quickly by tabulating the unique values of a column provided to it. Easy customization can follow, using the related adorn_()
functions.
Pipes are not only for "cleaning" commands. Pipes simply pass a dataset from one function to the next.
Making tables with {janitor} functions can require multiple steps, and you can use pipes %>%
to pass the data from one step to the next.
Write a command in your Summary Tables section that pipes the surv
data frame into the function tabyl()
, and then specify the column district
in the tabyl()
function.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Pipe the clean surv
data frame into tabyl()
and write the column district
in the parentheses.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv %>% tabyl(district)
quiz(caption = "Quiz - tabyl()", question("Is this command a PRINTING or SAVING command?", allow_retry = T, answer("A printing command", correct = T, message = "This is a printing command because there is no assignment operator arrow. The data are converted to a table and simply printed to the console. There is no lasting change to the dataset."), answer("A saving command", correct = F, message = "This is a printing command because there is no assignment operator arrow. The data are converted to a table and simply printed to the console. There is no lasting change to the dataset."), answer("A universal command", correct = F, message = "There is no such thing as a universal command") ), question("What is the district with the most cases recorded?", allow_retry = T, answer("Mountain Rural", correct = T), answer("West II"), answer("Central II"), answer("Manhattan") ), question("How many rows are missing (NA) district?", allow_retry = T, answer("203"), answer("37"), answer("None"), answer(surv %>% filter(is.na(district)) %>% nrow(), correct = T) ) )
show_na = FALSE
The tabyl()
output, when used on a column with missing values, by default contains a "valid_percent" column that shows the proportions as calculated when excluding any missing values.
Run the tabyl()
command again but specify the argument show_na = FALSE
. How does the output change?
surv %>% tabyl(district, show_na = FALSE)
How would you know to run this argument, or what the defaults are? This information is written in the function's documentation, which you can read by either:
?tabyl
, or tabyl
in the search bar In this documentation, you can see that the default for this argument is TRUE
, but that if you set it to FALSE
it will remove NA
values from the table.
knitr::include_graphics("images/show_na_argument.png", error = F)
{janitor} includes a series of "adorn" functions that you can pipe this table into, which will make the formatting nicer.
Add the following functions to the command, using pipes, *in this order*. Leave their parentheses empty. Observe how each function changes the table.
adorn_totals()
adorn_pct_formatting()
Currently, the table of districts is sorted alphabetically by district name. Change this by piping the result into the {dplyr} function arrange()
and specifying to sort on column n
descending.
To specify descending, you can either wrap the column name in the desc()
function, or put a - (minus) symbol in front of the column name.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv %>% tabyl(district) %>% adorn_totals() %>% adorn_pct_formatting() %>% arrange(desc(n))
or
surv %>% tabyl(district) %>% adorn_totals() %>% adorn_pct_formatting() %>% arrange(-n)
Experiment with the order of your steps... what happens if you add adorn_totals()
before arranging by column n
?
Re-arrange your steps so that the rows are in descending order by n
, but the Total row remains at the bottom.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv %>% tabyl(district) %>% arrange(desc(n)) %>% adorn_totals() %>% adorn_pct_formatting()
Now cross-tabulate district and hospital, by placing both column names in the tabyl()
function, separated by a comma.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv %>% tabyl(district, hospital)
quiz(caption = "Quiz - hospital tabulation", question("Based on this cross-tabulation, which district is SMMH likely located in?", allow_retry = T, answer("West II"), answer("Central II"), answer("Mountain Rural", correct = T), answer("East I") ), question("How does the argument show_na = FALSE impact this cross-tabulation?", allow_retry = T, answer("No effect"), answer("Removes NA from the rows"), answer("Removes NA from rows and columns", correct = T), answer("Removes NA from the columns") ) )
Try to add the adorn_()
functions to your tabyl()
command. See how they behave differently on cross-tabulations than they did on tabulation of a single column.
adorn_totals()
adorn_percentages()
(converts counts to decimal proportions) adorn_pct_formatting()
(converts decimal proportions to % format. Also try this with the argument digits = 0
) Now, try adding these functions (only one at a time):
adorn_ns()
or adorn_ns("rear")
or adorn_ns("front")
Explore the surv
data frame with tabyl()
.
Choose 2 tables to keep in your ebola_analysis.R script in the "Summmary tables" section which showcase your new abilities to use tabyl()
. Remove all the other tables you have created (or move them to the "Testing area" if you would like to keep them for reference).
Add a sub-section in your "Summary Tables" section that is called "Tables with dplyr". Ensure it has two hash symbols so that it appears properly indented in the script's outline (see the grey Outline button in the script's top-right corner).
# Summary tables ---------------------------------------------- ## Tables with janitor ---------------------------------------- ## Tables with dplyr ------------------------------------------
count()
The {dplyr} function count()
provides a simple alternative way to tabulate counts.
Try running this code in your "Summary tables" section:
## tables with dplyr surv %>% count(hospital)
How does it compare to the simple tabyl()
on the same column?
Now, adjust the command so that instead of printing the table in the R Console, the table is saved as a new data frame named hospital_counts
.
Once done, open the new data frame by clicking on it in the R Environment, or by running View(hospital_counts)
.
This step is to show you, that you in are in fact creating another data frame! You could conduct analyses on this data frame, or even export it as a csv or xlsx file.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
hospital_counts <- surv %>% count(hospital)
Now try entering 2 categorical columns into the count()
command, like district
and sex
(separated by a comma).
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
hospital_counts <- surv %>% count(hospital, sex)
What does this table look like? How does it compare to the cross-tabulation feature of tabyl()
? What might be some advantages or disadvantages of this "long" format?
Which of the two formats better adheres to the guidelines of "tidy data" explained in the Extras of the previous module? (Remember, in tidy data, each variable has its own column and each observation has its own row...)
NOTE: Tidy data structure was addressed in the Extras section of the 2nd data cleaning module, please return when you have the time to review "tidy data" formatting in depth!
group_by()
and summarise()
The functions group_by()
and summarise()
together are the most versatile tool to make a new summary data frame to hold summary statistics.
One major advantage of summarise()
is the ability to return more customized statistical summaries like median()
, mean()
, max()
, min()
, sd()
(standard deviation), percentiles, and the number/percent of rows that meet certain logical criteria.
Try this command below. It should produce exactly the same table as you made with count()
.
surv %>% group_by(hospital) %>% summarise(n_rows = n()) # amount of rows per group
You might wonder why anyone would write the command this way, when they could simply write count()
? The answer is that you can add more lines within summarise()
that will create new calculated summary columns.
n_rows
is the name we are designating for a new column in the summary dataset = n()
is a function n()
that counts the number of rows within the group (hospital) Regard below how the command is expanded to create 3 summary columns, each with their own respective calculations.
surv %>% group_by(hospital) %>% summarise( n_rows = n(), # amount of rows per group age_avg = mean(age_years, na.rm = T), # mean age in the group max_onset = max(date_onset, na.rm = T) # latest onset date )
In the two lines that create age_avg
and max_onset
summaries, the mathematical functions mean()
and max()
are used. In these functions, the first argument is the column in the original dataset that is being summarised (e.g. age_years
). This is followed by any other relevant arguments (e.g. na.rm = TRUE
for most mathematical functions).
na.rm = TRUE
We have mentioned before how in most mathematical functions you should include the argument na.rm = TRUE
. This is because R wants to alert you to any missing values in the calculation, and will return NA
if any are present. Setting na.rm = TRUE
(NA
"remove") turns off this behavior.
As an experiment, temporarily re-run your above code but without na.rm = T
in the max_onset
calculation. What changes? Do you see how the result is impacted by the fact that there are missing values of date_onset
within the max()
calculation?
sum()
Similar to how we used max()
and mean()
within summarise()
, you can use sum()
to return the number of rows per group that meet logical criteria. The row in the original dataset is "counted" if this logical expression evaluates to TRUE for that row. For example:
surv %>% group_by(hospital) %>% summarise(num_adults = sum(age_years >= 18, na.rm = T))
Other examples are:
sum(sex == "male", na.rm=T)
sum(district %in% c("West I", "West II"))
Note the use of na.rm = TRUE
within sum()
, for reasons described just above.
Note also the difference between sum()
and summarise()
:
sum()
is a basic mathematical function from {base} R
summarise()
(or summarize()
) is from the {dplyr} package and allows you to summarise data by group
Now, write a summarise()
command that returns, for each hospital
, the number of female cases.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv %>% group_by(hospital) %>% summarise(fem = sum(sex == "female", na.rm = TRUE))
You may find this more clear to read, written as:
surv %>% group_by(hospital) %>% summarise( fem = sum( # column name sex == "female", # column name used to calculate sum na.rm = TRUE)) # remove NAs for accurate calculation
What would you change to the calculation to summarize number of male cases?
What would you change to summarize the number of cases with unknown/missing sex? Hint: it involves using is.na()
Once you begin calculating these columns, you may want to divide one by another, to create a percent.
Once you have defined a column within summarise()
, you can reference it in lower parts of the same summarise()
command. For example:
1) You calculate the total number of cases per hospital, and call it n_cases
2) You calculate the number of male cases per hospital, and call it males
3) You calculate the proportion male using the two previous
surv %>% group_by(hospital) %>% # group by hospital summarise( n_cases = n(), # number of cases males = sum(sex == "male", na.rm=T), # count males males_pct = males / n_cases # percent males of total, using previous two columns created )
What does this percent look like? Is it really a percent? Or more of a (decimal) proportion?
There is a function you can use to quickly transform this proportion into a percent - it is the function percent()
from the package {scales}.
Try this now
surv %>% group_by(hospital) %>% # group by hospital summarise( n_cases = n(), # number of cases males = sum(sex == "male", na.rm=T), # count males males_pct = percent(males / n_cases) # percent males of total )
Ensure that you have the {scales} package loaded in your pacman command.
For more details, see the chapter on Descriptive tables in the Epi R Handbook.
If you need to round a number produced by summarise()
, wrap it in the {base} function round()
and use argument digits =
to adjust the number of decimals.
Try running this code, with, and without the round()
function:
surv %>% group_by(hospital) %>% summarise( mean_age = round(mean(age_years, na.rm = T), digits = 0) )
As you remove round()
, don't forget to also remove the comma and digits = 0
, which are the second argument of that function.
Un-nesting the functions and writing the command "longer" with strategic indentations can help some coders understand which arguments belong to which function. This can be personal preference.
surv %>% group_by(hospital) %>% summarise( mean_age = round( mean( age_years, na.rm = T), digits = 0 ) )
One of the more advanced summarise()
calculation tools you can use are subsetting brackets [ ]
. These symbols can be used after a column and will filter it according to whatever logical criteria you write inside.
For example, placed within summarise()
, the statement max_temp_fvr = max(temp[fever == "yes"], na.rm = T)
will return the maximum temperature recorded in the group, but only among the cases that did report a fever.
It is a complicated command - ask your facilitator if you do not understand it.
Create a data frame that summarises the following, for each hospital:
Be sure to write this command into your R script, and adjust your command to save this data frame in your R Environment as hospital_info
, to keep for later.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
hospital_info <- surv %>% group_by(hospital) %>% # get statistics for each hospital summarise( n_cases = n(), # number of rows (cases) max_onset = max(date_onset, na.rm = T), # latest onset date under5 = sum(age_years < 5, na.rm = T), # number of children under 5 vomit_n = sum(vomit == "yes", na.rm=T), # number vomiting vomit_pct = percent(vomit_n / n_cases), # percent vomiting max_wt_male = max(wt_kg[sex == "male"], na.rm = T) # max weight among men )
When making a complex summary table with {dplyr}, be sure to read the chapter in the Epi R Handbook, as there are details that we did not have time to cover here.
Let's clean up our code now! Keep only one version of the hospital_info
{dplyr}-style table in your "Summary tables" section. Remove other practice {dplyr} code or move it to the "Testing area".
Your "Summary tables" section may look something like this:
# Summary tables ---------------------------------------------- ## tables with janitor ---------------------------------------- surv %>% tabyl(district, hospital) surv %>% tabyl(district) %>% arrange(desc(n)) %>% adorn_totals() %>% adorn_pct_formatting() ## tables with dplyr ------------------------------------------ hospital_info <- surv %>% group_by(hospital) %>% # get statistics for each hospital summarise( n_cases = n(), # number of rows (cases) max_onset = max(date_onset, na.rm = T), # latest onset date under5 = sum(age_years < 5, na.rm = T), # number of children under 5 vomit_n = sum(vomit == "yes", na.rm=T), # number vomiting vomit_pct = percent(vomit_n / n_cases), # percent vomiting max_wt_male = max(wt_kg[sex == "male"], na.rm = T) # max weight among men )
Notice how clear and clean the section looks!
The {flextable} package can accept a data frame and convert or export it into an pretty HTML table.
This is useful if you create a table with {janitor} or {dplyr}, but want to put it in a report or print it as a PNG image.
The most simple command from {flextable} is qflextable()
, which stands for "quick flextable". It will convert a table into an HTML image, after making some quick display ratio and formatting adjustments. Look for it to appear in your RStudio Viewer pane.
Add the following code to your "Tables with dplyr" section of your code:
surv %>% group_by(hospital) %>% # get statistics for each hospital summarise( n_cases = n(), # number of rows (cases) max_onset = max(date_onset, na.rm = T), # latest onset date under5 = sum(age_years < 5, na.rm = T), # number of children under 5 vomit_n = sum(vomit == "yes", na.rm=T), # number vomiting vomit_pct = percent(vomit_n / n_cases), # percent vomiting max_wt_male = max(wt_kg[sex == "male"], na.rm = T)) %>% # max weight among men qflextable()
Review the nice table that has appeared in the RStudio Viewer pane.
Try out the same thing with one of your {janitor} tables (e.g. add qflextable()
to the end of a {janitor} table pipe).
There are a lot of further adjustments that you can make to your flextable. See the Tables for presentation chapter of the Epi R Handbook.
Here is one example of a different table, used in our Epi R Handbook:
knitr::include_graphics("images/flextable.png", error = F)
The Extras section of this exercise contains more practice using {flextable}.
Once your table is converted to a flextable
object, you can export it to Word, PowerPoint or HTML or as an image (PNG) files.
To do this:
1) Save the table as a named object (e.g. hospital_table
) using the assignment operator <-
hospital_table <- surv %>% group_by(hospital) %>% # get statistics for each hospital summarise( n_cases = n(), # number of rows (cases) max_onset = max(date_onset, na.rm = T), # latest onset date under5 = sum(age_years < 5, na.rm = T), # number of children under 5 vomit_n = sum(vomit == "yes", na.rm=T), # number vomiting vomit_pct = percent(vomit_n / n_cases), # percent vomiting max_wt_male = max(wt_kg[sex == "male"], na.rm = T)) %>% # max weight among men qflextable()
Note that this is a saving command, not a printing command. Therefore, to see this new table in the Viewer pane, you have to run the command hospital_table
.
2) In a separate command, write one of the {flextable} functions below. Do not pipe from your previous code.
save_as_docx()
save_as_pptx()
save_as_image()
save_as_html()
Within the parentheses of the function, first provide the name of the flextable object that you saved (e.g. my_table
). Then provide the argument path =
and supply the desired file name, in quotes, that you want to save to (including the file extension), for example:
# saves the table object my_table as "hospital_table.docx" save_as_docx(hospital_table, path = "hospital_table.docx")
Because no subfolders were specified in the example above, the file will be saved in the root/top folder of your RStudio project, the "ebola" folder.
How would we specify that we want to save the table in the "outputs" sub-folder of the "ebola" folder? Remember the here()
function?
Update your export command hospital_table
is saved in the correct folder:
# saves the table object my_table as "hospital_table.docx" save_as_docx(hospital_table, path = here("outputs", "hospital_table.docx"))
To save as a PNG image, you will need to install Phantom JS (free) for the save_as_image()
function to work. You can do this by installing the package {webshot} with your pacman command, then running the command webshot::install_phantomjs()
. Consider doing this after finishing this exercise.
# saves the table as hospital_table.png in the root folder of your R project save_as_image(my_table, path = here("outputs", "hospital_table.png"))
The final approach to making tables that you should practice is function tbl_summary()
from the {gtsummary} package.
This package aims to produce publication-ready images with very simple and short code. Indeed, you can make very complex and detailed adjustments to these tables, but it is also easy to get a nice table with very little code.
Different from the previous methods is that you must first select()
only the columns that you want to use in the table.
Pipe surv
into select()
, select two columns, and pipe into tbl_summary()
What was produced for the categorical column(s)?
What was produced for the continuous column(s)?
You can also add a by =
argument to the tbl_summary()
command, and designate a column to stratify the table (in columns). Do not forget to include this column in the select()
command above!
Try to make a table that assesses district
, age_cat
, all of the symptoms columns, weight, and height
Now make the same table, stratified by sex
.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
surv %>% select(district, age_cat, fever, chills, cough, aches, vomit, wt_kg, ht_cm, sex) %>% tbl_summary(by = sex)
An important note with {gtsummary} is you cannot directly export the table to a Word document. To allow this, pipe the table to the function as_flex_table()
from {flextable}. This allows the {gtsummary} table to then be exported to a Word document.
gt_to_flex <- surv %>% select(district, age_cat, fever, chills, cough, aches, vomit, wt_kg, ht_cm, sex) %>% tbl_summary(by = sex) %>% as_flex_table() save_as_docx(gt_to_flex, "my_table.docx")
{gtsummary} makes it very easy to run statistical tests. For example, the function add_p()
can be added if you have the appropriate configuration of the table.
Read more about the many ways to customize a {gtsummary} table here.
Congratulations! You finished the module on summary tables!
{flextable} is a useful tool for creating presentation-ready tables.
Start by creating a new script, called "ebola_flextable.R".
This is the table we aim to create:
knitr::include_graphics("images/table_final.png", error = F)
Just like with the "ebola_analyses.R" script, write some descriptive information at the top of the script.
############################################# # Ebola flextable example # Bonus section # Your NAME here #############################################
Next, add a section heading and load these packages using pacman_p_load()
:
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Load packages ---------------------------------------------------------------- pacman::p_load( rio, # import/export here, # file pathways flextable, # make HTML tables scales, # helper functions officer, # helper functions for nice tables tidyverse) # data management, summary, and visualization
Next, create an import section and import the cleaned ebola linelist, which should be saved in your "ebola/data/clean" folder as "surveillance_linelist_clean_20141201.rds". As a backup, you can use the version at the "ebola/data/clean/backup" folder.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Import data ------------------------------------------------------------- # Your saved clean linelist surv <- import(here("data", "clean", "surveillance_linelist_clean_20141201.rds")) # Back-up version surv <- import(here("data", "clean", "backup", "surveillance_linelist_clean_20141201.rds"))
Next, define the border style of the flextable using the {officer} package. Use the fp_border()
function to specify the border color of the table (black), and the border line width (set to 1). Feel free to play around with this function later.
# define border style ----------------------------------------------------- border_style = officer::fp_border(color="black", width=1)
Now begin creating your table, with step-by-step changes.
First, create a table using group_by()
and summarise()
that groups the linelist rows by hospital. Create summary columns that include:
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Pipe the linelist into group_by()
and specify to group by unique values in the column hospital
.
Pipe to summarise()
and inside this, specify three new columsn:
n()
sum()
to count cases where vomit == "yes"
(dont' forget na.rm = TRUE
!
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Create table ----------------------------------------------------------- surv %>% group_by(hospital) %>% # get statistics for each hospital summarise( n_cases = n(), # number of rows (cases) vomit_n = sum(vomit == "yes", na.rm=T), # number vomiting vomit_pct = percent(vomit_n / n_cases)) # percent vomiting
In this course we explained that missing values are most properly recorded in R as NA
. This is true, but now that we are preparing analysis for presentation, we can transform these values to more appropriate words. Still, this change should occur only in the output table, not the underlying dataset.
Add a mutate()
above the group_by()
function, and transform the hospital
column with the function fct_na_value_to_level()
from the {forcats} package, setting the explicit value as "Missing".
Finally, pipe the table to qflextable()
and save the table as an object called vom_table
.
r fontawesome::fa("lightbulb", fill = "gold")
Click to read a hint
Try searching the Help documentation for the fct_na_value_to_level()
function.
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Create table ----------------------------------------------------------- vom_table <- surv %>% mutate(hospital = fct_na_value_to_level(hospital, "Missing")) %>% # explicit NA values group_by(hospital) %>% # get statistics for each hospital summarise( n_cases = n(), # number of rows (cases) vomit_n = sum(vomit == "yes", na.rm=T), # number vomiting vomit_pct = percent(vomit_n / n_cases)) %>% # percent vomiting # convert to flextable qflextable()
Remember, now that the command saves the table with <-
to the object vom_table
, in order to view the updated table you need to run the command vom_table
.
Let's begin formatting the table for presentation!
First, use the {flextable} function add_header_row()
to add a header row on top of the existing header. Here is the code to add to your command (with a pipe).
add_header_row( top = TRUE, # new header on top of existing header values = c("Hospital", # values for new header "Total cases", "Cases with vomiting"), # This will span 2 columns colwidths = c(1,1,2)) # span the 4 columns
This section sets three new header values (the text), and the how many of the underlying four columns each should span. The last header value should span two underlying columns.
r fontawesome::fa("exclamation", fill = "red")
Alert! if you add a header row with fewer column names than the number of underlying columns, you will see an error like this:
Error in inverse.rle(structure(list(lengths = colwidths, values = values), : invalid 'rle' structure
Run the command vom_table
and see the changes in your RStudio Viewer. You should see the second header and how the "Cases with vomiting" spans the right-most two columns.
Let's make some manual adjustments to the header values. Pipe the table to this code:
# Re-label secondary headers set_header_labels(n_cases = "No.", vomit_n = "No.", vomit_pct = "Percent of total")
Run the command vom_table
and see the changes in your RStudio Viewer.
Your full table command should look like this:
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Create table ----------------------------------------------------------- vom_table <- surv %>% mutate(hospital = fct_na_value_to_level(hospital, "Missing")) %>% # explicit NA values group_by(hospital) %>% # get statistics for each hospital summarise( n_cases = n(), # number of rows (cases) vomit_n = sum(vomit == "yes", na.rm=T), # number vomiting vomit_pct = percent(vomit_n / n_cases)) %>% # percent vomiting # convert to flextable qflextable() %>% add_header_row( top = TRUE, # new header on top of existing header colwidths = c(1,1,2), # span the 4 columns values = c("Hospital", # values for new header "Total cases", "Cases with vomiting")) %>% # This will span 2 columns # Re-label secondary headers set_header_labels(n_cases = "No.", vomit_n = "No.", vomit_pct = "Percent of total")
Next, we will adjust the alignment of the values in the cells, and merge some of the values together.
To make specific formatting edits to specific rows, columns, or cells we must understand {flextable}'s location-targeting syntax. Many of this package's functions utilise this syntax:
function(table, part = "", i = , j = )
Let's understand it:
width()
to determine column widths, bg()
to set background colours, align()
to set centre/right/left alignment. table =
is the name of the data frame. This is not necessary if the flextable is piped into the function. part =
refers to which part of the table the function is being applied to. E.g. "header", "body" or "all". i =
specifies the row numbers to apply the function to, e.g. i = 4
(fourth row) i = c(1:3)
(first to third rows) j =
specifies the column numbers to apply the function to, e.g. j = 1
(first column) j = c(5, 8)
(5th and 8th column) You can find the complete list of flextable formatting function here or review the documentation by entering ?flextable
. You can also see another worked example in the Epi R Handbook.
Pipe your table into the following code:
# Align and merge cells align(part = "all", align = "center", j = c(2:4)) %>% # center values in columns 2-4 merge_at(i = 1:2, j = 1, part = "header") # merge vertically the 'hospital' title
Run the command vom_table
and see the changes in your RStudio Viewer.
Your full table command should look like this:
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Create table ----------------------------------------------------------- vom_table <- surv %>% mutate(hospital = fct_na_value_to_level(hospital, "Missing")) %>% # explicit NA values group_by(hospital) %>% # get statistics for each hospital summarise( n_cases = n(), # number of rows (cases) vomit_n = sum(vomit == "yes", na.rm=T), # number vomiting vomit_pct = percent(vomit_n / n_cases)) %>% # percent vomiting # convert to flextable qflextable() %>% add_header_row( top = TRUE, # new header on top of existing header colwidths = c(1,1,2), # span the 4 columns values = c("Hospital", # values for new header "Total cases", "Cases with vomiting")) %>% # This will span 2 columns # Re-label secondary headers set_header_labels(n_cases = "No.", vomit_n = "No.", vomit_pct = "Percent of total") %>% # Align and merge cells align(part = "all", align = "center", j = c(2:4)) %>% # center values in columns 2-4 merge_at(i = 1:2, j = 1, part = "header") # merge vertically the 'hospital' title
Finally, we will adjust the font and background color of the table.
Add the code below to your pipe chain.
# Change background for specific columns bg(part = "body", # body of the table (exclusing header) j = 4, # 4th column bg = "gray95") %>% # grey color # Change background conditionally bg(j = 4, # 4th column i = ~ vomit_pct >= 55, # The ~ shows that the rows affected are conditional part = "body", # the body of the table - not the header bg = "orange") %>% # the color # Main headers are bold bold(i = 1, # row 1 of the header bold = TRUE, part = "header")
The most simple option is to use add_footer_lines()
to append one large cell that spans with width of the entire table.
Adding references in the table (with superscript numbers and footer references) can be more complicated.
Add a footer row to your table that explains the data source and the logic behind the conditional coloring. Extra points if you create it with str_glue()
to make some of the text dynamic, and make it italicised!
Then use the inverse function add_header_lines()
to give your table a title. Extra points if you then align this title to the center!
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
Pipe your table into this code like this:
#add footer add_footer_lines(values = str_glue( "Cases reported as of {max_report_date}. Hospitals with >=55% vomiting highlighted.", max_report_date = max(surv$date_report, na.rm = T))) %>% italic(part = "footer") %>% # add title add_header_lines(values = "Vomiting case, by hospital") %>% align(part = "header", i = 1, align = "center")
Your full table command should look like this:
r fontawesome::fa("check", fill = "red")
Click to see a solution (try it yourself first!)
# Create table ----------------------------------------------------------- vom_table <- surv %>% mutate(hospital = fct_na_value_to_level(hospital, "Missing")) %>% # explicit NA values as "Missing" group_by(hospital) %>% # get statistics for each hospital summarise( n_cases = n(), # number of rows (cases) vomit_n = sum(vomit == "yes", na.rm=T), # number vomiting vomit_pct = percent(vomit_n / n_cases)) %>% # percent vomiting # convert to flextable qflextable() %>% add_header_row( top = TRUE, # new header on top of existing header colwidths = c(1,1,2), # span the 4 columns values = c("Hospital", # values for new header "Total cases", "Cases with vomiting")) %>% # This will span 2 columns # Re-label secondary headers set_header_labels(n_cases = "No.", vomit_n = "No.", vomit_pct = "Percent of total") %>% # Align and merge cells align(part = "all", align = "center", j = c(2:4)) %>% # center values in columns 2-4 merge_at(i = 1:2, j = 1, part = "header") %>% # merge vertically the 'hospital' title # Change background for specific columns bg(part = "body", # body of the table (exclusing header) j = 4, # 4th column bg = "gray95") %>% # grey color # Change background conditionally bg(j = 4, # 4th column i = ~ vomit_pct >= 55, # The ~ shows that the rows affected are conditional part = "body", # the body of the table - not the header bg = "orange") %>% # the color # Main headers are bold bold(i = 1, # row 1 of the header bold = TRUE, part = "header") %>% #add footer add_footer_lines(values = str_glue( "Cases reported as of {max_report_date}. Hospitals with >=55% vomiting highlighted.", max_report_date = max(surv$date_report, na.rm = T))) %>% italic(part = "footer") %>% # add title add_header_lines(values = "Vomiting cases, by hospital") %>% align(part = "header", i = 1, align = "center")
Finally, save the table to a Word document, Powerpoint, or your preferred output. Remember that this command should not be connected by a pipe to the table creation command.
This code should work to save it as Word to an "outputs" folder in the RStudio project.
# Save save_as_docx(vom_table, path = here("outputs", "vom_table.docx"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.