library(learnr)
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)

Pre-wrangling exploration


Plot univariate distributions {data-progressive=TRUE}

Let's get comfortable creating some univariate histograms to start exploring the data. Create several histograms of a couple variables. The ggplot2 package has been loaded.

capture.output({
data("tidier_framingham", package = "acdcourse")
library(ggplot2)
}, file = tempfile())

Exercise step

Instructions:

# Examine the age histogram
ggplot(tidier_framingham, aes(x = ___)) +
    ___()
"In the `aes()`, the argument should be `x = participant_age`."
# Examine the age histogram
ggplot(tidier_framingham, aes(x = participant_age)) +
    geom_histogram()
"Nice!"

Exercise step

Instructions:

# Examine the systolic blood pressure histogram
ggplot(tidier_framingham, aes(x = ___)) +
    ___()
"The `aes()` should have `x = systolic_blood_pressure`."
# Examine the systolic blood pressure histogram
ggplot(tidier_framingham, aes(x = systolic_blood_pressure)) +
    geom_histogram()
"Great job! You've created histograms and examined two variables."

Long data and visualizing variables over time {data-progressive=TRUE}

Now that you've learned how to create histograms, let's convert some of the Framingham dataset into the long data format using gather(). Then, using the long data form, create histograms for multiple variables simultaneously for each followup visit. This will give us a quick overview of the data and their distribution. Pay attention to how the distribution of each variable looks like.

capture.output({
data(tidier_framingham, package = "acdcourse")
library(dplyr)
library(tidyr)
library(ggplot2)
}, file = tempfile())

Exercise step

Instructions:

tidier_framingham %>%
    select(
        followup_visit_number,
        # Select the three cholesterol-based variables
        ___, ___, ___
    ) %>%
    gather(___, ___, -___)
"The `gather()` function should look like `gather(variable, value, -followup_visit_number)`."
tidier_framingham %>%
    select(
        followup_visit_number,
        # Select the three cholesterol-based variables
        total_cholesterol, high_density_lipoprotein, low_density_lipoprotein
    ) %>%
    gather(variable, value, -followup_visit_number)
"Great!"

Exercise step

Instructions:

tidier_framingham %>%
    select(
        followup_visit_number,
        # Select the three cholesterol-based variables
        total_cholesterol, high_density_lipoprotein, low_density_lipoprotein
    ) %>%
    gather(variable, value, -followup_visit_number) %>%
    ggplot(aes(x = value)) +
    geom_histogram() +
    # Facet by followup and variables
    ___(___(___, ___), 
        scales = "free")
"The `facet_wrap()` variables need to be within the `vars()` function and separated by a comma."
tidier_framingham %>%
    select(
        followup_visit_number,
        # Select the three cholesterol-based variables
        total_cholesterol, high_density_lipoprotein, low_density_lipoprotein
    ) %>%
    gather(variable, value, -followup_visit_number) %>%
    ggplot(aes(x = value)) +
    geom_histogram() +
    # Facet by followup and variables
    facet_wrap(vars(followup_visit_number, variable), 
               scales = "free")
"Great!"

Exercise step

Instructions:

tidier_framingham %>%
    select(
        followup_visit_number,
        # Select the three charactistics
        ___, ___, ___
    ) %>%
    gather(variable, value, -followup_visit_number) %>%
    ggplot(aes(x = value)) +
    geom_histogram() +
    facet_wrap(vars(followup_visit_number, variable), 
               scales = "free")
"Put the variables in the `select()` function."
tidier_framingham %>%
    select(
        followup_visit_number,
        # Select the three charactistics
        body_mass_index, participant_age, cigarettes_per_day
    ) %>%
    gather(variable, value, -followup_visit_number) %>%
    ggplot(aes(x = value)) +
    geom_histogram() +
    facet_wrap(vars(followup_visit_number, variable), 
               scales = "free")
"Amazing!"

Exercise step

There were several things to observe from the distributions of the variables and some things to consider for later analyses. Did you notice a few of them?

Hint: Run the code again to check the histogram plots.

question(
  "Which of the answers below describes some observations about the data.",
  answer("The lipoprotein data was not available at visits 1 and 2.", correct = FALSE),
  answer("Most people smoked zero cigarettes per day.", correct = FALSE),
  answer(
    "The participants' age had a 'jagged', uneven distribution.",
    correct = FALSE
  ),
  answer("All of the above.", correct = TRUE),
  answer("None of the above.", correct = FALSE),
  allow_retry = TRUE
)

Visually examine the outcomes with the exposures

Boxplots are great for showing a distribution by a grouping variable (e.g. sex or disease status). Create multiple boxplots of several exposure variables with the outcome variable (CVD) by combining what we learned previously about converting to long form and using faceting. Since we want to plot CVD status on the x-axis, we'll need to exclude it from being "gathered".

Instructions:

capture.output({
data(tidier_framingham, package = "acdcourse")
library(dplyr, quietly = TRUE)
library(tidyr, quietly = TRUE)
library(ggplot2, quietly = TRUE)
tidier_framingham <- tidier_framingham %>% 
    mutate(got_cvd = as.character(got_cvd))
}, file = tempfile())
tidier_framingham %>% 
    select(followup_visit_number,
           # Select the disease and the three continuous variables
           ___, ___,
           ___, ___) %>% 
    # Exclude also the disease
    gather(variable, value, -followup_visit_number, -___) %>% 
    ggplot(aes(y = ___, x = variable)) +
    # Plot boxplots
    ___() +
    facet_wrap(vars(followup_visit_number), ncol = 1) +
    # Flip the plot
    ___()
"- The initial `ggplot2` setup should be `ggplot(aes(x = value, y = variable))`."
"- Include `-got_cvd` after `-followup_visit_number` in `gather()`."
tidier_framingham %>% 
    select(followup_visit_number,
           # Select the disease and the three continuous variables
           got_cvd, total_cholesterol,
           participant_age, body_mass_index) %>% 
    # Exclude also the disease
    gather(variable, value, -followup_visit_number, -got_cvd) %>% 
    ggplot(aes(y = value, x = variable)) +
    # Plot boxplots
    geom_boxplot() +
    facet_wrap(vars(followup_visit_number), ncol = 1) +
    # Flip the plot
    coord_flip()
"Excellent! You quickly created a figure showing several continuous variables by the outcome, and over time! Notice how some variables are a bit higher in the `got_cvd` group and that over time these differences decreased? Also notice the problem of showing multiple variables that have vastly different values such as between body mass and cholesterol."

Discrete data and tidying it for later analysis


Make discrete variables human-readable {data-progressive=TRUE}

As you may have noticed, there are several discrete variables with ambiguous values. For instance, sex has the values as either 1 or 2, but what do those numbers mean? Often, you will encounter discrete data as integers rather than descriptive strings when working with cohort datasets. With data like this, you need to have a data dictionary to know what the numbers mean. Let's fix this problem and tidy up the data so it is more intuitive and descriptive.

capture.output({
data(tidier_framingham, package = "acdcourse")
library(dplyr)
}, file = tempfile())

Exercise step

Instructions:

tidier2_framingham <- tidier_framingham %>% 
    mutate(education = case_when(
      # Use the format: variable == number ~ "string"
      education == ___ ~ ___,
      TRUE ~ NA_character_))

# Check changed education
count(tidier2_framingham, education)
"The form for the `case_when()` should look like `education == 1 ~ "0-11 years"`, for each number-string pairing."
tidier2_framingham <- tidier_framingham %>% 
    mutate(education = case_when(
      # Use the format: variable == number ~ "string"
      education == 1 ~ "0-11 years",
      education == 2 ~ "High School",
      education == 3 ~ "Vocational",
      education == 4 ~ "College",
      TRUE ~ NA_character_))

# Check changed education
count(tidier2_framingham, education)
"Excellent!"

Exercise step

Instructions:

tidier2_framingham <- tidier_framingham %>% 
    mutate(sex = case_when(
      # Use the format: variable == number ~ "string"
      sex == ___ ~ ___,
      TRUE ~ NA_character_))

# Check changed education
count(tidier2_framingham, sex)
"The form for the `case_when()` should look like `sex == 1 ~ 'Man'`, for each number-string pairing."
tidier2_framingham <- tidier_framingham %>% 
    mutate(sex = case_when(
      # Use the format: variable == number ~ "string"
      sex == 1 ~ "Man",
      sex == 2 ~ "Woman",
      TRUE ~ NA_character_))

# Check changed education
count(tidier2_framingham, sex)
"Awesome! You've tidied up discrete values to be understandable to humans!"

Merge factor categories together

Sometimes, categorical variables (as factors or characters) have many levels but only a few observations in one or more of the levels. It might make sense to combine categories together for some analyses or particular questions.

The forcats package has been preloaded as well as the previous tidier2_framingham dataset you tidied.

Instructions:

capture.output({
data(tidier2_framingham, package = "acdcourse")
library(forcats)
library(dplyr)
tidier2_framingham$education_combined <- NULL
}, file = tempfile())
tidier2_framingham <- tidier2_framingham %>% 
    mutate(education_combined = ___(
        # Merge college and vocational levels
        education, 
        # Form is: "new" = "old"
        ___ = ___,
        ___ = ___))

# Confirm changes to variable
count(tidier2_framingham, ___)
'`fct_recode()` recoding should be in the form `"new name" = "old name"`, for example: `"Post-Secondary" = "College"`.'
tidier2_framingham <- tidier2_framingham %>% 
    mutate(education_combined = fct_recode(
        # Merge college and vocational levels
        education, 
        # Form is: "new" = "old"
        "Post-Secondary" = "College",
        "Post-Secondary" = "Vocational"))

# Confirm changes to variable
count(tidier2_framingham, education_combined)
"Great! You've combined two factor levels together into a new level."

Variable transformations


Apply variable transformations

There are several types of transformations you can choose from. Which one you choose depends on the question, the type of data and their values (e.g. discrete vs continuous), the statistical method you will use, and how you want your results to be interpreted.

Recall the form for mutate_at() is:

mutate_at(
    # List variables in here:
    vars(...), 
    # List functions in here, with name-function pair:
    list(name = function, ...)
)

Instructions:

capture.output({
data(tidier2_framingham, package = "acdcourse")
library(dplyr)
invert <- function(x) 1 / x
}, file = tempfile())
# Use three transformations on body mass index
transformed_framingham <- tidier2_framingham %>% 
    mutate_at(vars(___, ___), 
              list(___ = ___, ___ = ___, ___ = ___))

# Check the created variable summaries
transformed_framingham %>% 
    select(contains(___), 
           contains(___)) %>% 
    summary()
'The `select()` function form should look like `contains("body_mass_index")`.'
# Use three transformations on body mass index
transformed_framingham <- tidier2_framingham %>% 
    mutate_at(vars(body_mass_index, cigarettes_per_day), 
              list(log = log, sqrt = sqrt, invert = invert))

# Check the created variable summaries
transformed_framingham %>% 
    select(contains("body_mass_index"), 
           contains("cigarettes_per_day")) %>% 
    summary()
"Excellent! You've transformed two variables into several forms."

Compare the different transformations {data-progressive=TRUE}

Visualize how each transformation influences the distribution of the data. Graphing these transformations can provide insight into helping you choose a transformation for the variable.

Since we have several transformations, we want to plot them all on one plot. As we've done several times throughout the course, we need to use a long data format combined with facets to achieve this.

The transformed_framingham dataset you previously wrangled has been loaded.

capture.output({
data(transformed_framingham, package = "acdcourse")
library(tidyr)
library(dplyr)
library(ggplot2)
}, file = tempfile())

Exercise step

Instructions:

# Plot a histogram of body mass transforms
___ %>% 
    # Keep variables with string in variable name
    select(contains(___)) %>% 
    gather(variable, value) %>% 
    ggplot(aes(x = value)) +
    geom_histogram() +
    facet_wrap(vars(variable), scale = "free")
'Select the variables with `contains("body_mass_index")`.'
# Plot a histogram of body mass transforms
transformed_framingham %>% 
    # Keep variables with string in variable name
    select(contains("body_mass_index")) %>% 
    gather(variable, value) %>% 
    ggplot(aes(x = value)) +
    geom_histogram() +
    facet_wrap(vars(variable), scale = "free")
"Amazing!"

Exercise step

Instructions:

# Plot a histogram of cigarettes per day transforms
transformed_framingham %>% 
    # Keep variables with string in variable name
    select(contains("___")) %>% 
    gather(variable, value) %>% 
    ggplot(aes(x = value)) +
    geom_histogram() +
    facet_wrap(vars(variable), scale = "free")
'Use `contains("cigarettes_per_day")`.'
# Plot a histogram of cigarettes per day transforms
transformed_framingham %>% 
    # Keep variables with string in variable name
    select(contains("cigarettes_per_day")) %>% 
    gather(variable, value) %>% 
    ggplot(aes(x = value)) +
    geom_histogram() +
    facet_wrap(vars(variable), scale = "free")
"Great! Check out how each transformation influences the distribution of body mass index and of the number of cigarettes smoked."

How does the distribution change?

Understanding how each transformation influences the units and the distribution of the data is an important step in properly applying these transformations. Try answering these questions about the shape of the data after each transformation.

Both bmi_transforms_plot and cpd_transforms_plot are loaded for you to examine. Looking at the graphs, observe how each transformation influences the distribution of body mass index or cigarettes per day and think about how these new distributions might influence later analyses.

data(transformed_framingham, package = "acdcourse")
library(tidyr)
library(dplyr)
library(ggplot2)

transformed_framingham %>% 
    select(contains("body_mass_index")) %>% 
    gather(variable, value) %>% 
    ggplot(aes(x = value)) +
    geom_histogram() +
    facet_wrap(vars(variable), scale = "free")

transformed_framingham %>% 
    select(contains("cigarettes_per_day")) %>% 
    gather(variable, value) %>% 
    ggplot(aes(x = value)) +
    geom_histogram() +
    facet_wrap(vars(variable), scale = "free")
question(
  "Which statement is true?",
  answer(
    "Square root and scale don't change the distribution but do change the unit.",
    correct = FALSE,
    message = "Almost. While this is true, it's not the only true answer."
  ),
  answer(
    "Logarithm changes the distribution and unit.",
    correct = FALSE,
    message = "Almost. While this is true, it's not the only true answer."
  ),
  answer(
    "Body mass already has a good distribution and has the original unit.",
    correct = FALSE,
    message = "Almost. While this is true, it's not the only true answer."
  ),
  answer(
    "Scale can make interpreting easier as 1 unit = 1 standard deviation of the original unit.",
    correct = FALSE,
    message = "Almost. While this is true, it's not the only true answer."
  ),
  answer(
    "All of the above.", 
    correct = TRUE, 
    message = "Yes! Which type of and when you might transform really depends on the research question, the data values, and how you will want the results from your analyses to be interpreted. This means you need to carefully think about and have justifications for what you do to the data."), 
  allow_retry = TRUE
)


lwjohnst86/acdcourse documentation built on June 18, 2019, 8:26 p.m.