library(learnr)
library(tutorial.helpers)
library(tidyverse)
library(primer.data)
library(brms)
library(broom.mixed)
library(patchwork)

knitr::opts_chunk$set(echo = FALSE)
options(tutorial.exercise.timelimit = 600, 
        tutorial.storage = "local") 

x <- kenya |> 
  filter(rv13 > 0)

rv_p <- x |> 
  ggplot(aes(rv13)) + 
    geom_histogram(bins = 100) +
    labs(x = "Registered Voters",
         y = NULL) 

log_rv_p <- x |> 
  ggplot(aes(log(rv13))) + 
    geom_histogram(bins = 100) +
    labs(x = "Log of Registered Voters",
         y = NULL) +
    expand_limits(y = c(0, 175))

#no_na_nhanes <- nhanes |> 
#  select(height, age) |> 
#  drop_na() 

# Create and save all the models we use.

# fit_1 <- brm(formula = income ~ age, 
#              data = trains,
#              family = gaussian(),
#              silent = 2,
#              refresh = 0,
#              seed = 45)
# write_rds(fit_1, "data/fit_1.rds")
fit_1 <- read_rds("data/fit_1.rds")

trains_2 <- trains |> 
  mutate(c_age = age - mean(age))

# fit_1_c <- brm(formula = income ~ c_age, 
#                data = trains_2,
#                family = gaussian(),
#                silent = 2,
#                refresh = 0,
#                seed = 16)
# write_rds(fit_1_c, "data/fit_1_c.rds")
fit_1_c <- read_rds("data/fit_1_c.rds")

trains_3 <- trains |> 
  mutate(s_age = age / sd(age))

# fit_1_s <- brm(formula = income ~ s_age, 
#                data = trains_3,
#                family = gaussian(),
#                silent = 2,
#                refresh = 0,
#                seed = 16)
# write_rds(fit_1_s, "data/fit_1_s.rds")
fit_1_s <- read_rds("data/fit_1_s.rds")

trains_4 <- trains |> 
  mutate(z_age = scale(age))

# fit_1_z <- brm(formula = income ~ z_age, 
#                data = trains_4,
#                family = gaussian(),
#                silent = 2,
#                refresh = 0,
#                seed = 16)
# write_rds(fit_1_z, "data/fit_1_z.rds")
fit_1_z <- read_rds("data/fit_1_z.rds")

no_na_nhanes <- nhanes |> 
  select(height, age) |> 
  drop_na() 

# nhanes_1 <- brm(height ~ age,
#                 data = no_na_nhanes,
#                 family = gaussian(),
#                 silent = 2,
#                 refresh = 0,
#                 seed = 16)
# write_rds(nhanes_1, "data/nhanes_1.rds")
nhanes_1 <- read_rds("data/nhanes_1.rds")

# nhanes_2 <- brm(formula = height ~ age + I(age^2),
#                 data = no_na_nhanes,
#                 family = gaussian(),
#                 silent = 2,
#                 refresh = 0,
#                 seed = 27)
# write_rds(nhanes_2, "data/nhanes_2.rds")
nhanes_2 <- read_rds("data/nhanes_2.rds")

# nhanes_3 <- brm(formula = height ~ 
#                   I(ifelse(age > 18, 18, age)),
#                 data = no_na_nhanes,
#                 family = gaussian(),
#                 silent = 2,
#                 refresh = 0,
#                 seed = 87)
# write_rds(nhanes_3, "data/nhanes_3.rds")
nhanes_3 <- read_rds("data/nhanes_3.rds")

# fit_1_model <- brm(formula = 
#                      att_end ~ treatment + att_start,
#                    data = trains,
#                    family = gaussian(),
#                    silent = 2,
#                    refresh = 0,
#                    seed = 16)
# write_rds(fit_1_model, "data/fit_1_model.rds")
fit_1_model <- read_rds("data/fit_1_model.rds")

# fit_2_model <- brm(formula = income ~ age + liberal,
#                    data = trains,
#                    family = gaussian(),
#                    silent = 2,
#                    refresh = 0,
#                    seed = 16)
# write_rds(fit_2_model, "data/fit_2_model.rds")
fit_2_model <- read_rds("data/fit_2_model.rds")

# fit_liberal <- brm(formula = att_end ~ liberal,
#                    data = trains,
#                    family = gaussian(),
#                    silent = 2,
#                    refresh = 0,
#                    seed = 12)
# write_rds(fit_liberal, "data/fit_liberal.rds")
fit_liberal <- read_rds("data/fit_liberal.rds")

loo_liberal <- loo(fit_liberal)

# fit_att_start <- brm(formula = att_end ~ att_start,
#                      data = trains,
#                      family = gaussian(),
#                      silent = 2,
#                      refresh = 0,
#                      seed = 37)
# write_rds(fit_att_start, "data/fit_att_start.rds")
fit_att_start <- read_rds("data/fit_att_start.rds")

loo_att_start <- loo(fit_att_start) 


Introduction

This tutorial covers Chapter 7: Mechanics of Preceptor’s Primer for Bayesian Data Science: Using the Cardinal Virtues for Inference by David Kane.

In our haste to make progress --- to get all the way through the process of building, interpreting and using models --- we have given short shrift to some of the messy details of model building and evaluation. This chapter fills in those lacunae.

Transforming variables

It is often convenient to transform a predictor variable so that our model makes more sense.

Let's say we are interested in predicting a person's income based on their age. We have a model of income as a function of age: $$ income_i = \beta_0 + \beta_1 age_i + \epsilon_i$$

Exercise 1

Load the brms package.


library(brms)
library(brms)

Since income is a continuous variable, we use the brms package to build Bayesian regression models. A model that establishes the relationship between two variables is called a simple regression model.

Exercise 2

Load the tidyverse package.


library(tidyverse)
library(tidyverse)

The general form of the simple regression model is: $$ Y = \beta_0 + \beta_1 X + \epsilon_i$$ The model is simple not only because there is only one variable in the right side of the equation, but also because $X$ is not an exponential variable and thus the relationship between $X$ and $Y$ is constant, or linear. Thus, we can also call it linear regression.

Exercise 3

We will be using the trains data set from primer.data for our model. Load the primer.data package.


library(primer.data)
library(primer.data)

$\beta_0$ and $\beta_1$ are the coefficients, when we interpret coefficients, it is important to know the difference between across unit and within unit comparisons. The former refers to comparing two different units but not looking at causal relationship while the latter looks at that one unit under treatment versus when under control.

Exercise 4

We will also be using a new package, broom.mixed, which allows us to tidy regression data for plotting. Load the broom.mixed package.


library(broom.mixed)
library(broom.mixed)

In terms of the Preceptor Tables, within unit comparison is looking at one row of data, we are comparing one unit, or (in this case) one person, to itself under two conditions (control versus treatment). Meanwhile, across unit comparison is looking at multiple rows of data, with a focus on differences across columns, without making any causal claims.

Exercise 5

Type trains and hit "Run Code".


trains
trains

The model we are using only takes one variable, age, on the right hand side to predict income. However, are there any other variables in the data set that can also predict income, or potentially affect the relationship bettween age and income? We will discuss it in the following sections.

Exercise 6

Create a model using brm() from the brms. Use the arguments formula = income ~ age, data = trains, family = gaussian(), refresh = 0, silent = 2, and seed = 45. Assign the result to an object called fit_1.


fit_1 <- brm(... = income ~ age, 
             data = ...,
             ... = gaussian(),
             ... = 2,
             refresh = ...,
             ... = 45)

Since the model is linear, we assume that the relationship between income and age is stable. However, this is not true in reality, the effect when our age increases from 5 to 6 versus from 17 to 18, or 40 to 41 is not the same.

Exercise 7

Type fit_1 and hit "Run Code." This will return a table of regression.


fit_1
fit_1

In the Regression Coefficients section, we can see that the model returns the estimates and the 95% Confidence Intervals for the Intercept ($\beta_0$) and age ($\beta_1$). Recall that when calculating our quantity of interest, which is the coefficient in this case, we need to come up with a posterior probability distribution for our estimates.

Exercise 8

Run fixef() on fit_1 and hit "Run Code."


fixef(...)
fixef(fit_1)

The value of $\beta_0$, the intercept in the regression, is r scales::comma(round(fixef(fit_1)["Intercept", "Estimate"], 0)). This represents the estimated average income for a person with an age of zero, which is awkward and useless since there are no people of zero age in our data.

Exercise 9

One way to make the intercept more meaningful is to transform age. Start a pipe with trains to mutate with the argument c_age = age - mean(age). Assign the result to a new object called trains_2.


... <- trains |> 
  mutate(... = age - ...(age))
trains_2 <- trains |> 
  mutate(c_age = age - mean(age))

By doing this, we are subtracting the mean of a variable age in the data set from each of its values. The new distribution will be centered around zero meaning that it will have a mean of zero, but it will retain the same spread (standard deviation) and shape as the original distribution.

Exercise 10

Copy your previous code from Exercise 6, but set formula equal income ~ c_age, and data equal trains_2. Assign the result to an object called fit_1_c.


fit_1_c <- brm(formula = ..., 
               ... =  = trains_2,
               family = gaussian(),
               silent = 2,
               refresh = 0,
               seed = 16)

Using this centered version of age does not change the predictions or residuals in the model, but it does make the intercept easier to interpret.

Exercise 11

Type fit_1_c and hit "Run Code." This will return a table of regression.


fit_1_c
fit_1_c

Note that the estimate for the Intercept in this model is higher than that of the previous model that used the original values of age because the intercept now represents the expected value of the outcome variable income when age is at its mean value (instead of zero).

Exercise 12

Run fixef() on fit_1_c and hit "Run Code."


fixef(fit_1_c)
fixef(fit_1_c)

The intercept, r scales::comma(round(fixef(fit_1_c)["Intercept", "Estimate"], 0)), is the expected income for someone with c_age = 0, i.e. someone of an average age in the data, which is around r round(mean(trains$age), 0).

Centering --- changing a variable via addition/subtraction --- often makes the intercept easier to interpret. Scaling --- changing a variable via multiplication/division --- often makes it easier to interpret coefficients.

Exercise 13

The most common scaling method is to divide the variable by its standard deviation. Start a pipe with trains to mutate() with the argument s_age = age / sd(age). Assign the result to a new object called trains_3.


... <- trains |> 
  mutate(... = age / ...(age))
trains_3 <- trains |> 
  mutate(s_age = age / sd(age))

By doing this, we are scaling the variable age so that its values are measured in terms of standard deviations from the mean.

Exercise 14

Create a model using brm() from the brms package. Use the arguments formula = income ~ s_age, data = trains_3, family = gaussian(), refresh = 0, silent = 2, and seed = 16. Assign the result to an object called fit_1_s.


... <- brm(formula = ..., 
               ... = trains_3,
               family = ...,
               ... = 2,
               refresh = ...,
               seed = ...)

s_age is age scaled by its own standard deviation. A change in one unit of s_age is the same as a change in one standard deviation of the age, which is about 12.

Exercise 15

Type fit_1_s and hit "Run Code." This will return a table of regression.


fit_1_s
fit_1_s

In a standardized form, the coefficients of a regression model can be interpreted as the effect of a one standard deviation change in the predictor variable on the outcome variable. The interpretation of $\beta_1$ is now:

When comparing two people, one about 1 standard deviation worth of years older than the other, we expect the older person to earn about 11,000 more dollars.

Exercise 16

Run fixef() on fit_1_s and hit "Run Code."


fixef(...)
fixef(fit_1_s)

But, because we scaled without centering, the intercept is now back to the (nonsensical) meaning of the expected income for people of age 0.

Exercise 17

The most common transformation applies both centering and scaling. Start a pipe with trains to mutate with the argument z_age = scale(age). Assign the result to a new object called trains_4.


... <- ... |> 
  mutate(z_age = scale(...))
trains_4 <- trains |> 
  mutate(z_age = scale(age))

The base R function scale() subtracts the mean and divides by the standard deviation. A variable so transformed is a “z-score”, meaning a variable with a mean of zero and a standard deviation of one.

Exercise 18

Create a model using brm() from the brms package. Use the arguments formula = income ~ z_age, data = trains_4, family = gaussian(), refresh = 0, silent = 2, and seed = 16. Assign the result to an object called fit_1_z.


... <- brm(formula = ..., 
              ... = trains_4,
               family = ...,
               ... = 2,
               refresh = ...,
               ... = 16)

Using z-scores makes interpretation easier, especially when we seek to compare the importance of different predictors. Note that, when using z-scores, we would often phrase this comparison in terms of “sigmas.” One person is “one sigma” older than another person means that they are one standard deviation older.

Exercise 19

Type fit_1_z and hit "Run Code." This will return a table of regression.


fit_1_z
fit_1_z

The two parameters are easy to interpret after this transformation.

The expected income of someone of average age, which is about r round(mean(trains$age)) in this study, is about r scales::comma(round(fixef(fit_1_z)["Intercept", "Estimate"], -3)) dollars.

When comparing two individuals who differ in age by one standard deviation, which is about r round(sd(trains$age)) years in this study, the older person is expected to earn about r scales::comma(round(fixef(fit_1_z)["z_age", "Estimate"], -3)) dollars more than the younger.

Exercise 20

Run fixef() on fit_1_z and hit "Run Code".


fixef(...)
fixef(fit_1_z)

Note that the term "sigma" might be confusing since we already using the word “sigma” to mean $\sigma$, the standard deviation of $\epsilon_i$. You will hear the same word "sigma" applied to both concepts, even in the same sentence. Determine meaning by context.

Exercise 21

It is often helpful to take the log of predictor variables, especially in cases in which their distribution is skewed. We will be using the kenya data set from primer.data. Type kenya and hit "Run Code".


kenya
kenya

Whether or not to take log of the variables may depend on the conventions in your field. If everyone does X, then you should probably do X, unless you have a good reason not to and need to explain it prominently.

Exercise 22

Pipe kenya to summary() to return some statistics about the data.


... |> 
  summary()
kenya |> 
  summary()

Let's say we are interested in the number of registered voters rv13. The summary shows that there are no missing values (NAs) in this column, however, there are rows with 0 voters, resulting in the min being 0.

Exercise 23

To fix this, pipe kenya to the command filter() with the argument rv13 > 0. Assign the result to an object called x.


... <- ... |> 
  filter(... > 0)
x <- kenya |> 
  filter(rv13 > 0)

You should generally only take the log of variables for which all the values are strictly positive. The log of a negative number is not defined.

Exercise 24

Next, we will look at the distribution of rv13 in the our data. Behind the scene, we have created the graph for you. Type rv_p and hit "Run Code"


rv_p
rv_p

Using the raw data, the distribution is heavily skewed to the right. However, we do not know the “true” model, who is to say that a model using the raw value is right or wrong?

Exercise 25

Now let's see how the distribution looks like after transforming rv13 into the log version. Type log_rv_p and hit "Run Code".


log_rv_p
log_rv_p

The distribution after taking log is more symmetric and closer to a normal distribution. Check whether or not this choice meaningfully affects the answer to your question.

Exercise 26

Let's put the two distributions next to each other to see the differences. Type rv_p and log_rv_p, connect them by +. Hit "Run Code"


... + ...
rv_p + log_rv_p

Our inferences are often fairly "robust" to small changes in the model. If you get the same answer with rv13 as from log_rv13, then no one cares which you use.

Exercise 27

Lastly, add title, subtitle for the graph using the command plot_annotation().


... + ... + 
  plot_annotation(title = ..., 
                  subtitle = ...)

Remember that this is what your graph should look like.

rv_p + log_rv_p +
  plot_annotation(title = 'Registered Votes In Kenya Communities',
                  subtitle = "Taking logs helps us deal with outliers")
rv_p + log_rv_p +
  plot_annotation(title = "Registered Votes In Kenya Communities",
                  subtitle = "Taking logs helps us deal with outliers")

Most professionals, when presented with data distributed like rv13, would take the log. Professionals (irrationally?) hate outliers. Any transformation which makes a distribution look more normal is generally considered a good idea.

Exercise 28

Instead of simply transforming variables, we can add more terms which are transformed versions of a variable. We will be using the nhanes data set. Type nhanes and hit "Run Code."


nhanes
nhanes

The following exercises are built up in the way that the more complex our model is, the better predictions they make and thus the better answers to our questions they give. Complexity in this case can be referred to as the number of variables we put into our formulas, whether we transform them or use their raw versions, etc.

Exercise 29

Consider the relation of height to age in nhanes. Pipe nhanes to select() with height and age as the arguments.


... |> 
  select(..., ...)
nhanes |> 
  select(height, age)

Is age the best predictor of height? In the data set their are other potential variables such as race and sex, which one should we select, and which one should we discard? We will see in the later sections.

Exercise 30

Let’s start by dropping the missing values. Copy your previous code, continue the pipe with drop_na(). Assign the result to an object called no_na_nhanes.


... <- nhanes |> 
  select(..., ...) |> 
  ...
no_na_nhanes <- nhanes |> 
  select(height, age) |> 
  drop_na() 

When building models, we rely on assumptions to construct the mathematical formula to illustrate the relationship between variables. The less assumptions we need to make about our model, the more robust it gets in terms of reflecting the true underlying patterns in the data.

Exercise 31

Let's create a model to study the relationship between height and age. Create a model using brm() from the brms package. Use the arguments formula = height ~ age, data = no_na_nhanes, family = gaussian(), refresh = 0, silent = 2, and seed = 16. Assign the result to an object called nhanes_1.


... <- brm(formula = ...,
                data = ...,
                family = ...,
                ... = 2,
                refresh = ...,
                ... = 16)

This model assumes the relationship between age and height is always constant, meaning that height always increases as age increases. That is not a very good model, obviously, we will see the reason why.

Exercise 32

How accurate our model is can be determined by whether it represents the pattern of our data. Hit "Run Code".

no_na_nhanes |> 
  ggplot(aes(x = age, y = height)) +
    geom_point(alpha = 0.1) +
    geom_line(aes(y = fitted(nhanes_1)[, "Estimate"]),
             color = "red",
             linewidth = 2) +
    labs(title = "Age and Height",
         subtitle = "Children are shorter, but a linear fit is poor",
         x = "Age",
         y = "Height (cm)",
         caption = "Data source: NHANES")
no_na_nhanes |> 
  ggplot(aes(x = age, y = height)) +
    geom_point(alpha = 0.1) +
    geom_line(aes(y = fitted(nhanes_1)[, "Estimate"]),
             color = "red",
             linewidth = 2) +
    labs(title = "Age and Height",
         subtitle = "Children are shorter, but a linear fit is poor",
         x = "Age",
         y = "Height (cm)",
         caption = "Data source: NHANES")

From the graph, we see that before the age of 20, the data points gather and form a steep line, indicating that during this time period, people grow up really fast as their ages increase. However, after 20, there is little increase or even no increase in height as age increases. The red line created by the model does not capture this pattern.

Exercise 33

Run the brm() command. Use the arguments formula = height ~ age + I(age^2), data = no_na_nhanes, family = gaussian(), refresh = 0, silent = 2, and seed = 27. Assign the result to an object called nhanes_2.


... <- brm(... = height ~ age + I(age^2),
                data = ...,
                ... = gaussian(),
                silent = ...,
                ... = 0,
                seed = ...)

Note the need for I() in creating the squared term within the formula argument. Adding a quadratic term makes it better since we allow our model to get rid of the assumption of a linear relationship, which is not true between the two variables.

Exercise 34

Let's see whether this model is better in terms of demonstrating the relationship between age and height in our data set. Hit "Run Code".

no_na_nhanes |> 
  ggplot(aes(x = age, y = height)) +
    geom_point(alpha = 0.1) +
    geom_line(aes(y = fitted(nhanes_2)[, "Estimate"]), 
              color = "red") +
    labs(title = "Age and Height",
         subtitle = "Quadratic fit is much better, but still poor",
         x = "Age",
         y = "Height (cm)",
         caption = "Data source: NHANES")
no_na_nhanes |> 
  ggplot(aes(x = age, y = height)) +
    geom_point(alpha = 0.1) +
    geom_line(aes(y = fitted(nhanes_2)[, "Estimate"]), 
              color = "red") +
    labs(title = "Age and Height",
         subtitle = "Quadratic fit is much better, but still poor",
         x = "Age",
         y = "Height (cm)",
         caption = "Data source: NHANES")

This line looks better, but not the best we could do. We have not used our background knowledge that people don't get any taller after age 18 or so.

Exercise 35

Let's create variables which capture that break. Run the brm() command. Use the arguments formula = height ~ I(ifelse(age > 18, 18, age)), data = no_na_nhanes, family = gaussian(), refresh = 0, silent = 2, and seed = 87. Assign the result to an object called nhanes_3.


... <- brm(... = ... ~ I(ifelse(age > 18, 18, age)),
                data = ...,
                ... = gaussian(),
                silent = ...,
                ... = 0,
                seed = ...)

Using the ifelse() command, we check every value of age in the data set, an original value is used if it is smaller than 18, every value greater than that threshold will be recorded as 18. The model does not understand the fact that people will (generally) not grow after the age of 18 so we need to manipulate this by "letting" it know that age-related changes in height are relevant only up to age 18.

Exercise 36

Let's see how this model does in terms of demonstrating the relationship between the two variables. Hit "Run Code".

no_na_nhanes |> 
  ggplot(aes(x = age, y = height)) +
    geom_point(alpha = 0.1) +
    geom_line(aes(y = fitted(nhanes_3)[, "Estimate"]), 
              color = "red") +
    labs(title = "Age and Height",
         subtitle = "Domain knowledge makes for better models",
         x = "Age",
         y = "Height (cm)",
         caption = "Data source: NHANES")
no_na_nhanes |> 
  ggplot(aes(x = age, y = height)) +
    geom_point(alpha = 0.1) +
    geom_line(aes(y = fitted(nhanes_3)[, "Estimate"]), 
              color = "red") +
    labs(title = "Age and Height",
         subtitle = "Domain knowledge makes for better models",
         x = "Age",
         y = "Height (cm)",
         caption = "Data source: NHANES")

The new line correctly captures the pattern in our data, note that models only do what we tell them to do. We are the captains of our souls and using the knowledge we have to transform variables as needed.

Selecting variables

How do we decide which variables to include in a model? There is no one right answer to this question.

Exercise 1

We will be using variables in the trains data set as an example. Type trains and hit "Run Code". The trains data set includes gender, liberal, party, age, income, att_start, treatment, att_end.


trains
trains

Which variables would be best to include in a model depends on the question we are asking. For example, if we want to know if the ending attitude toward immigration differs between men and women, we need to include gender in the model.

Exercise 2

Let's say we want to investigate the causal effect of exposing people to Spanish-speakers on their attitudes towards immigration. Pipe trains to select() with att_end, treatment, att_start as the arguments.


... |> 
  select(..., ..., ...)
trains |> 
  select(att_end, treatment, att_start)

We also keep X if the underlying theory/observation suggests that X has a meaningfully connection to the outcome variable. It can also be the case when it is standard in your field to include X in such regressions, or simply just because your boss wants to include X.

Exercise 3

Run the brm() command. Use the arguments formula = att_end ~ treatment + att_start, data = trains, family = gaussian(), silent = 2, refresh = 0, seed = 16. Assign the result to an object called fit_1_model.


... <- brm(... = att_end ~ treatment + att_start,
                   data = ...,
                   ... = gaussian(),
                   silent = ...,
                   ... = 0,
                   seed = ...)

att_end, the variable before the tilde, is our outcome. The explanatory variables are treatment, which says whether a commuter relieved treatment or control conditions, and att_start, which measures attitude at the start of the study.

Exercise 4

Type fit_1_model and hit "Run Code."


fit_1_model
fit_1_model

Deciding if a variable is worth including in our model depends on whether it has a large and well-estimated coefficient. This means, roughly, that the 95% confidence interval excludes zero.

Exercise 5

Run fixef() on fit_1_model. Hit "Run Code".


fixef(...)
fixef(fit_1_model)

The 95% confidence interval for the coefficient of treatmentControl is -1.43 to -0.45. This means that, if I compare two people, one of whom received the treatment (i.e., heard Spanish speakers on the platform) and one of whom did not, then we would expect the one who did not (i.e., the one who was a "control" in our experiment) to have a lower score after the experiment.

Exercise 6

How do we decide which variables are useful? First, let’s interpret our coefficients. In your own words, describe the magnitude of the relationship between the variable treatmentControl and our outcome variable, att_end and what that means.

(Hint: Look at the sign of the estimate for that coefficient.)

question_text(NULL,
    message = "The point estimate for `treatmentControl` is -0.95, indicating the negative effect of being in the control group (not being exposed to Spanish-speakers) on the attitude towards immigration at the end of the experiment.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

The variable treatmentControl represents the offset in att_end from the estimate for our Intercept, which is for the group of people that were in the Control group. This means that, compared with the predicted att_end for groups under treatment, those in the control group have a predicted attitude that is almost one entire point lower.

Exercise 7

In your own words, describe the magnitude of the relationship between att_start and att_end. Again, look at the sign of the estimated coefficient.

question_text(NULL,
    message = "The estimated coefficient for `att_start` is positive, indicating that the attitude at the end of the experiment is higher for every one unit increase in `att_start`.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Recall that we keep variables that have a "large" coefficient, which can only be defined in the context of the specific model. Speaking roughly, removing a variable with a large coefficient meaningfully changes the predictions of the model.

Exercise 8

Referring back to our regression table for fit_1_model, what is the 95% Confidence Interval for treatmentControl? Round it up to 2 decimal places.

question_text(NULL,
    message = "The 95% CI for `treatmentControl` is [-1.44; -0.45].",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

The 95% CI of this variables excludes zero, suggesting that it is a worthy variable to include in our model.

Exercise 9

What is the 95% Confidence Interval for att_start? Round it up to 2 decimal places.

question_text(NULL,
    message = "The 95% CI for `att_start` is [0.72; 0.88].",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Applying the same requirement that the 95% Confidence Interval does NOT include zero, att_start is also a meaningful variable to our model. Therefore, our conclusion is to keep both variables, treatment and att_start.

Exercise 10

Let's us now consider income as a function of age and liberal, a proxy for political party. Pipe trains to select() with income, age, liberal as the arguments.


... |> 
  select(..., ..., ...)
trains |> 
  select(income, age, liberal)

The rule of thumb is to keep variables for which the estimated coefficient is more than two standard errors away from zero. Some of these variables won’t “matter” much to the model since their coefficients, although well-estimated, are small enough that removing the variable from the model will not affect the model’s predictions very much.

Exercise 11

Run the brm() command. Use the arguments formula = income ~ age + liberal, data = trains, family = gaussian(), silent = 2, refresh = 0, seed = 16. Assign the result to an object called fit_2_model.


... <- brm(... = income ~ age + liberal,
                   data = ...,
                   ... = gaussian(),
                   silent = ...,
                   ... = 0,
                   seed = ...)

The variable before the tilde, income, is our outcome. The explanatory variables are liberal, a logical value of TRUE or FALSE, and age, a numeric variable.

Exercise 12

Type fit_2_model and hit "Run Code."


fit_2_model
fit_2_model

The Intercept is estimating income where liberal == FALSE. Therefore, it is the estimated income for commuters that are not liberals and who have age = 0. The estimate for age is showing the increase in income with every additional year of age.

Exercise 13

Run fixef() on fit_2_model.


fixef(...)
fixef(fit_2_model)

The estimate for liberalTRUE represents the offset in predicted income for commuters who are liberal. To find the estimate, we must add the coefficient to our Intercept value. We see that, on average, liberal commuters make less money.

Exercise 14

In your own words, describe the magnitude of the relationship between liberalTRUE and income. Again, look at the sign of the estimated coefficient.

question_text(NULL,
    message = "The coefficient for `liberalTRUE` is -32434 (negative), indicating that the liberals in our data set make less on average than non-liberals.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

It is important to note that we are not looking at a causal relationship for either liberal or age. We are noting the differences between two groups, without considering causality, recall the differences between across unit comparison and within unit comparison.

Exercise 15

What is the 95% Confidence Interval for liberalTRUE. Round it up to 2 decimal places.

question_text(NULL,
    message = "The 95% CI for `liberalTRUE` is [-59756.45; -5266.50].",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

With rough mental math, we see that the 95% confidence interval excludes 0. Therefore, liberal is a helpful variable.

Exercise 16

What is the 95% Confidence Interval for age. Round it up to 2 decimal places.

question_text(NULL,
    message = "The 95% CI for `age` is [-22.75; 2177.09].",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

The variable age, however, does not appear to have a meaningful impact on our Intercept since its coefficient is low and the 95% confidence interval does not exclude 0. Therefore, keep liberal, age is not that meaningful so it is really a matter of preference at this point.

Comparing models in theory

Deciding which variables to include in a model is a subset of the larger question: How do we decide which model, out of the set of possible models, to choose?

Exercise 1

Consider the first model which seeks to explain the attitudes towards immigration among Boston commuters, using political ideology (liberal) as the explantory variable. Run the brm() command. Use the arguments formula = att_end ~ liberal, data = trains, family = gaussian(), silent = 2, refresh = 0, seed = 12. Assign the result to an object called fit_liberal.


fit_liberal <- brm(... = att_end ~ liberal,
                   data = ...,
                   ... = gaussian(),
                   silent = ...,
                   ... = 0,
                   seed = ...)

A better model not only makes better predictions on the current data set but also makes better prediction on the new data. For instance, if we were to predict how someone’s attitude changes toward immigration among Boston commuters based on political affiliation, we would want to go out and test our theories on new Boston commuters.

Exercise 2

Run fixef() on fit_liberal and hit "Run Code".


fixef(...)
fixef(fit_liberal)

The magnitude between liberalTRUE and att_end is negative, which makes sense. People who are liberal have more liberal attitudes about immigration, so we would expect their att_end scores to be lower.

Exercise 3

Consider another model which seeks to explain the attitudes towards immigration among Boston commuters, using att_start as the explanatory variable. Run the brm() command. Use the arguments formula = att_end ~ att_start, data = trains, family = gaussian(), silent = 2, refresh = 0, seed = 37. Assign the result to an object called fit_att_start.


... <- brm(... = att_end ~ att_start,
                     data = ...,
                     ... = gaussian(),
                     silent = ...,
                     ... = 0,
                     seed = ...)

When thinking of generalizing our model's prediction to new data, it is important to consider what is relevant new data in the context of the modeling problem. Some models are used to predict the future and, in those cases, we can wait and eventually observe the future and check how good our model is for making predictions.

Exercise 4

Run fixef() on fit_att_start and hit "Run Code".


fixef(...)
fixef(fit_att_start)

We would also expect people to provide similar answers in two surveys administered a week or two apart. It makes sense that those with higher (more conservative) values for att_start would also have higher values for att_end.

Exercise 5

The most obvious criteria for comparing models is the accuracy of the predictions. For example, consider the use of liberal to predict att_end. Pipe trains to mutate() with the argument pred_liberal = fitted(fit_liberal)[,"Estimate"].


... |> 
  mutate(... = fitted(...)[,"Estimate"])
trains |> 
  mutate(pred_liberal = fitted(fit_liberal)[,"Estimate"])

By doing this, we are creating a new column named pred_liberal that takes on the predicted values for att_end generated by the fit_liberal model. Adding [, "Estimate"] extracts just the "Estimate" column from the predictions, ignoring other columns like standard errors or confidence intervals.

Exercise 6

Copy the previous code, continue the pipe to select() and choose pred_liberal and att_end as the argument.


... |> 
  select(att_end, pred_liberal)
trains |> 
  mutate(pred_liberal = fitted(fit_liberal)[,"Estimate"]) |> 
  select(att_end, pred_liberal)

The att_end column represents the actual attitude towards immigration at the end of the experiment, while the pred_liberal generates the predicted values for att_end. Note that there are rows when the prediction matches the actual values, but that's not always the case.

Exercise 7

Copy your code from Exercise 5, continue the pipe with ggplot(). Map x to pred_liberal, y to att_end in the aes() argument. This will return a plain graph with two axes.


... |> 
  ggplot(...(x = ..., y = ...))
trains |> 
  mutate(pred_liberal = fitted(fit_liberal)[,"Estimate"]) |> 
  ggplot(aes(x = pred_liberal, y = att_end))

Each data point in the x axis is the predicted value for att_end of an individual and correspond to the actual value in the y axes. For some individuals, these are perfect predictions, but for others, they are poor predictions.

Exercise 8

Copy the previous code, add geom_jitter() as a another layer. Set width equal to 0.05, height equal to 0.2, and alpha equal to 0.5.


... +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5)
trains |> 
  mutate(pred_liberal = fitted(fit_liberal)[,"Estimate"]) |> 
  ggplot(aes(x = pred_liberal, y = att_end)) +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5)

Because there are only two possible values for liberal --- TRUE and FALSE --- there are only two predictions which this model will make: about 10 for liberal == FALSE and about 8 for liberal == TRUE. (The points in the above plot are jittered.)

Exercise 9

Copy the previous code, add the annotate() command as another layer. The first argument is "point", set x equal to 8, y equal to 8, size equal to 20, pch equal to 1, and color equal to "red".


... + 
    annotate("...", ... = 8, ... = 8, size = ..., pch = ..., color = "red")
trains |> 
  mutate(pred_liberal = fitted(fit_liberal)[,"Estimate"]) |> 
  ggplot(aes(x = pred_liberal, y = att_end)) +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5) + 
    annotate("point", x = 8, y = 8, size = 20, pch = 1, color = "red")

By running this command, we add a red circle where our predictions are most accurate (where x and y values are the same, which is where our predictions equal the true attitudes).

Exercise 10

Copy the previous code, add another annotate() command. The first argument is "point", set x equal to 10, y equal to 10, size equal to 20, pch equal to 1, and color equal to "red". Setting pch equal to 1 makes the inside of the point translucent to show the number of correct predictions.


... + 
    annotate("point", x = 10, y = 10, size = 20, pch = 1, color = "red")
trains |> 
  mutate(pred_liberal = fitted(fit_liberal)[,"Estimate"]) |> 
  ggplot(aes(x = pred_liberal, y = att_end)) +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5) + 
    annotate("point", x = 8, y = 8, size = 20, pch = 1, color = "red") + 
    annotate("point", x = 10, y = 10, size = 20, pch = 1, color = "red")

As we can see, the model isn't great at predicting att_end. (Note the two individuals who are liberal == TRUE, and who the model thinks will have att_end == 8, but who have att_end == 15. The model got them both very, very wrong.)

Exercise 11

Finally, add title, subtitle, labels for x and y.


... + 
  labs(title = ...,
       subtitle = ..., 
       x = ..., 
       y = ...)

Remember that your graph should look like this.

trains |> 
  mutate(pred_liberal = fitted(fit_liberal)[,"Estimate"]) |> 
  ggplot(aes(x = pred_liberal, y = att_end)) +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5) +
    annotate("point", x = 8, y = 8, size = 20, pch = 1, color = "red") +
    annotate("point", x = 10, y = 10, size = 20, pch = 1, color = "red") +
    labs(title = "Modeling Attitude Toward Immigration",
         subtitle = "Liberals are less conservative",
         x = "Predicted Attitude",
         y = "True Attitude")
trains |> 
  mutate(pred_liberal = fitted(fit_liberal)[,"Estimate"]) |> 
  ggplot(aes(x = pred_liberal, y = att_end)) +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5) +
    annotate("point", x = 8, y = 8, size = 20, pch = 1, color = "red") +
    annotate("point", x = 10, y = 10, size = 20, pch = 1, color = "red") +
    labs(title = "Modeling Attitude Toward Immigration",
         subtitle = "Liberals are less conservative",
         x = "Predicted Attitude",
         y = "True Attitude")

The most sensible way to test a model is to use the model to make predictions and compare those predictions to new data. After fitting the model using brm(), we would use posterior_predict() to obtain simulations representing the predictive distribution for new cases.

Exercise 12

Consider another model, using att_start to forecast att_end. Pipe trains to mutate() with the argument pred_as = fitted(fit_att_start)[,"Estimate"].


... |> 
  mutate(... = fitted(...)[,"Estimate"])
trains |> 
  mutate(pred_as = fitted(fit_att_start)[,"Estimate"])

Similar to the previous model, we will create a new column that contains the predicted values for att_end, but from a different model fitt_att_start. Then we also compare our predictions with the actual values.

Exercise 13

Copy the previous code, continue the pipe to select() and choose pred_as and att_end as the argument.


... |> 
  select(pred_as, att_end)
trains |> 
  mutate(pred_as = fitted(fit_att_start)[,"Estimate"]) |> 
  select(pred_as, att_end)

Note that the predictions can either be larger or smaller, or in some cases, equal to the actual values. If our predicted variable is smaller than the actual value, then we have underestimated the actual value, and vice versa.

Exercise 14

Copy your code from Exercise 12, continue the pipe with ggplot(). Map x to pred_as, y to att_end in the aes() argument. This will return a plain graph with two axes.


... |> 
  ggplot(...(x = ..., y = ...))
trains |> 
  mutate(pred_as = fitted(fit_att_start)[,"Estimate"]) |> 
  ggplot(aes(x = pred_as, y = att_end))

Because att_end takes on r length(unique(trains$att_end)) unique values, the model makes r length(unique(trains$att_end)) unique predictions. Some of those predictions are perfect! But others are very wrong.

Exercise 15

Copy the previous code, add geom_jitter() as another layer. Set width equal to 0.05, height equal to 0.2, and alpha equal to 0.5.


... +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5)
trains |> 
  mutate(pred_as = fitted(fit_att_start)[,"Estimate"]) |> 
  ggplot(aes(x = pred_as, y = att_end)) +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5)

Note that when we map the data points, they are spread through out the graph instead of only centering around specific values like the previous model. This is because liberal only takes 2 variable (0 and 1), while att_start can vary, which leads to more variation in the predicted values.

Exercise 16

Copy the previous code, add geom_abline() as another layer. Set intercept equal to 0, slope equal to 1, and color equal to "red". This will insert a red line where our predictions are correct using geom_abline() with an intercept, slope and color.



trains |> 
  mutate(pred_as = fitted(fit_att_start)[,"Estimate"]) |> 
  ggplot(aes(x = pred_as, y = att_end)) +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5) + 
    geom_abline(intercept = 0, slope = 1, color = "red")

Note the individual with a predicted att_end of around 9 but with an actual value of 15. That is a big miss!

Exercise 17

Finally, add title, subtitle, labels for x and y axes for your graph.


... + 
  labs(title = ..., 
       subtitle = ...,
       x = ...,
       y = ...)

Remember this is what your graph should look like.

trains |> 
  mutate(pred_as = fitted(fit_att_start)[,"Estimate"]) |> 
  ggplot(aes(x = pred_as, y = att_end)) +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5) + 
  geom_abline(intercept = 0, slope = 1, color = "red") +
    labs(title = "Modeling Attitude Toward Immigration",
         subtitle = "Survey responses are somewhat consistent",
         x = "Predicted Attitude",
         y = "True Attitude")
trains |> 
  mutate(pred_as = fitted(fit_att_start)[,"Estimate"]) |> 
  ggplot(aes(x = pred_as, y = att_end)) +
    geom_jitter(width = 0.05, height = 0.2, alpha = 0.5) + 
  geom_abline(intercept = 0, slope = 1, color = "red") +
    labs(title = "Modeling Attitude Toward Immigration",
         subtitle = "Survey responses are somewhat consistent",
         x = "Predicted Attitude",
         y = "True Attitude")

Rather than looking at individual cases, we need to look at the errors for all the predictions. Fortunately, a prediction error is the same thing as a residual, which is easy enough to calculate.

Exercise 18

Pipe trains to select() with att_end, att_start, and liberal.


... |> 
  ...(..., ..., ...)
trains |> 
  select(att_end, att_start, liberal)

A residual is the difference between the observed value of the dependent variable (actual value) and the predicted value (fitted value) produced by a regression model.

Exercise 19

Continue the pipe with mutate(). Set pred_lib equal to fitted(fit_liberal)[,"Estimate"] as the argument.


... |> 
  mutate(... = fitted(...)[,"Estimate"])
trains |> 
  select(att_end, att_start, liberal) |> 
  mutate(pred_lib = fitted(fit_liberal)[,"Estimate"])

In mathematical terms, for each observation $i$:

$$Residual = y_i - \hat{y_i} $$ $y_i$ is the actual value of the dependent variable, which is att_end and $\hat{y_i}$ is the predicted value of att_end generated from the model.

Exercise 20

Add another column resid_lib equal to pred_lib - att_end to the current tibble using the mutate() command.


...|> 
  mutate(resid_lib = pred_lib - att_end)
trains |> 
  select(att_end, att_start, liberal) |> 
  mutate(pred_lib = fitted(fit_liberal)[,"Estimate"]) |> 
  mutate(resid_lib = pred_lib - att_end)

Larger residuals mean larger deviations from the actual values. Smaller residuals indicate a model that fits the data well.

Exercise 21

Continue the pipe with mutate(), create a new column called pred_as and set it equal to fitted(fit_att_start)[,"Estimate"].


... |> 
  mutate(pred_as = fitted(fit_att_start)[,"Estimate"])
trains |> 
  select(att_end, att_start, liberal) |> 
  mutate(pred_lib = fitted(fit_liberal)[,"Estimate"]) |> 
  mutate(resid_lib = pred_lib - att_end) |> 
  mutate(pred_as = fitted(fit_att_start)[,"Estimate"])

Sadly, it is not wise to simply select the model which fits the data best because doing so can be misleading. We are using our data to select parameters and then, after using the data once, turning around and “checking” to see how well your model fits the data. It better fit!

Exercise 22

Finally, calculate the residual when using att_start to forecast att_end. Using mutate(), add resid_as and set it equal to pred_as - att_end.



trains |> 
  select(att_end, att_start, liberal) |> 
  mutate(pred_lib = fitted(fit_liberal)[,"Estimate"]) |> 
  mutate(resid_lib = pred_lib - att_end) |> 
  mutate(pred_as = fitted(fit_att_start)[,"Estimate"]) |> 
  mutate(resid_as = pred_as - att_end)

If the only criteria we cared about was how well the model predicts using the data on which the parameters were estimated, then a model with more parameters will always be better. But that is not what truly matters. What matters is how well the model works on data which was not used to create the model.

Exercise 23

Let's look at the square root of the average squared error. Pipe trains to select to take out att_end, att_start and liberal.


... |> 
  ...(..., ..., ...) 
trains |> 
  select(att_end, att_start, liberal) 

There are many different measures of the error which we might calculate. The squared difference is most common for historical reasons: it was the mathematically most tractable in the pre-computer age.

Exercise 24

Continue the pipe with mutate(). Create a column named lib_err and set it equal to (fitted(fit_liberal)[,"Estimate"] - att_end)^2.


... |>
  ...(... = (fitted(...)[,"Estimate"] - ...)^2)
trains |> 
  select(att_end, att_start, liberal) |> 
  mutate(lib_err = (fitted(fit_liberal)[,"Estimate"] - att_end)^2) 

Since the residuals can be negative, we could get a zero value when trying to sum them up, or we might be unable to take the square root of the difference if it's negative. Thus, we square them up so that all values are positive.

Exercise 25

Continue the pipe with mutate(). Create a column named as_err and set it equal to (fitted(fit_att_start)[,"Estimate"] - att_end)^2.


... |> 
  ...(... = (fitted(...)[,"Estimate"] - ...)^2)
trains |> 
  select(att_end, att_start, liberal) |> 
  mutate(lib_err = (fitted(fit_liberal)[,"Estimate"] - att_end)^2) |> 
  mutate(as_err = (fitted(fit_att_start)[,"Estimate"] - att_end)^2)

Having calculated a squared difference for each observation, we can sum them or take their average or take the square root of their average.

Exercise 26

Continue the pipe with summarize(). The first argument is lib_sigma and set it equal to sqrt(mean(lib_err)), the second argument is as_sigma and set it equal to sqrt(mean(as_err)).


... |> 
  summarize(lib_sigma = ...,
            as_sigma = ...)
trains |> 
  select(att_end, att_start, liberal) |> 
  mutate(lib_err = (fitted(fit_liberal)[,"Estimate"] - att_end)^2) |> 
  mutate(as_err = (fitted(fit_att_start)[,"Estimate"] - att_end)^2) |> 
  summarize(lib_sigma = sqrt(mean(lib_err)),
            as_sigma = sqrt(mean(as_err))) 

The better model is the one with a smaller square root of the residual. The as_sigma has a lower value compared to lib_sigma and thus it is a better model.

Comparing models in practice

In cross validation, part of the data is used to fit the model, while the rest of the data is used as a proxy for future data. We can hold out individual observations, called leave-one-out (LOO) cross validation; or groups of observations, called leave-one-group-out cross validation; or use past data to predict future observations, called leave-future-out cross validation.

Exercise 1

First, we will familiarize ourselves with our first model, fit_liberal. Run fixef() on fit_liberal to print out the previous result.


fixef(...)
fixef(fit_liberal)

In essence: One piece of data is excluded from our model, the model is re-fit, the model attempts to predict the value of the missing piece, we compare the true value to the predicted value, and we assess the accuracy of our model's prediction. This process occurs for each piece of data, allowing us to assess the model's accuracy in making predictions.

Exercise 2

Now, we will perform loo() on our model and look at the results. Run the loo() command on fit_liberal and assign the result to an object called loo_liberal. Type loo_liberal in a separate line and hit "Run Code".


... <- loo(...)

loo_liberal
loo_liberal <- loo(fit_liberal)

loo_liberal

elpd_loo is the estimated log score along with a standard error representing uncertainty due to using only 115 data points, p_loo is the estimated “effective number of parameters” in the model. looic is the LOO information criterion, −2 elpd_loo, which we compute for comparability to deviance.

Exercise 3

Let’s turn our attention to our second model. To begin, let’s observe the qualities of fit_att_start once again. Run fixef() on fit_att_start to print out the previous result.


fixef(...)
fixef(fit_att_start)

Basically, when we run loo(), we are telling R to take a piece of data out of our data set, re-estimate all parameters, and then predict the value for the missing piece of data. The value for elpd_loo() is based off of how close our estimate was to the truth and thus informs us of the effectiveness of our model in predicting data it has not seen before.

Exercise 4

Great! Now, let’s perform loo() on this model. Run the loo() command on fit_att_start and assign the result to an object called loo_att_start. Type loo_att_start in a separate line and hit "Run Code".


... <- loo(...) 

loo_att_start
loo_att_start <- loo(fit_att_start) 

loo_att_start

The closer our value of elpd_loo is to 0, the more effective our model is. The elpd_loo value for this model is higher than the elpd_loo for att_liberal (closer to zero), implying that this model is superior.

Exercise 5

To compare the two models directly, we can use the function loo_compare() with our two loo objects created above. This will calculate the difference in elpd_loo() between our models for us, making our job easier. Run the command loo_compare() with loo_att_start and loo_liberal as the arguments.


loo_compare(..., ...)
loo_compare(loo_att_start, loo_liberal)

When the better model is compared to itself, the values of elpd_diff and att_start will be 0. The following rows show the offset in elpd and se values between the less effective model, fit_liberal, and the more effective model, fit_att_start

Exercise 6

In one or two sentences, summarize our work so far, stating which model is better and conclude the better variable between att_start and liberal in terms of predicting the attitude towards immigration at the end of the experiment (att_end).

question_text(NULL,
    message = "The better model is clear: `fit_att_start`. Therefore, the attitude at the start of the `trains` study is more significant to predicting the final attitude when compared with the variable `liberal`, which is an analog for political affiliation.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

What do we do when the value of loo_compare() is small? As a general practice, when elpd_diff is less than 4, there is no advantage to one model over the other.

Summary

This tutorial covers Chapter 7: Mechanics of Preceptor’s Primer for Bayesian Data Science: Using the Cardinal Virtues for Inference by David Kane.




PPBDS/primer.tutorials documentation built on April 3, 2025, 3:11 p.m.