In PPBDS/primer.tutorials: Tutorials for Preceptor's Primer for Bayesian Data Science

library(learnr)
library(tutorial.helpers)
library(tidyverse)
library(primer.data)
knitr::opts_chunk$set(echo = FALSE)
options(tutorial.exercise.timelimit = 60, 
        tutorial.storage = "local") 

shoesize_p <- tibble(Wshoes = rnorm(n = 100000, mean = 8, sd = 1.5),
       Mshoes = rnorm(n = 100000, mean = 10.5, sd = 1.5)) |> 
  pivot_longer(cols = everything(), 
               names_to = "distribution", 
               values_to = "value") |>  
   ggplot(aes(x = value, fill = distribution)) +
    geom_histogram(aes(y = after_stat(count/sum(count))),
                   alpha = 0.5, 
                   bins = 100, 
                   position = "identity") +
    labs(title = "Normal Distributions of shoe sizes",
         x = "Value",
         y = "Probability")

emp_p <- 
  tibble(results = sample(c(0, 1), 100, replace = TRUE)) |> 
  ggplot(aes(x = results)) +
    geom_histogram(aes(y = after_stat(count/sum(count))), 
                   binwidth = 0.5, 
                   color = "white") +
    labs(title = "Empirical Probability Distribution",
         subtitle = "Flipping one coin a hundred times",
         x = "Outcome\nResult of Coin Flip",
         y = "Probability") +
    scale_x_continuous(breaks = c(0, 1), 
                       labels = c("Heads", "Tails")) +
    scale_y_continuous(labels = 
                         scales::percent_format(accuracy = 1)) +
    theme_classic()

set.seed(1)
emp_dist_dice <- tibble(ID = 1:100) |> 
  mutate(die_1 = map_dbl(ID, ~ sample(c(1:6), size = 1))) |> 
  mutate(die_2 = map_dbl(ID, ~ sample(c(1:6), size = 1))) |> 
  mutate(sum = die_1 + die_2) |> 
  ggplot(aes(x = sum)) +
    geom_histogram(aes(y = after_stat(count/sum(count))), 
                   binwidth = 1, 
                   color = "white") +
    labs(title = "Empirical Probability Distribution",
         subtitle = "Sum from rolling two dice, replicated one hundred times",
         x = "Outcome\nSum of Two Die",
         y = "Probability") +
    scale_x_continuous(breaks = seq(2, 12, 1), labels = 2:12) +
    scale_y_continuous(labels = 
                         scales::percent_format(accuracy = 1)) +
    theme_classic()

emp_mpg <- mpg |>
  filter(drv == "f", cty >= 16) |>
  select(displ)|>
  drop_na() |>
  ggplot(aes(x = displ)) +
    geom_histogram(aes(y = after_stat(count/sum(count))), 
                 binwidth = 1, 
                 color = "white")+
  labs(title = "Empirical Probability Distribution",
       subtitle = "Engine Displacement for front-wheel drive cars from mpg",
       x = "Engine Displacement",
       y = "Probability",caption = "Source:mpg") +
  scale_y_continuous(labels =
                      scales::percent_format(accuracy = 1)) +
  theme_classic()

jd_disease <- tibble(ID = 1:10000, 
                     have_disease = rep(c(TRUE, FALSE), 
                                    5000)) |> 
  mutate(positive_test = if_else(have_disease,
          map_int(have_disease, ~ rbinom(n = 1, size = 1, p = 0.99)),
          map_int(have_disease, ~ rbinom(n = 1, size = 1, p = 0.5))))

joint_p <- tibble(in_bag = rep(c(0, 1, 2), 1000)) |>
  mutate(in_sample = map_int(in_bag, ~ rbinom(n = 1, 
                                              size = 3, 
                                              p = ./2))) |>
  ggplot(aes(x = in_sample, y = in_bag)) +
    geom_jitter(alpha = 0.5) +
    labs(title = "Black and White Marbles",
         subtitle = "More white marbles in bag mean more white marbles selected",
         x = "White Marbles Selected",
         y = "White Marbles in the Bag") +
    scale_y_continuous(breaks = c(0, 1, 2)) +
  theme_classic()

emp_p <- tibble(p = rep(seq(0, 1, 0.1), 1000)) |>
  mutate(heads = map_int(p, ~ rbinom(n = 1, size = 20, p = .))) |>
  ggplot(aes(y = p, x = heads)) +
    geom_jitter(alpha = 0.1) +
    labs(title = "Empirical Distribution of Number of Heads",
         subtitle = expression(paste("Based on simulations with various values of ", rho[h])),
         x = "Number of Heads out of 20 Tosses",
         y = expression(rho[h])) +
  scale_y_continuous(breaks = seq(0, 1, 0.1)) +
  theme_classic()

Introduction

This tutorial covers Chapter 2: Probability of Preceptor’s Primer for Bayesian Data Science: Using the Cardinal Virtues for Inference by David Kane.

Distributions

Let's create this normal distribution by the end of this section.

shoesize_p <- tibble(Wshoes = rnorm(n = 100000, mean = 8, sd = 1.5),
       Mshoes = rnorm(n = 100000, mean = 10.5, sd = 1.5)) |> 
  pivot_longer(cols = everything(), 
               names_to = "distribution", 
               values_to = "value") |>  
   ggplot(aes(x = value, fill = distribution)) +
    geom_histogram(aes(y = after_stat(count/sum(count))),
                   alpha = 0.5, 
                   bins = 100, 
                   position = "identity") +
    labs(title = "Normal Distributions of shoe sizes",
         x = "Value",
         y = "Probability")

shoesize_p

Exercise 1

Use tibble() to create a tibble with one variable: numbers, and set numbers equal to the consecutive integers from 1 to 101 inclusive using :.

tibble(... = 1:101)

tibble(numbers = 1:101)

Note that the vector of numbers is integers, if you try putting c() around the two numbers, and replace : with ,, the two numbers become double instead.

Exercise 2

Pipe the results to summarize(), create a new column name avg which is equal to the mean() of numbers.

...|>
  summarize(avg = mean(...))

tibble(numbers = 1:101) |> 
  summarize(avg = mean(numbers))

summarize() does not keep any of the variables in the input tibble, unless the .by argument is used.

Exercise 3

Within the summarize(), add a new argument, middle, and set it equal to the median() of numbers. Remember to use commas to separate your arguments.

... |> 
  summarize(...,
            middle = median(...))

tibble(numbers = 1:101) |> 
  summarize(avg = mean(numbers),
            middle = median(numbers))

Both mean() and median() have the na.rm value set to its default FALSE, the na.rm means the function does not consider NA value in the tibble.

Exercise 4

Within the summarize(), add a new argument, stan_dev, set it equal to the standard deviation of numbers using sd().

... |> 
  summarize(...,
            ...,
            stan_dev = sd(...))

tibble(numbers = 1:101) |> 
  summarize(avg = mean(numbers),
            middle = median(numbers),
            stan_dev = sd(numbers))

Note that the x value inside mean() ,median() and sd() has to be a numeric vector, not a factor or logical.

Recall standard deviation is measuring how spread out our numbers are.

Exercise 5

Within the summarize(), add a new argument, med_dev and set it equal to mad(numbers).

... |> 
  summarize(...,
            ...,
            ...,
            med_dev = mad(...))

tibble(numbers = 1:101) |> 
  summarize(avg = mean(numbers),
            middle = median(numbers),
            stan_dev = sd(numbers),
            med_dev = mad(numbers))

mad stands for median absolute deviation, essentially it's how spread out the values are around the median, note that the number should be different in this case. sd() is looking for the square of difference, yet mad() is looking at the absolute difference.

Exercise 6

Last but not least, within summarize(), set qtile to quantile(). Within it, set the first argument to numbers, and the second to prob which is equal to c(0, 0.5, 1).

... |> 
  summarize(...,
            ...,
            ...,
            ..., 
            qtile = quantile(..., prob = c(..., ..., ...)))

tibble(numbers = 1:101) |> 
  summarize(avg = mean(numbers),
            middle = median(numbers),
            stan_dev = sd(numbers),
            med_dev = mad(numbers), 
            qtile = quantile(numbers, prob = c(0, 0.5, 1)))

This will create another two rows, because quantile has three values in it's column, yet if we want to keep it in one row, we can add list() around quantile(), so that we create a list-column for column quantile and keep everything else in 1 row.

Exercise 7

Create a new tibble. Make the first column name as ID, and set it equal to all the consecutive integers from 1 to 100. Then make a second column named scores, and set it equal to a vector c(). Inside the vector use rep() to make value 1350 occur 20 times, value 1380 occur 30 times, 1440 occur 40 times, and 1520 occur 10 times.

tibble(... = 1:100 , scores = c(rep(..., 20),
                                rep(1380, ...),
                                rep(1440, ...),
                                rep(1520, ...)))

tibble(ID = 1:100 , scores = c(rep(1350, 20),
                               rep(1380, 30),
                               rep(1440, 40),
                               rep(1520, 10)))

We can use c() to combine the rep() of numbers into vectors, and within tibble() or mutate() we can turn this vector into a column.

Exercise 8

Now let's say you are an admissions officer, and here's the score, now you want to look at the percentile of that score. After the previous code, pipe the summarize() function to make a new column percentile, and set equal to quantile(), first argument is scores, the second is prob, and set it equal to c(.35, .50, .75).

... |>
      summarize(percentile = quantile(scores, prob = c(..., ..., ...)))

tibble(ID = 1:100 , 
       scores = c(rep(1350, 20),
                               rep(1380, 30),
                               rep(1440, 40),
                               rep(1520, 10))) |> 
                               summarize(percentile = quantile(scores, prob = c(.35, .50, .75)))

The function quantile() produces sample quantiles corresponding to the given probabilities. The smallest observation corresponds to a probability of 0 and the largest to a probability of 1.

Exercise 9

Now let's say instead of looking at the percentile you want to randomly pick 10 people from 1350 to 1520 score range to admit. Within summarize(), replace percentile with admit, and set it equal to sample(). Within sample(), set the first argument to scores, second to size equal to 10, and third to replace which you set to FALSE.

... |>
      summarize(..., 
                percentile = quantile(scores, prob = c(..., ... = ..., ... = ...)))

tibble(ID = 1:100 , 
       scores = c(rep(1350, 20),
                               rep(1380, 30),
                               rep(1440, 40),
                               rep(1520, 10))) |> 
                               summarize(admit = sample(scores, size = 10, replace = FALSE))

Within sample(), the first argument represents the data source, size means the number of values we are picking, and replace means whether we pick the same value twice.

Exercise 10

Let's now use runif() to draw from a uniform distribution. Within runif(), set n to 5 (the number of draws), min to 5, and max to 8.

runif(n = ..., 
      min = ..., 
      max =...)

runif(n = 5, min = 5, max = 8)

The runif function creates a random uniform distribution between your chosen minimum and maximum values.

Exercise 11

Run tibble(), set the argument heads equal to rbinom(), with the first argument n equal to 1000, second argument size equal to 1, and the third argument prob equal to 0.5.

tibble(heads = rbinom(n = ..., size = ..., prob = ...))

tibble(heads = rbinom(n = 1000, size = 1, prob = 0.5))

rbinom() is used for the scenario where there is only two possible outcome also known as Boolean data type, the coin toss in this case is an example of Boolean data. Within rbinom() the value n means how many values are drawn, size is how many values are drawn each time, and prob is the probability of the outcome (1).

Exercise 12

We can give a quick look at what the graph looks like, pipe the tibble() from the previous code to ggplot(), within ggplot(), use aes() and set x = heads, then add the layer geom_bar(). Set binwidth to 0.5.

... |>
  ggplot(aes(...)) +
  geom_histogram(binwidth = ...)

tibble(heads = rbinom(n = 1000, size = 1, prob = 0.5)) |>
  ggplot(aes(x = heads)) +
  geom_histogram(binwidth = 0.5)

In this case geom_col() won't work because it needs a y aesthetic, geom_bar() is better than geom_histogram() in this scenario because geom_bar() automatically fixes the binwidth for you, while geom_histogram() requires you to set the binwidth yourself.

Exercise 13

Start a pipe with tibble(), create an value Wshoes, set it equal to rnorm(), set the first argument to 100000, the second argument mean equal to 8, and third argument sd to 1.5. Create another value Mshoes in tibble(), set it equal to rnorm(), set the first argument to 100000, the second argument mean equal to 10.5, and third argument sd to 1.5.

tibble(Wshoes = rnorm(..., mean = ..., sd = ...),
       Mshoes = rnorm(..., mean = ..., sd = ...))

tibble(Wshoes = rnorm(100000, mean = 8, sd = 1.5),
       Mshoes = rnorm(100000, mean = 10.5, sd = 1.5))

Within rnorm() the first argument means how many times we are drawing, the second argument mean is the mean of the distribution, and the third argument sd is the standard deviation for the distribution. Normal distribution is mostly used for things that are very common for most people, but uncommon as they get far away from their average value. Examples could be height, blood pressure, IQ value, shoe sizes, etc.

Exercise 14

Let's prepare to plot our tibble. Continue the pipe with pivot_longer(). Set cols to everything(), names_to to "Distribution", and values_to to "draw".

... |> 
    pivot_longer(cols = ...,
                 names_to = ...,
                 values_to = ...)

tibble(Wshoes = rnorm(100000, mean = 8, sd = 1.5),
       Mshoes = rnorm(100000, mean = 10.5, sd = 1.5)) |> 
          pivot_longer(cols = everything(),
                 names_to = "Distribution",
                 values_to = "draw")

pivot_longer() "lengthens" data, increasing the number of rows and decreasing the number of columns. The inverse transformation is pivot_wider()

Exercise 15

Continue your pipe with ggplot() to map x to draw and fill to Distribution. Add the layer geom_histogram(). Within geom_histogram(), inside aes(), set y equal to after_stat() with count/sum(count) as the argument.

...|>  
   ggplot(aes(x = ..., fill = ...)) +
    geom_histogram(aes(y = after_stat(...)))

tibble(Wshoes = rnorm(100000, mean = 8, sd = 1.5),
       Mshoes = rnorm(100000, mean = 10.5, sd = 1.5)) |> 
          pivot_longer(cols = everything(),
                 names_to = "Distribution",
                 values_to = "draw") |>
          ggplot(aes(x = draw, fill = Distribution)) +
                geom_histogram(aes(y = after_stat(count/sum(count))))

Exercise 16

Using the previous code, set alpha to 0.5, bins to 100, and position to "identity" within geom_histogram().

... |> 
   geom_histogram(...,
                  alpha =  ..., 
                  bins =  ..., 
                  position = ...)

tibble(Wshoes = rnorm(100000, mean = 8, sd = 1.5),
       Mshoes = rnorm(100000, mean = 10.5, sd = 1.5)) |> 
          pivot_longer(cols = everything(),
                 names_to = "Distribution",
                 values_to = "draw") |>
          ggplot(aes(x = draw, fill = Distribution)) +
                geom_histogram(aes(y = after_stat(count/sum(count))),
                               alpha = 0.5,
                               bins = 100,
                               position = "identity")

Exercise 17

Finally, use labs() to title your graph "Normal Distributions of shoe sizes". Also label your x-axis "Value" and y-axis "Probability".

tibble(Wshoes = rnorm(n = 100000, mean = 8, sd = 1.5),
       Mshoes = rnorm(n = 100000, mean = 10.5, sd = 1.5)) |> 
  pivot_longer(cols = everything(), 
               names_to = "distribution", 
               values_to = "value") |>  
   ggplot(aes(x = value, fill = distribution)) +
    geom_histogram(aes(y = after_stat(count/sum(count))),
                   alpha = 0.5, 
                   bins = 100, 
                   position = "identity") +
    labs(title = "Normal Distributions of shoe sizes",
         x = "Value",
         y = "Probability")

Reminder: This is what your plot should look like.

shoesize_p

PD for rolling two dice

Let's create the following empirical distribution for rolling two dice.

set.seed(1)

emp_dist_dice <- tibble(ID = 1:100) |> 
  mutate(die_1 = map_dbl(ID, ~ sample(c(1:6), size = 1))) |> 
  mutate(die_2 = map_dbl(ID, ~ sample(c(1:6), size = 1))) |> 
  mutate(sum = die_1 + die_2) |> 
  ggplot(aes(x = sum)) +
    geom_histogram(aes(y = after_stat(count/sum(count))), 
                   binwidth = 1, 
                   color = "white") +
    labs(title = "Empirical Probability Distribution",
         subtitle = "Sum from rolling two dice, one hundred times",
         x = "Outcome\nSum of Two Die",
         y = "Probability") +
    scale_x_continuous(breaks = seq(2, 12, 1), labels = 2:12) +
    scale_y_continuous(labels = 
                         scales::percent_format(accuracy = 1)) +
    theme_classic()

emp_dist_dice

Exercise 1

Create a tibble with variable ID, equal to vector from 1 to 100.

tibble(ID = ... : ...)

tibble(ID = 1:100)

The tibble() function provides us with a structure in which to organize the data which we generate.

Exercise 2

Pipe the tibble to the map_dbl() function to create the results for an random dice rolling experiment. It will include two arguments, one is the data source, which in this case is ID, and the second is ~sample(c(1:6), size = 1).

...|>
map_dbl(ID, ~sample(c(...:...), size = 1))

# tibble(ID = 1:100) |>
#                       map_dbl(ID, ~sample(c(1:6), size = 1))

This may return an error, but you may still continue on to the next exercise where the code should run without any errors.

We use 1 through 6 to generate any one of the 6 sides of the dice. Size is one because we only want one side.

Exercise 3

Now let's pipe the tibble to mutate(). Assign the map_dbl function we made above to an value called die_1.

tibble(ID = 1:100) |>
                     mutate(... = map_dbl(ID, ~sample(c(1:6), size = 1)))

tibble(ID = 1:100) |>
                     mutate(die_1 = map_dbl(ID, ~sample(c(1:6), size = 1)))

Exercise 4

Now copy and paste the results from above, and do the exact same thing again but instead, change the name from die_1 to die_2, because we need the results for two dices.

...|>
mutate(...,
       ... =map_dbl(ID, ~sample(c(...:...), size = 1)))

tibble(ID = 1:100) |>
                     mutate(die_1 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            die_2 = map_dbl(ID, ~sample(c(1:6), size = 1)))

Exercise 5

Use the mutate() function once again, name the value of the sum of die_1 and die_2 to sum.

...|>
mutate(...,
       ...,
       sum = die_1 + die_2)

tibble(ID = 1:100) |>
                     mutate(die_1 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            die_2 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            sum = die_1+die_2)

Exercise 6

Now use ggplot() to graph the Empirical Distribution, set x = sum within aes(), and then add a layer of geom_histogram()

...|>
ggplot(aes(...))+
  geom_histogram()

tibble(ID = 1:100) |>
                     mutate(die_1 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            die_2 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            sum = die_1+die_2) |>
                     ggplot(aes(x = sum)) + 
                      geom_histogram()

Exercise 7

Within geom_histogram(), inside aes(), set the argument y equal to after_stat(count/sum(count)) to put percents on the y-axis.

... +
    geom_histogram(aes(y = after_stat(...)))

tibble(ID = 1:100) |>
                     mutate(die_1 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            die_2 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            sum = die_1+die_2) |>
                     ggplot(aes(x = sum)) + 
                      geom_histogram(aes(y = after_stat(count/sum(count))))

Exercise 8

Also set binwidth to 0.5 and color to "white" within geom_histogram().

... +
    geom_histogram(..., 
                   binwidth = ..., 
                   color = "..."))

tibble(ID = 1:100) |>
                     mutate(die_1 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            die_2 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            sum = die_1+die_2) |>
                     ggplot(aes(x = sum)) + 
                      geom_histogram(aes(y = after_stat(count/sum(count))),
                                     binwidth = 0.5,
                                     color = "white")

Exercise 9

Now use scale_x_continuous() so the x-axis has breaks to the sequence between 2 and 12 common difference is 1 using seq(). Set the labels for the breaks from 2 to 12.

... +
  scale_x_continuous(breaks = seq(...,..., ...), labels = ...:...)

tibble(ID = 1:100) |>
                     mutate(die_1 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            die_2 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            sum = die_1+die_2) |>
                     ggplot(aes(x = sum)) + 
                      geom_histogram(aes(y = after_stat(count/sum(count))),
                                     binwidth = 0.5,
                                     color = "white") + 
                      scale_x_continuous(breaks = seq(2, 12, 1), labels = 2:12)

Exercise 10

Now use scale_y_continuous() to put the x-axis in percent format. Within scale_y_continuous(), set labels to scales::percent_format(). Within percent_format() set accuracy to 1.

... + 
  scale_y_continuous(labels = scales::percent_format(...))

tibble(ID = 1:100) |>
                     mutate(die_1 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            die_2 = map_dbl(ID, ~sample(c(1:6), size = 1)),
                            sum = die_1+die_2) |>
                     ggplot(aes(x = sum)) + 
                      geom_histogram(aes(y = after_stat(count/sum(count))),
                                     binwidth = 0.5,
                                     color = "white") + 
                      scale_x_continuous(breaks = seq(2, 12, 1), labels = 2:12) +
                      scale_y_continuous(labels = scales::percent_format(accuracy = 1))

Exercise 11

Finally, use labs() to add the appropriate title, subtitle, and axis labels. Also add the layer theme_classic().

tibble(ID = 1:100) |> 
  mutate(die_1 = map_dbl(ID, ~ sample(c(1:6), size = 1))) |> 
  mutate(die_2 = map_dbl(ID, ~ sample(c(1:6), size = 1))) |> 
  mutate(sum = die_1 + die_2) |> 
  ggplot(aes(x = sum)) +
    geom_histogram(aes(y = after_stat(count/sum(count))), 
                   binwidth = 1, 
                   color = "white") +
    labs(title = "Empirical Probability Distribution",
         subtitle = "Sum from rolling two dice, replicated one hundred times",
         x = "Outcome\nSum of Two Die",
         y = "Probability") +
    scale_x_continuous(breaks = seq(2, 12, 1), labels = 2:12) +
    scale_y_continuous(labels = 
                         scales::percent_format(accuracy = 1)) +
    theme_classic()

Reminder: Your plot should look similar to the the one below.

emp_dist_dice

Wisdom

All we can know is that we know nothing. And that’s the height of human wisdom. - Leo Tolstoy

Exercise 1

In your own words, describe the key components of Wisdom for working on a data science problem.

question_text(NULL,
    message = "Wisdom requires the creation of a Preceptor Table, an examination of our data, and a determination, using the concept of validity, as to whether or not we can (reasonably!) assume that the two come from the same population.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Exercise 2

Define a Preceptor Table.

question_text(NULL,
    message = "A Preceptor Table is the smallest possible table of data with rows and columns such that, if there is no missing data, it is easy to calculate the quantities of interest.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

One key aspect of a Preceptor Table is whether or not we need more than one potential outcome in order to calculate our estimand. For example, if we want to know the causal effect of exposure to Spanish-speakers on attitude toward immigration then we need a causal model, one which estimates that attitude for each person under both treatment and control.

Exercise 3

Describe the key components of Preceptor Tables in general. Use words like "units," "outcomes," and "covariates."

question_text(NULL,
    message = "The rows of the Preceptor Table are the units. The outcome is at least one of the columns. If the problem is causal, there will be at least two (potential) outcome columns. The other columns are covariates. If the problem is causal, at least one of the covariates will be a treatment.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Exercise 4

Write two sentences explaining a predictive model in your life. Include the one outcome variable of interest.

question_text(NULL,
    message = "A predictive model in my life can be seen as the money that I will potentially spend on groceries per week. I can take all of the data that I have from the previous weeks to estimate how much I will spend on groceries with a predictive model. Outcome: How much will I spend on groceries?",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

A predictive models has just one outcome column in its Preceptor Table.

Exercise 5

Write two sentences explaining a causal model in your life. Include the two potential outcomes and the treatment variable.

question_text(NULL,
    message = "A causal model in my life can be seen as the money that I will spend on groceries based on whether I go to the gym to exercise. Two potential outcomes: dollars spent on groceries if I do go to the gym and dollars spent on groceries if I don't go to the gym. The treatment is going to the gym or not going to the gym. The following situation is an example of a causal model because there are two outcome columns in the Preceptor Table.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Causal models aim to determine the effect of manipulating a specific variable on the outcome by comparing two or more potential outcomes for each unit. The difference between predictive models and causal models is that the former has one column for the outcome variable in the Preceptor Table and the latter has more than one column.

Exercise 6

Scenario: You are an analyst at the mayor's office and want to find out what it will take for people to be more kind towards immigrants. You have previous data regarding individuals who were exposed to one of two possible conditions, and then their attitudes towards immigrants were recorded. One condition was waiting on a train platform near individuals speaking Spanish. The other was being on a train platform without Spanish-speakers. You are going to use this data and conduct your own analysis. You will give speeches on citizens through random assignment that includes a positive message regarding immigrants. Your hope is that the causal effect of those speeches is to make people be kinder.

Is this scenario a predictive model or a causal model? Explain why in two sentences.

question_text(NULL,
    message = "This scenario is a causal model because there are multiple potential outcomes in the Preceptor Table. Causal models have more than one potential outcome while predictive models only have one outcome.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Exercise 7

Let's create a Preceptor Table by finding each column needed from the scenario. Write two sentences about what the rows are of the situation and how it relates to the scenario.

question_text(NULL,
    message = "We are going to use three rows: Citizens, Treatments, and Potential Outcomes. We are concerned about each citizen and the data that each one of them will contain.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Exercise 8

What is the treatment for this problem?

question_text(NULL,
    message = "Which citizens have heard the speech?",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Exercise 9

Write two sentences about what the potential outcomes are of the situation and how it relates to the scenario. Remember this is a causal model which means that there will be multiple potential outcomes.

question_text(NULL,
    message = "Potential Outcomes: 
               1. People who heard the speech and changed to be kind
               2. People who heard the speech and changed to be not kind              3. People who didn't hear the speech and are kind
               4. People who didn't hear the speech are not kind
               These are the four potential outcomes at hand because of              the influence from the treatment that will change the                 outcome of the scenario.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Exercise 10

In your own words, define "validity" as we use the term.

question_text(NULL,
    message = "Validity is the consistency, or lack thereof, in the columns of the data set and the corresponding columns in the Preceptor Table.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

In order to consider the two datasets to be drawn from the same population, the columns from one must be have a valid correspondence with the columns in the other.

Exercise 11

Provide a reason why the assumption of validity might not hold with the data from immigrants with Spanish-speakers on the train to our scenario of giving citizens a speech to be more kind. Explain how that failure might impact the results.

question_text(NULL,
    message = "The treatment in the data (Spanish speakers on a train platform) and the treatment in the Preceptor Table (a speech from the mayor about immigrants) are not similar enough that we can reasonably expect them to have the same causal effects. People are often annoyed by stangers but can react very differently to an elected official.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Validity, if true (or at least reasonable), allows us to construct the Population Table.

Justice

The arc of the moral universe is long, but it bends toward justice. - Theodore Parker

Exercise 1

In your own words, name the four key components of Justice for working on a data science problem.

question_text(NULL,
    message = "Justice concerns four topics: the Population Table, stability, representativeness, and unconfoundedness.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Exercise 2

In your own words, define a Population Table.

question_text(NULL,
    message = "The Population Table includes a row for each unit/time combination in the underlying population from which both the Preceptor Table and the data are drawn.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Exercise 3

In your own words, define the assumption of "stability" when employed in the context of data science.

question_text(NULL,
    message = "Stability means that the relationship between the columns in the Population Table is the same for three categories of rows: the data, the Preceptor Table, and the larger population from which both are drawn.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Stability is all about time. Is the relationship among the columns in the Population Table stable over time? In particular, is the relationship --- which is another way of saying "mathematical formula" --- at the time the data was gathered the same as the relationship at the (generally later) time references by the Preceptor Table.

Exercise 4

Provide one reason why the assumption of stability might not be true in this case.

question_text(NULL,
    message = "The assumption of stability might not hold because the relationship with our data (Spanish speakers on a train platform) and Preceptor Table (a speech from the mayor about immigrants) are not similar enough. ",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Exercise 5

In your own words, define the assumption of "representativeness" when employed in the context of data science.

question_text(NULL,
    message = "Representativeness, or the lack thereof, concerns two relationship, among the rows in the Population Table. The first is between the Preceptor Table and the other rows. The second is between our data and the other rows.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Ideally, we would like both the Preceptor Table and our data to be random samples from the population. Sadly, this is almost never the case.

Exercise 6

Provide one reason why the assumption of representativeness might not be true in this case.

question_text(NULL,
    message = "The assumption of representatives might not be true because the sample of our data (Spanish speakers on a train platform) and Preceptor Table (a speech from the mayor about immigrants) are not random enough.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Exercise 7

In your own words, define the assumption of "unconfoundedness" when employed in the context of data science.

question_text(NULL,
    message = "Unconfoundedness means that the treatment assignment is independent of the potential outcomes, when we condition on pre-treatment covariates.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

This assumption is only relevant for causal models. We describe a model as "confounded" if this is not true.

Exercise 8

In one sentence, provide a reason why the assumption of unconfoundedness would hold.

question_text(NULL,
    message = "The assumption of unconfoundedness would hold because the sample of our data (Spanish speakers on a train platform) and Preceptor Table (a speech from the mayor about immigrants) are part of a causal model and we are randomly assigning treatment to the citizens.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Unconfoundedness is related to the absence of confounding variables that may bias the relationship between the outcome variable and covariates. It implies that there are no hidden factors influencing both the treatment (if applicable) and the outcome.

Courage

Courage is the commitment to begin without any guarantee of success. - Johann Wolfgang von Goethe

Exercise 1

In your own words, describe the components of the virtue of Courage for analyzing data.

question_text(NULL,
    message = "Courage begins with the exploration and testing of different models. It concludes with the creation of a data generating mechanism.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Temperance

Temperance to be a virtue must be free, and not forced. - Philip Massinger

Exercise 1

In your own words, describe the use of Temperance in finishing your data science project.

question_text(NULL,
    message = "Temperance uses the data generating mechanism to answer the questions with which we began. Humility reminds us that this answer is always a lie. We can also use the DGM to calculate many similar quantities of interest, displaying the results graphically.",
    answer(NULL, correct = TRUE),
    allow_retry = FALSE,
    incorrect = NULL,
    rows = 6)

Summary

This tutorial covered Chapter 2: Probability of Preceptor’s Primer for Bayesian Data Science: Using the Cardinal Virtues for Inference by David Kane.

PPBDS/primer.tutorials documentation built on April 3, 2025, 3:11 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

PPBDS/primer.tutorials Tutorials for Preceptor's Primer for Bayesian Data Science

In PPBDS/primer.tutorials: Tutorials for Preceptor's Primer for Bayesian Data Science

Introduction

Distributions

Exercise 1

Exercise 2

Exercise 3

Exercise 4

Exercise 5

Exercise 6

Exercise 7

Exercise 8

Exercise 9

Exercise 10

Exercise 11

Exercise 12

Exercise 13

Exercise 14

Exercise 15

Exercise 16

Exercise 17

PD for rolling two dice

Exercise 1

Exercise 2

Exercise 3

Exercise 4

Exercise 5

Exercise 6

Exercise 7

Exercise 8

Exercise 9

Exercise 10

Exercise 11

Wisdom

Exercise 1

Exercise 2

Exercise 3

Exercise 4

Exercise 5

Exercise 6

Exercise 7

Exercise 8

Exercise 9

Exercise 10

Exercise 11

Justice

Exercise 1

Exercise 2

Exercise 3

Exercise 4

Exercise 5

Exercise 6

Exercise 7

Exercise 8

Courage

Exercise 1

Temperance

Exercise 1

Summary

R Package Documentation

Browse R Packages

We want your feedback!

PPBDS/primer.tutorials
Tutorials for Preceptor's Primer for Bayesian Data Science