# load packages ----------------------------------------------------------------
library(learnr)
library(gradethis)
library(tidyverse)
library(tidymodels)
library(dsbox)

# set options for exercises and checking ---------------------------------------

gradethis_setup()


# hide non-exercise code chunks ------------------------------------------------
knitr::opts_chunk$set(echo = FALSE)
gss16 <- dsbox::gss16 |>
  mutate(email = emailmin + (emailhr * 60))
gss16_email <- dsbox::gss16 |>
  mutate(email = emailmin + (emailhr * 60)) |>
  filter(!is.na(email))

set.seed(1234)
gss16_email <- dsbox::gss16 |>
  mutate(email = emailmin + (emailhr * 60)) |>
  filter(!is.na(email))

set.seed(1234)

boot_df <- gss16_email |>
  specify(response = email) |> 
  generate(reps = 1000, type = "bootstrap") |> 
  calculate(stat = "mean")
gss16_email <- dsbox::gss16 |>
  mutate(email = emailmin + (emailhr * 60)) |>
  filter(!is.na(email))

set.seed(1234)

boot_df_median <- gss16_email |>
  specify(response = email) |> 
  generate(reps = 1000, type = "bootstrap") |> 
  calculate(stat = "median")

Introduction

knitr::include_graphics("images/christian-wiediger-70ku6P7kgmc-unsplash.jpg")

Learning goals

Packages

In this assignment we will work with the following packages. You can load them with the following code block:

library(tidyverse)
library(tidymodels)
library(dsbox)
library(tidyverse)
library(tidymodels)
library(dsbox)
grade_this_code("You've successfully loaded the packages.")

Data

For this tutorial, we're going to continue our exploration of the 2016 GSS dataset from last week, which is available as part of the dsbox package.

Data dictionary

| Variable name | Description/Question in GSS |:--------|:------------------------------------------------------------- | harass5 | "Over the past five years, have you been harassed by your superiors or co-workers at your job, for example, have you experienced any bullying, physical or psychological abuse?" | emailmin | Number of minutes spent on email weekly, extra to the hours in emailhrs (e.g. emailmin = 30 for 2.5 hours on email). | emailhr | Number of hours spent on email weekly. | educ | Number of years in education. | polviews | Political views. Possible answers are Extremely liberal, Liberal, Slightly liberal, Moderate, Slghtly conservative, Conservative, Extrmly conservative. | advfront | "Even if it brings no immediate benefits, scientific research that advances the frontiers of knowledge is necessary and should be supported by the federal government." | snapchat | Whether respondent uses Snapchat or not. | instagram | Whether respondent uses Instagram or not. | wrkstat | Work status.

Exercises

email variable

Remember that the GSS asked respondents how many hours and minutes they spend on email weekly. The responses to these questions are recorded in the emailhr and emailmin variables. For example, if the response is 2.5 hrs, this would be recorded as emailhr = 2 and emailmin = 30.

For the first exercise, create a new variable called email that combines these two variables to reports the number of minutes the respondents spend on email weekly. (Yes, this exercise is a repeat of what you did last week!)

gss16 <- gss16 |>
  ___
gss16 <- gss16 |>
  mutate(___)
gss16 <- gss16 |>
  mutate(email = ___)
gss16 <- gss16 |>
  mutate(email = emailmin + (emailhr * 60))
grade_this_code("You've successfully created the 'email' variable.")

Filter the data

Next, filter the data for only those who have non NA entries for email. Do not overwrite the data frame (you’ll need the full data later). Instead save the resulting data frame as gss16_email.


gss16_email <- gss16 |>
  ___
gss16_email <- gss16 |>
  filter(___(email))
gss16_email <- gss16 |>
  filter(!is.na(email))
grade_this_code("You've successfully filtered the data.")

Bootstrap confidence intervals

In the following exercise, we'll be using the tidymodels package to construct intervals rather than using for loops.

What we want to do now is calculate a 95% bootstrap confidence interval for the mean amount of time Americans spend on email weekly.

Since the bootstrapping involves some random sampling, we want some way of essentially defining how the random sampling is done. For example, this might be done so that if another person runs your R code, they'll produce the same confidence interval as you.

Fortunately, this can be done quite easily by setting a seed. For this tutorial, we'll settle on the rather arbitrary 1234, but you can use any seed you like in your own assignments.

Use the following code block to set this seed:

set.seed(1234)
set.seed(1234)
grade_this_code("You've set the seed ready for bootstrapping.")

Creating the data frame

When bootstrapping, we first need to create a data frame containing our bootstrapped data - in this case, sample means.

There are a few functions we use from the tidymodels package in doing this:

Fill in the blanks in the following code block to create the bootstrapped data frame:

boot_df <- ___ |>
  specify(response = ___) |> 
  generate(reps = 1000, type = "bootstrap") |> 
  ___(stat = "___")
boot_df <- gss16_email |>
  specify(response = ___) |> 
  generate(reps = 1000, type = "bootstrap") |> 
  ___(stat = "___")
boot_df <- gss16_email |>
  specify(response = email) |> 
  generate(reps = 1000, type = "bootstrap") |> 
  ___(stat = "___")
boot_df <- gss16_email |>
  specify(response = email) |> 
  generate(reps = 1000, type = "bootstrap") |> 
  calculate(stat = "mean")
grade_this_code("You've successfully created the bootstrap data frame.")

Glimpse the bootstrapped data frame

Let's have a quick look at the bootstrapped data frame:

glimpse(boot_df)

Note that each value of the stat field is a particular bootstrapped sample mean.

Find the confidence interval

Now we have the boot_df data frame, we can construct the confidence interval itself by picking out a certain percentage of the centre of the distribution. Since we're after a 95% confidence interval, we take the centre 95% of the bootstrap distribution, and find the range of these values. This gives the desired interval.

To find this range, we calculate percentiles so that the central 95% of the distribution is contained within them.

Fill in the blanks in this code block to construct the 95% bootstrap confidence interval for the mean amount of time Americans spend on email weekly.

___ |>
  summarize(lower = quantile(___, ___),
            upper = ___(___, ___))
boot_df |>
  summarize(lower = quantile(___, ___),
            upper = ___(___, ___))
boot_df |>
  summarize(lower = quantile(stat, ___),
            upper = ___(___, ___))
boot_df |>
  summarize(lower = quantile(stat, 0.025),
            upper = ___(___, ___))
boot_df |>
  summarize(lower = quantile(stat, 0.025),
            upper = quantile(stat, 0.975))
grade_this({
  if(identical(floor(as.numeric(.result$lower[1])), 385) & identical(floor(as.numeric(.result$upper[1])), 452)) {
    pass("That is a 95% confidence interval for mean weekly time spent on email.")
  }
  if(identical(floor(as.numeric(.result$lower[1])), 389) & identical(floor(as.numeric(.result$upper[1])), 444)) {
    fail("It looks like you've calculated a 90% confidence interval. Remember that to pick out the central 95% of the distribution, we need to discard the most extreme 2.5% at each side.")
  }
  fail("Not quite. Take a look at the hints if you need some help.")
})

Would you expect a 99% confidence interval to be wider or narrower than the interval you calculated above?

question("Which statements are correct? Choose all that apply.",
  answer("The 99% confidence interval will be narrower, because we have to allow for less variation in order to be 99% accurate"),
  answer("The 99% confidence interval will be wider, because we need to allow for more variation in order to be more certain that any given mean will lie within the interval",
    correct = TRUE
  ),
  answer("The 95% confidence interval will be narrower, because we have to be less certain that any given mean will lie within the interval compared to a 99% confidence interval",
    correct = TRUE
  ),
  answer("The 95% confidence interval will be wider, because we can allow for more variation in order to be 95% accurate"),
  allow_retry = TRUE,
  random_answer_order = TRUE
)

Using the bootstrap distribution from a previous exercise, calculate a 99% bootstrap confidence interval for the mean amount of time Americans spend on email weekly.

___ |>
  ___
Look at the previous question for help!
boot_df |>
  summarize(lower = ___(___, ___),
            ___ = ___)
boot_df |>
  summarize(lower = quantile(___, 0.005),
            upper = quantile(___, ___))
boot_df |>
  summarize(lower = quantile(stat, 0.005),
            upper = quantile(stat, 0.995))
grade_this({
  if(identical(floor(as.numeric(.result$lower[1])), 375) & identical(floor(as.numeric(.result$upper[1])), 465)) {
    pass("You have calculated the 99% confidence interval correctly!")
  }
  if(identical(floor(as.numeric(.result$lower[1])), 378) & identical(floor(as.numeric(.result$upper[1])), 460)) {
    fail("It looks like you've calculated a 98% confidence interval. Remember that to pick out the central 99% of the distribution, we need to discard the most extreme 0.5% at each side.")
  }
  fail("Not quite. Take a look at the hints if you need some help.")
})

Bootstrapping the median

And finally, we want to construct a 90% confidence interval for the median amount of time Americans spend on email weekly. First, create a bootstrapped data frame just like you did in a previous exercise, only now, we want to calculate bootstrap medians. Your data frame should be called boot_df_median.

boot_df_median <- ___ |>
  ___
Look at how you created your first bootstrapped data frame, just remember to calculate the median instead of the mean!
boot_df_median <- ___ |>
  ___(response = ___) |> 
  ___(reps =___, type = ___) |> 
  ___(stat = "___")
boot_df_median <- gss16_email |>
  specify(response = ___) |> 
  generate(reps = 1000, type = "bootstrap") |> 
  calculate(stat = "___")
boot_df_median <- gss16_email |>
  specify(response = email) |> 
  generate(reps = 1000, type = "bootstrap") |> 
  calculate(stat = "median")
grade_this_code("You have created a new bootstrapped data frame!")

Now, calculate the 90% confidence interval for the median amount of time spent on email.

___ |>
  ___
Look at the previous question for help!
Remember to use the 'boot_df_median' dataframe.
boot_df_median |>
  summarize(lower = ___(___, ___),
            ___ = ___)
boot_df_median |>
  summarize(lower = quantile(___, 0.05),
            upper = quantile(___, ___))
boot_df_median |>
  summarize(lower = quantile(stat, 0.05),
            upper = quantile(stat, 0.95))
grade_this({
  if(identical(as.numeric(.result$lower[1]), 120) & identical(as.numeric(.result$upper[1]), 120)) {
  pass("You have calculated the 90% confidence interval correctly!")
}
  fail("Not quite. Take a look at the hints if you need some help.")
})

Based on your findings, answer the following question:

question("Which statement is correct?",
  answer("90% of Americans spend 2 hours per week on email."),
  answer("In 90% of random samples of size 1649 taken from the population, the median amount of time the respondents spent on email weekly was 2 hours",
    correct = TRUE
  ),
  answer("We can be 90% certain that the mean amount of time spent on email per week is 2 hours."),
  answer("In any given sample of Americans, we can predict the median amount of time spent on email per week to be 2 hours with a margin of error of 12 minutes."),
  allow_retry = TRUE,
  random_answer_order = TRUE
)

Wrap Up

Great work! We hope you enjoyed this exploration into bootstrapping and confidence intervals. Only one more tutorial to go!



rstudio-education/dsbox documentation built on Oct. 22, 2023, 12:20 a.m.