In emeyers/SDS100: Tools for the class Introductory Statistics

knitr::opts_chunk$set(echo = TRUE)

Normal quantiles

We use quantiles from a standard normal distribution to create confidence intervals for different confidence levels.

For a given confidence level C, we want to find the quentile value q such that C percent of the area of a standard normal distribution is between -q and q*.

# install.packages("mosaic")

quantile_values <- mosaic::xqnorm(c(.025, .975))


quantile_values

$\$

Simple example of sampling with replacement

library(SDS100)


# simple vector
my_sample <- c(3, 1, 4, 1, 5, 9)


# resampling values in the vector with replacement
boot_sample  <-  sample(my_sample,  6,  replace = TRUE)

$\$

Simple fake data bootstrap example

# simple bootstrap example

my_sample <- c(21, 29, 25, 19, 24, 22, 25, 26, 25, 29)


# calculate the x-bar statistic
obs_mean <- mean(my_sample)


# create the bootstrap distribution
bootstrap_dist <-  do_it(10000) * {    

  curr_boot <- sample(my_sample , 10, replace = TRUE)  
  mean(curr_boot)  

}


# visualize the bootstrap distribution
hist(bootstrap_dist, 100)


# calculate the standard error
SE_boot <- sd(bootstrap_dist)


# 95% confidence interval
CI_lower <-  obs_mean  - 2 * SE_boot 
CI_upper <-  obs_mean  + 2 * SE_boot 


c(CI_lower, CI_upper)

$\$

Confidence interval example 1: One true love?

A survey asked 2625 people whether they agreed with the statement “There is only one true love for each person”.

1812 of the respondents disagreed.

Compute a 90% confidence interval for the proportion who disagreed.

# create the data set

belives_in_true_love <- c(rep(FALSE, 1812), rep(TRUE, 2625 - 1812))


# The actual proportion of people who believe in true love
# (note: we are using the mean function here to get a proportion)
# alternatively we could use:  get_proportion(belives_in_true_love, TRUE)[2]

(obs_prop <- mean(belives_in_true_love))




# get an example of a bootstrap statistics 

# We can use the sample(data_vec, n) to get a sample of length n:
boot_sample <- sample(belives_in_true_love, 2625, replace = TRUE)
bootstrap_prop <- mean(boot_sample)



# Create a bootstrap distribution in R
boot_dist <- do_it(10000)  *  {

  boot_sample <- sample(belives_in_true_love, 2625, replace = TRUE)

  mean(boot_sample)

}



# visualize the bootstrap distribution
hist(boot_dist, 20)


# get the bootstrap standard error SE*
SE_boot <- sd(boot_dist)


# 90% confidence interval
CI_lower <-  obs_prop  - qnorm(.95) * SE_boot 
CI_upper <-  obs_prop  + qnorm(.95) * SE_boot 

c(CI_lower, CI_upper)

$\$

Confidence interval example 2: Average movie revenue

Let's use the Bechdel data to calculate a confidence interval fro the mean revenue that movies make.

library(fivethirtyeight)

revenue <- na.omit(bechdel$domgross_2013)


(obs_mean <- mean(revenue))



# Create a bootstrap distribution in R
boot_dist <- do_it(10000)  *  {

  boot_sample <- sample(revenue, length(revenue), replace = TRUE)

  mean(boot_sample)

}



# visualize the bootstrap distribution
hist(boot_dist, 100)


# get the bootstrap standard error SE*
SE_boot <- sd(boot_dist)


# 95% confidence interval
CI_lower <-  obs_mean  - 2 * SE_boot 
CI_upper <-  obs_mean  + 2 * SE_boot 

c(CI_lower, CI_upper)