In emeyers/SDS100: Tools for the class Introductory Statistics

knitr::opts_chunk$set(echo = TRUE)

library(SDS100)


# download the data
download_data("Gettysburg_sample_ave_word_lengths.rda")
download_data("gettysburg.Rda")

$\$

Gettysburg address word length distribution

Let's create a histogram of the lengths of the 268 words in the Gettysburg address.

# load the actual gettysburg address words and compute mu
load("gettysburg.Rda")

# get the actual number of letters in the Gettysburg address (population)
real_num_letters <- gettysburg$num_letters

# calculate the average of the population of word lengths (mu)
mean_num_letters <- mean(real_num_letters) 


# Create a histogram of the word lengths
hist(real_num_letters,
     xlab = "Number of letters",
     main = "Number of letters is words")


# plot the average word length as a red line
abline(v = mean_num_letters, col = 'blue')

$\$

Student samples of average word lengths from 10 words

Let's create a histogram of the average word length $\bar{x}$ from n = 10 word that students sampled from the Gettysburg address.

# load the class average word lengths and compute the mean E[x-bar]
load("Gettysburg_sample_ave_word_lengths.rda")


# get a vector of the each students average word length x-bar
average_word_lengths <- word_lengths$Ave_word_lengths


# take the average of the x-bar's  E[x-bar]
mean_class_average_word_len <- mean(average_word_lengths) 



# create a histogram of the class average word lengths
hist(average_word_lengths,
     breaks = 30,
     xlab = "Average word length of 10 words",
     main = "Histogram of average of 10 words")



# add line for mu 
abline(v = mean_num_letters, col = 'blue')


# add a line for E[x-bar]
abline(v = mean_class_average_word_len, col = "red")

$\$

Using R to get samples of average word lengths from 10 words

Let's use R to randomly (uniformly) sample 10 words from the Gettysburg address and calculate the average $\bar{X}$ of 10 randomly sampled words.

You can also explore this using this app: https://ethan-meyers-test.shinyapps.io/gettysburg_sampling_distribution/

library(SDS100)


# you can ignore this code for now
# we will explain how this works soon...
sampling_distribution <- do_it(10000) * {

  random_sample <- sample(real_num_letters, 10)
  mean(random_sample)

}


# get the mean of the sampling distribution 
mean_sampling_distributon <- mean(sampling_distribution)


# create a histogram of the randomly sampled average word lengths
hist(sampling_distribution,
     breaks = 30,
     xlab = "Average word length of 10 words",
     main = "Histogram of average of 10 words")


# add lines for mu and E[x-bar]
#abline(v = mean_class_average_word_len, col = "red")
abline(v = mean_num_letters, col = 'blue')
abline(v = mean_class_average_word_len, col = "red")


# add line at mu 
abline(v = mean_num_letters, col = 'blue')


# add a line at E[x-bar]
abline(v = mean_class_average_word_len, col = "red")


# add a line at the mean of the sampling distribution 
abline(v = mean_sampling_distributon, col = "green")