In emeyers/SDS100: Tools for the class Introductory Statistics

knitr::opts_chunk$set(echo = TRUE)

# download the data

SDS100::download_data("stereograms.txt")
SDS100::download_data("freshman-15.txt")
SDS100::download_data("zodiac.csv")

library(SDS100)

$\$

Hypothesis tests for two means: Stereograms

step 1:

$H_0: \mu_{visual} = \mu_{no_visual}$

$H_A: \mu_{visual} < \mu_{no_visual}$

# step 2a: 

# SDS100::download_data("stereograms.txt")
stereograms <- read.table("stereograms.txt", header = TRUE)

no_visual <- subset(stereograms, group == 'NV')$fusion_time
visual <- subset(stereograms, group == 'VV')$fusion_time


boxplot(visual, no_visual, 
        names = c("visual", "no visual"),
        ylab = "Fusion time (s)")





# step 2b: 

mean_no_visual <- mean(no_visual)
mean_visual <- mean(visual)

n_no_visual <- length(no_visual)
n_visual <- length(visual)


var_no_visual <- var(no_visual)
var_visual <- var(visual)


SE <- sqrt( (var_visual/n_visual)  +  (var_no_visual/n_no_visual)) 


t_stat <- (mean_visual - mean_no_visual)/SE   

degree_free <- min(n_no_visual, n_visual) - 1





# step 3: visualize null distribution

x_vals <- seq(-5, 5, length.out = 1000)
y_vals <- dt(x_vals, degree_free)
plot(x_vals, y_vals, type = "l")

abline(v = t_stat, col = "red")






# step 4: calculate the p-value

p_value <- pt(t_stat, df = degree_free)







# step 5: reject!


# can also run t.test()

t.test(visual, no_visual, alternative = "less")

$\$

Paired t-test: The freshman 15

Do freshman gain weight their first semester in college?

Step 1:

$H_0: \mu_{diff} = 0$
$H_A: \mu_{diff} > 0$

The data is from: https://dasl.datadescription.com/datafile/freshman-15/

# SDS100::download_data("freshman-15.txt")
freshman <- read.table("freshman-15.txt", header = TRUE)

initial_weight <- freshman$Initial.Weight
final_weight <- freshman$Terminal.Weight


# Let's define: mu_diff  =  mu_final - mu_initial  


weight_diff <- final_weight - initial_weight


# 2a. not a good visualization
boxplot(final_weight, initial_weight, 
        names = c("Final", "Initial"))


# 2a. stripchart and boxplot and 
stripchart(weight_diff, method = "jitter")
hist(weight_diff, breaks = 20)
abline(v = 0, col = "green")
abline(v = mean(weight_diff), col = "red")



# 2b. calculate the observed statistic
mean_diff <- mean(weight_diff)
sd_diff <- sd(weight_diff)
n_diff <- length(final_weight)

SE <- sd_diff/sqrt(n_diff)

t_stat <- mean_diff/SE

degree_free <- n_diff - 1




# 3. plot the null distribution 
x_vals <- seq(-5, 5, length.out = 1000)
y_vals <- dt(x_vals, degree_free)
plot(x_vals, y_vals, type = "l")




# 4.  p-value
p_val <- pt(t_stat, degree_free, lower.tail = FALSE)




# 5. conclusion!


t.test(initial_weight, final_weight, 
       paired = TRUE,
       alternative = "less")


# compare to non-paired t-test
t.test(initial_weight, final_weight, alternative = "less")



# confidence interval on the weight gain...
mean_diff + qt(.975, degree_free) * c(-SE, SE)

# more like the freshman 1.5 lbs!

# sanity check
# t.test(final_weight, initial_weight, paired = TRUE)$conf.int

$\$

Inference on more than 2 proportions

Let's run a chi-square test for goodness of fit test to see if there are the same number of CEOs born under different astrological signs.

Step 1:

$H_0$: same proportion of CEO born under each sign $H_A$: some signs yield more CEOs

#SDS100::download_data("zodiac.csv")


zodiac <- read.csv("zodiac.csv", header = TRUE)


# 2a: visualize the data

births <- zodiac$Births

barplot(births, 
        names.arg = zodiac$Month, 
        horiz = TRUE, 
        las = 2)



# step 2b. calculate the observed statistic

expected <- sum(births)/12

chi_stat <- sum((births - expected)^2/expected)

degree_free <- 12 - 1   # k - 1


# step 3: visualize the null
x_vals <- seq(0, 50, length.out = 1000)
y_vals <- dchisq(x_vals, degree_free)
plot(x_vals, y_vals, type = "l")
abline(v = chi_stat, col = "red")

# step 4: p-value
pchisq(chi_stat, degree_free, lower.tail = FALSE)


# 5. fail to reject

# sanity check using built in R functions
chisq.test(zodiac$Births)

$\$

Inference on more than 2 proportions using resampling methods

Can we run the test using resampling methods?

Step 1:

$H_0$: same proportion of CEO born under each sign. $H_A$: some signs yield more CEOs.

# step 2:  calculate the observed statistic


expected_props <- rep(1/12, 12) # expected born each month

(obs_stat <- get_chisqr_stat(births, expected_props))



# step 3: create the null distribution 

tot_births <- sum(births)  # n = total number of CEOs born

null_dist <- do_it(10000) * {

  simulated_births <- rmultinom(1, tot_births, expected_props)
  get_chisqr_stat(simulated_births, expected_props)

}


# view the null distribution
hist(null_dist, freq = FALSE)

x_vals <- seq(0, 40, by = .1)
y_vals <- dchisq(x_vals, df = degree_free)  # df = k-1
points(x_vals, y_vals, col = "green", type = "l")

abline(v = obs_stat, col = "red")


# how does this compare with the parametric p-value?
pnull(obs_stat, null_dist, lower.tail = FALSE)

$\$

Example 2: Alameda county jury selection

total_selected <- 1453

(obs_counts <- total_selected * c(0.26, 0.08, 0.08, 0.54, 0.04))

expected_props <- c(0.15, 0.18, 0.12, 0.54, 0.01)

chisq.test(obs_counts, p = expected_props)