In emeyers/SDS100: Tools for the class Introductory Statistics

knitr::opts_chunk$set(echo = TRUE)

library(SDS100)

$\$

Confidence interval for a single mean

A study by Nierenberg et al (1989) investigated the relationship between personal characteristics and dietary factors, and plasma concentrations of carotenoids.

Please use the data they collected to create a 98% confidence interval for the number of grams of fiber US adults get in a day.

#download.file("https://www.lock5stat.com/datasets3e/NutritionStudy.csv", "NutritionStudy.csv")

nutrition_df <- read.csv("NutritionStudy.csv")

fiber <- nutrition_df$Fiber

n <- length(fiber)
xbar <- mean(fiber)
SE <- sd(fiber)/sqrt(n)
t_star <- qt(.99, n -1)

CI <- c(xbar - t_star * SE, xbar + t_star * SE)

CI

$\$

Hypothesis test for a single mean

It is recommended that adults sleep at least 8 hours a night

A Statistics professor asked 12 undergraduate students how much sleep they were getting and found the average was 6.2 hours with a standard deviation of 1.7 hours.

Assuming this is representative of all students in a Statistics class, does this provide evidence that students in the class are not getting enough sleep on average?

mu0 <- 8
xbar <- 6.2
n <- 12
s = 1.7

(t_stat <- (xbar - mu0)/(s/sqrt(n)))

pt(t_stat, n-1)

$\$

Hypothesis tests for two means: Stereograms

step 1:

$H_0: \mu_{visual} = \mu_{no_visual}$

$H_A: \mu_{visual} < \mu_{no_visual}$

# step 2a: 

# SDS100::download_data("stereograms.txt")
stereograms <- read.table("stereograms.txt", header = TRUE)

no_visual <- subset(stereograms, group == 'NV')$fusion_time
visual <- subset(stereograms, group == 'VV')$fusion_time


boxplot(visual, no_visual, 
        names = c("visual", "no visual"),
        ylab = "Fusion time (s)")





# step 2b: 

mean_no_visual <- mean(no_visual)
mean_visual <- mean(visual)

n_no_visual <- length(no_visual)
n_visual <- length(visual)


var_no_visual <- var(no_visual)
var_visual <- var(visual)


SE <- sqrt( (var_visual/n_visual)  +  (var_no_visual/n_no_visual)) 


t_stat <- (mean_visual - mean_no_visual)/SE   

degree_free <- min(n_no_visual, n_visual) - 1





# step 3: visualize null distribution

x_vals <- seq(-5, 5, length.out = 1000)
y_vals <- dt(x_vals, degree_free)
plot(x_vals, y_vals, type = "l")

abline(v = t_stat, col = "red")






# step 4: calculate the p-value

p_value <- pt(t_stat, df = degree_free)







# step 5: reject!


# can also run t.test()

t.test(visual, no_visual, alternative = "less")

$\$

Paired t-test: The freshman 15

The data is from: https://dasl.datadescription.com/datafile/freshman-15/

# SDS100::download_data("freshman-15.txt")
freshman <- read.table("freshman-15.txt", header = TRUE)

initial_weight <- freshman$Initial.Weight
final_weight <- freshman$Terminal.Weight


# Let's define: mu_diff  =  mu_final - mu_initial  

# H0: mu_diff  = 0      
# HA: mu_diff  < 0

weight_diff <- final_weight - initial_weight





# 2a. not a good visualization
boxplot(final_weight, initial_weight, 
        names = c("Final", "Initial"))


# 2a. stripchart and boxplot and 
stripchart(weight_diff, method = "jitter")
hist(weight_diff, breaks = 20)
abline(v = 0, col = "green")
abline(v = mean(weight_diff), col = "red")



# 2b. calculate the observed statistic
mean_diff <- mean(weight_diff)
sd_diff <- sd(weight_diff)
n_diff <- length(final_weight)

SE <- sd_diff/sqrt(n_diff)

t_stat <- mean_diff/SE

degree_free <- n_diff - 1






# 3. plot the null distribution 
x_vals <- seq(-5, 5, length.out = 1000)
y_vals <- dt(x_vals, degree_free)
plot(x_vals, y_vals, type = "l")






# 4.  p-value
p_val <- pt(t_stat, degree_free, lower.tail = FALSE)






# 5. conclusion!


t.test(initial_weight, final_weight, 
       paired = TRUE,
       alternative = "less")


# compare to non-paired t-test
t.test(initial_weight, final_weight, alternative = "less")



# confidence interval on the weight gain...
mean_diff + qt(.975, degree_free) * c(-SE, SE)

# more like the freshman 1.5 lbs!

# sanity check
# t.test(final_weight, initial_weight, paired = TRUE)$conf.int