knitr::opts_chunk$set(echo = TRUE)
# download the data SDS100::download_data("stereograms.txt") SDS100::download_data("freshman-15.txt") SDS100::download_data("zodiac.csv")
library(SDS100)
$\$
step 1:
$H_0: \mu_{visual} = \mu_{no_visual}$
$H_A: \mu_{visual} < \mu_{no_visual}$
# step 2a: # SDS100::download_data("stereograms.txt") stereograms <- read.table("stereograms.txt", header = TRUE) no_visual <- subset(stereograms, group == 'NV')$fusion_time visual <- subset(stereograms, group == 'VV')$fusion_time boxplot(visual, no_visual, names = c("visual", "no visual"), ylab = "Fusion time (s)") # step 2b: mean_no_visual <- mean(no_visual) mean_visual <- mean(visual) n_no_visual <- length(no_visual) n_visual <- length(visual) var_no_visual <- var(no_visual) var_visual <- var(visual) SE <- sqrt( (var_visual/n_visual) + (var_no_visual/n_no_visual)) t_stat <- (mean_visual - mean_no_visual)/SE degree_free <- min(n_no_visual, n_visual) - 1 # step 3: visualize null distribution x_vals <- seq(-5, 5, length.out = 1000) y_vals <- dt(x_vals, degree_free) plot(x_vals, y_vals, type = "l") abline(v = t_stat, col = "red") # step 4: calculate the p-value p_value <- pt(t_stat, df = degree_free) # step 5: reject! # can also run t.test() t.test(visual, no_visual, alternative = "less")
$\$
Do freshman gain weight their first semester in college?
Step 1:
$H_0: \mu_{diff} = 0$
$H_A: \mu_{diff} > 0$
The data is from: https://dasl.datadescription.com/datafile/freshman-15/
# SDS100::download_data("freshman-15.txt") freshman <- read.table("freshman-15.txt", header = TRUE) initial_weight <- freshman$Initial.Weight final_weight <- freshman$Terminal.Weight # Let's define: mu_diff = mu_final - mu_initial weight_diff <- final_weight - initial_weight # 2a. not a good visualization boxplot(final_weight, initial_weight, names = c("Final", "Initial")) # 2a. stripchart and boxplot and stripchart(weight_diff, method = "jitter") hist(weight_diff, breaks = 20) abline(v = 0, col = "green") abline(v = mean(weight_diff), col = "red") # 2b. calculate the observed statistic mean_diff <- mean(weight_diff) sd_diff <- sd(weight_diff) n_diff <- length(final_weight) SE <- sd_diff/sqrt(n_diff) t_stat <- mean_diff/SE degree_free <- n_diff - 1 # 3. plot the null distribution x_vals <- seq(-5, 5, length.out = 1000) y_vals <- dt(x_vals, degree_free) plot(x_vals, y_vals, type = "l") # 4. p-value p_val <- pt(t_stat, degree_free, lower.tail = FALSE) # 5. conclusion! t.test(initial_weight, final_weight, paired = TRUE, alternative = "less") # compare to non-paired t-test t.test(initial_weight, final_weight, alternative = "less") # confidence interval on the weight gain... mean_diff + qt(.975, degree_free) * c(-SE, SE) # more like the freshman 1.5 lbs! # sanity check # t.test(final_weight, initial_weight, paired = TRUE)$conf.int
$\$
Let's run a chi-square test for goodness of fit test to see if there are the same number of CEOs born under different astrological signs.
Step 1:
$H_0$: same proportion of CEO born under each sign $H_A$: some signs yield more CEOs
#SDS100::download_data("zodiac.csv") zodiac <- read.csv("zodiac.csv", header = TRUE) # 2a: visualize the data births <- zodiac$Births barplot(births, names.arg = zodiac$Month, horiz = TRUE, las = 2) # step 2b. calculate the observed statistic expected <- sum(births)/12 chi_stat <- sum((births - expected)^2/expected) degree_free <- 12 - 1 # k - 1 # step 3: visualize the null x_vals <- seq(0, 50, length.out = 1000) y_vals <- dchisq(x_vals, degree_free) plot(x_vals, y_vals, type = "l") abline(v = chi_stat, col = "red") # step 4: p-value pchisq(chi_stat, degree_free, lower.tail = FALSE) # 5. fail to reject # sanity check using built in R functions chisq.test(zodiac$Births)
$\$
Can we run the test using resampling methods?
Step 1:
$H_0$: same proportion of CEO born under each sign. $H_A$: some signs yield more CEOs.
# step 2: calculate the observed statistic expected_props <- rep(1/12, 12) # expected born each month (obs_stat <- get_chisqr_stat(births, expected_props)) # step 3: create the null distribution tot_births <- sum(births) # n = total number of CEOs born null_dist <- do_it(10000) * { simulated_births <- rmultinom(1, tot_births, expected_props) get_chisqr_stat(simulated_births, expected_props) } # view the null distribution hist(null_dist, freq = FALSE) x_vals <- seq(0, 40, by = .1) y_vals <- dchisq(x_vals, df = degree_free) # df = k-1 points(x_vals, y_vals, col = "green", type = "l") abline(v = obs_stat, col = "red") # how does this compare with the parametric p-value? pnull(obs_stat, null_dist, lower.tail = FALSE)
$\$
total_selected <- 1453 (obs_counts <- total_selected * c(0.26, 0.08, 0.08, 0.54, 0.04)) expected_props <- c(0.15, 0.18, 0.12, 0.54, 0.01) chisq.test(obs_counts, p = expected_props)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.