knitr::opts_chunk$set(echo = TRUE)
library(SDS100)
# download the data SDS100::download_data("zodiac.csv") SDS100::download_data("popularkids.txt") SDS100::download_data("analgesics.txt")
$\$
Chi-square test for goodness of it
CEO zodiac signs
# SDS100::download_data("zodiac.csv") zodiac <- read.csv("zodiac.csv", header = TRUE) # step 1: # H0: same proportion of CEO born under each sign # HA: some signs yield more CEOs # 2a: visualize the data births <- zodiac$Births barplot(births, names.arg = zodiac$Month, horiz = TRUE, las = 2) # step 2b. calculate the observed statistic expected <- sum(births)/12 chi_stat <- sum((births - expected)^2/expected) degree_free <- 12 - 1 # k - 1 # step 3: visualize the null x_vals <- seq(0, 50, length.out = 1000) y_vals <- dchisq(x_vals, degree_free) plot(x_vals, y_vals, type = "l") abline(v = chi_stat, col = "red") # step 4: p-value pchisq(chi_stat, degree_free, lower.tail = FALSE) # 5. fail to reject # sanity check using built in R functions chisq.test(zodiac$Births)
$\$
CEO zodiac signs continued...
# step 1: # H0: same proportion of CEO born under each sign # HA: some signs yield more CEOs # step 2: calculate the observed statistic expected_props <- rep(1/12, 12) # expected born each month (obs_stat <- get_chisqr_stat(births, expected_props)) # step 3: create the null distribution tot_births <- sum(births) # n = total number of CEOs born null_dist <- do_it(10000) * { simulated_births <- rmultinom(1, tot_births, expected_props) get_chisqr_stat(simulated_births, expected_props) } # view the null distribution hist(null_dist, freq = FALSE) x_vals <- seq(0, 40, by = .1) y_vals <- dchisq(x_vals, df = degree_free) # df = k-1 points(x_vals, y_vals, col = "green", type = "l") abline(v = obs_stat, col = "red") # how does this compare with the parametric p-value? pnull(obs_stat, null_dist, lower.tail = FALSE)
$\$
The data is from:
Chase, M. A., and Dummer, G. M. (1992), "The Role of Sports as a Social Determinant for Children," Research Quarterly for Exercise and Sport, 63, 418-424
The subjects, students in grades 4-6 in selected schools in Michigan, were asked the following question: What would you most like to do at school?
A. Make good grades. B. Be good at sports. C. Be popular.
Demographic information was also collected for each student.
# chi-square test for association # grades, popularity, sports preference across grades # data code book # https://math.tntech.edu/machida/ISR/3070/DASL/a/popularkids.html?dataset=3070/DASL/a/popularkids.txt library(SDS100) #download_data("popularkids.txt") kids <- read.table("popularkids.txt", header = TRUE) grade <- kids$Grade goals <- kids$Goals # Step 1: # H0: There is no difference between preference (of grades, popularity and sports) between grades # HA: There is difference in preference between grades # Step 2: # observed table (observed_counts <- table(goals, grade)) # visualize the data par(mfrow = c(3, 1)) barplot(observed_counts[, 1], main = "4th grade") barplot(observed_counts[, 2], main = "5th grade") barplot(observed_counts[, 3], main = "6th grade") # expected counts row_sums <- rowSums(observed_counts) col_sums <- colSums(observed_counts) n_tot <- sum(observed_counts) expected_counts <- outer(row_sums, col_sums)/n_tot # can visualize the expected counts par(mfrow = c(3, 1)) barplot(expected_counts[, 1], main = "4th grade") barplot(expected_counts[, 2], main = "5th grade") barplot(expected_counts[, 3], main = "6th grade") chi_stat <- sum(((observed_counts - expected_counts)^2)/expected_counts) # step 3: null dist degree_free <- (nrow(observed_counts) - 1) * (ncol(observed_counts) - 1) par(mfrow = c(1, 1)) x_vals <- seq(0, 20, length.out = 1000) y_vals <- dchisq(x_vals, degree_free) plot(x_vals, y_vals, type = "l") abline(v = chi_stat, col = "red") # step 4: p-value pchisq(chi_stat, degree_free, lower.tail = FALSE) # step 5: decision # built in R function to do it! chisq.test(observed_counts) # Extra: can also look at rural, suburan and urban # Is there a difference between these? living_location <- kids$Urban.Rural # observed table (observed_counts <- table(goals, living_location)) # visualize the data par(mfrow = c(3, 1)) barplot(observed_counts[, 1], main = "Rural") barplot(observed_counts[, 2], main = "Suburban") barplot(observed_counts[, 3], main = "Urban") chisq.test(observed_counts)
$\$
A pharmaceutical company tested three formulations of a pain relief medicine for migraine headache sufferers. For the experiment, 27 volunteers were selected and 9 were randomly assigned to one of three drug formulations. The subjects were instructed to take the drug during their next migraine headache episode and to report their pain on a scale of 1 = no pain to 10 = extreme pain 30 minutes after taking the drug.
data from: https://dasl.datadescription.com/datafile/analgesics/?_sfm_methods=Analysis+of+Variance&_sfm_cases=4+59943
# download_data("analgesics.txt") drugs <- read.table("analgesics.txt", header = TRUE) # step 1: # H0: all mu the same # HA: one mu different # step 2 drug_type <- drugs$Drug pain_rating <- drugs$Pain by(pain_rating, drug_type, mean) par(mfrow = c(1, 1)) boxplot(pain_rating ~ drug_type) # variances barely meet criteria by(pain_rating, drug_type, sd) F_stat <- get_F_stat(pain_rating, drug_type) # step 3: visualize null df1 <- 2 # K - 1 df2 <- 27 - 3 # N - K x_vals <- seq(0, 15, length.out = 1000) y_vals <- df(x_vals, df1, df2) plot(x_vals, y_vals, type = "l") abline(v = F_stat, col = "red") # step 4: pf(F_stat, df1, df2, lower.tail = FALSE) # step 5: # built in R functions anova_model <- aov(pain_rating ~ drug_type) summary(anova_model)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.