In emeyers/SDS100: Tools for the class Introductory Statistics

knitr::opts_chunk$set(echo = TRUE)

library(SDS100)

# download the data

SDS100::download_data("zodiac.csv")
SDS100::download_data("popularkids.txt")
SDS100::download_data("analgesics.txt")

$\$

Inference on more than 2 proportions

Chi-square test for goodness of it

CEO zodiac signs

# SDS100::download_data("zodiac.csv")


zodiac <- read.csv("zodiac.csv", header = TRUE)


# step 1: 
# H0: same proportion of CEO born under each sign
# HA: some signs yield more CEOs


# 2a: visualize the data

births <- zodiac$Births

barplot(births, 
        names.arg = zodiac$Month, 
        horiz = TRUE, 
        las = 2)



# step 2b. calculate the observed statistic

expected <- sum(births)/12

chi_stat <- sum((births - expected)^2/expected)

degree_free <- 12 - 1   # k - 1


# step 3: visualize the null
x_vals <- seq(0, 50, length.out = 1000)
y_vals <- dchisq(x_vals, degree_free)
plot(x_vals, y_vals, type = "l")
abline(v = chi_stat, col = "red")

# step 4: p-value
pchisq(chi_stat, degree_free, lower.tail = FALSE)


# 5. fail to reject

# sanity check using built in R functions
chisq.test(zodiac$Births)

$\$

Inference on more than 2 proportions using resampling methods

CEO zodiac signs continued...

# step 1: 
# H0: same proportion of CEO born under each sign
# HA: some signs yield more CEOs




# step 2:  calculate the observed statistic


expected_props <- rep(1/12, 12) # expected born each month

(obs_stat <- get_chisqr_stat(births, expected_props))



# step 3: create the null distribution 

tot_births <- sum(births)  # n = total number of CEOs born

null_dist <- do_it(10000) * {

  simulated_births <- rmultinom(1, tot_births, expected_props)
  get_chisqr_stat(simulated_births, expected_props)

}


# view the null distribution
hist(null_dist, freq = FALSE)

x_vals <- seq(0, 40, by = .1)
y_vals <- dchisq(x_vals, df = degree_free)  # df = k-1
points(x_vals, y_vals, col = "green", type = "l")

abline(v = obs_stat, col = "red")


# how does this compare with the parametric p-value?
pnull(obs_stat, null_dist, lower.tail = FALSE)

$\$

Differences in categorical distributions:chi-square test for association

The data is from:

Chase, M. A., and Dummer, G. M. (1992), "The Role of Sports as a Social Determinant for Children," Research Quarterly for Exercise and Sport, 63, 418-424

The subjects, students in grades 4-6 in selected schools in Michigan, were asked the following question: What would you most like to do at school?

A. Make good grades. B. Be good at sports. C. Be popular.

Demographic information was also collected for each student.

# chi-square test for association
# grades, popularity, sports preference across grades 


# data code book
# https://math.tntech.edu/machida/ISR/3070/DASL/a/popularkids.html?dataset=3070/DASL/a/popularkids.txt


library(SDS100)

#download_data("popularkids.txt")


kids <- read.table("popularkids.txt", header = TRUE)

grade <- kids$Grade
goals <- kids$Goals


# Step 1: 
# H0: There is no difference between preference (of grades, popularity and sports) between grades
# HA: There is difference in preference between grades




# Step 2:

# observed table
(observed_counts <- table(goals, grade))


# visualize the data
par(mfrow = c(3, 1))
barplot(observed_counts[, 1], main = "4th grade")
barplot(observed_counts[, 2], main = "5th grade")
barplot(observed_counts[, 3], main = "6th grade")


# expected counts
row_sums <- rowSums(observed_counts)
col_sums <- colSums(observed_counts)
n_tot <- sum(observed_counts)

expected_counts <- outer(row_sums, col_sums)/n_tot

# can visualize the expected counts
par(mfrow = c(3, 1))
barplot(expected_counts[, 1], main = "4th grade")
barplot(expected_counts[, 2], main = "5th grade")
barplot(expected_counts[, 3], main = "6th grade")


chi_stat <- sum(((observed_counts - expected_counts)^2)/expected_counts)





# step 3: null dist
degree_free <- (nrow(observed_counts) - 1) * (ncol(observed_counts) - 1)

par(mfrow = c(1, 1))
x_vals <- seq(0, 20, length.out = 1000)
y_vals <- dchisq(x_vals, degree_free)
plot(x_vals, y_vals, type = "l")
abline(v = chi_stat, col = "red")





# step 4: p-value
pchisq(chi_stat, degree_free, lower.tail = FALSE)




# step 5: decision


# built in R function to do it!
chisq.test(observed_counts)





# Extra: can also look at rural, suburan and urban
# Is there a difference between these? 


living_location <- kids$Urban.Rural


# observed table
(observed_counts <- table(goals, living_location))


# visualize the data
par(mfrow = c(3, 1))
barplot(observed_counts[, 1], main = "Rural")
barplot(observed_counts[, 2], main = "Suburban")
barplot(observed_counts[, 3], main = "Urban")


chisq.test(observed_counts)

$\$

One-way ANOVA

A pharmaceutical company tested three formulations of a pain relief medicine for migraine headache sufferers. For the experiment, 27 volunteers were selected and 9 were randomly assigned to one of three drug formulations. The subjects were instructed to take the drug during their next migraine headache episode and to report their pain on a scale of 1 = no pain to 10 = extreme pain 30 minutes after taking the drug.

data from: https://dasl.datadescription.com/datafile/analgesics/?_sfm_methods=Analysis+of+Variance&_sfm_cases=4+59943

# download_data("analgesics.txt")

drugs <- read.table("analgesics.txt", header = TRUE)


# step 1: 

# H0: all mu the same
# HA: one mu different




# step 2

drug_type <- drugs$Drug
pain_rating <- drugs$Pain

by(pain_rating, drug_type, mean)


par(mfrow = c(1, 1))
boxplot(pain_rating ~ drug_type)


# variances barely meet criteria
by(pain_rating, drug_type, sd)


F_stat <- get_F_stat(pain_rating, drug_type)






# step 3: visualize null

df1 <- 2    # K - 1
df2 <- 27 - 3   # N - K

x_vals <- seq(0, 15, length.out = 1000)
y_vals <- df(x_vals, df1, df2)
plot(x_vals, y_vals, type = "l")
abline(v = F_stat, col = "red")





# step 4:

pf(F_stat, df1, df2, lower.tail = FALSE)



# step 5:




# built in R functions
anova_model <- aov(pain_rating ~ drug_type)

summary(anova_model)