require(tidyverse) #you may need to run install.packages('tidyverse') first, only once on a machine require(dplyr) require(lubridate) #same goes for any new package require(rvest) require(stringr) require(knitr) require(ggplot2) load("../data/honors.RData") load(file = "../data/athletic_summary.RData") require(gridExtra) load("../data/students.RData") load("../data/roster.RData") load("../data/classified.RData") load(file = "../data/ethnic_summary.RData") knitr::opts_chunk$set(tidy=TRUE, message=FALSE)
Motivation
Informal Response to the article : What Does it Mean to be the Best?: An Alum Considers the Relative Importance of Admission Criteria
Personal Interest in seeing how students of Asian origins are performing academically
Approach
Attain information from graduation catalog and team rosters
Use the wru
package to identify someone's ethnicity through surname analysis
Compare the percentage of students attaining cum laude
among different demographics.
students
dataset comes from the Williams College catalog archive, containing graduation data from the class of 2003 to the class of 2015Has r nrow(students)
entries
r
head(students, 3) %>%
mutate(name = c('*Amy Eph, with highest honor in Economics', '*Chris Somebody', '*Lauren Nobody')) %>%
kable()
The roster
dataset comes forom Williams College athletic archives, including rosters from the class of 2002 to the class of 2015 and teams such as soccer, football, and cross country
r nrow(roster)
entries
r
head(roster, 3) %>%
mutate(name = c('John Lax', 'Ryan Tennis', 'Will Football')) %>%
kable()
r
kable(head(ethnic_summary, 3))
A graphical summary of the data
r
p1 <- ethnic_summary %>%
ggplot(aes(x = year, y = ratio, color = race)) + geom_point() + theme_bw()
p2 <- honors %>%
mutate(latin = ifelse(summa == 1 | magna == 1 | cum == 1, 1, 0)) %>%
group_by(year, team) %>%
summarize(ratio = sum(latin) / length(name)) %>%
filter(team == "Women's Cross Country" | team == "Men's Soccer" | team == "football" | is.na(team)) %>%
ggplot(aes(x = year, y = ratio, color = team)) + geom_point() + theme_bw()
grid.arrange(p1, p2)
-
hispanic <- classified %>% filter(hispanic == 1) %>% group_by(latin) %>% summarize(value = length(name)) non_hispanic = classified %>% filter(hispanic == 0) %>% group_by(latin) %>% summarize(value = length(name)) prop.test(c(filter(hispanic, latin == 1)$value, filter(non_hispanic, latin == 1)$value), c(sum(hispanic$value), sum(non_hispanic$value)))
athlete <- athletic_summary %>% filter(athlete == 1) non_athlete <- athletic_summary %>% filter(athlete == 0) t.test(athlete$ratio, non_athlete$ratio)
r
p1 <- ggplot(athlete, aes(x = ratio, fill = "athlete")) +
geom_histogram(bins = 6, aes(y = ..density..)) + theme_bw() +
stat_function(fun = dnorm,
args = c(mean = mean(athlete$ratio),
sd = sd(athlete$ratio)))
p2 <- ggplot(non_athlete, aes(x = ratio, fill = "non-athlete")) +
geom_histogram(bins = 6, aes(y = ..density..)) + theme_bw() +
stat_function(fun = dnorm,
args = c(mean = mean(non_athlete$ratio),
sd = sd(non_athlete$ratio)))
grid.arrange(p1, p2, nrow = 1)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.