  collapse = TRUE,
  comment = "#>",
  fig.path = "README-files/",
  cache = TRUE,
  message = FALSE,
  warning = FALSE,
  fig.height = 10

About scorecard

Travis-CI Build Status AppVeyor Build Status

The scorecard package includes processed datasets from the College Scorecard, 1996-2017.

The Scorecard datasets are imbalance panels at the colleges-by-school-year level. The data was last updated in 2019. See the changelog for more details.

The following datasets are available:

The following functions are implemented:

Related R Package

Benjamin Skinner has created a wonderful R client rscorecard for the College Scorecard GET API. If you're interested in getting some specific variables quickly, I suggest using the rscorecard package.


You can also download the datasets as an R package. It might take a while to install and load into memory. To download the most recent stable release, use

# install.packages("devtools")

# To uninstall the package, use:
# remove.packages("scorecard")


Loading the merged file for school year 2014-15

All datasets are tibbles:


Working with variable and value labels

All the datasets have variable labels attached, which can be viewed in RStudio's Data Viewer:


You can also use the labelled package:

scorecard::mf2014_15 %>% 
  select(1:8) %>% 

Or work with the codebook directly:

## Show variable labels
scorecard::codebook %>% 
  select(var_name, var_label)

## Build a small function to shown value labels
show_val_label = . %>% {
    filter(scorecard::codebook, var_name == .) %>% 
    mutate(val_label = glue::glue("{val_label}  = {value}")) %>% 

## Show value labels:

Exploring codebook and plot distributions of in-state tuition

vars = c("mf_year", "iclevel", "control", "tuitionfee_in")

scorecard::codebook %>% 
  select(var_name, var_label, value, val_label) %>% 
  filter(var_name %in% vars) %>% 

dplyr_seq = . %>% 
  select(one_of(vars)) %>%
  haven::as_factor() %>% 
  filter(iclevel %in% c("4-year", "2-year")) %>% 
  mutate(year = mf_year %>% parse_number() %>% as.factor()) %>% 
  group_by(iclevel, control) %>% 
            ~statar::winsorise(., probs = c(0.02, 0.98), verbose = FALSE)) %>% 

## Test the functional sequence
scorecard::mf2014_15 %>% dplyr_seq()

  scorecard::mf2017_18 %>% dplyr_seq(),
  scorecard::mf2016_17 %>% dplyr_seq(),
  scorecard::mf2015_16 %>% dplyr_seq(),
  scorecard::mf2014_15 %>% dplyr_seq(),
  scorecard::mf2013_14 %>% dplyr_seq(),
  scorecard::mf2012_13 %>% dplyr_seq(),
  scorecard::mf2011_12 %>% dplyr_seq(),
  scorecard::mf2010_11 %>% dplyr_seq(),
  scorecard::mf2009_10 %>% dplyr_seq(),
  scorecard::mf2008_09 %>% dplyr_seq(),
  scorecard::mf2007_08 %>% dplyr_seq(),
  scorecard::mf2006_07 %>% dplyr_seq(),
  scorecard::mf2005_06 %>% dplyr_seq(),
  scorecard::mf2004_05 %>% dplyr_seq(),
  scorecard::mf2003_04 %>% dplyr_seq(),
  scorecard::mf2002_03 %>% dplyr_seq(),
  scorecard::mf2001_02 %>% dplyr_seq(),
  scorecard::mf2000_01 %>% dplyr_seq()
) -> df

df %>% 
  ggplot(aes(x = tuitionfee_in, y = year, fill = iclevel)) +
  ggjoy::geom_joy(scale = 2, alpha = .8, colour = "white") +
  ggjoy::theme_joy() +
  facet_grid(iclevel~control, scales = "free") +
  labs(x = NULL, y = NULL,
       title = "In-State Tuition and Fees, 2000-2017") +
  scale_x_continuous(labels = scales::dollar) +
  scale_y_discrete(breaks = seq(2017, 2000, -3), 
                   expand = c(0.01, 0)) +
  theme(axis.text = element_text(size = 8),
        legend.position = "none")

Compareing in-state and out-of-state tuition and fees

vars = c("mf_year", "iclevel", "control", "tuitionfee_in", "tuitionfee_out")

dplyr_seq = . %>% 
  select(one_of(vars)) %>%
  haven::as_factor() %>% 
  filter(iclevel %in% c("4-year", "2-year")) %>% 
  filter(control == "Public") %>% 
  mutate(type = paste(control, iclevel)) %>% 
  mutate(year = mf_year %>% parse_number() %>% as.factor()) %>% 
  group_by(type) %>% 
  mutate_at(c("tuitionfee_in", "tuitionfee_out"),
            ~statar::winsorise(., probs = c(0.02, 0.98), verbose = FALSE)) %>% 
  ungroup() %>% 
  gather(in_or_out, tuitionfee, tuitionfee_in:tuitionfee_out) %>% 
  mutate(in_or_out = if_else(in_or_out == "tuitionfee_in",
                              "In-state tuition and fees",
                              "Out-of-state tuition and fees"))

## Test the functional sequence
## scorecard::mf2014_15 %>% dplyr_seq()

  scorecard::mf2017_18 %>% dplyr_seq(),
  scorecard::mf2016_17 %>% dplyr_seq(),
  scorecard::mf2015_16 %>% dplyr_seq(),
  scorecard::mf2014_15 %>% dplyr_seq(),
  scorecard::mf2013_14 %>% dplyr_seq(),
  scorecard::mf2012_13 %>% dplyr_seq(),
  scorecard::mf2011_12 %>% dplyr_seq(),
  scorecard::mf2010_11 %>% dplyr_seq(),
  scorecard::mf2009_10 %>% dplyr_seq(),
  scorecard::mf2008_09 %>% dplyr_seq(),
  scorecard::mf2007_08 %>% dplyr_seq(),
  scorecard::mf2006_07 %>% dplyr_seq(),
  scorecard::mf2005_06 %>% dplyr_seq(),
  scorecard::mf2004_05 %>% dplyr_seq(),
  scorecard::mf2003_04 %>% dplyr_seq(),
  scorecard::mf2002_03 %>% dplyr_seq(),
  scorecard::mf2001_02 %>% dplyr_seq(),
  scorecard::mf2000_01 %>% dplyr_seq()
) -> df

df %>% 
  ggplot(aes(x = tuitionfee, y = year, fill = in_or_out)) +
  ggjoy::geom_joy(scale = 2, alpha = .8, colour = "white") +
  ggjoy::theme_joy() +
  facet_wrap(~type, scales = "free") +
  labs(x = NULL, y = NULL,
       title = "In-State Vs. Out-of-State Tuition and Fees for Public Colleges",
       caption = "Source: College Scorecard, 2000-2017") +
  scale_x_continuous(labels = scales::dollar) +
  scale_y_discrete(breaks = seq(2017, 2000, -3), 
                   expand = c(0.01, 0)) +
  theme(axis.text = element_text(size = 9),
        legend.position = "top",
        legend.title = element_blank(),
        legend.justification = "center")

Distribution of Average Age of Entry

scorecard::mf2016_17 %>%
  select(instnm, control, iclevel, age_entry) %>%
  haven::as_factor() %>%
  filter(iclevel == "2-year") %>%
  mutate(age_entry = as.numeric(age_entry)) %>%
  ggplot(aes(x = age_entry)) +
  geom_histogram(fill = "white", colour = "black") +
  hrbrthemes::theme_ipsum_rc() +
  labs(x = "Average Age of Entry", y = "Count",
       title = "Distribution of Average Age of Entry",
       subtitle = "for 2-Year Colleges",
       caption = "Source: College Scorecard, 2016-17.")

scorecard::mf2016_17 %>%
  select(instnm, control, iclevel, age_entry) %>%
  haven::as_factor() %>%
  mutate(age_entry = as.numeric(age_entry)) %>%
  ggplot(aes(x = age_entry)) +
  # geom_freqpoly() +
  geom_histogram(fill = "white", colour = "black") +
  facet_grid(control ~ iclevel) +
  hrbrthemes::theme_ipsum() +
  labs(x = "Average Age of Entry", y = "Count",
       title = "Distribution of Average Age of Entry",
       subtitle = "by Control and Level of Institution",
       caption = "Source: College Scorecard, 2016-17.")

jjchern/scorecard documentation built on Jan. 1, 2020, 12:59 p.m.