library(tidyverse)
library(janitor)
library(lubridate)
library(stringr)
library(usethis)
library(devtools)
# Datasets cleaned and written by Albert Y. Kim
# airline_safety ---------------------------------------------------------------
airline_safety <- read_csv("data-raw/airline-safety/airline-safety.csv")
colnames(airline_safety) <- colnames(airline_safety) %>%
tolower() %>%
str_replace_all(" ", "_")
airline_safety <- airline_safety %>%
mutate(
# Convert asterisks to own variable indicating if regional subsidiaries are
# included
incl_reg_subsidiaries = str_sub(airline, -1) == "*",
airline = ifelse(incl_reg_subsidiaries, str_sub(airline, end = -2), airline)
) %>%
select(airline, incl_reg_subsidiaries, everything())
usethis::use_data(airline_safety, overwrite = TRUE)
# alcohol_consumption ----------------------------------------------------------
drinks <- read_csv("data-raw/alcohol-consumption/drinks.csv")
colnames(drinks) <- colnames(drinks) %>%
tolower() %>%
str_replace_all(" ", "_")
usethis::use_data(drinks, overwrite = TRUE)
# avengers ---------------------------------------------------------------------
avengers <- read_csv("data-raw/avengers/avengers.csv")
colnames(avengers) <- colnames(avengers) %>%
tolower() %>%
str_replace_all(" ", "_")
avengers <- avengers %>%
rename(
name_alias = `name/alias`,
current = `current?`,
probationary_intro = `probationary_introl`,
full_reserve_avengers_intro = `full/reserve_avengers_intro`
) %>%
mutate(
# Convert 0/1 to TRUE/FALSE
current = ifelse(current == "YES", TRUE, FALSE),
death1 = ifelse(death1 == "YES", TRUE, FALSE),
return1 = ifelse(return1 == "YES", TRUE, FALSE),
death2 = ifelse(death2 == "YES", TRUE, FALSE),
return2 = ifelse(return2 == "YES", TRUE, FALSE),
death3 = ifelse(death3 == "YES", TRUE, FALSE),
return3 = ifelse(return3 == "YES", TRUE, FALSE),
death4 = ifelse(death4 == "YES", TRUE, FALSE),
return4 = ifelse(return4 == "YES", TRUE, FALSE),
death5 = ifelse(death5 == "YES", TRUE, FALSE),
return5 = ifelse(return5 == "YES", TRUE, FALSE)
# Convert full_reserve_avengers_intro to date object?
# month = str_replace_all(full_reserve_avengers_intro, "[:digit:]", ""),
# month = str_replace_all(month, "-", ""),
# 14 cases where month was missing
# date = ifelse(is.na(month), paste(year, "Jan", "01", sep="-"),
# paste(year, month, "01", sep="-")),
# full_reserve_avengers_intro2 = parse_date_time(date, "y-b-d")
)
usethis::use_data(avengers, overwrite = TRUE)
# bad-drivers ------------------------------------------------------------------
bad_drivers <- read_csv("data-raw/bad-drivers/bad-drivers.csv")
colnames(bad_drivers) <- colnames(bad_drivers) %>%
tolower() %>%
str_replace_all(" ", "_")
bad_drivers <- bad_drivers %>%
rename(
num_drivers = `number_of_drivers_involved_in_fatal_collisions_per_billion_miles`,
perc_speeding = `percentage_of_drivers_involved_in_fatal_collisions_who_were_speeding`,
perc_alcohol = `percentage_of_drivers_involved_in_fatal_collisions_who_were_alcohol-impaired`,
perc_not_distracted = `percentage_of_drivers_involved_in_fatal_collisions_who_were_not_distracted`,
perc_no_previous = `percentage_of_drivers_involved_in_fatal_collisions_who_had_not_been_involved_in_any_previous_accidents`,
insurance_premiums = `car_insurance_premiums_($)`,
losses = `losses_incurred_by_insurance_companies_for_collisions_per_insured_driver_($)`
)
usethis::use_data(bad_drivers, overwrite = TRUE)
# bechdel ----------------------------------------------------------------------
bechdel <- read_csv("data-raw/bechdel/movies.csv")
colnames(bechdel) <- colnames(bechdel) %>%
tolower() %>%
str_replace_all(" ", "_")
bechdel <- bechdel %>%
rename(
budget_2013 = `budget_2013$`,
domgross_2013 = `domgross_2013$`,
intgross_2013 = `intgross_2013$`
) %>%
mutate(
# Clean some movie titles
title = str_replace_all(title, "&", "&"),
title = str_replace_all(title, "'", "'"),
# Convert strings to numeric
# Avatar and Titanic overflow integer capacity
domgross = as.numeric(domgross),
intgross = as.numeric(intgross),
domgross_2013 = as.numeric(domgross_2013),
intgross_2013 = as.numeric(intgross_2013),
# Set levels to factors
clean_test = factor(clean_test, levels = c("nowomen", "notalk", "men", "dubious", "ok"), ordered = TRUE)
)
usethis::use_data(bechdel, overwrite = TRUE)
# biopics ----------------------------------------------------------------------
biopics <- read_csv("data-raw/biopics/biopics.csv")
colnames(biopics) <- colnames(biopics) %>%
tolower() %>%
str_replace_all(" ", "_")
biopics <- biopics %>%
mutate(
# Convert box_office variable to numeric
box_office = str_replace_all(box_office, "-", ""),
box_office = str_replace_all(box_office, "\\$", ""),
box_office = ifelse(box_office == "", NA, box_office),
units = str_sub(box_office, -1),
box_office = str_sub(box_office, end = -2),
box_office = as.numeric(box_office),
units = ifelse(units == "M", 10^6, ifelse(units == "K", 10^3, NA)),
box_office = box_office * units,
# Convert 0/1 to TRUE/FALSE
person_of_color = as.logical(person_of_color),
# Shorten IMDB link
site = str_sub(site, 27, 35)
) %>%
select(-units)
usethis::use_data(biopics, overwrite = TRUE)
# births -----------------------------------------------------------------------
US_births_1994_2003 <- read_csv("data-raw/births/US_births_1994-2003_CDC_NCHS.csv")
colnames(US_births_1994_2003) <- colnames(US_births_1994_2003) %>%
tolower() %>%
str_replace_all(" ", "_")
US_births_1994_2003 <- US_births_1994_2003 %>%
mutate(
date = ymd(paste(year, month, date_of_month)),
day_of_week = wday(date, label = TRUE)
) %>%
select(year, month, date_of_month, date, everything())
usethis::use_data(US_births_1994_2003, overwrite = TRUE)
US_births_2000_2014 <- read_csv("data-raw/births/US_births_2000-2014_SSA.csv")
colnames(US_births_2000_2014) <- colnames(US_births_2000_2014) %>%
tolower() %>%
str_replace_all(" ", "_")
US_births_2000_2014 <- US_births_2000_2014 %>%
mutate(
date = ymd(paste(year, month, date_of_month)),
day_of_week = wday(date, label = TRUE)
) %>%
select(year, month, date_of_month, date, everything())
usethis::use_data(US_births_2000_2014, overwrite = TRUE)
# bob-ross ---------------------------------------------------------------------
bob_ross <- read_csv("data-raw/bob-ross/elements-by-episode.csv")
colnames(bob_ross) <- colnames(bob_ross) %>%
tolower() %>%
str_replace_all(" ", "_")
bob_ross <- bob_ross %>%
mutate(
title = str_sub(title, 2, -2),
season = as.numeric(str_sub(episode, 2, 3)),
episode_num = as.numeric(str_sub(episode, 5, 6))
) %>%
select(episode, season, episode_num, everything())
usethis::use_data(bob_ross, overwrite = TRUE)
# buster-posey_mvp -------------------------------------------------------------
# No Data
# classic-rock -----------------------------------------------------------------
classic_rock_raw_data <- read_csv("data-raw/classic-rock/classic-rock-raw-data.csv")
colnames(classic_rock_raw_data) <- colnames(classic_rock_raw_data) %>%
tolower() %>%
str_replace_all(" ", "_")
classic_rock_raw_data <- classic_rock_raw_data %>%
select(-c(song_raw, artist_raw, `first?`)) %>%
rename(
song = song_clean,
artist = artist_clean
) %>%
mutate(date_time = as.POSIXct(time, origin = "1970-01-01")) %>%
select(song, artist, callsign, time, date_time, unique_id, combined)
usethis::use_data(classic_rock_raw_data, overwrite = TRUE)
classic_rock_song_list <- read_csv("data-raw/classic-rock/classic-rock-song-list.csv")
colnames(classic_rock_song_list) <- colnames(classic_rock_song_list) %>%
tolower() %>%
str_replace_all(" ", "_")
classic_rock_song_list <- classic_rock_song_list %>%
select(-c(`first?`)) %>%
rename(
song = song_clean,
artist = artist_clean,
has_year = `year?`,
playcount_has_year = `f*g`
) %>%
mutate(has_year = as.logical(has_year))
usethis::use_data(classic_rock_song_list, overwrite = TRUE)
# college-majors ---------------------------------------------------------------
college_all_ages <- read_csv("data-raw/college-majors/all-ages.csv")
colnames(college_all_ages) <- colnames(college_all_ages) %>%
tolower() %>%
str_replace_all(" ", "_")
college_all_ages <- college_all_ages %>%
rename(employed_fulltime_yearround = employed_full_time_year_round) %>%
mutate(
median = as.double(median),
p25th = as.double(p25th),
major = str_to_title(major)
) %>%
select(
major_code, major, major_category, total, employed,
employed_fulltime_yearround, unemployed, unemployment_rate,
p25th, median, p75th
)
usethis::use_data(college_all_ages, overwrite = TRUE)
college_grad_students <- read_csv("data-raw/college-majors/grad-students.csv")
colnames(college_grad_students) <- colnames(college_grad_students) %>%
tolower() %>%
str_replace_all(" ", "_")
college_grad_students <- college_grad_students %>%
rename(
grad_employed_fulltime_yearround = grad_full_time_year_round,
nongrad_employed_fulltime_yearround = nongrad_full_time_year_round,
grad_p25th = grad_p75,
grad_p75th = grad_p25,
nongrad_p25th = nongrad_p25,
nongrad_p75th = nongrad_p75
) %>%
mutate(
grad_p25th = as.double(grad_p25th),
nongrad_p25th = as.double(nongrad_p25th),
major = str_to_title(major)
) %>%
select(
major_code, major, major_category,
grad_total, grad_sample_size, grad_employed, grad_employed_fulltime_yearround,
grad_unemployed, grad_unemployment_rate,
grad_p25th, grad_median, grad_p75th,
nongrad_total, nongrad_employed, nongrad_employed_fulltime_yearround,
nongrad_unemployed, nongrad_unemployment_rate,
nongrad_p25th, nongrad_median, nongrad_p75th,
grad_share, grad_premium
)
usethis::use_data(college_grad_students, overwrite = TRUE)
# Redundant: These are the first three columns of college_all_ages and
# college_grad_students
# college_majors <- read_csv("data-raw/college-majors/majors-list.csv")
# colnames(college_majors) <- colnames(college_majors) %>%
# tolower() %>%
# str_replace_all(" ", "_")
# college_majors <- college_majors %>%
# rename(major_code = fod1p) %>%
# mutate(major = str_to_title(major))
# usethis::use_data(college_majors, overwrite = TRUE)
college_recent_grads <- read_csv("data-raw/college-majors/recent-grads.csv")
colnames(college_recent_grads) <- colnames(college_recent_grads) %>%
tolower() %>%
str_replace_all(" ", "_")
college_recent_grads <- college_recent_grads %>%
rename(
employed_fulltime_yearround = full_time_year_round,
employed_fulltime = full_time,
employed_parttime = part_time
) %>%
mutate(
p25th = as.double(p25th),
median = as.double(median),
p75th = as.double(p75th),
major = str_to_title(major)
) %>%
select(
rank, major_code, major, major_category, total, sample_size,
men, women, sharewomen,
employed, employed_fulltime, employed_parttime, employed_fulltime_yearround,
unemployed, unemployment_rate,
p25th, median, p75th,
college_jobs, non_college_jobs, low_wage_jobs
)
usethis::use_data(college_recent_grads, overwrite = TRUE)
# Redundant: Simply a subset of college_recent_grads
# college_women_stem <- read_csv("data-raw/college-majors/women-stem.csv")
# colnames(college_women_stem) <- colnames(college_women_stem) %>%
# tolower() %>%
# str_replace_all(" ", "_")
# college_women_stem <- college_women_stem %>%
# mutate(
# median = as.double(median),
# major = str_to_title(major)
# )
# usethis::use_data(college_women_stem, overwrite = TRUE)
# comma-survey ------------------------------------------------------------
comma_survey <- read_csv("data-raw/comma-survey/comma-survey.csv")
colnames(comma_survey) <- colnames(comma_survey) %>%
tolower() %>%
str_replace_all(" ", "_")
comma_survey <- comma_survey %>%
rename(
respondent_id = respondentid,
location = `location_(census_region)`,
more_grammar_correct = `in_your_opinion,_which_sentence_is_more_gramatically_correct?`,
heard_oxford_comma = `prior_to_reading_about_it_above,_had_you_heard_of_the_serial_(or_oxford)_comma?`,
care_oxford_comma = `how_much,_if_at_all,_do_you_care_about_the_use_(or_lack_thereof)_of_the_serial_(or_oxford)_comma_in_grammar?`,
write_following = `how_would_you_write_the_following_sentence?`,
data_singular_plural = `when_faced_with_using_the_word_\"data\",_have_you_ever_spent_time_considering_if_the_word_was_a_singular_or_plural_noun?`,
care_data = `how_much,_if_at_all,_do_you_care_about_the_debate_over_the_use_of_the_word_\"data\"_as_a_singluar_or_plural_noun?`,
care_proper_grammar = `in_your_opinion,_how_important_or_unimportant_is_proper_use_of_grammar?`
) %>%
mutate(
# Set levels to factors
age = factor(age, levels = c("18-29", "30-44", "45-60", "> 60"), ordered = TRUE),
household_income = factor(household_income, levels = c(
"$0 - $24,999", "$25,000 - $49,999", "$50,000 - $99,999",
"$100,000 - $149,999", "$150,000+"
), ordered = TRUE),
education = factor(education, levels = c(
"Less than high school degree", "High school degree",
"Some college or Associate degree", "Bachelor degree", "Graduate degree"
), ordered = TRUE),
heard_oxford_comma = ifelse(heard_oxford_comma == "Yes", TRUE, FALSE),
data_singular_plural = ifelse(data_singular_plural == "Yes", TRUE, FALSE),
care_oxford_comma = factor(care_oxford_comma, levels = c(
"Not at all", "Not much", "Some", "A lot"
), ordered = TRUE),
care_data = factor(care_data, levels = c(
"Not at all", "Not much", "Some", "A lot"
), ordered = TRUE),
care_proper_grammar = factor(care_proper_grammar, levels = c(
"Very unimportant", "Somewhat unimportant",
"Neither important nor unimportant (neutral)", "Somewhat important",
"Very important"
), ordered = TRUE)
) %>%
select(
respondent_id, gender, age, household_income,
education, location, everything()
)
usethis::use_data(comma_survey, overwrite = TRUE)
# congress-age -----------------------------------------------------------------
# Manually edited original CSV:
# -Lines 7054, 7581, 8088:
# (Pierre,Samuel, IV,du Pont,,) to (Pierre,Samuel,du Pont,IV,)
# -Lines 9719, 10235, 10764, 11290:
# (Harold,John, Jr.,,Daub,,) to (Harold,John, Daub, Jr.,)
# -Lines 10019, 10550, 11075, 11606, 12141, 12664:
# (Itimous,Thaddeus, Jr.,,Valentine,,) to (Itimous,Thaddeus,Valentine,Jr.,)
# -Lines 10641, 11168, 11697, 12230, 12738:
# (John,Alexander, III,McMillan,,) to (John,Alexander,McMillan,III)
congress_age <- read_csv("data-raw/congress-age/congress-terms.csv")
colnames(congress_age) <- colnames(congress_age) %>%
tolower() %>%
str_replace_all(" ", "_")
congress_age <- congress_age %>%
mutate(incumbent = ifelse(incumbent == "Yes", TRUE, FALSE))
usethis::use_data(congress_age, overwrite = TRUE)
# cousin-marriage --------------------------------------------------------------
cousin_marriage <- read_csv("data-raw/cousin-marriage/cousin-marriage-data.csv")
colnames(cousin_marriage) <- colnames(cousin_marriage) %>%
tolower() %>%
str_replace_all(" ", "_")
usethis::use_data(cousin_marriage, overwrite = TRUE)
# daily_show_guests ------------------------------------------------------------
daily_show_guests <- read_csv("data-raw/daily-show-guests/daily_show_guests.csv")
colnames(daily_show_guests) <- colnames(daily_show_guests) %>%
tolower() %>%
str_replace_all(" ", "_")
daily_show_guests <- daily_show_guests %>%
rename(google_knowledge_occupation = googleknowlege_occupation) %>%
mutate(
# Convert Show date to POSIX date object
show = mdy(show),
# Try to consolidate category of guest a bit
google_knowledge_occupation = tolower(google_knowledge_occupation)
)
usethis::use_data(daily_show_guests, overwrite = TRUE)
# democratic-bench -------------------------------------------------------------
democratic_bench <- read_csv("data-raw/democratic-bench/democratic-bench.csv")
colnames(democratic_bench) <- colnames(democratic_bench) %>%
tolower() %>%
str_replace_all(" ", "_")
democratic_bench <- democratic_bench %>%
rename(candidate = cand)
usethis::use_data(democratic_bench, overwrite = TRUE)
# drug-use-by-age --------------------------------------------------------------
drug_use <- read_csv("data-raw/drug-use-by-age/drug-use-by-age.csv", na = c("", "NA", "-"))
colnames(drug_use) <- colnames(drug_use) %>%
tolower() %>%
str_replace_all(" ", "_") %>%
str_replace_all("-", "_") %>%
str_replace_all("frequency", "freq")
drug_use <- drug_use %>%
mutate(age = factor(age,
levels = c(
"12", "13", "14", "15", "16", "17", "18", "19", "20", "21",
"22-23", "24-25", "26-29", "30-34", "35-49", "50-64", "65+"
),
ordered = TRUE
))
usethis::use_data(drug_use, overwrite = TRUE)
# early-senate-polls -----------------------------------------------------------
senate_polls <- read_csv("data-raw/early-senate-polls/early-senate-polls.csv")
colnames(senate_polls) <- colnames(senate_polls) %>%
tolower() %>%
str_replace_all(" ", "_")
usethis::use_data(senate_polls, overwrite = TRUE)
# elo-blatter ------------------------------------------------------------------
elo_blatter <- read_csv("data-raw/elo-blatter/elo_blatter.csv")
colnames(elo_blatter) <- colnames(elo_blatter) %>%
tolower() %>%
str_replace_all(" ", "_")
usethis::use_data(elo_blatter, overwrite = TRUE)
# endorsements-june-30 ---------------------------------------------------------
endorsements <- read_csv("data-raw/endorsements-june-30/endorsements-june-30.csv")
colnames(endorsements) <- colnames(endorsements) %>%
tolower() %>%
str_replace_all(" ", "_")
endorsements <- endorsements %>%
mutate(won_primary = ifelse(won_primary == "Yes", TRUE, FALSE))
usethis::use_data(endorsements, overwrite = TRUE)
# fandango ---------------------------------------------------------------------
fandango <- read_csv("data-raw/fandango/fandango_score_comparison.csv")
colnames(fandango) <- colnames(fandango) %>%
tolower() %>%
str_replace_all(" ", "_")
fandango <- fandango %>%
separate(film, c("film", "year"), sep = " \\(") %>%
mutate(
year = str_replace_all(year, "\\)", ""),
year = as.numeric(year)
)
usethis::use_data(fandango, overwrite = TRUE)
fandango_scrape <- read_csv("data-raw/fandango/fandango_scrape.csv")
colnames(fandango_scrape) <- colnames(fandango_scrape) %>%
tolower() %>%
str_replace_all(" ", "_")
# double parentheses at some points
# fandango_scrape <- fandango_scrape %>%
# separate(film, c("film", "year"), sep=" \\(") %>%
# mutate(
# year = str_replace_all(year, "\\)", ""),
# year = as.numeric(year)
# )
# usethis::use_data(fandango_scrape, overwrite = TRUE)
# fifa -------------------------------------------------------------------------
fifa_audience <- read_csv("data-raw/fifa/fifa_countries_audience.csv")
colnames(fifa_audience) <- colnames(fifa_audience) %>%
tolower() %>%
str_replace_all(" ", "_")
usethis::use_data(fifa_audience, overwrite = TRUE)
# flying-etiquette-survey ------------------------------------------------------
flying <- read_csv("data-raw/flying-etiquette-survey/flying-etiquette.csv") %>%
rename(
respondent_id = RespondentID,
location = `Location (Census Region)`,
frequency = `How often do you travel by plane?`,
recline_frequency = `Do you ever recline your seat when you fly?`,
height = `How tall are you?`,
children_under_18 = `Do you have any children under 18?`,
two_arm_rests = `In a row of three seats, who should get to use the two arm rests?`,
middle_arm_rest = `In a row of two seats, who should get to use the middle arm rest?`,
shade = `Who should have control over the window shade?`,
unsold_seat = `Is itrude to move to an unsold seat on a plane?`,
talk_stranger = `Generally speaking, is it rude to say more than a few words tothe stranger sitting next to you on a plane?`,
get_up = `On a 6 hour flight from NYC to LA, how many times is it acceptable to get up if you're not in an aisle seat?`,
recline_obligation = `Under normal circumstances, does a person who reclines their seat during a flight have any obligation to the person sitting behind them?`,
recline_rude = `Is itrude to recline your seat on a plane?`,
recline_eliminate = `Given the opportunity, would you eliminate the possibility of reclining seats on planes entirely?`,
switch_seats_friends = `Is it rude to ask someone to switch seats with you in order to be closer to friends?`,
switch_seats_family = `Is itrude to ask someone to switch seats with you in order to be closer to family?`,
wake_up_bathroom = `Is it rude to wake a passenger up if you are trying to go to the bathroom?`,
wake_up_walk = `Is itrude to wake a passenger up if you are trying to walk around?`,
baby = `In general, is itrude to bring a baby on a plane?`,
unruly_child = `In general, is it rude to knowingly bring unruly children on a plane?`,
electronics = `Have you ever used personal electronics during take off or landing in violation of a flight attendant's direction?`,
smoked = `Have you ever smoked a cigarette in an airplane bathroom when it was against the rules?`
)
colnames(flying) <- colnames(flying) %>%
tolower() %>%
str_replace_all(" ", "_")
flying <- flying %>%
mutate(
# Demographic Info
age = factor(age, levels = c("18-29", "30-44", "45-60", "> 60"), ordered = TRUE),
household_income = factor(household_income, levels = c(
"$0 - $24,999", "$25,000 - $49,999", "$50,000 - $99,999",
"$100,000 - $149,999", "$150,000+"
), ordered = TRUE),
education = factor(education, levels = c(
"Less than high school degree", "High school degree",
"Some college or Associate degree", "Bachelor degree", "Graduate degree"
), ordered = TRUE),
# Convert Yes/No to booleans
electronics = ifelse(electronics == "Yes", TRUE, FALSE),
smoked = ifelse(smoked == "Yes", TRUE, FALSE),
children_under_18 = ifelse(children_under_18 == "Yes", TRUE, FALSE),
recline_obligation =
ifelse(recline_obligation ==
"Yes, they should not recline their chair if the person behind them asks them not to",
TRUE, FALSE
),
recline_eliminate = ifelse(recline_eliminate == "Yes", TRUE, FALSE),
# Convert ordinal categorical to factor
height = factor(height, levels = c(
"Under 5 ft.", "5'0\"", "5'1\"", "5'2\"", "5'3\"", "5'4\"", "5'5\"",
"5'6\"", "5'7\"", "5'8\"", "5'9\"", "5'10\"", "5'11\"",
"6'0\"", "6'1\"", "6'2\"", "6'3\"", "6'4\"", "6'5\"", "6'6\" and above"
), ordered = TRUE),
frequency = factor(frequency, levels = c(
"Never", "Once a year or less", "Once a month or less", "A few times per month",
"A few times per week", "Every day"
), ordered = TRUE),
recline_frequency = factor(recline_frequency, levels = c(
"Never", "Once in a while", "About half the time", "Usually", "Always"
), ordered = TRUE),
unsold_seat = factor(unsold_seat, levels = c(
"No, not rude at all", "Yes, somewhat rude", "Yes, very rude"
), ordered = TRUE),
talk_stranger = factor(talk_stranger, levels = c(
"No, not at all rude", "Yes, somewhat rude", "Yes, very rude"
), ordered = TRUE),
get_up = factor(get_up, levels = c(
"It is not okay to get up during flight", "Once", "Twice", "Three times", "Four times",
"More than five times times"
), ordered = TRUE),
recline_rude = factor(recline_rude, levels = c(
"No, not rude at all", "Yes, somewhat rude", "Yes, very rude"
), ordered = TRUE),
switch_seats_friends = factor(switch_seats_friends, levels = c(
"No, not at all rude", "Yes, somewhat rude", "Yes, very rude"
), ordered = TRUE),
switch_seats_family = factor(switch_seats_family, levels = c(
"No, not at all rude", "Yes, somewhat rude", "Yes, very rude"
), ordered = TRUE),
wake_up_bathroom = factor(wake_up_bathroom, levels = c(
"No, not at all rude", "Yes, somewhat rude", "Yes, very rude"
), ordered = TRUE),
wake_up_walk = factor(wake_up_walk, levels = c(
"No, not at all rude", "Yes, somewhat rude", "Yes, very rude"
), ordered = TRUE),
baby = factor(baby, levels = c(
"No, not at all rude", "Yes, somewhat rude", "Yes, very rude"
), ordered = TRUE),
unruly_child = factor(unruly_child, levels = c(
"No, not at all rude", "Yes, somewhat rude", "Yes, very rude"
), ordered = TRUE)
) %>%
select(
respondent_id, gender, age, height, children_under_18, household_income, education, location,
frequency, recline_frequency, recline_obligation, recline_rude, recline_eliminate,
switch_seats_friends, switch_seats_family,
wake_up_bathroom, wake_up_walk,
baby, unruly_child,
two_arm_rests, middle_arm_rest,
everything()
)
levels(flying$recline_rude) <- c("No", "Somewhat", "Very")
levels(flying$switch_seats_friends) <- c("No", "Somewhat", "Very")
levels(flying$switch_seats_family) <- c("No", "Somewhat", "Very")
levels(flying$wake_up_bathroom) <- c("No", "Somewhat", "Very")
levels(flying$wake_up_walk) <- c("No", "Somewhat", "Very")
levels(flying$baby) <- c("No", "Somewhat", "Very")
levels(flying$unruly_child) <- c("No", "Somewhat", "Very")
levels(flying$unsold_seat) <- c("No", "Somewhat", "Very")
levels(flying$talk_stranger) <- c("No", "Somewhat", "Very")
flying %>%
select(
recline_rude, switch_seats_friends, switch_seats_family, wake_up_bathroom,
wake_up_walk, baby, unruly_child, unsold_seat, talk_stranger
) %>%
apply(2, table)
usethis::use_data(flying, overwrite = TRUE)
# food-world-cup ---------------------------------------------------------------
food_world_cup <- read_csv("data-raw/food-world-cup/food-world-cup-data.csv")
# Modify variable names
varnames <- colnames(food_world_cup)
country_indices <-
str_sub(varnames, 1, 29) == "Please rate how much you like"
varnames[country_indices] <- str_sub(varnames[country_indices], 58, -2)
varnames[c(1, 2, 3, 48)] <- c("respondent_id", "knowledge", "interest", "location")
colnames(food_world_cup) <- varnames %>%
tolower() %>%
str_replace_all(" ", "_")
food_world_cup <- food_world_cup %>%
mutate(
interest = str_replace_all(interest, "\xca", ""),
knowledge = factor(knowledge, levels = c(
"Novice", "Intermediate", "Advanced", "Expert"
), ordered = TRUE),
interest = factor(interest, levels = c(
"Not at all", "Not much", "Some", "A lot"
), ordered = TRUE),
age = factor(age, levels = c("18-29", "30-44", "45-60", "> 60")),
household_income = factor(household_income, levels = c(
"$0 - $24,999", "$25,000 - $49,999", "$50,000 - $99,999",
"$100,000 - $149,999", "$150,000+"
)),
education = factor(education, levels = c(
"Less than high school degree", "High school degree",
"Some college or Associate degree", "Bachelor degree", "Graduate degree"
), ordered = TRUE)
) %>%
select(
respondent_id, knowledge, interest, gender, age, household_income,
education, location, algeria, argentina, australia, belgium,
bosnia_and_herzegovina,
brazil, cameroon, chile, china, colombia, costa_rica,
croatia, cuba, ecuador, england, ethiopia, france,
germany, ghana, greece, honduras, india, iran, ireland,
italy, ivory_coast, japan, mexico, nigeria, portugal,
russia, south_korea, spain, switzerland, thailand,
the_netherlands, turkey, united_states, uruguay, vietnam
)
usethis::use_data(food_world_cup, overwrite = TRUE)
# love-actually -------------------------------------------------------------------------
love_actually_adj <- read_csv("data-raw/love-actually/love_actually_adjacencies.csv")
# HACK: Make adjacency matrix symmetric
love_actually_adj <- replace(love_actually_adj, is.na(love_actually_adj), 0)
for (i in 1:nrow(love_actually_adj)) {
love_actually_adj[i, -1] <- t(love_actually_adj[, i + 1])
}
usethis::use_data(love_actually_adj, overwrite = TRUE)
love_actually_appearance <- read_csv("data-raw/love-actually/love_actually_appearances.csv") %>%
# Only first 71 rows had data
slice(1:71)
# Replace all NA's with FALSE
love_actually_appearance <- replace(love_actually_appearance, is.na(love_actually_appearance), FALSE)
usethis::use_data(love_actually_appearance, overwrite = TRUE)
# Datasets cleaned and written by Chester Ismay
# Get list of variable names in df with newline
get_names <- function(x) {
cat(names(x), sep = "\n")
}
# Chester: forecast-methodology thru police-killings
chester_folders <- list.files(path = "data-raw")[c(26:30, 32:51)]
dirs <- paste0("data-raw/", chester_folders, "\n")
cat(dirs)
# To make headers, guess at data frame names, and use_data
prep <- function(x) {
dashes <- rep("-", times = (81 - nchar(x))) %>%
paste(collapse = "")
x <- x %>% str_replace_all("-", "_")
paste0(
"# ", x, " ", dashes, "\n",
x, " <-", "\n\n",
"devtools::use_data(", x, ", overwrite = TRUE) \n\n\n\n\n"
)
}
lapply(chester_folders, prep) %>%
unlist() %>%
cat()
# Get state info
source("data-raw/state_info.R")
# forecast_methodology -------------------------------------------------------------
hist_senate_preds <-
read_csv("data-raw/forecast-methodology/historical-senate-predictions.csv") %>%
# Redundant column not specified in README.md
select(-winflag)
colnames(hist_senate_preds) <- colnames(hist_senate_preds) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(hist_senate_preds, overwrite = TRUE)
# hate-crimes ----------------------------------------------------------------------
hate_crimes <- read_csv("data-raw/hate-crimes/hate_crimes.csv") %>%
rename(
"share_pop_HS" = share_population_with_high_school_degree,
"share_pop_metro" = share_population_in_metro_areas,
"share_vote_trump" = share_voters_voted_trump,
"share_unemp_seas" = share_unemployed_seasonal,
"median_house_inc" = median_household_income
)
colnames(hate_crimes) <- colnames(hate_crimes) %>%
tolower() %>%
str_replace_all(" ", "_")
# Add state abbreviations only
hate_crimes <- hate_crimes %>%
left_join(state_info, by = "state") %>%
select(-c(division, region)) %>%
select(state, state_abbrev, everything())
usethis::use_data(hate_crimes, overwrite = TRUE)
# hip_hop_candidate_lyrics ---------------------------------------------------------
hiphop_cand_lyrics <-
read_csv("data-raw/hip-hop-candidate-lyrics/genius_hip_hop_lyrics.csv",
na = "N/A"
) %>%
# Not specified in README.md
select(-id) %>%
mutate(sentiment = factor(sentiment, levels = c("negative", "neutral", "positive"), ordered = TRUE))
colnames(hiphop_cand_lyrics) <- colnames(hiphop_cand_lyrics) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(hiphop_cand_lyrics, overwrite = TRUE)
# historical_ncaa_forecasts --------------------------------------------------------
path <- "data-raw/historical-ncaa-forecasts/"
hist_ncaa_bball_casts <-
read_csv(paste0(path, "historical-538-ncaa-tournament-model-results.csv")) %>%
rename(
favorite_prob = favorite_probability,
favorite_win = favorite_win_flag
) %>%
mutate(favorite_win = ifelse(favorite_win == 1, TRUE, FALSE))
colnames(hist_ncaa_bball_casts) <- colnames(hist_ncaa_bball_casts) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(hist_ncaa_bball_casts, overwrite = TRUE)
# infrastructure_jobs --------------------------------------------------------------
state_payrolls <- read_csv("data-raw/infrastructure-jobs/payroll-states.csv") %>%
# Drop row number variable
select(-X1) # %>%
# gather(key = "state", value = "value", -date)
colnames(state_payrolls) <- colnames(state_payrolls) %>%
tolower() %>%
str_replace_all(" ", "_")
# devtools::use_data(state_payrolls, overwrite = TRUE)
# librarians -----------------------------------------------------------------------
librarians <- read_csv("data-raw/librarians/librarians-by-msa.csv") %>%
# Remove missing data rows
slice(-c(1, 2)) %>%
rename(loc_quotient = loc.quotient) %>%
mutate(
jobs_1000 = as.numeric(jobs_1000),
loc_quotient = as.numeric(loc_quotient)
)
colnames(librarians) <- colnames(librarians) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(librarians, overwrite = TRUE)
# mad_men --------------------------------------------------------------------------
mad_men <- read_csv("data-raw/mad-men/show-data.csv") %>%
rename(
num_lead = `#LEAD`,
num_support = `#SUPPORT`,
num_shows = `#Shows`,
status = `Status?`,
score_div_y = `Score/Y`
)
colnames(mad_men) <- colnames(mad_men) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(mad_men, overwrite = TRUE)
# male_flight_attendants -----------------------------------------------------------
male_flight_attend <-
read_tsv("data-raw/male-flight-attendants/male-flight-attendants.tsv")
colnames(male_flight_attend) <- colnames(male_flight_attend) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(male_flight_attend, overwrite = TRUE)
# march_madness_predictions --------------------------------------------------------
#-- MULTIPLE DATA FILES
# march_madness_predictions_2015 ---------------------------------------------------
#-- MULTIPLE DATA FILES
# marriage -------------------------------------------------------------------------
#-- Needs tidying and MULTIPLE DATA FILES
# mlb_allstar_teams ----------------------------------------------------------------
mlb_as_team_talent <- read_csv("data-raw/mlb-allstar-teams/allstar_team_talent.csv")
colnames(mlb_as_team_talent) <- colnames(mlb_as_team_talent) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(mlb_as_team_talent, overwrite = TRUE)
mlb_as_play_talent <- read_csv("data-raw/mlb-allstar-teams/allstar_player_talent.csv")
colnames(mlb_as_play_talent) <- colnames(mlb_as_play_talent) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(mlb_as_play_talent, overwrite = TRUE)
# most_common_name -----------------------------------------------------------------
#-- MULTIPLE DATA FILES
# murder_2016 ----------------------------------------------------------------------
murder_2016_prelim <- read_csv("data-raw/murder_2016/murder_2016_prelim.csv") %>%
rename(
murders_2015 = `2015_murders`,
murders_2016 = `2016_murders`
) %>%
mutate(as_of = mdy(as_of))
colnames(murder_2016_prelim) <- colnames(murder_2016_prelim) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(murder_2016_prelim, overwrite = TRUE)
murder_2015_final <- read_csv("data-raw/murder_2016/murder_2015_final.csv") %>%
rename(
murders_2014 = `2014_murders`,
murders_2015 = `2015_murders`
)
colnames(murder_2015_final) <- colnames(murder_2015_final) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(murder_2015_final, overwrite = TRUE)
# nba_draft_2015 -------------------------------------------------------------------
nba_draft_2015 <- read_csv("data-raw/nba-draft-2015/historical_projections.csv")
colnames(nba_draft_2015) <- colnames(nba_draft_2015) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(nba_draft_2015, overwrite = TRUE)
# nba_elo --------------------------------------------------------------------------
nba_elo <- read_csv("data-raw/nba-elo/nbaallelo.csv") %>%
rename(is_copy = `_iscopy`) %>%
mutate(
is_playoffs = ifelse(is_playoffs == 1, TRUE, FALSE),
is_copy = ifelse(is_copy == 1, TRUE, FALSE),
date_game = mdy(date_game)
)
colnames(nba_elo) <- colnames(nba_elo) %>%
tolower() %>%
str_replace_all(" ", "_")
# Not included due to size
# devtools::use_data(nba_elo, overwrite = TRUE)
# nba_tattoos ----------------------------------------------------------------------
nba_tattoos <- read_csv("data-raw/nba-tattoos/nba-tattoos-data.csv") %>%
rename(
player_name = `Player Name`,
tattoos = `Tattoos yes/no`
) %>%
mutate(tattoos = ifelse(tattoos == "yes", TRUE, FALSE))
colnames(nba_tattoos) <- colnames(nba_tattoos) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(nba_tattoos, overwrite = TRUE)
# nba_winprobs ---------------------------------------------------------------------
# Needs tidying (documentation not created)
nba_winprobs <- read_tsv("data-raw/nba-winprobs/nba.tsv")
colnames(nba_winprobs) <- colnames(nba_winprobs) %>%
tolower() %>%
str_replace_all(" ", "_")
# devtools::use_data(nba_winprobs, overwrite = TRUE)
# nfl_favorite_team ----------------------------------------------------------------
nfl_fav_team <-
read_csv("data-raw/nfl-favorite-team/team-picking-categories.csv") %>%
rename(
big_market = BMK,
uniform = UNI,
coaching = CCH,
stadium_exp = STX,
small_market = SMK,
afford = AFF,
stlouis_prox = SLP,
nyc_prox = NYP,
fan_relations = FRL,
bang_buck = BNG,
tradition = TRD,
bandwagon = BWG,
future_wins = FUT,
players = PLA,
ownership = OWN,
behavior = BEH
)
colnames(nfl_fav_team) <- colnames(nfl_fav_team) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(nfl_fav_team, overwrite = TRUE)
# nfl_suspensions ------------------------------------------------------------------
nfl_suspensions <- read_csv("data-raw/nfl-suspensions/nfl-suspensions-data.csv") %>%
rename(description = `desc.`)
colnames(nfl_suspensions) <- colnames(nfl_suspensions) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(nfl_suspensions, overwrite = TRUE)
# nfl_ticket_prices ----------------------------------------------------------------
nfltix_div_avgprice <-
read_csv("data-raw/nfl-ticket-prices/2014-average-ticket-price.csv") %>%
rename(avg_tix_price = `Avg TP, $`)
colnames(nfltix_div_avgprice) <- colnames(nfltix_div_avgprice) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(nfltix_div_avgprice, overwrite = TRUE)
nfltix_jets_buy <- read_csv("data-raw/nfl-ticket-prices/jets-buyer.csv")
# Needs cleaning, multiple subtables
nfltix_usa_avg <- read_csv("data-raw/nfl-ticket-prices/national-average.csv") %>%
rename(
avg_tix_price = `Avg TP, $`,
team = Genre
) %>%
mutate(team = str_replace_all(team, pattern = " Tickets", replacement = ""))
colnames(nfltix_usa_avg) <- colnames(nfltix_usa_avg) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(nfltix_usa_avg, overwrite = TRUE)
# nfl_wide_receivers ---------------------------------------------------------------
nflwr_hist <- read_csv("data-raw/nfl-wide-receivers/advanced-historical.csv",
na = "NULL"
)
colnames(nflwr_hist) <- colnames(nfl_wr_hist) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(nflwr_hist, overwrite = TRUE)
nflwr_aging_curve <- read_csv("data-raw/nfl-wide-receivers/try-per-game-aging-curve.csv")
colnames(nflwr_aging_curve) <- colnames(nflwr_aging_curve) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(nflwr_aging_curve, overwrite = TRUE)
# nutrition_studies ----------------------------------------------------------------
nutrition_survey <- read_csv("data-raw/nutrition-studies/raw_anonymized_data.csv")
colnames(nutrition_survey) <- colnames(nutrition_survey) %>%
tolower() %>%
str_replace_all(" ", "_")
# Needs to be flipped with survey respondent as OU
nutrition_pvalues <- read_csv("data-raw/nutrition-studies/p_values_analysis.csv")
colnames(nutrition_pvalues) <- colnames(nutrition_pvalues) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(nutrition_pvalues, overwrite = TRUE)
# pew_religions --------------------------------------------------------------------
# A transition matrix of data is included here.
# police_deaths --------------------------------------------------------------------
police_deaths <- read_csv("data-raw/police-deaths/clean_data.csv") %>%
# Removed extra variables
select(person, cause_short, date:state) %>%
rename(cause_of_death = cause_short)
colnames(police_deaths) <- colnames(police_deaths) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(police_deaths, overwrite = TRUE)
# police_killings ------------------------------------------------------------------
police_killings <- read_csv("data-raw/police-killings/police_killings.csv",
na = c("Unknown", "-", "NA")
)
colnames(police_killings) <- colnames(police_killings) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(police_killings, overwrite = TRUE)
# presidential-campaign-trail ------------------------------------------------------
clin_trail <- read_csv("data-raw/presidential-campaign-trail/clinton.csv") %>%
mutate(candidate = "Clinton", date = mdy(date))
trum_trail <- read_csv("data-raw/presidential-campaign-trail/trump.csv") %>%
mutate(candidate = "Trump", date = mdy(date))
pres_2016_trail <- bind_rows(clin_trail, trum_trail) %>%
select(candidate, everything())
colnames(pres_2016_trail) <- colnames(pres_2016_trail) %>%
tolower() %>%
str_replace_all(" ", "_")
devtools::use_data(pres_2016_trail, overwrite = TRUE)
# Datasets cleaned and written by Jennifer Chunn
# police-locals ---------------------------------------------------------------
police_locals <- read_csv("data-raw/police-locals/police-locals.csv",
na = c("", "NA", "**")
)
colnames(police_locals) <- colnames(police_locals) %>%
tolower() %>%
str_replace_all("-", "_")
police_locals <- police_locals %>% rename(force_size = police_force_size)
usethis::use_data(police_locals, overwrite = TRUE)
# poll-of-pollsters ---------------------------------------------------------------
#-- MULTIPLE DATA FILES
# pollster-ratings ---------------------------------------------------------------
#-- MULTIPLE DATA FILES
# potential-candidates ---------------------------------------------------------------
cand_events_20150114 <- read_csv("data-raw/potential-candidates/2015_01_14/events.csv")
colnames(cand_events_20150114) <- colnames(cand_events_20150114) %>% tolower()
cand_events_20150114 <- cand_events_20150114 %>%
mutate(date = dmy(paste(date, "2015")))
usethis::use_data(cand_events_20150114, overwrite = TRUE)
cand_state_20150114 <- read_csv("data-raw/potential-candidates/2015_01_14/statements.csv")
colnames(cand_state_20150114) <- colnames(cand_state_20150114) %>% tolower()
cand_state_20150114 <- cand_state_20150114 %>%
rename(
date = `statement date`,
latest = `latest statement`,
score = `statement score`
) %>%
mutate(
date = dmy(paste(date, "2015")),
score = factor(score, labels = c(
"Not running", "Haven't ruled out running but leaning towards no",
"Unsure", "Actively exploring or seriously considering"
))
)
usethis::use_data(cand_state_20150114, overwrite = TRUE)
cand_events_20150130 <- read_csv("data-raw/potential-candidates/2015_01_30/events.csv")
colnames(cand_events_20150130) <- colnames(cand_events_20150130) %>% tolower()
cand_events_20150130 <- cand_events_20150130 %>%
mutate(date = dmy(paste(date, "2015")))
usethis::use_data(cand_events_20150130, overwrite = TRUE)
cand_state_20150130 <- read_csv("data-raw/potential-candidates/2015_01_30/statements.csv")
colnames(cand_state_20150130) <- colnames(cand_state_20150130) %>% tolower()
cand_state_20150130 <- cand_state_20150130 %>%
rename(
date = `statement date`,
latest = `latest statement`,
score = `statement score`
) %>%
mutate(
date = dmy(paste(date, "2015")),
score = factor(score, labels = c(
"Not running", "Haven't ruled out running but leaning towards no",
"Unsure", "Actively exploring or seriously considering"
))
)
usethis::use_data(cand_state_20150130, overwrite = TRUE)
# presidential-commencement-speeches ---------------------------------------------------------------
pres_commencement <- read_csv("data-raw/presidential-commencement-speeches/commencement_speeches.csv")
pres_commencement <- pres_commencement %>%
# Convert Show date to POSIX date object
mutate(date = mdy(date)) %>%
rename(pres = president, pres_name = president_name)
usethis::use_data(pres_commencement, overwrite = TRUE)
# pulitzer ---------------------------------------------------------------
pulitzer <- read_csv("data-raw/pulitzer/pulitzer-circulation-data.csv")
colnames(pulitzer) <- colnames(pulitzer) %>%
tolower() %>%
str_replace_all("pulitzer prize ", "") %>%
str_replace_all(" ", "_") %>%
str_replace_all(",", "") %>%
str_replace_all("-", "_")
pulitzer <- pulitzer %>%
rename(
circ2004 = daily_circulation_2004,
circ2013 = daily_circulation_2013,
pctchg_circ = change_in_daily_circulation_2004_2013,
num_finals1990_2003 = winners_and_finalists_1990_2003,
num_finals2004_2014 = winners_and_finalists_2004_2014,
num_finals1990_2014 = winners_and_finalists_1990_2014
) %>%
mutate(pctchg_circ = as.integer(str_replace(pctchg_circ, "%", "")))
usethis::use_data(pulitzer, overwrite = TRUE)
# region-survey ---------------------------------------------------------------
#-- MULTIPLE DATA FILES
# religion-survey ---------------------------------------------------------------
# religion_survey <- read_csv("data-raw/religion-survey/religion-survey-results.csv")
# need to create all variable names
# usethis::use_data(religion_survey, overwrite = TRUE)
# san-andreas ---------------------------------------------------------------
san_andreas <- read_csv("data-raw/san-andreas/earthquake_data.csv")
colnames(san_andreas) <- c(
"worry_general",
"worry_bigone",
"will_occur",
"experience",
"prepared",
"fam_san_andreas",
"fam_yellowstone",
"age",
"female",
"hhold_income",
"region"
)
san_andreas <- san_andreas %>%
mutate(
worry_general = factor(worry_general,
levels = c(
"Not at all worried", "Not so worried", "Somewhat worried",
"Very worried", "Extremely worried"
), ordered = TRUE
),
worry_bigone = factor(worry_bigone,
levels = c(
"Not at all worried", "Not so worried", "Somewhat worried",
"Very worried", "Extremely worried"
), ordered = TRUE
),
will_occur = ifelse(will_occur == "Yes", TRUE, FALSE),
experience = factor(experience,
levels = c("No", "Yes, one or more minor ones", "Yes, one or more major ones"), ordered = TRUE
),
prepared = ifelse(prepared == "Yes", TRUE, FALSE),
fam_san_andreas = factor(fam_san_andreas,
levels = c(
"Not at all familiar", "Not so familiar", "Somewhat familiar",
"Very familiar", "Extremely familiar"
), ordered = TRUE
),
fam_yellowstone = factor(fam_yellowstone,
levels = c(
"Not at all familiar", "Not so familiar", "Somewhat familiar",
"Very familiar", "Extremely familiar"
), ordered = TRUE
),
age = factor(age,
levels = c("18 - 29", "30 - 44", "45 - 59", "60"),
labels = c("18 - 29", "30 - 44", "45 - 59", "60+"), ordered = TRUE
),
female = ifelse(female == "Female", TRUE, FALSE),
hhold_income = factor(hhold_income,
levels = c(
"$0 to $9,999", "$10,000 to $24,999", "$25,000 to $49,999",
"$50,000 to $74,999", "$75,000 to $99,999", "$100,000 to $124,999",
"$125,000 to $149,999", "$150,000 to $174,999", "$175,000 to $199,999",
"$200,000 and up", "Prefer not to answer"
), ordered = TRUE
)
)
usethis::use_data(san_andreas, overwrite = TRUE)
# sleeping-alone-data ---------------------------------------------------------------
# sleeping_alone_data <- read_csv("data-raw/sleeping-alone-data/sleeping-alone-data.csv")
# much data editing needed
# usethis::use_data(sleeping_alone_data, overwrite = TRUE)
# star-wars-survey ---------------------------------------------------------------
# star_wars_survey <- read_csv("data-raw/star-wars-survey/StarWars.csv")
# much data editing needed
# usethis::use_data(star_wars_survey, overwrite = TRUE)
# steak-survey ---------------------------------------------------------------
steak_survey <- read_csv("data-raw/steak-survey/steak-risk-survey.csv")
colnames(steak_survey) <- c(
"respondent_id",
"lottery_a",
"smoke",
"alcohol",
"gamble",
"skydiving",
"speed",
"cheated",
"steak",
"steak_prep",
"female",
"age",
"hhold_income",
"educ",
"region"
)
# remove row that only contains "Response" for each var
steak_survey <- steak_survey %>% filter(!is.na(respondent_id))
steak_survey <- steak_survey %>%
mutate(
lottery_a = ifelse(lottery_a == "Lottery A", TRUE, FALSE),
smoke = ifelse(smoke == "Yes", TRUE, FALSE),
alcohol = ifelse(alcohol == "Yes", TRUE, FALSE),
gamble = ifelse(gamble == "Yes", TRUE, FALSE),
skydiving = ifelse(skydiving == "Yes", TRUE, FALSE),
speed = ifelse(speed == "Yes", TRUE, FALSE),
cheated = ifelse(cheated == "Yes", TRUE, FALSE),
steak = ifelse(steak == "Yes", TRUE, FALSE),
steak_prep = factor(steak_prep,
levels = c("Rare", "Medium rare", "Medium", "Medium Well", "Well"), ordered = TRUE
),
female = ifelse(female == "Female", TRUE, FALSE),
age = factor(age, levels = c("18-29", "30-44", "45-60", "> 60"), ordered = TRUE),
hhold_income = factor(hhold_income,
levels = c("$0 - $24,999", "$25,000 - $49,999", "$50,000 - $99,999",
"$100,000 - $149,999", "$150,000+",
ordered = TRUE
)
),
educ = factor(educ, levels = c(
"Less than high school degree", "High school degree",
"Some college or Associate degree", "Bachelor degree",
"Graduate degree"
), ordered = TRUE)
)
usethis::use_data(steak_survey, overwrite = TRUE)
# study-drugs ---------------------------------------------------------------
# DATA IS CONFIDENTIAL
# tarantino ---------------------------------------------------------------
tarantino <- read_csv("data-raw/tarantino/tarantino.csv")
tarantino <- tarantino %>%
mutate(profane = ifelse(type == "word", TRUE, FALSE)) %>%
select(movie, profane, word, minutes_in)
usethis::use_data(tarantino, overwrite = TRUE)
# tennis-time ---------------------------------------------------------------
tennis_events_time <- read_csv("data-raw/tennis-time/events_time.csv")
year_start <- sapply(str_split(tennis_events_time$years, "-"), function(x) {
x[1]
})
year_end <- sapply(str_split(tennis_events_time$years, "-"), function(x) {
ifelse(length(x) == 1, x[1], x[2])
})
tennis_events_time <- tennis_events_time %>%
mutate(
surface = factor(surface),
year_start = as.integer(year_start),
year_end = as.integer(year_end)
) %>%
rename(sec_added = seconds_added_per_point) %>%
select(-years)
usethis::use_data(tennis_events_time, overwrite = TRUE)
tennis_players_time <- read_csv("data-raw/tennis-time/players_time.csv")
tennis_players_time <- tennis_players_time %>% rename(sec_added = seconds_added_per_point)
usethis::use_data(tennis_players_time, overwrite = TRUE)
tennis_serve_time <- read_csv("data-raw/tennis-time/serve_times.csv")
tennis_serve_time <- tennis_serve_time %>%
mutate(date = dmy(day)) %>%
rename(sec_between = seconds_before_next_point) %>%
select(-day)
usethis::use_data(tennis_serve_time, overwrite = TRUE)
# terrorism ---------------------------------------------------------------
# MULTIPLE FILES
# thanksgiving-2015 ---------------------------------------------------------------
# thanksgiving_2015 <- read_csv("data-raw/thanksgiving-2015/thanksgiving-2015-poll-data.csv")
#
# need to create all variable names
# usethis::use_data(thanksgiving_2015, overwrite = TRUE)
# trump-news ---------------------------------------------------------------
trump_news <- read_csv("data-raw/trump-news/trump_news_data.csv")
trump_news <- trump_news %>%
mutate(date = mdy(date)) %>%
rename(major_cat = major_category)
usethis::use_data(trump_news, overwrite = TRUE)
# trump-twitter ---------------------------------------------------------------
trump_twitter <- read_csv("data-raw/trump-twitter/realDonaldTrump_poll_tweets.csv")
trump_twitter <- trump_twitter %>%
mutate(created_at = mdy_hms(created_at))
usethis::use_data(trump_twitter, overwrite = TRUE)
# unisex-names ---------------------------------------------------------------
unisex_names <- read_csv("data-raw/unisex-names/unisex_names_table.csv")
unisex_names <- unisex_names %>% select(-X1)
usethis::use_data(unisex_names, overwrite = TRUE)
# us-weather-history ---------------------------------------------------------------
#-- MULTIPLE FILES
# weather-check ---------------------------------------------------------------
weather_check <- read_csv("data-raw/weather-check/weather-check.csv", na = c("", "NA", "-"))
colnames(weather_check) <- c(
"respondent_id",
"ck_weather",
"weather_source",
"weather_source_site",
"ck_weather_watch",
"age",
"female",
"hhold_income",
"region"
)
weather_check <- weather_check %>%
mutate(
ck_weather = ifelse(ck_weather == "Yes", TRUE, FALSE),
ck_weather_watch = factor(ck_weather_watch,
levels = c(
"Very unlikely", "Somewhat unlikely",
"Somewhat likely", "Very likely"
), ordered = TRUE
),
age = factor(age,
levels = c("18 - 29", "30 - 44", "45 - 59", "60+")
),
female = ifelse(female == "Female", TRUE, FALSE),
hhold_income = factor(hhold_income,
levels = c(
"$0 to $9,999", "$10,000 to $24,999", "$25,000 to $49,999",
"$50,000 to $74,999", "$75,000 to $99,999", "$100,000 to $124,999",
"$125,000 to $149,999", "$150,000 to $174,999", "$175,000 to $199,999",
"$200,000 and up", "Prefer not to answer"
), ordered = TRUE
)
)
usethis::use_data(weather_check, overwrite = TRUE)
# womens-world-cup-predictions ---------------------------------------------------------------
#--MULTIPLE FILES
# world-cup-predictions ---------------------------------------------------------------
#--MULTIPLE FILES
# obama-commutations ---------------------------------------------------------------
#-- not possible
# riddler-castles ---------------------------------------------------------------
riddler_castles <- read_csv("data-raw/riddler-castles/castle-solutions.csv", na = c("", "NA", "-"))
riddler_castles <- riddler_castles %>%
rename("reason" = `Why did you choose your troop deployment?`)
colnames(riddler_castles) <- colnames(riddler_castles) %>%
tolower() %>%
str_replace_all(" ", "")
usethis::use_data(riddler_castles, overwrite = TRUE)
# riddler-castles2 ---------------------------------------------------------------
riddler_castles2 <- read_csv("data-raw/riddler-castles/castle-solutions-2.csv", na = c("", "NA", "-"))
riddler_castles2 <- riddler_castles2 %>%
rename("reason" = `Why did you choose your troop deployment?`)
colnames(riddler_castles2) <- colnames(riddler_castles2) %>%
tolower() %>%
str_replace_all(" ", "")
usethis::use_data(riddler_castles2, overwrite = TRUE)
# antiquities-act ---------------------------------------------------------------
antiquities_act <- read_csv("data-raw/antiquities-act/actions_under_antiquities_act.csv", na = c("", "NA"))
names(antiquities_act)[1] <- "current_name"
antiquities_act <- antiquities_act %>%
# fix date for Acadia NP
mutate(
year = ifelse(str_length(date) == 4, date, year),
date = ifelse(str_length(date) == 4, paste("1", "1", str_sub(date, 3, 4), sep = "/"), date),
date = mdy(date),
# ensure century is correct
date = mdy(paste(month(date), day(date), year, sep = "/")),
# remove text from acres_affected variable
acres_affected = as.numeric(str_replace_all(acres_affected, "[^0-9\\.]", ""))
)
usethis::use_data(antiquities_act, overwrite = TRUE)
# tenth-circuit ---------------------------------------------------------------
tenth_circuit <- read_csv("data-raw/tenth-circuit/tenth-circuit.csv", na = c("", "NA"))
colnames(tenth_circuit) <- colnames(tenth_circuit) %>%
tolower() %>%
str_replace_all(" citation", "_cit") %>%
str_replace_all(" ", "")
tenth_circuit <- tenth_circuit %>%
mutate(date = mdy(date)) %>%
rename(
vote1_liberal = vote1,
vote2_liberal = vote2,
vote3_liberal = vote3
)
usethis::use_data(tenth_circuit, overwrite = TRUE)
# Datasets cleaned and written by Meredith Manley
# ahca_polls ----------------------------------------------------------------------
ahca_polls <- read_csv("data-raw/ahca-polls/ahca_polls.csv") %>%
clean_names() %>%
mutate(
start = as.Date(start, "%m/%d/%y"),
end = as.Date(end, "%m/%d/%y"),
pollster = as.factor(pollster)
)
usethis::use_data(ahca_polls, overwrite = TRUE)
# bachelorette ---------------------------------------------------------------------
bachelorette <- read_csv("data-raw/bachelorette/bachelorette.csv") %>%
clean_names() %>%
mutate_at(vars(starts_with("elimination")), as.factor) %>%
mutate_at(vars(starts_with("elimination")), funs(ifelse(. == "<NA>", NA, .))) %>%
mutate_at(vars(starts_with("dates")), as.factor) %>%
mutate_at(vars(starts_with("dates")), funs(ifelse(. == "<NA>", NA, .))) %>%
mutate(season = as.integer(season)) %>%
filter(season != "SEASON")
usethis::use_data(bachelorette, overwrite = TRUE)
# candy-power-rankings -------------------------------------------------------------
candy_rankings <- read_csv("data-raw/candy-power-ranking/candy-data.csv") %>%
clean_names() %>%
mutate(
chocolate = as.logical(chocolate),
fruity = as.logical(fruity),
caramel = as.logical(caramel),
peanutyalmondy = as.logical(peanutyalmondy),
nougat = as.logical(nougat),
crispedricewafer = as.logical(crispedricewafer),
hard = as.logical(hard),
bar = as.logical(bar),
pluribus = as.logical(pluribus)
) %>%
mutate(win)
mutate_at(vars(competitorname), funs(gsub("Ă•", "'", .)))
usethis::use_data(candy_rankings, overwrite = TRUE)
# chess-transfers ----------------------------------------------------------------
chess_transfers <- read_csv("data-raw/chess-transfers/transfers.csv") %>%
clean_names() %>%
mutate(
transfer_date = as.Date(transfer_date, "%m/%d/%y"),
id = as.character(id)
)
usethis::use_data(chess_transfers, overwrite = TRUE)
# congress-generic-ballot --------------------------------------------------------
# generic_polllist
generic_polllist <-
read_csv("https://projects.fivethirtyeight.com/generic-ballot-data/generic_polllist.csv") %>%
clean_names() %>%
mutate(
modeldate = as.Date(modeldate, "%m/%d/%Y"),
startdate = as.Date(startdate, "%m/%d/%Y"),
enddate = as.Date(enddate, "%m/%d/%Y"),
createddate = as.Date(createddate, "%m/%d/%Y"),
timestamp = parse_date_time(timestamp, "HMS dmY"),
subgroup = as.factor(subgroup),
pollster = as.factor(pollster),
grade = factor(grade, levels = rev(c("A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-")), ordered = TRUE),
population = as.factor(population),
poll_id = as.character(poll_id),
question_id = as.character(question_id)
) %>%
mutate_at(vars(multiversions), funs(ifelse(. == "<NA>", NA, .)))
usethis::use_data(generic_polllist, overwrite = TRUE)
# generic_topline
generic_topline <-
read_csv("https://projects.fivethirtyeight.com/generic-ballot-data/generic_topline.csv") %>%
clean_names() %>%
mutate(
modeldate = as.Date(modeldate, "%m/%d/%Y"),
timestamp = parse_date_time(timestamp, "HMS dmY"),
subgroup = as.factor(subgroup)
)
usethis::use_data(generic_topline, overwrite = TRUE)
# soccer-spi ------------------------------------------------------------------------
# spi_global_rankings
spi_global_rankings <-
read_csv("https://projects.fivethirtyeight.com/soccer-api/club/spi_global_rankings.csv")
usethis::use_data(spi_global_rankings, overwrite = TRUE)
# Datasets cleaned and written by Maggie Shea
# nfl-fandom---------------------------------------------------------------------
nfl_fandom_google <- read_csv("data-raw/nfl-fandom/NFL_fandom_data-google_trends.csv", skip = 1) %>%
clean_names() %>%
rename(
trump_2016_vote = "trump_2016_votepercent"
) %>%
mutate(
nfl = as.numeric(str_replace_all(nfl, "%", "")),
nba = as.numeric(str_replace_all(nba, "%", "")),
mlb = as.numeric(str_replace_all(mlb, "%", "")),
nhl = as.numeric(str_replace_all(nhl, "%", "")),
nascar = as.numeric(str_replace_all(nascar, "%", "")),
cbb = as.numeric(str_replace_all(cbb, "%", "")),
cfb = as.numeric(str_replace_all(cfb, "%", "")),
trump_2016_vote = as.numeric(str_replace_all(trump_2016_vote, "%", ""))
)
usethis::use_data(nfl_fandom_google, overwrite = TRUE)
nfl_fandom_surveymonkey <- read_csv(
"data-raw/nfl-fandom/NFL_fandom_data-surveymonkey.csv",
skip = 1
) %>%
clean_names() %>%
rename(
total_respondents = "tot_respondents",
gop_percent = "goppercent",
dem_percent = "dempercent",
ind_percent = "indpercent",
white_percent = "whitepercent",
nonwhite_percent = "nonwhitepercent",
asian_dem = "asian",
black_dem = "black",
hispanic_dem = "hispanic",
other_dem = "other",
white_dem = "white",
total_dem = "total"
) %>%
mutate(
team = as.factor(team),
gop_percent = str_replace_all(gop_percent, "%", ""),
dem_percent = str_replace_all(dem_percent, "%", ""),
ind_percent = str_replace_all(ind_percent, "%", ""),
white_percent = str_replace_all(white_percent, "%", ""),
nonwhite_percent = str_replace_all(nonwhite_percent, "%", "")
)
colnames(nfl_fandom_surveymonkey) <- colnames(nfl_fandom_surveymonkey) %>%
str_replace_all(pattern = "_1", replacement = "_ind") %>%
str_replace_all(pattern = "_2", replacement = "_gop")
usethis::use_data(nfl_fandom_surveymonkey, overwrite = TRUE)
# puerto-rico-media---------------------------------------------------------------------
# Data on Google trend searches for hurricanes Harvey, Irma, Jose, and Maria
google_trends <- read_csv("data-raw/puerto-rico-media/google_trends.csv") %>%
clean_names() %>%
rename(
date = day,
hurricane_harvey_us = hurricane_harvey_united_states,
hurricane_irma_us = hurricane_irma_united_states,
hurricane_maria_us = hurricane_maria_united_states,
hurricane_jose_us = hurricane_jose_united_states
)
usethis::use_data(google_trends, overwrite = TRUE)
# Data on the number of sentences per day that mention Hurricanes Harvey, Irma,
# Jose, and Maria in online news
mediacloud_hurricanes <- read_csv("data-raw/puerto-rico-media/mediacloud_hurricanes.csv") %>%
clean_names() %>%
mutate(
date = as.Date(date, format = "%m / %d / %y")
)
usethis::use_data(mediacloud_hurricanes, overwrite = TRUE)
# Data on the number of sentences per day
# that mention Puerto Rico, Texas, and Florida in online news
mediacloud_states <- read_csv("data-raw/puerto-rico-media/mediacloud_states.csv") %>%
clean_names() %>%
mutate(
date = as.Date(date, format = "%m / %d / %y")
)
usethis::use_data(mediacloud_states, overwrite = TRUE)
# A list of sources included in Media Cloud's "U.S. Top Online News" collection
mediacloud_online_news <- read_csv("data-raw/puerto-rico-media/mediacloud_top_online_news.csv")
usethis::use_data(mediacloud_online_news, overwrite = TRUE)
# Data on the number of headlines that mention Puerto Rico, Texas, and Florida,
# as well as headlines that mention each location and 'President' or 'Trump'.
mediacloud_trump <- read_csv("data-raw/puerto-rico-media/mediacloud_trump.csv") %>%
clean_names() %>%
rename(
puerto_rico = title_puerto_rico,
puerto_rico_and_trump = title_puerto_rico_and_title_trump_or_title_president,
florida = title_florida,
florida_and_trump = title_florida_and_title_trump_or_title_president,
texas = title_texas,
texas_and_trump = title_texas_and_title_trump_or_title_president
)
usethis::use_data(mediacloud_trump, overwrite = TRUE)
# Data on the percent of sentences per day in TV News that mention Hurricanes
# Harvey, Irma, Jose, and Maria.
tv_hurricanes <- read_csv("data-raw/puerto-rico-media/tv_hurricanes.csv") %>%
clean_names() %>%
mutate(
date = as.Date(date, format = "%m / %d / %y")
)
usethis::use_data(tv_hurricanes, overwrite = TRUE)
# Data on the percent of sentences per day in TV News that mention Hurricanes
# Harvey, Irma, Jose, and Maria by network.
tv_hurricanes_by_network <- read_csv("data-raw/puerto-rico-media/tv_hurricanes_by_network.csv") %>%
clean_names() %>%
mutate(
date = as.Date(date, format = "%m / %d / %y"),
query = as.factor(query)
)
usethis::use_data(tv_hurricanes_by_network, overwrite = TRUE)
# Data on the percent of sentences per day in TV News that mention Puerto Rico,
# Texas, and Florida.
tv_states <- read_csv("data-raw/puerto-rico-media/tv_states.csv") %>%
clean_names() %>%
mutate(date = as.Date(date, format = "%m / %d / %y"))
usethis::use_data(tv_states, overwrite = TRUE)
# riddler-pick-lowest---------------------------------------------------------------------
riddler_pick_lowest <- read_csv("data-raw/riddler-pick-lowest/low_numbers.csv") %>%
clean_names()
usethis::use_data(riddler_pick_lowest, overwrite = TRUE)
# sandy-311-calls---------------------------------------------------------------------
sandy_311 <- read_csv("data-raw/sandy-311-calls/sandy-311-calls-by-day.csv") %>%
clean_names() %>%
rename(
nyc_311 = nyc_3_1_1,
nyc_service = nycservice,
nys_emergency_mg = nysemergencymg
) %>%
mutate(date = as.Date(date, format = "%m / %d / %y"))
usethis::use_data(sandy_311, overwrite = TRUE)
# trump-approval-ratings---------------------------------------------------------------------
trump_approval_poll <- read_csv("https://projects.fivethirtyeight.com/trump-approval-data/approval_polllist.csv") %>%
mutate(
multiversions = ifelse(multiversions == "*", TRUE, FALSE),
multiversions = ifelse(is.na(multiversions), FALSE, TRUE),
tracking = ifelse(is.na(tracking), FALSE, TRUE),
subgroup = as.factor(subgroup),
modeldate = as.Date(modeldate, format = "%m / %d / %Y"),
startdate = as.Date(startdate, format = "%m / %d / %Y"),
enddate = as.Date(enddate, format = "%m / %d / %Y"),
pollster = as.factor(pollster),
grade = factor(grade, levels = rev(c("A+", "A", "A-", "B+", "B", "B-", "C+", "C", "C-")), ordered = TRUE),
population = as.factor(population),
url = as.factor(url),
createddate = as.Date(createddate, format = "%m / %d / %Y"),
timestamp = as.POSIXct(timestamp, tz = "GMT", format = "%H:%M:%S %d %b %Y")
) %>%
rename(
model_date = modeldate,
start_date = startdate,
end_date = enddate,
sample_size = samplesize,
created_date = createddate
) %>%
select(-c(president, model_date, influence))
usethis::use_data(trump_approval_poll, overwrite = TRUE)
trump_approval_trend <- read_csv("https://projects.fivethirtyeight.com/trump-approval-data/approval_topline.csv") %>%
clean_names() %>%
mutate(
president = as.factor(president),
subgroup = as.factor(subgroup),
modeldate = as.Date(modeldate, format = "%m / %d / %Y"),
timestamp = as.POSIXct(timestamp, tz = "GMT", format = "%H:%M:%S %d %b %Y")
) %>%
rename(
approve_high = approve_hi,
approve_low = approve_lo,
disapprove_high = disapprove_hi,
disapprove_low = disapprove_lo
) %>%
select(-c(president))
usethis::use_data(trump_approval_trend, overwrite = TRUE)
# trump-world-trust---------------------------------------------------------------------
trumpworld_issue_1 <- read_csv("data-raw/trump-world-trust/TRUMPWORLD-issue-1.csv") %>%
clean_names() %>%
mutate(
country = as.factor(country),
issue = 1
)
trumpworld_issue_2 <- read_csv("data-raw/trump-world-trust/TRUMPWORLD-issue-2.csv") %>%
clean_names() %>%
mutate(
country = as.factor(country),
issue = 2
)
trumpworld_issue_3 <- read_csv("data-raw/trump-world-trust/TRUMPWORLD-issue-3.csv") %>%
clean_names() %>%
mutate(
country = as.factor(country),
issue = 3
)
trumpworld_issue_4 <- read_csv("data-raw/trump-world-trust/TRUMPWORLD-issue-4.csv") %>%
clean_names() %>%
mutate(
country = as.factor(country),
issue = 4
)
trumpworld_issue_5 <- read_csv("data-raw/trump-world-trust/TRUMPWORLD-issue-5.csv") %>%
clean_names() %>%
mutate(
country = as.factor(country),
issue = 5
)
trumpworld_issues <- bind_rows(
trumpworld_issue_1, trumpworld_issue_2, trumpworld_issue_3,
trumpworld_issue_4, trumpworld_issue_5
)
usethis::use_data(trumpworld_issues, overwrite = TRUE)
trumpworld_pres <- read_csv("data-raw/trump-world-trust/TRUMPWORLD-pres.csv") %>%
clean_names() %>%
mutate(question = "Favorable view of US")
trumpworld_us <- read_csv("data-raw/trump-world-trust/TRUMPWORLD-us.csv") %>%
clean_names() %>%
mutate(question = "Trust President")
trumpworld_polls <- bind_rows(trumpworld_pres, trumpworld_us)
usethis::use_data(trumpworld_polls, overwrite = TRUE)
# undefeated-boxers---------------------------------------------------------------------
undefeated <- read_csv("data-raw/undefeated-boxers/undefeated.csv")
usethis::use_data(undefeated, overwrite = TRUE)
# Datasets cleaned and written by Starry Yujia Zhou
# august-senate-polls ---------------------------------------------------------------
august_senate_polls <- read_csv("data-raw/august-senate-polls/august_senate_polls.csv") %>%
clean_names() %>%
mutate(
cycle = as.numeric(cycle),
state = as.factor(state),
senate_class = as.factor(senate_class),
start_date = as.Date(start_date), end_date = as.Date(end_date)
)
usethis::use_data(august_senate_polls, overwrite = TRUE)
# endorsements ---------------------------------------------------------------
endorsements_2020 <- read_csv("https://projects.fivethirtyeight.com/endorsements-2020-data/endorsements-2020.csv") %>%
clean_names() %>%
mutate(
position = as.factor(position),
city = as.factor(city),
state = as.factor(state),
endorser_party = as.factor(endorser_party),
category = as.factor(category)
)
usethis::use_data(endorsements_2020, overwrite = TRUE)
# forecast-review ---------------------------------------------------------------
forecast_results_2018 <- read_csv("data-raw/forecast-review/forecast_results_2018.csv") %>%
clean_names() %>%
rename(dem_win_prob = democrat_win_probability) %>%
rename(rep_win_prob = republican_win_probability) %>%
mutate(
branch = as.factor(branch),
forecastdate = as.Date(forecastdate, "%m/%d/%y"),
version = as.factor(version),
category = factor(category, levels = c("Solid D", "Likely D", "Lean D", "Tossup (Tilt D)", "Tossup (Tilt R)", "Lean R", "Likely R", "Safe R")),
democrat_won = as.logical(democrat_won), # Convert 0/1 to TRUE/FALSE
republican_won = as.logical(republican_won),
uncalled = as.logical(uncalled)
)
usethis::use_data(forecast_results_2018, overwrite = TRUE)
# governors-forecast-2018 ---------------------------------------------------------------
governor_national_forecast <- read_csv("https://projects.fivethirtyeight.com/congress-model-2018/governor_national_forecast.csv") %>%
clean_names() %>%
mutate(
party = as.factor(party),
model = as.factor(model)
) %>%
select(-state)
governor_state_forecast <- read_csv("https://projects.fivethirtyeight.com/congress-model-2018/governor_state_forecast.csv") %>%
clean_names() %>%
mutate(
state = as.factor(state),
candidate = as.factor(candidate),
party = as.factor(party),
incumbent = as.logical(incumbent),
model = as.factor(model)
) %>%
select(-district, -special)
usethis::use_data(governor_national_forecast, overwrite = TRUE)
usethis::use_data(governor_state_forecast, overwrite = TRUE)
# house-forecast-2018 ---------------------------------------------------------------
house_national_forecast <-
"https://projects.fivethirtyeight.com/congress-model-2018/house_national_forecast.csv" %>%
read_csv() %>%
clean_names() %>%
mutate(
party = as.factor(party),
model = as.factor(model)
) %>%
select(-state)
usethis::use_data(house_district_forecast, overwrite = TRUE)
usethis::use_data(house_national_forecast, overwrite = TRUE)
# mueller-polls ---------------------------------------------------------------
mueller_approval_polls <- read_csv("data-raw/mueller-polls/mueller-approval-polls.csv") %>%
clean_names() %>%
mutate(
start = as.Date(start, "%m/%d/%y"),
end = as.Date(end, "%m/%d/%y"),
pollster = as.factor(pollster),
population = as.factor(population),
text = as.factor(text)
)
usethis::use_data(mueller_approval_polls, overwrite = TRUE)
# ncaa-womens-basketball-tournament ---------------------------------------------------------------
ncaa_w_bball_tourney <- read_csv("data-raw/ncaa-womens-basketball-tournament/ncaa-womens-basketball-tournament-history.csv") %>%
clean_names() %>%
rename(first_home_game = x1st_game_at_home) %>%
mutate(
school = as.factor(school),
seed = as.numeric(seed),
conference = as.factor(conference),
conf_w = as.numeric(conf_w),
conf_l = as.numeric(conf_l),
conf_percent = as.numeric(conf_percent),
conf_place = as.factor(conf_place),
how_qual = as.factor(how_qual),
first_home_game = as.factor(str_replace_all(first_home_game, "\\^", "")),
tourney_w = as.logical(tourney_w),
tourney_l = as.logical(tourney_l),
tourney_finish = factor(tourney_finish, levels = c("OR", "1st", "2nd", "RSF", "RF", "NSF", "N2nd", "Champ")),
full_percent = as.numeric(full_percent),
first_home_game = as.logical(ifelse(first_home_game == "Y", TRUE, FALSE))
)
usethis::use_data(ncaa_w_bball_tourney, overwrite = TRUE)
# partisan-lean ---------------------------------------------------------------
partisan_lean_district <- read_csv("data-raw/partisan-lean/fivethirtyeight_partisan_lean_DISTRICTS.csv") %>%
clean_names() %>%
separate(district, c("state", "district_number")) %>%
separate(pvi_538, c("pvi_party", "pvi_amount"))
partisan_lean_district <- partisan_lean_district %>%
mutate(
state = as.factor(state.name[match(partisan_lean_district$state, state.abb)]),
district_number = as.numeric(district_number),
pvi_party = as.factor(pvi_party),
pvi_amount = as.numeric(pvi_amount)
)
usethis::use_data(partisan_lean_district, overwrite = TRUE)
partisan_lean_state <- read_csv("data-raw/partisan-lean/fivethirtyeight_partisan_lean_STATES.csv") %>%
clean_names() %>%
separate(pvi_538, c("pvi_party", "pvi_amount")) %>%
mutate(
state = as.factor(state),
pvi_party = as.factor(pvi_party),
pvi_amount = as.numeric(pvi_amount)
)
usethis::use_data(partisan_lean_state, overwrite = TRUE)
# political-elasticity-scores ---------------------------------------------------------------
elasticity_by_district <- read_csv("data-raw/political-elasticity-scores/elasticity-by-district.csv") %>%
clean_names() %>%
separate(district, c("state", "district_number"))
elasticity_by_district <- elasticity_by_district %>%
mutate(
state = as.factor(state.name[match(elasticity_by_district$state, state.abb)]),
district_number = as.numeric(district_number)
)
usethis::use_data(elasticity_by_district, overwrite = TRUE)
elasticity_by_state <- read_csv("data-raw/political-elasticity-scores/elasticity-by-state.csv") %>%
clean_names() %>%
left_join(state_info, by = c("state" = "state_abbrev")) %>%
select(state, state_name = state.y, elasticity)
usethis::use_data(elasticity_by_state, overwrite = TRUE)
# russia-investigation ---------------------------------------------------------------
russia_investigation <- read_csv("data-raw/russia-investigation/russia-investigation.csv") %>%
clean_names() %>%
mutate(
investigation = as.factor(investigation),
name = as.factor(name),
type = as.factor(type),
president = as.factor(president)
)
usethis::use_data(russia_investigation, overwrite = TRUE)
# senate-forecast-2018 ---------------------------------------------------------------
senate_national_forecast <- read_csv("https://projects.fivethirtyeight.com/congress-model-2018/senate_national_forecast.csv") %>%
clean_names() %>%
mutate(
party = as.factor(party),
model = as.factor(model)
) %>%
select(-state)
usethis::use_data(senate_national_forecast, overwrite = TRUE)
senate_seat_forecast <- read_csv("https://projects.fivethirtyeight.com/congress-model-2018/senate_seat_forecast.csv") %>%
clean_names() %>%
mutate(
state = as.factor(state),
candidate = as.factor(candidate),
party = as.factor(party),
model = as.factor(model)
)
usethis::use_data(senate_seat_forecast, overwrite = TRUE)
# trump-lawsuits ---------------------------------------------------------------
trump_lawsuits <- read_csv("data-raw/trump-lawsuits/trump-lawsuits.csv") %>%
clean_names() %>%
mutate(
case_name = as.factor(case_name),
plaintiff = as.factor(plaintiff),
defendant = as.factor(defendant),
current_location = as.factor(current_location),
previous_location = as.factor(previous_location),
judge = as.factor(judge),
capacity = as.factor(capacity),
type = as.factor(type),
trump_category = as.factor(trump_category),
issue = as.factor(issue),
status = as.factor(status)
)
usethis::use_data(trump_lawsuits, overwrite = TRUE)
# masculinity-survey ---------------------------------------------------------------
masculinity_survey <- read_csv("data-raw/masculinity-survey/masculinity-survey-r.csv") %>%
clean_names() %>%
mutate(
question = as.factor(question),
response = as.factor(response)
)
usethis::use_data(masculinity_survey, overwrite = TRUE)
# Datasets cleaned and written by Natalia Iannucci
# media-mentions-2020---------------------------------------------------------------------
media_mentions_cable <- read_csv("data-raw/media-mentions-2020/cable_weekly.csv")
media_mentions_online <- read_csv("data-raw/media-mentions-2020/online_weekly.csv")
media_mentions_cable <- media_mentions_cable %>%
clean_names() %>%
mutate(
matched_clips = as.numeric(matched_clips),
all_candidate_clips = as.numeric(all_candidate_clips),
total_clips = as.numeric(total_clips)
)
usethis::use_data(media_mentions_cable, overwrite = TRUE)
media_mentions_online <- media_mentions_online %>%
clean_names() %>%
mutate(
matched_stories = as.numeric(matched_stories),
all_candidate_stories = as.numeric(all_candidate_stories)
)
usethis::use_data(media_mentions_online, overwrite = TRUE)
# Datasets cleaned and written by Marium Tapal
# index ------------------------------------------------------------
state_index <- read_csv("data-raw/state-of-the-state/index.csv") %>%
mutate(
state = as.factor(state),
party = as.factor(party)
)
usethis::use_data(state_index, overwrite = TRUE)
# words ------------------------------------------------------------
state_words <- read_csv("data-raw/state-of-the-state/words.csv") %>%
mutate(
category = as.factor(category),
d_speeches = as.integer(d_speeches),
r_speeches = as.integer(r_speeches),
total = as.integer(total)
)
usethis::use_data(state_words, overwrite = TRUE)
# Datasets cleaned and written by Alina Barylsky
# dem_candidates ---------------------------------------------------------------
dem_candidates <- read_csv("data-raw/primary-candidates-2018/dem_candidates.csv") %>%
clean_names() %>%
# change classes from character to factor and date to date
mutate(
state = as.factor(state),
office_type = as.factor(office_type),
race_type = as.factor(race_type),
primary_status = as.factor(primary_status),
primary_runoff_status = as.factor(primary_runoff_status),
general_status = as.factor(general_status),
race = as.factor(race),
race_primary_election_date = as.Date(race_primary_election_date, "%m/%d/%y"),
won_primary = as.factor(won_primary),
veteran = as.factor(veteran),
lgbtq = as.factor(lgbtq),
elected_official = as.factor(elected_official),
self_funder = as.factor(self_funder),
stem = as.factor(stem),
obama_alum = as.factor(obama_alum),
party_support = as.factor(party_support),
emily_endorsed = as.factor(emily_endorsed),
guns_sense_candidate = as.factor(guns_sense_candidate),
biden_endorsed = as.factor(biden_endorsed),
warren_endorsed = as.factor(warren_endorsed),
sanders_endorsed = as.factor(sanders_endorsed),
our_revolution_endorsed = as.factor(our_revolution_endorsed),
justice_dems_endorsed = as.factor(justice_dems_endorsed),
pccc_endorsed = as.factor(pccc_endorsed),
indivisible_endorsed = as.factor(indivisible_endorsed),
wfp_endorsed = as.factor(wfp_endorsed),
no_labels_support = as.factor(no_labels_support)
)
# transform district variable into 2 variables:
# body of government
dem_candidates <- dem_candidates %>%
mutate(
body = case_when(
str_detect(district, "Governor") ~ "governor",
str_detect(district, "House") ~ "house",
str_detect(district, "Senate") ~ "senate"
),
body = as.factor(body)
)
# district number
dem_candidates <- dem_candidates %>%
mutate(
district_num = as.double(str_extract(district, "[[:digit:]]+"))
)
# remove original district variable
dem_candidates$district <- NULL
# change levels from Yes/No to TRUE/FALSE for relevant variables
levels(dem_candidates$won_primary) <- c(FALSE, TRUE)
levels(dem_candidates$veteran) <- c(FALSE, TRUE)
levels(dem_candidates$lgbtq) <- c(FALSE, TRUE)
levels(dem_candidates$elected_official) <- c(FALSE, TRUE)
levels(dem_candidates$self_funder) <- c(FALSE, TRUE)
levels(dem_candidates$stem) <- c(FALSE, TRUE)
levels(dem_candidates$obama_alum) <- c(FALSE, TRUE)
levels(dem_candidates$party_support) <- c(FALSE, TRUE)
levels(dem_candidates$emily_endorsed) <- c(FALSE, TRUE)
levels(dem_candidates$guns_sense_candidate) <- c(FALSE, TRUE)
levels(dem_candidates$biden_endorsed) <- c(FALSE, TRUE)
levels(dem_candidates$warren_endorsed) <- c(FALSE, TRUE)
levels(dem_candidates$sanders_endorsed) <- c(FALSE, TRUE)
levels(dem_candidates$our_revolution_endorsed) <- c(FALSE, TRUE)
levels(dem_candidates$justice_dems_endorsed) <- c(FALSE, TRUE)
levels(dem_candidates$pccc_endorsed) <- c(FALSE, TRUE)
levels(dem_candidates$indivisible_endorsed) <- c(FALSE, TRUE)
levels(dem_candidates$wfp_endorsed) <- c(FALSE, TRUE)
levels(dem_candidates$vote_vets_endorsed) <- c(FALSE, TRUE)
levels(dem_candidates$no_labels_support) <- c(FALSE, TRUE)
# change the TRUE/FALSE variables to logical
dem_candidates <- dem_candidates %>%
mutate(
won_primary = as.logical(won_primary),
veteran = as.logical(veteran),
lgbtq = as.logical(lgbtq),
elected_official = as.logical(elected_official),
self_funder = as.logical(self_funder),
stem = as.logical(stem),
obama_alum = as.logical(obama_alum),
party_support = as.logical(party_support),
emily_endorsed = as.logical(emily_endorsed),
guns_sense_candidate = as.logical(guns_sense_candidate),
biden_endorsed = as.logical(biden_endorsed),
warren_endorsed = as.logical(warren_endorsed),
sanders_endorsed = as.logical(sanders_endorsed),
our_revolution_endorsed = as.logical(our_revolution_endorsed),
justice_dems_endorsed = as.logical(justice_dems_endorsed),
pccc_endorsed = as.logical(pccc_endorsed),
indivisible_endorsed = as.logical(indivisible_endorsed),
wfp_endorsed = as.logical(wfp_endorsed),
vote_vets_endorsed = as.logical(vote_vets_endorsed),
no_labels_support = as.logical(no_labels_support)
)
dem_candidates <- dem_candidates %>%
select(candidate, state, body, district_num, office_type, everything())
usethis::use_data(dem_candidates, overwrite = TRUE)
# Datasets cleaned and written by Sunni Raleigh
## wwc_matches ---------------------------------------------------------------------------------------------------------
wwc_2019_matches <- readr::read_csv("https://projects.fivethirtyeight.com/soccer-api/international/2019/wwc_matches.csv") %>%
clean_names() %>%
select(-c(league, league_id)) %>%
mutate(
team1 = str_remove_all(to_snake_case(team1), "_women"),
team2 = str_remove_all(to_snake_case(team2), "_women")
)
usethis::use_data(wwc_2019_matches, overwrite = TRUE)
## wwc_forecasts ---------------------------------------------------------------------------------------------------------
wwc_2019_forecasts <- readr::read_csv("https://projects.fivethirtyeight.com/soccer-api/international/2019/wwc_forecasts.csv") %>%
clean_names() %>%
mutate(
team = to_snake_case(team),
forecast_timestamp = as.Date(forecast_timestamp),
group = as.factor(group),
team = as.factor(team)
) %>%
rename(date = forecast_timestamp) %>%
select(-timestamp) %>%
arrange(date, group)
usethis::use_data(wwc_2019_forecasts, overwrite = TRUE)
# Datasets cleaned and written by Anna Ballou
# foul-balls -------------------------------------------------------------------
foul_balls <- read_csv("data-raw/foul-balls/foul-balls.csv") %>%
clean_names() %>%
mutate(
matchup = str_replace_all(matchup, c("VS" = "vs", "Diamondsbacks" = "Diamondbacks")),
type_of_hit = as.factor(type_of_hit),
camera_zone = as.factor(camera_zone),
used_zone = as.factor(used_zone),
predicted_zone = as.factor(predicted_zone)
)
usethis::use_data(foul_balls, overwrite = TRUE)
# Datasets cleaned and written by Jane Bang
# nba-draymond ---------------------------------------------------------------
nba_draymond <- read_csv("data-raw/nba-draymond/draymond.csv") %>%
clean_names()
usethis::use_data(nba_draymond, overwrite = TRUE)
# Datasets cleaned and written by Jordan Moody
# Import dataset from csv
fight_songs <- read_csv("data-raw/fight-songs/fight-songs.csv")
# Change variable types
fight_songs <- fight_songs %>%
rename(num_fights = number_fights) %>%
mutate(
year = as.numeric(year),
conference = ifelse(conference == "Yes", TRUE, FALSE),
student_writer = ifelse(student_writer == "Yes", TRUE, FALSE),
official_song = ifelse(official_song == "Yes", TRUE, FALSE),
contest = ifelse(contest == "Yes", TRUE, FALSE),
fight = ifelse(fight == "Yes", TRUE, FALSE),
victory = ifelse(victory == "Yes", TRUE, FALSE),
win_won = ifelse(win_won == "Yes", TRUE, FALSE),
victory_win_won = ifelse(victory_win_won == "Yes", TRUE, FALSE),
rah = ifelse(rah == "Yes", TRUE, FALSE),
nonsense = ifelse(nonsense == "Yes", TRUE, FALSE),
colors = ifelse(colors == "Yes", TRUE, FALSE),
men = ifelse(men == "Yes", TRUE, FALSE),
opponents = ifelse(opponents == "Yes", TRUE, FALSE),
spelling = ifelse(spelling == "Yes", TRUE, FALSE)
)
# Names are clean, checked with janitor's clean_names()
# Overwrite .rda files
usethis::use_data(fight_songs, overwrite = TRUE)
# Datasets cleaned and written by Kara Van Allen
# reading in csv
nba_elo_latest <- read_csv("https://projects.fivethirtyeight.com/nba-model/nba_elo_latest.csv")
rownames(nba_elo_latest) <- NULL
# tidying and taming data for nba_elo_latest
nba_elo_latest <- nba_elo_latest %>%
clean_names() %>%
mutate_if(is.logical, as.numeric) %>%
mutate(neutral = as.logical(neutral)) %>%
mutate(playoff = as.logical(playoff))
# setting up .rda
usethis::use_data(nba_elo_latest, overwrite = TRUE)
# Datasets cleaned and written by Lizette Carpenter
# cabinet_turnover -------------------------------------------------------------
presidents <- c("Carter", "Reagan", "Bush 41", "Clinton", "Bush 43", "Obama", "Trump")
cabinet_turnover <- read_csv("data-raw/cabinet-turnover/cabinet-turnover.csv") %>%
clean_names() %>%
mutate(
combined = str_detect(length, "combined"),
length = str_replace_all(length, " combined", ""),
length = as.numeric(length),
president = factor(president),
start =
case_when(
appointee == "Bob Gates" ~ "12/18/2006",
appointee == "George Tenet" ~ "7/3/95",
appointee == "Lauro Cavazos" ~ "9/20/88",
appointee == "Dick Thornburgh" ~ "8/15/88",
appointee == "Bill Webster" ~ "5/26/87",
appointee == "Nicholas Brady" ~ "9/15/88",
TRUE ~ start
),
end =
case_when(
appointee == "Bob Gates" ~ "6/30/2011",
appointee == "George Tenet" ~ "7/11/97",
appointee == "Lauro Cavazos" ~ "12/12/1990",
appointee == "Dick Thornburgh" ~ "8/15/1991",
appointee == "Bill Webster" ~ "8/31/1991",
appointee == "Nicholas Brady" ~ "1/17/1993",
TRUE ~ end
),
start = mdy(start),
end = mdy(end),
president = factor(president, levels = presidents)
)
usethis::use_data(cabinet_turnover, overwrite = TRUE)
# Datasets cleaned and written by Fatima Keita
impeachment_polls <- read_csv("data-raw/impeachment-polls/IMPEACHMENT-POLLS - Public Master List.csv") %>%
clean_names() %>%
mutate(
start = mdy(start),
end = mdy(end),
sponsor = as.factor(sponsor),
pollster = as.factor(pollster),
pop = as.factor(pop),
pollster = as.factor(pollster),
category = as.factor(category),
include = as.logical(include)
)
usethis::use_data(impeachment_polls, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.