Parse HTML page Get House of Representatives chronology

Parses data-raw/chronology-page.html to extract data on representatives.

Outputs data-raw/chronology-raw.csv

library(rvest)
library(tidyverse)
library(here)

Parse out table with rvest, convert to df

reps <- read_html(here("data-raw", "chronology-page.html"))
# data is in a <table>
tables_on_page <- html_nodes(reps, css = "table")
# where summary has particular value
rep_table <- which(
  html_attr(tables_on_page, "summary") == "LegislatorsChronological"
)

rep_df_raw <- tables_on_page[[rep_table]] %>% 
  html_table(fill = TRUE)

Clean up NAs, split by session and remove session id rows:

rep_df <- rep_df_raw[, !is.na(names(rep_df_raw))] 

rep_df <- rep_df %>%
  drop_na(Legislator) %>%
  rename_all(tolower) %>% 
  rename(desk_number = `desk number`) %>% 
  mutate(
    session_line = str_detect(legislator, "Session :"),
    session_id = cumsum(session_line))

rep_with_session <- rep_df %>% 
  group_by(session_id) %>%
  mutate(session = first(legislator)) %>%
  slice(-1) %>%
  ungroup() %>%
  select(-session_id, -session_line)

Output

write_csv(rep_with_session, here("data-raw", "chronology-raw.csv"))


or-house-vis/history documentation built on May 15, 2019, 1:11 p.m.