R/scrape_rcp.R

#' A Cat Function
#'
#' This function allows you to scrape poll data from realclearpolitics.com
#' @param url the URL of the page with the html table of your data\
#' @export
#' @import ggplot2
#' @import dplyr
#' @import stringr
#' @import rvest
#' @examples
#' trump_approval <- scrape_rcp(url = "https://www.realclearpolitics.com/epolls/other/president_trump_job_approval-6179.html")

scrape_rcp <- function(url) {
  
  # Pull table
  raw_table <- paste(url) %>%
    read_html %>%
    html_nodes("table") %>%
    html_table()
  
  # Select table elementstoday
  raw_table <- raw_table[[4]]
  raw_table <- raw_table[-c(1:2), ]
  
  # Format Poll dates
  close_date <- unlist(lapply(str_split(raw_table$Date, "- "), tail, 1))
  close_date <- as.Date(close_date, format = "%m/%d")
  
  # Identify horserace columns
  hr <- raw_table[!names(raw_table) %in% c("Poll", "Date", "Spread", "MoE", "Sample")]
  spread <- (hr[,1] - hr[,2])/100
  
  # Format sample size
  sample_size <- unlist(lapply(str_split(raw_table$Sample, " "), head, 1))
  
  # Pull standard error
  se <- (as.numeric(raw_table$MoE)/100)/1.96
  
  # Identify polls
  poll <- str_replace_all(raw_table$Poll, "[*]", "")
  poll_id <- match(poll, unique(poll))
  
  # Combine into one data frame
  poll_data <- data.frame(poll, 
                          poll_id,
                          close_date,
                          sample_size,
                          hr[, 1]/100,
                          hr[, 2]/100,
                          spread)
  names(poll_data)[5:6] <- names(hr)
  
  # Look at moore margin over time
  plot <- ggplot(poll_data, aes(x = close_date)) +
    geom_hline(aes(yintercept = 0),
               lty = 2,
               col = "darkgrey") +
    geom_point(aes(y = spread),
               col = "red") +
    geom_smooth(aes(y = spread),
                fill = "pink",
                col = "red") +
    xlab(" ") +
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          panel.background = element_blank(), 
          axis.line = element_line(colour = "black"),
          axis.text = element_text(colour = "black",
                                   size = 11),
          axis.title = element_text(colour = "black",
                                    size = 16,
                                    face = "bold"),
          plot.title = element_text(size = 20, 
                                    face = "bold"))
  
  out <- list(poll_data = poll_data,
              plot = plot)
  
  return(out)
}
alexpavlakis/rcpR documentation built on May 11, 2019, 6:26 p.m.