library(rvest) library(pbapply) library(tidyverse, quietly = TRUE) library(lubridate) years <- 1998:2017 urls <- sprintf("https://www.ecb.europa.eu/press/pressconf/%d/html/index.en.html", years) get_pressconf_url <- function(url) { webpage <- read_html(url) new_urls <- html_nodes(webpage, xpath='//div[@id="ecb-content-col"]//a[@class="arrow" and contains(@href, "pressconf")]') %>% html_attr("href") new_urls } links <- unlist(pbsapply(urls, get_pressconf_url, USE.NAMES=FALSE)) sprintf("Number of press conferences: %d", length(links)) head(links)
Crawl every single press conference page, and scrape the data
library(rvest) library(stringr) library(data.table) crawl_ecb <- function(url, baseurl = "https://www.ecb.europa.eu/") { sess <- html_session("https://www.ecb.europa.eu/") webpage <- read_html(jump_to(sess, url)) # Find labelling data press_date <- html_nodes(webpage, xpath='//nav[@id="ecb-breadcrumbs"]//span[@class="ecb-bcCurrent"]') %>% html_text %>% dmy speech_speaker <- html_nodes(webpage, xpath='//h2[@class="ecb-pressContentSubtitle"]') %>% html_text %>% str_split(",", n=2) %>% .[[1]] %>% .[1] #print(url) # Break into speech and Q&A data <- html_nodes(webpage, xpath='//*[@id="ecb-content-col"]//article//p | //*[@id="ecb-content-col"]//article//h2[contains(text(),"Transcript") or contains(text(),"transcript") or contains(*/text(),"Transcript") or contains(*/text(),"transcript")] | //*[@id="ecb-content-col"]//article//a[@id="qa"]')[-1] search_qa <- which(grepl('^<h2', data) | grepl('id="qa"', data) | grepl('disposal for questions',data) | grepl('open to questions', data)) if (length(search_qa) > 0) { qaRow <- max(search_qa) # Select last h2, incase leading H2s are captured speech_text <- html_text(data[1:(qaRow-1)]) qa_data <- trimws(html_text(data[(qaRow+1):length(data)])) # Further break Q&A ## Identify the first paragraphs for each question / response ided <- grepl("^ *[Qq]uestion.*:", qa_data) | # Starts with "Question:" grepl("^ *[Qq]:", qa_data) | # Starts with "Q:" (grepl('^[a-zA-Z_ ()]{1,22}:',qa_data, perl=TRUE) & ## Starts with some words followed by ":" grepl('^((?![q|Q]uestion:).)*$', substr(qa_data, 1, 30), perl=TRUE) & ## AND must not start with "Question" !grepl('^(on|and|about) ', tolower(qa_data)) & ## AND must not start with "or " or "and " !grepl('point:', substr(qa_data, 1, 30), perl=TRUE) & ## AND must not have "point:" !grepl('first|second|third|fourth|fifth|sixth', tolower(substr(qa_data, 1, str_locate(qa_data, ":")[,1]-1))) & ## AND not have counters str_count(substr(qa_data, 1, str_locate(qa_data, ":")[,1]-1)," ") < 3) ## AND less than 3 spaces unid_rows <- which(!ided) id_rows <- which(ided) if (length(unid_rows) > 0) { ## Append unidentified paras to identified paras belongs_to <- sapply(unid_rows, function(x) { max(id_rows[id_rows < x])}) for (i in 1:length(belongs_to)) { qa_data[belongs_to[i]] <- paste(qa_data[belongs_to[i]], qa_data[unid_rows[i]], sep=". ") } ## Delete unidenfied paras qa_data <- qa_data[-unid_rows] } qnRows <- which(grepl("^[Qq]uestion.*:", tolower(qa_data)) | grepl("^ *[Qq]:", qa_data)) qn_text <- trimws(str_split(qa_data[qnRows],"^[Qq].*?:", n=2, simplify=TRUE)[,2]) # Remove the leading "Question:" speaker_ans_text <- qa_data[-qnRows] ## Split answer into speaker and text split_text <- str_split(speaker_ans_text,":", n=2, simplify=TRUE) ans_speaker <- trimws(split_text[,1]) ans_text <- trimws(split_text[,2]) } else { speech_text <- html_text(data) qn_text <- NULL ans_text <- NULL ans_speaker <- NULL } x <- data.frame(text = c(speech_text, qn_text, ans_text), date = press_date, type = c(rep("speech", length(speech_text)), rep("question", length(qn_text)), rep("answer", length(ans_text))), speaker = c(rep(speech_speaker, length(speech_text)), rep("press", length(qn_text)), ans_speaker)) } l <- pblapply(links, crawl_ecb) # These are not policy meetings drop_dates <- ymd(c("2001-12-13", "2003-09-17", "2003-10-13", "2005-01-20", "2005-01-21")) df <- rbindlist(l) %>% mutate(text = as.character(text), speaker = as.character(speaker)) %>% filter(!(date %in% drop_dates)) rm(list=c("l")) # Clear some memory
people <- data_frame(Duisenberg = c("Introductory statement Willem F. Duisenberg", "(Duisenberg", "Willem F. Duisenberg", "WFD"), Trichet = c("held by Jean-Claude Trichet", "Jean-Claude Trichet", "Trichet (translation)", "Trichet (Translation)"), Draghi = c("Mario Draghi"), Papademos = c("Lucas Papademos"), Constancio = c("Vítor Constâncio"), Fazio = c("Fazio (Translation)"), Rato = c("Statements by Rodrigo Rato"), Noyer = c("Noyer (translation)")) %>% gather(to, from) %>% unique replace_text <- data_frame(from = c("\u2019"), to = c("")) df <- df %>% mutate(speaker = plyr::mapvalues(speaker, people$from, people$to), text = iconv(text, "UTF-8", "ASCII", ""))
library(ggplot2) df_summary <- df %>% group_by(date, type) %>% summarise(count = n(), drop=FALSE) g <- ggplot(df_summary, aes(date, count)) g + geom_point(aes(colour = type)) + geom_hline(yintercept = 0)
write.csv(df, file = "data/ecb_speeches.csv", row.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.