Extracting the links to each of the ECB press conferences

library(rvest)
library(pbapply)
library(tidyverse, quietly = TRUE)
library(lubridate)

years <- 1998:2017
urls <- sprintf("https://www.ecb.europa.eu/press/pressconf/%d/html/index.en.html", years)

get_pressconf_url <- function(url) {
  webpage <- read_html(url)
  new_urls <- html_nodes(webpage, xpath='//div[@id="ecb-content-col"]//a[@class="arrow" 
                         and contains(@href, "pressconf")]') %>%
    html_attr("href")
  new_urls
}

links <- unlist(pbsapply(urls, get_pressconf_url, USE.NAMES=FALSE))

sprintf("Number of press conferences: %d", length(links))
head(links)

Scraping each press conference speech

Crawl every single press conference page, and scrape the data

library(rvest)
library(stringr)
library(data.table)

crawl_ecb <- function(url, baseurl = "https://www.ecb.europa.eu/") {
  sess <- html_session("https://www.ecb.europa.eu/")
  webpage <- read_html(jump_to(sess, url))

  # Find labelling data
  press_date <- html_nodes(webpage, xpath='//nav[@id="ecb-breadcrumbs"]//span[@class="ecb-bcCurrent"]') %>%
    html_text %>%
    dmy
  speech_speaker <- html_nodes(webpage, xpath='//h2[@class="ecb-pressContentSubtitle"]') %>%
    html_text %>%
    str_split(",", n=2) %>%
    .[[1]] %>% .[1]
  #print(url)
  # Break into speech and Q&A
  data <- html_nodes(webpage, xpath='//*[@id="ecb-content-col"]//article//p | 
                     //*[@id="ecb-content-col"]//article//h2[contains(text(),"Transcript") or
                        contains(text(),"transcript") or 
                        contains(*/text(),"Transcript") or 
                        contains(*/text(),"transcript")] |
                     //*[@id="ecb-content-col"]//article//a[@id="qa"]')[-1]

  search_qa <- which(grepl('^<h2', data) | 
                       grepl('id="qa"', data) | 
                       grepl('disposal for questions',data) | 
                       grepl('open to questions', data))

  if (length(search_qa) > 0) {
    qaRow <- max(search_qa)  # Select last h2, incase leading H2s are captured

    speech_text <- html_text(data[1:(qaRow-1)])
    qa_data <- trimws(html_text(data[(qaRow+1):length(data)]))


    # Further break Q&A

      ## Identify the first paragraphs for each question / response
    ided <- grepl("^ *[Qq]uestion.*:", qa_data) | # Starts with "Question:"
              grepl("^ *[Qq]:", qa_data) |        # Starts with "Q:"
              (grepl('^[a-zA-Z_ ()]{1,22}:',qa_data, perl=TRUE) &        ## Starts with some words followed by ":"
                 grepl('^((?![q|Q]uestion:).)*$', substr(qa_data, 1, 30), perl=TRUE) &  ## AND must not start with "Question"
                 !grepl('^(on|and|about) ', tolower(qa_data)) &              ## AND must not start with "or " or "and "
                  !grepl('point:', substr(qa_data, 1, 30), perl=TRUE) &   ## AND must not have "point:"
                 !grepl('first|second|third|fourth|fifth|sixth', tolower(substr(qa_data, 1, str_locate(qa_data, ":")[,1]-1))) &  ## AND not have counters
                 str_count(substr(qa_data, 1, str_locate(qa_data, ":")[,1]-1)," ") < 3)     ## AND less than 3 spaces
    unid_rows <- which(!ided)
    id_rows <- which(ided)

    if (length(unid_rows) > 0) {
      ## Append unidentified paras to identified paras
      belongs_to <- sapply(unid_rows, function(x) { max(id_rows[id_rows < x])})

      for (i in 1:length(belongs_to)) {
        qa_data[belongs_to[i]] <- paste(qa_data[belongs_to[i]], qa_data[unid_rows[i]], sep=". ")
      }

      ## Delete unidenfied paras
      qa_data <- qa_data[-unid_rows]
    } 

    qnRows <- which(grepl("^[Qq]uestion.*:", tolower(qa_data)) | grepl("^ *[Qq]:", qa_data))
    qn_text <- trimws(str_split(qa_data[qnRows],"^[Qq].*?:", n=2, simplify=TRUE)[,2])  # Remove the leading "Question:"
    speaker_ans_text <- qa_data[-qnRows]

    ## Split answer into speaker and text
    split_text <- str_split(speaker_ans_text,":", n=2, simplify=TRUE)
    ans_speaker <- trimws(split_text[,1])
    ans_text <- trimws(split_text[,2])

  } else {
      speech_text <- html_text(data)
      qn_text <- NULL
      ans_text <- NULL
      ans_speaker <- NULL
  }
  x <- data.frame(text = c(speech_text, qn_text, ans_text), 
         date = press_date, 
         type = c(rep("speech", length(speech_text)),
                  rep("question", length(qn_text)),
                  rep("answer", length(ans_text))),
         speaker = c(rep(speech_speaker, length(speech_text)),
                    rep("press", length(qn_text)),
                    ans_speaker))

}
l <- pblapply(links, crawl_ecb)

# These are not policy meetings
drop_dates <- ymd(c("2001-12-13", "2003-09-17", "2003-10-13", "2005-01-20", "2005-01-21"))

df <- rbindlist(l) %>%
  mutate(text = as.character(text), 
         speaker = as.character(speaker)) %>% 
  filter(!(date %in% drop_dates))

rm(list=c("l")) # Clear some memory

Set names properly

people <- data_frame(Duisenberg = c("Introductory statement Willem F. Duisenberg", "(Duisenberg", "Willem F. Duisenberg", "WFD"),
              Trichet = c("held by Jean-Claude Trichet", "Jean-Claude Trichet", "Trichet (translation)", "Trichet (Translation)"),
              Draghi = c("Mario Draghi"),
              Papademos = c("Lucas Papademos"),
              Constancio = c("Vítor Constâncio"),
              Fazio = c("Fazio (Translation)"),
              Rato = c("Statements by Rodrigo Rato"),
              Noyer = c("Noyer (translation)")) %>%
  gather(to, from) %>%
  unique

replace_text <- data_frame(from = c("\u2019"),
                           to = c(""))

df <- df %>%
  mutate(speaker = plyr::mapvalues(speaker, people$from, people$to),
         text = iconv(text, "UTF-8", "ASCII", ""))

Check

library(ggplot2)

df_summary <- df %>% 
  group_by(date, type) %>%
  summarise(count = n(), drop=FALSE)

g <- ggplot(df_summary, aes(date, count))
g + geom_point(aes(colour = type)) +
  geom_hline(yintercept = 0)

Write to file

write.csv(df, file = "data/ecb_speeches.csv", row.names = FALSE)


yunching/tidymas documentation built on Feb. 5, 2023, 1:42 p.m.