In giocomai/edjnetquotefinder: EDJNet's QuoteFinder

if(!require(pacman)) install.packages("pacman")
pacman::p_load("tidyverse")
pacman::p_load("XML")

# remotes::install_github("giocomai/castarter")

library("quotefinder")

This document outlines an approach for getting basic information about all members of the European Parliament (MEPs). The data collected will then be fed to the Quote Finder.

First, let's get the list of all MEPs, provided by the European Parliament in a convenient xml format.

all_MEPs <- XML::xmlToDataFrame(doc = "http://www.europarl.europa.eu/meps/en/full-list/xml",
                                stringsAsFactors = FALSE)

There are the fields provided:

colnames(all_MEPs)

And this is how the actual data looks like:

pander::pander(head(all_MEPs))

This is useful information, but we still do not have reference to their social media accounts, which are instead given on the individual page of each of them. They can be extracted from there, even if some minor data cleaning is needed.

library("castarter")

SetCastarter(project = "ep", website = "meps")
index_links <- paste0("http://www.europarl.europa.eu/meps/en/", all_MEPs$id)
CreateFolders()
DownloadContents(links = index_links, type = "index")

fullName <- ExtractTitles(container = "span",
                          containerClass = "ep_name erpl-member-card-full-member-name", htmlLocation = fs::path("castarter", "ep", "meps", "IndexHtml"))

twitter_urls <- 
  purrr::map_chr(.x = seq_along(fullName),
                 .f = function (x) {
    temp <- ExtractLinks(container = "div",
                         containerClass = "ep-p_text link_twitt",
                         id = x)
    if (length(temp)==0) {
      NA
    } else {
      temp[[1]]
    }
  })

facebook_urls <- purrr::map_chr(.x = seq_along(fullName),
                 .f = function (x) {
    temp <- ExtractLinks(container = "div",
                         containerClass = "ep-p_text link_fb",
                         id = x)
    if (length(temp)==0) {
      NA
    } else {
      temp[[1]]
    }
  })

twitter_urls <- twitter_urls %>%
  str_remove(pattern = "\\?.*") %>%
  str_replace(pattern = "http://@",
              replacement = "https://twitter.com/") %>%
  str_replace(pattern = "https://www.twitter.com/",
              replacement = "https://twitter.com/") %>%
  str_replace(pattern = "http://www.twitter.com/",
              replacement = "https://twitter.com/") %>%
  str_remove(pattern = "/$")


twitter_urls[str_detect(string = twitter_urls, pattern = "www.facebook.com")&is.na(twitter_urls)==FALSE] <- NA


email <- purrr::map_chr(.x = seq_along(fullName),
                 .f = function (x) {
    temp <- ExtractLinks(container = "div",
                         containerClass = "ep-p_text link_email",
                         id = x)
    if (length(temp)==0) {
      NA
    } else {
      temp[[1]]
    }
  })

Now, as it's both more common and more practical to refer to EP groups by the shortened version of their name, we'll need to match the long version provided on the official website with the standard short version.

groups <- tibble::tribble(~GroupLong, ~GroupShort,
                          "Group of the European People's Party (Christian Democrats)", "EPP",
                          "Non-attached Members", "NA",
                          "Identity and Democracy Group", "ID",
                          "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament", "S&D",
                          "European Conservatives and Reformists Group", "ECR",
                          "Group of the Greens/European Free Alliance", "Greens–EFA",
                          "Renew Europe Group", "RE",
                          "Confederal Group of the European United Left - Nordic Green Left", "GUE-NGL"
)

pander::pander(groups)

readr::write_rds(x = as.list(groups$GroupShort), path = "EPGroupShort.rds")

It's now time to combine all these data in tabular format to facilitate further use.

all_MEPs_group_twitter <- 
  all_MEPs %>% 
  left_join(y = tibble::tibble(twitter_urls = twitter_urls,
                               faceboook_urls = facebook_urls, 
                               fullName = fullName),
            by = "fullName")  %>% 
  mutate(officialURL = paste0("https://www.europarl.europa.eu/meps/en/", id)) %>% 
  mutate(screen_name = str_remove(string = twitter_urls,
                                  pattern = "https://twitter.com/")) %>% 
  left_join(y = groups %>%
              dplyr::rename(politicalGroup=GroupLong),
            by = "politicalGroup") %>% 
  mutate(email = str_replace(fullName, " ", ".") %>% str_remove_all(" ") %>% str_to_lower() %>% paste0(., "@europarl.europa.eu") %>% stringi::stri_trans_general(id = "Latin-ASCII"))

write_csv(x = all_MEPs_group_twitter, path = "all_MEPs_group_official.csv")

Does this adequately capture all the MEPs who are on Twitter?

all_MEPs_group_twitter %>% 
  mutate(`On Twitter` = is.na(twitter_urls)==FALSE) %>% 
  group_by(`On Twitter`) %>% 
  count() %>% 
  pander::pander()

Unfortunately, not quite: more than 300 MEPs have not included information on their Twitter account on their official page on the website of the European Parliament, perhaps preferring to give more visibility to their Facebook account or for other unknown reasons.

Where else would it be possible to find a complete list of MEPs on Twitter?

The EP's Newshub collects information from social media accounts of MEPs and other EP-related figures, but does not seem to offer anything like an actual list of accounts.

The press service of the European Parliament conveniently offers a Twitter list of MEPs of current MEPs.

library("rtweet")

twitter_token <- readRDS(file = "twitter_token.rds")

if (fs::file_exists("mep_df.rds")==FALSE) {
  mep_df <- rtweet::lists_members(slug = "meps-2019-2024",
                                  owner_user = "EuroParlPress",
                                  token = twitter_token)
  saveRDS(object = mep_df, file = "mep_df")
} else {
  mep_df <- readRDS(file = "mep_df.rds")
}

The list includes r nrow(mep_df) MEPs, including about a hundred that did not list their account on the official website of the EP. It seems likely, however, that this is not yet a complete list.

eliflab_meps <- read_csv(file = "https://raw.githubusercontent.com/eliflab/European-Parliament-Open-Data/master/meps_full_list_with_twitter_accounts.csv")

Eliflab maintains a nicely formatted list of MEPs on Twitter, which includes the Twitter account of r nrow(eliflab_meps) MEPs.

By matching these three sources, and after some manual checks, it is possible to reach a reasonably complete and updated coverage of MEPs who are on Twitter.

Creating a list

While it is possible to request tweets for each user, to facilitate regular collection of tweets from a relatively large number of users such as in this case it is probably easier to create a Twitter list with all relevant users, and then ask Twitter for new tweets in that list (rather than ask ~600 times for each individual MEP).

all_mep_handles <- read_csv(file = "https://raw.githubusercontent.com/giocomai/European-Parliament-Open-Data/master/meps_full_list_with_twitter_accounts.csv") %>% filter(is.na(SCREEN_NAME)==FALSE) %>% pull(SCREEN_NAME) %>% str_remove(pattern = "@") 

post_list(users = all_mep_handles,
          name = "current_meps",
          description = "Current MEPs on Twitter",
          token = twitter_token)

The Twitter list is now available at the following link: https://twitter.com/EdjQuoteFinder/lists/current-meps

Get back some data from Twitter

The problem is that, as anyone who has been analysing twitter for a while will know, Twitter users can change their handle. And many MEPs do it, for different reasons: some add "MEP" at the end of their handle, some "EU", some include some other reference to their political affiliation, or remove something that was ok during the campaign but does not sound serious enough once elected.

To facilitate consistent pairing of the MEP with a Twitter profile, it is useful to retrieve the unchangeable numeric user id that Twitter assigns to every user.

qf_twitter_list <- rtweet::lists_members(slug = "current-meps",
                                         owner_user = "EDJQuoteFinder",
                                         token = twitter_token)

write_rds(x = qf_twitter_list, path = "qf_twitter_list.rds")

Have all these data together

qf_twitter_list <- readr::read_rds(file = "qf_twitter_list.rds")

all_MEPs_eliflab <- read_csv(file = "https://raw.githubusercontent.com/giocomai/European-Parliament-Open-Data/master/meps_full_list_with_twitter_accounts.csv") %>% 
  select(NAME, TWITTER_URL, SCREEN_NAME) %>% 
  mutate(screen_name = str_remove(SCREEN_NAME, "@")) %>% 
  rename(fullName = NAME) %>% 
  select(-SCREEN_NAME,-TWITTER_URL)

all_MEPs_full <- all_MEPs_group_twitter %>% 
  select(-screen_name) %>% 
  left_join(all_MEPs_eliflab, by = "fullName") %>% 
  left_join(qf_twitter_list %>% select(user_id, screen_name), by = "screen_name")

It is now possible to store these data for reference.

readr::write_rds(x = all_MEPs_full,
                 path = "all_MEPs_full.rds")
readr::write_csv(x = all_MEPs_full,
                 path = "all_MEPs_full.csv")

You can download them in csv format.

giocomai/edjnetquotefinder documentation built on Feb. 11, 2022, 12:51 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com