# ---------------------------------------------------------------------------------------
# legislatoR
# Sascha Göbel and Simon Munzert
# Script: functions
# August 2019
# ---------------------------------------------------------------------------------------
##### LANGUAGE CONVERSION OF MONTHS =====================================================
convMonth <- function(string, patterns, replacements) {
for (i in seq_along(patterns))
string <- gsub(patterns[i], replacements[i], string, perl=TRUE)
string
}
months_foreign <- c("Januar", "J\u00E4nner", "Februar", "M\u00E4rz", "Mai", "Juni", "Juli",
"Oktober", "Dezember")
months_english <- c("January", "January", "February", "March", "May", "June", "July",
"October", "December")
##### ENCODING DEBUGGING ================================================================
debugEnc <- function(string) {
debugtab <- read_html("http://www.i18nqa.com/debug/utf8-debug.html") %>%
html_table(fill = TRUE)
replacements <- c(debugtab[[1]][-1,9]) %>%
c("\u010d", "\U0153", "-", "i", "g", "\u0161", "c", "c", "c", "\u0159",
"\u011b", "\u017d", "\u017e", "\u0160", "\u0148", "\u010c", "\u0158", "\u016f", "\u0165", "\u010f", "\u0164",
"\u2013", "\u2014", ., "'", "\"")
patterns <- str_replace_all(debugtab[[1]][-1,11], " |%C3", "") %>%
c("%C4%8D", "%C5%93", "_%E2%80%A0_", "%C4%B1", "%C4%9F", "%C5%A1", "%C4%87", "%C4%8D", "%C4%87", "%C5%99",
"%C4%9B", "%C5%BD", "%C5%BE", "%C5%A0", "%C5%88", "%C4%8C", "%C5%98", "%C5%AF", "%C5%A5", "%C4%8F", "%C5%A4",
"%E2%80%93", "%E2%80%94", ., "%27", "%22")
string <- str_replace_all(string, "%C3", "")
for (i in seq_along(patterns)) {
#string <- gsub(patterns[i], replacements[i], string, perl=TRUE)
string <- str_replace_all(string, patterns[i], replacements[i])
}
string
}
##### DOWNLOAD OF HTMLS =================================================================
getHtml <- function(source, folder, country) {
for(i in 1:length(source)) {
if (!dir.exists(folder)) dir.create(path = folder)
download.file(source[i], destfile = str_c(folder, country, "-", i, ".html"))
Sys.sleep(1)
}
}
#### RETRIEVAL OF POLITICIANS' WIKIDATA AND PAGE IDS ====================================
wikiIDs <- function(url, corp = NULL) {
if (!missing(corp)) {
url <- url$url
}
url <- debugEnc(string = url)
title <- str_extract(url, "(?<=/)[[:upper:]].+")
langcode <- str_extract(url, "(?<=//)[[:alpha:]]{2}") # str_extract(url[1], "(?<=//)[[:alpha:]]{2}")
ids <- data.table(pageid = rep(as.integer(NA), length(title)),
wikidataid = rep(as.character(NA), length(title)))
k <- 1
cat("|")
#cat(k) # for debug purposes
for (i in 1:length(url)) {
if (is.na(url[i])) {
next
}
# for (i in 201:300) {
id <- NULL
while (is.null(id)) {
try(
id <- as.character(RETRY("POST", str_c("https://", langcode[i],".wikipedia.org/w/api.php"), # langcode
query = list(action = "query",
titles = title[i],
format = "json",
prop = "info|pageprops",
ppprop = "wikibase_item",
redirects = 1),
times = 1000)),
silent = TRUE)
}
ids[i,1] <- fromJSON(id)$query$pages[[1]]$pageid
# ids[i,2] <- fromJSON(id)$query$pages[[1]]$pageprops$wikibase_item # for debug purposes
check <- fromJSON(id)$query$pages[[1]]$pageprops$wikibase_item
if (!is.null(check)) {
ids[i,2] <- check
}
k <- k + 1 # for debug purposes
cat(k, ", ")
cat(".")
}
return(ids)
}
#### RETRIEVAL OF POLITICIANS' WIKIPEDIA URLS ===========================================
wikiURLS <- function(ids, langcode) { # wikidataid
cat("|")
urls <- rep(NA, length(ids))
for (i in 1:length(ids)) {
cat(".")
if (is.na(ids[i])) {
next
}
url <- as.character(RETRY("POST", "https://www.wikidata.org/w/api.php",
query = list(action = "wbgetentities",
format = "json",
props = "sitelinks/urls",
ids = ids[i],
sitefilter = langcode),
times = 1000))
Sys.sleep(1)
check <- fromJSON(url)$entities[[1]]$sitelinks
if (length(check) > 0) {
urls[i] <- fromJSON(url)$entities[[1]]$sitelinks[[1]][[4]]
}
}
return(urls)
}
#### RETRIEVAL OF WIKIPEDIA PAGE REVISIONS ==============================================
pageRev <- function(data, from, to, interval, fill = NULL, download = NULL, folder = NULL) {
if (!(from %in% 2001:2017)|!(to %in% 2001:2017))
stop ("Please select a year between 2001 and 2017", call. = FALSE)
if (from > to)
stop ("The year specified in 'from' must occur before or equal the year specified in 'to'", call. = FALSE)
if (!(interval %in% c("monthly", "weekly")))
stop ("The specified 'interval' must be either ''monthly'' or ''weekly''", call. = FALSE)
run <- 1
for (z in 1:length(data)) {
cat("\nDownloading data for legislature", z, "of", length(data), "\n")
if (interval == "monthly") {
begin <- seq(as.Date(str_c(from, "-01-01")), length = (to - from + 1) * 12, by = "1 month")
end <- seq(as.Date(str_c(from, "-02-01")), length = (to - from + 1) * 12, by = "1 month") -1
ids <- data.table(oldid = as.integer(NA), timestamp = as.character(NA), month = rep(month(begin), times = dim(data[[z]])[1]),
year = rep(year(begin), times = dim(data[[z]])[1]), pageid = rep(data[[z]]$pageid, each = length(begin)))
#ids <- rep(list(data.table(oldid = as.integer(NA), timestamp = as.character(NA), month = month(begin), year = year(begin))), times = length(url))
#names(ids) <- url
}
if (interval == "weekly") {
begin <- floor_date(seq(as.Date(str_c(from, "-01-01")), length = (to - from + 1) * 53, by="1 week"), unit = "week") + 1
end <- as.Date(ceiling_date(seq(as.Date(str_c(from, "-01-01")), length = (to - from + 1) * 53, by="1 week"), unit = "week"))
ids <- data.table(oldid = as.integer(NA), timestamp = as.character(NA), week = rep(week(begin), times = dim(data[[z]])[1]),
year = rep(year(begin), times = dim(data[[z]])[1]), pageid = rep(data[[z]]$pageid, each = length(begin)))
#ids <- rep(list(data.table(oldid = as.integer(NA), timestamp = as.character(NA), week = week(begin), year = year(begin))), times = length(url))
#names(ids) <- url
}
pageid <- data[[z]]$pageid
#title <- str_extract(url, "(?<=/)([[:upper:]]|%).+")
langcode <- str_extract(data[[z]]$url[1], "(?<=//)[[:alpha:]]{2}")
#for (i in 1:length(url)) {
for (i in 1:dim(data[[z]])[1]) {
for (k in 1:length(begin)) {
rev <- NULL
while (is.null(rev)) {
try(
rev <- as.character(RETRY("POST", str_c("https://", langcode,".wikipedia.org/w/api.php"),
query = list(action = "query",
pageids = pageid[i],
format = "json",
prop = "revisions",
rvprop = "ids|timestamp",
rvlimit = "1",
rvstart = str_c(begin[k], "T00:00:00Z"),
rvend = str_c(end[k], "T24:59:59Z"),
rvdir = "newer"),
times = 1000)),
silent = TRUE)
}
oldid <- fromJSON(rev)$query$pages[[1]]$revisions[,c(1,3)]
ids[ids$pageid == pageid[i],][k, 1:2] <- if(is.null(oldid)) NA else oldid
# ids[[1]][k,1:2] <- if(is.null(oldid)) NA else oldid
if (i > 1) {
cat(".")
}
}
if (i == 1) {
cat("---- MP ", i, " ")
} else {
cat(" MP ", i, " ")
}
}
#ids <- if(isTRUE(fill)) lapply(ids, na.locf) else ids
ids <- if(isTRUE(fill)) na.locf(ids) else ids
if (isTRUE(download)) {
if(!dir.exists(folder)) dir.create(path = folder)
#for (i in 1:length(url)) {
for (i in 1:dim(data[[z]])[1]) {
#if(!dir.exists(str_c(folder, "/", title[i]))) dir.create(path = str_c(folder, "/", title[i]))
if(!dir.exists(str_c(folder, "/", pageid[i]))) dir.create(path = str_c(folder, "/", pageid[i]))
#link <- na.omit(str_c("https://en.wikipedia.org/w/index.php?title=", title, "&oldid=",
# ids[[i]][[1]]))
link <- na.omit(str_c("https://", langcode, ".wikipedia.org/w/index.php?curid=", pageid[i], "&oldid=",
ids[ids$pageid == pageid[i],oldid]))
#dest <- na.omit(str_c(folder, "/", title[i], "/", title[i], "_", ids[[i]][[1]], ".html"))
dest <- na.omit(str_c(folder, "/", pageid[i], "/", pageid[i], "_", ids[ids$pageid == pageid[i],oldid], ".html"))
invisible(mapply(function(x, y) {
download.file(x, y)
# Sys.sleep(sample(seq(0, 1, by=0.001), 1))
}, link, dest))
}
}
if (run == 1) {
ids_list <- list(ids)
} else {
ids_list <- c(ids_list, list(ids))
cat("\nLegislature", z, "finished\n")
}
run <- run + 1
}
return(ids_list)
}
#### DATA EXTRACTION FROM WIKIDATA ======================================================
wikiData <- function (item, entity = NULL, property, date = FALSE, location = FALSE, serial = FALSE,
qualifier = NULL, qualifier_names = NULL, qualifier_class = NULL, unique = NULL) {
if (is.null(entity)) {
entity <- get_item(id = item)
}
claims <- extract_claims(items = entity, claims = property)
item <- item[!unlist(lapply(claims, is.na))]
claims <- claims[!unlist(lapply(claims, is.na))]
if (!isTRUE(date) & !isTRUE(location) & !isTRUE(serial)) {
property_ids <- rep(list(NA), length(item))
for (i in 1:length(item)) {
property_ids[[i]] <- claims[[i]][[1]]$mainsnak$datavalue$value$id
}
if (!isTRUE(unique)) {
idscount <- lapply(property_ids, table) %>%
mapply(`[`, ., lapply(., `>`, 1)) %>%
unlist %>%
subtract(1)
property_ids_unique <- sort(c(unique(unlist(property_ids)), rep(names(idscount), idscount)))
} else {
property_ids_unique <- na.omit(unique(unlist(property_ids))) # remove na.omit
}
property_entities_unique <- get_item(id = property_ids_unique)
property_values_unique <- lapply(property_entities_unique, `$.data.frame`, "labels") %>%
lapply(`$.data.frame`, "en") %>%
lapply(`$.data.frame`, "value") %>%
lapply(., function(x) ifelse(is.null(x), "unknown", x)) %>%
unlist
property_values <- rep(list(NA), length(property_ids_unique))
for (i in 1:length(property_ids_unique)) {
idx <- lapply(property_ids ,is.element, property_ids_unique[i])
property_values[[i]] <- unlist(lapply(idx, any))
property_values[[i]] <- ifelse(property_values[[i]], property_values_unique[i], NA)
}
property_values <- data.table(do.call(cbind, property_values))
if (!missing(qualifier)) {
qualifier_values <- rep(list(rep(list(NA), length(item))), length(qualifier))
for (k in 1:length(qualifier)) {
for (i in 1:length(item)) {
if (is.null(claims[[i]][[1]]$qualifiers) | !(qualifier[k] %in% names(claims[[i]][[1]]$qualifiers))) {
qualifier_values[[k]][[i]] <- NA
} else {
qualifier_values[[k]][[i]] <- lapply(claims[[i]][[1]]$qualifiers[qualifier[k]][[1]],
`$.data.frame`, "datavalue") %>%
lapply(`$.data.frame`, 1) %>%
lapply(`$.data.frame`, 1) %>%
lapply(., function(x) ifelse(is.null(x), NA, x)) %>%
unlist
}
}
if (!missing(qualifier_class)) {
if (qualifier_class == "Date") {
qualifier_values[[k]] <- lapply(X = qualifier_values[[k]], FUN = str_replace_all,
pattern = "\\+", replacement = "") %>%
lapply(X = ., FUN = str_replace_all,
pattern = "T", replacement = " ") %>%
lapply(X = ., FUN = str_replace_all,
pattern = "-00", replacement = "-01")
}
}
}
qualifier_columns <- rep(list(rep(list(NA), length(property_ids_unique))), length(qualifier))
for (k in 1:length(qualifier)) {
qualifier_values[[k]] <- lapply(qualifier_values[[k]], `length<-`, max(lengths(qualifier_values[[k]])))
property_ids2 <- property_ids
for (i in 1:length(property_ids_unique)) {
qualifier_pos <- lapply(property_ids2, function(x)
which(x %in% property_ids_unique[i])) %>%
lapply(., `[`, 1)
qualifier_columns[[k]][[i]] <- mapply(`[`, qualifier_values[[k]], qualifier_pos) %>%
lapply(., function(x) ifelse(!length(x), NA, x)) %>%
unlist
property_ids2 <- mapply(replace, property_ids2, qualifier_pos, NA)
}
qualifier_columns[[k]] <- data.table(do.call(cbind, qualifier_columns[[k]]))
}
qualifier_columns <- do.call(cbind, qualifier_columns)
if (!missing(qualifier_class)) {
if (qualifier_class == "Date") {
qualifier_columns[, 1:dim(qualifier_columns)[2]] <- lapply(qualifier_columns[, 1:dim(qualifier_columns)[2]], as.POSIXct)
}
}
data <- cbind(ifelse(is.na(property_values), FALSE, TRUE), qualifier_columns)
col_order <- order(names(data))
col_names <- c(str_replace_all(tolower(property_values_unique), " ", "_"),
unlist(lapply(qualifier_names, str_c, "_", seq_along(property_ids_unique))))
colnames(data) <- col_names
setcolorder(data, col_order)
data <- cbind(wikidataid = item, data)
} else {
data <- as.data.table(ifelse(is.na(property_values), FALSE, TRUE))
col_order <- order(names(data))
col_names <- str_replace_all(tolower(property_values_unique), " ", "_")
colnames(data) <- col_names
setcolorder(data, col_order)
data <- cbind(wikidataid = item, data)
}
} else {
if (isTRUE(date)) {
property_ids <- rep(NA, length(item))
for (i in 1:length(item)) {
property_ids[[i]] <- ifelse(is.null(claims[[i]][[1]]$mainsnak$datavalue$value$time[1]),
NA, claims[[i]][[1]]$mainsnak$datavalue$value$time[1])
}
data <- as.data.table(cbind(wikidataid = item, date = property_ids))
}
if (isTRUE(serial)) {
property_ids <- rep(NA, length(item))
for (i in 1:length(item)) {
property_ids[[i]] <- ifelse(is.null(claims[[i]][[1]]$mainsnak$datavalue$value[1]),
NA, claims[[i]][[1]]$mainsnak$datavalue$value[1])
}
data <- as.data.table(cbind(wikidataid = item, value = property_ids))
}
if (isTRUE(location)) {
property_ids <- rep(list(NA), length(item))
for (i in 1:length(item)) {
property_ids[[i]] <- claims[[i]][[1]]$mainsnak$datavalue$value$id
}
property_ids <- unlist(lapply(property_ids, `[[`, 1))
item <- item[!unlist(lapply(property_ids, is.na))]
property_ids <- property_ids[!unlist(lapply(property_ids, is.na))]
property_entities <- get_item(id = property_ids)
claims2 <- extract_claims(items = property_entities, claims = "P625")
item <- item[!unlist(lapply(claims2, is.na))]
claims2 <- claims2[!unlist(lapply(claims2, is.na))]
coordinates <- cbind(lat = rep(NA, length(item)), lon = rep(NA, length(item)))
for (i in 1:length(item)) {
test_lon <- claims2[[i]][[1]]$mainsnak$datavalue$value$longitude[1]
test_lat <- claims2[[i]][[1]]$mainsnak$datavalue$value$latitude[1]
if (length(test_lon == 1) & length(test_lat == 1)) {
coordinates[i,"lon"] <- claims2[[i]][[1]]$mainsnak$datavalue$value$longitude[1]
coordinates[i,"lat"] <- claims2[[i]][[1]]$mainsnak$datavalue$value$latitude[1]
}
}
data <- cbind(wikidataid = item, as.data.table(coordinates))
}
}
return(data)
}
#### RETRIEVAL OF WIKIPEDIA REVISION HISTORIES ==========================================
wikiHist <- function(pageid, project, uniqueid = NULL) {
nms <- c("revid", "parentid", "user", "userid", "userhidden", "anon",
"timestamp", "size", "minor", "comment", "commenthidden", "tags")
cat("|")
revision <- NULL
while(is.null(revision)) {
try(
revision <- as.character(httr::RETRY("POST", str_c("https://", project,".org/w/api.php"),
query = list(action = "query",
pageids = pageid,
prop = "revisions",
rvprop = "ids|flags|timestamp|user|userid|size|comment|tags",
rvlimit = "500",
format = "json",
redirects = ""),
times = 1000)),
silent = TRUE
)
}
revisions_full <- fromJSON(revision)$query$pages[[1]]$revisions
missing <- setdiff(nms, names(revisions_full))
revisions_full[missing] <- NA
revisions_full <- revisions_full[nms]
continue <- str_detect(revision, "rvcontinue")
rvcontinue <- str_extract(revision, "[[:digit:]]+\\|[[:digit:]]+")
cat(".")
Sys.sleep(1)
while(isTRUE(continue)) {
revision <- NULL
while(is.null(revision)) {
try(
revision <-as.character(RETRY("POST", str_c("https://", project,".org/w/api.php"),
query = list(action = "query",
pageids = pageid,
prop = "revisions",
rvprop = "ids|flags|timestamp|user|userid|size|comment|tags",
rvlimit = "500",
format = "json",
redirects = "",
rvcontinue = rvcontinue),
times = 1000)),
silent = TRUE
)
}
continue <- str_detect(revision, "rvcontinue")
rvcontinue <- str_extract(revision, "[[:digit:]]+\\|[[:digit:]]+")
revision <- fromJSON(revision)$query$pages[[1]]$revisions
missing <- setdiff(nms, names(revision))
revision[missing] <- NA
revision <- revision[nms]
revisions_full <- rbind(revisions_full, revision)
Sys.sleep(1)
cat(".")
}
revisions_full <- revisions_full %>% mutate(pageid = pageid)
if (!is.null(uniqueid)) {
revisions_full <- revisions_full %>% mutate(pageid_unique = uniqueid)
}
return(revisions_full)
}
#### RETRIEVAL OF WIKIPEDIA PAGE TRAFFIC ================================================
wikiTraffic <- function(data, project) {
title <- data$title
pageid <- data$pageid
traffic <- NULL
for (i in 1:length(title)) {
cat(".")
er <- FALSE
while(is.null(traffic)) {
traffic <- try(article_pageviews(project = project,
article = title[i],
user_type = "user",
start = "2015070100",
end = "2018123100") %>%
dplyr::select(article, agent, date, views),
silent = TRUE
)
if (class(traffic) == "try-error") {
traffic <- data.frame(article = NA, agent = NA, date = NA, views = NA, stringsAsFactors = FALSE)
er <- TRUE
}
}
if (!isTRUE(er)) {
traffic$article <- pageid[i]
names(traffic)[c(1,4)] <- c("pageid", "traffic")
traffic <- suppressMessages(pad(traffic, interval = "day"))
traffic[,1:3] <- na.locf(traffic[,1:3])
traffic[is.na(traffic[4]), 4] <- 0
if (i == 1) {
traffic_list <- list(traffic)
} else {
traffic_list <- c(traffic_list, list(traffic))
}
}
traffic <- NULL
}
traffic <- bind_rows(traffic_list)
return(traffic)
}
wikiTrafficNew <- function(data, project, uniqueid = NULL) {
title <- data$title
if (is.null(uniqueid)) {
pageid <- data$pageid
} else {
pageid_unique <- data$pageid_unique
}
traffic <- NULL
for (i in 1:length(title)) {
cat(".")
while(is.null(traffic)) {
traffic <- try(wp_trend(page = title[i],
lang = project[i],
from = "2007-12-01",
to = Sys.Date()) %>%
dplyr::select(date, views) %>%
filter(views > 0) %>%
mutate(date = lubridate::as_date(date)),
silent = TRUE
)
if (class(traffic) == "try-error") {
traffic <- try(wp_trend(page = title[i],
lang = project[i],
from = "2016-01-01",
to = Sys.Date()) %>%
dplyr::select(date, views) %>%
filter(views > 0) %>%
mutate(date = lubridate::as_date(date)),
silent = TRUE
)
if (class(traffic) == "try-error") {
next
}
}
}
if (is.null(uniqueid)) {
traffic$pageid <- pageid[i]
traffic <- traffic %>%
dplyr::select(pageid, date, traffic = views)
} else {
traffic$pageid_unique <- pageid_unique[i]
traffic <- traffic %>%
dplyr::select(pageid = pageid_unique, date, traffic = views)
}
traffic <- suppressMessages(padr::pad(traffic, interval = "day"))
traffic[,1:2] <- zoo::na.locf(traffic[,1:2])
traffic[is.na(traffic[3]), 3] <- 0
if (i == 1) {
traffic_list <- list(traffic)
} else {
traffic_list <- c(traffic_list, list(traffic))
}
traffic <- NULL
}
traffic <- bind_rows(traffic_list)
return(traffic)
}
#### RETRIEVAL OF WIKIPEDIA IMAGE URLS ==================================================
imageUrl <- function(pageid, project, uniqueid = NULL) {
image <- NULL
urls <- rep(NA, times = length(pageid))
for (i in 1:length(urls)) {
while(is.null(image)) {
try(
image <- as.character(httr::RETRY("POST", str_c("https://", project,".org/w/api.php"), # project[i]
query = list(action = "query",
pageids = pageid[i],
prop = "pageimages",
format = "json",
redirects = ""),
times = 1000)),
silent = TRUE
)
}
image <- fromJSON(image)$query$pages[[1]]$thumbnail$source
urls[i] <- if(is.null(image)) NA else image
if (!is.na(urls[i])) {
urls[i] <- str_replace(urls[i], pattern = "jpg\\/[[:digit:]].+?px",
replacement = "jpg\\/600px") %>%
str_replace(., pattern = "JPG\\/[[:digit:]].+?px",
replacement = "JPG\\/600px") %>%
str_replace(., pattern = "png\\/[[:digit:]].+?px",
replacement = "png\\/600px") %>%
str_replace(., pattern = "jpeg\\/[[:digit:]].+?px",
replacement = "jpeg\\/600px")
}
image <- NULL
cat(".")
}
urls <- na.omit(data.table(pageid = pageid, image_url = urls)) # pageid = uniqueid
return(urls)
}
#### RETRIEVAL OF UNDIRECTED WIKIPEDIA PAGE TITLES ======================================
undirectedTitle <- function(pageid, project, uniqueid = NULL) {
revision <- NULL
urls <- rep(NA, times = length(pageid))
for (i in 1:length(urls)) {
while(is.null(revision)) {
try(
revision <- as.character(httr::RETRY("POST", str_c("https://", project,".org/w/api.php"), # project[i] - adjust if error
query = list(action = "query",
pageids = pageid[i],
prop = "info",
inprop= "url",
format = "json",
redirects = ""),
times = 1000)),
silent = TRUE
)
}
urls[i] <- str_extract(fromJSON(revision)$query$pages[[1]]$fullurl, "(?<=\\/wiki\\/).+")
revision <- NULL
cat(".")
}
urls <- debugEnc(urls)
if (is.null(uniqueid)) {
urls <- data.table(title = urls, pageid = pageid)
} else {
urls <- data.table(title = urls, pageid = pageid, pageid_unique = uniqueid)
}
return(urls)
}
#### FACE++ API BINDING =================================================================
authFacepp <- function(api_key, api_secret){
auth <- structure(list(api_key = api_key, api_secret = api_secret), class="FaceppProxy")
}
faceEst <- function(data, auth) {
url <- data$image_url
pageid <- data$pageid
face <- NULL
faces <- data.table(ethnicity = as.character(NA),
#smile_threshold = as.numeric(NA),
#smile_intensity = as.numeric(NA), emo_sadness = as.numeric(NA),
#emo_neutral = as.numeric(NA), emo_disgust = as.numeric(NA),
#emo_anger = as.numeric(NA), emo_surprise = as.numeric(NA),
#emo_fear = as.numeric(NA), emo_hapiness = as.numeric(NA),
#beauty_female = as.numeric(NA), beauty_male = as.numeric(NA),
#skin_dark_circles = as.numeric(NA), skin_stain = as.numeric(NA),
#skin_acne = as.numeric(NA), skin_health = as.numeric(NA),
#image_quality_threshold = as.numeric(NA), image_quality = as.numeric(NA),
pageid = data$pageid)
run <- 0
for (i in 1:length(url)) {
run <- run + 1
cat(run)
while(is.null(face)) {
try(
face <- as.character(httr::RETRY("POST", "https://api-us.faceplusplus.com/facepp/v3/detect",
query = list(api_key = auth$api_key,
api_secret = auth$api_secret,
image_url = url[i],
return_landmark = 0,
return_attributes = "ethnicity"),
times = 20, encode = "json")),
silent = TRUE
)
}
if (length(fromJSON(face)$faces) != 0) {
ethnicity <- fromJSON(face)$faces[[1]]$ethnicity
#smile <- fromJSON(face)$faces[[1]]$smile
#emotion <- fromJSON(face)$faces[[1]]$emotion
#beauty <- fromJSON(face)$faces[[1]]$beauty
#skin <- fromJSON(face)$faces[[1]]$skinstatus
#quality <- fromJSON(face)$faces[[1]]$facequality
#faces[faces$pageid == pageid[i],][,1:18] <- c(ethnicity,smile,emotion,beauty,skin,quality)
faces[faces$pageid == pageid[i],][,1] <- c(ethnicity)
face <- NULL
Sys.sleep(5)
} else {
face <- NULL
Sys.sleep(5)
}
}
return(faces)
}
#### UPDATE SOCIAL MEDIA INFORMATION ====================================================
updateSocial <- function(previous,current) {
previous <- readRDS(previous)
current <- readRDS(current)
#missing_1 <- which(!(previous$wikidataid %in% current$wikidataid))
#current <- rbind(current, previous[missing_1,])
cols <- colnames(previous)
for (i in 2:length(cols)) {
na_curr <- current$wikidataid[which(is.na(current[,cols[i]]))]
na_prev <- previous$wikidataid[which(previous$wikidataid %in% na_curr)]
na_prev_idx <- which(!is.na(previous[previous$wikidataid %in% na_prev, cols[i]]))
if(length(na_prev_idx) > 0) {
current[current$wikidataid %in% na_prev[na_prev_idx],cols[i]] <- previous[previous$wikidataid %in% na_prev[na_prev_idx],cols[i]]
}
print(i)
}
return(current)
}
#### CANADA WIKIPEDIA INFORMATION EXTRACTION ============================================
collectorCanada <- function(source) {
#source <- str_c("https://en.wikipedia.org/wiki/List_of_House_members_of_the_",
# sapply(1:42, toOrdinal), "_Parliament_of_Canada")
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
langcode <- "en"
duration <- list(c("1867-09-20", "1872-10-11"), c("1872-10-12", "1874-01-21"),
c("1874-01-22", "1878-09-16"), c("1878-09-17", "1882-06-19"),
c("1882-06-20", "1887-02-21"), c("1887-02-21", "1891-03-04"),
c("1891-03-05", "1896-06-22"), c("1896-06-23", "1900-11-06"),
c("1900-11-07", "1904-11-02"), c("1904-11-03", "1908-10-25"),
c("1908-10-26", "1911-09-20"), c("1911-09-21", "1917-12-16"),
c("1917-12-17", "1921-12-05"), c("1921-12-06", "1925-10-28"),
c("1925-10-29", "1926-09-13"), c("1926-09-14", "1930-07-27"),
c("1930-07-28", "1935-10-13"), c("1935-10-14", "1940-03-25"),
c("1940-03-26", "1945-06-10"), c("1945-06-11", "1949-06-26"),
c("1949-06-27", "1953-08-09"), c("1953-08-10", "1957-06-09"),
c("1957-06-10", "1958-03-30"), c("1958-03-31", "1962-06-17"),
c("1962-06-18", "1963-04-07"), c("1963-04-08", "1965-11-07"),
c("1965-11-08", "1968-06-24"), c("1968-06-25", "1972-10-29"),
c("1972-10-30", "1974-07-07"), c("1974-07-08", "1979-05-21"),
c("1979-05-22", "1980-02-17"), c("1980-02-18", "1984-09-03"),
c("1984-09-04", "1988-11-20"), c("1988-11-21", "1993-10-24"),
c("1993-10-25", "1997-06-01"), c("1997-06-02", "2000-11-26"),
c("2000-11-27", "2004-06-27"), c("2004-06-28", "2006-01-22"),
c("2006-01-23", "2008-10-13"), c("2008-10-14", "2011-05-01"),
c("2011-05-02", "2015-10-18"), c("2015-10-19", "2019-10-20"),
c("2019-10-21", "2023-10-16"))
condition1 <- "contains(@title, 'Bloc')"
condition2 <- "contains(., 'members') or contains(., 'membership') or contains(., 'Membership') or contains(., 'Standings')"
condition3 <- "contains(@title, 'Lieutenant') or contains(@title, 'Minister') or
contains(@title, 'Superior') or contains(@title, 'Supreme') or
contains(@title, 'Honourable') or contains(@title, 's Land') or
contains(@title, 'Welland Canal') or contains(@title, 'Template')"
condition4 <- "contains(@title, 'Andrew McNaughton') or contains(@title, 'Arthur Meighen')"
condition5 <- "contains(@href, 'cite') or contains(@href, 'endnote')"
condition6 <- "contains(@title, 'Elmer MacKay') or contains(@title, 'Yvon L') or
contains(@title, 'opold Corriveau') or contains(@title, 'Maurice Dupras') or
contains(@title, 'Claude G. Lajoie') or contains(@title, 'Derek Blackburn') or
contains(@title, 'Jack Murta') or contains(@title, 'Doug Rowland') or
contains(@title, 'Bill Knight') or contains(@title, 'Thomas Speakman Barnett') or
contains(@title, 'Tommy Douglas')"
condition7 <- "contains(@title, 'By-election')"
query <- c(rep(str_c("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[position()>=2 and position() <= (last() - 1)]//a[not(", condition3,
" or ", condition5, ")]"), times = 18),
str_c("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[position()>=2 and position() <= (last() - 1)]//a[not(", condition3,
" or ", condition4, " or ", condition5, ")]"),
rep(str_c("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[position()>=2 and position() <= (last() - 1)]//a[not(", condition3,
" or ", condition5, ")]"), times = 8),
str_c("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[position()>=2 and position() <= (last() - 1)]//a[not(", condition3,
" or ", condition6, " or ", condition5, ")]" ),
str_c("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[position()>=2 and position() <= (last() - 1)]//a[not(", condition3,
" or ", condition5, ")]"),
str_c("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[(last() - 1)]//a[1][not(", condition3, " or ", condition7, " or ",
condition5, ")]"),
rep(str_c("//h3[not(", condition2, ")]", "/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[3]/ancestor::tr/td[3]//a[1][not(", condition3, " or ", condition7, " or ",
condition5, ")]"), times = 2),
str_c("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[2]//a[1][not(", condition3, " or ", condition7, " or ", condition5, ")]"),
rep(str_c("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[(last() - 1)]//a[1][not(", condition3, " or ", condition7, " or ",
condition5, ")]"), times = 3),
rep(str_c("//h3[not(", condition2, ")]", "/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[3]/ancestor::tr/td[2]//a[1][not(", condition3, " or ", condition7, " or ",
condition5, ")]"), times = 7))
query_party <- c(rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[last()]", times = 30),
rep(str_c("//h3[not(", condition2, ")]", "/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[3]/ancestor::tr/td[last()]"), times = 2),
"//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[3]",
str_c("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[last()]//a[not(", condition1, ")]"),
str_c("//h3[not(", condition2, ")]", "/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[3]/ancestor::tr/td[last()]"),
"//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[last()]",
rep(str_c("//h3[not(", condition2, ")]", "/span/ancestor::h3/following-sibling::table[1]/tbody/tr/
td[3]/ancestor::tr/td[3]"), times = 7)
)
regex_term <- "[[:digit:]]+"
query_constituency <- c(rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[1]/a", times = 2),
"//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[1]",
rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[1]/a", times = 24),
rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[3]/ancestor::tr/td[1]/a", times = 2),
rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[3]/ancestor::tr/td[position()>=1 and position() <= (last() - 2)]/a",
times = 3),
str_c("//h3[not(", condition2, ")]/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[last()]/a"),
rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[2]/a", times = 2),
"//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[3]/ancestor::tr/td[1]/a",
rep(str_c("//h3[not(", condition2, ")]/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[last()]/a"), times = 7))
query_multiplier <- c(rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[1]/a/parent::td", times = 2),
"//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[1]",
rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[1]/a/parent::td", times = 24),
rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[3]/ancestor::tr/td[1]/a/parent::td", times = 2),
rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[3]/ancestor::tr/td[position()>=1 and position() <= (last() - 2)]/a/parent::td",
times = 3),
str_c("//h3[not(", condition2, ")]/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[last()]/a/parent::td"),
rep("//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[2]/a/parent::td", times = 2),
"//h3/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[3]/ancestor::tr/td[1]/a/parent::td",
rep(str_c("//h3[not(", condition2, ")]/span/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]/ancestor::tr/td[last()]/a/parent::td"),
times = 7))
replace_idx <- c(list(56), rep(list(0), times = 10), list(54), rep(list(0), times = 18), list(71), list(280), list(0),
list(c(12, 37, 57, 62, 71, 76, 87, 91, 94, 99, 100, 107, 247)), list(c(113, 153, 204, 240)), list(0),
list(c(1:6,7,8:12,14,15:26,28:51,52,55,60:62,64,65,66,68,78,82,83,87,90,91, 93, 97, 98, 100, 104,124,
142,161,169,173,224,228,262,264,269,275,278,285:287,290,291,293:296,298)),
list(c(14, 56, 67, 149, 155, 159, 290)), list(c(10, 89, 97, 107, 124,134, 137, 158, 198, 301, 305)),
list(c(2, 51, 189, 238)), list(c(11,15,42,89,110,153,172,177,194,197,219,238,244,260,271,272,283,284,290)),
list(c(6,7,8,13,27,33,51,71,102,114,121,127,170,174,175,198,202,210,233,250,253,271,283,289,291,302,
306,310,311,316,322,328,337)), list(0))
replacement <- c(list(1), rep(list(0), times = 10), list(1), rep(list(0), times = 18), list(1), list(1), list(0),
list(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), list(c(1, 1, 1, 1)), list(0),
list(c(rep(1,6),2,rep(1,5),1,rep(1,12),rep(1,24),1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,2,
1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1)),
list(c(1,1,1,1,1,1,1)), list(c(1,1,1,1,1,1,1,1,1,1,1)), list(c(1,1,1,1)),
list(c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)), list(1), list(0))
query_constituency2 <- c(rep("//h3[not(contains(., 'members') or contains(., 'membership') or contains(., 'Membership') or contains(., 'Standings'))]/span[@id]", times = 9),
rep("//h3[not(contains(., 'members') or contains(., 'membership') or contains(., 'Membership') or contains(., 'Standings'))]/span[@id]/a/ancestor::span", times = 18),
rep("//h3[not(contains(., 'members') or contains(., 'membership') or contains(., 'Membership') or contains(., 'Standings'))]/span[@id]", times = 16))
query_multiplier2_1 <- unlist(lapply(str_split(query, "/tbody"), `[`, 1))
query_multiplier2_2 <- unlist(lapply(str_split(query, "table\\[1\\]"), `[`, 2)) %>%
str_replace("^", "\\.")
query_service <- str_c(query, "/ancestor::td")
replace_idx2 <- c(list(0),list(0),list(c(107,11)),list(c(204,162,117)),
list(c(25,111,112,166, 171, 172, 175,177,184,204, 205, 239, 244)),
list(c(261, 162, 159, 160, 142, 142, 141, 124, 125, 118, 119, 107, 106, 86,
82, 71, 72, 69, 70, 67, 68, 50, 51, 61, 63, 48, 49, 45, 46, 1)),
list(c(294, 295, 285, 286, 284, 282, 283, 273, 254, 251, 252, 242, 243, 224, 218,
219, 192, 188, 189, 186, 180, 175, 176, 173, 174, 169, 165, 166, 167, 168,
160, 157, 155, 153, 147, 139, 137, 138, 135, 118, 119, 108, 109, 102, 103,
95, 96, 88, 84, 82, 83, 74, 75, 71, 72, 69, 70, 64, 65, 58, 59, 60, 61,
55, 56, 52, 53, 54, 48, 37, 32, 22, 18, 19, 15, 13, 14, 10, 6, 7)),
list(c(241,242,231,232,229,230,225,226,204,205,189,190,185,186,176,173,
174,163,164,142,142,130,131,51,52,42,7)),
list(c(251,252,231,232,210,211,198,199,191,192,177,178,175,172,165,
166,141,130,129,123,30,31,32,33)),
list(c(252,253,245,246,190,191,170,168,166,167,163,144,133,130,125,116,117,67,68,41,42)),
list(c(185,186,171,142,143,115,66)),
list(c(242,243,229,230,224,225,219,220,196,197,192,193,188,189,181,
182,156,157,150,151,152,153,145,140,87,88,81,82,25,26,23,24,
21,22,15,16)),
list(c(146,135,122,61,62,27,24,25)),
list(c(270,271,249,250,230,231,226,227,208,209,191,192,180,181,177
,170,148,149,150,151,96,97,93,94,76,77,78,79,73,74,56,57,41,
42)),
list(c(161,110,111)),
list(c(269,270,266,267,262,263,249,250,238,239,229,230,225,226,196,197,
192,193,179,163,164,143,125,126,112,113,86,87,54,55,34,35,21,22)),
list(c(261,262,251,252,239,240,227,228,210,211,190,191,173,170,171,
153,154,155,156,135,133,134,120,121,95,96,61,62,58,59,39,40,7,8)),
list(c(148,136,92,93)), list(c(161,151)), list(c(136,131,64)), list(c(169, 142)),
list(c(169,142,262,251,66,194,196,208,213,215,217,223,231,237,246,249)),
list(c(4,49,61,141,154,156,161,185)),list(c(267)), list(c(164,154)),
list(c(0)),list(c(159,62)), list(c(16,42,56,58,102,108,198,202,208,244,250)),
list(c(0)), list(c(243,206,27,5,23,112,117,131,156,160,168,194,74,95,109,13,66,75,
103,104,107,8,155,32,58)), list(c(223,2)),
list(c(273,17,212,115,151,194,186,59,71,155,8,141)),
list(c(63,6,138,287,251,100,64,7,139,288,252,101)),
list(c(23,167,68,46,240,24,168,69,47,241)),
list(c(4,6,137,65,85,103,176,47,98,5,7,138,66,86,104,177,48,99)),
list(c(17,57,76,100,206,210,234,279,281)),
list(c(206,283,68,87,7,90,294,246,225,173,288,255,165,285,128,63,
207,284,69,88,8,91,295,247,226,174,289,256,232)),
list(c(92,56,291,93)),
list(c(88,155,89,156,81,98,205,213,310,94,238,95,82,99,111,138,128,239,311,214,206)),
list(c(44,101,267,244,102,45,268,245,78,206,69,243,52,2,207,79,70)),
list(c(0)),list(c(0)), list(c(0)))
replacement2 <- c(list(0),list(0),list(c(0,0)),list(c(1036,552,1292)),
list(c(1707, 524, 428, 310, 1168, 538, 73, 863, 1500, 492, 537, 406, 56)),
list(c(1472, 162, 1472, 1472, 1167, 1167, 304, 1167, 1167, 1167, 1167, 197,
1255, 1126, 1231, 1472, 1472, 1472, 1472, 1174, 1174, 963, 963, 1472,
1472, 508, 508, 541, 541, 831)),
list(c(1475, 1475, 1600, 1600, 335, 1935, 1935, 312, 371, 1935, 1935, 1935, 1935,
530, 1935, 1935, 421, 1935, 1935, 343, 357, 1935, 1935, 1935, 1935, 371, 1935,
1935, 1935, 1935, 1008, 352, 1743, 376, 373, 358, 1935, 1935, 336, 1935, 1935,
1935, 1935, 1935, 1935, 1935, 1935, 1755, 343, 1935, 1935, 1935, 1935, 1935,
1935, 1935, 1935, 1935, 1935, 1935, 1935, 1935, 1935, 1935, 1935, 1935, 1935,
1935, 1797, 1633, 357, 628, 1935, 1935, 993, 1935, 1935,498, 1935, 1935)),
list(c(1596,1596,1596,1596,1596,1596,1596,1596,1596,1596,1596,1596,1596,
1596,904,1596,1596,1596,1596,1596,1596,1596,1596,1596,1596,
301,157)),
list(c(1456,1456,1456,1456,1456,1456,1178,1178,1456,1456,1456,1456,
427,756,1456,1456,894,1456,1456,853,1456,1456,1456,1456)),
list(c(1452,1452,1452,1452,1452,1452,384,1145,1452,1452,1452,1090,
705,222,1145,1090,889,1452,1452,1452,1452)),
list(c(1059,1059,738,1059,1059,460,119)),
list(c(1125,1125,2278,2278,2278,2278,1125,1125,893,893,1476,1476,
2278,2278,2278,2278,2278,2278,2278,2278,2278,2278,263,
154,2278,2278,2278,2278,2278,2278,2278,2278,1890,1890,2278,2278)),
list(c(842,1148,722,1449,1449,1071,1449,1449)),
list(c(1422,1422,1422,1422,1422,1422,1422,1422,1422,1422,1422,1422,
1422,1422,84,1422,1422,1422,1422,1422,1422,1422,1422,407,
1422,1422,1422,1422,491,491,1422,1422,1422,1422)),
list(c(39,319,319)),
list(c(1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,
1412,1412,1412,1154,1154,1231,1412,1412,1049,1412,1412,1412,1412,
1412,1412,1412,1412,1203,1203,1412,1412)),
list(c(1903,1903,1903,1903,1903,1903,1903,1903,1903,1903,1903,1903,
1519,1903,1903,1903,1903,1519,1519,1358,1903,1903,1903,1903,
1903,1903,1903,1903,1903,1903,1903,1903,1903,1903)),
list(c(539,378,1624,1624)), list(c(685,146)), list(c(497,1093,861)),
list(c(1064, 1064)),
list(c(1064,1064,1027,439,439,439,1146,1384,1404,1040,1384,1384,1384,306,306,1027)),
list(c(737,943,621,224,805,455,455,777)),list(c(189)), list(c(259,553)),
list(c(0)),list(c(581,581)),
list(c(1070,875,1044,864,911,1070,792,657,1231,287,230)), list(c(0)),
list(c(1561,1561,1561,1561,1561,1561,1561,1561,1561,1561,1561,1561,1561,1561,
1561,1051,1051,1051,1051,1051,1051,833,833,463,463)), list(c(181,120)),
list(c(1288,1288,1191,967,967,967,546,546,441,420,420,203)),
list(c(1385,1049,1049,1049,755,755,152,488,488,488,782,782)),
list(c(749,630,630,448,112,1048,1167,1167,1349,1685)),
list(c(882,882,882,882,882,882,476,476,476,432,432,432,432,432,432,838,838,838)),
list(c(943,578,578,578,213,578,578,578,213)),
list(c(413,413,414,424,430,484,484,526,537,683,837,840,1016,
1110,1110,1256,894,894,893,883,877,823,823,781,770,624,470,
467,197)),
list(c(171,376,499,401)),
list(c(217,240,685,685,370,394,525,535,550,552,585,391,391,391,732,805,921,209,209,209,209)),
list(c(181,198,219,337,591,591,591,591,563,688,688,738,741,761,152,152,152)),
list(c(0)), list(c(0)), list(c(0)))
regex_service1 <- c(rep("(from|acclaimed in|re-elected).+?[[:digit:]]{4}?", times = 4),
rep("(from|acclaimed in|re-elected|by-election|declared).+?[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}?", times = 7),
rep(c("(from|acclaimed in|re-elected|by-election|declared).+?[[:digit:]]{4}?",
"(from|acclaimed in|re-elected|by-election|declared).+?[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}?"), times = 6),
rep("(from|acclaimed in|re-elected|by-election|declared).+?[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}?",times =4),
rep(".+", times =13),
rep("(from|acclaimed in|re-elected|after).+?[[:digit:]]{4}?", times = 2))
regex_service2 <- c(rep("(to |died|until|overturned|defeated in|ended term|resigned).+?[[:digit:]]{4}?", times = 4),
rep("(to |died|until|overturned|defeated in|ended term|resigned|appointed|unseated|voided|expelled).+?[[:digit:]]{4}?", times = 23),
rep(".+", times =13),
rep("(to |died|until|overturned|defeated in|ended term|resigned).+?[[:digit:]]{4}?", times = 2))
append_names <- c(rep(list(0), 41),
list(c("Bob Benzen", "Stephanie Kusie", "Glen Motz", "Dane Lloyd",
"Churence Rogers", "Michael Barrett", "Mary Ng", "Mona Fortier",
"Jean Yip", "Emmanuella Lambropoulos", "Rosemarie Falk")))
append_urls <- c(rep(list(0), 41),
list(c("https://en.wikipedia.org/wiki/Bob_Benzen",
"https://en.wikipedia.org/wiki/Stephanie_Kusie",
"https://en.wikipedia.org/wiki/Glen_Motz",
"https://en.wikipedia.org/wiki/Dane_Lloyd",
"https://en.wikipedia.org/wiki/Churence_Rogers",
"https://en.wikipedia.org/wiki/Michael_Barrett_(Canadian_politician)",
"https://en.wikipedia.org/wiki/Mary_Ng",
"https://en.wikipedia.org/wiki/Mona_Fortier",
"https://en.wikipedia.org/wiki/Jean_Yip",
"https://en.wikipedia.org/wiki/Emmanuella_Lambropoulos",
"https://en.wikipedia.org/wiki/Rosemarie_Falk")))
append_parties <- c(rep(list(0), 41),
list(c("Conservative", "Conservative", "Conservative", "Conservative",
"Liberal", "conservative", "Liberal",
"Liberal", "Liberal", "Liberal", "Conservative")))
append_constituencies <- c(rep(list(0), 41),
list(c("Calgary Heritage", "Calgary Midnapore", "Medicine Hat—Cardston—Warner", "Sturgeon River—Parkland",
"Bonavista—Burin—Trinity", "Leeds—Grenville—Thousand Islands and Rideau Lakes",
"Markham—Thornhill", "Ottawa—Vanier", "Scarborough—Agincourt", "Saint-Laurent",
"Battlefords—Lloydminster")))
append_constituencies2 <- c(rep(list(0), 41),
list(c("Alberta", "Alberta","Alberta", "Alberta", "Newfoundland and Labrador",
"Ontario", "Ontario", "Ontario", "Ontario", "Quebec", "Saskatchewan")))
append_services <- c(rep(list(0), 41),
list(c(930,930,1091,727,678,321,920,920,678,930,678)))
append_session_start <- c(rep(list(0), 41),
list(rep("2015-10-19", times = 11)))
append_session_end <- c(rep(list(0), 41),
list(rep("2019-10-20", times = 11)))
urls <- rep(list(NA), times = length(source))
htmls <- rep(list(NA), times = length(source))
names <- rep(list(NA), times = length(source))
parties <- rep(list(NA), times = length(source))
constituencies <- rep(list(NA), times = length(source))
constituencies2 <- rep(list(NA), times = length(source))
multipliers1 <- rep(list(NA), times = length(source))
multipliers2 <- rep(list(NA), times = length(source))
beginnings <- rep(list(NA), times = length(source))
endings <- rep(list(NA), times = length(source))
services <- rep(list(NA), times = length(source))
session_start <- rep(list(NA), times = length(source))
session_end <- rep(list(NA), times = length(source))
for (i in 1:length(urls)) {
htmls[[i]] <- read_html(source[i], encoding = "UTF-8")
urls[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_attr("href") %>%
str_c("https://", langcode,".wikipedia.org", .)
names[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_text()
parties[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all("\\n|Vacant|\\r", "") %>%
extract(. !="")
constituencies[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency[i]) %>%
html_text() %>%
str_replace("\n", "")
multipliers1[[i]] <- html_nodes(x = htmls[[i]], xpath = query_multiplier[i]) %>%
html_attr("rowspan") %>%
as.numeric() %>%
replace(replace_idx[[i]], replacement[[i]]) %>%
na.replace(1)
constituencies[[i]] <- rep(constituencies[[i]], times = multipliers1[[i]])
constituencies2[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency2[i]) %>%
html_text()
multipliers2[[i]] <- html_nodes(x = htmls[[i]], xpath = query_multiplier2_1[i]) %>%
lapply(., html_nodes, xpath = query_multiplier2_2[i]) %>%
lengths
constituencies2[[i]] <- rep(constituencies2[[i]], times = multipliers2[[i]])
beginnings[[i]] <- html_nodes(x = htmls[[i]], xpath = query_service[i]) %>%
html_text() %>%
str_replace_all("\n", "") %>%
str_extract(regex_service1[i]) %>%
str_replace_all("from |acclaimed in |re-elected in |by-election |,|of |declared elected |after ", "")
if (i %in% c(1,2,3,20,41,42,43)) {
beginnings[[i]] <- beginnings[[i]] %>%
str_replace("^(?=[[:digit:]]{4})", "January 2, ") %>%
str_replace("(?<=[[:alpha:]])((,)? )(?=[[:digit:]]{4})", " 2, ") %>%
mdy() %>%
na.replace(duration[[i]][1])
} else if (i %in% c(4,12,14,16,18,22)) {
if (i %in% c(22)) {
beginnings[[i]] <- str_replace(beginnings[[i]],"^(?=[[:digit:]]{4})", "1 January ") %>%
str_replace("^(?=[[:alpha:]])", "1 ")
}
beginnings[[i]] <- beginnings[[i]] %>%
dmy() %>%
na.replace(duration[[i]][1])
} else if (i %in% c(28:40)) {
beginnings[[i]][1:length(beginnings[[i]])] <- NA
beginnings[[i]] <- na.replace(beginnings[[i]], duration[[i]][1])
} else {
beginnings[[i]] <- beginnings[[i]] %>%
ymd() %>%
na.replace(duration[[i]][1])
}
endings[[i]] <- html_nodes(x = htmls[[i]], xpath = query_service[i]) %>%
html_text() %>%
str_replace_all("\n", "") %>%
str_extract(regex_service2[i]) %>%
str_replace_all("to |,|died |until |overturned in |defeated in |by-election |ended term |resigned |
unseated |Quebec legislative council |appointment |election voided |appointed |New Brunswick's |
Lieutenant-Governor |Manitoba |voided |expelled |by petition |Secretary of State |unseated |House of Commons Clerk on ", "")
if (i %in% c(1,2,3,20,41,42,43)) {
endings[[i]] <- endings[[i]] %>%
str_replace("^(?=[[:digit:]]{4})", "January 1, ") %>%
str_replace("(?<=[[:alpha:]])((,)? )(?=[[:digit:]]{4})", " 1, ") %>%
mdy() %>%
na.replace(duration[[i]][2])
} else if (i %in% c(4:19,21,22,23,24,25,26,27)) {
if (i %in% c(5:19,21,22,23,24,25,26,27)) {
endings[[i]] <- str_replace(endings[[i]],"^(?=[[:digit:]]{4})", "1 January ") %>%
str_replace("^(?=[[:alpha:]])", "1 ")
}
endings[[i]] <- endings[[i]] %>%
dmy() %>%
na.replace(duration[[i]][2])
} else if (i %in% c(28:40)) {
endings[[i]][1:length(endings[[i]])] <- NA
endings[[i]] <- na.replace(endings[[i]], duration[[i]][2])
}
services[[i]] <- difftime(time1 = endings[[i]], time2 = beginnings[[i]], units = "days")
services[[i]] <- ifelse(services[[i]] < 0, 0, services[[i]]) %>%
replace(replace_idx2[[i]], replacement2[[i]]) %>%
round()
session_start[[i]] <- duration[[i]][1]
session_end[[i]] <- duration[[i]][2]
if (i %in% c(42)) {
urls[[i]] <- append(urls[[i]], append_urls[[i]])
names[[i]] <- append(names[[i]], append_names[[i]])
parties[[i]] <- append(parties[[i]], append_parties[[i]])
constituencies[[i]] <- append(constituencies[[i]], append_constituencies[[i]])
constituencies2[[i]] <- append(constituencies2[[i]], append_constituencies2[[i]])
services[[i]] <- append(services[[i]], append_services[[i]])
# session_start[[i]] <- append(session_start[[i]], append_session_start[[i]])
# session_end[[i]] <- append(session_end[[i]], append_session_end[[i]])
}
}
urls <- lapply(urls, data.table) %>%
lapply(rename, url = V1)
names <- lapply(names, data.table) %>%
lapply(rename, name = V1)
parties <- lapply(parties, data.table) %>%
lapply(rename, party = V1)
constituencies <- lapply(constituencies, data.table) %>%
lapply(rename, constituency = V1)
constituencies2 <- lapply(constituencies2, data.table) %>%
lapply(rename, constituency2 = V1)
services <- lapply(services, data.table) %>%
lapply(rename, service = V1)
session_start <- lapply(session_start, data.table) %>%
lapply(rename, session_start = V1)
session_end <- lapply(session_end, data.table) %>%
lapply(rename, session_end = V1)
urls <- mapply(cbind, urls, names, parties, constituencies,
constituencies2, services, session_start,
session_end, SIMPLIFY = FALSE)
urls <- as.numeric(as.roman(str_extract(source, regex_term))) %>%
mapply(function(urls, x) mutate(urls, term = x), urls, ., SIMPLIFY=FALSE) %>%
mapply(function(., x) mutate(., country = x), ., rep("CAN", times = 43), SIMPLIFY=FALSE)
}
#### AUSTRIA WIKIPEDIA INFORMATION EXTRACTION ===========================================
collectorAustria <- function(source) {
# read and sort filepaths to Wikipedia htmls
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
# assign start and end date for each legislative period
duration <- list(c("1920-11-10", "1923-11-20"), c("1923-11-20", "1927-05-18"),
c("1927-05-18", "1930-10-01"), c("1930-12-02", "1934-05-02"),
c("1945-12-19", "1949-11-08"), c("1949-11-08", "1953-03-18"),
c("1953-03-18", "1956-06-08"), c("1956-06-08", "1959-06-09"),
c("1959-06-09", "1962-12-14"), c("1962-12-14", "1966-03-30"),
c("1966-03-30", "1970-03-30"), c("1970-03-31", "1971-11-04"),
c("1971-11-04", "1975-11-04"), c("1975-11-04", "1979-06-04"),
c("1979-06-05", "1983-05-18"), c("1983-05-19", "1986-12-16"),
c("1986-12-17", "1990-11-04"), c("1990-11-05", "1994-11-06"),
c("1994-11-07", "1996-01-14"), c("1996-01-15", "1999-10-29"),
c("1999-10-29", "2002-12-19"), c("2002-12-20", "2006-10-29"),
c("2006-10-30", "2008-10-27"), c("2008-10-28", "2013-10-28"),
c("2013-10-29", "2017-10-15"), c("2017-10-16", "2019-09-29"),
c("2019-09-30", "2024-10-01"))
# assign session specific XPath query for URLs of legislators
query <- rep("//table/tbody/tr/td[1]/a", times = 27)
# assign session specific XPath query for party affiliation of legislators
query_party <- c(rep("//table/tbody/tr/td[4]", times = 24),
rep("//table/tbody/tr/td[6]", times = 3)) ##########
# assign session specific XPath query for constituency of legislators
query_constituency <- c(rep(NA, times = 22),
rep("//table/tbody/tr/td[5]", times = 2),
rep("//table/tbody/tr/td[7]", times = 3))
# assign session specific XPath query for constituency of legislators
query_service <- c(rep("//table/tbody/tr/td[5]", times = 22),
rep("//table/tbody/tr/td[6]", times = 2),
rep("//table/tbody/tr/td[8]", times = 3))
# assign empty lists to store session speficic htmls
htmls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator URLs
urls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator names
names <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator party affiliations
parties <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator constituencies
constituencies <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator beginning of services
beginnings <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator end of services
endings <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator service in days
services <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic beginning dates
session_start <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic ending dates
session_end <- rep(list(NA), times = length(source))
# assign session specific manual replacement index for services
replace_idx <- c(rep(list(0), times = 21), list(c(c(65))),
rep(list(0), times = 5))
# assign session specific manual replacement for services
replacement <- c(rep(list(0), times = 21), list(c(c(429))),
rep(list(0), times = 5))
# loop through all html files
for (i in 1:length(htmls)) {
# read html file
htmls[[i]] <- read_html(source[i], encoding = "UTF-8")
# locate legislator biography URLs in html via XPath query, extract and complete URL
urls[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_attr("href") %>%
str_c("https://", "de",".wikipedia.org", .)
# locate legislator names in html via query, extract names
names[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_text()
# locate legislator party affiliations in html via query_party, extract and format party affiliation
parties[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
# if a query was set for the respective session
if (!is.na(query_constituency[[i]])) {
# locate legislator constituencies in html via query_constituency and format constituencies
constituencies[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
}
beginnings[[i]] <- html_nodes(x = htmls[[i]], xpath = query_service[i]) %>%
html_text() %>%
str_trim() %>%
str_replace("(Wechsel von der SPÖ zum Team Stronach)|Nachfolger von Reinhold|und von 26. April 2011|und ab 20. September 2011 als |und Vorgänger von Gerald Klug| Bundesministerin für Gesundheit und Frauen|Nachfolger und Vorgänger jeweils| und seit 1. Juli 2016.+|Nachfolgerin von Monika Lindner.+|7 – Tirol; ab |9 – Wien, ab ", "") %>%
str_replace_all("(Bis|bis).+[[:digit:]]{4}( )?|Wechselte am .+|(A|a)m .+ (übergetreten|übergetreten|ausgetreten|ausgeschlossen|ausgeschieden)|bis 14. Oktober 2012 BZÖ,|.+Ruhendstellung wieder aufgehoben|Bis 27. Dezember 2009 BZÖ, seit 23.|Ab 28. April 2006 BZÖ|Bis 29. Dezember 2009 BZÖ|Bis 1. August 2011 FPÖ,|bis 27. Dezember 2009 BZÖ|.+als Abgeordneter der Landesliste Wien|bis 15. September 2011 BZÖ|.+vom Bundeswahlvorschlag auf|^.+Klubobfrau|.+seit 11. August 2017 Mitglied im FPÖ-Klub|.+bis 23. Dezember 2015 fraktionslos|Nach Mandatsverzicht.+|.+ohne Klubzugehörigkeit|Nationalratspräsident.+|.+Klubobmann der Grünen", "") %>%
str_extract("([[:digit:]])?[[:digit:]].+?[[:digit:]]{4}|ab 10. April für|ab April 2017") %>%
str_replace("ab 10. April für", "10. April 2007") %>%
str_replace("^ab", "01.") %>%
convMonth(patterns = months_foreign, replacements = months_english) %>%
dmy() %>%
na.replace(duration[[i]][1])
endings[[i]] <- html_nodes(x = htmls[[i]], xpath = query_service[i]) %>%
html_text() %>%
str_trim() %>%
str_replace("(Wechsel von der SPÖ zum Team Stronach)|Nachfolger von Reinhold|und von 26. April 2011|und ab 20. September 2011 als |und Vorgänger von Gerald Klug| Bundesministerin für Gesundheit und Frauen|Nachfolger und Vorgänger jeweils| und seit 1. Juli 2016.+", "") %>%
str_replace_all("(Vom|vom|ab|Ab|Von|von|seit).+?[[:digit:]]{4}( )?|^([[:digit:]])?[[:digit:]].+?[[:digit:]]{4}|.+? und|Wechselte am .+|(A|a)m .+ (angelobt|nachgerückt|übergetreten|übergetreten|ausgetreten|ausgeschlossen)|bis 14. Oktober 2012 BZÖ,|.+Ruhendstellung wieder aufgehoben|Bis 27. Dezember 2009 BZÖ, seit 23.|Bis 29. Dezember 2009 BZÖ|Bis 1. August 2011 FPÖ,|bis 27. Dezember 2009 BZÖ|.+als Abgeordneter der Landesliste Wien|bis 15. September 2011 BZÖ|.+vom Bundeswahlvorschlag auf|^.+Klubobfrau|.+seit 11. August 2017 Mitglied im FPÖ-Klub|.+bis 23. Dezember 2015 fraktionslos", "") %>%
str_extract("([[:digit:]])?[[:digit:]].+?[[:digit:]]{4}|Bis September 2008|bis April 2017") %>%
str_replace("^(B|b)is", "01.") %>%
convMonth(patterns = months_foreign, replacements = months_english) %>%
dmy() %>%
na.replace(duration[[i]][2])
if (i == 22) {
beginnings[[i]] <- append(beginnings[[i]], ymd(duration[[i]][1]), after = 196)
endings[[i]] <- append(endings[[i]], ymd(duration[[i]][2]), after = 196)
}
services[[i]] <- difftime(time1 = endings[[i]], time2 = beginnings[[i]], units = "days")
if (i == 22) {
services[[i]] <- replace(services[[i]], replace_idx[[i]], replacement[[i]])
}
session_start[[i]] <- duration[[i]][1]
session_end[[i]] <- duration[[i]][2]
}
urls <- lapply(urls, data.table) %>%
lapply(rename, url = V1)
names <- lapply(names, data.table) %>%
lapply(rename, name = V1)
parties <- lapply(parties, data.table) %>%
lapply(rename, party = V1)
constituencies <- lapply(constituencies, data.table) %>%
lapply(rename, constituency = V1)
services <- lapply(services, data.table) %>%
lapply(rename, service = V1)
session_start <- lapply(session_start, data.table) %>%
lapply(rename, session_start = V1)
session_end <- lapply(session_end, data.table) %>%
lapply(rename, session_end = V1)
urls <- mapply(cbind, urls, names, parties, constituencies,
services, session_start, session_end, SIMPLIFY = FALSE)
urls <- as.numeric(as.roman(str_extract(source, "[[:digit:]]+"))) %>%
mapply(function(urls, x) mutate(urls, term = x), urls, ., SIMPLIFY=FALSE) %>%
mapply(function(., x) mutate(., country = x), ., rep("AUT", times = 27), SIMPLIFY=FALSE)
}
#### GERMANY WIKIPEDIA INFORMATION EXTRACTION ===========================================
collectorGermany <- function(source) {
# read and sort filepaths to Wikipedia htmls
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
duration <- list(c("1949-09-07", "1953-10-06"), c("1953-10-06", "1957-10-15"),
c("1957-10-15", "1961-10-17"), c("1961-10-17", "1965-10-19"),
c("1965-10-19", "1969-10-20"), c("1969-10-20", "1972-12-13"),
c("1972-12-13", "1976-12-14"), c("1976-12-14", "1980-10-04"),
c("1980-10-04", "1983-03-29"), c("1983-03-29", "1987-02-18"),
c("1987-02-18", "1990-12-20"), c("1990-12-20", "1994-11-10"),
c("1994-11-10", "1998-10-26"), c("1998-10-26", "2002-10-17"),
c("2002-10-17", "2005-10-18"), c("2005-10-18", "2009-10-27"),
c("2009-10-27", "2013-10-22"), c("2013-10-22", "2017-10-24"),
c("2017-10-24", "2021-10-24"))
query <- c(rep("//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a[1]",
times = 11),
"//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/i/a[1]|
//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a[1]",
rep("//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a[1]",
times = 5),
"//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a[1]|
//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/i/a[1]",
"//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[2]/a[1]")
query_party <- c(rep("//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[3]",
times = 18),
"//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[4]")
query_constituency <- c(rep("//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[4]",
times = 18),
"//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[5]")
query_constituency2 <- c(rep("//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[5]",
times = 18),
"//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[6]")
query_service <- c(rep("//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[7]",
times = 18),
"//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[8]")
# assign empty lists to store session speficic htmls
htmls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator URLs
urls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator names
names <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator party affiliations
parties <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator constituencies
constituencies <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator constituencies2
constituencies2 <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator beginning of services
beginnings <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator end of services
endings <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator service in days
services <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic beginning dates
session_start <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic ending dates
session_end <- rep(list(NA), times = length(source))
# assign session specific manual replacement index for services
replace_idx <- c(rep(list(0), times = 18), list(c(710)))
# assign session specific manual replacement for services
replacement <- c(rep(list(0), times = 18), list(c(373)))
# assign removal index
removal_idx <- c(rep(list(0), times = 18), list(c(711)))
# loop through all html files
for (i in 1:length(htmls)) {
# read html file
htmls[[i]] <- read_html(source[i], encoding = "UTF-8")
# locate legislator biography URLs in html via XPath query, extract and complete URL
urls[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_attr("href") %>%
str_c("https://", "de",".wikipedia.org", .)
# locate legislator names in html via query, extract names
names[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_text()
# locate legislator party affiliations in html via query_party, extract and format party affiliation
parties[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
constituencies[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
constituencies2[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency2[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "") %>%
inset(.=="", NA)
fill <- html_nodes(x = htmls[[i]], xpath = "//h2/span[contains(text(), 'Abgeordnete')]/ancestor::h2/following-sibling::table[1]/tbody/tr") %>%
lapply(xml_children) %>%
lapply(length) %>%
unlist %>%
extract(-1) %>%
as.numeric %>%
is_less_than(7) %>%
which()
beginnings[[i]] <- html_nodes(x = htmls[[i]], xpath = query_service[i]) %>%
html_text() %>%
str_extract_all("(eingetreten am|nachgewählt am|nachgerückt am).+?[[:digit:]]{4}?") %>%
lapply(., tail, n=1) %>%
lapply(., function(x) if(identical(x, character(0))) NA_character_ else x) %>%
unlist %>%
str_extract("([[:digit:]])?[[:digit:]].+?[[:digit:]]{4}") %>%
convMonth(patterns = months_foreign, replacements = months_english) %>%
dmy() %>%
na.replace(duration[[i]][1])
beginnings[[i]] <- numeric(length(beginnings[[i]]) + length(fill)) %>%
replace(fill, NA) %>%
replace(!is.na(.), beginnings[[i]]) %>%
as.Date() %>%
na.replace(duration[[i]][1])
endings[[i]] <- html_nodes(x = htmls[[i]], xpath = query_service[i]) %>%
html_text() %>%
str_extract_all("((A|a)usgeschieden am|verstorben am).+?[[:digit:]]{4}?") %>%
lapply(., function(x) if(identical(x, character(0))) NA_character_ else x) %>%
unlist %>%
str_extract("([[:digit:]])?[[:digit:]].+?[[:digit:]]{4}") %>%
convMonth(patterns = months_foreign, replacements = months_english) %>%
dmy() %>%
na.replace(duration[[i]][2])
endings[[i]] <- numeric(length(endings[[i]]) + length(fill)) %>%
replace(fill, NA) %>%
replace(!is.na(.), endings[[i]]) %>%
as.Date() %>%
na.replace(duration[[i]][2])
services[[i]] <- difftime(time1 = endings[[i]], time2 = beginnings[[i]], units = "days")
if (i == 19) {
services[[i]] <- replace(services[[i]], replace_idx[[i]], replacement[[i]])
}
if (i %in% 19) {
parties[[i]] <- parties[[i]][-removal_idx[[i]]]
constituencies[[i]] <- constituencies[[i]][-removal_idx[[i]]]
constituencies2[[i]] <- constituencies2[[i]][-removal_idx[[i]]]
services[[i]] <- services[[i]][-removal_idx[[i]]]
}
if (i %in% 9) {
constituencies2[[i]] <- append(constituencies2[[i]], NA, after = 522)
}
session_start[[i]] <- duration[[i]][1]
session_end[[i]] <- duration[[i]][2]
}
urls <- lapply(urls, data.table) %>%
lapply(rename, url = V1)
names <- lapply(names, data.table) %>%
lapply(rename, name = V1)
parties <- lapply(parties, data.table) %>%
lapply(rename, party = V1)
constituencies <- lapply(constituencies, data.table) %>%
lapply(rename, constituency = V1)
constituencies2 <- lapply(constituencies2, data.table) %>%
lapply(rename, constituency2 = V1)
services <- lapply(services, data.table) %>%
lapply(rename, service = V1)
session_start <- lapply(session_start, data.table) %>%
lapply(rename, session_start = V1)
session_end <- lapply(session_end, data.table) %>%
lapply(rename, session_end = V1)
urls <- mapply(cbind, urls, names, parties, constituencies,
constituencies2, services, session_start,
session_end, SIMPLIFY = FALSE)
urls <- as.numeric(as.roman(str_extract(source, "[[:digit:]]+"))) %>%
mapply(function(urls, x) mutate(urls, term = x), urls, ., SIMPLIFY=FALSE) %>%
mapply(function(., x) mutate(., country = x), ., rep("DEU", times = 19), SIMPLIFY=FALSE)
}
#### IRELAND WIKIPEDIA INFORMATION EXTRACTION ===========================================
collectorIreland <- function(source) {
# read and sort filepaths to Wikipedia htmls
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
duration <- list(c("1918-12-14", "1921-05-24"), c("1921-05-24", "1922-06-16"),
c("1922-06-16", "1923-08-27"), c("1923-08-27", "1927-06-09"),
c("1927-06-09", "1927-09-15"), c("1927-09-15", "1932-02-16"),
c("1932-02-16", "1933-01-24"), c("1933-01-24", "1937-07-01"),
c("1937-07-01", "1938-06-17"), c("1938-06-17", "1943-06-23"),
c("1943-06-23", "1944-05-30"), c("1944-05-30", "1948-02-04"),
c("1948-02-04", "1951-05-30"), c("1951-05-30", "1954-05-18"),
c("1954-05-18", "1957-03-05"), c("1957-03-05", "1961-10-04"),
c("1961-10-04", "1965-04-07"), c("1965-04-07", "1969-06-18"),
c("1969-06-18", "1973-02-28"), c("1973-02-28", "1977-06-16"),
c("1977-06-16", "1981-06-11"), c("1981-06-11", "1982-02-18"),
c("1982-02-18", "1982-11-24"), c("1982-11-24", "1987-02-17"),
c("1987-02-17", "1989-06-15"), c("1989-06-15", "1992-11-25"),
c("1992-11-25", "1997-06-06"), c("1997-06-06", "2002-05-17"),
c("2002-05-17", "2007-05-24"), c("2007-05-24", "2011-02-25"),
c("2011-02-25", "2016-02-26"), c("2016-02-26", "2020-02-08"),
c("2020-02-08", "2025-02-20"))
query <- c("//h2/span[contains(text(), 'Members by constituency')]/ancestor::h2/
following-sibling::table[1]/tbody/tr/td[2]/a",
"//h2/span[contains(text(), 'Members by constituency')]/ancestor::h2/
following-sibling::table[1]/tbody/tr/td[position() < last()]/
a[not(contains(@title, 'constituency'))]",
rep("//h2/span[contains(text(), 'TDs by constituency')]/ancestor::h2/
following-sibling::table[1]/tbody/tr/td[position() < last()]/
a[not(contains(@title, 'constituency'))]", times = 26),
rep("//h2/span[contains(text(), 'TDs by party')]/ancestor::h2/
following-sibling::table[1]/tbody/tr/td[not(@rowspan) and position() <
last()]/a[not(contains(@title, 'Unemployed Action'))]",
times = 4),
rep("//h2/span[contains(text(), 'List of TDs')]/ancestor::h2/
following-sibling::table[1]/tbody/tr/td[not(@rowspan) and position() <
last()]/a[not(contains(@title, 'Unemployed Action'))]",
times = 1))
query_party <- c(rep("//h2/span[contains(text(), 'Members by constituency')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/
td[last()]/a", times = 2),
rep("//h2/span[contains(text(), 'TDs by constituency')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/
td[last()]/a", times = 26),
rep("//h2/span[contains(text(), 'TDs by party')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/
td[contains(text(), ')')]/a", times = 4),
rep("//h2/span[contains(text(), 'List of TDs')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/
td[contains(text(), ')')]/a", times = 1)
)
query_constituency <- c(rep("//h2/span[contains(text(), 'Members by constituency')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/td/
a[contains(@title, 'constituency')]", times = 2),
rep("//h2/span[contains(text(), 'TDs by constituency')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/td/
a[contains(@title, 'constituency')]", times = 26),
rep("//h2/span[contains(text(), 'TDs by party')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/td/
a[contains(@title, 'constituency')]", times = 4),
rep("//h2/span[contains(text(), 'List of TDs')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/td/
a[contains(@title, 'constituency')]", times = 1))
query_multiplier <- c(rep("//h2/span[contains(text(), 'Members by constituency')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/td/
a[contains(@title, 'constituency')]/parent::td", times = 2),
rep("//h2/span[contains(text(), 'TDs by constituency')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/td/
a[contains(@title, 'constituency')]/parent::td", times = 26),
rep("//h2/span[contains(text(), 'TDs by party')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/td/
a[contains(@title, 'constituency')]/parent::td", times = 4),
rep("//h2/span[contains(text(), 'List of TDs')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/td/
a[contains(@title, 'constituency')]/parent::td", times = 1))
query_multiplier2 <- c(rep(NA, times = 28),
rep("//h2/span[contains(text(), 'TDs by party')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/
td[contains(text(), ')')]", times = 4),
rep("//h2/span[contains(text(), 'List of TDs')]/
ancestor::h2/following-sibling::table[1]/tbody/tr/
td[contains(text(), ')')]", times = 1))
replace_idx <- c(list(c(25,29,92)), list(c(0)), list(c(0)),
list(c(59,129,57,42,102,24,113,17,39,3,8,50,53,61,86,91,115,132,43,97)),
list(c(62,48)), list(c(1,53,49,89,108,42,83)), list(0),
list(c(65,46,75,71,147)), list(0), list(c(53,69)),list(0), list(c(76,12,
51,74,103,132,19,44,125,129)), list(c(31,29,35)), list(c(92,109,139,50,20,145,
73,18,105)), list(c(97,77,45,87,18,5,62)), list(c(42,71,56,10,61,115,3,124)),
list(c(45,16,86,120,71,18)), list(c(81,137,14,97,10,143,96)),
list(c(69,90,104,34,46,23)), list(c(119,25,78,80,114,36,70)), list(c(16,25,32)),
list(0),list(c(79,89)),list(c(38,42,109)),list(0),list(0), list(c(68,131,21,28,162,35,82)),
list(c(114,45,28,69,149,147)),list(c(50,84)),list(c(14,162,34,25,49,7)),
list(c(122,49,50,153,38,41,76)),list(c(158, 132, 22, 69, 133, 37)), list(c(0)))
replacement <- c(list(c(681,601,82)), list(c(0)), list(c(0)),
list(c(61,68,198,205,275,449,449,450,451,562,562,562,562,562,562,562,562,562,906,906)),
list(c(76,76)), list(c(49,201,546,631,1002,1181,1383)),list(0),
list(c(262,874,876,1297,1301)),list(0),list(c(354,713)),list(0),list(c(164,553,553,
553,553,553,745,1247,1247,1247)), list(c(307,497,651)), list(c(393,393,393,532,750,750,
814,1008,1008)), list(c(574,652,713,713,807,911,911)), list(c(254,451,477,869,869,869,
1206,1092)), list(c(603,868,868,1008,1156,1253)), list(c(609,609,946,946,1072,1072,1141)),
list(c(259,300,300,532,532,410)), list(c(272,613,734,734,987,1198,1198)),
list(c(874,874,1239)),list(0), list(c(96,152)),list(c(170,364,568)),list(0),list(0),
list(c(561,561,715,715,946,1224,1224)), list(c(113,136,281,626,1019,1202)),list(c(898,898)),
list(c(412,589,746,991,1034,1258)),list(c(105,665,1124,1186,1186,1343,1784)),list(c(13,1221,1221,1221,1221,1376)),list(0))
# assign session specific manual addition
append_urls <- c(list(0), list(0), list(0),
list(c("https://en.wikipedia.org/wiki/Hugh_Kennedy",
"https://en.wikipedia.org/wiki/Patrick_McGilligan",
"https://en.wikipedia.org/wiki/James_O%27Mara",
"https://en.wikipedia.org/wiki/Batt_O%27Connor",
"https://en.wikipedia.org/wiki/Richard_O%27Connell_(politician)",
"https://en.wikipedia.org/wiki/Michael_K._Noonan",
"https://en.wikipedia.org/wiki/Se%C3%A1n_Lemass",
"https://en.wikipedia.org/wiki/John_Madden_(Irish_politician)",
"https://en.wikipedia.org/wiki/Michael_Egan_(Irish_politician)",
"https://en.wikipedia.org/wiki/Denis_McCullough",
"https://en.wikipedia.org/wiki/Thomas_Bolger",
"https://en.wikipedia.org/wiki/John_Joe_O%27Reilly",
"https://en.wikipedia.org/wiki/Patrick_Leonard_(politician)",
"https://en.wikipedia.org/wiki/Oscar_Traynor",
"https://en.wikipedia.org/wiki/Thomas_Hennessy",
"https://en.wikipedia.org/wiki/Samuel_Holt",
"https://en.wikipedia.org/wiki/Martin_Roddy",
"https://en.wikipedia.org/wiki/Michael_Tierney_(politician)",
"https://en.wikipedia.org/wiki/Martin_Conlon",
"https://en.wikipedia.org/wiki/William_Norton",
"https://en.wikipedia.org/wiki/James_Dwyer_(politician)")),
list(c("https://en.wikipedia.org/wiki/Thomas_Hennessy",
"https://en.wikipedia.org/wiki/Gear%C3%B3id_O%27Sullivan")),
list(c("https://en.wikipedia.org/wiki/Denis_Gorey",
"https://en.wikipedia.org/wiki/Vincent_Rice",
"https://en.wikipedia.org/wiki/Thomas_F._O%27Higgins",
"https://en.wikipedia.org/wiki/Se%C3%A1n_Mac_Eoin",
"https://en.wikipedia.org/wiki/James_Geoghegan",
"https://en.wikipedia.org/wiki/Thomas_Finlay_(Cumann_na_nGaedheal_politician)",
"https://en.wikipedia.org/wiki/Thomas_Harris_(Irish_politician)")),
list(0), list(c("https://en.wikipedia.org/wiki/Robert_Rowlette",
"https://en.wikipedia.org/wiki/Cecil_Lavery",
"https://en.wikipedia.org/wiki/Eamon_Corbett",
"https://en.wikipedia.org/wiki/Martin_Neilan",
"https://en.wikipedia.org/wiki/Denis_Allen_(politician)"
)),list(0), list(c("https://en.wikipedia.org/wiki/John_McCann_(Irish_politician)",
"https://en.wikipedia.org/wiki/John_J._Keane_(politician)")),
list(0), list(c("https://en.wikipedia.org/wiki/Donal_O%27Donoghue",
"https://en.wikipedia.org/wiki/Patrick_Shanahan_(politician)",
"https://en.wikipedia.org/wiki/Vivion_de_Valera",
"https://en.wikipedia.org/wiki/Honor_Crowley",
"https://en.wikipedia.org/wiki/Bernard_Commons",
"https://en.wikipedia.org/wiki/Brendan_Corish",
"https://en.wikipedia.org/wiki/Patrick_McGrath_(Irish_politician)",
"https://en.wikipedia.org/wiki/Se%C3%A1n_MacBride",
"https://en.wikipedia.org/wiki/Patrick_Kinane",
"https://en.wikipedia.org/wiki/John_Ormonde")),
list(c("https://en.wikipedia.org/wiki/Neil_Blaney",
"https://en.wikipedia.org/wiki/William_J._Murphy_(Labour_politician)",
"https://en.wikipedia.org/wiki/Patrick_O%27Donnell_(Irish_politician)")),
list(c("https://en.wikipedia.org/wiki/John_Carew_(Irish_politician)",
"https://en.wikipedia.org/wiki/Phelim_Calleary",
"https://en.wikipedia.org/wiki/William_Kenneally",
"https://en.wikipedia.org/wiki/Thomas_Byrne_(Dublin_politician)",
"https://en.wikipedia.org/wiki/Richard_Barry_(Irish_politician)",
"https://en.wikipedia.org/wiki/Mark_Deering",
"https://en.wikipedia.org/wiki/Robert_Lahiffe",
"https://en.wikipedia.org/wiki/Stephen_Barrett_(Irish_politician)",
"https://en.wikipedia.org/wiki/George_Coburn")),
list(c("https://en.wikipedia.org/wiki/Michael_Colbert",
"https://en.wikipedia.org/wiki/Kathleen_O%27Connor",
"https://en.wikipedia.org/wiki/Patrick_Byrne_(Irish_politician)",
"https://en.wikipedia.org/wiki/Kieran_Egan_(politician)",
"https://en.wikipedia.org/wiki/John_Galvin_(Irish_politician)",
"https://en.wikipedia.org/wiki/Martin_Medlar",
"https://en.wikipedia.org/wiki/Noel_Lemass")),
list(c("https://en.wikipedia.org/wiki/Frank_Sherwin",
"https://en.wikipedia.org/wiki/Anthony_Millar",
"https://en.wikipedia.org/wiki/Patrick_Cummins_(politician)",
"https://en.wikipedia.org/wiki/Se%C3%A1n_%C3%93_Ceallaigh_(Clare_politician)",
"https://en.wikipedia.org/wiki/Richie_Ryan_(politician)",
"https://en.wikipedia.org/wiki/Henry_Johnston_(Irish_politician)",
"https://en.wikipedia.org/wiki/Patrick_Teehan",
"https://en.wikipedia.org/wiki/Joseph_McLoughlin")),
list(c("https://en.wikipedia.org/wiki/Paddy_Belton",
"https://en.wikipedia.org/wiki/Sheila_Galvin",
"https://en.wikipedia.org/wiki/Terence_Boylan_(politician)",
"https://en.wikipedia.org/wiki/Joan_Burke",
"https://en.wikipedia.org/wiki/John_Donnellan",
"https://en.wikipedia.org/wiki/Eileen_Desmond")),
list(c("https://en.wikipedia.org/wiki/John_O%27Leary_(Kerry_politician)",
"https://en.wikipedia.org/wiki/Fad_Browne",
"https://en.wikipedia.org/wiki/Se%C3%A1n_French_(1931%E2%80%932011)",
"https://en.wikipedia.org/wiki/Gerry_Collins_(politician)",
"https://en.wikipedia.org/wiki/Sylvester_Barrett",
"https://en.wikipedia.org/wiki/Godfrey_Timmins",
"https://en.wikipedia.org/wiki/Desmond_O%27Malley")),
list(c("https://en.wikipedia.org/wiki/Se%C3%A1n_Sherwin",
"https://en.wikipedia.org/wiki/Patrick_Malone_(Irish_politician)",
"https://en.wikipedia.org/wiki/Patrick_Cooney",
"https://en.wikipedia.org/wiki/Patrick_Delap",
"https://en.wikipedia.org/wiki/Larry_McMahon",
"https://en.wikipedia.org/wiki/Gene_Fitzgerald")),
list(c("https://en.wikipedia.org/wiki/Brendan_Toal",
"https://en.wikipedia.org/wiki/Se%C3%A1n_Brosnan",
"https://en.wikipedia.org/wiki/Michael_P._Kitt",
"https://en.wikipedia.org/wiki/M%C3%A1ire_Geoghegan-Quinn",
"https://en.wikipedia.org/wiki/Enda_Kenny",
"https://en.wikipedia.org/wiki/Paddy_Keaveney",
"https://en.wikipedia.org/wiki/Brendan_Halligan")),
list(c("https://en.wikipedia.org/wiki/Liam_Burke",
"https://en.wikipedia.org/wiki/Myra_Barry",
"https://en.wikipedia.org/wiki/Clement_Coughlan")),list(0),
list(c("https://en.wikipedia.org/wiki/Liam_Skelly",
"https://en.wikipedia.org/wiki/Noel_Treacy")),
list(c("https://en.wikipedia.org/wiki/Cathal_Coughlan_(politician)",
"https://en.wikipedia.org/wiki/Tom_Leonard_(Irish_politician)",
"https://en.wikipedia.org/wiki/Brian_Cowen")),list(0),list(0),
list(c("https://en.wikipedia.org/wiki/Eric_Byrne",
"https://en.wikipedia.org/wiki/Michael_Ring",
"https://en.wikipedia.org/wiki/Kathleen_Lynch_(politician)",
"https://en.wikipedia.org/wiki/Hugh_Coveney",
"https://en.wikipedia.org/wiki/Mildred_Fox",
"https://en.wikipedia.org/wiki/Cecilia_Keaveney",
"https://en.wikipedia.org/wiki/Brian_Lenihan_Jnr")),
list(c("https://en.wikipedia.org/wiki/Jan_O%27Sullivan",
"https://en.wikipedia.org/wiki/Se%C3%A1n_Ryan_(politician)",
"https://en.wikipedia.org/wiki/Simon_Coveney",
"https://en.wikipedia.org/wiki/Mary_Upton",
"https://en.wikipedia.org/wiki/S%C3%A9amus_Healy",
"https://en.wikipedia.org/wiki/Tom_Hayes_(Irish_politician)")),
list(c("https://en.wikipedia.org/wiki/Shane_McEntee",
"https://en.wikipedia.org/wiki/Catherine_Murphy_(politician)")),
list(c("https://en.wikipedia.org/wiki/George_Lee_(journalist)",
"https://en.wikipedia.org/wiki/Maureen_O%27Sullivan_(politician)",
"https://en.wikipedia.org/wiki/Pearse_Doherty")),
list(c("https://en.wikipedia.org/wiki/Patrick_Nulty",
"https://en.wikipedia.org/wiki/Helen_McEntee",
"https://en.wikipedia.org/wiki/Ruth_Coppinger",
"https://en.wikipedia.org/wiki/Gabrielle_McFadden",
"https://en.wikipedia.org/wiki/Paul_Murphy_(Irish_politician)",
"https://en.wikipedia.org/wiki/Michael_Fitzmaurice_(politician)",
"https://en.wikipedia.org/wiki/Bobby_Aylward")),
list(c("https://en.wikipedia.org/wiki/Mark_Ward_(politician)",
"https://en.wikipedia.org/wiki/P%C3%A1draig_O%27Sullivan",
"https://en.wikipedia.org/wiki/Malcolm_Byrne",
"https://en.wikipedia.org/wiki/Joe_O%27Brien_(politician)")))
# assign session specific manual addition for names
append_names <- c(list(0), list(0), list(0),
list(c("Hugh Kennedy", "Patrick McGilligan",
"James O'Mara", "Batt O'Connor",
"Richard O'Connell", "Michael Noonan",
"Seán Lemass", "John Madden", "Michael Egan",
"Denis McCullough", "Thomas Bolger", "John Joe O'Reilly",
"Patrick Leonard", "Oscar Traynor", "Thomas Hennessy",
"Samuel Holt", "Martin Roddy", "Michael Tierney",
"Martin Conlon", "William Norton", "James Dwyer")),
list(c("Thomas Hennessy", "Gearóid O'Sullivan")),
list(c("Denis Gorey", "Vincent Rice", "Thomas F. O'Higgins",
"Seán Mac Eoin", "James Geoghegan", "Thomas Finlay",
"Thomas Harris")), list(0),list(c("Robert Rowlette",
"Cecil Lavery", "Eamon Corbett", "Martin Neilan",
"Denis Allen")),list(0), list(c("John McCann", "John J. Keane")),
list(0), list(c("Donal O'Donoghue", "Patrick Shanahan", "Vivion de Valera",
"Honor Crowley", "Bernard Commons", "Brendan Corish", "Patrick McGrath",
"Seán MacBride","Patrick Kinane", "John Ormonde")),
list(c("Neil Blaney", "William J. Murphy", "Patrick O'Donnell")),
list(c("John Carew", "Phelim Calleary", "William Kenneally", "Thomas Byrne",
"Richard Barry", "Mark Deering", "Robert Lahiffe", "Stephen Barrett",
"George Coburn")), list(c("Michael Colbert", "Kathleen O'Connor",
"Patrick Byrne", "Kieran Egan", "John Galvin", "Martin Medlar",
"Noel Lemass")),list(c("Frank Sherwin", "Anthony Millar", "Patrick Cummins",
"Seán Ó Ceallaigh", "Richie Ryan", "Henry Johnston", "Patrick Teehan", "Joseph McLoughlin")),
list(c("Paddy Belton","Sheila Galvin","Terence Boylan","Joan Burke","John Donnellan","Eileen Desmond")),
list(c("John O'Leary","Fad Browne","Seán French","Gerry Collins","Sylvester Barrett","Godfrey Timmins",
"Desmond O'Malley")), list(c("Seán Sherwin","Patrick Malone","Patrick Cooney","Patrick Delap",
"Larry McMahon","Gene Fitzgerald")), list(c("Brendan Toal","Seán Brosnan","Michael P. Kitt",
"Máire Geoghegan-Quinn","Enda Kenny","Paddy Keaveney","Brendan Halligan")),
list(c("Liam Burke","Myra Barry","Clement Coughlan")),list(0), list(c("Liam Skelly","Noel Treacy")),
list(c("Cathal Coughlan","Tom Leonard","Brian Cowen")),list(0),list(0),
list(c("Eric Byrne","Michael Ring","Kathleen Lynch","Hugh Coveney","Mildred Fox","Cecilia Keaveney",
"Brian Lenihan Jnr")), list(c("Jan O'Sullivan","Seán Ryan","Simon Coveney","Mary Upton",
"Séamus Healy","Tom Hayes")),list(c("Shane McEntee","Catherine Murphy")), list(c("George Lee",
"Maureen O'Sullivan","Pearse Doherty")), list(c("Patrick Nulty","Helen McEntee","Ruth Coppinger",
"Gabrielle McFadden","Paul Murphy","Michael Fitzmaurice","Bobby Aylward")),
list(c("Mark Ward", "Pádraig O'Sullivan", "Malcolm Byrne", "Joe O'Brien")))
# assign session specific manual addition for parties
append_parties <- c(list(0), list(0), list(0),
list(c(rep("Cumann na nGaedheal", times = 6),
"Republican","Republican", rep("Cumann na nGaedheal", times = 5),
"Republican", "Cumann na nGaedheal","Republican",
"Cumann na nGaedheal","Cumann na nGaedheal","Cumann na nGaedheal",
"Labour Party", "Cumann na nGaedheal")),
list(c("Cumann na nGaedheal", "Cumann na nGaedheal")),
list(c(rep("Cumann na nGaedheal", times = 4), "Fianna Fáil",
"Cumann na nGaedheal", "Fianna Fáil")),list(0),list(c("Independent",
"Fine Gael", "Fianna Fáil", "Fianna Fáil", "Fianna Fáil")),list(0),
list(c("Fianna Fáil", "Fianna Fáil")),
list(0), list(c(rep("Fianna Fáil", times = 4), "Clann na Talmhan",
"Labour Party", "Fianna Fáil", "Clann na Poblachta",
"Clann na Poblachta", "Fianna Fáil")), list(c("Fianna Fáil", "Labour Party",
"Fine Gael")), list(c("Fine Gael","Fianna Fáil","Fianna Fáil",
"Independent", "Fine Gael", "Fine Gael", "Fianna Fáil", "Fine Gael",
"Fine Gael")), list(c("Fianna Fáil","Clann na Poblachta","Independent",
rep("Fianna Fáil", times = 4))), list(c("Independent","Fianna Fáil","Fianna Fáil",
"Fianna Fáil","Fine Gael","Fianna Fáil","Fianna Fáil","Fine Gael")),
list(c("Fine Gael","Fianna Fáil","Fianna Fáil","Fine Gael","Fine Gael","Labour Party")),
list(c(rep("Fianna Fáil", times = 5),"Fine Gael","Fianna Fáil")), list(c("Fianna Fáil","Fine Gael",
"Fine Gael","Fianna Fáil","Fine Gael","Fianna Fáil")),list(c("Fine Gael",
rep("Fianna Fáil",times=3),"Fine Gael","Independent Fianna Fáil","Labour Party")),
list(c("Fine Gael","Fine Gael","Fianna Fáil")),list(0), list(c("Fine Gael","Fianna Fáil")),
list(c(rep("Fianna Fáil", times = 3))),list(0),list(0), list(c("Democratic Left","Fine Gael",
"Democratic Left","Fine Gael","Independent","Fianna Fáil","Fianna Fáil")),
list(c("Labour Party","Labour Party","Fine Gael","Labour Party","Independent","Fine Gael")),
list(c("Fine Gael","Independent")),list(c("Fine Gael","Independent","Sinn Féin")),
list(c("Labour Party","Fine Gael","Socialist Party","Fine Gael","Anti-Austerity Alliance",
"Independent","Fianna Fáil")),
list(c("Sinn Féin", "Fianna Fáil", "Fianna Fáil", "Green Party")))
# assign session specific manual addition for constituencies
append_constituencies <- c(list(0), list(0), list(0),
list(c("Dublin South", "National University of Ireland", "Dublin South",
"Dublin County", "Limerick", "Cork East", "Dublin South",
"Mayo North", "Cork Borough", "Donegal", "Carlow–Kilkenny", "Cavan",
"Dublin North", "Dublin North", "Dublin South", "Leitrim–Sligo",
"Leitrim–Sligo", "Mayo North", "Roscommon", "Dublin County",
"Leix–Offaly")),
list(c("Dublin South", "Dublin County")),
list(c("Carlow–Kilkenny", "Dublin North", "Dublin North",
"Leitrim–Sligo", "Longford–Westmeath", "Dublin County",
"Kildare")),list(0),list(c("Dublin University",
"Dublin County", "Galway", "Galway", "Wexford")),list(0),
list(c("Dublin South", "Galway West")),
list(0), list(c("Kerry South", "Clare", "Dublin North-West",
"Kerry South", "Mayo South", "Wexford", "Cork Borough", "Dublin County",
"Tipperary", "Waterford")), list(c("Donegal East", "Cork West", "Donegal West")),
list(c("Limerick East", "Mayo North", "Waterford", "Dublin North-West",
"Cork East", "Wicklow", "Galway South", "Cork Borough", "Louth")),
list(c("Limerick West", "Kerry North", "Dublin North-East", "Leix–Offaly",
"Cork Borough", "Carlow–Kilkenny","Dublin South-West")),
list(c("Dublin North-Central","Galway South","Dublin South-Central","Clare",
"Dublin South-West","Meath","Carlow–Kilkenny","Sligo–Leitrim")),
list(c("Dublin North-East","Cork Borough","Kildare","Roscommon","Galway East","Cork Mid")),
list(c("Kerry South","Waterford","Cork Borough","Limerick West","Clare","Wicklow",
"Limerick East")), list(c("Dublin South-West","Kildare","Longford–Westmeath",
"Donegal–Leitrim","Dublin County South","Cork Mid")),list(c("Monaghan",
"Cork North-East","Galway North-East","Galway West","Mayo West","Donegal North-East",
"Dublin South-West")), list(c("Cork City","Cork North-East","Donegal")),list(0),
list(c("Dublin West","Galway East")),list(c("Donegal South-West","Dublin Central",
"Laois–Offaly")),list(0),list(0), list(c("Dublin South-Central","Mayo West",
"Cork North-Central","Cork South-Central","Wicklow","Donegal North-East",
"Dublin West")),list(c("Limerick East","Dublin North","Cork South-Central",
"Dublin South-Central","Tipperary South","Tipperary South")),list(c("Meath",
"Kildare North")),list(c("Dublin South","Dublin Central","Donegal South-West")),
list(c("Dublin West","Meath East","Dublin West","Longford–Westmeath","Dublin South-West",
"Roscommon–South Leitrim","Carlow–Kilkenny")),
list(c("Dublin Mid-West", "Cork North-Central", "Wexford", "Dublin Fingal")))
# assign session specific manual addition for services
append_services <- c(list(0), list(0), list(0),
list(c(388,1313,1183,1176,1106,932,932,932,931,930,819,819,819,819,819,819,
819,819,819,475,475)),
list(c(22,22)),list(c(1565,1413,1068,983,612,433,231)),list(0),
list(c(1356,744,742,321,317)),list(0), list(c(1477,1118)),
list(0), list(c(1180,791,791,791,791,791,599,97,97,97)),
list(c(903,713,559)), list(c(690,690,690,551,333,333,269,75,75)),
list(c(447,369,308,308,214,110,110)), list(c(1419,1222,1196,804,804,
804,467,581)),list(c(677,412,412,272,124,27)),list(c(923,923,
586,586,460,460,391)), list(c(1091,1050,1050,818,818,940)),
list(c(1296,945,834,834,581,370,370)), list(c(581,581,216)),list(0),
list(c(182,126)), list(c(1375,1181,977)),list(0),list(0),
list(c(1092,1092,938,938,707,429,429)),list(c(1527,1527,1301,932,
693,319)), list(c(803,803)),list(c(247,628,90)),list(c(875,1065,643,643,
502,502,278)),
list(c(70,70,70,70)))
# assign session specific manual addition for session_star
append_session_start <- c(list(0), list(0), list(0),
list(rep("1923-08-27", times = 21)),
list(c("1927-06-09","1927-06-09")),
list(rep("1927-09-15", times = 7)),list(0),
list(rep("1933-01-24", times = 5)),list(0),
list(c("1938-06-17","1938-06-17")),list(0),
list(rep("1944-05-30", times = 10)), list(c("1948-02-04", "1948-02-04",
"1948-02-04")), list(rep("1951-05-30", times = 9)),
list(rep("1954-05-18", times = 7)),list(rep("1957-03-05", times = 8)),
list(rep("1961-10-04", times = 6)),list(rep("1965-04-07", times = 7)),
list(rep("1969-06-18", times = 6)),list(rep("1973-02-28", times = 7)),
list(rep("1977-06-16", times = 3)),list(0),
list(c("1982-02-18","1982-02-18")),list(rep("1982-11-24", times = 3)),
list(0),list(0),list(rep("1992-11-25",times =7)),
list(rep("1997-06-06",times = 6)),list(c("2002-05-17","2002-05-17")),
list(rep("2007-05-24",times = 3)),list(rep("2011-02-25",times = 7)),
list(rep("2016-02-26",times = 4)))
# assign session specific manual addition for session_end
append_session_end <- c(list(0), list(0), list(0),
list(rep("1927-06-09", times = 21)),
list(c("1927-09-15", "1927-09-15")),
list(rep("1932-02-16", times = 7)),list(0),
list(rep("1937-07-01", times = 5)),list(0),
list(c("1943-06-23","1943-06-23")),list(0),
list(rep("1948-02-04", times = 10)), list(c("1951-05-30","1951-05-30",
"1951-05-30")), list(rep("1954-05-18", times = 9)),
list(rep("1957-03-05", times = 7)),list(rep("1961-10-04", times = 8)),
list(rep("1965-04-07", times = 6)),list(rep("1969-06-18", times = 7)),
list(rep("1973-02-28", times = 6)),list(rep("1977-06-16", times = 7)),
list(rep("1981-06-11", times = 3)),list(0),
list(c("1982-11-24","1982-11-24")),list(rep("1987-02-17", times = 3)),
list(0),list(0),list(rep("1997-06-06",times = 7)),
list(rep("2002-05-17",times = 6)),list(c("2007-05-24","2007-05-24")),
list(rep("2011-02-25",times = 3)),list(rep("2016-02-26",times = 7)),
list(rep("2020-02-08",times = 4)))
# assign empty lists to store session speficic htmls
htmls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator URLs
urls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator names
names <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator party affiliations
parties <- rep(list(NA), times = length(source))
multiplier <- rep(list(NA), times = length(source))
multiplier2 <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator constituencies
constituencies <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic beginning dates
session_start <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic ending dates
session_end <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator service in days
services <- rep(list(NA), times = length(source))
# loop through all html files
for (i in 1:length(htmls)) {
# read html file
htmls[[i]] <- read_html(source[i], encoding = "UTF-8")
# locate legislator biography URLs in html via XPath query, extract and complete URL
urls[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_attr("href") %>%
str_c("https://", "en",".wikipedia.org", .)
# locate legislator names in html via query, extract names
names[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_text()
# locate legislator party affiliations in html via query_party, extract and format party affiliation
parties[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_trim()
if (!equals(length(parties[[i]]), length(urls[[i]]))) {
multiplier[[i]] <- html_nodes(x = htmls[[i]], xpath = query_multiplier2[i]) %>%
html_attrs() %>%
as.numeric %>%
na.replace(1)
parties[[i]] <- rep(parties[[i]], times = multiplier[[i]])
}
constituencies[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency[i]) %>%
html_text()
if (!equals(length(constituencies[[i]]), length(urls[[i]]))) {
multiplier2[[i]] <- html_nodes(x = htmls[[i]], xpath = query_multiplier[i]) %>%
html_attrs() %>%
as.numeric %>%
na.replace(1)
constituencies[[i]] <- rep(constituencies[[i]], times = multiplier2[[i]])
}
session_start[[i]] <- rep(duration[[i]][1], times = length(urls[[i]]))
session_end[[i]] <- rep(duration[[i]][2], times = length(urls[[i]]))
services[[i]] <- round(rep(difftime(time1 = session_end[[i]][1], time2 = session_start[[i]][1], units = "days"),
length(urls[[i]])))
services[[i]] <- replace(services[[i]], replace_idx[[i]], replacement[[i]])
if (i %in% c(4:6,8,10,12:21,23:24,27:32)) {
urls[[i]] <- append(urls[[i]], append_urls[[i]])
names[[i]] <- append(names[[i]], append_names[[i]])
parties[[i]] <- append(parties[[i]], append_parties[[i]])
constituencies[[i]] <- append(constituencies[[i]], append_constituencies[[i]])
services[[i]] <- append(services[[i]], append_services[[i]])
session_start[[i]] <- append(session_start[[i]], append_session_start[[i]])
session_end[[i]] <- append(session_end[[i]], append_session_end[[i]])
}
}
urls <- lapply(urls, data.table) %>%
lapply(rename, url = V1)
names <- lapply(names, data.table) %>%
lapply(rename, name = V1)
parties <- lapply(parties, data.table) %>%
lapply(rename, party = V1)
constituencies <- lapply(constituencies, data.table) %>%
lapply(rename, constituency = V1)
services <- lapply(services, data.table) %>%
lapply(rename, service = V1)
session_start <- lapply(session_start, data.table) %>%
lapply(rename, session_start = V1)
session_end <- lapply(session_end, data.table) %>%
lapply(rename, session_end = V1)
urls <- mapply(cbind, urls, names, parties, constituencies,
services, session_start,session_end, SIMPLIFY = FALSE)
urls <- as.numeric(as.roman(str_extract(source, "[[:digit:]]+"))) %>%
mapply(function(urls, x) mutate(urls, term = x), urls, ., SIMPLIFY=FALSE) %>%
mapply(function(., x) mutate(., country = x), ., rep("IRL", times = 33), SIMPLIFY=FALSE)
}
#### FRANCE WIKIPEDIA INFORMATION EXTRACTION ============================================
collectorFrance <- function(source) {
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
duration <- list(c("1958-11-30", "1962-11-25"), c("1962-11-25", "1967-03-12"),
c("1967-03-12", "1968-06-30"), c("1968-06-30", "1973-03-11"),
c("1973-03-11", "1978-03-19"), c("1978-03-19", "1981-06-21"),
c("1981-06-21", "1986-03-16"), c("1986-03-16", "1988-06-12"),
c("1988-06-12", "1993-03-28"), c("1993-03-28", "1997-06-01"),
c("1997-06-01", "2002-06-16"), c("2002-06-16", "2007-06-17"),
c("2007-06-17", "2012-06-17"), c("2012-06-17", "2017-06-18"),
c("2017-06-18", "2022-06-01"))
condition <- str_c("[", str_c("@id = '", LETTERS, "' or ", collapse = ""),
"@id = 'Liste_des_d.C3.A9put.C3.A9s']")
query <- c(str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[2]/a"),
str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a"),
str_c("//h2/span", str_replace(condition, " or @id = 'U'", ""),
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a"),
rep(str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a"),
times = 10),
str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a|//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/s/a"),
str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[2]/a|//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[2]/s/a"))
query_party <- c(rep(str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[3]"),
times = 13),
str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[4]/a"),
str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[9]/a[1]"))
query_constituency <- c(rep(str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[4]"),
times = 13),
str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[2]"),
str_c("//h2/span", condition,
"/ancestor::h2/following-sibling::table[1]/tbody/tr/td[3]"))
# assign empty lists to store session speficic htmls
htmls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator URLs
urls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator names
names <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator party affiliations
parties <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator constituencies
constituencies <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic beginning dates
session_start <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic ending dates
session_end <- rep(list(NA), times = length(source))
for (i in 1:length(htmls)) {
# read html file
htmls[[i]] <- read_html(source[i], encoding = "UTF-8")
# locate legislator biography URLs in html via XPath query, extract and complete URL
urls[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_attr("href") %>%
str_c("https://", "fr",".wikipedia.org", .)
# locate legislator names in html via query, extract names
names[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_text()
# locate legislator party affiliations in html via query_party, extract and format party affiliation
parties[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
constituencies[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
session_start[[i]] <- duration[[i]][1]
session_end[[i]] <- duration[[i]][2]
}
urls <- lapply(urls, data.table) %>%
lapply(rename, url = V1)
names <- lapply(names, data.table) %>%
lapply(rename, name = V1)
parties <- lapply(parties, data.table) %>%
lapply(rename, party = V1)
constituencies <- lapply(constituencies, data.table) %>%
lapply(rename, constituency = V1)
session_start <- lapply(session_start, data.table) %>%
lapply(rename, session_start = V1)
session_end <- lapply(session_end, data.table) %>%
lapply(rename, session_end = V1)
urls <- mapply(cbind, urls, names, parties, constituencies,
session_start, session_end, SIMPLIFY = FALSE)
urls <- as.numeric(as.roman(str_extract(source, "[[:digit:]]+"))) %>%
mapply(function(urls, x) mutate(urls, term = x), urls, ., SIMPLIFY=FALSE) %>%
mapply(function(., x) mutate(., country = x), ., rep("FRA", times = 15), SIMPLIFY=FALSE)
}
#### USA WIKIPEDIA INFORMATION EXTRACTION ===============================================
collectorUSA <- function(source,chamber) {
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
if (chamber == "house") {
query_a <- "(//table[@class='multicol'])[2]//ul/li/a[not(contains(@title, 'district'))
and not(contains(@title, 'Party'))
and not(contains(@title, 'Commissioner of Puerto'))
and not(contains(@title, 'at-large'))
and not(contains(@title, 'Independent politician'))
and not(contains(@title, 'Free Silver'))
and not(contains(@title, 'Southwest Territory'))
and not(contains(@title, 'Know Nothing'))
and not(contains(@title, 'Independent Democrat'))
and not(contains(@title, 'Independent (politics)'))
and not(contains(@title, 'Independent (United States)'))
and not(contains(@title, 'Independent (politician)'))
and not(contains(@title, 'congressional delegations'))"
query_b <- " and not(contains(@title, 'Tennessee'))"
query_c <- "(//table[@class='multicol'])[2]//ul/li/dl/dd/a[not(contains(@title, 'district'))
and not(contains(@title, 'Party'))
and not(contains(@title, 'Commissioner of Puerto'))
and not(contains(@title, 'at-large'))
and not(contains(@title, 'Independent politician'))
and not(contains(@title, 'Free Silver'))
and not(contains(@title, 'Southwest Territory'))
and not(contains(@title, 'Know Nothing'))
and not(contains(@title, 'Independent Democrat'))
and not(contains(@title, 'Independent (politics)'))
and not(contains(@title, 'Independent (United States)'))
and not(contains(@title, 'Independent (politician)'))
and not(contains(@title, 'congressional delegations'))"
query_d <- " and not(contains(@title, 'William Wilkins'))"
query_e <- " and not(contains(@title, 'States Rights party'))"
query_f <- " and not(contains(@title, 'States-Rights Whig'))"
query_g <- str_replace(query_a, "@class='multicol'", "@width='75%'")
query_h <- str_replace(query_c, "@class='multicol'", "@width='75%'")
query <- c(rep(str_c(query_a, query_b, "]|", query_c, query_b, "]"), times = 16),
rep(str_c(query_a, "]|", query_c, "]"), times = 4),
rep(str_c(query_a, query_d, "]|", query_c, query_d, "]"), times = 1),
rep(str_c(query_a, "]|", query_c, "]"), times = 2),
rep(str_c(query_a, query_e, "]|", query_c, query_e, "]"), times = 1),
rep(str_c(query_a, "]|", query_c, "]"), times = 2),
rep(str_c(query_a, query_f, "]|", query_c, query_f, "]"), times = 1),
rep(str_c(query_a, "]|", query_c, "]"), times = 63),
str_c(query_g, "]|", query_h, "]"),
rep(str_c(query_a, "]|", query_c, "]"), times = 25)
)
query_constituency <- c(rep("(//table[@class='multicol'])[2]//ul/li/a[1]", 90),
"(//table[@width='75%'])[2]//ul/li/a[1]",
rep("(//table[@class='multicol'])[2]//ul/li/a[1]", 25))
query_party <- c(rep("(//table[@class='multicol'])[2]//ul/li", 90),
"(//table[@width='75%'])[2]//ul/li",
rep("(//table[@class='multicol'])[2]//ul/li", 25))
regex_exception1 <- "\n.+|\\(Resident Commissioner\\)|\\(.Southwest Territory.+?\\)|\\(politician\\)"
regex_exception2 <- ".acant(,)? (until|to|admitted|moved)|^Vacant.+thereafter$|newly admitted|2\\. and 3\\. Joint|.acant(,)? from|^Vacant|Vacant$|^4\\. vacant"
regex_exception3 <- "^Vacant|^[[:digit:]]([[:digit:]])?(\\.|\\:) Vacant|At-large..(v|V)acant|2. and 3. Joint district with two seats\\.|, vacant until|\\. vacant to|(\\.|\\:) Vacant until|^[[:digit:]]([[:digit:]])?(\\.|\\:) vacant"
query_ttime <- "//table[@class = 'infobox vevent']/tbody/tr[4]"
}
if (chamber == "senate") {
query <- c(rep("(//table[@class='multicol'])[1]//ul/li/a[1]", 42),
"(//table[@class='multicol'])[1]//ul/li/a[not(contains(@title, 'Senatorial'))
and not(contains(@title, 'Liberal'))]",
rep("(//table[@class='multicol'])[1]//ul/li/a[1]", 7),
"(//table[@class='multicol'])[1]//ul/li/a[not(contains(@title, 'Senatorial'))]",
rep("(//table[@class='multicol'])[1]//ul/li/a[1]", 39),
"//table[2]//ul/li/a[1]",
rep("(//table[@class='multicol'])[1]//ul/li/a[1]", 25))
query_constituency <- c(rep("(//table[@class='multicol'])[1]//ul/li", 90),
"//table[2]//ul/li",
rep("(//table[@class='multicol'])[1]//ul/li", 25))
query_party <- c(rep("(//table[@class='multicol'])[1]//ul/li", times = 90), "//table[2]//ul/li",
rep("(//table[@class='multicol'])[1]//ul/li", times = 25))
regex_exception1 <- "\n.+|\\(Resident Commissioner\\)|\\(.Southwest Territory.+?\\)|\\(politician\\)| until December 3, 1928.+"
regex_exception2 <- ".acant(,)? (until|to|admitted|moved)|^Vacant.+thereafter$|newly admitted|2\\. and 3\\. Joint|.acant(,)? from|^Vacant|Vacant$|(\\.|\\:) .acant"
regex_exception3 <- "^Vacant|^[[:digit:]]([[:digit:]])?(\\.|\\:) Vacant|At-large..(v|V)acant|2. and 3. Joint district with two seats\\.|, vacant until|\\. vacant to|(\\.|\\:) Vacant until|^[[:digit:]]([[:digit:]])?(\\.|\\:) vacant"
regex_constituency <- c(rep("h4/span[1]/a", times = 12), "h4/span[1]",
rep("h4/span[1]/a", times = 76), "h4/span[1]", "h4/span[1]",
rep("h4/span[1]/a", times = 25))
query_ttime <- "//table[@class = 'infobox vevent']/tbody/tr[4]"
}
# assign empty lists to store session speficic htmls
htmls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator URLs
urls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator names
names <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator party affiliations
parties <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator constituencies
constituencies <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator beginning of services
beginnings <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator end of services
endings <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator service in days
services <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic beginning dates
session_start <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic ending dates
session_end <- rep(list(NA), times = length(source))
vacant <- rep(list(NA), times = length(source))
term_times <- rep(list(NA), times = length(source))
replace_idx <- c(rep(list(0), times = 4),list(c(86)),rep(list(0), times = 8),list(c(35,82)),
rep(list(0), times = 2),list(c(27)),list(c(64,113)),list(c(0)),list(c(11)),
list(c(156,158,209)),list(c(13)),rep(list(0), times = 8),list(c(223)))
replacement <- c(rep(list(0), times = 4),list(c(178)),rep(list(0), times = 8),list(c(0,275)),
rep(list(0), times = 2),list(c(0)),list(c(461,48)),list(c(0)),list(c(0)),
list(c(0,0,327)),list(c(0)),rep(list(0), times = 8),list(c(274)))
replace_idx2 <- c(list(c(0)),list(c(23)),list(c(0)),list(c(7)),rep(list(0), times = 47),
list(c(14)),rep(list(0), times = 62),list(c(2)))
replacement2 <- c(list(c(0)),list(c(0)),list(c(0)),list(c(257)),rep(list(0), times = 47),
list(c(648)),rep(list(0), times = 62),list(c(328)))
for (i in 1:116) {
# read html file
htmls[[i]] <- read_html(source[i], encoding = "UTF-8")
# locate legislator biography URLs in html via XPath query, extract and complete URL
urls[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_attr("href") %>%
str_c("https://", "en",".wikipedia.org", .)
# locate legislator names in html via query, extract names
names[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_text()
parties[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all(regex_exception1, "") %>%
str_replace_all("^.+?(?=\\()|(?<=\\)).+", "") %>%
str_replace_all("\\(|\\)|\n|\r", "") %>%
extract(!str_detect(., regex_exception2))
if (chamber == "house") {
vacant[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all(regex_exception1, "") %>%
str_replace_all("^.+?(?=\\()|(?<=\\)).+", "") %>%
str_replace_all("\\(|\\)|\n", "") %>%
str_detect(., regex_exception2)
constituencies[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency[i]) %>%
html_attr("title") %>%
replace(., !str_detect(., "district|At-large"), NA) %>%
na.locf()
if (equals(length(vacant[[i]]), length(constituencies[[i]]))) {
constituencies[[i]] <- constituencies[[i]] %>%
extract(!vacant[[i]])
}
}
if (chamber == "senate") {
vacant[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all(regex_exception1, "") %>%
str_replace_all("^.+?(?=\\()|(?<=\\)).+", "") %>%
str_replace_all("\\(|\\)|\n", "") %>%
str_detect(., regex_exception2)
constituency <- html_nodes(x = htmls[[i]], xpath = str_replace(query_constituency[i], "ul/li", regex_constituency[i])) %>%
html_text()
replicate <- html_nodes(x = htmls[[i]], xpath = query_constituency[i]) %>%
html_text %>%
str_extract("[[:digit:]](?=\\.|\\:)") %>%
as.numeric
constituencies[[i]] <- replace(replicate, which(!is.na(replicate))[-seq(from = 2,
to = length(which(!is.na(replicate))), by = 2)],
constituency) %>%
replace(., str_detect(., "[[:digit:]]"), NA) %>%
na.locf() %>%
extract(!vacant[[i]])
}
term_times[[i]] <- html_nodes(x = htmls[[i]], xpath = query_ttime) %>%
html_text() %>%
str_replace_all(".\\(.+?\\)", "") %>%
str_split(pattern = "(?<=[[:digit:]]) ", n = 2) %>%
unlist %>%
str_replace_all(pattern = "^.+(?=[[:upper:]])|\\n", replacement = "")
beginnings[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all("\n.+|\n", "") %>%
extract(!str_detect(., regex_exception3)) %>%
str_replace_all("([^[[:digit:]]]+$)|\\[.(.\\])?|, changed.+", "") %>%
replace(.,list = !str_detect(., "[[:digit:]]{4}"), "") %>%
str_replace_all("(?<=[[:digit:]]{4} ).+| until.+| resigned.+| died.+|.\\?\\?(\\?)?(\\?)?", "") %>%
str_extract("[[:upper:]][[:alpha:]]+ [[:digit:]]([[:digit:]])?,.+") %>%
na.replace(term_times[[i]][1]) %>%
mdy()
endings[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all("\n.+|\n", "") %>%
extract(!str_detect(., regex_exception3)) %>%
str_replace(" - End", str_c(" - ", term_times[[i]][2])) %>%
str_replace_all("([^[[:digit:]]]+$)| before April 6|, changed.+", "") %>%
replace(.,list = !str_detect(., "[[:digit:]]{4}"), "") %>%
str_replace_all("^.+[[:digit:]]{4} |( )?from.+?[[:digit:]]{4}|( )?seated.+|.\\?\\?(\\?)?(\\?)?(,)?", "") %>%
str_extract("([[:upper:]][[:alpha:]]+)?(,)? ([[:digit:]]([[:digit:]])?, )?[[:digit:]]{4}") %>%
str_trim %>%
str_replace("^(?=[[:digit:]]{4})", "January 1, ") %>%
str_replace("(?<=[[:alpha:]])((,)? )(?=[[:digit:]]{4})", " 1, ") %>%
na.replace(term_times[[i]][2]) %>%
mdy()
services[[i]] <- difftime(time1 = endings[[i]], time2 = beginnings[[i]], units = "days")
if (i %in% c(5,14,17,18,20:22,31) & chamber == "house") {
services[[i]] <- replace(services[[i]], replace_idx[[i]], replacement[[i]])
}
if (i %in% c(2,4,52,115) & chamber == "senate") {
services[[i]] <- replace(services[[i]], replace_idx2[[i]], replacement2[[i]])
}
session_start[[i]] <- term_times[[i]][1]
session_end[[i]] <- term_times[[i]][2]
}
urls <- lapply(urls, data.table) %>%
lapply(rename, url = V1)
names <- lapply(names, data.table) %>%
lapply(rename, name = V1)
parties <- lapply(parties, data.table) %>%
lapply(rename, party = V1)
constituencies <- lapply(constituencies, data.table) %>%
lapply(rename, constituency = V1)
services <- lapply(services, data.table) %>%
lapply(rename, service = V1)
session_start <- lapply(session_start, data.table) %>%
lapply(rename, session_start = V1)
session_end <- lapply(session_end, data.table) %>%
lapply(rename, session_end = V1)
urls <- mapply(cbind, urls, names, parties, constituencies,
services, session_start, session_end, SIMPLIFY = FALSE)
urls <- as.numeric(as.roman(str_extract(source, "[[:digit:]]+"))) %>%
mapply(function(urls, x) mutate(urls, term = x), urls, ., SIMPLIFY=FALSE) %>%
mapply(function(., x) mutate(., country = x), ., rep("USA-H", times = 116), SIMPLIFY=FALSE)
}
#### CZECH REPUBLIC WIKIPEDIA INFORMATION EXTRACTION ====================================
collectorCzech <- function(source,chamber) {
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
duration <- list(c("1992-06-06","1996-06-01"),c("1996-06-01","1998-06-20"),
c("1998-06-20","2002-06-15"),c("2002-06-15","2006-06-03"),
c("2006-06-03","2010-05-29"), c("2010-05-29", "2013-10-26"),
c("2013-10-26", "2017-10-21"), c("2017-10-21", "2021-10-21"))
query <- c("//h2/span[contains(text(), 'seznam')]/ancestor::h2/following-sibling::ul/li/a[1]",
"//h2/span[contains(text(), 'seznam')]/ancestor::h2/following-sibling::ul/li/a[1]",
"//h2/span[contains(text(), 'seznam')]/ancestor::h2/following-sibling::ul/li/a[1]",
"//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table/tbody/tr/td[1]/b/a[1]",
"//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table/tbody/tr/td[1]//a[1]",
"//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[1]/a[1]|
//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[1]/span[@class = 'vcard fn']/a[1]",
rep("//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a[1]|
//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/span[@class = 'vcard fn']/a[1]",
times = 2))
query_party <- c("//h2/span[contains(text(), 'seznam')]/ancestor::h2/following-sibling::ul/li/a[2]","","",
"//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table/tbody/tr/td[3]/a",
"",
"//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[2]",
"//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[2]//a",
"//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[2]")
query_constituency <- c("","","",
"//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table/tbody/tr/td[6]/a",
"",
"//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[4]",
rep("//h2/span[contains(text(), 'Seznam')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[4]",
times = 2))
# assign empty lists to store session speficic htmls
htmls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator URLs
urls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator names
names <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator party affiliations
parties <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator constituencies
constituencies <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic beginning dates
session_start <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic ending dates
session_end <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator service in days
services <- rep(list(NA), times = length(source))
# assign session specific manual replacement index for services
replace_idx <- c(list(c(0)),
list(c(0)),
list(c(0)),
list(c(5,7,11,15,18,22,41,83,120,141,158,161,228,236,25,33,39,44,49,55,70,71,84,86,89,
95,100,111,116,119,121,129,151,152,155,160,162,164,171,173,196,201,216,219,220,
221,224,231)),
list(c(0)),
list(c(13,15,19,24,33,42,50,51,52,63,66,76,77,81,86,94,101,106,107,110,119,142,144,145,146,174,177,186,187,188,189,196,218)),
list(c(10,23,26,50,52,58,77,79,80,94,100,113,137,138,143,146,148,149,150,164,167,174,176,179,190,194,212)),
list(c(32,60,88,115,133,137,138,144,147,152,166,168,172,188)))
# assign session specific manual replacement for services
replacement <- c(list(c(0)),
list(c(0)),
list(c(0)),
list(c(695,695,754,1149,13,766,766,766,766,766,766,766,766,766,58,0,832,695,
142,326,1055,149,771,225,707,653,188,195,1366,632,146,632,631,6,695,
695,1196,312,1109,695,695,695,695,1105,623,188,996,107)),
list(c(0)),
list(c(131,352,352,875,1,875,185,893,276,367,1048,619,267,875,203,184,367,1042,969,875,197,300,352,626,1060,1098,944,147,367,1114,893,893,367)),
list(c(32,1235,1299,1269,1047,186,1322,111,360,247,1417,1208,306,290,1208,408,1165,247,237,220,6,1149,1095,1344,133,1218,156)),
list(c(1219,34,409,1426,1426,241,1051,34,1426,34,1299,161,345,1115)))
# assign removal index
for (i in 1:length(htmls)) {
# read html file
htmls[[i]] <- read_html(source[i], encoding = "UTF-8")
# locate legislator biography URLs in html via XPath query, extract and complete URL
urls[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_attr("href") %>%
str_c("https://", "cs",".wikipedia.org", .)
# locate legislator names in html via query, extract names
names[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_text() %>%
str_squish()
session_start[[i]] <- duration[[i]][1]
session_end[[i]] <- duration[[i]][2]
if (i %in% c(4,6:8)) {
# locate legislator party affiliations in html via query_party, extract and format party affiliation
parties[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
if (i %in% 6) { # check whether this ndex is correct
parties[[i]] <- str_replace(parties[[i]], ".+(?= [[:alpha:]])", "") %>%
str_squish()
}
constituencies[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
services[[i]] <- round(rep(difftime(time1 = session_end[[i]][1], time2 = session_start[[i]][1], units = "days"),
length(urls[[i]])))
services[[i]] <- replace(services[[i]], replace_idx[[i]], replacement[[i]])
services[[i]] <- as.numeric(services[[i]])
}
if (i %in% c(1,2,3,5)) {
if (i %in% 5) {
data2 <- data.frame(urls = urls[[i]], names = names[[i]], nameso = names[[i]], stringsAsFactors = FALSE)
data2$names <- str_c(str_replace(data2$names, ".+? ", ""), " ", str_extract(data2$names, ".+? ")) %>%
str_replace(" \\(politik\\)", "")
data2 <- arrange(data2, names)
redist <- data2[c(29,30,58,88,99,160,162,163,166,168,169,170,173,174,175,178,179,182,188,189,191,192,193,198),]
data2 <- data2[-c(4,29,30,33,58,69,80,88,99,120,140,160,162,163,166,168,169,170,171,173,174,175,178,179,182,184,188,189,191,192,193,197,198,210,225,232,234),]
rownames(data2) <- seq(length=nrow(data2))
data2 <- rbind(data2[1:54,],redist[3,],data2[55:62,],redist[1:2,],data2[63:81,],redist[4,],data2[82:92,],redist[5,],data2[93:151,],
redist[6:8,],data2[152:153,],redist[9,],data2[154:168,],redist[10:24,],data2[169:197,])
urls[[i]] <- data2$urls
names[[i]] <- data2$nameso
}
if (i %in% c(1,2,3)) {
urls[[i]] <- urls[[i]][-length(urls[[i]])]
names[[i]] <- names[[i]][-length(names[[i]])]
}
if (i %in% 1) {
urls[[i]] <- urls[[i]][-c(57,64)]
names[[i]] <- names[[i]][-c(57,64)]
parties[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
extract(-c(57,64))
}
htmls2 <- lapply(mixedsort(list.files(str_c("./data/htmls/czech_official/",i,"/"), full.names = TRUE)), read_html)
dates <- lapply(htmls2, html_nodes, xpath = "(//li[@class='previous'])[1]") %>%
lapply(., html_text) %>% unlist %>% str_squish
beginnings <- dates %>% str_replace(".+?od", "") %>% str_split("do") %>% lapply(`[`,1) %>% unlist %>% str_replace_all(" ", "") %>% dmy
endings <- dates %>% str_replace(".+?od", "") %>% str_split("do") %>% lapply(`[`,2) %>% unlist %>% str_replace_all(" ", "") %>% dmy
services[[i]] <- as.numeric(round(difftime(time1 = endings, time2 = beginnings, units = "days")))
parcon <- lapply(htmls2, html_nodes, xpath = "//div[@class = 'figcaption']/p/text()[preceding-sibling::br]") %>% lapply(html_text)
constituencies[[i]] <- lapply(parcon, `[`, 1) %>% unlist %>% str_squish %>% str_replace(".+?: ", "")
if (i %in% c(2,3,5)) {
parties[[i]] <- lapply(parcon, `[`, 2) %>% unlist %>% str_squish %>% str_replace(".+?: ", "")
}
}
}
urls <- lapply(urls, data.table) %>%
lapply(rename, url = V1)
names <- lapply(names, data.table) %>%
lapply(rename, name = V1)
parties <- lapply(parties, data.table) %>%
lapply(rename, party = V1)
constituencies <- lapply(constituencies, data.table) %>%
lapply(rename, constituency = V1)
services <- lapply(services, data.table) %>%
lapply(rename, service = V1)
session_start <- lapply(session_start, data.table) %>%
lapply(rename, session_start = V1)
session_end <- lapply(session_end, data.table) %>%
lapply(rename, session_end = V1)
urls <- mapply(cbind, urls, names, parties, constituencies,
services, session_start,
session_end, SIMPLIFY = FALSE)
urls <- as.numeric(as.roman(str_extract(source, "[[:digit:]]+"))) %>%
mapply(function(urls, x) mutate(urls, term = x), urls, ., SIMPLIFY=FALSE) %>%
mapply(function(., x) mutate(., country = x), ., rep("CZE", times = 8), SIMPLIFY=FALSE)
}
#### SCOTLAND WIKIPEDIA INFORMATION EXTRACTION ==========================================
collectorScotland <- function(source) {
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
duration <- list(c("1999-05-06","2003-05-01"),c("2003-05-01","2007-05-03"),
c("2007-05-03","2011-05-05"),c("2011-05-05","2016-05-05"),
c("2016-05-05","2021-05-06"))
query <- rep("//h2/span[contains(text(), 'MSPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[1]//a[1]|
//h3/span[contains(text(), 'Former MSPs')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]//a[1]",
times = 5)
query_party <- rep("//h2/span[contains(text(), 'MSPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[5]//a[1]|
//h3/span[contains(text(), 'Former MSPs')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[5]//a[1]",
times = 5)
query_constituency <- rep("//h2/span[contains(text(), 'MSPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[3]//a[1]|
//h3/span[contains(text(), 'Former MSPs')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[3]//a[1]",
times = 5)
query_constituency2 <- rep("//h2/span[contains(text(), 'MSPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[4]|
//h3/span[contains(text(), 'Former MSPs')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[4]",
times = 5)
# assign empty lists to store session speficic htmls
htmls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator URLs
urls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator names
names <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator party affiliations
parties <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator constituencies
constituencies <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator constituencies2
constituencies2 <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic beginning dates
session_start <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic ending dates
session_end <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator service in days
services <- rep(list(NA), times = length(source))
# assign session specific manual replacement index for services
replace_idx <- c(list(c(133,105,130,9,131,134,114,29,132,30)),list(c(131,4,132,15,130,42,133,62,125,134,100)),
list(c(130,114,131,80)),list(c(129,7,130,84,4,131,17,132,54,133,106,134)),
list(c(130,12,131,8,132,53,133,85)))
# assign session specific manual replacement for services
replacement <- c(list(c(229,1140,524,888,739,739,692,692,827,628)),list(c(620,842,778,684,854,580,1055,1443,390,1072,390)),
list(c(120,1342,645,817)),list(c(582,1244,721,1791,1086,830,996,856,970,919,907,1065)),
list(c(216,1610,364,1462,404,1422,404,1422)))
# loop through all html files
for (i in 1:length(htmls)) {
# read html file
htmls[[i]] <- read_html(source[i], encoding = "UTF-8")
# locate legislator biography URLs in html via XPath query, extract and complete URL
urls[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_attr("href") %>%
str_c("https://", "en",".wikipedia.org", .)
# locate legislator names in html via query, extract names
names[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_text()
# locate legislator party affiliations in html via query_party, extract and format party affiliation
parties[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
constituencies[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
constituencies2[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency2[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
session_start[[i]] <- duration[[i]][1]
session_end[[i]] <- duration[[i]][2]
services[[i]] <- round(rep(difftime(time1 = session_end[[i]][1], time2 = session_start[[i]][1], units = "days"),
length(urls[[i]])))
services[[i]] <- replace(services[[i]], replace_idx[[i]], replacement[[i]])
}
urls <- lapply(urls, data.table) %>%
lapply(rename, url = V1)
names <- lapply(names, data.table) %>%
lapply(rename, name = V1)
parties <- lapply(parties, data.table) %>%
lapply(rename, party = V1)
constituencies <- lapply(constituencies, data.table) %>%
lapply(rename, constituency = V1)
constituencies2 <- lapply(constituencies2, data.table) %>%
lapply(rename, constituency2 = V1)
services <- lapply(services, data.table) %>%
lapply(rename, service = V1)
session_start <- lapply(session_start, data.table) %>%
lapply(rename, session_start = V1)
session_end <- lapply(session_end, data.table) %>%
lapply(rename, session_end = V1)
urls <- mapply(cbind, urls, names, parties, constituencies,
constituencies2, services, session_start,
session_end, SIMPLIFY = FALSE)
urls <- as.numeric(as.roman(str_extract(source, "[[:digit:]]+"))) %>%
mapply(function(urls, x) mutate(urls, term = x), urls, ., SIMPLIFY=FALSE) %>%
mapply(function(., x) mutate(., country = x), ., rep("SCO", times = 5), SIMPLIFY=FALSE)
}
#### UNITED KINGDOM WIKIPEDIA INFORMATION EXTRACTION ====================================
collectorUk <- function(source) {
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
duration <- list(c("1801-01-01", "1802-07-05"), c("1802-07-05", "1806-10-29"),
c("1806-10-29", "1807-05-04"), c("1807-05-04", "1812-10-05"),
c("1812-10-05", "1818-08-04"), c("1818-08-04", "1820-03-06"),
c("1820-03-06", "1826-06-07"), c("1826-06-07", "1830-07-29"),
c("1830-07-29", "1831-04-28"), c("1831-04-28", "1832-12-08"),
c("1832-12-08", "1835-01-06"),
c("1852-07-07", "1857-03-27"), c("1857-03-27", "1859-04-28"),
c("1868-11-17", "1874-01-31"), c("1874-01-31", "1880-03-31"),
c("1880-03-31", "1885-11-24"), c("1885-11-24", "1886-07-01"),
c("1886-07-01", "1892-07-04"), c("1892-07-04", "1895-07-13"),
c("1895-07-13", "1900-09-26"), c("1900-09-26", "1906-01-12"),
c("1906-01-12", "1910-01-15"), c("1910-01-15", "1910-12-03"),
c("1910-12-03", "1918-12-14"), c("1918-12-14", "1922-11-15"),
c("1922-11-15", "1923-12-06"), c("1923-12-06", "1924-10-29"),
c("1924-10-29", "1929-05-30"), c("1929-05-30", "1931-10-27"),
c("1931-10-27", "1935-11-14"), c("1935-11-14", "1945-07-05"),
c("1945-07-05","1950-02-23"), c("1950-02-23","1951-10-25"),
c("1951-10-25","1955-05-26"), c("1955-05-26","1959-10-08"),
c("1959-10-08","1964-10-15"), c("1964-10-15","1966-03-31"),
c("1966-03-31","1970-06-18"), c("1970-06-18","1974-02-28"),
c("1974-02-28","1974-10-10"), c("1974-10-10","1979-05-03"),
c("1979-05-03","1983-06-09"), c("1983-06-09","1987-06-11"),
c("1987-06-11","1992-04-09"), c("1992-04-09","1997-05-01"),
c("1997-05-01","2001-06-07"), c("2001-06-07","2005-05-05"),
c("2005-05-05","2010-05-06"), c("2010-05-06","2015-05-07"),
c("2015-05-07","2017-06-08"), c("2017-06-08","2019-12-12"),
c("2019-12-12","2024-05-02"))
query <- c("", "", "", "", "", "", "", "", "", "",
"//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[7]/b/span/a[1]|
//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[7]/b/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1][not(contains(., 'Independent'))]|
//table[@class = 'wikitable']/tbody/tr/td[2]/i/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[1]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[2]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[3]/td[1]//a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1][not(contains(., 'Independent'))]|
//table[@class = 'wikitable']/tbody/tr/td[2]/i/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[1]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[2]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[3]/td[1]//a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[1]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[2]/td[1]//a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[3]/td[1]//a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[3]/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[3]/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1][not(contains(., 'Party'))] |
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1][not(contains(., 'Party'))] |
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1][not(contains(., 'Federation'))] |
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1][not(contains(., 'Parnellite'))] |
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1][not(contains(., 'Party'))] |
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]",
"//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[2]/a[1][not(contains(., 'Party'))] |
//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1][not(contains(., 'Party'))] |
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]",
"//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[2]//a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]", # 31 -
"//table[3]/tbody/tr/td[2]/a[1]|
//table[3]/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]/ancestor::tr/td[2]/a[1]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]", # 36
"//table[@class = 'wikitable']/tbody/tr/td[2]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]", # 37
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]/ancestor::tr/td[2]/a[1]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[1]/a[1]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[5]/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[5]/span//a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[3]/b/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[3]/b/a[1]")
query_party <- c("", "", "", "", "", "", "", "", "", "",
"//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[8]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[2]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[3]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[2]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[3]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[2]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[3]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[2]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[3]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[2]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='4']/ancestor::tr/following-sibling::tr[3]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]",
"//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[3]|
//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]",
"//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[3]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]",
"//table[3]/tbody/tr/td[3]|
//table[3]/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]/ancestor::tr/td[3]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]",
"//table[@class = 'wikitable']/tbody/tr/td[3]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//table[@class = 'wikitable']/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]", # 37
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]/ancestor::tr/td[3]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='2']/ancestor::tr/following-sibling::tr[1]/td[2]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[1]/td[2]|
//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[@rowspan='3']/ancestor::tr/following-sibling::tr[2]/td[2]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[3]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[5]/a[2]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[5]/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[2]/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[5]/a[1]")
query_constituency <- c("", "", "", "", "", "", "", "", "", "",
"//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[1]/b/a[1]|
//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[1]/b/span/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[3]/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]", # 37
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]/ancestor::tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[2]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[1]/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/th[1]/a[1]",
"//h2/span[contains(text(), 'List of MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/th[1]/a[1]")
query_multiplier <- c("", "", "", "", "", "", "", "", "", "",
"",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[3]/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[3]/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[3]/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[3]/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//h2/span[contains(text(), 'MPs')]/ancestor::h2/following-sibling::table[1]/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[3]/tbody/tr/td[3]/ancestor::tr/td[1]", "",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[3]/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]/ancestor::tr/td[1]",
"//table[@class = 'wikitable']/tbody/tr/td[3]/ancestor::tr/td[1]",
"//h2/span[contains(text(), 'Composition')]/ancestor::h2/following-sibling::table[3]/tbody/tr/td[3]/ancestor::tr/td[1]")
query_be <- c("", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "",
"//h3/span/a[contains(text(), '27th')]/ancestor::tr/following-sibling::tr/td[6]/a[preceding::a[contains(text(), '27th')]]",
"//h3/span/a[contains(text(), '28th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '27th')]]",
"//h3/span/a[contains(text(), '29th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '28th')]]",
"//h3/span/a[contains(text(), '30th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '29th')]]",
"//h3/span/a[contains(text(), '31st')]/ancestor::tr/following-sibling::tr/td[6]/a[preceding::a[contains(text(), '31st')]]",
"//h3/span/a[contains(text(), '32nd')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '31st')]]",
"//h3/span/a[contains(text(), '33rd')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '32nd')]]",
"//h3/span/a[contains(text(), '34th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '33rd')]]",
"//h3/span/a[contains(text(), '35th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '34th')]]",
"//h3/span/a[contains(text(), '36th')]/ancestor::tr/following-sibling::tr/td[6]/a[preceding::a[contains(text(), '36th')]]",
"//h3/span/a[contains(text(), '37th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '36th')]]",
"//h3/span/a[contains(text(), '38th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '37th')]]",
"//h3/span/a[contains(text(), '39th')]/ancestor::tr/following-sibling::tr/td[6]/a[preceding::a[contains(text(), '39th')]]",
"//h3/span/a[contains(text(), '40th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '39th')]]",
"//h3/span/a[contains(text(), '41st')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '40th')]]",
"//h3/span/a[contains(text(), '42nd')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '41st')]]",
"//h3/span/a[contains(text(), '43rd')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '42nd')]]",
"//h3/span/a[contains(text(), '44th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '43rd')]]",
"//h3/span/a[contains(text(), '45th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '44th')]]",
"//h3/span/a[contains(text(), '46th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '45th')]]",
"//h3/span/a[contains(text(), '47th')]/ancestor::tr/following-sibling::tr/td[6]/a[following::a[contains(text(), '46th')]]",
"//h3/span[contains(text(), '1983 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[6]/a",
"//h3/span[contains(text(), '1987 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[6]/a",
"//h3/span[contains(text(), '1992 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[6]/a",
"//h3/span[contains(text(), '1997 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[6]/a",
"//h3/span[contains(text(), '2001 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[6]/a",
"//h3/span[contains(text(), '2005 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[6]/a",
"//h3/span[contains(text(), '2010 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[6]/a",
"//h3/span[contains(text(), '15 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[6]/a",
"//h3/span[contains(text(), '17 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[6]/a",
"//h3/span[contains(text(), 'present Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[6]/a")
query_be_party <- c("", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "",
"//h3/span/a[contains(text(), '27th')]/ancestor::tr/following-sibling::tr/td[8]/a[preceding::a[contains(text(), '27th')]]",
"//h3/span/a[contains(text(), '28th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '27th')]]",
"//h3/span/a[contains(text(), '29th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '28th')]]",
"//h3/span/a[contains(text(), '30th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '29th')]]",
"//h3/span/a[contains(text(), '31st')]/ancestor::tr/following-sibling::tr/td[8]/a[preceding::a[contains(text(), '31st')]]",
"//h3/span/a[contains(text(), '32nd')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '31st')]]",
"//h3/span/a[contains(text(), '33rd')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '32nd')]]",
"//h3/span/a[contains(text(), '34th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '33rd')]]",
"//h3/span/a[contains(text(), '35th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '34th')]]",
"//h3/span/a[contains(text(), '36th')]/ancestor::tr/following-sibling::tr/td[8]/a[preceding::a[contains(text(), '36th')]]",
"//h3/span/a[contains(text(), '37th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '36th')]]",
"//h3/span/a[contains(text(), '38th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '37th')]]",
"//h3/span/a[contains(text(), '39th')]/ancestor::tr/following-sibling::tr/td[8]/a[preceding::a[contains(text(), '39th')]]",
"//h3/span/a[contains(text(), '40th')]/ancestor::tr/following-sibling::tr/td[8]/a[1][following::a[contains(text(), '39th')]]",
"//h3/span/a[contains(text(), '41st')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '40th')]]",
"//h3/span/a[contains(text(), '42nd')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '41st')]]",
"//h3/span/a[contains(text(), '43rd')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '42nd')]]",
"//h3/span/a[contains(text(), '44th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '43rd')]]",
"//h3/span/a[contains(text(), '45th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '44th')]]",
"//h3/span/a[contains(text(), '46th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '45th')]]",
"//h3/span/a[contains(text(), '47th')]/ancestor::tr/following-sibling::tr/td[8]/a[following::a[contains(text(), '46th')]]",
"//h3/span[contains(text(), '1983 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[8]/a",
"//h3/span[contains(text(), '1987 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[8]/a",
"//h3/span[contains(text(), '1992 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[8]/a",
"//h3/span[contains(text(), '1997 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[8]/a",
"//h3/span[contains(text(), '2001 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[8]/a",
"//h3/span[contains(text(), '2005 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[8]/a",
"//h3/span[contains(text(), '2010 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[8]/a",
"//h3/span[contains(text(), '15 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[8]/a",
"//h3/span[contains(text(), '17 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[8]/a",
"//h3/span[contains(text(), 'present Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[8]/a")
query_be_constituency <- c("", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "",
"//h3/span/a[contains(text(), '27th')]/ancestor::tr/following-sibling::tr/td[1]/a[preceding::a[contains(text(), '27th')]]",
"//h3/span/a[contains(text(), '28th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '27th')]]",
"//h3/span/a[contains(text(), '29th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '28th')]]",
"//h3/span/a[contains(text(), '30th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '29th')]]",
"//h3/span/a[contains(text(), '31st')]/ancestor::tr/following-sibling::tr/td[1]/a[preceding::a[contains(text(), '31st')]]",
"//h3/span/a[contains(text(), '32nd')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '31st')]]",
"//h3/span/a[contains(text(), '33rd')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '32nd')]]",
"//h3/span/a[contains(text(), '34th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '33rd')]]",
"//h3/span/a[contains(text(), '35th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '34th')]]",
"//h3/span/a[contains(text(), '36th')]/ancestor::tr/following-sibling::tr/td[1]/a[preceding::a[contains(text(), '36th')]]",
"//h3/span/a[contains(text(), '37th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '36th')]]",
"//h3/span/a[contains(text(), '38th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '37th')]]",
"//h3/span/a[contains(text(), '39th')]/ancestor::tr/following-sibling::tr/td[1]/a[preceding::a[contains(text(), '39th')]]",
"//h3/span/a[contains(text(), '40th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '39th')]]",
"//h3/span/a[contains(text(), '41st')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '40th')]]",
"//h3/span/a[contains(text(), '42nd')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '41st')]]",
"//h3/span/a[contains(text(), '43rd')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '42nd')]]",
"//h3/span/a[contains(text(), '44th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '43rd')]]",
"//h3/span/a[contains(text(), '45th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '44th')]]",
"//h3/span/a[contains(text(), '46th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '45th')]]",
"//h3/span/a[contains(text(), '47th')]/ancestor::tr/following-sibling::tr/td[1]/a[following::a[contains(text(), '46th')]]",
"//h3/span[contains(text(), '1983 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]/a",
"//h3/span[contains(text(), '1987 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]/a",
"//h3/span[contains(text(), '1992 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]/a",
"//h3/span[contains(text(), '1997 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]/a",
"//h3/span[contains(text(), '2001 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]/a",
"//h3/span[contains(text(), '2005 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]/a",
"//h3/span[contains(text(), '2010 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]/a",
"//h3/span[contains(text(), '15 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]/a",
"//h3/span[contains(text(), '17 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]/a",
"//h3/span[contains(text(), 'present Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[1]/a")
query_be_service <- c("", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "",
"//h3/span/a[contains(text(), '27th')]/ancestor::tr/following-sibling::tr/td[2][preceding::a[contains(text(), '27th')]]",
"//h3/span/a[contains(text(), '28th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '27th')]]",
"//h3/span/a[contains(text(), '29th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '28th')]]",
"//h3/span/a[contains(text(), '30th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '29th')]]",
"//h3/span/a[contains(text(), '31st')]/ancestor::tr/following-sibling::tr/td[2][preceding::a[contains(text(), '31st')]]",
"//h3/span/a[contains(text(), '32nd')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '31st')]]",
"//h3/span/a[contains(text(), '33rd')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '32nd')]]",
"//h3/span/a[contains(text(), '34th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '33rd')]]",
"//h3/span/a[contains(text(), '35th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '34th')]]",
"//h3/span/a[contains(text(), '36th')]/ancestor::tr/following-sibling::tr/td[2][preceding::a[contains(text(), '36th')]]",
"//h3/span/a[contains(text(), '37th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '36th')]]",
"//h3/span/a[contains(text(), '38th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '37th')]]",
"//h3/span/a[contains(text(), '39th')]/ancestor::tr/following-sibling::tr/td[2][preceding::a[contains(text(), '39th')]]",
"//h3/span/a[contains(text(), '40th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '39th')]]",
"//h3/span/a[contains(text(), '41st')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '40th')]]",
"//h3/span/a[contains(text(), '42nd')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '41st')]]",
"//h3/span/a[contains(text(), '43rd')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '42nd')]]",
"//h3/span/a[contains(text(), '44th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '43rd')]]",
"//h3/span/a[contains(text(), '45th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '44th')]]",
"//h3/span/a[contains(text(), '46th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '45th')]]",
"//h3/span/a[contains(text(), '47th')]/ancestor::tr/following-sibling::tr/td[2][following::a[contains(text(), '46th')]]",
"//h3/span[contains(text(), '1983 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[2]",
"//h3/span[contains(text(), '1987 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[2]",
"//h3/span[contains(text(), '1992 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[2]",
"//h3/span[contains(text(), '1997 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[2]",
"//h3/span[contains(text(), '2001 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[2]",
"//h3/span[contains(text(), '2005 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[2]",
"//h3/span[contains(text(), '2010 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[2]",
"//h3/span[contains(text(), '15 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[2]",
"//h3/span[contains(text(), '17 Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[2]",
"//h3/span[contains(text(), 'present Parliament')]/ancestor::h3/following-sibling::table[1]/tbody/tr/td[2]")
# assign empty lists to store session speficic htmls
htmls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator URLs
urls <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator names
names <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator party affiliations
parties <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator constituencies
constituencies <- rep(list(NA), times = length(source))
multipliers1 <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic beginning dates
session_start <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic ending dates
session_end <- rep(list(NA), times = length(source))
# assign empty lists to store session speficic legislator service in days
services <- rep(list(NA), times = length(source))
# assign session specific manual replacement index for services
replace_idx <- c(rep(list(c(0)),31),list(c(88,347,540,506,273,38,249,212,548,247,537,92,163,18,618,453,244,216,303,
261,323,364,324,430,331,2,156,4,451,492,243,39,466,55,187,441,153,283,291,
245,24,473,579,328,83,101,145,407,208,20,528)),
list(c(595,614,377,285,444,109,7,108,51,66,447,256,354,101,195,514)),
list(c(621,541,213,570,445,425,323,19,375,409,554,2,11,575,517,163,410,211,
282,20,90,287,276,319,448,303,444,162,113,63,8,552,188,289,544,33,326,
126,228,71,50,622,16,150,196,351,529,89)),
list(c(604,457,244,430,285,50,531,520,5,465,142,411,18,603,606,503,201,
332,570,255,485,371,358,327,263,184,310,204,215,423,45,588,109,
132,590,363,398,139,141,425,568,585,563,243,296,350,80,569,271,
246,576)),
list(c(373,231,610,498,178,118,540,553,505,394,196,384,343,52,484,
549,595,172,352,560,491,154,433,427,262,185,144,356,598,177,
404,409,173,545,464,449,403,78,368,248,450,234,393,108,587,
454,72,621,303,153,123,81,208,567,461,383,133,84,47,213,286,
101)),
list(c(318,222,148,313,66,7,493,501,509,203,12,442,366)),
list(c(28,97,593,380,562,452,419,332,252,68,142,577,605,585,103,416,39,120,
415,514,446,590,398,190,10,337,177,391,358,279,586,122,99,307,481,442,
257,132)),
list(c(312,252,212,57,482,327,390,603,597,367,194,141,575,555,483,532,341,400,384,
607,538,271,289,265,112,526,20,372,218,503)),
list(c(439)),
list(c(383,357,157,474,58,466,406,285,632,230,358,255,485,90,68,510,277,21,74,153,
125,628,590,431,577,500,620,563,165,165)),
list(c(175,538,262,540,70,270,159,415,47,257,56,170,173,601,241,253,533,312,404,134,491)),
list(c(348,367,580,281,349,420,495,184,252,427,584,83,233,468,562,533,172,135,458)),
list(c(354,354,304,379,408,414,478,453,452,79,214,350,74,588,532,591,261,590,480,464,234,263,341)),
list(c(397,188,191,628,24,536,303,372,195,459,334,198,406,423,217,173,80,22,490,141,418)),
list(c(637,224,621,464,241,520,586,481,125,314,635,269,204,327,637,35,450,595)),
list(c(278,49,333,80,440,307)),
list(c(238,422,242,236,257,275,152,490,184,94,61,180,340,126)),
list(c(284,385,646,520,366,204,158,368,477,114,150,354,77,226,298,38,327,23,435)),
list(c(436,495,433,576,30,631,468,503,149,549)),
list(c(619,331,392,447,79)), list(c(0)))
# assign session specific manual replacement for services
replacement <- c(rep(list(c(0)),31),list(c(1617,1477,1350,1344,1330,1323,1239,1227,1190,1183,1029,993,980,974,973,958,937,882,875,
874,812,798,671,586,518,518,510,509,503,502,420,385,383,382,336,334,256,232,231,
222,217,210,161,138,133,132,118,118,90,89,88)),
list(c(483,476,450,422,406,357,280,280,279,266,252,244,217,70,61,41)),
list(c(1239,1197,1190,1188,1183,1176,1153,1122,1120,1106,1106,1099,1099,1092,1092,
1071,902,896,868,866,847,840,840,832,770,756,749,749,693,616,614,566,538,
524,523,523,504,476,399,399,376,376,368,364,266,105,104,104)),
list(c(1484,1477,1414,1399,1393,1393,1344,1281,1274,1264,1260,1260,1113,
1113,1113,1113,1113,1085,1036,1022,993,924,917,882,840,763,735,735,
734,665,665,651,651,644,644,630,573,539,490,407,378,280,264,264,264,
259,252,203,196,195,77)),
list(c(1708,1701,1680,1680,1680,1680,1526,1519,1519,1519,1505,1491,
1491,1475,1412,1407,1365,1365,1351,1267,1267,1260,1141,1141,
1141,1141,1141,1008,980,972,972,950,922,910,896,888,888,887,
882,770,763,763,762,574,560,560,532,525,525,525,525,413,406,
405,405,405,405,405,405,224,161,161)),
list(c(469,392,385,280,203,168,160,159,112,112,112,98,98)),
list(c(1449,1442,1344,1344,1309,1309,1309,1309,1309,1183,1148,1113,1092,1092,
1092,952,945,840,819,805,805,728,728,728,728,714,602,581,581,581,
539,539,392,350,343,343,343,105)),
list(c(1239,1239,1239,1239,1134,1134,1105,1071,1071,987,987,987,903,903,861,686,686,
665,469,462,455,385,364,343,343,343,287,287,154,126)),
list(c(84)),
list(c(1631,1603,1603,1477,1477,1372,1372,1329,1295,1295,1288,1281,1239,1141,1043,1001,
931,931,903,868,784,756,756,756,644,623,518,518,511,259)),
list(c(1421,1393,1309,1274,1274,1232,1148,1127,1120,1057,1036,938,903,805,707,420,315,224,147,1379,1417)),
list(c(1411,1417,1372,1358,1253,1134,1064,1064,1036,959,910,756,553,371,329,329,329,266,49)),
list(c(1610,1610,1610,1484,1435,1393,1365,1267,1267,1246,1225,1204,1078,1071,1015,735,735,693,623,623,553,518,399)),
list(c(1740,1743,1842,1785,1708,1463,1393,1204,1162,1141,1043,980,812,791,791,791,791,791,756,476,392)),
list(c(158,1330,1302,1302,1302,1239,1148,1099,1008,938,875,875,812,770,203,203,189,91)),
list(c(1211,1134,1134,833,252,168)),
list(c(1652,1540,1281,1176,1162,1148,1113,805,805,420,420,280,147,70)),
list(c(1617,1491,1379,1092,1036,1029,938,938,938,924,924,924,693,588,420,399,364,301,252)),
list(c(167,273,321,368,406,494,537,547,627,627)),
list(c(329,372,665,728,784)), list(c(0)))
replace_idx2 <- c(rep(list(c(0)), 20), list(c(75)), list(c(0)), list(c(0)), list(c(246)),
list(c(83,105)), list(c(0)), list(c(0)), list(c(35)), list(c(0)), list(c(6)), list(c(54)))
replacement2 <- c(rep(list(c(0)), 20), list(c("18 November 1902")), list(c(0)), list(c(0)), list(c("11 February 1911")),
list(c("27 March 1920", "19 March 1919")), list(c(0)), list(c(0)), list(c("26 April 1927")),
list(c(0)), list(c("17 June 1935")), list(c("10 June 1942")))
remove_table <- c(list(c(1,32,45,51,90,93,164,211,232,241,244,267,300,307,314,331,336,391,427,470,
484,509,514,535,591,618,677)),
list(c(1,32,89,90,138,164,211,232,241,267,300,307,314,336,391,427,428,470,484,509,
514,535,591,618,677)),
list(c(1,32,89,92,158,161,208,229,238,264,297,304,311,328,333,388,424,462,476,501,
506,528,561,584,611,676)),
list(c(1,32,88,89,135,161,208,229,238,264,297,304,311,333,388,424,425,
462,476,494,501,506,528,584,611,676)),
list(c(1,32,89,92,158,161,208,229,238,264,297,304,311,328,333,388,424,462,
476, 501,506,528,584,611,676)),
list(c(1,32,88,89,135,161,208,229,238,264,297,304,311,333,388,424,425,462,476,
501,506,522,528,584,611,630,676)),
list(c(1,2,33,34,91,92,95,161,164,165,211,212,233,234,243,244,270,271,
310,311,318,319,336,341,342,397,398,434,435,473,474,488,489,514,515,519,
520,542,543,599,600,627,628,633,646,693,694)),
list(c(1,2,32,33,89,90,91,99,135,141,162,163,209,210,218,231,232,241,242,266,267,
308,309,317,332,338,339,385,394,395,432,433,434,445,470,471,485,486,511,
512,517,518,541,542,598,599,626,627,690,691,694)),
list(c(1,2,32,33,90,91,94,99,141,159,162,163,209,210,218,231,232,241,242,266,267,
308,309,316,317,332,333,338,339,394,395,432,433,445,470,471,485,486,511,
512,517,518,541,542,598,599,626,627,690,691,694)),
list(c(1,2,32,33,89,90,91,135,141,162,163,209,210,218,231,232,241,242,266,267,
308,309,316,317,332,338,339,394,395,432,433,434,445,470,471,485,486,511,
512,517,518,541,542,598,599,626,627,690,691,694)))
ext_by_table <- c("//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][2]","//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][2]","//table[@class = 'wikitable'][3]",
"//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][2]","//table[@class = 'wikitable'][3]","//table[@class = 'wikitable'][4]","//table[@class = 'wikitable'][5]",
"//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][1]",
"//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][1]","//table[@class = 'wikitable'][1]")
cut_by_table <- c(list(c(1:71)),list(c(1:219)),list(c(1:54)),list(c(1:214)),list(c(1:211)),
list(c(1:67)),list(c(1:178)),list(c(1:165)),list(c(1:61)),list(c(1:64)),
list(c(444:502)),list(c(3:221)),list(c(371:460)),list(c(396:573)),list(c(199:393)),
list(c(3:196)),list(c(407:444)),list(c(225:403)),list(c(119:221)),list(c(3:115)))
col_by_table_1 <- c(rep(5, 10), rep(6, 10))
col_by_table_2 <- c(rep(0, 10), rep(8, 10))
col_by_table_3 <- c(rep(2, 10), rep(1, 10))
col_by_table_4 <- c(rep(1, 10), rep(2, 10))
for (i in 1:length(htmls)) {
if (i %in% c(11:52)) {
# read html file
htmls[[i]] <- read_html(source[i], encoding = "UTF-8")
# locate legislator biography URLs in html via XPath query, extract and complete URL
urls[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_attr("href") %>%
str_c("https://", "en",".wikipedia.org", .)
if (i == 15) {
urls[[i]] <- c(urls[[i]][1:368], "", urls[[i]][369:651])
}
if (i == 12) {
urls[[i]] <- c(urls[[i]][1:302], "", urls[[i]][303:653])
}
# locate legislator names in html via query, extract names
names[[i]] <- html_nodes(x = htmls[[i]], xpath = query[i]) %>%
html_text()
if (i == 15) {
names[[i]] <- c(names[[i]][1:368], "George Errington", names[[i]][369:651])
}
if (i == 12) {
names[[i]] <- c(names[[i]][1:302], "John Green", names[[i]][303:653])
}
# locate legislator party affiliations in html via query_party, extract and format party affiliation
parties[[i]] <- html_nodes(x = htmls[[i]], xpath = query_party[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
constituencies[[i]] <- html_nodes(x = htmls[[i]], xpath = query_constituency[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
if (i %in% c(1:31)) {
services[[i]] <- rep(NA, length(urls[[i]]))
session_start[[i]] <- duration[[i]][1]
session_end[[i]] <- duration[[i]][2]
} else {
session_start[[i]] <- duration[[i]][1]
session_end[[i]] <- duration[[i]][2]
services[[i]] <- round(rep(difftime(time1 = session_end[[i]][1], time2 = session_start[[i]][1], units = "days"),
length(urls[[i]])))
services[[i]] <- replace(services[[i]], replace_idx[[i]], replacement[[i]])
}
if (i %in% c(12:23,25:32)) {# 31:38
if (i == 27) {
constituencies[[i]] <- c(constituencies[[i]][1:105], "Cambridge", constituencies[[i]][106:594])
}
multipliers1[[i]] <- html_nodes(x = htmls[[i]], xpath = query_multiplier[i]) %>%
html_attr("rowspan") %>%
as.numeric() %>%
na.replace(1)
constituencies[[i]] <- rep(constituencies[[i]], times = multipliers1[[i]])
}
}
if (i %in% c(1:10)) {
html_table2 <- trace(rvest:::html_table.xml_node, quote({
values <- lapply(lapply(cells, html_node, "a"), html_attr, name = "href")
values[[1]] <- html_text(cells[[1]])
}), at = 14)
htmls[[i]] <- read_html(source[i], encoding = "UTF-8")
table_a <- html_nodes(x = htmls[[i]], xpath = "//table[@class = 'wikitable']") %>%
html_table(fill = TRUE) %>%
extract2(1)
table_a <- table_a[-remove_table[[i]],]
urls[[i]] <- table_a$X2 %>%
str_c("https://", "en",".wikipedia.org", .)
untrace(rvest:::html_table.xml_node)
table_b <- html_nodes(x = htmls[[i]], xpath = "//table[@class = 'wikitable']") %>%
html_table(fill = TRUE) %>%
extract2(1)
table_b <- table_b[-remove_table[[i]],]
names[[i]] <- table_b$X2 %>%
str_replace_all("\\[.+| died.+| Died.+| appointed.+| Appointed.+|seat Vacant|seat vacant| elected.+| resigned.+| Resigned.+| became.+| void.+| Void.+| sat for.+| Sat for.+| election.+| Election.+| 18.+| took office.+| raised to.+| unseated on.+|( )?Replaced.+| succeeded.+| expelled.+| ennobled.+| Ennobled.+| Expelled.+| assassinated.+| –", "") %>%
str_trim
parties[[i]] <- table_b$X3
constituencies[[i]] <- table_b$X1 %>%
str_replace_all("\\(two.+|\\(three.+|\\(four.+|\\(seat.+", "") %>%
str_trim
services[[i]] <- rep(NA, length(urls[[i]]))
session_start[[i]] <- duration[[i]][1]
session_end[[i]] <- duration[[i]][2]
}
if (i %in% c(1:2)) {
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections14.html")
}
if (i %in% c(3:5)) {
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections13.html")
}
if (i %in% c(6:10)) {
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections12.html")
}
if (i %in% c(11)) {
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections11.html")
}
if (i %in% c(12)) {
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections10.html")
}
if (i %in% c(13)) {
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections9.html")
}
if (i %in% c(14:16)) {
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections8.html")
}
if (i %in% c(17:20)) {
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections7.html")
}
if (i %in% c(21:24)) {
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections6.html")
}
if (i %in% c(25:29)) {# 31-35
if (i == 25) {
urls[[i]] <- urls[[i]][-168]
names[[i]] <- names[[i]][-168]
services[[i]] <- services[[i]][-168]
}
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections1.html")
}
# add legislators elected at by-elections
if (i %in% c(30:32)) {#38
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections2.html")
}
if (i %in% c(33:41)) {#39-
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections3.html")
}
if (i %in% c(42:48)) {#48-
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections4.html")
}
if (i %in% c(49:50)) {#48-
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections5.html")
}
if (i %in% c(51)) {#48-
by_elections <- read_html("./data/htmls/uk_by_elections/by_elections15.html")
}
if (i %in% c(21:52)) {
be_urls <- html_nodes(x = by_elections, xpath = query_be[i]) %>%
html_attr("href") %>%
str_c("https://", "en",".wikipedia.org", .)
be_names <- html_nodes(x = by_elections, xpath = query_be[i]) %>%
html_text()
# locate legislator party affiliations in html via query_party, extract and format party affiliation
be_parties <- html_nodes(x = by_elections, xpath = query_be_party[i]) %>%
html_text() %>%
# remove the two additionals, for 37
str_replace_all("\\r|\\n", "")
if (i %in% 31) {
be_parties <- be_parties[-c(57,194)]
}
be_constituencies <- html_nodes(x = by_elections, xpath = query_be_constituency[i]) %>%
html_text() %>%
str_replace_all("\\r|\\n", "")
if (i %in% 31) {
be_constituencies <- be_constituencies[-c(57,194)]
}
# remove the two additionals, for 37
be_services <- html_nodes(x = by_elections, xpath = query_be_service[i]) %>%
html_text()
if (i %in% c(21,24,25,28,30,31)) {
be_services <- replace(be_services, replace_idx2[[i]], replacement2[[i]])
}
be_services <- be_services %>%
str_extract("^.+[[:digit:]]{4}") %>%
dmy() %>%
difftime(time1 = duration[[i]][2], time2 = .)
if (i %in% 31) {
be_services <- be_services[-c(57,194)]
}
} else {
html_table2 <- trace(rvest:::html_table.xml_node, quote({
values <- lapply(lapply(cells, html_node, "a"), html_attr, name = "href")
values[[1]] <- html_text(cells[[1]])
}), at = 14)
table_a <- html_nodes(x = by_elections, xpath = ext_by_table[i]) %>%
html_table(fill = TRUE) %>%
extract2(1)
table_a <- table_a[cut_by_table[[i]],]
be_urls <- table_a[,col_by_table_1[i]] %>%
str_c("https://", "en",".wikipedia.org", .)
untrace(rvest:::html_table.xml_node)
table_b <- html_nodes(x = by_elections, xpath = ext_by_table[i]) %>%
html_table(fill = TRUE) %>%
extract2(1)
table_b <- table_b[cut_by_table[[i]],]
be_names <- table_b[,col_by_table_1[i]] %>%
str_trim
if (i %in% c(11:20)) {
be_parties <- table_b[,col_by_table_2[i]]
} else {
be_parties <- rep(NA, length(be_urls))
}
be_constituencies <- table_b[,col_by_table_3[i]] %>%
str_replace_all("[[:digit:]]", "") %>%
str_trim
be_services <- table_b[,col_by_table_4[i]] %>%
str_extract("^.+ [[:digit:]]{4}") %>%
str_replace_all("–[[:digit:]] |–[[:digit:]][[:digit:]] ", "") %>%
dmy() %>%
difftime(time1 = duration[[i]][2], time2 = .)
}
if (!(i %in% 52)) {
urls[[i]] <- c(urls[[i]], be_urls)
names[[i]] <- c(names[[i]], be_names)
parties[[i]] <- c(parties[[i]], be_parties)
constituencies[[i]] <- c(constituencies[[i]], be_constituencies)
services[[i]] <- c(services[[i]], be_services)
}
}
urls <- lapply(urls, data.table) %>%
lapply(rename, url = V1)
names <- lapply(names, data.table) %>%
lapply(rename, name = V1)
parties <- lapply(parties, data.table) %>%
lapply(rename, party = V1)
constituencies <- lapply(constituencies, data.table) %>%
lapply(rename, constituency = V1)
services <- lapply(services, data.table) %>%
lapply(rename, service = V1)
session_start <- lapply(session_start, data.table) %>%
lapply(rename, session_start = V1)
session_end <- lapply(session_end, data.table) %>%
lapply(rename, session_end = V1)
urls <- mapply(cbind, urls, names, parties, constituencies,
services, session_start,
session_end, SIMPLIFY = FALSE)
urls <- c(1:11,16,17,20:58) %>%
mapply(function(urls, x) mutate(urls, term = x), urls, ., SIMPLIFY=FALSE) %>%
mapply(function(., x) mutate(., country = x), ., rep("GBR", times = 52), SIMPLIFY=FALSE)
}
#### SPAIN WIKIPEDIA INFORMATION EXTRACTION =============================================
collectorSpain <- function(source) {
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
for (i in 1:length(source)) {
subsource <- mixedsort(list.files(source[i], full.names = TRUE), decreasing = FALSE)
cat("|")
for (j in 1:length(subsource)) {
urls <- read_html(subsource[j]) %>%
html_nodes(xpath = "//div[@class = 'mw-category']//ul/li/a") %>%
html_attr("href")
names <- read_html(subsource[j]) %>%
html_nodes(xpath = "//div[@class = 'mw-category']//ul/li/a") %>%
html_text()
if (j == 1) {
urls_all <- urls
names_all <- names
} else {
urls_all <- c(urls_all, urls)
names_all <- c(names_all, names)
}
cat(".")
}
if (i %in% c(2:5)) {
urls_all <- urls_all[-1]
names_all <- names_all[-1]
}
if (i == 1) {
output <- list(data.frame(name = names_all, url = urls_all,
stringsAsFactors = FALSE))
} else {
output <- c(output,
list(data.frame(name = names_all, url = urls_all,
stringsAsFactors = FALSE)))
}
}
return(output)
}
#### SPAIN OFFICIAL INFORMATION EXTRACTION ==============================================
collectorSpainOfficial <- function(source) {
source <- mixedsort(list.files(source, full.names = TRUE), decreasing = TRUE)
for (i in 1:length(source)) {
subsource <- mixedsort(list.files(source[i], full.names = TRUE), decreasing = TRUE)
hull <- data.frame(name = rep(NA, length(subsource)),
party = rep(NA, length(subsource)),
group = rep(NA, length(subsource)),
constituency = rep(NA, length(subsource)),
start_time = rep(NA, length(subsource)),
end_time = rep(NA, length(subsource)),
portrait = rep(NA, length(subsource)))
cat("|")
for (j in 1:length(subsource)) {
parsed <- read_html(subsource[j])
hull$name[j] <- html_node(parsed, xpath = "//div[@class = 'nombre_dip']") %>%
html_text()
hull$party[j] <- html_node(parsed, xpath = "//p[@class = 'nombre_grupo']") %>%
html_text()
hull$group[j] <- html_node(parsed, xpath = "//div[@class = 'dip_rojo']/a") %>%
html_text()
hull$constituency[j] <- html_node(parsed, xpath = "//div[@class = 'dip_rojo'][1]") %>%
html_text() %>%
str_replace_all("\\\r|\\\n|\\.", "") %>%
str_squish()
if (i %in% c(1,2)) {
hull$start_time[j] <- html_node(parsed, xpath = "//div[@class = 'texto_dip'][2]/ul/li//div[contains(text(), 'Condici')]") %>%
html_text()
} else {
hull$start_time[j] <- html_node(parsed, xpath = "//div[@class = 'texto_dip'][2]/ul/li//div[contains(text(), 'Fecha')]") %>%
html_text()
}
hull$end_time[j] <- html_node(parsed, xpath = "//div[@class = 'texto_dip'][2]/ul/li//div[contains(text(), 'Caus')]") %>%
html_text()
hull$portrait[j] <- html_node(parsed, xpath = "//img[@name = 'foto']") %>%
html_attr(name = "src") %>%
str_c("http://www.congreso.es", .)
cat(".")
}
if (i == 1) {
output <- list(hull)
} else {
output <- c(output, list(hull))
}
}
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.