source/integration.R

# ---------------------------------------------------------------------------------------
# legislatoR
# Sascha Göbel and Simon Munzert
# Script: integration
# Part of the code in this script written by Johana Sperlova
# April 2019
# ---------------------------------------------------------------------------------------


#### PREPARATIONS =======================================================================

# clear workspace -----------------------------------------------------------------------
rm(list = ls(all = TRUE))

# set working directory -----------------------------------------------------------------
setwd("D:/Sascha/Projects/legislatoR")

# install and load packages and functions -----------------------------------------------
source("./code/packages.R")
source("./code/functions.R")


#### INTEGRATE SIEBERER ET AL BUNDESTAG ROLL CALL VOTE DATA WITH LEGISLATOR =============
# full data and codebook available at:
# https://dataverse.harvard.edu/dataverse/btvote
# the mp_characteristics data set to which legislatoR was integrated is here
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/QSFXLQ

# import, join, and process legislatoR data with necessary columns ----------------------
ger <- readRDS("./data/germany")[,c("pageid","wikidataid","name","term")]
ger2 <- readRDS("./data/germany_birth")
ger <- left_join(ger, ger2, by = "wikidataid")
ger <- select(ger, pageid, name, date, term)
ger$date <- as.character(ger$date)
ger$name <- tolower(ger$name)
rm(ger2)

# import, join, process btvote data with necessary columns ------------------------------
bt_vote <- read_dta("./data/pol_sci_data/mp_characteristics.dta")
bt_vote$name <- str_c(bt_vote$firstname, " ", bt_vote$lastname)
bt_vote <- select(bt_vote, mp_id, name, lastname, elecper, date_birth)
bt_vote$date_birth <- as.character(bt_vote$date_birth)
bt_vote$name <- tolower(bt_vote$name)
bt_vote$lastname <- tolower(bt_vote$lastname)

# build unique match string (full name, term, birthdate) --------------------------------
ger$match_string <- str_c(ger$name, "_", ger$term, "_", ger$date)
bt_vote$match_string <- str_c(bt_vote$name, "_", bt_vote$elecper, "_",
                              bt_vote$date_birth)

# initialise empty columns --------------------------------------------------------------
bt_vote$match_name <- NA
bt_vote$match_birth <- NA
bt_vote$pageid <- NA

# match bt_vote and legislatoR data based on unique match string ------------------------
bt_vote$match_name[match(ger$match_string, bt_vote$match_string, nomatch = 0)] <- ger$name[ger$match_string %in% bt_vote$match_string]
bt_vote$match_birth[match(ger$match_string, bt_vote$match_string, nomatch = 0)] <- ger$date[ger$match_string %in% bt_vote$match_string]
bt_vote$pageid[match(ger$match_string, bt_vote$match_string, nomatch = 0)] <- ger$pageid[ger$match_string %in% bt_vote$match_string]
bt_vote_matched <- distinct(bt_vote, mp_id, .keep_all = TRUE)
bt_vote_matched$pageid[which(bt_vote_matched$date_birth != bt_vote_matched$match_birth)] <- NA
matched1 <- bt_vote_matched[which(!is.na(bt_vote_matched$pageid)),]

# match unmatched bt_vote and legislatoR data based on non-duplicated birth dates -------
matched_ids <- bt_vote_matched$pageid[which(!is.na(bt_vote_matched$pageid))]
ger3 <- ger[-which(ger$pageid %in% matched_ids),]
ger3 <- distinct(ger3, pageid, .keep_all = TRUE)
ger3 <- ger3[!(duplicated(ger3$date) | duplicated(ger3$date, fromLast = TRUE)), ]
bt_vote_unmatched1 <- bt_vote_matched[which(is.na(bt_vote_matched$pageid)),]
bt_vote_unmatched1 <- bt_vote_unmatched1[!(duplicated(bt_vote_unmatched1$date_birth) | duplicated(bt_vote_unmatched1$date_birth, fromLast = TRUE)), ]
bt_vote_unmatched1$match_birth[match(ger3$date, bt_vote_unmatched1$date_birth, nomatch = 0)] <- ger3$date[ger3$date %in% bt_vote_unmatched1$date_birth]
bt_vote_unmatched1$match_name[match(ger3$date, bt_vote_unmatched1$date_birth, nomatch = 0)] <- ger3$name[ger3$date %in% bt_vote_unmatched1$date_birth]
bt_vote_unmatched1$pageid[match(ger3$date, bt_vote_unmatched1$date_birth, nomatch = 0)] <- ger3$pageid[ger3$date %in% bt_vote_unmatched1$date_birth]
matched2 <- bt_vote_unmatched1[which(!is.na(bt_vote_unmatched1$pageid)),]
matched <- rbind(matched1, matched2)
rm(matched_ids,bt_vote_matched,matched1,matched2)

# manually match remaining unmatched bt_vote and legislatoR data ------------------------
unmatched3 <- bt_vote[which(!(bt_vote$mp_id %in% matched$mp_id)),]
unmatched3 <- distinct(unmatched3, mp_id, .keep_all = TRUE)
matched_ids2 <- bt_vote_unmatched1$pageid[which(!is.na(bt_vote_unmatched1$pageid))]
ger4 <- ger3[-which(ger3$pageid %in% matched_ids2),]
matched3 <- data.frame(mp_id = c(37,59,489,1143,1900,1902,2070,2521,2948,3114,3300,4752,4778,5172,
                                 5183,5821,5916,6331,7917,7940,8594,9158,9476,9623,9811,9817,10068,
                                 10090,71309,80435,90561,90985,100481,110342,110887,111693,120426,
                                 120532,120732,121208,122027,122556,132083,136472,138434,141888,
                                 141953,142773,151852,152506,152558,153372,161568,161663,162207,
                                 162949,162950,163021,163431,171853,172083,172840,180292,180870,
                                 181758,182623,182773,111731,121863,120753,111384,143826,1814,81291,
                                 152050,121478,151513,131585,121701,131337,143475,152132,121293,
                                 130924,120739,182771,2571,141990,181997,142045,7138,90156,6718,
                                 5355,90095,71794,143508,70305,132189,90970,90923,110847,182471,
                                 181217,121173,8505,173107,2528,151147,5299,2898,123098,111471,
                                 131760,181753,1173,2576,3898,111870,142331,71196,122214,488,
                                 180744,120781,152012,120958,121211,172038,101096,6676,130909,
                                 153065,182683,10483,122302,162511,161654,144343,141915,90199,
                                 111358,122401,131178,121794,181426,2650,182387,90991),
                       pageid = c(171955,171031,1735815,73442,59646,275654,461545,1107969,384715,
                                  306741,249207,662444,236401,73028,671031,72707,249182,2458535,
                                  268250,3529205,175594,68496,396004,249589,44167,3139967,492786,
                                  929399,2453129,984854,385674,309188,6102529,4237239,1198825,
                                  314710,4371301,67204,6105368,655561,1640511,528146,4330989,
                                  175482,6676676,587829,454894,381427,667865,984976,517256,
                                  984828,984862,182032,955303,6211362,4449876,977008,516094,
                                  4763942,3849486,4754626,1640511,528146,67204,6964599,655561,
                                  174000,6585374,173925,92453,4764351,73446,6099304,985060,
                                  145,412478,173786,6098250,293347,908175,2882304,6585379,448542,
                                  466451,3883230,256024,544256,4411342,397042,366157,236382,
                                  2568922,2756753,1999142,174340,422346,1116341,173801,2907199,
                                  5105098,1549269,1045238,805760,72217,3111614,4541620,271490,
                                  937009,68600,281241,1073329,174118,984868,1156743,241226,
                                  278972,1636023,67208,663326,261246,286279,174379,6574489,
                                  6574489,1027175,10541132,3793999,985385,237748,109100,
                                  175484,667926,44691,44691,219996,984963,4750832,4802838,
                                  292724,413127,952741,174349,964075,2171574,175490,1113748,
                                  6069711,174368))
matched3 <- left_join(matched3, unmatched3, by = "mp_id")
matched3$pageid <- matched3$pageid.x
matched3 <- select(matched3,-c(pageid.x,pageid.y))
matched <- rbind(matched,matched3)
rm(bt_vote_unmatched1,ger3,ger4,matched3,unmatched3,matched_ids2)

# check multiple matches on same pageid -------------------------------------------------
# mostly this is because in bt_vote one person holds more than one mp_id across terms
# if not, they are matched to the correct pageid
#View(matched[(duplicated(matched$pageid) |
#                duplicated(matched$pageid, fromLast = TRUE)), ])
matched$pageid[which(matched$mp_id == 131746)] <- 4764692
bt_vote_id <- select(matched, pageid, mp_id) # join to deu_ids
rm(bt_vote,ger,matched)

# import legislator IDs and Core datasets for germany -----------------------------------
deu_core <- readRDS("./package/legislatoR-data-v0.1.0/data/deu_core")[,c(2,3)]
deu_ids <- readRDS("./package/legislatoR-data-v0.1.0/data/deu_ids")

# join matches to wikidata ids ----------------------------------------------------------
bt_vote_id <- left_join(x = bt_vote_id, y =  deu_core, by = "pageid")
rm(deu_core)
saveRDS(bt_vote_id, "./data/pol_sci_data/bt_vote_id")

# join matches to IDs dataset -----------------------------------------------------------
bt_vote_id <- readRDS("./data/pol_sci_data/bt_vote_id")
deu_ids <- full_join(x = deu_ids, y = bt_vote_id, by = "wikidataid")

# clean up ------------------------------------------------------------------------------
deu_ids <- deu_ids[-which(is.na(deu_ids$wikidataid)),]
deu_ids <- select(deu_ids, -pageid)
colnames(deu_ids)[12] <- "btvote"
saveRDS(deu_ids, "./package/legislatoR-data-v0.2.0/deu_ids")


#### INTEGRATE RAUH ET AL BUNDESTAG PARL SPEECH DATA V1 WITH LEGISLATOR =================
# full data and codebook available at:
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/E4RSP9
# note: ParlSpeech covers 12:17, "1991-03-12"-"2013-09-03" sessions only and includes 
# speakers that were not MPs of the national assembly

# import, join, and process legislatoR data with necessary columns ----------------------
deu_core <- readRDS("./package/legislatoR-data/data/deu_core")
deu_political <- filter(readRDS("./package/legislatoR-data/data/deu_political"), 
                        session %in% 12:17)
leg_ger <- semi_join(x = deu_core, y = deu_political, by = "pageid")

# import, join, process btvote data with necessary columns ------------------------------
load("./data/pol_sci_data/Corp_Bundestag.RData")

# match legislator and parlspeech
origin <- leg_ger$name
destin <- unique(bt.corpus$speaker)
#length(unique(str_c(bt.corpus$speaker, bt.corpus$party)))
remainder1 <- origin[which(is.na(match(origin, destin)))]
remainder2 <- destin[which(is.na(match(destin, origin)))]
remainder1_last <- str_replace(remainder1, "-", " ") %>%
  str_replace("^.+ ", "")
remainder2_last <- str_replace(remainder2, "-", " ") %>%
  str_replace("^.+ ", "")
remainder1_2 <- remainder1[-which(duplicated(remainder1_last))]
remainder2_2 <- remainder2[-which(duplicated(remainder2_last))]
replace_1 <- remainder1_2[which(!is.na(match(unique(remainder1_last), unique(remainder2_last))))] # orig
replace_2 <- na.omit(remainder2_2[match(unique(remainder1_last), unique(remainder2_last))]) # dest
# replace in id orig with dest
id <- leg_ger$name
id[match(replace_1[-c(15)], origin)] <- replace_2[-c(15)]
# remove those matched from destin, repeat with first names, remove from destin, 
origin <- leg_ger$name[-which(!is.na(match(leg_ger$name, unique(bt.corpus$speaker))))] # minus these matched from the start
destin <- unique(bt.corpus$speaker)[-which(!is.na(match(unique(bt.corpus$speaker), leg_ger$name)))] # minus these matched from the start
destin <- sort(destin[-match(replace_2[-c(15)], destin)])
origin <- sort(origin[-match(replace_1[-c(15)], origin)])
id[match(origin[c(4,6,7)], leg_ger$name)] <- destin[c(18,26,31)]
# Brigitte Traupe == Brigitte Schulte
# Christian von Stetten == Christian Freiherr von Stetten
# Cornelia von Teichman und Logischen == Cornelia von Teichman
leg_ger$parl_speech <- id
leg_ger <- leg_ger[which(id %in% unique(bt.corpus$speaker)),]
# 50 MPs from legislator in the respective period are left unmatched.
leg_ger <- select(leg_ger, wikidataid, parl_speech)
saveRDS(leg_ger, "./data/pol_sci_data/leg_ger")

# import legislator IDs dataset for germany ---------------------------------------------
deu_ids <- readRDS("./package/legislatoR-data-v0.2.0/deu_ids")

# join matches to IDs dataset -----------------------------------------------------------
leg_ger <- readRDS("./data/pol_sci_data/leg_ger")
deu_ids <- full_join(x = deu_ids, y = leg_ger, by = "wikidataid")

# clean up ------------------------------------------------------------------------------
colnames(deu_ids)[13] <- "parlspeech"
saveRDS(deu_ids, "./package/legislatoR-data-v0.2.0/deu_ids")

#### INTEGRATE ANDREW EGGERS AND ARTHUR SPIRLING DATABASE WITH LEGISLATOR ===============
# full data and codebook available at:
# http://andy.egge.rs/eggers_spirling_database.html
# this part of the code was written by Johana Sperlova

# load core dataset (core) --------------------------------------------------------------
gbr_core <- readRDS('./data/pol_sci_data/gbr_core.rds')  %>%
  group_by(pageid,wikidataid,name,birth) %>%
  count(name = "record_count_in_group") %>%
  ungroup() %>%
  as.data.frame()
gbr_core$birth <- as.Date(gbr_core$birth)
gbr_core['birthyear'] <-format(as.Date(gbr_core$birth, format="%d/%m/%Y"),"%Y")
gbr_core$birthyear<-as.numeric(as.character(gbr_core$birthyear))

# load external dataset -----------------------------------------------------------------
gbr_mp_characteristics <- read.csv('mps.csv') %>%
  group_by(member.id,mp.name,mp.sname,mp.fname) %>%
  ungroup() %>%
  as.data.frame()
gbr_mp_characteristics$mp.dob <- as.Date(gbr_mp_characteristics$mp.dob)
gbr_mp_characteristics['birthyear'] <-format(as.Date(gbr_mp_characteristics$mp.dob, format="%d/%m/%Y"),"%Y")
gbr_mp_characteristics$birthyear<-as.numeric(as.character(gbr_mp_characteristics$birthyear))

## extract first name and last name -----------------------------------------------------
#first name through regex
gbr_core['firstname'] <- trimws(unlist(str_extract(gbr_core$name,
                                                   '^([\\w\\-]+)')))
gbr_core['firstname'] <- tolower(gbr_core$firstname)
#last name through regex
gbr_core['lastname'] <- trimws(unlist(str_extract(gbr_core$name,
                                                  '\\s(\\w+)$')))
gbr_core['lastname'] <- tolower(gbr_core$lastname)
gbr_mp_characteristics['lastname'] <- tolower(gbr_mp_characteristics$mp.sname)
gbr_mp_characteristics['firstname'] <- tolower(gbr_mp_characteristics$mp.fname)

# first merge; merging the core dataset with the exterenal database ---------------------
merge1 <- gbr_core %>%
  left_join(gbr_mp_characteristics,by= c("firstname" = "firstname",
                                         "lastname"="lastname",
                                         "birthyear"="birthyear")) %>%
  select(pageid, wikidataid, member.id, firstname, lastname, birthyear)
# filter for when there are no NAs (to later join)
finaljoin1 <- merge1 %>%
  filter(merge1$member.id != "NA")
# create a dataset with all the unmatched observations
nas1 <- merge1 %>%
  filter( is.na(merge1$member.id))
names(nas1)[names(nas1) == 'member.id'] <- 'nacolumn'

# second merge; merging the NAs1 with the external database ----------------------------
merge2 <- nas1 %>%
  left_join(gbr_mp_characteristics,by= c("lastname"="lastname",
                                         "birthyear"="birthyear")) %>%
  select(pageid, wikidataid, member.id,firstname.x,lastname, birthyear)
# filter for when there are no NAs (to later join)
finaljoin2 <- merge2 %>%
  filter(merge2$member.id != "NA")
names(finaljoin2)[4]<-paste("firstname")
# create a dataset with all the unmatched observations
nas2 <-merge2 %>%
  filter( is.na(merge2$member.id))
names(nas2)[names(nas2) == 'member.id'] <- 'nacolumn'
names(nas2)[4]<-paste("firstname")

# finalize ------------------------------------------------------------------------------
#join the two left_joined files:
totaljoin<-rbind(finaljoin1,finaljoin2)
#extract unmatched NAs to csv (to be sorted manually):
write.csv(nas2, "gbr_nas2.csv")
write.csv(gbr_mp_characteristics, "gbr_external_data.csv")
write.csv(totaljoin,"totaljoin_gbr.csv")
#the rest of the observations were edited manually in excel (around 900)

# import legislator IDs dataset for gbr -------------------------------------------------
gbr_ids <- readRDS("./package/legislatoR-data-v0.2.0/gbr_ids")

# join matches to IDs dataset -----------------------------------------------------------
gbr_eggers <- read.csv("./data/pol_sci_data/gbr_wikidataid_eggers_key.csv", 
                       stringsAsFactors = FALSE)
gbr_eggers <- select(gbr_eggers, wikidataid, member.id)
gbr_eggers <- gbr_eggers[!is.na(gbr_eggers$member.id),]
gbr_ids <- full_join(x = gbr_ids, y = gbr_eggers, by = "wikidataid")

# clean up ------------------------------------------------------------------------------
colnames(gbr_ids)[18] <- "eggersspirling"
saveRDS(gbr_ids, "./package/legislatoR-data-v0.2.0/gbr_ids")


## Czech ParlSpeech V1 ==================================================================
cze_output <- readRDS("./data/pol_sci_data/cze_output")
czech_ids <- readRDS("./package/legislatoR-data-v0.1.0/data/cze_ids")
czech_ids <- full_join(czech_ids, cze_output, by = "wikidataid")
colnames(czech_ids)[11] <- "parlspeech"
#czech_ids <- czech_ids[-which(rowSums(is.na(czech_ids)) == ncol(czech_ids)-1),]
saveRDS(czech_ids, "./package/legislatoR-data-v1.0.0/czech_ids")

## UK ParlSpeech V1 =====================================================================
uk_output <- readRDS("./data/pol_sci_data/uk_output")
uk_ids <- readRDS("./package/legislatoR-data-v1.0.0/gbr_ids")
uk_ids <- full_join(uk_ids, uk_output, by = "wikidataid")
colnames(uk_ids)[19] <- "parlspeech"
uk_ids <- uk_ids[-which(rowSums(is.na(uk_ids)) == ncol(uk_ids)-1),]
saveRDS(uk_ids, "./package/legislatoR-data-v1.0.0/gbr_ids")

## Spain  ParlSpeech V2 =================================================================
# names in parlSpeech are unique identifiers
# and they match perfectly because legislator gathered official data
# from the same source used by parlspeech
spain_corp <- readRDS("./data/pol_sci_data/Corp_Congreso_V2")
spain_corp$date <- ymd(as.character(spain_corp$date))
spain_corp$speaker <- as.character(spain_corp$speaker)
spain_corp$speaker <- spain_corp$speaker %>% str_trim()
spain_corp <- spain_corp %>% tidyr::drop_na(party) # president etc. not mps
spain_corp$first <- str_trim(str_replace(str_extract(spain_corp$speaker, ",.+"), "^,", ""))
spain_corp$last <- str_replace(spain_corp$speaker, ",.+", "")
spain_corp$name <- str_c(spain_corp$first, " ", spain_corp$last)
spain_corp <- spain_corp %>% 
  distinct(speaker, .keep_all = TRUE)
spain_core <- readRDS("./package/legislatoR-data-v1.0.0/esp_core")
spain_core$name[which(duplicated(spain_core$name) | duplicated(spain_core$name, fromLast=TRUE))] <- NA
spain_core <- spain_core %>% tidyr::drop_na(name)
spain_ids <- left_join(spain_corp[,c("party","name", "speaker")], spain_core[,c("name", "wikidataid")], by = "name")
spain_core <- readRDS("./package/legislatoR-data-v1.0.0/esp_core")
# In the following NAs are persons to which an ID was already assigned, these are duplicates in parlspeech
# with the name written slightly differently, or persons which are not in legislatoR, could be non-mp speakers
spain_ids[is.na(spain_ids$wikidataid),]$wikidataid <- c(NA,NA,NA, "Q455748",NA, "Q701362",NA, "Q3109576","Q1366326",NA,
                                                        NA, "Q3154831", NA, NA, NA, NA, NA, NA, NA, NA,
                                                        NA, NA, NA, NA, NA, "Q4890993", NA, "Q292095", NA, NA,
                                                        NA, NA, "Q3327534", "Q2749833", NA, NA, NA, "Q3187630", NA, "Q270883",
                                                        NA, NA, NA, NA, NA, "Q1311580", NA, "Q1690734", NA, "Q2986637",
                                                        NA, NA, NA, "Q2893019", NA, NA, NA, NA, NA, NA,
                                                        NA, NA, NA, "Q1336629", NA, NA, NA, NA, "Q16297359", NA,
                                                        NA, NA, "Q271658", NA, NA, NA, NA, NA, NA, "Q9009373",
                                                        NA, NA, NA)
spain_ids <- spain_ids %>% tidyr::drop_na(wikidataid) %>%
  dplyr::select(speaker, wikidataid)
colnames(spain_ids)[1] <- "parlspeech"
spain_ids_full <- readRDS("./package/legislatoR-data-v1.0.0/esp_ids")
spain_ids_full <- full_join(spain_ids_full, spain_ids[,c("wikidataid","parlspeech")], by = "wikidataid")
#spain_ids_full <- spain_ids_full[-which(rowSums(is.na(spain_ids_full)) == ncol(spain_ids_full)-1),]
saveRDS(spain_ids_full, "./package/legislatoR-data-v1.0.0/esp_ids")

# Austria V2 ============================================================================
austria_corp <- readRDS("./data/pol_sci_data/Corp_Nationalrat_V2")
austria_corp <- austria_corp %>% 
  distinct(speaker, .keep_all = TRUE)
austria_corp$speaker_original <- austria_corp$speaker
austria_corp$speaker <- austria_corp$speaker %>% str_remove("Bundesminister(in)?|Präsident(in)?|Bundeskanzler(in)?|Schriftführer(in)?")
austria_corp$speaker <- austria_corp$speaker %>% str_remove(".+Abg\\.|Vizekanzler(in)?")
austria_corp$speaker <- austria_corp$speaker %>% str_remove(".+Forschung |\\. Redezeit|.+Verbraucherschutz |.+Bundesminister")
office_regex <- unique(str_extract(austria_corp$speaker, "fĂĽr .+"))
office_regex <- gsub(" [^ ]*$", "", office_regex)
office_regex <- gsub(" [^ ]*$", "", office_regex)
office_regex <- na.omit(unique(office_regex))
for (i in 1:length(office_regex)) {
  austria_corp$speaker <- austria_corp$speaker %>% str_remove(office_regex[i])
}
austria_corp$speaker <- str_trim(austria_corp$speaker)
austria_corp$session <- NA
austria_corp$date <- ymd(austria_corp$date)
austria_corp$party <- austria_corp$party %>% str_replace("Jetzt – Liste ", "")

aut <- readRDS("./package/legislatoR-data-v1.0.0/aut_core")
aut_pol <- readRDS("./package/legislatoR-data-v1.0.0/aut_political")
aut_pol <- aut_pol %>% filter(session != 27)
for (i in 1:length(unique(aut_pol$session_start))) {
  in_session <- austria_corp$date %within% lubridate::interval(unique(aut_pol$session_start)[i], unique(aut_pol$session_end)[i])
  austria_corp$session <- ifelse(in_session == TRUE, i, austria_corp$session)
}
aut_pol <- aut_pol %>% filter(session %in% 20:26)
aut <- semi_join(aut, aut_pol, by = "pageid")
aut <- left_join(aut, aut_pol, by = "pageid")
aut$speaker <- aut$wikititle %>% str_replace_all("_|\\(.+", " ") %>% str_trim()
aut$matchid_1 <- str_c(aut$speaker, aut$session)
austria_corp$matchid_1 <- str_c(austria_corp$speaker, austria_corp$session)
ids_1 <- left_join(austria_corp[,c("speaker_original", "session", "party", "matchid_1")], 
                   aut[,c("speaker", "session", "party", "wikidataid", "matchid_1")], by = "matchid_1") %>%
  dplyr::distinct(wikidataid, .keep_all = TRUE) %>%
  tidyr::drop_na(wikidataid)
# inspection did not yield any mismatches
# manually match remaining
aut <- aut %>% filter(!(wikidataid %in% ids_1$wikidataid)) %>%
  distinct(wikidataid, .keep_all = TRUE)
ids_2 <- data.frame(speaker_original = c("Erich L. Schreiner", "Willi Brauneder", " Eva Glawischnig-Piesczek",
                                         "Herbert L. Graf", "Beate Hartinger", "Evelin Lichtenberger",
                                         "Susanne Riess-Passer", "Ulrike Sima", "Karl-Heinz Dernoscheg",
                                         "Dagmar Belakowitsch-Jenewein", "Adelheid Irina FĂĽrntrath-Moretti",
                                         "Johann Georg Schelling", "Johannes Hahn", "Sonja SteĂźl-MĂĽhlbacher",
                                         "Nikolaus Alm", "AygĂĽl Berivan Aslan", "Angelika Rosa Mlinar",
                                         "Josef Schellhorn", "Ulrike Weigerstorfer", "Ricarda Berger",
                                         "Douglas Hoyos-Trauttmansdorff", "Barbara Krenn", "Maria Theresia Niss",
                                         "Bundesminister für Land- und Forstwirtschaft, Umwelt und Wasserwirtschaft Andrä Rupprechter",
                                         "Birgit Silvia Sandler", "Andrea Michaela Schartel", "Gabriela Schwarz"),
                    wikidataid = c("Q1353204", "Q1237043", "Q93870", "Q28919536",
                                   "Q813144", "Q79073", "Q78904", "Q2477473", "Q1731521",
                                   "Q90426", "Q354661", "Q1580606", "Q78647", "Q2301742",
                                   "Q964070", "Q15792244", "Q524225", "Q17352992", "Q94004",
                                   "Q46013387", "Q19278490", "Q42313276", "Q42304258", "Q15428608",
                                   "Q43231483", "Q19501935", "Q36808597")) 
ids <- rbind(ids_1[,c("speaker_original", "wikidataid")], ids_2)
colnames(ids)[1] <- "parlspeech"
aut_ids <- readRDS("./package/legislatoR-data-v1.0.0/aut_ids")
aut_ids <- full_join(aut_ids, ids, by = "wikidataid")
saveRDS(aut_ids, "./package/legislatoR-data-v1.0.0/aut_ids")

  
#### INTEGRATE SILVA/PROKSCH TWITTER DATA WITH LEGISLATOR ===============================

# import twitter data -------------------------------------------------------------------
sp_twitter <- read.csv("./data/twitter handles/politicians_working_JAN2019_2.csv", 
                       na.strings = "",
                       stringsAsFactors = FALSE)

# filter relevant countries and columns -------------------------------------------------
sp_twitter_austria <- filter(sp_twitter, country == "austria")
sp_twitter_austria <- select(sp_twitter_austria, wikidata, twitter, facebook)
sp_twitter_austria <- sp_twitter_austria[!is.na(sp_twitter_austria$twitter),]
colnames(sp_twitter_austria) <- c("wikidataid", "twitter_new", "facebook_new")
sp_twitter_czech <- filter(sp_twitter, country == "czechia")
sp_twitter_czech <- select(sp_twitter_czech, wikidata, twitter, facebook)
sp_twitter_czech <- sp_twitter_czech[!is.na(sp_twitter_czech$twitter),]
colnames(sp_twitter_czech) <- c("wikidataid", "twitter_new", "facebook_new")
sp_twitter_france <- filter(sp_twitter, country == "france")
sp_twitter_france <- select(sp_twitter_france, wikidata, twitter, facebook)
sp_twitter_france <- sp_twitter_france[!(is.na(sp_twitter_france$twitter) & 
                                                is.na(sp_twitter_france$facebook)),]
colnames(sp_twitter_france) <- c("wikidataid", "twitter_new", "facebook_new")
sp_twitter_germany <- filter(sp_twitter, country == "germany")
sp_twitter_germany <- select(sp_twitter_germany, wikidata, twitter, facebook)
sp_twitter_germany <- sp_twitter_germany[!(is.na(sp_twitter_germany$twitter) & 
                                           is.na(sp_twitter_germany$facebook)),]
colnames(sp_twitter_germany) <- c("wikidataid", "twitter_new", "facebook_new")
sp_twitter_ireland <- filter(sp_twitter, country == "ireland")
sp_twitter_ireland <- select(sp_twitter_ireland, wikidata, twitter, facebook)
sp_twitter_ireland <- sp_twitter_ireland[!is.na(sp_twitter_ireland$twitter),]
colnames(sp_twitter_ireland) <- c("wikidataid", "twitter_new", "facebook_new")
sp_twitter_gbr <- filter(sp_twitter, country == "uk")
sp_twitter_gbr <- select(sp_twitter_gbr, wikidata, twitter, facebook)
sp_twitter_gbr <- sp_twitter_gbr[!(is.na(sp_twitter_gbr$twitter) & 
                                             is.na(sp_twitter_gbr$facebook)),]
colnames(sp_twitter_gbr) <- c("wikidataid", "twitter_new", "facebook_new")

# import legislator social datasets -----------------------------------------------------
aut_social <- readRDS("./package/legislatoR-data-v0.1.0/data/aut_social")
cze_social <- readRDS("./package/legislatoR-data-v0.1.0/data/cze_social")
fra_social <- readRDS("./package/legislatoR-data-v0.1.0/data/fra_social")
deu_social <- readRDS("./package/legislatoR-data-v0.1.0/data/deu_social")
irl_social <- readRDS("./package/legislatoR-data-v0.1.0/data/irl_social")
gbr_social <- readRDS("./package/legislatoR-data-v0.1.0/data//gbr_social")

# join twitter and legislatoR social datasets -------------------------------------------
aut_social <- full_join(x = aut_social, y = sp_twitter_austria, by = "wikidataid")
aut_social <- aut_social[!is.na(aut_social$wikidataid),]
aut_social$twitter_full <- ifelse(is.na(aut_social$twitter), aut_social$twitter_new, 
                                  aut_social$twitter)
length(which(!is.na(aut_social$twitter_full)))-length(which(!is.na(aut_social$twitter)))
# 61 twitter handles added
aut_social <- select(aut_social, wikidataid, twitter = twitter_full, facebook:googlep)
cze_social <- full_join(x = cze_social, y = sp_twitter_czech, by = "wikidataid")
cze_social <- cze_social[!is.na(cze_social$wikidataid),]
cze_social$twitter_full <- ifelse(is.na(cze_social$twitter), cze_social$twitter_new, 
                                  cze_social$twitter)
length(which(!is.na(cze_social$twitter_full)))-length(which(!is.na(cze_social$twitter)))
# 58 twitter handles added
cze_social <- select(cze_social, wikidataid, twitter = twitter_full, facebook:linkedin)
fra_social <- full_join(x = fra_social, y = sp_twitter_france, by = "wikidataid")
fra_social <- fra_social[!is.na(fra_social$wikidataid),]
fra_social$twitter_full <- ifelse(is.na(fra_social$twitter), fra_social$twitter_new, 
                                  fra_social$twitter)
fra_social$facebook_full <- ifelse(is.na(fra_social$facebook), fra_social$facebook_new, 
                                  fra_social$facebook)
length(which(!is.na(fra_social$twitter_full)))-length(which(!is.na(fra_social$twitter)))
# 26 twitter handles added
length(which(!is.na(fra_social$facebook_full)))-length(which(!is.na(fra_social$facebook)))
# 0 facebool handles added
fra_social <- select(fra_social, wikidataid, twitter = twitter_full, facebook:website)
deu_social <- full_join(x = deu_social, y = sp_twitter_germany, by = "wikidataid")
deu_social <- deu_social[!is.na(deu_social$wikidataid),]
deu_social$twitter_full <- ifelse(is.na(deu_social$twitter), deu_social$twitter_new, 
                                  deu_social$twitter)
deu_social$facebook_full <- ifelse(is.na(deu_social$facebook), deu_social$facebook_new, 
                                   deu_social$facebook)
length(which(!is.na(deu_social$twitter_full)))-length(which(!is.na(deu_social$twitter)))
# 232 twitter handles added
length(which(!is.na(deu_social$facebook_full)))-length(which(!is.na(deu_social$facebook)))
# 6 facebook handles added
deu_social <- select(deu_social, wikidataid, twitter = twitter_full, 
                     facebook = facebook_full, youtube:website)
irl_social <- full_join(x = irl_social, y = sp_twitter_ireland, by = "wikidataid")
irl_social <- irl_social[!is.na(irl_social$wikidataid),]
irl_social$twitter_full <- ifelse(is.na(irl_social$twitter), irl_social$twitter_new, 
                                  irl_social$twitter)
irl_social$facebook_full <- ifelse(is.na(irl_social$facebook), irl_social$facebook_new, 
                                   irl_social$facebook)
length(which(!is.na(irl_social$twitter_full)))-length(which(!is.na(irl_social$twitter)))
# 129 twitter handles added
length(which(!is.na(irl_social$facebook_full)))-length(which(!is.na(irl_social$facebook)))
# 0 facebook handles added
irl_social <- select(irl_social, wikidataid, twitter = twitter_full, facebook:website)
gbr_social <- full_join(x = gbr_social, y = sp_twitter_gbr, by = "wikidataid")
gbr_social <- gbr_social[!is.na(gbr_social$wikidataid),]
gbr_social$twitter_full <- ifelse(is.na(gbr_social$twitter), gbr_social$twitter_new, 
                                  gbr_social$twitter)
gbr_social$facebook_full <- ifelse(is.na(gbr_social$facebook), gbr_social$facebook_new, 
                                   gbr_social$facebook)
length(which(!is.na(gbr_social$twitter_full)))-length(which(!is.na(gbr_social$twitter)))
# 15 twitter handles added
length(which(!is.na(gbr_social$facebook_full)))-length(which(!is.na(gbr_social$facebook)))
# 4 facebook handles added
gbr_social <- select(gbr_social, wikidataid, twitter = twitter_full, 
                     facebook = facebook_full, youtube:googlep)
# overall 521 missing twitter handles added
# overall 10 missing facebook handles added
saveRDS(aut_social, "./package/legislatoR-data-v0.2.0/aut_social")
saveRDS(cze_social, "./package/legislatoR-data-v0.2.0/cze_social")
saveRDS(fra_social, "./package/legislatoR-data-v0.2.0/fra_social")
saveRDS(deu_social, "./package/legislatoR-data-v0.2.0/deu_social")
saveRDS(irl_social, "./package/legislatoR-data-v0.2.0/irl_social")
saveRDS(gbr_social, "./package/legislatoR-data-v0.2.0/gbr_social")



#### INTEGRATE VOTEVIEW DATABASE WITH LEGISLATOR ========================================
# full data and codebook available at:
# https://www.voteview.com/
# this part of the code was written by Johana Sperlova

# house
# import core dataset for house members (core) ------------------------------------------
usa_house_core <- readRDS('./data/pol_sci_data/usa_house_core.rds')  %>%
  group_by(pageid,wikidataid,name,birth) %>%
  count(name = "record_count_in_group") %>%
  ungroup() %>%
  as.data.frame()
usa_house_core$birth <- as.Date(usa_house_core$birth)
usa_house_core['birthyear'] <-format(as.Date(usa_house_core$birth, format="%d/%m/%Y"),"%Y")
usa_house_core$birthyear<-as.numeric(as.character(usa_house_core$birthyear))

# import external dataset (voteview) ----------------------------------------------------
us_house_mp_characteristics <- read.csv('./data/pol_sci_data/us_house_mp_characteristics.csv') %>%
  group_by(bioname,bioguide_id, born, died) %>%
  count(name = "record_count_in_group") %>%
  ungroup() %>%
  as.data.frame()

# extract politicians names (external dataset) ------------------------------------------
# last name
# regex pattern to extract only part of the name that's before the comma (lastname)
us_house_mp_characteristics['lastname'] <- trimws(unlist(str_extract(us_house_mp_characteristics$bioname,
                                                                     '^([^,])+')))
# change last name to lowercase
us_house_mp_characteristics['lastname'] <- tolower(us_house_mp_characteristics$lastname)
# first name
# regex pattern to extract only part of the last name that's before the bracket
us_house_mp_characteristics['firstname'] <- trimws(unlist(str_extract(us_house_mp_characteristics$bioname,
                                                                      '(?<=\\,\\s)(\\w+)')))
us_house_mp_characteristics['firstname'] <- tolower(us_house_mp_characteristics$firstname)

# extract politicians names (core dataset) ----------------------------------------------
# first name through regex
usa_house_core['firstname'] <- trimws(unlist(str_extract(usa_house_core$name,
                                                         '^([\\w\\-]+)')))
usa_house_core['firstname'] <- tolower(usa_house_core$firstname)
# last name through regex
usa_house_core['lastname'] <- trimws(unlist(str_extract(usa_house_core$name,
                                                        '\\s(\\w+)$')))
usa_house_core['lastname'] <- tolower(usa_house_core$lastname)

# merge legislator with voteview Level 1 ------------------------------------------------
#first merge; merging the core dataset with the exterenal database
merge1 <- usa_house_core %>%
  left_join(us_house_mp_characteristics,by= c("firstname" = "firstname",
                                              "lastname"="lastname",
                                              "birthyear"="born")) %>%
  select(pageid, wikidataid, bioguide_id, firstname, lastname, birthyear)
#filter for when there are no NAs (to later join)
finaljoin1 <- merge1 %>%
  filter(merge1$bioguide_id != "NA")
#create a dataset with all the unmatched observations
nas1 <- merge1 %>%
  filter(is.na(merge1$bioguide_id))
names(nas1)[names(nas1) == 'bioguide_id'] <- 'nacolumn'

## merge legislator and voteview Level 2 ------------------------------------------------
# second merge; merging the NAs1 with the external database
merge2 <- nas1 %>%
  left_join(us_house_mp_characteristics,by= c("lastname"="lastname",
                                              "birthyear"="born")) %>%
  select(pageid, wikidataid, bioguide_id,firstname.x,lastname, birthyear)
#filter for when there are no NAs (to later join)
finaljoin2 <- merge2 %>%
  filter(merge2$bioguide_id != "NA")
names(finaljoin2)[4]<-paste("firstname")
#create a dataset with all the unmatched observations
nas2 <-merge2 %>%
  filter( is.na(merge2$bioguide_id))
names(nas2)[names(nas2) == 'bioguide_id'] <- 'nacolumn'
names(nas2)[4]<-paste("firstname")
#join the two left_joined files:
totaljoin<-rbind(finaljoin1,finaljoin2)
#extract unmatched NAs to csv (to be sorted manually):
write.csv(nas2, "us_nas2.csv")
write.csv(us_house_mp_characteristics, "us_external_data.csv")
write.csv(totaljoin,"./data/pol_sci_data/totaljoin.csv")
# the rest of the observations were edited manually in excel (around 900)!

# senate
# import core dataset for senate members (core) -----------------------------------------
usa_senate_core <- readRDS('./data/pol_sci_data/usa_senate_core.rds')  %>%
  group_by(pageid,wikidataid,name,birth) %>%
  count(name = "record_count_in_group") %>%
  ungroup() %>%
  as.data.frame()
usa_senate_core$birth <- as.Date(usa_senate_core$birth)
usa_senate_core['birthyear'] <-format(as.Date(usa_senate_core$birth, format="%d/%m/%Y"),"%Y")
usa_senate_core$birthyear<-as.numeric(as.character(usa_senate_core$birthyear))

# import external dataset (voteview) ----------------------------------------------------
us_senate_mp_characteristics <- read.csv('./data/pol_sci_data/us_senate_mp_characteristics.csv') %>%
  group_by(bioname,bioguide_id, born, died) %>%
  count(name = "record_count_in_group") %>%
  ungroup() %>%
  as.data.frame()

# extract politicians names (external dataset) ------------------------------------------
# first name
# regex pattern to extract only part of the name that's before the comma (lastname)
us_senate_mp_characteristics['lastname'] <- trimws(unlist(str_extract(us_senate_mp_characteristics$bioname,
                                                                      '^([^,])+')))
# change last name to lowercase
us_senate_mp_characteristics['lastname'] <- tolower(us_senate_mp_characteristics$lastname)
# last name
# regex pattern to extract only part of the last name that's before the bracket
us_senate_mp_characteristics['firstname'] <- trimws(unlist(str_extract(us_senate_mp_characteristics$bioname,
                                                                       '(?<=\\,\\s)(\\w+)')))
us_senate_mp_characteristics['firstname'] <- tolower(us_senate_mp_characteristics$firstname)

# extract politicians names (core dataset) ----------------------------------------------
#first name through regex
usa_senate_core['firstname'] <- trimws(unlist(str_extract(usa_senate_core$name,
                                                          '^([\\w\\-]+)')))
usa_senate_core['firstname'] <- tolower(usa_senate_core$firstname)
#last name through regex
usa_senate_core['lastname'] <- trimws(unlist(str_extract(usa_senate_core$name,
                                                         '\\s(\\w+)$')))
usa_senate_core['lastname'] <- tolower(usa_senate_core$lastname)

# merge legislator with voteview Level 1 ------------------------------------------------
#first merge; merging the core dataset with the exterenal database
merge1 <- usa_senate_core %>%
  left_join(us_senate_mp_characteristics,by= c("firstname" = "firstname",
                                               "lastname"="lastname",
                                               "birthyear"="born")) %>%
  select(pageid, wikidataid, bioguide_id, firstname, lastname, birthyear)
#filter for when there are no NAs (to later join)
finaljoin1 <- merge1 %>%
  filter(merge1$bioguide_id != "NA")
#create a dataset with all the unmatched observations
nas1 <- merge1 %>%
  filter( is.na(merge1$bioguide_id))
names(nas1)[names(nas1) == 'bioguide_id'] <- 'nacolumn'

# merge legislator with voteview Level 1 ------------------------------------------------
#second merge; merging the NAs1 with the exterenal database
merge2 <- nas1 %>%
  left_join(us_senate_mp_characteristics,by= c("lastname"="lastname",
                                               "birthyear"="born")) %>%
  select(pageid, wikidataid, bioguide_id,firstname.x,lastname, birthyear)
#filter for when there are no NAs (to later join)
finaljoin2 <- merge2 %>%
  filter(merge2$bioguide_id != "NA")
names(finaljoin2)[4]<-paste("firstname")
#create a dataset with all the unmatched observations
nas2 <-merge2 %>%
  filter( is.na(merge2$bioguide_id))
names(nas2)[names(nas2) == 'bioguide_id'] <- 'nacolumn'
names(nas2)[4]<-paste("firstname")
#join the two left_joined files:
totaljoin<-rbind(finaljoin1,finaljoin2)
#extract unmatched NAs to csv (to be sorted manually):
write.csv(nas2, "./data/pol_sci_data/us_senate_nas2.csv")
write.csv(us_senate_mp_characteristics, "./data/pol_sci_data/us_senate_external_data.csv")
write.csv(totaljoin,"./data/pol_sci_data/totaljoin_senate.csv")
#the rest of the observations were edited manually in excel (around 380)!

# join house an senate ------------------------------------------------------------------
# load usa_nominate dataset
usa_nominate <- read_excel('./data/pol_sci_data/US_Nominate_Voteview_Total_Key.xlsx')  %>%
  as.data.frame()
# load voteview original dataset
usa_nominate_original <- read.csv('./data/pol_sci_data/Voteview_scpsr.csv') %>%
  as.data.frame()

# merge on icpsr & bioguide id ----------------------------------------------------------
#first merge; merging the core dataset with the exterenal database
merge1 <- usa_nominate %>%
  left_join(usa_nominate_original,by= c("bioguide_id" = "bioguide_id")) %>%
  select(pageid, wikidataid, bioguide_id, firstname, lastname, birthyear, icpsr)
#drop duplicate rows
merge2 <- merge1 %>%
  distinct(pageid, .keep_all = TRUE)
#extract document into csv:
write.csv(merge2, "./data/pol_sci_data/usa_congress_wikidataid_icspr_bioguideid_final_key.csv")

# import legislator core and IDs dataset for usa house and senate -----------------------
usa_house_core <- readRDS("./package/legislatoR-data-v0.1.0/data/usa_house_core")
usa_senate_core <- readRDS("./package/legislatoR-data-v0.1.0/data/usa_senate_core")
usa_house_ids <- readRDS("./package/legislatoR-data-v0.1.0/data/usa_house_ids")
usa_senate_ids <- readRDS("./package/legislatoR-data-v0.1.0/data/usa_senate_ids")

# join matches to IDs dataset -----------------------------------------------------------
usa_voteview <- read.csv("./data/pol_sci_data/usa_congress_wikidataid_icspr_bioguideid_final_key.csv", 
                       stringsAsFactors = FALSE)
usa_voteview <- usa_voteview[!is.na(usa_voteview$wikidataid),]
usa_voteview_house <- filter(usa_voteview, wikidataid %in% 
                               unique(usa_house_core$wikidataid))
usa_voteview_senate <- filter(usa_voteview, wikidataid %in% 
                                unique(usa_senate_core$wikidataid))
# select appropriate columns
usa_voteview_house <- select(usa_voteview_house, wikidataid, icpsr, bioguide_id)
usa_voteview_senate <- select(usa_voteview_senate, wikidataid, icpsr, bioguide_id)
# full join
usa_house_ids <- full_join(x = usa_house_ids, y = usa_voteview_house, by = "wikidataid")
usa_senate_ids <- full_join(x = usa_senate_ids, y = usa_voteview_senate, by = "wikidataid")
usa_house_ids$bioguide_id <- ifelse(is.na(usa_house_ids$bioguide_id), usa_house_ids$parlid, 
                                    usa_house_ids$bioguide_id)
usa_senate_ids$bioguide_id <- ifelse(is.na(usa_senate_ids$bioguide_id), usa_senate_ids$parlid, 
                                    usa_senate_ids$bioguide_id)

# clean up ------------------------------------------------------------------------------
usa_house_ids <- select(usa_house_ids, wikidataid, bioguide_id, icpsr, gndid:politfacts)
usa_senate_ids <- select(usa_senate_ids, wikidataid, bioguide_id, icpsr, gndid:politfacts)
usa_house_ids <- usa_house_ids[-which(rowSums(is.na(usa_house_ids)) == ncol(usa_house_ids)-1),]
usa_senate_ids <- usa_senate_ids[-which(rowSums(is.na(usa_senate_ids)) == ncol(usa_senate_ids)-1),]
colnames(usa_house_ids)[2] <- "bioguide"
colnames(usa_senate_ids)[2] <- "bioguide"

saveRDS(usa_house_ids, "./package/legislatoR-data-v0.2.0/usa_house_ids")
saveRDS(usa_senate_ids, "./package/legislatoR-data-v0.2.0/usa_senate_ids")


#### INTEGRATE DATABASE OF PARLIAMENTARY SPEECHES IN IRELAND TH LEGISLATOR ==============
library(vroom)
library(dplyr)
library(magrittr)
library(stringr)
library(lubridate)
dpsi <- vroom::vroom(file = "./data/pol_sci_data/Dail_debates_1919-2013/Dail_debates_1919-2013.tab",
                     col_select = c(memberID,date,member_name,party_name))
dpsi <- dpsi %>% distinct(memberID, party_name, .keep_all = TRUE)
irl <- readRDS("./package/legislatoR-data-v1.0.0/irl_core")
irl_pol <- readRDS("./package/legislatoR-data-v1.0.0/irl_political")
#dpsi$member_name <- str_remove(dpsi$member_name, "^.+?\\.|General|Professor") %>%
#  str_trim()
dpsi$last_name <- dpsi$member_name %>%
  stringi::stri_extract_last_words(1) %>%
  tolower()
dpsi$session <- NA
for (i in 1:length(unique(irl_pol$session_start))) {
  in_session <- dpsi$date %within% lubridate::interval(unique(irl_pol$session_start)[i], unique(irl_pol$session_end)[i])
  dpsi$session <- ifelse(in_session == TRUE, i, dpsi$session)
}
irl <- left_join(irl_pol, irl, by = "pageid")
irl$last_name <- irl$name %>%
  stringi::stri_extract_last_words(1) %>%
  tolower()
irl2 <- irl %>%
  group_by(session) %>%
  distinct(last_name, .keep_all = TRUE) %>%
  ungroup() %>%
  group_by_all() %>%
  filter(n() == 1) %>%
  mutate(matchID = str_c(last_name, session))
dpsi2 <- dpsi %>%
  group_by(session) %>%
  distinct(last_name, .keep_all = TRUE) %>%
  ungroup() %>%
  group_by_all() %>%
  filter(n() == 1) %>%
  mutate(matchID = str_c(last_name, session))
irl2 <- left_join(irl2, dpsi2, by = "matchID")
dpsi <- left_join(dpsi, irl2[,c("pageid", "name", "memberID")], by = "memberID")
# correct mismatches
dpsi$pageid[c(13,30,76,79,180,185,
              197,206,223,261,265,
              274,275,292,307,310,
              334,357,366,391,399,
              406,421,425,426,453,
              457,471,476,497,499,
              504,512,513,518,536,
              545,550,560,564,567,
              568,571,573,579,597,
              600,615,616,625,629,
              630,631,633,635,663,
              670,671,682,683,707,
              715,720,721,731,740,
              750,758,773,781,784,
              791,795,803,817,844,
              852,858,865,888,899,
              901,923,933,945,951,
              967,968,969,985,990,
              999,1002,1004,1012,
              1015,1016,1025,1059,
              1074,1075,1085,1091,
              1110,1113,1115,1121,
              1123,1146,1149)] <- as.integer(c(12962224,10204324,268250,559763,15316591,19887055,
                                         18493722,5076916,21003759,21117442,21056219,
                                         15694619, 262963, 20675980,4923955,20162801,
                                         21279436,25165814,11571030,21445057,20723993,
                         8313964,4951236,155297,19886919,18993945,
                         5211624,8839393,3311885,1749922,4980676,
                         16762121,4951611,15525105,563162,18399837,
                         390173,19553185,11562473,5209531,20423340,
                         18401494,2051931,5230730,17419286,1076658,
                         17767979,1080484,11591809,5211949,21056316,
                         4944644,9563739,9509723,5031154,5223847,
                         30872323,254418,15472425,17748676,533405,
                         5220051,148009,1786645,8896919,2363705,
                         15878770,1087220,2780061,482923,9039250,
                         2671345,969786,1508578,5940560,9719032,
                         15085414,1479176,5226017,316088,1095335,
                         30872401,3322765,4069955,4972475,5160886,
                         9631623,1479223,1387412,1326887,3422685,
                         1508486,3422946,30865860,4892571,3191097,
                         3193024,1479223,3199789,11451134,11435896,
                         11437479,4150590,11416881,31026346,22690975,
                         31022071,7268221,1508492,31022086))
# manually match NAs
dpsi[is.na(dpsi$pageid),]$pageid <- as.integer(c(10519936, 15849075, 97509, 12960359, 2218736,
                                                 20480488, 11329755, 20640403, 147390, 13067392,
                                                 15330744, 16654206, 4115669, 20654463, 20723667,
                                                 18709803, 1683145, 11591480, 20251372, 2126221,
                                                 4976440, 21103761, 5238527, 17696717, 18630820,
                                                 21131499,8089293,21321425, 20356044,8166418,
                                                 20580168,1076125,4931235,18301731,478888,
                                                 18504589,18301893,3172374,5230730,15734696,
                                                 5170107,20680931,15614093,983143,5040122,13755540,
                                                 4930848,1085153,2329531,3774806,1087209,
                                                 15755004,2560179,19285376,316094,1474357,15850298,
                                                 30875049,1724217,3313007,15766118,172298,
                                                 556748, 5160977,22450394,3214938,4983885,
                                                 3422568,1474327,3191118,1374439,2199319,
                                                 2695519,11436629,1508611,11417803,
                                                 12529585,23121623,31029261,31022106,31021272,
                                                 1508492,31022071,31023259))
dpsi$pageid[which(duplicated(dpsi$pageid) | duplicated(dpsi$pageid, fromLast=TRUE))] <- NA
dpsi[is.na(dpsi$pageid),]$pageid %>% length
dpsi[match(c(835,229,405,798,137,
             2093,93,1032,267,1033,
             800,896,127,1021,1017,
             138,612,91,611,404,
             128,228,268,203,652,
             895,804,654,799,840,
             144,400,145,1315,408,
             1013,2028,2292,2293,2004,
             2323,410,199),dpsi$memberID),]$pageid <- 
  c(10204324,1796630,20527274,20675980,4155405,
    25166139,4916567,20958411,4976440,20971662,
    18913597,1089066,2371351,21278998,11572276,
    18407583,896958,5010169,18444718,20527274,
    18914011,262966,4976287,20601375,146950,
    272755,5230730,5165717,18570544,10519936,
    15129693,1076644,3378954,5174095,3191282,
    1479223, 984957,22751226,31025105,1508492,
    31022071,1080636,271593)
irl <- left_join(irl, dpsi[,c("pageid", "memberID")], by = "pageid") %>%
  dplyr::distinct(wikidataid, .keep_all = TRUE)
ireland_ids <- readRDS("./package/legislatoR-data-v1.0.0/irl_ids")
ireland_ids <- full_join(ireland_ids, irl[,c("wikidataid","memberID")], by = "wikidataid")
colnames(ireland_ids)[11] <- "dpsi"
ireland_ids <- ireland_ids[-which(rowSums(is.na(ireland_ids)) == ncol(ireland_ids)-1),]
saveRDS(ireland_ids, "./package/legislatoR-data-v1.0.0/irl_ids")


#### INTEGRATE parlScot WITH LEGISLATOR =================================================
parlScot <- vroom("./data/pol_sci_data/parlScotch_CLD.csv")
scotland_ids <- readRDS("./package/legislatoR-data-v2.0.0/sco_ids")
scotland_ids <- full_join(scotland_ids, parlScot[,c("parl_id", "wikidataid")], by = "wikidataid")
colnames(scotland_ids)[9] <- "parlScot"
saveRDS(scotland_ids, "./package/legislatoR-data-v2.0.0/sco_ids")
saschagobel/legislatoR documentation built on Jan. 30, 2024, 7:52 p.m.