library(pageviews)
library(lubridate)
library(vilaweb)
library(rtweet)
library(tidyverse)
library(databrew)
library(translateR)
library(sentimentr) # https://github.com/trinker/sentimentr
require(RPostgreSQL)
require(readr)
require(DBI)
library(webshot)
# Get newspaper headlines
if(!dir.exists('newspaper_headlines')){
dir.create('newspaper_headlines')
}
# Get newspapers
newspapers <- c('elpais', 'abc',
'elmundo',
'larazon',
'lavanguardia',
'elperiodico')
dates <- seq(as.Date('2017-09-15'), as.Date('2017-12-24'),by = 1)
for(i in 1:length(newspapers)){
for(j in 1:length(dates)){
this_newspaper <- newspapers[i]
this_date <- dates[j]
formatted_date <- format(this_date, '%Y/%m/%d')
this_path <-
paste0("http://img.kiosko.net/",
formatted_date,
"/es/",
this_newspaper,
".750.jpg")
message(this_newspaper, ' - ', this_date)
this_file <- paste0('newspaper_headlines/',
this_date,
'_',
this_newspaper,
'.jpg')
if(!file.exists(this_file)){
message('...Downloading')
download.file(url = this_path,
destfile =
this_file)
Sys.sleep(1)
} else {
message('...Skipping')
}
}
}
add_month <- function(x){
d <- as.Date(x, format = '%Y%m%d00')
month(d) <- month(d) + 1
format(d, '%Y%m%d00')
}
# Get catalan page vies
if(!'data_ca.RData' %in% dir()){
people <- c("Carles Puigdemont",
"Pedro Sánchez (politician)",
"Pablo Casado",
"Albert Rivera",
"Oriol Junqueras",
"Jordi Turull",
"Jordi Cuixart",
"Jordi Sànchez i Picanyol",
"Josep Rull",
"Dolors Bassa",
"Raül Romeva",
"Carme Forcadell",
"Joaquim Forn",
"Marta Rovira",
"Antoni Comín",
"Meritxell Serret",
"Lluís Puig",
"Anna Gabriel i Sabaté",
"Clara Ponsatí",
"Valtònyc",
"Inés Arrimadas",
"Miquel Iceta",
"Xavier García Albiol",
"Josep Borrell")
out_list <- list()
for(i in 1:length(people)){
start_date <- "2017010100"
person <- people[i]
new_person <- person
if(person == 'Pedro Sánchez (politician)'){new_person <- 'Pedro Sánchez Pérez-Castejón'}
if(person == 'Carles Puigdemont'){new_person <- 'Carles Puigdemont i Casamajó'}
if(person == 'Pablo Casado'){new_person <- 'Pablo Casado Blanco'}
if(person == 'Albert Rivera'){new_person <- 'Albert Rivera Díaz'}
if(person == 'Oriol Junqueras'){new_person <- 'Oriol Junqueras i Vies'}
if(person == 'Jordi Turull'){new_person <- 'Jordi Turull i Negre'}
if(person == 'Jordi Cuixart'){new_person <- 'Jordi Cuixart i Navarro'}
# if(person == 'Jordi Sànchez i Picanyol'){new_person <- ''}
if(person == 'Josep Rull'){new_person <- 'Josep Rull i Andreu'}
if(person == 'Dolors Bassa'){new_person <- 'Dolors Bassa i Coll'}
if(person == 'Raül Romeva'){new_person <- 'Raül Romeva i Rueda'}
if(person == 'Carme Forcadell'){new_person <- 'Carme Forcadell i Lluís'}
if(person == 'Joaquim Forn'){new_person <- 'Joaquim Forn i Chiariello'}
if(person == 'Marta Rovira'){new_person <- 'Marta Rovira i Vergés'}
if(person == 'Antoni Comín'){new_person <- 'Antoni Comín i Oliveres'}
if(person == 'Meritxell Serret'){new_person <- 'Meritxell Serret i Aleu'}
if(person == 'Lluís Puig'){new_person <- 'Lluís Puig i Gordi'}
if(person == 'Anna Gabriel i Sabaté'){new_person <- 'Anna Gabriel i Sabaté'}
if(person == 'Clara Ponsatí'){new_person <- 'Clara Ponsatí i Obiols'}
if(person == 'Valtònyc'){new_person <- 'Valtònyc'}
if(person == 'Inés Arrimadas'){new_person <- 'Inés Arrimadas García'}
if(person == 'Miquel Iceta'){new_person <- 'Miquel Iceta i Llorens'}
# if(person == 'Xavier García Albiol'){new_person <- ''}
if(person == 'Josep Borrell'){new_person <- 'Josep Borrell i Fontelles'}
message(person, '---------')
res <- try(
pv <-
article_pageviews(project = "ca.wikipedia",
article = new_person,
platform = "all",
user_type = "all",
start = start_date,
end = "2018121500",
reformat = TRUE)
)
while(class(res) == 'try-error'){
Sys.sleep(1)
start_date <- add_month(start_date)
message('---New date: ', start_date)
pv <-
article_pageviews(project = "ca.wikipedia",
article = new_person,
platform = "all",
user_type = "all",
start = start_date,
end = "2018121500",
reformat = TRUE)
}
pv$person <- person
out_list[[i]] <- pv
}
pv_ca <- bind_rows(out_list)
save(pv_ca, file = 'data_ca.RData')
} else {
load('data_ca.RData')
}
pv_ca$date <- as.Date(pv_ca$date)
if(!'data.RData' %in% dir()){
people <- c("Carles Puigdemont",
"Pedro Sánchez (politician)",
"Pablo Casado",
"Albert Rivera",
"Oriol Junqueras",
"Jordi Turull",
"Jordi Cuixart",
"Jordi Sànchez i Picanyol",
"Josep Rull",
"Dolors Bassa",
"Raül Romeva",
"Carme Forcadell",
"Joaquim Forn",
"Marta Rovira",
"Antoni Comín",
"Meritxell Serret",
"Lluís Puig",
"Anna Gabriel i Sabaté",
"Clara Ponsatí",
"Valtònyc",
"Inés Arrimadas",
"Miquel Iceta",
"Xavier García Albiol",
"Josep Borrell")
out_list <- list()
for(i in 1:length(people)){
start_date <- "2017010100"
person <- people[i]
new_person <- person
if(person == 'Clara Ponsatí'){new_person <- 'Clara Ponsatí i Obiols'}
message(person, '---------')
res <- try(
pv <-
article_pageviews(project = "en.wikipedia",
article = person,
platform = "all",
user_type = "all",
start = start_date,
end = "2018121500",
reformat = TRUE)
)
while(class(res) == 'try-error'){
Sys.sleep(1)
start_date <- add_month(start_date)
message('---New date: ', start_date)
pv <-
article_pageviews(project = "en.wikipedia",
article = person,
platform = "all",
user_type = "all",
start = start_date,
end = "2018121500",
reformat = TRUE)
}
pv$person <- person
out_list[[i]] <- pv
}
pv <- bind_rows(out_list)
save(pv, file = 'data.RData')
} else {
load('data.RData')
}
pv$date <- as.Date(pv$date)
# Get spanish too
if(!'data_es.RData' %in% dir()){
people <- c("Carles Puigdemont",
"Pedro Sánchez",
"Pablo Casado",
"Albert Rivera",
"Oriol Junqueras",
"Jordi Turull",
"Jordi Cuixart",
"Jordi Sànchez i Picanyol",
"Josep Rull",
"Dolors Bassa",
"Raül Romeva",
"Carme Forcadell",
"Joaquim Forn",
"Marta Rovira",
"Antoni Comín",
"Meritxell Serret",
"Lluís Puig",
"Anna Gabriel",
"Clara Ponsatí",
"Valtònyc",
"Inés Arrimadas",
"Miquel Iceta",
"Xavier García Albiol",
"Josep Borrell")
out_list <- list()
for(i in 18:length(people)){
start_date <- "2017010100"
person <- people[i]
message(person, '---------')
res <- try(
pv <-
article_pageviews(project = "es.wikipedia",
article = person,
platform = "all",
user_type = "all",
start = start_date,
end = "2018121500",
reformat = TRUE)
)
while(class(res) == 'try-error'){
Sys.sleep(1)
start_date <- add_month(start_date)
message('---New date: ', start_date)
pv <-
article_pageviews(project = "es.wikipedia",
article = person,
platform = "all",
user_type = "all",
start = start_date,
end = "2018121500",
reformat = TRUE)
}
pv$person <- person
out_list[[i]] <- pv
}
pv_es <- bind_rows(out_list)
# Conform names
pv_es <- pv_es %>%
mutate(person = ifelse(grepl('Gabriel', person), 'Anna Gabriel i Sabaté',
ifelse(grepl('Pedro Sánchez', person),
"Pedro Sánchez (politician)", person)))
save(pv_es, file = 'data_es.RData')
} else {
load('data_es.RData')
}
pv_es$date <- as.Date(pv_es$date)
pv$date <- as.Date(pv$date)
# Combine
pv <- pv %>%
mutate(language = 'English') %>%
bind_rows(pv_es %>%
mutate(language = 'Español')) %>%
bind_rows(pv_ca %>%
mutate(language = 'Catalan'))
# Clean up a bit
pv <- pv %>%
mutate(person = gsub(' (politician)', '', person, fixed = TRUE),
person = gsub(' i Picanyol', '', person, fixed = TRUE),
person = gsub(' i Sabaté', '', person, fixed = TRUE))
cleaner <- data_frame(
person = c('Albert Rivera',
'Anna Gabriel',
'Antoni Comín',
'Carles Puigdemont',
'Carme Forcadell',
'Clara Ponsatí',
'Dolors Bassa',
'Inés Arrimadas',
'Joaquim Forn',
'Jordi Cuixart',
'Jordi Sànchez',
'Jordi Turull',
'Josep Borrell',
'Josep Rull',
'Lluís Puig',
'Marta Rovira',
'Meritxell Serret',
'Miquel Iceta',
'Oriol Junqueras',
'Pablo Casado',
'Pedro Sánchez',
'Raül Romeva',
'Valtònyc',
'Xavier García Albiol'),
indepe = c(
FALSE ,#'Albert Rivera'
TRUE,#'Anna Gabriel'
TRUE,#'Antoni Comín'
TRUE,#'Carles Puigdemont'
TRUE,#'Carme Forcadell'
TRUE,#'Clara Ponsatí'
TRUE,#'Dolors Bassa'
FALSE,#'Inés Arrimadas'
TRUE,#'Joaquim Forn'
TRUE,#'Jordi Cuixart'
TRUE,#'Jordi Sànchez'
TRUE,#'Jordi Turull'
FALSE,#'Josep Borrell'
TRUE,#'Josep Rull'
TRUE ,#'Lluís Puig'
TRUE,#'Marta Rovira'
TRUE,#'Meritxell Serret'
FALSE,#'Miquel Iceta'
TRUE,#'Oriol Junqueras'
FALSE,#'Pablo Casado'
FALSE,#'Pedro Sánchez'
TRUE,#'Raül Romeva'
TRUE,#'Valtònyc'
FALSE#'Xavier García Albiol'
),
exile =
c(
NA,#'Albert Rivera'
'Exile',#'Anna Gabriel'
'Exile' ,#'Antoni Comín'
'Exile',#'Carles Puigdemont'
'Prison' ,#'Carme Forcadell'
'Exile',#'Clara Ponsatí'
'Prison' ,#'Dolors Bassa'
NA,#'Inés Arrimadas'
'Prison',#'Joaquim Forn'
'Prison',#'Jordi Cuixart'
'Prison',#'Jordi Sànchez'
'Prison',#'Jordi Turull'
NA,#'Josep Borrell'
'Prison',#'Josep Rull'
'Exile',#'Lluís Puig'
'Exile',#'Marta Rovira'
'Exile',#'Meritxell Serret'
NA,#'Miquel Iceta'
'Prison',#'Oriol Junqueras'
NA,#'Pablo Casado'
NA,#'Pedro Sánchez'
'Prison',#'Raül Romeva'
'Exile' ,#'Valtònyc'
NA#'Xavier García Albiol'
)
)
pv <- left_join(pv, cleaner)
# Get wikipedia data for charlottesville organizer
if('pv_char.RData' %in% dir()){
load('pv_char.RData')
} else {
start_date <- "2017010100"
char_people <- c('Jason Kessler', 'Richard B. Spencer')
out_list <- list()
for(i in 1:length(char_people)){
person <- char_people[i]
message(person, '---------')
res <- try(
pv_char <-
article_pageviews(project = "en.wikipedia",
article = person,
platform = "all",
user_type = "all",
start = start_date,
end = "2018121500",
reformat = TRUE)
)
while(class(res) == 'try-error'){
Sys.sleep(1)
start_date <- add_month(start_date)
message('---New date: ', start_date)
pv_char <-
article_pageviews(project = "en.wikipedia",
article = person,
platform = "all",
user_type = "all",
start = start_date,
end = "2018121500",
reformat = TRUE)
}
pv_char$person <- person
out_list[[i]] <- pv_char
}
pv_char <- bind_rows(out_list)
pv_char$date <- as.Date(pv_char$date)
save(pv_char,
file = 'pv_char.RData')
}
make_wiki_plot <- function(language = 'en',
since = '2018-01-01'){
plot_data <-
pv %>%
filter(date >= since) %>%
group_by(person, language) %>%
summarise(views = sum(views))
if(language == 'ca'){
x <- ''
y <- 'Visites'
title <- 'Visites de pàgines Wikipedia, 2018'
} else {
x <- ''
y <- 'Visits'
title <- 'Wikipedia page visits, 2018'
}
cols <- databrew::make_colors(10)[c(3,5,7)]
ggplot(data = plot_data,
aes(x = person,
y = views,
fill = language)) +
geom_bar(stat = 'identity') +
theme_vilaweb() +
theme(axis.text.x = element_text(angle = 90,
vjust = 0.5,
hjust = 1)) +
scale_fill_manual(name = '',
values = cols) +
labs(x = x,
y = y,
title = title) +
facet_wrap(~language, ncol = 3)
}
# make_wiki_plot()
# make_wiki_plot('ca')
# # Get most recent tweets from our people of interest
if(file.exists('tl.RData')){
load('tl.RData')
} else {
# Connect to the db
pg = DBI::dbDriver("PostgreSQL")
con = DBI::dbConnect(pg, dbname="twitter")
tl <- RPostgreSQL::dbGetQuery(
con,
paste0("SELECT * FROM twitter")
)
save(tl, file = 'tl.RData')
dbDisconnect(con)
}
# GET TWEETS FROM POLITICIANS, SEP 20-22
if('people_tweets.RData' %in% dir()){
load('people_tweets.RData')
load('people_tweets_long.RData')
load('newspaper_tweets.RData')
} else {
# Keep only the 2 week period following 20 sep
dates <- seq(as.Date('2017-09-20'),(as.Date('2017-09-20')+13), 1)
long_dates <- seq(as.Date('2017-09-10'),(as.Date('2017-10-31')), 1)
df <- tl %>% filter(date %in% dates)
people <-
tolower(c('Santi_ABASCAL',
'Albert_Rivera',
'InesArrimadas',
'sanchezcastejon',
'pablocasado_',
'ALevySoler',
'miqueliceta',
'Pablo_Iglesias_',
'albiol_xg',
'carrizosacarlos',
'ciudadanoscs',
'ciutadanscs',
'eva_granados',
'j_zaragoza_',
'marianorajoy',
'meritxell_batet',
'miqueliceta',
'pablocasado_',
'ppcatalunya',
'pnique',
'ppopular',
'psoe',
'santi_abascal',
'sanchezcastejon',
'socialistes_cat',
'societatcc',
'vox_es'))
newspapers <- c('cronicaglobal',
'elespanolcom',
'elmundoespana',
'elconfidencial',
'okdiario',
'elpais_espana',
'lavanguardia',
'elperiodico')
df <- df %>%
filter(username %in% c(newspapers, people)) %>%
mutate(is_newspaper = username %in% newspapers,
is_person = username %in% people)
# Go through each persons tweet on each day
people_tweets <- df %>%
filter(is_person) %>%
filter(date <= '2017-09-22') %>%
arrange(username, date)
# Save df
people_tweets_long <- tl %>%
filter(username %in% c(newspapers, people)) %>%
mutate(is_newspaper = username %in% newspapers,
is_person = username %in% people) %>%
filter(is_person) %>%
filter(date %in% long_dates)
save(people_tweets_long,
file = 'people_tweets_long.RData')
newspaper_tweets <- tl %>%
filter(username %in% c(newspapers, people)) %>%
mutate(is_newspaper = username %in% newspapers,
is_person = username %in% people) %>%
filter(is_newspaper) %>%
filter(date %in% long_dates)
save(newspaper_tweets,
file = 'newspaper_tweets.RData')
# Save for later use
save(people_tweets,
file = 'people_tweets.RData')
}
#' Prepend zero(s) to a number
#'
#' Prepend one or more 0's to a number. Useful for alphabetizing facto levels named with numbers.
add_zero <- function(x, n){
x <- as.character(x)
adders <- n - nchar(x)
adders <- ifelse(adders < 0, 0, adders)
for (i in 1:length(x)){
if(!is.na(x[i])){
x[i] <- paste0(
paste0(rep('0', adders[i]), collapse = ''),
x[i],
collapse = '')
}
}
return(x)
}
if(!dir.exists('screenshots')){
dir.create('screenshots')
}
person_dates <- the_dates <- the_times <- the_timezones <- the_ids <- rep(NA, nrow(people_tweets))
done <- TRUE
for(i in 1:nrow(people_tweets)){
message(i)
this_person <- people_tweets$username[i]
this_tweet <- people_tweets$tweet[i]
this_url <- people_tweets$link[i]
this_date <- people_tweets$date[i]
this_id <- people_tweets$id[i]
person_dates[i] <- paste0(this_person, ' ', this_date)
the_dates[i] <- this_date
the_times[i] <- as.character(people_tweets$time[i])
the_timezones[i] <- as.character(people_tweets$timezone[i])
the_ids[i] <- this_id
# webshot(this_url, paste0(i, '.png'),
# cliprect = 'viewport')
file_name <-
paste0("screenshots/",
add_zero(i, 5),
".png")
if(!file.exists(file_name)){
system(paste0(
"screenshot-tweet ",
this_url,
" ",
file_name
))
} else {
message('Skipping ', i, ' because file already exists.')
}
}
files <- dir('screenshots/')
picture_df <-
data_frame(file = files,
person_dates,
the_dates,
the_times,
the_timezones,
id = the_ids)
picture_df <- picture_df %>%
arrange(person_dates, the_times) %>%
mutate(the_dates = as.character(as.Date(the_dates, origin = '1970-01-01')))
# GET TWEETS FROM POLITICIANS, OCT 16-18
if('people_tweets2.RData' %in% dir()){
load('people_tweets2.RData')
} else {
load('tl.RData')
# Keep only the 2 week period following 20 sep
dates <- seq(as.Date('2017-10-16'),(as.Date('2017-10-18')), 1)
df <- tl %>% filter(date %in% dates)
people <-
tolower(c('Santi_ABASCAL',
'Albert_Rivera',
'InesArrimadas',
'sanchezcastejon',
'pablocasado_',
'ALevySoler',
'miqueliceta',
'Pablo_Iglesias_',
'albiol_xg',
'carrizosacarlos',
'ciudadanoscs',
'ciutadanscs',
'eva_granados',
'j_zaragoza_',
'marianorajoy',
'meritxell_batet',
'miqueliceta',
'pablocasado_',
'ppcatalunya',
'pnique',
'ppopular',
'psoe',
'santi_abascal',
'sanchezcastejon',
'socialistes_cat',
'societatcc',
'vox_es'))
newspapers <- c('cronicaglobal',
'elespanolcom',
'elmundoespana',
'elconfidencial',
'okdiario',
'elpais_espana',
'lavanguardia',
'elperiodico')
df <- df %>%
filter(username %in% c(newspapers, people)) %>%
mutate(is_newspaper = username %in% newspapers,
is_person = username %in% people)
# Go through each persons tweet on each day
people_tweets2 <- df %>%
filter(is_person) %>%
filter(date %in% dates) %>%
arrange(username, date)
# Save for later use
save(people_tweets2,
file = 'people_tweets2.RData')
}
if(!dir.exists('screenshots2')){
dir.create('screenshots2')
}
person_dates <- the_dates <- the_times <- the_timezones <- rep(NA, nrow(people_tweets2))
done <- TRUE
for(i in 1:nrow(people_tweets2)){
message(i)
this_person <- people_tweets2$username[i]
this_tweet <- people_tweets2$tweet[i]
this_url <- people_tweets2$link[i]
this_date <- people_tweets2$date[i]
person_dates[i] <- paste0(this_person, ' ', this_date)
the_dates[i] <- this_date
the_times[i] <- as.character(people_tweets2$time[i])
the_timezones[i] <- as.character(people_tweets2$timezone[i])
# webshot(this_url, paste0(i, '.png'),
# cliprect = 'viewport')
file_name <-
paste0("screenshots2/",
add_zero(i, 5),
".png")
if(!file.exists(file_name)){
system(paste0(
"screenshot-tweet ",
this_url,
" ",
file_name
))
} else {
message('Skipping ', i, ' because file already exists.')
}
}
files <- dir('screenshots2/')
picture_df2 <-
data_frame(file = files,
person_dates,
the_dates,
the_times,
the_timezones)
picture_df2 <- picture_df2 %>%
arrange(person_dates, the_times) %>%
mutate(the_dates = as.character(as.Date(the_dates, origin = '1970-01-01')))
if(!'usa.RData' %in% dir()){
load('tl.RData')
keep <- congress$user_name
usa <- tl %>%
filter(username %in% keep) %>%
filter(date >= '2017-08-01',
date <= '2017-08-30')
save(usa, file = 'usa.RData')
} else {
load('usa.RData')
}
if(!'news.RData' %in% dir()){
load('tl.RData')
keep <- tolower(news$user_name)
usa_news <- tl %>%
filter(username %in% keep) %>%
filter(date >= '2017-08-01',
date <= '2017-08-30')
save(usa_news, file = 'news.RData')
} else {
load('news.RData')
}
# Get google search trends
library(gtrendsR)
if('gt.RData' %in% dir()){
load('gt.RData')
} else {
g_df <-
data_frame(people = c("Jordi Cuixart",
'Jordi Sànchez',
'Jason Kessler',
'Richard Spencer'),
geo = c('ES', 'ES', 'US', 'US'))
out_list <- list()
for(i in 1:nrow(g_df)){
person <- g_df$people[i]
geo <- g_df$geo[i]
g <- gtrends(person,geo=geo)
interest <- g$interest_over_time
interest$date <- as.Date(interest$date)
out_list[[i]] <- interest
}
for(i in 1:length(out_list)){
out <- out_list[[i]]
out$hits[out$hits == '<1'] <- '0.5'
out$hits <- as.numeric(out$hits)
out_list[[i]] <- out
}
gt <- bind_rows(out_list)
save(gt, file = 'gt.RData')
}
# Prepare turkish coup data
if('turkey.RData' %in% dir()){
load('turkey.RData')
} else {
turkey <- tl %>%
filter(username %in% newspapers |
username %in% tolower(news$user_name)) %>%
filter(date >= '2016-07-01',
date <= '2016-07-31') %>%
mutate(is_spanish = username %in% newspapers,
is_international = username %in% tolower(news$user_name))
save(turkey,
file = 'turkey.RData')
}
detect_violence <- function(x){
grepl('violen|violèn',x)
}
newspapers <- c('cronicaglobal',
'elespanolcom',
'elmundoespana',
'elconfidencial',
'okdiario',
'elpais_espana',
'lavanguardia',
'elperiodico')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.