In joebrew/vilaweb: Reproducible data analysis for www.vilaweb.cat

# Basic knitr options
library(knitr)
opts_chunk$set(comment = NA, 
               echo = FALSE, 
               warning = FALSE, 
               message = FALSE, 
               error = TRUE, 
               cache = FALSE,
               fig.height = 8,
               fig.path = 'cotxes_figures/')

# Libraries
library(vilaweb)
library(rtweet)
library(tidyverse)
library(databrew)
library(translateR)
library(sentimentr) # https://github.com/trinker/sentimentr
require(RPostgreSQL)
require(readr)  
require(DBI)
library(webshot)

# #Coches dañados
# system("python3 ../../foreign/twint/Twint.py -s 'coches dañados' --since 2017-09-15 --until 2017-09-25 -o cotxes/coches_danados.csv --csv")
# # Cotxes danyats
# system("python3 ../../foreign/twint/Twint.py -s 'cotxes danyats' --since 2017-09-15 --until 2017-09-25 -o cotxes/cotxes_danyats.csv --csv")
# #Coches destruidos
# system("python3 ../../foreign/twint/Twint.py -s 'coches destruidos' --since 2017-09-15 --until 2017-09-25 -o cotxes/coches_destruidos.csv --csv")
# # Cotxes destruits
# system("python3 ../../foreign/twint/Twint.py -s 'cotxes destruïts' --since 2017-09-15 --until 2017-09-25 -o cotxes/cotxes_destruits.csv --csv")
# # Vehículos dañados
# system("python3 ../../foreign/twint/Twint.py -s 'vehículos dañados' --since 2017-09-15 --until 2017-09-25 -o cotxes/vehiculos_danados.csv --csv")
# # vehicles danyats
# system("python3 ../../foreign/twint/Twint.py -s 'vehicles danyats' --since 2017-09-15 --until 2017-09-25 -o cotxes/vehicles_danyats.csv --csv")
# # destrozar coches
# system("python3 ../../foreign/twint/Twint.py -s 'destrozar coches' --since 2017-09-15 --until 2017-09-25 -o cotxes/destrozar_coches.csv --csv")
# # destrossar cotxes
# system("python3 ../../foreign/twint/Twint.py -s 'destrossar cotxes' --since 2017-09-15 --until 2017-09-25 -o cotxes/destrossar_cotxes --csv")
# 
# system("python3 ../../foreign/twint/Twint.py -s '\"coches\" AND \"guardia civil\"' --since 2017-09-15 --until 2017-09-25 -o cotxes/x.csv --csv")

# system("python3 ../../foreign/twint/Twint.py -s '(\"coches\" OR \"cotxes\") AND (\"guardia civil\" OR \"dañados\" OR \"danyats\" OR \"destruidos\" OR \"destruïts\" OR \"dañar\" OR \"danyar\" OR \"destrossar\" OR \"destrozar\")' --since 2017-09-15 --until 2017-09-30 -o cotxes/final.csv --csv")


cotxes_dir <- dir('cotxes', recursive = TRUE)
out_list <- list()
for(i in which(cotxes_dir == 'final/tweets.csv')){
# for(i in 1:length(cotxes_dir)){
  file_path <- paste0('cotxes/', cotxes_dir[i])
  search_string <- unlist(strsplit(file_path, '/'))[1]
  data <- read_csv(file_path) %>%
    mutate(search_string = search_string)
  out_list[[i]] <- data
}


df <- bind_rows(out_list)
df <- df %>% filter(!duplicated(id))

# Adjust for time zone
library(lubridate)
df$date_time <- as.POSIXct(paste0(df$date, ' ', df$time, ' ', '+0', df$timezone))
Sys.setenv(TZ='CET')


# df$date_time <- with_tz(df$date_time, 'CET')


agg <- df %>%
  mutate(hour = as.POSIXct(cut(date_time, 'hour'))) %>%
  group_by(hour) %>% 
  summarise(n = n(),
            retweets = sum(retweets_count, na.rm = TRUE) + 1,
            likes = sum(likes_count))
left <- data.frame(hour  = seq(min(agg$hour),
                               max(agg$hour),
                               by = 'hour'))
agg <- left_join(left, agg)
agg$n[is.na(agg$n)] <- 0
agg$retweets[is.na(agg$retweets)] <- 0
agg$likes[is.na(agg$likes)] <- 0
agg$interactions <- agg$n + agg$retweets + agg$likes

agg <- agg %>%
  filter(hour >= '2017-09-19',
         hour <= '2017-09-24')

date_breaks <- data.frame(date_time = sort(unique(agg$hour)))
date_breaks$date <- as.Date(date_breaks$date_time)
date_breaks$hour <- as.numeric(substr(date_breaks$date_time, 12, 13))
keep_breaks <- date_breaks %>%
  filter(hour %in% seq(0, 24, 4)) %>%
  dplyr::select(date_time) %>%
  .$date_time
strong_lines <- date_breaks %>%
  filter(hour %in% 0) %>%
  dplyr::select(date_time) %>%
  .$date_time

shader <- date_breaks %>% filter(hour == 0)
shader$end <- shader$date_time + hours(24)

Unique tweets

ggplot(data = agg,
       aes(x = hour,
           y = n)) +
  geom_vline(xintercept = strong_lines,
             alpha = 0.7, 
             lty = 2) +

  geom_line() +
  theme_databrew() +
  scale_x_datetime(breaks = keep_breaks) +
  # scale_x_datetime(breaks = sort(unique(agg$hour))) +
  theme(axis.text.x = element_text(angle = 90,
                                   size = 10,
                                   vjust = 0.5),
        plot.title = element_text(size = 30),
        plot.subtitle = element_text(size = 25)) +
  labs(title = 'Hourly tweets',
       x = 'Hour', y = 'Tweets',
       subtitle = 'Unique tweets only',
       caption = paste0('Search string:\n', 
                        "(\"coches\" OR \"cotxes\") AND (\"guardia civil\" OR \"dañados\" OR \"danyats\" OR \"destruidos\" OR\n\"destruïts\" OR \"dañar\" OR \"danyar\" OR \"destrossar\" OR \"destrozar\")"))

Zoom-in of the above

ggplot(data = agg %>%
         filter(hour >= '2017-09-20',
                hour <= '2017-09-22'),
       aes(x = hour,
           y = n)) +
    geom_vline(xintercept = strong_lines,
             alpha = 0.7, 
             lty = 2) +

  geom_point() +

  geom_line() +
  theme_databrew() +
  scale_x_datetime(breaks = keep_breaks) +
  # scale_x_datetime(breaks = sort(unique(agg$hour))) +
  theme(axis.text.x = element_text(angle = 90,
                                   size = 15,
                                   vjust = 0.5),
        plot.title = element_text(size = 30),
        plot.subtitle = element_text(size = 25)) +
  labs(title = 'Hourly tweets',
       x = 'Hour', y = 'Tweets',
       subtitle = 'Unique tweets only',
       caption = paste0('Search string:\n', 
                        "(\"coches\" OR \"cotxes\") AND (\"guardia civil\" OR \"dañados\" OR \"danyats\" OR \"destruidos\" OR\n\"destruïts\" OR \"dañar\" OR \"danyar\" OR \"destrossar\" OR \"destrozar\")"))

Same as above but including retweets

ggplot(data = agg %>%
         filter(hour >= '2017-09-20',
                hour <= '2017-09-22'),
       aes(x = hour,
           y = retweets)) +
    geom_vline(xintercept = strong_lines,
             alpha = 0.7, 
             lty = 2) +

  geom_point() +
  geom_line() +
  theme_databrew() +
  scale_x_datetime(breaks = keep_breaks) +
  # scale_x_datetime(breaks = sort(unique(agg$hour))) +
  theme(axis.text.x = element_text(angle = 90,
                                   size = 15,
                                   vjust = 0.5),
        plot.title = element_text(size = 30),
        plot.subtitle = element_text(size = 25)) +
  labs(title = 'Hourly tweets',
       x = 'Hour', y = 'Tweets',
       subtitle = 'Including retweets',
       caption = paste0('Search string:\n', 
                        "(\"coches\" OR \"cotxes\") AND (\"guardia civil\" OR \"dañados\" OR \"danyats\" OR \"destruidos\" OR\n\"destruïts\" OR \"dañar\" OR \"danyar\" OR \"destrossar\" OR \"destrozar\")"))

Same as above but including all interactions (tweet + number of retweets + number of likes)

ggplot(data = agg %>%
         filter(hour >= '2017-09-20',
                hour <= '2017-09-22'),
       aes(x = hour,
           y = interactions)) +
    geom_vline(xintercept = strong_lines,
             alpha = 0.7, 
             lty = 2) +

  geom_point() +
  geom_line() +
  theme_databrew() +
  scale_x_datetime(breaks = keep_breaks) +
  # scale_x_datetime(breaks = sort(unique(agg$hour))) +
  theme(axis.text.x = element_text(angle = 90,
                                   size = 15,
                                   vjust = 0.5),
        plot.title = element_text(size = 30),
        plot.subtitle = element_text(size = 25)) +
  labs(title = 'Hourly tweets',
       x = 'Hour', y = 'Tweets',
       subtitle = 'All interactions',
       caption = paste0('Search string:\n', 
                        "(\"coches\" OR \"cotxes\") AND (\"guardia civil\" OR \"dañados\" OR \"danyats\" OR \"destruidos\" OR\n\"destruïts\" OR \"dañar\" OR \"danyar\" OR \"destrossar\" OR \"destrozar\")"))

Technical details

Data scraped from twtiter on February 21, 2019 using the python twint library.
Data processed, aggregated, and visualized using R.
The number of retweets and likes are as of date of data retrieval
All code for this analysis at https://github.com/joebrew/vilaweb/tree/master/analyses/sep20/cotxes.Rmd

joebrew/vilaweb documentation built on Sept. 11, 2020, 3:42 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com