VaccinationinNLShiny: Vaccination in the Netherlands

## code to prepare `collect_news` dataset goes here
##-----------------------------------------------------------------------
library(tidyverse)
library(glue)
library(qdapRegex)
library(tidytext)
library(syuzhet)
library(lubridate)
library(newsanchor)
library(stopwords)
library(quanteda)
library(stm)

dutch_stop_words <- data.frame()
stop_words <- c(stopwords('nl'),stopwords('nl',source = 'nltk'),stopwords('nl',source = 'stopwords-iso'))
dutch_stop_words <- as.data.frame(stop_words) %>% rename(word = stop_words)

date <- Sys.Date()

#Keywords Collect Tweets
##-----------------------------------------------------------------------
tweet_keywords <- c('vaccineren',
                    'vaccineren autism',
                    'vaccineren mmr',
                    'vaccineren measles',
                    'vaccineren corona',
                    'vaccineren coronavirus',
                    'vaccineren covid',
                    'vaccinatie autism',
                    'vaccinatie corona',
                    'Vaccinatie covid',
                    'Vaccinatie autism',
                    'vaxxhappened',
                    'CoronaVaccin',
                    'plandemic',
                    'autism vaccine',
                    'vrijheid vaccine',
                    'mmr vaccine',
                    'measles autism',
                    'Vaccinatie',
                    'antivax',
                    'vaccination autism',
                    'vaccination corona',
                    'antivax corona',
                    'antivax covid',
                    'coronavaccin',
                    'viruswaanzin',

)


news_item_results_df <- data.frame()
news_item_metadata <- data.frame()

for (keyword in search_items) {
  
  df_tmp <- get_everything_all(keyword,api_key = api_key,language = 'nl',from = '2020-07-23' )
  tmp_result_df<- df_tmp$results_df
  tmp_metadata_df <-df_tmp$metadata
  tmp_result_df$searched_keyword <- keyword
  tmp_metadata_df$searched_keyword <- keyword
  news_item_results_df <- bind_rows(news_item_results_df,tmp_result_df)
  news_item_metadata <- bind_rows(news_item_metadata,tmp_metadata_df)

  Sys.sleep(10)
}
date <- Sys.Date()

news_item_results_df %>%
  write_csv(glue('../vaccinationinNL/tmp/news_item_results_df{date}','.csv'))


news_item_metadata %>%
  write_csv(glue('../vaccinationinNL/tmp/news_item_metadata{date}','.csv'))



#  Preprocessing
###---------------------------------------------------

news_files <- list.files('../vaccinationinNL/tmp') %>%
  .[str_detect(.,'news_item_results_df')]
news_files <- paste('../vaccinationinNL/tmp/',news_files,sep = '')
news_dfs <-purrr::map(news_files,read_csv)
all_news <- data.frame()
for (df in news_dfs) {
  all_news <- bind_rows(all_news,df)
  
  
}



all_news$searched_keyword <- NULL

all_news<-all_news  %>% 
  mutate(published_at = lubridate::ymd_hms(published_at),
         by_day = lubridate::floor_date(published_at,  "day")) %>% 
  distinct_all(.keep_all = TRUE) 



#Sentiment Analysis
#------------------------------------------------------
all_news_sentiment_description <- all_news %>% 
  bind_cols(get_nrc_sentiment(as.vector(
    all_news$description),
    language = 'dutch')) %>% 
  filter(!is.na(description)) %>% 
  filter(str_detect(str_to_lower(title),'vaccin') |str_detect(str_to_lower(description),'vaccin')|str_detect(str_to_lower(content),'vaccin'))

all_news_sentiment_content <- all_news %>% 
  mutate(content = str_remove_all(content,'Deze website maakt gebruik van cookies'),
         content = str_remove_all(content,'chars'),
         content = str_remove_all(content,'cookies'),
         content = qdapRegex::rm_number(content)) %>% 
  bind_cols(get_nrc_sentiment(as.vector(
    all_news$content),
    language = 'dutch')) %>% 
  filter(!is.na(content)) %>% 
  filter(str_detect(str_to_lower(title),'vaccin') |str_detect(str_to_lower(description),'vaccin')|str_detect(str_to_lower(content),'vaccin')) 

sentiment_Words_content <- all_news  %>% 
  unnest_tokens(input =  content,output = word,drop  = FALSE) %>% 
  inner_join(get_sentiment_dictionary('nrc',language ='dutch'))


sentiment_Words_description_stop_words  <- all_news_sentiment_description  %>% 
  unnest_tokens(input =  description ,output = word,drop  = FALSE) %>% 
  rowid_to_column() %>% 
  filter(word != 'chars') %>% 
  anti_join(dutch_stop_words) 

count_sentiment_Words_description_stop_words <- sentiment_Words_description_stop_words%>% 
  inner_join(get_sentiment_dictionary('nrc',language ='dutch')) %>% 
  count(sentiment)



all_news_sentiment_content_tidy_stop_word <- all_news_sentiment_content %>% 
  rowid_to_column() %>% 
  unnest_tokens(word,content) %>% 
  anti_join(dutch_stop_words) 


all_news_sentiment_description_tidy_stop_word <-  all_news_sentiment_description %>% 
  rowid_to_column() %>% 
  unnest_tokens(word,description) %>% 
  anti_join(dutch_stop_words) 
##STM
#---------------------------------------------------


d_tidy <- sentiment_Words_description_stop_words %>% 
  add_count(word) %>% 
  select(-n)
  
d_sparse <- all_news_sentiment_description_tidy_stop_word %>% 
  count(rowid,word,sort = TRUE) %>% 
  cast_sparse(rowid,word,n)

n_outlets <- n_distinct(d_tidy$name)
n_docs <- n_distinct(d_tidy$row_id)
n_words <- nrow(d_tidy)
n_authors <- n_distinct(d_tidy$author)

m_topic <- stm(d_sparse, K = 8, verbose = TRUE, max.em.its = 75)

d_beta <- tidy(m_topic)
  # select only one of the terms that map to the same lemmatised term
  d_top_terms <- d_beta %>% 
  group_by(topic, term) %>% 
  summarise(beta = max(beta)) %>% 
  
  # select the top words by topic
  group_by(topic) %>%
  top_n(100, beta) %>%
  mutate(top_word = beta == max(beta)) %>% 
  arrange(topic) %>%
  ungroup() %>% 
  rename(term = term)

d_gamma <- tidy(m_topic, matrix = "gamma", document_names = rownames(d_sparse))

# compute topic prevalence
d_gamma_terms <- 
  d_gamma %>%
  group_by(topic) %>%
  summarise(gamma = mean(gamma)) %>%
  arrange(desc(gamma)) %>%
  left_join(d_top_terms, by = "topic")

m_corr <- stm::topicCorr(m_topic, cutoff = 0.1)  # 0.075

# extract adjacency matrix from topic correlations and build a network via igraph
g_vis <- 
  igraph::simplify(igraph::graph.adjacency(m_corr$poscor, mode = "undirected", weighted = TRUE)) %>% 
  toVisNetworkData()



d_nodes <- 
  g_vis$nodes %>% 
  left_join(d_top_terms %>% filter(top_word) %>% select(-top_word), by = c(id = "topic")) %>% 
  left_join(
    d_gamma_terms %>% filter(top_word) %>% select(topic, gamma),
    by  = c(id = "topic")
  ) %>% 
  mutate(size = 2000 * gamma) %>%
  select(id, label = term, size) %>% 
  mutate(topic = id) %>% 
  left_join(
    d_top_terms %>% 
      group_by(topic) %>% 
      top_n(10, beta) %>% 
      ungroup() %>% 
      mutate(node_id = 1000 * (1:n())) %>% 
      select(topic, term, beta, node_id),
    by = "topic"
  ) %>% 
  mutate(
    id = ifelse(label == term, id, node_id),
    shape = ifelse(label == term, "dot", "square"),
    color = ifelse(label == term, "#3fb1e3", "#888888"),
    # font.color = ifelse(label == term, "#3fb1e3", "#888888"),
    size = 12,
    font.size =12,
    label = term
  )


# add additional properties to the graph edges
d_edges <- 
  g_vis$edges %>% 
  mutate(width = weight * 20) %>% 
  bind_rows(
    d_nodes %>% 
     # filter(id > 100) %>% 
      mutate(weight = 1, width = 1) %>% 
      select(from = topic, to = id, weight, width)
  )




#tf-idf
#---------------------------------------------------
# all_news_sentiment_content_tidy_stop_word %>% 
#   count(rowid,word,sort = TRUE) %>% 
#   bind_tf_idf(word,rowid,n) %>% 
#   arrange(-tf_idf) %>% 
#   group_by(rowid) %>% 
#   top_n(10) %>% 
#   ungroup() %>%
#   filter(rowid <10) %>% 
#   mutate(word = reorder_within(word, tf_idf, rowid)) %>%
#   ggplot(aes(word, tf_idf, fill = rowid)) +
#   geom_col(alpha = 0.8, show.legend = FALSE) +
#   facet_wrap(~ rowid, scales = "free", ncol = 3) +
#   scale_x_reordered() +
#   coord_flip() +
#   theme(strip.text=element_text(size=11)) 
# 
# 
# news_documents <- textProcessor(all_news_sentiment_content$content, metadata = all_news_sentiment_content, lowercase = TRUE,
#               removestopwords = TRUE, removenumbers = TRUE,
#               removepunctuation = TRUE, ucp = FALSE, stem = FALSE,
#               wordLengths = c(3, Inf), sparselevel = 1, language = "dutch",
#               verbose = TRUE, onlycharacter = FALSE, striphtml = FALSE,
#               customstopwords = dutch_stop_words$word, custompunctuation = NULL, v1 = FALSE)
# 
# 
# out <- prepDocuments(news_documents$documents, news_documents$vocab, news_documents$meta,lower.thresh = 0)
# 
# poliblogPrevFit <- stm(out$documents, out$vocab, K=8, prevalence=~positive+negative, 
#                        max.em.its=100, data=out$meta, init.type="Spectral", 
#                        seed=8458159)
# plot(poliblogPrevFit, type="summary", xlim=c(0,10))
# plot(poliblogPrevFit, type="labels", )
# 
# plot(poliblogPrevFit, type="perspectives", topics=c(1,5))
#d_sparse_news <-d_sparse 
#Save words 
#---------------------------------------------------------------------------------------------------
usethis::use_data(all_news, overwrite = TRUE)
usethis::use_data(all_news_sentiment_description, overwrite = TRUE)
usethis::use_data(all_news_sentiment_content, overwrite = TRUE)
usethis::use_data(sentiment_Words_content, overwrite = TRUE)
usethis::use_data(all_news_sentiment_content_tidy_stop_word, overwrite = TRUE)
usethis::use_data(sentiment_Words_description_stop_words, overwrite = TRUE)
usethis::use_data(count_sentiment_Words_description_stop_words, overwrite = TRUE)
usethis::use_data(d_nodes, overwrite = TRUE)
usethis::use_data(d_edges, overwrite = TRUE)
usethis::use_data(d_sparse_news, overwrite = TRUE)
usethis::use_data(d_top_terms, overwrite = TRUE)
mcnakhaee/VaccinationinNLShiny documentation built on Aug. 4, 2020, 1:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mcnakhaee/VaccinationinNLShiny
Vaccination in the Netherlands

data-raw/collect_news.R
In mcnakhaee/VaccinationinNLShiny: Vaccination in the Netherlands

R Package Documentation

Browse R Packages

We want your feedback!

mcnakhaee/VaccinationinNLShiny Vaccination in the Netherlands

data-raw/collect_news.R In mcnakhaee/VaccinationinNLShiny: Vaccination in the Netherlands

R Package Documentation

Browse R Packages

We want your feedback!

mcnakhaee/VaccinationinNLShiny
Vaccination in the Netherlands

data-raw/collect_news.R
In mcnakhaee/VaccinationinNLShiny: Vaccination in the Netherlands