Due to copyright reasons, we cannot bundle the full text of New York Times, Süddeutsche Zeitung and Le Fegaro news articles in this package.
The first few rows of the data look like so.
require(rectr) require(tibble) require(dplyr) require(quanteda) readRDS("~/dev/infocrap/final_data_endefr.RDS")
## # A tibble: 3,391 x 10 ## path id pubdate headline lede body lang content nt tokenized_conte… ## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <list> ## 1 ./pa… arti… 2 Nove… "Maladi… "Ave… "Dan… FR "Avec … 1026 <chr [1,138]> ## 2 ./pa… arti… 2 Nove… "« Cela… "LE … "Pat… FR "LE FI… 734 <chr [820]> ## 3 ./pa… arti… 2 Nove… "L'Iran… "L'a… "de … FR "L'anc… 984 <chr [1,081]> ## 4 ./pa… arti… 2 Nove… "Matthi… "Le … "Mat… FR "Le pr… 1077 <chr [1,213]> ## 5 ./pa… arti… 2 Nove… "Les 31… "Lan… "EUR… FR "Lancé… 1012 <chr [1,115]> ## 6 ./pa… arti… 2 Nove… "Genera… "Apr… "À l… FR "Après… 800 <chr [874]> ## 7 ./pa… arti… 2 Nove… "La dis… "Seu… "Les… FR "Seul … 1171 <chr [1,318]> ## 8 ./pa… arti… 2 Nove… "Les ca… "Ils… "Ce … FR "Ils d… 552 <chr [612]> ## 9 ./pa… arti… 2 Nove… "Le pré… "FRA… "À l… FR "FRANÇ… 506 <chr [589]> ## 10 ./pa… arti… 2 Nove… "La nui… " Ç… "Au … FR " ÇA … 516 <chr [552]> ## # … with 3,381 more rows
Unfortunately, one must have the original data to reproduce this part of the analysis. Also, the code is not optimized. Thus, don't run this on your machine.
Reproduce the analyses in the paper.
require(rectr) require(tidyverse) require(quanteda) paris_corpus
paris_dfm
emb <- read_ft(c("fr", "de", "en")) paris_dfm_filtered <- filter_dfm(paris_dfm, paris_corpus, k = 5) paris_dfm_filtered
paris_gmm <- calculate_gmm(paris_dfm_filtered, seed = 42) paris_gmm
readRDS("final_data_endefr.RDS") %>% mutate(content = paste(lede, body), lang = tolower(lang), id = row_number(), outlet = recode(lang, 'en' = 'NYT', 'de' = 'SZ', 'fr' = 'LF')) %>% select(content, lang, pubdate, headline, id, outlet) -> textdata excluded_id <- c(1729, 1815, 1843) textdata <- textdata[-excluded_id,] max_docfreq <- (nrow(textdata) * 0.99) %>% ceiling dfm(textdata$content, tolower = TRUE, stem = TRUE, remove = stopwords("en"), remove_number = TRUE, remove_punct = TRUE) %>% dfm_trim(min_docfreq = 3, max_docfreq = max_docfreq) %>% dfm_tfidf -> dfm_tfidf eng_dfm <- dfm_tfidf[textdata$lang == 'en',] keywords <- function(topic = 1, thetax, eng_dfm, textdata, lang = "en") { print(topic) dfm_col <- ncol(eng_dfm) cor_t1 <- map_dbl(1:dfm_col, ~ cor(as.vector(eng_dfm[,.]), thetax[textdata$lang == lang,topic])) tibble(token = colnames(eng_dfm), cor = cor_t1, topic = topic) } keywords_matrix <- map_dfr(1:5, keywords, thetax = paris_gmm$theta, eng_dfm = eng_dfm, textdata = textdata) keywords_matrix %>% group_by(topic) %>% top_n(n = 10, wt = cor) %>% summarise(desc = paste(token, collapse = ", ")) -> rectr_keywords
rectr_keywords %>% knitr::kable()
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.