Due to copyright reasons, we cannot bundle the full text of New York Times, Süddeutsche Zeitung and Le Fegaro news articles in this package.
The first few rows of the data look like so.
require(rectr) require(tibble) require(dplyr) require(quanteda) require(tidyverse) readRDS("final_data_endefr.RDS")
## # A tibble: 3,391 x 10 ## path id pubdate headline lede body lang content nt tokenized_conte… ## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <list> ## 1 ./pa… arti… 2 Nove… "Maladi… "Ave… "Dan… FR "Avec … 1026 <chr [1,138]> ## 2 ./pa… arti… 2 Nove… "« Cela… "LE … "Pat… FR "LE FI… 734 <chr [820]> ## 3 ./pa… arti… 2 Nove… "L'Iran… "L'a… "de … FR "L'anc… 984 <chr [1,081]> ## 4 ./pa… arti… 2 Nove… "Matthi… "Le … "Mat… FR "Le pr… 1077 <chr [1,213]> ## 5 ./pa… arti… 2 Nove… "Les 31… "Lan… "EUR… FR "Lancé… 1012 <chr [1,115]> ## 6 ./pa… arti… 2 Nove… "Genera… "Apr… "À l… FR "Après… 800 <chr [874]> ## 7 ./pa… arti… 2 Nove… "La dis… "Seu… "Les… FR "Seul … 1171 <chr [1,318]> ## 8 ./pa… arti… 2 Nove… "Les ca… "Ils… "Ce … FR "Ils d… 552 <chr [612]> ## 9 ./pa… arti… 2 Nove… "Le pré… "FRA… "À l… FR "FRANÇ… 506 <chr [589]> ## 10 ./pa… arti… 2 Nove… "La nui… " Ç… "Au … FR " ÇA … 516 <chr [552]> ## # … with 3,381 more rows
The content column has the content of the news articles. For the rectr method, a processed version of the corpus and dfm is available in this package. The data was generated using the following code.
require(rectr) require(tidyverse) require(quanteda) readRDS("final_data_endefr.RDS") %>% mutate(content = paste(lede, body), lang = tolower(lang), id = row_number(), outlet = recode(lang, 'en' = 'NYT', 'de' = 'SZ', 'fr' = 'LF')) %>% select(content, lang, pubdate, headline, id, outlet) -> paris
There are three French articles published in NYT. In order not to interfere the analysis, these three articles were excluded.
excluded_id <- c(1729, 1815, 1843) paris$outlet[excluded_id] paris$headline[excluded_id]
get_ft("fr") get_ft("de") get_ft("en")
emb <- read_ft(c("fr", "de", "en")) paris_corpus <- create_corpus(paris$content, paris$lang) docvars(paris_corpus, "headline") <- paris$headline docvars(paris_corpus, "pubdate") <- paris$pubdate docvars(paris_corpus, "id") <- paris$id docvars(paris_corpus, "outlet") <- paris$outlet paris_corpus <- paris_corpus[-excluded_id] paris_dfm <- transform_dfm_boe(paris_corpus, emb) saveRDS(paris_dfm, "paris_dfm.RDS") paris_dfm_bert <- transform_dfm_boe(paris_corpus, mode = "bert", noise = TRUE, path = "./") saveRDS(paris_dfm_bert, "paris_dfm_bert.RDS")
Again, due to copyright reasons, we cannot provide the full text and the version translated by Google (using the DTM translation method by Reber 2019). The DTM (in STM) is available in this package. It was created using this code.
require(tidyverse) ##require(googleLanguageR) require(stm) require(quanteda) textdata <- readRDS("final_data_endefr.RDS") %>% mutate(content = paste(lede, body), lang = tolower(lang), id = row_number()) %>% select(content, lang, pubdate, headline, id) textdata %>% filter(lang != "en") -> FR_DE_content gl_auth("") glx <- function(content, source) { print(substr(content, 1, 30)) Sys.sleep(sample(1:5)) res <- gl_translate(content, source, target = "en", format = 'text') print(substr(res$translatedText, 1, 30)) return(res) } ### You need to pay Google over USD200 for running this line. Also 5 hrs of your time. FR_DE_trans <- map2(FR_DE_content$content, FR_DE_content$lang, safely(glx)) dead <- map_lgl(map(FR_DE_trans, "error"), Negate(is.null)) FR_DE_trans2 <- map2(substr(FR_DE$content[dead], 1, 20000), FR_DE$lang[dead], safely(glx)) FR_DE_trans[dead] <- FR_DE_trans2 ## saveRDS(FR_DE_trans, "FR_DE_trains.RDS") ## ## reassumble FR_DE_trans %>% map_chr(~.$result$translatedText) -> translated_content textdata$google_translate <- textdata$content textdata$google_translate[textdata$lang == "fr"] <- translated_content[FR_DE_content$lang == "fr"] textdata$google_translate[textdata$lang == "de"] <- translated_content[FR_DE_content$lang == "de"] textdata %>% sample_n(20) %>% select(content, google_translate, lang) saveRDS(textdata, "paris_ft_translate.RDS")
## Exclusion of three French NYT articles. textdata <- readRDS("paris_ft_translate.RDS")[-excluded_id,] min_docfreq <- (nrow(textdata) * 0.005) %>% ceiling max_docfreq <- (nrow(textdata) * 0.99) %>% ceiling dfm(textdata$google_translate, tolower = TRUE, stem = TRUE, remove = stopwords("en"), remove_number = TRUE, remove_punct = TRUE) %>% dfm_trim(min_docfreq = min_docfreq, max_docfreq = max_docfreq) %>% convert(to = 'stm') -> paris_ft_dfm ####### tdtm textdata <- readRDS("paris_ft_translate.RDS") textdata %>% filter(lang == "fr") %>% pull(content) -> FR_CONTENT textdata %>% filter(lang == "de") %>% pull(content) -> DE_CONTENT textdata %>% filter(lang == "en") %>% pull(content) -> EN_CONTENT FR_DFM <- dfm(FR_CONTENT, remove = stopwords("fr"), remove_numbers = TRUE, remove_punct = TRUE) %>% dfm_trim(min_docfreq = 2) FR_terms <- colnames(FR_DFM) ## ## Uncomment the following 3 lines if you want to do the google translation ## FR_trans_terms <- gl_translate(FR_terms, source = "fr") ## saveRDS(FR_trans_terms, "FR_trans_terms.RDS") ## saveRDS(FR_terms, "FR_terms.RDS") DE_DFM <- dfm(DE_CONTENT, remove = stopwords("de"), , remove_numbers = TRUE, remove_punct = TRUE) %>% dfm_trim(min_docfreq = 2) DE_terms <- colnames(DE_DFM) ## Uncomment the following 3 lines if you want to do the google translation ## DE_trans_terms <- gl_translate(DE_terms, source = "de") ## saveRDS(DE_trans_terms, "DE_trans_terms.RDS") ## saveRDS(DE_terms, "DE_terms.RDS") DE_trans_terms <- readRDS("DE_trans_terms.RDS") FR_trans_terms <- readRDS("FR_trans_terms.RDS") FR_tokens <- tokens(FR_CONTENT, remove_numbers = TRUE, remove_punct = TRUE) DE_tokens <- tokens(DE_CONTENT, remove_numbers = TRUE, remove_punct = TRUE) recon <- function(token_obj, trans_terms) { tibble(text = tolower(token_obj)) %>% left_join(trans_terms, by = 'text') %>% filter(!is.na(translatedText)) %>% pull(translatedText) %>% paste(collapse = " ") } FR_recon_en_text <- map_chr(FR_tokens, recon, trans_terms = FR_trans_terms) DE_recon_en_text <- map_chr(DE_tokens, recon, trans_terms = DE_trans_terms) recon_complete_text <- c(FR_recon_en_text, EN_CONTENT, DE_recon_en_text) textdata$translated_text <- recon_complete_text ### Exclusion of three NYT articles. textdata <- textdata[-excluded_id,] min_docfreq <- (nrow(textdata) * 0.005) %>% ceiling max_docfreq <- (nrow(textdata) * 0.99) %>% ceiling names(textdata$translated_text) <- NULL dfm(textdata$translated_text, tolower = TRUE, stem = TRUE, remove = stopwords("en"), remove_number = TRUE, remove_punct = TRUE) %>% dfm_trim(min_docfreq = min_docfreq, max_docfreq = max_docfreq) %>% convert(to = 'stm') -> paris_tdtm_dfm
Saving into the data directory, and don't get us into legal trouble.
paris_dfm <- readRDS("paris_dfm.RDS") paris_dfm_bert <- readRDS("paris_dfm_bert.RDS") ## Delete all text content, sorry, researchers! paris_corpus[1:length(paris_corpus)] <- "" paris_dfm$corpus <- paris_corpus paris_dfm_bert$corpus <- paris_corpus testthat::expect_true(all(nrow(paris_dfm$dfm) == 3388, nrow(paris_dfm_bert$dfm) == 3388, nrow(paris_tdtm_dfm$meta) == 3388, nrow(paris_ft_dfm$meta) == 3388, length(paris_corpus) == 3388)) testthat::expect_true(all(paris_dfm$corpus == "")) testthat::expect_true(all(paris_dfm_bert$corpus == "")) usethis::use_data(paris_corpus, overwrite = TRUE) usethis::use_data(paris_dfm, overwrite = TRUE) usethis::use_data(paris_dfm_bert, overwrite = TRUE) usethis::use_data(paris_ft_dfm, overwrite = TRUE) usethis::use_data(paris_tdtm_dfm, overwrite = TRUE)
Reproduce the analyses in the paper.
Aligned fastText
require(rectr) require(tidyverse) require(quanteda) paris_corpus
paris_dfm
paris_dfm_filtered <- filter_dfm(paris_dfm, k = 5, noise = TRUE) paris_dfm_filtered
paris_gmm <- calculate_gmm(paris_dfm_filtered, seed = 42) paris_gmm
M-BERT
paris_dfm_bert
paris_dfm_bert_filtered <- filter_dfm(paris_dfm_bert, k = 5, noise = TRUE) paris_dfm_bert_filtered
paris_gmm_bert <- calculate_gmm(paris_dfm_bert_filtered, seed = 43) paris_gmm_bert
get_sample <- function(i, paris_corpus, theta, threshold = 0.8, replace = FALSE) { tibble(hl = docvars(paris_corpus, "headline"), lang = docvars(paris_corpus, "lang"), prob = theta[,i]) %>% group_by(lang) %>% filter(prob > threshold) %>% sample_n(size = 5, weight = prob, replace = replace) %>% select(hl, lang, prob) %>% ungroup %>% arrange(lang, prob) %>% mutate(topic = i) } get_headline <- function(paris_gmm) { map_dfr(seq_len(ncol(paris_gmm$theta)), get_sample, paris_gmm$dfm$corpus, theta = paris_gmm$theta) %>% unique } set.seed(42) hl <- get_headline(paris_gmm) rio::export(select(hl, topic, lang, hl), "af_rectr_hl.csv") print(hl, n = 100)
set.seed(42) hl <- get_headline(paris_gmm_bert) rio::export(select(hl, topic, lang, hl), "mb_rectr_hl.csv") print(hl, n = 100)
require(stm) set.seed(42) translated_stm <- stm(paris_tdtm_dfm$documents, paris_tdtm_dfm$vocab, K = 5) set.seed(42) map_dfr(1:5, get_sample, paris_corpus, theta = translated_stm$theta, replace = TRUE) %>% unique -> hl rio::export(select(hl, topic, lang, hl), "tdtm_stm_hl.csv") print(hl, n = 100)
require(stm) require(tidyverse) ##require(googleLanguageR) require(quanteda) set.seed(42) ft_stm <- stm(paris_ft_dfm$documents, paris_ft_dfm$vocab, K = 5) set.seed(42) map_dfr(1:5, get_sample, paris_corpus, theta = ft_stm$theta, replace = TRUE) %>% unique -> hl rio::export(select(hl, topic, lang, hl), "ft_stm_hl.csv") print(hl, n = 100)
The coded data by two coders is available in this package.
paris_human_coding
Correlation between two coders
cor.test(paris_human_coding$coder1, paris_human_coding$coder2)
Interrater reliability
require(irr) irr::kripp.alpha(matrix(c(paris_human_coding$coder1, paris_human_coding$coder2), nrow = 2, byrow = TRUE), method = "ordinal")
Unadjusted Kappa (for comparison with Hatzivassiloglou et al. 1999)
irr::kappa2(t(matrix(c(paris_human_coding$coder1, paris_human_coding$coder2), nrow = 2, byrow = TRUE)), weight = "squared")
Topical similarity
require(lsa) id_row <- function(id, corpus) { which(id == docvars(corpus, "id")) } cal_sim <- function(id1, id2, thetax, corpus) { cosine(thetax[id_row(id1, corpus),], thetax[id_row(id2, corpus),])[1,1] } paris_human_coding %>% mutate(rectr_topicsim = map2_dbl(a1id, a2id, cal_sim, thetax = paris_gmm$theta, corpus = paris_corpus), bert_topicsim = map2_dbl(a1id, a2id, cal_sim, thetax = paris_gmm_bert$theta, corpus = paris_corpus), tstm_topicsim = map2_dbl(a1id, a2id, cal_sim, thetax = translated_stm$theta, corpus = paris_corpus), ft_topicsim = map2_dbl(a1id, a2id, cal_sim, thetax = ft_stm$theta, corpus = paris_corpus) , human = (coder1 + coder2) / 2) -> paris_topic_sim
cor.test(paris_topic_sim$human, paris_topic_sim$rectr_topicsim) cor.test(paris_topic_sim$human, paris_topic_sim$bert_topicsim) cor.test(paris_topic_sim$human, paris_topic_sim$tstm_topicsim) cor.test(paris_topic_sim$human, paris_topic_sim$ft_topicsim)
paris_topic_sim %>% select(rectr_topicsim, tstm_topicsim, ft_topicsim, bert_topicsim, human) %>% gather(key = "model", value = "similarity", -human) %>% mutate(model = recode(model, rectr_topicsim = "af-rectr", bert_topicsim = "mb-rectr", tstm_topicsim = "tdtm-stm", ft_topicsim = "ft-stm")) %>% ggplot(aes(x = human, y = similarity)) + geom_point(alpha = 0.5) + facet_grid(fct_relevel(model, "af-rectr", "mb-rectr") ~ .) + geom_smooth(method = 'lm') + ylab("Topical Similarity") + xlab("Human evaluation") -> fig height <- 6 ggsave("rectr_cmm_r1_fig7.pdf", fig, height = height, width = height * 1.77) fig
require(ggridges) theta_to_tibble <- function(theta, method = "af-rectr", paris_corpus) { tibble(theta = c(theta[,1], theta[,2], theta[,3], theta[,4], theta[,5]), lang = rep(docvars(paris_corpus, "lang"), 5), pubdate = rep(docvars(paris_corpus, "pubdate"), 5), topic = c(sort(rep(1:5, length(paris_corpus)))), method = method) } map2_dfr(list(paris_gmm$theta, paris_gmm_bert$theta, translated_stm$theta, ft_stm$theta), c("af-rectr", "mb-rectr", "tdtm-stm", "ft-stm"), theta_to_tibble, paris_corpus = paris_corpus) %>% ggplot(aes(x = theta, y = topic, group = topic)) + geom_density_ridges(aes(fill = lang), alpha = .5, color = "white", from = 0, to = 1) + scale_fill_brewer(palette = "Dark2") + facet_grid(lang ~ fct_relevel(method, "af-rectr", "mb-rectr")) + scale_y_continuous(breaks=seq_len(5), labels=c("t1", "t2", "t3", "t4", "t5")) + xlab(expression(theta[t])) -> fig ggsave("rectr_cmm_r1_fig5.pdf", fig, height = height, width = height * 1.77) fig
cal_hhi <- function(model, method = "af-rectr") { tibble(best_fit = apply(model$theta, 1, which.max), lang = docvars(paris_corpus, "lang")) %>% count(best_fit, lang) %>% group_by(best_fit) %>% mutate(tn = sum(n), prob = (n / tn)^2) %>% summarise(hhi = sum(prob)) %>% mutate(method = method) } map2_dfr(list(paris_gmm, paris_gmm_bert, translated_stm, ft_stm), c("af-rectr", "mb-rectr", "tdtm-stm", 'ft-stm'), cal_hhi) %>% ggplot(aes(x = best_fit, y = hhi)) + geom_bar(stat = 'identity') + facet_grid(. ~ fct_relevel(method, "af-rectr", "mb-rectr")) + xlab("topic") + ylab("Hirschman-Herfindahl Index") -> fig ggsave("rectr_cmm_r1_fig6.pdf", fig, height = height, width = height * 1.77) fig
require(lubridate) tibble(theta = c(paris_gmm$theta[,1], paris_gmm$theta[,2], paris_gmm$theta[,3], paris_gmm$theta[,4], paris_gmm$theta[,5], translated_stm$theta[,1], translated_stm$theta[,2], translated_stm$theta[,3], translated_stm$theta[,4], translated_stm$theta[,5]), lang = rep(docvars(paris_corpus, "lang"), 10), pubdate = rep(docvars(paris_corpus, "pubdate"), 10), topic = c(sort(rep(1:5, length(paris_corpus))), sort(rep(1:5, length(paris_corpus)))), method = c(rep("af-rectr", 3388*5), rep("tdtm-stm", 3388*5))) %>% mutate(pubdate = lubridate::dmy(pubdate)) %>% group_by(topic, lang, pubdate, method) %>% summarise(mean_theta = mean(theta)) %>% ggplot(aes(x = pubdate, y = mean_theta, color = lang)) + geom_line() + facet_grid(topic ~ method) + ylab(expression(theta[t])) + xlab("Date") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_color_brewer(palette = "Dark2") -> fig ggsave("rectr_cmm_r1_fig8.pdf", fig, height = height, width = height * 1.77) fig
Appendix VII
tibble(theta = c(paris_gmm_bert$theta[,1], paris_gmm_bert$theta[,2], paris_gmm_bert$theta[,3], paris_gmm_bert$theta[,4], paris_gmm_bert$theta[,5], ft_stm$theta[,1], ft_stm$theta[,2], ft_stm$theta[,3], ft_stm$theta[,4], ft_stm$theta[,5]), lang = rep(docvars(paris_corpus, "lang"), 10), pubdate = rep(docvars(paris_corpus, "pubdate"), 10), topic = c(sort(rep(1:5, length(paris_corpus))), sort(rep(1:5, length(paris_corpus)))), method = c(rep("mb-rectr", 3388*5), rep("ft-stm", 3388*5))) %>% mutate(pubdate = lubridate::dmy(pubdate)) %>% group_by(topic, lang, pubdate, method) %>% summarise(mean_theta = mean(theta)) %>% ggplot(aes(x = pubdate, y = mean_theta, color = lang)) + geom_line() + facet_grid(topic ~ method) + ylab(expression(theta[t])) + xlab("Date") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_color_brewer(palette = "Dark2")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.