data-raw/ECB_speeches_and_conferences.R

# library(quanteda)
# 
# ECB_speeches <- get_ECB_speeches(
#   filter_english = TRUE,
#   clean_footnotes = TRUE,
#   compute_sentiment = TRUE,
#   tokenize_w_POS = TRUE
# )
# 
# ## Remove set of stopwords
# ECB_speeches <- tokens_remove(ECB_speeches, c(stopwords("en"), "speech", "see", "ladies", "gentlemen", "year", "years", "also", "can", "may", "new", "one", "time", "important", "level", "however", "need", "first", "even", "let", "like", "two", "three", "four", "five", "framework", "second", "third", "now", "well", "since", "recent", "process", "current", "large", "many", "future", "low", "still", "way", "high", "across", "key", "today", "last", "main", "different", "much", "take", "make", "higher", "lower", "made", "towards", "within", "period", "view", "past", "present", "already", "use", "ensure", "fact", "analysis", "number", "numbers", "objective", "part"))
# ## Detect bigrams
# ECB_speeches <- tokens_compound(
#   ECB_speeches,
#   sapply(quanteda.textstats::textstat_collocations(ECB_speeches, min_count = 1500, size = 2)$collocation, phrase)
# )
# 
# ECB_speeches <- tokens_tolower(ECB_speeches)
# 
# ## Frequency-based cleaning
# dfm <- dfm(ECB_speeches)
# dfm
# 
# dfm |> dfm_remove(c(stopwords("en"), "non")) |>
#   dfm_trim(min_docfreq = .02, docfreq_type = "prop") |>
#   dfm_trim(max_docfreq = 1, docfreq_type = "prop") |>
#   dfm_select(min_nchar = 3) -> dfm2
# 
# dfm(toks2)
# 
# intersect(
#   c(stopwords("en"), "speech", "see", "ladies", "gentlemen", "year", "years", "also", "can", "may", "new", "one", "time", "important", "level", "however", "need", "first", "even", "let", "like", "two", "three", "four", "five", "framework", "second", "third", "now", "well", "since", "recent", "process", "current", "large", "many", "future", "low", "still", "way", "high", "across", "key", "today", "last", "main", "different", "much", "take", "make", "higher", "lower", "made", "towards", "within", "period", "view", "past", "present", "already", "use", "ensure", "fact", "analysis", "number", "numbers", "objective", "part"),
#   unlist(LoughranMcDonald, use.names = FALSE)
# )
# 
# toks <- as.tokens(dfm2, tokens = ECB_speeches, padding = TRUE, ignore_list = unlist(LoughranMcDonald, use.names = FALSE))
# 
# ## Subset on dates to limit dataset size
# toks <- toks[toks$.date >= "2016-01-01"]
# toks <- toks[ntoken(toks) > 200]
# 
# ECB_speeches <- toks
# 
# usethis::use_data(ECB_speeches, overwrite = TRUE)


# Press conference --------------------------------------------------------

library(quanteda)
ECB_press_conferences <- get_ECB_press_conferences()
# ECB_press_conferences2 <- ECB_press_conferences[, list(
#   date = unique(date),
#   title = paste0(unique(title), " -- ", section_title[1]),
#   content = paste0(content, collapse = "\n\n")
# ), by = "doc_id"]

# ECB_press_conferences2 <- corpus(ECB_press_conferences2, docid_field = "doc_id", text_field = "content")
# ECB_press_conferences22 <- corpus_reshape(ECB_press_conferences2, "paragraphs")
# ECB_press_conferences22
ECB_press_conferences <- corpus(ECB_press_conferences, docid_field = "paragraph_id", text_field = "content")
# 
# head(attr(ECB_press_conferences22, "docvars"))
# head(attr(ECB_press_conferences, "docvars"))
# 
# waldo::compare(ECB_press_conferences, ECB_press_conferences22)

attr(ECB_press_conferences, "meta")$object$unit <- "paragraph"
attr(ECB_press_conferences, "docvars")$docid_ <- factor(ECB_press_conferences$doc_id, levels = unique(ECB_press_conferences$doc_id))
data.table::setnames(attr(ECB_press_conferences, "docvars"), "date", ".date")
ECB_press_conferences$.date <- as.Date(ECB_press_conferences$.date)
# ee <- corpus_reshape(ECB_press_conferences, "documents")

{
  s <- sentometrics::compute_sentiment(
    ECB_press_conferences,
    lexicons = sentometrics::sento_lexicons(sentometrics::list_lexicons["LM_en"],
                                            sentometrics::list_valence_shifters$en),
    how = "proportional"
  )
  ECB_press_conferences$.sentiment <- s$LM_en
  rm(s)
}
usethis::use_data(ECB_press_conferences, overwrite = TRUE)

## Subset paragraphs too short
{
  n <- ntoken(ECB_press_conferences, remove_punct = TRUE,
              remove_symbols = TRUE, remove_numbers = TRUE)
  # summary(n)
  # aggregate(n, list(ECB_press_conferences$doc_id), sum)$x |> hist()
  ECB_press_conferences_tokens <- ECB_press_conferences[n > 10]
}

{
  spacyr::spacy_initialize(entity = FALSE)
  (parsed <- spacyr::spacy_parse(ECB_press_conferences_tokens,
                                 entity = FALSE, nounphrase = FALSE)) |>
    system.time() |>
    print()
  toks <- quanteda::as.tokens(subset(parsed, pos %in% c("NOUN", "ADJ", "PROPN")), use_lemma = TRUE)
  ECB_press_conferences_tokens <- ECB_press_conferences_tokens[names(ECB_press_conferences_tokens) %in% names(toks)]
  {
    # toks$.date <- as.Date(ECB_press_conferences_tokens$.date)
    # ECB_press_conferences_tokens$date <- NULL
  }
  {
    # toks$.sentiment <- ECB_press_conferences_tokens$.sentiment
    # ECB_press_conferences_tokens$sentiment <- NULL
  }
  docvars(toks) <- docvars(ECB_press_conferences_tokens)
  ECB_press_conferences_tokens <- toks
}

## Remove set of stopwords
ECB_press_conferences_tokens <- tokens_remove(ECB_press_conferences_tokens, c(stopwords("en", source = "marimo"), "lady","term", "long", "medium", "short", "gentleman", "speech", "see", "ladies", "gentlemen", "year", "years", "also", "can", "may", "new", "one", "time", "important", "level", "however", "need", "first", "even", "let", "like", "two", "three", "four", "five", "framework", "second", "third", "now", "well", "since", "recent", "process", "current", "large", "many", "future", "low", "still", "way", "high", "across", "key", "today", "last", "main", "different", "much", "take", "make", "higher", "lower", "made", "towards", "within", "period", "view", "past", "present", "already", "use", "ensure", "fact", "analysis", "number", "numbers", "objective", "part"))
## Detect bigrams
quanteda.textstats::textstat_collocations(ECB_press_conferences_tokens, min_count = 500, size = 2)
ECB_press_conferences_tokens <- tokens_compound(
  ECB_press_conferences_tokens,
  sapply(quanteda.textstats::textstat_collocations(ECB_press_conferences_tokens, min_count = 500, size = 2)$collocation, phrase)
)

ECB_press_conferences_tokens <- tokens_tolower(ECB_press_conferences_tokens)

length(types(ECB_press_conferences_tokens))

## Frequency-based cleaning
dfm <- dfm(ECB_press_conferences_tokens)
dfm

dfm |> dfm_remove(c(stopwords("en", source = "marimo"), "non")) |>
  dfm_trim(min_docfreq = 10) |>
  dfm_trim(max_docfreq = 1, docfreq_type = "prop") |>
  dfm_select(min_nchar = 3) -> dfm2
dfm2

ECB_press_conferences_tokens <- as.tokens(dfm2, tokens = ECB_press_conferences_tokens, padding = TRUE, ignore_list = unlist(LoughranMcDonald, use.names = FALSE))

usethis::use_data(ECB_press_conferences_tokens, overwrite = TRUE)
odelmarcelle/sentopics documentation built on Jan. 10, 2025, 2:58 p.m.