suppressMessages(library(here)) suppressMessages(library(tidyverse)) library(tidytext) library(SnowballC) library(topicmodels) library(ldatuning) library(here) assets <- purrr::partial(here, "data-raw", "util", "assets") data_folder <- purrr::partial(here, "data-raw", ".temp", "data")
Used data as of Oct 12, 2020.
Original exploration can be found at: https://julianbarg.github.io/spills/inquiries/incident_variance/2020-08-03.slides.html#/
# incidents <- readRDS(data_folder("incidents_merged.rds")) # readr::write_rds( # select(incidents, incident_ID, ID, name, on_offshore, commodity, year, # narrative), # assets("narratives.rds")) narratives <- readRDS(assets("narratives.rds")) head(narratives)
company_names <- unique(word(narratives$name, 1)) paste(company_names, collapse = ", ")
Limite data set to observation period and only retain relevant columns.
```r, slideshow={'slide_type': 'subslide'}} narratives <- subset(narratives, on_offshore == "onshore" & commodity %in% c("rpp", "crude") & year >= 2004 & year < 2020 & !is.na(narrative))
```r} word_counts <- narratives %>% unnest_tokens(word, narrative) %>% anti_join(data.frame(word = str_to_lower(company_names)), by = "word") %>% anti_join(stop_words, by = "word") %>% filter(! str_detect(word, "^[0-9]")) %>% mutate(word = wordStem(word)) %>% count(incident_ID, word, sort = T) %>% cast_dtm(document = incident_ID, term = word, value = n) word_counts
Models based on data as of Oct 12, 2020.
```r, slideshow={'slide_type': 'subslide'}}
results_1 <- readRDS(assets("results_1.rds"))
```r, slideshow={'slide_type': 'subslide'}} FindTopicsNumber_plot(results_1)
```r, slideshow={'slide_type': 'subslide'}}
results_2 <- readRDS(assets("results_2.rds"))
```r, slideshow={'slide_type': 'subslide'}} FindTopicsNumber_plot(results_2)
```r, slideshow={'slide_type': 'subslide'}}
results_3 <- readRDS(assets("results_3.rds"))
```r, slideshow={'slide_type': 'subslide'}} FindTopicsNumber_plot(results_3)
Models based on data as of Oct 12, 2020.
# narratives_lda <- LDA(word_counts, method = "Gibbs", k = 23, control = list(seed = 912)) # saveRDS(narratives_lda, assets("narratives_lda.rds")) narratives_lda <- readRDS(assets("narratives_lda.rds")) narratives_lda
gammas <- tidy(narratives_lda, matrix = "gamma") head(gammas)
```r, slideshow={'slide_type': 'subslide'}} betas <- tidy(narratives_lda, matrix = "beta") head(betas)
<!-- #region slideshow={"slide_type": "slide"} --> ## Save model <!-- #endregion --> ```r} saveRDS(betas, data_folder("betas.rds"))
```r, slideshow={'slide_type': 'subslide'}} gammas <- narratives %>% mutate(incident_ID = as.character(incident_ID)) %>% select(incident_ID, ID, year, commodity, on_offshore) %>% right_join(gammas, by = c("incident_ID" = "document"))
head(gammas)
```r, slideshow={'slide_type': 'subslide'}} saveRDS(gammas, data_folder("gammas.rds"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.