Create Topic Model for Incidents

suppressMessages(library(here))
suppressMessages(library(tidyverse))
library(tidytext)
library(SnowballC)
library(topicmodels)
library(ldatuning)
library(here)

assets <- purrr::partial(here, "data-raw", "util", "assets")
data_folder <- purrr::partial(here, "data-raw", ".temp", "data")

Used data as of Oct 12, 2020.

Original exploration can be found at: https://julianbarg.github.io/spills/inquiries/incident_variance/2020-08-03.slides.html#/

# incidents <- readRDS(data_folder("incidents_merged.rds"))
# readr::write_rds(
#   select(incidents, incident_ID, ID, name, on_offshore, commodity, year,
#          narrative),
#   assets("narratives.rds"))
narratives <- readRDS(assets("narratives.rds"))
head(narratives)

We want to grab the company names, so we can remove those from the dataset

company_names <- unique(word(narratives$name, 1))
paste(company_names, collapse = ", ")

Limite data set to observation period and only retain relevant columns.

Create document-term matrix

```r, slideshow={'slide_type': 'subslide'}} narratives <- subset(narratives, on_offshore == "onshore" & commodity %in% c("rpp", "crude") & year >= 2004 & year < 2020 & !is.na(narrative))

```r}
word_counts <- narratives %>%
    unnest_tokens(word, narrative) %>%
    anti_join(data.frame(word = str_to_lower(company_names)), by = "word") %>%
    anti_join(stop_words, by = "word") %>%
    filter(! str_detect(word, "^[0-9]")) %>%
    mutate(word = wordStem(word)) %>%
    count(incident_ID, word, sort = T) %>%
    cast_dtm(document = incident_ID, term = word, 
             value = n)
word_counts

Run topicmodels

Models based on data as of Oct 12, 2020.

```r, slideshow={'slide_type': 'subslide'}}

results_1 <- FindTopicsNumber(word_counts,

topics = c(5, 10, 20, 40, 80, 120, 160),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 532))

saveRDS(results_1, file = assets("results_1.rds"))

results_1 <- readRDS(assets("results_1.rds"))

```r, slideshow={'slide_type': 'subslide'}}
FindTopicsNumber_plot(results_1)

```r, slideshow={'slide_type': 'subslide'}}

results_2 <- FindTopicsNumber(word_counts,

topics = c(15, 18, 19, 20, 21, 22, 25),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 544))

saveRDS(results_2, file = assets("results_2.rds"))

results_2 <- readRDS(assets("results_2.rds"))

```r, slideshow={'slide_type': 'subslide'}}
FindTopicsNumber_plot(results_2)

```r, slideshow={'slide_type': 'subslide'}}

results_3 <- FindTopicsNumber(word_counts,

topics = seq(15, 35),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 912))

saveRDS(results_3, file = assets("results_3.rds"))

results_3 <- readRDS(assets("results_3.rds"))

```r, slideshow={'slide_type': 'subslide'}}
FindTopicsNumber_plot(results_3)

Create model to be used

Models based on data as of Oct 12, 2020.

# narratives_lda <- LDA(word_counts, method = "Gibbs", k = 23, control = list(seed = 912))
# saveRDS(narratives_lda, assets("narratives_lda.rds"))
narratives_lda <- readRDS(assets("narratives_lda.rds"))
narratives_lda

Get betas and gammas

gammas <- tidy(narratives_lda, matrix = "gamma")
head(gammas)

```r, slideshow={'slide_type': 'subslide'}} betas <- tidy(narratives_lda, matrix = "beta") head(betas)

<!-- #region slideshow={"slide_type": "slide"} -->
## Save model
<!-- #endregion -->

```r}
saveRDS(betas, data_folder("betas.rds"))

```r, slideshow={'slide_type': 'subslide'}} gammas <- narratives %>% mutate(incident_ID = as.character(incident_ID)) %>% select(incident_ID, ID, year, commodity, on_offshore) %>% right_join(gammas, by = c("incident_ID" = "document"))

head(gammas)

```r, slideshow={'slide_type': 'subslide'}}
saveRDS(gammas, data_folder("gammas.rds"))


julianbarg/oildata documentation built on Nov. 27, 2020, 4 p.m.