In julianbarg/oildata: Access data on the American oil pipeline system and oil spills

Create Topic Model for Incidents

suppressMessages(library(here))
suppressMessages(library(tidyverse))
library(tidytext)
library(SnowballC)
library(topicmodels)
library(ldatuning)
library(here)

assets <- purrr::partial(here, "data-raw", "util", "assets")
data_folder <- purrr::partial(here, "data-raw", ".temp", "data")

Used data as of Oct 12, 2020.

Original exploration can be found at: https://julianbarg.github.io/spills/inquiries/incident_variance/2020-08-03.slides.html#/

# incidents <- readRDS(data_folder("incidents_merged.rds"))
# readr::write_rds(
#   select(incidents, incident_ID, ID, name, on_offshore, commodity, year,
#          narrative),
#   assets("narratives.rds"))
narratives <- readRDS(assets("narratives.rds"))
head(narratives)

We want to grab the company names, so we can remove those from the dataset

company_names <- unique(word(narratives$name, 1))
paste(company_names, collapse = ", ")

Limite data set to observation period and only retain relevant columns.

Create document-term matrix

```r, slideshow={'slide_type': 'subslide'}} narratives <- subset(narratives, on_offshore == "onshore" & commodity %in% c("rpp", "crude") & year >= 2004 & year < 2020 & !is.na(narrative))

```r}
word_counts <- narratives %>%
    unnest_tokens(word, narrative) %>%
    anti_join(data.frame(word = str_to_lower(company_names)), by = "word") %>%
    anti_join(stop_words, by = "word") %>%
    filter(! str_detect(word, "^[0-9]")) %>%
    mutate(word = wordStem(word)) %>%
    count(incident_ID, word, sort = T) %>%
    cast_dtm(document = incident_ID, term = word, 
             value = n)
word_counts

Run topicmodels

Models based on data as of Oct 12, 2020.

```r, slideshow={'slide_type': 'subslide'}}

results_1 <- FindTopicsNumber(word_counts,

topics = c(5, 10, 20, 40, 80, 120, 160),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 532))

saveRDS(results_1, file = assets("results_1.rds"))

results_1 <- readRDS(assets("results_1.rds"))

```r, slideshow={'slide_type': 'subslide'}}
FindTopicsNumber_plot(results_1)

```r, slideshow={'slide_type': 'subslide'}}

results_2 <- FindTopicsNumber(word_counts,

topics = c(15, 18, 19, 20, 21, 22, 25),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 544))

saveRDS(results_2, file = assets("results_2.rds"))

results_2 <- readRDS(assets("results_2.rds"))

```r, slideshow={'slide_type': 'subslide'}}
FindTopicsNumber_plot(results_2)

```r, slideshow={'slide_type': 'subslide'}}

results_3 <- FindTopicsNumber(word_counts,

topics = seq(15, 35),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 912))

saveRDS(results_3, file = assets("results_3.rds"))

results_3 <- readRDS(assets("results_3.rds"))

```r, slideshow={'slide_type': 'subslide'}}
FindTopicsNumber_plot(results_3)

Create model to be used

Models based on data as of Oct 12, 2020.

# narratives_lda <- LDA(word_counts, method = "Gibbs", k = 23, control = list(seed = 912))
# saveRDS(narratives_lda, assets("narratives_lda.rds"))
narratives_lda <- readRDS(assets("narratives_lda.rds"))
narratives_lda

Get betas and gammas

gammas <- tidy(narratives_lda, matrix = "gamma")
head(gammas)

```r, slideshow={'slide_type': 'subslide'}} betas <- tidy(narratives_lda, matrix = "beta") head(betas)

<!-- #region slideshow={"slide_type": "slide"} -->
## Save model
<!-- #endregion -->

```r}
saveRDS(betas, data_folder("betas.rds"))

```r, slideshow={'slide_type': 'subslide'}} gammas <- narratives %>% mutate(incident_ID = as.character(incident_ID)) %>% select(incident_ID, ID, year, commodity, on_offshore) %>% right_join(gammas, by = c("incident_ID" = "document"))

head(gammas)

```r, slideshow={'slide_type': 'subslide'}}
saveRDS(gammas, data_folder("gammas.rds"))

julianbarg/oildata documentation built on Nov. 27, 2020, 4 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

julianbarg/oildata
Access data on the American oil pipeline system and oil spills

In julianbarg/oildata: Access data on the American oil pipeline system and oil spills

Create Topic Model for Incidents

We want to grab the company names, so we can remove those from the dataset

Create document-term matrix

Run topicmodels

results_1 <- FindTopicsNumber(word_counts,

topics = c(5, 10, 20, 40, 80, 120, 160),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 532))

saveRDS(results_1, file = assets("results_1.rds"))

results_2 <- FindTopicsNumber(word_counts,

topics = c(15, 18, 19, 20, 21, 22, 25),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 544))

saveRDS(results_2, file = assets("results_2.rds"))

results_3 <- FindTopicsNumber(word_counts,

topics = seq(15, 35),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 912))

saveRDS(results_3, file = assets("results_3.rds"))

Create model to be used

Get betas and gammas

R Package Documentation

Browse R Packages

We want your feedback!

julianbarg/oildata Access data on the American oil pipeline system and oil spills

In julianbarg/oildata: Access data on the American oil pipeline system and oil spills

Create Topic Model for Incidents

We want to grab the company names, so we can remove those from the dataset

Create document-term matrix

Run topicmodels

results_1 <- FindTopicsNumber(word_counts,

topics = c(5, 10, 20, 40, 80, 120, 160),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 532))

saveRDS(results_1, file = assets("results_1.rds"))

results_2 <- FindTopicsNumber(word_counts,

topics = c(15, 18, 19, 20, 21, 22, 25),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 544))

saveRDS(results_2, file = assets("results_2.rds"))

results_3 <- FindTopicsNumber(word_counts,

topics = seq(15, 35),

metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),

method = "Gibbs",

control = list(seed = 912))

saveRDS(results_3, file = assets("results_3.rds"))

Create model to be used

Get betas and gammas

R Package Documentation

Browse R Packages

We want your feedback!

julianbarg/oildata
Access data on the American oil pipeline system and oil spills