knitr::opts_chunk$set(echo = TRUE, 
                      comment = NA, 
                      error = TRUE, 
                      warning = FALSE, 
                      message = FALSE, 
                      fig.align = 'center')

Extracting 2-gram words from the papers

Load paper object from OnePetro scan

We do not do the scan here to save time. The paper scan to OnePetro was done in another notebook.

library(petro.One)
library(RWeka)
library(tm)
library(qdap)
library(tidyverse)
library(wordcloud)
library(knitr)
library(kableExtra)


# load previous findings
load(file = paste0("rese_rese_mach_mach", ".rda"))
papers <- paper_search_obj$papers
papers

Create corpus for paper titles

# this time we will remove all punctuation, all whitespaces, convert to lowercase,
# without stemming

data("stopwords")  # remove custom stopwords
vdocs <- VCorpus(VectorSource(papers$book_title))

vdocs <- tm_map(vdocs, content_transformer(tolower)) # convert to lowercase
# vdocs <- tm_map(vdocs, removeNumbers)   # if we remove numbers, CO2, 3D won't be recognized

vdocs <- tm_map(vdocs, removePunctuation)      # remove all punctuation
vdocs <- tm_map(vdocs, stripWhitespace)        # remove whitespaces
vdocs <- tm_map(vdocs, removeWords, stopwords("english")) # remove built-in stopwords
vdocs <- tm_map(vdocs, removeWords, custom_stopwords)

vdocs <- tm_map(vdocs, stemDocument, language = "english")  # apply stemming

Analysis on two-words

options(mc.cores=1)

twogramTokenizer <- function(x) {
    NGramTokenizer(x, Weka_control(min = 2, max = 2))
}

tdm2 <- TermDocumentMatrix(vdocs,
                           control = list(tokenize = twogramTokenizer))

tdm2.matrix <- as.matrix(tdm2)
tdm2.rs <- sort(rowSums(tdm2.matrix), decreasing = TRUE)
tdm2.df <- data.frame(word = names(tdm2.rs), freq = tdm2.rs, stringsAsFactors = FALSE)

row.names(tdm2.df) <- NULL
tdm2_df <- head(tdm2.df, 60)

kable(
    list(
        tdm2_df[1:20, ], tdm2_df[21:40, ], tdm2_df[41:60, ]
    ),
    booktabs = TRUE, 
    row.names = FALSE
)

Plots on raw data

# we change the minimum frequency of appearance otherwise will be too crowded
set.seed(1234)
wordcloud(words = tdm2.df$word, freq = tdm2.df$freq, min.freq = 8,
          max.words=200, random.order=TRUE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), font = 2)

p2 <- ggplot(subset(tdm2.df, freq > 8), aes(x=reorder(word, freq), y=freq)) + 
    geom_bar(stat = "identity") + 
    xlab("Terms") + ylab("Count") + 
    coord_flip()

p2

We see many words that are not directly related to the reservoir engineering discipline. We will remove them by first creating a vector of common stop words. We split in three groups: data anlytics stop words, common or geographical terms, and those terms not in the RE discipline, such as electro submersible pumps.

Refining the results

Additional custom stop words

datalytics <- c(
    "machin learn",
    "artifici intellig",
    "genet algorithm",
    "data analyt",
    "data mine",
    "data driven",
    "vector machin",
    "data analysi",
    "deep learn",
    "technolog focus",
    "pattern recognit",
    "datadriven model",
    "fuzzi logic",
    "kalman filter",
    "ensembl kalman",
    "optim techniqu",
    "neural network",
    "algorithm applic",
    "data manag",
    "data scienc",
    "datadriven method"

)    

common_terms <- c(
    "oil gas",
    "marcellus shale",
    "eagl ford",
    "oil field",
    "midland basin",
    "permian basin",
    "barnett shale",
    "crude oil"
)

not_in_discipline <- c(
    "ep note",
    "electr submers",
    "drill data",
    "well complet"
)

custom_stopwords_2 <- c(datalytics, common_terms, not_in_discipline)    

Final Frequency table

# refine paper corpus by removing more stop words.
vdocs <- tm_map(vdocs, removeWords, custom_stopwords_2)   # remove 2-grams

tdm2 <- TermDocumentMatrix(vdocs,
                           control = list(tokenize = twogramTokenizer))

tdm2.matrix <- as.matrix(tdm2)
tdm2.rs <- sort(rowSums(tdm2.matrix), decreasing = TRUE)
tdm2.df <- data.frame(word = names(tdm2.rs), freq = tdm2.rs, stringsAsFactors = FALSE)

row.names(tdm2.df) <- NULL
tdm2_df <- head(tdm2.df, 60)

kable(
    list(
        tdm2_df[1:20, ], tdm2_df[21:40, ], tdm2_df[41:60, ]
    ),
    booktabs = TRUE, 
    row.names = FALSE
)

Word cloud

set.seed(1234)
wordcloud(words = tdm2.df$word, freq = tdm2.df$freq, min.freq = 8,
          max.words=200, random.order=TRUE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), font = 2)

Bar plot

p2 <- ggplot(subset(tdm2.df, freq > 8), aes(x=reorder(word, freq), y=freq)) + 
    geom_bar(stat = "identity") + 
    xlab("Terms") + ylab("Count") + 
    coord_flip()

p2

Get papers for top 10 terms

# get papers for top 10 terms
top_papers <- get_top_term_papers(papers, tdm2.matrix, 10)
top_papers %>% 
    distinct(word, freq) %>% 
    kable(booktabs = TRUE) %>% 
    kable_styling(latex_options = "striped")

Notice that we are using stemmed words, the root of the words. In the next table, the stemmed words are replaced by the complete words.

Report

top_papers %>% 
    select(book_title, paper_id) %>% 
    mutate(book_title = substr(book_title, 1, 80)) %>% 
    kable(booktabs = T, longtable = TRUE) %>% 
    kable_styling(latex_options = c("striped", "repeat_header")) %>% 
    group_rows("history matching", 1, 46) %>% 
    group_rows("reservoir simulation", 47, 66) %>% 
    group_rows("well logging", 67, 85) %>%
    group_rows("oil recovery", 86, 104) %>% 
    group_rows("reservoir modeling", 105, 122) %>% 
    group_rows("unconventional reservoirs", 123, 139) %>% 
    group_rows("uncertainty quantification", 140, 156) %>% 
    group_rows("hydraulic fracturing", 157, 173) %>% 
    group_rows("carbonate reservoirs", 174, 189) %>% 
    group_rows("horizontal wells", 190, 202)

Sample of papers for Candidate Selection and Permeability Prediction

papers_2t <- get_top_term_papers(papers, tdm2.matrix, 
                                 terms = c("candid select"))
papers_2t %>% 
    select(book_title, paper_id, authors) %>% 
    mutate(book_title = substr(book_title, 1, 65)) %>% 
    mutate(paper_id = substr(paper_id, 1, 14)) %>% 
    mutate(authors = substr(authors, 1, 20)) %>% 
    kable(booktabs = T, caption = "Candidate Selection") %>% 
    kable_styling(latex_options = "striped")
papers_2t <- get_top_term_papers(papers, tdm2.matrix, 
                                 terms = c("permeabl predict"))
papers_2t %>% 
    select(book_title, paper_id, authors) %>% 
    mutate(book_title = substr(book_title, 1, 65)) %>% 
    mutate(paper_id = substr(paper_id, 1, 14)) %>% 
    mutate(authors = substr(authors, 1, 20)) %>% 
    kable(booktabs = T, caption = "Permeability Prediction") %>% 
    kable_styling(latex_options = "striped")


f0nzie/petro.One documentation built on May 29, 2019, 12:05 a.m.