knitr::opts_chunk$set(echo = TRUE, comment = NA, error = TRUE, warning = FALSE, message = FALSE, fig.align = 'center')
We do not do the scan here to save time. The paper scan to OnePetro was done in another notebook.
library(petro.One) library(RWeka) library(tm) library(qdap) library(tidyverse) library(wordcloud) library(knitr) library(kableExtra) # load previous findings load(file = paste0("rese_rese_mach_mach", ".rda")) papers <- paper_search_obj$papers papers
# this time we will remove all punctuation, all whitespaces, convert to lowercase, # without stemming data("stopwords") # remove custom stopwords vdocs <- VCorpus(VectorSource(papers$book_title)) vdocs <- tm_map(vdocs, content_transformer(tolower)) # convert to lowercase # vdocs <- tm_map(vdocs, removeNumbers) # if we remove numbers, CO2, 3D won't be recognized vdocs <- tm_map(vdocs, removePunctuation) # remove all punctuation vdocs <- tm_map(vdocs, stripWhitespace) # remove whitespaces vdocs <- tm_map(vdocs, removeWords, stopwords("english")) # remove built-in stopwords vdocs <- tm_map(vdocs, removeWords, custom_stopwords) vdocs <- tm_map(vdocs, stemDocument, language = "english") # apply stemming
options(mc.cores=1) twogramTokenizer <- function(x) { NGramTokenizer(x, Weka_control(min = 2, max = 2)) } tdm2 <- TermDocumentMatrix(vdocs, control = list(tokenize = twogramTokenizer)) tdm2.matrix <- as.matrix(tdm2) tdm2.rs <- sort(rowSums(tdm2.matrix), decreasing = TRUE) tdm2.df <- data.frame(word = names(tdm2.rs), freq = tdm2.rs, stringsAsFactors = FALSE) row.names(tdm2.df) <- NULL tdm2_df <- head(tdm2.df, 60) kable( list( tdm2_df[1:20, ], tdm2_df[21:40, ], tdm2_df[41:60, ] ), booktabs = TRUE, row.names = FALSE )
# we change the minimum frequency of appearance otherwise will be too crowded set.seed(1234) wordcloud(words = tdm2.df$word, freq = tdm2.df$freq, min.freq = 8, max.words=200, random.order=TRUE, rot.per=0.35, colors=brewer.pal(8, "Dark2"), font = 2) p2 <- ggplot(subset(tdm2.df, freq > 8), aes(x=reorder(word, freq), y=freq)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count") + coord_flip() p2
We see many words that are not directly related to the reservoir engineering discipline. We will remove them by first creating a vector of common stop words. We split in three groups: data anlytics stop words, common or geographical terms, and those terms not in the RE discipline, such as electro submersible pumps.
datalytics <- c( "machin learn", "artifici intellig", "genet algorithm", "data analyt", "data mine", "data driven", "vector machin", "data analysi", "deep learn", "technolog focus", "pattern recognit", "datadriven model", "fuzzi logic", "kalman filter", "ensembl kalman", "optim techniqu", "neural network", "algorithm applic", "data manag", "data scienc", "datadriven method" ) common_terms <- c( "oil gas", "marcellus shale", "eagl ford", "oil field", "midland basin", "permian basin", "barnett shale", "crude oil" ) not_in_discipline <- c( "ep note", "electr submers", "drill data", "well complet" ) custom_stopwords_2 <- c(datalytics, common_terms, not_in_discipline)
# refine paper corpus by removing more stop words. vdocs <- tm_map(vdocs, removeWords, custom_stopwords_2) # remove 2-grams tdm2 <- TermDocumentMatrix(vdocs, control = list(tokenize = twogramTokenizer)) tdm2.matrix <- as.matrix(tdm2) tdm2.rs <- sort(rowSums(tdm2.matrix), decreasing = TRUE) tdm2.df <- data.frame(word = names(tdm2.rs), freq = tdm2.rs, stringsAsFactors = FALSE) row.names(tdm2.df) <- NULL tdm2_df <- head(tdm2.df, 60) kable( list( tdm2_df[1:20, ], tdm2_df[21:40, ], tdm2_df[41:60, ] ), booktabs = TRUE, row.names = FALSE )
set.seed(1234) wordcloud(words = tdm2.df$word, freq = tdm2.df$freq, min.freq = 8, max.words=200, random.order=TRUE, rot.per=0.35, colors=brewer.pal(8, "Dark2"), font = 2)
p2 <- ggplot(subset(tdm2.df, freq > 8), aes(x=reorder(word, freq), y=freq)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count") + coord_flip() p2
# get papers for top 10 terms top_papers <- get_top_term_papers(papers, tdm2.matrix, 10)
top_papers %>% distinct(word, freq) %>% kable(booktabs = TRUE) %>% kable_styling(latex_options = "striped")
Notice that we are using stemmed words, the root of the words. In the next table, the stemmed words are replaced by the complete words.
top_papers %>% select(book_title, paper_id) %>% mutate(book_title = substr(book_title, 1, 80)) %>% kable(booktabs = T, longtable = TRUE) %>% kable_styling(latex_options = c("striped", "repeat_header")) %>% group_rows("history matching", 1, 46) %>% group_rows("reservoir simulation", 47, 66) %>% group_rows("well logging", 67, 85) %>% group_rows("oil recovery", 86, 104) %>% group_rows("reservoir modeling", 105, 122) %>% group_rows("unconventional reservoirs", 123, 139) %>% group_rows("uncertainty quantification", 140, 156) %>% group_rows("hydraulic fracturing", 157, 173) %>% group_rows("carbonate reservoirs", 174, 189) %>% group_rows("horizontal wells", 190, 202)
papers_2t <- get_top_term_papers(papers, tdm2.matrix, terms = c("candid select")) papers_2t %>% select(book_title, paper_id, authors) %>% mutate(book_title = substr(book_title, 1, 65)) %>% mutate(paper_id = substr(paper_id, 1, 14)) %>% mutate(authors = substr(authors, 1, 20)) %>% kable(booktabs = T, caption = "Candidate Selection") %>% kable_styling(latex_options = "striped")
papers_2t <- get_top_term_papers(papers, tdm2.matrix, terms = c("permeabl predict")) papers_2t %>% select(book_title, paper_id, authors) %>% mutate(book_title = substr(book_title, 1, 65)) %>% mutate(paper_id = substr(paper_id, 1, 14)) %>% mutate(authors = substr(authors, 1, 20)) %>% kable(booktabs = T, caption = "Permeability Prediction") %>% kable_styling(latex_options = "striped")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.