knitr::opts_chunk$set(echo = TRUE, 
                      comment = NA, 
                      error = TRUE, 
                      warning = FALSE, 
                      message = FALSE, 
                      fig.align = 'center')

load paper object from previous OnePetro scan

library(petro.One)
library(RWeka)
library(tm)
library(tidyverse)
library(wordcloud)


# load previous findings
load(file = paste0("rese_rese_mach_mach", ".rda"))
papers <- paper_search_obj$papers
papers

Create the corpus from the papers dataframe

## create the corpus
data("stopwords")  # remove custom stopwords

vdocs <- VCorpus(VectorSource(papers$book_title))

vdocs <- tm_map(vdocs, content_transformer(tolower)) # convert to lowercase
vdocs <- tm_map(vdocs, removePunctuation)      # remove all punctuation
vdocs <- tm_map(vdocs, stripWhitespace)        # remove whitespaces
vdocs <- tm_map(vdocs, removeWords, stopwords("english")) # remove built-in stopwords
vdocs <- tm_map(vdocs, removeWords, custom_stopwords)

vdocs <- tm_map(vdocs, stemDocument, language = "english")  
# get the 3-grams and frequency table
options(mc.cores=1)

threegramTokenizer <- function(x) {
    NGramTokenizer(x, Weka_control(min = 3, max = 3))
}

tdm3 <- TermDocumentMatrix(vdocs,
                           control=list(tokenize=threegramTokenizer))

tdm3.matrix <- as.matrix(tdm3)
tdm3.df     <- as.data.frame(tdm3.matrix)

tdm3.rs <- sort(rowSums(tdm3.matrix), decreasing=TRUE)
tdm3.freq <- data.frame(word = names(tdm3.rs), freq = tdm3.rs, stringsAsFactors = FALSE)
row.names(tdm3.freq) <- NULL
head(tdm3.freq, 35)

Some 3-g words that are not related to reservoir engineering. Need to remove them.

Remove 3-gram custom words

datalytics <- c("artifici neural network",
                "artifici intellig",
                "applic artifici intellig",
                "ensembl kalman filter",
                "genet algorithm optim",
                "machin learn",
                "histori match",
                "data mine"
                )

not_discipline <- c(
    "oil gas industri",
    "electr submers pump"
)


custom_stopwords_3 <- c(datalytics, not_discipline)        
# remove the 3-g stop words
vdocs <- tm_map(vdocs, removeWords, custom_stopwords_3)

tdm3 <- TermDocumentMatrix(vdocs,
                           control=list(tokenize = threegramTokenizer))

tdm3.matrix <- as.matrix(tdm3)
tdm3.df <- as.data.frame(tdm3.matrix)

tdm3.rs <- sort(rowSums(tdm3.matrix), decreasing=TRUE)
tdm3.freq <- data.frame(word = names(tdm3.rs), freq = tdm3.rs, stringsAsFactors = FALSE)
row.names(tdm3.freq) <- NULL
head(tdm3.freq, 25)
set.seed(1234)
wordcloud(words = tdm3.freq$word, freq = tdm3.freq$freq, min.freq = 3,
          max.words = 200, random.order=TRUE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"), font = 2)
# bar plot, top 10
p2 <- ggplot(head(tdm3.freq, 10), aes(x = reorder(word, freq), y=freq)) + 
    geom_bar(stat = "identity") + 
    xlab("Terms") + ylab("Count") + 
    coord_flip()

p2

Inverse process

# get papers for top 10 terms
top_papers <- get_top_term_papers(papers, tdm3.matrix, 10)
top_papers
get_top_term_papers <- function(papers, tdm_matrix, top_terms, verbose = FALSE) {
    tdm.rs <- sort(rowSums(tdm_matrix), decreasing = TRUE)
    tdm.freq <- data.frame(word = names(tdm.rs), freq = tdm.rs, stringsAsFactors = FALSE)
    tdm.freq <- head(tdm.freq, top_terms)  # select top # of rows
    row.names(tdm.freq) <- NULL

    # make the words the rows, the docs the columns
    tdtm_matrix <- t(tdm_matrix)          # transpose the matrix
    dtm.df <- as.data.frame(tdtm_matrix)  # convert to dataframe

    # iterate through the tdm frequency dataframe and get the papers for indices
    df_cum <- data.frame()        # accumulator
    for (i in 1:nrow(tdm.freq)) {
        w <- tdm.freq$word[i]
        f <- tdm.freq$freq[i]

        indices <- which(dtm.df[w] > 0)  # get indices
        if (verbose) {
            cat(sprintf("%-25s %3d \t", w, f)) 
            cat(indices, "\n")
            }
        df <- papers[indices, ]          # get papers
        df$word <- w                     # add variable word
        df$freq <- f                     # add variable frequency
        df_cum <- rbind(df, df_cum)      # cumulative dataframe

    }
    df_cum <- df_cum[with(df_cum, order(-freq)), ]             # sort by frequency
    subset(df_cum, select = c(word, freq, book_title:keyword)) # select columns
}


f0nzie/petro.One documentation built on May 29, 2019, 12:05 a.m.