knitr::opts_chunk$set(echo = TRUE, comment = NA, error = TRUE, warning = FALSE, message = FALSE, fig.align = 'center')
library(petro.One) library(RWeka) library(tm) library(tidyverse) library(wordcloud) # load previous findings load(file = paste0("rese_rese_mach_mach", ".rda")) papers <- paper_search_obj$papers papers
## create the corpus data("stopwords") # remove custom stopwords vdocs <- VCorpus(VectorSource(papers$book_title)) vdocs <- tm_map(vdocs, content_transformer(tolower)) # convert to lowercase vdocs <- tm_map(vdocs, removePunctuation) # remove all punctuation vdocs <- tm_map(vdocs, stripWhitespace) # remove whitespaces vdocs <- tm_map(vdocs, removeWords, stopwords("english")) # remove built-in stopwords vdocs <- tm_map(vdocs, removeWords, custom_stopwords) vdocs <- tm_map(vdocs, stemDocument, language = "english")
# get the 3-grams and frequency table options(mc.cores=1) threegramTokenizer <- function(x) { NGramTokenizer(x, Weka_control(min = 3, max = 3)) } tdm3 <- TermDocumentMatrix(vdocs, control=list(tokenize=threegramTokenizer)) tdm3.matrix <- as.matrix(tdm3) tdm3.df <- as.data.frame(tdm3.matrix) tdm3.rs <- sort(rowSums(tdm3.matrix), decreasing=TRUE) tdm3.freq <- data.frame(word = names(tdm3.rs), freq = tdm3.rs, stringsAsFactors = FALSE) row.names(tdm3.freq) <- NULL head(tdm3.freq, 35)
Some 3-g words that are not related to reservoir engineering. Need to remove them.
datalytics <- c("artifici neural network", "artifici intellig", "applic artifici intellig", "ensembl kalman filter", "genet algorithm optim", "machin learn", "histori match", "data mine" ) not_discipline <- c( "oil gas industri", "electr submers pump" ) custom_stopwords_3 <- c(datalytics, not_discipline)
# remove the 3-g stop words vdocs <- tm_map(vdocs, removeWords, custom_stopwords_3) tdm3 <- TermDocumentMatrix(vdocs, control=list(tokenize = threegramTokenizer)) tdm3.matrix <- as.matrix(tdm3) tdm3.df <- as.data.frame(tdm3.matrix) tdm3.rs <- sort(rowSums(tdm3.matrix), decreasing=TRUE) tdm3.freq <- data.frame(word = names(tdm3.rs), freq = tdm3.rs, stringsAsFactors = FALSE) row.names(tdm3.freq) <- NULL head(tdm3.freq, 25)
set.seed(1234) wordcloud(words = tdm3.freq$word, freq = tdm3.freq$freq, min.freq = 3, max.words = 200, random.order=TRUE, rot.per=0.35, colors=brewer.pal(8, "Dark2"), font = 2)
# bar plot, top 10 p2 <- ggplot(head(tdm3.freq, 10), aes(x = reorder(word, freq), y=freq)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count") + coord_flip() p2
# get papers for top 10 terms top_papers <- get_top_term_papers(papers, tdm3.matrix, 10) top_papers
get_top_term_papers <- function(papers, tdm_matrix, top_terms, verbose = FALSE) { tdm.rs <- sort(rowSums(tdm_matrix), decreasing = TRUE) tdm.freq <- data.frame(word = names(tdm.rs), freq = tdm.rs, stringsAsFactors = FALSE) tdm.freq <- head(tdm.freq, top_terms) # select top # of rows row.names(tdm.freq) <- NULL # make the words the rows, the docs the columns tdtm_matrix <- t(tdm_matrix) # transpose the matrix dtm.df <- as.data.frame(tdtm_matrix) # convert to dataframe # iterate through the tdm frequency dataframe and get the papers for indices df_cum <- data.frame() # accumulator for (i in 1:nrow(tdm.freq)) { w <- tdm.freq$word[i] f <- tdm.freq$freq[i] indices <- which(dtm.df[w] > 0) # get indices if (verbose) { cat(sprintf("%-25s %3d \t", w, f)) cat(indices, "\n") } df <- papers[indices, ] # get papers df$word <- w # add variable word df$freq <- f # add variable frequency df_cum <- rbind(df, df_cum) # cumulative dataframe } df_cum <- df_cum[with(df_cum, order(-freq)), ] # sort by frequency subset(df_cum, select = c(word, freq, book_title:keyword)) # select columns }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.