library(petro.One) library(RWeka) library(tm) library(qdap) library(tidyverse) library(wordcloud) library(knitr) library(kableExtra) # load previous findings load(file = paste0("rese_rese_mach_mach", ".rda")) papers <- paper_search_obj$papers papers
term_frequency_n_grams <- function(df, gram.min = 2, gram.max = 2, mc.cores = 2, stemming = TRUE, more_stopwords = NULL) { vdocs <- VCorpus(VectorSource(df$book_title)) vdocs <- tm_map(vdocs, stripWhitespace) vdocs <- tm_map(vdocs, removePunctuation) vdocs <- tm_map(vdocs, content_transformer(tolower)) vdocs <- tm_map(vdocs, removeWords, stopwords("english")) vdocs <- tm_map(vdocs, removeWords, custom_stopwords) # from data # apply more stopwords if (!is.null(more_stopwords)) vdocs <- tm_map(vdocs, removeWords, more_stopwords) # more stopwords # apply stemming if (stemming) vdocs <- tm_map(vdocs, stemDocument, language = "english") tdm <- TermDocumentMatrix(vdocs) tdm.matrix <- as.matrix(tdm) tdm.rs <- sort(rowSums(tdm.matrix), decreasing=TRUE) tdm.df <- data.frame(word = names(tdm.rs), freq = as.integer(tdm.rs), stringsAsFactors = FALSE) options(mc.cores = mc.cores) twogramTokenizer <- function(x) { NGramTokenizer(x, Weka_control(min=gram.min, max=gram.max)) } tdm2 <- TermDocumentMatrix(vdocs, control=list(tokenize=twogramTokenizer)) tdm2.matrix <- as.matrix(tdm2) tdm2.rs <- sort(rowSums(tdm2.matrix), decreasing=TRUE) tdm2.df <- data.frame(word = names(tdm2.rs), freq = tdm2.rs, stringsAsFactors = FALSE) tibble::as.tibble(tdm2.df) }
get_term_document_matrix <- function(df) { vdocs <- VCorpus(VectorSource(df$book_title)) # Create the toSpace content transformer toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ", x))}) # Apply it for substituting the regular expression given in one of the former answers by " " vdocs <- tm_map(vdocs, toSpace,"[^[:graph:]]") vdocs <- tm_map(vdocs, stripWhitespace) vdocs <- tm_map(vdocs, removePunctuation) vdocs <- tm_map(vdocs, content_transformer(tolower)) vdocs <- tm_map(vdocs, removeWords, stopwords("english")) vdocs <- tm_map(vdocs, removeWords, custom_stopwords) tdm <- TermDocumentMatrix(vdocs) tdm.matrix <- as.matrix(tdm) tdm.rs <- sort(rowSums(tdm.matrix), decreasing=TRUE) tdm.df <- data.frame(word = names(tdm.rs), freq = as.integer(tdm.rs), stringsAsFactors = FALSE) tdm_list = list(tdm = tdm, matrix = tdm.matrix, rowsums = tdm.rs, freq = tdm.df) tdm_list }
term_frequency <- function(df) { tibble::as.tibble(get_term_document_matrix(df)$freq) } term_frequency(papers)
term_frequency_n_grams(papers, gram.min = 1, gram.max = 1)
my_custom_stopwords <- c("data", "oil", "learn", "analysi", "machin", "applic", "algorithm", "method", "analyt", "system", "technolog") term_frequency_n_grams(papers, gram.min = 1, gram.max = 1, more_stopwords = my_custom_stopwords)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.