library(petro.One)
library(RWeka)
library(tm)
library(qdap)
library(tidyverse)
library(wordcloud)
library(knitr)
library(kableExtra)


# load previous findings
load(file = paste0("rese_rese_mach_mach", ".rda"))
papers <- paper_search_obj$papers
papers
term_frequency_n_grams <- function(df, gram.min = 2, gram.max = 2, 
                                   mc.cores = 2,
                                   stemming = TRUE,
                                   more_stopwords = NULL) {

    vdocs <- VCorpus(VectorSource(df$book_title))
    vdocs <- tm_map(vdocs, stripWhitespace)
    vdocs <- tm_map(vdocs, removePunctuation)
    vdocs <- tm_map(vdocs, content_transformer(tolower))
    vdocs <- tm_map(vdocs, removeWords, stopwords("english"))
    vdocs <- tm_map(vdocs, removeWords, custom_stopwords)  # from data

    # apply more stopwords
    if (!is.null(more_stopwords)) 
        vdocs <- tm_map(vdocs, removeWords, more_stopwords)  # more stopwords

    # apply stemming
    if (stemming)
        vdocs <- tm_map(vdocs, stemDocument, language = "english")  

    tdm   <- TermDocumentMatrix(vdocs)

    tdm.matrix <- as.matrix(tdm)
    tdm.rs <- sort(rowSums(tdm.matrix), decreasing=TRUE)
    tdm.df <- data.frame(word = names(tdm.rs), freq = as.integer(tdm.rs),
                         stringsAsFactors = FALSE)

    options(mc.cores = mc.cores)

    twogramTokenizer <- function(x) {
        NGramTokenizer(x, Weka_control(min=gram.min, max=gram.max))
    }

    tdm2 <- TermDocumentMatrix(vdocs,
                               control=list(tokenize=twogramTokenizer))

    tdm2.matrix <- as.matrix(tdm2)
    tdm2.rs <- sort(rowSums(tdm2.matrix), decreasing=TRUE)
    tdm2.df <- data.frame(word = names(tdm2.rs), freq = tdm2.rs, stringsAsFactors = FALSE)
    tibble::as.tibble(tdm2.df)
}
get_term_document_matrix <- function(df) {
    vdocs <- VCorpus(VectorSource(df$book_title))
    # Create the toSpace content transformer
    toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ",
                                                                      x))})
    # Apply it for substituting the regular expression given in one of the former answers by " "
    vdocs <- tm_map(vdocs, toSpace,"[^[:graph:]]")
    vdocs <- tm_map(vdocs, stripWhitespace)
    vdocs <- tm_map(vdocs, removePunctuation)
    vdocs <- tm_map(vdocs, content_transformer(tolower))
    vdocs <- tm_map(vdocs, removeWords, stopwords("english"))
    vdocs <- tm_map(vdocs, removeWords, custom_stopwords)


    tdm <- TermDocumentMatrix(vdocs)

    tdm.matrix <- as.matrix(tdm)
    tdm.rs <- sort(rowSums(tdm.matrix), decreasing=TRUE)
    tdm.df <- data.frame(word = names(tdm.rs), freq = as.integer(tdm.rs),
                         stringsAsFactors = FALSE)
    tdm_list = list(tdm = tdm, matrix = tdm.matrix, rowsums = tdm.rs, freq = tdm.df)
    tdm_list
}
term_frequency <- function(df) {
    tibble::as.tibble(get_term_document_matrix(df)$freq)
}

term_frequency(papers)
term_frequency_n_grams(papers, gram.min = 1, gram.max = 1)
my_custom_stopwords <- c("data", "oil", "learn", "analysi", "machin", "applic", 
                         "algorithm", "method", "analyt", "system", "technolog")

term_frequency_n_grams(papers, gram.min = 1, gram.max = 1, 
                       more_stopwords = my_custom_stopwords)


f0nzie/petro.One documentation built on May 29, 2019, 12:05 a.m.