R/ngram_functions.R

Defines functions bigram.tokenizer trigram.tokenizer wordcloud.fromcorpus.bigram wordcloud.fromcorpus.trigram wordcloud.fromtdm quanteda.sentence.VCorpus

#####################
#FUNCTIONS
#####################
#' @export
bigram.tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
#' @export
trigram.tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

#' Generate a bigram wordcloud from a `tm` volatile corpus
#'
#' @param corpus a volatile corpus generated using package `tm`, such as those generated by function `pubmed.corpus` in this package.
#'
#' @export
#'
#' @examples
#' `wordcloud.fromcorpus.bigram(tm_volatilecorpus)`
wordcloud.fromcorpus.bigram <- function(corpus) {
  if(class(corpus)[1] != "VCorpus") stop("The argument to this function must be a Volatile Corpus (class = VCorpus) generated with package 'tm'. Stopping.")
  tdm_temp <- TermDocumentMatrix(corpus,
                                 control = list(tokenize = bigram.tokenizer))
  freq <- sort(rowSums(as.matrix(tdm_temp)),decreasing = TRUE)
  freq.df <- data.frame(word=names(freq), freq=freq)
  print(head(freq.df, 30))
  pal <- brewer.pal(8,"Blues")
  pal <- pal[-(1:3)]
  wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
}

#' Generate a trigram wordcloud from a `tm` volatile corpus
#'
#' @param corpus a volatile corpus generated using package `tm`, such as those generated by function `pubmed.corpus` in this package.
#'
#' @export
#'
#' @examples
#' `wordcloud.fromcorpus.bigram(tm_volatilecorpus)`
wordcloud.fromcorpus.trigram <- function(corpus) {
  if(class(corpus)[1] != "VCorpus") stop("The argument to this function must be a Volatile Corpus (class = VCorpus) generated with package 'tm'. Stopping.")
  tdm_temp <- TermDocumentMatrix(corpus,
                                 control = list(tokenize = trigram.tokenizer))
  freq <- sort(rowSums(as.matrix(tdm_temp)),decreasing = TRUE)
  freq.df <- data.frame(word=names(freq), freq=freq)
  print(head(freq.df, 10))
  pal <- brewer.pal(8,"Blues")
  pal <- pal[-(1:3)]
  wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
}

#' Generate a bigram wordcloud from a `tm` document matrix. This can be a term-document or document-term matrix.
#'
#' @param tdm a term-document or document-term matrix.
#'
#' @export
#'
#' @examples
#' `wordcloud.fromcorpus.bigram(tm_volatilecorpus)`
wordcloud.fromtdm <- function(tdm) {
  if(class(tdm)[1] != "TermDocumentMatrix" & class(tdm)[1] != "DocumentTermMatrix") stop("The argument to this function must be a term document matrix (class = TermDocumentMatrix) generated with package 'tm'. Stopping")
  freq = sort(rowSums(as.matrix(tdm)),decreasing = TRUE)
  freq.df = data.frame(word=names(freq), freq=freq)
  print(head(freq.df, 10))
  pal = brewer.pal(8,"Blues")
  pal = pal[-(1:3)]
  wordcloud(freq.df$word,freq.df$freq,max.words=100,random.order = F, colors=pal)
}

#' Create a `tm` volatile corpus from a `quanteda corpus`
#'
#' @param quanteda_corpus a corpus available through package `quanteda`
#'
#' @return a `tm` volatile corpus with each observation representing one sentence
#' @export
#'
quanteda.sentence.VCorpus <- function(quanteda_corpus) {
sentences <- corpus_reshape(quanteda_corpus, to = "sentences") #first getting sentences in a nested list format
sentences_df <- data.frame(sentences[[1]][[1]]) %>% #extracting from list
  transmute(value = sentences..1....1..) %>%
  mutate(value = replace_html(value), #cleaning html
         value = gsub('([[:punct:]])|\\s+',' ', value), #cleaning special characters
         value = gsub('[[:digit:]]+', ' ', value), #removing numbers, replacing with space
         value = tolower(value), #lowercase
         value = gsub('\\b\\w{1,2}\\b','', value), #removing 1-2 letter words
         value = str_squish(value)) %>% #removing extra spaces
  transmute(doc_id = 1:n(), #putting in the required DataframeSource format
            text = value) %>%
  as.tibble()
tm <- DataframeSource(sentences_df)
tm_corp <- VCorpus(tm, readerControl = list(language = "en"))
return(tm_corp)
}
joelmlevin/uglypaRse documentation built on Sept. 4, 2019, 2:45 p.m.