In news-r/textanalysis: Text analysis

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

textanalysis

Text Analysis in R via Julia.

Installation

Being a wrapper to a Julia package, textanalysis requires the latter to be installed.

# install.packages("remotes")
remotes::install_github("news-r/textanalysis") # github

Setup

You must run init_textanalysis at the begining of every session, you will otherwise encounter errors and be prompted to do so.

library(textanalysis) # load the package

init_textanalysis() # initialise

Some funtions depend on the development version of the Julia package, to install it run:

install_textanalysis(version = "latest")

Basic Examples

# build document
str <- paste(
  "They <span>write</span>, it writes too!!!",
  "This is another sentence.",
  "More stuff in this document."
)
doc <- string_document(str)

# basic cleanup
prepare(doc)
get_text(doc)

# stem
stem_words(doc)
get_text(doc)

# corpus
doc2 <- token_document("Hey write another document.")

# combine
corpus <- corpus(doc, doc2)

# standardize
standardize(corpus, "token_document")

# prepare corpus
prepare(corpus, strip_html_tags = FALSE)
get_text(corpus)

# lexicon + lexical stats
(lexicon <- lexicon(corpus))
lexical_frequency(corpus, "document")

# inverse index
inverse_index(corpus)

# dtm
m <- document_term_matrix(corpus)

# create func to easily add lexicon
bind_lexicon <- function(data){
  data %>% 
    as.data.frame() %>% 
    dplyr::bind_cols(
      lexicon %>% 
        dplyr::select(-n),
      .
    )
}

# term-frequency
tf(m) %>% bind_lexicon()

# tf-idf
tf_idf(m) %>% bind_lexicon()

# bm-25
# https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/
bm_25(m) %>% bind_lexicon()

# sentiment
sentiment(corpus)

# summarise in 2 sentences
summarize(string_document(str), ns = 2L)

Latent Dirichlet Allocation

fit LDA on the gensimr data.

set_seed(42L)

data("corpus", package = "gensimr")
documents <- to_documents(corpus) # convert vector to documents

crps <- corpus(documents)
dtm <- document_term_matrix(crps)

# 2 topics
# 1K iterations
lda_data <- lda(dtm, 2L, 1000L)

# classification
lda_data$ntopics_ndocs

mat <- dtm_matrix(dtm, "dense")

tfidf <- tf_idf(mat)

km <- kmeans(tfidf, centers = 2)

Hash trick

hash_func <- create_hash_function(10L)
hash("a", hash_func)
hash(doc) # doc has built-in has

Naive Bayes Classifier

classes <- factor(c("financial", "legal"))
model <- init_naive_classifer(classes)

train <- tibble::tibble(
  text = c("this is financial doc", "this is legal doc"),
  labels = factor(c("financial", "legal"))
)

train_naive_classifier(model, train, text, labels)

test <- tibble::tibble(
  text = "this should be predicted as a legal document"
)
predict_class(model, test, text)