knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" )
Text Analysis in R via Julia.
Being a wrapper to a Julia package, textanalysis requires the latter to be installed.
# install.packages("remotes") remotes::install_github("news-r/textanalysis") # github
You must run init_textanalysis
at the begining of every session, you will otherwise encounter errors and be prompted to do so.
library(textanalysis) # load the package init_textanalysis() # initialise
Some funtions depend on the development version of the Julia package, to install it run:
install_textanalysis(version = "latest")
# build document str <- paste( "They <span>write</span>, it writes too!!!", "This is another sentence.", "More stuff in this document." ) doc <- string_document(str) # basic cleanup prepare(doc) get_text(doc) # stem stem_words(doc) get_text(doc) # corpus doc2 <- token_document("Hey write another document.") # combine corpus <- corpus(doc, doc2) # standardize standardize(corpus, "token_document") # prepare corpus prepare(corpus, strip_html_tags = FALSE) get_text(corpus) # lexicon + lexical stats (lexicon <- lexicon(corpus)) lexical_frequency(corpus, "document") # inverse index inverse_index(corpus) # dtm m <- document_term_matrix(corpus) # create func to easily add lexicon bind_lexicon <- function(data){ data %>% as.data.frame() %>% dplyr::bind_cols( lexicon %>% dplyr::select(-n), . ) } # term-frequency tf(m) %>% bind_lexicon() # tf-idf tf_idf(m) %>% bind_lexicon() # bm-25 # https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/ bm_25(m) %>% bind_lexicon() # sentiment sentiment(corpus) # summarise in 2 sentences summarize(string_document(str), ns = 2L)
fit LDA on the gensimr data.
set_seed(42L) data("corpus", package = "gensimr") documents <- to_documents(corpus) # convert vector to documents crps <- corpus(documents) dtm <- document_term_matrix(crps) # 2 topics # 1K iterations lda_data <- lda(dtm, 2L, 1000L) # classification lda_data$ntopics_ndocs mat <- dtm_matrix(dtm, "dense") tfidf <- tf_idf(mat) km <- kmeans(tfidf, centers = 2)
hash_func <- create_hash_function(10L) hash("a", hash_func) hash(doc) # doc has built-in has
classes <- factor(c("financial", "legal")) model <- init_naive_classifer(classes) train <- tibble::tibble( text = c("this is financial doc", "this is legal doc"), labels = factor(c("financial", "legal")) ) train_naive_classifier(model, train, text, labels) test <- tibble::tibble( text = "this should be predicted as a legal document" ) predict_class(model, test, text)
Plot method uses echarts4r
matrix <- coom(crps) plot(matrix)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.