Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>", warning = FALSE
)
## -----------------------------------------------------------------------------
################################################################################
# Example: Using tidytext with textmineR
################################################################################
library(tidytext)
library(textmineR)
library(dplyr)
library(tidyr)
# load documents in a data frame
docs <- textmineR::nih_sample
# tokenize using tidytext's unnest_tokens
tidy_docs <- docs %>%
select(APPLICATION_ID, ABSTRACT_TEXT) %>%
unnest_tokens(output = word,
input = ABSTRACT_TEXT,
stopwords = c(stopwords::stopwords("en"),
stopwords::stopwords(source = "smart")),
token = "ngrams",
n_min = 1, n = 2) %>%
count(APPLICATION_ID, word) %>%
filter(n>1) #Filtering for words/bigrams per document, rather than per corpus
tidy_docs <- tidy_docs %>% # filter words that are just numbers
filter(! stringr::str_detect(tidy_docs$word, "^[0-9]+$"))
# turn a tidy tbl into a sparse dgCMatrix for use in textmineR
d <- tidy_docs %>%
cast_sparse(APPLICATION_ID, word, n)
# create a topic model
m <- FitLdaModel(dtm = d,
k = 20,
iterations = 200,
burnin = 175)
# below is equivalent to tidy_beta <- tidy(x = m, matrix = "beta")
tidy_beta <- data.frame(topic = as.integer(stringr::str_replace_all(rownames(m$phi), "t_", "")),
m$phi,
stringsAsFactors = FALSE) %>%
gather(term, beta, -topic) %>%
tibble::as_tibble()
# below is equivalent to tidy_gamma <- tidy(x = m, matrix = "gamma")
tidy_gamma <- data.frame(document = rownames(m$theta),
m$theta,
stringsAsFactors = FALSE) %>%
gather(topic, gamma, -document) %>%
tibble::as_tibble()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.