knitr::opts_chunk$set(eval = T, echo = TRUE)
library(keyATM) data(keyATM_data_bills) bills_cov <- keyATM_data_bills$cov dim(bills_cov) # We have 140 documents and a single covariate bills_time_index <- keyATM_data_bills$time_index bills_time_index <- as.integer(bills_time_index - 100) library(quanteda) bills_dfm <- keyATM_data_bills$doc_dfm # quanteda object keyATM_docs <- keyATM_read(bills_dfm) bills_keywords <- list( Education = c("education", "child", "student"), Law = c("court", "law", "attorney"), Health = c("public", "health", "program"), Drug = c("drug", "treatment") )
out <- keyATM( docs = keyATM_docs, # text input no_keyword_topics = 3, # number of topics without keywords keywords = bills_keywords, # keywords model = "base", # base, covariates, or dynamic options = list(seed = 100), keep = c("Z", "S") # You need to keep `Z` and `S` )
We have a binary variable of the party ID of bill's proposer. 0
indicates Democrat and 1
indicates Republican.
table(bills_cov[, "RepParty"])
by_strata_TopicWord()
function calculates topic-word distribution subsetted by a vector provided.
RepParty <- as.vector(bills_cov[, "RepParty"]) # the length should be the same as the number of documents strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = RepParty)
You can get top words with top_words()
.
top_words(strata_tw, n = 5)
As long as the length is the same as the number of documents, you can use a character vector for by
argument argument.
RepParty_chr <- ifelse(bills_cov[, "RepParty"] == 0, "Democrat", "Republican") strata_tw_chr <- by_strata_TopicWord(out, keyATM_docs, RepParty_chr) top_words(strata_tw_chr, n = 3)
With keyATM covariates, we can calculate the predicted posterior distribution of the document-topic distributions.
out <- keyATM( docs = keyATM_docs, # text input no_keyword_topics = 3, # number of topics without keywords keywords = bills_keywords, # keywords model = "covariates", # select the model model_settings = list(covariates_data = bills_cov, covariates_formula = ~ RepParty), options = list(seed = 100) )
Now let's calculate the mean of theta for two different values of RepParty
. The figure shows 90\% credible intervals.
strata_topic <- by_strata_DocTopic(out, by_name = "RepParty", by_values = c(0,1)) plot(strata_topic, topics = c(1,3,5))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.