knitr::opts_chunk$set(eval = T, echo = TRUE)
library(keyATM)
data(keyATM_data_bills)
bills_cov <- keyATM_data_bills$cov
dim(bills_cov)  # We have 140 documents and a single covariate

bills_time_index <- keyATM_data_bills$time_index
bills_time_index <- as.integer(bills_time_index - 100)

library(quanteda)
bills_dfm <- keyATM_data_bills$doc_dfm  # quanteda object
keyATM_docs <- keyATM_read(bills_dfm)

bills_keywords <- list(
                       Education = c("education", "child", "student"),
                       Law       = c("court", "law", "attorney"),
                       Health    = c("public", "health", "program"),
                       Drug      = c("drug", "treatment")
                      )

Topic-Word distribution by covariates

out <- keyATM(
              docs              = keyATM_docs,    # text input
              no_keyword_topics = 3,              # number of topics without keywords
              keywords          = bills_keywords, # keywords
              model             = "base",         # base, covariates, or dynamic
              options           = list(seed = 100),
              keep              = c("Z", "S")     # You need to keep `Z` and `S`
             )

We have a binary variable of the party ID of bill's proposer. 0 indicates Democrat and 1 indicates Republican.

table(bills_cov[, "RepParty"])

by_strata_TopicWord() function calculates topic-word distribution subsetted by a vector provided.

RepParty <- as.vector(bills_cov[, "RepParty"])  # the length should be the same as the number of documents
strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = RepParty)

You can get top words with top_words().

top_words(strata_tw, n = 5)

As long as the length is the same as the number of documents, you can use a character vector for by argument argument.

RepParty_chr <- ifelse(bills_cov[, "RepParty"] == 0, "Democrat", "Republican")
strata_tw_chr <- by_strata_TopicWord(out, keyATM_docs, RepParty_chr)
top_words(strata_tw_chr, n = 3)

Document-Topic distribution by covariates

With keyATM covariates, we can calculate the predicted posterior distribution of the document-topic distributions.

out <- keyATM(    
              docs              = keyATM_docs,    # text input
              no_keyword_topics = 3,              # number of topics without keywords
              keywords          = bills_keywords, # keywords
              model             = "covariates",   # select the model
              model_settings    = list(covariates_data = bills_cov,
                                       covariates_formula = ~ RepParty),
              options           = list(seed = 100)
             )

Now let's calculate the mean of theta for two different values of RepParty. The figure shows 90\% credible intervals.

strata_topic <- by_strata_DocTopic(out, by_name = "RepParty", by_values = c(0,1))
plot(strata_topic, topics = c(1,3,5))


Shusei-E/keyATM documentation built on Dec. 23, 2019, 6:34 p.m.