Description Usage Arguments Value Examples
Plotting of key terms across preprocessing decisions.
1 2 3 4 5 6 7 | topic_key_term_plot(topic_key_term_results, labs, key_term_columns = 2:6,
custom_col_names = c("Iraq", "Terrorism", "Al Qaeda", "Insurance",
"Stem Cell"), custom_labels = c("0%", "<1%", "1-2%", "2-3%", "3-4%",
"4-5%", "5-6%", "6-7%", "7-8%", "8-9%", "9-10%", "10%+"),
one_matrix = FALSE, thresholds = c(-1e-04, 0, 0.0099, 0.0199, 0.0299,
0.0399, 0.0499, 0.0599, 0.0699, 0.0799, 0.0899, 0.0999), heat_ramp = FALSE,
return_data = FALSE)
|
topic_key_term_results |
A data.frame with one column per key term and one row for each set of topic model results. The entries in each cell should be the proportion of topics in which a term appears. |
labs |
Labels for the preprocessing specifications associated with each set of topic model results. |
key_term_columns |
The columns containing key term results. |
custom_col_names |
Names for the key terms. |
custom_labels |
Labels for the provided key. Must be of length 12. |
one_matrix |
Logical indicating whether results should be displayed as a one column matrix. Defaults to FALSE. |
thresholds |
A numeric vector of length 11 with threshold for inclusion in various heat map categories. |
heat_ramp |
Option to use heat ramp (yellow-red-purple) instead of a white to blue ramp. |
return_data |
Logical indicating whether rescaled data should be returned. Defaults to FALSE. |
A plot
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | ## Not run:
set.seed(12345)
# load the package
library(preText)
# load in the data
data("UK_Manifestos")
# preprocess data
preprocessed_documents <- factorial_preprocessing(
UK_Manifestos,
use_ngrams = TRUE,
infrequent_term_threshold = 0.02,
verbose = TRUE)
cross_validation_splits <- 10
# create 10 test/train splits
train_inds <- vector(mode = "list", length = cross_validation_splits)
test_inds <- vector(mode = "list", length = cross_validation_splits)
# sample CV indices
for (i in 1:cross_validation_splits) {
test <- sample(1:length(UK_Manifestos),
size = round(length(UK_Manifestos)/5),
replace = FALSE)
train <- 1:length(UK_Manifestos)
for (j in 1:length(test)) {
train <- train[-which(train == test[j])]
}
train_inds[[i]] <- train
test_inds[[i]] <- test
}
# get the optimal number of topics (this will take a very long time):
optimal_k <- optimal_k_comparison(
train_inds,
test_inds,
preprocessed_documents$dfm_list,
topics = c(25,50,75,100,125,150,175,200),
names = preprocessed_documents$labels)
# run a topic model with the optimal number of topics for each preproc. spec.
top_terms_list <- vector(mode = "list", length = 128)
for (i in 1:128) {
fit <- topicmodels::LDA(quanteda::convert(preprocessed_documents$dfm_list[[i]],
to = "topicmodels"),
k = optimal_k[i])
# extract out top 20 terms for each topic
top_terms <- terms(fit,20)
top_terms_list[[i]] <- top_terms
}
# !!!!!! You will need to look for some key terms, and store them in a
# data.frame. Your code should be based off of the following. !!!!
# function to search for a term
find_term <- function(vec, term) {
tc <- 0
for(i in 1:length(term)) {
tc <- tc + sum(grepl(term[i],vec, ignore.case = T))
}
if (tc > 0) {
return(TRUE)
} else {
return(FALSE)
}
}
# look for topics containing the terms below -- this is from our example with
# press releases so it will have to be modified.
# allows for multiple top terms related to the same concept
num_topics <- rep(0, length = 128)
search_list <- list(iraq = c("iraq"),
terror = c("terror"),
al_qaeda = c("qaeda"),
insurance = c("insur"),
stem_cell = c("stem"))
# where we will store our results
topics_in_results <- data.frame(
preprocessing_steps = preprocessed_documents$labels,
iraq = num_topics,
terror = num_topics,
al_qaeda = num_topics,
insurance = num_topics,
stem_cell = num_topics,
optimal_number_of_topics = optimal_k,
stringsAsFactors = FALSE)
# count the number of topics in which each top term appears
for (i in 1:128) {
# allows for multiple top terms related to the same concept
top_terms <- top_terms_list[[i]]
for (j in 1:length(search_list)) {
in_topic <- apply(top_terms,2,find_term, term = search_list[[j]])
which_topics <- which(in_topic)
topics_in_results[i,(j+1)] <- length(which_topics)
}
}
# now make a plot:
topic_key_term_plot(
topics_in_results,
preprocessed_documents$labels,
key_term_columns = 2:6,
custom_col_names = c("Iraq", "Terrorism", "Al Qaeda", "Insurance", "Stem Cell"),
custom_labels = c("0%","<1%","1-2%","2-3%","3-4%","4-5%","5-6%","6-7%","7-8%",
"8-9%","9-10%","10%+"),
one_matrix = FALSE,
thresholds = c(-0.0001,0,0.0099,0.0199,0.0299,0.0399,0.0499,0.0599,0.0699,
0.0799,0.0899,0.0999),
heat_ramp = FALSE,
return_data = FALSE)
## End(Not run)
|
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.