topic_key_term_plot: Plot Prevalence of Topic Key Terms

Description Usage Arguments Value Examples

Description

Plotting of key terms across preprocessing decisions.

Usage

1
2
3
4
5
6
7
topic_key_term_plot(topic_key_term_results, labs, key_term_columns = 2:6,
  custom_col_names = c("Iraq", "Terrorism", "Al Qaeda", "Insurance",
  "Stem Cell"), custom_labels = c("0%", "<1%", "1-2%", "2-3%", "3-4%",
  "4-5%", "5-6%", "6-7%", "7-8%", "8-9%", "9-10%", "10%+"),
  one_matrix = FALSE, thresholds = c(-1e-04, 0, 0.0099, 0.0199, 0.0299,
  0.0399, 0.0499, 0.0599, 0.0699, 0.0799, 0.0899, 0.0999), heat_ramp = FALSE,
  return_data = FALSE)

Arguments

topic_key_term_results

A data.frame with one column per key term and one row for each set of topic model results. The entries in each cell should be the proportion of topics in which a term appears.

labs

Labels for the preprocessing specifications associated with each set of topic model results.

key_term_columns

The columns containing key term results.

custom_col_names

Names for the key terms.

custom_labels

Labels for the provided key. Must be of length 12.

one_matrix

Logical indicating whether results should be displayed as a one column matrix. Defaults to FALSE.

thresholds

A numeric vector of length 11 with threshold for inclusion in various heat map categories.

heat_ramp

Option to use heat ramp (yellow-red-purple) instead of a white to blue ramp.

return_data

Logical indicating whether rescaled data should be returned. Defaults to FALSE.

Value

A plot

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
## Not run: 
set.seed(12345)
# load the package
library(preText)
# load in the data
data("UK_Manifestos")
# preprocess data
preprocessed_documents <- factorial_preprocessing(
    UK_Manifestos,
    use_ngrams = TRUE,
    infrequent_term_threshold = 0.02,
    verbose = TRUE)
cross_validation_splits <- 10
# create 10 test/train splits
train_inds <- vector(mode = "list", length = cross_validation_splits)
test_inds <- vector(mode = "list", length = cross_validation_splits)
# sample CV indices
for (i in 1:cross_validation_splits) {
    test <- sample(1:length(UK_Manifestos),
                   size = round(length(UK_Manifestos)/5),
                   replace = FALSE)
    train <- 1:length(UK_Manifestos)
    for (j in 1:length(test)) {
        train <- train[-which(train == test[j])]
    }
    train_inds[[i]] <- train
    test_inds[[i]] <- test
}
# get the optimal number of topics (this will take a very long time):
optimal_k <- optimal_k_comparison(
     train_inds,
     test_inds,
     preprocessed_documents$dfm_list,
     topics = c(25,50,75,100,125,150,175,200),
     names  = preprocessed_documents$labels)
# run a topic model with the optimal number of topics for each preproc. spec.
top_terms_list <- vector(mode = "list", length = 128)
for (i in 1:128) {
     fit <- topicmodels::LDA(quanteda::convert(preprocessed_documents$dfm_list[[i]],
                                               to = "topicmodels"),
                             k = optimal_k[i])
     # extract out top 20 terms for each topic
     top_terms <- terms(fit,20)
     top_terms_list[[i]] <- top_terms
}
# !!!!!! You will need to look for some key terms, and store them in a
# data.frame. Your code should be based off of the following. !!!!
# function to search for a term
find_term <- function(vec, term) {
     tc <- 0
     for(i in 1:length(term)) {
         tc <- tc + sum(grepl(term[i],vec, ignore.case = T))
     }
     if (tc > 0) {
         return(TRUE)
     } else {
         return(FALSE)
     }
}

# look for topics containing the terms below -- this is from our example with
# press releases so it will have to be modified.
# allows for multiple top terms related to the same concept
num_topics <- rep(0, length = 128)
search_list <- list(iraq = c("iraq"),
                    terror = c("terror"),
                    al_qaeda = c("qaeda"),
                    insurance = c("insur"),
                    stem_cell = c("stem"))

# where we will store our results
topics_in_results <- data.frame(
    preprocessing_steps = preprocessed_documents$labels,
    iraq = num_topics,
    terror = num_topics,
    al_qaeda = num_topics,
    insurance = num_topics,
    stem_cell = num_topics,
    optimal_number_of_topics = optimal_k,
    stringsAsFactors = FALSE)
# count the number of topics in which each top term appears
for (i in 1:128) {
    # allows for multiple top terms related to the same concept
    top_terms <- top_terms_list[[i]]
    for (j in 1:length(search_list)) {
        in_topic <- apply(top_terms,2,find_term, term = search_list[[j]])
        which_topics <- which(in_topic)
        topics_in_results[i,(j+1)] <- length(which_topics)
    }
}
# now make a plot:
topic_key_term_plot(
     topics_in_results,
     preprocessed_documents$labels,
     key_term_columns  = 2:6,
     custom_col_names = c("Iraq", "Terrorism", "Al Qaeda", "Insurance", "Stem Cell"),
     custom_labels = c("0%","<1%","1-2%","2-3%","3-4%","4-5%","5-6%","6-7%","7-8%",
                       "8-9%","9-10%","10%+"),
     one_matrix = FALSE,
     thresholds = c(-0.0001,0,0.0099,0.0199,0.0299,0.0399,0.0499,0.0599,0.0699,
                    0.0799,0.0899,0.0999),
     heat_ramp = FALSE,
     return_data = FALSE)

## End(Not run)

preText documentation built on May 1, 2019, 8:27 p.m.