#' Topicanalysis Class.
#'
#' Analyse topicmodels.
#'
#' @field topicmodel A topicmodel of class \code{TopicModel}, generated from
#' package \code{topicmodels}.
#' @field posterior Slot to store posterior, not used at this point.
#' @field terms The \code{matrix} with the terms of a topicmodel. Keeping the
#' terms may speed up subsequent operations.
#' @field topics The \code{matrix} with the topics present in documents. Keeping
#' this matrix may speed up subsequent operations.
#' @field bundle A \code{partition_bundle}, required to use method \code{read}
#' to access full text.
#' @field labels A \code{character} vector, labels for the topics.
#' @field name A name for the \code{Topicanalysis} object. Useful if combining
#' several objects into a bundle.
#' @field categories A \code{character} vector with categories.
#' @field grouping Not used at this stage.
#' @field exclude Topics to exclude from further analysis.
#' @field type Corpus type, necessary for applying correct template for fulltext output.
#'
#' @section Arguments:
#' \describe{
#' \item{new}{New value for a label or a category.}
#' \item{n}{Number of a topic.}
#' \item{n_words}{An \code{integer}, the number of words to be displayed in a wordcloud.}
#' \item{x}{Number or name of a topics.}
#' \item{y}{Number or name of a topic cooccurring with x.}
#' \item{k}{Number of top topics of a document to consider.}
#' \item{exclude}{A \code{logical} value, whether to to exclude topics
#' earmarked in logical vector in field exclude.}
#' \item{aggregation}{Level of aggregation of \code{as.zoo} method.}
#' \item{...}{Further parameters passed to worker function (such as
#' \code{wordcloud::wordcloud} when calling \code{$wordcloud()}, for
#' instance).}
#' \item{regex}{A regular expression that will limit the evaluation to those documents only
#' that are matched by the regular expression.}
#' }
#'
#' @section Methods:
#' \describe{
#' \item{\code{$initialize(topicmodel)}}{Instantiate new \code{Topicanalysis}
#' object. Upon initialization, labels will be the plain numbers of the
#' topics, all exclude values are \code{FALSE}.}
#' \item{\code{$cooccurrences(k = 3, regex = NULL, docs = NULL, renumber = NULL,
#' progress = TRUE, exclude = TRUE)}}{Get cooccurrences of topics. Arguments are
#' documented with the S4 cooccurrences-method for TopicModel-objects.}
#' \item{\code{$relabel(n, new)}}{Relabel topic \code{n}, assigning new label
#' \code{new}.}
#' \item{\code{$add_category(new)}}{Add \code{new}, a \code{character} vector
#' as a new category to the \code{character} vector in the field
#' \code{category}.}
#' \item{\code{$ignorance(n, new)}}{Exclude topic \code{n} (i.e. add to ignore).}
#' \item{\code{$wordcloud(n, n = 50, ...)}}{Generate wordcloud for topic
#' \code{n}, with \code{n_words} words. Further arguments can be passed into
#' \code{wordcloud::wordcloud} usint the three dots.}
#' \item{\code{$docs(x, y = NULL, n = 3L, s_attributes = NULL)}}{Get documents
#' where topic \code{x} occurrs among the top \code{n} topics. If \code{y} is
#' provided, documents are returned where \code{x} and \code{y} are among the
#' \code{n} top topics. If \code{x} or \code{y} are provided as a \code{character}
#' vector, the method will look up this label in the \code{labels} field.}
#' \item{\code{$read(x, n = 3, no_token = 100)}}{Read document \code{x},
#' highlighting the number of topics specified by \code{n}, indicated by
#' \code{no_token}.}
#' \item{\code{$as.zoo(x = NULL, y = NULL, k = 3, exclude = TRUE, aggregation
#' = c(NULL, "month", "quarter", "year"))}}{Generate \code{zoo} object from
#' topicmodel.}
#' \item{\code{$compare(x, ...)}}{Compare the similarity of two topicmodels.}
#' \item{\code{$find_topics(x, n = 100, word2vec = NULL)}}{Find a topic.}
#' }
#' @importFrom polmineR as.speeches
#' @examples
#' data(BE_lda)
#' data(BE_labels)
#' data(BE_exclude)
#'
#' BE <- Topicanalysis$new(topicmodel = BE_lda)
#' BE$labels <- BE_labels
#' BE$exclude <- BE_exclude
#' BE$exclude <- grepl("^\\((split|)\\)$", BE$labels)
#' BE$name <- "Berlin"
#' BE$type <- "plpr_partition"
#'
#' z <- BE$as.zoo(x = "Flucht, Asyl, vorläufiger Schutz", aggregation = "year")
#' plot(z)
#'
#' y <- BE$as.zoo(
#' x = grep("Asyl", BE_labels),
#' y = grep("Europ", BE_labels),
#' aggregation = "year"
#' )
#' plot(y)
#'
#' BE$exclude <- grepl("^\\(.*?\\)$", BE$labels)
#' dt <- BE$cooccurrences(k = 3L, exclude = TRUE)
#' dt_min <- dt[chisquare >= 10.83]
#'
#'
#' if (requireNamespace("igraph")){
#' g <- igraph::graph_from_data_frame(
#' d = data.frame(
#' from = dt_min[["a_label"]],
#' to = dt_min[["b_label"]],
#' n = dt_min[["count_coi"]],
#' stringsAsFactors = FALSE
#' ),
#' directed = TRUE
#' )
#' g <- igraph::as.undirected(g, mode = "collapse")
#' if (interactive()){
#' igraph::plot.igraph(
#' g, shape = "square", vertex.color = "steelblue",
#' label = igraph::V(g)$name, label.family = 11, label.cex = 0.5
#' )
#' }
#' }
#'
#' topic_flucht <- 125L
#' topic_integration <- 241
#' BE$docs(x = "Flucht, Asyl, vorläufiger Schutz")
#' BE$docs(x = grep("Flucht", BE$labels))
#' BE$docs(x = 125L)
#' docs <- BE$docs(x = 125L, y = 241L)
#'
#' \dontrun{
#' li <- lapply(
#' docs,
#' function(doc){
#' polmineR::as.speeches(
#' polmineR::partition(
#' "BE",
#' who = gsub("^(.*?)_.*$", "\\1", doc),
#' date = gsub("^.*(\\d{4}-\\d{2}-\\d{2})_\\d+$", "\\1", doc)
#' ),
#' s_attribute_name = "who"
#' )[[as.integer(gsub("^.*?_(\\d+)$", "\\1", doc))]]
#' })
#' BE$bundle <- as.partition_bundle(li)
#'
#' read(BE$topicmodel, BE$bundle[[1]], n = 3L, no_token = 250)
#' read(BE$topicmodel, BE$bundle[[2]], n = 3L, no_token = 250)
#' read(BE$topicmodel, BE$bundle[[3]], n = 3L, no_token = 250)
#' for (doc in docs){
#' print(doc)
#' p <- BE$bundle[[doc]]
#' read(BE$topicmodel, p, n = 3L, no_token = 250)
#' readline(prompt = "Hit any key to continue.")
#' }
#' }
#'
#' #############################
#'
#' data(SL_lda)
#' data(SL_labels)
#' data(SL_exclude)
#'
#' SL <- Topicanalysis$new(topicmodel = SL_lda)
#' SL$labels <- SL_labels
#' SL$exclude <- SL_exclude
#' SL$exclude <- grepl("^\\((split|)\\)$", SL$labels)
#' SL$name <- "Hamburg"
#'
#' cp_1 <- BE$compare(SL, BE)
#' cp_2 <- BE$compare(SL, BE)
#'
#' @export Topicanalysis
#' @name Topicanalysis
#' @aliases Topicanalysis
#' @importFrom R6 R6Class
#' @importFrom wordcloud wordcloud
#' @importFrom topicmodels posterior terms
Topicanalysis <- R6Class(
"Topicanalysis",
public = list(
topicmodel = NULL,
posterior = NULL,
terms = NULL,
topics = NULL,
bundle = NULL,
labels = c(),
name = c(),
categories = c(),
grouping = list(),
exclude = c(),
type = NULL,
initialize = function (topicmodel){
self$topicmodel <- topicmodel
self$labels <- as.character(1:topicmodel@k)
self$exclude <- as.logical(rep(FALSE, times = topicmodel@k))
self$categories <- character()
invisible(self)
},
relabel = function(n, new){
stopifnot(is.character(new))
if (as.character(n) %in% names(self$labels)){
self$labels[which(names(self$labels) == as.character(n))] <- new
} else {
self$labels[n] <- new
}
},
add_category = function(new){
stopifnot(is.character(new))
if (new != ""){
new_categories <- c(new, self$categories)
self$categories <- new_categories[order(new_categories)]
}
},
ignorance = function(n, new){
stopifnot(is.numeric(n), is.logical(new))
self$exclude[n] <- new
},
cooccurrences = function(k = 3, regex = NULL, docs = NULL, renumber = NULL, progress = TRUE, verbose = FALSE, exclude = TRUE){
if (is.null(self$topics)){
self$topics <- topics(self$topicmodel, k = k)
} else {
if (nrow(self$topics) != k) self$topics <- topics(self$topicmodel, k = k)
}
if (is.null(renumber)){
cooc <- cooccurrences(self$topics, regex = regex, docs = docs, progress = progress)
cooc[, "a_label" := self$labels[ cooc[["a"]] ] ]
cooc[, "b_label" := self$labels[cooc[["b"]] ] ]
if (exclude){
cooc <- cooc[!cooc$a %in% which(self$exclude == TRUE),]
cooc <- cooc[!cooc$b %in% which(self$exclude == TRUE),]
}
return(cooc)
} else {
stopifnot(is.integer(renumber))
cooc <- cooccurrences(
self$topics, regex = regex, docs = docs, renumber = unname(renumber),
progress = progress, verbose = verbose
)
if (exclude){
cooc <- cooc[x > 0][y > 0]
labelVector <- renumber[which(renumber > 0)]
labelVector2 <- sapply(split(labelVector, names(labelVector)), function(x) unique(x))
labelVector3 <- sort(labelVector2)
labelVector4 <- names(labelVector3)
names(labelVector4) <- as.character(labelVector3)
if (!is.null(names(renumber))){
cooc[, x_label := labelVector4[as.character(cooc[["x"]])]]
cooc[, y_label := labelVector4[as.character(cooc[["y"]])]]
}
}
return(cooc)
}
},
wordcloud = function(n, n_words = 50, ...){
tokens <- posterior(self$topicmodel)[["terms"]][n, ]
tokens <- tokens[order(tokens, decreasing = TRUE)][1:n_words]
wordcloud::wordcloud(words = names(tokens), freq = unname(tokens), ...)
},
docs = function(x, y = NULL, n = 3L, s_attributes = NULL, regex = NULL){
if (is.null(self$topics)){
self$topics <- topics(self$topicmodel, k = n)
} else {
if (nrow(self$topics) != n) self$topics <- topics(self$topicmodel, k = n)
}
topic_matrix <- self$topics
if (!is.null(regex)) topic_matrix <- topic_matrix[,grepl(regex, colnames(topic_matrix))]
if (is.character(x)) x <- which(self$labels == x)
x_present <- apply(topic_matrix, 2, function(column) any(x %in% column))
if (!is.null(y)){
if (is.character(y)) y <- which(self$labels == y)
y_present <- apply(topic_matrix, 2, function(column) any(y %in% column))
xy_present <- apply(matrix(data = c(x_present, y_present), ncol = 2), 1, all)
if (length(xy_present) > 0){
return( colnames(topic_matrix)[xy_present] )
} else {
return( character() )
}
} else {
if (length(x_present) > 0){
return( colnames(topic_matrix)[x_present] )
} else {
return( character() )
}
}
},
read = function(x, n = 3L, no_token = 100L){
read(self$topicmodel, as(self$bundle[[x]], self$type), no_topics = n, no_token = no_token)
},
as.zoo = function(x = NULL, y = NULL, k = 3L, exclude = TRUE, aggregation = c(NULL, "month", "quarter", "year")){
if (is.null(y)){
retval <- as.zoo(self$topicmodel, k = k, aggregation = aggregation)
colnames(retval) <- self$labels[as.integer(colnames(retval))]
if (!is.null(x)) retval <- retval[,x]
if (exclude) retval <- retval[, -which(self$exclude == TRUE)]
return(retval)
} else {
if (is.character(y)){
labelPosition <- sapply(y, function(x) grep(x, self$labels))
y <- as.integer(names(self$labels)[labelPosition])
}
z <- as.zoo(self$topicmodel, k = k, select = c(x, y), aggregation = aggregation)
colnamesSplit <- strsplit(colnames(z), "-")
colnames(z) <- sapply(
colnamesSplit,
function(x) paste(self$labels[x[1]], self$labels[x[2]], sep = " <-> ")
)
return(z)
}
},
compare = function(...){
# missing here: check that names exist and are unique
ta_list <- list(self, ...)
names(ta_list) <- sapply(ta_list, function(x) x$name)
DTs <- lapply(
names(ta_list),
function(ta_name){
message("... generating DT for ", ta_name)
m <- topicmodels::posterior(ta_list[[ta_name]]$topicmodel)[["terms"]]
rownames(m) <- paste(ta_name, ta_list[[ta_name]]$labels, sep = "::")
m <- m[which(ta_list[[ta_name]]$exclude == FALSE), ]
dt <- data.table(m)
dt[, "label" := rownames(m)]
y <- melt.data.table(data = dt, id.vars = "label")
setnames(y, old = "variable", new = "feature")
y
}
)
message("... creating aggregated DT")
DT <- rbindlist(DTs)
message("... crosstabulation")
DT_crosstab <- dcast.data.table(DT, feature ~ label, fun.aggregate = sum)
features <- as.vector(DT_crosstab[["feature"]])
DT_crosstab[, feature := NULL]
merged <- as.matrix(DT_crosstab)
rownames(merged) <- features
message("... calculating cosine similarity")
similarity <- lsa::cosine(merged)
similarity
},
find_topics = function(x, n = 100, word2vec = NULL){
if (is.null(self$terms)){
self$terms <- terms(self$topicmodel, n)
}
if (!is.null(word2vec)){
vocab <- unlist(lapply(x, function(x) as.vector(word2vec$similarity(x, 50)[,"word"])))
} else {
vocab <- x
}
y <- apply(
self$terms, 2,
function(column) sum(unlist(lapply(vocab, function(t) which(t == rev(column)))))
)
y <- y[order(y, decreasing = TRUE)]
y <- round(y / (length(vocab) * n), 3L)
df <- data.frame(
topic = as.integer(gsub("^.*?\\s+(\\d+)$", "\\1", names(y))),
score = unname(y)
)
df <- subset(df, score > 0)
for (i in 1L:10L){
df[[paste("word", i, sep = "_")]] <- self$terms[i, df$topic]
}
df
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.