#' Plotting Counts/Proportion of Words/Docs in LDA-generated Topic-Subcorpora over Time
#'
#' Creates a plot of the counts/proportion of words/docs in corpora which are
#' generated by a \code{ldaresult}. Therefore an article is allocated to a topic
#' - and then to the topics corpus - if there are enough (see \code{limit} and
#' \code{alloc}) allocations of words in the article to the corresponding topic.
#' Additionally the corpora are reduced by \code{\link{filterWord}} and a
#' \code{search}-argument. The plot shows counts of subcorpora or if
#' \code{rel = TRUE} proportion of subcorpora to its corresponding whole corpus.
#'
#' @param object \code{\link{textmeta}} object with strictly tokenized
#' \code{text} component (character vectors) - such as a result of
#' \code{\link{cleanTexts}}
#' @param ldaresult The result of a function call \code{\link{LDAgen}}
#' @param ldaID Character vector of IDs of the documents in
#' \code{ldaresult}
#' @param limit Integer/numeric: How often a word must be
#' allocated to a topic to count these article as belonging
#' to this topic - if \code{0<limit<1} proportion is used (default: \code{10})?
#' @param alloc Character: Should every article
#' be allocated to multiple topics (\code{"multi"}), or maximum one
#' topic (\code{"unique"}), or the most represantative - exactly one -
#' topic (\code{"best"}) (default: \code{"multi"})? If \code{alloc = "best"} \code{limit} has no effect.
#' @param select Integer vector: Which topics of
#' \code{ldaresult} should be plotted (default: all topics)?
#' @param tnames Character vector of same length as \code{select}
#' - labels for the topics (default are the first returned words of
#' \code{\link[lda]{top.topic.words}} from the \code{lda} package for each topic)
#' @param search See \code{\link{filterWord}}
#' @param ignore.case See \code{\link{filterWord}}
#' @param type Character: Should counts/proportion of documents, where every
#' \code{"docs"} or words \code{"words"} be plotted (default: \code{"docs"})?
#' @param rel Logical. Should counts (\code{FALSE})
#' or proportion (\code{TRUE}) be plotted (default: \code{TRUE})?
#' @param mark Logical: Should years be marked by
#' vertical lines (default: \code{TRUE})?
#' @param unit Character: To which unit should dates be floored
#' (default: \code{"month"})? Other possible units are "bimonth", "quarter", "season",
#' "halfyear", "year", for more units see \code{\link[lubridate]{round_date}}
#' @param curves Character: Should \code{"exact"},
#' \code{"smooth"} curve or \code{"both"} be plotted (default: \code{"exact"})?
#' @param smooth Numeric: Smoothing parameter
#' which is handed over to \code{\link{lowess}} as \code{f} (default: \code{0.05})
#' @param both.lwd Graphical parameter for smoothed values
#' if \code{curves = "both"}
#' @param both.lty Graphical parameter for smoothed values
#' if \code{curves = "both"}
#' @param main Character: Graphical parameter
#' @param xlab Character: Graphical parameter
#' @param ylab Character: Graphical parameter
#' @param ylim Graphical parameter (default if \code{rel = TRUE}: \code{c(0, 1)})
#' @param col Graphical parameter, could be a vector. If \code{curves = "both"}
#' the function will for every wordgroup plot at first the exact and then the
#' smoothed curve - this is important for your col order.
#' @param legend Character: Value(s) to specify the
#' legend coordinates (default: "topright"). If "none" no legend is plotted.
#' @param natozero Logical. Should NAs be coerced
#' to zeros (default: \code{TRUE})? Only has effect if \code{rel = TRUE}.
#' @param file Character: File path if a pdf should be created
#' @param ... Additional graphical parameters
#' @return A plot.
#' Invisible: A dataframe with columns \code{date} and \code{tnames} with the
#' counts/proportion of the selected topics.
#' @examples
#' \dontrun{
#' data(politics)
#' poliClean <- cleanTexts(politics)
#' poliPraesidents <- filterWord(object=poliClean, search=c("bush", "obama"))
#' words10 <- makeWordlist(text=poliPraesidents$text)
#' words10 <- words10$words[words10$wordtable > 10]
#' poliLDA <- LDAprep(text=poliPraesidents$text, vocab=words10)
#' LDAresult <- LDAgen(documents=poliLDA, K=5, vocab=words10)
#' plotWordSub(object=poliClean, ldaresult=LDAresult, ldaID=names(poliLDA), search="obama")
#' }
#' @export plotWordSub
# AUSBLICK:
# select: auch liste zulassen + link-argument (schwierige Implementierung...)
# bei "or"-link im prinzip fuer rel = TRUE mean und bei rel = FALSE sum
plotWordSub = function(object, ldaresult, ldaID, limit = 10,
alloc = c("multi", "unique", "best"), select = 1:nrow(ldaresult$document_sums),
tnames, search, ignore.case = TRUE, type = c("docs", "words"), rel = TRUE,
mark = TRUE, unit = "month", curves = c("exact", "smooth", "both"),
smooth = 0.05, main, xlab, ylab, ylim, both.lwd, both.lty, col,
legend = "topright", natozero = TRUE, file, ...){
if(!missing(file)) pdf(file, width = 15, height = 8)
if(missing(tnames)) tnames <- paste0("T", select, ".",
lda::top.topic.words(ldaresult$topics, num.words = 1, by.score = TRUE)[select])
if(missing(main))
main <- paste(ifelse(rel, "Proportion", "Count"), "of",
ifelse(type[1] == "docs", "texts", "words"), "in given subcorpus over time")
if(missing(xlab)) xlab <- "date"
if(missing(ylab)) ylab <- paste(ifelse(rel, "proportion", "counts"), "per", unit)
if(missing(both.lwd)) both.lwd <- 1
if(missing(both.lty)) both.lty <- 1
if(mark)
markYears <- seq(
from = lubridate::floor_date(min(object$meta$date, na.rm = TRUE), unit = "year"),
to = lubridate::ceiling_date(max(object$meta$date, na.rm = TRUE), unit = "year"),
by = "year")
else markYears <- NA
# create allocs (list with article ids per topics):
allocs <- lapply(ldaresult$assignments, function(x) sort(table(x), decreasing = TRUE))
if(limit > 0 && limit < 1) allocs <- lapply(allocs, function(x) x/sum(x))
allocs <- switch(alloc[1],
multi = lapply(allocs, function(x) names(x)[x > limit]),
unique = lapply(allocs, function(x) names(x[1])[x[1] > limit]),
best = lapply(allocs, function(x) names(x)[1]))
articleids <- rep(ldaID, lengths(allocs))
allocs <- split(articleids, unlist(allocs))
allocs <- allocs[match(select-1, names(allocs))]
# create textmeta with text-component as result of filterWord (mention
# articles which appears in list allocs) and meta-component (all articles
# which appears in list allocs):
subs <- lapply(allocs, function(x)
textmeta(text = object$text[names(object$text) %in% x],
meta = object$meta[object$meta$id %in% x,]))
subids <- lapply(subs, function(object)
names(object$text)[filterWord(text = object$text,
search = search, ignore.case = ignore.case, out = "bin")])
### help-function (simplifier of plotScot):
levs <- unique(lubridate::floor_date(seq(from = min(object$meta$date, na.rm = TRUE),
to = max(object$meta$date, na.rm = TRUE), by = "day"), unit = unit))
foo <- function(subobject, id, type. = type, rel. = rel, unit. = unit,
natozero. = natozero){
dates <- lubridate::floor_date(
subobject$meta$date[match(id, subobject$meta$id)], unit.)
if (type.[1] == "words"){
docLengths <- lengths(subobject$text[match(id, names(subobject$text))])
counts <- sapply(split(docLengths, dates), sum)
}
else counts <- table(dates)
if (rel.){
# compute normalisation
if (type.[1] == "words"){
allDates <- lubridate::floor_date(
subobject$meta$date[match(names(subobject$text), subobject$meta$id)], unit.)
allCounts <- sapply(split(lengths(subobject$text), allDates), sum)
}
else{
allDates <- lubridate::floor_date(subobject$meta$date, unit.)
allCounts <- table(allDates)
}
# compute proportions
proportion <- counts[match(names(allCounts), names(counts))] / allCounts
# some preparation for plotting
dateNames <- as.Date(names(allCounts))
proportion <- as.vector(proportion)
proportion[is.na(proportion)] <- 0
tab <- data.frame(date = dateNames, proportion = proportion)
}
else{
# some preparation for plotting
dateNames <- as.Date(names(counts))
counts <- as.vector(counts)
tab <- data.frame(date = dateNames, counts = counts)
}
zerosToAdd <- !(levs %in% tab$date)
if(any(zerosToAdd)){
# add NA for proportion or zero for counts
zerosToAdd <- data.frame(levs[zerosToAdd], ifelse(rel., NA, 0))
names(zerosToAdd) <- names(tab)
tab <- rbind(tab, zerosToAdd)
}
tab <- tab[order(tab$date),]
if(natozero.) tab[is.na(tab)] <- 0
row.names(tab) <- 1:nrow(tab)
return(tab[,2])
}
###
toplot <- mapply(foo, subobject = subs, id = subids)
tab <- data.frame(levs, toplot)
colnames(tab) <- c("date", tnames)
# plotting:
if(missing(ylim)) ylim <- c(0, ifelse(rel, 1, max(tab[,-1])))
plot(tab$date, toplot[, 1], type = "n",
main = main, xlab = xlab, ylab = ylab, ylim = ylim, ...)
abline(v = markYears, lty = 3)
switch(curves[1],
exact = {
# set colors if missing
if (missing(col)) col <- RColorBrewer::brewer.pal(8, "Dark2")
col <- rep(col, length.out = length(select))
for (i in 1:ncol(toplot))
lines(tab$date, toplot[, i], col = col[i], ...)
},
smooth = {
# set colors if missing
if (missing(col)) col <- RColorBrewer::brewer.pal(8, "Dark2")
col <- rep(col, length.out = length(select))
for (i in 1:ncol(toplot))
lines(lowess(tab$date, toplot[, i], f = smooth), col = col[i], ...)
},
both = {
# set colors if missing
if (missing(col)) col <- RColorBrewer::brewer.pal(12, "Paired")
col <- rep(col, length.out = 2*length(select))
# plot both curves
for (i in 1:ncol(toplot)){
lines(tab$date, toplot[, i], col = col[2*i-1], ...)
lines(lowess(tab$date, toplot[, i], f = smooth), col = col[2*i],
lwd = both.lwd, lty = both.lty)
}
# reduce col-vector for legend
col <- col[seq_along(col) %% 2 == 0]
})
# plot legend
if (legend != "none") legend(legend, legend = tnames, col = col, pch = 20)
if(!missing(file)) dev.off()
# return data.frame as invisible
invisible(tab)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.