tosca: Tools for Statistical Content Analysis

Documented in plotWordSub

#' Plotting Counts/Proportion of Words/Docs in LDA-generated Topic-Subcorpora over Time
#'
#' Creates a plot of the counts/proportion of words/docs in corpora which are
#' generated by a \code{ldaresult}. Therefore an article is allocated to a topic
#' - and then to the topics corpus - if there are enough (see \code{limit} and
#' \code{alloc}) allocations of words in the article to the corresponding topic.
#' Additionally the corpora are reduced by \code{\link{filterWord}} and a
#' \code{search}-argument. The plot shows counts of subcorpora or if
#' \code{rel = TRUE} proportion of subcorpora to its corresponding whole corpus.
#'
#' @param object \code{\link{textmeta}} object with strictly tokenized
#' \code{text} component (character vectors) - such as a result of
#' \code{\link{cleanTexts}}
#' @param ldaresult The result of a function call \code{\link{LDAgen}}
#' @param ldaID Character vector of IDs of the documents in
#' \code{ldaresult}
#' @param limit Integer/numeric: How often a word must be
#'  allocated to a topic to count these article as belonging
#' to this topic - if \code{0<limit<1} proportion is used (default: \code{10})?
#' @param alloc Character:  Should every article
#' be allocated to multiple topics (\code{"multi"}), or maximum one
#' topic (\code{"unique"}), or the most represantative  - exactly one -
#' topic (\code{"best"}) (default: \code{"multi"})? If \code{alloc = "best"} \code{limit} has no effect.
#' @param select Integer vector: Which topics of
#' \code{ldaresult} should be plotted (default: all topics)?
#' @param tnames Character vector of same length as \code{select}
#' - labels for the topics (default are the first returned words of
#' \code{\link[lda]{top.topic.words}} from the \code{lda} package for each topic)
#' @param search See \code{\link{filterWord}}
#' @param ignore.case See \code{\link{filterWord}}
#' @param type Character: Should counts/proportion of documents, where every
#'  \code{"docs"} or words \code{"words"} be plotted (default: \code{"docs"})?
#' @param rel Logical. Should counts (\code{FALSE})
#'  or proportion (\code{TRUE}) be plotted (default: \code{TRUE})?
#' @param mark Logical:  Should years be marked by
#' vertical lines (default: \code{TRUE})?
#' @param unit Character: To which unit should dates be floored
#' (default: \code{"month"})? Other possible units are "bimonth", "quarter", "season",
#' "halfyear", "year", for more units see \code{\link[lubridate]{round_date}}
#' @param curves Character: Should \code{"exact"},
#' \code{"smooth"} curve or \code{"both"} be plotted (default: \code{"exact"})?
#' @param smooth Numeric: Smoothing parameter
#' which is handed over to \code{\link{lowess}} as \code{f} (default: \code{0.05})
#' @param both.lwd Graphical parameter for smoothed values
#' if \code{curves = "both"}
#' @param both.lty Graphical parameter for smoothed values
#' if \code{curves = "both"}
#' @param main Character: Graphical parameter
#' @param xlab Character: Graphical parameter
#' @param ylab Character: Graphical parameter
#' @param ylim Graphical parameter (default if \code{rel = TRUE}: \code{c(0, 1)})
#' @param col Graphical parameter, could be a vector. If \code{curves = "both"}
#' the function will for every wordgroup plot at first the exact and then the
#' smoothed curve - this is important for your col order.
#' @param legend Character: Value(s) to specify the
#' legend coordinates (default: "topright"). If "none" no legend is plotted.
#' @param natozero Logical. Should NAs be coerced
#' to zeros (default: \code{TRUE})? Only has effect if \code{rel = TRUE}.
#' @param file Character: File path if a pdf should be created
#' @param ... Additional graphical parameters
#' @return A plot.
#' Invisible: A dataframe with columns \code{date} and \code{tnames} with the
#' counts/proportion of the selected topics.
#' @examples
#' \dontrun{
#' data(politics)
#' poliClean <- cleanTexts(politics)
#' poliPraesidents <- filterWord(object=poliClean, search=c("bush", "obama"))
#' words10 <- makeWordlist(text=poliPraesidents$text)
#' words10 <- words10$words[words10$wordtable > 10]
#' poliLDA <- LDAprep(text=poliPraesidents$text, vocab=words10)
#' LDAresult <- LDAgen(documents=poliLDA, K=5, vocab=words10)
#' plotWordSub(object=poliClean, ldaresult=LDAresult, ldaID=names(poliLDA), search="obama")
#' }
#' @export plotWordSub

# AUSBLICK:
# select: auch liste zulassen + link-argument (schwierige Implementierung...)
# bei "or"-link im prinzip fuer rel = TRUE mean und bei rel = FALSE sum

plotWordSub = function(object, ldaresult, ldaID, limit = 10,
  alloc = c("multi", "unique", "best"), select = 1:nrow(ldaresult$document_sums),
  tnames, search, ignore.case = TRUE, type = c("docs", "words"), rel = TRUE,
  mark = TRUE, unit = "month", curves = c("exact", "smooth", "both"),
  smooth = 0.05, main, xlab, ylab, ylim, both.lwd, both.lty, col,
  legend = "topright", natozero = TRUE, file, ...){

  if(!missing(file)) pdf(file, width = 15, height = 8)
  if(missing(tnames)) tnames <- paste0("T", select, ".",
    lda::top.topic.words(ldaresult$topics, num.words = 1, by.score = TRUE)[select])
  if(missing(main))
    main <- paste(ifelse(rel, "Proportion", "Count"), "of",
      ifelse(type[1] == "docs", "texts", "words"), "in given subcorpus over time")
  if(missing(xlab)) xlab <- "date"
  if(missing(ylab)) ylab <- paste(ifelse(rel, "proportion", "counts"), "per", unit)
  if(missing(both.lwd)) both.lwd <- 1
  if(missing(both.lty)) both.lty <- 1
  if(mark)
    markYears <- seq(
      from = lubridate::floor_date(min(object$meta$date, na.rm = TRUE), unit = "year"),
      to = lubridate::ceiling_date(max(object$meta$date, na.rm = TRUE), unit = "year"),
      by = "year")
  else markYears <- NA

  # create allocs (list with article ids per topics):
  allocs <- lapply(ldaresult$assignments, function(x) sort(table(x), decreasing = TRUE))
  if(limit > 0 && limit < 1) allocs <- lapply(allocs, function(x) x/sum(x))
  allocs <- switch(alloc[1],
    multi = lapply(allocs, function(x) names(x)[x > limit]),
    unique = lapply(allocs, function(x) names(x[1])[x[1] > limit]),
    best = lapply(allocs, function(x) names(x)[1]))
  articleids <- rep(ldaID, lengths(allocs))
  allocs <- split(articleids, unlist(allocs))
  allocs <- allocs[match(select-1, names(allocs))]

  # create textmeta with text-component as result of filterWord (mention
  # articles which appears in list allocs) and meta-component (all articles
  # which appears in list allocs):
  subs <- lapply(allocs, function(x)
    textmeta(text = object$text[names(object$text) %in% x],
      meta = object$meta[object$meta$id %in% x,]))
  subids <- lapply(subs, function(object)
    names(object$text)[filterWord(text = object$text,
      search = search, ignore.case = ignore.case, out = "bin")])

  ### help-function (simplifier of plotScot):
  levs <- unique(lubridate::floor_date(seq(from = min(object$meta$date, na.rm = TRUE),
    to = max(object$meta$date, na.rm = TRUE), by = "day"), unit = unit))
  foo <- function(subobject, id, type. = type, rel. = rel, unit. = unit,
    natozero. = natozero){
    dates <- lubridate::floor_date(
      subobject$meta$date[match(id, subobject$meta$id)], unit.)
    if (type.[1] == "words"){
      docLengths <- lengths(subobject$text[match(id, names(subobject$text))])
      counts <- sapply(split(docLengths, dates), sum)
    }
    else counts <- table(dates)
    if (rel.){
      # compute normalisation
      if (type.[1] == "words"){
        allDates <- lubridate::floor_date(
          subobject$meta$date[match(names(subobject$text), subobject$meta$id)], unit.)
        allCounts <- sapply(split(lengths(subobject$text), allDates), sum)
      }
      else{
        allDates <- lubridate::floor_date(subobject$meta$date, unit.)
        allCounts <- table(allDates)
      }
      # compute proportions
      proportion <- counts[match(names(allCounts), names(counts))] / allCounts
      # some preparation for plotting
      dateNames <- as.Date(names(allCounts))
      proportion <- as.vector(proportion)
      proportion[is.na(proportion)] <- 0
      tab <- data.frame(date = dateNames, proportion = proportion)
    }
    else{
      # some preparation for plotting
      dateNames <- as.Date(names(counts))
      counts <- as.vector(counts)
      tab <- data.frame(date = dateNames, counts = counts)
    }
    zerosToAdd <- !(levs %in% tab$date)
    if(any(zerosToAdd)){
      # add NA for proportion or zero for counts
      zerosToAdd <- data.frame(levs[zerosToAdd], ifelse(rel., NA, 0))
      names(zerosToAdd) <- names(tab)
      tab <- rbind(tab, zerosToAdd)
    }
    tab <- tab[order(tab$date),]
    if(natozero.) tab[is.na(tab)] <- 0
    row.names(tab) <- 1:nrow(tab)
    return(tab[,2])
  }
  ###

  toplot <- mapply(foo, subobject = subs, id = subids)
  tab <- data.frame(levs, toplot)
  colnames(tab) <- c("date", tnames)

  # plotting:
  if(missing(ylim)) ylim <- c(0, ifelse(rel, 1, max(tab[,-1])))
  plot(tab$date, toplot[, 1], type = "n",
    main = main, xlab = xlab, ylab = ylab, ylim = ylim, ...)
  abline(v = markYears, lty = 3)
  switch(curves[1],
    exact = {
      # set colors if missing
      if (missing(col)) col <- RColorBrewer::brewer.pal(8, "Dark2")
      col <- rep(col, length.out = length(select))
      for (i in 1:ncol(toplot))
        lines(tab$date, toplot[, i], col = col[i], ...)
    },
    smooth = {
      # set colors if missing
      if (missing(col)) col <- RColorBrewer::brewer.pal(8, "Dark2")
      col <- rep(col, length.out = length(select))
      for (i in 1:ncol(toplot))
        lines(lowess(tab$date, toplot[, i], f = smooth), col = col[i], ...)
    },
    both = {
      # set colors if missing
      if (missing(col)) col <- RColorBrewer::brewer.pal(12, "Paired")
      col <- rep(col, length.out = 2*length(select))
      # plot both curves
      for (i in 1:ncol(toplot)){
        lines(tab$date, toplot[, i], col = col[2*i-1], ...)
        lines(lowess(tab$date, toplot[, i], f = smooth), col = col[2*i],
          lwd = both.lwd, lty = both.lty)
      }
      # reduce col-vector for legend
      col <- col[seq_along(col) %% 2 == 0]
    })
  # plot legend
  if (legend != "none") legend(legend, legend = tnames, col = col, pch = 20)
  if(!missing(file)) dev.off()
  # return data.frame as invisible
  invisible(tab)
}