#' Functions for plotting STM objects
#'
#' Produces one of four types of plots for an STM object. The default option
#' \code{"summary"} prints topic words with their corpus frequency.
#' \code{"labels"} is for easy printing of tables of indicative words for each
#' topic. \code{"perspectives"} depicts differences between two topics,
#' content covariates or combinations. \code{"hist"} creates a histogram of the
#' expected distribution of topic proportions across the documents.
#'
#' The function can produce three types of plots which summarize an STM object
#' which is chosen by the argument \code{type}. \code{summary} produces a plot
#' which displays the topics ordered by their expected frequency across the
#' corpus. \code{labels} plots the top words selected according to the chosen
#' criteria for each selected topics. \code{perspectives} plots two topic or
#' topic-covariate combinations. Words are sized proportional to their use
#' within the plotted topic-covariate combinations and oriented along the
#' X-axis based on how much they favor one of the two configurations. If the
#' words cluster on top of each other the user can either set the plot size to
#' be larger or shrink the total number of words on the plot. The vertical
#' configuration of the words is random and thus can be rerun to produce
#' different results each time. Note that \code{perspectives} plots do
#' not use any of the labeling options directly. \code{hist} plots a histogram of the MAP
#' estimates of the document-topic loadings across all documents. The median
#' is also denoted by a dashed red line.
#'
#' @param x Model output from stm.
#' @param type Sets the desired type of plot. See details for more
#' information.
#' @param n Sets the number of words used to label each topic. In perspective
#' plots it approximately sets the total number of words in the plot. The
#' defaults are 3, 20 and 25 for \code{summary}, \code{labels} and
#' \code{perspectives} respectively. n must be greater than or equal to 2
#' @param topics Vector of topics to display. For plot perspectives this must
#' be a vector of length one or two. For the other two types it defaults to all
#' topics.
#' @param labeltype Determines which option of \code{"prob", "frex", "lift",
#' "score"} is used for choosing the most important words. See
#' \code{\link{labelTopics}} for more detail. Passing an argument to
#' \code{custom.labels} will override this. Note that this does not apply to
#' \code{perspectives} type which always uses highest probability words.
#' @param frexw If "frex" labeltype is used, this will be the frex weight.
#' @param main Title to the plot
#' @param xlim Range of the X-axis.
#' @param ylim Range of the Y-axis.
#' @param xlab Labels for the X-axis. For perspective plots, use
#' \code{plabels} instead.
#' @param family The Font family. Most of the time the user will not need to
#' specify this but if using other character sets can be useful see \link{par}.
#' @param width Sets the width in number of characters used for string wrapping
#' in type \code{"labels"}
#' @param covarlevels A vector of length one or length two which contains the
#' levels of the content covariate to be used in perspective plots.
#' @param plabels This option can be used to override the default labels in the
#' perspective plot that appear along the x-axis. It should be a character
#' vector of length two which has the left hand side label first.
#' @param text.cex Controls the scaling constant on text size.
#' @param custom.labels A vector of custom labels if labeltype is equal to
#' "custom".
#' @param topic.names A vector of custom topic names. Defaults to "Topic #: ".
#' @param \dots Additional parameters passed to plotting functions.
#' @seealso \code{\link{plotQuote}}, \code{\link{plot.topicCorr}}
#' @references Roberts, Margaret E., Brandon M. Stewart, Dustin Tingley,
#' Christopher Lucas, Jetson Leder-Luis, Shana Kushner Gadarian, Bethany
#' Albertson, and David G. Rand. "Structural Topic Models for Open-Ended
#' Survey Responses." American Journal of Political Science 58, no 4 (2014):
#' 1064-1082.
#' @examples
#' \donttest{
#'
#' #Examples with the Gadarian Data
#' plot(gadarianFit)
#' plot(gadarianFit,type="labels")
#' plot(gadarianFit, type="perspectives", topics=c(1,2))
#' plot(gadarianFit,type="hist")
#' }
#' @export plot.STM
#' @export
plot.STM <- function(x,
type=c("summary", "labels", "perspectives", "hist"),
n=NULL, topics=NULL,
labeltype=c("prob", "frex", "lift", "score"),
frexw=.5,
main=NULL, xlim=NULL, ylim=NULL, xlab=NULL, family="",
width=80,
covarlevels=NULL, plabels=NULL, text.cex=1, custom.labels=NULL,
topic.names=NULL,
...){
model <- x
type <- match.arg(type)
contentcov <- length(model$beta$logbeta)!=1
if(contentcov & !missing(labeltype)) stop("Cannot specify label type for content covariate models.")
labeltype <- match.arg(labeltype)
if(!is.null(custom.labels)) labeltype <- "custom"
if(is.null(n)) n <- switch(type,
summary=3,
labels=20,
perspectives=25,
hist=3)
if(type!="perspectives" & is.null(topics)) topics <- 1:model$settings$dim$K
if(labeltype!="custom"){
if(type != "perspectives") {
lab <- labelTopics(model, topics=topics, n = n, frexweight=frexw)
if(contentcov) {
lab <- lab$topics
} else {
lab <- lab[[labeltype]]
}
}
} else {
lab <- custom.labels
if(length(lab)!=length(topics)) lab <- rep_len(lab, length.out=length(topics))
}
if(!is.null(topic.names)) topic.names <- rep_len(topic.names, length.out=length(topics))
##############
# Summary Plot
##############
if(type=="summary") {
if(labeltype!="custom"){
lab <- apply(lab, 1, commas)
if(!contentcov) lab <- lab[topics]
}
if(is.null(topic.names)) {
topic.names <- sprintf("Topic %i:", topics)
}
lab <- sprintf("%s %s", topic.names, lab)
frequency <- colMeans(model$theta[,topics])
invrank <- order(frequency, decreasing=FALSE)
if(is.null(xlim)) xlim <- c(0,min(2*max(frequency), 1))
if(is.null(ylim)) ylim <- c(0,length(topics))
if(is.null(main)) main <- "Top Topics"
if(is.null(xlab)) xlab <- "Expected Topic Proportions"
ylab <- ""
plot(c(0,0), type="n", xlim=xlim, ylim=ylim, main=main,
yaxt="n",
ylab=ylab, xlab=xlab, ...)
for(i in 1:length(invrank)) {
lines(c(0,frequency[invrank[i]]), c(i, i))
text(frequency[invrank[i]]+ min(2*max(frequency), 1)/100, i , lab[invrank[i]],family=family,pos=4, cex=text.cex)
}
}
##############
# Labels
##############
if(type=="labels") {
if(contentcov) {
#we want to marginalize over the aspects here
weights <- model$settings$covariates$betaindex
tab <- table(weights)
weights <- tab/sum(tab)
#marginalize
beta <- exp(model$beta$logbeta[[1]])*weights[1]
for(i in 2:length(model$beta$logbeta)) {
beta <- beta + exp(model$beta$logbeta[[i]])*weights[i]
}
lab <- t(apply(beta, 1, function(x) model$vocab[order(x, decreasing=TRUE)[1:n]]))
}
plot(c(0,0), type="n", main=main,
ylim=c((length(topics)+.9),1), #note- this flips the coordinate system
xlim=c(1,n+1),
xaxt="n", yaxt="n", xlab="", ylab="",...)
if(labeltype!="custom") {
lab <- apply(lab, 1, commas)
lab <- lab[topics]
}
if(is.null(topic.names)) {
topic.names <- sprintf("Topic %i:", topics)
}
lab <- lapply(lab, strwrap, width=width)
for(i in 1:length(topics)){
if(i!=length(topics)) lines(c(-1,n+3), c(i+1,i+1), lty=2)
string <- paste0(lab[[i]],collapse="\n")
string <- sprintf("%s \n %s", topic.names[i], string)
text(n/2, (i + .5), string, family=family, cex=text.cex)
}
}
##############
# Perspectives
##############
if(type=="perspectives") {
if(!contentcov) covarlevels <- c(1,1)
#check the topics
if(is.null(topics)) stop("Must specify one or two topic numbers using topics")
if(length(topics)>2) stop("Too many topics specified.")
if(length(topics)==1) {
topics <- rep(topics,2)
}
sametopics <- topics[1]==topics[2]
#check the aspects
if(is.null(covarlevels)) {
covarlevels <- c(1,2)
} else {
if(length(covarlevels)>2) stop("More than two covariate levels specified.")
if(length(covarlevels)==1) covarlevels <- rep(covarlevels,2)
covlabels <- model$seetings$covariates$yvarlevels
if(is.character(covarlevels)) {
covarlevels <- pmatch(covarlevels, model$settings$covariates$yvarlevels)
if(any(is.na(covarlevels))) stop("Unrecognized covariate levels")
}
}
samecovars <- covarlevels[1]==covarlevels[2]
#determine what the axis labels should be
if(is.null(plabels)) {
plabels <- vector(length=2)
if(!contentcov) {
plabels[1] <- paste0(c("Topic ", topics[1]),collapse="")
plabels[2] <- paste0(c("Topic ", topics[2]),collapse="")
} else {
if(sametopics & !samecovars) {
plabels[1] <- paste0(c(model$settings$covariates$yvarlevels[covarlevels[1]],
"\n", "(Topic ", topics[1], ")"),
collapse="")
plabels[2] <- paste0(c(model$settings$covariates$yvarlevels[covarlevels[2]],
"\n", "(Topic ", topics[2], ")"),
collapse="")
} else {
plabels[1] <- paste0(c("Topic ", topics[1], ") \n",
"(",
model$settings$covariates$yvarlevels[covarlevels[1]], ")"),
collapse="")
plabels[2] <- paste0(c("Topic ", topics[2], ") \n",
"(",
model$settings$covariates$yvarlevels[covarlevels[2]], ")"),
collapse="")
}
}
}
#Retrieve the relevant logbetas
# okay next thing to try is regularized estimates of the ratio.
left <- model$beta$logbeta[[covarlevels[1]]][topics[1],]
right <- model$beta$logbeta[[covarlevels[2]]][topics[2],]
nd2 <- floor(n/2)
words <- unique(c(order(left, decreasing=TRUE)[1:nd2], order(right,decreasing=TRUE)[1:nd2]))
#Take the probability for the topic they are the highest in.
# Scale it so the max is 4 times as large, and then take the inverse hyperbolic sine func.
# this basically has the effect of flattening it out a bit so the giant things aren't huge.
scale <- pmax(exp(left[words]) + exp(right[words]))
scale <- asinh((scale/max(scale))*4)
diff <- exp(left[words]) - exp(right[words])
diff <- diff/max(abs(diff))
#Create the plot
plot(c(0,0), xlim=c(max(diff)+.1,min(diff) - .1), ylim=c(-2*length(diff),6*length(diff)), type="n",
xaxt="n", xlab="", main=main, yaxt="n", ylab="", bty="n",...)
segments(0,0, 0, -2*length(diff),lty=2)
rand <- sample(seq(1, 6*length(diff),by=2), length(diff), replace=F)
thresh <- .1
negdiff <- diff*(diff< -thresh)
posdiff <- diff*(diff>thresh)
middiff <- diff*(diff <thresh & diff > -thresh)
colors <- grDevices::rgb(posdiff+.75, .75, -negdiff+.75, maxColorValue=2)
text(diff, rand, model$vocab[words], cex=text.cex*scale, col=colors, family=family)
#X-axis
text(.75*min(diff), -length(diff), as.character(plabels[2]),
col=grDevices::rgb(.5,.5,1.6,2,maxColorValue=2), cex=2, pos=1)
text(.75*max(diff), -length(diff), as.character(plabels[1]),
col=grDevices::rgb(1.6,.5,.5,2,maxColorValue=2), cex=2, pos=1)
segments(min(diff),-.5*length(diff),max(diff),-.5*length(diff))
}
###############
# Theta Histogram plots
###############
if(type=="hist") {
#setup components of the plot size.
N <- length(topics)
root <- ceiling(sqrt(N))
oldpar <- par(no.readonly=TRUE)
par(mfrow=c(root,root), oma=c(0,.5,1.5,0), mar=c(2,2,4,1))
if(labeltype!="custom"){
lab <- apply(lab, 1, commas)
if(!contentcov) lab <- lab[topics]
}
if(is.null(topic.names)) {
topic.names <- sprintf("Topic %i:", topics)
}
lab <- sprintf("%s %s", topic.names, lab)
for(i in 1:length(topics)){
theta_median <- median(model$theta[,topics[i]])
if(is.null(xlab)) xlab <- ""
#Now call the histogram. Note this is a bit kludgey but the default for xlim keeps us
# from being able to pass the NULL value. Because it references something internally
# it would be difficult to make it one call. So instead we just split on condition.
if(is.null(xlim)) {
hist(model$theta[,i], main=lab[i], xlab = xlab, ylab = "", ylim=ylim,...)
} else {
hist(model$theta[,i], main=lab[i], xlab = xlab, ylab = "", xlim=xlim, ylim=ylim,...)
}
abline(v = theta_median, col = "red", lwd = 1,lty=2)
}
if(is.null(main)) {
title("Distribution of MAP Estimates of Document-Topic Proportions", outer=TRUE)
} else {
title(main, outer=TRUE)
}
par(oldpar)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.