Nothing
#' Label topics
#'
#' Generate a set of words describing each topic from a fitted STM object.
#' Uses a variety of labeling algorithms (see details).
#'
#' Four different types of word weightings are printed with label topics.
#'
#' Highest Prob: are the words within each topic with the highest probability
#' (inferred directly from topic-word distribution parameter \eqn{\beta}).
#'
#' FREX: are the words that are both frequent and exclusive, identifying words
#' that distinguish topics. This is calculated by taking the harmonic mean of
#' rank by probability within the topic (frequency) and rank by distribution of
#' topic given word \eqn{p(z|w=v)} (exclusivity). In estimating exclusivity we
#' use a James-Stein type shrinkage estimator of the distribution
#' \eqn{p(z|w=v)}. More information can be found in the documentation for the
#' internal function \code{\link{calcfrex}} and \code{\link{js.estimate}}.
#'
#' Score and Lift are measures provided in two other popular text mining
#' packages. For more information on type Score, see the R package
#' \code{\link{lda}} or the internal function \code{\link{calcscore}}.
#' For more information on type Lift, see the R package \code{maptpx}
#' or or the internal function \code{\link{calclift}}.
#'
#' @aliases labelTopics print.labelTopics
#' @param model An \code{STM} model object.
#' @param topics A vector of numbers indicating the topics to include. Default
#' is all topics.
#' @param n The desired number of words (per type) used to label each topic.
#' Must be 1 or greater.
#' @param frexweight A weight used in our approximate FREX scoring algorithm
#' (see details).
#' @return A labelTopics object (list) \item{prob }{matrix of highest
#' probability words} \item{frex }{matrix of highest ranking frex words}
#' \item{lift }{matrix of highest scoring words by lift} \item{score }{matrix
#' of best words by score} \item{topicnums }{a vector of topic numbers which
#' correspond to the rows}
#' @seealso \code{\link{stm}} \code{\link{plot.STM}}
#' \code{\link{calcfrex}} \code{\link{js.estimate}} \code{\link{calcscore}} \code{\link{calclift}}
#' @examples
#' labelTopics(gadarianFit)
#' @export
labelTopics <- function (model, topics=NULL, n = 7, frexweight=.5) {
if(n<1) stop("n must be 1 or greater")
logbeta <- model$beta$logbeta
K <- model$settings$dim$K
vocab <- model$vocab
if(is.null(topics)) topics <- 1:nrow(logbeta[[1]])
#make a switch for presence of contnet covariate
aspect <- length(logbeta)>1
out <- list()
if(!aspect) {
out$prob <- list()
out$frex <- list()
out$lift <- list()
out$score <- list()
logbeta <- logbeta[[1]]
wordcounts <- model$settings$dim$wcounts$x
#Calculate FREX Score
frexlabels <- try(calcfrex(logbeta, frexweight, wordcounts),silent=TRUE)
#Calculate Lift (Taddys thing this is beta_k,v divided by the empirical term probability)
liftlabels <- try(calclift(logbeta, wordcounts), silent=TRUE)
#Calculate score (Chang in LDA package etc.)
scorelabels <- try(calcscore(logbeta), silent=TRUE)
#standard labels
problabels <- apply(logbeta, 1, order, decreasing=TRUE)
for(k in 1:K) {
out$prob[[k]] <- vocab[problabels[1:n,k]]
if(inherits(frexlabels,"try-error")) {
out$frex[[k]] <- "FREX encountered an error and failed to run"
} else {
out$frex[[k]] <- vocab[frexlabels[1:n,k]]
}
if(inherits(liftlabels,"try-error")) {
out$lift[[k]] <- "Lift encountered an error and failed to run"
} else {
out$lift[[k]] <- vocab[liftlabels[1:n,k]]
}
if(inherits(scorelabels,"try-error")) {
out$lift[[k]] <- "Score encountered an error and failed to run"
} else {
out$score[[k]] <- vocab[scorelabels[1:n,k]]
}
}
out <- lapply(out, do.call, what=rbind)
} else {
labs <- lapply(model$beta$kappa$params, function(x) {
windex <- order(x,decreasing=TRUE)[1:n]
#we want to threshold by some minimal value.
ifelse(x[windex]>1e-3, vocab[windex], "")
})
labs <- do.call(rbind, labs)
A <- model$settings$dim$A
anames <- model$settings$covariates$yvarlevels
i1 <- K + 1; i2 <- K + A;
intnums <- (i2+1):nrow(labs)
out$topics <- labs[topics,,drop=FALSE]
out$covariate <- labs[i1:i2,,drop=FALSE]
rownames(out$covariate) <- anames
if(model$settings$kappa$interactions) {
tindx <- rep(1:K, each=A)
intnums <- intnums[tindx %in% topics]
out$interaction <- labs[intnums,,drop=FALSE]
}
}
out$topicnums <- topics
class(out) <- "labelTopics"
return(out)
}
#' @method print labelTopics
#' @export
print.labelTopics <- function(x,...) {
#test of its an aspect model or not
if(names(x)[1]!="topics") {
#Standard Non-aspect models
for(i in 1:length(x$topicnums)) {
toprint <- sprintf("Topic %i Top Words:\n \t Highest Prob: %s \n \t FREX: %s \n \t Lift: %s \n \t Score: %s \n",
x$topicnums[i],
commas(x$prob[x$topicnums[i],]),
commas(x$frex[x$topicnums[i],]),
commas(x$lift[x$topicnums[i],]),
commas(x$score[x$topicnums[i],]))
cat(toprint)
}
} else {
covlevels <- rownames(x$covariate)
#Aspect Models
topiclabs <- c("Topic Words:\n",sprintf("Topic %i: %s \n",
x$topicnums, apply(x$topics, 1, commas)))
aspects <- c("Covariate Words:\n", sprintf("Group %s: %s \n",
covlevels,
apply(x$covariate, 1, commas)))
if("interaction" %in% names(x)) {
interactions <- c("Topic-Covariate Interactions:\n")
intlabs <- apply(x$interaction,1,commas)
topicnums <- rep(x$topicnums, times=length(covlevels))
for(i in x$topicnums) {
out <- sprintf("Topic %i, Group %s: %s \n", i, covlevels, intlabs[which(topicnums==i)])
interactions <- c(interactions,out,"\n")
}
labels <- c(topiclabs,"\n",aspects,"\n",interactions)
} else {
labels <- c(topiclabs, "\n", aspects)
}
cat(labels)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.