Nothing
#' Assists the user in selecting the best STM model.
#'
#' Discards models with the low likelihood values based on a small number of EM
#' iterations (cast net stage), then calculates semantic coherence,
#' exclusivity, and sparsity (based on default STM run using selected
#' convergence criteria) to allow the user to choose between models with high
#' likelihood values.
#'
#' @param documents The documents to be modeled. Object must be a list of with
#' each element corresponding to a document. Each document is represented as
#' an integer matrix with two rows, and columns equal to the number of unique
#' vocabulary words in the document. The first row contains the 1-indexed
#' vocabulary entry and the second row contains the number of times that term
#' appears.
#'
#' This is similar to the format in the \pkg{lda} package except that
#' (following R convention) the vocabulary is indexed from one. Corpora can be
#' imported using the reader function and manipulated using the
#' \code{\link{prepDocuments}}.
#' @param vocab Character vector specifying the words in the corpus in the
#' order of the vocab indices in documents. Each term in the vocabulary index
#' must appear at least once in the documents. See
#' \code{\link{prepDocuments}} for dropping unused items in the vocabulary.
#' @param K A positive integer (of size 2 or greater) representing the desired
#' number of topics. Additional detail on choosing the number of topics in
#' details.
#' @param prevalence A formula object with no response variable or a matrix
#' containing topic prevalence covariates. Use \code{s()}, \code{ns()} or
#' \code{bs()} to specify smooth terms. See details for more information.
#' @param content A formula containing a single variable, a factor variable or
#' something which can be coerced to a factor indicating the category of the
#' content variable for each document.
#' @param runs Total number of STM runs used in the cast net stage.
#' Approximately 15 percent of these runs will be used for running a STM until
#' convergence.
#' @param data Dataset which contains prevalence and content covariates.
#' @param init.type The method of initialization. Must be either Latent
#' Dirichlet Allocation (LDA), Dirichlet Multinomial Regression Topic Model
#' (DMR), a random initialization or a previous STM object.
#' @param seed Seed for the random number generator. \code{stm} saves the seed
#' it uses on every run so that any result can be exactly reproduced. Setting
#' the seed here simply ensures that the sequence of models will be exactly the
#' same when respecified. Individual seeds can be retrieved from the component
#' model objects.
#' @param max.em.its The maximum number of EM iterations. If convergence has
#' not been met at this point, a message will be printed.
#' @param emtol Convergence tolerance. EM stops when the relative change in
#' the approximate bound drops below this level. Defaults to .001\%.
#' @param verbose A logical flag indicating whether information should be
#' printed to the screen.
#' @param frexw Weight used to calculate exclusivity
#' @param net.max.em.its Maximum EM iterations used when casting the net
#' @param netverbose Whether verbose should be used when calculating net
#' models.
#' @param M Number of words used to calculate semantic coherence and
#' exclusivity. Defaults to 10.
#' @param N Total number of models to retain in the end. Defaults to .2 of
#' runs.
#' @param to.disk Boolean. If TRUE, each model is saved to disk at the current
#' directory in a separate RData file. This is most useful if one needs to run
#' \code{multiSTM()} on a large number of output models.
#' @param \dots Additional options described in details of stm.
#' @return \item{runout}{List of model outputs the user has to choose from.
#' Take the same form as the output from a stm model.} \item{semcoh}{Semantic
#' coherence values for each topic within each model in runout}
#' \item{exclusivity}{Exclusivity values for each topic within each model in
#' runout. Only calculated for models without a content covariate}
#' \item{sparsity}{Percent sparsity for the covariate and interaction kappas
#' for models with a content covariate.}
#' @examples
#'
#' \dontrun{
#'
#' temp<-textProcessor(documents=gadarian$open.ended.response, metadata=gadarian)
#' meta<-temp$meta
#' vocab<-temp$vocab
#' docs<-temp$documents
#' out <- prepDocuments(docs, vocab, meta)
#' docs<-out$documents
#' vocab<-out$vocab
#' meta <-out$meta
#' set.seed(02138)
#' mod.out <- selectModel(docs, vocab, K=3, prevalence=~treatment + s(pid_rep),
#' data=meta, runs=5)
#' plotModels(mod.out)
#' selected<-mod.out$runout[[1]]
#' }
#' @export
selectModel <- function(documents, vocab, K,
prevalence=NULL, content=NULL, data=NULL,
max.em.its=100, verbose=TRUE, init.type = "LDA",
emtol= 1e-05, seed=NULL,runs=50, frexw=.7,
net.max.em.its=2, netverbose=FALSE, M=10, N=NULL,
to.disk=F, ...){
if(!is.null(seed)) set.seed(seed)
if(is.null(N)){
N <- round(.2*runs)
}
if(runs<2){
stop("Number of runs must be two or greater.")
}
if(runs<N){
stop("Number in the net must be greater or equal to the number of final models.")
}
seedout <- NULL
likelihood <- NULL
cat("Casting net \n")
for(i in 1:runs){
cat(paste(i, "models in net \n"))
mod.out <- stm(documents, vocab, K,
prevalence=prevalence, content=content, data=data, init.type=init.type,
max.em.its=net.max.em.its, emtol=emtol, verbose=netverbose,...)
seedout[i] <- mod.out$settings$seed
likelihood[i] <- mod.out$convergence$bound[length(mod.out$convergence$bound)]
}
keep <- order(likelihood, decreasing=T)[1:N]
keepseed <- seedout[keep]
cat("Running select models \n")
runout <- list()
semcoh <- list()
exclusivity <- list()
sparsity <- list()
for(i in 1:length(keepseed)){
cat(paste(i, "select model run \n"))
initseed <- keepseed[i]
mod.out <- stm(documents, vocab, K,
prevalence=prevalence, content=content, data=data, init.type=init.type, seed=initseed,
max.em.its=max.em.its, emtol=emtol, verbose=verbose,...)
runout[[i]] <- mod.out
if(to.disk==T){
mod <- mod.out
save(mod, file=paste("runout", i, ".RData", sep=""))
}
semcoh[[i]] <- semanticCoherence(mod.out, documents, M)
if(length(mod.out$beta$logbeta)<2){
exclusivity[[i]] <- exclusivity(mod.out, M=M, frexw=.7)
sparsity[[i]] = "Sparsity not calculated for models without content covariates"
}
if(length(mod.out$beta$logbeta)>1){
exclusivity[[i]] = "Exclusivity not calculated for models with content covariates"
kappas <- t(matrix(unlist(mod.out$beta$kappa$params), ncol=length(mod.out$beta$kappa$params)))
topics <-mod.out$settings$dim$K
numsparse = apply(kappas[(K+1):nrow(kappas),], 1,function (x) sum(x<emtol))
sparsity[[i]] = numsparse/ncol(kappas)
}
}
out <- list(runout=runout, semcoh=semcoh, exclusivity=exclusivity, sparsity=sparsity)
class(out) <- "selectModel"
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.