Nothing
#' Plot predictions using topics
#'
#' Use the \pkg{glmnet} package to plot LASSO based estimates of relationship
#' between an arbitrary dependent variable with topics and additional variables
#' as predictors. This function is experimental (see below).
#'
#' This function is used for estimating the most important topical predictors
#' of an arbitrary outcome. The idea is to run an L1 regularized regression
#' using \code{\link{cv.glmnet}} in the \pkg{glmnet} package where the
#' document-level dependent variable is chosen by the user and the predictors
#' are the document-topic proportions in the \code{\link{stm}} model along with
#' any other variables of interest.
#'
#' The function uses cross-validation to choose the regularization parameter
#' and generates a plot of which loadings were the most influential in
#' predicting the outcome. It also invisibly returns the glmnet model so that
#' it can be used for prediction.
#'
#' NOTE: This function is still very experimental and may have stability
#' issues. If stability issues are encountered see the documentation in
#' \pkg{glmnet} for arguments that can be passed to improve convergence. Also,
#' it is unlikely to work well with multivariate gaussian or multinomial
#' families.
#'
#' @param formula Formula specifying the dependent variable and additional
#' variables to included in the LASSO beyond the topics present in the stmobj.
#' Just pass a 1 on the right-hand side in order to run without additional
#' controls.
#' @param data Data file containing the dependent variable. Typically will be
#' the metadata file used in the stm analysis. It must have a number of rows
#' equal to the number of documents in the stmobj.
#' @param stmobj The STM object, and output from the \code{stm} function.
#' @param subset A logical statement that will be used to subset the corpus.
#' @param omit.var Pass a character vector of variable names to be excluded
#' from the plot. Note this does not exclude them from the calculation, only
#' the plot.
#' @param family The family parameter used in \code{\link{glmnet}}. See
#' explanation there. Defaults to "gaussian"
#' @param main Character string for the main title.
#' @param xlab Character string giving an x-axis label.
#' @param labeltype Type of example words to use in labeling each topic. See
#' \code{\link{labelTopics}}. Defaults to "prob".
#' @param seed The random seed for replication of the cross-validation samples.
#' @param xlim Width of the x-axis.
#' @param standardize Whether to standardize variables. Default is FALSE, which
#' is different from the \pkg{glmnet} default because the topics are already
#' standardized. Note that glmnet standardizes the variables by default but
#' then projects them back to their original scales before reporting
#' coefficients.
#' @param nfolds the number of cross-validation folds. Defaults to 20.
#' @param ... Additional arguments to be passed to glmnet. This can be useful
#' for addressing convergence problems.
#' @seealso \pkg{glmnet}
#' @references Friedman, Jerome, Trevor Hastie, and Rob Tibshirani.
#' "Regularization paths for generalized linear models via coordinate descent."
#' Journal of statistical software 33.1 (2010): 1.
#' @examples
#'
#' \donttest{
#'
#' #Load the poliblog data
#' data(poliblog5k)
#' #estimate a model with 50 topics
#' stm1 <- stm(poliblog5k.docs, poliblog5k.voc, 50,
#' prevalence=~rating + blog, data=poliblog5k.meta,
#' init.type="Spectral")
#' #make a plot of the topics most predictive of "rating"
#' out <- topicLasso(rating ~ 1, family="binomial", data=poliblog5k.meta,stmobj=stm1)
#' #generate some in-sample predictions
#' pred <- predict(out, newx=stm1$theta,type="class")
#' #check the accuracy of the predictions
#' table(pred, poliblog5k.meta$rating)
#' }
#' @export
topicLasso <- function(formula, data, stmobj=NULL, subset=NULL,
omit.var=NULL, family="gaussian",
main="Topic Effects on Outcome",
xlab=expression("Lower Outcome Higher Outcome"),
labeltype=c("prob", "frex", "lift", "score"),seed=02138,
xlim=c(-4,4), standardize=FALSE, nfolds=20, ...) {
X <- model.matrix(formula, data)
pvarnames <- colnames(X)
p <- ncol(X)
z <- stmobj
X <- cbind(X,z$theta)
#if there is an intercept remove it.
if(colnames(X)[1]=="(Intercept)") {
p <- p - 1
X <- X[,-1]
pvarnames <- pvarnames[-1]
}
mframe <- model.frame(formula,data)
y <- model.response(mframe)
if(!is.null(subset)) {
X <- X[subset,]
y <- y[subset]
}
X <- Matrix(X)
if(!is.null(seed)) set.seed(seed)
#pick the label type- if its a conten covariate fix it
labeltype <- match.arg(labeltype)
if(!z$settings$kappa$LDAbeta) labeltype <- "topics"
topiclabs <- labelTopics(z)
topiclabs <- apply(topiclabs[[labeltype]], 1, commas)
topiclabs <- sprintf("Topic %i: %s", 1:length(topiclabs), topiclabs)
varnames <- c(pvarnames, topiclabs)
linmod <- glmnet::cv.glmnet(x=X,y=y, family=family, standardize=standardize, nfolds=nfolds, ...)
loadings <- coef(linmod)[-1]
posind <- which(loadings>0)
if(length(posind!=0)) {
cat(c("Positive Effect \n",sprintf("%s \n",
varnames[posind])))
} else {
cat("No Positive Effects. \n")
}
negind <- which(loadings<0)
if(length(negind!=0)) {
cat(c("Negative Effect \n",sprintf("%s \n",
varnames[negind])))
} else {
cat("No Negative Effects. \n")
}
coefind <- which(loadings!=0)
drop <- c()
if(!is.null(omit.var)) {
for(i in 1:length(omit.var)) {
drop <- c(drop,agrep(omit.var[i], varnames))
}
coefind <- coefind[!(coefind %in% drop)]
}
if(length(coefind)!=0) {
coefval <- loadings[coefind]
labels <- varnames[coefind]
labels <- labels[order(coefval)]
values <- sort(coefval)
plot(x=values, y=1:length(values), pch=19, main=main,
xlim=xlim, ylab="", type="p", xlab=xlab,
ylim=c(-1, length(values)+1),yaxt="n", bty="n")
text(values, 1:length(values), labels, pos=2)
segments(0, -1, 0, length(values)+1,lty=2)
} else {
cat("No Non Zero Loadings")
}
return(invisible(linmod))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.