#' Create a projection for text matching
#'
#' Calculates the linear information in the word counts apart from the topics
#' about the treatment.
#'
#' The function returns one loading per document, per level of the factor.
#' Thus in the standard case of two levels (treatment/control) the projection
#' is actually two-dimensional (indicating words that are particularly indicative
#' of treatment and words particularly indicative of control).
#'
#' When \code{interactions=FALSE} only the content covariate parameters
#' are used (and not the topic-covariate interactions). This may often
#' be a decent approximation to the full calculation because the topic-covariate
#' interactions are typically very sparse.
#'
#' When \code{interactions=TRUE} information from the interaction of the topic
#' and the content covariate is included in the projections. The software offers
#' three ways to do this based on the options set for \code{type}. In each case
#' the difference is how we reweight the topic-specific components of the interaction.
#'
#' When \code{type="theta"} (the option used in the paper), we simply use the theta
#' values estimated under the model. When \code{type="phi"}, we recompute the token-level
#' topic loadings conditional on the document-topics theta. This allows individual words
#' to have their own topic-specific loadings. When \code{type="refit"}, we recompute
#' the token-level topic loadings but under each different level of the content covariate.
#' The option is called \code{"refit"} because it is essentially refitting the tokens under
#' each different potential level of the content covariate when calculating the projection to
#' that level.
#'
#' @param stm_model the stm content covariate model from which to develop the projection
#' @param documents the documents that we want the projection for. Note that
#' these must be aligned with the vocabulary. If they were not the original documents
#' used to fit the model, see \link[stm]{alignCorpus}.
#' @param interactions a logical which defaults to TRUE. Determines whether or not the topic-aspect
#' interactions are included.
#' @param type determines how the topic-covariate interactions are included (see details below).
#' If interactions=FALSE this has no effect.
#' @param verbose a logical indicating if progress should be printed to the screen
#' @return list
#'
#' \item{projection}{the projection of word count information on the document}
#' \item{diagnostic}{the sum of interaction projections for each word type
#' normalized by the total number of words. This is helpful for assessing which
#' elements of the vocabularly are contributing to the topic-specific elements of
#' the projection. For non-topic specific parts, the relative contributions can
#' be read directly off the kappa object in stm.}
#' @references
#' Roberts, M., Stewart, B., Nielsen, R. (2020)
#' "Adjusting for Confounding with Text Matching."
#' In American Journal of Political Science
#' @examples
#' \donttest{
#' data(sim)
#' projection <- project(sim_topics,sim_documents)
#' }
#' @export
project <- function(stm_model, documents,
interactions=TRUE,
type=c("theta", "phi","refit"),
verbose=TRUE) {
#match arguments and check type
int.type <- match.arg(type)
if(!inherits(stm_model, "STM")) stop("stm_model is not an stm object")
if(length(stm_model$beta$logbeta)<2) stop("stm_model does not have a content covariate.")
#deal with older versions of stm by automatically coding contrast
if(is.null(stm_model$settings$kappa$contrast)) stm_model$settings$kappa$contrast <- FALSE
#if we have the contrast coding we are effectively working with one less element
contrast <- stm_model$settings$kappa$contrast
if(contrast) {
index <- index[-length(index)]
}
if(interactions & contrast) {
if(int.type%in% c("refit", "phi")) stop("Only type='theta' works with contrast")
}
#let's start a timer
t1 <- proc.time()
#pull some key elements out of the model for easy reference
kappa <- stm_model$beta$kappa$params
K <- stm_model$setting$dim$K
A <- stm_model$setting$dim$A
index <- sort(unique(stm_model$settings$covariates$betaindex))
#Compute the projection for the non-interactive elements
# in Equation 5 of the paper this is \sum_l w_{i,l}\kappa_{t,c}^{cov}
proj <- vector(mode="list", length=length(index))
for(i in 1:length(index)) {
#for each value grab the weighted sum of the kappa elements
#for each level of A
proj[[i]] <- unlist(lapply(documents, function(doc) {
sum((kappa[[K+index[i]]])[doc[1,]]*doc[2,])/sum(doc[2,])
}))
}
if(interactions & int.type=="refit") {
#Plan:
# for each aspect {
# - compute phi (the posterior mean of the topic indicator)
# - for each word, multiply through the relevant interaction
# - sum over the document and normalize
# }
if(verbose) {
dstar <- max(round(length(documents)/100),1)
msg <- sprintf("Starting Interactions...\nThis may take a while. I will print a dot every %i docs. \n", dstar)
cat(msg)
}
logbeta <- stm_model$beta$logbeta
#we want to store the contributions aggregated over word
diagnostic <- vector(mode="list", length=length(index))
for(i in 1:length(diagnostic)) {
diagnostic[[i]] <- rep(0,stm_model$settings$dim$V)
}
agg.wcounts <- rep(0, length(index))
for(d in 1:length(documents)) {
doc <- documents[[d]]
for(a in 1:A) {
expeta <- c(exp(stm_model$eta[d,]),1)
#pieces for the derivatives of the exp(eta)beta part
EB <- expeta*exp(logbeta[[index[a]]][,doc[1,],drop=FALSE]) #calculate exp(eta)\beta for each word
EB <- t(EB)/colSums(EB) #transpose and norm by (now) the row
#at this point EB is the phi matrix
phi <- t(EB*(doc[2,])) #multiply through by word count
#create a sparse K by V matrix containing interaction components
kappamat <- Matrix::Matrix(do.call(rbind,kappa[(K+ length(index) + (a-1)*K + 1):(K+ length(index) + a*K)]))
kappamat <- kappamat[,doc[1,]]
word.proj <- Matrix::colSums(kappamat*phi)
#add in contributions from interaction and word counts
diagnostic[[a]][doc[1,]] <-diagnostic[[a]][doc[1,]] + word.proj
agg.wcounts[a] <- agg.wcounts[a] + sum(doc[2,])
#elementwise multiply and sum
#now add it to the existing projection
proj[[a]][d] <- proj[[a]][d] + sum(word.proj)/sum(doc[2,])
}
if(verbose & d%%dstar==0) cat(".")
}
}
if(interactions & int.type=="phi") {
if(verbose) {
dstar <- max(round(length(documents)/100),1)
msg <- sprintf("Starting Interactions...\nThis may take a while. I will print a dot every %i docs. \n", dstar)
cat(msg)
}
logbeta <- stm_model$beta$logbeta
#we want to store the contributions aggregated over word
diagnostic <- vector(mode="list", length=length(index))
for(i in 1:length(diagnostic)) {
diagnostic[[i]] <- rep(0,stm_model$settings$dim$V)
}
agg.wcounts <- rep(0, length(index))
bindex <- stm_model$settings$covariates$betaindex
for(d in 1:length(documents)) {
doc <- documents[[d]]
expeta <- c(exp(stm_model$eta[d,]),1)
#pieces for the derivatives of the exp(eta)beta part
EB <- expeta*exp(logbeta[[bindex[d]]][,doc[1,],drop=FALSE]) #calculate exp(eta)\beta for each word
EB <- t(EB)/colSums(EB) #transpose and norm by (now) the row
#at this point EB is the phi matrix
phi <- t(EB*(doc[2,])) #multiply through by word count
for(a in 1:length(index)) {
#create a sparse K by V matrix containing interaction components
kappamat <- Matrix::Matrix(do.call(rbind,kappa[(K+ length(index) + (a-1)*K + 1):(K+ length(index) + a*K)]))
kappamat <- kappamat[,doc[1,]]
word.proj <- Matrix::colSums(kappamat*phi)
#add in contributions from interaction and word counts
diagnostic[[a]][doc[1,]] <-diagnostic[[a]][doc[1,]] + word.proj
agg.wcounts[a] <- agg.wcounts[a] + sum(doc[2,])
#elementwise multiply and sum
#now add it to the existing projection
proj[[a]][d] <- proj[[a]][d] + sum(word.proj)/sum(doc[2,])
}
if(verbose & d%%dstar==0) cat(".")
}}
if(interactions & int.type=="theta") {
if(verbose) {
dstar <- max(round(length(documents)/100),1)
msg <- sprintf("Starting Interactions...\nThis may take a while. I will print a dot every %i docs. \n", dstar)
cat(msg)
}
diagnostic <- vector(mode="list", length=length(index))
for(i in 1:length(diagnostic)) {
diagnostic[[i]] <- rep(0,stm_model$settings$dim$V)
}
agg.wcounts <- rep(0, length(index))
for(d in 1:length(documents)) {
doc <- documents[[d]]
theta <- stm_model$theta[d,]
wct <- matrix(rep(doc[2,], each=K), nrow=K)
theta.wct <- wct*theta
for(a in 1:length(index)) {
#create a sparse K by V matrix containing interaction components
kappamat <- Matrix::Matrix(do.call(rbind,kappa[(K+ length(index) + (a-1)*K + 1):(K+ length(index) + a*K)]))
kappamat <- kappamat[,doc[1,]]
word.proj <- Matrix::colSums(kappamat*theta.wct)
#add in contributions from interaction and word counts
diagnostic[[a]][doc[1,]] <-diagnostic[[a]][doc[1,]] + word.proj
agg.wcounts[a] <- agg.wcounts[a] + sum(doc[2,])
#elementwise multiply and sum
#now add it to the existing projection
proj[[a]][d] <- proj[[a]][d] + sum(word.proj)/sum(doc[2,])
}
if(verbose & d%%dstar==0) cat(".")
}}
if(!interactions) {
cat(sprintf("\n Calculation Complete in %.2f seconds",(proc.time() - t1)[3]))
return(list(projection=proj))
} else {
for(a in 1:length(index)) {
diagnostic[[a]] <- diagnostic[[a]]/agg.wcounts[a]
}
cat(sprintf("\n Calculation Complete in %.2f seconds",(proc.time() - t1)[3]))
return(list(projection=proj, diagnostic=diagnostic))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.