#' @title Wordfish Comparison.
#' @description Calculated Wordfish scores for a list of dfm objects with
#' temporal filtering.
#'
#' @param dfm_list A list of quanteda dfm objects generated by the
#' `factorial_preprocessing()` and returned in the `$dfm_list` field
#' @param years A numeric vector giving the year for each document.
#' @param anchors A numeric vector of length two used to anchor the Wordfish
#' estimates. Defaults to c(1,24) which should work for the UK parliament docs.
#' @param proportion_threshold proportion of years a term must be included in
#' to be included in the Wordfish analysis.
#' @param document_inidices An option vector of row indices to be used. Useful
#' for using a subset of the data for analysis.
#' @return A result list object
#' @examples
#' \dontrun{
#' # replicates wordfish aanalysis from Denny and Spirling (2016)
#' # load the package
#' library(preText)
#' # load in the data
#' data("UK_Manifestos")
#' # preprocess data
#' preprocessed_documents <- factorial_preprocessing(
#' UK_Manifestos,
#' use_ngrams = TRUE,
#' infrequent_term_threshold = 0.02,
#' verbose = TRUE)
#' # get the years each document was written and store them as a numeric vector
#' dfm <- preprocessed_documents$dfm_list[[1]]
#' rl <- function(str) {
#' stringr::str_replace_all(str,"[A-Za-z]+","")
#' }
#' years <- as.numeric(sapply(rownames(dfm),rl))
#'
#' # use the wordfish_comparison function to compare all dfms. We are using
#' # conservative and labour manifestos from 1983, 1987, 1992, and 1997 for a total
#' # of 8 manifestos. These are indicated by the document_inidices = c(19:22,42:45)
#' # argument. You can see the document names by entering rownames(dfm) into the
#' # console. We need to set the anchors to 5,1 because anchoring is applied in the
#' # reduced dfm. We are also only including terms that appear atleast once in a
#' # manifesto from each of the 4 years, to deal with the strong temporal effects.
#' wordfish_results <- wordfish_comparison(
#' preprocessed_documents$dfm_list,
#' years,
#' anchors = c(1,5),
#' proportion_threshold = 1,
#' document_inidices = c(19:22,42:45))
#' }
#' @export
wordfish_comparison <- function(dfm_list,
years,
anchors = c(1,24),
proportion_threshold = 1,
document_inidices = NULL){
dfm_object_list <- dfm_list
# get the number of dfms
num_dfms <- length(dfm_object_list)
# ceate data structures to store information
score_list <- vector(mode = "list", length = num_dfms)
max_min <- matrix(0, nrow = num_dfms,ncol = 2)
for (i in 1:num_dfms) {
cat("Currently working on dfm",i,"of",num_dfms,"\n")
ptm <- proc.time()
# apply temporal filter
cur_dfm <- dfm_object_list[[i]]
if (!is.null(document_inidices)) {
cur_dfm <- cur_dfm[document_inidices,]
doc_counts <- quanteda::colSums(cur_dfm)
rm_words <- which(doc_counts == 0)
if(length(rm_words) > 0) {
cur_dfm <- cur_dfm[,-rm_words]
}
reduced_years <- years[document_inidices]
} else {
reduced_years <- years
}
print(cur_dfm)
dfm <- temporal_filter(cur_dfm,
reduced_years,
proportion_threshold = proportion_threshold)
print(dfm)
# run wordfish
result <- quanteda.textmodels::textmodel_wordfish(dfm,dir = anchors)
# create a summary data frame
tp <- data.frame(document = dfm@Dimnames$docs[order(result@theta)],
score = result@theta[order(result@theta)],
std_error = result@se.theta[order(result@theta)],
stringsAsFactors = FALSE)
# store everything
score_list[[i]] <- tp
max_min[i,] <- c(order(result@theta)[1], order(result@theta)[length(result@theta)])
t2 <- proc.time() - ptm
cat("Complete in:",t2[[3]],"seconds...\n")
}
return(list(summary_results = max_min,
results_by_dfm = score_list))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.