R/wordfish_comparison.R

#' @title Wordfish Comparison.
#' @description Calculated Wordfish scores for a list of dfm objects with
#' temporal filtering.
#'
#' @param dfm_list A list of quanteda dfm objects generated by the
#' `factorial_preprocessing()` and returned in the `$dfm_list` field
#' @param years A numeric vector giving the year for each document.
#' @param anchors A numeric vector of length two used to anchor the Wordfish
#' estimates. Defaults to c(1,24) which should work for the UK parliament docs.
#' @param proportion_threshold proportion of years a term must be included in
#' to be included in the Wordfish analysis.
#' @param document_inidices An option vector of row indices to be used. Useful
#' for using a subset of the data for analysis.
#' @return A result list object
#' @examples
#' \dontrun{
#' # replicates wordfish aanalysis from Denny and Spirling (2016)
#' # load the package
#' library(preText)
#' # load in the data
#' data("UK_Manifestos")
#' # preprocess data
#' preprocessed_documents <- factorial_preprocessing(
#'     UK_Manifestos,
#'     use_ngrams = TRUE,
#'     infrequent_term_threshold = 0.02,
#'     verbose = TRUE)
#' # get the years each document was written and store them as a numeric vector
#' dfm <- preprocessed_documents$dfm_list[[1]]
#' rl <- function(str) {
#'     stringr::str_replace_all(str,"[A-Za-z]+","")
#' }
#' years <- as.numeric(sapply(rownames(dfm),rl))
#'
#' # use the wordfish_comparison function to compare all dfms. We are using
#' # conservative and labour manifestos from 1983, 1987, 1992, and 1997 for a total
#' # of 8 manifestos. These are indicated by the document_inidices = c(19:22,42:45)
#' # argument. You can see the document names by entering rownames(dfm) into the
#' # console. We need to set the anchors to 5,1 because anchoring is applied in the
#' # reduced dfm. We are also only including terms that appear atleast once in a
#' # manifesto from each of the 4 years, to deal with the strong temporal effects.
#' wordfish_results <- wordfish_comparison(
#'     preprocessed_documents$dfm_list,
#'     years,
#'     anchors = c(1,5),
#'     proportion_threshold = 1,
#'     document_inidices = c(19:22,42:45))
#' }
#' @export
wordfish_comparison <- function(dfm_list,
                                years,
                                anchors = c(1,24),
                                proportion_threshold = 1,
                                document_inidices = NULL){

    dfm_object_list <- dfm_list
    # get the number of dfms
    num_dfms <- length(dfm_object_list)

    # ceate data structures to store information
    score_list <- vector(mode = "list", length = num_dfms)
    max_min <- matrix(0, nrow = num_dfms,ncol = 2)

    for (i in 1:num_dfms) {
        cat("Currently working on dfm",i,"of",num_dfms,"\n")
        ptm <- proc.time()
        # apply temporal filter
        cur_dfm <- dfm_object_list[[i]]

        if (!is.null(document_inidices)) {
            cur_dfm <- cur_dfm[document_inidices,]
            doc_counts <- quanteda::colSums(cur_dfm)
            rm_words <- which(doc_counts == 0)
            if(length(rm_words) > 0) {
                cur_dfm <- cur_dfm[,-rm_words]
            }
            reduced_years <- years[document_inidices]
        } else {
            reduced_years <- years
        }

        print(cur_dfm)

        dfm <- temporal_filter(cur_dfm,
                               reduced_years,
                               proportion_threshold = proportion_threshold)

        print(dfm)

        # run wordfish
        result <- quanteda::textmodel_wordfish(dfm,dir = anchors)

        # create a summary data frame
        tp <- data.frame(document = dfm@Dimnames$docs[order(result@theta)],
                         score = result@theta[order(result@theta)],
                         std_error = result@se.theta[order(result@theta)],
                         stringsAsFactors = FALSE)

        # store everything
        score_list[[i]] <- tp
        max_min[i,] <- c(order(result@theta)[1], order(result@theta)[length(result@theta)])
        t2 <- proc.time() - ptm
        cat("Complete in:",t2[[3]],"seconds...\n")
    }

    return(list(summary_results = max_min,
                results_by_dfm = score_list))

}

Try the preText package in your browser

Any scripts or data that you put into this service are public.

preText documentation built on May 1, 2019, 8:27 p.m.