SpeedReader: High Performance Text Analysis

Documented in feature_selection

#' A function that implements a number of feature selection methods for finding top
#' words which distinguish between two classes.
#'
#' @param contingency_table A contingency table generated by the `contingency_table()` function.
#' @param rows_to_compare A numeric vector containing the indicies of the rows
#' in the contingency table we wish to compare against eachother. Defaults to
#' NULL, in which case all rows are compared against eachother.
#' @param alpha The Dirichlet hyperparameter to be used if method =
#' "informed_Dirichlet". Suggested value is the average number of terms that
#' appear in a document. If a small value is selected, then more (globally)
#' common terms may be selected as top words. Increasing the value will select
#' for less globally common words. Defaults to 1 (not usually a good choice for
#' most analyses).
#' @param method Defaults to "informed_Dirichlet", which implements the model
#'  described in section 3.5.1 of Monroe et al. "Fightin Words...". Can also be
#'  "TF-IDF", in which case canonical TF-IDF ranking is used. The user may also
#'  select "TF-IDF-log(tf)", in which case the TF term is logged following
#'  Manning and Schutze (1999, p.544), or "TF-IDF-augmented(tf)", in which case
#'  the TF term is augmented also following Manning and Schutze (1999, p.544).
#' @param maximum_top_words Controls the maximum number of top words returned in
#' each category. Defaults to 5000.
#' @param document_term_matrix The document term matrix used to construct the
#' contingency_table. Necessary if the user selects method = "TF-IDF". Defaults
#' to NULL.
#' @param subsume_ngrams Optional argument allowing the user to combine highly
#' correlated ngrams in resulting output. Only useful if terms in the document
#' term matrix can overlap.
#' @param ngram_subsumption_correlation_threshold Defualts to 0.9, can be set
#' higher or lower depending on the correlation threshold at which the user
#' would like to subsume n-grams.
#' @param rank_by_log_odds Only applicable for the "informed_Dirichlet" method.
#' Defaults to FALSE. If TRUE, then terms are ranked by log odds instead of z-score.
#' @return A list object containing two dataframes (one for each comparison
#' category) with ranked top words. All words included in each dataset obtain
#' a z-score greater in magnitude than 1.96.
#' @export
feature_selection <- function(contingency_table,
                              rows_to_compare = NULL,
                              alpha = 1,
                              method = c("informed Dirichlet",
                                         "TF-IDF",
                                         "TF-IDF-log(tf)",
                                         "TF-IDF-augmented(tf)"),
                              maximum_top_words = 5000,
                              document_term_matrix = NULL,
                              subsume_ngrams = FALSE,
                              ngram_subsumption_correlation_threshold = 0.9,
                              rank_by_log_odds = FALSE){

    if (is.null(rows_to_compare)) {
        rows_to_compare <- 1:nrow(contingency_table)
    }

    method <- method[1]

    if ((method == "TF-IDF" | method == "TF-IDF-log(tf)"| method ==
        "TF-IDF-augmented(tf)") & is.null(document_term_matrix)) {
        stop("You must provide a document_term_matrix if method == 'TF-IDF'")
    }

    # make sure we only use one method
    method <- method[1]

    # determine whether we are working with a sparse matrix
    is_sparse_matrix <- FALSE
    if(class(contingency_table) == "simple_triplet_matrix"){
        is_sparse_matrix <- TRUE
    }

    # get the vocabulary
    vocabulary <- colnames(contingency_table)

    cat("Generating column sums...\n")
    # get column sums
    category_list <- vector(mode = "list", length = length(rows_to_compare))
    if (is_sparse_matrix) {
        colsums <- as.numeric(slam::col_sums(contingency_table))
        for (i in 1:length(rows_to_compare)) {
            category_list[[i]] <- as.numeric(slam::col_sums(
                contingency_table[rows_to_compare[i],]))
        }
        all_classes <- as.numeric(slam::col_sums(
            contingency_table[rows_to_compare,]))
    } else {
        colsums <- as.numeric(apply(contingency_table,2,sum))
        for (i in 1:length(rows_to_compare)) {
            category_list[[i]] <- as.numeric(
                contingency_table[rows_to_compare[i],])
        }
        all_classes <- as.numeric(apply(contingency_table[rows_to_compare,],
                                        2,sum))
    }

    to_return <- vector(mode = "list", length = length(rows_to_compare))
    #############################################
    ######## INFORMED DIRICHLET RANKING #########
    #############################################
    if (method == "informed Dirichlet") {
        # get the total number of tokens
        total_tokens <- sum(colsums)

        #calculate the prior contribution
        prior_contribution <- colsums * (alpha/total_tokens)

        cat("Calcualting log-odds ratios and variances...\n")

        for (i in 1:length(rows_to_compare)) {
            cat1 <- category_list[[i]]
            cat2 <- (all_classes - category_list[[i]])
            # now calculate the log-odds ratio for each token
            log_odds_ratio <- log(cat1 + prior_contribution) -
                log(sum(cat1) + alpha - cat1 - prior_contribution) -
                log(cat2 + prior_contribution) +
                log(sum(cat2) + alpha - cat2 - prior_contribution)

            # calculate the variance of the log-odds ratio using equation 19
            variance <- 1/(cat1 + prior_contribution) +
                1/(sum(cat1) + alpha - cat1 - prior_contribution) +
                1/(cat2 + prior_contribution) +
                1/(sum(cat2) + alpha - cat2 - prior_contribution)

            # subset everything to remove any NaNs (this comes in if we are using a
            # subset of documents where some words do not appear at all in any of them)
            vocab <- vocabulary
            column_indicies <- 1:length(vocabulary)
            remove <- unique(c(which(is.nan(log_odds_ratio)),
                               which(is.na(log_odds_ratio))))
            if (length(remove) > 0) {
                log_odds_ratio <- log_odds_ratio[-remove]
                variance <- variance[-remove]
                vocab <- vocabulary[-remove]
                cat1 <- cat1[-remove]
                cat2 <- cat2[-remove]
                all_c <- all_classes[-remove]
                column_indicies <- column_indicies[-remove]
                global_count <- colsums[-remove]
            } else {
                all_c <- all_classes
                global_count <- colsums
            }


            # calculate z scores
            z_scores <- log_odds_ratio/sqrt(variance)

            # generate the overal rankings for use in fightin words plots
            if (i == 1) {
                if (rank_by_log_odds) {
                    ordering <- order(log_odds_ratio, decreasing = TRUE)
                    ordered_data <- data.frame(scores = log_odds_ratio[ordering],
                                               log_odds_ratio= log_odds_ratio[ordering],
                                               z_scores = z_scores[ordering],
                                               total_count = all_c[ordering],
                                               terms = vocab[ordering],
                                               term_indicies = column_indicies[ordering],
                                               stringsAsFactors = FALSE)
                } else {
                    ordering <- order(z_scores, decreasing = TRUE)
                    ordered_data <- data.frame(scores = z_scores[ordering],
                                               log_odds_ratio = log_odds_ratio[ordering],
                                               z_scores = z_scores[ordering],
                                               total_count = all_c[ordering],
                                               terms = vocab[ordering],
                                               term_indicies = column_indicies[ordering],
                                               stringsAsFactors = FALSE)
                }
            }

            #now get the significant words and rank them.
            cat("Finding top words in each category...\n\n")
            inds <- which(z_scores > 1.96)
            category_1_significant_words <- data.frame(
                term = vocab[inds],
                log_odds_ratio = log_odds_ratio[inds],
                variance = variance[inds],
                z_scores = z_scores[inds],
                count = cat1[inds],
                other_count = cat2[inds],
                global_count = global_count[inds],
                term_indicies = column_indicies[inds],
                stringsAsFactors = FALSE)

            if (rank_by_log_odds) {
                ordering <- order(category_1_significant_words$log_odds_ratio,
                                  decreasing = T)
            } else {
                ordering <- order(category_1_significant_words$z_scores,
                                  decreasing = T)
            }

            category_1_significant_words <- category_1_significant_words[ordering,]
            rownames(category_1_significant_words) <- category_1_significant_words$term

            #print out resutls
            cat("Top 20 terms for category:",
                rownames(contingency_table)[rows_to_compare[i]], "...\n")
            print(head(category_1_significant_words[,2:7],n = 20))

            # make sure we do not excede the max number of words
            if (nrow(category_1_significant_words) > maximum_top_words) {
                category_1_significant_words <- category_1_significant_words[1:maximum_top_words,]
            }

            to_return[[i]] <- category_1_significant_words
        }
    }

    #################################
    ######## TD-IDF RANKING #########
    #################################
    if (method == "TF-IDF" | method == "TF-IDF-log(tf)"| method ==
        "TF-IDF-augmented(tf)") {
        # calcualte document frequency (not logged)
        if (class(document_term_matrix) == "simple_triplet_matrix") {
            document_frequency <- rep(0, ncol(document_term_matrix))
            printseq <- round(seq(1,length(document_term_matrix$i),
                                  length.out = 11)[2:11],0)
            # fast C++ implementation
            document_frequency <- Sparse_Document_Frequencies(
                length(document_term_matrix$j),
                document_term_matrix$j,
                document_frequency,
                printseq,
                length(printseq))
            document_frequency <- as.numeric(document_frequency)
        } else {
            document_frequency <- calculate_document_frequency(
                document_term_matrix)
        }

        document_indices <- attr(contingency_table,"document_indices")

        num_docs <- nrow(document_term_matrix)
        for (i in 1:length(rows_to_compare)) {
            cat1 <- category_list[[i]]
            cat1 <- cat1/length(document_indices[[rows_to_compare[i]]])

            if (method == "TF-IDF-log(tf)") {
                cat1 <- 1 + log(cat1)
            }
            remove <- NULL
            if(method == "TF-IDF-augmented(tf)") {
                cat1 <- 0.5 + (0.5 * cat1)/max(cat1)
                temp <- which(cat1 == 0.5)
                if(length(temp) > 0){
                    remove <- temp
                }
            }

            # now calculate scores
            scores_1 <- cat1*log(num_docs/(1 + document_frequency))

            #remove NaN entries
            column_indicies <- 1:length(vocabulary)
            vocabulary1 <- vocabulary
            category_11 <- cat1
            doc_frequency1 <- document_frequency
            remove <- c(remove,which(is.nan(scores_1)))
            remove <- c(remove,which(is.infinite(scores_1)))
            remove <- unique(remove)
            if (length(remove) > 0) {
                scores_1 <- scores_1[-remove]
                vocabulary1 <- vocabulary[-remove]
                category_11 <- cat1[-remove]
                doc_frequency1 <- document_frequency[-remove]
                column_indicies <- column_indicies[-remove]
            }

            # make the dataframe to return
            category_1_significant_words <- data.frame(
                term = vocabulary1,
                tfidf = scores_1,
                count = category_11,
                idf = doc_frequency1,
                term_indicies = column_indicies,
                stringsAsFactors = FALSE)
            ordering <- order(category_1_significant_words$tfidf,
                              decreasing = T)
            category_1_significant_words <- category_1_significant_words[ordering,]
            rownames(category_1_significant_words) <- category_1_significant_words$term

            #print out resutls
            cat("Top 20 terms for category:",
                rownames(contingency_table)[rows_to_compare[i]], "...\n")
            print(head(category_1_significant_words[,2:4],n = 20))

            # make sure we do not excede the max number of words
            if (nrow(category_1_significant_words) > maximum_top_words) {
                category_1_significant_words <- category_1_significant_words[1:maximum_top_words,]
            }

            to_return[[i]] <- category_1_significant_words
        }

        # now generate full ordered dataset if for first two categories
        # (only using significant words)

        scores <- c(to_return[[1]]$tfidf, rev(to_return[[2]]$tfidf))
        cnts <- c(to_return[[1]]$count, rev(to_return[[2]]$count))
        vcb <- c(to_return[[1]]$term, rev(to_return[[2]]$term))
        term_inds <- c(to_return[[1]]$term_indicies,
                           rev(to_return[[2]]$term_indicies))
        ordered_data <- data.frame(scores = scores,
                                   total_count = cnts,
                                   terms = vcb,
                                   term_indicies = term_inds,
                                   stringsAsFactors = FALSE)

    }

    # now if we are subsuming n-grams, then do so and provide an additional list
    if (subsume_ngrams) {
        cat("Subsuming N-Grams...\n")
        # list with subsumed n-grams for each entry
        subsumed_ngrams <- vector(mode = "list", length = length(rows_to_compare))
        # get the row indices which will stay constant.
        document_indices <- attr(contingency_table,"document_indices")
        row_inds <- NULL
        for (k in 1:length(rows_to_compare)) {
            row_inds <- c(row_inds,
                          as.numeric(document_indices[[rows_to_compare[k]]]))
        }
        for (k in 1:length(rows_to_compare)) {
            # get the document term matrix
            col_inds <- as.numeric(to_return[[k]]$term_indicies)
            doc_term_mat <- document_term_matrix[row_inds, col_inds]
            # subsume the n-grams
            subsumed <- subsume_ngrams(
                ranked_terms = to_return[[k]],
                document_term_matrix = doc_term_mat ,
                term_clusters_to_output = 200,
                top_terms_to_search = 200,
                correlation_threshold = ngram_subsumption_correlation_threshold)

            subsumed_ngrams[[k]] <- subsumed
        }
        # name everything correctly
        names(subsumed_ngrams) <- rownames(contingency_table)[rows_to_compare]
        to_return <- append(to_return,
                            list(ordered_data))
        to_return <- append(to_return,
                            list(subsumed_ngrams))
        names(to_return) <- c(rownames(contingency_table)[rows_to_compare],
                              "Term_Ordering",
                              "Subsumed_NGrams")
    } else {
        # if we are not subsuming n-grams, then just name stuff and return it.
        to_return <- append(to_return, list(ordered_data))
        names(to_return) <- c(rownames(contingency_table)[rows_to_compare],
                              "Term_Ordering")
    }

    # add in indicator for whether we are rank_by_log_odds
    to_return$rank_by_log_odds <- rank_by_log_odds

    return(to_return)
}