#' A function that implements a number of feature selection methods for finding top
#' words which distinguish between two classes.
#'
#' @param contingency_table A contingency table generated by the `contingency_table()` function.
#' @param rows_to_compare A numeric vector containing the indicies of the rows
#' in the contingency table we wish to compare against eachother. Defaults to
#' NULL, in which case all rows are compared against eachother.
#' @param alpha The Dirichlet hyperparameter to be used if method =
#' "informed_Dirichlet". Suggested value is the average number of terms that
#' appear in a document. If a small value is selected, then more (globally)
#' common terms may be selected as top words. Increasing the value will select
#' for less globally common words. Defaults to 1 (not usually a good choice for
#' most analyses).
#' @param method Defaults to "informed_Dirichlet", which implements the model
#' described in section 3.5.1 of Monroe et al. "Fightin Words...". Can also be
#' "TF-IDF", in which case canonical TF-IDF ranking is used. The user may also
#' select "TF-IDF-log(tf)", in which case the TF term is logged following
#' Manning and Schutze (1999, p.544), or "TF-IDF-augmented(tf)", in which case
#' the TF term is augmented also following Manning and Schutze (1999, p.544).
#' @param maximum_top_words Controls the maximum number of top words returned in
#' each category. Defaults to 5000.
#' @param document_term_matrix The document term matrix used to construct the
#' contingency_table. Necessary if the user selects method = "TF-IDF". Defaults
#' to NULL.
#' @param subsume_ngrams Optional argument allowing the user to combine highly
#' correlated ngrams in resulting output. Only useful if terms in the document
#' term matrix can overlap.
#' @param ngram_subsumption_correlation_threshold Defualts to 0.9, can be set
#' higher or lower depending on the correlation threshold at which the user
#' would like to subsume n-grams.
#' @param rank_by_log_odds Only applicable for the "informed_Dirichlet" method.
#' Defaults to FALSE. If TRUE, then terms are ranked by log odds instead of z-score.
#' @return A list object containing two dataframes (one for each comparison
#' category) with ranked top words. All words included in each dataset obtain
#' a z-score greater in magnitude than 1.96.
#' @export
feature_selection <- function(contingency_table,
rows_to_compare = NULL,
alpha = 1,
method = c("informed Dirichlet",
"TF-IDF",
"TF-IDF-log(tf)",
"TF-IDF-augmented(tf)"),
maximum_top_words = 5000,
document_term_matrix = NULL,
subsume_ngrams = FALSE,
ngram_subsumption_correlation_threshold = 0.9,
rank_by_log_odds = FALSE){
if (is.null(rows_to_compare)) {
rows_to_compare <- 1:nrow(contingency_table)
}
method <- method[1]
if ((method == "TF-IDF" | method == "TF-IDF-log(tf)"| method ==
"TF-IDF-augmented(tf)") & is.null(document_term_matrix)) {
stop("You must provide a document_term_matrix if method == 'TF-IDF'")
}
# make sure we only use one method
method <- method[1]
# determine whether we are working with a sparse matrix
is_sparse_matrix <- FALSE
if(class(contingency_table) == "simple_triplet_matrix"){
is_sparse_matrix <- TRUE
}
# get the vocabulary
vocabulary <- colnames(contingency_table)
cat("Generating column sums...\n")
# get column sums
category_list <- vector(mode = "list", length = length(rows_to_compare))
if (is_sparse_matrix) {
colsums <- as.numeric(slam::col_sums(contingency_table))
for (i in 1:length(rows_to_compare)) {
category_list[[i]] <- as.numeric(slam::col_sums(
contingency_table[rows_to_compare[i],]))
}
all_classes <- as.numeric(slam::col_sums(
contingency_table[rows_to_compare,]))
} else {
colsums <- as.numeric(apply(contingency_table,2,sum))
for (i in 1:length(rows_to_compare)) {
category_list[[i]] <- as.numeric(
contingency_table[rows_to_compare[i],])
}
all_classes <- as.numeric(apply(contingency_table[rows_to_compare,],
2,sum))
}
to_return <- vector(mode = "list", length = length(rows_to_compare))
#############################################
######## INFORMED DIRICHLET RANKING #########
#############################################
if (method == "informed Dirichlet") {
# get the total number of tokens
total_tokens <- sum(colsums)
#calculate the prior contribution
prior_contribution <- colsums * (alpha/total_tokens)
cat("Calcualting log-odds ratios and variances...\n")
for (i in 1:length(rows_to_compare)) {
cat1 <- category_list[[i]]
cat2 <- (all_classes - category_list[[i]])
# now calculate the log-odds ratio for each token
log_odds_ratio <- log(cat1 + prior_contribution) -
log(sum(cat1) + alpha - cat1 - prior_contribution) -
log(cat2 + prior_contribution) +
log(sum(cat2) + alpha - cat2 - prior_contribution)
# calculate the variance of the log-odds ratio using equation 19
variance <- 1/(cat1 + prior_contribution) +
1/(sum(cat1) + alpha - cat1 - prior_contribution) +
1/(cat2 + prior_contribution) +
1/(sum(cat2) + alpha - cat2 - prior_contribution)
# subset everything to remove any NaNs (this comes in if we are using a
# subset of documents where some words do not appear at all in any of them)
vocab <- vocabulary
column_indicies <- 1:length(vocabulary)
remove <- unique(c(which(is.nan(log_odds_ratio)),
which(is.na(log_odds_ratio))))
if (length(remove) > 0) {
log_odds_ratio <- log_odds_ratio[-remove]
variance <- variance[-remove]
vocab <- vocabulary[-remove]
cat1 <- cat1[-remove]
cat2 <- cat2[-remove]
all_c <- all_classes[-remove]
column_indicies <- column_indicies[-remove]
global_count <- colsums[-remove]
} else {
all_c <- all_classes
global_count <- colsums
}
# calculate z scores
z_scores <- log_odds_ratio/sqrt(variance)
# generate the overal rankings for use in fightin words plots
if (i == 1) {
if (rank_by_log_odds) {
ordering <- order(log_odds_ratio, decreasing = TRUE)
ordered_data <- data.frame(scores = log_odds_ratio[ordering],
log_odds_ratio= log_odds_ratio[ordering],
z_scores = z_scores[ordering],
total_count = all_c[ordering],
terms = vocab[ordering],
term_indicies = column_indicies[ordering],
stringsAsFactors = FALSE)
} else {
ordering <- order(z_scores, decreasing = TRUE)
ordered_data <- data.frame(scores = z_scores[ordering],
log_odds_ratio = log_odds_ratio[ordering],
z_scores = z_scores[ordering],
total_count = all_c[ordering],
terms = vocab[ordering],
term_indicies = column_indicies[ordering],
stringsAsFactors = FALSE)
}
}
#now get the significant words and rank them.
cat("Finding top words in each category...\n\n")
inds <- which(z_scores > 1.96)
category_1_significant_words <- data.frame(
term = vocab[inds],
log_odds_ratio = log_odds_ratio[inds],
variance = variance[inds],
z_scores = z_scores[inds],
count = cat1[inds],
other_count = cat2[inds],
global_count = global_count[inds],
term_indicies = column_indicies[inds],
stringsAsFactors = FALSE)
if (rank_by_log_odds) {
ordering <- order(category_1_significant_words$log_odds_ratio,
decreasing = T)
} else {
ordering <- order(category_1_significant_words$z_scores,
decreasing = T)
}
category_1_significant_words <- category_1_significant_words[ordering,]
rownames(category_1_significant_words) <- category_1_significant_words$term
#print out resutls
cat("Top 20 terms for category:",
rownames(contingency_table)[rows_to_compare[i]], "...\n")
print(head(category_1_significant_words[,2:7],n = 20))
# make sure we do not excede the max number of words
if (nrow(category_1_significant_words) > maximum_top_words) {
category_1_significant_words <- category_1_significant_words[1:maximum_top_words,]
}
to_return[[i]] <- category_1_significant_words
}
}
#################################
######## TD-IDF RANKING #########
#################################
if (method == "TF-IDF" | method == "TF-IDF-log(tf)"| method ==
"TF-IDF-augmented(tf)") {
# calcualte document frequency (not logged)
if (class(document_term_matrix) == "simple_triplet_matrix") {
document_frequency <- rep(0, ncol(document_term_matrix))
printseq <- round(seq(1,length(document_term_matrix$i),
length.out = 11)[2:11],0)
# fast C++ implementation
document_frequency <- Sparse_Document_Frequencies(
length(document_term_matrix$j),
document_term_matrix$j,
document_frequency,
printseq,
length(printseq))
document_frequency <- as.numeric(document_frequency)
} else {
document_frequency <- calculate_document_frequency(
document_term_matrix)
}
document_indices <- attr(contingency_table,"document_indices")
num_docs <- nrow(document_term_matrix)
for (i in 1:length(rows_to_compare)) {
cat1 <- category_list[[i]]
cat1 <- cat1/length(document_indices[[rows_to_compare[i]]])
if (method == "TF-IDF-log(tf)") {
cat1 <- 1 + log(cat1)
}
remove <- NULL
if(method == "TF-IDF-augmented(tf)") {
cat1 <- 0.5 + (0.5 * cat1)/max(cat1)
temp <- which(cat1 == 0.5)
if(length(temp) > 0){
remove <- temp
}
}
# now calculate scores
scores_1 <- cat1*log(num_docs/(1 + document_frequency))
#remove NaN entries
column_indicies <- 1:length(vocabulary)
vocabulary1 <- vocabulary
category_11 <- cat1
doc_frequency1 <- document_frequency
remove <- c(remove,which(is.nan(scores_1)))
remove <- c(remove,which(is.infinite(scores_1)))
remove <- unique(remove)
if (length(remove) > 0) {
scores_1 <- scores_1[-remove]
vocabulary1 <- vocabulary[-remove]
category_11 <- cat1[-remove]
doc_frequency1 <- document_frequency[-remove]
column_indicies <- column_indicies[-remove]
}
# make the dataframe to return
category_1_significant_words <- data.frame(
term = vocabulary1,
tfidf = scores_1,
count = category_11,
idf = doc_frequency1,
term_indicies = column_indicies,
stringsAsFactors = FALSE)
ordering <- order(category_1_significant_words$tfidf,
decreasing = T)
category_1_significant_words <- category_1_significant_words[ordering,]
rownames(category_1_significant_words) <- category_1_significant_words$term
#print out resutls
cat("Top 20 terms for category:",
rownames(contingency_table)[rows_to_compare[i]], "...\n")
print(head(category_1_significant_words[,2:4],n = 20))
# make sure we do not excede the max number of words
if (nrow(category_1_significant_words) > maximum_top_words) {
category_1_significant_words <- category_1_significant_words[1:maximum_top_words,]
}
to_return[[i]] <- category_1_significant_words
}
# now generate full ordered dataset if for first two categories
# (only using significant words)
scores <- c(to_return[[1]]$tfidf, rev(to_return[[2]]$tfidf))
cnts <- c(to_return[[1]]$count, rev(to_return[[2]]$count))
vcb <- c(to_return[[1]]$term, rev(to_return[[2]]$term))
term_inds <- c(to_return[[1]]$term_indicies,
rev(to_return[[2]]$term_indicies))
ordered_data <- data.frame(scores = scores,
total_count = cnts,
terms = vcb,
term_indicies = term_inds,
stringsAsFactors = FALSE)
}
# now if we are subsuming n-grams, then do so and provide an additional list
if (subsume_ngrams) {
cat("Subsuming N-Grams...\n")
# list with subsumed n-grams for each entry
subsumed_ngrams <- vector(mode = "list", length = length(rows_to_compare))
# get the row indices which will stay constant.
document_indices <- attr(contingency_table,"document_indices")
row_inds <- NULL
for (k in 1:length(rows_to_compare)) {
row_inds <- c(row_inds,
as.numeric(document_indices[[rows_to_compare[k]]]))
}
for (k in 1:length(rows_to_compare)) {
# get the document term matrix
col_inds <- as.numeric(to_return[[k]]$term_indicies)
doc_term_mat <- document_term_matrix[row_inds, col_inds]
# subsume the n-grams
subsumed <- subsume_ngrams(
ranked_terms = to_return[[k]],
document_term_matrix = doc_term_mat ,
term_clusters_to_output = 200,
top_terms_to_search = 200,
correlation_threshold = ngram_subsumption_correlation_threshold)
subsumed_ngrams[[k]] <- subsumed
}
# name everything correctly
names(subsumed_ngrams) <- rownames(contingency_table)[rows_to_compare]
to_return <- append(to_return,
list(ordered_data))
to_return <- append(to_return,
list(subsumed_ngrams))
names(to_return) <- c(rownames(contingency_table)[rows_to_compare],
"Term_Ordering",
"Subsumed_NGrams")
} else {
# if we are not subsuming n-grams, then just name stuff and return it.
to_return <- append(to_return, list(ordered_data))
names(to_return) <- c(rownames(contingency_table)[rows_to_compare],
"Term_Ordering")
}
# add in indicator for whether we are rank_by_log_odds
to_return$rank_by_log_odds <- rank_by_log_odds
return(to_return)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.