#' A function to calculate a number of information-theoretic measures on terms in a contingency table, including point-wise mutual information.
#'
#' @param contingency_table A contingency table generated by the `contingency_table()` function.
#' @param display_top_x_terms Defaults to 20, the number of top ranked terms to display for each measure.
#' @param term_threshold The threshold at which terms are eliminated from the contingency table for the purposes of calculating information-theoretic quantities. THis gets around issues with terms that only appear once having very high PMI.
#' @param every_category_counts Defaults to FALSE, if TRUE, then terms are removed if they do not appear at least term_threshold times in every row (category) of the contingency table.
#' @return A list object containing lots of different information theoretic measures calculated on the contingency table. If a sparse matrix was provided, then a sparse PMI table is returned. Note that the "zero" entries in this sparse matrix are actually -Inf, but cannot be represented as such using the slam sparse matrix libraries (which this package does), so you will manually need to replace the zero entries with -Inf if you want to compare to a dense matrix.
#' @export
pmi <- function(contingency_table,
display_top_x_terms = 20,
term_threshold = 5,
every_category_counts = FALSE){
is_sparse_matrix <- FALSE
if(class(contingency_table) == "simple_triplet_matrix"){
is_sparse_matrix <- TRUE
}
#remove rows with sum zero
if(is_sparse_matrix){
rowsums <- slam::row_sums(contingency_table)
}else{
rowsums <- apply(contingency_table,1,sum)
}
zero_rows <- which(rowsums == 0)
if(length(zero_rows) > 0){
cat("Removing categories:",rownames(contingency_table)[zero_rows], "becasue they contain zero terms...\n")
contingency_table <- contingency_table[-zero_rows,]
}
Names <- rownames(contingency_table)
Terms <- colnames(contingency_table)
categories <- nrow(contingency_table)
temp = keep1 = NULL
check <- function(index){
return(length(which(temp == keep1[index])))
}
cat("Thresholding for terms that appear at least",term_threshold,"times...\n")
orig_terms <- ncol(contingency_table)
if(every_category_counts){
for(i in 1:categories){
if(is_sparse_matrix){
if(i == 1){
colsums <- slam::col_sums(contingency_table[1,])
keep1 <- which(colsums >= term_threshold)
}else{
colsums <- slam::col_sums(contingency_table[i,])
temp <- which(colsums >= term_threshold)
temp2 <- unlist(sapply(1:length(keep1),check))
keep1 <- keep1[which(temp2 > 0)]
}
}else{
if(i == 1){
keep1 <- which(contingency_table[1,] >= term_threshold)
}else{
temp <- which(contingency_table[i,] >= term_threshold)
temp2 <- unlist(sapply(1:length(keep1),check))
keep1 <- keep1[which(temp2 > 0)]
}
}
}
}else{
if (is_sparse_matrix) {
colsums <- slam::col_sums(contingency_table)
}else{
colsums <- apply(contingency_table,2,sum)
}
keep1 <- which(colsums >= term_threshold)
}
contingency_table <- contingency_table[,keep1]
cat("Remaining terms in contingency table after thresholding:",ncol(contingency_table),"of",orig_terms, "\n")
#allocate tables
unique_terms <- ncol(contingency_table)
table_sum <- sum(contingency_table)
if (is_sparse_matrix) {
colsums <- slam::col_sums(contingency_table)
rowsums <- slam::row_sums(contingency_table)
} else {
colsums <- apply(contingency_table,2,sum)
rowsums <- apply(contingency_table,1,sum)
}
cat("Generating token PMI table...\n")
if (is_sparse_matrix) {
printseq <- round(seq(1,length(contingency_table$i), length.out = 11)[2:11],0)
stats <- Sparse_PMI_Statistics(length(contingency_table$i),
table_sum,
as.numeric(colsums),
as.numeric(rowsums),
contingency_table$j,
contingency_table$i,
as.numeric(contingency_table$v),
printseq,
length(printseq))
# print(str(stats))
#now create the sparse matrix objects
pmi_table <- contingency_table
pmi_table$v <- stats[[1]]
distinctiveness_table <- contingency_table
distinctiveness_table$v <- stats[[2]]
saliency_table <- contingency_table
saliency_table$v <- stats[[3]]
}else{
pmi_table <- matrix(0,nrow = categories,ncol = unique_terms )
distinctiveness_table <- matrix(0,nrow = categories,ncol = unique_terms )
saliency_table <- matrix(0,nrow = categories,ncol = unique_terms )
#generate tables
for(i in 1:nrow(contingency_table)){
cat("Category", i,"of",nrow(contingency_table),"\n")
for(j in 1:ncol(contingency_table)){
pmi_table[i,j] <- log((contingency_table[i,j]/table_sum)/((colsums[j]/table_sum)*(rowsums[i]/table_sum)))
distinctiveness_table[i,j] <- (contingency_table[i,j]/colsums[j])*log((contingency_table[i,j]/colsums[j])/(rowsums[i]/table_sum))
saliency_table[i,j] <- (colsums[j]/table_sum)*distinctiveness_table[i,j]
}
}
}
# reduce terms to only those we are keeping
Terms <- Terms[keep1]
if(is_sparse_matrix){
## get token top and bottom words
top_terms <- vector(mode = "list", length = categories)
for(i in 1:categories){
counts <- pmi_table$v[which(pmi_table$i == i)]
indices <- pmi_table$j[which(pmi_table$i == i)]
top_terms[[i]] <- list(indices = indices[order(counts,decreasing = T)],
counts = counts[order(counts,decreasing = T)])
}
pmi_ranked_terms <- vector(mode = "list", length = categories)
for(i in 1:categories){
terms <- rep("",length(top_terms[[i]]$indices))
for(j in 1:length(terms)){
terms[j] <- Terms[top_terms[[i]]$indices[j]]
}
pmi_ranked_terms[[i]] <- terms
}
ranked_pmi <- vector(mode = "list", length = categories)
for(i in 1:categories){
ranked_pmi[[i]] <- top_terms[[i]]$counts
}
cat("Top terms by category:\n\n")
# print(str(top_terms))
for(i in 1:categories){
cat("Category: ",Names[i], "\n")
disp <- min(display_top_x_terms,length(pmi_ranked_terms[[i]]))
for(j in 1:disp){
cat(pmi_ranked_terms[[i]][j]," ePMI:",exp(ranked_pmi[[i]][j]), ": Local Count --",contingency_table[i,top_terms[[i]]$indices[j]]$v, "Global Count --",sum(contingency_table[,top_terms[[i]]$indices[j]]$v),"\n")
}
cat("\n\n")
}
distinctiveness <- slam::col_sums(distinctiveness_table)
saliency <- slam::col_sums(saliency_table)
}else{
# DENSE MATRICES
## get token top and bottom words
top_terms <- matrix(0,nrow = categories,ncol = ncol(pmi_table) )
for(i in 1:categories){
top_terms[i,] <- order(pmi_table[i,],decreasing = T)
}
pmi_ranked_terms <- matrix("",nrow = categories,ncol = ncol(pmi_table) )
for(i in 1:categories){
for(j in 1:ncol(pmi_table)){
pmi_ranked_terms[i,j] <- Terms[top_terms[i,j]]
}
}
ranked_pmi <- matrix(0,nrow = categories,ncol = ncol(pmi_table) )
for(i in 1:categories){
ranked_pmi[i,] <- pmi_table[i,top_terms[i,]]
}
cat("Top terms by category:\n\n")
for(i in 1:categories){
cat("Category: ",Names[i], "\n")
for(j in 1:display_top_x_terms){
cat(Terms[top_terms[i,j]]," ePMI:",exp(pmi_table[i,top_terms[i,j]]), ": Local Count --",contingency_table[i,top_terms[i,j]], "Global Count --",sum(contingency_table[,top_terms[i,j]]),"\n")
}
cat("\n\n")
}
distinctiveness <- apply(distinctiveness_table,2,sum)
saliency <- apply(saliency_table,2,sum)
}
dist_terms <- Terms[order(distinctiveness ,decreasing = T)]
non_dist_terms <- Terms[order(distinctiveness ,decreasing = F)]
sal_terms <- Terms[order(saliency ,decreasing = T)]
non_sal_terms <- Terms[order(saliency ,decreasing = F)]
cat("Highest distinctiveness terms...\n")
for(i in 1:display_top_x_terms){
cat(dist_terms[i], ", ")
}
cat("\n\n")
cat("Lowest distinctiveness terms...\n")
for(i in 1:display_top_x_terms){
cat(non_dist_terms[i], ", ")
}
cat("\n\n")
cat("Highest salience terms...\n")
for(i in 1:display_top_x_terms){
cat(sal_terms[i], ", ")
}
cat("\n\n")
cat("Lowest salience terms...\n")
for(i in 1:display_top_x_terms){
cat(non_sal_terms[i], ", ")
}
cat("\n\n")
return(list(pmi_table = pmi_table,
pmi_ranked_terms = pmi_ranked_terms,
ranked_pmi = ranked_pmi,
distinctive_terms = dist_terms,
non_distinctive_terms = non_dist_terms,
salient_terms = sal_terms,
non_salient_terms = non_sal_terms,
contingency_table = contingency_table))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.