SpeedReader: High Performance Text Analysis

Documented in pmi

#' A function to calculate a number of information-theoretic measures on terms in a contingency table, including point-wise mutual information.
#'
#' @param contingency_table A contingency table generated by the `contingency_table()` function.
#' @param display_top_x_terms Defaults to 20, the number of top ranked terms to display for each measure.
#' @param term_threshold The threshold at which terms are eliminated from the contingency table for the purposes of calculating information-theoretic quantities. THis gets around issues with terms that only appear once having very high PMI.
#' @param every_category_counts Defaults to FALSE, if TRUE, then terms are removed if they do not appear at least term_threshold times in every row (category) of the contingency table.
#' @return A list object containing lots of different information theoretic measures calculated on the contingency table. If a sparse matrix was provided, then a sparse PMI table is returned. Note that the "zero" entries in this sparse matrix are actually -Inf, but cannot be represented as such using the slam sparse matrix libraries (which this package does), so you will manually need to replace the zero entries with -Inf if you want to compare to a dense matrix.
#' @export
pmi <- function(contingency_table,
                display_top_x_terms = 20,
                term_threshold = 5,
                every_category_counts = FALSE){

    is_sparse_matrix <- FALSE
    if(class(contingency_table) == "simple_triplet_matrix"){
        is_sparse_matrix <- TRUE
    }

    #remove rows with sum zero
    if(is_sparse_matrix){
        rowsums <- slam::row_sums(contingency_table)
    }else{
        rowsums <- apply(contingency_table,1,sum)
    }

    zero_rows <- which(rowsums == 0)
    if(length(zero_rows) > 0){
        cat("Removing categories:",rownames(contingency_table)[zero_rows], "becasue they contain zero terms...\n")
        contingency_table <- contingency_table[-zero_rows,]
    }


    Names <- rownames(contingency_table)
    Terms <- colnames(contingency_table)
    categories <- nrow(contingency_table)
    temp = keep1 = NULL
    check  <- function(index){
        return(length(which(temp == keep1[index])))
    }
    cat("Thresholding for terms that appear at least",term_threshold,"times...\n")
    orig_terms <- ncol(contingency_table)

    if(every_category_counts){
        for(i in 1:categories){
            if(is_sparse_matrix){
                if(i == 1){
                    colsums <- slam::col_sums(contingency_table[1,])
                    keep1 <- which(colsums >= term_threshold)
                }else{
                    colsums <- slam::col_sums(contingency_table[i,])
                    temp <- which(colsums >= term_threshold)
                    temp2 <- unlist(sapply(1:length(keep1),check))
                    keep1 <- keep1[which(temp2 > 0)]
                }
            }else{
                if(i == 1){
                    keep1 <- which(contingency_table[1,] >= term_threshold)
                }else{
                    temp <- which(contingency_table[i,] >= term_threshold)
                    temp2 <- unlist(sapply(1:length(keep1),check))
                    keep1 <- keep1[which(temp2 > 0)]
                }
            }
        }
    }else{
        if (is_sparse_matrix) {
            colsums <- slam::col_sums(contingency_table)
        }else{
            colsums <- apply(contingency_table,2,sum)
        }
        keep1 <- which(colsums >= term_threshold)
    }

    contingency_table <- contingency_table[,keep1]
    cat("Remaining terms in contingency table after thresholding:",ncol(contingency_table),"of",orig_terms, "\n")

    #allocate tables
    unique_terms <- ncol(contingency_table)
    table_sum <- sum(contingency_table)
    if (is_sparse_matrix) {
        colsums <- slam::col_sums(contingency_table)
        rowsums <- slam::row_sums(contingency_table)
    } else {
        colsums <- apply(contingency_table,2,sum)
        rowsums <- apply(contingency_table,1,sum)
    }

    cat("Generating token PMI table...\n")
    if (is_sparse_matrix) {
        printseq <- round(seq(1,length(contingency_table$i), length.out = 11)[2:11],0)
        stats <- Sparse_PMI_Statistics(length(contingency_table$i),
            table_sum,
            as.numeric(colsums),
            as.numeric(rowsums),
            contingency_table$j,
            contingency_table$i,
            as.numeric(contingency_table$v),
            printseq,
            length(printseq))

        # print(str(stats))
        #now create the sparse matrix objects
        pmi_table <- contingency_table
        pmi_table$v <- stats[[1]]
        distinctiveness_table <- contingency_table
        distinctiveness_table$v <- stats[[2]]
        saliency_table <- contingency_table
        saliency_table$v <- stats[[3]]

    }else{
        pmi_table <- matrix(0,nrow = categories,ncol = unique_terms )
        distinctiveness_table <- matrix(0,nrow = categories,ncol = unique_terms )
        saliency_table <- matrix(0,nrow = categories,ncol = unique_terms )

        #generate tables
        for(i in 1:nrow(contingency_table)){
            cat("Category", i,"of",nrow(contingency_table),"\n")
            for(j in 1:ncol(contingency_table)){
                pmi_table[i,j] <- log((contingency_table[i,j]/table_sum)/((colsums[j]/table_sum)*(rowsums[i]/table_sum)))
                distinctiveness_table[i,j] <- (contingency_table[i,j]/colsums[j])*log((contingency_table[i,j]/colsums[j])/(rowsums[i]/table_sum))
                saliency_table[i,j] <- (colsums[j]/table_sum)*distinctiveness_table[i,j]
            }
        }

    }


    # reduce terms to only those we are keeping
    Terms <- Terms[keep1]

    if(is_sparse_matrix){
        ## get token top and bottom words
        top_terms <- vector(mode = "list", length = categories)
        for(i in 1:categories){
            counts <- pmi_table$v[which(pmi_table$i == i)]
            indices <- pmi_table$j[which(pmi_table$i == i)]
            top_terms[[i]] <- list(indices = indices[order(counts,decreasing = T)],
                                   counts = counts[order(counts,decreasing = T)])
        }
        pmi_ranked_terms <- vector(mode = "list", length = categories)
        for(i in 1:categories){
            terms <- rep("",length(top_terms[[i]]$indices))
            for(j in 1:length(terms)){
                terms[j] <- Terms[top_terms[[i]]$indices[j]]
            }
            pmi_ranked_terms[[i]] <- terms
        }
        ranked_pmi <- vector(mode = "list", length = categories)
        for(i in 1:categories){
            ranked_pmi[[i]] <- top_terms[[i]]$counts
        }
        cat("Top terms by category:\n\n")
        # print(str(top_terms))

        for(i in 1:categories){
            cat("Category: ",Names[i], "\n")

            disp <- min(display_top_x_terms,length(pmi_ranked_terms[[i]]))
            for(j in 1:disp){
                cat(pmi_ranked_terms[[i]][j],"    ePMI:",exp(ranked_pmi[[i]][j]), ": Local Count --",contingency_table[i,top_terms[[i]]$indices[j]]$v, "Global Count --",sum(contingency_table[,top_terms[[i]]$indices[j]]$v),"\n")
            }
            cat("\n\n")
        }

        distinctiveness <- slam::col_sums(distinctiveness_table)
        saliency <- slam::col_sums(saliency_table)

    }else{
        # DENSE MATRICES
        ## get token top and bottom words
        top_terms <- matrix(0,nrow = categories,ncol = ncol(pmi_table) )
        for(i in 1:categories){
            top_terms[i,] <- order(pmi_table[i,],decreasing = T)
        }
        pmi_ranked_terms <- matrix("",nrow = categories,ncol = ncol(pmi_table) )
        for(i in 1:categories){
            for(j in 1:ncol(pmi_table)){
                pmi_ranked_terms[i,j] <- Terms[top_terms[i,j]]
            }
        }
        ranked_pmi <-  matrix(0,nrow = categories,ncol = ncol(pmi_table) )
        for(i in 1:categories){
            ranked_pmi[i,] <- pmi_table[i,top_terms[i,]]
        }
        cat("Top terms by category:\n\n")


        for(i in 1:categories){
            cat("Category: ",Names[i], "\n")

            for(j in 1:display_top_x_terms){
                cat(Terms[top_terms[i,j]],"    ePMI:",exp(pmi_table[i,top_terms[i,j]]), ": Local Count --",contingency_table[i,top_terms[i,j]], "Global Count --",sum(contingency_table[,top_terms[i,j]]),"\n")
            }
            cat("\n\n")
        }

        distinctiveness <- apply(distinctiveness_table,2,sum)
        saliency <- apply(saliency_table,2,sum)
    }

    dist_terms <- Terms[order(distinctiveness ,decreasing = T)]
    non_dist_terms <- Terms[order(distinctiveness ,decreasing = F)]

    sal_terms <- Terms[order(saliency ,decreasing = T)]
    non_sal_terms <- Terms[order(saliency ,decreasing = F)]

    cat("Highest distinctiveness terms...\n")
    for(i in 1:display_top_x_terms){
        cat(dist_terms[i], ", ")
    }
    cat("\n\n")

    cat("Lowest distinctiveness terms...\n")
    for(i in 1:display_top_x_terms){
        cat(non_dist_terms[i], ", ")
    }
    cat("\n\n")


    cat("Highest salience terms...\n")
    for(i in 1:display_top_x_terms){
        cat(sal_terms[i], ", ")
    }
    cat("\n\n")
    cat("Lowest salience terms...\n")
    for(i in 1:display_top_x_terms){
        cat(non_sal_terms[i], ", ")
    }
    cat("\n\n")

    return(list(pmi_table = pmi_table,
                pmi_ranked_terms = pmi_ranked_terms,
                ranked_pmi = ranked_pmi,
                distinctive_terms = dist_terms,
                non_distinctive_terms = non_dist_terms,
                salient_terms = sal_terms,
                non_salient_terms = non_sal_terms,
                contingency_table = contingency_table))
}