SpeedReader: High Performance Text Analysis

Documented in ACMI_contribution

#' @title Calculate Average Conditional Mutual Information (ACMI) contribution
#' of each vocabulary term in a sparse DTM.
#' @description Generates a vector of contributions to ACMI for each vocabulary
#' term in a DTM, based on a contingency table generated by the
#' contingency_table() function.
#'
#' @param joint_dist A matrix of class "simple_triplet_matrix" generated by the
#' contingency_table() function. Matching pairs of metadata combinations must be
#' sequential rows.
#' @return A vector recording the ACMI contribution of each variable.
#' @export
ACMI_contribution <- function(joint_dist){

    ptm <- proc.time()

    # deal with the case where we have a sparse matrix
    if (class(joint_dist) != "simple_triplet_matrix") {
        joint_dist <- slam::as.simple_triplet_matrix(joint_dist)
    }

    average_contribution <- rep(0, joint_dist$ncol)

    total_weight <- sum(joint_dist$v)

    cat("Normalizing, calculating row and column sums...\n")
    #noramlize joint dist and take row and column sums
    num_row_pairs <- joint_dist$nrow/2

    # store all contributions for each pair
    # storage_list <- vector(mode = "list", length = num_row_pairs)
    all_negative_vocab <- NULL
    all_negative_vocab_inds <- NULL

    # loop over row pairs and calculate ACMI contribution
    for (l in 1:num_row_pairs) {
        ptm1 <- proc.time()

        cat("Currently working on row pair:",l, "of",num_row_pairs,"\n")
        cur <- joint_dist[(2*l - 1):(2*l),]
        cur_weight <- sum(cur$v)
        cur$v <- cur$v/sum(cur$v)
        normalized_column_sums <- slam::col_sums(cur)
        normalized_row_sums <- slam::row_sums(cur)

        i <- cur$i - 1
        j <- cur$j - 1
        v <- cur$v
        num_entries <- length(i)

        ret <- Fast_Sparse_Mutual_Information_Full(
            i,
            j,
            v,
            normalized_column_sums,
            normalized_row_sums,
            num_entries)

        MI <- ret[[1]]
        column_contributions <- as.numeric(ret[[2]])

        cat("Full Mutual Information:", MI,"\n")

        if (MI > 0) {
            dense <- sparse_to_dense_matrix(cur)
            dist_sum <- sum(cur)

            contributions <- calculate_mutual_info_contributions(
                normalized_column_sums,
                normalized_row_sums,
                column_contributions,
                dense,
                cur$ncol,
                dist_sum)

            if (length(which(is.na(contributions))) > 0) {
                cat("There was a problem with NA contributions! \n")
            } else {
                contributions <- MI - contributions

                # make sure to zero out contributions for terms that did not appear
                # at all
                zero_out <- which(normalized_column_sums == 0)
                cat("Terms to zero out:",length(zero_out),"\n")
                cat("Sum of terms to zero out:",sum(contributions[zero_out]),"\n")
                contributions[zero_out] <- 0

                average_contribution <- average_contribution + (cur_weight/total_weight) * contributions

                #store the current entries
                # colinds <- which(contributions != 0)
                # curdat <- data.frame(column_index = colinds,
                #                      contribution = contributions[colinds])

                temp <- which(contributions < 0)
                if (length(temp) > 0) {
                    # storage_list[[l]] <- cur$dimnames[[2]][temp]
                    all_negative_vocab <- c(all_negative_vocab,temp)
                    all_negative_vocab_inds <- c(all_negative_vocab_inds,rep(l,length(temp)))
                }
            }
        } else {
            cat("MI too close to zero or negative (problem with machine precision)...\n")
        }

        t3 <- proc.time() - ptm1
        cat("Current calculation complete in:",t3[[3]],"seconds...\n")
    }

    negative_vocab <- unique(all_negative_vocab)

    ret <- list(average_contribution = average_contribution,
                negative_vocab = negative_vocab,
                all_negative_vocab = all_negative_vocab,
                all_negative_vocab_inds = all_negative_vocab_inds)

    t2 <- proc.time() - ptm
    cat("Full calculation complete in:",t2[[3]],"seconds...\n")
    return(ret)
}