#' @title Calculate Average Conditional Mutual Information (ACMI) contribution
#' of each vocabulary term in a sparse DTM.
#' @description Generates a vector of contributions to ACMI for each vocabulary
#' term in a DTM, based on a contingency table generated by the
#' contingency_table() function.
#'
#' @param joint_dist A matrix of class "simple_triplet_matrix" generated by the
#' contingency_table() function. Matching pairs of metadata combinations must be
#' sequential rows.
#' @return A vector recording the ACMI contribution of each variable.
#' @export
ACMI_contribution <- function(joint_dist){
ptm <- proc.time()
# deal with the case where we have a sparse matrix
if (class(joint_dist) != "simple_triplet_matrix") {
joint_dist <- slam::as.simple_triplet_matrix(joint_dist)
}
average_contribution <- rep(0, joint_dist$ncol)
total_weight <- sum(joint_dist$v)
cat("Normalizing, calculating row and column sums...\n")
#noramlize joint dist and take row and column sums
num_row_pairs <- joint_dist$nrow/2
# store all contributions for each pair
# storage_list <- vector(mode = "list", length = num_row_pairs)
all_negative_vocab <- NULL
all_negative_vocab_inds <- NULL
# loop over row pairs and calculate ACMI contribution
for (l in 1:num_row_pairs) {
ptm1 <- proc.time()
cat("Currently working on row pair:",l, "of",num_row_pairs,"\n")
cur <- joint_dist[(2*l - 1):(2*l),]
cur_weight <- sum(cur$v)
cur$v <- cur$v/sum(cur$v)
normalized_column_sums <- slam::col_sums(cur)
normalized_row_sums <- slam::row_sums(cur)
i <- cur$i - 1
j <- cur$j - 1
v <- cur$v
num_entries <- length(i)
ret <- Fast_Sparse_Mutual_Information_Full(
i,
j,
v,
normalized_column_sums,
normalized_row_sums,
num_entries)
MI <- ret[[1]]
column_contributions <- as.numeric(ret[[2]])
cat("Full Mutual Information:", MI,"\n")
if (MI > 0) {
dense <- sparse_to_dense_matrix(cur)
dist_sum <- sum(cur)
contributions <- calculate_mutual_info_contributions(
normalized_column_sums,
normalized_row_sums,
column_contributions,
dense,
cur$ncol,
dist_sum)
if (length(which(is.na(contributions))) > 0) {
cat("There was a problem with NA contributions! \n")
} else {
contributions <- MI - contributions
# make sure to zero out contributions for terms that did not appear
# at all
zero_out <- which(normalized_column_sums == 0)
cat("Terms to zero out:",length(zero_out),"\n")
cat("Sum of terms to zero out:",sum(contributions[zero_out]),"\n")
contributions[zero_out] <- 0
average_contribution <- average_contribution + (cur_weight/total_weight) * contributions
#store the current entries
# colinds <- which(contributions != 0)
# curdat <- data.frame(column_index = colinds,
# contribution = contributions[colinds])
temp <- which(contributions < 0)
if (length(temp) > 0) {
# storage_list[[l]] <- cur$dimnames[[2]][temp]
all_negative_vocab <- c(all_negative_vocab,temp)
all_negative_vocab_inds <- c(all_negative_vocab_inds,rep(l,length(temp)))
}
}
} else {
cat("MI too close to zero or negative (problem with machine precision)...\n")
}
t3 <- proc.time() - ptm1
cat("Current calculation complete in:",t3[[3]],"seconds...\n")
}
negative_vocab <- unique(all_negative_vocab)
ret <- list(average_contribution = average_contribution,
negative_vocab = negative_vocab,
all_negative_vocab = all_negative_vocab,
all_negative_vocab_inds = all_negative_vocab_inds)
t2 <- proc.time() - ptm
cat("Full calculation complete in:",t2[[3]],"seconds...\n")
return(ret)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.