#' Calculates multiple 2 class mutual information scores from confusion matrix probabilities in dplyr friendly manner
#'
#' The purpose of this is to make it possible to calculate MI in a DBPLYR sql table
#'
#' @param df a dataframe containing one observation per row & full confusion matrix and marginal probabilities: i.e. p_x1, p_x0, p_y1, p_y0, p_x1y1, p_x0y1, p_x1y0, and p_x0y0 columns (see probabilitiesFromCounts)
#' @return the datatable with additional columns for entropy, mutual information, pointwise mutual information and normalised pointwise mutual information for all various combinations of outcome
#' @import dplyr
#' @export
calculateBinaryMI = function(df) {
return(
df %>% mutate(
pmi_x1y1 = ifelse( p_x1y1==0, ifelse(p_x1==0 | p_y1==0, 0, NA), log(p_x1y1/(p_x1*p_y1)) ),
pmi_x0y1 = ifelse( p_x0y1==0, ifelse(p_x0==0 | p_y1==0, 0, NA), log(p_x0y1/(p_x0*p_y1)) ),
pmi_x1y0 = ifelse( p_x1y0==0, ifelse(p_x1==0 | p_y0==0, 0, NA), log(p_x1y0/(p_x1*p_y0)) ),
pmi_x0y0 = ifelse( p_x0y0==0, ifelse(p_x0==0 | p_y0==0, 0, NA), log(p_x0y0/(p_x0*p_y0)) )
) %>% mutate(
H_x = ifelse(p_x1==0|p_x0==0,0,-(p_x1*log(p_x1)+(p_x0*log(p_x0)))),
H_y = ifelse(p_y1==0|p_y0==0,0,-(p_y1*log(p_y1)+(p_y0*log(p_y0)))),
I = (
ifelse(p_x1y1==0|p_x1==0|p_y1==0, 0, p_x1y1*pmi_x1y1)+
ifelse(p_x0y1==0|p_x0==0|p_y1==0, 0, p_x0y1*pmi_x0y1)+
ifelse(p_x1y0==0|p_x1==0|p_y0==0, 0, p_x1y0*pmi_x1y0)+
ifelse(p_x0y0==0|p_x0==0|p_y0==0, 0, p_x0y0*pmi_x0y0)
)
) %>% mutate(
H_xy = H_x + H_y - I,
npmi_x1y1 = ifelse( p_x1y1==0, ifelse(p_x1==0 | p_y1==0, 0, -1), pmi_x1y1 / (-log(p_x1y1)) ),
npmi_x0y1 = ifelse( p_x0y1==0, ifelse(p_x0==0 | p_y1==0, 0, -1), pmi_x0y1 / (-log(p_x0y1)) ),
npmi_x1y0 = ifelse( p_x1y0==0, ifelse(p_x1==0 | p_y0==0, 0, -1), pmi_x1y0 / (-log(p_x1y0)) ),
npmi_x0y0 = ifelse( p_x0y0==0, ifelse(p_x0==0 | p_y0==0, 0, -1), pmi_x0y0 / (-log(p_x0y0)) )
)
)
}
#' Calculates multiple confusion matrix stats from mariginal probabilities in dplyr friendly manner
#'
#' The purpose of this is to make it possible to calculate accuracy stats in a DPLYR table (including dbplyr). Typically
#' this will come from some sort of threshold based classification task where the classification output
#' is a probability and the prediction class is a binary outcome.
#'
#' see also fBeta score vectorised function
#'
#' @param df a dataframe containing one observation per row & p_x1y1, p_x0y1, p_x1y0, and p_x0y0 columns (see probabilitiesFromConfusionMatrix)
#' @return the datatable with additional columns for confusion matrix stats: true_pos_rate / true_neg_rate / etc...
#' @import dplyr
#' @export
calculateConfusionMatrixStats = function(df) {
return(df %>% mutate(
true_pos_rate = p_x1y1/p_x1,
true_neg_rate = p_x0y0/p_x0,
false_pos_rate = p_x0y1/p_x0,
false_neg_rate = p_x1y0/p_x1,
neg_pred_value = p_x0y0/p_y0,
pos_pred_value = p_x1y1/p_y1,
specificity = true_neg_rate,
sensitivity = true_pos_rate,
precision = pos_pred_value,
recall = true_pos_rate,
accuracy = p_x1y1+p_x0y0,
f1 = 2*precision*recall/(precision+recall),
mcc = (p_x1y1*p_x0y0 - p_x0y1*p_x1y0) / sqrt(p_x1*p_x0*p_y1*p_y0),
informedness = true_pos_rate+true_neg_rate-1,
youdens_j = informedness
))
}
#' Calculate single mutual information score from multiclass groups in dplyr friendly manner.
#'
#' The purpose of this is to make it possible to calculate MI from tidy data. This is useful where you have a a data from that
#' represents a multi-class confusion matrix with unique combinations of inputs and probabilities for the co-occurrence and marginal probabilities
#' already calculated. Typically this will be generated by the probabilitiesFromCooccurrence function.
#'
#' @param df a dataframe containing one observation per row & minimally p_x1y1, p_x1, p_y1 columns (see probabilitiesFromCounts / probabilitiesFromCooccurrence)
#' @return the datatable with additional columns for MI
#' @import dplyr
#' @export
calculateMultiClassMI = function(df) {
return(
df %>% mutate(
pmi_x1y1 = ifelse( p_x1y1==0, ifelse(p_x1==0 | p_y1==0, 0, NA), log(p_x1y1/(p_x1*p_y1)) ),
I_xy = ifelse(p_x1y1==0|p_x1==0|p_y1==0, 0, p_x1y1*pmi_x1y1)
) %>% summarise(
N = NA,
I = sum(I_xy, na.rm=TRUE),
I_sd = NA,
method = "Empirical"
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.