R/bayeserror.R

Defines functions randomForest_classify naivebayes_classify svm_classify predict_on_fold merge_folds predict_ensemble ensemble_errors dual_total_correlation average_mutual_information error_correlation bayeserror_formula bayeserror mahalanobis_bound bhattacharyya_bound nearest_neighbor_bound

Documented in average_mutual_information bayeserror dual_total_correlation ensemble_errors error_correlation mahalanobis_bound merge_folds predict_ensemble predict_on_fold

## bayeserror.R

randomForest_classify <- function(x, y, newdata) {
    fit <- randomForest::randomForest(x = x, y = factor(y))
    preds <- predict(fit, newdata = newdata, type = "prob")
    return(preds)
}
attr(randomForest_classify, "name") <- "randomForest"
attr(randomForest_classify, "shortname") <- "RF"

naivebayes_classify <- function(x, y, newdata) {
    fit <- naivebayes::naive_bayes(x = x, y = factor(y), )
    preds <- predict(fit, newdata = newdata, type = "prob")
    return(preds)
}
attr(naivebayes_classify, "name") <- "naiveBayes"
attr(naivebayes_classify, "shortname") <- "NB"

svm_classify <- function(x, y, newdata) {
    fit <- e1071::svm(x = x, y = factor(y), probability = TRUE)
    preds <- attr(predict(fit, newdata = newdata, probability = TRUE),
                  "probabilities")
    return(preds)
}
attr(svm_classify, "name") <- "svm"
attr(svm_classify, "shortname") <- "SVM"

#' Generate predictions on a cross-validation fold.
#'
#' @return an array of class probability predictions
#'
#' @export
#' @keywords internal
predict_on_fold <- function(x, y, fold, classifiers) {
    checkmate::assert_matrix(x,
                             mode = "numeric",
                             any.missing = FALSE)
    checkmate::assert_integerish(y,
                                 lower = 0L, upper = 1L,
                                 any.missing = FALSE,
                                 len = nrow(x))
    checkmate::assert_list(fold)
    checkmate::assert_list(classifiers,
                           type = c("character", "function"))
    ## Get training and prediction folds.
    x_train <- origami::training(x = x, fold = fold)
    x_val <- origami::validation(x = x, fold = fold)
    y_train <- origami::training(x = y, fold = fold)
    y_val <- origami::validation(x = y, fold = fold)
    ## Make predictions on prediction set, returning a 3-dimensional
    ## array of probabilities: observation x class x classifier
    ps <- abind::abind(lapply(classifiers,
                              function(classify)
                                  classify(x_train, y_train, x_val)),
                       along = 3L)
    checkmate::assert_array(ps,
                            mode = "numeric",
                            d = 3L,
                            any.missing = FALSE)
    ## Prediction probabilities for ensemble classifier. Compute by
    ## averaging class predictions across the classifiers.
    p_ave <- apply(ps,
                   MARGIN = c(1L, 2L),
                   FUN = mean)
    ## Collect predictions
    classifier_names <- lapply(classifiers,
                               function(classify)
                                   attr(classify, "shortname"))
    ps <- abind::abind(ps, p_ave,
                       along = 3L,
                       new.names = list(NULL, NULL,
                                        c(classifier_names, "AVE")))
    attr(ps, "index") <- fold$validation
    return(ps)
}

#' Merge folded data
#'
#' @param folded_data a list of arrays containing the results of a
#'     computation on folded data, one array for each fold.
#'
#' @export
#' @keywords internal
merge_folds <- function(folded_data) {
    checkmate::assert_list(folded_data, types = "array")
    lapply(folded_data,
           function(fold)
               assertthat::assert_that(assertthat::has_attr(fold, "index"),
                                       msg = "Fold is missing index."))
    merged_array <- array(data = NA,
                          dim = attr(folded_data, "ds"))
    for(arr in folded_data) {
        index <- as.integer(attr(arr, "index"))
        merged_array[index, , ] <- arr
    }
    dimnames(merged_array) <- attr(folded_data, "dnames")
    ## checkmate
    return(merged_array)
}

#' Generate class prediction probabilities
#'
#' This function will generate class prediction probabilities on a
#' dataset from a list of classifiers using a specified resampling
#' method.
#' 
#' @param x A matrix with observations as rows and features as
#'     columns.
#'
#' @param y A vector of 0/1 response classes
#'
#' @param classifiers A list of classifier functions. Each function
#'     should be of the form `f(x, y, newdata)` and have the attribute
#'     `shortname` to identify its predictions.
#'
#' @param v A number specifying the resampling method to use when
#'     makining predictions.
#'
#' @return A three-dimensional array containing class prediction
#'     probabilities. Its dimesions are as (observation, class,
#'     classifier). The last entry on the third dimension is `"AVE"`,
#'     the predictions for the ensemble classifier.
#'
#' @param v For k-fold cross-validation, use an integer to specify the
#'     number of folds. For instance, `v = 10` means to use 10-fold
#'     CV. For a holdout split, use a number between 0.0 to 1.0
#'     specifying the percent to be held out to make predictions. For
#'     in-sample predictions, use `v = 0`.
#' @export
predict_ensemble <- function(x, y, classifiers, v = 10L) {
    checkmate::assert_matrix(x)
    checkmate::assert_numeric(y)
    checkmate::assert_list(classifiers)
    ## checkmate
    folds <- origami::make_folds(x,
                                 ## 10-fold CV
                                 fold_fun = origami::folds_vfold, V = v,
                                 ## Balance samples across classes
                                 strata_ids = y)
    folded_data <- lapply(folds,
                          function(fold)
                              predict_on_fold(x, y, fold, classifiers))
    attr(folded_data, "ds") <- c(nrow(x),
                                 2, # binary classification
                                 length(classifiers) + 1)
    attr(folded_data, "dnames") <- list(NULL, NULL,
                                        append(lapply(classifiers,
                                                      function(c)
                                                          attr(c, "shortname")),
                                               "AVE"))
    merged_data <- merge_folds(folded_data)
    ## checkmate
    return(merged_data)
}

#' Compute classifier error from a matrix of class predictions.
ensemble_errors <- function(predictions, true_classes, method = "accuracy") {
    checkmate::assert_matrix(predictions, mode = "integerish")
    checkmate::assert_integerish(predictions,
                                 lower = 0L, upper = 1L)
    checkmate::assert_integerish(true_classes,
                                 lower = 0L, upper = 1L)
    loss <- function(y_pred) {
        MLmetrics::Accuracy(y_pred, true_classes)
    }
    err <- apply(X = predictions,
                 MARGIN = 2,
                 FUN = loss)
    ## checkmate
    return(err)
}

#' Dual Total Correlation for Binary Data
#'
#' @param X a matrix whose columns are 0/1 class vectors
dual_total_correlation <- function(X, normalized = FALSE) {
    checkmate::assert_matrix(X, mode = "integerish")
    entropy <- infotheo::entropy(X)
    cond_entropy <-
        lapply(seq_len(ncol(X)),
               function(i)
                   infotheo::condentropy(X=X[, i], Y=X[, -i]))
    cond_entropy <- unlist(cond_entropy)
    sum_cond_entropy <- sum(cond_entropy)
    dtc <- entropy - sum_cond_entropy
    result <- ifelse(normalized,
                     dtc / entropy,
                     dtc)
#    checkmate::assert_number(result, lower = 0.0, upper = 1.0)
    return(result)
}

#' Average Mutual Information
#'
#' The average mutual information as a fraction of the total entropy.
average_mutual_information <- function(X, ave) {
    checkmate::assert_matrix(X, mode = "integerish")
    checkmate::assert_integerish(ave)
    mi <- apply(X = X,
                MARGIN = 2,
                FUN = function (x) infotheo::mutinformation(x, ave))
    ent <- infotheo::entropy(X)
    sum_mi <- sum(mi)
    ami <- sum_mi / ent
#    checkmate::assert_number(ami, lower = 0.0, upper = 1.0)
    return(ami)
}

#' Get mutual-information based correlated-error for predicted classes.
#'
#' @param total matrix of base classifier 0/1 predictions
#' @param ave vector of ensemble 0/1 predictions
#' @param method the method to use to compute the correlation
error_correlation <- function(total, ave, method = "ami") {
    checkmate::assert_matrix(total, mode = "numeric")
    checkmate::assert_numeric(ave)
    # TODO - if not integer, discretize
    X <- as.matrix(infotheo::discretize(total))
    ave <- as.matrix(infotheo::discretize(ave))
    delta <- dual_total_correlation(X, normalized = TRUE)
#    checkmate::assert_number(delta, lower = 0.0, upper = 1.0)
    return(delta)
}

bayeserror_formula <- function(N, e_total, e_ave, delta) {
    checkmate::assert_int(N, lower = 2L)
    checkmate::assert_number(e_total, lower = 0.0, upper = 1.0)
    checkmate::assert_number(e_ave, lower = 0.0, upper = 1.0)
#    checkmate::assert_number(delta, lower = 0.0, upper = 1.0)
    be <- ((N * e_ave - ((N - 1L) * delta + 1L) * e_total) /
           ((N - 1L) * (1L - delta)))
    return(be)
}

#' Estimate the Bayes error for a data set using a list of classifiers.
#'
#' @param x a matrix of features
#' @param y a vector of the observed classes
#' @param classifiers a list of classifier functions or else their names
#' @param err_method classification loss method
#' @param corr_method correlation of error method
#' @param v the resampling method
bayeserror <- function(x,
                       y,
                       classifiers,
                       err_method="accuracy",
                       corr_method="ami",
                       v = 10L) {
    ## Check input.
    checkmate::assert_matrix(x,
                             mode = "numeric")
    checkmate::assert_integerish(y,
                                 lower = 0L, upper = 1L,
                                 any.missing = FALSE,
                                 len = nrow(x))
    checkmate::assert_list(classifiers,
                           types = c("character", "function"))

    ## Posterior class probabilities.
    pred_prob <- predict_ensemble(x, y, v = v, classifiers = classifiers)
    ## Convert to 0/1 class predictions.
    pred_class <- round(pred_prob[, 1, ]) # array with dims (obs, classifier)
    ## Compute errors of the predictions
    errs <- ensemble_errors(pred_class, y, err_method)
    ## Form bayes error estimator terms.
    ## Number of base classifiers
    N <- length(classifiers)
    ## Class predictions of base classifiers
    class_total <- pred_class[, 1:N]
    ## Class predictions of ensemble classifier
    class_ave <- pred_class[, N + 1]
    ## Mean error of base classifiers
    e_total <- mean(head(errs, -1))
    ## Error of ensemble classifier
    e_ave <- tail(errs, 1)
    names(e_ave) <- NULL
    ## Estimated error correlation
    delta <- error_correlation(class_total, class_ave, corr_method)
    e_bayes <- bayeserror_formula(N, e_total, e_ave, delta)
    return(list("errors" = errs, "e_total" = e_total,
                "e_ave" = e_ave, "delta" = delta,
                "e_bayes" = e_bayes))
}


########## Nonparametric Estimate ##########

#' Upper Bound of the Bayes Error Rate given by the Mahalanobis Distance
#'
#' Provides an upper bound for the Bayes error rate of a dataset on
#' binary classes. This bound requires no assumptions of the
#' distribution of the data.
#' 
#' @param x a data matrix with numeric values
#' @param y a binary vector
#' @return an estimate of the Bayes error rate of x w.r.t. y
#'
#' @details
#' The Mahalanobis distance between class 0 and class 1 is given by
#' \deqn{M_d = (\mu_0 - \mu_1)^T \Sigma^{-1}(\mu_0 - \mu_1)}{%
#' M_d = (mu_0 - mu_1)' Sigma^-1 (mu_0 - mu_1)}
#' where \eqn{\mu_0} and \eqn{\mu_1} are the class means in `x` for
#' class 0 and 1, respectively. This provides an upper bound on the Bayes
#' error \eqn{\epsilon_bayes} by
#' \deqn{\epsilon_{bayes} \leq \frac{2 Pr(0) Pr(1)}{1 + Pr(0) Pr(1) M_d}}{%
#' \epsilon_bayes <= (2 Pr(0) Pr(1)) / (1 + Pr(0) Pr(1) M_d)}
#' where \eqn{Pr(0)} and \eqn{Pr(1)} are the class probabilities of 0 and
#' 1.
#' 
#' @export
mahalanobis_bound <- function(x, y) {
    checkmate::assertMatrix(x, mode = "numeric", any.missing = FALSE)
    checkmate::assertIntegerish(y, lower = 0, upper = 1)
    ## Probability weighted covariance matrix
    ## Empirical class probabilities
    p_1 <- mean(y)
    p_0 <- 1 - p_1
    sigma_0 <- cov(x[y == 0, ])
    sigma_1 <- cov(x[y == 1, ])
    sigma <- sigma_0 * p_0 + sigma_1 * p_1
    ## The Mahalanobis distance in x between class means
    m_d <- mahalanobis(colMeans(x[y == 0, ]),
                       colMeans(x[y == 1, ]),
                       sigma)
    ## Upper Bound of the Bayes error rate of x
    upper <- (2 * p_0 * p_1) / (1 + p_0 * p_1 * m_d)
    names(upper) <- "upper"
    return(upper)
}


#' Bhattacharyya distance estimate
#'
#'@export
bhattacharyya_bound <- function(x, y) {
    checkmate::assertMatrix(x, mode = "numeric", any.missing = FALSE)
    checkmate::assertIntegerish(y, lower = 0, upper = 1)
    ## Bhattacharyya distance between class conditional distributions
    x_0 <- x[y == 0, ]
    x_1 <- x[y == 1, ]
    mu_0 <- colMeans(x_0)
    mu_1 <- colMeans(x_1)
    sigma_0 <- cov(x_0)
    sigma_1 <- cov(x_1)
    sigma <- 0.5 * (sigma_0 + sigma_1)
    rho <- (1/8) * t(mu_1 - mu_0) %*% solve(sigma) %*% (mu_1 - mu_0) +
        0.5 * log(det(sigma) / sqrt(det(sigma_0) * det(sigma_1)))
    p_1 <- mean(y)
    p_0 <- 1 - p_1
    lower <- 0.5 * ( 1 - sqrt(1 - 4 * p_0 * p_1 * exp(-2 * rho)))
    upper <- exp(-rho) * sqrt(p_0 * p_1)
    bounds <- c(lower, upper)
    names(bounds) <- c("lower", "upper")
    return(bounds)
}

#' Nearest Neighbor estimate
#'
#' @export
nearest_neighbor_bound <- function(x, y) {
    checkmate::assertMatrix(x, mode = "numeric", any.missing = FALSE)
    checkmate::assertIntegerish(y, lower = 0, upper = 1)
    e_nn <- MLmetrics::ZeroOneLoss(class::knn.cv(x, y), y)
    print(e_nn)
    lower <- 0.5 * (1 - sqrt(1 - 2 * e_nn))
    upper <- e_nn
    bounds <- c(lower, upper)
    names(bounds) <- c("lower", "upper")
    return(bounds)
}
ryanholbrook/bayeserror documentation built on Jan. 12, 2020, 6:25 p.m.