R/matthewCor.R

Defines functions .mcc mcc

Documented in mcc

## Matthews correlatipon coefficient
## TODO:: Give this function a more descriptive name
#' Compute a Mathews Correlation Coefficient
#'
#' The function computes a Matthews correlation coefficient for two factors
#' provided to the function. It assumes each factor is a factor of class labels,
#' and the enteries are paired in order of the vectors.
#'
#' Please note: we recommend you call set.seed() before using this function to
#' ensure the reproducibility of your results. Write down the seed number or
#' save it in a script if you intend to use the results in a publication.
#'
#' @examples
#' x <- factor(c(1,2,1,2,3,1))
#' y <- factor(c(2,1,1,1,2,2))
#' mcc(x,y)
#'
#' @param x,y \code{factor} of the same length with the same number of levels
#' @param nperm \code{numeric} number of permutations for significance
#' estimation. If 0, no permutation testing is done
#' @param nthread \code{numeric} can parallelize permutation texting using
#'   BiocParallels bplapply
#' @param alternative indicates the alternative hypothesis and must be one of
#' ‘"two.sided"’, ‘"greater"’ or ‘"less"’.  You can specify just
#' the initial letter.  ‘"greater"’ corresponds to positive
#' association, ‘"less"’ to negative association.
#' @param ... \code{list} Additional arguments
#'
#' @return A list with the MCC as the $estimate, and p value as $p.value
#' @export
mcc <- function(x, y, nperm = 1000, nthread = 1, alternative=c("two.sided", "less", "greater"), ...) {
    # PARAMETER CHANGE WARNING
    if (!missing(...)) {
        if ("setseed" %in% names(...)) {
            warning("The setseed parameter has been removed in this release to conform
              to Bioconductor coding standards. Please call set.seed in your
              script before running this function.")
        }
    }
    alternative <- match.arg(alternative)
    if ((length(x) != length(y)) || (!is.factor(x) || length(levels(x)) < 2) || (!is.factor(y) || length(levels(y)) < 2)) {
        stop("x and y must be factors of the same length with at least two levels")
    }
    if (length(levels(x)) != length(levels(y))) {

        warning("The number of levels x and y was different. Taking the union of all class labels.")
        levels(x) <- union(levels(x), levels(y))
        levels(y) <- union(levels(x), levels(y))

    }
    res <- list(estimate = NA, p.value = NA)
    ## compute MCC
    res[["estimate"]] <- .mcc(ct = table(x, y))
    ## compute significance of MCC using a permutation test
    if (nperm > 0) {
        splitix <- parallel::splitIndices(nx = nperm, ncl = nthread)
        splitix <- splitix[vapply(splitix, length, FUN.VALUE = numeric(1)) > 0]
        mcres <- BiocParallel::bplapply(splitix, function(x, xx, yy) {
            res <- vapply(x, function(x, xx, yy) {
                xx <- sample(xx)
                yy <- sample(yy)
                return(.mcc(ct = table(xx, yy)))
            }, xx = xx, yy = yy, FUN.VALUE = numeric(1))
            return(res)
        }, xx = x, yy = y)
        mcres <- unlist(mcres)
        #res[["p.value"]] <- sum(mcres > res["estimate"])/sum(!is.na(mcres))
        switch(alternative, "two.sided" = {
            if(res[["estimate"]] > 0) {
                res[["p.value"]] <- 2*sum(res[["estimate"]] <=  mcres )/sum(!is.na(mcres))
            } else {
                res[["p.value"]] <- 2*sum(res[["estimate"]] >=  mcres )/sum(!is.na(mcres))
            }
        }, "less" = {
            res[["p.value"]] <- sum(res[["estimate"]] >=  mcres )/sum(!is.na(mcres)) 
        }, "greater" = {
            res[["p.value"]] <- sum(res[["estimate"]] <=  mcres )/sum(!is.na(mcres))
        })
        if (res["p.value"] == 0) {
            res["p.value"] <- 1/(nperm + 1)
        }
    }
    return(res)
}

## Helper functions


## Just a lot of math, multiclass MCC
## https://en.wikipedia.org/wiki/Matthews_correlation_coefficient#Multiclass_case
.mcc <-
  function(ct, nbcat=nrow(ct)) {
    if(nrow(ct) != ncol(ct)) { stop("the confusion table should be square!") }

    ## The following code checks if there would be a division by 0 error in the computation on the Matthew's
    ## correlation coefficient. In practice, this occurs when there are multiple categories but all predictions end up
    ## in only one category - in this case, the denominator is 0. This is dealt with by adding a psuedocount to each.
    ## This is chosen because the mcc of a matrix of 1s is 0, and such should not affect the value in expectation.
    ## Note this is not necessary when alll entries lie on the diagonal.
    if( !(sum(ct)==sum(diag(ct))) && ## Check if all entries are on the diagonal. If they are, no adjustment necessary.
        ((sum(rowSums(ct) == 0) == (nbcat-1)) || ## Otherwise, check if there is entries only in one column or only in one row.
        (sum(colSums(ct) == 0) == (nbcat-1)))) {
        ct <- ct + matrix(1, ncol=nbcat, nrow=nbcat)
      } ### add element to categories if nbcat-1 predictive categories do not contain elements. Not in case where all are correct!

    if (sum(ct, na.rm = TRUE) <= 0) {
        return(NA)
    }

    myx <- matrix(TRUE, nrow = nrow(ct), ncol = ncol(ct))
    diag(myx) <- FALSE
    if (sum(ct[myx]) == 0) {
        return(1)
    }
    myperf <- 0
    for (k in seq_len(nbcat)) {
        for (m in seq_len(nbcat)) {
            for (l in seq_len(nbcat)) {
                myperf <- myperf + ((ct[k, k] * ct[m, l]) - (ct[l, k] * ct[k, m]))
            }
        }
    }
    aa <- 0
    for (k in seq_len(nbcat)) {
        cc <- 0
        for (l in seq_len(nbcat)) {
            cc <- cc + ct[l, k]
        }
        dd <- 0
        for (f in seq_len(nbcat)) {
            for (g in seq_len(nbcat)) {
                if (f != k) {
                  dd <- dd + ct[g, f]
                }
            }
        }
        aa <- aa + (cc * dd)
    }
    bb <- 0
    for (k in seq_len(nbcat)) {
        cc <- 0
        for (l in seq_len(nbcat)) {
            cc <- cc + ct[k, l]
        }
        dd <- 0
        for (f in seq_len(nbcat)) {
            for (g in seq_len(nbcat)) {
                if (f != k) {
                  dd <- dd + ct[f, g]
                }
            }
        }
        bb <- bb + (cc * dd)
    }

    myperf <- myperf/(sqrt(aa) * sqrt(bb))
    return(myperf)
}
bhklab/CoreGx documentation built on March 14, 2024, 3:04 a.m.