R/aurocs.R

Defines functions auroc_p_value colVars_sparse compute_tie_correction rank_zero rank_sparse compute_aurocs design_matrix

Documented in compute_aurocs design_matrix

#' Convert character vector to one-hot encoded matrix, with one column per
#' unique string.
#'
#' @param cell_type Character vector.
#' @param scale_columns Boolean, should columns be normalized to sum to 1?
#'
#' @export
design_matrix = function(cell_type, scale_columns=FALSE) {
  factors = levels(as.factor(cell_type))
  if (length(factors) > 1) {
    result = stats::model.matrix(~as.factor(cell_type)-1)
  } else {
    result = matrix(1, nrow = length(cell_type), ncol = 1)
  }
  colnames(result) = factors
  if (scale_columns) {
    result = scale(result, center = FALSE, scale = colSums(result))
  }
  return(result)
}

#' Efficient computation of AUROCs (vectorized for predictors and categories).
#'
#' @param predictors Matrix where each column is a predictor and each row is a sample.
#' @param label_matrix One-hot encoded matrix where columns are categories and each row is a sample.
#' The number of rows must be identical to the number of rows in predictors.
#' 1 indicates that the sample on this row belongs to the category on this column.
#' @param compute_tie_correction Boolean. If TRUE, for each AUROC, compute classical
#' tie correction (only useful for p-value computation).
#' @return An AUROC matrix of size #predictors x #categories, containing 
#' all (predictor, category) combinations.
compute_aurocs = function(predictors, label_matrix, compute_tie_correction = FALSE) {
    label_matrix = as.matrix(label_matrix)
    n_positives = colSums(label_matrix)
    n_negatives = nrow(label_matrix) - n_positives
    if (methods::is(predictors, "dgCMatrix")) {
        # we shift all ranks after the matrix multiplication to keep
        # the predictor matrix sparse
        ranks = rank_sparse(predictors)
        sum_of_positive_ranks = as.matrix(Matrix::crossprod(label_matrix, ranks)) +
            outer(n_positives, rank_zero(predictors))
    } else {
        predictors = as.matrix(predictors)
        ranks = matrixStats::colRanks(predictors, ties.method = "average", preserveShape=TRUE)
        sum_of_positive_ranks = crossprod(label_matrix, ranks)
        colnames(sum_of_positive_ranks) = colnames(predictors)
    }
    result = (sum_of_positive_ranks / n_positives - (n_positives+1)/2) / n_negatives

    if (compute_tie_correction) {
        tie_correction = compute_tie_correction(ranks)
    } else {
        tie_correction = 0
    }
    return(list(aurocs = result, tie_corrections = tie_correction))
}

# these 2 functions only rank non-zeros, implicitly shifting the matrix of ranks
# to keep the matrix sparse according to the formula:
#   true_ranks(M) = rank_zero(M) + rank_sparse(M), where:
#     + rank_zero(M) = (#zeros + 1)/2 = ((nrow(M) - diff(M@p)) + 1) / 2
#     + rank_sparse(M) = rank(nonzero element) + (#zeros - 1)/2
# faster than solution using base::tapply, probably data.table would be faster
rank_sparse = function(M) {
    nnz = diff(M@p)
    # declare variables for compatibility with R CMD check
    x = j = NULL
    data_ = data.table::data.table(x = M@x, j = rep.int(1:ncol(M), nnz))
    ranks = data_[, list(rank = c(matrixStats::colRanks(as.matrix(x), ties.method = "average"))), by=j]
    R = M
    R@x = ranks$rank + rep.int((nrow(M)-nnz-1)/2, nnz)
    return(R)
}

rank_zero = function(M) {
    return(((nrow(M) - diff(M@p)) + 1) / 2)
}

# For the following functions, see
#
#   https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test#Normal_approximation_and_tie_correction
#
# The tie correction effectively computes lost variance because of ties (compared to discrete uniform).
# Computing the Wikipedia formula naively is slow, this method is equivalent and fast.
compute_tie_correction = function(ranks) {
    if (methods::is(ranks, "dgCMatrix")) {
        observed_var = colVars_sparse(ranks)
    } else {
        observed_var = matrixStats::colVars(as.matrix(ranks))
    }
    max_var = stats::var(seq_len(nrow(ranks)))
    return((max_var-observed_var) * 12 / nrow(ranks))
}

colVars_sparse = function(M) {
    result = (Matrix::colMeans(M**2) - Matrix::colMeans(M)**2)*nrow(M)/(nrow(M)-1)
    return(result)
}

auroc_p_value = function(aurocs, label_matrix, two_tailed = TRUE, tie_correction = 0, log.p = FALSE) {
    p = colSums(label_matrix)
    n = nrow(label_matrix) - p
  
    # Careful: NAs may arise from tie_correction (predictor with 0 variance)
    if (length(tie_correction) > 1) {
        Z = (aurocs - 0.5) * sqrt(12*n*p)
        Z = t(t(Z) / sqrt(nrow(label_matrix)+1-tie_correction))
    } else {
        Z = (aurocs - 0.5) / sqrt((nrow(label_matrix)+1-tie_correction)/(12*n*p))
    }
  
    result = Z
    if (two_tailed) {
        is_not_na = !is.na(Z)
        neg_tail = stats::pnorm(Z[Z<=0 & is_not_na], log.p = log.p)
        pos_tail = stats::pnorm(Z[Z>0 & is_not_na], lower.tail=FALSE, log.p = log.p)
        if (log.p) {
            neg_tail = neg_tail + log(2)
            pos_tail = pos_tail + log(2)
        } else {
            neg_tail = neg_tail*2
            pos_tail = pos_tail*2
        }
        result[Z<=0 & is_not_na] = neg_tail
        result[Z>0 & is_not_na] = pos_tail
    } else {
        result = stats::pnorm(Z, lower.tail=FALSE, log.p = log.p)
    }
    return(result)
}
gillislab/MetaMarkers documentation built on April 24, 2021, 9:25 p.m.