R/metrics.R

Defines functions logFC auroc normalize.vec sv fast.sv rank1.residuals fast.condNumber condNumber.norm condNumber pinv.svd cdev.from.scaling.factors cdev cdev.generic

#' Condition-number-based deviation of X and Y,
#' for generally given X and Y.
#'
#' Much faster with thin matrix (nrow >> ncol).
#'
#' @import MASS
#' @param X m x n matrix
#' @param Y m x n matrix
#' @param always.small [=TRUE] if set to TRUE, cdev will be calculate for the smaller version of transformation matrix
#' @export
cdev.generic <- function(X, Y, always.small = TRUE) {
    if (!all(dim(X) == dim(Y))) stop("cdev is only defined for 2 matrices with the same shape.")
    if (always.small & (dim(X)[2] > dim(X)[1])) {
        return(fast.condNumber(MASS::ginv(t(X)) %*% t(Y)))
    } else {
        return(fast.condNumber((MASS::ginv(X) %*% Y)))
    }
}

#' Condition-number-based deviation of X and Y,
#' assuming X can be transformed into Y by scaling each columns with
#' a coefficient.
#'
#' @param X a matrix
#' @param Y another matrix with the same shape as X
#' @param generic whether to use the generic definition of cdev, without assuming only column-scaling is involved between X and Y
#' @export
cdev <- function(X, Y, scale = FALSE, generic = FALSE, ...) {
    if (generic) return(cdev.generic(X,Y,...))
    # Pick one of the genes with minimum counts > 0,
    # reverse-engineer the scaling factors
    an_eligible_feature = which(apply(X, MARGIN = 1, min) > 0)[1]
    scalingFactors = X[an_eligible_feature,] / Y[an_eligible_feature,]
    return(cdev.from.scaling.factors(scalingFactors))
}

#' @export
cdev.from.scaling.factors <- function(scalingFactors) {
    return(max(scalingFactors) / min(scalingFactors))
}

#' Compute the pseudoinverse using SVD
#'
pinv.svd = function(X) {
    x.svd = svd(X)
    return(x.svd$v %*% diag(1/x.svd$d) %*% t(x.svd$u))
}
#' Condition number of a matrix X
condNumber <- function(X) {
    sigmas = svd(X)[['d']]
    return(sigmas[1] / sigmas[length(sigmas)])
}

condNumber.norm <- function(X) {
    return(norm(X,'2') %*% norm(MASS::ginv(X), '2'))
}



#' Condition number using fast.svd
#'
#' When X is rectangular, the SVD of XX' (or X'X) will be faster if ncol > nrow (or nrow > ncol)
#' Since we only care about the ratio between first and last singular values,
#' there's no need to calculate V'
fast.condNumber <- function(X) {
    sigmas = svd(X, nu = 0, nv = 0)[['d']]
    return(sigmas[1] / sigmas[length(sigmas)])
}

#' rank-1 residuals
#'
#' @export
rank1.residuals <- function(X) {
    return(norm(normalize.vec(fast.sv(X))[-1], type='2'))
}

fast.sv <- function(X) {
     if (which.min(dim(X)) == 1) {
        xx = X %*% t(X)
    } else {
        xx = t(X) %*% X
    }
    return(sqrt(svd(xx, nu = 0, nv = 0)[['d']]))
}

sv <- function(X) {
    return(svd(X, nu = 0, nv = 0)[['d']])
}

normalize.vec <- function(x) {
    return(x / sqrt(sum(x^2)))
}

#' Area under the ROC curve
#' Source: https://mbq.me/blog/augh-roc/
#' @param score array of scores
#' @param labels boolean array of true labels, with TRUE being positive
auroc<-function(score, labels){
    n1<-sum(!labels); sum(labels)->n2;
    U<-sum(rank(score)[!labels])-n1*(n1+1)/2;
    return(1-U/n1/n2);
}

logFC = function(x, groups, log.base = 2, pseudocount = 0) {
    G = unique(groups)
    log(rowMeans(x[,groups == G[2], drop= FALSE]) + pseudocount, base = log.base) -
        log(rowMeans(x[,groups == G[1], drop = FALSE]) + pseudocount, base = log.base)
}
ttdtrang/cdev-paper documentation built on Dec. 23, 2021, 1:01 p.m.