R/auxFuns.R

# exprData=expr.data; gSets=gene.sets; minGsetSize=min.cl.sz
filterInputs <- function(exprData, gSets, minGsetSize) {
    # keep only genes in exprData and gene sets
    exprDataGenes <- rownames(exprData);
    geneSetsGenes <- unique(do.call(c, gSets));
    commonGenes <- intersect(exprDataGenes, geneSetsGenes);

    flog.info(paste("Genes removed from expression matrix:", length(exprDataGenes)-length(commonGenes)));
    exprData <- exprData[commonGenes,];
    flog.info(paste("Genes removed from gene sets:", length(geneSetsGenes)-length(commonGenes)));
    gSets <- lapply(gSets, intersect, commonGenes);

    # remove gene sets with less than minSz genes (detected in the experiment)
    keepGsets <- unlist(lapply(gSets, length) > minGsetSize);
    gSets <- gSets[keepGsets];
    flog.info(paste("Gene sets removed by min size:", sum(!keepGsets), ', kept:', length(gSets)));
    return(list(exprData=exprData, gSets=gSets));
}

#' @importFrom BiocParallel bpparam bplapply
#' @importFrom futile.logger flog.info
#' @include rankFunctions.R
# classes=l
getRankings <- function(exprData, classes, nPerm, rankFn, rankInParallel) {
    if (is.character(rankFn) && rankFn == 'MA') {
        flog.info('ebayes rank selected.');
        rankFn <- maRank;
    } else if (is.character(rankFn) && rankFn == 'RNA') {
        flog.info('Voom rank selected.');
        rankFn <- rnaRank;
    } else if (!is.function(rankFn)) {
        flog.info('Custon ranking function provided.');
        stop('rankFn must be one of "MA", "RNA", an R function, or a matrix.');
    }

    # generate permutations
    perms <- replicate(nPerm, sample(1:ncol(exprData), replace=FALSE));
    permsConds <- do.call(cbind, lapply(seq_len(ncol(perms)), function(i) {
        as.character(classes[perms[,i]]);
    }))
    perms <- t(perms[,!duplicated(t(permsConds))]);
    rm(permsConds);

    nPerm <- nrow(perms);
    flog.info(paste("Number of unique permutations:", nPerm));

    rankings <- matrix(0, nrow=nrow(exprData), ncol=nPerm+1);
    # gene scores for real data
    rankings[,1] <- rankFn(exprData, classes);

    # gene scores for permuted data
    if (rankInParallel) {
        whichLapply <- bplapply;
        flog.info(paste("Getting ranking at cores:", bpparam()$workers));
    } else {
        whichLapply <- lapply;
        flog.info(paste("Getting ranking at cores:", 1));
    }

    rankList <- whichLapply(seq_len(nPerm), function(i) {
        flog.info(paste0('Getting rank for perm: ', i));
        actRank <- rankFn(exprData, classes[perms[i,]]);
        return(actRank);
    });
    rankings[,-1] <- do.call(cbind, rankList);

    ## todo: maybe the best alternative would be delete the gene from
    # everywhere
    flog.info(paste("Percentage of NA in ranking:", sum(is.na(rankings))/length(rankings)));
    rankings[is.na(rankings)] <- mean(rankings, na.rm=TRUE);
    # genes which are NA are given the mean value of all genes,
    # so they dont contribute to enrichment score.

    return(rankings);
}

countHygeVarMean <- function (M, K) {
    N <- matrix(rep(seq_len(M), length(K)), ncol=length(K));
    K <- matrix(rep(K, M), byrow=TRUE, ncol=length(K));
    mean <- N * K/M;
    var <- N * (K * (M - K)/M^2) * ((M - N)/(M - 1));
    return(list(mean=mean, var=var));
}

countProbSum <- function (M, hyge.mean, hyge.var) {
    tulos <- matrix(0, nrow=length(hyge.mean));
    N_tab <- matrix(c(2:M), byrow=FALSE);
    tulos[1] <- hyge.mean[1];
    tulos[2:M] <- N_tab/(N_tab-1) * hyge.mean[2:M] - (hyge.mean[2:M]^2 +
                  hyge.var[2:M])/(N_tab-1);
    return(tulos);
}

sumVarMeanCalc <- function(ranking, preVar, normFactors) {
    divider <- seq_along(ranking);
    mean_table <- cumsum(ranking)/divider;
    mean_table_sq <- mean_table^2;

    # not sure if it is +2*preVar or just +preVar , in mGSZ it does +preVar
    # and again +preVar
    var_table <- cumsum(ranking^2)/divider - (mean_table)^2 + 2*preVar;

    z_means <- do.call(cbind, lapply(seq_len(ncol(normFactors$normMean)),
        function(j) {
            mean_table * (2 * normFactors$normMean[,j] - divider);
    }));
    colnames(z_means) <- colnames(normFactors$normMean);

    z_vars <- do.call(cbind, lapply(seq_len(ncol(normFactors$normVar)),
        function(j) {
            4 * (var_table * normFactors$probSum[,j] + mean_table_sq *
                normFactors$normVar[,j]);
    }));
    colnames(z_vars) <- colnames(normFactors$normVar);

    return(list(Z_means=z_means, Z_vars=z_vars));
}

zVarCalc <- function(z_vars, wgt2, varConstant) {
    medianPart <- matrix(matrixStats::rowMedians(t(z_vars)) *
                     wgt2, nrow=nrow(z_vars), ncol=ncol(z_vars), byrow=!FALSE);
    z_var <- (z_vars + medianPart + varConstant)^0.5;
    return(z_var);
}

getEnrichScore <- function(ranking, actGset, zM_d, zM_i, zV_d, zV_i) {
    startVal <- 5; # hard coded
    nMG <- !names(ranking) %in% actGset;
    ranking[nMG] <- -ranking[nMG];

    es_dec <- cumsum(ranking) - zM_d;
    es_inc <- cumsum(rev(ranking)) - zM_i;

    es_dec[1:startVal] <- 0; es_inc[1:startVal] <- 0;

    es_dec <- es_dec/zV_d;
    es_inc <- es_inc/zV_i;

    return(c(es_dec, es_inc));
}

#' @importFrom compiler cmpfun
getEnrichScore_c <- cmpfun(getEnrichScore);

#' @importFrom matrixStats colSds colVars
#' @importFrom ismev gum.fit
getPvalues <- function(realESs, permESs) {
    # some normalization
    mean.prof <- colMeans(permESs);
    std.prof <- matrixStats::colSds(permESs);
    std.prof[std.prof == 0] <- 0.1;
    realESs <- (realESs - mean.prof)/std.prof;
    permESs <- do.call(cbind, lapply(seq_len(ncol(permESs)), function(k) {
        (permESs[, k] - mean.prof[k])/std.prof[k];
    }))
    rm(mean.prof); rm(std.prof);

    col.ind <- matrixStats::colVars(permESs) > 0;
    permESs <- permESs[, col.ind];
    realESs <- realESs[col.ind];
    ev.p.val.class <- rep(0, length(realESs))

    pvals <- do.call(c, lapply(seq_along(realESs), function(k) {
        ev.param.class <- ismev::gum.fit(as.vector(permESs[, k]),
                                         show=FALSE)$mle
        logEVcdf(realESs[[k]], ev.param.class);
    }))
    return(pvals);
}

logEVcdf <- function (x, fitParams) {
    mu <- fitParams[[1]];
    sigma <- fitParams[[2]];
    x <- (mu - x)/sigma;
    out <- rep(0, length(x));
    if (!(is.vector(x))) {
        out <- matrix(out, nrow(x), ncol(x));
    }
    po1 <- which(x < 5);
    out[po1] <- -log(1 - exp(-exp(x[po1])));
    x <- x[-po1];
    out[-po1] <- -x + exp(x)/2 - exp(2 * x)/24 + exp(4 * x)/2880;
    out <- out/log(10);
    out <- 10^(-out);
    return(out);
}
jcrodriguez1989/mGSZ2 documentation built on May 24, 2019, 9:51 a.m.