R/pathwayAnalysis.R

Defines functions pathwayRankBasedEnrichment pathwayOverrepresent

Documented in pathwayOverrepresent pathwayRankBasedEnrichment

#' Gene set over-representation analysis
#'
#' @description This function performes gene set over-representation analysis
#' using Fisher's exact test.
#'
#' @param geneSet an array of gene or phosphosite IDs (IDs are gene symbols etc
#' that match to your pathway annotation list).
#' @param annotation a list of pathways with each element containing an array of
#' gene or phosphosite IDs.
#' @param universe the universe/backgrond of all genes or phosphosites in your
#' profiled dataset.
#' @param alter test for enrichment ('greater', default), depletion ('less'), or
#' 'two.sided'.
#'
#' @return A matrix of pathways and their associated substrates and p-values.
#'
#' @examples
#'
#' library(limma)
#'
#' data('phospho_L6_ratio')
#' data('SPSs')
#'
#' grps = gsub('_.+', '', colnames(phospho.L6.ratio))
#'
#' # Cleaning phosphosite label
#' phospho.site.names = rownames(phospho.L6.ratio)
#' L6.sites = gsub(' ', '', sapply(strsplit(rownames(phospho.L6.ratio), '~'),
#'                                 function(x){paste(toupper(x[2]), x[3], '',
#'                                                 sep=';')}))
#' phospho.L6.ratio = t(sapply(split(data.frame(phospho.L6.ratio), L6.sites),
#'                             colMeans))
#' phospho.site.names = split(phospho.site.names, L6.sites)
#'
#' # Construct a design matrix by condition
#' design = model.matrix(~ grps - 1)
#'
#' # phosphoproteomics data normalisation using RUV
#' ctl = which(rownames(phospho.L6.ratio) %in% SPSs)
#' phospho.L6.ratio.RUV = RUVphospho(phospho.L6.ratio, M = design, k = 3,
#'                                 ctl = ctl)
#'
#'
#' # divides the phospho.L6.ratio data into groups by phosphosites
#' L6.sites <- gsub(' ', '', gsub('~[STY]', '~',
#'     sapply(strsplit(rownames(phospho.L6.ratio.RUV), '~'),
#'         function(x){paste(toupper(x[2]), x[3], sep='~')})))
#' phospho.L6.ratio.sites <- t(sapply(split(data.frame(phospho.L6.ratio.RUV),
#'                                         L6.sites), colMeans))
#'
#' # fit linear model for each phosphosite
#' f <- gsub('_exp\\d', '', colnames(phospho.L6.ratio.RUV))
#' X <- model.matrix(~ f - 1)
#' fit <- lmFit(phospho.L6.ratio.RUV, X)
#'
#' # extract top-ranked phosphosites for each condition compared to basal
#' table.AICAR <- topTable(eBayes(fit), number=Inf, coef = 1)
#' table.Ins <- topTable(eBayes(fit), number=Inf, coef = 3)
#' table.AICARIns <- topTable(eBayes(fit), number=Inf, coef = 2)
#'
#' DE1.RUV <- c(sum(table.AICAR[,'adj.P.Val'] < 0.05),
#'     sum(table.Ins[,'adj.P.Val'] < 0.05),
#'     sum(table.AICARIns[,'adj.P.Val'] < 0.05))
#'
#' # extract top-ranked phosphosites for each group comparison
#' contrast.matrix1 <- makeContrasts(fAICARIns-fIns, levels=X)
#' contrast.matrix2 <- makeContrasts(fAICARIns-fAICAR, levels=X)
#' fit1 <- contrasts.fit(fit, contrast.matrix1)
#' fit2 <- contrasts.fit(fit, contrast.matrix2)
#' table.AICARInsVSIns <- topTable(eBayes(fit1), number=Inf)
#' table.AICARInsVSAICAR <- topTable(eBayes(fit2), number=Inf)
#'
#' DE2.RUV <- c(sum(table.AICARInsVSIns[,'adj.P.Val'] < 0.05),
#'     sum(table.AICARInsVSAICAR[,'adj.P.Val'] < 0.05))
#'
#' o <- rownames(table.AICARInsVSIns)
#' Tc <- cbind(table.Ins[o,'logFC'], table.AICAR[o,'logFC'],
#'             table.AICARIns[o,'logFC'])
#' rownames(Tc) = gsub('(.*)(;[A-Z])([0-9]+)(;)', '\\1;\\3;', o)
#' colnames(Tc) <- c('Ins', 'AICAR', 'AICAR+Ins')
#'
#' # summary phosphosite-level information to proteins for performing downstream
#' # gene-centric analyses.
#' Tc.gene <- phosCollapse(Tc, id=gsub(';.+', '', rownames(Tc)),
#'     stat=apply(abs(Tc), 1, max), by = 'max')
#' geneSet <- names(sort(Tc.gene[,1],
#'                     decreasing = TRUE))[1:round(nrow(Tc.gene) * 0.1)]
#' #lapply(PhosphoSite.rat, function(x){gsub(';[STY]', ';', x)})
#'
#' # 1D gene-centric pathway analysis
#' path1 <- pathwayOverrepresent(geneSet, annotation=Pathways.reactome,
#'     universe = rownames(Tc.gene), alter = 'greater')
#'
#' @export
pathwayOverrepresent <- function(geneSet, annotation, universe,
    alter = "greater") {
    if (missing(geneSet)) {
        stop("Parameter geneSet is missing!")
    }
    if (missing(annotation)) {
        stop("Parameter annotation is missing!")
    }
    if (missing(universe)) {
        stop("Parameter universe is missing!")
    }

    fisherTest.mat <- matrix("NA", ncol = 3, nrow = length(annotation))
    colnames(fisherTest.mat) <- c("pvalue", "# of substrates",
        "substrates")
    for (i in seq_len(length(annotation))) {
        di <- length(intersect(geneSet, annotation[[i]]))
        dn <- length(setdiff(geneSet, annotation[[i]]))
        ndi <- length(setdiff(annotation[[i]], geneSet))
        ndn <- length(setdiff(universe, union(geneSet, annotation[[i]])))
        p <- stats::fisher.test(rbind(c(di, ndi), c(dn, ndn)),
            alternative = alter)$p.value
        substrates <- paste(intersect(geneSet, annotation[[i]]),
            collapse = "|")
        fisherTest.mat[i, ] <- c(p, di, substrates)
    }
    rownames(fisherTest.mat) <- names(annotation)
    fisherTest.mat <- fisherTest.mat[order(as.numeric(fisherTest.mat[,
        1])), ]
    return(fisherTest.mat)
}

#' Gene set enrichment analysis
#'
#' This function performes gene set enrichment analysis using Wilcoxon Rank Sum
#' test.
#'
#' @param geneStats an array of statistics (e.g. log2 FC) of all quantified
#' genes or phosphosite with names of the array as gene or phosphosite IDs.
#' @param annotation a list of pathways with each element containing an array of
#'  gene IDs.
#' @param alter test for enrichment ('greater', default), depletion ('less'), or
#'  'two.sided'.
#'
#' @return A matrix of pathways and their associated substrates and p-values.
#'
#' @examples
#'
#' library(limma)
#'
#' data('phospho_L6_ratio')
#' data('SPSs')
#'
#' grps = gsub('_.+', '', colnames(phospho.L6.ratio))
#'
#' # Cleaning phosphosite label
#' phospho.site.names = rownames(phospho.L6.ratio)
#' L6.sites = gsub(' ', '', sapply(strsplit(rownames(phospho.L6.ratio), '~'),
#'                                 function(x){paste(toupper(x[2]), x[3], '',
#'                                                 sep=';')}))
#' phospho.L6.ratio = t(sapply(split(data.frame(phospho.L6.ratio), L6.sites),
#'                             colMeans))
#' phospho.site.names = split(phospho.site.names, L6.sites)
#'
#' # Construct a design matrix by condition
#' design = model.matrix(~ grps - 1)
#'
#' # phosphoproteomics data normalisation using RUV
#' ctl = which(rownames(phospho.L6.ratio) %in% SPSs)
#' phospho.L6.ratio.RUV = RUVphospho(phospho.L6.ratio, M = design, k = 3,
#'                                 ctl = ctl)
#'
#'
#' # divides the phospho.L6.ratio data into groups by phosphosites
#' L6.sites <- gsub(' ', '', gsub('~[STY]', '~',
#'     sapply(strsplit(rownames(phospho.L6.ratio.RUV), '~'),
#'         function(x){paste(toupper(x[2]), x[3], sep='~')})))
#' phospho.L6.ratio.sites <- t(sapply(split(data.frame(phospho.L6.ratio.RUV),
#'                                         L6.sites), colMeans))
#'
#' # fit linear model for each phosphosite
#' f <- gsub('_exp\\d', '', colnames(phospho.L6.ratio.RUV))
#' X <- model.matrix(~ f - 1)
#' fit <- lmFit(phospho.L6.ratio.RUV, X)
#'
#' # extract top-ranked phosphosites for each condition compared to basal
#' table.AICAR <- topTable(eBayes(fit), number=Inf, coef = 1)
#' table.Ins <- topTable(eBayes(fit), number=Inf, coef = 3)
#' table.AICARIns <- topTable(eBayes(fit), number=Inf, coef = 2)
#'
#' DE1.RUV <- c(sum(table.AICAR[,'adj.P.Val'] < 0.05),
#'     sum(table.Ins[,'adj.P.Val'] < 0.05),
#'     sum(table.AICARIns[,'adj.P.Val'] < 0.05))
#'
#' # extract top-ranked phosphosites for each group comparison
#' contrast.matrix1 <- makeContrasts(fAICARIns-fIns, levels=X)
#' contrast.matrix2 <- makeContrasts(fAICARIns-fAICAR, levels=X)
#' fit1 <- contrasts.fit(fit, contrast.matrix1)
#' fit2 <- contrasts.fit(fit, contrast.matrix2)
#' table.AICARInsVSIns <- topTable(eBayes(fit1), number=Inf)
#' table.AICARInsVSAICAR <- topTable(eBayes(fit2), number=Inf)
#'
#' DE2.RUV <- c(sum(table.AICARInsVSIns[,'adj.P.Val'] < 0.05),
#'     sum(table.AICARInsVSAICAR[,'adj.P.Val'] < 0.05))
#'
#' o <- rownames(table.AICARInsVSIns)
#' Tc <- cbind(table.Ins[o,'logFC'], table.AICAR[o,'logFC'],
#'             table.AICARIns[o,'logFC'])
#' rownames(Tc) = gsub('(.*)(;[A-Z])([0-9]+)(;)', '\\1;\\3;', o)
#' colnames(Tc) <- c('Ins', 'AICAR', 'AICAR+Ins')
#'
#' # summary phosphosite-level information to proteins for performing downstream
#' #  gene-centric analyses.
#' Tc.gene <- phosCollapse(Tc, id=gsub(';.+', '', rownames(Tc)),
#'     stat=apply(abs(Tc), 1, max), by = 'max')
#'
#' # 1D gene-centric pathway analysis
#' path2 <- pathwayRankBasedEnrichment(Tc.gene[,1],
#'                                     annotation=Pathways.reactome,
#'                                     alter = 'greater')
#' @export
pathwayRankBasedEnrichment <- function(geneStats, annotation,
    alter = "greater") {
    if (missing(geneStats))
        stop("Parameter geneStats is missing!")
    if (missing(annotation))
        stop("Parameter annotation is missing!")

    wilcoxTest.mat <- matrix("NA", ncol = 3, nrow = length(annotation))
    colnames(wilcoxTest.mat) <- c("pvalue", "# of substrates",
        "substrates")

    # perform wilcox sum rank test for pathway enrichment
    # analysis
    for (i in seq_len(length(annotation))) {
        p.in <- intersect(names(geneStats), annotation[[i]])
        p.out <- setdiff(names(geneStats), annotation[[i]])

        if (length(geneStats[p.in]) <= 3) {
            wilcoxTest.mat[i, seq_len(3)] <- NA
        } else {
            wilcoxTest.mat[i, 1] <- stats::wilcox.test(geneStats[p.in],
                geneStats[p.out], alternative = alter)$p.value
            wilcoxTest.mat[i, 2] <- length(p.in)
            wilcoxTest.mat[i, 3] <- paste(p.in, collapse = ";")
        }
    }

    rownames(wilcoxTest.mat) <- names(annotation)
    wilcoxTest.mat <- wilcoxTest.mat[order(as.numeric(wilcoxTest.mat[,
        1])), ]
    return(wilcoxTest.mat)
}
PengyiYang/PhosR documentation built on June 21, 2020, 8:37 a.m.