R/MLP.R

#' This function calculates p-values for each gene set based on row permutations
#' of the gene p values or column permutations of the expression matrix; the p values
#' can be obtained either as individual gene set p values or p values based on smoothing
#' across gene sets of similar size.
#' @param geneSet is the input list of gene sets (components) and gene IDs (character vectors). 
#' A gene set can, for example, be a GO category with for each category Entrez Gene identifiers; 
#' The \link{getGeneSets} function can be used to construct the geneSet argument for different
#' pathway sources.
#' @param geneStatistic is either a named numeric vector (if rowPermutations is TRUE)
#'   or a numeric matrix of pvalues (if rowPermutations is FALSE). The names of the numeric vector
#'   or row names of the matrix should represent the gene IDs.
#' @param minGenes minimum number of genes in a gene set for it to be considered (lower threshold
#'    for gene set size)
#' @param maxGenes maximum number of genes in a gene set for it to be considered (upper threshold
#'    for gene set size)
#' @param rowPermutations logical indicating whether to use row permutations (TRUE; default) or column 
#'    permutations (FALSE) 
#' @param nPermutations is the number of simulations. By default 100 permutations are conducted.
#' @param smoothPValues logical indicating whether one wants to calculate smoothed cut-off thresholds (TRUE; default)
#'    or not (FALSE).
#' @param probabilityVector vector of quantiles at which p values for each gene set are desired
#' @param df degrees of freedom for the smooth.spline function used in getSmoothedPValues 
#' @param addGeneSetDescription logical indicating whether a column with the gene set description be added to
#' the output data frame; defaults to TRUE.
#' @return data frame with four (or five) columns: totalGeneSetSize, testedGeneSetSize, geneSetStatistic and geneSetPValue
#' and (if addDescription is set to TRUE) geneSetDescription; the rows of the data frame are ordered by 
#' ascending geneSetPValue.
#' @references Raghavan, Nandini et al. (2007). The high-level similarity of some disparate gene expression measures,
#' Bioinformatics, 23, 22, 3032-3038.
#' @examples 
#' if (require(GO.db)){
#'   pathExampleGeneSet <- system.file("exampleFiles", "exampleGeneSet.rda", package = "MLP")
#'   pathExamplePValues <- system.file("exampleFiles", "examplePValues.rda", package = "MLP")
#'   load(pathExampleGeneSet)
#'   load(pathExamplePValues)
#'   head(examplePValues)
#'   head(exampleGeneSet)
#'   mlpResult <- MLP(geneSet = exampleGeneSet, geneStatistic = examplePValues)
#'   head(mlpResult)
#' }
#' 
#' @export
MLP <- function (geneSet, geneStatistic, minGenes = 5, maxGenes = 100, 
    rowPermutations = TRUE, nPermutations = 100, smoothPValues = TRUE, 
    probabilityVector = c(0.5, 0.9, 0.95, 0.99, 0.999, 0.9999, 0.99999), 
    df = 9, addGeneSetDescription = TRUE){
  if (!inherits(geneSet, "geneSetMLP")) {
    stop("The 'geneSet' should be an object of class 'geneSetMLP' as produced by getGeneSets")
  }
  species <- attr(geneSet, "species")
  if (is.vector(geneStatistic)) 
    if (!rowPermutations) 
      stop("If rowPermutations is set to FALSE, the 'geneStatistic' needs to be a matrix")
  geneStatistic <- as.matrix(geneStatistic)
  if (!is.numeric(geneStatistic) & !is.matrix(geneStatistic)) {
    warning("Argument 'geneStatistic' should be a numeric vector or matrix")
  }
  if (!rowPermutations) {
    nPermutations <- ncol(geneStatistic) - 2
  }
  totalGeneSetSize <- sapply(geneSet, length)
  geneSets <- sapply(geneSet, function(x) x[x %in% rownames(geneStatistic)])
  geneSetSize <- sapply(geneSets, function(x) length(x))
  geneSetIndices <- which(geneSetSize >= minGenes & geneSetSize <= 
          maxGenes)
  geneSets <- geneSets[geneSetIndices]
  geneStatistic <- na.omit(geneStatistic)
  geneStatistic <- geneStatistic[grep("^[[:digit:]]+$", rownames(geneStatistic)), 
      , drop = FALSE]
  mapResult <- mapGeneSetStatistic(geneSets, geneStatistic)
  pValueNAlist <- lapply(mapResult, function(x) which(!is.na(x)))
  filterFunction <- function(x, y) {
    x[y]
  }
  reducedMapResult <- mapply(filterFunction, mapResult)
  reducedGeneSet <- mapply(filterFunction, geneSets)
  n2 <- nrow(geneStatistic)
  w0 <- t(mlpStatistic(reducedMapResult))
  n11 <- length(geneSets)
  
  if (!rowPermutations) {
    w <- w0[,-2] # take out observed gene set statistics
    w0 <- w0[, 1:2] # take out gene set statistics for permuted data
  } else {
    p1 <- apply(matrix(1:n2, n2, nPermutations), 2, sample)
    y1 <- matrix(geneStatistic[p1, 1], n2, nPermutations)
    row.names(y1) <- row.names(geneStatistic)
    rm(p1)
    mapResultPermuted <- mapGeneSetStatistic(reducedGeneSet, 
        y1)
    w <- t(mlpStatistic(mapResultPermuted))
  }
  if (smoothPValues) {
    pw0 <- getSmoothedPValues(w0, w[, -1], q.cutoff = probabilityVector,
        df = df)
  } else {
    pw0 <- getIndividualPValues(w0, w[, -1])
  }
  res <- data.frame(testedGeneSetSize = w0[, 1], geneSetStatistic = w0[, 
          2], geneSetPValue = pw0)
  res <- res[order(res$geneSetPValue), ]
  res <- data.frame(totalGeneSetSize = totalGeneSetSize[rownames(res)], 
      res)
  if (is.null(attr(res, "species"))) 
    attr(res, "species") <- species
  if (addGeneSetDescription) {
    if(is.null(attr(geneSet, "descriptions"))){
      geneSetSource <- attr(geneSet, "geneSetSource")
      res <- addGeneSetDescription(object = res, geneSetSource = geneSetSource)
    } else{
      descriptions <- attr(geneSet, "descriptions")
      res <- data.frame(res, geneSetDescription = descriptions[row.names(res)], stringsAsFactors = FALSE)
    }
  }
  attr(res, "geneSetSource") <- attr(geneSet, "geneSetSource")
  if (!is.null(attr(pw0, "quantileCurveInformation"))) 
    attr(res, "quantileCurveInformation") <- attr(pw0, "quantileCurveInformation")
  class(res) <- c("MLP", class(res))
  return(res)
}

Try the MLP package in your browser

Any scripts or data that you put into this service are public.

MLP documentation built on Nov. 8, 2020, 8:23 p.m.