R/analyzeGeneSetCollections_MSF_windows.r

#' A function called from HyperGeneSetCollection_MSF
#' @description This function runs a function that takes a list of gene set collections, a named phenotype vector
#'              (with names of the phenotype vector as the universe), a vector of hits (gene names only)
#'              and returns the results of hypergeometric and gene set enrichment analyses for all of the gene set collections
#'              (with multiple hypothesis testing corrections).
#' @param listOfGeneSetCollections a list of gene set collections (a 'gene set collection' is a list of gene sets).
#'        Even if only one collection is being tested, it must be entered as an element of a 1-element list,
#'        e.g. ListOfGeneSetCollections = list(YourOneGeneSetCollection).
#'        Naming the elements of listOfGeneSetCollections will result in these names being associated
#'        with the relevant data frames in the output (meaningful names are advised)
#' @param geneList a numeric or integer vector of phenotypes in descending or ascending order
#'        with elements named by their EntrezIds (no duplicates nor NA values)
#' @param hits a character vector of the EntrezIds of hits, as determined by the user
#' @param pAdjustMethod a single character value specifying the p-value adjustment method to be used (see 'p.adjust' for details)
#' @param pValueCutoff numeric value, p value of cut of
#' @param nPermutations numeric value, number of permutations
#' @param minGeneSetSize numeric value, minimun number of genes accepted
#' @param doGSEA if TRUE do GSEA type of analysis
#' @param doGSOA if TRUE do Over representation Analysis using HyperGeom test
#'
analyzeGeneSetCollections_MSF_windows<-function (listOfGeneSetCollections, geneList, hits, pAdjustMethod = "BH",
                                         pValueCutoff = 0.05, nPermutations = 1000, minGeneSetSize = 15,
                                         exponent = 1, verbose = TRUE, doGSOA = TRUE, doGSEA = TRUE)
{ require(HTSanalyzeR)
  if ((!doGSOA) && (!doGSEA))
    stop("Please choose to perform hypergeometric tests and/or GSEA by specifying 'doGSOA' and 'doGSEA'!\n")
  if (doGSOA || doGSEA) {
    #paraCheck("gscs", listOfGeneSetCollections)
    #paraCheck("genelist", geneList)
    #paraCheck("pAdjustMethod", pAdjustMethod)
    #paraCheck("pValueCutoff", pValueCutoff)
    #paraCheck("minGeneSetSize", minGeneSetSize)
    #paraCheck("verbose", verbose)
  }
  if (doGSOA) {
    #  paraCheck("hits", hits)
  }
  if (doGSEA) {
    # paraCheck("nPermutations", nPermutations)
    # paraCheck("exponent", exponent)
  }


  numGeneSetCollections <- length(listOfGeneSetCollections)
  max.size <- 0
  for (l in 1:numGeneSetCollections) {
    gs.size <- unlist(lapply(lapply(listOfGeneSetCollections[[l]],
                                        intersect, y = names(geneList)), length))
    max.size <- max(max(gs.size), max.size)
    gs.id <- which(gs.size >= minGeneSetSize)
    n.gs.discarded <- length(listOfGeneSetCollections[[l]]) -
      length(gs.id)
    listOfGeneSetCollections[[l]] <- listOfGeneSetCollections[[l]][gs.id]
    if (verbose && n.gs.discarded > 0)
      cat(paste("--", n.gs.discarded, " gene sets don't have >= ",
                minGeneSetSize, " overlapped genes with universe in gene",
                " set collection named ", names(listOfGeneSetCollections)[l],
                "!\n", sep = ""))
  }

  if (all(unlist(lapply(listOfGeneSetCollections, length)) ==  0))

  #  if (all(unlist(lapply(listOfGeneSetCollections, length)) == 0))
    stop(paste("No gene set has >= ", minGeneSetSize, " overlapped ",
               "genes with universe!\n The largest number of overlapped ",
               "genes between gene sets and universe is: ", max.size,
               sep = ""))
  result.names <- names(listOfGeneSetCollections)

  if (doGSOA) {
    #HGTresults <- list()
    # cat("-Performing hypergeometric analysis ...\n")

    #     for (i in 1:length(listOfGeneSetCollections)) {
    #       if (verbose) {
    #         cat("--For", names(listOfGeneSetCollections)[i], "\n")
    #       }
    #       if (length(listOfGeneSetCollections[[i]]) > 0)
    #            HGTresults[[i]] <- multiHyperGeoTest(listOfGeneSetCollections[[i]],
    #            universe = names(geneList), hits = hits, minGeneSetSize = minGeneSetSize,
    #            pAdjustMethod = pAdjustMethod, verbose = verbose)
    #
    #       else {
    #         HGTresults[[i]] <- matrix(, nrow = 0, ncol = 7)
    #         colnames(HGTresults[[i]]) <- c("Universe Size",
    #                                        "Gene Set Size", "Total Hits", "Expected Hits",
    #                                        "Observed Hits", "Pvalue", "Adjusted.Pvalue")
    #       }
    #     }
    HGTresults<-lapply(listOfGeneSetCollections, function(L) {
      #if (verbose) { cat("--For", , "\n")      }
      if (length(L) > 0) out<-multiHyperGeoTest(L, universe = names(geneList), hits = hits,
                                                minGeneSetSize = minGeneSetSize,  pAdjustMethod = pAdjustMethod, verbose = verbose)
      else {
        out <- matrix(, nrow = 0, ncol = 7)
        colnames(out) <- c("Universe Size",  "Gene Set Size", "Total Hits", "Expected Hits",
                           "Observed Hits", "Pvalue", "Adjusted.Pvalue")
      }
      out<-cbind(out[,1:5],OR=out[,"Observed Hits"]/out[, "Expected Hits"],out[,6:7])
      return(out)
    })

    pvals <- NULL
    sapply(1:numGeneSetCollections, function(i) {
      if (nrow(HGTresults[[i]]) > 0) {
        pv <- HGTresults[[i]][, "Pvalue"]
        names(pv) <- rownames(HGTresults[[i]])
        pvals <<- c(pvals, pv)
      }
    })

    HGTpvals <- p.adjust(pvals, method = pAdjustMethod)
    sapply(1:numGeneSetCollections, function(i) {
      if (nrow(HGTresults[[i]]) > 0) {
        ind <- match(rownames(HGTresults[[i]]), names(HGTpvals))
        Adjusted.Pvalue <- HGTpvals[ind]
        HGTresults[[i]][, "Adjusted.Pvalue"] <<- Adjusted.Pvalue
      }
    })
    names(HGTresults) <- result.names
    cat("-Hypergeometric analysis complete\n\n")

    sign.hgt <- lapply(HGTresults, function(x) {
      if (nrow(x) > 0) {
        a <- which(x[, "Pvalue"] < pValueCutoff)
        x <- x[a, , drop = FALSE]
        if (length(a) > 1) {
          x <- x[order(x[, "Pvalue"]), , drop = FALSE]
        }
        return(x)
      }
    })

    sign.hgt.adj <- lapply(HGTresults, function(x) {
      if (nrow(x) > 0) {
        a <- which(x[, "Adjusted.Pvalue"] < pValueCutoff)
        x <- x[a, , drop = FALSE]
        if (length(a) > 1) {
          x <- x[order(x[, "Adjusted.Pvalue"]), , drop = FALSE]
        }
        return(x)
      }
    })

  }
  else {  HGTresults = NULL  }

  if (doGSEA) {
    GSEA.results.list <- list()
    cat("-Performing gene set enrichment analysis ...\n")
    #     test.collection <- list()
    #     sapply(1:length(listOfGeneSetCollections), function(i) {
    #                     if (verbose) { cat("--For", names(listOfGeneSetCollections)[i],  "\n")}
    #       if (length(listOfGeneSetCollections[[i]]) > 0)
    #           test.collection[[i]] <<- collectionGsea(listOfGeneSetCollections[[i]],
    #           geneList = geneList, exponent = exponent, nPermutations = nPermutations,
    #           minGeneSetSize = minGeneSetSize, verbose = verbose)
    #
    #       else {test.collection[[i]] <<- list(Observed.scores = NULL, Permutation.scores = NULL)}
    #     })
    test.collection<-lapply(listOfGeneSetCollections, function(L) {
      if (verbose) { cat("--For", 'names(listOfGeneSetCollections)[i]',  "\n")}
      if (length(L) > 0)
        collectionGsea(L, geneList = geneList, exponent = exponent, nPermutations = nPermutations,
                       minGeneSetSize = minGeneSetSize, verbose = verbose)
      else {test.collection[[i]] <<- list(Observed.scores = NULL, Permutation.scores = NULL)}
    })

    obs.scores <- NULL
    prm.scores <- NULL
    sapply(1:numGeneSetCollections, function(i) {
      obs.scores <<- c(obs.scores, test.collection[[i]]$Observed.scores)
      prm.scores <<- rbind(prm.scores, test.collection[[i]]$Permutation.scores)
    })

    if (length(obs.scores) > 0) {
      total.test.collect <- list(Observed.scores = obs.scores, Permutation.scores = prm.scores)
      test.FDR.collection <- FDRcollectionGsea(permScores = total.test.collect$Permutation.scores,
                                               dataScores = total.test.collect$Observed.scores)
      test.pvalues.collection <- permutationPvalueCollectionGsea(permScores = total.test.collect$Permutation.scores,
                                                                 dataScores = (total.test.collect$Observed.scores))
      gsea.adjust.pval <- p.adjust(test.pvalues.collection, method = pAdjustMethod)
      test.GSEA.results <- cbind(total.test.collect$Observed.scores, test.pvalues.collection,
                                 gsea.adjust.pval, test.FDR.collection)
      colnames(test.GSEA.results) <- c("Observed.score",  "Pvalue", "Adjusted.Pvalue", "FDR")

      sapply(1:numGeneSetCollections, function(i) {
        if (!is.null(test.collection[[i]])) {
          match.ind <- match(names(listOfGeneSetCollections[[i]]), rownames(test.GSEA.results))
          match.ind <- match.ind[which(!is.na(match.ind))]
          GSEA.res.mat <- test.GSEA.results[match.ind, , drop = FALSE]
          GSEA.res.mat <- GSEA.res.mat[order(GSEA.res.mat[,  "Adjusted.Pvalue"]), , drop = FALSE]
          GSEA.results.list[[i]] <<- GSEA.res.mat
        }
      })
      names(GSEA.results.list) <- result.names

    }
    else {
      sapply(1:numGeneSetCollections, function(i) {
        GSEA.results.list[[i]] <<- matrix(, nrow = 0, ncol = 4)
        colnames(GSEA.results.list[[i]]) <<- c("Observed.score",
                                               "Pvalue", "Adjusted.Pvalue", "FDR")
      })
    }
    sign.gsea <- lapply(GSEA.results.list, function(x) {
      if (nrow(x) > 0) {
        a <- which(x[, "Pvalue"] < pValueCutoff)
        x <- x[a, , drop = FALSE]
        if (length(a) > 1) {
          x <- x[order(x[, "Pvalue"]), , drop = FALSE]
        }
        return(x)
      }
    })
    sign.gsea.adj <- lapply(GSEA.results.list, function(x) {
      if (nrow(x) > 0) {
        a <- which(x[, "Adjusted.Pvalue"] <= pValueCutoff)
        x <- x[a, , drop = FALSE]
        if (length(a) > 1) {
          x <- x[order(x[, "Adjusted.Pvalue"]), , drop = FALSE]
        }
        return(x)
      }
    })
  }
  else {
    GSEA.results.list = NULL
  }

  if (doGSOA && doGSEA) {
    overlap <- list()
    overlap.adj <- list()
    sapply(1:numGeneSetCollections, function(i) {
      a1 <- intersect(rownames(sign.gsea[[i]]), rownames(sign.hgt[[i]]))
      a2 <- intersect(rownames(sign.gsea.adj[[i]]), rownames(sign.hgt.adj[[i]]))
      Hypergeometric.Pvalue <- HGTresults[[i]][a1, "Pvalue",
                                               drop = FALSE]
      Hypergeometric.Adj.Pvalue <- HGTresults[[i]][a2,
                                                   "Adjusted.Pvalue", drop = FALSE]
      GSEA.Pvalue <- GSEA.results.list[[i]][a1, "Pvalue",
                                            drop = FALSE]
      GSEA.Adj.Pvalue <- GSEA.results.list[[i]][a2, "Adjusted.Pvalue",
                                                drop = FALSE]
      overlap[[i]] <<- cbind(Hypergeometric.Pvalue, GSEA.Pvalue)
      colnames(overlap[[i]]) <<- c("HyperGeo.Pvalue", "GSEA.Pvalue")
      overlap.adj[[i]] <<- cbind(Hypergeometric.Adj.Pvalue,
                                 GSEA.Adj.Pvalue)
      colnames(overlap.adj[[i]]) <<- c("HyperGeo.Adj.Pvalue", "GSEA.Adj.Pvalue")
    })
    names(overlap) <- result.names
    names(overlap.adj) <- result.names
  }
  else {
    overlap = NULL
    overlap.adj = NULL
  }
  cat("-Gene set enrichment analysis complete \n")
  final.results <- list(HyperGeo.results = HGTresults, GSEA.results = GSEA.results.list,
                        Sig.pvals.in.both = overlap, Sig.adj.pvals.in.both = overlap.adj)
  print('Finish Running GSOA/GSEA')
  return(final.results)
}
mssm-msf-2019/BiostatsALL documentation built on May 22, 2019, 12:16 p.m.