R/Crispr_Enrichment_test.R

Defines functions .doHyperGInternal ct.targetSetEnrichment ct.PantherPathwayEnrichment

Documented in ct.PantherPathwayEnrichment ct.targetSetEnrichment

##' @title Run a (limited) Pathway Enrichment Analysis on the results of a Crispr experiment.
##' @description This function enables some limited geneset enrichment-type analysis of data derived from a pooled Crispr
##' screen using the PANTHER pathway database. Specifically, it identifies the set of targets significantly enriched or
##' depleted in a \code{summaryDF} object returned from \code{ct.generateResults} and compares that set to the remaining
##' targets in the screening library using a hypergeometric test.
##'
##' Note that many Crispr gRNA libraries specifically target biased sets of genes, often focusing on genes involved
##' in a particular pathway or encoding proteins with a shared biological property. Consequently, the enrichment results
##' returned by this function represent the pathways containing genes disproportionately targeted *within the context
##' of the screen*, and may or may not be informative of the underlying biology in question. This means that
##' pathways not targeted by a Crispr library will obviously never be enriched within the positive target set regardless of
##' their biological relevance, and pathways enriched within a focused library screen are similarly expected to partially
##' reflect the composition of the library and other confounding issues (e.g., number of targets within a pathway).
##' Analysts should therefore use this function with care. For example, it might be unsurprising to detect pathways related
##' to histone modification within a screen employing a crispr library targeting epigenetic regulators.
##'
##' @param summaryDF A dataframe summarizing the results of the screen, returned by the function \code{\link{ct.generateResults}}.
##' @param pvalue.cutoff A gene-level p-value cutoff defining targets of interest within the screen. Note that this is a
##' nominal p-value cutoff to preserve end-user flexibility.
##' @param enrich Logical indicating whether to consider guides that are enriched (default) or depleted within the screen.
##' @param organism The species of the cell line used in the screen; currently only 'human' or 'mouse' are supported.
##' @param db.cut Minimum number of genes annotated to a given to a pathway within the screen in order to consider it in the enrichment test.
##' @return A dataframe of enriched pathways.
##' @author Russell Bainer, Steve Lianoglou
##' @examples data('resultsDF')
##' ct.PantherPathwayEnrichment(resultsDF, organism = 'mouse')
##' @export

ct.PantherPathwayEnrichment <- function(summaryDF, pvalue.cutoff = 0.01, enrich = TRUE, organism = 'human', db.cut = 10){
  if (!requireNamespace("PANTHER.db")) {
    stop("The PANTHER.db bioconductor package is required")
  }
  if (!requireNamespace("AnnotationDbi")) {
    stop("The AnnotationDbi bioconductor package is required")
  }

  #Check the input:
    organism <- match.arg(organism, c('human', 'mouse'))

    if(!(enrich %in% c(TRUE, FALSE))){
      stop('enrich must be either TRUE or FALSE.')
    }

    if(!ct.resultCheck(summaryDF)){
      stop("Execution halted.")
    }

    if(!(pvalue.cutoff <= 1 & pvalue.cutoff >= 0 & is.numeric(pvalue.cutoff))){
      stop("pvalue.cutoff must be a numeric value between 0 and 1.")
    }

    if(!(is.numeric(db.cut))){
      stop("db.cut must be a numeric value.")
    }


    message("WARNING: The interpretation of gene set enrichment analyses in Crispr screens is tricky. See the man page for further details.")

    #Condense the summary frame to gene-level estimates and isolate the ones that we are testing
    summaryDF <- summaryDF[!duplicated(summaryDF$geneID),]
    summaryDF <- summaryDF[!is.na(summaryDF$geneID),]
    summaryDF <- summaryDF[!grepl('^[A-Za-z]+$', summaryDF$geneID),]
    universe <- summaryDF$geneID
    #Pull out the significant hits
    selected <- summaryDF$geneID[summaryDF[,"Target-level Enrichment P"] <= pvalue.cutoff]

    if(enrich == FALSE){
      selected <- summaryDF$geneID[summaryDF[,"Target-level Depletion P"] <= pvalue.cutoff]
    }

    #make the database and conform it to the provided resultDF:
    pdb <- suppressMessages(ct.getPanther(organism))
    pathways <- names(pdb)
    pdb.cut <- lapply(pdb, function(x){x[x %in% universe]})
    genesInCats <- vapply(pdb.cut, length, numeric(1))
    pdb.cut <- pdb.cut[genesInCats > db.cut]
    genesInCats <- genesInCats[genesInCats > db.cut]

    message(paste('Omitting PANTHER pathways containing fewer than', db.cut, 'genes.'))

    if(length(pdb.cut) == 0){
      stop(paste('No acceptable gene sets found; consider reducing db.cut? \nExiting.'))
    }

    message(paste('Performing Hypergeometric tests with', length(pdb.cut), 'gene sets...'))

    #Set up the output and run the tests
    out <- data.frame('PATHWAY' = names(pdb.cut),
                      'nGenes' = genesInCats,
                      'sigGenes' = vapply(pdb.cut, function(x){sum(x %in% selected, na.rm = TRUE)}, numeric(1)),
                      stringsAsFactors = FALSE)
    testResults <- t(mapply(.doHyperGInternal,
                         'numW' = out$nGenes,
                         'numB' = rep(length(universe), nrow(out)),
                         'numDrawn' = rep(length(selected), nrow(out)),
                         'numWdrawn' = out$sigGenes,
                         'over' = rep(TRUE, nrow(out)),
                         SIMPLIFY =TRUE))
    out <- cbind(out, testResults[,3:1])
    out[,'FDR'] <- p.adjust(out$p, 'BH')
    out[,2:7] <- apply(out[,2:7], 2, as.numeric)
    out <- out[order(out$p, decreasing = FALSE),]
    row.names(out) <- 1:nrow(out)

    return(out)
    }

##' @title  Extract a Named List of Entrez IDs Annotated to Each Pathway in \code{PANTHER.db}
##' @description This is a function that invokes the \link[PANTHER.db]{PANTHER.db} Bioconductor library to extract a list of pathway mappings
##' to be used in gene set enrichment tests. Specifically, the function returns a named list of pathways, where each element contains
##' Entrez IDs. Users should not generally call this function directly as it is invoked internally by the higher-level
##' \code{ct.PantherPathwayEnrichment()} function.
##' @param species The species of the cells used in the screen. Currently only 'human' or 'mouse' are supported.
##' @return A named list of pathways from \code{PANTHER.db}.
##' @author Russell Bainer, Steve Lianoglou.
##' @keywords internal

ct.getPanther <- function (species = c("human", "mouse")){
  species <- match.arg(species, c("human", "mouse"))

  if (!requireNamespace("PANTHER.db")) {
    stop("The PANTHER.db bioconductor package is required")
  }
  if (!requireNamespace("AnnotationDbi")) {
    stop("The AnnotationDbi bioconductor package is required")
  }
  if (species == "human") {
    org.pkg <- "org.Hs.eg.db"
    xorg <- "Homo_sapiens"
  }
  else {
    org.pkg <- "org.Mm.eg.db"
    xorg <- "Mus_musculus"
  }
  if (!requireNamespace(org.pkg)) {
    stop(org.pkg, " bioconductor package required for this species query")
  }

  p.db <- PANTHER.db

  #This is a prefiltering step that appears to be unnecessary now
  #but potentially breaks if the user doesn't have proper database permissions.
  #pthOrganisms(p.db) <- toupper(species)

  org.db <- getFromNamespace(org.pkg, org.pkg)

  p.all <- select(p.db, keys(p.db, keytype="PATHWAY_ID"),
                    columns=c("PATHWAY_ID", "PATHWAY_TERM", "UNIPROT"),
                    'PATHWAY_ID')
  # Map uniprot to entrez
  umap <- suppressMessages(select(org.db, p.all$UNIPROT, c('UNIPROT', 'ENTREZID'), 'UNIPROT'))
  m <- merge(p.all, umap, by='UNIPROT')
  m <- subset(m, !is.na(m$ENTREZID))

  paths <- split(m[,3:4], as.factor(m$PATHWAY_TERM))
  lapply(paths, function(x){unique(x$ENTREZID)})
}

##' @title Test Whether a Specified Target Set is Enriched Within a Pooled Screen
##' @description This function takes in a \code{resultsDF} and a vector of \code{targets} (contained in the \code{geneID} column of
##' \code{resultsDF}) and determines whether the specified targets are enriched within the set of all significantly altered targets.
##' It does this by iteratively testing whether \code{targets} are more likely to be among the set of enriched or depleted targets
##' at various significance thresholds using a hypergeometric test. Note that the returned Hypergeometric P-values are not corrected
##' for multiple testing.
##'
##' Returns a list detailing the \code{targets} used in the tests, and tables indicating the results of the hypergeometric test
##' at various significance thresholds.
##' @param summaryDF A dataframe summarizing the results of the screen, returned by the function \code{\link{ct.generateResults}}.
##' @param targets A character vector containing the names of the targets to be tested. Only targets contained in the \code{geneID}
##' column of the provided \code{summaryDF} are considered.
##' @param enrich Logical indicating whether to consider guides that are enriched (default) or depleted within the screen.
##' @param ignore Optionally, a character vector containing elements of the \code{geneID} column of the provided \code{summaryDF}
##' that should be ignored in the analysis (e.g., unassignable or nonfunctional targets, such as nontargeting controls). By default,
##' this function omits targets with \code{geneSymbol} 'NoTarget'.
##' @return A named list containing the tested target set and tables detailing the hypergeometric test results using various P-value and
##' Q-value thresholds.
##' @examples data(resultsDF)
##' tar <-  sample(unique(resultsDF$geneID), 20)
##' res <- ct.targetSetEnrichment(resultsDF, tar)
##' @author Russell Bainer
##' @export
ct.targetSetEnrichment <- function(summaryDF, targets, enrich = TRUE, ignore = NULL){

  #input checks

  if(!ct.resultCheck(summaryDF)){
    stop("Execution halted.")
  }

  if(!(enrich %in% c(TRUE, FALSE))){
      stop('enrich must be either TRUE or FALSE.')
  }

  valid <- intersect(targets, summaryDF$geneID)

  if(length(valid) == 0){
    stop('None of the targets in the supplied vector are contained in the geneID column of the summary dataframe.')
  }
  if(length(setdiff(targets, valid)) != 0){
    warning(paste('Not all of the supplied targets are present in the geneID column of the summary dataframe. Proceeding with',
            length(valid), 'targets.'))
  }

  #Condense the summary frame to gene-level estimates and isolate the ones that we are testing
  summaryDF <- summaryDF[!duplicated(summaryDF$geneID),]
  summaryDF <- summaryDF[!is.na(summaryDF$geneID),]
  summaryDF <- summaryDF[!grepl('NoTarget', summaryDF$geneSymbol),]  #Remove NoTarget Genes rather than constraining to Entrez

  if(!is.null(ignore)){
    summaryDF <- summaryDF[!(summaryDF$geneID %in% ignore),]
  }

  #Pull out the P-values
  if(enrich){
   selected <- summaryDF[,c("Target-level Enrichment P", "Target-level Enrichment Q")]
  } else {
    selected <- summaryDF[,c("Target-level Depletion P", "Target-level Depletion Q")]
  }
  row.names(selected) <- summaryDF$geneID

  #Set the cutoffs
  cuts <- c(0,1/(10^(5:1)), 0.5, 1)

  out <- list('targets' = valid)
  out$P.values <- cbind(cuts,
                        vapply(cuts, function(x){sum(selected[valid,1] <= x)}, numeric(1)),
                        vapply(cuts, function(x){
                                            .doHyperGInternal(length(valid),
                                                nrow(selected),
                                                sum(selected[,1] <= x),
                                                sum(selected[valid,1] <= x),
                                                TRUE)$p},
                                      numeric(1))

                          )
  out$Q.values <- cbind(cuts,
                        vapply(cuts, function(x){sum(selected[valid,2] <= x)}, numeric(1)),
                        vapply(cuts, function(x){
                          .doHyperGInternal(length(valid),
                                            nrow(selected),
                                            sum(selected[,2] <= x),
                                            sum(selected[valid,2] <= x),
                                            TRUE)$p},
                          numeric(1))
                        )


  colnames(out$P.values) <- colnames(out$Q.values) <- c('Cutoff', 'Sig', 'Hypergeometric_P')
  return(out)
}






## -----------------------------------------------------------------------------
## These were copied from the Bioconductor collection package

## We envision the test as follows:
##
## The urn contains genes from the gene universe.  Genes annotated at a
## given collection term are white and the rest black.
##
## The number drawn is the size of the selected gene list.  The
## number of white drawn is the size of the intersection of the
## selected list and the genes annotated at the collection.
## Here's a diagram based on using GO as the collection:
##
##          inGO    notGO
##          White   Black
## selected  n11     n12
## not       n21     n22
##
## numW: number of genes in GO category
## numB: size of universe
## numDrawn: number of differentially expressed genes
## numWdrawn: the number of genes differentially expressed in category
##' @keywords internal

.doHyperGInternal <- function(numW, numB, numDrawn, numWdrawn, over) {
  n21 <- numW - numWdrawn
  n12 <- numDrawn - numWdrawn
  n22 <- numB - n12

  odds_ratio <-  (numWdrawn * n22) / (n12 * n21)

  expected <- (numWdrawn + n12) * (numWdrawn + n21)
  expected <- expected / (numWdrawn + n12 + n21 + n22)

  if (over) {
    ## take the -1 because we want evidence for as extreme or more
    pvals <- phyper(numWdrawn - 1L, numW, numB,
                    numDrawn, lower.tail=FALSE)
  } else {
    pvals <- phyper(numWdrawn, numW, numB,
                    numDrawn, lower.tail=TRUE)
  }

  list(p=pvals, odds=odds_ratio, expected=expected)
}

Try the gCrisprTools package in your browser

Any scripts or data that you put into this service are public.

gCrisprTools documentation built on Nov. 8, 2020, 8:17 p.m.