R/get-panther.R

Defines functions getPantherGOSLIM getPantherPathways getGOslimGeneSetDb getPantherGeneSetDb

Documented in getGOslimGeneSetDb getPantherGeneSetDb

#' Get pathways/GOslim information from PANTHER.db Biocondcutor package.
#'
#' @description
#' This is a convience function that orchestrates the PANTHER.db package to
#' return GeneSetDb objects for either pathway or GOslim information for
#' human or mouse.
#'
#' Note that for some reason the `PANTHER.db` package needs to be
#' installed in a user-writable package location for this to work properly.
#' If you see an error like "Error in resqlite_send_query ... attempt to
#' write a readonly database", this is the problem. Please install another
#' version of the `PANTHER.db` package in a user-writable directory using
#' [BiocManager::install()].
#'
#' @section GOSLIM:
#' [GO Slims](http://geneontology.org/page/go-slim-and-subset-guide) are
#' "cut down" versions of the GO ontology that contain a subset of the terms in
#' the whole GO.
#'
#' PANTHER provides their own set of
#' [GO slims](http://www.pantherdb.org/panther/ontologies.jsp), although
#' it's not clear how often these get updated.
#'
#' @rdname getPantherGeneSetDb
#' @export
#' @importFrom GSEABase EntrezIdentifier
#' @param type "pathway" or, "goslim"
#' @param species "human" or "mouse"
#'
#' @return A wired up GeneSetDb
getPantherGeneSetDb <- function(type=c('pathway', 'goslim'),
                                species=c('human', 'mouse')) {
  species <- match.arg(species)
  type <- match.arg(type)

  if (!requireNamespace('PANTHER.db', quietly=TRUE)) {
    stop("The PANTHER.db bioconductor package is required")
  }
  if (species == 'human') {
    org.pkg <- 'org.Hs.eg.db'
    xorg <- 'Homo_sapiens'
  } else {
    org.pkg <- 'org.Mm.eg.db'
    xorg <- 'Mus_musculus'
  }
  on.exit({
    unloadNamespace('PANTHER.db')
    unloadNamespace(org.pkg)
  })
  if (!requireNamespace(org.pkg, quietly=TRUE)) {
    stop(org.pkg, " bioconductor package required for this species query")
  }

  p.db <- PANTHER.db::PANTHER.db
  PANTHER.db::pthOrganisms(p.db) <- toupper(species)
  org.db <- getFromNamespace(org.pkg, org.pkg)

  out <- switch(type,
                pathway=getPantherPathways(p.db, org.db),
                goslim=getPantherGOSLIM(p.db, org.db))
  mapIds <- getFromNamespace('mapIds', 'AnnotationDbi')
  out@db$symbol <- mapIds(org.db, out@db$feature_id, 'SYMBOL', 'ENTREZID')
  org(out) <- xorg
  out
}

#' @rdname getPantherGeneSetDb
#' @export
getGOslimGeneSetDb <- function(species=c('human', 'mouse')) {
  getPantherGeneSetDb('goslim', species)
}

#' @noRd
getPantherPathways <- function(p.db, org.db) {
  aselect <- getFromNamespace('select', 'AnnotationDbi')
  p.all <- aselect(p.db, AnnotationDbi::keys(p.db, keytype="PATHWAY_ID"),
                   columns=c("PATHWAY_ID", "PATHWAY_TERM", "UNIPROT"),
                   'PATHWAY_ID')
  ## Map uniprot to entrez
  umap <- aselect(org.db, p.all$UNIPROT, c('UNIPROT', 'ENTREZID'), 'UNIPROT')
  m <- merge(p.all, umap, by='UNIPROT')
  m <- m[!is.na(m[['ENTREZID']]),,drop=FALSE]
  lol <- list(`panther pathway`=split(m$ENTREZID, m$PATHWAY_TERM))

  idxref <- unique(as.data.table(p.all)[, c('PATHWAY_TERM', 'PATHWAY_ID'), with=FALSE])
  setkeyv(idxref, 'PATHWAY_TERM')

  url.fn <- function(coll, name, ...) {
    ## Captures the lookup table for future use from parent.frame(!)
    ## Not sure if this will cause any serious memory leak, but ...
    pid <- idxref[list(name)]$PATHWAY_ID
    if (is.na(pid)) {
      return("http://pantherdb.org/panther/prowler.jsp?reset=1&selectedView=5")
    }
    paste0('http://pantherdb.org/pathway/pathwayDiagram.jsp?catAccession=', pid)
  }

  gdb <- GeneSetDb(lol)
  geneSetCollectionURLfunction(gdb, names(lol)) <- url.fn
  featureIdType(gdb, names(lol)) <- EntrezIdentifier()
  gdb
}

#' @noRd
getPantherGOSLIM <- function(p.db, org.db) {
  if (!requireNamespace("GO.db")) {
    stop("GO.db is required for this functionality")
  }
  aselect <- getFromNamespace('select', 'AnnotationDbi')
  p.all <- aselect(p.db,
                   AnnotationDbi::keys(p.db, keytype='GOSLIM_ID'),
                   columns=c('ENTREZ', 'GOSLIM_ID', 'GOSLIM_TERM'),
                   'GOSLIM_ID')
  p.all <- p.all[!is.na(p.all[['ENTREZ']]),,drop=FALSE]
  p.all <- p.all[order(p.all[['ENTREZ']]),]

  go <- aselect(GO.db::GO.db,
                unique(p.all[['GOSLIM_ID']]),
                c('GOID', 'TERM'),
                'GOID')
  go.missed <- go[is.na(go[['TERM']]),,drop=FALSE]
  ## 2015-09-02 (Bioc 3.1)
  ##       GOID TERM
  ## GO:0005083   NA
  ## GO:0006917   NA
  ## GO:0019204   NA
  go.add <- data.frame(
    GOID=c("GO:0005083", "GO:0006917", "GO:0019204"),
    TERM=c(
      "GTPase regulator activity",
      "apoptotic process",
      "nucleotide phosphatase activity (obsolete)"
    ), stringsAsFactors=FALSE)
  go <- rbind(
    go[!is.na(go[['TERM']]),,drop=FALSE],
    go.add
  )

  GO <- merge(p.all, go, by.x='GOSLIM_ID', by.y='GOID', all.x=TRUE)

  missed <- is.na(GO$TERM)
  mids <- unique(GO$GOSLIM_ID[missed])
  if (length(mids)) {
    warning(length(missed), " GOSLIM terms were not found in GO.db",
            immediate.=TRUE)
    GO$TERM <- ifelse(missed, GO$GOSLIM_ID, GO$TERM)
  }

  lol <- split(GO$ENTREZ, GO$TERM)
  url.fn <- function(x, y, ...) {
    "http://www.pantherdb.org/panther/ontologies.jsp"
  }

  gdb <- GeneSetDb(lol, collectionName='GOSLIM')
  xref <- match(gdb@table$name, GO$TERM)
  gdb@table$GOID <- GO$GOSLIM_ID[xref]
  gdb@table$ontology <- GO$GOSLIM_TERM[xref]
  geneSetCollectionURLfunction(gdb, 'GOSLIM') <- url.fn
  featureIdType(gdb, 'GOSLIM') <- EntrezIdentifier()
  gdb
}
lianos/multiGSEA documentation built on Nov. 17, 2020, 1:26 p.m.