R/rrvgo.R

Defines functions calculateSimMatrix reduceSimMatrix getGoSize getGoTerm loadOrgdb

Documented in calculateSimMatrix getGoSize getGoTerm loadOrgdb reduceSimMatrix

#' calculateSimMatrix
#' Calculate the score similarity matrix between terms
#' 
#' @param x vector of GO terms
#' @param orgdb one of org.* Bioconductor packages (the package name, or the
#'   package itself)
#' @param semdata object with prepared GO DATA for measuring semantic similarity
#' @param ont ontlogy. One of c("BP", "MF", "CC")
#' @param method distance method. One of the supported methods by GOSemSim:
#'   c("Resnik", "Lin", "Rel", "Jiang", "Wang")
#' @return a square matrix with similarity scores between terms
#' @details 
#' All similarity measures available are those implemented in the
#' [GOSemSim package](https://www.bioconductor.org/packages/release/bioc/html/GOSemSim.html),
#' namely the Resnik, Lin, Relevance, Jiang and Wang methods. See the
#' [Semantic Similarity Measurement Based on GO](https://www.bioconductor.org/packages/release/bioc/vignettes/GOSemSim/inst/doc/GOSemSim.html#semantic-similarity-measurement-based-on-go)
#' section from the GOSeSim documentation for more details.
#' @examples
#' go_analysis <- read.delim(system.file("extdata/example.txt", package="rrvgo"))
#' simMatrix <- calculateSimMatrix(go_analysis$ID, orgdb="org.Hs.eg.db", ont="BP", method="Rel")
#' @import GOSemSim
#' @importFrom methods is
#' @export
calculateSimMatrix <- function(x,
                               orgdb,
                               semdata=GOSemSim::godata(orgdb, ont=ont),
                               ont=c("BP", "MF", "CC"),
                               method=c("Resnik", "Lin", "Rel", "Jiang", "Wang")) {
 
  # check function args
  ont <- match.arg(ont) 
  method <- match.arg(method) 
  
  # load orgdb object
  if(all(is(orgdb) != "OrgDb")) {
    orgdb <- loadOrgdb(orgdb)
  }
 
  # filter GO terms not in orgdb
  x <- unique(x)
  found <- x %in% names(semdata@IC)
  hasAncestor <- !is.na(sapply(x, function(x) tryCatch(GOSemSim:::getAncestors(ont)[x], error=function(e) NA)))
  if(all(!found)) {
    warning("No terms were found in orgdb for ", ont,
            "\nCheck that the organism and ontology match the ones provided by orgdb")
    return(NA)
  } else if(!all(found)) {
    warning("Removed ", length(x) - sum(found), " terms that were not found in orgdb for ", ont)
  }
  x <- x[found & hasAncestor]
  
 
  # return the similarity matrix
  m <- matrix(GOSemSim::goSim(x, x, semData=semdata, measure=method),
              ncol=length(x), dimnames=list(x, x))
  
  # removing terms which the similarity couldn't be calculated
  out <- apply(m, 2, function(x) all(is.na(x)))
  m[!out, !out]
}

#' reduceSimMatrix
#' Reduce a set of GO terms based on their semantic similarity and scores.
#' 
#' @details
#' Remove terms with a similarity higher than `threshold`. Decide which term
#' remains based on a score. If no score is provided, the select either the
#' broader or the narrower one (`untie` parm).
#' 
#' @param simMatrix a (square) similarity matrix
#' @param scores *named* vector with scores (weights) assigned to each term.
#'   Higher is better. Can be NULL (default, means no scores. In this case, a default score
#'   based on set size is assigned, thus favoring larger sets). Note: if you have
#'   p-values as scores, consider -1*log-transforming them (`-log(p)`)
#' @param threshold similarity threshold (0-1). Some guidance:
#'  Large (allowed similarity=0.9), Medium (0.7), Small (0.5), Tiny (0.4)
#'  Defaults to Medium (0.7)
#' @param orgdb one of org.* Bioconductor packages (the package name, or the
#'   orgdb object itself)
#' @return a data.frame with all terms and it's "reducer" (NA if the term was not reduced)
#' @examples
#' go_analysis <- read.delim(system.file("extdata/example.txt", package="rrvgo"))
#' simMatrix <- calculateSimMatrix(go_analysis$ID, orgdb="org.Hs.eg.db", ont="BP", method="Rel")
#' scores <- setNames(-log10(go_analysis$qvalue), go_analysis$ID)
#' reducedTerms <- reduceSimMatrix(simMatrix, scores, threshold=0.7, orgdb="org.Hs.eg.db")
#' @importFrom stats cutree hclust
#' @export
reduceSimMatrix <- function(simMatrix, scores=NULL, threshold=0.7, orgdb) {
 
  # check function arguments
  if(!is.null(scores) && !all(rownames(simMatrix) %in% names(scores))) {
    stop("scores vector does not contain all terms in the similarity matrix")
  }
  scores <- scores[match(rownames(simMatrix), names(scores))]
  
  # reorder the similarity matrix as in the scores, just in case they don't come in the same order
  orows <- match(rownames(simMatrix), names(scores))
  ocols <- match(colnames(simMatrix), names(scores))
  simMatrix <- simMatrix[orows, ocols]
  
  # get category size, and use it as scores if they were not provided
  sizes <- getGoSize(rownames(simMatrix), orgdb)
  if(is.null(scores)) {
    scores <- sizes
  }
  
  # sort matrix based on the score
  o <- rev(order(scores, sizes, na.last=FALSE))
  simMatrix <- simMatrix[o, o]
  
  # cluster terms and cut the tree at the desired threshold.
  # Then find the term with the highest score as the representative of each cluster
  cluster <- cutree(hclust(as.dist(1-simMatrix)), h=threshold)
  clusterRep <- tapply(rownames(simMatrix), cluster, function(x) x[which.max(scores[x])])
  
  # return
  data.frame(go=rownames(simMatrix),
             cluster=cluster,
             parent=clusterRep[cluster],
             parentSimScore=unlist(Map(seq_len(nrow(simMatrix)), clusterRep[cluster], f=function(i, j) simMatrix[i, j])),
             score=scores[match(rownames(simMatrix), names(scores))],
             size=sizes[match(rownames(simMatrix), names(sizes))],
             term=getGoTerm(rownames(simMatrix)),
             parentTerm=getGoTerm(clusterRep[cluster]))
}

#' getGoSize
#' Get GO term size (# of genes)
#' 
#' @param terms GO terms
#' @param orgdb one of org.* Bioconductor packages (the package name, or the
#'   package itself)
#' @importFrom AnnotationDbi select keys
#' @importFrom stats setNames
#' @importFrom methods is
#' @return number of genes associated with each term
getGoSize <- function(terms, orgdb) {
  if(all(is(orgdb) != "OrgDb")) {
    orgdb <- loadOrgdb(orgdb)
  }
  
  # get all GO terms with genes associated
  go <- suppressMessages(
          AnnotationDbi::select(orgdb,
                                keytype="ENTREZID",
                                columns=c("GO", "ONTOLOGY"),
                                keys=AnnotationDbi::keys(orgdb, keytype="ENTREZID")))
  go <- go[!is.na(go$GO), ]
  go <- go[go$GO %in% terms, ]
  
  # count
  counts   <- table(go$GO)
  go <- go[go$GO %in% terms, ]
  empty    <- terms[!(terms %in% names(counts))]
  nocounts <- setNames(rep(0, length(empty)), empty)
  
  c(counts, nocounts)
}

#' getGoTerm
#' Get the description of a GO term
#' 
#' @param x GO terms
#' @import GO.db
#' @return the Term slot in GO.db::GOTERM[[x]]
getGoTerm <- function(x) {
  sapply(x, function(x) GO.db::GOTERM[[x]]@Term)
}

#' loadOrgdb
#' Load an orgdb object
#'
#' @param orgdb one of org.* Bioconductor packages
#' @return the loaded orgdb
loadOrgdb <- function(orgdb) {
  if(!requireNamespace(orgdb, quietly = TRUE)) {
    stop("Bioconductor orgdb for ", orgdb, " not found. Consider installing it.",
         call. = FALSE)
  }
  eval(parse(text=paste0(orgdb, "::", orgdb)))
}
ssayols/rrvgo-ci documentation built on March 17, 2020, 12:14 a.m.