R/findSTRINGKnodes.R

# findSTRINGKnodes.R

#' \code{findSTRINGKnodes} output the K node values for vertices in the STRING interaction graph given the system of genes 'sys'.
#'
#' \code{findSTRINGKnodes} outputs a the K node values (as defined by Cornish & Markowetz, 2014),
#' a measure of the vertex association to the high-weighted vertices defined by the input
#' curated gene system 'sys'. Also plots the knode the top \emph{n} knode values
#'
#' @param sys (character)       5-character system code of genes

#' @return (numeric) Knode values
#' @examples
#' \dontrun{
#' # Find the Knodes values for genes in the STRING graph sorted by association to PHALY genes
#' findSTRINGKnodes("PHALY")
#' }
#' @export
findSTRINGKnodes <- function(sys) {

  # Get genes that are system components
  myDB <- fetchData("SysDB")
  geneSet <- SyDBgetSysSymbols(myDB, sys)[[1]]

  if (length(geneSet) == 0) {
    print("Invalid system")
    return (invisible(NULL))
  }

  # Load high-confidence STRING edges
  STRINGedges <- fetchData("STRINGedges0.9")

  # Create graph from STRING edges
  geneGraph <- igraph::graph_from_edgelist(matrix(c(STRINGedges$a, STRINGedges$b), ncol = 2, byrow = FALSE), directed = FALSE)

  # Find IDs of vertices that are part of geneSet
  ids <- match(geneSet, igraph::V(geneGraph)$name)
  ids <- ids[!is.na(ids)]

  # Create a vertex attribute called "annotated" that identifies if it was annotated as part of the
  # input system (sys)
  geneGraph <- igraph::set_vertex_attr(geneGraph, name="annotated", value=0)
  geneGraph <- igraph::set_vertex_attr(geneGraph, name="annotated", index=ids, value=1)

  # Calculate Knodes (rank of vertex association to annotated vertices)
  # https://www.rdocumentation.org/packages/SANTA/versions/2.10.2/topics/Knode
  knodes <- SANTA::Knode(geneGraph, dist.method = c("shortest.paths"), vertex.attr = "annotated", verbose = TRUE)

  # Within the first n nodes in the ranked list (knodes), find which genes
  # were not included in geneSet (genes in sys)
  topRankedGenes <- names(knodes[1:length(geneSet)])
  inGeneSetID <- which(topRankedGenes %in% geneSet)
  notInGeneSetID <- which(!topRankedGenes %in% geneSet)

  # Create R dataframe made of the top n nodes in the ranked list for plotting
  numPlotPts <- length(geneSet)
  knodeDF <- data.frame(knodes)
  knodeDF <- knodeDF[1:numPlotPts, , drop=FALSE] # drop parameter to retain row names
  knodeDF$inGeneSet <- topRankedGenes %in% geneSet # Mark the genes that were in the original system sys

  # Plot scatterpoint of above dataframe, and have the genes that were in sys
  # labelled in green, genes that were not in sys labelled in red

  # (BS Note) inGeneSet has no visible binding - changing to knodeDF$inGeneSet
  #           assuming that was intended.
  print(ggplot2::ggplot(knodeDF, ggplot2::aes(x=1:numPlotPts, y=knodes, label = row.names(knodeDF), color = factor(knodeDF$inGeneSet, levels=c(TRUE, FALSE)))) +
          ggrepel::geom_label_repel() +
          ggplot2::geom_point(color = 'black') +
          ggplot2::theme_classic(base_size = 16) +
          ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) +
          ggplot2::scale_color_manual(values = c("green", "red"), name = "In gene set") +
          ggplot2::labs(x="Gene Knode Rank", y="Knode Value", title=paste("Top Knode Values Through Association with Genes in ", sys)))

  return (knodes)
}

# [END]
hyginn/BCB420.2019.ESA documentation built on May 29, 2019, 1:23 p.m.