R/jamenrich-communities-nodegroups.R

# jamenrich-communities-to-nodegroups.R

#' Convert communities object to nodegroups list format
#'
#' Convert communities object to nodegroups list format
#'
#' Note that this function is "lossy", in that the output `list`
#' does not contain all the information necessary to reconstitute
#' the input `communities` object in detail. However, the output
#' `list` can be converted to a `communities` object that will
#' be accepted by most `igraph` related functions that require
#' that object type as an input value.
#'
#'
#' @family jam igraph functions
#'
#' @return `list` of `character` vectors, where each vector contains
#'    names of `igraph` nodes. When `algorithm` is defined in the
#'    input object, it is included as an attribute of the output `list`,
#'    accessible with `attr(out, "algorithm")`.
#'
#'    When optional value `"cluster_names"` is present in the `communities`
#'    object, they are used to define the output `list` names.
#'
#' @param wc `communities` object as returned by `igraph` functions
#'    such as `cluster_optimal()`, `cluster_walktrap()`, or
#'    `cluster_leading_eigen()`.
#' @param ... additional arguments are ignored.
#'
#' @export
communities2nodegroups <- function
(wc,
 ...)
{
   #
   if (length(wc$names) == 0) {
      wc$names <- seq_along(wc$membership);
   }
   nodegroups <- split(
      wc$names,
      wc$membership)
   if ("cluster_names" %in% names(wc)) {
      names(nodegroups) <- wc$cluster_names
   }
   if ("algorithm" %in% names(wc)) {
      attr(nodegroups, "algorithm") <- wc$algorithm;
   } else {
      attr(nodegroups, "algorithm") <- "nodegroups";
   }
   return(nodegroups)
}


#' Convert nodegroups list to communities object
#'
#' Convert nodegroups list to communities object
#'
#' Note that this function is "lossy", in that the output `communities`
#' object will not contain any supporting data specific to the
#' community detection algorithm originally used.
#' However, the output `communities` object will be accepted
#' by most `igraph` related functions that require
#' that object type as an input value.
#'
#' The `names(nodegroups)` are used to define a new element in the
#' output `communities` object `"cluster_names"`, so the names
#' will be maintained in the data. Default `igraph` functions
#' do not use these names, but they are used by `multienrichjam`
#' for example by function `make_point_hull()` which uses these
#' names to label each cluster during plotting.
#'
#' @family jam igraph functions
#'
#' @return `community` object, which is essentially a `list` with
#'    specific required elements:
#'    * `"membership"` - `integer` assignment of nodes to clusters
#'    * `"names"` - `character` list of node names
#'    * `"vcount"` - `integer` number of nodes
#'    * `"algorithm"` - `character` string with the name of the community
#'    detection method used.
#'    * `"cluster_names"` - `character` labels associated with `membership`
#'    index values. These names are not generated by `igraph` community
#'    detection, and are therefore optional for use in most `igraph`
#'    workflows. However, they are used in some `multienrichjam` functions,
#'    specifically `make_point_hull()` which optionally displays a
#'    label beside each node cluster during plotting.
#'
#' @param wc `communities` object as returned by `igraph` functions
#'    such as `cluster_optimal()`, `cluster_walktrap()`, or
#'    `cluster_leading_eigen()`.
#' @param algorithm `character` or `NULL`, indicating the name of the
#'    community detection algorithm used.
#'    * When `algorithm` is defined, it is used instead of
#'    `attr(nodegroups, "algorithm")`.
#'    * When `algorithm` is `NULL`, `attribute(nodegroups, "algorithm")` is
#'    used if defined, otherwise `algorithm="nodegroups"`.
#' @param ... additional arguments are ignored.
#'
#' @export
nodegroups2communities <- function
(nodegroups,
 algorithm=NULL,
 ...)
{
   #
   if (length(nodegroups) == 0) {
      stop("Input nodegroups was empty.")
   }
   if (length(names(nodegroups)) == 0) {
      names(nodegroups) <- seq_along(nodegroups)
   }
   if (length(algorithm) == 0) {
      if ("algorithm" %in% names(attributes(nodegroups))) {
         algorithm <- attr(nodegroups, "algorithm");
      } else {
         algorithm <- "nodegroups";
      }
   }
   nodegroup_factors <- factor(names(nodegroups),
      levels=names(nodegroups));
   nodegroup_df <- data.frame(check.names=FALSE,
      stringsAsFactors=FALSE,
      name=unlist(unname(nodegroups)),
      nodegroup=rep(nodegroup_factors, lengths(nodegroups)),
      membership=as.integer(rep(nodegroup_factors, lengths(nodegroups))))
   if (is.numeric(nodegroup_df$name)) {
      nodegroup_df <- jamba::mixedSortDF(nodegroup_df,
         byCols=c("name"));
   }
   wc <- list(
      membership=nodegroup_df$membership,
      names=nodegroup_df$name,
      vcount=nrow(nodegroup_df),
      algorithm=algorithm,
      cluster_names=levels(nodegroup_factors))
   class(wc) <- "communities";
   return(wc)
}

#' Assign labels to igraph communities
#'
#' Assign labels to igraph communities
#'
#' @family jam igraph functions
#'
#' @return `communities` or `list` format matching the input `wc` format.
#'    * When `communities` is input, additional value `cluster_names`
#'    will contain a `character` vector of names corresponding to each
#'    integer index in `wc$membership`.
#'    * When `nodegroups` is input, the `list` names will be a `character`
#'    vector of cluster labels.
#'
#' @param wc `communities` object, or `list` in form of nodegroups,
#'    which is a `list` of `character` vectors that contain `igraph`
#'    node names.
#' @param labels `character` vector of optional labels to assign directly
#'    to community clusters. When not defined, the auto-detection method
#'    is used.
#' @param add_catchwords `character` of optional words to include as
#'    catchwords, to be excluded from use in the final label.
#' @param num_keep_terms `integer` maximum number of terms to be included
#'    in the final output label, when auto-detection is used.
#' @param keep_terms_sep `character` string used as a delimited to separate
#'    each term when multiple terms are concatenated together to form
#'    the cluster label.
#' @param ... additional arguments are ignored.
#'
#' @export
label_communities <- function
(wc,
 labels=NULL,
 add_catchwords=NULL,
 num_keep_terms=3,
 keep_terms_sep=",\n",
 ...)
{
   # define catchwords
   catchwords <- unique(c(
      add_catchwords,
      "the", "an", "a", "of", "in", "between", "to", "and",
      "peptide", "peptides",
      "protein", "proteins",
      "gene", "genes",
      "system", "systems",
      "role", "roles",
      "base", "bases", "based", "basing", "basic",
      "acid", "acids", "acidic",
      "cells", "cell", "cellular",
      "space", "spaces", "spaced", "spacing",
      "positive", "positives", "positively",
      "negative", "negatives", "negatively",
      "pathway", "pathways",
      "set", "sets",
      "position", "positions", "positioned", "positioning",
      "function", "functions", "functioning", "functioned",
      "signaling", "signal", "signaling", "signals", "signaled",
      "activity", "activation", "activate", "activates", "activated", "activating",
      "involve", "involved", "involves", "involving",
      "response", "responses", "respond", "responds", "responded", "responding",
      "transcript", "transcripts", "transcribe", "transcribed", "transcribes",
      "organization", "organize", "organizes", "organized", "organizing",
      "formation", "form", "forms", "formed", "forming",
      "enhanced", "enhance", "enhances", "enhancing",
      "mediated", "mediate", "mediates", "mediating",
      "expression", "express", "expresses", "expressed", "expressing",
      "compound", "compounds", "compounding", "compounded",
      "process", "processes", "processed", "processing",
      "regulation", "regulate", "regulates", "regulated", "regulating",
      # "up-regulation", "up-regulate", "up-regulates", "up-regulated", "up-regulating",
      # "down-regulation", "down-regulate", "down-regulates", "down-regulated", "down-regulating",
      "the"))
   hyphen_pattern <- paste0(
      "-(",
      paste(catchwords, collapse="|"),
      ")$")

   input_type <- NULL;
   if ("communities" %in% class(wc) ||
         (is.list(wc) && all(c("membership", "names") %in% names(wc)))) {
      # define list
      input_type <- "communities";
      nodegroups_wc <- communities2nodegroups(wc);
   } else if (is.list(wc)) {
      input_type <- "nodegroups";
      nodegroups_wc <- wc;
   } else {
      stop("Input wc must be 'communities' or 'nodegroups' list object.");
   }

   if (length(labels) > 0) {
      if (length(labels) != length(nodegroups_wc)) {
         stop("length(labels) must equal the number of clusters in wc")
      }
   }

   # assign most common terms as a cluster label
   nodegroup_labels <- lapply(seq_along(nodegroups_wc), function(inum){
      i <- nodegroups_wc[[inum]];
      # split on whitespace, tab, or newline
      j <- tolower(unlist(strsplit(i, "[\t\r\n ]+")));
      # remove catchword from the second word of hyphenated phrases
      j <- gsub(hyphen_pattern, "", j);
      # remove non-alphanumeric characters
      j <- gsub("[():;,]+", "", j)

      # keep words which are not catchwords, and with two or more characters
      j_keep <- (!j %in% catchwords & nchar(j) > 1);
      j <- j[j_keep];
      if (length(j) == 0) {
         # if no words remain, name the cluster by number
         return(inum)
      }
      names(head(tcount(j), num_keep_terms))
   })

   names(nodegroups_wc) <- nodegroup_labels;
   if ("nodegroups" %in% input_type) {
      return(nodegroups_wc)
   }

   # assign cluster_names to communities object
   wc$cluster_names <- nodegroup_labels;
   return(wc);
}

#' Sync igraph nodes and communities
#'
#' Sync igraph nodes and communities
#'
#' This function ensures that `igraph` nodes and corresponding
#' community clusters are synchronized for proper downstream use.
#' In particular, when using a subgraph, or when communities only
#' assign a subset of nodes to clusters, this function ensures the
#' two objects are in sync, the same order, and with the same nodes.
#'
#' @return `list` with two elements:
#'    * `"g"` - the `igraph` object after subsetting to match node names
#'    shared with `wc`, as necessary.
#'    * `"wc'` - the `communities` object after subsetting to match
#'    node names shared with `g`, as necessary. When input `wc` is
#'    in `list` nodegroups format, that same format is returned.
#'
#' @family jam igraph functions
#'
#' @param g `igraph` object
#' @param wc `communities` object, or `list` in form of nodegroups,
#'    which is a `list` of `character` vectors that contain `igraph`
#'    node names.
#' @param verbose `logical` indicating whether to print verbose output.
#' @param ... additional arguments are passed to `nodegroups2communities()`
#'    only when input `wc` is supplied in `list` nodegroups format.
#'
#' @export
sync_igraph_communities <- function
(g,
 wc,
 verbose=TRUE,
 ...)
{
   # validate input
   input_type <- NULL;
   if ("communities" %in% class(wc) ||
         (is.list(wc) && all(c("membership", "names") %in% names(wc)))) {
      # define list
      input_type <- "communities";
   } else if (is.list(wc)) {
      input_type <- "nodegroups";
      nodegroups_wc <- wc;
      wc <- nodegroups2communities(nodegroups_wc,
         ...)
   } else {
      stop("Input wc must be 'communities' or 'nodegroups' list object.");
   }

   g_nodes <- igraph::V(g)$name;
   wc_nodes <- wc$names;
   observed_nodes <- intersect(g_nodes, wc_nodes)


   # subset igraph
   if (!all(g_nodes %in% observed_nodes)) {
      g_keep <- which(g_nodes %in% observed_nodes)
      g <- igraph::subgraph(g, v=g_keep)
      # if layout is defined, subset in place
      if ("layout" %in% igraph::graph_attr_names(g)) {
         if (verbose) {
            jamba::printDebug("sync_igraph_communities(): ",
               "igraph layout was subset to match the remaining nodes.")
         }
         g_layout <- igraph::graph_attr(g, "layout");
         igraph::graph_attr(g, "layout") <- g_layout[g_keep, , drop=FALSE];
      }
   }

   # subset (and order) communities
   wc_keep <- match(igraph::V(g)$name, wc_nodes)
   wc$membership <- wc$membership[wc_keep]
   wc$names <- wc$names[wc_keep]
   if (length(wc$modularity) > 1) {
      wc$modularity <- wc$modularity[wc_keep]
   }
   if (length(wc$memberships) > 1) {
      wc$memberships <- wc$memberships[wc_keep, , drop=FALSE]
   }
   if (length(wc$merges) > 0) {
      if (verbose) {
         jamba::printDebug("sync_igraph_communities(): ",
            "community merges were removed.")
      }
      wc$merges <- NULL;
   }
   wc$vcount <- igraph::vcount(g);
   class(wc) <- "communities";

   if ("cluster_names" %in% names(wc)) {
      cluster_names <- wc$cluster_names;
      names(cluster_names) <- seq_along(cluster_names);
      if (!all(names(cluster_names) %in% as.character(wc$membership))) {
         if (verbose) {
            jamba::printDebug("sync_igraph_communities(): ",
               "cluster_names were reduced due to match the remaining nodes.")
         }
         cn_keep <- (names(cluster_names) %in% as.character(wc$membership));
         new_cluster_names <- unname(cluster_names[cn_keep]);
         new_membership <- as.numeric(as.factor(wc$membership));
         wc$cluster_names <- new_cluster_names;
         wc$membership <- new_membership
      }
   }

   if ("nodegroups" %in% input_type) {
      wc <- communities2nodegroups(wc);
   }

   return(list(
      g=g,
      wc=wc));
}
jmw86069/multienrichjam documentation built on Nov. 3, 2024, 10:29 p.m.