R/topology.R

Defines functions get_hub_degree get_hub_high_co build_graph_from_sq_mat

Documented in build_graph_from_sq_mat get_hub_degree get_hub_high_co

# Removing errors about magrittr's placeholder `.` not declared
utils::globalVariables(c("."))

# Removing errors about dplyr data-variables
utils::globalVariables(c("gene_from", "gene_to"))

#' Return graph from squared matrix network
#'
#' Takes a squared matrix containing the pairwise similarity scores for each
#' gene and return a igraph object.
#'
#' @param sq_mat matrix or data.frame, squared matrix representing
#'
#' @importFrom igraph graph_from_data_frame
#' @importFrom dplyr filter
#' @importFrom magrittr %>%
#' @importFrom tibble rownames_to_column
#' @importFrom tidyr pivot_longer
#'
#' @return An igraph object
#'
#' @examples
#' mat <- matrix(runif(40*40), 40)
#' build_graph_from_sq_mat(mat)
#'
#' @export

build_graph_from_sq_mat <- function(sq_mat) {
  # Checks
  if (!(is.matrix(sq_mat) | is.data.frame((sq_mat))))
    stop("sq_mat should be a matrix or a data.frame")
  if (any(is.na(sq_mat)))
    warning("sq_mat should not contain any missing value")
  if ((any(sq_mat > 1) | any(sq_mat < -1)) & !any(is.na(sq_mat)))
    stop("sq_mat should be filled with value in the [-1,1] range")
  if (nrow(sq_mat) != ncol(sq_mat))
    stop("sq_mat must be a squared matrix")

  # From matrix to graph
  sq_mat %>%
    as.data.frame %>%
    tibble::rownames_to_column("gene_from") %>%
    tidyr::pivot_longer(-gene_from,
                        names_to = "gene_to",
                        values_to = "weight") %>%
    # removing pairwise duplicates
    .[!duplicated(t(apply(.[, c("gene_from", "gene_to")], 1, sort))), ] %>%
    as.data.frame %>%
    # removing self edge
    dplyr::filter(gene_from != gene_to) %>%
    igraph::graph_from_data_frame(directed = FALSE)
}


#' Determine hub genes based on connectivity
#'
#' Compute connectivity of each gene by module if provided or for whole network
#' if not, and return the top_n highest connected ones.
#'
#' @param network matrix or data.frame, square table representing connectivity
#' between each genes as returned by build_net. Can be whole network or a
#' single module.
#' @param modules list, modules defined as list of gene vectors. If null,
#' network is supposed to be the whole network or an already split module
#' @param top_n integer, number of genes to be considered as hub genes
#'
#' @return A list of vectors, or single vector of gene names
#'
#' @importFrom magrittr %>%
#'
#' @examples
#' mat <- matrix(runif(40*40), 40)
#' colnames(mat) <- paste0("gene_", seq_len(ncol(mat)))
#' rownames(mat) <- paste0("gene_", seq_len(nrow(mat)))
#' get_hub_high_co(mat)
#'
#' @export

get_hub_high_co <- function(network, modules = NULL, top_n = 5) {
  # Checks
  .check_network(network)
  if (!is.null(modules)) {
    .check_module(modules, is.list(modules))}
  if (length(top_n) > 1) stop("top_n must be a single numeric value")
  if (!is.numeric(top_n)) stop("top_n must be a numeric value")
  if (top_n < 1 | top_n %% 1 != 0) stop("If not NULL, block_size must be a ",
  " whole number >= 1")

  # Highly connected genes selection
  # Considered network is already a single module, or looking for hubs
  # independently of modules split
  if (is.null(modules)) {
    if (top_n > ncol(network)) stop("top_n > to number of genes into network")
    modules <- list(colnames(network))
  }

  hubs <- lapply(modules, function(x){
    net <- network[x,x]
    if (top_n > ncol(net)) {
      warning("top_n > to number of genes into one of the modules. Taking all",
              " genes in this case.")
      top_n <- ncol(net)
    } else {
      hubs_name <- rowSums(net) %>%
        sort(decreasing = TRUE) %>%
        .[seq_len(top_n)]
    }
  })
  return(hubs)
}

#' Determine hub genes based on degree
#'
#' Remove edges from the graph which value is under weight_th then compute
#' degree of each node (gene). Hub gene are genes whose degree value is above
#' average degree value of the thresholded network.
#'
#' @param network matrix or data.frame, square table representing connectivity
#' between each genes as returned by build_net. Can be whole network or a
#' single module.
#' @param modules list, modules defined as list of gene vectors. If null,
#' network is supposed to be the whole network or an already split module
#' @param weight_th decimal, weight threshold under or equal to which edges
#' will be removed
#'
#' @details GWENA natively build networks using WGCNA. These networks are
#' complete in a graph theory sens, meaning all nodes are connected to each
#' other. Therefore a threshold need to be applied so degree of all nodes
#' isn't the same.
#'
#' @importFrom igraph delete.edges degree E
#'
#' @return A list of vectors, or single vector of gene names
#'
#' @examples
#' mat <- matrix(runif(40*40), 40)
#' colnames(mat) <- paste0("gene_", seq_len(ncol(mat)))
#' rownames(mat) <- paste0("gene_", seq_len(nrow(mat)))
#' get_hub_degree(mat)
#'
#' @export

get_hub_degree <- function(network, modules = NULL, weight_th = 0.2) {
  # Checks
  .check_network(network)
  if (!is.null(modules)) {
    .check_module(modules, is.list(modules))}
  if (!is.null(weight_th)) {
    if (length(weight_th) > 1) stop("weight_th must be a single numeric value")
    if (!is.numeric(weight_th)) stop("weight_th must be a numeric value")
    if (weight_th < 0 | weight_th >= 1) stop("weight_th must be a in [0;1[")
  }
  # TODO : check if can avoid transforming to igraph object.
  # It takes a lot of time...

  # Above average degree genes
  # Considered network is already a single module, or looking for hubs
  # independently of modules split
  if (is.null(modules)) {
    modules <- list(colnames(network))
  }
  hubs <- lapply(modules, function(x){
    net_degree <- network[x,x] %>%
      build_graph_from_sq_mat() %>%
      igraph::delete.edges(which(igraph::E(.)$weight <= weight_th)) %>%
      igraph::degree()
    if (table(net_degree) %>% length == 1) {
      x_hubs <- c()
    } else {
      x_hubs <- net_degree[which(net_degree > (net_degree %>% mean))] }

  })

  return(hubs)
}


#' Determine hub genes based on Kleinberg's score
#'
#' Compute Kleinberg's score (defined as the principal eigenvector of A*t(A),
#' where A is the similarity matrix of the graph) of each gene by module if
#' provided or for whole network if not, and return the top_n highest ones.
#'
#' @param network matrix or data.frame, square table representing connectivity
#' between each genes as returned by build_net. Can be whole network or a
#' single module.
#' @param modules list, modules defined as list of gene vectors. If null,
#' network is supposed to be the whole network or an already split module
#' @param top_n integer, number genes to be considered as hub genes
#' @param k_th decimal, Kleinberg's score threshold above or equal to which
#' genes are considered as hubs
#'
#' @details If you provide a top_n value, you can't provide a k_th value and
#' vice versa. If none of them is provided, top_n = 5.
#' For more information on Kleinberg's score, look at
#' \code{\link[igraph]{hub_score}} from igraph.
#'
#' @importFrom igraph hub_score
#'
#' @return A list of vectors, or single vector of gene names
#'
#' @examples
#' mat <- matrix(runif(40*40), 40)
#' colnames(mat) <- paste0("gene_", seq_len(ncol(mat)))
#' rownames(mat) <- paste0("gene_", seq_len(nrow(mat)))
#' get_hub_degree(mat)
#' get_hub_kleinberg(mat, top_n = NULL, k_th = 0.9)
#'
#' @export

get_hub_kleinberg <- function(network, modules = NULL, top_n = 5,
                              k_th = NULL) {
  # Checks
  .check_network(network)
  if (!is.null(modules)) {
    .check_module(modules, is.list(modules))}
  if (!is.null(top_n) & !is.null(k_th)) {
    k_th <- NULL
    warning("Conflict: top_n and k_th value provided.",
    " Keeping only top_n value") }
  if (!is.null(k_th)) {
    if (length(k_th) > 1) stop("k_th must be a single numeric value")
    if (!is.numeric(k_th)) stop("k_th must be a numeric value")
    if (k_th < 0 | k_th >= 1) stop("k_th must be a in [0;1[")
  }
  if (!is.null(top_n)) {
    if (length(top_n) > 1) stop("top_n must be a single numeric value")
    if (!is.numeric(top_n)) stop("top_n must be a numeric value")
    if (top_n < 1 | top_n %% 1 != 0)
      stop("If not NULL, block_size must be a whole number >= 1")
  }

  # Considered network is already a single module, or looking for hubs
  # independently of modules split
  if (is.null(modules)) {
    modules <- list(module = colnames(network))
  }
  hubs <- lapply(modules, function(x){
    net_hub_score <- network[x,x] %>%
      build_graph_from_sq_mat() %>%
      igraph::hub_score(scale = TRUE) %>%
      .$vector

    # With top_n
    if (!is.null(top_n)) {
      x_hubs <- net_hub_score %>% sort(decreasing = TRUE) %>% .[seq_len(top_n)]
    } else { # With k_th
      x_hubs <- net_hub_score[which(net_hub_score >= k_th)]
    }
    return(x_hubs)
  })

  return(hubs)
}


#' Determine hub genes inside each module
#'
#' Return genes considered as hub genes inside each module of a network
#' following the selected method. Method will be lauched with default
#' parameters. If specific parameters desired, please use directly the
#' function \code{get_hub_...} itself.
#'
#' @param network matrix or data.frame, square table representing connectivity
#' between each genes as returned by build_net. Can be whole network or a
#' single module.
#' @param modules list, modules defined as list of gene vectors. If null,
#' network is supposed to be the whole network or an already split module
#' @param method string, name of the method to be used for hub gene detection.
#' See details.
#'
#' @details
#' \describe{
#'   \item{highest connectivity}{Select the top n (n depending on
#'   parameter given) highest connected genes. Similar to
#'   WGCNA::chooseTopHubInEachModule.}
#'   \item{superior degree}{Select genes which degree is greater than average
#'   connection degree of the network. Definition from network theory.}
#'   \item{Kleinberg's score}{Select genes which Kleinberg's score superior to
#'   provided threshold.}
#' }
#'
#' @return A list of vectors representing hub genes, by module
#'
#' @examples
#' mat <- matrix(runif(40*40), 40)
#' colnames(mat) <- paste0("gene_", seq_len(ncol(mat)))
#' rownames(mat) <- paste0("gene_", seq_len(nrow(mat)))
#' get_hub_genes(mat)
#'
#' @export

get_hub_genes <- function(network, modules = NULL, method =
                            c("highest connectivity", "superior degree",
                              "Kleinberg's score")) {
  # Checks
  method = match.arg(method)

  # Call
  if (method == "highest connectivity") {
    hubs <- get_hub_high_co(network, modules)
  } else if (method == "superior degree") {
    hubs <- get_hub_degree(network, modules)
  } else if (method == "Kleinberg's score") {
    hubs <- get_hub_kleinberg(network, modules) }

  return(hubs)
}


# Removing errors about dplyr data-variables
utils::globalVariables(c("vertex.size", "edge.width"))

#' Plot co-expression network
#'
#' Display a graph representing the co-expression network and different
#' informations like hubs, enrichments
#'
#' @param graph_module igraph object, module to plot.
#' @param hubs character vector or numeric vector with names, optionnal,
#' vector of gene names or vector of numeric values named with gene names.
#' @param groups matrix or data.frame, a two cols table with the gene id in the
#' first one, and the group assignation in the second one.
#' @param lower_weight_th,upper_weight_th decimal, weight threshold above
#' lower_weight_th or below upper_weight_th which edges will be removed.
### @param enrichment list, representing a gost object. # Not yet implemented
#' @param title string, main title that will be displayed on the plot.
#' @param degree_node_scaling boolean, indicates if node size should represent
#' the degree of this node.
#' @param node_scaling_min,node_scaling_max integer, if degree_node_scaling is
#' TRUE, it is the min/max size of the node, else it is the exact size of all
#' node.
#' @param edge_scaling_min,edge_scaling_max integer, min/max width of the edge
#' @param nb_row_legend integer, number of levels in the legend.
#' @param layout numeric matrix or function or string, numeric matrix for nodes
#' coordinates, or function for layout, or name of a layout function available
#' in \code{igraph}. Default "auto" will choose the best layout depending on
#' the graph. For more information, see \code{\link{igraph.plotting}}.
#' @param zoom integer, scaling factor by which it's possible to have
#' compact graph (< 1) or larger graph (> 1) display.
#' @param vertex.label.cex,legend_cex float, font size for vertex labels. It is
#' interpreted as a multiplication factor of some device-dependent base font
#' size. If 0, no labels displayed.
#' @param vertex.label.color,edge.color,vertex.frame.color,vertex.color
#' character and/or integer vector , color of the labels.
#' It may either contain integer values, named colors or RGB specified colors
#' with three or four bytes. All strings starting with ‘#’ are assumed to be
#' RGB color specifications. It is possible to mix named color and RGB colors.
#' @param vertex.label.family character, font family to be used for vertex
#' labels.
#' @param vertex.label.dist integer, distance of the label from the center of
#' the vertex. If it is 0 then the label is centered on the vertex. If it is 1
#' then the label is displayed beside the vertex.
#' @param groups_palette character and/or integer vector, vertices group palette
#' of colors for the groups specified. It may either contain integer values, 
#' named colors or RGB specified colors with three or four bytes. All strings 
#' starting with ‘#’ are assumed to be RGB color specifications. It is possible 
#' to mix named color and RGB colors.
#' @param window_x_min decimal, value for the bottom limit of the window.
#' @param window_x_max decimal, value for the top limit of the window.
#' @param window_y_min decimal, value for the left limit of the window.
#' @param window_y_max decimal, value for the right limit of the window.
#' @param legend boolean, indicates if the legend should be plotted.
# #' @param dymamic_plot boolean, does the final plot should be dynamic (plotly
# #' plot) or static (ggplot2 plot).
#' @param ... any other parameter compatible with the
#' \code{\link[igraph]{plot.igraph}} function.
#'
#' @details Take care of you intend to compare modules' graphs, the same size
#' of node will not correspond to the same values because of the scaling.
#'
#' @import igraph
#' @importFrom magrittr %>%
#' @importFrom graphics legend symbols
#' @importFrom utils lsf.str
#'
#' @return matrix, layout of the graph as a two column matrix (x, y)
#'
#' @examples
#' mat <- matrix(runif(40*40), 40)
#' g <- build_graph_from_sq_mat(mat)
#' plot_module(g, lower_weight_th = -0.5, upper_weight_th = 0.5)
#'
#' @export

plot_module <- function(graph_module, hubs = NULL, groups = NULL,
                        lower_weight_th = NULL, upper_weight_th = NULL,
                        # enrichment = NULL,
                        # dynamic_plot = TRUE,
                        title = "Module", degree_node_scaling = TRUE,
                        node_scaling_min = 1, edge_scaling_min = 0.2,
                        node_scaling_max = 6, edge_scaling_max = 1,
                        nb_row_legend = 6, layout = "auto", zoom = 1,
                        vertex.label.cex = 0.7,
                        vertex.label.color = "gray20",
                        vertex.label.family = "Helvetica",
                        edge.color = "gray70", vertex.frame.color = "white",
                        vertex.color = "gray60", vertex.label.dist = 1, 
                        legend_cex = 0.8, groups_palette = NULL,
                        window_x_min = -1, window_x_max = 1,
                        window_y_min = -1, window_y_max = 1,
                        legend = TRUE, ...) {
  # TODO: add a layer arg that could take a list where each named element could
  # be list of genes of interest (like one displaying gene known in aging and
  # another for inflammation)

  # Checking args
  if (!igraph::is.igraph(graph_module))
    stop("graph_module must be an igraph object")
  named_num_vec <- char_vec <- FALSE
  if (!is.null(hubs)) {
    # Named vector with numeric values
    if (is.vector(hubs, "numeric") & !is.null(names(hubs)))
      named_num_vec <- TRUE
    # Vector of characters
    if (is.vector(hubs, "character")) char_vec <- TRUE
    if (!(named_num_vec & char_vec))
      stop("hubs must be a named vector of numeric values, or a vector of",
      " characters")
  }
  if (!is.null(groups)) {
    if (!is.data.frame(groups) & !is.matrix(groups))
      stop("groups must be a matrix or a data.frame")
    if (length(groups) != 2)
      stop("groups must be a two columns matrix with genes id and group",
           " assignation")
  }
  if (!is.null(groups_palette)){
    if (is.null(groups)) 
      stop("groups need to be provided to use groups_palette")
  }
  lapply(c(lower_weight_th, upper_weight_th), function(weight_th){
    if (!is.null(weight_th)) {
      if (length(weight_th) > 1)
        stop("Weight threshold must be a single numeric value")
      if (!is.numeric(weight_th))
        stop("Weight threshold must be a numeric value")
      # if (weight_th <= -1 | weight_th >= 1)
      #   stop("Weight thresholds must be a in ]-1;1[")
    }
  })
  # TODO: Change when implemented
  # if (!is.null(enrichment)) .check_gost(enrichment)
  if (!is.character(layout) & !is.matrix(layout) & !is.function(layout)) {
    stop("layout must be a layout function, its name as a string, or a matrix",
    " giving position of each node ") }
  if (!is.numeric(zoom))
    stop("zoom must be a numeric value superior to 0.")
  if (!is.logical(degree_node_scaling))
    stop("degree_node_scaling must be a boolean value")
  if (isTRUE(degree_node_scaling) & exists("vertex.size"))
    stop("If degree_node_scaling is TRUE, you cannot specify a vertex.size")
  if (!is.numeric(c(node_scaling_max, edge_scaling_max,
                    node_scaling_min, edge_scaling_min)))
      stop("All scaling values must be a numeric value")
  if (any(node_scaling_max < node_scaling_min,
          edge_scaling_max < edge_scaling_min))
      stop("Scalling values must have a min inferior to the corresponding max")
  if (!is.null(edge_scaling_max) & exists("edge.width"))
    stop("If edge_scaling_max is not NULL, you cannot specify a edge.width")

  # Keeping only gene names if hubs is a named numeric vector
  if (named_num_vec) hubs <- names(hubs)
  if (!(all(hubs %in% igraph::V(graph_module)$names)))
    stop("Not all hubs are in graph_module")

  # Removing edges whose weight is below or above thresholds given
  if (!is.null(lower_weight_th) & !is.null(upper_weight_th)) {
    graph_to_plot <- graph_module %>%
      igraph::delete.edges(which(igraph::E(.)$weight > lower_weight_th &
                                   igraph::E(.)$weight < upper_weight_th))
  } else if (!is.null(lower_weight_th)) {
    graph_to_plot <- graph_module %>%
      igraph::delete.edges(which(igraph::E(.)$weight > lower_weight_th))
  } else if (!is.null(upper_weight_th)) {
    graph_to_plot <- graph_module %>%
      igraph::delete.edges(which(igraph::E(.)$weight < upper_weight_th))
  } else graph_to_plot <- graph_module

  # Détermining layout
  if (is.character(layout)) {
    if (layout != "auto") {
      # Checking if layout function name exists
      igraph_layouts <- grep("^layout_\\w[\\w|_]*",
                             utils::lsf.str("package:igraph"),
                             value = TRUE)
      if (!any(layout %in% igraph_layouts))
        stop("layout name provided not found in igraph layout functions")
      l <- igraph::layout_(graph_to_plot, get(layout))
    } else {
      l <- igraph::layout_nicely(graph_to_plot)}
  } else if (is.function(layout)) {
    l <- layout(graph_to_plot)
  } else if (is.matrix(layout)){
    l <- layout
  } else stop("Should never be triggered.")

  # Normalizing boundaries to ensure right plotting
  if (zoom != 1) {
      l <- igraph::norm_coords(l, xmin = window_x_min, xmax = window_x_max,
                          ymin = window_y_min, ymax = window_y_max) }

  # Node scaling on degree or provided value
  if (degree_node_scaling) {
    deg <- igraph::degree(graph_to_plot)
    # Checking deg is the same for all (meaning graph is fully connected)
    delta_deg <- max(deg) - min(deg)
    if (delta_deg == 0) {
      warning("max and min degree of the nodes are equal. ",
      "Consider changing the weight thresholds value.")
      delta_deg <- 1
    }
    vertex_size <- lapply(deg, function(x) {
      (x - min(deg)) / delta_deg *
      (node_scaling_max - node_scaling_min) +
      node_scaling_min }) %>% unlist
  } else {
    if (exists("vertex.size")) {
      vertex_size <- vertex.size
    } else {
      vertex_size <- node_scaling_max
    }
  }

  # Edge scaling if no edge width provided
  if (exists("edge.width")) {
    edge_width <- edge.width
  } else {
    edge <- igraph::E(graph_to_plot)$weight
    edge_width <- lapply(edge, function(x) {
      (x - min(edge)) / (max(edge) - min(edge)) *
      (edge_scaling_max - edge_scaling_min) +
      edge_scaling_min}) %>% unlist
  }
  
  
  # FIXME: hotfix to manage no label
  if (vertex.label.cex == 0) vertex_label = NA else vertex_label = NULL

  # Plotting static or dynamic graph
  # TODO: implementer
  # if (dynamic_plot) {
  # 
  # 
  # } else {
    # Vertex color if groups provided
    if (!is.null(groups)) {
      # No palette specified
      if (is.null(groups_palette)) {
        graph_to_plot$palette <- RColorBrewer::brewer.pal(
          length(unique(groups[,2])), "Paired")
      # Palette name specified
      } else if (is.character(groups_palette) & length(groups_palette) == 1) {
        graph_to_plot$palette <- RColorBrewer::brewer.pal(
          length(unique(groups[,2])), groups_palette)
      # Palette of custom colors specified
      } else {
        if (length(unique(groups[,2])) > length(groups_palette)) {
          warning("Number of groups superior to number of colors in the",
                  "palette. Using ggplot2 like colors instead.")
          graph_to_plot$palette <- gg_palette(length(unique(groups[,2])))
        } else {
          graph_to_plot$palette <- groups_palette
        }
      }
      graph_to_plot <- igraph::set_vertex_attr(graph_to_plot, "color", 
                                               groups[,1], groups[,2])
      vertex.color <- NULL
    }
    
    igraph::plot.igraph(graph_to_plot,
                        vertex.label.color = vertex.label.color,
                        vertex.label.family = vertex.label.family,
                        vertex.label.cex = vertex.label.cex,
                        vertex.label.dist = vertex.label.dist,
                        edge.color = edge.color,
                        vertex.label = vertex_label,
                        vertex.frame.color = vertex.frame.color,
                        vertex.color = vertex.color,
                        vertex.size = vertex_size,
                        edge.width = edge_width,
                        layout = l * zoom,
                        rescale = ifelse(zoom != 1, FALSE, TRUE),
                        main = title,
                        ...)


    # Adding a legend
    if (legend) {
      # Legend nodes size
      scale_vertex_size <- seq(node_scaling_min, node_scaling_max,
                               length.out = nb_row_legend)
      if (degree_node_scaling) {
        label_legend_node <- seq(
          min(deg), max(deg), length.out = nb_row_legend) %>%
          round %>%
          as.character
      } else {label_legend_node <- as.character(scale_vertex_size)}

      legend_nodes_size <- graphics::legend(
        "bottomright", label_legend_node,
        pt.cex = scale_vertex_size/200, col = 'white', 
        pch = 21, title = "Degree", bty = "n", 
        y.intersp = 0.7, cex = legend_cex)
      x_nodes_size <- (legend_nodes_size$text$x + legend_nodes_size$rect$left)/2
      y_nodes_size <- legend_nodes_size$text$y
      graphics::symbols(x_nodes_size, y_nodes_size, 
                        circles = scale_vertex_size/200, inches = FALSE,
                        add = TRUE, bg = 'gray', fg = 'white')
      
      # Legend nodes colors if groups provided
      if (!is.null(groups)) {
        graphics::legend("topleft", as.character(unique(groups[ ,2])),
                         pt.bg = graph_to_plot$palette, title = names(groups)[2], 
                         pt.cex = 2, pch = 21, col = "white",
                         bty = "n", cex = legend_cex)
      }

      # Legend vertex
      scale_edge_width <- seq(edge_scaling_min, edge_scaling_max,
                              length.out = nb_row_legend) %>% signif
      label_legend_edge <- seq(
        min(edge), max(edge), length.out = nb_row_legend) %>%
        signif %>%
        as.character
      graphics::legend("topright", label_legend_edge, col='gray', title = "Weight",
                       lwd = scale_edge_width, bty = "n", cex = legend_cex)
    }
  # }
  return(l)
}


# Removing errors about dplyr data-variables
utils::globalVariables(c("avg_sil_width", "k", "sub_module"))

#' Detect sub clusters
#'
#' Use a partitioning around medoid (PAM, or k-medoid) clustering method to
#' detect clusters into a provided module using the strength matrix of the
#' network
#'
#' @param network matrix or data.frame, strength of gene co-expression (edge
#' values).
#' @param seq_k vector, sequence of k number of cluster to test
#' @param fit_plot boolean, does the plot with silhouette coefficient depending
#' on the k tested should be plotted.
#' @param ... any other parameter compatible with the
#' \code{\link[cluster]{pam}} function.
#'
#' @importFrom magrittr %>%
#' @importFrom cluster pam
#' @importFrom tibble rownames_to_column
#' @importFrom dplyr filter select
#' @importFrom graphics plot
#'
#' @return data.frame, a two cols table with the gene id in the first one, and
#' the cluster number assignation in the second one.
#'
#' @examples
#' df <- kuehne_expr[1:24, 1:350]
#' net <- build_net(df, n_threads = 1)
#' mods <- detect_modules(df, net$network)
#' net_mod_1 <- net$network[mods$modules$`1`, mods$modules$`1`]
#' get_sub_clusters(net_mod_1)
#'
#' @export

get_sub_clusters <- function(network, seq_k = seq_len(15), fit_plot = TRUE,
                             ...) {
  # Checking args
  is_network(network)
  if(!is.numeric(seq_k))
    stop("seq_k must be a numeric vector")
  if(length(seq_k) < 2)
    stop("seq_k must at leat have two power values to test")
  if (any(lapply(seq_k, function(x) x < 1 | x %% 1 != 0) %>% unlist))
    stop("seq_k must contain only whole numbers >= 1")
  if(!is.logical(fit_plot))
    stop("fit_plot must be a boolean")

  # Computing the k-medoid
  list_k_tests <- lapply(seq_k, function(k) {
    k_res <- cluster::pam(network, k, diss = TRUE, ...)
    return(k_res)
  }) %>% stats::setNames(paste0("k_", seq_k))

  # Summarizing needed results for plot and returning optimal k
  df_k_tests <- list_k_tests %>%
    lapply(`[[`, "silinfo") %>%
    lapply(`[[`, "avg.width") %>%
    unlist %>%
    data.frame(k = seq_k[-1], avg_sil_width = .)

  # If asked, plotting the silhouette coefficient against k tested
  if (fit_plot) {
    graphics::plot(df_k_tests, type="b", pch = 19,
         frame = FALSE, xlab="Number of clusters K",
         ylab="Average silhouette width")
  }

  # Isolating optimal k regarding the silhouette coefficient
  opti_k <- df_k_tests %>%
    dplyr::filter(avg_sil_width == max(avg_sil_width)) %>%
    dplyr::select(k) %>% as.numeric()

  # Formatting the table to return the gene id association to sub_module
  clusters <- list_k_tests[[opti_k]]$clustering %>%
    data.frame(sub_module = .) %>%
    tibble::rownames_to_column("gene") %>%
    dplyr::mutate(sub_module = as.character(sub_module))

  return(clusters)
}
Kumquatum/GWENA documentation built on July 7, 2023, 3:41 p.m.