rm(list = ls())
seed <- 1
set.seed(seed)
require(kaiaulu)
require(visNetwork)
require(reactable)
require(data.table)
require(igraph)
require(yaml)
require(stringi)
require(knitr)
require(RColorBrewer)

Introduction

This notebook showcases the use of community detection using OSLOM [1], which requires a separate installation to obtain the binary (see Kaiaulu README.md for details). OSLOM differs from other community detection algorithms in that its algorithm can account for directed and weighted graphs. In addition, OSLOM will also identify nodes that belong to multiple communities and automatically verify how statistically significant the identified communities are.

[1] Finding statistically significant communities in networks. A. Lancichinetti, F. Radicchi, J.J. Ramasco and S. Fortunato PLoS ONE 6, e18961 (2011).

To facilitate the usage of the OSLOM algorithm, Kaiaulu currently provides two interface functions to its two binaries: community_oslom and recolor_network_by_community. The former requires the binary path to OSLOM to be provided and parses OSLOM outputs into R memory as a list of tables closest to the raw output. The latter leverages said list of tables to re-color a network by its assigned community up to 12 different communities (the limitation is only on color, the OSLOM output will give as many communities as detected). Nodes that are assigned to more than one community are colored black.

recolor_network_by_community leverages the existing visualization library in Kaiaulu, which means communities can be explored interactively. This is of particular interest to some of the social smells, where community detection is a required step, and therefore a poor community detection would lead to erroneous results.

Despite the algorithm being commonly named community detection, any graph can be used for the algorithm. Therefore, any network obtained by the family of transform_to_network_* functions can be used as input for community detection. For example, while applying the algorithm to a network obtained from transform_reply_to_bipartite_network() would lead to the community of developers based on their e-mail exchange, transform_dependencies_to_network() would identify groups of files that contain several static dependencies to one another (e.g. function calls, class inheritance, etc). Applying community detection to a co-change network could identify files co-changed across multiple commits, etc.

The remainder of this Notebook illustrates two functions for some of Kaiaulu networks for the APR project.

Project Configuration File

As usual, the first step is to load the project configuration file.

tool <- parse_config("../tools.yml")
conf <- parse_config("../conf/apr.yml")

# 3rd Party Tools
perceval_path <- get_tool_project("perceval", tool)
utags_path <- get_tool_project("utags", tool)
oslom_dir_path <- get_tool_project("oslom_dir", tool)
oslom_undir_path <- get_tool_project("oslom_undir", tool)

# Ctags Line Types
kinds <- get_uctags_line_types(conf)

# Local Git Repo Folder Path
git_repo_path <- get_git_repo_path(conf)

# File Filters
file_extensions <- get_file_extensions(conf)
substring_filepath <- get_substring_filepath(conf)

Parse Git Log

project_git <- parse_gitlog(perceval_path,git_repo_path)
project_git <- project_git  %>%
  filter_by_file_extension(file_extensions,"file_pathname")  %>% 
  filter_by_filepath_substring(substring_filepath,"file_pathname")
project_git$author_datetimetz <- as.POSIXct(project_git$author_datetimetz,
                                              format = "%a %b %d %H:%M:%S %Y %z", tz = "UTC")
project_git$author_datetimetz <- as.POSIXct(project_git$committer_datetimetz,
                                              format = "%a %b %d %H:%M:%S %Y %z", tz = "UTC")

#project_git_slice <- project_git[author_datetimetz >= as.POSIXct("2015-01-01", format = "%Y-%m-%d",tz = "UTC") & author_datetimetz < as.POSIXct("2015-12-31", format = "%Y-%m-%d",tz = "UTC")]

Identity Matching

project_log <- list(project_git=project_git)
project_log <- identity_match(project_log,
                              name_column = c("author_name_email"),
                              assign_exact_identity,
                              use_name_only=TRUE,
                              label = "raw_name"
                              )
project_git <- project_log[["project_git"]]

Create Temporal Network

project_collaboration_network <- transform_gitlog_to_temporal_network(project_git,mode="author")

Plot Network

i_project_temporal_collaboration_network <- igraph::graph_from_data_frame(d=project_collaboration_network[["edgelist"]], 
                      directed = TRUE, 
                      vertices = project_collaboration_network[["nodes"]])
visIgraph(i_project_temporal_collaboration_network,randomSeed = 1)

Community Detection

Use oslom_dir_path or oslom_undir_path as preferred for directed or undirected community detection.

community <- community_oslom(oslom_dir_path,
              project_collaboration_network,
              seed=seed,
              n_runs = 100,
              is_weighted = TRUE)
community
project_collaboration_network <- recolor_network_by_community(project_collaboration_network,community)

gcid <- igraph::graph_from_data_frame(d=project_collaboration_network[["edgelist"]], 
                      directed = TRUE,
                      vertices = project_collaboration_network[["nodes"]])

visIgraph(gcid,randomSeed = 1)

Dependency Modules

# Depends parameters
depends_jar_path <- get_tool_project("depends", tool)
language <- get_depends_code_language(conf)
keep_dependencies_type <- get_depends_keep_dependencies_type(conf)
project_dependencies <- parse_dependencies(depends_jar_path,git_repo_path,language=language)
project_dependencies[["nodes"]] <- project_dependencies[["nodes"]]  %>%
  filter_by_file_extension(file_extensions,"filepath")  %>% 
  filter_by_filepath_substring(substring_filepath,"filepath")

project_dependencies[["edgelist"]] <- project_dependencies[["edgelist"]]  %>%
  filter_by_file_extension(file_extensions,"src_filepath")  %>% 
  filter_by_file_extension(file_extensions,"dest_filepath")  %>% 
  filter_by_filepath_substring(substring_filepath,"src_filepath") %>%
  filter_by_filepath_substring(substring_filepath,"dest_filepath")
project_file_network <- transform_dependencies_to_network(project_dependencies,
                                                   weight_types = keep_dependencies_type)
module <- community_oslom(oslom_undir_path,
              project_file_network,
              seed=seed,
              n_runs = 100,
              is_weighted = TRUE)
module
project_file_network <- recolor_network_by_community(project_file_network,module)

gcid <- igraph::graph_from_data_frame(d=project_file_network[["edgelist"]], 
                      directed = FALSE,
                      vertices = project_file_network[["nodes"]])

visIgraph(gcid,randomSeed = 1)

Co-Change Community Detection

project_git_slice <- project_git[author_datetimetz >= as.POSIXct("2015-01-01", format = "%Y-%m-%d",tz = "UTC") & author_datetimetz < as.POSIXct("2019-12-31", format = "%Y-%m-%d",tz = "UTC")]

project_commit_network <- transform_gitlog_to_bipartite_network(project_git_slice,
                                                                mode="commit-file")
co_change_network <- bipartite_graph_projection(project_commit_network,
                                                mode = FALSE,
                                                weight_scheme_function = weight_scheme_sum_edges)
co_change_module <- community_oslom(oslom_undir_path,
              co_change_network,
              seed=seed,
              n_runs = 5,
              is_weighted = TRUE)
co_change_module
co_change_network <- recolor_network_by_community(co_change_network,co_change_module)

gcid <- igraph::graph_from_data_frame(d=co_change_network[["edgelist"]], 
                      directed = FALSE,
                      vertices = co_change_network[["nodes"]])
visIgraph(gcid,randomSeed = 1)


sailuh/kaiaulu documentation built on Dec. 10, 2024, 3:14 a.m.