rm(list = ls()) seed <- 1 set.seed(seed)
require(kaiaulu) require(visNetwork) require(data.table) require(stringi) require(igraph) require(gh) require(yaml) require(magrittr) require(knitr) require(gt)
This notebook computes three sets of metrics:
For implementation details, please refer to the function reference documentation. If this is being read on the package documentation, you can simply click the function to access it.
The first step is loading the project configuration file, which contains information for both the project data provenance, and the various parameters used for all the tools. Refer to Kaiaulu's repo conf folder for the configuration file of interest used in the code block below.
tool <- parse_config("../tools.yml") #conf <- parse_config("../conf/tse_cassandra.yml") conf <- parse_config("../conf/camel.yml") perceval_path <- get_tool_project("perceval", tool) dv8_path <- get_tool_project("dv8", tool) scc_path <- get_tool_project("scc", tool) # Gitlog parameters git_repo_path <- get_git_repo_path(conf) git_branch <- get_git_branches(conf)[1] # camel 1.6.0 # Depends parameters depends_jar_path <- get_tool_project("depends", tool) language <- get_depends_code_language(conf) keep_dependencies_type <- get_depends_keep_dependencies_type(conf) # Mailing List # Specify project_key_index in get_mbox_path() (e.g. "project_key_1") mbox_path <- get_mbox_path(conf, "project_key_1") # DV8 parameters project_path <- get_dv8_folder_path(conf) project_name <- stringi::stri_split_regex(project_path,pattern = "/")[[1]] project_name <- project_name[length(project_name)] flaws_params <- get_dv8_flaws_params(conf) # Filters file_extensions <- get_file_extensions(conf) substring_filepath <- get_substring_filepath(conf) filter_commit_size <- get_filter_commit_size(conf) # Issue ID Regex on Commit Messages issue_id_regex <- get_issue_id_regex(conf) # Path to Jira Issues (obtained using `download_jira_data Notebook`) # Specify project_key_index in get_jira_issues_path() (e.g. "project_key_1") jira_issues_path <- get_jira_issues_path(conf, "project_key_1") # Specify project_key_index in get_jira_issues_comments_path (e.g. "project_key_1") jira_issue_comments_path <- get_jira_issues_comments_path(conf, "project_key_1")
In order to compute architectural flaws, our first step is to parse the Git log and Dependencies for DV8. These will used to construct historical and structural dependencies in DV8.
For the git log, we first parse it into Kaiaulu as a table, and filter based on the project configuration files criteria. Here's a sample:
#git_repo_path <- "~/Downloads/testing-cochange/.git" git_checkout(git_branch,git_repo_path) project_git <- parse_gitlog(perceval_path,git_repo_path) project_git <- project_git %>% filter_by_file_extension(file_extensions,"file_pathname") %>% filter_by_filepath_substring(substring_filepath,"file_pathname") %>% filter_by_commit_size(commit_size = filter_commit_size) project_git$author_datetimetz <- as.POSIXct(project_git$author_datetimetz, format = "%a %b %d %H:%M:%S %Y %z", tz = "UTC") project_git$committer_datetimetz <- as.POSIXct(project_git$committer_datetimetz, format = "%a %b %d %H:%M:%S %Y %z", tz = "UTC") kable(head(project_git))
Checking the git log last timestamp is also advised, as git logs on GitHub are sometimes mirrors which have not been updated for years (e.g. Geronimo's), or your local copy may not be up to date.
project_git[order(author_datetimetz)]$author_datetimetz[1]
Earliest Date:
project_git[order(-author_datetimetz)]$author_datetimetz[1]
We then transform it as a hdsm JSON (i.e. hdsmj):
hdsmj_path <- transform_gitlog_to_hdsmj(project_git, hdsmj_path = file.path(project_path,paste0(project_name,"-hdsm.json")))
Next, we parse file dependencies using Depends, filter the files by the project configuration file criteria, and proceed to convert it into a DV8 Structural Dependency Matrix binary.
project_dependencies <- parse_dependencies(depends_jar_path,git_repo_path,language=language) project_dependencies[["nodes"]] <- project_dependencies[["nodes"]] %>% filter_by_file_extension(file_extensions,"filepath") %>% filter_by_filepath_substring(substring_filepath,"filepath") project_dependencies[["edgelist"]] <- project_dependencies[["edgelist"]] %>% filter_by_file_extension(file_extensions,"src_filepath") %>% filter_by_file_extension(file_extensions,"dest_filepath") %>% filter_by_filepath_substring(substring_filepath,"src_filepath") %>% filter_by_filepath_substring(substring_filepath,"dest_filepath") sdsmj_path <- transform_dependencies_to_sdsmj(project_dependencies, sdsmj_path = file.path(project_path,paste0(project_name,"-sdsm.json"))) list.files(project_path)
We are now ready to convert the json files into binary format, so they can be merged:
hdsmb_path <- dv8_dsmj_to_dsmb(dv8_path = dv8_path, dsmj_path = hdsmj_path, dsmb_path = file.path(project_path,paste0(project_name, "-hdsm.dv8-dsm"))) sdsmb_path <- dv8_dsmj_to_dsmb(dv8_path = dv8_path, dsmj_path = sdsmj_path, dsmb_path = file.path(project_path,paste0(project_name, "-sdsm.dv8-dsm"))) list.files(project_path)
Our next step in the pipeline, is to combine both our binary DSMs, i.e. *-sdsm.dv8-dsm and *-hdsm.dv8-dsm into a merged DSM, *-merge-dv8.dsm.
mdsmb_path <- dv8_hdsmb_sdsmb_to_mdsmb(dv8_path=dv8_path, hdsmb_path=hdsmb_path, sdsmb_path = sdsmb_path, mdsmb_path = file.path(project_path, paste0(project_name,"-merge.dv8-dsm"))) list.files(project_path)
We can load load on DV8-GUI the -hdsm, -sdsm or -mdsm files files to inspect their matrices, or export them via dv8-console their .xlsx counterpart. Let's inspect the -mdsm. First, we will perform clustering over the files. This will later be displayed in the DSM as black rectangles. This is an optional step:
hierclsxb_path <- dv8_mdsmb_to_hierclsxb(dv8_path = dv8_path, mdsmb_path = mdsmb_path, hierclsxb_path = file.path(project_path, paste0(project_name, "-clsx.dv8-clsx"))) list.files(project_path)
Now we have a merged DSM, we can perform various analysis in DV8.
DV8 computes a variety of metrics. Let's first observe Architectural Flaws.
# Format Architectural Flaw Parameters to DV8 Command flaws_params$cliqueDepends <- stringi::stri_c(flaws_params$cliqueDepends,collapse=",") flaws_params$uihDepends <- stringi::stri_c(flaws_params$uihDepends,collapse=",") flaws_params$uihInheritance <- stringi::stri_c(flaws_params$uihInheritance,collapse=",")
flaws_folder <- dv8_mdsmb_to_flaws(dv8_path = dv8_path, mdsmb_path = mdsmb_path, flaws_path = file.path(project_path,paste0(project_name,"_flaws")), is_file_only_metric = FALSE, cliqueDepends=flaws_params$cliqueDepends, crossingCochange=flaws_params$crossingCochange, crossingFanIn=flaws_params$crossingFanIn, crossingFanOut=flaws_params$crossingFanOut, mvCochange=flaws_params$mvCochange, uiCochange=flaws_params$uiCochange, uihDepends=flaws_params$uihDepends, uihInheritance=flaws_params$uihInheritance, uiHistoryImpact=flaws_params$uiHistoryImpact, uiStructImpact=flaws_params$uiStructImpact) list.files(project_path)
We can see a folder of Architectural Flaws was generated. The folder organization provides us with means to understand what files are assigned to various types of architectural flaws. The following is a general example used for geronimo (not computed here, but the structure carries over for other larger projects):
geronimo_flaws/ ├── modularity-violation └── package-cycle
Describes two types of architectural flaws were found in APR, modularity violation and package-cycle. Moreover inside each of the folders we see numbered folders:
geronimo_flaws/ ├── modularity-violation │ ├── 1 │ │ ├── 1-clsx.dv8-clsx │ │ ├── 1-hdsm.dv8-dsm │ │ ├── 1-merge.dv8-dsm │ │ ├── 1-sdsm.dv8-dsm │ │ └── 1.dv8-issue │ ├── 10 │ │ ├── 10-clsx.dv8-clsx │ │ ├── 10-hdsm.dv8-dsm │ │ ├── 10-merge.dv8-dsm │ │ ├── 10-sdsm.dv8-dsm │ │ └── 10.dv8-issue
Each numbered folder represents an architectural flaw ID. For example, above we have the modularity violation flaws ID 1 and ID 10. We can further see the sdsm, hdsm, merge dsm, and cluster files occur within both folder ids. If load on DV8 GUI or export to excel (as done in the prior section) any of these files, we can see which files participate in these architectural flaws, and the clusters they are assigned. We will now see how to represent this information as a table we can parse in R, or more specifically the file-architectural flaw assignment, so it can be combined to other analysis in Kaiaulu.
file_to_flaws_map <- parse_dv8_architectural_flaws(dv8_path, flaws_folder,progress_bar = TRUE) fwrite(file_to_flaws_map,file.path(project_path,paste0(project_name,"-flaws_map.csv"))) kable(head(file_to_flaws_map))
file_to_flaws_map <- fread(file.path(project_path,paste0(project_name,"-flaws_map.csv")))
With the flaws mapping, we can then group by the table over the file paths and type of flaw, to obtain the primary granularity of this analysis: I.e. metrics per file. This table provides us with the flaws metrics per file.
file_flaws <- file_to_flaws_map[,.(n_flaws=length(architecture_issue_id)),by=c("file_path", "architecture_issue_type")] file_flaws <- dcast(file_flaws, file_path ~ ...,value.var = "n_flaws")
The computation of Flaws Mapping can be very slow, as it is done outside DV8. If the interest is only to compute file metrics, and not aggregate them, then we can request them directly from DV8 like so:
flaws_folder <- dv8_mdsmb_to_flaws(dv8_path = dv8_path, mdsmb_path = mdsmb_path, flaws_path = file.path(project_path,paste0(project_name,"_flaws")), is_file_only_metric = TRUE, cliqueDepends=flaws_params$cliqueDepends, crossingCochange=flaws_params$crossingCochange, crossingFanIn=flaws_params$crossingFanIn, crossingFanOut=flaws_params$crossingFanOut, mvCochange=flaws_params$mvCochange, uiCochange=flaws_params$uiCochange, uihDepends=flaws_params$uihDepends, uihInheritance=flaws_params$uihInheritance, uiHistoryImpact=flaws_params$uiHistoryImpact, uiStructImpact=flaws_params$uiStructImpact) list.files(project_path)
DV8 always generate its file metrics file inside the flaws folder and names it file-measure-report.csv
:
flaws_folder <- file.path(project_path,paste0(project_name,"_flaws")) file_flaws <- fread(file.path(flaws_folder,"file-measure-report.csv")) setnames(file_flaws, old = c("FileName","Clique","Crossing","ModularityViolation","PackageCycle", "UnhealthyInheritance","UnstableInterface"), new = c("file_path","clique","crossing","modularity-violation","package-cycle", "unhealthy-inheritance", "unstable-interface"))
With flaws computed, we moved on to the next file metrics of this analysis.
To compute the social smell motifs, we require communication data. For this dataset, we will use mailiing list communication.
project_mbox <- parse_mbox(perceval_path,mbox_path)
project_jira <- parse_jira_replies(parse_jira(jira_issue_comments_path)) # Timezone is embedded on separated field. All times shown in UTC. project_jira$reply_tz <- "0000" project_jira$reply_datetimetz <- as.POSIXct(project_jira$reply_datetimetz, format = "%Y-%m-%dT%H:%M:%S.000+0000", tz = "UTC")
# All replies are combined into a single reply table. project_reply <- project_mbox project_git <- project_git[order(author_datetimetz)] project_reply <- project_reply[order(reply_datetimetz)] #project_reply <- project_reply[reply_datetimetz >= start_date & reply_datetimetz <= end_date]
project_reply <- project_jira project_git <- project_git[order(author_datetimetz)] project_reply <- project_reply[order(reply_datetimetz)]
The Git, Reply and Source Code (va git_checkout) Snapshot Time Windows must align
start_timestamp <- max(min(project_git$author_datetimetz,na.rm = TRUE), min(project_reply$reply_datetimetz,na.rm = TRUE)) end_timestamp <- min(max(project_git$author_datetimetz,na.rm = TRUE), max(project_reply$reply_datetimetz,na.rm = TRUE)) if(start_timestamp > end_timestamp){ stop("Non-overlapping git log and reply datasets") } project_git <- project_git[(author_datetimetz >= start_timestamp) & (author_datetimetz <= end_timestamp )] project_reply <- project_reply[(reply_datetimetz >= start_timestamp) & (reply_datetimetz <= end_timestamp )]
Because developers may utilize different e-mails within Git and between Git and Mailing List, we use a set of heuristics to unify their identity across different e-mails.
project_log <- list(project_git=project_git,project_reply=project_reply) project_log <- identity_match(project_log, name_column = c("author_name_email","reply_from"), assign_exact_identity, # use_name_only=FALSE, label = "raw_name") project_git <- project_log[["project_git"]] project_reply <- project_log[["project_reply"]]
Having performed the necessary transformations on our data sources, we are ready to transform them to networks, where our motifs will be computed. Our goal is to create a single graph containing all the information of interest in order to search for sub-graphs of interest (i.e. our defined motifs).
A number of transformation functions are available in Kaiaulu to transform the various logs into networks. First, we transform our git log data into a bipartite author-file network:
git_network <- transform_gitlog_to_bipartite_network(project_git, mode="author-file")
Next we apply the same transformation to obtain our reply network. Note this reply network is also a bipartite graph, of the type developer-thread
. Since the communication is occurring in GitHub, an issue is equivalent to an e-mail thread. Because we wish to "add" communication edges between developers to the git log network, we perform a bipartite projection over developer-thread
to obtain a developer-developer
network. Here, we chose the weight scheme that sums the existing edge weights (i.e. number of replies to a thread) to the deleted thread node together.
reply_network <- transform_reply_to_bipartite_network(project_reply) reply_network <- bipartite_graph_projection(reply_network, mode = TRUE, weight_scheme_function = weight_scheme_sum_edges)
We can then add the developer-developer
network nodes and edges to the developer-file
network:
git_reply_network <- list() git_reply_network[["nodes"]] <- unique(rbind(git_network[["nodes"]], reply_network[["nodes"]])) git_reply_network[["edgelist"]] <- rbind(git_network[["edgelist"]], reply_network[["edgelist"]])
To perform motif search, we rely on the igraph
library. First, we transform the networks to igraph's network representation:
i_git_reply_network <- igraph::graph_from_data_frame(d=git_reply_network[["edgelist"]], directed = FALSE, vertices = git_reply_network[["nodes"]])
#visIgraph(i_git_reply_network,randomSeed = 1)
We also create our anti-motif triangle sub-graph and display:
motif_triangle <- motif_factory("anti_triangle") i_triangle_motif <- igraph::graph_from_data_frame(d=motif_triangle[["edgelist"]], directed = FALSE, vertices = motif_triangle[["nodes"]])
visIgraph(i_triangle_motif)
Because the motif search expects the "color" node attribute to be numeric, we convert the node color to 1 if black, or 2 otherwise in the igraph network representation:
V(i_triangle_motif)$color <- ifelse(V(i_triangle_motif)$color == "black",1,2) V(i_git_reply_network)$color <- ifelse(V(i_git_reply_network)$color == "black",1,2)
We can then count the motifs:
## Count subgraph isomorphisms motif_count <- igraph::count_subgraph_isomorphisms(i_triangle_motif, i_git_reply_network, method="vf2", edge.color1 = NULL, edge.color2 = NULL) motif_count
Or obtain the list of every sub-graph match of the triangle motif:
i_motif_vertice_sequence <- subgraph_isomorphisms(i_triangle_motif, i_git_reply_network, method="vf2", edge.color1 = NULL, edge.color2 = NULL) motif_vertice_sequence <- lapply(i_motif_vertice_sequence,igraph::as_ids) motif_anti_triangle_dt <- rbindlist(lapply((lapply(motif_vertice_sequence,t)),data.table)) #kable(motif_anti_triangle_dt)
The anti-motif triangle metric is then the number of times a file participates in an instance of triangle anti-motif:
setnames(motif_anti_triangle_dt, old=c("V1","V2","V3"), new=c("dev1","dev2","file_pathname")) motif_anti_triangle_count_dt <- motif_anti_triangle_dt[,.(anti_triangle_motif=.N),by="file_pathname"]
For square motif, we now also consider the file dependencies. Similar to before, we combine the networks, now also including the file network.
file_network <- copy(project_dependencies) setnames(file_network[["nodes"]], old = "filepath", new = "name") file_network[["nodes"]]$type <- FALSE file_network[["nodes"]]$color <- "#f4dbb5" edgelist <- file_network[["edgelist"]][,.(from=src_filepath, to=dest_filepath)] edgelist$weight <- rowSums(file_network[["edgelist"]][,3:ncol(file_network[["edgelist"]]),with=FALSE]) file_network[["edgelist"]] <- edgelist
file_git_reply_network <- list() file_git_reply_network[["nodes"]] <- unique(rbind(git_network[["nodes"]], reply_network[["nodes"]], file_network[["nodes"]])) file_git_reply_network[["edgelist"]] <- rbind(git_network[["edgelist"]], reply_network[["edgelist"]], file_network[["edgelist"]])
We then use igraph for visualization and calculating the motif. First of the network:
i_file_git_reply_network <- igraph::graph_from_data_frame(d=file_git_reply_network[["edgelist"]], directed = FALSE, vertices = file_git_reply_network[["nodes"]]) #visIgraph(i_file_git_reply_network,randomSeed = 1)
And then of the square motif:
motif_square <- motif_factory("anti_square") i_square_motif <- igraph::graph_from_data_frame(d=motif_square[["edgelist"]], directed = FALSE, vertices = motif_square[["nodes"]]) visIgraph(i_square_motif)
Once more, we transform the color of the nodes to numeric to perform the motif search.
V(i_square_motif)$color <- ifelse(V(i_square_motif)$color == "black",1,2) V(i_file_git_reply_network)$color <- ifelse(V(i_file_git_reply_network)$color == "black",1,2)
We can then count the motif ocurrences:
## Count subgraph isomorphisms motif_count <- count_subgraph_isomorphisms(i_square_motif, i_file_git_reply_network, method="vf2", edge.color1 = NULL, edge.color2 = NULL) motif_count
Or enumerate where it occurred:
i_motif_vertice_sequence <- subgraph_isomorphisms(i_square_motif, i_file_git_reply_network, method="vf2", edge.color1 = NULL, edge.color2 = NULL) motif_vertice_sequence <- lapply(i_motif_vertice_sequence,igraph::as_ids) motif_anti_square_dt <- rbindlist(lapply((lapply(motif_vertice_sequence,t)),data.table)) #kable(motif_anti_square_dt) setnames(motif_anti_square_dt, old=c("V1","V2","V3","V4"), new=c("dev1","dev2","file_pathname1","file_pathname2")) motif_anti_square_count_dt_1 <- motif_anti_square_dt[,.(anti_motif_square=.N),by="file_pathname1"] setnames(motif_anti_square_count_dt_1, old = "file_pathname1", new = "file_pathname") motif_anti_square_count_dt_2 <- motif_anti_square_dt[,.(anti_motif_square=.N),by="file_pathname2"] setnames(motif_anti_square_count_dt_2, old = "file_pathname2", new = "file_pathname") motif_anti_square_count_dt <- rbind(motif_anti_square_count_dt_1, motif_anti_square_count_dt_2) motif_anti_square_count_dt <- motif_anti_square_count_dt[,.(anti_motif_square=sum(anti_motif_square)),by = file_pathname]
line_metrics_dt <- parse_line_metrics(scc_path,git_repo_path) line_metrics_dt <- line_metrics_dt[,.(file_pathname=Location, lines=Lines, code=Code, comments=Comments, blanks=Blanks, complexity=Complexity)]
The various outcome metrics used in this Notebook rely on the traceability between file and issues. This is obtained from the commit messages. We can use a built-in Kaiaulu function to search for a regular expression (regex) of the issue id. First we use the regex to calculate how many commits contain issue ids. Ideally, you should consider projects with a high enough coverage, or the results may not be representative.
The total number of commits with issue ids in the chosen git slice is:
commit_message_id_coverage(project_git,issue_id_regex)
Proportion of commit messages containing issue ids relative to all commits in the slice:
normalized_coverage <- commit_message_id_coverage(project_git,issue_id_regex)/length(unique(project_git$commit_hash)) normalized_coverage
We will calculate four metrics:
First, we parse out of the project_git
's commit messages, the issue ids and add to a separate column. Note, as mentioned above, not every commit will have an issue annotated to it.
project_git <- parse_commit_message_id(project_git, issue_id_regex) jira_issues <- parse_jira_rss_xml(jira_issues_path)
project_git <- parse_commit_message_id(project_git, issue_id_regex) jira_issues <- parse_jira(jira_issues_path)[["issues"]] # Timezone is embedded on separated field. All times shown in UTC. jira_issues$issue_tz <- "0000" jira_issues$issue_updated_datetimetz <- as.POSIXct(jira_issues$issue_updated_datetimetz, format = "%Y-%m-%dT%H:%M:%S.000+0000", tz = "UTC")
Time Window Alignment
jira_issues <- jira_issues[(issue_updated_datetimetz >= start_timestamp) & (issue_updated_datetimetz <= end_timestamp )]
file_churn <- metric_file_churn(project_git) file_bug_frequency <- metric_file_bug_frequency(project_git,jira_issues) file_non_bug_frequency <- metric_file_non_bug_frequency(project_git,jira_issues) file_bug_churn <- metric_file_bug_churn(project_git,jira_issues) file_non_bug_churn <- metric_file_non_bug_churn(project_git,jira_issues) #kable(head(file_non_bug_frequency[order(-file_bug_frequency)],20))
The left join table is the project_dependencies
nodes table. Any other tables should be left joined to it, including git log ones. Otherwise, files no longer present in a snapshot will be included via the git log, which is incorrect.
flaws_and_outcomes_dt <- copy(project_dependencies[["nodes"]]) setnames(flaws_and_outcomes_dt, old = "filepath", new = "file_pathname") setnames(file_flaws, old = "file_path", new = "file_pathname") flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,line_metrics_dt, by = "file_pathname", all.x = TRUE) flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_flaws, by = "file_pathname", all.x = TRUE) flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,motif_anti_triangle_count_dt, by = "file_pathname", all.x = TRUE) flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,motif_anti_square_count_dt, by = "file_pathname", all.x = TRUE) flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_churn, by = "file_pathname", all.x = TRUE) flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_bug_frequency, by = "file_pathname", all.x = TRUE) flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_non_bug_frequency, by = "file_pathname", all.x = TRUE) flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_bug_churn, by = "file_pathname", all.x = TRUE) flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_non_bug_churn, by = "file_pathname", all.x = TRUE)
When combining tables, if a file was not listed in a metric calculated from the git log, it is because no file change was associated with the bug. The same is true in file_flaws
: If the file was not assigned to an architectural flaw, it will not occur.
setnafill(flaws_and_outcomes_dt, cols = colnames(flaws_and_outcomes_dt)[2:length(colnames(flaws_and_outcomes_dt))] , fill = 0)
A sample of the final table is shown below:
head(flaws_and_outcomes_dt) %>% gt(auto_align = FALSE) #fwrite(flaws_and_outcomes_dt,file.path(project_path,paste0(project_name,"-flaws_smells_vs_outcome_metrics.csv")))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.