rm(list = ls())
seed <- 1
set.seed(seed)
require(kaiaulu)
require(visNetwork)
require(data.table)
require(stringi)
require(igraph)
require(gh)
require(yaml)
require(magrittr)
require(knitr)
require(gt)

Introduction

This notebook computes three sets of metrics:

For implementation details, please refer to the function reference documentation. If this is being read on the package documentation, you can simply click the function to access it.

Project Configuration File

The first step is loading the project configuration file, which contains information for both the project data provenance, and the various parameters used for all the tools. Refer to Kaiaulu's repo conf folder for the configuration file of interest used in the code block below.

tool <- parse_config("../tools.yml")
#conf <- parse_config("../conf/tse_cassandra.yml")
conf <- parse_config("../conf/camel.yml")
perceval_path <- get_tool_project("perceval", tool)
dv8_path <- get_tool_project("dv8", tool)
scc_path <- get_tool_project("scc", tool)

# Gitlog parameters
git_repo_path <- get_git_repo_path(conf)
git_branch <- get_git_branches(conf)[1] # camel 1.6.0


# Depends parameters
depends_jar_path <- get_tool_project("depends", tool)
language <- get_depends_code_language(conf)
keep_dependencies_type <- get_depends_keep_dependencies_type(conf)

# Mailing List
# Specify project_key_index in get_mbox_path() (e.g. "project_key_1")
mbox_path <- get_mbox_path(conf, "project_key_1")

# DV8 parameters 
project_path <- get_dv8_folder_path(conf)
project_name <- stringi::stri_split_regex(project_path,pattern = "/")[[1]]
project_name <- project_name[length(project_name)]

flaws_params <- get_dv8_flaws_params(conf)

# Filters
file_extensions <- get_file_extensions(conf)
substring_filepath <- get_substring_filepath(conf)
filter_commit_size <- get_filter_commit_size(conf)


# Issue ID Regex on Commit Messages
issue_id_regex <- get_issue_id_regex(conf)
# Path to Jira Issues (obtained using `download_jira_data Notebook`)
# Specify project_key_index in get_jira_issues_path() (e.g. "project_key_1")
jira_issues_path <- get_jira_issues_path(conf, "project_key_1")
# Specify project_key_index in get_jira_issues_comments_path (e.g. "project_key_1")
jira_issue_comments_path <- get_jira_issues_comments_path(conf, "project_key_1")

Raw Data Pre-Processing for DV8

In order to compute architectural flaws, our first step is to parse the Git log and Dependencies for DV8. These will used to construct historical and structural dependencies in DV8.

Git Log

For the git log, we first parse it into Kaiaulu as a table, and filter based on the project configuration files criteria. Here's a sample:

#git_repo_path <- "~/Downloads/testing-cochange/.git"
git_checkout(git_branch,git_repo_path)
project_git <- parse_gitlog(perceval_path,git_repo_path)
project_git <- project_git  %>%
  filter_by_file_extension(file_extensions,"file_pathname")  %>% 
  filter_by_filepath_substring(substring_filepath,"file_pathname") %>% 
  filter_by_commit_size(commit_size = filter_commit_size)

project_git$author_datetimetz <- as.POSIXct(project_git$author_datetimetz,
                                              format = "%a %b %d %H:%M:%S %Y %z", tz = "UTC")
project_git$committer_datetimetz <- as.POSIXct(project_git$committer_datetimetz,
                                              format = "%a %b %d %H:%M:%S %Y %z", tz = "UTC")

kable(head(project_git))

Checking the git log last timestamp is also advised, as git logs on GitHub are sometimes mirrors which have not been updated for years (e.g. Geronimo's), or your local copy may not be up to date.

project_git[order(author_datetimetz)]$author_datetimetz[1]

Earliest Date:

project_git[order(-author_datetimetz)]$author_datetimetz[1]

We then transform it as a hdsm JSON (i.e. hdsmj):

hdsmj_path <- transform_gitlog_to_hdsmj(project_git,
                                        hdsmj_path = file.path(project_path,paste0(project_name,"-hdsm.json")))

Dependencies

Next, we parse file dependencies using Depends, filter the files by the project configuration file criteria, and proceed to convert it into a DV8 Structural Dependency Matrix binary.

project_dependencies <- parse_dependencies(depends_jar_path,git_repo_path,language=language)

project_dependencies[["nodes"]] <- project_dependencies[["nodes"]]  %>%
  filter_by_file_extension(file_extensions,"filepath")  %>% 
  filter_by_filepath_substring(substring_filepath,"filepath")

project_dependencies[["edgelist"]] <- project_dependencies[["edgelist"]]  %>%
  filter_by_file_extension(file_extensions,"src_filepath")  %>% 
  filter_by_file_extension(file_extensions,"dest_filepath")  %>% 
  filter_by_filepath_substring(substring_filepath,"src_filepath") %>%
  filter_by_filepath_substring(substring_filepath,"dest_filepath")


sdsmj_path <- transform_dependencies_to_sdsmj(project_dependencies,
                                    sdsmj_path = file.path(project_path,paste0(project_name,"-sdsm.json")))

list.files(project_path)

Converting to DV8 Binaries

We are now ready to convert the json files into binary format, so they can be merged:

hdsmb_path <- dv8_dsmj_to_dsmb(dv8_path = dv8_path,
                                     dsmj_path = hdsmj_path, 
                                     dsmb_path = file.path(project_path,paste0(project_name,
                                                                               "-hdsm.dv8-dsm")))

sdsmb_path <- dv8_dsmj_to_dsmb(dv8_path = dv8_path,
                                     dsmj_path = sdsmj_path, 
                                     dsmb_path = file.path(project_path,paste0(project_name,
                                                                               "-sdsm.dv8-dsm")))

list.files(project_path)

Merging DSMs

Our next step in the pipeline, is to combine both our binary DSMs, i.e. *-sdsm.dv8-dsm and *-hdsm.dv8-dsm into a merged DSM, *-merge-dv8.dsm.

mdsmb_path <- dv8_hdsmb_sdsmb_to_mdsmb(dv8_path=dv8_path,
                                         hdsmb_path=hdsmb_path,
                                         sdsmb_path = sdsmb_path,
                                         mdsmb_path = file.path(project_path,
                                                                paste0(project_name,"-merge.dv8-dsm")))
list.files(project_path)

Exploring Merge DSMs

We can load load on DV8-GUI the -hdsm, -sdsm or -mdsm files files to inspect their matrices, or export them via dv8-console their .xlsx counterpart. Let's inspect the -mdsm. First, we will perform clustering over the files. This will later be displayed in the DSM as black rectangles. This is an optional step:

hierclsxb_path <- dv8_mdsmb_to_hierclsxb(dv8_path = dv8_path,
                                          mdsmb_path = mdsmb_path,
                                          hierclsxb_path = file.path(project_path,
                                                                     paste0(project_name,
                                                                            "-clsx.dv8-clsx")))
list.files(project_path)

DV8 Analysis

Now we have a merged DSM, we can perform various analysis in DV8.

Architectural Flaws

DV8 computes a variety of metrics. Let's first observe Architectural Flaws.

# Format Architectural Flaw Parameters to DV8 Command
flaws_params$cliqueDepends <- stringi::stri_c(flaws_params$cliqueDepends,collapse=",")
flaws_params$uihDepends <- stringi::stri_c(flaws_params$uihDepends,collapse=",")
flaws_params$uihInheritance <- stringi::stri_c(flaws_params$uihInheritance,collapse=",")
flaws_folder <- dv8_mdsmb_to_flaws(dv8_path = dv8_path, 
                                    mdsmb_path = mdsmb_path,
                                    flaws_path = file.path(project_path,paste0(project_name,"_flaws")),
                                    is_file_only_metric = FALSE,
                                    cliqueDepends=flaws_params$cliqueDepends,
                                    crossingCochange=flaws_params$crossingCochange,
                                    crossingFanIn=flaws_params$crossingFanIn,
                                    crossingFanOut=flaws_params$crossingFanOut,
                                    mvCochange=flaws_params$mvCochange,
                                    uiCochange=flaws_params$uiCochange,
                                    uihDepends=flaws_params$uihDepends,
                                    uihInheritance=flaws_params$uihInheritance,
                                    uiHistoryImpact=flaws_params$uiHistoryImpact,
                                    uiStructImpact=flaws_params$uiStructImpact)
list.files(project_path)

Exploring Flaws DSMs

We can see a folder of Architectural Flaws was generated. The folder organization provides us with means to understand what files are assigned to various types of architectural flaws. The following is a general example used for geronimo (not computed here, but the structure carries over for other larger projects):

geronimo_flaws/
├── modularity-violation
└── package-cycle

Describes two types of architectural flaws were found in APR, modularity violation and package-cycle. Moreover inside each of the folders we see numbered folders:

geronimo_flaws/
├── modularity-violation
│   ├── 1
│   │   ├── 1-clsx.dv8-clsx
│   │   ├── 1-hdsm.dv8-dsm
│   │   ├── 1-merge.dv8-dsm
│   │   ├── 1-sdsm.dv8-dsm
│   │   └── 1.dv8-issue
│   ├── 10
│   │   ├── 10-clsx.dv8-clsx
│   │   ├── 10-hdsm.dv8-dsm
│   │   ├── 10-merge.dv8-dsm
│   │   ├── 10-sdsm.dv8-dsm
│   │   └── 10.dv8-issue

Each numbered folder represents an architectural flaw ID. For example, above we have the modularity violation flaws ID 1 and ID 10. We can further see the sdsm, hdsm, merge dsm, and cluster files occur within both folder ids. If load on DV8 GUI or export to excel (as done in the prior section) any of these files, we can see which files participate in these architectural flaws, and the clusters they are assigned. We will now see how to represent this information as a table we can parse in R, or more specifically the file-architectural flaw assignment, so it can be combined to other analysis in Kaiaulu.

Creating a Flaws Mapping

file_to_flaws_map <- parse_dv8_architectural_flaws(dv8_path, flaws_folder,progress_bar = TRUE)
fwrite(file_to_flaws_map,file.path(project_path,paste0(project_name,"-flaws_map.csv")))
kable(head(file_to_flaws_map))
file_to_flaws_map <- fread(file.path(project_path,paste0(project_name,"-flaws_map.csv")))

With the flaws mapping, we can then group by the table over the file paths and type of flaw, to obtain the primary granularity of this analysis: I.e. metrics per file. This table provides us with the flaws metrics per file.

file_flaws <- file_to_flaws_map[,.(n_flaws=length(architecture_issue_id)),by=c("file_path",
                                                                 "architecture_issue_type")]
file_flaws <- dcast(file_flaws, file_path ~ ...,value.var = "n_flaws")

Alternative to Flaws Mapping

The computation of Flaws Mapping can be very slow, as it is done outside DV8. If the interest is only to compute file metrics, and not aggregate them, then we can request them directly from DV8 like so:

flaws_folder <- dv8_mdsmb_to_flaws(dv8_path = dv8_path, 
                                    mdsmb_path = mdsmb_path,
                                    flaws_path = file.path(project_path,paste0(project_name,"_flaws")),
                                    is_file_only_metric = TRUE,
                                    cliqueDepends=flaws_params$cliqueDepends,
                                    crossingCochange=flaws_params$crossingCochange,
                                    crossingFanIn=flaws_params$crossingFanIn,
                                    crossingFanOut=flaws_params$crossingFanOut,
                                    mvCochange=flaws_params$mvCochange,
                                    uiCochange=flaws_params$uiCochange,
                                    uihDepends=flaws_params$uihDepends,
                                    uihInheritance=flaws_params$uihInheritance,
                                    uiHistoryImpact=flaws_params$uiHistoryImpact,
                                    uiStructImpact=flaws_params$uiStructImpact)
list.files(project_path)

DV8 always generate its file metrics file inside the flaws folder and names it file-measure-report.csv:

flaws_folder <- file.path(project_path,paste0(project_name,"_flaws"))
file_flaws <- fread(file.path(flaws_folder,"file-measure-report.csv"))
setnames(file_flaws,
         old = c("FileName","Clique","Crossing","ModularityViolation","PackageCycle",
                 "UnhealthyInheritance","UnstableInterface"),
         new = c("file_path","clique","crossing","modularity-violation","package-cycle",
                 "unhealthy-inheritance",
                 "unstable-interface"))

With flaws computed, we moved on to the next file metrics of this analysis.

File Social Smell Motif Metrics

To compute the social smell motifs, we require communication data. For this dataset, we will use mailiing list communication.

project_mbox <- parse_mbox(perceval_path,mbox_path)
project_jira <- parse_jira_replies(parse_jira(jira_issue_comments_path)) 

# Timezone is embedded on separated field. All times shown in UTC.
project_jira$reply_tz <- "0000"

project_jira$reply_datetimetz <- as.POSIXct(project_jira$reply_datetimetz,
                                      format = "%Y-%m-%dT%H:%M:%S.000+0000", tz = "UTC")
# All replies are combined into a single reply table. 
project_reply <- project_mbox

project_git <- project_git[order(author_datetimetz)]

project_reply <- project_reply[order(reply_datetimetz)]

#project_reply <- project_reply[reply_datetimetz >= start_date & reply_datetimetz <= end_date]
project_reply <- project_jira

project_git <- project_git[order(author_datetimetz)]

project_reply <- project_reply[order(reply_datetimetz)]

The Git, Reply and Source Code (va git_checkout) Snapshot Time Windows must align

start_timestamp <- max(min(project_git$author_datetimetz,na.rm = TRUE),
                       min(project_reply$reply_datetimetz,na.rm = TRUE))

end_timestamp <- min(max(project_git$author_datetimetz,na.rm = TRUE),
                     max(project_reply$reply_datetimetz,na.rm = TRUE))

if(start_timestamp > end_timestamp){
  stop("Non-overlapping git log and reply datasets")
}

project_git <- project_git[(author_datetimetz >= start_timestamp) &
                             (author_datetimetz <= end_timestamp )]

project_reply <- project_reply[(reply_datetimetz >= start_timestamp) &
                             (reply_datetimetz <= end_timestamp )]

Identity Match

Because developers may utilize different e-mails within Git and between Git and Mailing List, we use a set of heuristics to unify their identity across different e-mails.

project_log <- list(project_git=project_git,project_reply=project_reply)
project_log <- identity_match(project_log,
                                name_column = c("author_name_email","reply_from"),
                                assign_exact_identity,
                               # use_name_only=FALSE,
                                label = "raw_name")

project_git <- project_log[["project_git"]]
project_reply <- project_log[["project_reply"]]

Networks

Having performed the necessary transformations on our data sources, we are ready to transform them to networks, where our motifs will be computed. Our goal is to create a single graph containing all the information of interest in order to search for sub-graphs of interest (i.e. our defined motifs).

A number of transformation functions are available in Kaiaulu to transform the various logs into networks. First, we transform our git log data into a bipartite author-file network:

git_network <- transform_gitlog_to_bipartite_network(project_git,
                                                     mode="author-file")

Next we apply the same transformation to obtain our reply network. Note this reply network is also a bipartite graph, of the type developer-thread. Since the communication is occurring in GitHub, an issue is equivalent to an e-mail thread. Because we wish to "add" communication edges between developers to the git log network, we perform a bipartite projection over developer-thread to obtain a developer-developer network. Here, we chose the weight scheme that sums the existing edge weights (i.e. number of replies to a thread) to the deleted thread node together.

reply_network <- transform_reply_to_bipartite_network(project_reply)

reply_network <- bipartite_graph_projection(reply_network,
                                                      mode = TRUE,
                                                      weight_scheme_function = weight_scheme_sum_edges)   

We can then add the developer-developer network nodes and edges to the developer-file network:

git_reply_network <- list()

git_reply_network[["nodes"]] <- unique(rbind(git_network[["nodes"]],
                                             reply_network[["nodes"]]))

git_reply_network[["edgelist"]] <- rbind(git_network[["edgelist"]],
                                         reply_network[["edgelist"]])

Anti Triangle Motif

To perform motif search, we rely on the igraph library. First, we transform the networks to igraph's network representation:

i_git_reply_network <- igraph::graph_from_data_frame(d=git_reply_network[["edgelist"]], 
                      directed = FALSE, 
                      vertices = git_reply_network[["nodes"]])
#visIgraph(i_git_reply_network,randomSeed = 1)

We also create our anti-motif triangle sub-graph and display:

motif_triangle <- motif_factory("anti_triangle")
i_triangle_motif <- igraph::graph_from_data_frame(d=motif_triangle[["edgelist"]], 
                      directed = FALSE, 
                      vertices = motif_triangle[["nodes"]])
visIgraph(i_triangle_motif)

Because the motif search expects the "color" node attribute to be numeric, we convert the node color to 1 if black, or 2 otherwise in the igraph network representation:

V(i_triangle_motif)$color <- ifelse(V(i_triangle_motif)$color == "black",1,2)
V(i_git_reply_network)$color <- ifelse(V(i_git_reply_network)$color == "black",1,2)

We can then count the motifs:

## Count subgraph isomorphisms
motif_count <- igraph::count_subgraph_isomorphisms(i_triangle_motif, i_git_reply_network, method="vf2",
                                           edge.color1 = NULL,
                                           edge.color2 = NULL)
motif_count

Or obtain the list of every sub-graph match of the triangle motif:

i_motif_vertice_sequence <- subgraph_isomorphisms(i_triangle_motif, i_git_reply_network, method="vf2",
                                           edge.color1 = NULL,
                                           edge.color2 = NULL)
motif_vertice_sequence <- lapply(i_motif_vertice_sequence,igraph::as_ids)

motif_anti_triangle_dt <- rbindlist(lapply((lapply(motif_vertice_sequence,t)),data.table))
#kable(motif_anti_triangle_dt)

The anti-motif triangle metric is then the number of times a file participates in an instance of triangle anti-motif:

setnames(motif_anti_triangle_dt,
         old=c("V1","V2","V3"),
         new=c("dev1","dev2","file_pathname"))
motif_anti_triangle_count_dt <- motif_anti_triangle_dt[,.(anti_triangle_motif=.N),by="file_pathname"]

Anti Square Motif

For square motif, we now also consider the file dependencies. Similar to before, we combine the networks, now also including the file network.

file_network <- copy(project_dependencies)
setnames(file_network[["nodes"]],
         old = "filepath",
         new = "name")
file_network[["nodes"]]$type <- FALSE
file_network[["nodes"]]$color <- "#f4dbb5"

edgelist <- file_network[["edgelist"]][,.(from=src_filepath,
                                          to=dest_filepath)]
edgelist$weight <- rowSums(file_network[["edgelist"]][,3:ncol(file_network[["edgelist"]]),with=FALSE])
file_network[["edgelist"]] <- edgelist
file_git_reply_network <- list()

file_git_reply_network[["nodes"]] <- unique(rbind(git_network[["nodes"]],
                                             reply_network[["nodes"]],
                                             file_network[["nodes"]]))

file_git_reply_network[["edgelist"]] <- rbind(git_network[["edgelist"]],
                                         reply_network[["edgelist"]],
                                         file_network[["edgelist"]])

We then use igraph for visualization and calculating the motif. First of the network:

i_file_git_reply_network <- igraph::graph_from_data_frame(d=file_git_reply_network[["edgelist"]], 
                      directed = FALSE, 
                      vertices = file_git_reply_network[["nodes"]])

#visIgraph(i_file_git_reply_network,randomSeed = 1)

And then of the square motif:

motif_square <- motif_factory("anti_square")
i_square_motif <- igraph::graph_from_data_frame(d=motif_square[["edgelist"]], 
                      directed = FALSE, 
                      vertices = motif_square[["nodes"]])

visIgraph(i_square_motif)

Once more, we transform the color of the nodes to numeric to perform the motif search.

V(i_square_motif)$color <- ifelse(V(i_square_motif)$color == "black",1,2)
V(i_file_git_reply_network)$color <- ifelse(V(i_file_git_reply_network)$color == "black",1,2)

We can then count the motif ocurrences:

## Count subgraph isomorphisms
motif_count <- count_subgraph_isomorphisms(i_square_motif, i_file_git_reply_network, method="vf2",
                                           edge.color1 = NULL,
                                           edge.color2 = NULL)
motif_count

Or enumerate where it occurred:

i_motif_vertice_sequence <- subgraph_isomorphisms(i_square_motif, i_file_git_reply_network, method="vf2",
                                           edge.color1 = NULL,
                                           edge.color2 = NULL)
motif_vertice_sequence <- lapply(i_motif_vertice_sequence,igraph::as_ids)

motif_anti_square_dt <- rbindlist(lapply((lapply(motif_vertice_sequence,t)),data.table))
#kable(motif_anti_square_dt)

setnames(motif_anti_square_dt,
         old=c("V1","V2","V3","V4"),
         new=c("dev1","dev2","file_pathname1","file_pathname2"))
motif_anti_square_count_dt_1 <- motif_anti_square_dt[,.(anti_motif_square=.N),by="file_pathname1"]
setnames(motif_anti_square_count_dt_1,
         old = "file_pathname1",
         new = "file_pathname")
motif_anti_square_count_dt_2 <- motif_anti_square_dt[,.(anti_motif_square=.N),by="file_pathname2"]
setnames(motif_anti_square_count_dt_2,
         old = "file_pathname2",
         new = "file_pathname")
motif_anti_square_count_dt <- rbind(motif_anti_square_count_dt_1,
                                    motif_anti_square_count_dt_2)
motif_anti_square_count_dt <- motif_anti_square_count_dt[,.(anti_motif_square=sum(anti_motif_square)),by = file_pathname]

Line Metrics

line_metrics_dt <- parse_line_metrics(scc_path,git_repo_path)
line_metrics_dt <- line_metrics_dt[,.(file_pathname=Location,
                                      lines=Lines,
                                      code=Code,
                                      comments=Comments,
                                      blanks=Blanks,
                                      complexity=Complexity)]

Calculating Outcome Metrics

Identifying Issue IDs in Commit Messages

The various outcome metrics used in this Notebook rely on the traceability between file and issues. This is obtained from the commit messages. We can use a built-in Kaiaulu function to search for a regular expression (regex) of the issue id. First we use the regex to calculate how many commits contain issue ids. Ideally, you should consider projects with a high enough coverage, or the results may not be representative.

The total number of commits with issue ids in the chosen git slice is:

commit_message_id_coverage(project_git,issue_id_regex)

Proportion of commit messages containing issue ids relative to all commits in the slice:

normalized_coverage <- commit_message_id_coverage(project_git,issue_id_regex)/length(unique(project_git$commit_hash))
normalized_coverage

We will calculate four metrics:

First, we parse out of the project_git's commit messages, the issue ids and add to a separate column. Note, as mentioned above, not every commit will have an issue annotated to it.

project_git <- parse_commit_message_id(project_git, issue_id_regex)
jira_issues <- parse_jira_rss_xml(jira_issues_path)
project_git <- parse_commit_message_id(project_git, issue_id_regex)
jira_issues <- parse_jira(jira_issues_path)[["issues"]]


# Timezone is embedded on separated field. All times shown in UTC.
jira_issues$issue_tz <- "0000"

jira_issues$issue_updated_datetimetz <- as.POSIXct(jira_issues$issue_updated_datetimetz,
                                      format = "%Y-%m-%dT%H:%M:%S.000+0000", tz = "UTC")

Time Window Alignment

jira_issues <- jira_issues[(issue_updated_datetimetz >= start_timestamp) &
                             (issue_updated_datetimetz <= end_timestamp )]
file_churn <- metric_file_churn(project_git)
file_bug_frequency <- metric_file_bug_frequency(project_git,jira_issues)
file_non_bug_frequency <- metric_file_non_bug_frequency(project_git,jira_issues)
file_bug_churn <- metric_file_bug_churn(project_git,jira_issues)
file_non_bug_churn <- metric_file_non_bug_churn(project_git,jira_issues)


#kable(head(file_non_bug_frequency[order(-file_bug_frequency)],20))

Merge

The left join table is the project_dependencies nodes table. Any other tables should be left joined to it, including git log ones. Otherwise, files no longer present in a snapshot will be included via the git log, which is incorrect.

flaws_and_outcomes_dt <- copy(project_dependencies[["nodes"]])

setnames(flaws_and_outcomes_dt,
         old = "filepath",
         new = "file_pathname")

setnames(file_flaws,
         old = "file_path",
         new = "file_pathname")

flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,line_metrics_dt, by = "file_pathname", all.x = TRUE)
flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_flaws, by = "file_pathname", all.x = TRUE)
flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,motif_anti_triangle_count_dt, by = "file_pathname", all.x = TRUE)
flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,motif_anti_square_count_dt, by = "file_pathname", all.x = TRUE)
flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_churn, by = "file_pathname", all.x = TRUE)
flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_bug_frequency, by = "file_pathname", all.x = TRUE)
flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_non_bug_frequency, by = "file_pathname", all.x = TRUE)
flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_bug_churn, by = "file_pathname", all.x = TRUE)
flaws_and_outcomes_dt <- merge(flaws_and_outcomes_dt,file_non_bug_churn, by = "file_pathname", all.x = TRUE)

Fill Non Join Match with 0s

When combining tables, if a file was not listed in a metric calculated from the git log, it is because no file change was associated with the bug. The same is true in file_flaws: If the file was not assigned to an architectural flaw, it will not occur.

setnafill(flaws_and_outcomes_dt, 
          cols = colnames(flaws_and_outcomes_dt)[2:length(colnames(flaws_and_outcomes_dt))]
          , fill = 0)

A sample of the final table is shown below:

head(flaws_and_outcomes_dt)  %>%
  gt(auto_align = FALSE) 
#fwrite(flaws_and_outcomes_dt,file.path(project_path,paste0(project_name,"-flaws_smells_vs_outcome_metrics.csv")))


sailuh/kaiaulu documentation built on Dec. 10, 2024, 3:14 a.m.