# Kaiaulu - https://github.com/sailuh/kaiaulu
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
############## Understand Project Builder ##############
#' Build Understand DB
#'
#' Uses Scitools Understand to create a source code project Und Database.
#'
#' @param scitools_path path to the scitools binary `und`
#' @param project_path path to the project source code folder to create the Understand DB.
#' @param language the primary language of the project (language must be supported by Understand)
#' @param output_dir path to output directory (formatted output_path/)
#'
#' @return The created Scitools Understand DB path
#' @references See pg. 352 in https://documentation.scitools.com/pdf/understand.pdf Sept. 2024 Edition
#' @export
#' @family parsers
build_understand_project <- function(scitools_path, project_path, language, output_dir){
scitools_path <- path.expand(scitools_path)
# Create variables for command line
command <- scitools_path
project_path <- shQuote(project_path) # Quoting the project path
db_dir <- file.path(output_dir, "Understand.und")
args <- c("create", "-db", db_dir, "-languages", language)
# Build the Understand project by parsing through using Understand's und command
build_output <- system2(command, args)
args <- c("-db", db_dir, "add", project_path)
db_output <- system2(command, args)
analyze_output <- args <- c("analyze", db_dir)
output <- system2(command, args)
return(db_dir)
}
#' Extract Understand Dependencies
#'
#' Extract the XML dependency file for either class or file granularity from
#' an understand DB.
#'
#' @param scitools_path path to the scitools binary `und`
#' @param db_path path to the scitools DB (see \code{\link{build_understand_project}})
#' @param parse_type Type of dependencies to generate into xml (either "file" or "class")
#' @param output_filepath path to the output XML filepath of dependencies
#'
#' @return The output directory where the db will be created, i.e. output_dir parameter.
#' @references See pg. 352 in https://documentation.scitools.com/pdf/understand.pdf Sept. 2024 Edition
#' @export
#' @family parsers
export_understand_dependencies <- function(scitools_path, db_filepath, parse_type = c("file", "class"), output_filepath){
scitools_path <- path.expand(scitools_path)
# Before running, check if parse_type is correct
parse_type <- match.arg(parse_type)
# Create the variables used in command lines
#db_dir <- file.path(understand_dir, "Understand.und")
#file_name <- paste0(parse_type, "Dependencies.xml")
#xml_dir <- file.path(db_dir, file_name)
# Generate the XML file
# Derived from pg. 352 in https://documentation.scitools.com/pdf/understand.pdf Sept. 2024 Edition
args <- c("export", "-dependencies", parse_type, "cytoscape", output_filepath, db_filepath)
output <- system2(scitools_path, args)
return(output_filepath)
# Generated XML file is assumed to be in this approximate format (regardless of parse_type) using Understand Build 1202
# <graph ...>
# ... [Irrelevant graph attributes and rdf grandchildren]
# <node id="67" label="ObjectMapper id:67">
# <att type="string" name="node.shape" value="rect"/>
# <att type="string" name="node.fontSize" value="5"/>
# <att type="string" name="node.label" value="ObjectMapper"/>
# <att type="string" name="longName" value="com.fasterxml.jackson.databind.ObjectMapper"/>
# <att type="string" name="kind" value="Unknown Class"/>
# <graphics type="RECTANGLE" h="35" w="35" x="0" y="0" fill="#ffffff" width="1" outline="#000000" cy:nodeTransparency="1.0" cy:nodeLabelFont="Default-0-8" cy:borderLineType="solid"/>
# </node>
# ... [Other nodes sharing the format]
# <edge source="2" target="9" label="App(Depends On)CalculatorUI">
# <att type="string" name="edge.targetArrowShape" value="ARROW"/>
# <att type="string" name="edge.color" value="#0000FF"/>
# <att type="string" name="canonicalName" value="App(Depends On)CalculatorUI"/>
# <att type="string" name="interaction" value="Depends On"/>
# <att type="string" name="dependency kind" value="Call, Create"/>
# </edge>
# ... [Other edges sharing the format]
}
############## Parsers ##############
#' Parse Scitools Understand Dependencies XML
#'
#' Parses either a file or class scitools understand dependency XML to table.
#'
#' @param dependencies_path path to the exported Understand dependencies file (see \code{\link{export_understand_dependencies}}).
#' @export
#' @family parsers
parse_understand_dependencies <- function(dependencies_path) {
# Parse the XML file
xml_data <- xmlParse(dependencies_path) # Creates pointer to file
xml_nodes <- xmlRoot(xml_data) # Finds the head: graph
xml_nodes <- xmlChildren(xml_nodes)
# xml_nodes now contains the nodes and edges (which were children of graph) and also graph's atts
# From child nodes- filter for those with name "node"
# Create a list by iterating through all the children in xml_nodes
node_elements <- lapply(xml_nodes, function(child) {
if (xmlName(child) == "node") { # We're searching for nodes, not att or edges
id <- xmlGetAttr(child, "id") # Extract the id from the node line
att_nodes <- xmlChildren(child) # To access the atts of the node
node_label <- xmlGetAttr(att_nodes[[3]], "value") # Relevant att is the 3rd line
long_name <- xmlGetAttr(att_nodes[[4]], "value") # Relevant att is the 4th line
return(data.table(node_label = node_label, id = id, long_name = long_name)) # Returns the table containing the filtered node data
} else {
return(NULL) # Return NULL for the entry to be filtered out later
}
})
# Remove NULLs and combine the results from the node_elements list
node_list <- rbindlist(node_elements[!sapply(node_elements, is.null)], use.names = TRUE, fill = TRUE)
# From child nodes- filter for those with name "edge"
# Create a list by iterating through all the children in xml_nodes
edge_elements <- lapply(xml_nodes, function(child) {
if (xmlName(child) == "edge") { # We're searching for edges, not att or nodes
# Extract the id_from and id_to from the edge line
id_from <- xmlGetAttr(child, "source")
id_to <- xmlGetAttr(child, "target")
att_nodes <- xmlChildren(child) # To access the atts of the edge
dependency_kind <- xmlGetAttr(att_nodes[[5]], "value") # Relevant att is the 5th line
# Error handling for empty and NULL dependency_kind (this is necessary as errors do occur even in the formatted style)
# Code correctly handles all the edges, however produces error if error handling is not included... so...
if (!is.null(dependency_kind) && dependency_kind != "") {
dependency_kind <- unlist(stri_split(dependency_kind, regex = ",\\s*")) # Separates the string into a vector
return(data.table(id_from = id_from, id_to = id_to, dependency_kind = dependency_kind)) # Returns the table containing the filtered node data
} else {
return(NULL) # Return NULL for the entry to be filtered out later
}
} else {
return(NULL) # Return NULL for the entry to be filtered out later
}
})
# Remove NULLs and combine the results from the edge_elements list
edge_list <- rbindlist(edge_elements[!sapply(edge_elements, is.null)], use.names = TRUE, fill = TRUE)
# Merge edges with nodes to get label_from
edge_list <- merge(edge_list, node_list[, .(id, node_label)], by.x = "id_from", by.y = "id", all.x = TRUE)
setnames(edge_list, "node_label", "label_from")
# Merge again to get label_to
edge_list <- merge(edge_list, node_list[, .(id, node_label)], by.x = "id_to", by.y = "id", all.x = TRUE)
setnames(edge_list, "node_label", "label_to")
# Reorder columns to have label_from and label_to on the left
edge_list <- edge_list[, .(label_from, label_to, id_from, id_to, dependency_kind)]
# Create a list of the network to return
graph <- list(node_list = node_list, edge_list = edge_list)
return(graph)
}
#' Parse dependencies from Depends
#'
#' @param depends_jar_path path to depends jar
#' @param git_repo_path path to git repo (ends in .git)
#' @param output_dir path to output directory (formatted output_path/)
#' @param language the language of the .git repo (accepts cpp, java, ruby, python, pom)
#' @export
#' @family parsers
parse_dependencies <- function(depends_jar_path,git_repo_path,language,output_dir="/tmp/"){
# Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop")
depends_jar_path <- path.expand(depends_jar_path)
git_repo_path <- path.expand(git_repo_path)
# Remove ".git"
folder_path <- stri_replace_last(git_repo_path,replacement="",regex=".git")
project_name <- stri_split_regex(folder_path,pattern="/")[[1]]
project_name <- project_name[length(project_name)-1]
# Use Depends to parse the code folder.
system2("java",
args = c("-jar",depends_jar_path,
language,folder_path,
project_name,'--dir',
output_dir,
'--auto-include',
'--granularity=file', '--namepattern=/',
'--format=json'),
stdout = FALSE,
stderr = FALSE)
# Construct /output_dir/ file path
output_path <- stri_c(output_dir, project_name,".json")
# Parsed JSON output.
depends_parsed <- jsonlite::read_json(output_path)
# The JSON has two main parts. The first is a vector of all file names.
file_names <- unlist(depends_parsed[["variables"]])
# Depends will create full filepaths, but folder_path may be a relative path.
# We must guarantee our folder_path is also a full path in order to turn all
# full filepaths generated by Depends into relative paths.
# "../rawdata/git_repo/helix/" => "/Users/cvp/Desktop/kaiaulu/rawdata/git_repo/helix"
normalized_folder_path <- paste0(normalizePath(folder_path),"/")
# /Users/user/git_repos/APR/xml/apr_xml_xmllite.c => "xml/apr_xml_xmllite.c"
file_names <- stri_replace_first(file_names,replacement="",regex=normalized_folder_path)
# The second part is the dependencies itself, which refer to the file name indices.
dependencies <- depends_parsed[["cells"]]
# The types of dependencies is a list of lists. First we unlist the various types.
dependencies_types <- rbindlist(lapply(dependencies,
function(x) as.data.table(x$values)),
fill=TRUE)
# Fixes column types to numeric, and replace NAs by 0s, as an NA means 0 dependencies.
dependencies_types <- data.table(sapply(dependencies_types,as.numeric))
dependencies_types[is.na(dependencies_types)] <- 0
# Then we unlist the src and dest files.
dependencies_files <- rbindlist(lapply(dependencies,
function(x) as.data.table(x[c("src","dest")])),
fill=TRUE)
# And finally we combine them
depends_parsed <- cbind(dependencies_files,dependencies_types)
# We use the file_names to re-label the files for further analysis
# Note the +1: The json assumes a file index starts at 0. R index starts 1, hence the + 1.
depends_parsed$src <- file_names[depends_parsed$src + 1]
depends_parsed$dest <- file_names[depends_parsed$dest + 1]
edgelist <- depends_parsed
data.table::setnames(x = edgelist,
old = c("src","dest"),
new = c("src_filepath","dest_filepath"))
nodes <- data.table(filepath=file_names)
graph <- list(nodes=nodes,edgelist=edgelist)
return(graph)
}
#' Parse Java Code Refactorings
#'
#' @param rminer_path The path to RMiner binary.
#' See \url{https://github.com/tsantalis/RefactoringMiner#running-refactoringminer-from-the-command-line}
#' @param git_repo_path path to git repo (ends in .git)
#' @param start_commit the start commit hash
#' @param end_commit the end commit hash
#' @export
#' @references Nikolaos Tsantalis, Matin Mansouri, Laleh Eshkevari,
#' Davood Mazinanian, and Danny Dig, "Accurate and Efficient Refactoring
#' Detection in Commit History," 40th
#' International Conference on Software Engineering (ICSE 2018),
#' Gothenburg, Sweden, May 27 - June 3, 2018.
#' @keywords internal
parse_java_code_refactoring_json <- function(rminer_path,git_repo_path,start_commit,end_commit){
# Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop")
rminer_path <- path.expand(rminer_path)
git_repo_path <- path.expand(git_repo_path)
# Remove ".git"
git_uri <- stri_replace_last(git_repo_path,replacement="",regex=".git")
# Use percerval to parse mbox_path. --json line is required to be parsed by jsonlite::fromJSON.
rminer_output <- system2(rminer_path,
args = c('-bc',git_uri,start_commit,end_commit),
stdout = TRUE,
stderr = FALSE)
# Parsed JSON output as a data.table.
rminer_parsed <- jsonlite::parse_json(rminer_output)
return(rminer_parsed)
}
#' Parse File Line Metrics
#'
#' @param scc_path The path to scc binary.
#' See \url{https://github.com/boyter/scc}
#' @param git_repo_path path to git repo (ends in .git)
#' @export
parse_line_metrics <- function(scc_path,git_repo_path){
# Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop")
scc_path <- path.expand(scc_path)
git_repo_path <- path.expand(git_repo_path)
# Remove ".git"
folder_path <- stri_replace_last(git_repo_path,replacement="",regex=".git")
# Use Depends to parse the code folder.
stdout <- system2(
scc_path,
args = c(folder_path, '--by-file','--format','csv'),
stdout = TRUE,
stderr = FALSE
)
line_metrics <- fread(stri_c(stdout,collapse = "\n"))
# /Users/user/git_repos/APR/xml/apr_xml_xmllite.c => "xml/apr_xml_xmllite.c"
line_metrics$Location <- stri_replace_first(line_metrics$Location,
replacement="",
regex=folder_path)
return(line_metrics)
}
#' Parse File Line Type
#'
#' @param utags_path The path to utags binary.
#' See \url{https://github.com/universal-ctags/ctags}
#' @param filepath path to file
#' @param kinds the entity kinds utags should identify per line.
#' @export
#' @keywords internal
parse_line_type_file <- function(utags_path,filepath,kinds){
# Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop")
utags_path <- path.expand(utags_path)
filepath <- path.expand(filepath)
language <- stri_trans_tolower(last(stri_split_regex(filepath,"\\.")[[1]]))
# Entity Kinds e.g. (function, class, etc) are specified by user.
file_kinds <- kinds[[language]]
# Specify fields of uctags output this function will parse and show to user:
# n = start line
# e = end line
# k = entity kind specified as single letter (i.e. 'f','c', etc)
fields <- c("n","e","k")
stdout <- system2(
command = utags_path,
args = c(
stri_c("--fields=",stri_c(fields, collapse = "")),
stri_c("--kinds-", language,"=",stri_c(file_kinds,collapse=""), collapse =""),
'-f','-',filepath),
stdout = TRUE,
stderr = FALSE
)
parsed_tags <- data.table(
stri_match_first_regex(stdout,
pattern = '^(\\S+)\\t(\\S+)\\t/\\^(.+)\\$?\\/;\"\\t(\\w)\\tline:(\\d+)\\tend:(\\d+)')
)
setnames(parsed_tags,
c("raw_ctags","entity_name","filepath","line_content","entity_type","line_start","line_end"))
parsed_tags[]
return(parsed_tags)
}
#' Parse R File and Function Dependencies
#'
#' @param folder_path The path to an R folder path
#' @export
parse_r_dependencies <- function(folder_path){
all_filepaths <- list.files(file.path(path.expand(folder_path)),recursive=TRUE,pattern="\\.r|\\.R",full.names=TRUE)
parsed_r_files <- lapply(all_filepaths,parse_rfile_ast)
names(parsed_r_files) <- all_filepaths
parsed_r_files <- lapply(parsed_r_files,function(x)
x[,filepath:= stri_replace_first(filepath,regex = stri_c(folder_path,"/"),replacement = "")])
definitions <- lapply(parsed_r_files,parse_r_function_definition)
definitions <- rbindlist(definitions)
unique_definitions <- unique(definitions)
parse_r_network <- function(parsed_r_file,unique_definitions){
function_edgelist <- parse_r_function_dependencies(parsed_r_file,unique_definitions)
return(function_edgelist)
}
edgelists <- lapply(parsed_r_files,parse_r_network,unique_definitions)
filter_by_ownership <- function(edgelist,unique_definitions){
edgelist <- edgelist[src_functions_call_name %in% unique_definitions$src_functions_name &
src_functions_caller_name %in% unique_definitions$src_functions_name]
return(edgelist)
}
edgelists <- lapply(edgelists,filter_by_ownership,unique_definitions)
edgelists <- rbindlist(edgelists)
names(parsed_r_files) <- sapply(stri_split_regex(all_filepaths,"/"),data.table::last)
return(edgelists)
}
############## Network Transform ##############
#' Transform Understand Dependencies
#'
#' @description This function subsets a parsed table from parse_understand_dependencies
#'
#' @param parsed Parsed table from \code{\link{parse_understand_dependencies}}
#' @param weight_types The weight types as defined in Depends. Accepts single string and vector input
#' @export
#' @family edgelists
transform_understand_dependencies_to_network <- function(parsed, weight_types) {
nodes <- parsed[["node_list"]]
edges <- parsed[["edge_list"]]
# Create an ID column, as the file name in a label may occur
# again in other parts of the code.
nodes$node_label <- stringi::stri_c(nodes$node_label,"|",nodes$id)
edges$label_from <- stringi::stri_c(edges$label_from,"|",edges$id_from)
edges$label_to <- stringi::stri_c(edges$label_to,"|",edges$id_to)
# Filter out by weights if vector provided
if (length(weight_types) > 0) {
edges <- edges[dependency_kind %in% weight_types]
}
# If filter removed all edges:
if (nrow(edges) == 0) {
stop("Error: No edges found under weight_types.")
}
# Create a list to return
graph <- list(node_list = nodes, edge_list = edges)
return(graph)
}
#' Transform parsed dependencies into a network
#'
#' @param depends_parsed A parsed mbox by \code{\link{parse_dependencies}}.
#' @param weight_types The weight types as defined in Depends.
#'
#' @export
#' @family edgelists
transform_dependencies_to_network <- function(depends_parsed,weight_types=NA){
src <- dest <- weight <- NULL # due to NSE notes in R CMD check
# Can only include types user wants if Depends found them at least once on codebase
nodes <- depends_parsed[["nodes"]]
edgelist <- depends_parsed[["edgelist"]]
weight_types <- intersect(names(edgelist)[3:ncol(edgelist)],weight_types)
dependency_edgelist <- edgelist[,.(src_filepath,dest_filepath)]
if(any(is.na(weight_types))){
dependency_edgelist$weight <- rowSums(edgelist[,3:ncol(edgelist),with=FALSE])
}else{
dependency_edgelist$weight <- rowSums(edgelist[,weight_types,with=FALSE])
}
# Remove dependencies not chosen by user
dependency_edgelist <- dependency_edgelist[weight != 0]
setnames(dependency_edgelist,
old=c("src_filepath","dest_filepath"),
new=c("from","to"))
# Select relevant columns for nodes
dependency_nodes <- nodes
setnames(x=dependency_nodes,
old="filepath",
new="name")
# Color files yellow
dependency_nodes <- data.table(name=dependency_nodes$name,color="#f4dbb5")
# Return the parsed JSON output as nodes and edgelist.
file_network <- list()
file_network[["nodes"]] <- dependency_nodes
file_network[["edgelist"]] <- dependency_edgelist
return(file_network)
}
#' Transform parsed R dependencies into a graph
#' @param r_dependencies_edgelist A parsed R folder by \code{\link{parse_r_dependencies}}.
#' @param dependency_type The type of dependency to be parsed: Function or File
#' @export
transform_r_dependencies_to_network <- function(r_dependencies_edgelist,dependency_type=c("function","file")){
mode <- match.arg(dependency_type)
if(mode == "function"){
graph <- model_directed_graph(r_dependencies_edgelist[,.(from=src_functions_call_name,
to=src_functions_caller_name)],
is_bipartite = FALSE,
color = c("#fafad2"))
}else if(mode == "file"){
graph <- model_directed_graph(r_dependencies_edgelist[,.(from=src_functions_call_filename,
to=src_functions_caller_filename)],
is_bipartite = FALSE,
color = c("#f4dbb5"))
}
return(graph)
}
############## Syntax Extractor ##############
#' Creates srcML XML
#'
#' Parses src code zip, folder or file and outputs the annotated
#' XML representation of the file.
#'
#' @param srcml_path The path to srcML binary
#' @param src_folder The path to the source code zip, folder or file of analysis
#' @param srcml_filepath The path, filename and extension (.xml) of the output XML file.
#'
#' @return The path where the output xml was saved (i.e. srcml_filepath)
#' @references For details, see \url{https://www.srcml.org/tutorials/creating-srcml.html}.
#' @seealso \code{\link{query_src_text}} to query the output file.
#' @export
#'
annotate_src_text <- function(srcml_path,src_folder,srcml_filepath){
srcml_path <- path.expand(srcml_path)
src_folder <- path.expand(src_folder)
srcml_filepath <- path.expand(srcml_filepath)
srcml_output <- system2(srcml_path,
args = c(src_folder, '--output',srcml_filepath),
stdout = FALSE,
stderr = FALSE)
return(srcml_filepath)
}
#' Query srcML XML
#'
#' Queries srcML XML for code units (e.g. function names, declarations, etc.).
#' For a list of code units and languages supported by srcML XML see:
#' \url{https://www.srcml.org/documentation.html}.
#'
#' Note a query, unless explicitly specified to be parsed (e.g. by using the
#' string() scrML function), is a srcML XML itself, which can be also queried.
#'
#' @param srcml_path The path to srcML binary
#' @param xpath_query The XPath query to be performed on the .xml
#' @param srcml_filepath The path to the srcML file to be queried
#' (see \code{\link{annotate_src_text}}).
#'
#' @return The path where the output xml was saved (i.e. srcml_filepath)
#' @references For details, see \url{https://www.srcml.org/tutorials/xpath-query.html}.
#' @export
query_src_text <- function(srcml_path,xpath_query,srcml_filepath){
srcml_path <- path.expand(srcml_path)
xpath_query <- path.expand(xpath_query)
srcml_filepath <- path.expand(srcml_filepath)
#srcml --xpath "//src:class/src:name" depends.xml
srcml_output <- system2(srcml_path,
args = c('--xpath',paste0('"',xpath_query,'"'),
srcml_filepath),
stdout = TRUE,
stderr = FALSE)
return(srcml_output)
}
#' Query srcML Class Names
#'
#' This is a convenience function to parse class names out of a project.
#' \url{https://www.srcml.org/documentation.html}.
#'
#'
#' @param srcml_path The path to srcML binary
#' @param srcml_filepath The path to the srcML file to be queried
#' (see \code{\link{annotate_src_text}}).
#'
#' @return A data.table containing filepath and class name.
#' @references For details, see \url{https://www.srcml.org/documentation.html}.
#' @export
query_src_text_class_names <- function(srcml_path,srcml_filepath){
srcml_path <- path.expand(srcml_path)
srcml_filepath <- path.expand(srcml_filepath)
xpath_query <- "//src:class/src:name"
srcml_output <- query_src_text(srcml_path,xpath_query,srcml_filepath)
srcml_output <- XML::xmlTreeParse(srcml_output)
srcml_root <- XML::xmlRoot(srcml_output)
# The children of the root node is a list of unit nodes
srcml_class_names <- XML::xmlChildren(srcml_root)
# Each unit node is of the form:
# <unit revision="1.0.0" language="Java" filename="/path/to/file.java" item="2"><name>someClassName</name></unit>
parse_filepath_and_class_name <- function(unit){
# The class name is a child node of each node
class_name <- XML::xmlValue(unit[[1]])
# The attribute filename contains the filename the class belongs to
filepath <- XML::xmlGetAttr(unit,"filename")
return(data.table(filepath=filepath,classname=class_name))
}
dt_filepath_classname <- rbindlist(lapply(srcml_class_names,parse_filepath_and_class_name))
return(dt_filepath_classname)
}
#' Query srcML Namespace
#'
#' This is a convenience function to parse namespace names out of a project.
#' \url{https://www.srcml.org/documentation.html}.
#'
#'
#' @param srcml_path The path to srcML binary
#' @param srcml_filepath The path to the srcML file to be queried
#' (see \code{\link{annotate_src_text}}).
#'
#' @return A data.table containing Namespace.
#' @references For details, see \url{https://www.srcml.org/documentation.html}.
#' @export
query_src_text_namespace <- function(srcml_path,srcml_filepath){
srcml_path <- path.expand(srcml_path)
srcml_filepath <- path.expand(srcml_filepath)
xpath_query <- "//src:package"
srcml_output <- query_src_text(srcml_path,xpath_query,srcml_filepath)
srcml_output <- XML::xmlTreeParse(srcml_output)
srcml_root <- XML::xmlRoot(srcml_output)
# The children of the root node is a list of unit nodes
srcml_class_names <- XML::xmlChildren(srcml_root)
# Each unit node is of the form:
# <unit revision="1.0.0" language="Java" filename="/Users/lzhan/Desktop/rawdata/git_repo/iotdb/tsfile/src/test/java/org/apache/iotdb/tsfile/read/reader/FakedMultiBatchReader.java" item="1"><package>package <name><name>org</name><operator>.</operator><name>apache</name><operator>.</operator><name>iotdb</name><operator>.</operator><name>tsfile</name><operator>.</operator><name>read</name><operator>.</operator><name>reader</name></name>;</package></unit>
parse_namespace <- function(unit){
# The class name is a child node of each node
class_name <- XML::xmlValue(unit[[1]])
class_name <- sub("^package", "", class_name)
class_name <- sub(";$","",class_name)
# The attribute filename contains the filename the class belongs to
filepath <- XML::xmlGetAttr(unit,"filename")
project_name <- 'iotdb'
# Create a regular expression pattern using project_name
pattern <- paste0('.*?', project_name, '/')
# Get relative path
filepath <- sub(pattern, '', filepath)
filename <- sub("\\.java$", "", basename(filepath))
full_path <- paste(class_name, filename, sep=".")
return(data.table(filepath=filepath, namespace=full_path))
}
dt_filepath_classname <- rbindlist(lapply(srcml_class_names,parse_namespace))
return(dt_filepath_classname)
}
############## GoF Detection ##############
#' Write GoF Patterns
#'
#' Write GoF patterns generated by `pattern4.jar` into a a table.
#' \url{https://www.srcml.org/documentation.html}.
#' Pattern4.jar is available on
#' [Tsantalis' homepage](https://users.encs.concordia.ca/~nikolaos/pattern_detection.html)).
#'
#' @param pattern4_path The path to Tsantalis' pattern4 jar
#' @param class_folder_path The path to a folder one
#' level above subdirectories that contain the class files.
#' @param output_filepath Optional path to store the XML generated by pattern4. If not
#' specified, it will be saved to `/tmp/gof.xml`.
#'
#' @return A data.table containing the parsed gof patterns per class.
#' @references N. Tsantalis, A. Chatzigeorgiou, G. Stephanides, S. T. Halkidis,
#' "Design Pattern Detection Using Similarity Scoring",
#' IEEE Transactions on Software Engineering,
#' vol. 32, no. 11, pp. 896-909, November, 2006.
#' @export
write_gof_patterns <- function(pattern4_path,class_folder_path,output_filepath='/tmp/gof.xml'){
pattern4_path <- path.expand(pattern4_path)
class_folder_path <- path.expand(class_folder_path)
if(!file.exists(pattern4_path)) stop("The specified pattern4_path does not exist!")
if(!dir.exists(class_folder_path)) stop("The specified class_folder_path does not exist!")
# java -Xms32m -Xmx512m -jar pattern4.jar -target "C:\foo\myclasses" -output "C:\foo\output.xml"
gof_pattern_xml_path <- system2("java",
args = c('-Xms64m','-Xmx100000m','-jar',
pattern4_path,
'-target',paste0('"',class_folder_path,'"'),
'-output',paste0('"',output_filepath,'"')),
stdout = TRUE,
stderr = FALSE)
}
#' Parse GoF Patterns
#'
#' Parses GoF patterns generated by \code{\link{write_gof_patterns}} into a a table.
#' \url{https://www.srcml.org/documentation.html}.
#' Pattern4.jar is available on
#' [Tsantalis' homepage](https://users.encs.concordia.ca/~nikolaos/pattern_detection.html)).
#'
#' @param output_filepath Optional path to read the XML generated by \code{\link{write_gof_patterns}}. If not
#' specified, it will be assumed saved to the temporary folder path `/tmp/gof.xml`.
#'
#' @return A data.table containing the parsed gof patterns per class.
#' @references N. Tsantalis, A. Chatzigeorgiou, G. Stephanides, S. T. Halkidis,
#' "Design Pattern Detection Using Similarity Scoring",
#' IEEE Transactions on Software Engineering,
#' vol. 32, no. 11, pp. 896-909, November, 2006.
#' @export
parse_gof_patterns <- function(output_filepath='/tmp/gof.xml'){
gof_pattern_xml <- XML::xmlTreeParse(output_filepath)
# The <system> root node enumerates a fixed number of <pattern> tags.
gof_root <- XML::xmlRoot(gof_pattern_xml) #class => XML Node
patterns <- XML::xmlChildren(gof_root) #class => XMLNodeList (lapply safe)
parse_instance <- function(instance){
roles <- XML::xmlChildren(instance)
role_names <- sapply(roles,XML::xmlGetAttr,"name")
element <- sapply(roles,XML::xmlGetAttr,"element")
instance <- data.table(instance_id,
role_name = role_names,
element = element)
instance_id <<- instance_id + 1
return(instance)
}
parse_pattern <- function(pattern){
# Each GoF pattern, if occurring on the code, is assigned an instance
n_instances <- XML::xmlSize(pattern)
# The XML mentions the pattern name even with no instances detected. We do not
# include the pattern name if no instances are detected.
if(n_instances > 0){
# Note counter bypasses lapply scope <<-
instance_id <<- 1
pattern_name <- XML::xmlGetAttr(pattern,"name")
instances <- XML::xmlChildren(pattern)
instances_dt <- rbindlist(lapply(instances,parse_instance))
instances_dt$pattern_name <- pattern_name
return(instances_dt)
}else{
return(data.table())
}
}
patterns_dt <- rbindlist(lapply(patterns,parse_pattern))
patterns_dt <- patterns_dt[,.(pattern_name,instance_id,role_name,element)]
return(patterns_dt)
}
#' Subset GoF Classes
#'
#' The \code{\link{write_gof_patterns}} contains not only
#' the participation of a class in a GoF Pattern, but also
#' the participation of methods and variables when applicable.
#' To distinguish a row entry among class, method or variable,
#' we must subset the role names that are associated to classes.
#' This information can be obtained by inspecting the source code
#' of a similar tool to pattern4 by Tsantalis.
#'
#' More specifically, every pattern that pattern4.jar can identify is
#' defined as a PatternDescriptor in DPD4Eclipse/src/gr/uom/java/pattern
#' /PatternGenerator.java (see: https://github.com/tsantalis/DPD4Eclipse).
#'
#' E.g. The PatternDescriptor Decorator has rowNameList.add("Component");
#' rowNameList.add("Decorator"); Therefore, in the XML output by pattern4.jar,
#' it is guaranteed when (pattern_name == "Decorator" & role_name == "Component") or
#' (pattern_name == "Decorator" & role_name == "Decorator").
#'
#' By following this process, a list of role names can be defined to subset the table
#' to only contain classes.
#'
#' Note pattern4 executes in bytecode, hence the classes are identified by their namespace.
#' Refer to \code{\link{query_src_text_namespace}} to obtain a table to map namespace classes
#' to filepaths.
#' @param gof_patterns A table of parsed GoF Patterns
#' obtained from \code{\link{parse_gof_patterns}}.
#' @export
subset_gof_class <- function(gof_patterns){
lead_patterns <- c('Creator', 'Abstraction', 'Adapter', 'Singleton', 'Prototype', 'Decorator', 'AbstractClass', 'Composite', 'Subject', 'State', 'Visitor', 'Strategy', 'Observer', 'Command', 'Handler', 'Component', 'Context', 'Implementor', 'ConcreteElement', 'Prototype', 'Client', 'Proxy', 'RealSubject', 'Subject', 'FamilyHead', 'Redirecter')
return(gof_patterns[role_name %in% lead_patterns])
}
# Various imports
utils::globalVariables(c("."))
#' @importFrom magrittr %>%
#' @importFrom stringi stri_replace_last
#' @importFrom stringi stri_replace_first
#' @importFrom stringi stri_match_all
#' @importFrom stringi stri_match_first_regex
#' @importFrom stringi stri_detect_regex
#' @importFrom stringi stri_c
#' @importFrom stringi stri_split_regex
#' @importFrom stringi stri_trans_tolower
#' @importFrom data.table data.table
#' @importFrom data.table is.data.table
#' @importFrom data.table as.data.table
#' @importFrom data.table .N
#' @importFrom data.table transpose
#' @importFrom data.table :=
#' @importFrom data.table copy
#' @importFrom data.table rbindlist
#' @importFrom data.table setkey
#' @importFrom data.table setkeyv
#' @importFrom data.table setnames
#' @importFrom data.table last
NULL
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.