suppressPackageStartupMessages({ library(dplyr) library(magrittr) library(tidygraph) library(ggraph) library(stringdist) })
https://github.com/ropensci/git2r
# Dependencies: # - lintr, dplyr, purrr, tibble, magrittr, methods, stringdist if (!"dupree" %in% installed.packages()) { require("devtools") devtools::install_github( repo = "russHyde/dupree", dependencies = FALSE ) } suppressPackageStartupMessages({ library(lintr) library(dupree) library(git2r) })
"The most common design problems result from code that
Is duplicated
Is unclear
Is complicated"
Quote: Kerievsky 'Refactoring to Patterns'
See also Fowler 'Refactoring', Martin 'Clean Code' and Jenny Bryan's talk 'Code smells and feels'
url <- "https://static.fjcdn.com/large/pictures/9d/b7/9db733_1672275.gif"
{ width=80% }
Trivial stuff (library(dplyr)
)
Copy/paste-driven development (similar logic & code)
Functional duplication (same logic, different code)
? False duplication (different logic, similar code)
Python
pylint
(looks for identical lines between files)Java / C++ / C# etc
R: nothing for source code (AFAIK)
stringdist
ropensci:textreuse
goodpractice
, lintr
, styler
, cyclocomp
, pkgnet
)dupree
All community input is welcome
Most data input is welcome:
dupree()
)dupree_dir()
)dupree_package()
)# min_block_size: used to prevent dupree analysing really small code blocks dupree("duplication_heavy.R", min_block_size = 3) %>% dplyr::select(-file_a, -file_b)
Longest Common Substring
# breakf-a---st # break-dance-- stringdist::stringdist("breakfast", "breakdance", method = "lcs")
Code blocks
-> Sentences of function / variable names
-> "Sentences" of integers
-> Compute similarity score based on longest-common-subsequence
Use seq_sim
to compute LCS-based distance between vectors of integers
to_ints <- function(word){ as.integer(factor(strsplit(word, "")[[1]], levels = letters)) } to_ints("breakfast")
stringdist::seq_sim( list(to_ints("breakfast")), list(to_ints("breakdance")), method = "lcs" ) # 1 - |LCS| / (|seq1| + |seq2|)
Downloaded the source code for lintr
from github using
ropensci/git2r
.
# temporary dir for storing `lintr`'s source code lintr_path <- file.path(tempdir(), "lintr") lintr_repo <- git2r::clone( "https://github.com/jimhester/lintr", lintr_path )
Ran dupree on lintr
dups <- dupree::dupree_package( lintr_path, min_block_size = 40 )
ggplot2::qplot( x = seq(nrow(dups)), y = dups[["score"]], xlab = "Index", ylab = "Similarity score:\nversus closest matching block")
dups %>% dplyr::filter(score > 0.4 & file_a != file_b) %>% dplyr::mutate_at(c("file_a", "file_b"), basename) %>% head()
We make a tidygraph structure from the similarity scores
dup_graph <- dups %>% # keep code-block pairs with moderate similarity: dplyr::filter(score > 0.4) %>% dplyr::transmute( # indicate code-blocks by filename and start-line from = paste(basename(file_a), line_a), to = paste(basename(file_b), line_b), type = "duplication", score = score ) %>% tidygraph::as_tbl_graph() %>% # distinguish the file each code block came from mutate(filename = gsub("(.*) \\d+$", "\\1", name))
graph_image <- dup_graph %>% ggraph(layout = "gem") + geom_edge_link( aes(colour = type, edge_width = score) ) + geom_node_point( aes(colour = filename), size = 4, show.legend = FALSE ) + theme_graph()
graph_image
graph_image + geom_node_text(aes(label = name), repel = TRUE)
lintr
by the way?style / syntax checker for R
configurable
can be ran
in Rstudio / vim / atom etc
or on Travis
(and dupree uses lintr's file parsers)
Improving the structure of code (without modifying its function)
The rule of 3
Examples
Figures: Global theming / %+%
Statements: Replace with function call
Common functions: Move to a package
RMarkdown: Configurable reports / child-stubs
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.