suppressPackageStartupMessages({
  library(dplyr)
  library(magrittr)
  library(tidygraph)
  library(ggraph)
  library(stringdist)
})

Background and Links:

Today's packages

Me

Preamble

# Dependencies:
# - lintr, dplyr, purrr, tibble, magrittr, methods, stringdist
if (!"dupree" %in% installed.packages()) {
  require("devtools")
  devtools::install_github(
    repo = "russHyde/dupree", dependencies = FALSE
  )
}

suppressPackageStartupMessages({
  library(lintr)
  library(dupree)
  library(git2r)
})

Code Smells & Architectural Ideals

"The most common design problems result from code that

Quote: Kerievsky 'Refactoring to Patterns'

See also Fowler 'Refactoring', Martin 'Clean Code' and Jenny Bryan's talk 'Code smells and feels'

Types of duplication

url <- "https://static.fjcdn.com/large/pictures/9d/b7/9db733_1672275.gif"

{ width=80% }

How to detect duplication?

dupree

Duplication in a script

# min_block_size: used to prevent dupree analysing really small code blocks
dupree("duplication_heavy.R", min_block_size = 3) %>%
  dplyr::select(-file_a, -file_b)

Duplication in a script (cont.) {.smaller}


Mechanics

Longest Common Substring

# breakf-a---st
# break-dance--
stringdist::stringdist("breakfast", "breakdance", method = "lcs")

Code blocks

-> Sentences of function / variable names

-> "Sentences" of integers

-> Compute similarity score based on longest-common-subsequence

Mechanics (cont.)

Use seq_sim to compute LCS-based distance between vectors of integers

to_ints <- function(word){
  as.integer(factor(strsplit(word, "")[[1]], levels = letters))
}

to_ints("breakfast")
stringdist::seq_sim(
  list(to_ints("breakfast")), list(to_ints("breakdance")), method = "lcs"
) # 1 - |LCS| / (|seq1| + |seq2|)

Duplication in a package

Downloaded the source code for lintr from github using ropensci/git2r.

# temporary dir for storing `lintr`'s source code
lintr_path <- file.path(tempdir(), "lintr")
lintr_repo <- git2r::clone(
  "https://github.com/jimhester/lintr",
  lintr_path
)

Duplication in a package (cont)

Ran dupree on lintr

dups <- dupree::dupree_package(
  lintr_path, min_block_size = 40
)
ggplot2::qplot(
  x = seq(nrow(dups)), y = dups[["score"]],
  xlab = "Index",
  ylab = "Similarity score:\nversus closest matching block")

Duplication in a package (cont) {.smaller}

dups %>%
  dplyr::filter(score > 0.4 & file_a != file_b) %>%
  dplyr::mutate_at(c("file_a", "file_b"), basename) %>%
  head()

GOTO: equals_na_lintr.R

Visualisation of duplication results

We make a tidygraph structure from the similarity scores

dup_graph <- dups %>%
  # keep code-block pairs with moderate similarity:
  dplyr::filter(score > 0.4) %>%
  dplyr::transmute(
    # indicate code-blocks by filename and start-line
    from = paste(basename(file_a), line_a),
    to = paste(basename(file_b), line_b),
    type = "duplication",
    score = score
  ) %>%
  tidygraph::as_tbl_graph() %>%
  # distinguish the file each code block came from
  mutate(filename = gsub("(.*) \\d+$", "\\1", name))

Visualisation of duplication results (cont)

graph_image <- dup_graph %>%
  ggraph(layout = "gem") +
  geom_edge_link(
    aes(colour = type, edge_width = score)
  ) +
  geom_node_point(
    aes(colour = filename), size = 4, show.legend = FALSE
  ) +
  theme_graph()

Visualisation of duplication results (cont)

graph_image

Visualisation of duplication results (cont)

graph_image +
  geom_node_text(aes(label = name), repel = TRUE)

What was lintr by the way?

refactoRing

Thanks



russHyde/dupree documentation built on April 8, 2024, 10:37 a.m.