###############################################################################
# Classes for `dupree`
###############################################################################
# Class definition: `EnumeratedCodeTable`
#' An S4 class to represent the code blocks as strings of integers
#'
#' @name EnumeratedCodeTable-class
#' @slot blocks A tbl_df with columns `file`, `block`, `start_line` and
#' `enumerated_code`
#'
methods::setClass("EnumeratedCodeTable", slots = list(blocks = "tbl_df"))
###############################################################################
#' `EnumeratedCodeTable` validation
#'
#' @noRd
#'
.is_enumerated_code_table <- function(object) {
required_cols <- c("file", "block", "start_line", "enumerated_code")
observed_cols <- colnames(object@blocks)
if (
all(required_cols %in% observed_cols)
) {
TRUE
} else {
missing_cols <- setdiff(required_cols, observed_cols)
paste("Column", missing_cols, "should be in object@blocks")
}
}
methods::setValidity(
"EnumeratedCodeTable",
function(object) .is_enumerated_code_table(object)
)
###############################################################################
#' Initialise an `EnumeratedCodeTable`
#'
#' An `EnumeratedCodeTable` contains a `blocks` table. Each row of this table
#' contains details for a block of R code: the filename, block-id and startline
#' of the block, and a tokenized version of the code within that block.
#'
#' Once initialised, the blocks table is ordered by filename and then block-id.
#'
#' @importFrom methods callNextMethod setMethod validObject
#' @importFrom tibble tibble
#'
#' @noRd
#'
methods::setMethod(
"initialize",
"EnumeratedCodeTable",
function(.Object, blocks = NULL, ...) {
.Object <- methods::callNextMethod(...)
default_code_table <- tibble::tibble(
file = character(0), block = integer(0), start_line = integer(0),
enumerated_code = list()
)
if (is.null(blocks)) {
.Object@blocks <- default_code_table
} else {
# we ensure that the code blocks are ordered by file and then block
.Object@blocks <- dplyr::arrange(
blocks, .data[["file"]], .data[["block"]]
)
}
methods::validObject(.Object)
.Object
}
)
###############################################################################
# `find_best_matches`
###############################################################################
# By default we use `lcs` as the sequence-similarity measure
# - for two integer vectors, the lcs-distance is the minimum number of entries
# that need to be removed from both vectors before identity is reached
# - then the lcs-similarity score is 1 - distance / max_length; where
# max_length is the sum of the lengths of the two input vectors
# - d((1, 2, 3, 4), (1, 4, 5, 6)) = 4; s(..., ...) = 1 - 4 / 8
# - we use lcs because it's simple to explain
#' `find_best_matches` between code blocks
#'
#' @noRd
#'
# nocov start
methods::setGeneric("find_best_matches", function(x, ...) {
methods::standardGeneric("find_best_matches")
})
# nocov end
#' `find_best_matches` between code blocks in an `EnumeratedCodeTable`
#'
#' The code blocks are assumed to be ordered within the
#' `EnumeratedCodeTable`, as such when two code blocks are
#' mutually-best-matches, the results returned by this function only contains
#' a single row for those two code blocks; when this happens we guarantee that
#' `file_a` <= `file_b` and `block_a` <= `block_b`
#'
#' @noRd
#'
methods::setMethod(
"find_best_matches",
methods::signature("EnumeratedCodeTable"),
function(x, ...) {
blocks <- x@blocks
enum_codes <- x@blocks$enumerated_code
index_matches <- find_indexes_of_best_matches(enum_codes, ...)
details_a <- blocks[index_matches$index_a, ]
details_b <- blocks[index_matches$index_b, ]
score <- index_matches$score
tibble::tibble(
file_a = details_a$file,
file_b = details_b$file,
block_a = details_a$block,
block_b = details_b$block,
line_a = details_a$start_line,
line_b = details_b$start_line,
score = score
)
}
)
###############################################################################
# Related Functions
###############################################################################
#' One against all search
#'
#' @noRd
#'
.one_against_all <- function(subject_index, enum_codes, sim_func) {
subject <- enum_codes[subject_index]
scores <- sim_func(subject, enum_codes)
scores[subject_index] <- -1
list(
index_a = subject_index,
index_b = which.max(scores),
score = max(scores)
)
}
#' All against all search
#'
#' @param enum_codes List of vectors of integers. Each `int` is an
#' enumerated code for some code-symbol (like a conversion of the
#' code-symbols into a factor).
#' @param method Alignment method for use in
#' `stringdist::seq_sim`.
#' @param ... Further parameters for passing to
#' `stringdist::seq_sim`.
#'
#' @importFrom dplyr arrange desc mutate select
#' @importFrom purrr map_df
#' @importFrom stringdist seq_sim
#' @importFrom tibble tibble
#' @importFrom rlang .data
#'
#' @noRd
#'
find_indexes_of_best_matches <- function(enum_codes, method = "lcs", ...) {
empty_result <- tibble::tibble(
index_a = integer(0), index_b = integer(0), score = numeric(0)
)
if (length(enum_codes) <= 1) {
return(empty_result)
}
sim_func <- function(x, y) {
stringdist::seq_sim(x, y, method = method, ...)
}
# .one_against_all returns df: (index_a, index_b, score)
# For each code-block we want to identify it's closest matching code-block
#
# We only return a code-block pair once (ie, if A-B is a pair and B-A is a
# pair, then we return A-B, but not B-A)
#
# When C-A is a pair but the index of C is greater than that of A, we return
# the pair A-C
scores <- purrr::map_df(
seq_along(enum_codes),
.one_against_all,
enum_codes,
sim_func
) %>%
# ensure the index of A is less than the index of B
dplyr::mutate(
temp = pmax(.data[["index_a"]], .data[["index_b"]]),
index_a = pmin(.data[["index_a"]], .data[["index_b"]]),
index_b = .data[["temp"]]
) %>%
dplyr::select(
-"temp"
) %>%
# only return each code-block pair once
unique() %>%
# order the code-block pairs by decreasing score
dplyr::arrange(
dplyr::desc(.data[["score"]]), .data[["index_a"]], .data[["index_b"]]
)
scores
}
###############################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.