
Defines functions preprocess_code_blocks tokenize_code_blocks import_parsed_code_blocks import_parsed_code_blocks_from_one_file summarise_enumerated_blocks enumerate_code_symbols remove_trivial_code_symbols get_localised_parsed_code_blocks annotate_parsed_content .get_default_annotated_parsed_content


# Functions / Classes for extracting / collapsing / filtering parsed-code
# blocks from a set of files
# - All non-trivial symbols in the code blocks are enumerated (converted to an
#   integer) for use in similarity measurement


#' `.get_default_annotated_parsed_content`
#' @noRd
.get_default_annotated_parsed_content <- function() {
  i0 <- integer(0)
  c0 <- character(0)
  l0 <- logical(0)

    line1 = i0, col1 = i0, line2 = i0, col2 = i0, id = i0, parent = i0,
    token = c0, terminal = l0, text = c0, file = c0, block = i0,
    start_line = i0

#' Add filename, block-number and start-line for the parsed-content for each
#'   code block in a given file
#' @param        parsed_content   The parsed-content for a specific code-block
#'   from running get_source_expressions on a file.
#' @param        file             The filename for the content.
#' @param        block            The block from which the content came.
#' @param        start_line       The start-line of the block in the filename.
#' @importFrom   dplyr         mutate
#' @include      dupree_data_validity.R
#' @noRd
annotate_parsed_content <- function(parsed_content, file, block, start_line) {

  parsed_content %>%
      file = file, block = block, start_line = start_line

#' Convert a list of source_expressions to a data-frame that contains the
#'   parsed-content from each source expression, and indicates the file,
#'   block-number and start-line for that source expression
#' @param        source_exprs   A list of source-expressions, obtained from
#'   lintr::get_source_expressions.
#' @importFrom   dplyr         bind_rows
#' @importFrom   purrr         keep   map2
#' @include   dupree_data_validity.R
#' @noRd
get_localised_parsed_code_blocks <- function(source_exprs) {
  source_blocks <- purrr::keep(

  if (length(source_blocks) == 0) {

  parsed_content <- purrr::map2(
    function(x, y) {
      annotate_parsed_content(x$parsed_content, x$file, y, x$line)


#' `remove_trivial_code_symbols`
#' @importFrom   dplyr         filter
#' @importFrom   rlang         .data
#' @noRd
remove_trivial_code_symbols <- function(df) {
  # TODO: check for presence of `token` column
  .quote_wrap <- function(x) {
    gsub(pattern = "^(.*)$", replacement = "'\\1'", x)

  drop_tokens <- c(
      c("-", "+", ",", "(", ")", "[", "]", "{", "}", "$", "@", ":")
    "AND2", "NS_GET", "expr", "COMMENT", "LEFT_ASSIGN", "LBB", "EQ_SUB"

  df %>%
    dplyr::filter(!.data[["token"]] %in% drop_tokens)

#' enumerate_code_symbols
#' @importFrom   dplyr         mutate
#' @importFrom   rlang         .data
#' @noRd
enumerate_code_symbols <- function(df) {
  # TODO: check for `text` column
  df %>%
    dplyr::mutate(symbol_enum = as.integer(factor(.data[["text"]])))

#' summarise_enumerated_blocks
#' @importFrom   dplyr         group_by   summarise   n
#' @importFrom   rlang         .data
#' @noRd
summarise_enumerated_blocks <- function(df) {
  grouping_cols <- c("file", "block", "start_line")
  df %>%
    ) %>%
      enumerated_code = list(c(.data[["symbol_enum"]])),
      block_size = dplyr::n()


#' import_parsed_code_blocks_from_one_file
#' @importFrom   dplyr         filter
#' @importFrom   lintr         get_source_expressions
#' @importFrom   rlang         .data
#' @noRd
import_parsed_code_blocks_from_one_file <- function(file) {
  file %>%
    lintr::get_source_expressions() %>%
    get_localised_parsed_code_blocks() %>%
    dplyr::filter(!.data[["token"]] %in% "COMMENT")

#' import_parsed_code_blocks
#' @importFrom   dplyr         bind_rows
#' @importFrom   purrr         map
#' @noRd
import_parsed_code_blocks <- function(files) {
  files %>%
    purrr::map(import_parsed_code_blocks_from_one_file) %>%

#' tokenize_code_blocks
#' @noRd
tokenize_code_blocks <- function(block_df) {
  block_df %>%
    remove_trivial_code_symbols() %>%
    enumerate_code_symbols() %>%


#' preprocess_code_blocks
#' @param        files         A set of *.R or *.Rmd files over which dupree is
#'   to perform duplicate-identification
#' @param        min_block_size   An integer >= 1. How many non-trivial symbols
#'   must be present in a code-block if that block is to be used in
#'   code-duplication detection.
#' @importFrom   dplyr         filter
#' @importFrom   methods       new
#' @include      dupree_classes.R
#' @noRd
preprocess_code_blocks <- function(files, min_block_size = 40) {
  blocks <- files %>%
    import_parsed_code_blocks() %>%
    tokenize_code_blocks() %>%
      .data[["block_size"]] >= min_block_size

  methods::new("EnumeratedCodeTable", blocks)

russHyde/dupree documentation built on April 8, 2024, 10:37 a.m.