R/03b_doubleSpaceLinter.R

Defines functions get_comment_nodes_with_prior_code get_code_lints adjust_lint_location extract_double_space_nodes double_space_linter

Documented in adjust_lint_location double_space_linter extract_double_space_nodes get_code_lints get_comment_nodes_with_prior_code

#' @describeIn INWTLinters Are there double whitespaces?
#' @export
double_space_linter <- function() {
  # X-Path logic to find the deepest node with in-text double spaces. Either
  # COMMENT or STR_CONST.
  comment_string_xpath <- "//descendant::*[contains(., '  ') and not(descendant::*[contains(., '  ')])]"
  # Same logic for non-in-text double spaces (in remaining script content).
  code_xpath <- "//descendant::*[not(descendant::*) and @end and not(ancestor-or-self::COMMENT)]"
  Linter(function(source_expression) {
    if (!is_lint_level(source_expression, level = "file")) {
      return(list())
    }

    # Set the correct format of the xml-parsed script.
    xml <- source_expression$full_xml_parsed_content
    xml_df <- source_expression$full_parsed_content

    # Type 1: Lints in strings and comments
    comment_string_nodes <- xml_find_all(xml, comment_string_xpath)
    comment_string_nodes_filt <- extract_double_space_nodes(comment_string_nodes)
    adjust_lint_location(xml_df, comment_string_nodes_filt)

    # The `range_start_xpath` is altered such that we correct for the loc_diff
    comment_string_lints <- xml_nodes_to_lints(
      comment_string_nodes_filt,
      source_expression = source_expression,
      lint_message = "Inappropriate spacing between elements in non-code content detected.",
      type = "style",
      range_start_xpath = "number(./@lint_loc) + number(./@loc_diff)",
      range_end_xpath = "number(./@col2)"
    )

    # Type 2: Lints in code
    code_nodes <- xml_find_all(xml, code_xpath)
    code_lines <- xml_attr(code_nodes, "line1") # code lines with same line number
    unique_lines <- unique(code_lines)

    # Nodes have already been converted to lints in `get_code_lints()`.
    code_lints <- get_code_lints(unique_lines, code_nodes, source_expression)

    # Type 3: code and comment in same line
    code_before_comment_nodes <- get_comment_nodes_with_prior_code(comment_string_nodes, code_nodes)

    code_before_comment_lints <- xml_nodes_to_lints(
      code_before_comment_nodes,
      source_expression = source_expression,
      lint_message = "Inappropriate spacing between elements in code-before-comment content detected.",
      type = "style",
      range_start_xpath = "number(./@col1) + 1",
      range_end_xpath = "number(./@col2)"
    )
    # Put collected lints of all three types together.
    lintlist <- c(comment_string_lints, code_lints, code_before_comment_lints)
  })
}

#' @title Extract nodes with incorrect double spaces
#' @param comment_nodes Node list extracted with the XPath for Comments/Strings
#' @returns Node list of valid lints, so invalid double spaces
#' @description First collects the locations of the in-text double spaces in
#' a dataframe. Distinguishes between "#" and "#'" in the beginning.
extract_double_space_nodes <- function(comment_nodes) {
  valid_nodes <- list()
  for (node in comment_nodes) {
    text <- xml_text(node)
    double_spaces <- str_locate_all(text, "\\s{2,}")[[1]]

    if (nrow(double_spaces) > 0) {

      has_invalid_match <- FALSE
      starts <- double_spaces[, "start"]
      ends <- double_spaces[, "end"]
      # This loop picks out the problematic double spaces and neglects the other
      for (position in seq_along(starts)) {
        start_pos <- starts[position]
        end_pos <- ends[position]
        char_before <- ifelse((start_pos > 1), substr(text, start_pos - 1, start_pos - 1), "")
        char_after <- ifelse((end_pos < nchar(text)), substr(text, end_pos + 1, end_pos + 1), "")
        # First condition to flag general invalid patterns. Double spaces after
        # "#" could be an indented line which was commented out, whereas double
        # spaces before "#" are often used in headers as a measure of style.
        if (char_before != "#" && char_after != "#") {
          has_invalid_match <- TRUE
        }
        # Second condition to handle exceptions for "#'". This refers to roxygen2
        # type documentation, where we use "#'" in the beginning. Using only a
        # single position character as before, "'     CHAR" would be categorized
        # a lint, which is not true.

        # roxygen2 correction
        if (substr(text, start_pos - 2, start_pos - 1) == "#'" && char_after != "#") {
          has_invalid_match <- FALSE
        }
        # Rule out cases where line breaks produce double spaces
        if (xml_attr(node, "line1") != xml_attr(node, "line2") && grepl("\n", xml_text(node))) {
          has_invalid_match <- FALSE
        }
        if (has_invalid_match) {
          # Stops at the first lint detection.
          break
        }
      }
      if (has_invalid_match) {
        valid_nodes[[length(valid_nodes) + 1]] <- node
        xml2::xml_set_attr(node, "lint_loc", as.character(start_pos))
        # "lint_loc" is used in the `range_start_xpath` for `comment_string_lints`.
      }
    }
  }

  return(valid_nodes)
}

#' @title Correcting Lint Locations
#' @param df A dataframe of the full parsed content of the underlying script
#' @param valid_nodes A node list containing detected lints from comments/strings
#' @returns An node list extended by the loc_diff
#' @description The lint location must be corrected and the
#' following code seeks to add the amount of spaces that was overlooked
#' when interpreting a COMMENT with code in front of it (code is invisible).
adjust_lint_location <- function(df, valid_nodes) {
  unique_lines <- unique(df[, "line1"])

  for (line_number in unique_lines) {
    line_data <- df[df[, "line1"] == line_number, ]
    if (any(line_data[, "token"] %in% c("STR_CONST", "COMMENT"))) {
      target_col1 <-
        line_data[line_data[, "token"] %in% c("STR_CONST", "COMMENT"), "col1"][1]
      # Subtracting by 1 gives us the amount of space before comments. It consists of
      # code and white space.
      diff <- target_col1 - 1
      for (node in valid_nodes) {
        if (as.integer(xml_attr(node, "line1")) == line_number) {
          xml2::xml_set_attr(node, "loc_diff", as.character(diff))
        }
      }
    }
  }
}

#' @title Extract lints from code
#' @param unique_lines A vector of line numbers for each instance of written code
#' @param code_nodes Node list extracted with the XPath for code
#' @param source_expression Contains the XML parsed content. Needed for
#' transforming nodes to lints
#' @returns List of lints containing invalid double spaces
#' @description The function logic is to compare locations of nodes in the
#' same `line1` value, which stands for the line number of a node in a Script.
#' A node in code can be, for example, a comma, an assignment arrow  or a number.
#' F.e., consider these three nodes: "(", "," and ")". If the first node ends at
#' position "4" and the second node starts at a position larger than "7", we
#' have at least a double space in between (position 5 + 6). The second node
#' `i+1` (the right part of a double space) is directly converted into a lint
#' list.
get_code_lints <- function(unique_lines, code_nodes, source_expression) {
  # Extract attributes for each node
  code_lints <- list()
  code_lines <- xml_attr(code_nodes, "line1") # code lines with same line number
  code_starts <- as.integer(xml_attr(code_nodes, "start"))
  code_ends <- as.integer(xml_attr(code_nodes, "end"))

  for (line in unique_lines) {
    indices_on_line <- which(code_lines == line)

    for (index in indices_on_line[-length(indices_on_line)]) {
      current_end <- code_ends[index]
      next_start <- code_starts[index + 1]

      if (next_start - current_end > 2) {
        lint <- xml_nodes_to_lints(
          code_nodes[index + 1],
          source_expression = source_expression,
          lint_message = "Inappropriate spacing between elements in code content detected.",
          type = "style",
          range_start_xpath = "number(./@col1) - 1",
          range_end_xpath = "number(./@col2)"
        )
        code_lints <- c(code_lints, lint)
      }
    }
  }
  return(code_lints)
}

#' @title Extract lints between code and comments
#' @param comment_nodes Node list extracted with the XPath for comments/strings
#' @param code_nodes Node list extracted with the XPath for code
#' @returns Node List containing lints between code and comments
#' @description We need to account for special cases like
#' "x <- 3      #   Hello world", where both double spaces are invalid and
#' should therefore be detected as lint. This function first pairs nodes
#' that have code and text on the same line and then compares the amount of
#' spaces
#' - between code and "#",
#' - between "#" and text.
get_comment_nodes_with_prior_code <- function(comment_nodes, code_nodes) {
  valid_comments <- list()

  for (node in comment_nodes) {
    comment_node <- node
    comment_line <- xml_attr(comment_node, "line1")
    comment_start <- as.integer(xml_attr(comment_node, "start"))
    comment_text <- xml_text(comment_node)

    hash_pos <- regexpr("#", comment_text)

    if (hash_pos != -1) { # Default if there is no "#" equals -1.
      substring_after_hash <- substring(comment_text, hash_pos + 1)
      space_count_after_hash <- str_count(substring_after_hash, "\\G\\s")
      num_spaces_after_hash <- ifelse((space_count_after_hash > 0),
                                      space_count_after_hash,
                                      nchar(substring_after_hash)
      )

      code_on_same_line <- code_nodes[xml_attr(code_nodes, "line1") == comment_line]
      if (length(code_on_same_line) > 0) {
        code_ends <- as.integer(xml_attr(code_on_same_line, "end"))
        valid_code_ends <- code_ends[code_ends < comment_start]

        if (length(valid_code_ends) > 0) {
          last_code_end <- max(valid_code_ends)
          # The node is written in a node list if at least one case is
          # met ...
          space_between_code_and_comment <- comment_start - last_code_end - 1
          if (space_between_code_and_comment > 1 ||
              num_spaces_after_hash > 1) {
            valid_comments[[length(valid_comments) + 1]] <- comment_node
          }
        }
      }
    }
  }
  return(valid_comments)
}
INWT/INWTUtils documentation built on May 22, 2024, 4:45 p.m.