R/fix_links.R

Defines functions make_link text_to_links links_within_text_regex resolve_end_node add_sibling_sourcepos fix_broken_link_too fix_broken_link get_start_asis get_link_fragment_nodes gsb ctext make_link_patterns fix_broken_links find_broken_links fix_links

Documented in find_broken_links fix_broken_link fix_broken_links fix_links get_link_fragment_nodes links_within_text_regex make_link make_link_patterns text_to_links

#' Find and fix unresolved template links within an Episode XML body
#'
#' @description
#' Links like `[link text]({{ page.root }}/destination.html)` are not parsed
#' correctly by our commonmark parser and are output as text. Use this to find
#' these missing links and transform them into link or image elements.
#'
#'
#' @param body an XML document.
#' @return `fix_links()`: the modified body
#'
#' @details
#'
#' ## Motivation 
#'
#' Jekyll implements [the liquid template
#' language](https://shopify.github.io/liquid/), which can break some syntax
#' expected by commonmark. If this syntax appears in a link context, that link
#' is rendred as text. Carpentries Lessons created before 2023 use Jekyll
#' and have this templating embedded for many links. 
#'
#' In order to convert a pre-workbench lesson to use The Workbench, we need to
#' make sure all the links are accurately represented to avoid invalid syntax
#' and broken links from sneaking into the lesson. 
#'
#' ## Implementation Details
#'
#' For example, a valid line with a link that looks
#' like `[Home](index.html) and other text` will appear in XML as:
#'
#' ```xml
#' ...
#' <link destination="index.html">Home</link>
#' <text> and other text</text>
#' ...
#' ```
#'
#' However, if a link uses liquid templating for a variable such as: 
#' `[Home]({{ page.root }}/index.html) and other text`, it will appear in XML as
#'
#' ```xml
#' ...
#' <text asis="true">[</text>
#' <text>Home</text>
#' <text asis="true">]</text>
#' <text>({{ page.root }}/index.html) and other text</text>
#' ...
#' ```
#' 
#' Note: the nodes with `asis` elements are from \pkg{tinkr} protecting square
#' brackets. When we run `fix_links()`, these nodes are collapsed into a link:
#'
#' ```xml
#' ...
#' <link destination="{{ page.root }}/index.html">Home</link>
#' <text> and other text</text>
#' ...
#' ```
#'
#' And with that we can further transform the link to replace the liquid
#' templating with something that makes sense in \pkg{sandpaper}.
#'
#' @rdname fix_links
#' @keywords internal
#' @examples
#' loop <- fs::path(lesson_fragment(), "_episodes", "14-looping-data-sets.md")
#' e <- Episode$new(loop, fix_links = FALSE)
#' e$links  # five links
#' e$images # four images
#' 
#' # fix_links() ---------------------------------------------------------------
#' e$body <- asNamespace("pegboard")$fix_links(e$body)
#' e$links  # eight links
#' e$images # five images
#'
fix_links <- function(body) {
  fragments <- find_broken_links(body)
  fix_broken_links(fragments)
  invisible(xml2::read_xml(as.character(body)))
}

#' @rdname fix_links
#'
#' @details
#' `find_broken_links()` uses the pattern generated by `make_link_patterns()`
#' to search for potential links.
#'
#' @return 
#'  - `find_broken_link()`: a list where each element represents a fragmented
#'  link. Inside each element are two elements:
#'   - parent: the parent paragraph node for the link
#'   - nodes: the series of four or five nodes that make up the link text
find_broken_links <- function(body) {
  nodes <- xml2::xml_find_all(body, make_link_patterns(), ns = get_ns())
  purrr::map(nodes, get_link_fragment_nodes)
}

#' @rdname fix_links
#'
#' @details
#' `fix_broken_links()` uses the output of `find_broken_links()` to replace the
#' node fragments with links. 
fix_broken_links <- function(fragments) {
  purrr::walk(fragments, fix_broken_link_too)
}

#' @details
#' `make_link_patterns()` a generator to create an XPath query that will search
#' for liquid markup following a closing bracket.
#'
#' @param ns the namespace prefix to use for the pattern
#' @rdname fix_links
#' @examples
#' asNamespace("pegboard")$make_link_patterns()
make_link_patterns <- function(ns = "md:") {

  predicate <- gsb("(<ctext('({{')> and <ctext('}}')>)")
  asis_nodes <- "text[@asis][text()=']']"
  destination <- glue::glue(
    ".//{ns}{asis_nodes}/following-sibling::{ns}text[{predicate}]"
  )
  return(destination)
}

ctext <- function(x) glue::glue("contains(text(), '{x}')")
gsb <- function(x) glue::glue(x, .open = "<", .close = ">")

#' @details
#' `get_link_fragment_nodes()`: Get the source for the link node fragments
#'
#' @param node a node determined to be a text representation of a link
#'   destination
#' @return 
#'  - `get_link_fragments()`: the preceding three or four nodes, which will be
#'  the text of the link or the alt text of the image.
#' @rdname fix_links
get_link_fragment_nodes <- function(node) {
  the_parent <- xml2::xml_parent(node)
  the_children <- xml2::xml_children(the_parent)
  # find the node in question by testing for identity since they represent the
  # same object, they will be identical. 
  id <- which(purrr::map_lgl(the_children, identical, node))
  # test for image with endsWith because they may have an inline image.
  openid <- get_start_asis(the_children, id)
  is_image <- id > 4
  is_image <- is_image && endsWith(xml2::xml_text(the_children[[openid - 1L]]), "!")
  offset <- openid - is_image
  the_children[seq(offset, id)]
}

# find the asis node that is opener of our fragment 
get_start_asis <- function(chillns, id) {
  XPath <- "boolean(./self::*[@asis][text()='['])"
  openers <- which(xml2::xml_find_lgl(chillns, XPath))
  # get the differences between the id (our link pattern) and the opener
  idx <- id - openers
  # the smallest non-negative integer of differences is the one we want
  non_neg <- idx > 0L
  idx <- which.min(idx[non_neg])
  openers[idx]
}

#' @rdname fix_links
#'
#' @details
#' `fix_broken_link()` takes a set of nodes that comprises a single link and
#' recomposes them into a link or image node.
fix_broken_link <- function(nodes) {
  # get_link_fragment_nodes() returns 4 nodes for a link and 5 nodes for an
  # image to account for the extra "!" in markdown.
  type <- if (is.na(xml2::xml_attr(nodes[[1]], "asis"))) "image" else "link"
  text <- paste(xml2::xml_text(nodes), collapse = "")
  # create the nodes that we use to replace the link fragment nodes
  to_replace <- text_to_links(text, ns = xml2::xml_ns(nodes[[1]]), type = type)
  # insert the replacements before the link fragment nodes
  purrr::walk(to_replace, 
    ~xml2::xml_add_sibling(nodes[[1]], .x, .where = "before")
  )
  # remove the link fragment nodes
  xml2::xml_remove(nodes)
}

fix_broken_link_too <- function(nodes) {
  # find the boundaries and node type
  is_image <- is.na(xml2::xml_attr(nodes[[1]], "asis"))
  start <- if (is_image) 3L else 2L
  new_node_type <- if (is_image) "image" else "link"

  # extract destination node
  final_node <- nodes[[length(nodes)]]
  end <- resolve_end_node(final_node)

  # extract link children nodes
  txt <- nodes[seq(start, length(nodes) - 2L)]
  new_txt <- make_text_nodes(as.character(txt))

  # create new link node before the nodes
  new_node <- xml2::xml_add_sibling(nodes[[1]], new_node_type, 
    destination = end$destination, .where = "before")
  # re-add text into the node
  purrr::walk(new_txt, function(node) {
    xml2::xml_add_child(new_node, node)
  })

  # if we have an extra bit, do not remove the end
  if (length(end$extra)) {
    xml2::xml_set_text(final_node, end$extra)
    nodes <- nodes[-length(nodes)]
  }

  xml2::xml_remove(nodes)
  add_sibling_sourcepos(new_node)
  new_node
}

# When we create a new node for a link,
add_sibling_sourcepos <- function(node) {
  if (!is.na(xml2::xml_attr(node, "sourcepos"))) {
    # bail early if we have a source position
    return(invisible(node))
  }
  # find the sibling node right before this node
  preceding <- xml2::xml_find_first(node, ".//preceding-sibling::*[1]")
  sourcepos <- xml2::xml_attr(preceding, "sourcepos")
  if (is.na(sourcepos)) {
    # add the parent source position if we can't find a sibling
    sourcepos <- xml2::xml_attr(xml2::xml_parent(node), "sourcepos")
  }
  xml2::xml_set_attr(node, "sourcepos", sourcepos)
}

resolve_end_node <- function(node) {
  txt <- xml2::xml_text(node)
  if (startsWith(txt, "(") && endsWith(txt, ")")) {
    destination <- substring(txt, 2L , nchar(txt) - 1L)
    extra <- character(0)
  } else {
    rgx <- "^[(]([^)]+?)[)](.*$)"
    destination <- sub(rgx, "\\1", txt)
    extra <- sub(rgx, "\\2", txt)
  }
  if (length(extra) && destination == extra) {
    XPath <- "./following-sibling::*[@sourcepos][1]"
    next_node <- xml2::xml_find_first(node, XPath)
    next_text <- xml2::xml_text(next_node)
    rgx <- "^([^)].+?)[)](.*$)"
    donor <- sub(rgx, "\\1", next_text)
    trimmed <- sub(rgx, "\\2", next_text)
    if (donor != trimmed) {
      xml2::xml_set_text(next_node, trimmed)
      new <- paste(destination, donor)
      destination <- substring(new, 2L , nchar(new))
      extra <- character(0)
    }
  }
  list(destination = destination, extra = extra)
}

#' @details
#' `links_within_text_regex()`: finding different types of links within markdown
#' text can be challenging because it involves characters used in regex for
#' grouping and character classes. In general, I want to do two things with text
#' that I get back from a document:
#'
#'  1. split the links out from the text
#'  2. identify which parts of the resulting vector are links.
#'
#' This way, I can convert the links to links and the text to text.
#'
#' @rdname fix_links
#' @examples
#'
#' # links_within_text_regex() -------------------------------------------------
#' helpers <- pegboard:::links_within_text_regex()
#' helpers
#' txt <- "text ![image text](a.png) with [a link](b.org) and text"
#' res <- strsplit(txt, helpers["to_split"], perl = TRUE)[[1]]
#' data.frame(res)
#' grepl(helpers["find_links"], res, perl = TRUE)
links_within_text_regex <- function() {

  b1 <- "\\["
  b2 <- "\\]"
  p1 <- "\\("
  p2 <- "\\)"
  woo <- "\\!"
  # Does not match: ][ or )[ ![
  first_b1 <- glue::glue("(?<!({b2}|{p2}|{woo})){b1}")
  # Does not match: ]] or ][ or ](
  last_b2  <- glue::glue("{b2}(?!({b2}|{b1}|{p1}))")
  rgx      <- glue::glue("{first_b1}|{last_b2}|{p2}")
  # Does not match [][ or []( (image links)
  lnk      <- glue::glue("(?<!{b1}){b2}({b1}|{p1})")

  return(c(to_split = rgx, find_links = lnk))
}

#' @details
#' `text_to_links()`: Splits links away from text and returns a nodeset to insert
#' 
#' @param txt text derived from `xml2::xml_text()`
#' @param ns a namespace object
#' @param type either "image" or "link".
#' @param sourcepos defaults to NULL. If this is not NULL, it's the sourcepos
#'   attribute of the text node(s) and will be applied to the new nodes.
#' @return `text_to_links()`: if ns is NULL: a character vector of XML text
#'   nodes, otherwise, new XML text nodes.
#' @rdname fix_links
#' @examples
#'
#' # text_to_links() -----------------------------------------------------------
#' txt <- "Some text [and _a link_]({{ page.root }}/link.to#thing), 
#' some other text."
#' pegboard:::text_to_links(txt, type = "link")
#' md <- c(md = "http://commonmark.org/xml/1.0")
#' class(md) <- "xml_namespace"
#' pegboard:::text_to_links(txt, md, "link")
text_to_links <- function(txt, ns = NULL, type, sourcepos = NULL) {

  regex_helpers <- links_within_text_regex()
  rgx <- regex_helpers["to_split"]
  lnk <- regex_helpers["find_links"]

  texts <- strsplit(txt, rgx, perl = TRUE)[[1]]
  texts <- texts[texts != ""]
  # escape ampersands that are not valid code points, though this will still
  # allow invalid code points, but it's better than nothing
  texts <- gsub("[&](?![#]?[A-Za-z0-9]+?[;])", "&amp;", texts, perl = TRUE)
  are_links <- grepl(lnk, texts, perl = TRUE)
  texts[are_links]  <- purrr::map_chr(texts[are_links], make_link, pattern = lnk, type = type)
  texts[!are_links] <- glue::glue("<text>{texts[!are_links]}</text>")
  if (!is.null(ns)) {
    # TODO: fix this process for creating new nodes. Use the process from 
    # {tinkr} to do this. 
    texts <- xml_new_paragraph(glue::glue_collapse(texts), ns, tag = FALSE)
    texts <- xml2::xml_children(texts)
    xml2::xml_set_attr(texts, "sourcepos", sourcepos)
  }
  texts
}



#' @details
#' `make_link()`: makes a link depending on the link type
#'
#' @param pattern a regular expression that is used for splitting the link
#' from the surrounding text. 
#' @rdname fix_links
make_link <- function(txt, pattern, type = "rel_link") {
  # relative tags are processed
  txt <- if (type == "image") sub("^\\!\\[", '', txt) else txt
  # split the link text and text into a two-element vector
  text_link <- rev(strsplit(txt, pattern, perl = TRUE)[[1]])
  link <- glue::glue_collapse(text_link, sep = "'><text>")
  glue::glue("<{type} destination='{link}</text></{type}>")
}
carpentries/pegboard documentation built on Nov. 13, 2024, 8:53 a.m.