#' Find and fix unresolved template links within an Episode XML body
#'
#' @description
#' Links like `[link text]({{ page.root }}/destination.html)` are not parsed
#' correctly by our commonmark parser and are output as text. Use this to find
#' these missing links and transform them into link or image elements.
#'
#'
#' @param body an XML document.
#' @return `fix_links()`: the modified body
#'
#' @details
#'
#' ## Motivation
#'
#' Jekyll implements [the liquid template
#' language](https://shopify.github.io/liquid/), which can break some syntax
#' expected by commonmark. If this syntax appears in a link context, that link
#' is rendred as text. Carpentries Lessons created before 2023 use Jekyll
#' and have this templating embedded for many links.
#'
#' In order to convert a pre-workbench lesson to use The Workbench, we need to
#' make sure all the links are accurately represented to avoid invalid syntax
#' and broken links from sneaking into the lesson.
#'
#' ## Implementation Details
#'
#' For example, a valid line with a link that looks
#' like `[Home](index.html) and other text` will appear in XML as:
#'
#' ```xml
#' ...
#' <link destination="index.html">Home</link>
#' <text> and other text</text>
#' ...
#' ```
#'
#' However, if a link uses liquid templating for a variable such as:
#' `[Home]({{ page.root }}/index.html) and other text`, it will appear in XML as
#'
#' ```xml
#' ...
#' <text asis="true">[</text>
#' <text>Home</text>
#' <text asis="true">]</text>
#' <text>({{ page.root }}/index.html) and other text</text>
#' ...
#' ```
#'
#' Note: the nodes with `asis` elements are from \pkg{tinkr} protecting square
#' brackets. When we run `fix_links()`, these nodes are collapsed into a link:
#'
#' ```xml
#' ...
#' <link destination="{{ page.root }}/index.html">Home</link>
#' <text> and other text</text>
#' ...
#' ```
#'
#' And with that we can further transform the link to replace the liquid
#' templating with something that makes sense in \pkg{sandpaper}.
#'
#' @rdname fix_links
#' @keywords internal
#' @examples
#' loop <- fs::path(lesson_fragment(), "_episodes", "14-looping-data-sets.md")
#' e <- Episode$new(loop, fix_links = FALSE)
#' e$links # five links
#' e$images # four images
#'
#' # fix_links() ---------------------------------------------------------------
#' e$body <- asNamespace("pegboard")$fix_links(e$body)
#' e$links # eight links
#' e$images # five images
#'
fix_links <- function(body) {
fragments <- find_broken_links(body)
fix_broken_links(fragments)
invisible(xml2::read_xml(as.character(body)))
}
#' @rdname fix_links
#'
#' @details
#' `find_broken_links()` uses the pattern generated by `make_link_patterns()`
#' to search for potential links.
#'
#' @return
#' - `find_broken_link()`: a list where each element represents a fragmented
#' link. Inside each element are two elements:
#' - parent: the parent paragraph node for the link
#' - nodes: the series of four or five nodes that make up the link text
find_broken_links <- function(body) {
nodes <- xml2::xml_find_all(body, make_link_patterns(), ns = get_ns())
purrr::map(nodes, get_link_fragment_nodes)
}
#' @rdname fix_links
#'
#' @details
#' `fix_broken_links()` uses the output of `find_broken_links()` to replace the
#' node fragments with links.
fix_broken_links <- function(fragments) {
purrr::walk(fragments, fix_broken_link_too)
}
#' @details
#' `make_link_patterns()` a generator to create an XPath query that will search
#' for liquid markup following a closing bracket.
#'
#' @param ns the namespace prefix to use for the pattern
#' @rdname fix_links
#' @examples
#' asNamespace("pegboard")$make_link_patterns()
make_link_patterns <- function(ns = "md:") {
predicate <- gsb("(<ctext('({{')> and <ctext('}}')>)")
asis_nodes <- "text[@asis][text()=']']"
destination <- glue::glue(
".//{ns}{asis_nodes}/following-sibling::{ns}text[{predicate}]"
)
return(destination)
}
ctext <- function(x) glue::glue("contains(text(), '{x}')")
gsb <- function(x) glue::glue(x, .open = "<", .close = ">")
#' @details
#' `get_link_fragment_nodes()`: Get the source for the link node fragments
#'
#' @param node a node determined to be a text representation of a link
#' destination
#' @return
#' - `get_link_fragments()`: the preceding three or four nodes, which will be
#' the text of the link or the alt text of the image.
#' @rdname fix_links
get_link_fragment_nodes <- function(node) {
the_parent <- xml2::xml_parent(node)
the_children <- xml2::xml_children(the_parent)
# find the node in question by testing for identity since they represent the
# same object, they will be identical.
id <- which(purrr::map_lgl(the_children, identical, node))
# test for image with endsWith because they may have an inline image.
openid <- get_start_asis(the_children, id)
is_image <- id > 4
is_image <- is_image && endsWith(xml2::xml_text(the_children[[openid - 1L]]), "!")
offset <- openid - is_image
the_children[seq(offset, id)]
}
# find the asis node that is opener of our fragment
get_start_asis <- function(chillns, id) {
XPath <- "boolean(./self::*[@asis][text()='['])"
openers <- which(xml2::xml_find_lgl(chillns, XPath))
# get the differences between the id (our link pattern) and the opener
idx <- id - openers
# the smallest non-negative integer of differences is the one we want
non_neg <- idx > 0L
idx <- which.min(idx[non_neg])
openers[idx]
}
#' @rdname fix_links
#'
#' @details
#' `fix_broken_link()` takes a set of nodes that comprises a single link and
#' recomposes them into a link or image node.
fix_broken_link <- function(nodes) {
# get_link_fragment_nodes() returns 4 nodes for a link and 5 nodes for an
# image to account for the extra "!" in markdown.
type <- if (is.na(xml2::xml_attr(nodes[[1]], "asis"))) "image" else "link"
text <- paste(xml2::xml_text(nodes), collapse = "")
# create the nodes that we use to replace the link fragment nodes
to_replace <- text_to_links(text, ns = xml2::xml_ns(nodes[[1]]), type = type)
# insert the replacements before the link fragment nodes
purrr::walk(to_replace,
~xml2::xml_add_sibling(nodes[[1]], .x, .where = "before")
)
# remove the link fragment nodes
xml2::xml_remove(nodes)
}
fix_broken_link_too <- function(nodes) {
# find the boundaries and node type
is_image <- is.na(xml2::xml_attr(nodes[[1]], "asis"))
start <- if (is_image) 3L else 2L
new_node_type <- if (is_image) "image" else "link"
# extract destination node
final_node <- nodes[[length(nodes)]]
end <- resolve_end_node(final_node)
# extract link children nodes
txt <- nodes[seq(start, length(nodes) - 2L)]
new_txt <- make_text_nodes(as.character(txt))
# create new link node before the nodes
new_node <- xml2::xml_add_sibling(nodes[[1]], new_node_type,
destination = end$destination, .where = "before")
# re-add text into the node
purrr::walk(new_txt, function(node) {
xml2::xml_add_child(new_node, node)
})
# if we have an extra bit, do not remove the end
if (length(end$extra)) {
xml2::xml_set_text(final_node, end$extra)
nodes <- nodes[-length(nodes)]
}
xml2::xml_remove(nodes)
add_sibling_sourcepos(new_node)
new_node
}
# When we create a new node for a link,
add_sibling_sourcepos <- function(node) {
if (!is.na(xml2::xml_attr(node, "sourcepos"))) {
# bail early if we have a source position
return(invisible(node))
}
# find the sibling node right before this node
preceding <- xml2::xml_find_first(node, ".//preceding-sibling::*[1]")
sourcepos <- xml2::xml_attr(preceding, "sourcepos")
if (is.na(sourcepos)) {
# add the parent source position if we can't find a sibling
sourcepos <- xml2::xml_attr(xml2::xml_parent(node), "sourcepos")
}
xml2::xml_set_attr(node, "sourcepos", sourcepos)
}
resolve_end_node <- function(node) {
txt <- xml2::xml_text(node)
if (startsWith(txt, "(") && endsWith(txt, ")")) {
destination <- substring(txt, 2L , nchar(txt) - 1L)
extra <- character(0)
} else {
rgx <- "^[(]([^)]+?)[)](.*$)"
destination <- sub(rgx, "\\1", txt)
extra <- sub(rgx, "\\2", txt)
}
if (length(extra) && destination == extra) {
XPath <- "./following-sibling::*[@sourcepos][1]"
next_node <- xml2::xml_find_first(node, XPath)
next_text <- xml2::xml_text(next_node)
rgx <- "^([^)].+?)[)](.*$)"
donor <- sub(rgx, "\\1", next_text)
trimmed <- sub(rgx, "\\2", next_text)
if (donor != trimmed) {
xml2::xml_set_text(next_node, trimmed)
new <- paste(destination, donor)
destination <- substring(new, 2L , nchar(new))
extra <- character(0)
}
}
list(destination = destination, extra = extra)
}
#' @details
#' `links_within_text_regex()`: finding different types of links within markdown
#' text can be challenging because it involves characters used in regex for
#' grouping and character classes. In general, I want to do two things with text
#' that I get back from a document:
#'
#' 1. split the links out from the text
#' 2. identify which parts of the resulting vector are links.
#'
#' This way, I can convert the links to links and the text to text.
#'
#' @rdname fix_links
#' @examples
#'
#' # links_within_text_regex() -------------------------------------------------
#' helpers <- pegboard:::links_within_text_regex()
#' helpers
#' txt <- "text ![image text](a.png) with [a link](b.org) and text"
#' res <- strsplit(txt, helpers["to_split"], perl = TRUE)[[1]]
#' data.frame(res)
#' grepl(helpers["find_links"], res, perl = TRUE)
links_within_text_regex <- function() {
b1 <- "\\["
b2 <- "\\]"
p1 <- "\\("
p2 <- "\\)"
woo <- "\\!"
# Does not match: ][ or )[ ![
first_b1 <- glue::glue("(?<!({b2}|{p2}|{woo})){b1}")
# Does not match: ]] or ][ or ](
last_b2 <- glue::glue("{b2}(?!({b2}|{b1}|{p1}))")
rgx <- glue::glue("{first_b1}|{last_b2}|{p2}")
# Does not match [][ or []( (image links)
lnk <- glue::glue("(?<!{b1}){b2}({b1}|{p1})")
return(c(to_split = rgx, find_links = lnk))
}
#' @details
#' `text_to_links()`: Splits links away from text and returns a nodeset to insert
#'
#' @param txt text derived from `xml2::xml_text()`
#' @param ns a namespace object
#' @param type either "image" or "link".
#' @param sourcepos defaults to NULL. If this is not NULL, it's the sourcepos
#' attribute of the text node(s) and will be applied to the new nodes.
#' @return `text_to_links()`: if ns is NULL: a character vector of XML text
#' nodes, otherwise, new XML text nodes.
#' @rdname fix_links
#' @examples
#'
#' # text_to_links() -----------------------------------------------------------
#' txt <- "Some text [and _a link_]({{ page.root }}/link.to#thing),
#' some other text."
#' pegboard:::text_to_links(txt, type = "link")
#' md <- c(md = "http://commonmark.org/xml/1.0")
#' class(md) <- "xml_namespace"
#' pegboard:::text_to_links(txt, md, "link")
text_to_links <- function(txt, ns = NULL, type, sourcepos = NULL) {
regex_helpers <- links_within_text_regex()
rgx <- regex_helpers["to_split"]
lnk <- regex_helpers["find_links"]
texts <- strsplit(txt, rgx, perl = TRUE)[[1]]
texts <- texts[texts != ""]
# escape ampersands that are not valid code points, though this will still
# allow invalid code points, but it's better than nothing
texts <- gsub("[&](?![#]?[A-Za-z0-9]+?[;])", "&", texts, perl = TRUE)
are_links <- grepl(lnk, texts, perl = TRUE)
texts[are_links] <- purrr::map_chr(texts[are_links], make_link, pattern = lnk, type = type)
texts[!are_links] <- glue::glue("<text>{texts[!are_links]}</text>")
if (!is.null(ns)) {
# TODO: fix this process for creating new nodes. Use the process from
# {tinkr} to do this.
texts <- xml_new_paragraph(glue::glue_collapse(texts), ns, tag = FALSE)
texts <- xml2::xml_children(texts)
xml2::xml_set_attr(texts, "sourcepos", sourcepos)
}
texts
}
#' @details
#' `make_link()`: makes a link depending on the link type
#'
#' @param pattern a regular expression that is used for splitting the link
#' from the surrounding text.
#' @rdname fix_links
make_link <- function(txt, pattern, type = "rel_link") {
# relative tags are processed
txt <- if (type == "image") sub("^\\!\\[", '', txt) else txt
# split the link text and text into a two-element vector
text_link <- rev(strsplit(txt, pattern, perl = TRUE)[[1]])
link <- glue::glue_collapse(text_link, sep = "'><text>")
glue::glue("<{type} destination='{link}</text></{type}>")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.