scripts/generate_known_tags.R

## Source with `source("./scripts/generate_known_tags.R", echo = TRUE, prompt.echo = ">")`

## This script web scrapes two Mozilla websites for HTML and SVG tag elements.
## All HTML tags


library(magrittr)


get_tags <- function(url, css) {
  url %>%
    httr::GET() %>%
    httr::content() %>%
    rvest::html_nodes(css) %>%
    rvest::html_text() %>%
    sub("^<", "", .) %>%
    sub(">$", "", .) %>%
    sort() %>%
    unique() %>%
    print()
}

## W3 Schools
## Mozilla seemed to have a more up to date set of what is possible / not obsolete
# w3html_tags <- get_tags("https://www.w3schools.com/tags/default.asp", "#htmltags tr td:first-child a:not(.notsupported)")
## Had extra tags not seen in other places `altGlyph`
# w3svg_tags <- get_tags("https://www.w3schools.com/graphics/svg_reference.asp", "#main td:first-child")

## W3 Standard
# # The original spec websites made it very hard to determine what was obsolete / shouldn't be used and what was to be used
# html_tags <- get_tags("https://www.w3.org/TR/2018/WD-html53-20181018/single-page.html", "dfn[data-dfn-type='element']")
# svg_tags <- get_tags("https://svgwg.org/svg2-draft/single-page.html", "dfn[data-dfn-type='element']")


## Mozilla
# do not include the last section of obsolete tags
html_tags <- get_tags("https://developer.mozilla.org/en-US/docs/Web/HTML/Element", "article table:not(:last-child) td:first-child code")
# html_tags_obsolete <- get_tags("https://developer.mozilla.org/en-US/docs/Web/HTML/Element", "#content table:last-child td:first-child a")

# do not include tags that do not contain documentation articles
# Only pull from the index, as elements not in the index are considered obsolete. (ex: altGlyph or font-face)
svg_tags <- get_tags("https://developer.mozilla.org/en-US/docs/Web/SVG/Element", "article .index a:not([rel='nofollow']) code")


# Both SVG2 and HTML5
svg_tags[svg_tags %in% html_tags]

# Call using callr::r to avoid any devtools loaded htmltools::tags namespace issues
cran_tags <- callr::r(
  function() {
    remotes::install_cran("htmltools")
    names(htmltools::tags)
  },
  show = TRUE
)

new_tags <- c(svg_tags, html_tags) %>% unique() %>% sort()

# tags which should not HTML5 / SVG2 supported
setdiff(cran_tags, new_tags)
#> "command"     "eventsource" "keygen"


# New HTML5 tags
setdiff(html_tags, cran_tags)
#> "rb"   "rtc"  "slot"
# New SVG2 tags
setdiff(svg_tags, cran_tags)
### ...basically all svg tags

# combine old and new tags so that old tags are not lost
save_tags <- c(new_tags, cran_tags) %>% unique() %>% sort()

save_line <- paste0(
  format(paste0("  \"", save_tags, "\"", ifelse(seq_along(save_tags) == length(save_tags), "", ",")), justify = "left"), "#",
  ifelse(save_tags %in% html_tags, " html", "     "),
  ifelse(save_tags %in% svg_tags, " svg", "")
) %>%
  sub("\\s+$", "", .)
cat(
  "## Generated by `./scripts/generate_known_tags.R`: do not edit by hand",
  "## Please call `source(\"./scripts/generate_known_tags.R\", echo = TRUE, prompt.echo = \">\")` to update",
  "known_tags <- c(",
    paste0(save_line, collapse = "\n"),
  ")",
  sep = "\n",
  file = rprojroot::find_package_root_file(file.path("R", "known_tags.R"))
)
message("Saved to `./R/known_tags.R`")
rstudio/htmltools documentation built on March 29, 2024, 2:22 p.m.