R/clean.R

Defines functions fix_img_tags remove_highlight_tags normalize_chars process_bare_urls remove_extra_empty_lines process_file

# these functions are mainly for cleaning up Markdown files generated by Exitwp
# with the XML file exported from WordPress

# a wrapper function to read a file as UTF-8, process the text, and write back
process_file = function(f, FUN, x = read_utf8(f)) {
  xfun::process_file(f, FUN, x)
}

# replace three or more \n with two, i.e. two or more empty lines with one
remove_extra_empty_lines = function(...) xfun::process_file(..., fun = function(x) {
  x = paste(gsub('\\s+$', '', x), collapse = '\n')
  trim_ws(gsub('\n{3,}', '\n\n', x))
})

# replace [url](url) with <url>
process_bare_urls = function(...) xfun::process_file(..., fun = function(x) {
  gsub('\\[([^]]+)]\\(\\1/?\\)', '<\\1>', x)
})

normalize_chars = function(...) xfun::process_file(..., fun = function(x) {
  # curly single and double quotes to straight quotes
  x = gsub(paste0('[', intToUtf8(8216:8217), ']'), "'", x)
  x = gsub(paste0('[', intToUtf8(8220:8221), ']'), '"', x)
  x = gsub(intToUtf8(8230), '...', x)  # ellipses
  x = gsub(intToUtf8(160), ' ', x)  # zero-width space
  x
})

# clean up code blocks that have been syntax highlighted by Pandoc
remove_highlight_tags = function(...) xfun::process_file(..., fun = function(x) {
  clean = function(x) {
    # remove the <code></code> tags
    x = gsub('^(\\s+)<code( class="[^"]*")?>(.*)', '\\1\\3', x)
    x = gsub('</code>\\s*$', '', x)
    # remove <span></span>
    x = gsub('</?span([^>])*>', '', x)
    x
  }
  # only process lines that are indented by at least 4 spaces
  i = grep('^( {4,}.*)', x)
  x[i] = clean(x[i])
  x
})

# <img></img> to <img />
fix_img_tags = function(...) xfun::process_file(..., fun = function(x) {
  gsub('></img>', ' />', x)
})

Try the blogdown package in your browser

Any scripts or data that you put into this service are public.

blogdown documentation built on July 9, 2023, 5:28 p.m.