R/pandoc.R

Defines functions quoted pandoc_citeproc pandoc detect_generic_lang with_pandoc_safe_environment get_pandoc_version find_pandoc pandoc_html_highlight_args unix_mathjax_path pandoc_mathjax_local_path pandoc_mathjax_args validate_self_contained pandoc_self_contained_html pandoc_template pandoc_path_arg pandoc_toc_args pandoc_latex_engine_args pandoc_highlight_args pandoc_include_args pandoc_variable_arg pandoc_version pandoc_available pandoc_convert

Documented in pandoc_available pandoc_convert pandoc_highlight_args pandoc_include_args pandoc_latex_engine_args pandoc_path_arg pandoc_self_contained_html pandoc_template pandoc_toc_args pandoc_variable_arg pandoc_version

#' Convert a document with pandoc
#'
#' Convert documents to and from various formats using the pandoc utility.
#'
#' @param input Character vector containing paths to input files
#'   (files must be UTF-8 encoded)
#' @param to Format to convert to (if not specified, you must specify
#'   \code{output})
#' @param from Format to convert from (if not specified then the format is
#'   determined based on the file extension of \code{input}).
#' @param output Output file (if not specified then determined based on format
#'   being converted to)
#' @param citeproc \code{TRUE} to run the pandoc-citeproc filter (for processing
#'   citations) as part of the conversion
#' @param options Character vector of command line options to pass to pandoc.
#' @param verbose \code{TRUE} to show the pandoc command line which was executed
#' @param wd Working directory in which code will be executed. If not
#'   supplied, defaults to the common base directory of \code{input}
#'
#' @details Supported input and output formats are described in the
#'   \href{http://johnmacfarlane.net/pandoc/README.html}{pandoc user guide}.
#'
#'   The system path as well as the version of pandoc shipped with RStudio (if
#'   running under RStudio) are scanned for pandoc and the highest version
#'   available is used.
#'
#' @examples
#' \dontrun{
#' library(rmarkdown)
#'
#' # convert markdown to various formats
#' pandoc_convert("input.md", to = "html")
#' pandoc_convert("input.md", to = "pdf")
#'
#' # process citations
#' pandoc_convert("input.md", to = "html", citeproc = TRUE)
#'
#' # add some pandoc options
#' pandoc_convert("input.md", to="pdf, options = c("--listings"))
#' }
#'
#' @export
pandoc_convert <- function(input,
                           to = NULL,
                           from = NULL,
                           output = NULL,
                           citeproc = FALSE,
                           options = NULL,
                           verbose = FALSE,
                           wd = NULL) {

  # ensure we've scanned for pandoc
  find_pandoc()

  # execute in specified working directory
  if (is.null(wd)) {
    wd <- base_dir(input)
  }
  oldwd <- setwd(wd)
  on.exit(setwd(oldwd), add = TRUE)


  # input file and formats
  args <- c(input)
  if (!is.null(to))
    args <- c(args, "--to", to)
  if (!is.null(from))
    args <- c(args, "--from", from)

  #  output file
  if (!is.null(output))
    args <- c(args, "--output", output)

  # citeproc filter if requested
  if (citeproc)
    args <- c(args, "--filter", pandoc_citeproc())

  # set pandoc stack size
  stack_size <- getOption("pandoc.stack.size", default = "512m")
  args <- c(c("+RTS", paste0("-K", stack_size), "-RTS"), args)
  
  # additional command line options
  args <- c(args, options)

  # build the conversion command
  command <- paste(quoted(pandoc()), paste(quoted(args), collapse = " "))

  # show it in verbose mode
  if (verbose)
    cat(command, "\n")

  # run the conversion
  with_pandoc_safe_environment({
    result <- system(command)
  })
  if (result != 0)
    stop("pandoc document conversion failed with error ", result, call. = FALSE)

  invisible(NULL)
}

#' Check pandoc availabilty and version
#' 
#' Determine whether pandoc is currently available on the system (optionally 
#' checking for a specific version or greater). Determine the specific version 
#' of pandoc available.
#' 
#' @param version Required version of pandoc
#'   
#' @return \code{pandoc_available} returns a logical indicating whether the 
#'   required version of pandoc is available. \code{pandoc_version} returns a 
#'   \code{\link[base]{numeric_version}} with the version of pandoc found.
#'   
#' @details
#' 
#' The system path as well as the version of pandoc shipped with RStudio (if 
#' running under RStudio) are scanned for pandoc and the highest version 
#' available is used.
#'
#' @examples
#' \dontrun{
#' library(rmarkdown)
#'
#' if (pandoc_available())
#'   cat("pandoc", as.character(pandoc_version()), "is available!\n")
#'
#' if (pandoc_available("1.12.3"))
#'   cat("requried version of pandoc is available!\n")
#' }
#' @export
pandoc_available <- function(version = NULL) {

  # ensure we've scanned for pandoc
  find_pandoc()

  # check availability
  if (!is.null(.pandoc$dir))
    if (!is.null(version))
      .pandoc$version >= version
  else
    TRUE
  else
    FALSE
}


#' @rdname pandoc_available
#' @export
pandoc_version <- function() {
  find_pandoc()
  .pandoc$version
}

#' Functions for generating pandoc command line arguments
#'
#' Functions that assist in creating various types of pandoc command line
#' arguments (e.g. for templates, table of contents, highlighting, and content
#' includes)
#'
#' @inheritParams includes
#'
#' @param name Name of template variable to set.
#' @param value Value of template variable.
#' @param toc \code{TRUE} to include a table of contents in the output.
#' @param toc_depth Depth of headers to include in table of contents.
#' @param highlight The name of a pandoc syntax highlighting theme.
#' @param latex_engine LaTeX engine for producing PDF output. Options are
#'   "pdflatex", "lualatex", and "xelatex".
#' @param default The highlighting theme to use if "default"
#'   is specified.
#'
#' @return A character vector with pandoc command line arguments
#'
#' @details Non-absolute paths for resources referenced from the
#'   \code{in_header}, \code{before_body}, and \code{after_body}
#'   parameters are resolved relative to the directory of the input document.
#'
#' @examples
#' \dontrun{
#'
#' library(rmarkdown)
#'
#' pandoc_include_args(before_body = "header.htm")
#' pandoc_include_args(before_body = "header.tex")
#'
#' pancoc_highlight_args("kate")
#' 
#' pandoc_latex_engine_args("pdflatex")
#'
#' pandoc_toc_args(toc = TRUE, toc_depth = 2)
#'
#' }
#' @name pandoc_args
NULL

#' @rdname pandoc_args
#' @export
pandoc_variable_arg <- function(name, value) {
  c("--variable", paste(name, "=", value, sep = ""))
}


#' @rdname pandoc_args
#' @export
pandoc_include_args <- function(in_header = NULL,
                                before_body = NULL,
                                after_body = NULL) {
  args <- c()

  for (file in in_header)
    args <- c(args, "--include-in-header", pandoc_path_arg(file))

  for (file in before_body)
    args <- c(args, "--include-before-body", pandoc_path_arg(file))

  for (file in after_body)
    args <- c(args, "--include-after-body", pandoc_path_arg(file))

  args
}

#' @rdname pandoc_args
#' @export
pandoc_highlight_args <- function(highlight, default = "tango") {

  args <- c()

  if (is.null(highlight))
    args <- c(args, "--no-highlight")
  else {
    if (identical(highlight, "default"))
      highlight <- default
    args <- c(args, "--highlight-style", highlight)
  }

  args
}

#' @rdname pandoc_args
#' @export
pandoc_latex_engine_args <- function(latex_engine) {
  
  # use a full path to the latex engine on OSX since the stripping
  # of the PATH environment variable by OSX 10.10 Yosemite prevents
  # pandoc from finding the engine in e.g. /usr/texbin
  if (Sys.info()["sysname"] == "Darwin") {
    # resolve path if it's not already an absolute path
    if (!grepl("/", latex_engine, fixed = TRUE))
      program_path <- find_program(latex_engine)
      if (nzchar(program_path))
        latex_engine <- program_path  
  }
  
  c("--latex-engine", latex_engine)
}

#' @rdname pandoc_args
#' @export
pandoc_toc_args <- function(toc, toc_depth = 3) {

  args <- c()

  if (toc) {
    args <- c(args, "--table-of-contents")
    args <- c(args, "--toc-depth", toc_depth)
  }

  args
}


#' Transform path for passing to pandoc
#' 
#' Transform a path for passing to pandoc on the command line. Calls 
#' \code{\link[base:path.expand]{path.expand}} on all platforms. On Windows, 
#' transform it to a short path name if it contains spaces, and then convert 
#' forward slashes to back slashes (as required by pandoc for some path
#' references)
#' 
#' @param path Path to transform
#'   
#' @return Transformed path that can be passed to pandoc on the command line
#'   
#' @export
pandoc_path_arg <- function(path) {

  path <- path.expand(path)

  # remove redundant ./ prefix if present
  path <- sub('^[.]/', '', path)

  if (is_windows()) {
    i <- grep(' ', path)
    if (length(i))
      path[i] <- utils::shortPathName(path[i])
    path <- gsub('/', '\\\\', path)
  }

  path
}


#' Render a pandoc template.
#'
#' Use the pandoc templating engine to render a text file. Substitutions are
#' done using the \code{metadata} list passed to the function.
#'
#' @param metadata A named list containing metadata to pass to template.
#' @param template Path to a pandoc template.
#' @param output Path to save output.
#' @param verbose \code{TRUE} to show the pandoc command line which was
#'   executed.
#' @return (Invisibly) The path of the generated file.
#'
#' @export
pandoc_template <- function(metadata, template, output, verbose = FALSE) {

  tmp <- tempfile(fileext = ".md")
  on.exit(unlink(tmp))

  cat("---\n", file = tmp)
  cat(yaml::as.yaml(metadata), file = tmp, append = TRUE)
  cat("---\n", file = tmp, append = TRUE)
  cat("\n", file = tmp, append = TRUE)

  pandoc_convert(tmp, "markdown", output = output,
                 options = paste0("--template=", template),
                 verbose = verbose)

  invisible(output)
}

#' Create a self-contained HTML document using pandoc.
#'
#' Create a self-contained HTML document by base64 encoding images,
#' scripts, and stylesheets referended by the input document.
#'
#' @param input Input html file to create self-contained version of.
#' @param output Path to save output.
#'
#' @return (Invisibly) The path of the generated file.
#'
#' @export
pandoc_self_contained_html <- function(input, output) {

  # make input file path absolute
  input <- normalizePath(input)

  # ensure output file exists and make it's path absolute
  if (!file.exists(output))
    file.create(output)
  output <- normalizePath(output)

  # create a simple body-only template
  template <- tempfile(fileext = ".html")
  writeLines("$body$", template)

  # call pandoc with from format of "markdown_strict" to
  # get as close as possible to html -> html conversion
  rmarkdown::pandoc_convert(
    input = input,
    from = "markdown_strict",
    output = output,
    options = c(
      "--self-contained",
      "--template", template
    )
  )

  invisible(output)
}



validate_self_contained <- function(mathjax) {
  if (identical(mathjax, "local"))
    stop("Local MathJax isn't compatible with self_contained\n",
         "(you should set self_contained to FALSE)", call. = FALSE)
}

pandoc_mathjax_args <- function(mathjax,
                                template,
                                self_contained,
                                files_dir,
                                output_dir) {
  args <- c()

  if (!is.null(mathjax)) {

    if (identical(mathjax, "default")) {
      if (identical(template, "default"))
        mathjax <- default_mathjax()
      else
        mathjax <- NULL
    }
    else if (identical(mathjax, "local")) {
      mathjax_path <- pandoc_mathjax_local_path()
      mathjax_path <- render_supporting_files(mathjax_path,
                                              files_dir,
                                              "mathjax-local")
      mathjax <- paste(normalized_relative_to(output_dir, mathjax_path), "/",
                       mathjax_config(), sep = "")
    }

    if (identical(template, "default")) {
      args <- c(args, "--mathjax")
      args <- c(args, "--variable", paste("mathjax-url:", mathjax, sep=""))
    } else if (!self_contained) {
      args <- c(args, "--mathjax")
      if (!is.null(mathjax))
        args <- c(args, mathjax)
    } else {
      warning("MathJax doesn't work with self_contained when not ",
              "using the rmarkdown \"default\" template.", call. = FALSE)
    }

  }

  args
}


pandoc_mathjax_local_path <- function() {
  
  local_path <- Sys.getenv("RMARKDOWN_MATHJAX_PATH", unset = NA)
  if (is.na(local_path)) {
    local_path <- unix_mathjax_path()
    if (is.na(local_path)) {
      stop("For mathjax = \"local\", please set the RMARKDOWN_MATHJAX_PATH ",
           "environment variable to the location of MathJax. ",
           "On Linux systems you can also install MathJax using your ",
           "system package manager.")
    } else {
      local_path
    }
  } else {
    local_path
  }
}


unix_mathjax_path <- function() {
  if (identical(.Platform$OS.type, "unix")) {
    mathjax_path <- "/usr/share/javascript/mathjax"
    if (file.exists(file.path(mathjax_path, "MathJax.js")))
      mathjax_path
    else
      NA
  } else {
    NA
  }
}



pandoc_html_highlight_args <- function(highlight,
                                       template,
                                       self_contained,
                                       files_dir,
                                       output_dir) {

  args <- c()

  if (is.null(highlight)) {
    args <- c(args, "--no-highlight")
  }
  else if (!identical(template, "default")) {
    if (identical(highlight, "default"))
      highlight <- "pygments"
    args <- c(args, "--highlight-style", highlight)
  }
  else {
    highlight <- match.arg(highlight, html_highlighters())
    if (highlight %in% c("default", "textmate")) {
      highlight_path <- rmarkdown_system_file("rmd/h/highlight")
      if (self_contained)
        highlight_path <- pandoc_path_arg(highlight_path)
      else
      {
        highlight_path <- normalized_relative_to(output_dir,
          render_supporting_files(highlight_path, files_dir))
      }
      args <- c(args, "--no-highlight")
      args <- c(args,
                "--variable", paste("highlightjs=", highlight_path, sep=""))
      if (identical(highlight, "textmate")) {
        args <- c(args,
                  "--variable",
                  paste("highlightjs-theme=", highlight, sep=""))
      }
    }
    else {
      args <- c(args, "--highlight-style", highlight)
    }
  }

  args
}


# Scan for a copy of pandoc and set the internal cache if it's found.
find_pandoc <- function() {

  if (is.null(.pandoc$dir)) {

    # define potential sources
    sys_pandoc <- find_program("pandoc")
    sources <- c(Sys.getenv("RSTUDIO_PANDOC"),
                 ifelse(nzchar(sys_pandoc), dirname(sys_pandoc), ""))
    if (!is_windows())
      sources <- c(sources, path.expand("~/opt/pandoc"))

    # determine the versions of the sources
    versions <- lapply(sources, function(src) {
      if (dir_exists(src))
        get_pandoc_version(src)
      else
        numeric_version("0")
    })

    # find the maximum version
    found_src <- NULL
    found_ver <- numeric_version("0")
    for (i in 1:length(sources)) {
      ver <- versions[[i]]
      if (ver > found_ver) {
        found_ver <- ver
        found_src <- sources[[i]]
      }
    }

    # did we find a version?
    if (!is.null(found_src)) {
      .pandoc$dir <- found_src
      .pandoc$version <- found_ver
    }
  }
}

# Get an S3 numeric_version for the pandoc utility at the specified path
get_pandoc_version <- function(pandoc_dir) {
  pandoc_path <- file.path(pandoc_dir, "pandoc")
  with_pandoc_safe_environment({
    version_info <- system(paste(shQuote(pandoc_path), "--version"),
                           intern = TRUE)
  })
  version <- strsplit(version_info, "\n")[[1]][1]
  version <- strsplit(version, " ")[[1]][2]
  numeric_version(version)
}

# wrap a system call to pandoc so that LC_ALL is not set
# see: https://github.com/rstudio/rmarkdown/issues/31
# see: https://ghc.haskell.org/trac/ghc/ticket/7344
with_pandoc_safe_environment <- function(code) {
  lc_all <- Sys.getenv("LC_ALL", unset = NA)
  if (!is.na(lc_all)) {
    Sys.unsetenv("LC_ALL")
    on.exit(Sys.setenv(LC_ALL = lc_all), add = TRUE)
  }
  lc_ctype <- Sys.getenv("LC_CTYPE", unset = NA)
  if (!is.na(lc_ctype)) {
    Sys.unsetenv("LC_CTYPE")
    on.exit(Sys.setenv(LC_CTYPE = lc_ctype), add = TRUE)
  }
  if (Sys.info()['sysname'] == "Linux" &&
        is.na(Sys.getenv("HOME", unset = NA))) {
    stop("The 'HOME' environment variable must be set before running Pandoc.")
  }
  if (Sys.info()['sysname'] == "Linux" &&
        is.na(Sys.getenv("LANG", unset = NA))) {
    # fill in a the LANG environment variable if it doesn't exist
    Sys.setenv(LANG=detect_generic_lang())
    on.exit(Sys.unsetenv("LANG"), add = TRUE)
  }
  if (Sys.info()['sysname'] == "Linux" &&
    identical(Sys.getenv("LANG"), "en_US")) {
    Sys.setenv(LANG="en_US.UTF-8")
    on.exit(Sys.setenv(LANG="en_US"), add = TRUE)
  }
  force(code)
}

# if there is no LANG environment variable set pandoc is going to hang so
# we need to specify a "generic" lang setting. With glibc >= 2.13 you can
# specify C.UTF-8 so we prefer that. If we can't find that then we fall back
# to en_US.UTF-8.
detect_generic_lang <- function() {

  locale_util <- Sys.which("locale")

  if (nzchar(locale_util)) {
    locales <- system(paste(locale_util, "-a"), intern = TRUE)
    locales <- suppressWarnings(
        strsplit(locales, split = "\n", fixed = TRUE)
    )
    if ("C.UTF-8" %in% locales)
      return ("C.UTF-8")
  }

  # default to en_US.UTF-8
  "en_US.UTF-8"
}



# get the path to the pandoc binary
pandoc <- function() {
  find_pandoc()
  file.path(.pandoc$dir, "pandoc")
}

# get the path to the pandoc-citeproc binary
pandoc_citeproc <- function() {
  find_pandoc()
  citeproc_path = file.path(.pandoc$dir, "pandoc-citeproc")
  if (file.exists(citeproc_path))
    citeproc_path
  else
    "pandoc-citeproc"
}

# quote args if they need it
quoted <- function(args) {
  # some characters are legal in filenames but without quoting are likely to be
  # interpreted by the shell (e.g. redirection, wildcard expansion, etc.) --
  # wrap arguments containing these characters in quotes. 
  shell_chars <- grepl(.shell_chars_regex, args)
  args[shell_chars] <- shQuote(args[shell_chars])
  args
}

# Environment used to cache the current pandoc directory and version
.pandoc <- new.env()
.pandoc$dir <- NULL
.pandoc$version <- NULL
UNFAOstatistics/faodoc documentation built on June 3, 2017, 6:42 a.m.