R/read_md.R

Defines functions join_eml_attributes parse_eml_attributes add_standard_yaml safe_temp_directory check_valid_suffix place_eml_first read_md

Documented in read_md

#' Read markdown-formatted metadata
#' 
#' @description
#' `read_md()` imports metadata from a markdown file into the workspace as a 
#' `tibble`.
#' @param file Filename to read from. Must be either `.Rmd` or `.qmd` file.
#' @details
#' [read_md()] is unusual in that it calls [rmarkdown::render()] or 
#' [quarto::quarto_render()] internally to ensure code blocks and snippets 
#' are parsed correctly. This ensures dynamic content is rendered correctly in 
#' the resulting `EML` document, but makes this function considerably slower 
#' than a standard import function. Conceptually, therefore, it is closer to a 
#' renderer with output type `tibble` than a traditional `read_` function.
#' 
#' This approach has one unusual consequence; it prevents 'round-tripping' of 
#' embedded code. That is, dynamic content in code snippets within the 
#' metadata statement is rendered to plain text in `EML.` If that `EML` document 
#' is later re-imported to `Rmd` using [read_eml()] and [write_md()], formerly 
#' dynamic content will be shown as plain text. 
#' 
#' Internally, [read_md()] calls [lightparser::split_to_tbl()].
#' @returns `read_md()` returns an object of class `tbl_df`, `tbl` and 
#' `data.frame` (i.e. a `tibble`).
#' @examples
#' source_file <- system.file("extdata", 
#'                            "bionet_metadata.Rmd",
#'                            package = "delma")
#' read_md(source_file)
#' @export
read_md <- function(file){
  
  # check file is specified
  if(missing(file)){
    cli::cli_abort("`file` is missing, with no default.")
  }
  # check file exists
  if(!file.exists(file)){
    cli::cli_abort("Specified file \"{file}\" does not exist.")
  }
  # check file is correctly specified
  check_is_single_character(file)
  format <- check_valid_suffix(file) # check and return a valid format
  temp_dir <- safe_temp_directory() # set a working directory
  
  # create a temporary file and convert output format to markdown
  temp_source <- glue::glue("{temp_dir}/temp_source.Rmd")
  file.copy(from = file, 
            to = temp_source,
            overwrite = TRUE) |>
    invisible()

  # create a rendered version of this doc, as needed for the supplied `format`
  temp_md <- glue::glue("{temp_dir}/temp_md.md")
  
  switch(format, 
         "Quarto" = {
           # Copy .qmd file from local directory to temp directory; a
           # workaround required due to quarto's inability to render a document
           # anywhere except the directory where the .qmd file is located.
           
           # This solution copies qmd to temp directory and explicitly 
           # sets the temp directory to run `quarto_render()`
           invisible(
             file.copy(
               from = file,
               to = glue::glue("{temp_dir}/{file}"),
               overwrite = TRUE
             )
           )
           xfun::in_dir(glue::glue("{temp_dir}/"), 
                        # run Quarto in the directory of the input file
                        report <- quarto::quarto_render(
                          input = basename(glue::glue("{file}")),
                          output_format = "md",
                          output_file = "temp_md.md", # output file will be created in the temp directory
                          quiet = TRUE
                          )
           )
           
           },
         {rmarkdown::render(input = temp_source,
                            output_format = "md_document", # rmarkdown::md_document() ?
                            output_file = temp_md,
                            quiet = TRUE)}
  )
  # NOTE: we MUST call `render()` here, and not `knit()`.
  # Only `render()` uses `pandoc`, meaning it will extract and 
  # calculate metadata that is necessary to place the
  # title and date properly in the body of the markdown file
  
  # rendered markdown files have no yaml, but need one for importing via
  # lightparser. Add a placeholder YAML here (note: data not used)
  add_standard_yaml(temp_md)
  
  # Parse markdown info as a tibble
  result <- read_lp(temp_md) |>
    place_eml_first() |>
    parse_lp_to_tibble()
  
  # import 'unrendered' tibble, extract hidden lists as attributes
  eml_attributes <- read_lp(file) |>
    parse_eml_attributes(tags = result$label)
  
  # clean up
  unlink(temp_dir, recursive = TRUE)
  
  # join and return
  if(is.null(eml_attributes)){
    result
  }else{
    join_eml_attributes(result, eml_attributes)
  }
}

#' Internal function to ensure no headings are placed before eml
#' @noRd
#' @keywords Internal
place_eml_first <- function(df){
  # Remove any headings placed *before* EML
  # NOTE: This step is important for Quarto documents. lightparser reads  
  #       yaml title as a heading in qmd but not Rmd, which must be removed
  eml_checker <- df$heading |>
    tolower() |>
    grepl("^eml", x = _) # accounts for both `eml:eml` and `EML`
  if(any(eml_checker)){
    eml_row <- which(eml_checker)[1]
    if(eml_row > 1){ # if EML is first, impossible for a header to be earlier
      df <- df[seq(from = eml_row, to = nrow(df)), ]
    }
  }
  # Note that the approach above *always* removes yaml. If that is needed later
  # this approach will require revision
  df
}


#' Internal function to check file name contains a supported suffix
#' @noRd
#' @keywords Internal
check_valid_suffix <- function(file){
  suffix <- stringr::str_extract(file, "\\.[:alnum:]+$")
  if(!(suffix %in% c(".Rmd", ".Qmd", ".qmd"))){
    c("Invalid file suffix", 
      "Accepted formats are `.Rmd` or `.qmd`") |>
    cli::cli_abort(call = rlang::caller_env())
  }else{
    switch(suffix,
           # ".md" = "basic",
           ".Rmd" = "Rmarkdown",
           ".Qmd" = "Quarto",
           ".qmd" = "Quarto")
  }
}

#' Internal function to create a temporary working directory.
#' This is needed because wiping `tempdir()` breaks heaps of stuff in R,
#' apparently including stored data from loaded packages.
#' @noRd
#' @keywords Internal
safe_temp_directory <- function(){
  safe_location <- glue::glue("{tempdir()}/delma-temp-working-directory") |>
    as.character()
  if(!dir.exists(safe_location)){
    dir.create(safe_location)  
  }
  safe_location
}

#' Internal function to add yaml to a file that is missing one
#' @returns Called for side-effect of editing file given by `input`
#' @noRd
#' @keywords Internal
add_standard_yaml <- function(input){
  # write new yaml as a text string
  # NOTE: This isn't actually used anywhere; it's a patch to get past
  # lightparser's import requirements
  yaml_new <- c("---" ,
                "author: unknown",
                "date: today",
                "---" )
  # write to supplied file
  c(yaml_new,
    readLines(input)) |>
    writeLines(con = input)
}

#' Internal function to extract tagged code blocks, to parse as attributes
#' @param x A tibble to extract attributes from
#' @param tags A vector of EML tags (`labels` col in source df) that are available for joining
#' @noRd
#' @keywords Internal
parse_eml_attributes <- function(x, tags){
  eml_check <- x$label == "eml"
  eml_check[is.na(eml_check)] <- FALSE
  if(any(eml_check)){
    x$label[which(eml_check)[1]] <- "eml:eml"
  }
  attr_chunks <- x |>
      dplyr::filter(.data$type == "block" &
                    .data$label %in% tags)
  if(nrow(attr_chunks) < 1){
    NULL
  }else{
    result <- purrr::map(attr_chunks$code,
               \(a){
                 contains_list_check <- any(grepl("^\\s?list\\(", a))
                 if(contains_list_check){
                   outcome <- glue::glue_collapse(a, sep = "\n") |>
                     parse(text = _) |>
                     eval() |>
                     try()
                   if(!inherits(outcome, "list")){ # try-error?
                     c("One of your named code blocks did not parse",
                       i = "These code blocks are used by `delma` to assign EML attributes",
                       i = "Consider revising code block '{attr(a, 'chunk_opts')$label}' to return a `list()`") |>
                     cli::cli_abort(call = rlang::caller_env())
                   }else{
                     outcome
                   }
                 }else{
                   c("One of your named code blocks does not contain `list()`",
                     i = "These code blocks are used by `delma` to assign EML attributes",
                     i = "Consider labelling code block '{attr(a, 'chunk_opts')$label}' to something else, or adding a `list()`") |>
                   cli::cli_abort(call = rlang::caller_env())
                 }
               })
    names(result) <- attr_chunks$label
    result
  }
}

#' Internal function to extract tagged code blocks, to parse as attributes
#' @param df A tibble
#' @param attr_list Named list of attributes to be appended to df
#' @noRd
#' @keywords Internal
join_eml_attributes <- function(df, attr_list){
  for(i in seq_along(attr_list)){
    row <- which(df$label == names(attr_list)[i])
    df$attributes[[row]] <- attr_list[[i]]
  }
  df
}

Try the delma package in your browser

Any scripts or data that you put into this service are public.

delma documentation built on June 8, 2025, 1:56 p.m.