R/split_to_tbl.R
In lightparser: From 'Rmarkdown' and 'Quarto' Files to Tibble and Back

Documented in split_to_tbl

# WARNING - Generated by {fusen} from dev/flat_split_combine.Rmd: do not edit by hand

#' Group lines of a Rmd or Qmd file by types in a tibble
#'
#' @param file A Rmd or Qmd file path
#' @return A tibble with 6 columns:
#' - `type`: type of the part (yaml, heading, inline, block)
#' - `label`: label of the part (if any)
#' - `params`: parameters of the part (if any)
#' - `text`: text of the part (if any)
#' - `code`: code of the part (if any)
#' - `heading`: heading of the part (if any)
#' - `heading_level`: level of the heading (if any)
#' - `section`: section of the Rmd file, according to headings (if any)
#'
#' @importFrom tibble tibble
#' @importFrom utils getFromNamespace
#'
#' @export
#' @examples
#' file <- system.file(
#'   "dev-template-parsing.Rmd",
#'   package = "lightparser"
#' )
#' split_to_tbl(file)
split_to_tbl <- function(file) {
  rmd_lines <- readLines(file)
  # Get yaml header
  if (!grepl("^---$", rmd_lines[1])) {
    stop(
      "Rmd/Qmd file parsed needs to have a yaml header",
      " starting from line 1 with: ---"
    )
  }
  yaml_begin <- 1
  yaml_end <- which(grepl("^---", rmd_lines))[2]
  yaml_content <- yaml::yaml.load(rmd_lines[yaml_begin:yaml_end])
  yaml_tbl <- tibble(
    type = "yaml",
    label = NA,
    params = list(yaml_content),
    text = NA,
    code = NA,
    heading = NA,
    heading_level = NA,
    section = NA
  )

  # Get the rest of the document without yaml
  rmd_lines_no_yaml <- rmd_lines[-c(yaml_begin:yaml_end)]

  if (length(rmd_lines_no_yaml) == 0) {
    # yaml only
    return(yaml_tbl)
  }

  # Are we inside a Rmd/Qmd that is currently knitted ?
  # If so, we cannot use knitr::split_file() in the current session
  # because it will affect the hidden knit environment variables.
  outside_knit <- is.null(knitr::opts_knit$get("out.format"))

  if (outside_knit) {
    res_split <- knitr_split(rmd_lines_no_yaml)
  } else {
    message(
      "It seems you are currently knitting a Rmd/Qmd file.",
      " The parsing of the file will be done in a new R session."
    )
    rlang::check_installed("callr", reason = "to extract Rmd when knitting")

    res_split <- callr::r(function() knitr_split(rmd_lines_no_yaml),
      package = TRUE
    )
  }

  res_split$text <- lapply(res_split$text, split_headers_from_text)

  # duplicate rows according to number of elements in "text"
  res_split_unnested <- res_split[rep(
    seq_len(nrow(res_split)),
    lengths(res_split$text)
  ), ]
  # replace "text" by the proper not duplicated element of "text"
  which_element <- unlist(sapply(lengths(res_split$text), seq_len))
  res_split_unnested$text <- sapply(
    seq_len(nrow(res_split_unnested)),
    function(x) {
      res_split_unnested$text[[x]][which_element[x]]
    }
  )

  # Get headings
  res_split_unnested$heading <- sapply(
    seq_len(nrow(res_split_unnested)),
    function(x) {
      if (isTRUE(grepl("-heading-", names(res_split_unnested$text)[x]))) {
        gsub("^#*\\s*", "", res_split_unnested$text[x])
      } else {
        NA
      }
    }
  )

  # Get level according to heading name
  res_split_unnested$heading_level <- sapply(
    seq_len(nrow(res_split_unnested)),
    function(x) {
      if (!is.na(res_split_unnested$heading[x])) {
        # extract level number after heading in names
        as.numeric(
          gsub(
            "^.*-heading-level-([0-9]+)$",
            "\\1",
            names(res_split_unnested$text)[x]
          )
        )
      } else {
        NA
      }
    }
  )

  # Get section according to heading : duplicate headings down the column
  change_heading <- cumsum(!is.na(res_split_unnested$heading))
  change_heading[change_heading == 0] <- NA
  res_split_unnested$section <- res_split_unnested$heading[
    !is.na(res_split_unnested$heading)
  ][change_heading]

  res_split_unnested$type[!is.na(res_split_unnested$heading)] <- "heading"

  # Put back yaml in 'res'
  res_full <- rbind(yaml_tbl, res_split_unnested)

  return(res_full)
}

#' Split text and chunks from a Rmd or Qmd file into a tibble
#' @noRd
knitr_split <- function(rmd_lines_no_yaml) {
  # Use hidden functions of {knitr}
  # Code extracted will be stored in a new environment
  # with `knitr:::split_file()`
  # Using `knitr::knit_code$restore()` makes sure there is no current Rmd to be
  # splitted
  # Is it compatible with the function beeing included in a Rmd file ?
  # We'll see.
  getFromNamespace("knit_log", "knitr")$restore()
  knitr::knit_code$restore()
  getFromNamespace("chunk_counter", "knitr")(reset = TRUE)
  options(knitr.duplicate.label = "allow")
  # Restore after splitting
  on.exit(options(knitr.duplicate.label = NULL), add = TRUE)
  # restore unnamed-chunk counter
  on.exit(getFromNamespace("chunk_counter", "knitr")(reset = TRUE), add = TRUE)
  on.exit(knitr::knit_code$restore(), add = TRUE)
  on.exit(getFromNamespace("knit_log", "knitr")$restore(), add = TRUE)


  # Split files
  out <- getFromNamespace("split_file", "knitr")(
    rmd_lines_no_yaml,
    patterns = getFromNamespace("all_patterns", "knitr")$md,
    set.preamble = TRUE
  )

  res <- tibble(
    type = sapply(out, class),
    label = sapply(out, function(x) {
      ifelse(
        !is.null(x$params$label), x$params$label, NA
      )
    }),
    params = lapply(out, function(x) {
      if (!is.null(x$params) &&
        (is.null(x$params$engine) || x$params$engine == "r")) {
        # if chunk is not 'r' chunk, it must be accounted as text
        x$params
      } else {
        NA
      }
    }),
    text = lapply(out, function(x) {
      if (inherits(x, "inline")) {
        unlist(strsplit(x$input, "\n"))
      } else if (
        !is.null(x$params) &&
          !is.null(x$params$engine) &&
          x$params$engine != "r"
      ) {
        knitr::knit_code$get(x$params$label)
      } else {
        NA
      }
    }),
    code = lapply(out, function(x) {
      if (
        !is.null(x$params) &&
          !is.null(x$params$engine) &&
          x$params$engine != "r"
      ) {
        NA
      } else if (inherits(x, "block")) {
        knitr::knit_code$get(x$params$label)
      } else {
        NA
      }
    })
  )

  return(res)
}

#' split_headers_in_text
#' @noRd
split_headers_from_text <- function(the_text) {
  new_group <- rep(FALSE, length(the_text))
  which_header <- grep("^#", the_text)
  which_level <-
    sapply(the_text[which_header], function(x) {
      nchar(gsub("[^#]", "", x))
    })
  if (length(which_header) != 0) {
    new_group[which_header] <- TRUE
    # Change group just after header
    which_header_plus <- 1 + which_header[
      (which_header + 1) <= length(new_group)
    ]
    if (length(which_header_plus) != 0) {
      new_group[which_header_plus] <- TRUE
    }
    groups <- formatC(
      cumsum(new_group),
      width = max(nchar(cumsum(new_group))),
      flag = "0"
    )
    groups[which_header] <-
      paste0(groups[which_header], "-heading-level-", which_level)

    split_text <- split(the_text, groups)
  } else {
    split_text <- list(the_text)
  }
  return(split_text)
}