R/parse_lp_to_tibble.R
In delma: Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

#' Internal function to convert `lightparser` tibble to `delma` format
#' @param x A tibble
#' @noRd
#' @keywords Internal
parse_lp_to_tibble <- function(x){
  result <- x |>
    clean_header_level() |> # downfill heading level
    dplyr::mutate(attributes = as.list(rep(NA, nrow(x)))) |>
    dplyr::filter(.data$type != "yaml",
                  is.na(.data$heading),
                  .data$heading_level > 0) |>
    clean_lp_text() |>
    clean_urls() |>
    dplyr::select("heading_level", "section", "text", "attributes") |>
    dplyr::rename("level" = "heading_level", 
                  "label" = "section") |>
    clean_eml_tags()
  class(result) <- c("tbl_df", "tbl", "data.frame")
  result
}

#' Internal function to clean header levels
#' @param x a tibble with the column heading level
#' @noRd
#' @keywords Internal
clean_header_level <- function(x){
  heading_value <- 0
  all_rows <- seq_len(nrow(x))
  for(i in all_rows){
    if(!is.na(x$heading_level[i])){
      heading_value <- x$heading_level[i]
    }else{
      x$heading_level[i] <- heading_value
    }
  }
  x
}

#' Internal function to clean text column
#' @param x A tibble
#' @noRd
#' @keywords Internal
clean_lp_text <- function(x){
  result <- purrr::map(x$text,
                       \(a){
                         a <- trimws(a)
                         if(length(a) < 2){
                           if(is.na(a)){ # important to parse NAs first
                             a
                           }else if(a == ""){
                             NA
                           }else{
                             a
                           }
                         }else{
                           if(any(a == "")){
                             a[a == ""] <- "{BREAKPOINT}"
                             b <- glue::glue_collapse(a, sep = " ") |>
                               strsplit(split = "\\{BREAKPOINT\\}") |>
                               purrr::pluck(!!!list(1)) |>
                               trimws() |>
                               stringr::str_replace("\\s{2,}", "\\s")
                             b <- b[b != ""] # remove empty spaces; typically a leading ""
                             n <- length(b)
                             if(n < 1){
                               NA # multiple "" are replaced with NA
                             }else if(n == 1L){
                               b # single text pasted as is
                             }else{
                               as.list(b) # lists are parsed as paragraphs
                             }
                           }else{
                             glue::glue_collapse(x, sep = " ") |>
                               as.character()
                           }                 
                         }
                       })
  names(result) <- NULL
  x$text <- result
  x
}

#' Internal function to clean urls in a text column (remove leading and trailing <>)
#' @param x A tibble
#' @noRd
#' @keywords Internal
clean_urls <- function(x){
  result <- purrr::map(x$text, \(a){
    if(inherits(a, "list")){
      purrr::map(a, clean_url)  
    }else{
      clean_url(a)
    }
  })
  x$text <- result
  x
}

#' Internal function to clean_urls() for a single string
#' @param x A length-1 string
#' @noRd
#' @keywords Internal
clean_url <- function(x){
  if(is.na(x)){
    x
  }else{
    if(grepl("^<", x) & grepl(">$", x)){ # includes urls and html comments
      if(grepl("^<!--", x) & grepl("-->$", x)){ # html comments only
        NA
      }else{ # urls and emails only
        x |>
          stringr::str_replace("^<", "") |>
          stringr::str_replace(">$", "")
      }
    }else{
      x
    }      
  }
}

#' Internal function to clean EML headings that behave oddly
#' @noRd
#' @keywords Internal
#' @importFrom rlang .data
clean_eml_tags <- function(df){
  # modify the tibble to the required conventions for list/xml
  # `label` should be camel case
  df |>
    dplyr::mutate(label = snakecase::to_lower_camel_case(.data$label)) |>
    dplyr::mutate(label = dplyr::case_when(.data$label == "eml" ~ "eml:eml",
                                           .data$label == "surname" ~ "surName",
                                           .data$label == "pubdate" ~ "pubDate",
                                           .default = .data$label))
}

Any scripts or data that you put into this service are public.

delma documentation built on June 8, 2025, 1:56 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

delma
Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

R/parse_lp_to_tibble.R
In delma: Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

Defines functions clean_eml_tags clean_url clean_urls clean_lp_text clean_header_level parse_lp_to_tibble

Try the delma package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

delma Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

R/parse_lp_to_tibble.R In delma: Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

Defines functions clean_eml_tags clean_url clean_urls clean_lp_text clean_header_level parse_lp_to_tibble

Try the delma package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

delma
Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

R/parse_lp_to_tibble.R
In delma: Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language