R/parse_list_to_tibble.R
In delma: Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

#' @rdname parse_
#' @order 5
#' @noRd
#' @keywords Internal
parse_list_to_tibble <- function(x){
  result <- list_to_tibble_recurse(x)
  for(i in seq_len(purrr::pluck_depth(result))){
    result <- purrr::list_flatten(result)
  }
  result_tibble <- result |>
    clean_tbl_text_list() |>
    dplyr::bind_rows()
  
  # break pipe here to remove duplicates
  result_tibble <- result_tibble[!duplicated(result_tibble), ] 
  
  # restart pipe
  result_tibble |>
    dplyr::mutate(index_number = dplyr::row_number()) |>
    dplyr::select(
      dplyr::any_of(c("level", "label", "text", "attributes"))) |>
    clean_empty_lists() |>
    clean_paragraphs()
}

#' Internal recursive function
#' @param x (list) A list constructed from xml (via `xml2::as_list()`)
#' @param level (integer) what depth are we currently in within the list
#' @param file (string) file name to save changes
#' @noRd
#' @keywords Internal
list_to_tibble_recurse <- function(x, 
                                  level = 1,
                                  outcome = xml_tibble()){
  x_names <- names(x)
  purrr::map(.x = seq_along(x),
             .f = \(a){
               result <- extract_list_to_tibble(a, x_names, x, level)
               if(!is.null(result)){
                 if(nrow(result) > 0){
                   current_index <- c(
                     outcome$index[[nrow(outcome)]],
                     result$index[[1]])
                   outcome <- dplyr::bind_rows(outcome, result)
                   outcome$index[[nrow(outcome)]] <- current_index[!is.na(current_index)]
                 }
               }
               if(is.list(x[[a]])){
                 if(length(x[[a]]) > 0){
                   list_to_tibble_recurse(x[[a]], 
                                          level = level + 1, 
                                          outcome = outcome) 
                 }
                 else{
                   format_xml_tibble(outcome)
                 }
               }else if(is.character(x[[a]])){
                 result <- format_xml_tibble(outcome)
                 # This code is rather convoluted.
                 # Where a `para` tag includes text FOLLOWED by other tags,
                 # indexing was failing to detect it. This 're-adds' 
                 # text missing for this reason.
                 match_check <- result$text[nrow(result)] == x[[a]]
                 if(is.na(match_check) | isFALSE(match_check)){
                   result <- result |>
                     tibble::add_row(
                       level = level,
                       label = NA,
                       attributes = list(NA),
                       text = x[[a]]
                     )
                 }
                 result 
               }else{
                 format_xml_tibble(outcome)
               }
             }
  )
}

#' get information as tibble in list_to_tibble_recurse
#' @noRd
#' @keywords Internal
extract_list_to_tibble <- function(index, list_names, list_data, level){
  if(is.null(list_names[index])){
    xml_tibble(level = level, 
               text = list_data[[1]],
               index = index)
  }else if(list_names[index] != ""){
    current_contents <- list_data[[index]]
    current_attr <- attributes(current_contents)
    # current_title <- snakecase::to_title_case(list_names[index])
    result <- xml_tibble(level = level,
                         label = list_names[index],
                         index = index)
    if(length(current_attr) >= 1){
      non_name_attributes <- current_attr[names(current_attr) != "names"] |>
        replace_xml_quotes()
      result$attributes[1] <- list(non_name_attributes)
    }else{
      result$attributes <- list(NA)
    }
    if(inherits(current_contents, "character")){
      result$text <- current_contents
    }
    result
  }else{
    NULL
  }
}

#' Internal function to replace "\"" with "&quot;"
#' Former is created by xml2 somewhere, even when source says latter.
#' @noRd
#' @keywords Internal
replace_xml_quotes <- function(x){
  purrr::map(x,
    \(a){
      if(a == "\""){
        "&quot;"
      }else{
        a
      }
    })
}

#' empty tibble content
#' @noRd
#' @keywords Internal
xml_tibble <- function(level = NA,
                       label = NA,
                       attributes = NA,
                       text = NA,
                       index = NA){
  tibble::tibble(
    level = as.integer(level),
    label = as.character(label),
    attributes = as.list(attributes),
    text = as.character(text),
    index = as.list(index)
    )
}

#' Internal function to format a tibble from list
#' @noRd
#' @keywords Internal
format_xml_tibble <- function(df){
  df <- df[-1, ] # top row is empty
  index_names <- purrr::map(.x = seq_len(nrow(df)),
                      .f = \(a){
                        paste(df$label[seq_len(a)], collapse = "_")
                      }) |>
    unlist()
  df$index_names <- index_names
  df
}

#' Internal function to clean tibbles for aggregating
#' @param x a flat list (i.e. no nesting) containing one tibble per entry
#' @noRd
#' @keywords Internal
clean_tbl_text_list <- function(x){
  purrr::map(x, \(a){
    if(is.null(a)){
      a
    }else{
      n <- nrow(a)
      if(!is.na(a$text[n]) & is.na(a$text[(n - 1)]) & is.na(a$label[n])){
        a$text[n - 1] <- a$text[n]
        dplyr::slice_head(a, n = (n - 1))
      }else{
        a
      }      
    }
  })
}

#' Internal function to look for empty named lists; replace with list(NA)
#' @param x a tibble with a list-column named `attributes`
#' @noRd
#' @keywords Internal
clean_empty_lists <- function(x){
  list_check <- purrr::map(x$attributes, \(a){inherits(a, "list")}) |>
    unlist()
  list_entries <- x$attributes[list_check]
  x$attributes[list_check] <- purrr::map(list_entries, 
                                         \(a){if(length(a) < 1){NA}else{a}})
  x
}

#' Internal function to consolidate paragraphs
#' @param x a tibble with a chr-column named `label`
#' @noRd
#' @keywords Internal
clean_paragraphs <- function(x){
  x <- x |> 
    dplyr::mutate(text = as.list(.data$text),
                  index = seq_len(nrow(x)),
                  group = detect_paras(.data$label))
  if(any(x$group > 0)){
    x_subset <- x |>
      dplyr::filter(.data$group > 0) 
    x_split <- split(x_subset, x_subset$group)
    result_split <- purrr::map(x_split, \(a){
      tibble::tibble(
        level = a$level[1],
        label = a$label[1],
        text =  list(as.list(a$text[seq(from = 2, to = nrow(a), by = 1)])),
        attributes = a$attributes[1],
        group = 0,
        index = a$index[1])
    })
    dplyr::bind_rows(result_split) |>
      dplyr::bind_rows(x) |>
      dplyr::filter(.data$group < 1) |>
      dplyr::arrange(.data$index) |>
      dplyr::select(c("level", "label", "text", "attributes"))
  }else{
    x |>
      dplyr::select(c("level", "label", "text", "attributes"))
  }
}

#' Internal function to detect paragraphs
#' @param y a vector of labels
#' @noRd
#' @keywords Internal
detect_paras <- function(y){
  n <- length(y)
  result <- vector(mode = "integer", length = n)
  value <- 0
  for(i in seq_len(n)){
    if(i > 1 & i < n){
      if(y[i] != "Para" & y[(i + 1)] == "Para"){
        value <- value + 1
        result[i] <- value
      }else if(y[i] == "Para"){
        result[i] <- value
      }else{
        result[i] <- 0
      }
    }else{
      result[i] <- 0
    }
  }
  result
}

Any scripts or data that you put into this service are public.

delma documentation built on June 8, 2025, 1:56 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

delma
Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

R/parse_list_to_tibble.R
In delma: Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

Defines functions detect_paras clean_paragraphs clean_empty_lists clean_tbl_text_list format_xml_tibble xml_tibble replace_xml_quotes extract_list_to_tibble list_to_tibble_recurse parse_list_to_tibble

Try the delma package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

delma Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

R/parse_list_to_tibble.R In delma: Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

Defines functions detect_paras clean_paragraphs clean_empty_lists clean_tbl_text_list format_xml_tibble xml_tibble replace_xml_quotes extract_list_to_tibble list_to_tibble_recurse parse_list_to_tibble

Try the delma package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

delma
Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language

R/parse_list_to_tibble.R
In delma: Convert 'R Markdown' and 'Quarto' Documents to Ecological Metadata Language