# nolint start: line_length_linter.
#' Read Bib file
#'
#' Read a bib file into a data.frame
#'
#' @details
#' Inspired and partially credited to `bib2df::bib2df()` although this has no
#' dependencies outside of base functions and much quicker. This speed seems
#' to come from removing `stringr` functions and simplifying a few *apply
#' functions.
#' This will also include as many categories as possible from the entry.
#'
#' @param file File or connection
#' @param skip The lines to skip
#' @param max_lines The maximum number of lines to read
#' @param encoding Assumed encoding of file (passed to [readLines()]
#'
#' @return A `data.frame` with each row as a bib entry and each column as a
#' field
#' @seealso `?bib2df::bib2df`
#'
#' @export
#'
#' @examples
#' file <- "https://raw.githubusercontent.com/jmbarbone/bib-references/master/references.bib"
#' bibdf <- read_bib(file, max_lines = 51L)
#'
#' if (package_available("tibble")) {
#' tibble::as_tibble(bibdf)
#' } else {
#' head(bibdf)
#' }
#'
#' if (package_available("bib2df") & package_available("bench")) {
#' file <- system.file("extdata", "bib2df_testfile_3.bib", package = "bib2df")
#'
#' # Doesn't include the 'tidying' up
#' foo <- function(file) {
#' bib <- ("bib2df" %colons% "bib2df_read")(file)
#' ("bib2df" %colons% "bib2df_gather")(bib)
#' }
#'
#' \donttest{
#' bench::mark(
#' read_bib = read_bib(file),
#' bib2df = bib2df::bib2df(file),
#' foo = foo(file),
#' check = FALSE
#' )[1:9]
#' }
#' }
# nolint end: line_length_linter.
read_bib <- function(file, skip = 0L, max_lines = NULL, encoding = "UTF-8") {
# Account for nul values found in encoding?
# skipNul = TRUE could do this but an error can still be caused later
bib <- readLines(file, encoding = encoding, skipNul = FALSE)
if (!is.null(max_lines)) {
start <- skip + 1L
n_lines <- length(bib)
bib <- bib[seq.int(start, min(skip + max_lines, n_lines))]
}
bib <- gsub("[^[:graph:]]", " ", bib)
# Find start of entries
from <- grep("[@]", bib)
if (!length(from)) {
stop(cond_read_bib_entries())
}
# shift over (may contain white space?)
to <- c(from[-1L] - 1L, length(bib))
item_list <- Map(function(x, y) bib[x:y], x = from, y = to - 1L)
# Extract first line for speediness
first_line <- sapply(item_list, `[`, 1L)
keys <- gsub(".*((?<=\\{)[^,]+).*", "\\1", first_line, perl = TRUE)
fields <- gsub(".*((?<=@)[^\\{]+).*", "\\1", first_line, perl = TRUE)
fields <- tolower(fields)
# TODO Implement checks for duplicate categories?
out <- lapply(item_list, parse_bib)
categories <- lapply(out, "[[", "cat")
values <- lapply(out, "[[", "val")
as_bib(
process_bib_dataframe(categories, values, fields, keys),
bib_list = process_bib_list(keys, fields, categories, values)
)
}
# FUNS --------------------------------------------------------------------
parse_bib <- function(x) {
x <- x[-1L]
m <- regexpr("=", x)
ok <- m > 0L
m <- m[ok]
x <- x[ok]
res <- list(
cat = substring(x, 1L, m - 1L),
val = substring(x, m + 1L, nchar(x))
)
res$cat <- parse_bib_cat(res$cat)
res$val <- parse_bib_val(res$val)
res
}
parse_bib_cat <- function(x) {
x <- trimws(x)
x[x %in% c("{", "}")] <- NA_character_
tolower(x)
}
parse_bib_val <- function(x) {
x[!lengths(x)] <- NA_character_
# There may be something better than this
# Would like to maintain the { and }
x <- trimws(x)
x <- gsub("^(\\{|\")|(\"|\\})[,]?$", "", x)
x <- gsub(",$", "", x)
x
}
#' Process bib values
#'
#' Generates a data frame of values from bibs
#'
#' @param categories A list of categories
#' @param values A list of values
#' @param fields a Vector of fields
#' @param keys a Vector of keys
#'
#' @return A wide `data.frame` with explicit `NA`s
process_bib_dataframe <- function(categories, values, fields, keys) {
# Determine all categories for missing values inside Map
ucats <- unique(remove_na(unlist(categories)))
ucats_df <- quick_dfl(
category = ucats,
value = rep(NA_character_, length(ucats))
)
x <- mapply(
function(cats, vals, field, key) {
# Determine valid categories/values
valids <- !is.na(cats)
cats <- cats[valids]
vals <- vals[valids]
# Check for duplicate categories
lens <- counts(cats)
bad <- lens > 1L
if (any(bad)) {
stop(cond_process_bib_dataframe_dupe(key, names(lens)[bad]))
}
# Append vectors
cats <- c("key", "field", cats)
vals <- c(key, field, vals)
# Create data.frame
data <- quick_dfl(category = cats, value = vals)
# Check for missing categories
toadd <- ucats %out% cats
if (any(toadd)) {
data <- rbind(data, ucats_df[toadd, ])
}
# Transpose to prep for reduce rbinding
quick_df(set_names(as.list(data[[2L]]), data[[1L]]))
},
cats = categories,
vals = values,
field = fields,
key = keys,
SIMPLIFY = FALSE
)
# TODO consider mark::row_bind(), when ready
Reduce(rbind, x)[, c("key", "field", ucats), ]
}
process_bib_list <- function(keys, fields, categories, values) {
valid <- !is.na(values) & values != ""
x <- mapply(
function(key, field, cats, vals) {
struct(
c(key, field, vals),
class = c("character", "mark_bib_entry"),
names = c("key", "field", cats)
)
},
key = keys[valid],
field = fields[valid],
cats = categories[valid],
vals = values[valid],
SIMPLIFY = FALSE
)
as_bib_list(x, keys[valid])
}
as_bib_list <- function(x, names = NULL) {
if (!is.list(x)) {
stop(cond_as_bib_list_class())
}
class(x) <- c("list", "mark_bib_list")
x
}
as_bib <- function(x, bib_list = NULL) {
if (!is.data.frame(x)) {
stop(cond_as_bib_class())
}
class(x) <- c("mark_bib_df", "data.frame")
attr(x, "bib_list") <- bib_list
x
}
# Prints ------------------------------------------------------------------
#' @export
print.mark_bib_list <- function(x, ...) {
nm <- names(x)
out <- x
names(out) <- NULL
# TODO run format() on all categories for equal lengths
for (i in seq_along(out)) {
nmi <- nm[i]
bordern <- getOption("width") - nchar(nmi) - 1L
header <- if (bordern) {
paste0(" ", collapse0(rep("-", bordern), sep = ""))
} else {
""
}
co <- utils::capture.output(print(out[[i]]))[-1L]
cat(
paste0(if (i == 1L) "" else "\n", nmi, header),
collapse0(paste0(" ", co), sep = "\n"),
sep = "\n"
)
}
invisible(x)
}
#' @export
print.mark_bib_entry <- function(x, ...) {
out <- x[!(is.na(x) | x == "")]
nm <- names(out)
chars <- nchar(nm)
chars[is.na(chars)] <- 0L
chars_max <- max(chars)
blanks <- vap_chr(
chars_max - chars + 2L,
function(x) {
collapse0(rep(" ", x), sep = "")
}
)
cat(paste(nm, blanks, out), sep = "\n")
invisible(x)
}
#' Print bib data frame
#'
#' Print bib dataframe, or as list
#'
#' @param x The `mark_bib_df` object
#' @param list If `TRUE` will print as a list rather than the `data.frame`
#' @param ... Additional arguments passed to methods
#' @returns `x`, invisibly, called for its side effects
#' @export
print.mark_bib_df <- function(x, list = FALSE, ...) {
if (list) {
print(attr(x, "bib_list"), ...)
} else {
y <- x
attr(y, "bib_list") <- NULL
NextMethod()
}
invisible(x)
}
# conditions --------------------------------------------------------------
cond_read_bib_entries <- function() {
new_condition(
"No entries detected",
"read_bib_entries"
)
}
cond_process_bib_dataframe_dupe <- function(key, categories) { # nolint: object_length_linter, line_length_linter.
new_condition(
sprintf(
"The key `%s` has duplicate categories of `%s`",
key,
categories
),
"process_bib_dataframe_dupe"
)
}
cond_as_bib_list_class <- function() {
new_condition(
"`x` must be a list",
"as_bib_list_class"
)
}
cond_as_bib_class <- function() {
new_condition(
"`x` must be a data.frame",
"as_bib_class"
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.