R/bib_parser.R
In TeXCheckR: Parses LaTeX Documents for Errors

Documented in fread_bib

#' Functions for parsing .bib files
#' @name bib_parser
#' @param file.bib \code{.bib} file.
#' @param to_sort Include only author, title, year, and date.
#' @param check.dup.keys If \code{TRUE}, the default, return error if any bib keys are duplicates.
#' @param strip.braces If \code{TRUE}, the default, braces in fields are removed. 
#' @param check.unescaped.percent If \code{TRUE}, the default, fields with unescaped
#' percent signs are an error. (Unescaped percent signs in URLs are permitted.) Set
#' to \code{FALSE} to skip this check.
#' @param .bib_expected (logical, default: \code{TRUE}) Should \code{file.bib} be expected to 
#' have file extension \code{.bib}? If expectation violated, a warning is emitted.
#' @param halt Whether to halt on error. If \code{NULL}, the default, the value
#' \code{getOption("TeXCheckR.halt_on_error")} is used. Otherwise, \code{TRUE} or
#' \code{FALSE} to halt
#' regardless of the value of the option.
#' @param rstudio (logical, default: \code{FALSE}) If \code{TRUE}, pop the RStudio session
#' to the location in \code{file.bib} of the first error.
#' @param .report_error A function like \code{\link{report2console}} to handle errors.
#' @details \code{bib2DT} returns a \code{data.table} of the entries in \code{file.bib}. The function
#' \code{reorder_bib} rewrites \code{file.bib}, to put it in surname, year, title, line number order.
#' 
#' @export bib2DT fread_bib

fread_bib <- function(file.bib,
                      check.dup.keys = TRUE,
                      strip.braces = TRUE,
                      check.unescaped.percent = TRUE,
                      .bib_expected = TRUE,
                      halt = TRUE,
                      rstudio = FALSE,
                      .report_error) {
  stopifnot(length(file.bib) == 1L)
  check_TF(check.dup.keys)
  check_TF(strip.braces)
  check_TF(check.unescaped.percent)
  check_TF(.bib_expected)
  if (is.null(halt)) {
    halt <- getOption("TeXCheckR.halt_on_error")
  }
  check_TF(halt)
  check_TF(rstudio)
  
  if (XOR(.bib_expected, endsWith(file.bib, ".bib"))) {
    warning("File extension is not '.bib'.")
  }
  
  if (missing(.report_error)) {
    .report_error <- report2console
  }
  

  bib <-
    read_lines(file.bib) %>%  # Consider: fread("~/Road-congestion-2017/bib/Transport.bib", sep = "\n", fill = TRUE, encoding = "UTF-8", header = FALSE) 
    
    # Avoid testing }\\s+$ rather than just == }
    stri_trim_both %>%
    .[!startsWith(., "@Comment")]
  
  is_at <- startsWith(bib, "@")
  is_closing <- bib == "}"
  
  # Need to exit if bib is empty (also makes testing above convenient)
  if (!any(is_at) || !any(is_closing)) {
    message("`file.bib = ", file.bib, "` had no bib entries. ", 
            "Returning empty data.table.")
    return(data.table())
  }

  sep <- NULL
  # Can't use = as separator (almost certainly occurs in a URL)
  # Try these:
  for (sep_candidate in c("\t", "^", "|")){
    if (!any(grepl(sep_candidate, bib, fixed = TRUE))){
      sep <- sep_candidate
      break
    }
  }
  if (is.null(sep)){
    stop("No suitable separator found for bibliography file. That is, all candidates tried already appeared in the file")
  }

  bib_just_key_and_fields <- bib
  bib_just_key_and_fields[is_closing] <- ""
  bib_just_key_and_fields[is_at] <- sub("@", "key = ", bib_just_key_and_fields[is_at], fixed = TRUE)

  # Make sure the sep is detected (in case of >author   ={John Daley}<)
  bib_just_key_and_fields <- sub("={", "= {", bib_just_key_and_fields, fixed = TRUE)
  bib_just_key_and_fields <- sub(" = ", sep, bib_just_key_and_fields, fixed = TRUE)
  used_line_nos <- which(nzchar(bib_just_key_and_fields))
  bib_just_key_and_fields <- bib_just_key_and_fields[used_line_nos]

  x <- line_no <- NULL
  bibDT <- setDT(list(line_no = used_line_nos,
                      x = bib_just_key_and_fields))

  field <- value <- NULL
  bibDT[, c("field", "value") := tstrsplit(x, sep, fixed = TRUE)]

  is_key <- NULL
  bibDT[, is_key := field == "key"]
  
  key_value <- NULL
  bibDT[(is_key), key_value := tolower(sub("^.*\\{", "", value, perl = TRUE))]
  
  if (check.dup.keys && anyDuplicated(stats::na.omit(bibDT[["key_value"]]))) {
    duplicates <- duplicated_rows(bibDT, by = "key_value", na.rm = TRUE)
    if (getOption("TeXCheckR.messages", TRUE)) {
      print(duplicates[, "bib_file" := file.bib])
    }
    .report_error(file = file.bib,
                   line_no = if (!is.null(duplicates[["line_no"]])) first(duplicates[["line_no"]]),
                   context = first(duplicates[["value"]]),
                   error_message = "Duplicate bib key used.",
                   advice = paste0("Compare the two entries above. If they are identical, delete one. ", 
                                   "If they are distinct, choose a different bib key. ", 
                                   "(Note: keys are case-insensitive.)"),
                  halt = halt,
                  rstudio = rstudio)
  }
  
  bibDT[, key_value := NULL]

  key_line <- entry_type <- NULL
  bibDT[(is_key), key_line := value]
  bibDT[, key_line := zoo::na.locf(key_line, na.rm = FALSE)]
  bibDT <- bibDT[(!is_key)]
  bibDT[, x := NULL]

  bibDT[, lapply(.SD, stri_trim_both)]
  bibDT[, key_line := sub(",$", "", key_line, perl = TRUE)]
  bibDT[, c("entry_type", "key") := tstrsplit(key_line, "{", fixed = TRUE)]
  bibDT[, field := tolower(stri_trim_both(field))]
  if (strip.braces) {
    bibDT[, value := gsub("[{}]", "", value, perl = TRUE)]
  }
  bibDT[, value := sub(",$", "", value, perl = TRUE)]

  
  dups <- NULL
  duplicate_fields <-
    bibDT[, .(dups = anyDuplicated(field)), by = key]
  
  if (any(duplicate_fields[["dups"]])){
    keys <-
      duplicate_fields %>%
      .[as.logical(dups)] %>%
      .[["key"]]

    n_keys <- length(keys)

    if (n_keys <= 5L){
      top_keys <- keys
      stop("Duplicate fields found in ", paste0(keys, collapse = " "), ".")
    } else {
      top_keys <- keys[1:5]
      if (n_keys == 6L){
        stop("Duplicate fields found in ", paste0(keys, collapse = " "), ".")
      } else {
        stop("Duplicate fields found in ", paste0(keys, collapse = " "), " and ", n_keys - 5L, " others.")
      }
    }
  }
  
  # Check for unescaped % in fields (except URL)
  if (check.unescaped.percent) {
    unescaped_percent <- 
      bibDT[field != "url"][grep("(?<!(?:\\\\))[%]", value, perl = TRUE)]
    
    if (nrow(unescaped_percent)) {
      orig_file.bib <- read_lines(file.bib)
      first_line_no <- unescaped_percent[1L][["line_no"]]
      first_bad_text <- orig_file.bib[first_line_no]
      
      column <- 
        first_bad_text %>%
        stri_locate_all_regex_no_stri("(?<!(?:\\\\))[%]") %>%
        .[[1L]] %>%
        .[1, "start"] %>%
        as.vector
      
      first_bad_field <- unescaped_percent[1L][["field"]]
      first_bad_key <- unescaped_percent[1L][["key"]]
      
      .report_error(file = file.bib,
                    line_no = first_line_no,
                    column = column,
                    error_message = paste0("Field <", first_bad_field, "> ",
                                           "in entry <", first_bad_key, "> ",
                                           "contains unescaped %."),
                    advice = "Insert a backslash before this %.",
                    context = first_bad_text,
                    halt = halt,
                    caret = 2L,
                    rstudio = rstudio)
    }
  }
  
  bibDT[, .(line_no, entry_type, key, field, value)]

}

#' @rdname bib_parser

bib2DT <- function(file.bib, to_sort = FALSE){
  stopifnot(length(file.bib) == 1L)
    if (!grepl("\\.bib$", file.bib)){
      warning("File extension is not '.bib'.")
    }

    bib <-
      read_lines(file.bib) %>%
      # Avoid testing }\\s+$ rather than just == }
      stri_trim_both %>%
      .[!grepl("@Comment", ., fixed = TRUE)]
    is_at <- substr(bib, 0L, 1L) == "@" #grepl("^@", bib, perl = TRUE)
    is_closing <- bib == "}"
    entry_no <- cumsum(is_at)
  # We make assumptions about the structure
  # Namely that the closing brace for each entry
  # is the only character on its own line.
  n_ats <- sum(is_at)
  n_closing_braces <- sum(is_closing)

  if (n_ats != n_closing_braces){
    cat("@'s: ", n_ats, "\n",
        "}'s: ", n_closing_braces, "\n")
    stop("Unable to parse: Braces to close a bib entry should be on their own line.")
  }

  # All entries must have a nonempty key to be valid
  if (any(grepl("{,", bib[is_at], fixed = TRUE))){
    cat("At line: ", grep("{,", bib[is_at], fixed = TRUE)[[1]])
    stop("All bib entries must have a non-empty key.")
  }

  line_by_entry_no <- cumsum(is_at)
  keys <- gsub("^.*\\{(.*),$", "\\1", bib[is_at], perl = TRUE)

  if (uniqueN(keys) != n_ats){
    stop("Number of unique keys not equal to number of key entries.")
  }

  is_field <-
    !(is_at | is_closing | bib == "")

  extract_field_from <- function(field, from){
    pattern <- sprintf("%s%s%s", "^", field, "\\s+[=]\\s+[{](.*)[}],?$")
    if_else(grepl(pattern, from, perl = TRUE),
            gsub(pattern = pattern,
                 "\\1",
                 from), # perl = TRUE slower
            NA_character_)
  }

  # CRAN NOTE avoidance
  Line_no <- Surname <- Year <- annote <- author <- booktitle <- chapter <- crossref <- edition <-
    editor <- endyear <- howpublished <- institution <- intra_key_line_no <- line_no <-
    note <- number <- organization <- pages <- publisher <- related <- school <- series <- title <-
    type <- volume <- Year_as_date <- Date <- NULL

  bib_orig <- field_name <- NULL

  data.table(line_no = seq_along(bib),
             bib = bib,
             line_by_entry_no = line_by_entry_no,
             is_field = is_field,
             is_closing = is_closing) %>%
    .[, key := if_else(is_at,
                       gsub("^@[A-Za-z]+\\{(.*),$", "\\1", bib, perl = TRUE),
                       NA_character_)] %>%
    .[, key := zoo::na.locf(key, na.rm = FALSE)] %>%
    .[, author := extract_field_from("author", bib)] %>%
    .[, title  := extract_field_from("title", bib)] %>%
    .[, year   := extract_field_from("year", bib)] %>%
    .[, date   := extract_field_from("date", bib)] %>%
    {
      dot <- .
      if (!to_sort){
        out <-
          dot %>%
          .[, address := extract_field_from("address", bib)] %>%
          .[, annote   := extract_field_from("annote", bib)] %>%
          .[, booktitle   := extract_field_from("booktitle", bib)] %>%
          .[, booktitle   := extract_field_from("booktitle", bib)] %>%
          .[, chapter   := extract_field_from("chapter", bib)] %>%
          .[, crossref   := extract_field_from("crossref", bib)] %>%
          .[, options   := extract_field_from("options", bib)] %>%
          .[, related   := extract_field_from("related", bib)] %>%
          .[, edition   := extract_field_from("edition", bib)] %>%
          .[, editor   := extract_field_from("editor", bib)] %>%
          .[, howpublished   := extract_field_from("howpublished", bib)] %>%
          .[, institution   := extract_field_from("institution", bib)] %>%
          .[, month   := extract_field_from("month", bib)] %>%
          .[, note   := extract_field_from("note", bib)] %>%
          .[, number   := extract_field_from("number", bib)] %>%
          .[, organization   := extract_field_from("organization", bib)] %>%
          .[, pages   := extract_field_from("pages", bib)] %>%
          .[, publisher   := extract_field_from("publisher", bib)] %>%
          .[, school   := extract_field_from("school", bib)] %>%
          .[, series   := extract_field_from("series", bib)] %>%
          .[, type   := extract_field_from("type", bib)] %>%
          .[, volume   := extract_field_from("volume", bib)] %>%
          .[, url   := extract_field_from("url", bib)] %>%
          .[, edition   := extract_field_from("edition", bib)]
      } else {
        out <- dot
      }
      out
    } %>%
    # 1999/2015 --> 1999
    .[, date := if_else(grepl("/", date, fixed = TRUE),
                        substr(date, 0, 4),
                        date)] %>%
    .[, Year_as_date := if_else(grepl("[0-9]{4}", year),
                                paste0(year, "-01-01"),
                                year)] %>%
    .[, Date := coalesce(date, Year_as_date)] %>%
    .[, Surname := if_else(!is.na(author),
                           # If protected, just use as-is
                           # recalling that the outer braces have been removed already
                           if_else(grepl("^[{]", author, perl = TRUE),
                                   gsub("[{}]", "", author, perl = TRUE),
                                   # Daley, John
                                   if_else(grepl("^[A-Z][a-z]+,", author, perl = TRUE),
                                           author,
                                           rev_forename_surname_bibtex(author))
                           ),
                           # Sort first
                           NA_character_)] %>%
    .[, intra_key_line_no := seq_len(.N), by = "key"] %>%
    .[, Line_no := line_no] %>%
    .[, field_name := gsub("[^a-z]", "", gsub("[=].*$", "", bib, perl = TRUE), perl = TRUE)] %>%
    .[, field_name := if_else(is_at, "AT", field_name)] %>%
    .[, field_name := if_else(is_closing, "}", field_name)] %>%
    .[, field_name := factor(field_name,
                             levels = c("AT", "author", "title", "year", "date", "address", "annote",
                                        "booktitle", "chapter", "crossref", "options", "related", "edition",
                                        "editor", "howpublished", "institution", "month", "note", "number",
                                        "organization", "pages", "publisher", "school", "series", "type",
                                        "volume", "url", "Year_as_date", "Date", "Surname", "intra_key_line_no",
                                        "Line_no", "}"),
                             ordered = TRUE)] %>%
    .[, bib_orig := bib] %>%
    .[, lapply(.SD, zoo::na.locf, na.rm = FALSE, fromLast = FALSE), by = "key", .SDcols = author:bib_orig] %>%
    .[, lapply(.SD, zoo::na.locf, na.rm = FALSE, fromLast = TRUE) , by = "key", .SDcols = author:bib_orig] %>%
    setorder(Surname, Date, title, field_name, Line_no) %>%
    .[]
}

#' @rdname bib_parser
#' @param outfile.bib File to write the reordered bib to. Defaults to \code{file.bib}.
#' @export reorder_bib
reorder_bib <- function(file.bib, outfile.bib = file.bib){
  out_no <- bib2DT(file.bib, to_sort = TRUE)[["Line_no"]]
  y <- readLines(file.bib, encoding = "UTF-8")
  writeLines(y[out_no], outfile.bib, useBytes = TRUE)
}


enkey <- function(file.bib, outfile.bib = file.bib, placeholder = "PLACEHOLDER") {
  out <- readLines(file.bib, encoding = "UTF-8")
  out <- trimws(out)
  stopifnot(length(placeholder) == 1, is.character(placeholder))
  while (any(grepl(placeholder, out, fixed = TRUE))) {
    placeholder <- paste0(placeholder, "r")
  }
  is_at <- startsWith(out, "@")
  is_invalid_at <- is_at & grepl("\\{,$", out)
  wis_invalid_at <- which(is_invalid_at)
  for (i in wis_invalid_at) {
    out[i] <- sub(",$", paste0(placeholder, i, ","), out[i])
  } 
  writeLines(out, outfile.bib, useBytes = TRUE)
}