R/utils.R

Defines functions is_docx .onLoad check_dup_sections header_to_bold get_pdf_pagenumber clean_string find_pages process_pdf evaluate_inline print.revise_manuscript extract_refs read_md checksum_exists.revise_manuscript checksum_exists.revise_corpus checksum_exists

Documented in extract_refs read_md

checksum_exists <- function(corpus, checksum){
  UseMethod("checksum_exists", corpus)
}
#' @export
checksum_exists.revise_corpus <- function(corpus, checksum){
  all_csums <- sapply(corpus, `[[`, "checksum")
  return(isTRUE(checksum %in% all_csums))
}

#' @export
checksum_exists.revise_manuscript <- function(corpus, checksum){
  return(isTRUE(checksum == corpus[["checksum"]]))
}

#' read_md
#'
#' Read in a markdown document for further analysis
#' @param path to markdown manuscript

read_md <- function(path){
  unname(unlist(utils::read.delim(path, sep = "\n", quote = "", header = FALSE)))
}

#' extract_refs
#'
#' Extract table and figure references
#' @param path to markdown manuscript

extract_refs <- function(path){
  lines <- read_md(path)
  lines <- paste(lines, collapse = "\n")
  references <- unlist(stringr::str_extract_all(lines, "\\\\\\@ref.+?\\)"))
  figures <- data.frame(text = unique(grep("fig:",references, value = TRUE)))
  tables <- data.frame(text = unique(grep("tab:",references, value = TRUE)))
  figures$ref <- seq_along(figures[,1])
  tables$ref <- seq_along(tables[,1])
  list(figures = figures, tables = tables)

}


#' @export
#' @importFrom cli cli_h2
print.revise_manuscript = function(x, ...){
  cli::cli_h2("<Manuscript>")
  cli_msg("*" = "{length(x$sec)} section{c('s', '')[(length(x$sec) == 1)+1L]}")
  if(is.null(x$PDF)){
    cli_msg("x" = "No PDF attached")
  }else{
    cli_msg("*" = "{nrow(x$PDF)} pages")
  }
}

# evaluate_inline
#
# Evaluates embedded rchunks within a string of text
# @param string a section of text with includes inline elements
# @examples
# revise:::evaluate_inline("1+1 = `r 1+1`")

evaluate_inline <- function(string){
  glue::glue(string, .open = "`r ", .close = "`")
}

# process_pdf
#
# process pdf file
# @param path path to pdf

process_pdf <- function(path){
  ext <- tolower(tools::file_ext(path))
  if(!ext %in% c("pdf")){
    stop("Extracting page numbers only works for pdf documents")
  }
  doc <- get_pdf_text(path)
  running_head = gsub("\n.*","",doc$text[2])
  running_head = trimws(gsub("[0-9]","",running_head))
  running_head = gsub("\\s{1,}"," ", running_head)
  is_head <- unlist(lapply(doc$text, function(x) grepl(running_head,x)))
  prop_head <- prop.table(table(is_head))["TRUE"]

  doc$text <- tolower(doc$text)
  if(!is.null(running_head)){
    doc$text <- stringr::str_remove_all(doc$text, tolower(glue::glue("^{running_head}")))
  }

  doc <-  dplyr::summarise(
    dplyr::group_by(doc, page_id),
    text = paste(text, collapse = " "),
    .groups = "drop"
  )

  doc$text <- trimws(stringr::str_remove_all(doc$text, "[[:digit:]]")) # remove all numbers
  doc$text <- stringr::str_remove_all(doc$text, "\\[.{0,50}\\]") # remove square brackets
  doc$text <- stringr::str_remove_all(doc$text, "\\(.{0,50}\\)") # remove parentheses
  doc$text <- trimws(stringr::str_remove_all(doc$text, "[[:punct:]]")) # remove all punctuation
  if(prop_head > .7){
    attr(doc, "running_head") <- running_head
  }else{
    attr(doc, "running_head") <- NULL
  }
  doc
}

# find_pages
#
# Searches through a manuscript for a page reference
# @param manuscript the manuscript object to search
# @param string the string to find
# @export

find_pages <- function(manuscript, string){
  if(is.null(manuscript$PDF)) stop("No PDF attached to the manuscript object: not possible to identify page numbers.")
  get_pdf_pagenumber(string, pdf_text = manuscript$PDF)
}

# clean_string
#
# Cleaning steps for identifying strings
# @param string string to clean

clean_string <- function(string){
  string <- gsub("\\[.{0,50}\\]","",string) # remove square brackets
  string <- gsub("\\*|\\#", "", string) # remove rmarkdown formatting
  string <- gsub("[0-9]","", string) # remove numbers
  string <- gsub("\\(.{0,50}\\)", "", tolower(string)) # remove parentheses
  string <- gsub("[[:punct:]]", "", tolower(string)) # remove parentheses
  string <- gsub("\\s{2,}", " ", string) # remove additional spaces
  string <- gsub("\\n", " ", string)
  string
}

# get_page_number
#
# Finds page number from pdf based on text matching
# @param string text to match
# @param pdf_text text to search
# @param max.distance argument passed to agrep

get_pdf_pagenumber = function(string, pdf_text, max.distance = .15){

  string <- clean_string(string)

  doc <- clean_string(pdf_text$text)

  pnum <- agrep(string, doc, ignore.case = TRUE, max.distance = max.distance)

  if(length(pnum) > 0) return(paste(pnum, collapse = ", "))

  l <- lapply(seq_len(length(doc)), function(p){ # look at combinations of pages if no match

    pages = sapply(c(doc[p], doc[p + 1]), function(x){
      tolower(unlist(x))
    })

    pages <- lapply(seq_along(pages), function(i){
      page <- pages[[i]]
      page <- gsub("\\\n", " ", page)
      page <- gsub(r"(\s{2,})", " ", page)
      page <- trimws(page)
      page
    })

    data.frame(
      page_id = glue::glue("{p}-{p + 1}"),
      text = trimws(paste(pages, collapse = " "))
    )
  }) # ---

  doc <- do.call(rbind, l)
  pnum <- agrep(string, doc$text, ignore.case = TRUE, max.distance = max.distance)
  doc$page_id[pnum]

}

# header_to_bold
#
# Converts headers to bold
# @param string a string
# @return string

header_to_bold = function(string){

  while(grepl("(?<!\\{)#{1,}.{0,100}\\\n",string, perl = TRUE)){
    target <- stringr::str_extract(string, "(?<!\\{)#{1,}.+?(\\n){1,}")
    n_hash <- sum(strsplit(target, split = "")[[1]]=="#")
    replacement <- gsub("(?<!\\{)#{1,}\\s?","**", target, perl = TRUE)
    if(n_hash < 3){
      replacement <- gsub("\\n{1,}","**\\\n\\\n", replacement)
    } else{
      replacement <- gsub("\\n{1,}",".** ", replacement)
    }
    string <- gsub(target, replacement, string)
  }

  string

}


utils::globalVariables(c("text", "page_id", ".revise_manuscripts"))

check_dup_sections <- function(sect_nams, revise_errors = getOption("revise_errors")){
  if(any(duplicated(sect_nams))){
    m <- paste("The following sections have duplicate names. When referencing sections by name, the first section with that name will be selected. Please use unique section names:\n", paste0("  '", sect_nams[duplicated(sect_nams)], "'\n"))

    if(!revise_errors){
    warning(m, call. = FALSE)
    }else{
      stop(m, call. = FALSE)
    }

  }
}

.onLoad <- function(libname, pkgname){

  if(is.null(getOption("revise_errors"))){
    options(revise_errors = TRUE)
  }

  if(is.null(getOption("reviewer_chunkname"))){
    options(reviewer_chunkname = c("asis", "reviewer"))
  }

}

# path
#
# Determines whether a path is to a .doc or .docx
# @param path to file

is_docx <- function(path) {
  ext <- tolower(tools::file_ext(path))
  ext %in% c("doc", "docx")
}

Try the revise package in your browser

Any scripts or data that you put into this service are public.

revise documentation built on April 3, 2025, 11:47 p.m.