R/candidate_sections.R

Defines functions candidate_sections

#' Detect candidate section subheadings
#'
#' @description Detect potential candidate subheadings from a corpus
#' of line-separated full texts. The function detects any lines below
#' a given number of characters (default is 40) and deduplicates
#' across all texts.
#' @param text An input text stored as a character vector split by
#' lines.
#' @param length Maximum character length for potential candidate
#' subheadings. Default value is 40.
#' @return A character vector of candidate subheadings.
#' @export
#' @examples
#' dois <- c('10.1186/s13750-021-00219-x', '10.1186/s13750-018-0116-4', '10.1186/s13750-018-0144-0', '10.1186/s13750-017-0113-z')
#' text <- doi2html(doi = dois[1])
#' secs <- candidate_sections(text)
#' secs
#' texts <- lapply(dois, doi2html)
#' secs <- candidate_sections(texts)
#' secs
candidate_sections <- function(text,
                               length = 40){

  #internal function to shorten texts to (abstract -> references)
  shorten <- function(text){
    if (grepl('Abstract', text) == TRUE){
      start <- min(grep('Abstract', text)) #detect start of text
    } else {
      start <- 1
    }
    if (grepl('Reference', text) == TRUE){
      end <- max(grep('Reference', text)) #detect end of text
    } else {
      end <- length(text)
    }
    short <- text[start:end] #create short text from abstract to references
    return(short)
  }

  #apply shorten to text(s)
  if (is.list(text) == FALSE){
    short <- shorten(text)
    output <- short[nchar(short) < length] #extract all lines less than 'length' characters long
    output <- unique(output) #deduplicate across multiple texts

  } else {
    short <- lapply(text, shorten)
    #short <- list()
    #for (i in 1:length(text)){
    #  x <- shorten(unlist(text[i]))
    #  short <- c(short, x)
    #}
    output <- lapply(short, function(x, y = length){x[nchar(x) < y]}) #extract all lines less than 'length' characters long
    output <- unique(unlist(output)) #deduplicate across multiple texts
  }

  # tidy output (remove non alphanumerics and empty values)
  output <- trimws(gsub("[^[:alnum:] ]", "", output))
  output <- output[!grepl("\\D", output)==FALSE] #exclude numbers only
  output <- output[output != ""]
  output <- unique(output)
  output <- sort(output)
  return(output)

}


candidate_sections(text)
ESHackathon/doi2txt documentation built on Dec. 17, 2021, 5:39 p.m.