bignlp: Fast and Memory-Efficient Annotation of Big Corpora

Documented in corenlp_parse_conll corenlp_parse_json corenlp_parse_xml

#' @title Parse CoreNLP output formats.
#' @description Parse output from CoreNLP.
#' @param x character vector, the JSON string(s) to be parsed
#' @param cols_to_keep columns to keep
#' @param output a destfile
#' @param logfile a character string naming the file to an error log to; if
#'   provided, json strings will be written to this file if parsing the json
#'   string string fails
#' @param progress logical 
#' @details The JSON results of applying the Stanford CoreNLP annotators can be
#'   written to a streaming JSON file (ndjson format). \code{corenlp_parse_json}
#'   will parse a json string to a data.frame. If output is specified, the
#'   output will be appended to the file provided. If \code{output} is \code{NULL}, a
#'   data.frame is returned. Strings that cannot be parsed are written to the
#'   logfile, if it is defined. If \code{filename} is present, the function will
#'   process one or more files with the output of Stanford CoreNLP in a NDJSON
#'   format. If the argument \code{output} has been defined during initialization, results are
#'   written/appended to that file. Otherwise, a \code{data.frame} is returned.
#' @importFrom data.table fread rbindlist as.data.table setkeyv setorderv
#' @importFrom jsonlite fromJSON
#' @importFrom utils write.table
#' @export corenlp_parse_json
#' @rdname parse
corenlp_parse_json = function(x, cols_to_keep = c("sentence", "index", "word", "pos", "ner"), output = NULL, logfile = NULL, progress = TRUE){
  if (length(x) == 1L){
    # run the parsing within try - coding issues may cause problems
    json_parsed <- try( jsonlite::fromJSON(x) )
    if (class(json_parsed)[1] == "try-error"){
      warning("Cannot parse character vector: ", x)
      if (!is.null(logfile)) cat(x, file = logfile, append = TRUE)
      return( NULL )
    }
    
    # to cope with '{"chunk": 2859285,  "sentences": [ ] }'
    if (length(json_parsed$sentences$tokens) == 0L){
      warning("JSON string without tokens: ", x)
      if (!is.null(logfile)) cat(x, file = logfile, append = TRUE)
      return( NULL )
    }
    
    dfs <- lapply(
      1L:length(json_parsed$sentences$tokens),
      function(i){
        if (ncol(json_parsed$sentences$tokens[[i]]) > 0){
          return( data.frame(sentence = i, json_parsed$sentences$tokens[[i]]))
        } else {
          return( NULL )
        }
      }
    )
    y <- do.call(rbind, dfs)
    
    # output
    if (!is.null(output)){
      write.table(
        y, file = output,
        sep = "\t",
        append = if (file.exists(output)) TRUE else FALSE,
        row.names = FALSE,
        col.names = if (file.exists(output)) FALSE else TRUE
      )
      return( NULL )
    } else {
      return( y )
    }
  } else if (length(x) > 1L){
    .parse <- function(line) corenlp_parse_json(line, cols_to_keep = cols_to_keep, output = output, logfile = logfile, progress = FALSE)
    dfs <- if (progress) pblapply(x, .parse) else lapply(x, .parse)
    if (is.null(output)) return( do.call(rbind, dfs) ) else return( invisible( NULL ) )
  }
}


#' Parse the CoNLL output of CoreNLP.
#' 
#' Read CoNLL output from a file and return a `data.table` with the annotation
#' data.
#' 
#' @details `corenlp_parse_conll` uses `data.table::fread()` and supplies
#'   settings that prevent undesired behaviour. The resulting `data.table` will
#'   have the columns "idx", "word", "lemma", "pos", "ner", "headidx", "deprel",
#'   see the [documentation of the CoNLLOutputter
#'   class](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/pipeline/CoNLLOutputter.html).
#'   
#' @param x A filename, or a `character` vector of filenames. If `x` is a `list`
#'   (of `character` vectors of filenames), it will be unlisted to yield a
#'   `character` vector.
#' @param progress logical 
#' @importFrom data.table fread rbindlist setcolorder
#' @importFrom jsonlite fromJSON
#' @return A `data.frame` with 8 columns:
#'   - **doc_id**: Document id, an integer value.
#'   - **idx**: Token Counter, starting at 1 for each new sentence.
#'   - **word**: Word form or punctuation symbol.
#'   - **lemma**: Lemma of word form, or an underscore if not available.
#'   - **pos**: Fine-grained part-of-speech tag, or underscore if not available.
#'   - **ner**: Named Entity tag, or underscore if not available.
#'   - **headidx**: Head of the current token, which is either a value of ID or zero ('0'). This is underscore if not available.
#'   - **deprel**: Dependency relation to the HEAD, or underscore if not available.
#'   
#' Note that Column 1 is generated by bignlp, columns 2-8 map the CoNLL output
#' of CoreNLP; the description of the columns is taken from the [documentation
#' of the CoNLLOutputter
#' class](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/pipeline/CoNLLOutputter.html)
#' @export corenlp_parse_conll
#' @importFrom utils read.table
corenlp_parse_conll = function(x, progress = TRUE){
  if (is.list(x)) x <- unlist(x)
  if (length(x) == 1L){
    if (file.exists(x)){
      dt <- fread(x, na.strings = NULL, blank.lines.skip = TRUE, quote = "", header = FALSE)
      colnames(dt) <- c("idx", "word", "lemma", "pos", "ner", "headidx", "deprel")
      dt[, "doc_id" := as.integer(sub("^(\\d+)\\..*$", "\\1", basename(x)))]
      setcolorder(dt, neworder = "doc_id")
    } else {
      dt <- as.data.table(
        read.table(text = x, blank.lines.skip = TRUE, header = FALSE, sep = "\t", quote = "")
      )
      colnames(dt) <- c("idx", "word", "lemma", "pos", "ner", "headidx", "deprel")
    }
    return(dt)
  } else if (length(x) > 1L){
    .parse <- function(f) corenlp_parse_conll(f, progress = FALSE)
    dts <- if (progress) pblapply(x, .parse) else lapply(x, .parse)
    return(rbindlist(dts))
  }
}

#' @rdname parse
#' @export corenlp_parse_xml
#' @importFrom coreNLP loadXMLAnnotation getToken
corenlp_parse_xml <- function(x){
  cat(x, file = (xmlfile <- tempfile()))
  a <- loadXMLAnnotation(xmlfile)
  unlink(xmlfile)
  getToken(a)
}