R/nlp.R

Defines functions fillDF parseAnnoXML universalTagset loadXMLAnnotation getOpenIE getCoreference getSentiment getDependency getParse getToken print.annotation annotateFile annotateString initCoreNLP

Documented in annotateFile annotateString getCoreference getDependency getOpenIE getParse getSentiment getToken initCoreNLP loadXMLAnnotation parseAnnoXML print.annotation universalTagset

# http://nlp.stanford.edu/software/stanford-corenlp-full-2015-01-29.zip

volatiles = new.env(parent=emptyenv())

#' Initialize the CoreNLP java object
#'
#' This must be run prior to calling any other CoreNLP
#' functions. It may be called multiple times in order
#' to specify a different parameter set, but note that
#' if you use a different configuration during the same
#' R session it must have a unique name.
#'
#' @importFrom           rJava .jinit .jaddClassPath .jcall .jnew
#' @param libLoc         a string giving the location of the CoreNLP java
#'                       files. This should point to a directory which
#'                       contains, for
#'                       example the file "stanford-corenlp-*.jar", where "*" is the
#'                       version number. If missing, the function will try to find the
#'                       library in the environment variable CORENLP_HOME, and otherwise
#'                       will fail.
#' @param type           type of model to load. Ignored if parameterFile is set.
#' @param parameterFile  the path to a parameter file. See the CoreNLP documentation for
#'                       an extensive list of options. If missing, the package will simply
#'                       specify a list of standard annotators and otherwise only use default
#'                       values.
#' @param mem            a string giving the amount of memory to be assigned to the rJava
#'                       engine. For example, "6g" assigned 6 gigabytes of memory. At least
#'                       2 gigabytes are recommended at a minimum for running the CoreNLP
#'                       package. On a 32bit machine, where this is not possible, setting
#'                       "1800m" may also work. This option will only have an effect the first
#'                       time \code{initCoreNLP} is called, and also will not have an effect if
#'                       the java engine is already started by a seperate process.
#'@examples
#'\dontrun{
#'initCoreNLP()
#'sIn <- "Mother died today. Or, maybe, yesterday; I can't be sure."
#'annoObj <- annotateString(sIn)
#'}
#' @export
initCoreNLP = function(libLoc, type = c("english", "english_all", "english_fast", "arabic", "chinese", "french", "german", "spanish"),
      parameterFile = NULL, mem = "4g") {
  # Find location of the CoreNLP Libraries
  if (missing(libLoc)) {
    libLoc = paste0(system.file("extdata",package="coreNLP"),
                    "/stanford-corenlp-full-2015-12-09")
    if (!file.exists(libLoc))
      stop("Please run downloadCoreNLP() in order to install required jar files.")
  }
  if (!file.exists(libLoc) || !file.info(libLoc)$isdir)
    stop("libLoc does not point to an existing directory path")
  path = Sys.glob(paste0(libLoc,"/*.jar"))

  # Start java engine, if not already done, and add to classpath
  options(java.parameters = paste0("-Xmx", mem))
  rJava::.jinit()
  rJava::.jaddClassPath(path)

  # Determine if the corenlp files have been loaded correctly
  len = length(grep("stanford-corenlp-", basename(rJava::.jclassPath())))
  if (len == 0L)
    stop("The coreNLP jar files are were not found in libLoc.")
  if (len < 4L)
    warning("The set of coreNLP jar files may be incomplete. Proceed with caution")

  # Read parameter file and add to classpath
  if (is.null(parameterFile)) {
    type = match.arg(type)
    basepath = system.file("extdata",package="coreNLP")
    if (type == "english") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP.properties")
    if (type == "english_all") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-english-all.properties")
    if (type == "english_fast") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-english-fast.properties")
    if (type == "arabic") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-arabic.properties")
    if (type == "chinese") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-chinese.properties")
    if (type == "french") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-french.properties")
    if (type == "german") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-german.properties")
    if (type == "spanish") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-spanish.properties")
  } else {
    path = Sys.glob(paste0(parameterFile[[1]]))
  }
  rJava::.jaddClassPath(dirname(path))

  if (!is.null(volatiles$cNLP))
    rJava::.jcall(volatiles$cNLP, "V", "clearAnnotatorPool")

  volatiles$cNLP = rJava::.jnew("edu.stanford.nlp.pipeline.StanfordCoreNLP", basename(path))
  volatiles$xmlOut = rJava::.jnew("edu.stanford.nlp.pipeline.XMLOutputter")
}

#' Annotate a string of text
#'
#' Runs the CoreNLP annotators over a given string of text. The details
#' for which annotators to run and how to run them are specified in the
#' properties file loaded in via the \code{initCoreNLP} function (which
#' must be run prior to any annotation).
#'
#' @importFrom        rJava .jcall
#' @param text        a vector of strings for which an annotation is desired.
#'                    Will be collapsed to length 1 using new line characters
#'                    prior to the annotation.
#' @param format      the desired output format. Option \code{obj}, the default,
#'                    returns an R object of class \code{annotation} and will
#'                    likely be the desired choice for most users. The \code{xml}
#'                    and \code{text} exist primarily for subsequently saving
#'                    to disk.
#' @param outputFile  character string indicating where to put the output. If
#'                    set to NA, the output will be returned by the function.
#' @param includeXSL  boolean. Whether the xml style sheet should be included
#'                    in the output. Only used if format is \code{xml} and
#'                    outputFile is not \code{NA}.
#'@examples
#'\dontrun{
#'initCoreNLP()
#'sIn <- "Mother died today. Or, maybe, yesterday; I can't be sure."
#'annoObj <- annotateString(sIn)
#'}
#' @export
annotateString = function(text, format=c("obj", "xml", "text"), outputFile=NA,
                          includeXSL=FALSE) {
  if (is.null(volatiles$cNLP))
    stop("Must initilize with 'initCoreNLP'!")

  format = match.arg(format)
  if (length(text) > 1L) text = paste(text,collapse="\n")
  if (!is.na(outputFile) & format == "obj") {
    warning("Cannot output 'obj' as text file, saving xml instead.")
    format = "xml"
  }

  anno = rJava::.jcall(volatiles$cNLP, "Ledu/stanford/nlp/pipeline/Annotation;", "process", text)

  # Parse the output
  if (format == "text") {
    rJava::.jcall(volatiles$cNLP, "V", "prettyPrint", anno, .jnew("java.io.PrintWriter", tf <- tempfile()))
    on.exit(file.remove(tf))
    out = readLines(tf)
  } else {
    d = rJava::.jcall(volatiles$xmlOut, "Lnu/xom/Document;", "annotationToDoc", anno, volatiles$cNLP)
    xml = rJava::.jcall(d, "Ljava/lang/String;", "toXML")
    if (format == "xml") out = xml else out = parseAnnoXML(xml)
  }

  # Return the output if asked for; otherwise just write to disk
  if (is.na(outputFile)) return(out)
  outputCon = file(outputFile, "w")
  on.exit(close(outputCon),add=TRUE)
  writeLines(out, outputCon)
  if (includeXSL & format == "xml")
    file.copy(from=paste0(system.file("extdata",package="coreNLP"), "/CoreNLP-to-HTML.xsl"),
      to=dirname(outputFile))

  invisible(outputFile)
}

#' Annotate a text file
#'
#' Runs the CoreNLP annotators for the text contained in a given file.
#' The details for which annotators to run and how to run them are
#' specified in the properties file loaded in via the \code{initCoreNLP}
#' function (which must be run prior to any annotation).
#'
#' @importFrom        rJava .jcall
#' @param file        a string giving the location of the file to be loaded.
#' @param format      the desired output format. Option \code{obj}, the default,
#'                    returns an R object of class \code{annotation} and will
#'                    likely be the desired choice for users loading the output
#'                    into R. The \code{xml} and \code{text} exist primarily for
#'                    saving the files on the disk.
#' @param outputFile  character string indicating where to put the output. If
#'                    set to NA, the output will be returned by the function.
#' @param includeXSL  boolean. Whether the xml style sheet should be included
#'                    in the output. Only used if format is \code{xml} and
#'                    outputFile is not \code{NA}.
#' @export
annotateFile = function(file, format=c("obj", "xml", "text"), outputFile=NA,
                            includeXSL=FALSE) {

  if (is.null(volatiles$cNLP))
    stop("Must initilize with 'initCoreNLP'!")

  # Processing inputs
  format = match.arg(format)
  if (!is.na(outputFile) & format == "obj") {
    warning("Cannot output 'obj' as text file, saving xml instead.")
    format = "xml"
  }
  if (is.character(file)) {
    file = file(file, "r")
    on.exit(close(file))
  }

  # Read input into R and annotate
  #   TODO: Use native java interface instead of reading into R.
  text = readLines(file)
  if (length(text) > 1L) text = paste(text,collapse="\n")
  anno = rJava::.jcall(volatiles$cNLP, "Ledu/stanford/nlp/pipeline/Annotation;",
                "process", text)

  # Parse the output
  if (format == "text") {
    rJava::.jcall(volatiles$cNLP, "V", "prettyPrint", anno, .jnew("java.io.PrintWriter", tf <- tempfile()))
    on.exit(file.remove(tf))
    out = readLines(tf)
  } else {
    d = rJava::.jcall(volatiles$xmlOut, "Lnu/xom/Document;", "annotationToDoc", anno, volatiles$cNLP)
    xml = rJava::.jcall(d, "Ljava/lang/String;", "toXML")
    if (format == "xml") out = xml else out = parseAnnoXML(xml)
  }

  # Return the output if asked for asked for; otherwise just write to disk
  if (is.na(outputFile)) return(out)
  outputCon = file(outputFile, "w")
  on.exit(close(outputCon),add=TRUE)
  writeLines(out, outputCon)
  if (includeXSL && format == "xml")
    file.copy(from=paste0(system.file("extdata",package="coreNLP"), "CoreNLP-to-HTML.xsl"),
      to=dirname(outputFile))
}

#' Print a summary of an annotation object
#'
#' @method print annotation
#' @param x    an annotation object
#' @param ...  other arguments. Currently unused.
#'
#'@examples
#'print(annoEtranger)
#'
#' @export
print.annotation = function(x, ...) {
  cat("\nA CoreNLP Annotation:\n")
  cat("  num. sentences:", x$token[nrow(x$token),1], "\n")
  cat("  num. tokens:", nrow(x[[1]]), "\n")
  cat("\n")
  invisible(x)
}

#' Get tokens as data frame
#'
#' Returns a data frame of the tokens from an annotation
#' object.
#'
#' @param annotation    an annotation object
#'
#'@examples
#'getToken(annoEtranger)
#'
#' @export
getToken = function(annotation) {
  annotation$token
}

#' Get parse tree as character vector
#'
#' Returns a character vector of the parse trees.
#' Mostly use for visualization; the output of
#' \code{\link{getToken}} will generally be more
#' conveniant for manipulating in R.
#'
#' @param annotation    an annotation object
#'
#'@examples
#'getParse(annoEtranger)
#'
#' @export
getParse = function(annotation) {
  annotation$parse
}

#' Get Dependencies
#'
#' Returns a data frame of the coreferences of an annotation
#'
#' @param annotation    an annotation object
#' @param type          the class of coreference desired
#'
#'@examples
#'getDependency(annoEtranger)
#'getDependency(annoHp)
#'
#' @export
getDependency = function(annotation, type=c("CCprocessed","basic","collapsed")) {
  type = match.arg(type)
  dep <- if (type == "basic") annotation$basicDep
  else if (type == "collapsed") annotation$collapsedDep
  else if (type == "CCprocessed") annotation$collapsedProcDep

  dep$govIndex = match(paste0(dep$sentence,"-",dep$governorIdx),
                     paste0(annotation$token$sentence,"-",annotation$token$id))
  dep$depIndex = match(paste0(dep$sentence,"-",dep$dependentIdx),
                     paste0(annotation$token$sentence,"-",annotation$token$id))
  dep
}

#' Get Sentiment scores
#'
#' Returns a data frame of the sentiment scores from an annotation
#'
#' @param annotation    an annotation object
#'
#'@examples
#'getSentiment(annoEtranger)
#'getSentiment(annoHp)
#'
#' @export
getSentiment = function(annotation) {
  annotation$sentiment
}

#' Get Coreference
#'
#' Returns a dataframe containing all coreferences detected in the text.
#'
#' @param annotation    an annotation object
#'
#'@examples
#'getCoreference(annoHp)
#'
#' @export
getCoreference = function(annotation) {
  coref = annotation$coref[,-grep("text",names(annotation$coref))]
  coref$startIndex = match(paste0(coref$sentence,"-",coref$start),
                     paste0(annotation$token$sentence,"-",annotation$token$id))
  coref$endIndex = match(paste0(coref$sentence,"-",as.numeric(coref$end)-1),
                     paste0(annotation$token$sentence,"-",annotation$token$id))
  coref
}

#' Get OpenIE
#'
#' Returns a dataframe containing all OpenIE triples.
#'
#' @param annotation    an annotation object
#'
#'@examples
#'getOpenIE(annoHp)
#'
#' @export
getOpenIE = function(annotation) {
  annotation$openie
}

#' Load CoreNLP XML file
#'
#' Loads a properly formated XML file output by the CoreNLP
#' library into an \code{annotation} object in R.
#'
#' @param file      connection or character string giving the file name to load
#' @param encoding  encoding to be assumed for input strings.  It is used to mark
#'                  character strings as known to be in Latin-1 or UTF-8: it is
#'                  not used to re-encode the input. Passed to \code{readLines}.
#' @export
loadXMLAnnotation = function(file, encoding="unknown") {
  if (is.character(file)) {
    file = file(file, "r")
    on.exit(close(file))
  }
  x = readLines(file)
  tryCatch(output <- parseAnnoXML(x),
           error = function(e) stop("Not a valid CoreNLP XML file."))
  output
}

#' Convert Penn TreeBank POS to Universal Tagset
#'
#' Maps a character string of English Penn TreeBank
#' part of speech tags into the universal tagset
#' codes. This provides a reduced set of tags (12), and
#' a better cross-linguist model of speech.
#'
#' @param pennPOS   a character vector of penn tags to match
#'
#'@examples
#'tok <- getToken(annoEtranger)
#'cbind(tok$POS,universalTagset(tok$POS))
#'
#' @export
universalTagset = function(pennPOS) {
  mtab = structure(c("!", "#", "$", "''", "(", ")", ",", "-LRB-",
      "-RRB-",  ".", ":", "?", "CC", "CD", "CD|RB", "DT", "EX", "FW", "IN",
      "IN|RP",  "JJ", "JJR", "JJRJR", "JJS", "JJ|RB", "JJ|VBG", "LS", "MD", "NN",
      "NNP", "NNPS", "NNS", "NN|NNS", "NN|SYM", "NN|VBG", "NP", "PDT",  "POS", "PRP",
      "PRP$", "PRP|VBP", "PRT", "RB", "RBR", "RBS", "RB|RP",  "RB|VBG", "RN", "RP", "SYM",
      "TO", "UH", "VB", "VBD", "VBD|VBN",  "VBG", "VBG|NN", "VBN", "VBP", "VBP|TO",
      "VBZ", "VP", "WDT",  "WH", "WP", "WP$", "WRB", "``", ".", ".", ".", ".", ".",
      ".",  ".", ".", ".", ".", ".", ".", "CONJ", "NUM", "X", "DET", "DET",  "X",
      "ADP", "ADP", "ADJ", "ADJ", "ADJ", "ADJ", "ADJ", "ADJ",  "X", "VERB", "NOUN",
      "NOUN", "NOUN", "NOUN", "NOUN", "NOUN",  "NOUN", "NOUN", "DET", "PRT", "PRON",
      "PRON", "PRON", "PRT",  "ADV", "ADV", "ADV", "ADV", "ADV", "X", "PRT", "X",
      "PRT", "X",  "VERB", "VERB", "VERB", "VERB", "VERB", "VERB", "VERB", "VERB",
      "VERB", "VERB", "DET", "X", "PRON", "PRON", "ADV", "."), .Dim = c(68L,  2L))

  index = match(pennPOS, mtab[,1])
  output = rep("X", length(pennPOS))
  output[!is.na(index)] = mtab[index[!is.na(index)],2]
  output
}

#' Parse annotation xml
#'
#' Returns an annotation object from a character vector containing
#' the xml. Not exported; use \code{loadXMLAnnotation} instead.
#'
#' @importFrom   XML xmlRoot xmlParse xmlToDataFrame xmlChildren xmlAttrs xmlValue
#' @param xml    character vector containing the xml file from an annotation
parseAnnoXML = function(xml) {
  xml = XML::xmlRoot(XML::xmlParse(xml))[[1]]
  coref = xml[[2]]
  xml = xml[[1]]
  sentences = XML::xmlChildren(xml)

  out = list(token = NULL,parse = NULL,basicDep = NULL,collapsedDep = NULL,
              collapsedProcDep = NULL, coref = NULL, openie = NULL)

  if (length(sentences)==0L) {
    class(out) = "annotation"
    return(out)
  }

  tokenNames = c("sentence", "id", "word", "lemma", "CharacterOffsetBegin",
                 "CharacterOffsetEnd", "POS", "NER", "Speaker")
  depNames = c("sentence", "governor", "dependent", "type", "governorIdx",
               "dependentIdx")
  corefNames = c("corefId", "sentence", "start", "end", "head", "text")
  sentNames = c("id", "sentimentValue", "sentiment")
  openieNames = c("subject_start", "subject_end", "subject",
                  "relation_start", "relation_end", "relation",
                  "object_start", "object_end", "object")

  for (i in 1:length(sentences)) {
    sent = sentences[[i]]
    df = data.frame(sentence=i, id=sapply(XML::xmlChildren(sent[[1L]]),function(v) XML::xmlAttrs(v)[1]),
                    XML::xmlToDataFrame(sent[[1]], stringsAsFactors=FALSE), stringsAsFactors=FALSE)

    index = match(tokenNames, names(df))
    if (length(index) != ncol(df)) df = df[,index[!is.na(index)]]
    if (any(is.na(index))) df = fillDF(df, tokenNames)
    df = df[,tokenNames]

    out$token = rbind(out$token, df)

    elementNames <- names(XML::xmlChildren(sent))
    elementAttributes <- sapply(XML::xmlChildren(sent),
        function(v) {v <- as.character(XML::xmlAttrs(v)['type']); if (length(v)) v else "" })
    names(elementAttributes) <- NULL


    parseIndex <- which(elementNames == "parse" & elementAttributes == "")[1]
    basicDepIndex <- which(elementNames == "dependencies" & elementAttributes == "basic-dependencies")[1]
    collapsedDepIndex <- which(elementNames == "dependencies" & elementAttributes == "collapsed-dependencies")[1]
    collapsedProcDepIndex <- which(elementNames == "dependencies" & elementAttributes == "collapsed-ccprocessed-dependencies")[1]
    openieIndex <- which(elementNames == "openie" & elementAttributes == "")[1]
    machineReadingIndex <- which(elementNames == "MachineReading" & elementAttributes == "")[1]

    if (!is.na(parseIndex))
      out$parse = c(out$parse, XML::xmlValue(sent[[parseIndex]]))

    if (!is.na(basicDepIndex) && length(XML::xmlToDataFrame(sent[[basicDepIndex]]))) {
      df = data.frame(sentence=i, XML::xmlToDataFrame(sent[[basicDepIndex]],stringsAsFactors=FALSE),
                type=sapply(XML::xmlChildren(sent[[basicDepIndex]]),function(v) XML::xmlAttrs(v)[[1]]),
                governorIdx=sapply(XML::xmlChildren(sent[[basicDepIndex]]), function(v) XML::xmlAttrs(v[[1]])[1]),
                dependentIdx=sapply(XML::xmlChildren(sent[[basicDepIndex]]), function(v) XML::xmlAttrs(v[[2]])[1]),
                stringsAsFactors=FALSE)

      index = match(depNames, names(df))
      if (length(index) != ncol(df)) df = df[,index[!is.na(index)]]
      if (any(is.na(index))) df = fillDF(df, depNames)

      out$basicDep = rbind(out$basicDep, df)
    }

    if (!is.na(collapsedDepIndex) && length(XML::xmlToDataFrame(sent[[collapsedDepIndex]]))) {
      df = data.frame(sentence=i, XML::xmlToDataFrame(sent[[collapsedDepIndex]],stringsAsFactors=FALSE),
                type=sapply(XML::xmlChildren(sent[[collapsedDepIndex]]),function(v) XML::xmlAttrs(v)[[1]]),
                governorIdx=sapply(XML::xmlChildren(sent[[collapsedDepIndex]]), function(v) XML::xmlAttrs(v[[1]])[1]),
                dependentIdx=sapply(XML::xmlChildren(sent[[collapsedDepIndex]]), function(v) XML::xmlAttrs(v[[2]])[1]))

      index = match(depNames, names(df))
      if (length(index) != ncol(df)) df = df[,index[!is.na(index)]]
      if (any(is.na(index))) df = fillDF(df, depNames)

      out$collapsedDep = rbind(out$collapsedDep, df)
    }

    if (!is.na(collapsedProcDepIndex) && length(XML::xmlToDataFrame(sent[[collapsedProcDepIndex]]))) {
      df = data.frame(sentence=i, XML::xmlToDataFrame(sent[[collapsedProcDepIndex]],stringsAsFactors=FALSE),
                type=sapply(XML::xmlChildren(sent[[collapsedProcDepIndex]]),function(v) XML::xmlAttrs(v)[[1]]),
                governorIdx=sapply(XML::xmlChildren(sent[[collapsedProcDepIndex]]), function(v) XML::xmlAttrs(v[[1]])[1]),
                dependentIdx=sapply(XML::xmlChildren(sent[[collapsedProcDepIndex]]), function(v) XML::xmlAttrs(v[[2]])[1]))

      index = match(depNames, names(df))
      if (length(index) != ncol(df)) df = df[,index[!is.na(index)]]
      if (any(is.na(index))) df = fillDF(df, depNames)

      out$collapsedProcDep = rbind(out$collapsedProcDep, df)
    }

    if (!is.na(openieIndex) && length(XML::xmlToDataFrame(sent[[openieIndex]]))) {

      these <- XML::xmlChildren(sent[[openieIndex]])

      ids <-
      cbind(t(sapply(these, function(v) XML::xmlAttrs(v[[1]]))),
            sapply(these, function(v) XML::xmlValue(v[[1]][[1]])),
            t(sapply(these, function(v) XML::xmlAttrs(v[[2]]))),
            sapply(these, function(v) XML::xmlValue(v[[2]][[1]])),
            t(sapply(these, function(v) XML::xmlAttrs(v[[3]]))),
            sapply(these, function(v) XML::xmlValue(v[[3]][[1]])))

      ids <- data.frame(ids, stringsAsFactors=FALSE, row.names=NULL)
      colnames(ids) <- openieNames

      out$openie = rbind(out$openie, ids)
    }

    # if (!is.na(machineReadingIndex) && length(XML::xmlToDataFrame(sent[[machineReadingIndex]]))) {

    #   these <- XML::xmlChildren(sent[[machineReadingIndex]])

    #   ids <-
    #   cbind(t(sapply(these, function(v) XML::xmlAttrs(v[[1]]))),
    #         sapply(these, function(v) XML::xmlValue(v[[1]][[1]])),
    #         t(sapply(these, function(v) XML::xmlAttrs(v[[2]]))),
    #         sapply(these, function(v) XML::xmlValue(v[[2]][[1]])),
    #         t(sapply(these, function(v) XML::xmlAttrs(v[[3]]))),
    #         sapply(these, function(v) XML::xmlValue(v[[3]][[1]])))

    #   ids <- data.frame(ids, stringsAsFactors=FALSE, row.names=NULL)
    #   colnames(ids) <- openieNames



    #   out$mr = rbind(out$mr, ids)
    # }

    sm = XML::xmlAttrs(sent)
    df = data.frame(matrix(sm,nrow=1),stringsAsFactors=FALSE)
    names(df) = names(sm)

    index = match(sentNames, names(df))
    if (length(index) != ncol(df)) df = df[,index[!is.na(index)],drop=FALSE]
    if (any(is.na(index))) df = fillDF(df, sentNames)

    if (nrow(df)) out$sentiment = rbind(out$sentiment, df)
  }

  if (!is.null(out$token)) {
    if (!is.na(index <- match("word", names(out$token))[1]))
      names(out$token)[index] = "token"
    if (sum(!is.na((index <- match(c("CharacterOffsetBegin","CharacterOffsetEnd"),names(out$token))))))
      out$token[,index] = apply(out$token[,index,drop=FALSE],2,as.integer)
  }

  if (!is.null(out$basicDep)) {
    if (sum(is.na((index <- match(c("governorIdx","dependentIdx"),names(out$basicDep))))))
      out$basicDep[,index] = apply(out$basicDep[,index,drop=FALSE],2,as.integer)
  }

  if (!is.null(out$collapsedDep)) {
    if (sum(!is.na((index <- match(c("governorIdx","dependentIdx"),names(out$collapsedDep))))))
      out$collapsedDep[,index] = apply(out$collapsedDep[,index,drop=FALSE],2,as.integer)
  }

  if (!is.null(out$collapsedProcDep)) {
    if (sum(!is.na((index <- match(c("governorIdx","dependentIdx"),names(out$collapsedProcDep))))))
      out$collapsedProcDep[,index] = apply(out$collapsedProcDep[,index,drop=FALSE],2,as.integer)
  }

  if (!is.null(coref)) {
    coref = XML::xmlChildren(coref)
    for (corefId in 1:length(coref)) {
      df = data.frame(corefId=corefId, XML::xmlToDataFrame(coref[[corefId]], stringsAsFactors=FALSE),
                      stringsAsFactors=FALSE)

      index = match(corefNames, names(df))
      if (length(index) != ncol(df)) df = df[,index[!is.na(index)]]
      if (any(is.na(index))) df = fillDF(df, corefNames)

      out$coref = rbind(out$coref, df)
    }
  }

  if (!is.null(out$sentiment)) {
    if (sum(!is.na(index <- match(c("id","sentimentValue"),names(out$sentiment)))))
      out$sentiment[,index[!is.na(index)]] = apply(out$sentiment[,index[!is.na(index)],drop=FALSE],2,as.integer)
  }

  class(out) = "annotation"
  out
}

fillDF = function(df, nameVec) {
  if (nrow(df) == 0L) return(df)
  index = match(nameVec, names(df))
  df[,nameVec[is.na(index)]] = NA
  return(df)
}

Try the coreNLP package in your browser

Any scripts or data that you put into this service are public.

coreNLP documentation built on Aug. 17, 2022, 5:07 p.m.