R/nlp.R

# http://nlp.stanford.edu/software/stanford-corenlp-full-2015-01-29.zip

volatiles = new.env(parent=emptyenv())

#' Initialize the CoreNLP java object
#'
#' This must be run prior to calling any other CoreNLP
#' functions. It may be called multiple times in order
#' to specify a different parameter set, but note that
#' if you use a different configuration during the same
#' R session it must have a unique name.
#'
#' @importFrom           rJava .jinit .jaddClassPath .jcall .jnew
#' @param libLoc         a string giving the location of the CoreNLP java
#'                       files. This should point to a directory which
#'                       contains, for
#'                       example the file "stanford-corenlp-*.jar", where "*" is the
#'                       version number. If missing, the function will try to find the
#'                       library in the environment variable CORENLP_HOME, and otherwise
#'                       will fail.
#' @param type           type of model to load. Ignored if parameterFile is set.
#' @param parameterFile  the path to a parameter file. See the CoreNLP documentation for
#'                       an extensive list of options. If missing, the package will simply
#'                       specify a list of standard annotators and otherwise only use default
#'                       values.
#' @param mem            a string giving the amount of memory to be assigned to the rJava
#'                       engine. For example, "6g" assigned 6 gigabytes of memory. At least
#'                       2 gigabytes are recommended at a minimum for running the CoreNLP
#'                       package. On a 32bit machine, where this is not possible, setting
#'                       "1800m" may also work. This option will only have an effect the first
#'                       time \code{initCoreNLP} is called, and also will not have an effect if
#'                       the java engine is already started by a seperate process.
#'@examples
#'\dontrun{
#'initCoreNLP()
#'sIn <- "Mother died today. Or, maybe, yesterday; I can't be sure."
#'annoObj <- annotateString(sIn)
#'}
#' @export
initCoreNLP = function(libLoc, type = c("english", "english_all", "english_fast", "arabic", "chinese", "french", "german", "spanish"),
      parameterFile = NULL, mem = "4g") {
  # Find location of the CoreNLP Libraries
  if (missing(libLoc)) {
    libLoc = paste0(system.file("extdata",package="coreNLP"),
                    "/stanford-corenlp-full-2015-12-09")
    if (!file.exists(libLoc))
      stop("Please run downloadCoreNLP() in order to install required jar files.")
  }
  if (!file.exists(libLoc) || !file.info(libLoc)$isdir)
    stop("libLoc does not point to an existing directory path")
  path = Sys.glob(paste0(libLoc,"/*.jar"))

  # Start java engine, if not already done, and add to classpath
  options(java.parameters = paste0("-Xmx", mem))
  rJava::.jinit()
  rJava::.jaddClassPath(path)

  # Determine if the corenlp files have been loaded correctly
  len = length(grep("stanford-corenlp-", basename(rJava::.jclassPath())))
  if (len == 0L)
    stop("The coreNLP jar files are were not found in libLoc.")
  if (len < 4L)
    warning("The set of coreNLP jar files may be incomplete. Proceed with caution")

  # Read parameter file and add to classpath
  if (is.null(parameterFile)) {
    type = match.arg(type)
    basepath = system.file("extdata",package="coreNLP")
    if (type == "english") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP.properties")
    if (type == "english_all") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-english-all.properties")
    if (type == "english_fast") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-english-fast.properties")
    if (type == "arabic") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-arabic.properties")
    if (type == "chinese") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-chinese.properties")
    if (type == "french") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-french.properties")
    if (type == "german") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-german.properties")
    if (type == "spanish") path = sprintf("%s/%s", basepath, "/StanfordCoreNLP-spanish.properties")
  } else {
    path = Sys.glob(paste0(parameterFile[[1]]))
  }
  rJava::.jaddClassPath(dirname(path))

  if (!is.null(volatiles$cNLP))
    rJava::.jcall(volatiles$cNLP, "V", "clearAnnotatorPool")

  volatiles$cNLP = rJava::.jnew("edu.stanford.nlp.pipeline.StanfordCoreNLP", basename(path))
  volatiles$xmlOut = rJava::.jnew("edu.stanford.nlp.pipeline.XMLOutputter")
}

#' Annotate a string of text
#'
#' Runs the CoreNLP annotators over a given string of text. The details
#' for which annotators to run and how to run them are specified in the
#' properties file loaded in via the \code{initCoreNLP} function (which
#' must be run prior to any annotation).
#'
#' @importFrom        rJava .jcall
#' @param text        a vector of strings for which an annotation is desired.
#'                    Will be collapsed to length 1 using new line characters
#'                    prior to the annotation.
#' @param format      the desired output format. Option \code{obj}, the default,
#'                    returns an R object of class \code{annotation} and will
#'                    likely be the desired choice for most users. The \code{xml}
#'                    and \code{text} exist primarily for subsequently saving
#'                    to disk.
#' @param outputFile  character string indicating where to put the output. If
#'                    set to NA, the output will be returned by the function.
#' @param includeXSL  boolean. Whether the xml style sheet should be included
#'                    in the output. Only used if format is \code{xml} and
#'                    outputFile is not \code{NA}.
#'@examples
#'\dontrun{
#'initCoreNLP()
#'sIn <- "Mother died today. Or, maybe, yesterday; I can't be sure."
#'annoObj <- annotateString(sIn)
#'}
#' @export
annotateString = function(text, format=c("obj", "xml", "text"), outputFile=NA,
                          includeXSL=FALSE) {
  if (is.null(volatiles$cNLP))
    stop("Must initilize with 'initCoreNLP'!")

  format = match.arg(format)
  if (length(text) > 1L) text = paste(text,collapse="\n")
  if (!is.na(outputFile) & format == "obj") {
    warning("Cannot output 'obj' as text file, saving xml instead.")
    format = "xml"
  }

  anno = rJava::.jcall(volatiles$cNLP, "Ledu/stanford/nlp/pipeline/Annotation;", "process", text)

  # Parse the output
  if (format == "text") {
    rJava::.jcall(volatiles$cNLP, "V", "prettyPrint", anno, .jnew("java.io.PrintWriter", tf <- tempfile()))
    on.exit(file.remove(tf))
    out = readLines(tf)
  } else {
    d = rJava::.jcall(volatiles$xmlOut, "Lnu/xom/Document;", "annotationToDoc", anno, volatiles$cNLP)
    xml = rJava::.jcall(d, "Ljava/lang/String;", "toXML")
    if (format == "xml") out = xml else out = parseAnnoXML(xml)
  }

  # Return the output if asked for; otherwise just write to disk
  if (is.na(outputFile)) return(out)
  outputCon = file(outputFile, "w")
  on.exit(close(outputCon),add=TRUE)
  writeLines(out, outputCon)
  if (includeXSL & format == "xml")
    file.copy(from=paste0(system.file("extdata",package="coreNLP"), "/CoreNLP-to-HTML.xsl"),
      to=dirname(outputFile))

  invisible(outputFile)
}

#' Annotate a text file
#'
#' Runs the CoreNLP annotators for the text contained in a given file.
#' The details for which annotators to run and how to run them are
#' specified in the properties file loaded in via the \code{initCoreNLP}
#' function (which must be run prior to any annotation).
#'
#' @importFrom        rJava .jcall
#' @param file        a string giving the location of the file to be loaded.
#' @param format      the desired output format. Option \code{obj}, the default,
#'                    returns an R object of class \code{annotation} and will
#'                    likely be the desired choice for users loading the output
#'                    into R. The \code{xml} and \code{text} exist primarily for
#'                    saving the files on the disk.
#' @param outputFile  character string indicating where to put the output. If
#'                    set to NA, the output will be returned by the function.
#' @param includeXSL  boolean. Whether the xml style sheet should be included
#'                    in the output. Only used if format is \code{xml} and
#'                    outputFile is not \code{NA}.
#' @export
annotateFile = function(file, format=c("obj", "xml", "text"), outputFile=NA,
                            includeXSL=FALSE) {

  if (is.null(volatiles$cNLP))
    stop("Must initilize with 'initCoreNLP'!")

  # Processing inputs
  format = match.arg(format)
  if (!is.na(outputFile) & format == "obj") {
    warning("Cannot output 'obj' as text file, saving xml instead.")
    format = "xml"
  }
  if (is.character(file)) {
    file = file(file, "r")
    on.exit(close(file))
  }

  # Read input into R and annotate
  #   TODO: Use native java interface instead of reading into R.
  text = readLines(file)
  if (length(text) > 1L) text = paste(text,collapse="\n")
  anno = rJava::.jcall(volatiles$cNLP, "Ledu/stanford/nlp/pipeline/Annotation;",
                "process", text)

  # Parse the output
  if (format == "text") {
    rJava::.jcall(volatiles$cNLP, "V", "prettyPrint", anno, .jnew("java.io.PrintWriter", tf <- tempfile()))
    on.exit(file.remove(tf))
    out = readLines(tf)
  } else {
    d = rJava::.jcall(volatiles$xmlOut, "Lnu/xom/Document;", "annotationToDoc", anno, volatiles$cNLP)
    xml = rJava::.jcall(d, "Ljava/lang/String;", "toXML")
    if (format == "xml") out = xml else out = parseAnnoXML(xml)
  }

  # Return the output if asked for asked for; otherwise just write to disk
  if (is.na(outputFile)) return(out)
  outputCon = file(outputFile, "w")
  on.exit(close(outputCon),add=TRUE)
  writeLines(out, outputCon)
  if (includeXSL && format == "xml")
    file.copy(from=paste0(system.file("extdata",package="coreNLP"), "CoreNLP-to-HTML.xsl"),
      to=dirname(outputFile))
}

#' Print a summary of an annotation object
#'
#' @method print annotation
#' @param x    an annotation object
#' @param ...  other arguments. Currently unused.
#'
#'@examples
#'print(annoEtranger)
#'
#' @export
print.annotation = function(x, ...) {
  cat("\nA CoreNLP Annotation:\n")
  cat("  num. sentences:", x$token[nrow(x$token),1], "\n")
  cat("  num. tokens:", nrow(x[[1]]), "\n")
  cat("\n")
  invisible(x)
}

#' Get tokens as data frame
#'
#' Returns a data frame of the tokens from an annotation
#' object.
#'
#' @param annotation    an annotation object
#'
#'@examples
#'getToken(annoEtranger)
#'
#' @export
getToken = function(annotation) {
  annotation$token
}

#' Get parse tree as character vector
#'
#' Returns a character vector of the parse trees.
#' Mostly use for visualization; the output of
#' \code{\link{getToken}} will generally be more
#' conveniant for manipulating in R.
#'
#' @param annotation    an annotation object
#'
#'@examples
#'getParse(annoEtranger)
#'
#' @export
getParse = function(annotation) {
  annotation$parse
}

#' Get Dependencies
#'
#' Returns a data frame of the coreferences of an annotation
#'
#' @param annotation    an annotation object
#' @param type          the class of coreference desired
#'
#'@examples
#'getDependency(annoEtranger)
#'getDependency(annoHp)
#'
#' @export
getDependency = function(annotation, type=c("CCprocessed","basic","collapsed")) {
  type = match.arg(type)
  dep <- if (type == "basic") annotation$basicDep
  else if (type == "collapsed") annotation$collapsedDep
  else if (type == "CCprocessed") annotation$collapsedProcDep

  dep$govIndex = match(paste0(dep$sentence,"-",dep$governorIdx),
                     paste0(annotation$token$sentence,"-",annotation$token$id))
  dep$depIndex = match(paste0(dep$sentence,"-",dep$dependentIdx),
                     paste0(annotation$token$sentence,"-",annotation$token$id))
  dep
}

#' Get Sentiment scores
#'
#' Returns a data frame of the sentiment scores from an annotation
#'
#' @param annotation    an annotation object
#'
#'@examples
#'getSentiment(annoEtranger)
#'getSentiment(annoHp)
#'
#' @export
getSentiment = function(annotation) {
  annotation$sentiment
}

#' Get Coreference
#'
#' Returns a dataframe containing all coreferences detected in the text.
#'
#' @param annotation    an annotation object
#'
#'@examples
#'getCoreference(annoHp)
#'
#' @export
getCoreference = function(annotation) {
  coref = annotation$coref[,-grep("text",names(annotation$coref))]
  coref$startIndex = match(paste0(coref$sentence,"-",coref$start),
                     paste0(annotation$token$sentence,"-",annotation$token$id))
  coref$endIndex = match(paste0(coref$sentence,"-",as.numeric(coref$end)-1),
                     paste0(annotation$token$sentence,"-",annotation$token$id))
  coref
}

#' Get OpenIE
#'
#' Returns a dataframe containing all OpenIE triples.
#'
#' @param annotation    an annotation object
#'
#'@examples
#'getOpenIE(annoHp)
#'
#' @export
getOpenIE = function(annotation) {
  annotation$openie
}

#' Load CoreNLP XML file
#'
#' Loads a properly formated XML file output by the CoreNLP
#' library into an \code{annotation} object in R.
#'
#' @param file      connection or character string giving the file name to load
#' @param encoding  encoding to be assumed for input strings.  It is used to mark
#'                  character strings as known to be in Latin-1 or UTF-8: it is
#'                  not used to re-encode the input. Passed to \code{readLines}.
#' @export
loadXMLAnnotation = function(file, encoding="unknown") {
  if (is.character(file)) {
    file = file(file, "r")
    on.exit(close(file))
  }
  x = readLines(file)
  tryCatch(output <- parseAnnoXML(x),
           error = function(e) stop("Not a valid CoreNLP XML file."))
  output
}

#' Convert Penn TreeBank POS to Universal Tagset
#'
#' Maps a character string of English Penn TreeBank
#' part of speech tags into the universal tagset
#' codes. This provides a reduced set of tags (12), and
#' a better cross-linguist model of speech.
#'
#' @param pennPOS   a character vector of penn tags to match
#'
#'@examples
#'tok <- getToken(annoEtranger)
#'cbind(tok$POS,universalTagset(tok$POS))
#'
#' @export
universalTagset = function(pennPOS) {
  mtab = structure(c("!", "#", "$", "''", "(", ")", ",", "-LRB-",
      "-RRB-",  ".", ":", "?", "CC", "CD", "CD|RB", "DT", "EX", "FW", "IN",
      "IN|RP",  "JJ", "JJR", "JJRJR", "JJS", "JJ|RB", "JJ|VBG", "LS", "MD", "NN",
      "NNP", "NNPS", "NNS", "NN|NNS", "NN|SYM", "NN|VBG", "NP", "PDT",  "POS", "PRP",
      "PRP$", "PRP|VBP", "PRT", "RB", "RBR", "RBS", "RB|RP",  "RB|VBG", "RN", "RP", "SYM",
      "TO", "UH", "VB", "VBD", "VBD|VBN",  "VBG", "VBG|NN", "VBN", "VBP", "VBP|TO",
      "VBZ", "VP", "WDT",  "WH", "WP", "WP$", "WRB", "``", ".", ".", ".", ".", ".",
      ".",  ".", ".", ".", ".", ".", ".", "CONJ", "NUM", "X", "DET", "DET",  "X",
      "ADP", "ADP", "ADJ", "ADJ", "ADJ", "ADJ", "ADJ", "ADJ",  "X", "VERB", "NOUN",
      "NOUN", "NOUN", "NOUN", "NOUN", "NOUN",  "NOUN", "NOUN", "DET", "PRT", "PRON",
      "PRON", "PRON", "PRT",  "ADV", "ADV", "ADV", "ADV", "ADV", "X", "PRT", "X",
      "PRT", "X",  "VERB", "VERB", "VERB", "VERB", "VERB", "VERB", "VERB", "VERB",
      "VERB", "VERB", "DET", "X", "PRON", "PRON", "ADV", "."), .Dim = c(68L,  2L))

  index = match(pennPOS, mtab[,1])
  output = rep("X", length(pennPOS))
  output[!is.na(index)] = mtab[index[!is.na(index)],2]
  output
}

#' Parse annotation xml
#'
#' Returns an annotation object from a character vector containing
#' the xml. Not exported; use \code{loadXMLAnnotation} instead.
#'
#' @importFrom   XML xmlRoot xmlParse xmlToDataFrame xmlChildren xmlAttrs xmlValue
#' @param xml    character vector containing the xml file from an annotation
parseAnnoXML = function(xml) {
  xml = XML::xmlRoot(XML::xmlParse(xml))[[1]]
  coref = xml[[2]]
  xml = xml[[1]]
  sentences = XML::xmlChildren(xml)

  out = list(token = NULL,parse = NULL,basicDep = NULL,collapsedDep = NULL,
              collapsedProcDep = NULL, coref = NULL, openie = NULL)

  if (length(sentences)==0L) {
    class(out) = "annotation"
    return(out)
  }

  tokenNames = c("sentence", "id", "word", "lemma", "CharacterOffsetBegin",
                 "CharacterOffsetEnd", "POS", "NER", "Speaker")
  depNames = c("sentence", "governor", "dependent", "type", "governorIdx",
               "dependentIdx")
  corefNames = c("corefId", "sentence", "start", "end", "head", "text")
  sentNames = c("id", "sentimentValue", "sentiment")
  openieNames = c("subject_start", "subject_end", "subject",
                  "relation_start", "relation_end", "relation",
                  "object_start", "object_end", "object")

  for (i in 1:length(sentences)) {
    sent = sentences[[i]]
    df = data.frame(sentence=i, id=sapply(XML::xmlChildren(sent[[1L]]),function(v) XML::xmlAttrs(v)[1]),
                    XML::xmlToDataFrame(sent[[1]], stringsAsFactors=FALSE), stringsAsFactors=FALSE)

    index = match(tokenNames, names(df))
    if (length(index) != ncol(df)) df = df[,index[!is.na(index)]]
    if (any(is.na(index))) df = fillDF(df, tokenNames)
    df = df[,tokenNames]

    out$token = rbind(out$token, df)

    elementNames <- names(XML::xmlChildren(sent))
    elementAttributes <- sapply(XML::xmlChildren(sent),
        function(v) {v <- as.character(XML::xmlAttrs(v)['type']); if (length(v)) v else "" })
    names(elementAttributes) <- NULL


    parseIndex <- which(elementNames == "parse" & elementAttributes == "")[1]
    basicDepIndex <- which(elementNames == "dependencies" & elementAttributes == "basic-dependencies")[1]
    collapsedDepIndex <- which(elementNames == "dependencies" & elementAttributes == "collapsed-dependencies")[1]
    collapsedProcDepIndex <- which(elementNames == "dependencies" & elementAttributes == "collapsed-ccprocessed-dependencies")[1]
    openieIndex <- which(elementNames == "openie" & elementAttributes == "")[1]
    machineReadingIndex <- which(elementNames == "MachineReading" & elementAttributes == "")[1]

    if (!is.na(parseIndex))
      out$parse = c(out$parse, XML::xmlValue(sent[[parseIndex]]))

    if (!is.na(basicDepIndex) && length(XML::xmlToDataFrame(sent[[basicDepIndex]]))) {
      df = data.frame(sentence=i, XML::xmlToDataFrame(sent[[basicDepIndex]],stringsAsFactors=FALSE),
                type=sapply(XML::xmlChildren(sent[[basicDepIndex]]),function(v) XML::xmlAttrs(v)[[1]]),
                governorIdx=sapply(XML::xmlChildren(sent[[basicDepIndex]]), function(v) XML::xmlAttrs(v[[1]])[1]),
                dependentIdx=sapply(XML::xmlChildren(sent[[basicDepIndex]]), function(v) XML::xmlAttrs(v[[2]])[1]),
                stringsAsFactors=FALSE)

      index = match(depNames, names(df))
      if (length(index) != ncol(df)) df = df[,index[!is.na(index)]]
      if (any(is.na(index))) df = fillDF(df, depNames)

      out$basicDep = rbind(out$basicDep, df)
    }

    if (!is.na(collapsedDepIndex) && length(XML::xmlToDataFrame(sent[[collapsedDepIndex]]))) {
      df = data.frame(sentence=i, XML::xmlToDataFrame(sent[[collapsedDepIndex]],stringsAsFactors=FALSE),
                type=sapply(XML::xmlChildren(sent[[collapsedDepIndex]]),function(v) XML::xmlAttrs(v)[[1]]),
                governorIdx=sapply(XML::xmlChildren(sent[[collapsedDepIndex]]), function(v) XML::xmlAttrs(v[[1]])[1]),
                dependentIdx=sapply(XML::xmlChildren(sent[[collapsedDepIndex]]), function(v) XML::xmlAttrs(v[[2]])[1]))

      index = match(depNames, names(df))
      if (length(index) != ncol(df)) df = df[,index[!is.na(index)]]
      if (any(is.na(index))) df = fillDF(df, depNames)

      out$collapsedDep = rbind(out$collapsedDep, df)
    }

    if (!is.na(collapsedProcDepIndex) && length(XML::xmlToDataFrame(sent[[collapsedProcDepIndex]]))) {
      df = data.frame(sentence=i, XML::xmlToDataFrame(sent[[collapsedProcDepIndex]],stringsAsFactors=FALSE),
                type=sapply(XML::xmlChildren(sent[[collapsedProcDepIndex]]),function(v) XML::xmlAttrs(v)[[1]]),
                governorIdx=sapply(XML::xmlChildren(sent[[collapsedProcDepIndex]]), function(v) XML::xmlAttrs(v[[1]])[1]),
                dependentIdx=sapply(XML::xmlChildren(sent[[collapsedProcDepIndex]]), function(v) XML::xmlAttrs(v[[2]])[1]))

      index = match(depNames, names(df))
      if (length(index) != ncol(df)) df = df[,index[!is.na(index)]]
      if (any(is.na(index))) df = fillDF(df, depNames)

      out$collapsedProcDep = rbind(out$collapsedProcDep, df)
    }

    if (!is.na(openieIndex) && length(XML::xmlToDataFrame(sent[[openieIndex]]))) {

      these <- XML::xmlChildren(sent[[openieIndex]])

      ids <-
      cbind(t(sapply(these, function(v) XML::xmlAttrs(v[[1]]))),
            sapply(these, function(v) XML::xmlValue(v[[1]][[1]])),
            t(sapply(these, function(v) XML::xmlAttrs(v[[2]]))),
            sapply(these, function(v) XML::xmlValue(v[[2]][[1]])),
            t(sapply(these, function(v) XML::xmlAttrs(v[[3]]))),
            sapply(these, function(v) XML::xmlValue(v[[3]][[1]])))

      ids <- data.frame(ids, stringsAsFactors=FALSE, row.names=NULL)
      colnames(ids) <- openieNames

      out$openie = rbind(out$openie, ids)
    }

    # if (!is.na(machineReadingIndex) && length(XML::xmlToDataFrame(sent[[machineReadingIndex]]))) {

    #   these <- XML::xmlChildren(sent[[machineReadingIndex]])

    #   ids <-
    #   cbind(t(sapply(these, function(v) XML::xmlAttrs(v[[1]]))),
    #         sapply(these, function(v) XML::xmlValue(v[[1]][[1]])),
    #         t(sapply(these, function(v) XML::xmlAttrs(v[[2]]))),
    #         sapply(these, function(v) XML::xmlValue(v[[2]][[1]])),
    #         t(sapply(these, function(v) XML::xmlAttrs(v[[3]]))),
    #         sapply(these, function(v) XML::xmlValue(v[[3]][[1]])))

    #   ids <- data.frame(ids, stringsAsFactors=FALSE, row.names=NULL)
    #   colnames(ids) <- openieNames



    #   out$mr = rbind(out$mr, ids)
    # }

    sm = XML::xmlAttrs(sent)
    df = data.frame(matrix(sm,nrow=1),stringsAsFactors=FALSE)
    names(df) = names(sm)

    index = match(sentNames, names(df))
    if (length(index) != ncol(df)) df = df[,index[!is.na(index)],drop=FALSE]
    if (any(is.na(index))) df = fillDF(df, sentNames)

    if (nrow(df)) out$sentiment = rbind(out$sentiment, df)
  }

  if (!is.null(out$token)) {
    if (!is.na(index <- match("word", names(out$token))[1]))
      names(out$token)[index] = "token"
    if (sum(!is.na((index <- match(c("CharacterOffsetBegin","CharacterOffsetEnd"),names(out$token))))))
      out$token[,index] = apply(out$token[,index,drop=FALSE],2,as.integer)
  }

  if (!is.null(out$basicDep)) {
    if (sum(is.na((index <- match(c("governorIdx","dependentIdx"),names(out$basicDep))))))
      out$basicDep[,index] = apply(out$basicDep[,index,drop=FALSE],2,as.integer)
  }

  if (!is.null(out$collapsedDep)) {
    if (sum(!is.na((index <- match(c("governorIdx","dependentIdx"),names(out$collapsedDep))))))
      out$collapsedDep[,index] = apply(out$collapsedDep[,index,drop=FALSE],2,as.integer)
  }

  if (!is.null(out$collapsedProcDep)) {
    if (sum(!is.na((index <- match(c("governorIdx","dependentIdx"),names(out$collapsedProcDep))))))
      out$collapsedProcDep[,index] = apply(out$collapsedProcDep[,index,drop=FALSE],2,as.integer)
  }

  if (!is.null(coref)) {
    coref = XML::xmlChildren(coref)
    for (corefId in 1:length(coref)) {
      df = data.frame(corefId=corefId, XML::xmlToDataFrame(coref[[corefId]], stringsAsFactors=FALSE),
                      stringsAsFactors=FALSE)

      index = match(corefNames, names(df))
      if (length(index) != ncol(df)) df = df[,index[!is.na(index)]]
      if (any(is.na(index))) df = fillDF(df, corefNames)

      out$coref = rbind(out$coref, df)
    }
  }

  if (!is.null(out$sentiment)) {
    if (sum(!is.na(index <- match(c("id","sentimentValue"),names(out$sentiment)))))
      out$sentiment[,index[!is.na(index)]] = apply(out$sentiment[,index[!is.na(index)],drop=FALSE],2,as.integer)
  }

  class(out) = "annotation"
  out
}

fillDF = function(df, nameVec) {
  if (nrow(df) == 0L) return(df)
  index = match(nameVec, names(df))
  df[,nameVec[is.na(index)]] = NA
  return(df)
}
statsmaths/coreNLP documentation built on May 30, 2019, 10:41 a.m.