R/parser.R

#' Parse Sentences
#'
#' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named sentence parsing tools.
#'
#' @param text.var The text string variable.
#' @param engine The backend pat of speech tagger, either "openNLP" or "coreNLP".
#' The default "openNLP" uses the \pkg{openNLP} package.  If the user has the
#' Stanford CoreNLP suite (\file{http://stanfordnlp.github.io/CoreNLP/})
#' installed this can be used as the tagging backend instead.
#' @param parse.annotator A parse annotator.  See \code{?parse_annotator}.  Due
#' to \pkg{Java} memory allocation limits the user must generate the annotator
#' and supply it directly to \code{parser} (only used if \code{lib = "openNLP"}).
#' @param word.annotator A word annotator (only used if \code{lib = "openNLP"}).
#' @param java.path The path the where \pkg{Java} is located (only used if
#' \code{lib = "coreNLP"}).
#' @param element.chunks The number of elements to include in a chunk. Chunks are
#' passed through an \code{\link[base]{lapply}} and size is kept within a tolerance
#' because of memory allocation in the tagging process with \pkg{Java}.
#' @return Returns a list of character vectors of parsed sentences.
#' @keywords parse sentence
#' @export
#' @examples
#' \dontrun{
#' txt <- c(
#'     "Really, I like chocolate because it is good. It smells great.",
#'     "Robots are rather evil and most are devoid of decency.",
#'     "He is my friend.",
#'     "Clifford the big red dog ate my lunch.",
#'     "Professor Johns can not teach",
#'     "",
#'     NA
#' )
#'
#' ## openNLP parser
#' if(!exists('parse_ann')) {
#'     parse_ann <- parse_annotator()
#' }
#'
#' (x2 <- parser(txt, engine = 'openNLP',  parse.annotator = parse_ann))
#' dev.new()
#' par(
#'     mfrow = c(3, 2),
#'     mar = c(0,0,1,1) + 0.1
#' )
#' frame(); text(.5, .5, "openNLP", cex=2)
#' lapply(x2[1:5], plot)
#'
#' ## coreNLP parser
#' (x <- parser(txt, engine = "coreNLP"))
#'
#' par(mar = c(0,0,0,.7) + 0.2)
#' plot(x[[2]])
#' par(
#'     mfrow = c(3, 2),
#'     mar = c(0,0,1,1) + 0.1
#' )
#' frame(); text(.5, .5, "coreNLP", cex=2)
#' lapply(x[1:5], plot)
#' }
parser <- function(text.var, engine = "openNLP", parse.annotator = easy_parse_annotator(),
    word.annotator = word_annotator(), java.path = "java",
    element.chunks = floor(2000 * (23.5/mean(sapply(text.var, nchar), na.rm = TRUE)))){

    len <- length(text.var)

    ## locate empty or missing text elements
    nas <- sort(union(which(is.na(text.var)), grep("^\\s*$", text.var)))

    ## replace empty text with a period
    if(!identical(nas, integer(0))){
       text.var[nas] <- "."
    }

    ## Chunking the text into memory sized chunks:
    ## caluclate the start/end indexes of the chunks
    ends <- c(utils::tail(seq(0, by = element.chunks,
        length.out = ceiling(len/element.chunks)), -1), len)
    starts <- c(1, utils::head(ends + 1 , -1))

    ## chunk the text
    text_list <- Map(function(s, e) {text.var[s:e]}, starts, ends)

    ## loop through the chunks and tag them
    out <- unlist(lapply(text_list, function(x){
        switch(engine,
            coreNLP = {x <- core_parsify(x)},
            openNLP = {x <- open_parsify(x, word.annotator, parse.annotator)},
            stop("`engine` must be either \"openNLP\" or \"coreNLP\".")
        )
        gc()
        x
    }))

    out[nas] <- NA

    out <- as.list(out)
    lapply(out, function(x){
        if (is.na(x)) return(NA)
        class(x) <- c("parsed_character", "character")
        x
    })
}

open_parsify <- function(text.var, word, parse, ...){

    #text.var <- gsub("-+", " ", text.var)
    text.var <- gsub("^\\s+|\\s+$", "", text.var)
    s <- NLP::as.String(paste(text.var, collapse=""))

    ## Manually calculate the starts and ends via nchar
    lens <- sapply(text.var, nchar)
    ends <- cumsum(lens)
    starts <- c(1, utils::head(ends + 1, -1))

    a2 <- NLP::Annotation(seq_along(starts), rep("sentence", length(starts)), starts, ends)
    a2 <- NLP::annotate(s, word, a2)
    p <- parse(s, a2)
    sapply(p$features, `[[`, "parse")

}


core_parsify <- function (text.var,
    stanford.tagger = coreNLPsetup::coreNLP_loc(), java.path = "java", ...) {

    if (!file.exists(stanford.tagger)) {
        coreNLPsetup::check_stanford_installed(...)
    }

    text.var <- gsub("[.?!](?!$)", " ", gsub("(?<=[.?!])[.?!]+$", "", text.var, perl = TRUE), perl = TRUE)

    #message("\nAnalyzing text for sentiment...\n")

    cmd <- sprintf(
        "%s -cp \"%s/*\" -mx5g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators \"tokenize,ssplit,parse\" -ssplit.eolonly",
        #"%s -cp \"%s/*\" -mx5g edu.stanford.nlp.sentiment.SentimentPipeline -stdin",
        java.path, stanford.tagger
    )

    results <- system(cmd, input = text.var, intern = TRUE, ignore.stderr = TRUE)

    gsub("^\\(ROOT", "(TOP", gsub("\\s+", " ", Map(function(b, e){paste(results[b:e], collapse="")},
        grep("^\\(ROOT", results),
        grep("^root\\(ROOT", results) - 2
    )))

}



#' Prints a parsed_character Object
#'
#' Prints a parsed_character object
#'
#' @param x A parsed_character Object.
#' @param \ldots ignored.
#' @method print parsed_character
#' @export
print.parsed_character <- function(x, ...){
    class(x) <- "character"
    print(x)
}


#' Plots a plot.parsed_character Object
#'
#' Plots a plot.parsed_character object
#'
#' @param x A \code{parsed_character} object (see \code{\link[parsent]{parser}}.
#' @param vertex.color The vertex color (see \code{?igraph::igraph_options}).
#' @param vertex.frame.color The vertex frame color (see \code{?igraph::igraph_options}).
#' @param vertex.label.font The vertex label font (see \code{?igraph::igraph_options}).
#' @param vertex.label.cex The vertex label scaled relative to the default (see
#' \code{?igraph::igraph_options}).
#' @param edge.width The edge width (see \code{?igraph::igraph_options}).
#' @param edge.color The edge color (see \code{?igraph::igraph_options}).
#' @param edge.arrow.size The edge arrow size (see \code{?igraph::igraph_options}).
#' @param leaf.color The color of the leaves (tokens).
#' @param phrase.marker.color The color of the non-terminal grammar categories
#' (e.g., NP, VP, D, etc.).
#' @param title The main title of the graph.
#' @param cex.title The size of the title relative to the default.
#' @param asp The y/x aspect ratio.
#' @param \ldots Other arguments passed to \code{\link[igraph]{plot.igraph}}.
#' @author StackOverflow's \href{http://stackoverflow.com/users/2415684/thetime}{TheTime} and Tyler Rinker <tyler.rinker@@gmail.com>.
#' @references \url{http://stackoverflow.com/a/33536291/1000343}
#' @method plot parsed_character
#' @export
plot.parsed_character <- function(x, vertex.color=NA, vertex.frame.color=NA,
    vertex.label.font=2, vertex.label.cex=1, edge.width=1.5,
    edge.color='black', edge.arrow.size=0, leaf.color='chartreuse4',
    phrase.marker.color='blue4', title=NULL, cex.title=.9, asp=0.5, ...) {

    ## Replace words with unique versions
    ms <- gregexpr("[^() ]+", x)                                      # just ignoring spaces and brackets?
    words <- regmatches(x, ms)[[1]]                                   # just words
    regmatches(x, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words

    ## Going to construct an edgelist and pass that to igraph
    ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
    edgelist <- matrix('', nrow=length(words)-2, ncol=2)

    ## Function to fill in edgelist in place
    edgemaker <- (function() {
        i <- 0                                       # row counter
        g <- function(node) {                        # the recursive function
            if (inherits(node, "Tree")) {            # only recurse subtrees
                if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
                    for (child in node$children) {
                        childval <- if(inherits(child, "Tree")) child$value else child
                        i <<- i+1
                        edgelist[i,1:2] <<- c(val, childval)
                    }
                }
                invisible(lapply(node$children, g))
            }
        }
    })()

    ## Create the edgelist from the parse tree
    edgemaker(NLP::Tree_parse(x))

    ## Make the graph, add options for coloring leaves separately
    g <- igraph::graph_from_edgelist(edgelist)
    igraph::vertex_attr(g, 'label.color') <- phrase.marker.color  # non-leaf colors
    igraph::vertex_attr(g, 'label.color', igraph::V(g)[!igraph::degree(g, mode='out')]) <- leaf.color
    igraph::V(g)$label <- sub("\\d+", '', igraph::V(g)$name)      # remove the numbers for labels
    igraph::plot.igraph(g, layout=igraph::layout.reingold.tilford, vertex.color = vertex.color,
        vertex.frame.color = vertex.frame.color,
        vertex.label.font = vertex.label.font,
        vertex.label.cex = vertex.label.cex, asp = asp, edge.width = edge.width,
        edge.color = edge.color, edge.arrow.size = edge.arrow.size)
    if (!missing(title)) title(title, cex.main=cex.title)
}
trinker/parsent documentation built on May 31, 2019, 9:41 p.m.