#' Parse Sentences
#'
#' A wrapper for \pkg{NLP},/\pkg{openNLP}'s named sentence parsing tools.
#'
#' @param text.var The text string variable.
#' @param engine The backend pat of speech tagger, either "openNLP" or "coreNLP".
#' The default "openNLP" uses the \pkg{openNLP} package. If the user has the
#' Stanford CoreNLP suite (\file{http://stanfordnlp.github.io/CoreNLP/})
#' installed this can be used as the tagging backend instead.
#' @param parse.annotator A parse annotator. See \code{?parse_annotator}. Due
#' to \pkg{Java} memory allocation limits the user must generate the annotator
#' and supply it directly to \code{parser} (only used if \code{lib = "openNLP"}).
#' @param word.annotator A word annotator (only used if \code{lib = "openNLP"}).
#' @param java.path The path the where \pkg{Java} is located (only used if
#' \code{lib = "coreNLP"}).
#' @param element.chunks The number of elements to include in a chunk. Chunks are
#' passed through an \code{\link[base]{lapply}} and size is kept within a tolerance
#' because of memory allocation in the tagging process with \pkg{Java}.
#' @return Returns a list of character vectors of parsed sentences.
#' @keywords parse sentence
#' @export
#' @examples
#' \dontrun{
#' txt <- c(
#' "Really, I like chocolate because it is good. It smells great.",
#' "Robots are rather evil and most are devoid of decency.",
#' "He is my friend.",
#' "Clifford the big red dog ate my lunch.",
#' "Professor Johns can not teach",
#' "",
#' NA
#' )
#'
#' ## openNLP parser
#' if(!exists('parse_ann')) {
#' parse_ann <- parse_annotator()
#' }
#'
#' (x2 <- parser(txt, engine = 'openNLP', parse.annotator = parse_ann))
#' dev.new()
#' par(
#' mfrow = c(3, 2),
#' mar = c(0,0,1,1) + 0.1
#' )
#' frame(); text(.5, .5, "openNLP", cex=2)
#' lapply(x2[1:5], plot)
#'
#' ## coreNLP parser
#' (x <- parser(txt, engine = "coreNLP"))
#'
#' par(mar = c(0,0,0,.7) + 0.2)
#' plot(x[[2]])
#' par(
#' mfrow = c(3, 2),
#' mar = c(0,0,1,1) + 0.1
#' )
#' frame(); text(.5, .5, "coreNLP", cex=2)
#' lapply(x[1:5], plot)
#' }
parser <- function(text.var, engine = "openNLP", parse.annotator = easy_parse_annotator(),
word.annotator = word_annotator(), java.path = "java",
element.chunks = floor(2000 * (23.5/mean(sapply(text.var, nchar), na.rm = TRUE)))){
len <- length(text.var)
## locate empty or missing text elements
nas <- sort(union(which(is.na(text.var)), grep("^\\s*$", text.var)))
## replace empty text with a period
if(!identical(nas, integer(0))){
text.var[nas] <- "."
}
## Chunking the text into memory sized chunks:
## caluclate the start/end indexes of the chunks
ends <- c(utils::tail(seq(0, by = element.chunks,
length.out = ceiling(len/element.chunks)), -1), len)
starts <- c(1, utils::head(ends + 1 , -1))
## chunk the text
text_list <- Map(function(s, e) {text.var[s:e]}, starts, ends)
## loop through the chunks and tag them
out <- unlist(lapply(text_list, function(x){
switch(engine,
coreNLP = {x <- core_parsify(x)},
openNLP = {x <- open_parsify(x, word.annotator, parse.annotator)},
stop("`engine` must be either \"openNLP\" or \"coreNLP\".")
)
gc()
x
}))
out[nas] <- NA
out <- as.list(out)
lapply(out, function(x){
if (is.na(x)) return(NA)
class(x) <- c("parsed_character", "character")
x
})
}
open_parsify <- function(text.var, word, parse, ...){
#text.var <- gsub("-+", " ", text.var)
text.var <- gsub("^\\s+|\\s+$", "", text.var)
s <- NLP::as.String(paste(text.var, collapse=""))
## Manually calculate the starts and ends via nchar
lens <- sapply(text.var, nchar)
ends <- cumsum(lens)
starts <- c(1, utils::head(ends + 1, -1))
a2 <- NLP::Annotation(seq_along(starts), rep("sentence", length(starts)), starts, ends)
a2 <- NLP::annotate(s, word, a2)
p <- parse(s, a2)
sapply(p$features, `[[`, "parse")
}
core_parsify <- function (text.var,
stanford.tagger = coreNLPsetup::coreNLP_loc(), java.path = "java", ...) {
if (!file.exists(stanford.tagger)) {
coreNLPsetup::check_stanford_installed(...)
}
text.var <- gsub("[.?!](?!$)", " ", gsub("(?<=[.?!])[.?!]+$", "", text.var, perl = TRUE), perl = TRUE)
#message("\nAnalyzing text for sentiment...\n")
cmd <- sprintf(
"%s -cp \"%s/*\" -mx5g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators \"tokenize,ssplit,parse\" -ssplit.eolonly",
#"%s -cp \"%s/*\" -mx5g edu.stanford.nlp.sentiment.SentimentPipeline -stdin",
java.path, stanford.tagger
)
results <- system(cmd, input = text.var, intern = TRUE, ignore.stderr = TRUE)
gsub("^\\(ROOT", "(TOP", gsub("\\s+", " ", Map(function(b, e){paste(results[b:e], collapse="")},
grep("^\\(ROOT", results),
grep("^root\\(ROOT", results) - 2
)))
}
#' Prints a parsed_character Object
#'
#' Prints a parsed_character object
#'
#' @param x A parsed_character Object.
#' @param \ldots ignored.
#' @method print parsed_character
#' @export
print.parsed_character <- function(x, ...){
class(x) <- "character"
print(x)
}
#' Plots a plot.parsed_character Object
#'
#' Plots a plot.parsed_character object
#'
#' @param x A \code{parsed_character} object (see \code{\link[parsent]{parser}}.
#' @param vertex.color The vertex color (see \code{?igraph::igraph_options}).
#' @param vertex.frame.color The vertex frame color (see \code{?igraph::igraph_options}).
#' @param vertex.label.font The vertex label font (see \code{?igraph::igraph_options}).
#' @param vertex.label.cex The vertex label scaled relative to the default (see
#' \code{?igraph::igraph_options}).
#' @param edge.width The edge width (see \code{?igraph::igraph_options}).
#' @param edge.color The edge color (see \code{?igraph::igraph_options}).
#' @param edge.arrow.size The edge arrow size (see \code{?igraph::igraph_options}).
#' @param leaf.color The color of the leaves (tokens).
#' @param phrase.marker.color The color of the non-terminal grammar categories
#' (e.g., NP, VP, D, etc.).
#' @param title The main title of the graph.
#' @param cex.title The size of the title relative to the default.
#' @param asp The y/x aspect ratio.
#' @param \ldots Other arguments passed to \code{\link[igraph]{plot.igraph}}.
#' @author StackOverflow's \href{http://stackoverflow.com/users/2415684/thetime}{TheTime} and Tyler Rinker <tyler.rinker@@gmail.com>.
#' @references \url{http://stackoverflow.com/a/33536291/1000343}
#' @method plot parsed_character
#' @export
plot.parsed_character <- function(x, vertex.color=NA, vertex.frame.color=NA,
vertex.label.font=2, vertex.label.cex=1, edge.width=1.5,
edge.color='black', edge.arrow.size=0, leaf.color='chartreuse4',
phrase.marker.color='blue4', title=NULL, cex.title=.9, asp=0.5, ...) {
## Replace words with unique versions
ms <- gregexpr("[^() ]+", x) # just ignoring spaces and brackets?
words <- regmatches(x, ms)[[1]] # just words
regmatches(x, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(NLP::Tree_parse(x))
## Make the graph, add options for coloring leaves separately
g <- igraph::graph_from_edgelist(edgelist)
igraph::vertex_attr(g, 'label.color') <- phrase.marker.color # non-leaf colors
igraph::vertex_attr(g, 'label.color', igraph::V(g)[!igraph::degree(g, mode='out')]) <- leaf.color
igraph::V(g)$label <- sub("\\d+", '', igraph::V(g)$name) # remove the numbers for labels
igraph::plot.igraph(g, layout=igraph::layout.reingold.tilford, vertex.color = vertex.color,
vertex.frame.color = vertex.frame.color,
vertex.label.font = vertex.label.font,
vertex.label.cex = vertex.label.cex, asp = asp, edge.width = edge.width,
edge.color = edge.color, edge.arrow.size = edge.arrow.size)
if (!missing(title)) title(title, cex.main=cex.title)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.