#' Wrappper to Standford's coreNLP Sentiment Tagger
#'
#' Tag sentiment as most negative (-1) to most positive (+1). A
#' reimplementation of Matthew Jocker's Stanford coreNLP wrapper in \pkg{syuzhet}.
#'
#' @param text.var The text variable.
#' @param stanford.tagger path to the Stanford tagger.
#' @param hyphen The character string to replace hyphens with. Default replaces
#' with nothing so 'sugar-free' becomes 'sugarfree'. Setting \code{hyphen = " "}
#' would result in a space between words (e.g., 'sugar free').
#' @param missing_value A value to replace \code{NA}/\code{NaN} with. Use
#' \code{NULL} to retain missing values.
#' @param java.path Path to where \pkg{Java} resides. If \pkg{Java} is on your
#' path the minimal \code{java.path = "java"} is all that should be required.
#' @param \ldots Other arguments passed to \code{check_stanford_installed}.
#' @return Returns a \pkg{data.table} of:
#' \itemize{
#' \item element_id - The id number of the original vector passed to \code{sentiment}
#' \item sentence_id - The id number of the sentences within each \code{element_id}
#' \item word_count - Word count
#' \item sentiment - Sentiment/polarity score
#' }
#' @references http://nlp.stanford.edu/software/corenlp.shtml \cr
#' \url{http://www.matthewjockers.net/2015/03/04/some-thoughts-on-annies-thoughts-about-syuzhet}
#' @keywords sentiment
#' @export
#' @examples
#' \dontrun{
#' mytext <- c(
#' 'do you like it? But I hate really bad dogs',
#' 'I am the best friend.',
#' 'Do you really like it? I\'m not a fan'
#' )
#' sentiment_stanford(mytext)
#'
#' library(sentimentr)
#' data(sam_i_am)
#' (sam <- sentiment_stanford(gsub("Sam-I-am", "Sam I am", sam_i_am)))
#' plot(sam)
#' plot(sam, scale_range = TRUE, low_pass_size = 5)
#' plot(sam, scale_range = TRUE, low_pass_size = 10)
#' y <- "He was not the sort of man that one would describe as especially handsome."
#' sentiment_stanford(y)
#'
#' if (!require("pacman")) install.packages("pacman")
#' pacman::p_load_gh("trinker/textshape", "trinker/sentimentr")
#' pacman::p_load(syuzhet)
#'
#' temp <- tempdir()
#' pang_et_al <- "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz"
#' download.file(pang_et_al, file.path(temp, basename(pang_et_al)))
#'
#' untar(file.path(temp, basename(pang_et_al)), exdir = file.path(temp, "out"))
#' dirs <- sprintf(file.path(temp, "out/txt_sentoken/%s"), c("neg", "pos"))
#' text_vector <- paste(unlist(lapply(
#' c(file.path(dirs[1], dir(dirs[1])[1]),
#' file.path(dirs[2], dir(dirs[2])[1])
#' ), readLines)), collapse = " ")
#'
#' sents <- textshape::split_sentence(text_vector)[[1]]
#'
#' syuzhet <- setNames(as.data.frame(lapply(c("bing", "afinn", "nrc"),
#' function(x) syuzhet::get_sentiment(sents[[1]], method=x))), c("bing", "afinn", "nrc"))
#'
#' width <- options()$width
#' options(width=1000)
#'
#' left_just(data.frame(
#' stanford = round(sentiment_stanford(sents)[["sentiment"]], 2),
#' sentimentr = round(sentiment(sents, question.weight = 0)[["sentiment"]], 2),
#' syuzhet,
#' sentences = unlist(sents),
#' stringsAsFactors = FALSE
#' ), "sentences")
#'
#' options(width=width)
#' }
sentiment_stanford <- function(text.var, hyphen = "", missing_value = 0,
stanford.tagger = coreNLPsetup::coreNLP_loc(), java.path = "java", ...){
sentiment <- .N <- NULL
# break rows into sentences, count words
sents <- textshape::split_sentence(gsub('\\s+', ' ', text.var))
sent_dat <- data.frame(element_id = seq_along(sents))
sent_dat[["sentences"]] <- sents
data.table::setDT(sent_dat)
sent_dat <- sent_dat[, list('sentences' = unlist(sentences)), by = 'element_id'][,
'sentences' := punctuation_reducer(sentences, hyphen = hyphen)][,
'sentence_id' := seq_len(.N), by='element_id'][,
'word_count' := replace_zero(stringi::stri_count_words(sentences))][,
'sentiment' := sentiment_stanford_helper(sentences,
stanford.tagger = stanford.tagger, java.path = java.path, ...)]
if (!is.null(missing_value)){
sent_dat[, 'sentiment' := replace_na(sentiment, y = missing_value)]
}
sent_dat <- sent_dat[, c("element_id", "sentence_id", "word_count", "sentiment"), with = FALSE]
class(sent_dat) <- unique(c("sentiment", class(sent_dat)))
sentences <- new.env(FALSE)
sentences[["sentences"]] <- sents
attributes(sent_dat)[["sentences"]] <- sentences
sent_dat[]
}
punctuation_reducer <- function(x, hyphen){
x <- gsub("[.?!](?!$)", " ", gsub("(?<=[.?!])[.?!]+$", "",
x, perl = TRUE), perl = TRUE)
if (!is.null(hyphen)){
x <- gsub("-", hyphen, x)
}
x
}
replace_na <- function(x, y = 0) {x[is.na(x)] <- y; x}
replace_zero <- function(x, y = NA) {x[x == 0] <- y; x}
#' Plots a sentiment object
#'
#' Plots a sentiment object.
#'
#' @param x The sentiment object.
#' @param transformation.function A transformation function to smooth the sentiment
#' scores.
#' @param \ldots Other arguments passed to \code{\link[syuzhet]{get_transformed_values}}.
#' @details Utilizes Matthew Jocker's \pkg{syuzhet} package to calculate smoothed
#' sentiment across the duration of the text.
#' @return Returns a \pkg{ggplot2} object.
#' @method plot sentiment
#' @importFrom syuzhet get_dct_transform
#' @export
plot.sentiment <- function(x, transformation.function = syuzhet::get_dct_transform, ...){
m <- transformation.function(stats::na.omit(x[["sentiment"]]), ...)
dat <- data.frame(
Emotional_Valence = m,
Duration = seq_along(m)
)
ggplot2::ggplot(dat, ggplot2::aes_string('Duration', 'Emotional_Valence')) +
ggplot2::geom_path(size=1, color="blue") +
ggplot2::theme_bw() +
ggplot2::theme(plot.margin = grid::unit(c(5.1, 15.1, 4.1, 2.1), "pt")) +
ggplot2::ylab("Emotional Valence") +
ggplot2::theme(panel.grid = ggplot2::element_blank()) +
ggplot2::scale_x_continuous(label=function(x) paste0(x, "%"),
expand = c(0,0), limits = c(0,100))
}
sentiment_stanford_helper <- function (text.var,
stanford.tagger = coreNLPsetup::coreNLP_loc(), java.path = "java", ...) {
if (!file.exists(stanford.tagger)) {
coreNLPsetup::check_stanford_installed(...)
}
text.var <- gsub("[.?!](?!$)", " ", gsub("(?<=[.?!])[.?!]+$", "", text.var, perl = TRUE), perl = TRUE)
#message("\nAnalyzing text for sentiment...\n")
cmd <- sprintf(
#"%s -cp \"%s/*\" -mx5g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators \"tokenize,ssplit,parse,sentiment\" -ssplit.eolonly",
"%s -cp \"%s/*\" -mx5g edu.stanford.nlp.sentiment.SentimentPipeline -stdin",
java.path, stanford.tagger
)
results <- system(cmd, input = text.var, intern = TRUE, ignore.stderr = TRUE)
as.numeric(.mgsub(c(".*Very negative", ".*Negative", ".*Neutral",
".*Positive", ".*Very positive"), seq(-1, 1, by = 0.5),
results, fixed = FALSE))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.