#' Sentiment analysis
#'
#' Performs sentiment analysis of given object (vector of strings, document-term
#' matrix, corpus).
#'
#' @param x A vector of characters, a \code{data.frame}, an object of type
#' \code{\link[tm]{Corpus}}, \code{\link[tm]{TermDocumentMatrix}} or
#' \code{\link[tm]{DocumentTermMatrix}}
#' @param language Language used for preprocessing operations (default:
#' English)
#' @param aggregate A factor variable by which documents can be grouped.
#' This helpful when joining e.g. news from the same day or move reviews
#' by the same author
#' @param rules A named list containing individual sentiment metrics.
#' Therefore, each entry consists itself of a list with first a method,
#' followed by an optional dictionary.
#' @param removeStopwords Flag indicating whether to remove stopwords or not (default: yes)
#' @param stemming Perform stemming (default: TRUE)
#' @param ... Additional parameters passed to function for e.g.
#' preprocessing
#' @return Result is a matrix with sentiment values for each document across
#' all defined rules
#' @examples
#' \dontrun{
#' library(tm)
#'
#' # via vector of strings
#' corpus <- c("Positive text", "Neutral but uncertain text", "Negative text")
#' sentiment <- analyzeSentiment(corpus)
#' compareToResponse(sentiment, c(+1, 0, -2))
#'
#' # via Corpus from tm package
#' data("crude")
#' sentiment <- analyzeSentiment(crude)
#'
#' # via DocumentTermMatrix (with stemmed entries)
#' dtm <- DocumentTermMatrix(VCorpus(VectorSource(c("posit posit", "negat neutral"))))
#' sentiment <- analyzeSentiment(dtm)
#' compareToResponse(sentiment, convertToBinaryResponse(c(+1, -1)))
#'
#' # By adapting the parameter rules, one can incorporate customized dictionaries
#' # e.g. in order to adapt to arbitrary languages
#' dictionaryAmplifiers <- SentimentDictionary(c("more", "much"))
#' sentiment <- analyzeSentiment(corpus,
#' rules=list("Amplifiers"=list(ruleRatio,
#' dictionaryAmplifiers)))
#'
#' # On can also restrict the number of computed methods to the ones of interest
#' # in order to achieve performance optimizations
#' sentiment <- analyzeSentiment(corpus,
#' rules=list("SentimentLM"=list(ruleSentiment,
#' loadDictionaryLM())))
#' sentiment
#' }
#'
#' @details This function returns a data.frame with continuous values. If one desires
#' other formats, one needs to convert these. Common examples of such formats are
#' binary response values (positive / negative) or tertiary (positive, neutral,
#' negative). Hence, consider using the functions \code{\link{convertToBinaryResponse}} and
#' \code{\link{convertToDirection}}, which can convert a vector of continuous sentiment
#' scores into a factor object.
#' @keywords sentiment
#' @seealso \code{\link{compareToResponse}} for evaluating the results,
#' \code{\link{convertToBinaryResponse}} and \code{\link{convertToDirection}} for
#' for getting binary results, \code{\link{generateDictionary}} for dictionary generation,
#' \code{\link{plotSentiment}} and \code{\link{plotSentimentResponse}} for visualization
#' @rdname analyzeSentiment
#' @export
"analyzeSentiment" <- function(x, language="english", aggregate=NULL,
rules=defaultSentimentRules(),
removeStopwords=TRUE, stemming=TRUE,
...) {
UseMethod("analyzeSentiment", x)
}
#' @rdname analyzeSentiment
#' @export
"analyzeSentiment.Corpus" <- function(x, language="english", aggregate=NULL,
rules=defaultSentimentRules(),
removeStopwords=TRUE, stemming=TRUE,
...) {
dtm <- toDocumentTermMatrix(x, language=language,
removeStopwords=removeStopwords, stemming=stemming,
weighting=function(x) tm::weightTf(x))
return(analyzeSentiment(dtm, language, aggregate, rules, removeStopwords, stemming, ...))
}
#' @rdname analyzeSentiment
#' @export
"analyzeSentiment.character" <- function(x, language="english", aggregate=NULL,
rules=defaultSentimentRules(),
removeStopwords=TRUE, stemming=TRUE,
...) {
corpus <- transformIntoCorpus(x)
return(analyzeSentiment(corpus, language, aggregate, rules, removeStopwords, stemming, ...))
}
#' @rdname analyzeSentiment
#' @export
"analyzeSentiment.data.frame" <- function(x, language="english", aggregate=NULL,
rules=defaultSentimentRules(),
removeStopwords=TRUE, stemming=TRUE,
...) {
corpus <- transformIntoCorpus(x)
return(analyzeSentiment(corpus, language, aggregate, rules, removeStopwords, stemming, ...))
}
#' @rdname analyzeSentiment
#' @export
"analyzeSentiment.TermDocumentMatrix" <- function(x, language="english", aggregate=NULL,
rules=defaultSentimentRules(),
removeStopwords=TRUE, stemming=TRUE,
...) {
analyzeSentiment(t(x), language, aggregate, rules, removeStopwords, stemming, ...)
}
#' @rdname analyzeSentiment
#' @export
"analyzeSentiment.DocumentTermMatrix" <- function(x, language="english", aggregate=NULL,
rules=defaultSentimentRules(),
removeStopwords=TRUE, stemming=TRUE,
...) {
sent <- list()
for (n in names(rules)) {
rule <- rules[[n]]
if (length(rule) == 1) {
sent[[n]] <- rule[[1]](x)
} else {
sent[[n]] <- rule[[1]](x, rule[[2:length(rule)]])
}
}
df <- as.data.frame(t(do.call(rbind, sent)),
stringsAsFactors=FALSE,
row.names=NULL)
rownames(df) <- NULL
return(df)
}
#' Convert continuous sentiment to direction
#'
#' This function converts continuous sentiment scores into a their corresponding
#' binary sentiment class. As such, the result is a factor with two levels
#' indicating positive and negative content. Neutral documents (with a sentiment
#' score of 0) are counted as positive.
#'
#' @param sentiment Vector, matrix or data.frame with sentiment scores.
#' @return If a vector is supplied, it returns a factor with two levels representing
#' positive and negative content. Otherwise, it returns a data.frame with the
#' corresponding columns being exchanged.
#' @details If a matrix or data.frame is provided, this routine does not touch
#' all columns. In fact, it scans for those where the column name starts with
#' "Sentiment" and changes these columns only. Hence, columns with pure
#' negativity, positivity or ratios or word counts are ignored.
#' @examples
#' sentiment <- c(-1, -0.5, +1, 0.6, 0)
#' convertToBinaryResponse(sentiment)
#' convertToDirection(sentiment)
#'
#' df <- data.frame(No=1:5, Sentiment=sentiment)
#' df
#' convertToBinaryResponse(df)
#' convertToDirection(df)
#' @keywords sentiment evaluation
#' @seealso \code{\link{convertToDirection}}
#' @export
convertToBinaryResponse <- function(sentiment) {
if (is.null(sentiment)) {
stop("Input argument is NULL.")
}
if (is.matrix(sentiment) || is.data.frame(sentiment)) {
if (is.matrix(sentiment)) {
sentiment <- data.frame(sentiment,
stringsAsFactors=FALSE)
}
for (i in grep("^Sentiment", colnames(sentiment))) {
sentiment[, i] <- convertToBinaryResponse(sentiment[, i])
}
return(sentiment)
}
direction <- sign(sentiment)
direction[which(direction==0)] <- 1
return(factor(direction,
levels=c(-1, +1),
labels=c("negative", "positive")))
}
#' Convert continuous sentiment to direction
#'
#' This function converts continuous sentiment scores into a their corresponding
#' sentiment direction. As such, the result is a factor with three levels
#' indicating positive, neutral and negative content. In contrast
#' to \code{\link{convertToBinaryResponse}}, neutral documents have their own category.
#'
#' @param sentiment Vector, matrix or data.frame with sentiment scores.
#' @return If a vector is supplied, it returns a factor with three levels representing
#' positive, neutral and negative content. Otherwise, it returns a data.frame with the
#' corresponding columns being exchanged.
#' @details If a matrix or data.frame is provided, this routine does not touch
#' all columns. In fact, it scans for those where the column name starts with
#' "Sentiment" and changes these columns only. Hence, columns with pure
#' negativity, positivity or ratios or word counts are ignored.
#' @examples
#' sentiment <- c(-1, -0.5, +1, 0.6, 0)
#' convertToBinaryResponse(sentiment)
#' convertToDirection(sentiment)
#'
#' df <- data.frame(No=1:5, Sentiment=sentiment)
#' df
#' convertToBinaryResponse(df)
#' convertToDirection(df)
#' @keywords sentiment evaluation
#' @seealso \code{\link{convertToBinaryResponse}}
#' @export
convertToDirection <- function(sentiment) {
if (is.null(sentiment)) {
stop("Input argument is NULL.")
}
if (is.matrix(sentiment) || is.data.frame(sentiment)) {
if (is.matrix(sentiment)) {
sentiment <- data.frame(sentiment,
stringsAsFactors=FALSE)
}
for (i in grep("^Sentiment", colnames(sentiment))) {
sentiment[, i] <- convertToDirection(sentiment[, i])
}
return(sentiment)
}
return(factor(sign(sentiment),
levels=c(-1, 0, +1),
labels=c("negative", "neutral", "positive")))
}
#' Compare sentiment values to existing response variable
#'
#' This function compares the calculated sentiment values with an external
#' response variable. Examples of such an exogenous response are stock market
#' movements or IMDb move rating. Both usually reflect a "true" value that
#' the sentiment should match.
#'
#' @param sentiment Matrix with sentiment scores for each document across several
#' sentiment rules
#' @param response Vector with "true" response. This vector can either be of a
#' continuous numeric or binary values. In case of the latter, FALSE is matched
#' to a negative sentiment value, while TRUE is matched to a non-negative one.
#' @return Matrix with different performance metrics for all given sentiment
#' rules
#' @examples
#' sentiment <- matrix(c(5.5, 2.9, 0.9, -1),
#' dimnames=list(c("A", "B", "C", "D"), c("Sentiment")))
#'
#' # continuous numeric response variable
#' response <- c(5, 3, 1, -1)
#' compareToResponse(sentiment, response)
#'
#' # binary response variable
#' response <- c(TRUE, TRUE, FALSE, FALSE)
#' compareToResponse(sentiment, response)
#' @keywords evaluation
#' @rdname compareToResponse
#' @export
"compareToResponse" <- function(sentiment, response) {
UseMethod("compareToResponse", response)
}
#' @rdname compareToResponse
#' @export
"compareToResponse.logical" <- function(sentiment, response) {
compareToResponse.factor(sentiment, factor(response, levels=c("negative", "positive")))
}
#' @rdname compareToResponse
#' @export
"compareToResponse.factor" <- function(sentiment, response) {
r <- list()
if (all(levels(response) == c("negative", "positive"))) {
for (n in colnames(sentiment)) {
r[[n]] <- c(evalBinaryClassifier(convertToBinaryResponse(sentiment[, n]), response),
"avg.sentiment.pos.response"=mean(sentiment[response == "positive", n]),
"avg.sentiment.neg.response"=mean(sentiment[response == "negative", n]))
}
} else if (all(levels(response) == c("negative", "neutral", "positive"))) {
stop("Not yet implemented; use binary response instead.")
} else {
stop("Factor levels do not match expected format")
}
return(t(do.call(rbind, r)))
}
#' @rdname compareToResponse
#' @export
"compareToResponse.integer" <- function(sentiment, response) {
compareToResponse.numeric(sentiment, response)
}
#' @rdname compareToResponse
#' @export
"compareToResponse.data.frame" <- function(sentiment, response) {
compareToResponse.numeric(sentiment, response)
}
#' @rdname compareToResponse
#' @importFrom stats lm cor.test
#' @export
"compareToResponse.numeric" <- function(sentiment, response) {
result <- list("cor"=cor(sentiment, response)[,1],
"cor.t.statistic"=unlist(lapply(colnames(sentiment), function(x) cor.test(sentiment[, x], response)$statistic)),
"cor.p.value"=unlist(lapply(colnames(sentiment), function(x) cor.test(sentiment[, x], response)$p.val)),
"lm.t.value"=unlist(lapply(colnames(sentiment), function(x) {
tryCatch(
{
return(summary(lm(response ~ sentiment[, x]))$coefficients[2,3])
},
error = function(e) {
return(NA)
})
})),
"r.squared"=(cor(sentiment, response)^2)[,1],
"RMSE"=unlist(lapply(colnames(sentiment), function(x) sqrt(mean((sentiment[, x]-response)^2)))),
"MAE"=unlist(lapply(colnames(sentiment), function(x) mean(abs(sentiment[, x]-response)))))
return(rbind(do.call(rbind, result),
compareToResponse(sentiment, convertToBinaryResponse(response))))
}
"evalBinaryClassifier" <- function(pred, true) {
cm <- table(pred, true)
# TODO: Make possible for tertiary
r <- c("Accuracy"=(cm[1,1]+cm[2,2])/(cm[1,1]+cm[1,2]+cm[2,1]+cm[2,2]),
"Precision"=cm[1,1]/(cm[1,1]+cm[1,2]),
"Sensitivity"=cm[1,1]/(cm[1,1]+cm[2,1]),
"Specificity"=cm[2,2]/(cm[1,2]+cm[2,2]))
r["F1"] <- (2*r["Sensitivity"]*r["Precision"])/(r["Sensitivity"]+r["Precision"])
r["BalancedAccuracy"] <- (r["Sensitivity"]+r["Specificity"])/2
return(r)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.