View source: R/generateDictionary.R
| generateDictionary | R Documentation | 
Routine applies method for dictionary generation (LASSO, ridge regularization, elastic net, ordinary least squares, generalized linear model or spike-and-slab regression) to the document-term matrix in order to extract decisive terms that have a statistically significant impact on the response variable.
generateDictionary(
  x,
  response,
  language = "english",
  modelType = "lasso",
  filterTerms = NULL,
  control = list(),
  minWordLength = 3,
  sparsity = 0.9,
  weighting = function(x) tm::weightTfIdf(x, normalize = FALSE),
  ...
)
## S3 method for class 'Corpus'
generateDictionary(
  x,
  response,
  language = "english",
  modelType = "lasso",
  filterTerms = NULL,
  control = list(),
  minWordLength = 3,
  sparsity = 0.9,
  weighting = function(x) tm::weightTfIdf(x, normalize = FALSE),
  ...
)
## S3 method for class 'character'
generateDictionary(
  x,
  response,
  language = "english",
  modelType = "lasso",
  filterTerms = NULL,
  control = list(),
  minWordLength = 3,
  sparsity = 0.9,
  weighting = function(x) tm::weightTfIdf(x, normalize = FALSE),
  ...
)
## S3 method for class 'data.frame'
generateDictionary(
  x,
  response,
  language = "english",
  modelType = "lasso",
  filterTerms = NULL,
  control = list(),
  minWordLength = 3,
  sparsity = 0.9,
  weighting = function(x) tm::weightTfIdf(x, normalize = FALSE),
  ...
)
## S3 method for class 'TermDocumentMatrix'
generateDictionary(
  x,
  response,
  language = "english",
  modelType = "lasso",
  filterTerms = NULL,
  control = list(),
  minWordLength = 3,
  sparsity = 0.9,
  weighting = function(x) tm::weightTfIdf(x, normalize = FALSE),
  ...
)
## S3 method for class 'DocumentTermMatrix'
generateDictionary(
  x,
  response,
  language = "english",
  modelType = "lasso",
  filterTerms = NULL,
  control = list(),
  minWordLength = 3,
  sparsity = 0.9,
  weighting = function(x) tm::weightTfIdf(x, normalize = FALSE),
  ...
)
| x | A vector of characters, a  | 
| response | Response variable including the given gold standard. | 
| language | Language used for preprocessing operations (default: English). | 
| modelType | A string denoting the estimation method. Allowed values are  | 
| filterTerms | Optional vector of strings (default:  | 
| control | (optional) A list of parameters defining the model used for dictionary generation. If  
 If  
 If  
 If  If  
 If  
 | 
| minWordLength | Removes words given a specific minimum length (default: 3). This preprocessing is applied when the input is a character vector or a corpus and the document-term matrix is generated inside the routine. | 
| sparsity | A numeric for removing sparse terms in the document-term matrix. The
argument  | 
| weighting | Weights a document-term matrix by e.g. term frequency - inverse
document frequency (default). Other variants can be used from 
 | 
| ... | Additional parameters passed to function for e.g. 
preprocessing or  | 
Result is a matrix which sentiment values for each document across all defined rules
Pr\"ollochs and Feuerriegel (2018). Statistical inferences for Polarity Identification in Natural Language, PloS One 13(12).
analyzeSentiment, predict.SentimentDictionaryWeighted, 
plot.SentimentDictionaryWeighted and compareToResponse for
advanced evaluations
# Create a vector of strings
documents <- c("This is a good thing!",
               "This is a very good thing!",
               "This is okay.",
               "This is a bad thing.",
               "This is a very bad thing.")
response <- c(1, 0.5, 0, -0.5, -1)
# Generate dictionary with LASSO regularization
dictionary <- generateDictionary(documents, response)
# Show dictionary
dictionary
summary(dictionary)
plot(dictionary)
# Compute in-sample performance
sentiment <- predict(dictionary, documents)
compareToResponse(sentiment, response)
plotSentimentResponse(sentiment, response)
# Generate new dictionary with spike-and-slab regression instead of LASSO regularization
library(spikeslab)
dictionary <- generateDictionary(documents, response, modelType="spikeslab")
# Generate new dictionary with tf weighting instead of tf-idf
library(tm)
dictionary <- generateDictionary(documents, response, weighting=weightTf)
sentiment <- predict(dictionary, documents)
compareToResponse(sentiment, response)
# Use instead lambda.min from the LASSO estimation
dictionary <- generateDictionary(documents, response, control=list(s="lambda.min"))
sentiment <- predict(dictionary, documents)
compareToResponse(sentiment, response)
# Use instead OLS as estimation method
dictionary <- generateDictionary(documents, response, modelType="lm")
sentiment <- predict(dictionary, documents)
sentiment
dictionary <- generateDictionary(documents, response, modelType="lm", 
                                 filterTerms = c("good", "bad"))
sentiment <- predict(dictionary, documents)
sentiment
dictionary <- generateDictionary(documents, response, modelType="lm", 
                                 filterTerms = extractWords(loadDictionaryGI()))
sentiment <- predict(dictionary, documents)
sentiment
# Generate dictionary without LASSO intercept
dictionary <- generateDictionary(documents, response, intercept=FALSE)
dictionary$intercept
 
## Not run: 
imdb <- loadImdb()
# Generate Dictionary
dictionary_imdb <- generateDictionary(imdb$Corpus, imdb$Rating, family="poisson")
summary(dictionary_imdb)
compareDictionaries(dictionary_imdb,
                    loadDictionaryGI())
                    
# Show estimated coefficients with Kernel Density Estimation (KDE)
plot(dictionary_imdb)
plot(dictionary_imdb) + xlim(c(-0.1, 0.1))
# Compute in-sample performance
pred_sentiment <- predict(dict_imdb, imdb$Corpus)
compareToResponse(pred_sentiment, imdb$Rating)
# Test a different sparsity parameter
dictionary_imdb <- generateDictionary(imdb$Corpus, imdb$Rating, family="poisson", sparsity=0.99)
summary(dictionary_imdb)
pred_sentiment <- predict(dict_imdb, imdb$Corpus)
compareToResponse(pred_sentiment, imdb$Rating)
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.