#' @title Fit a Generalized Word Power model.
#' @description Fit a Generalized Word Power model on a target variable using word frequency data with an elastic-net model.
#' @param frequencies frequencies data.table generated using the function \code{\link{computeFrequencies}}.
#' @param responseData data.frame with column \code{regID} and \code{y}.
#' \code{regID} should match with the \code{regID} of the \code{frequencies} input while \code{y} is the response variable.
#' @param alpha alpha parameter for an elastic-net model.
#' @param lowerLimit lower limit on the number of time a sentiment word must appear over all the period.
#' If the value is above or equal to 1, this value is absolute. If the value is below 1, this value is in percentage term.
#' @return A list with the following elements:
#' \itemize{
#' \item scores: data.frame wiht the column \code{word}, \code{score}, and \code{importance}.
#' the word column correspond to the sentiment words, the score column correspond to the polarity of the sentiment words,
#' and the importance column correspond to the importance of each word withing the calbration sample.
#' \item sentiment: data.frame with colmn \code{regID}, \code{SentimentScores}, and \code{y}.
#' the column regID correspond to the regID of each observation, the columne SentimentScores is the sentiment for that observation,
#' and the column y is the initially given reponse variable.
#' }
#' @examples
#' # Load example data
#' data("corpus", package = "GWP")
#' data("vix", package = "GWP")
#' # Setup the lexicons
#' sentimentWord <- sentometrics::list_lexicons$LM_en$x
#' shifterWord <- sentometrics::list_valence_shifters$en[, c("x", "y")]
#'
#' # Generate the frequency data
#' frequencies <- computeFrequencies(corpus, sentimentWord, shifterWord, clusterSize = 5)
#'
#' # Calibrate the lexicon
#' res = fitGWP(frequencies = frequencies, responseData = vix)
#' @import caret glmnet
#' @importFrom stats sd
#' @importFrom stats var
#' @export
fitGWP <- function(frequencies, responseData, alpha = c(0, 0.2, 0.4, 0.6, 0.8, 1), lowerLimit = 0.2) {
aggFreq <- aggregateByRegID(frequencies)
sparseAggFreq <- toSparseMatrix(aggFreq)
if(lowerLimit >= 1){
id.freq <- colSums(as.matrix(sparseAggFreq) != 0) >= lowerLimit
} else {
id.freq <- colSums(as.matrix(sparseAggFreq) != 0)/nrow(sparseAggFreq) >= lowerLimit
}
sparseAggFreq = sparseAggFreq[,id.freq]
id.match <- match(rownames(sparseAggFreq), responseData$regID)
missing.obs <- rownames(sparseAggFreq)[is.na(id.match)]
if(length(missing.obs) > 0){
warning(paste0(" Cannot match ",
length(missing.obs),
" regIDs or ",
round(length(missing.obs)/nrow(sparseAggFreq),2),
" percent of the regIDs. Ignoring those observations for calibration."))
}
y <- responseData$y[id.match[!is.na(id.match)]]
sparseAggFreq <- sparseAggFreq[!is.na(id.match),]
flds <- caret::createFolds(y,
k = 10,
list = TRUE,
returnTrain = FALSE)
foldids <- rep(1, length(y))
for(i in 2:length(flds)){
foldids[flds[[i]]] <- i
}
cv.res <- matrix(nrow = length(alpha), ncol = 3)
for(i in 1:length(alpha)){
reg <- glmnet::cv.glmnet(sparseAggFreq,y,
alpha = alpha[i],
standardize = TRUE,
foldid = foldids)
cv.res[i,] <- c(alpha[i], reg$cvm[which(reg$lambda == reg$lambda.min)], reg$lambda.min)
}
id.best <- which.min(cv.res[1,])
reg <- glmnet::cv.glmnet(sparseAggFreq,
y,
alpha = alpha[id.best],
standardize = TRUE)
tmp_coef <- as.matrix(coef(reg, s = "lambda.min"))
coef <- as.vector(tmp_coef)[-1]
names(coef) <- rownames(tmp_coef)[-1]
scores <- (coef - mean(coef))/sd(coef)
scores <- data.frame(word = names(coef), score = scores)
rownames(scores) <- NULL
importance <- NA
for(i in 1:ncol(sparseAggFreq)){
importance[i] <- coef[i]^2*var(sparseAggFreq[, i])
}
importance <- importance/sum(importance)
scores$importance <- importance
aggFrequenciesWithSentimentScores <- addSentimentScores(aggFreq, scores)
sentiment <- computeSentimentByRegID(aggFrequenciesWithSentimentScores)
sentiment$responseData = NA
sentiment$responseData[!is.na(id.match)] <- y
res <- list(scores = scores, sentiment = sentiment, param = cv.res[id.best,], reg = reg)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.